Skip to content

Commit

Permalink
review labels, various minor updates
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Jul 29, 2023
1 parent 353aff5 commit f4dcebf
Show file tree
Hide file tree
Showing 12 changed files with 91 additions and 74 deletions.
20 changes: 20 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/data/Funding.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ public class Funding {
private String grantNumber = null;
private List<LayoutToken> grantNumberLayoutTokens = new ArrayList<>();

private String grantName = null;
private List<LayoutToken> grantNameLayoutTokens = new ArrayList<>();

private String projectFullName = null;
private List<LayoutToken> projectFullNameLayoutTokens = new ArrayList<>();

Expand Down Expand Up @@ -90,6 +93,22 @@ public List<LayoutToken> getGrantNumberLayoutTokens() {
return this.grantNumberLayoutTokens;
}

public String getGrantName() {
return this.grantName;
}

public void setGrantName(String grantName) {
this.grantName = grantName;
}

public void appendGrantNameLayoutTokens(List<LayoutToken> layoutTokens) {
this.grantNameLayoutTokens.addAll(layoutTokens);
}

public List<LayoutToken> getGrantNameLayoutTokens() {
return this.grantNameLayoutTokens;
}

public String getRawGrantNumber() {
return LayoutTokensUtil.toText(this.grantNumberLayoutTokens);
}
Expand Down Expand Up @@ -157,6 +176,7 @@ public void addLayoutTokens(List<LayoutToken> layoutTokens) {
public boolean isValid() {
if (funder != null ||
grantNumber != null ||
grantName != null ||
projectFullName != null ||
projectAbbreviatedName != null ||
programFullName != null ||
Expand Down
57 changes: 0 additions & 57 deletions grobid-core/src/main/java/org/grobid/core/engines/DateParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -96,63 +96,6 @@ protected List<Date> processCommon(List<String> input) {
throw new GrobidException("An exception on " + this.getClass().getName() + " occured while running Grobid.", e);
}
}

public List<Date> resultExtractionOld(String result) {

List<Date> dates = null;

StringTokenizer st2 = new StringTokenizer(result, "\n");
String lastTag = null;
org.grobid.core.data.Date date = new Date();
int lineCount = 0;
String currentMarker = null;
while (st2.hasMoreTokens()) {
String line = st2.nextToken();
if ((line.trim().length() == 0)) {
if (date.isNotNull()) {
if (dates == null)
dates = new ArrayList<>();
Date normalizedDate = normalizeAndClean(date);
dates.add(normalizedDate);
}
date = new Date();
continue;
}
StringTokenizer st3 = new StringTokenizer(line, "\t ");
int ll = st3.countTokens();
int i = 0;
String s1 = null;
String s2 = null;
while (st3.hasMoreTokens()) {
String s = st3.nextToken().trim();
if (i == 0) {
s2 = s; // string
} else if (i == ll - 1) {
s1 = s; // label
}
i++;
}

if ("<year>".equals(s1) || "I-<year>".equals(s1)) {

} else if ("<month>".equals(s1) || "I-<month>".equals(s1)) {

} else if ("<day>".equals(s1) || "I-<day>".equals(s1)) {

}

lastTag = s1;
lineCount++;
}
if (date.isNotNull()) {
if (dates == null)
dates = new ArrayList<>();
Date normalizedDate = normalizeAndClean(date);
dates.add(normalizedDate);
}

return dates;
}

public List<Date> resultExtraction(String result, List<LayoutToken> tokenizations) {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public Figure processing(List<LayoutToken> tokenizationFigure, String featureVec
//System.out.println("---------------------res-----------------------");
//System.out.println(res);
} catch (Exception e) {
throw new GrobidException("CRF labeling with figure model fails.", e);
throw new GrobidException("Sequence labeling with figure model fails.", e);
}
if (res == null) {
return null;
Expand Down Expand Up @@ -94,7 +94,7 @@ public Pair<String, String> createTrainingData(List<LayoutToken> tokenizations,
try {
res = label(featureVector);
} catch (Exception e) {
LOGGER.error("CRF labeling in FigureParser fails.", e);
LOGGER.error("Sequence labeling in FigureParser fails.", e);
}
if (res == null) {
return Pair.of(null, featureVector);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1943,7 +1943,7 @@ protected List<Figure> processFigures(String rese, List<LayoutToken> layoutToken

/**
* Create training data for the figures as identified by the full text model.
* Return the pair (TEI fragment, CRF raw data).
* Return the pair (TEI fragment, sequence labeling raw data).
*/
protected Pair<String,String> processTrainingDataFigures(String rese,
List<LayoutToken> tokenizations, String id) {
Expand Down Expand Up @@ -1992,7 +1992,7 @@ protected Pair<String,String> processTrainingDataFigures(String rese,
openFigure = true;
tokenizationsFigure.addAll(tokenizationsBuffer);
}
// we remove the label in the CRF row
// we remove the label in the sequence labeling row
int ind = row.lastIndexOf("\t");
figureBlock.append(row, 0, ind).append("\n");
} else if (label.equals("I-<figure>") || openFigure) {
Expand Down Expand Up @@ -2121,7 +2121,7 @@ protected List<Table> processTables(String rese,

/**
* Create training data for the table as identified by the full text model.
* Return the pair (TEI fragment, CRF raw data).
* Return the pair (TEI fragment, sequence labeling raw data).
*/
protected Pair<String,String> processTrainingDataTables(String rese,
List<LayoutToken> tokenizations, String id) {
Expand Down Expand Up @@ -2170,7 +2170,7 @@ protected Pair<String,String> processTrainingDataTables(String rese,
if (!openTable) {
openTable = true;
tokenizationsTable.addAll(tokenizationsBuffer); }
// we remove the label in the CRF row
// we remove the label in the sequence labeling row
int ind = row.lastIndexOf("\t");
tableBlock.append(row.substring(0, ind)).append("\n");
} else if (label.equals("I-<table>") || openTable) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public class ModelMap {
private static Map<String, Model> models = null;

/**
* Return a CRFPP tagger created corresponding to the model given in argument.
* Return a model tagger created corresponding to the model given in argument.
*
* @param grobidModel
* the model to use for the creation of the tagger.
Expand All @@ -53,7 +53,7 @@ public static Tagger getTagger(GrobidModel grobidModel) {
}

/**
* Loading of the CRFPP models.
* Loading of the models.
*/
@Deprecated
public static synchronized void initModels() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ private String getFeatureVectorsAsString(Document doc, Map<String, Integer> patt
}
}

// we consider the first token of the line as usual lexical CRF token
// we consider the first token of the line as usual lexical token
// and the second token of the line as feature
StringTokenizer st2 = new StringTokenizer(line, " \t");
// alternatively, use a grobid analyser
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ public Pair<String,String> createTrainingData(Document doc, int id) {
res = label(featureVector);
}
catch(Exception e) {
throw new GrobidException("CRF labeling in ReferenceSegmenter fails.", e);
throw new GrobidException("Sequence labeling in ReferenceSegmenter fails.", e);
}
if (res == null) {
return null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ private String getFeatureVectorsAsString(Document doc, Map<String, Integer> patt
}
}

// we consider the first token of the line as usual lexical CRF token
// we consider the first token of the line as usual lexical token
// and the second token of the line as feature
StringTokenizer st2 = new StringTokenizer(line, " \t\f\u00A0");
// alternatively, use a grobid analyser
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public List<Table> processing(List<LayoutToken> tokenizationTable, String featur
try {
res = label(featureVector);
} catch (Exception e) {
throw new GrobidException("CRF labeling with table model fails.", e);
throw new GrobidException("Sequence labeling with table model fails.", e);
}

if (res == null) {
Expand Down Expand Up @@ -131,7 +131,7 @@ public Pair<String, String> createTrainingData(List<LayoutToken> tokenizations,
try {
res = label(featureVector);
} catch (Exception e) {
LOGGER.error("CRF labeling in TableParser fails.", e);
LOGGER.error("Sequence labeling in TableParser fails.", e);
}
if (res == null) {
return Pair.of(null, featureVector);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,15 @@ public class TaggingLabels {
public final static String PATENT_CITATION_PL_LABEL = "<refPatent>";
public final static String PATENT_CITATION_NPL_LABEL = "<refNPL>";

public final static String FUNDER_FULL_NAME_LABEL = "<funderFull>";
public final static String FUNDER_ABBRV_NAME_LABEL = "<funderAbbrv>";
public final static String PROGRAM_FULL_NAME_LABEL = "<programFull>";
public final static String PROGRAM_ABBRV_NAME_LABEL = "<programAbbrv>";
public final static String GRANT_NUMBER_LABEL = "<grantNumber>";
public final static String GRANT_NAME_LABEL = "<grantName>";
public final static String PROJECT_FULL_NAME_LABEL = "<projectFull>";
public final static String PROJECT_ABBRV_NAME_LABEL = "<projectAbbrv>";
public final static String URL_LABEL = "<url>";

/* title page (secondary title page)
* publisher page (publication information, including usually the copyrights info)
Expand Down Expand Up @@ -251,6 +260,17 @@ public class TaggingLabels {
public static final TaggingLabel MONOGRAPH_BACK = new TaggingLabelImpl(GrobidModels.MONOGRAPH, BACK_LABEL);
public static final TaggingLabel MONOGRAPH_OTHER = new TaggingLabelImpl(GrobidModels.MONOGRAPH, OTHER_LABEL);

public static final TaggingLabel FUNDING_FUNDER_FULL_NAME = new TaggingLabelImpl(GrobidModels.FUNDING, FUNDER_FULL_NAME_LABEL);
public static final TaggingLabel FUNDING_FUNDER_ABBRV_NAME = new TaggingLabelImpl(GrobidModels.FUNDING, FUNDER_ABBRV_NAME_LABEL);
public static final TaggingLabel FUNDING_PROGRAM_FULL_NAME = new TaggingLabelImpl(GrobidModels.FUNDING, PROGRAM_FULL_NAME_LABEL);
public static final TaggingLabel FUNDING_PROGRAM_ABBRV_NAME = new TaggingLabelImpl(GrobidModels.FUNDING, PROGRAM_ABBRV_NAME_LABEL);
public static final TaggingLabel FUNDING_GRANT_NUMBER = new TaggingLabelImpl(GrobidModels.FUNDING, GRANT_NUMBER_LABEL);
public static final TaggingLabel FUNDING_GRANT_NAME = new TaggingLabelImpl(GrobidModels.FUNDING, GRANT_NAME_LABEL);
public static final TaggingLabel FUNDING_PROJECT_FULL_NAME = new TaggingLabelImpl(GrobidModels.FUNDING, PROJECT_FULL_NAME_LABEL);
public static final TaggingLabel FUNDING_PROJECT_ABBRV_NAME = new TaggingLabelImpl(GrobidModels.FUNDING, PROJECT_ABBRV_NAME_LABEL);
public static final TaggingLabel FUNDING_URL = new TaggingLabelImpl(GrobidModels.FUNDING, URL_LABEL);
public static final TaggingLabel FUNDING_OTHER = new TaggingLabelImpl(GrobidModels.FUNDING, OTHER_LABEL);

protected static void register(TaggingLabel label) {
cache.putIfAbsent(new Pair<>(label.getGrobidModel(), label.getLabel()), label);
}
Expand Down Expand Up @@ -383,6 +403,18 @@ protected static void register(TaggingLabel label) {
register(MONOGRAPH_GLOSSARY);
register(MONOGRAPH_BACK);
register(MONOGRAPH_OTHER);

// funding
register(FUNDING_FUNDER_FULL_NAME);
register(FUNDING_FUNDER_ABBRV_NAME);
register(FUNDING_PROGRAM_FULL_NAME);
register(FUNDING_PROGRAM_ABBRV_NAME);
register(FUNDING_GRANT_NUMBER);
register(FUNDING_GRANT_NAME);
register(FUNDING_PROJECT_FULL_NAME);
register(FUNDING_PROJECT_ABBRV_NAME);
register(FUNDING_URL);
register(FUNDING_OTHER);
}

protected TaggingLabels() {
Expand Down
24 changes: 24 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ public class Lexicon {

private FastMatcher orgFormPattern = null;
private FastMatcher collaborationPattern = null;
private FastMatcher funderPattern = null;

private FastMatcher personTitlePattern = null;
private FastMatcher personSuffixPattern = null;
Expand Down Expand Up @@ -477,6 +478,17 @@ public void initPersonSuffix() {
}
}

public void initFunders() {
try {
funderPattern = new FastMatcher(new
File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/funders.txt"));
} catch (PatternSyntaxException e) {
throw new GrobidResourceException("Error when compiling lexicon matcher for funders.", e);
} catch (Exception e) {
throw new GrobidException("An exception occured while running Grobid Lexicon init.", e);
}
}

/**
* Look-up in first name gazetteer
*/
Expand Down Expand Up @@ -654,6 +666,18 @@ public List<OffsetPosition> tokenPositionsCollaborationNames(List<LayoutToken> s
return results;
}

/**
* Case sensitive look-up in funder name gazetteer for a given list of LayoutToken objects
* with token positions
*/
public List<OffsetPosition> tokenPositionsFunderNames(List<LayoutToken> s) {
if (funderPattern == null) {
initFunders();
}
List<OffsetPosition> results = funderPattern.matchLayoutToken(s, true, true);
return results;
}

/**
* Soft look-up in city name gazetteer for a given string with token positions
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@
import java.util.List;

/**
* SAX parser for author sequences encoded in the TEI format data.
* Segmentation of tokens must be identical as the one from pdf2xml files to that
* training and online input tokens are identical.
* SAX parser for date sequences encoded in the TEI format data.
*
* @author Patrice Lopez
*/
Expand Down Expand Up @@ -44,7 +42,7 @@ public ArrayList<String> getLabeledResult() {
public void endElement(java.lang.String uri,
java.lang.String localName,
java.lang.String qName) throws SAXException {
if (( (qName.equals("year")) | (qName.equals("month")) | (qName.equals("day"))) & (currentTag != null)) {
if (( (qName.equals("year")) || (qName.equals("month")) || (qName.equals("day"))) & (currentTag != null)) {
String text = getText();
writeField(text);
}
Expand Down

0 comments on commit f4dcebf

Please sign in to comment.