Skip to content

Commit

Permalink
coordinates for all title elements
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Nov 11, 2023
1 parent 0d2d5ba commit f30fb65
Show file tree
Hide file tree
Showing 6 changed files with 133 additions and 14 deletions.
2 changes: 1 addition & 1 deletion doc/Coordinates-in-PDF.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Since April 2017, GROBID version 0.4.2 and higher, coordinate areas can be obtai
* ```head``` for section titles,
* ```s``` for optional sentence structure (the GROBID fulltext service must be called with the `segmentSentences` parameter to provide the optional sentence-level elements),
* ```note``` for foot note elements,
* ```title``` for the title elements.
* ```title``` for the title elements (main article title and cited reference titles).

However, there is normally no particular limitation to the type of structures which can have their coordinates in the results, the implementation is on-going, see [issue #69](https://github.com/kermitt2/grobid/issues/69), and it is expected that more or less any structures could be associated with their coordinates in the orginal PDF.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,14 @@ public List<LayoutToken> retokenizeSubdigitsFromLayoutToken(List<LayoutToken> to
LayoutToken layoutToken = new LayoutToken();
layoutToken.setText(subtokens[i]);
layoutToken.setOffset(offset);

// coordinates - TODO: refine the width/X for the sub token
layoutToken.setX(token.getX());
layoutToken.setY(token.getY());
layoutToken.setHeight(token.getHeight());
layoutToken.setWidth(token.getWidth());
layoutToken.setPage(token.getPage());

offset += subtokens[i].length();
result.add(layoutToken);
}
Expand Down
113 changes: 105 additions & 8 deletions grobid-core/src/main/java/org/grobid/core/data/BiblioItem.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import org.grobid.core.utilities.KeyGen;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.GrobidModels;
import org.grobid.core.engines.label.TaggingLabel;
import org.grobid.core.engines.label.TaggingLabels;

import java.net.URLEncoder;
import java.util.*;
Expand Down Expand Up @@ -2251,12 +2253,39 @@ public String toTEI(int n, int indent, GrobidAnalysisConfig config) {
tei.append("<title");
if ((bookTitle == null) && (journal == null) && (serieTitle == null)) {
tei.append(" level=\"m\" type=\"main\"");
} else
if (config.isGenerateTeiCoordinates("title")) {
// title for articles or chapters
List<LayoutToken> titleTokens = getLayoutTokens(TaggingLabels.CITATION_BOOKTITLE);
if (titleTokens == null || titleTokens.size()==0) {
titleTokens = getLayoutTokens(TaggingLabels.CITATION_TITLE);
}

if (titleTokens != null && titleTokens.size()>0) {
String coords = LayoutTokensUtil.getCoordsString(titleTokens);
if (coords != null && coords.length()>0) {
tei.append(" coord=\"" + coords + "\"");
}
}
}
} else {
tei.append(" level=\"a\" type=\"main\"");

if (config.isGenerateTeiCoordinates("title")) {
// title for articles or chapters
List<LayoutToken> titleTokens = getLayoutTokens(TaggingLabels.CITATION_TITLE);
if (titleTokens != null && titleTokens.size()>0) {
String coords = LayoutTokensUtil.getCoordsString(titleTokens);
if (coords != null && coords.length()>0) {
tei.append(" coord=\"" + coords + "\"");
}
}
}
}
if (generateIDs) {
String divID = KeyGen.getKey().substring(0,7);
tei.append(" xml:id=\"_" + divID + "\"");
}

// here check the language ?
if (StringUtils.isEmpty(english_title)) {
tei.append(">").append(TextUtilities.HTMLEncode(title)).append("</title>\n");
Expand Down Expand Up @@ -2292,6 +2321,7 @@ else if (bookTitle == null) {
String divID = KeyGen.getKey().substring(0,7);
tei.append(" xml:id=\"_" + divID + "\"");
}

tei.append(" xml:lang=\"en\">")
.append(TextUtilities.HTMLEncode(english_title)).append("</title>\n");
}
Expand Down Expand Up @@ -2391,6 +2421,16 @@ else if (bookTitle == null) {
String divID = KeyGen.getKey().substring(0,7);
tei.append(" xml:id=\"_" + divID + "\"");
}
if (config.isGenerateTeiCoordinates("title")) {
List<LayoutToken> titleTokens = getLayoutTokens(TaggingLabels.CITATION_BOOKTITLE);
if (titleTokens != null && titleTokens.size()>0) {
String coords = LayoutTokensUtil.getCoordsString(titleTokens);
if (coords != null && coords.length()>0) {
tei.append(" coord=\"" + coords + "\"");
}
}
}

tei.append(">" + TextUtilities.HTMLEncode(bookTitle) + "</title>\n");

if (!StringUtils.isEmpty(serieTitle)) {
Expand All @@ -2403,6 +2443,18 @@ else if (bookTitle == null) {
String divID = KeyGen.getKey().substring(0,7);
tei.append(" xml:id=\"_" + divID + "\"");
}

if (config.isGenerateTeiCoordinates("title")) {
// title for articles or chapters
List<LayoutToken> titleTokens = getLayoutTokens(TaggingLabels.CITATION_SERIES);
if (titleTokens != null && titleTokens.size()>0) {
String coords = LayoutTokensUtil.getCoordsString(titleTokens);
if (coords != null && coords.length()>0) {
tei.append(" coord=\"" + coords + "\"");
}
}
}

tei.append(">" + TextUtilities.HTMLEncode(serieTitle) + "</title>\n");
}

Expand Down Expand Up @@ -2623,6 +2675,18 @@ else if (this.getYear().length() == 4)
String divID = KeyGen.getKey().substring(0,7);
tei.append(" xml:id=\"_" + divID + "\"");
}

if (config.isGenerateTeiCoordinates("title")) {
// title for articles or chapters
List<LayoutToken> titleTokens = getLayoutTokens(TaggingLabels.CITATION_JOURNAL);
if (titleTokens != null && titleTokens.size()>0) {
String coords = LayoutTokensUtil.getCoordsString(titleTokens);
if (coords != null && coords.length()>0) {
tei.append(" coord=\"" + coords + "\"");
}
}
}

tei.append(">" + TextUtilities.HTMLEncode(journal) + "</title>\n");

if (!StringUtils.isEmpty(getJournalAbbrev())) {
Expand All @@ -2638,6 +2702,18 @@ else if (this.getYear().length() == 4)
String divID = KeyGen.getKey().substring(0,7);
tei.append(" xml:id=\"_" + divID + "\"");
}

if (config.isGenerateTeiCoordinates("title")) {
// title for articles or chapters
List<LayoutToken> titleTokens = getLayoutTokens(TaggingLabels.CITATION_SERIES);
if (titleTokens != null && titleTokens.size()>0) {
String coords = LayoutTokensUtil.getCoordsString(titleTokens);
if (coords != null && coords.length()>0) {
tei.append(" coord=\"" + coords + "\"");
}
}
}

tei.append(">" + TextUtilities.HTMLEncode(serieTitle) + "</title>\n");
}

Expand Down Expand Up @@ -4281,25 +4357,25 @@ public void setLabeledTokens(Map<String, List<LayoutToken>> labeledTokens) {
this.labeledTokens = labeledTokens;
}

public List<LayoutToken> getLayoutTokens(TaggingLabel headerLabel) {
public List<LayoutToken> getLayoutTokens(TaggingLabel biblioLabel) {
if (labeledTokens == null) {
LOGGER.debug("labeledTokens is null");
return null;
}
if (headerLabel.getLabel() == null) {
LOGGER.debug("headerLabel.getLabel() is null");
if (biblioLabel.getLabel() == null) {
LOGGER.debug("biblioLabel.getLabel() is null");
return null;
}
return labeledTokens.get(headerLabel.getLabel());
return labeledTokens.get(biblioLabel.getLabel());
}

public void setLayoutTokensForLabel(List<LayoutToken> tokens, TaggingLabel headerLabel) {
public void setLayoutTokensForLabel(List<LayoutToken> tokens, TaggingLabel biblioLabel) {
if (labeledTokens == null)
labeledTokens = new TreeMap<>();
labeledTokens.put(headerLabel.getLabel(), tokens);
labeledTokens.put(biblioLabel.getLabel(), tokens);
}

public void generalResultMapping(String labeledResult, List<LayoutToken> tokenizations) {
public void generalResultMappingHeader(String labeledResult, List<LayoutToken> tokenizations) {
if (labeledTokens == null)
labeledTokens = new TreeMap<>();

Expand All @@ -4320,6 +4396,27 @@ public void generalResultMapping(String labeledResult, List<LayoutToken> tokeniz
}
}

public void generalResultMappingReference(String labeledResult, List<LayoutToken> tokenizations) {
if (labeledTokens == null)
labeledTokens = new TreeMap<>();

TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.CITATION, labeledResult, tokenizations);
List<TaggingTokenCluster> clusters = clusteror.cluster();
for (TaggingTokenCluster cluster : clusters) {
if (cluster == null) {
continue;
}

TaggingLabel clusterLabel = cluster.getTaggingLabel();
List<LayoutToken> clusterTokens = cluster.concatTokens();
List<LayoutToken> theList = labeledTokens.get(clusterLabel.getLabel());

theList = theList == null ? new ArrayList<>() : theList;
theList.addAll(clusterTokens);
labeledTokens.put(clusterLabel.getLabel(), theList);
}
}

public List<LayoutToken> getAuthorsTokensWorkingCopy() {
return authorsTokensWorkingCopy;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -308,10 +308,10 @@ public List<BibDataSet> processingReferenceSection(Document doc, ReferenceSegmen
cntManager.i(CitationParserCounters.SEGMENTED_REFERENCES, references.size());
}

// consolidation: if selected, is not done individually for each citation but
// consolidation: if selected, it is NOT done individually for each citation but
// in a second stage for all citations
if (references != null) {
List<String> refTexts = new ArrayList<>();
/*List<String> refTexts = new ArrayList<>();
for (LabeledReferenceResult ref : references) {
// paranoiac check
if (ref == null)
Expand All @@ -321,8 +321,20 @@ public List<BibDataSet> processingReferenceSection(Document doc, ReferenceSegmen
localRef = TextUtilities.removeLeadingAndTrailingChars(localRef, "[({.,])}: \n"," \n");
refTexts.add(localRef);
}
List<BiblioItem> bibList = processingStringMultiple(refTexts, 0);*/

List<List<LayoutToken>> allRefBlocks = new ArrayList<>();
for (LabeledReferenceResult ref : references) {
// paranoiac check
if (ref == null)
continue;

List<LayoutToken> localTokens = ref.getTokens();
localTokens = TextUtilities.removeLeadingAndTrailingCharsLayoutTokens(localTokens, "[({.,])}: \n"," \n");
allRefBlocks.add(localTokens);
}
List<BiblioItem> bibList = processingLayoutTokenMultiple(allRefBlocks, 0);

List<BiblioItem> bibList = processingStringMultiple(refTexts, 0);
if (bibList != null && bibList.size()>0) {
int i = 0;
for (LabeledReferenceResult ref : references) {
Expand Down Expand Up @@ -470,6 +482,7 @@ public BiblioItem resultExtractionLayoutTokens(String result,

TaggingLabel lastClusterLabel = null;
TaggingTokenClusteror clusteror = new TaggingTokenClusteror(GrobidModels.CITATION, result, tokenizations);
biblio.generalResultMappingReference(result, tokenizations);

String tokenLabel = null;
List<TaggingTokenCluster> clusters = clusteror.cluster();
Expand Down Expand Up @@ -574,7 +587,7 @@ else if (biblio.getJournal().length() >= clusterContent.length())
String clusterNonDehypenizedContent = LayoutTokensUtil.toText(cluster.concatTokens());
biblio.setWeb(clusterNonDehypenizedContent);
}
}
}

return biblio;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -796,7 +796,7 @@ public BiblioItem resultExtraction(String result, List<LayoutToken> tokenization

List<TaggingTokenCluster> clusters = clusteror.cluster();

biblio.generalResultMapping(result, tokenizations);
biblio.generalResultMappingHeader(result, tokenizations);
for (TaggingTokenCluster cluster : clusters) {
if (cluster == null) {
continue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,7 @@ public void onSuccess(List<BiblioItem> res) {
for(BiblioItem oneRes : res) {
if ((GrobidProperties.getInstance().getConsolidationService() == GrobidConsolidationService.GLUTTON) ||
postValidation(theBiblio, oneRes)) {
oneRes.setLabeledTokens(theBiblio.getLabeledTokens());
results.put(Integer.valueOf(getRank()), oneRes);
if (cntManager != null) {
cntManager.i(ConsolidationCounters.CONSOLIDATION_SUCCESS);
Expand Down

0 comments on commit f30fb65

Please sign in to comment.