Skip to content

Commit

Permalink
review TEI serialization
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Aug 12, 2023
1 parent 3e501fd commit e799571
Show file tree
Hide file tree
Showing 6 changed files with 187 additions and 25 deletions.
27 changes: 27 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/data/Funder.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,16 @@ public class Funder {
private String url = null;

private List<LayoutToken> layoutTokens = new ArrayList<>();

static public Funder EMPTY = new Funder("unknown");

public Funder() {
}

public Funder(String fullName) {
this.fullName = fullName;
}

public String getFullName() {
return this.fullName;
}
Expand Down Expand Up @@ -179,4 +188,22 @@ public String toJson() {
json.append("\n}");
return json.toString();
}

public String toTEI() {
StringBuilder tei = new StringBuilder();

tei.append("<funder>\n");
if (fullName != null) {
tei.append("\t<orgName type=\"full\">"+TextUtilities.HTMLEncode(fullName)+"</orgName>\n");
}
if (abbreviatedName != null) {
tei.append("\t<orgName type=\"abbreviated\">"+TextUtilities.HTMLEncode(abbreviatedName)+"</orgName>\n");
}
if (doi != null) {
tei.append("\t<idno type=\"DOI\" subtype=\"crossref\">"+TextUtilities.HTMLEncode(doi)+"</idno>\n");
}
tei.append("</funder>\n");

return tei.toString();
}
}
50 changes: 50 additions & 0 deletions grobid-core/src/main/java/org/grobid/core/data/Funding.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import org.grobid.core.utilities.TextUtilities;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.utilities.LayoutTokensUtil;
import org.grobid.core.utilities.KeyGen;

import java.util.ArrayList;
import java.util.List;
Expand All @@ -12,6 +13,9 @@
*/
public class Funding {
private Funder funder = null;

// this is an identifier for identifying and referencing the funding inside the full document
private String identifier = null;

// program or call
private String programFullName = null;
Expand Down Expand Up @@ -227,4 +231,50 @@ public String toJson() {
json.append("\n}");
return json.toString();
}

public String toTEI() {
StringBuilder tei = new StringBuilder();

String localType = "funding";
if (projectFullName != null || projectAbbreviatedName != null)
localType = "funded-project";

if (this.identifier == null) {
String localId = KeyGen.getKey().substring(0, 7);
this.identifier = "_" + localId;
}

tei.append("<org type=\""+localType+"\" xml:id=\""+this.identifier+"\">\n");
if (grantNumber != null) {
tei.append("\t<idno type=\"grant-number\">"+TextUtilities.HTMLEncode(grantNumber)+"</idno>\n");
}
if (programFullName != null) {
tei.append("\t<orgName type=\"program\" subtype=\"full\">"+TextUtilities.HTMLEncode(programFullName)+"</orgName>\n");
}
if (programAbbreviatedName != null) {
tei.append("\t<orgName type=\"program\" subtype=\"abbreviated\">"+TextUtilities.HTMLEncode(programAbbreviatedName)+"</orgName>\n");
}
if (projectFullName != null) {
tei.append("\t<orgName type=\"project\" subtype=\"full\">"+TextUtilities.HTMLEncode(projectFullName)+"</orgName>\n");
}
if (projectAbbreviatedName != null) {
tei.append("\t<orgName type=\"project\" subtype=\"abbreviated\">"+TextUtilities.HTMLEncode(projectAbbreviatedName)+"</orgName>\n");
}
if (url != null) {
tei.append("<ptr target=\"").append(TextUtilities.HTMLEncode(url)).append("\" />\n");
}
if (start != null) {
String dateString = start.toTEI();
dateString = dateString.replace("<date ", "<date type=\"start\" ");
tei.append(dateString);
}
if (end != null) {
String dateString = end.toTEI();
dateString = dateString.replace("<date ", "<date type=\"end\" ");
tei.append(dateString);
}
tei.append("</org>\n");

return tei.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
import nu.xom.Text;

import org.grobid.core.GrobidModels;
import org.grobid.core.data.*;
import org.grobid.core.data.Date;
import org.grobid.core.data.*;
import org.grobid.core.document.xml.XmlBuilderUtils;
import org.grobid.core.engines.Engine;
import org.grobid.core.engines.FullTextParser;
Expand Down Expand Up @@ -99,15 +99,17 @@ public StringBuilder toTEIHeader(BiblioItem biblio,
String defaultPublicationStatement,
List<BibDataSet> bds,
List<MarkerType> markerTypes,
List<Funding> fundings,
GrobidAnalysisConfig config) {
return toTEIHeader(biblio, SchemaDeclaration.XSD, defaultPublicationStatement, bds, markerTypes, config);
return toTEIHeader(biblio, SchemaDeclaration.XSD, defaultPublicationStatement, bds, markerTypes, fundings, config);
}

public StringBuilder toTEIHeader(BiblioItem biblio,
SchemaDeclaration schemaDeclaration,
String defaultPublicationStatement,
List<BibDataSet> bds,
List<MarkerType> markerTypes,
List<Funding> fundings,
GrobidAnalysisConfig config) {
StringBuilder tei = new StringBuilder();
tei.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
Expand Down Expand Up @@ -157,7 +159,34 @@ public StringBuilder toTEIHeader(BiblioItem biblio,
tei.append(TextUtilities.HTMLEncode(biblio.getTitle()));
}

tei.append("</title>\n\t\t\t</titleStmt>\n");
tei.append("</title>\n");

if (fundings != null && fundings.size()>0) {

Map<Funder,List<Funding>> fundingRelation = new HashMap<>();
for(Funding funding : fundings) {
if (funding.getFunder() == null) {
List<Funding> localfundings = fundingRelation.get(Funder.EMPTY);
if (localfundings == null)
localfundings = new ArrayList<>();
localfundings.add(funding);
fundingRelation.put(Funder.EMPTY, localfundings);
} else {
List<Funding> localfundings = fundingRelation.get(funding.getFunder());
if (localfundings == null)
localfundings = new ArrayList<>();
localfundings.add(funding);
fundingRelation.put(funding.getFunder(), localfundings);
}
}

for (Map.Entry<Funder, List<Funding>> entry : fundingRelation.entrySet()) {
tei.append(entry.getKey().toTEI());
}
}

tei.append("\t\t\t</titleStmt>\n");

if ((biblio.getPublisher() != null) ||
(biblio.getPublicationDate() != null) ||
(biblio.getNormalizedPublicationDate() != null)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,14 @@
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.lang3.tuple.MutablePair;
import org.apache.commons.lang3.tuple.MutableTriple;
import org.apache.commons.io.FileUtils;

import java.nio.charset.StandardCharsets;

import org.grobid.core.GrobidModels;
import org.grobid.core.data.BibDataSet;
import org.grobid.core.data.BiblioItem;
import org.grobid.core.data.Figure;
import org.grobid.core.data.Table;
import org.grobid.core.data.Equation;
import org.grobid.core.data.Funding;
import org.grobid.core.data.Funder;
import org.grobid.core.data.*;
import org.grobid.core.document.Document;
import org.grobid.core.document.DocumentPiece;
import org.grobid.core.document.DocumentPointer;
Expand Down Expand Up @@ -68,6 +64,8 @@
import java.util.TreeSet;
import java.util.regex.Matcher;

import nu.xom.Element;

import static org.apache.commons.lang3.StringUtils.*;

public class FullTextParser extends AbstractParser {
Expand Down Expand Up @@ -2461,16 +2459,11 @@ private void toTEI(Document doc,
}
List<BibDataSet> resCitations = doc.getBibDataSets();
TEIFormatter teiFormatter = new TEIFormatter(doc, this);
StringBuilder tei;
StringBuilder tei = new StringBuilder();
try {
tei = teiFormatter.toTEIHeader(resHeader, null, resCitations, markerTypes, config);

//System.out.println(rese);
//int mode = config.getFulltextProcessingMode();
tei = teiFormatter.toTEIBody(tei, reseBody, resHeader, resCitations,
layoutTokenization, figures, tables, equations, markerTypes, doc, config);
List<Funding> fundings = new ArrayList<>();

tei.append("\t\t<back>\n");
List<String> annexStatements = new ArrayList<>();

// funding in header
StringBuilder fundingStmt = new StringBuilder();
Expand All @@ -2492,7 +2485,25 @@ private void toTEI(Document doc,
config);
}
if (fundingStmt.length() > 0) {
tei.append(fundingStmt.toString());
MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affiliation>>> localResult =
parsers.getFundingAcknowledgementParser().processingXmlFragment(fundingStmt.toString(), config);

if (localResult != null && localResult.getLeft() != null) {
String local_tei = localResult.getLeft().toXML();
local_tei = local_tei.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", "");
//tei.append(local_tei);
annexStatements.add(local_tei);
} else {
//tei.append(fundingStmt);
annexStatements.add(fundingStmt.toString());
}

if (localResult != null && localResult.getRight() != null && localResult.getRight().getLeft() != null) {
List<Funding> localFundings = localResult.getRight().getLeft();
if (localFundings.size()>0) {
fundings.addAll(localFundings);
}
}
}
}

Expand All @@ -2505,18 +2516,63 @@ private void toTEI(Document doc,
resCitations,
config);
if (fundingStmt.length() > 0) {
tei.append(fundingStmt);

parsers.getFundingAcknowledgementParser().processingXmlFragment(fundingStmt.toString(), config);
MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affiliation>>> localResult =
parsers.getFundingAcknowledgementParser().processingXmlFragment(fundingStmt.toString(), config);

if (localResult != null && localResult.getLeft() != null){
String local_tei = localResult.getLeft().toXML();
local_tei = local_tei.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", "");
//tei.append(local_tei);
annexStatements.add(local_tei);
} else {
//tei.append(fundingStmt);
annexStatements.add(fundingStmt.toString());
}

if (localResult != null && localResult.getRight() != null && localResult.getRight().getLeft() != null) {
List<Funding> localFundings = localResult.getRight().getLeft();
if (localFundings.size()>0) {
fundings.addAll(localFundings);
}
}
}

// acknowledgement is in the back
StringBuilder acknowledgmentStmt = getSectionAsTEI("acknowledgement", "\t\t\t", doc, SegmentationLabels.ACKNOWLEDGEMENT,
teiFormatter, resCitations, config);

if (acknowledgmentStmt.length() > 0) {
tei.append(acknowledgmentStmt);
MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affiliation>>> localResult =
parsers.getFundingAcknowledgementParser().processingXmlFragment(acknowledgmentStmt.toString(), config);

if (localResult != null && localResult.getLeft() != null) {
String local_tei = localResult.getLeft().toXML();
local_tei = local_tei.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", "");
//tei.append(local_tei);
annexStatements.add(local_tei);
}
else {
//tei.append(acknowledgmentStmt);
annexStatements.add(acknowledgmentStmt.toString());
}

if (localResult != null && localResult.getRight() != null && localResult.getRight().getLeft() != null) {
List<Funding> localFundings = localResult.getRight().getLeft();
if (localFundings.size()>0) {
fundings.addAll(localFundings);
}
}
}

tei.append(teiFormatter.toTEIHeader(resHeader, null, resCitations, markerTypes, fundings, config));

tei = teiFormatter.toTEIBody(tei, reseBody, resHeader, resCitations,
layoutTokenization, figures, tables, equations, markerTypes, doc, config);

tei.append("\t\t<back>\n");

for (String annexStatement : annexStatements) {
tei.append(annexStatement);
}

// availability statements in header
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ public MutablePair<Element, MutableTriple<List<Funding>,List<Person>,List<Affili
globalResult = MutablePair.of(root, localResult.getRight());
} else {
// concatenate members of the local results to the global ones

}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ public String processingHeaderSection(GrobidAnalysisConfig config, Document doc,
// we don't need to serialize if we process the full text (it would be done 2 times)
if (serialize) {
TEIFormatter teiFormatter = new TEIFormatter(doc, null);
StringBuilder tei = teiFormatter.toTEIHeader(resHeader, null, null, null, config);
StringBuilder tei = teiFormatter.toTEIHeader(resHeader, null, null, null, null, config);
tei.append("\t</text>\n");
tei.append("</TEI>\n");
return tei.toString();
Expand Down

0 comments on commit e799571

Please sign in to comment.