diff --git a/grobid-core/src/main/java/org/grobid/core/data/Funder.java b/grobid-core/src/main/java/org/grobid/core/data/Funder.java index e40bf8eabd..2886723eec 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Funder.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Funder.java @@ -33,7 +33,16 @@ public class Funder { private String url = null; private List layoutTokens = new ArrayList<>(); + + static public Funder EMPTY = new Funder("unknown"); + public Funder() { + } + + public Funder(String fullName) { + this.fullName = fullName; + } + public String getFullName() { return this.fullName; } @@ -179,4 +188,22 @@ public String toJson() { json.append("\n}"); return json.toString(); } + + public String toTEI() { + StringBuilder tei = new StringBuilder(); + + tei.append("\n"); + if (fullName != null) { + tei.append("\t"+TextUtilities.HTMLEncode(fullName)+"\n"); + } + if (abbreviatedName != null) { + tei.append("\t"+TextUtilities.HTMLEncode(abbreviatedName)+"\n"); + } + if (doi != null) { + tei.append("\t"+TextUtilities.HTMLEncode(doi)+"\n"); + } + tei.append("\n"); + + return tei.toString(); + } } \ No newline at end of file diff --git a/grobid-core/src/main/java/org/grobid/core/data/Funding.java b/grobid-core/src/main/java/org/grobid/core/data/Funding.java index a351866596..dfdac314df 100644 --- a/grobid-core/src/main/java/org/grobid/core/data/Funding.java +++ b/grobid-core/src/main/java/org/grobid/core/data/Funding.java @@ -3,6 +3,7 @@ import org.grobid.core.utilities.TextUtilities; import org.grobid.core.layout.LayoutToken; import org.grobid.core.utilities.LayoutTokensUtil; +import org.grobid.core.utilities.KeyGen; import java.util.ArrayList; import java.util.List; @@ -12,6 +13,9 @@ */ public class Funding { private Funder funder = null; + + // this is an identifier for identifying and referencing the funding inside the full document + private String identifier = null; // program or call private String programFullName = null; @@ -227,4 +231,50 @@ public String toJson() { json.append("\n}"); return json.toString(); } + + public String toTEI() { + StringBuilder tei = new StringBuilder(); + + String localType = "funding"; + if (projectFullName != null || projectAbbreviatedName != null) + localType = "funded-project"; + + if (this.identifier == null) { + String localId = KeyGen.getKey().substring(0, 7); + this.identifier = "_" + localId; + } + + tei.append("\n"); + if (grantNumber != null) { + tei.append("\t"+TextUtilities.HTMLEncode(grantNumber)+"\n"); + } + if (programFullName != null) { + tei.append("\t"+TextUtilities.HTMLEncode(programFullName)+"\n"); + } + if (programAbbreviatedName != null) { + tei.append("\t"+TextUtilities.HTMLEncode(programAbbreviatedName)+"\n"); + } + if (projectFullName != null) { + tei.append("\t"+TextUtilities.HTMLEncode(projectFullName)+"\n"); + } + if (projectAbbreviatedName != null) { + tei.append("\t"+TextUtilities.HTMLEncode(projectAbbreviatedName)+"\n"); + } + if (url != null) { + tei.append("\n"); + } + if (start != null) { + String dateString = start.toTEI(); + dateString = dateString.replace("\n"); + + return tei.toString(); + } } diff --git a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java index aa1d5e91ea..ca6962d2ca 100755 --- a/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java +++ b/grobid-core/src/main/java/org/grobid/core/document/TEIFormatter.java @@ -14,8 +14,8 @@ import nu.xom.Text; import org.grobid.core.GrobidModels; -import org.grobid.core.data.*; import org.grobid.core.data.Date; +import org.grobid.core.data.*; import org.grobid.core.document.xml.XmlBuilderUtils; import org.grobid.core.engines.Engine; import org.grobid.core.engines.FullTextParser; @@ -99,8 +99,9 @@ public StringBuilder toTEIHeader(BiblioItem biblio, String defaultPublicationStatement, List bds, List markerTypes, + List fundings, GrobidAnalysisConfig config) { - return toTEIHeader(biblio, SchemaDeclaration.XSD, defaultPublicationStatement, bds, markerTypes, config); + return toTEIHeader(biblio, SchemaDeclaration.XSD, defaultPublicationStatement, bds, markerTypes, fundings, config); } public StringBuilder toTEIHeader(BiblioItem biblio, @@ -108,6 +109,7 @@ public StringBuilder toTEIHeader(BiblioItem biblio, String defaultPublicationStatement, List bds, List markerTypes, + List fundings, GrobidAnalysisConfig config) { StringBuilder tei = new StringBuilder(); tei.append("\n"); @@ -157,7 +159,34 @@ public StringBuilder toTEIHeader(BiblioItem biblio, tei.append(TextUtilities.HTMLEncode(biblio.getTitle())); } - tei.append("\n\t\t\t\n"); + tei.append("\n"); + + if (fundings != null && fundings.size()>0) { + + Map> fundingRelation = new HashMap<>(); + for(Funding funding : fundings) { + if (funding.getFunder() == null) { + List localfundings = fundingRelation.get(Funder.EMPTY); + if (localfundings == null) + localfundings = new ArrayList<>(); + localfundings.add(funding); + fundingRelation.put(Funder.EMPTY, localfundings); + } else { + List localfundings = fundingRelation.get(funding.getFunder()); + if (localfundings == null) + localfundings = new ArrayList<>(); + localfundings.add(funding); + fundingRelation.put(funding.getFunder(), localfundings); + } + } + + for (Map.Entry> entry : fundingRelation.entrySet()) { + tei.append(entry.getKey().toTEI()); + } + } + + tei.append("\t\t\t\n"); + if ((biblio.getPublisher() != null) || (biblio.getPublicationDate() != null) || (biblio.getNormalizedPublicationDate() != null)) { diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java index 4b657135b0..0f1590d50c 100755 --- a/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FullTextParser.java @@ -5,18 +5,14 @@ import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; +import org.apache.commons.lang3.tuple.MutablePair; +import org.apache.commons.lang3.tuple.MutableTriple; import org.apache.commons.io.FileUtils; import java.nio.charset.StandardCharsets; import org.grobid.core.GrobidModels; -import org.grobid.core.data.BibDataSet; -import org.grobid.core.data.BiblioItem; -import org.grobid.core.data.Figure; -import org.grobid.core.data.Table; -import org.grobid.core.data.Equation; -import org.grobid.core.data.Funding; -import org.grobid.core.data.Funder; +import org.grobid.core.data.*; import org.grobid.core.document.Document; import org.grobid.core.document.DocumentPiece; import org.grobid.core.document.DocumentPointer; @@ -68,6 +64,8 @@ import java.util.TreeSet; import java.util.regex.Matcher; +import nu.xom.Element; + import static org.apache.commons.lang3.StringUtils.*; public class FullTextParser extends AbstractParser { @@ -2461,16 +2459,11 @@ private void toTEI(Document doc, } List resCitations = doc.getBibDataSets(); TEIFormatter teiFormatter = new TEIFormatter(doc, this); - StringBuilder tei; + StringBuilder tei = new StringBuilder(); try { - tei = teiFormatter.toTEIHeader(resHeader, null, resCitations, markerTypes, config); - - //System.out.println(rese); - //int mode = config.getFulltextProcessingMode(); - tei = teiFormatter.toTEIBody(tei, reseBody, resHeader, resCitations, - layoutTokenization, figures, tables, equations, markerTypes, doc, config); + List fundings = new ArrayList<>(); - tei.append("\t\t\n"); + List annexStatements = new ArrayList<>(); // funding in header StringBuilder fundingStmt = new StringBuilder(); @@ -2492,7 +2485,25 @@ private void toTEI(Document doc, config); } if (fundingStmt.length() > 0) { - tei.append(fundingStmt.toString()); + MutablePair,List,List>> localResult = + parsers.getFundingAcknowledgementParser().processingXmlFragment(fundingStmt.toString(), config); + + if (localResult != null && localResult.getLeft() != null) { + String local_tei = localResult.getLeft().toXML(); + local_tei = local_tei.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", ""); + //tei.append(local_tei); + annexStatements.add(local_tei); + } else { + //tei.append(fundingStmt); + annexStatements.add(fundingStmt.toString()); + } + + if (localResult != null && localResult.getRight() != null && localResult.getRight().getLeft() != null) { + List localFundings = localResult.getRight().getLeft(); + if (localFundings.size()>0) { + fundings.addAll(localFundings); + } + } } } @@ -2505,10 +2516,25 @@ private void toTEI(Document doc, resCitations, config); if (fundingStmt.length() > 0) { - tei.append(fundingStmt); - - parsers.getFundingAcknowledgementParser().processingXmlFragment(fundingStmt.toString(), config); + MutablePair,List,List>> localResult = + parsers.getFundingAcknowledgementParser().processingXmlFragment(fundingStmt.toString(), config); + + if (localResult != null && localResult.getLeft() != null){ + String local_tei = localResult.getLeft().toXML(); + local_tei = local_tei.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", ""); + //tei.append(local_tei); + annexStatements.add(local_tei); + } else { + //tei.append(fundingStmt); + annexStatements.add(fundingStmt.toString()); + } + if (localResult != null && localResult.getRight() != null && localResult.getRight().getLeft() != null) { + List localFundings = localResult.getRight().getLeft(); + if (localFundings.size()>0) { + fundings.addAll(localFundings); + } + } } // acknowledgement is in the back @@ -2516,7 +2542,37 @@ private void toTEI(Document doc, teiFormatter, resCitations, config); if (acknowledgmentStmt.length() > 0) { - tei.append(acknowledgmentStmt); + MutablePair,List,List>> localResult = + parsers.getFundingAcknowledgementParser().processingXmlFragment(acknowledgmentStmt.toString(), config); + + if (localResult != null && localResult.getLeft() != null) { + String local_tei = localResult.getLeft().toXML(); + local_tei = local_tei.replace(" xmlns=\"http://www.tei-c.org/ns/1.0\"", ""); + //tei.append(local_tei); + annexStatements.add(local_tei); + } + else { + //tei.append(acknowledgmentStmt); + annexStatements.add(acknowledgmentStmt.toString()); + } + + if (localResult != null && localResult.getRight() != null && localResult.getRight().getLeft() != null) { + List localFundings = localResult.getRight().getLeft(); + if (localFundings.size()>0) { + fundings.addAll(localFundings); + } + } + } + + tei.append(teiFormatter.toTEIHeader(resHeader, null, resCitations, markerTypes, fundings, config)); + + tei = teiFormatter.toTEIBody(tei, reseBody, resHeader, resCitations, + layoutTokenization, figures, tables, equations, markerTypes, doc, config); + + tei.append("\t\t\n"); + + for (String annexStatement : annexStatements) { + tei.append(annexStatement); } // availability statements in header diff --git a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java index e7be1ff118..24cc970b77 100644 --- a/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java +++ b/grobid-core/src/main/java/org/grobid/core/engines/FundingAcknowledgementParser.java @@ -140,7 +140,7 @@ public MutablePair,List,List\n"); tei.append("\n"); return tei.toString();