Skip to content

Commit

Permalink
add lightweight model training generation
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Aug 2, 2024
1 parent 2ece31e commit f30d6f6
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,7 @@ public void createTrainingBlank(File inputFile, String pathRaw, String pathTEI,
*/
public void createTraining(File inputFile, String pathRaw, String pathTEI, int id) {
System.out.println(inputFile.getPath());
Document doc = parsers.getFullTextParser().createTraining(inputFile, pathRaw, pathTEI, id);
Document doc = parsers.getFullTextParser().createTraining(inputFile, pathRaw, pathTEI, id, GrobidModels.ModelFlavour.ARTICLE_LIGHT);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1128,6 +1128,14 @@ public Document createTraining(File inputFile,
String pathFullText,
String pathTEI,
int id) {
return createTraining(inputFile, pathFullText, pathTEI, id, null);
}

public Document createTraining(File inputFile,
String pathFullText,
String pathTEI,
int id,
GrobidModels.ModelFlavour flavour) {
if (tmpPath == null)
throw new GrobidResourceException("Cannot process pdf file, because temp path is null.");
if (!tmpPath.exists()) {
Expand All @@ -1153,7 +1161,7 @@ public Document createTraining(File inputFile,
doc.produceStatistics();

String fulltext = //getAllTextFeatured(doc, false);
parsers.getSegmentationParser().getAllLinesFeatured(doc);
parsers.getSegmentationParser(flavour).getAllLinesFeatured(doc);
//List<LayoutToken> tokenizations = doc.getTokenizationsFulltext();
List<LayoutToken> tokenizations = doc.getTokenizations();

Expand All @@ -1174,8 +1182,8 @@ public Document createTraining(File inputFile,
FileUtils.writeStringToFile(new File(outPathRawtext), rawtxt.toString(), StandardCharsets.UTF_8);

if (isNotBlank(fulltext)) {
String rese = parsers.getSegmentationParser().label(fulltext);
StringBuffer bufferFulltext = parsers.getSegmentationParser().trainingExtraction(rese, tokenizations, doc);
String rese = parsers.getSegmentationParser(flavour).label(fulltext);
StringBuffer bufferFulltext = parsers.getSegmentationParser(flavour).trainingExtraction(rese, tokenizations, doc);

// write the TEI file to reflect the extact layout of the text as extracted from the pdf
writer = new OutputStreamWriter(new FileOutputStream(new File(pathTEI +
Expand All @@ -1189,7 +1197,7 @@ public Document createTraining(File inputFile,
writer.close();
}

doc = parsers.getSegmentationParser().processing(documentSource,
doc = parsers.getSegmentationParser(flavour).processing(documentSource,
GrobidAnalysisConfig.defaultInstance());

// REFERENCE SEGMENTER MODEL
Expand Down Expand Up @@ -1391,7 +1399,7 @@ public Document createTraining(File inputFile,
headerTokenizations.add(tokenizationsFull.get(i));
}
}
Pair<String, List<LayoutToken>> featuredHeader = parsers.getHeaderParser().getSectionHeaderFeatured(doc, documentHeaderParts);
Pair<String, List<LayoutToken>> featuredHeader = parsers.getHeaderParser(flavour).getSectionHeaderFeatured(doc, documentHeaderParts);
String header = featuredHeader.getLeft();

if ((header != null) && (header.trim().length() > 0)) {
Expand All @@ -1401,12 +1409,12 @@ public Document createTraining(File inputFile,
writer.write(header + "\n");
writer.close();

String rese = parsers.getHeaderParser().label(header);
String rese = parsers.getHeaderParser(flavour).label(header);
BiblioItem resHeader = new BiblioItem();
resHeader = parsers.getHeaderParser().resultExtraction(rese, headerTokenizations, resHeader);
resHeader = parsers.getHeaderParser(flavour).resultExtraction(rese, headerTokenizations, resHeader);

// buffer for the header block
StringBuilder bufferHeader = parsers.getHeaderParser().trainingExtraction(rese, headerTokenizations);
StringBuilder bufferHeader = parsers.getHeaderParser(flavour).trainingExtraction(rese, headerTokenizations);
Language lang = LanguageUtilities.getInstance().runLanguageId(bufferHeader.toString());
if (lang != null) {
doc.setLanguage(lang.getLang());
Expand Down Expand Up @@ -2734,8 +2742,19 @@ private void toTEI(Document doc,

tei.append(teiFormatter.toTEIHeader(resHeader, null, resCitations, markerTypes, fundings, config));

tei = teiFormatter.toTEIBody(tei, reseBody, resHeader, resCitations,
layoutTokenization, figures, tables, equations, markerTypes, doc, config);
tei = teiFormatter.toTEIBody(
tei,
reseBody,
resHeader,
resCitations,
layoutTokenization,
figures,
tables,
equations,
markerTypes,
doc,
config
);

tei.append("\t\t<back>\n");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,7 @@ public void processPDFAnnotation(final GrobidMainArgs pGbdArgs) throws Exception
* @return List<String> containing the list of the methods.
*/
public final static List<String> getUsableMethods() {
final Class<?> pClass = new ProcessEngine().getClass();
final Class<?> pClass = ProcessEngine.class;
final List<String> availableMethods = new ArrayList<String>();
for (final Method method : pClass.getMethods()) {
if (isUsableMethod(method.getName())) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ public class GrobidMainArgs {

private boolean addElementId = false;

private String modelFlavour = null;

/**
* @return the path2grobidHome
*/
Expand Down Expand Up @@ -241,4 +243,11 @@ public final void setSegmentSentences(final boolean pSegmentSentences) {
segmentSentences = pSegmentSentences;
}

public String getModelFlavour() {
return modelFlavour;
}

public void setModelFlavour(String modelFlavour) {
this.modelFlavour = modelFlavour;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,13 @@ public static File writeInputFile(InputStream inputStream) {
originFile = null;
} finally {
try {
if (inputStream != null) {
inputStream.close();
}

if (out != null) {
out.close();
}
inputStream.close();
} catch (IOException e) {
LOGGER.error("An internal error occurs, while writing to disk (file to write '"
+ originFile + "').", e);
Expand Down
3 changes: 3 additions & 0 deletions grobid-home/config/grobid.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,9 @@ grobid:
epsilon: 0.000001
window: 30
nbMaxIterations: 1500
delft:
architecture: "BidLSTM_ChainCRF_FEATURES"
useELMo: false

- name: "reference-segmenter"
engine: "wapiti"
Expand Down

0 comments on commit f30d6f6

Please sign in to comment.