Skip to content

Commit

Permalink
Merge branch 'master' into release-0.8.1
Browse files Browse the repository at this point in the history
  • Loading branch information
lfoppiano committed Jun 17, 2024
2 parents c408076 + d714650 commit 50860c5
Show file tree
Hide file tree
Showing 4 changed files with 359 additions and 15 deletions.
4 changes: 4 additions & 0 deletions Dockerfile.delft
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ RUN rm -rf grobid-home/lib/lin-32
RUN rm -rf grobid-home/lib/win-*
RUN rm -rf grobid-home/lib/mac-64

# Setting DL-powered configuration
RUN rm grobid-home/config/grobid.yaml && \
mv grobid-home/config/grobid-full.yaml grobid-home/config/grobid.yaml

RUN ./gradlew clean assemble --no-daemon --info --stacktrace

WORKDIR /opt/grobid
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1274,6 +1274,11 @@ private StringBuilder toTEINote(StringBuilder tei,

org.apache.commons.lang3.tuple.Pair<String, List<LayoutToken>> noteProcess =
fullTextParser.processShort(noteTokens, doc);

if (noteProcess == null) {
continue;
}

String labeledNote = noteProcess.getLeft();
List<LayoutToken> noteLayoutTokens = noteProcess.getRight();

Expand Down Expand Up @@ -1512,7 +1517,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
int clusterPage = Iterables.getLast(clusterTokens).getPage();

List<Note> notesSamePage = null;
List<Triple<String,String, OffsetPosition>> matchedLabelPosition = new ArrayList<>();
List<Triple<String, String, OffsetPosition>> matchedLabelPositions = new ArrayList<>();

// map the matched note labels to their corresponding note objects
Map<String, Note> labels2Notes = new TreeMap<>();
Expand All @@ -1530,20 +1535,23 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
// map a note label (string) to a valid matching position in the sequence of Layout Tokens
// of the paragraph segment

int start = 0;
for (Note note : notesSamePage) {
Optional<LayoutToken> matching = clusterTokens
List<LayoutToken> clusterReduced = clusterTokens.subList(start, clusterTokens.size());
Optional<LayoutToken> matching = clusterReduced
.stream()
.filter(t -> t.getText().equals(note.getLabel()) && t.isSuperscript())
.findFirst();

if (matching.isPresent()) {
int idx = clusterTokens.indexOf(matching.get());
int idx = clusterReduced.indexOf(matching.get()) + start;
note.setIgnored(true);
OffsetPosition matchingPosition = new OffsetPosition();
matchingPosition.start = idx;
matchingPosition.end = idx+1; // to be review, might be more than one layout token
matchedLabelPosition.add(Triple.of(note.getLabel(), "note", matchingPosition));
labels2Notes.put(note.getLabel(), note);
start = matchingPosition.end;
matchedLabelPositions.add(Triple.of(note.getIdentifier(), "note", matchingPosition));
labels2Notes.put(note.getIdentifier(), note);
}
}

Expand All @@ -1555,7 +1563,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
.forEach(opu -> {
// We correct the latest token here, since later we will do a substring in the shared code,
// and we cannot add a +1 there.
matchedLabelPosition.add(
matchedLabelPositions.add(
Triple.of(LayoutTokensUtil.normalizeDehyphenizeText(clusterTokens.subList(opu.start, opu.end)),
"url",
new OffsetPosition(opu.start, opu.end + 1)
Expand All @@ -1567,7 +1575,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
// We can add more elements to be extracted from the paragraphs, here. Each labelPosition it's a
// Triple with three main elements: the text of the item, the type, and the offsetPositions.

if (CollectionUtils.isEmpty(matchedLabelPosition)){
if (CollectionUtils.isEmpty(matchedLabelPositions)){
String clusterContent = LayoutTokensUtil.normalizeDehyphenizeText(clusterTokens);
if (isNewParagraph(lastClusterLabel, curParagraph)) {
if (curParagraph != null && config.isWithSentenceSegmentation()) {
Expand Down Expand Up @@ -1617,7 +1625,7 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
}

// sort the matches by position
Collections.sort(matchedLabelPosition, (m1, m2) -> {
Collections.sort(matchedLabelPositions, (m1, m2) -> {
return m1.getRight().start - m2.getRight().start;
}
);
Expand All @@ -1626,11 +1634,11 @@ public StringBuilder toTEITextPiece(StringBuilder buffer,
int pos = 0;

// build the paragraph segment, match by match
for (Triple<String, String, OffsetPosition> referenceInformation : matchedLabelPosition) {
for (Triple<String, String, OffsetPosition> referenceInformation : matchedLabelPositions) {
String type = referenceInformation.getMiddle();
OffsetPosition matchingPosition = referenceInformation.getRight();

if (pos >= matchingPosition.start)
if (pos > matchingPosition.start)
break;

List<LayoutToken> before = clusterTokens.subList(pos, matchingPosition.start);
Expand Down Expand Up @@ -1945,7 +1953,8 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
currentSentenceTokens = new ArrayList<>();
break;
}
sentenceChunk = text.substring(theSentences.get(currentSentenceIndex).start, theSentences.get(currentSentenceIndex).end);
int endPosition = Math.min(theSentences.get(currentSentenceIndex).end, text.length());
sentenceChunk = text.substring(theSentences.get(currentSentenceIndex).start, endPosition);
}
currentSentenceTokens = new ArrayList<>();
currentSentenceTokens.add(token);
Expand Down Expand Up @@ -2025,12 +2034,13 @@ public void segmentIntoSentences(Element curParagraph, List<LayoutToken> curPara
}
}

if (pos+posInSentence <= theSentences.get(i).end) {
String local_text_chunk = text.substring(pos+posInSentence, theSentences.get(i).end);
int endPosition = Math.min(theSentences.get(i).end, text.length());
if (pos+posInSentence <= endPosition) {
String local_text_chunk = text.substring(pos+posInSentence, endPosition);
local_text_chunk = XmlBuilderUtils.stripNonValidXMLCharacters(local_text_chunk);
sentenceElement.appendChild(local_text_chunk);
curParagraph.appendChild(sentenceElement);
}
curParagraph.appendChild(sentenceElement);
}

for(int i=curParagraph.getChildCount()-1; i>=0; i--) {
Expand Down
Loading

0 comments on commit 50860c5

Please sign in to comment.