Skip to content

Commit

Permalink
#4517 - Multi-value features are not included in BioC output
Browse files Browse the repository at this point in the history
- Added support for loading and saving multi-value string features
- Added test
  • Loading branch information
reckart committed Feb 16, 2024
1 parent d8ae85d commit b0fdc48
Show file tree
Hide file tree
Showing 17 changed files with 152 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,12 @@
import static de.tudarmstadt.ukp.inception.project.initializers.basic.BasicRelationLayerInitializer.BASIC_RELATION_LAYER_NAME;
import static de.tudarmstadt.ukp.inception.project.initializers.basic.BasicSpanLayerInitializer.BASIC_SPAN_LABEL_FEATURE_NAME;
import static de.tudarmstadt.ukp.inception.project.initializers.basic.BasicSpanLayerInitializer.BASIC_SPAN_LAYER_NAME;
import static org.apache.commons.collections.CollectionUtils.isNotEmpty;
import static org.apache.uima.cas.CAS.FEATURE_FULL_NAME_BEGIN;
import static org.apache.uima.cas.CAS.FEATURE_FULL_NAME_END;
import static org.apache.uima.cas.CAS.FEATURE_FULL_NAME_SOFA;

import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
Expand All @@ -34,6 +36,7 @@
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.util.FSUtil;
import org.apache.uima.jcas.JCas;

import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.MetaDataStringField;
Expand Down Expand Up @@ -92,7 +95,7 @@ static Optional<MetaDataStringField> getCollectionMetadataField(CAS aCas, String
.findFirst();
}

static Type guessBestRelationType(TypeSystem aTypeSystem, Map<String, String> aInfons)
static Type guessBestRelationType(TypeSystem aTypeSystem, Map<String, List<String>> aInfons)
{
var type = guessBestType(aTypeSystem, aInfons);
if (type != null && isRelationLayer(type)) {
Expand All @@ -107,7 +110,7 @@ static Type guessBestRelationType(TypeSystem aTypeSystem, Map<String, String> aI
return null;
}

static Type guessBestSpanType(TypeSystem aTypeSystem, Map<String, String> aInfons)
static Type guessBestSpanType(TypeSystem aTypeSystem, Map<String, List<String>> aInfons)
{
var type = guessBestType(aTypeSystem, aInfons);
if (type != null && isSpanLayer(type)) {
Expand All @@ -122,16 +125,16 @@ static Type guessBestSpanType(TypeSystem aTypeSystem, Map<String, String> aInfon
return null;
}

private static Type guessBestType(TypeSystem aTypeSystem, Map<String, String> aInfons)
private static Type guessBestType(TypeSystem aTypeSystem, Map<String, List<String>> aInfons)
{
var typeInfon = aInfons.get(I_TYPE);
if (typeInfon != null) {
var type = aTypeSystem.getType(typeInfon);
if (isNotEmpty(typeInfon)) {
var type = aTypeSystem.getType(typeInfon.get(0));
if (type != null) {
return type;
}

type = findTypeByShortName(aTypeSystem, typeInfon);
type = findTypeByShortName(aTypeSystem, typeInfon.get(0));
if (type != null) {
return type;
}
Expand All @@ -151,7 +154,7 @@ private static Type findTypeByShortName(TypeSystem aTypeSystem, String aBaseName
return null;
}

public static void transferFeatures(AnnotationFS aAnnotation, Map<String, String> aInfons)
public static void transferFeatures(AnnotationFS aAnnotation, Map<String, List<String>> aInfons)
{
var anyFeatureSet = false;
for (var infon : aInfons.entrySet()) {
Expand All @@ -164,8 +167,14 @@ public static void transferFeatures(AnnotationFS aAnnotation, Map<String, String
continue;
}

if (feature.getRange().isPrimitive()) {
aAnnotation.setFeatureValueFromString(feature, infon.getValue());
if (feature.getRange().isPrimitive() && isNotEmpty(infon.getValue())) {
aAnnotation.setFeatureValueFromString(feature, infon.getValue().get(0));
anyFeatureSet = true;
}

if (CAS.TYPE_NAME_STRING_ARRAY.equals(feature.getRange().getName())
&& isNotEmpty(infon.getValue())) {
FSUtil.setFeature(aAnnotation, feature, infon.getValue());
anyFeatureSet = true;
}
}
Expand All @@ -174,7 +183,10 @@ public static void transferFeatures(AnnotationFS aAnnotation, Map<String, String
var valueFeature = aAnnotation.getType()
.getFeatureByBaseName(BASIC_SPAN_LABEL_FEATURE_NAME);
if (!anyFeatureSet && aInfons.size() == 1 && valueFeature != null) {
aAnnotation.setFeatureValueFromString(valueFeature, aInfons.values().iterator().next());
var values = aInfons.values().iterator().next();
if (isNotEmpty(values)) {
aAnnotation.setFeatureValueFromString(valueFeature, values.get(0));
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@
* This class is exposed as a Spring Component via
* {@link BioCAutoConfiguration#bioCXmlDocumentFormatSupport}.
* </p>
*
* @deprecated Experimental code that was deprecated in favor of {@link BioCFormatSupport}
*/
@Deprecated
public class BioCXmlDocumentFormatSupport
implements FormatSupport
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@
import de.tudarmstadt.ukp.inception.io.xml.dkprocore.CasXmlHandler.ElementListener;
import de.tudarmstadt.ukp.inception.support.xml.XmlParserUtils;

/**
* @deprecated Experimental code that was deprecated in favor of {@link BioCReader}
*/
@Deprecated
public class BioCXmlDocumentReader
extends BioCReaderImplBase
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@
import de.tudarmstadt.ukp.inception.io.bioc.xml.Cas2BioCSaxEvents;
import de.tudarmstadt.ukp.inception.support.xml.XmlParserUtils;

/**
* @deprecated Experimental code that was deprecated in favor of {@link BioCWriter}
*/
@Deprecated
public class BioCXmlDocumentWriter
extends JCasFileWriter_ImplBase
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@
*/
package de.tudarmstadt.ukp.inception.io.bioc.model;

import static java.util.stream.Collectors.toMap;

import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
Expand Down Expand Up @@ -63,8 +62,15 @@ public Optional<String> infon(String aKey)
.map(BioCInfon::getValue);
}

public Map<String, String> infonMap()
public Map<String, List<String>> infonMap()
{
return infons.stream().collect(toMap(BioCInfon::getKey, BioCInfon::getValue));
var map = new LinkedHashMap<String, List<String>>();

for (var infon : infons) {
var list = map.computeIfAbsent(infon.getKey(), $ -> new ArrayList<>());
list.add(infon.getValue());
}

return map;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import java.util.Set;
import java.util.stream.Collectors;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.fit.util.FSUtil;
import org.apache.uima.jcas.JCas;
Expand Down Expand Up @@ -199,6 +200,17 @@ private void serializeFeatures(Annotation aAnnotation, BioCObject aBioCAnnotatio
aBioCAnnotation.addInfon(feature.getShortName(), value);
}
}

if (CAS.TYPE_NAME_STRING_ARRAY.equals(feature.getRange().getName())) {
var values = FSUtil.getFeature(aAnnotation, feature, String[].class);
if (values != null) {
for (var value : values) {
if (value != null) {
aBioCAnnotation.addInfon(feature.getShortName(), value);
}
}
}
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@
import static java.util.stream.Collectors.toList;

import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import org.apache.uima.cas.Type;
Expand All @@ -58,8 +60,13 @@

import de.tudarmstadt.ukp.clarin.webanno.tsv.internal.tsv3x.Tsv3XCasSchemaAnalyzer;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.inception.io.bioc.model.BioCToCas;
import de.tudarmstadt.ukp.inception.io.xml.dkprocore.XmlNodeUtils;

/**
* @deprecated Experimental code that was deprecated in favor of {@link BioCToCas}
*/
@Deprecated
public class BioC2XmlCas
{
private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
Expand All @@ -82,7 +89,7 @@ private void transferRelationAnnotations(JCas aJCas, Map<String, AnnotationFS> a
var infons = extractInfons(relationElement);
var nodes = extractNodes(relationElement);

Type uimaType = guessBestRelationType(aJCas.getTypeSystem(), infons);
var uimaType = guessBestRelationType(aJCas.getTypeSystem(), infons);
if (uimaType == null || !isRelationLayer(uimaType)) {
LOG.debug("Unable to find suitable UIMA type for relation annotation");
continue;
Expand Down Expand Up @@ -151,7 +158,7 @@ private Map<String, AnnotationFS> transferSpanAnnotations(JCas aJCas)
return id2Span;
}

private Map<String, String> extractInfons(XmlElement aElement)
private Map<String, List<String>> extractInfons(XmlElement aElement)
{
var children = aElement.getChildren();

Expand All @@ -161,13 +168,14 @@ private Map<String, String> extractInfons(XmlElement aElement)

var infonChildren = children.select(XmlElement.class) //
.filter(e -> E_INFON.equals(e.getQName())) //
.collect(toList());
.toList();

var infons = new LinkedHashMap<String, String>();
var infons = new LinkedHashMap<String, List<String>>();
for (var infonChild : infonChildren) {
var key = getMandatoryAttributeValue(infonChild, A_KEY);
var value = XmlNodeUtils.textContent(infonChild);
infons.put(key, value);
var list = infons.computeIfAbsent(key, $ -> new ArrayList<>());
list.add(value);
}

return infons;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@

import org.dkpro.core.api.xml.type.XmlElement;

/**
* @deprecated Experimental code that was deprecated.
*/
@Deprecated
public class BioCXmlUtils
{
public static Optional<XmlElement> getChildTextElement(XmlElement aContainer)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,13 @@
import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.MetaDataStringField;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.inception.io.bioc.model.CasToBioC;
import de.tudarmstadt.ukp.inception.io.xml.dkprocore.Cas2SaxEvents;

/**
* @deprecated Experimental code that was deprecated in favor of {@link CasToBioC}
*/
@Deprecated
public class Cas2BioCSaxEvents
extends Cas2SaxEvents
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@
import javax.xml.stream.events.XMLEvent;
import javax.xml.stream.util.EventReaderDelegate;

/**
* @deprecated Experimental code that was deprecated.
*/
@Deprecated
public class DocumentWrappingXmlInputReader
extends EventReaderDelegate
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
import org.xml.sax.Locator;
import org.xml.sax.SAXException;

/**
* @deprecated Experimental code that was deprecated.
*/
@Deprecated
public class SplittingContentHandler
implements ContentHandler
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ NOTE: This format dynamically maps information from the imported files to the la
* If a document has not been imported from a BioC file containing passages and does not contain
`Div` annotations from any other source either, then on export a single passage containing the
entire document is created.
* Multi-value features are supported. They are serialized as a sequence of infons using the same key
(but different values). They can also be deserialized from those infons. When there are multiple
infons with the same key during deserialization but the target feature is not multi-valued, then
only the first infon is considered and the others are ignored.

.Unsupported features
* Cross-passage relations are not supported.
* Sentence-level infons are not supported.
* Passage-level infons are not supported.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,7 @@
import java.io.File;

import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.cas.CAS;
import org.apache.uima.collection.CollectionReaderDescription;
import org.apache.uima.fit.pipeline.SimplePipeline;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.TypeSystemDescription;
Expand All @@ -63,21 +61,21 @@ public static Iterable<File> testSuiteFiles()
@MethodSource("testSuiteFiles")
public void runTest(File aReferenceFolder) throws Exception
{
TypeSystemDescription merged = createTestTypeSystem(aReferenceFolder);
var merged = createTestTypeSystem(aReferenceFolder);

String targetFolder = "target/test-output/bioc-suite/" + aReferenceFolder.getName();
var targetFolder = "target/test-output/bioc-suite/" + aReferenceFolder.getName();

CollectionReaderDescription reader = createReaderDescription( //
var reader = createReaderDescription( //
BioCReader.class, merged, BioCReader.PARAM_SOURCE_LOCATION, aReferenceFolder, //
BioCReader.PARAM_PATTERNS, DATA_XML);

AnalysisEngineDescription writer = createEngineDescription( //
var writer = createEngineDescription( //
BioCWriter.class, merged, //
BioCWriter.PARAM_TARGET_LOCATION, targetFolder, //
BioCWriter.PARAM_STRIP_EXTENSION, true, //
BioCWriter.PARAM_OVERWRITE, true);

AnalysisEngineDescription xmiWriter = createEngineDescription( //
var xmiWriter = createEngineDescription( //
XmiWriter.class, merged, //
XmiWriter.PARAM_TARGET_LOCATION, targetFolder, //
XmiWriter.PARAM_STRIP_EXTENSION, true, //
Expand All @@ -95,7 +93,7 @@ public void runTest(File aReferenceFolder) throws Exception
private TypeSystemDescription createTestTypeSystem(File aReferenceFolder)
throws ResourceInitializationException
{
TypeSystemDescription global = createTypeSystemDescription();
var global = createTypeSystemDescription();

TypeSystemDescription local;
if (new File(aReferenceFolder, TYPESYSTEM_XML).exists()) {
Expand All @@ -114,6 +112,7 @@ private TypeSystemDescription createTestTypeSystem() throws ResourceInitializati
var tsd = UIMAFramework.getResourceSpecifierFactory().createTypeSystemDescription();
var basicSpanType = tsd.addType(BASIC_SPAN_LAYER_NAME, null, TYPE_NAME_ANNOTATION);
basicSpanType.addFeature(BASIC_SPAN_LABEL_FEATURE_NAME, null, TYPE_NAME_STRING);
basicSpanType.addFeature("values", null, CAS.TYPE_NAME_STRING_ARRAY);

var basicRelationType = tsd.addType(BASIC_RELATION_LAYER_NAME, null, TYPE_NAME_ANNOTATION);
basicRelationType.addFeature(FEAT_REL_SOURCE, null, CAS.TYPE_NAME_ANNOTATION);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;

/**
* @deprecated Experimental code that was deprecated.
*/
@Deprecated
public class BioCXmlDocumentReaderTest
{
@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@
import org.dkpro.core.testing.TestOptions;
import org.junit.jupiter.api.Test;

/**
* @deprecated Experimental code that was deprecated.
*/
@Deprecated
public class BioCXmlDocumentReaderWriterTest
{
@Test
Expand Down
Loading

0 comments on commit b0fdc48

Please sign in to comment.