From e9a9065abda45c541ab64ae04587216948260365 Mon Sep 17 00:00:00 2001 From: Stefan Armbruster Date: Sat, 21 Jan 2017 16:53:29 +0100 Subject: [PATCH] fixes #256 * deal gracefully with xml mixed content * refactoring * deprecate apoc.load.xmlSimple --- docs/index.html | 23 +-- docs/loadxml.adoc | 9 +- src/main/java/apoc/load/Xml.java | 202 ++++++++++++++------------- src/test/java/apoc/load/XmlTest.java | 117 ++++++++++++++-- src/test/resources/mixedcontent.xml | 7 + 5 files changed, 233 insertions(+), 125 deletions(-) create mode 100644 src/test/resources/mixedcontent.xml diff --git a/docs/index.html b/docs/index.html index 8c235d60b5..f09458a691 100644 --- a/docs/index.html +++ b/docs/index.html @@ -5484,14 +5484,21 @@

Load XML Introduction

Many existing (enterprise) applications, endpoints and files use XML as data exchange format.

-

To make these datastructures available to Cypher, you can use apoc.load.xml(Simple). +

To make these datastructures available to Cypher, you can use apoc.load.xml. It takes a file or http URL and parses the XML into a map datastructure.

-
-

While apoc.load.xml has a more verbose format, it keeps the ordering of elements.

-
-
-

While apoc.load.xmlSimple provides a more compact representation that makes it easier to process.

+
+ + + + + +
+
Note
+
+in previous releases we’ve had apoc.load.xmlSimple. This is now deprecated and got superseeded by +apoc.load.xml(url, true). +

See the following usage-examples for the procedures.

@@ -5553,7 +5560,7 @@

Simple XML Format

-
call apoc.load.xmlSimple("https://raw.githubusercontent.com/neo4j-contrib/neo4j-apoc-procedures/master/src/test/resources/books.xml")
+
call apoc.load.xml("https://raw.githubusercontent.com/neo4j-contrib/neo4j-apoc-procedures/master/src/test/resources/books.xml", true)
@@ -6057,7 +6064,7 @@

Further Functions

diff --git a/docs/loadxml.adoc b/docs/loadxml.adoc index 17a206085d..76a1ce14a9 100644 --- a/docs/loadxml.adoc +++ b/docs/loadxml.adoc @@ -4,12 +4,11 @@ Many existing (enterprise) applications, endpoints and files use XML as data exchange format. -To make these datastructures available to Cypher, you can use `apoc.load.xml(Simple)`. +To make these datastructures available to Cypher, you can use `apoc.load.xml`. It takes a file or http URL and parses the XML into a map datastructure. -While `apoc.load.xml` has a more verbose format, it keeps the ordering of elements. - -While `apoc.load.xmlSimple` provides a more compact representation that makes it easier to process. +NOTE: in previous releases we've had `apoc.load.xmlSimple`. This is now deprecated and got superseeded by +`apoc.load.xml(url, true)`. See the following usage-examples for the procedures. @@ -60,7 +59,7 @@ Here is the example file from above loaded with `apoc.load.xmlSimple` [source,cypher] ---- -call apoc.load.xmlSimple("https://raw.githubusercontent.com/neo4j-contrib/neo4j-apoc-procedures/master/src/test/resources/books.xml") +call apoc.load.xml("https://raw.githubusercontent.com/neo4j-contrib/neo4j-apoc-procedures/master/src/test/resources/books.xml", true) ---- [source,javascript] diff --git a/src/main/java/apoc/load/Xml.java b/src/main/java/apoc/load/Xml.java index c19154c1a4..4476c583f7 100644 --- a/src/main/java/apoc/load/Xml.java +++ b/src/main/java/apoc/load/Xml.java @@ -9,19 +9,16 @@ import org.neo4j.procedure.Procedure; import javax.xml.stream.XMLInputFactory; -import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import java.io.IOException; import java.net.URL; import java.net.URLConnection; -import java.util.ArrayList; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.stream.Stream; import static apoc.util.Util.cleanUrl; +import static javax.xml.stream.XMLStreamConstants.*; public class Xml { @@ -30,112 +27,121 @@ public class Xml { @Context public GraphDatabaseService db; @Procedure - @Description("apoc.load.xml('http://example.com/test.xml') YIELD value as doc CREATE (p:Person) SET p.name = doc.name load from XML URL (e.g. web-api) to import XML as single nested map with attributes and _type, _text and _childrenx fields.") - public Stream xml(@Name("url") String url) { - try { - FileUtils.checkReadAllowed(url); - URLConnection urlConnection = new URL(url).openConnection(); - FACTORY.setProperty("javax.xml.stream.isCoalescing", true); - XMLStreamReader reader = FACTORY.createXMLStreamReader(urlConnection.getInputStream()); - if (reader.nextTag()==XMLStreamConstants.START_ELEMENT) { - return Stream.of(new MapResult(handleElement(reader))); - } - throw new RuntimeException("Can't read url " + cleanUrl(url) + " as XML"); - } catch (IOException | XMLStreamException e) { - throw new RuntimeException("Can't read url " + cleanUrl(url) + " as XML", e); - } + @Description("apoc.load.xml('http://example.com/test.xml', false) YIELD value as doc CREATE (p:Person) SET p.name = doc.name load from XML URL (e.g. web-api) to import XML as single nested map with attributes and _type, _text and _childrenx fields.") + public Stream xml(@Name("url") String url, @Name(value = "simple", defaultValue = "false") boolean simpleMode) { + return xmlToMapResult(url, simpleMode); } - @Procedure - @Description("apoc.load.xml('http://example.com/test.xml') YIELD value as doc CREATE (p:Person) SET p.name = doc.name load from XML URL (e.g. web-api) to import XML as single nested map with attributes and _type, _text and _childrenx fields.") + + @Procedure(deprecatedBy = "apoc.load.xml") + @Deprecated + @Description("apoc.load.xmlSimple('http://example.com/test.xml') YIELD value as doc CREATE (p:Person) SET p.name = doc.name load from XML URL (e.g. web-api) to import XML as single nested map with attributes and _type, _text and _children fields. This method does intentionally not work with XML mixed content.") public Stream xmlSimple(@Name("url") String url) { + return xmlToMapResult(url, true); + } + + private Stream xmlToMapResult(@Name("url") String url, boolean simpleMode) { try { - FileUtils.checkReadAllowed(url); - URLConnection urlConnection = new URL(url).openConnection(); - FACTORY.setProperty("javax.xml.stream.isCoalescing", true); - XMLStreamReader reader = FACTORY.createXMLStreamReader(urlConnection.getInputStream()); - if (reader.nextTag()==XMLStreamConstants.START_ELEMENT) { - return Stream.of(new MapResult(handleElementSimple(null,reader))); - } - throw new RuntimeException("Can't read url " + cleanUrl(url) + " as XML"); + XMLStreamReader reader = getXMLStreamReaderFromUrl(url); + final Deque> stack = new LinkedList<>(); + do { + handleXmlEvent(stack, reader, simpleMode); + } while (proceedReader(reader)); + return Stream.of(new MapResult(stack.getFirst())); } catch (IOException | XMLStreamException e) { throw new RuntimeException("Can't read url " + cleanUrl(url) + " as XML", e); } } - private Map handleElement(XMLStreamReader reader) throws XMLStreamException { - LinkedHashMap row = null; - String element = null; - if (reader.isStartElement()) { - int attributes = reader.getAttributeCount(); - row = new LinkedHashMap<>(attributes + 3); - element = reader.getLocalName(); - row.put("_type", element); - for (int a = 0; a < attributes; a++) { - row.put(reader.getAttributeLocalName(a), reader.getAttributeValue(a)); - } - next(reader); - if (reader.hasText()) { - row.put("_text",reader.getText().trim()); - next(reader); - } - if (reader.isStartElement()) { - List> children = new ArrayList<>(100); - do { - Map child = handleElement(reader); - if (child != null && !child.isEmpty()) { - children.add(child); - } - } while (next(reader) == XMLStreamConstants.START_ELEMENT); - if (!children.isEmpty()) row.put("_children", children); - } - if (reader.isEndElement() || reader.getEventType() == XMLStreamConstants.END_DOCUMENT) { - return row; - } + private XMLStreamReader getXMLStreamReaderFromUrl(@Name("url") String url) throws IOException, XMLStreamException { + FileUtils.checkReadAllowed(url); + URLConnection urlConnection = new URL(url).openConnection(); + FACTORY.setProperty("javax.xml.stream.isCoalescing", true); + return FACTORY.createXMLStreamReader(urlConnection.getInputStream()); + } + + + private boolean proceedReader(XMLStreamReader reader) throws XMLStreamException { + if (reader.hasNext()) { + do { + reader.next(); + } while (reader.isWhiteSpace()); + return true; + } else { + return false; } - throw new IllegalStateException("Incorrect end-element state "+reader.getEventType()+" after "+element); } - private Map handleElementSimple(Map parent, XMLStreamReader reader) throws XMLStreamException { - LinkedHashMap row = null; - String element = null; - if (reader.isStartElement()) { - int attributes = reader.getAttributeCount(); - row = new LinkedHashMap<>(attributes + 3); - element = reader.getLocalName(); - row.put("_type", element); - for (int a = 0; a < attributes; a++) { - row.put(reader.getAttributeLocalName(a), reader.getAttributeValue(a)); - } - if (parent!=null) { - Object children = parent.get("_"+element); - if (children == null) parent.put("_"+element, row); - else if (children instanceof List) ((List)children).add(row); - else { - List list = new ArrayList<>(); - list.add(children); - list.add(row); - parent.put("_"+element, list); + + private void handleXmlEvent(Deque> stack, XMLStreamReader reader, boolean simpleMode) throws XMLStreamException { + Map elementMap; + switch (reader.getEventType()) { + case START_DOCUMENT: + case END_DOCUMENT: + // intentionally empty + break; + case START_ELEMENT: + int attributes = reader.getAttributeCount(); + elementMap = new LinkedHashMap<>(attributes+3); + elementMap.put("_type", reader.getLocalName()); + for (int a = 0; a < attributes; a++) { + elementMap.put(reader.getAttributeLocalName(a), reader.getAttributeValue(a)); } - } - next(reader); - if (reader.hasText()) { - row.put("_text",reader.getText().trim()); - next(reader); - } - if (reader.isStartElement()) { - do { - handleElementSimple(row, reader); - } while (next(reader) == XMLStreamConstants.START_ELEMENT); - } - if (reader.isEndElement() || reader.getEventType() == XMLStreamConstants.END_DOCUMENT) { - return row; - } + if (!stack.isEmpty()) { + final Map last = stack.getLast(); + String key = simpleMode ? "_" + reader.getLocalName() : "_children"; + amendToList(last, key, elementMap); + } + + stack.addLast(elementMap); + break; + + case END_ELEMENT: + elementMap = stack.size() > 1 ? stack.removeLast() : stack.getLast(); + + // maintain compatibility with previous implementation: + // if we only have text childs, return them in "_text" and not in "_children" + Object children = elementMap.get("_children"); + if (children!= null) { + if ((children instanceof String) || collectionIsAllStrings(children) ) { + elementMap.put("_text", children); + elementMap.remove("_children"); + } + } + break; + + case CHARACTERS: + final String text = reader.getText().trim(); + if (!text.isEmpty()) { + Map map = stack.getLast(); + amendToList(map, "_children", text); + } + break; + + default: + throw new RuntimeException("dunno know how to handle xml event type " + reader.getEventType()); } - throw new IllegalStateException("Incorrect end-element state "+reader.getEventType()+" after "+element); } - private int next(XMLStreamReader reader) throws XMLStreamException { - reader.next(); - while (reader.isWhiteSpace()) reader.next(); - return reader.getEventType(); + private boolean collectionIsAllStrings(Object collection) { + if (collection instanceof Collection) { + return ((Collection)collection).stream().allMatch(o -> o instanceof String); + } else { + return false; + } + } + + private void amendToList(Map map, String key, Object value) { + final Object element = map.get(key); + if (element == null ) { + map.put(key, value); + } else { + if (element instanceof List) { + ((List)element).add(value); + } else { + List list = new LinkedList<>(); + list.add(element); + list.add(value); + map.put(key, list); + } + } } } diff --git a/src/test/java/apoc/load/XmlTest.java b/src/test/java/apoc/load/XmlTest.java index 2f59a2dc7e..23c15309d5 100644 --- a/src/test/java/apoc/load/XmlTest.java +++ b/src/test/java/apoc/load/XmlTest.java @@ -1,14 +1,23 @@ package apoc.load; import apoc.util.TestUtil; +import org.apache.commons.lang.math.IntRange; import org.junit.After; import org.junit.Before; import org.junit.Test; import org.neo4j.graphdb.GraphDatabaseService; +import org.neo4j.helpers.collection.Iterators; import org.neo4j.test.TestGraphDatabaseFactory; +import java.util.Arrays; +import java.util.List; +import java.util.stream.IntStream; + import static apoc.util.TestUtil.testCall; +import static apoc.util.TestUtil.testResult; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.assertTrue; public class XmlTest { @@ -17,36 +26,116 @@ public class XmlTest { "_children=[" + "{_type=child, name=Neo4j, _text=Neo4j is a graph database}, " + "{_type=child, name=relational, _children=[" + - "{_type=grandchild, name=MySQL, _text=MySQL is a database & relational}, " + - "{_type=grandchild, name=Postgres, _text=Postgres is a relational database}]}]}"; + "{_type=grandchild, name=MySQL, _text=MySQL is a database & relational}, " + + "{_type=grandchild, name=Postgres, _text=Postgres is a relational database}]}]}"; public static final String XML_AS_NESTED_SIMPLE_MAP = "{_type=parent, name=databases, " + "_child=[" + "{_type=child, name=Neo4j, _text=Neo4j is a graph database}, " + "{_type=child, name=relational, _grandchild=[" + - "{_type=grandchild, name=MySQL, _text=MySQL is a database & relational}, " + - "{_type=grandchild, name=Postgres, _text=Postgres is a relational database}]}]}"; + "{_type=grandchild, name=MySQL, _text=MySQL is a database & relational}, " + + "{_type=grandchild, name=Postgres, _text=Postgres is a relational database}]}]}"; private GraphDatabaseService db; - @Before public void setUp() throws Exception { - db = new TestGraphDatabaseFactory().newImpermanentDatabaseBuilder().setConfig("apoc.import.file.enabled","true").newGraphDatabase(); - TestUtil.registerProcedure(db, Xml.class); - } - @After public void tearDown() { - db.shutdown(); + + @Before + public void setUp() throws Exception { + db = new TestGraphDatabaseFactory().newImpermanentDatabaseBuilder().setConfig("apoc.import.file.enabled", "true").newGraphDatabase(); + TestUtil.registerProcedure(db, Xml.class); } - @Test public void testLoadXml() throws Exception { - testCall(db, "CALL apoc.load.xml('file:databases.xml')", // YIELD value RETURN value + @After + public void tearDown() { + db.shutdown(); + } + + @Test + public void testLoadXml() throws Exception { + testCall(db, "CALL apoc.load.xml('file:databases.xml')", // YIELD value RETURN value (row) -> { Object value = row.get("value"); assertEquals(XML_AS_NESTED_MAP, value.toString()); }); } - @Test public void testLoadXmlSimple() throws Exception { - testCall(db, "CALL apoc.load.xmlSimple('file:databases.xml')", // YIELD value RETURN value + + @Test + public void testLoadXmlSimple() throws Exception { + testCall(db, "CALL apoc.load.xmlSimple('file:databases.xml')", // YIELD value RETURN value (row) -> { Object value = row.get("value"); assertEquals(XML_AS_NESTED_SIMPLE_MAP, value.toString()); }); } + + @Test + public void testMixedContent() { + testCall(db, "CALL apoc.load.xml('file:src/test/resources/mixedcontent.xml')", // YIELD value RETURN value + (row) -> { + Object value = row.get("value"); + assertEquals("{_type=root, _children=[{_type=text, _children=[text0, {_type=mixed}, text1]}, {_type=text, _text=text as cdata}]}", value.toString()); + }); + } + + @Test + public void testBookIds() { + testResult(db, "call apoc.load.xml('file:src/test/resources/books.xml') yield value as catalog\n" + + "UNWIND catalog._children as book\n" + + "RETURN book.id as id\n", result -> { + List ids = Iterators.asList(result.columnAs("id")); + assertTrue(IntStream.rangeClosed(1,12).allMatch(value -> ids.contains(String.format("bk1%02d",value)))); + }); + } + + @Test + public void testFilterIntoCollection() { + testResult(db, "call apoc.load.xml('file:src/test/resources/books.xml') yield value as catalog\n" + + " UNWIND catalog._children as book\n" + + " RETURN book.id, [attr IN book._children WHERE attr._type IN ['author','title'] | [attr._type, attr._text]] as pairs" + , result -> { + assertEquals("+----------------------------------------------------------------------------------------------------------------+\n" + + "| book.id | pairs |\n" + + "+----------------------------------------------------------------------------------------------------------------+\n" + + "| \"bk101\" | [[\"author\",\"Gambardella, Matthew\"],[\"author\",\"Arciniegas, Fabio\"],[\"title\",\"XML Developer's Guide\"]] |\n" + + "| \"bk102\" | [[\"author\",\"Ralls, Kim\"],[\"title\",\"Midnight Rain\"]] |\n" + + "| \"bk103\" | [[\"author\",\"Corets, Eva\"],[\"title\",\"Maeve Ascendant\"]] |\n" + + "| \"bk104\" | [[\"author\",\"Corets, Eva\"],[\"title\",\"Oberon's Legacy\"]] |\n" + + "| \"bk105\" | [[\"author\",\"Corets, Eva\"],[\"title\",\"The Sundered Grail\"]] |\n" + + "| \"bk106\" | [[\"author\",\"Randall, Cynthia\"],[\"title\",\"Lover Birds\"]] |\n" + + "| \"bk107\" | [[\"author\",\"Thurman, Paula\"],[\"title\",\"Splish Splash\"]] |\n" + + "| \"bk108\" | [[\"author\",\"Knorr, Stefan\"],[\"title\",\"Creepy Crawlies\"]] |\n" + + "| \"bk109\" | [[\"author\",\"Kress, Peter\"],[\"title\",\"Paradox Lost\"]] |\n" + + "| \"bk110\" | [[\"author\",\"O'Brien, Tim\"],[\"title\",\"Microsoft .NET: The Programming Bible\"]] |\n" + + "| \"bk111\" | [[\"author\",\"O'Brien, Tim\"],[\"title\",\"MSXML3: A Comprehensive Guide\"]] |\n" + + "| \"bk112\" | [[\"author\",\"Galos, Mike\"],[\"title\",\"Visual Studio 7: A Comprehensive Guide\"]] |\n" + + "+----------------------------------------------------------------------------------------------------------------+\n" + + "12 rows\n", result.resultAsString()); + }); + } + + @Test + public void testReturnCollectionElements() { + testResult(db, "call apoc.load.xml('file:src/test/resources/books.xml') yield value as catalog\n" + + "UNWIND catalog._children as book\n" + + "WITH book.id as id, [attr IN book._children WHERE attr._type IN ['author','title'] | attr._text] as pairs\n" + + "RETURN id, pairs[0] as author, pairs[1] as title" + , result -> { + assertEquals("+-----------------------------------------------------------------------------+\n" + + "| id | author | title |\n" + + "+-----------------------------------------------------------------------------+\n" + + "| \"bk101\" | \"Gambardella, Matthew\" | \"Arciniegas, Fabio\" |\n" + + "| \"bk102\" | \"Ralls, Kim\" | \"Midnight Rain\" |\n" + + "| \"bk103\" | \"Corets, Eva\" | \"Maeve Ascendant\" |\n" + + "| \"bk104\" | \"Corets, Eva\" | \"Oberon's Legacy\" |\n" + + "| \"bk105\" | \"Corets, Eva\" | \"The Sundered Grail\" |\n" + + "| \"bk106\" | \"Randall, Cynthia\" | \"Lover Birds\" |\n" + + "| \"bk107\" | \"Thurman, Paula\" | \"Splish Splash\" |\n" + + "| \"bk108\" | \"Knorr, Stefan\" | \"Creepy Crawlies\" |\n" + + "| \"bk109\" | \"Kress, Peter\" | \"Paradox Lost\" |\n" + + "| \"bk110\" | \"O'Brien, Tim\" | \"Microsoft .NET: The Programming Bible\" |\n" + + "| \"bk111\" | \"O'Brien, Tim\" | \"MSXML3: A Comprehensive Guide\" |\n" + + "| \"bk112\" | \"Galos, Mike\" | \"Visual Studio 7: A Comprehensive Guide\" |\n" + + "+-----------------------------------------------------------------------------+\n" + + "12 rows\n", result.resultAsString()); + }); + } + } diff --git a/src/test/resources/mixedcontent.xml b/src/test/resources/mixedcontent.xml new file mode 100644 index 0000000000..c2df198a31 --- /dev/null +++ b/src/test/resources/mixedcontent.xml @@ -0,0 +1,7 @@ + + + text0 text1 + + \ No newline at end of file