Skip to content

Commit

Permalink
fixes #256
Browse files Browse the repository at this point in the history
* deal gracefully with xml mixed content
* refactoring
* deprecate apoc.load.xmlSimple
  • Loading branch information
sarmbruster committed Jan 27, 2017
1 parent 3de42d5 commit e9a9065
Show file tree
Hide file tree
Showing 5 changed files with 233 additions and 125 deletions.
23 changes: 15 additions & 8 deletions docs/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -5484,14 +5484,21 @@ <h4 id="_load_xml_introduction">Load XML Introduction</h4>
<p>Many existing (enterprise) applications, endpoints and files use XML as data exchange format.</p>
</div>
<div class="paragraph">
<p>To make these datastructures available to Cypher, you can use <code>apoc.load.xml(Simple)</code>.
<p>To make these datastructures available to Cypher, you can use <code>apoc.load.xml</code>.
It takes a file or http URL and parses the XML into a map datastructure.</p>
</div>
<div class="paragraph">
<p>While <code>apoc.load.xml</code> has a more verbose format, it keeps the ordering of elements.</p>
</div>
<div class="paragraph">
<p>While <code>apoc.load.xmlSimple</code> provides a more compact representation that makes it easier to process.</p>
<div class="admonitionblock note">
<table>
<tr>
<td class="icon">
<div class="title">Note</div>
</td>
<td class="content">
in previous releases we&#8217;ve had <code>apoc.load.xmlSimple</code>. This is now deprecated and got superseeded by
<code>apoc.load.xml(url, true)</code>.
</td>
</tr>
</table>
</div>
<div class="paragraph">
<p>See the following usage-examples for the procedures.</p>
Expand Down Expand Up @@ -5553,7 +5560,7 @@ <h4 id="_simple_xml_format">Simple XML Format</h4>
</div>
<div class="listingblock">
<div class="content">
<pre class="highlight"><code class="language-cypher" data-lang="cypher">call apoc.load.xmlSimple("https://raw.githubusercontent.com/neo4j-contrib/neo4j-apoc-procedures/master/src/test/resources/books.xml")</code></pre>
<pre class="highlight"><code class="language-cypher" data-lang="cypher">call apoc.load.xml("https://raw.githubusercontent.com/neo4j-contrib/neo4j-apoc-procedures/master/src/test/resources/books.xml", true)</code></pre>
</div>
</div>
<div class="listingblock">
Expand Down Expand Up @@ -6057,7 +6064,7 @@ <h4 id="_further_functions">Further Functions</h4>
</div>
<div id="footer">
<div id="footer-text">
Last updated 2017-01-23 16:38:09 +00:00
Last updated 2017-01-25 13:14:42 +01:00
</div>
</div>
</body>
Expand Down
9 changes: 4 additions & 5 deletions docs/loadxml.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@

Many existing (enterprise) applications, endpoints and files use XML as data exchange format.

To make these datastructures available to Cypher, you can use `apoc.load.xml(Simple)`.
To make these datastructures available to Cypher, you can use `apoc.load.xml`.
It takes a file or http URL and parses the XML into a map datastructure.

While `apoc.load.xml` has a more verbose format, it keeps the ordering of elements.

While `apoc.load.xmlSimple` provides a more compact representation that makes it easier to process.
NOTE: in previous releases we've had `apoc.load.xmlSimple`. This is now deprecated and got superseeded by
`apoc.load.xml(url, true)`.

See the following usage-examples for the procedures.

Expand Down Expand Up @@ -60,7 +59,7 @@ Here is the example file from above loaded with `apoc.load.xmlSimple`

[source,cypher]
----
call apoc.load.xmlSimple("https://raw.githubusercontent.com/neo4j-contrib/neo4j-apoc-procedures/master/src/test/resources/books.xml")
call apoc.load.xml("https://raw.githubusercontent.com/neo4j-contrib/neo4j-apoc-procedures/master/src/test/resources/books.xml", true)
----

[source,javascript]
Expand Down
202 changes: 104 additions & 98 deletions src/main/java/apoc/load/Xml.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,16 @@
import org.neo4j.procedure.Procedure;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.stream.Stream;

import static apoc.util.Util.cleanUrl;
import static javax.xml.stream.XMLStreamConstants.*;

public class Xml {

Expand All @@ -30,112 +27,121 @@ public class Xml {
@Context public GraphDatabaseService db;

@Procedure
@Description("apoc.load.xml('http://example.com/test.xml') YIELD value as doc CREATE (p:Person) SET p.name = doc.name load from XML URL (e.g. web-api) to import XML as single nested map with attributes and _type, _text and _childrenx fields.")
public Stream<MapResult> xml(@Name("url") String url) {
try {
FileUtils.checkReadAllowed(url);
URLConnection urlConnection = new URL(url).openConnection();
FACTORY.setProperty("javax.xml.stream.isCoalescing", true);
XMLStreamReader reader = FACTORY.createXMLStreamReader(urlConnection.getInputStream());
if (reader.nextTag()==XMLStreamConstants.START_ELEMENT) {
return Stream.of(new MapResult(handleElement(reader)));
}
throw new RuntimeException("Can't read url " + cleanUrl(url) + " as XML");
} catch (IOException | XMLStreamException e) {
throw new RuntimeException("Can't read url " + cleanUrl(url) + " as XML", e);
}
@Description("apoc.load.xml('http://example.com/test.xml', false) YIELD value as doc CREATE (p:Person) SET p.name = doc.name load from XML URL (e.g. web-api) to import XML as single nested map with attributes and _type, _text and _childrenx fields.")
public Stream<MapResult> xml(@Name("url") String url, @Name(value = "simple", defaultValue = "false") boolean simpleMode) {
return xmlToMapResult(url, simpleMode);
}
@Procedure
@Description("apoc.load.xml('http://example.com/test.xml') YIELD value as doc CREATE (p:Person) SET p.name = doc.name load from XML URL (e.g. web-api) to import XML as single nested map with attributes and _type, _text and _childrenx fields.")

@Procedure(deprecatedBy = "apoc.load.xml")
@Deprecated
@Description("apoc.load.xmlSimple('http://example.com/test.xml') YIELD value as doc CREATE (p:Person) SET p.name = doc.name load from XML URL (e.g. web-api) to import XML as single nested map with attributes and _type, _text and _children fields. This method does intentionally not work with XML mixed content.")
public Stream<MapResult> xmlSimple(@Name("url") String url) {
return xmlToMapResult(url, true);
}

private Stream<MapResult> xmlToMapResult(@Name("url") String url, boolean simpleMode) {
try {
FileUtils.checkReadAllowed(url);
URLConnection urlConnection = new URL(url).openConnection();
FACTORY.setProperty("javax.xml.stream.isCoalescing", true);
XMLStreamReader reader = FACTORY.createXMLStreamReader(urlConnection.getInputStream());
if (reader.nextTag()==XMLStreamConstants.START_ELEMENT) {
return Stream.of(new MapResult(handleElementSimple(null,reader)));
}
throw new RuntimeException("Can't read url " + cleanUrl(url) + " as XML");
XMLStreamReader reader = getXMLStreamReaderFromUrl(url);
final Deque<Map<String, Object>> stack = new LinkedList<>();
do {
handleXmlEvent(stack, reader, simpleMode);
} while (proceedReader(reader));
return Stream.of(new MapResult(stack.getFirst()));
} catch (IOException | XMLStreamException e) {
throw new RuntimeException("Can't read url " + cleanUrl(url) + " as XML", e);
}
}

private Map<String, Object> handleElement(XMLStreamReader reader) throws XMLStreamException {
LinkedHashMap<String, Object> row = null;
String element = null;
if (reader.isStartElement()) {
int attributes = reader.getAttributeCount();
row = new LinkedHashMap<>(attributes + 3);
element = reader.getLocalName();
row.put("_type", element);
for (int a = 0; a < attributes; a++) {
row.put(reader.getAttributeLocalName(a), reader.getAttributeValue(a));
}
next(reader);
if (reader.hasText()) {
row.put("_text",reader.getText().trim());
next(reader);
}
if (reader.isStartElement()) {
List<Map<String, Object>> children = new ArrayList<>(100);
do {
Map<String, Object> child = handleElement(reader);
if (child != null && !child.isEmpty()) {
children.add(child);
}
} while (next(reader) == XMLStreamConstants.START_ELEMENT);
if (!children.isEmpty()) row.put("_children", children);
}
if (reader.isEndElement() || reader.getEventType() == XMLStreamConstants.END_DOCUMENT) {
return row;
}
private XMLStreamReader getXMLStreamReaderFromUrl(@Name("url") String url) throws IOException, XMLStreamException {
FileUtils.checkReadAllowed(url);
URLConnection urlConnection = new URL(url).openConnection();
FACTORY.setProperty("javax.xml.stream.isCoalescing", true);
return FACTORY.createXMLStreamReader(urlConnection.getInputStream());
}


private boolean proceedReader(XMLStreamReader reader) throws XMLStreamException {
if (reader.hasNext()) {
do {
reader.next();
} while (reader.isWhiteSpace());
return true;
} else {
return false;
}
throw new IllegalStateException("Incorrect end-element state "+reader.getEventType()+" after "+element);
}
private Map<String, Object> handleElementSimple(Map<String,Object> parent, XMLStreamReader reader) throws XMLStreamException {
LinkedHashMap<String, Object> row = null;
String element = null;
if (reader.isStartElement()) {
int attributes = reader.getAttributeCount();
row = new LinkedHashMap<>(attributes + 3);
element = reader.getLocalName();
row.put("_type", element);
for (int a = 0; a < attributes; a++) {
row.put(reader.getAttributeLocalName(a), reader.getAttributeValue(a));
}
if (parent!=null) {
Object children = parent.get("_"+element);
if (children == null) parent.put("_"+element, row);
else if (children instanceof List) ((List)children).add(row);
else {
List list = new ArrayList<>();
list.add(children);
list.add(row);
parent.put("_"+element, list);

private void handleXmlEvent(Deque<Map<String, Object>> stack, XMLStreamReader reader, boolean simpleMode) throws XMLStreamException {
Map<String, Object> elementMap;
switch (reader.getEventType()) {
case START_DOCUMENT:
case END_DOCUMENT:
// intentionally empty
break;
case START_ELEMENT:
int attributes = reader.getAttributeCount();
elementMap = new LinkedHashMap<>(attributes+3);
elementMap.put("_type", reader.getLocalName());
for (int a = 0; a < attributes; a++) {
elementMap.put(reader.getAttributeLocalName(a), reader.getAttributeValue(a));
}
}
next(reader);
if (reader.hasText()) {
row.put("_text",reader.getText().trim());
next(reader);
}
if (reader.isStartElement()) {
do {
handleElementSimple(row, reader);
} while (next(reader) == XMLStreamConstants.START_ELEMENT);
}
if (reader.isEndElement() || reader.getEventType() == XMLStreamConstants.END_DOCUMENT) {
return row;
}
if (!stack.isEmpty()) {
final Map<String, Object> last = stack.getLast();
String key = simpleMode ? "_" + reader.getLocalName() : "_children";
amendToList(last, key, elementMap);
}

stack.addLast(elementMap);
break;

case END_ELEMENT:
elementMap = stack.size() > 1 ? stack.removeLast() : stack.getLast();

// maintain compatibility with previous implementation:
// if we only have text childs, return them in "_text" and not in "_children"
Object children = elementMap.get("_children");
if (children!= null) {
if ((children instanceof String) || collectionIsAllStrings(children) ) {
elementMap.put("_text", children);
elementMap.remove("_children");
}
}
break;

case CHARACTERS:
final String text = reader.getText().trim();
if (!text.isEmpty()) {
Map<String, Object> map = stack.getLast();
amendToList(map, "_children", text);
}
break;

default:
throw new RuntimeException("dunno know how to handle xml event type " + reader.getEventType());
}
throw new IllegalStateException("Incorrect end-element state "+reader.getEventType()+" after "+element);
}

private int next(XMLStreamReader reader) throws XMLStreamException {
reader.next();
while (reader.isWhiteSpace()) reader.next();
return reader.getEventType();
private boolean collectionIsAllStrings(Object collection) {
if (collection instanceof Collection) {
return ((Collection<Object>)collection).stream().allMatch(o -> o instanceof String);
} else {
return false;
}
}

private void amendToList(Map<String, Object> map, String key, Object value) {
final Object element = map.get(key);
if (element == null ) {
map.put(key, value);
} else {
if (element instanceof List) {
((List)element).add(value);
} else {
List<Object> list = new LinkedList<>();
list.add(element);
list.add(value);
map.put(key, list);
}
}
}
}
Loading

0 comments on commit e9a9065

Please sign in to comment.