From 172371fd6b54ff40ec0f5b987540d3bae268274b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kamil=20Go=C5=82=C4=99biewski?= Date: Tue, 16 Jan 2024 02:13:54 +0100 Subject: [PATCH] Add support to optionally allow surrogate pair entities (#165) (#174) --- release-notes/CREDITS | 5 + release-notes/VERSION | 2 + .../java/com/ctc/wstx/api/ReaderConfig.java | 29 +++ .../com/ctc/wstx/api/WstxInputProperties.java | 9 + .../java/com/ctc/wstx/sr/StreamScanner.java | 103 ++++++----- .../org/codehaus/stax/test/BaseStaxTest.java | 10 ++ .../stax/test/stream/TestEntityRead.java | 166 ++++++++++++++++-- 7 files changed, 262 insertions(+), 62 deletions(-) diff --git a/release-notes/CREDITS b/release-notes/CREDITS index c2745a9f..a2159234 100644 --- a/release-notes/CREDITS +++ b/release-notes/CREDITS @@ -89,3 +89,8 @@ Tim Martin (@Orbisman) * Contributed fix for #67: Wrong line for XML event location in elements following DTD (6.6.0) + +Kamil Gołębiewski (@Magmaruss) + +* Contributed #165: Add support to optionally allow surrogate pair entities + (6.6.0) diff --git a/release-notes/VERSION b/release-notes/VERSION index 64cdd80e..44360840 100644 --- a/release-notes/VERSION +++ b/release-notes/VERSION @@ -9,6 +9,8 @@ Project: woodstox #67: Wrong line for XML event location in elements following DTD (reported by @m-g-sonar) (fix contributed by Tim M) +#165: Add support to optionally allow surrogate pair entities + (contributed by Kamil G) #176: Fix parser when not replacing entities and treating char references as entities (contributed by Guillaume N) diff --git a/src/main/java/com/ctc/wstx/api/ReaderConfig.java b/src/main/java/com/ctc/wstx/api/ReaderConfig.java index 5aa334d9..f6514dce 100644 --- a/src/main/java/com/ctc/wstx/api/ReaderConfig.java +++ b/src/main/java/com/ctc/wstx/api/ReaderConfig.java @@ -139,6 +139,11 @@ public final class ReaderConfig final static int PROP_MAX_DTD_DEPTH = 69; + /** + * @since 6.6 + */ + final static int PROP_ALLOW_SURROGATE_PAIR_ENTITIES = 70; + /* //////////////////////////////////////////////// // Limits for numeric properties @@ -361,6 +366,8 @@ public final class ReaderConfig PROP_UNDECLARED_ENTITY_RESOLVER); sProperties.put(WstxInputProperties.P_BASE_URL, PROP_BASE_URL); + sProperties.put(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES, + PROP_ALLOW_SURROGATE_PAIR_ENTITIES); sProperties.put(WstxInputProperties.P_INPUT_PARSING_MODE, PROP_INPUT_PARSING_MODE); } @@ -419,6 +426,13 @@ public final class ReaderConfig */ protected URL mBaseURL; + /** + * Whether to allow surrogate pairs as entities (2 code-points as one target character). + * + * @since 6.6 + */ + protected boolean mAllowSurrogatePairEntities = false; + /** * Parsing mode can be changed from the default xml compliant * behavior to one of alternate modes (fragment processing, @@ -583,6 +597,7 @@ public ReaderConfig createNonShared(SymbolTable sym) rc.mMaxEntityDepth = mMaxEntityDepth; rc.mMaxEntityCount = mMaxEntityCount; rc.mMaxDtdDepth = mMaxDtdDepth; + rc.mAllowSurrogatePairEntities = mAllowSurrogatePairEntities; if (mSpecialProperties != null) { int len = mSpecialProperties.length; Object[] specProps = new Object[len]; @@ -792,6 +807,10 @@ public XMLResolver getUndeclaredEntityResolver() { public URL getBaseURL() { return mBaseURL; } + public boolean allowsSurrogatePairEntities() { + return mAllowSurrogatePairEntities; + } + public WstxInputProperties.ParsingMode getInputParsingMode() { return mParsingMode; } @@ -1074,6 +1093,10 @@ public void setUndeclaredEntityResolver(XMLResolver r) { } public void setBaseURL(URL baseURL) { mBaseURL = baseURL; } + + public void doAllowSurrogatePairEntities(boolean state) { + mAllowSurrogatePairEntities = state; + } public void setInputParsingMode(WstxInputProperties.ParsingMode mode) { mParsingMode = mode; @@ -1533,6 +1556,8 @@ public Object getProperty(int id) return getUndeclaredEntityResolver(); case PROP_BASE_URL: return getBaseURL(); + case PROP_ALLOW_SURROGATE_PAIR_ENTITIES: + return allowsSurrogatePairEntities(); case PROP_INPUT_PARSING_MODE: return getInputParsingMode(); @@ -1757,6 +1782,10 @@ public boolean setProperty(String propName, int id, Object value) setBaseURL(u); } break; + + case PROP_ALLOW_SURROGATE_PAIR_ENTITIES: + doAllowSurrogatePairEntities(ArgUtil.convertToBoolean(propName, value)); + break; case PROP_INPUT_PARSING_MODE: setInputParsingMode((WstxInputProperties.ParsingMode) value); diff --git a/src/main/java/com/ctc/wstx/api/WstxInputProperties.java b/src/main/java/com/ctc/wstx/api/WstxInputProperties.java index 840e7ddd..40773f71 100644 --- a/src/main/java/com/ctc/wstx/api/WstxInputProperties.java +++ b/src/main/java/com/ctc/wstx/api/WstxInputProperties.java @@ -300,6 +300,15 @@ public final class WstxInputProperties * DTD subset). */ public final static String P_BASE_URL = "com.ctc.wstx.baseURL"; + + /** + * Property of type {@link java.lang.Boolean}, that will allow parsing + * high unicode characters written by surrogate pairs (2 code points) + * Default set as Boolean.FALSE, because it is not a standard behavior + * + * @since 6.6 + */ + public final static String P_ALLOW_SURROGATE_PAIR_ENTITIES = "com.ctc.wstx.allowSurrogatePairEntities"; // // // Alternate parsing modes diff --git a/src/main/java/com/ctc/wstx/sr/StreamScanner.java b/src/main/java/com/ctc/wstx/sr/StreamScanner.java index 645ab8dc..7ca155f2 100644 --- a/src/main/java/com/ctc/wstx/sr/StreamScanner.java +++ b/src/main/java/com/ctc/wstx/sr/StreamScanner.java @@ -1183,59 +1183,62 @@ protected int resolveSimpleEntity(boolean checkStd) char[] buf = mInputBuffer; int ptr = mInputPtr; char c = buf[ptr++]; + final boolean allowSurrogatePairs = mConfig.allowsSurrogatePairEntities(); // Numeric reference? if (c == '#') { - c = buf[ptr++]; int value = 0; + int pairValue = 0; int inputLen = mInputEnd; - if (c == 'x') { // hex - while (ptr < inputLen) { + + mInputPtr = ptr; + value = resolveCharEnt(null, false); + ptr = mInputPtr; + c = buf[ptr - 1]; + + // If resolving entity surrogate pairs enabled and if current entity + // is in range of high surrogate value, try to find surrogate pair + if (allowSurrogatePairs && value >= 0xD800 && value <= 0xDBFF) { + if (c == ';' && ptr + 1 < inputLen) { c = buf[ptr++]; - if (c == ';') { - break; - } - value = value << 4; - if (c <= '9' && c >= '0') { - value += (c - '0'); - } else if (c >= 'a' && c <= 'f') { - value += (10 + (c - 'a')); - } else if (c >= 'A' && c <= 'F') { - value += (10 + (c - 'A')); - } else { - mInputPtr = ptr; // so error points to correct char - throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F)."); - } - /* Need to check for overflow; easiest to do right as - * it happens... - */ - if (value > MAX_UNICODE_CHAR) { - reportUnicodeOverflow(); - } - } - } else { // numeric (decimal) - while (c != ';') { - if (c <= '9' && c >= '0') { - value = (value * 10) + (c - '0'); - // Overflow? - if (value > MAX_UNICODE_CHAR) { - reportUnicodeOverflow(); + if (c == '&' && ptr + 1 < inputLen) { + c = buf[ptr++]; + if (c == '#' && ptr + 1 < inputLen) { + try { + mInputPtr = ptr; + pairValue = resolveCharEnt(null, false); + ptr = mInputPtr; + c = buf[ptr -1]; + } catch (WstxUnexpectedCharException wuce) { + reportNoSurrogatePair(value); + } + } else { + reportNoSurrogatePair(value); } } else { - mInputPtr = ptr; // so error points to correct char - throwUnexpectedChar(c, "; expected a decimal number."); + reportNoSurrogatePair(value); } - if (ptr >= inputLen) { - break; - } - c = buf[ptr++]; + } else { + reportNoSurrogatePair(value); } } + // We get here either if we got it all, OR if we ran out of // input in current buffer. if (c == ';') { // got the full thing mInputPtr = ptr; - validateChar(value); + + if (allowSurrogatePairs && pairValue > 0) { + // [woodstox-core#165] + // If pair value is not in range of low surrogate values, then throw an error + if (pairValue < 0xDC00 || pairValue > 0xDFFF) { + reportInvalidSurrogatePair(value, pairValue); + } + value = 0x10000 + (value - 0xD800) * 0x400 + (pairValue - 0xDC00); + } else { + validateChar(value); + } + return value; } @@ -1352,7 +1355,7 @@ protected int resolveCharOnlyEntity(boolean checkStd) // A char reference? if (c == '#') { // yup ++mInputPtr; - return resolveCharEnt(null); + return resolveCharEnt(null, true); } // nope... except may be a pre-def? @@ -1518,7 +1521,7 @@ protected int fullyResolveEntity(boolean allowExt) // Do we have a (numeric) character entity reference? if (c == '#') { // numeric final StringBuffer originalSurface = new StringBuffer("#"); - int ch = resolveCharEnt(originalSurface); + int ch = resolveCharEnt(originalSurface, true); if (mCfgTreatCharRefsAsEntities) { final char[] originalChars = new char[originalSurface.length()]; originalSurface.getChars(0, originalSurface.length(), originalChars, 0); @@ -2314,7 +2317,7 @@ protected final void parseUntil(TextBuffer tb, char endChar, boolean convertLFs, /////////////////////////////////////////////////////////////////////// */ - private int resolveCharEnt(StringBuffer originalCharacters) + private int resolveCharEnt(StringBuffer originalCharacters, boolean validateChar) throws XMLStreamException { int value = 0; @@ -2369,7 +2372,9 @@ private int resolveCharEnt(StringBuffer originalCharacters) } } } - validateChar(value); + if (validateChar) { + validateChar(value); + } return value; } @@ -2455,7 +2460,19 @@ private void reportUnicodeOverflow() private void reportIllegalChar(int value) throws XMLStreamException { - throwParseError("Illegal character entity: expansion character (code 0x{0}", Integer.toHexString(value), null); + throwParseError("Illegal character entity: expansion character (code 0x{0})", Integer.toHexString(value), null); + } + + private void reportNoSurrogatePair(int highSurrogate) + throws XMLStreamException + { + throwParseError("Cannot find surrogate pair: high surrogate character (code 0x{0})", Integer.toHexString(highSurrogate), null); + } + + private void reportInvalidSurrogatePair(int firstSurrogate, int secondSurrogate) + throws XMLStreamException + { + throwParseError("Invalid surrogate pair: first surrogate character (code 0x{0}), second surrogate character (code 0x{1})", Integer.toHexString(firstSurrogate), Integer.toHexString(secondSurrogate)); } protected void verifyLimit(String type, long maxValue, long currentValue) diff --git a/src/test/java/org/codehaus/stax/test/BaseStaxTest.java b/src/test/java/org/codehaus/stax/test/BaseStaxTest.java index b5e1438d..fea81ec7 100644 --- a/src/test/java/org/codehaus/stax/test/BaseStaxTest.java +++ b/src/test/java/org/codehaus/stax/test/BaseStaxTest.java @@ -8,6 +8,8 @@ import javax.xml.stream.*; import javax.xml.stream.events.XMLEvent; +import com.ctc.wstx.api.WstxInputProperties; + /* Latest updates: * * - 07-Sep-2007, TSa: Updating based on latest understanding of @@ -275,6 +277,14 @@ protected static boolean setSupportExternalEntities(XMLInputFactory f, boolean s return false; } } + + protected static void setResolveEntitySurrogatePairs(XMLInputFactory f, boolean state) + throws XMLStreamException + { + Boolean b = state ? Boolean.TRUE : Boolean.FALSE; + f.setProperty(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES, b); + assertEquals(b, f.getProperty(WstxInputProperties.P_ALLOW_SURROGATE_PAIR_ENTITIES)); + } protected static void setResolver(XMLInputFactory f, XMLResolver resolver) throws XMLStreamException diff --git a/src/test/java/org/codehaus/stax/test/stream/TestEntityRead.java b/src/test/java/org/codehaus/stax/test/stream/TestEntityRead.java index cb92c6c9..1f91b9d6 100644 --- a/src/test/java/org/codehaus/stax/test/stream/TestEntityRead.java +++ b/src/test/java/org/codehaus/stax/test/stream/TestEntityRead.java @@ -1,5 +1,9 @@ package org.codehaus.stax.test.stream; +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; + import javax.xml.stream.*; import org.codehaus.stax.test.SimpleResolver; @@ -27,7 +31,7 @@ public void testValidPredefdEntities() String EXP = "Testing \"this\" & 'that' !? !"; String XML = "Testing "this" & 'that' !? !"; - XMLStreamReader sr = getReader(XML, false, true, true); + XMLStreamReader sr = getReader(XML, false, true, true, false); assertTokenType(START_ELEMENT, sr.next()); assertTokenType(CHARACTERS, sr.next()); @@ -52,7 +56,7 @@ public void testValidCharEntities() throws XMLStreamException { String XML = "surrogates: 񐀀."; - XMLStreamReader sr = getReader(XML, true, true, true); + XMLStreamReader sr = getReader(XML, true, true, true, false); assertTokenType(START_ELEMENT, sr.next()); assertTokenType(CHARACTERS, sr.next()); @@ -73,6 +77,52 @@ public void testValidCharEntities() assertTokenType(END_ELEMENT, type); sr.close(); } + + /** + * This unit test checks that resolving of surrogate pairs works + * as expected, including different ways of writing entities + */ + public void testValidSurrogatePairEntities() + throws XMLStreamException + { + final Map xmlWithExp = new HashMap(); + // Numeric surrogate pairs + xmlWithExp.put("surrogate pair: ��.", + "surrogate pair: \uD83C\uDF85."); + // Hex and numeric surrogate pairs + xmlWithExp.put("surrogate pair: ��.", + "surrogate pair: \uD83C\uDF85."); + // Numeric and hex surrogate pairs + xmlWithExp.put("surrogate pair: ��.", + "surrogate pair: \uD83C\uDF85."); + // Hex surrogate pairs + xmlWithExp.put("surrogate pair: ��.", + "surrogate pair: \uD83C\uDF85."); + // Two surrogate pairs + xmlWithExp.put("surrogate pair: ����.", + "surrogate pair: \uD83C\uDF85\uD83C\uDF84."); + // Surrogate pair and simple entity + xmlWithExp.put("surrogate pair: ��™.", + "surrogate pair: \uD83C\uDF85\u2122."); + + for (Entry xmlExp: xmlWithExp.entrySet()) { + XMLStreamReader sr = getReader(xmlExp.getKey(), true, true, true, true); + assertTokenType(START_ELEMENT, sr.next()); + assertTokenType(CHARACTERS, sr.next()); + + StringBuffer sb = new StringBuffer(getAndVerifyText(sr)); + int type; + + while ((type = sr.next()) == CHARACTERS) { + sb.append(getAndVerifyText(sr)); + } + + String result = sb.toString(); + assertEquals(xmlExp.getValue(), result); + assertTokenType(END_ELEMENT, type); + sr.close(); + } + } public void testValidGeneralEntities() throws XMLStreamException @@ -85,7 +135,7 @@ public void testValidGeneralEntities() +"]>\n" +"&x; &both; &aa;&myAmp;"; - XMLStreamReader sr = getReader(XML, false, true, true); + XMLStreamReader sr = getReader(XML, false, true, true, false); assertTokenType(DTD, sr.next()); int type = sr.next(); @@ -126,7 +176,7 @@ public void testUnexpandedEntities() +" ]>\n" +"&Start"&myent;End!"; - XMLStreamReader sr = getReader(XML, false, true, false); + XMLStreamReader sr = getReader(XML, false, true, false, false); assertTokenType(DTD, sr.next()); int type = sr.next(); @@ -163,11 +213,11 @@ public void testUnexpandedEntities() +""; // First, no coalescing - sr = getReader(XML, false, false, false); + sr = getReader(XML, false, false, false, false); streamThrough(sr); // then with coalescing - sr = getReader(XML, false, true, false); + sr = getReader(XML, false, true, false, false); streamThrough(sr); } @@ -184,7 +234,7 @@ public void testUnexpandedEntities2() +" ]>" +"&myent;"; - XMLStreamReader sr = getReader(XML, false, true, false); + XMLStreamReader sr = getReader(XML, false, true, false, false); assertTokenType(DTD, sr.next()); assertTokenType(START_ELEMENT, sr.next()); @@ -225,7 +275,7 @@ public void testElementEntities() +"]>\n" +"&ent1;&ent2;&ent3;&ent4a;"; - XMLStreamReader sr = getReader(XML, true, true, true); + XMLStreamReader sr = getReader(XML, true, true, true, false); assertTokenType(DTD, sr.next()); // May or may not get whitespace @@ -289,7 +339,7 @@ public void testQuotedCDataEndMarker() +" and then alternatives: ]]>" +", ]]>" +""; - XMLStreamReader sr = getReader(XML, true, false, true); + XMLStreamReader sr = getReader(XML, true, false, true, false); streamThrough(sr); } catch (Exception e) { fail("Didn't except problems with pre-def/char entity quoted ']]>'; got: "+e); @@ -303,7 +353,7 @@ public void testQuotedCDataEndMarker() +"" +" &doubleBracket;> and &doubleBracket;>" +""; - XMLStreamReader sr = getReader(XML, true, false, true); + XMLStreamReader sr = getReader(XML, true, false, true, false); streamThrough(sr); } catch (Exception e) { fail("Didn't except problems with general entity quoted ']]>'; got: "+e); @@ -326,7 +376,7 @@ public void testInvalidEntityUndeclared() throws XMLStreamException { XMLStreamReader sr = getReader("&myent;", - true, false, true); + true, false, true, false); try { streamThrough(sr); fail("Expected an exception for invalid comment content"); @@ -341,7 +391,7 @@ public void testInvalidEntityRecursive() +"\n" +"\n" +"]> &ent1;", - false, true, true); + false, true, true, false); streamThroughFailing(sr, "recursive general entity/ies"); @@ -363,7 +413,7 @@ public void testInvalidEntityPEInIntSubset() +"\n" +"\n" +"]> ", - false, true, true); + false, true, true, false); streamThroughFailing(sr, "declaring a parameter entity in the internal DTD subset"); } @@ -382,7 +432,7 @@ public void testInvalidEntityPartial() ("\n" +"]>&partial;;", - false, false, true); + false, false, true, false); /* Hmmh. Actually, fully conforming implementations should throw * an exception when parsing internal DTD subset. But better @@ -407,6 +457,79 @@ public void testInvalidEntityPartial() assertTokenType(START_ELEMENT, type2); fail("Expected an exception for partial entity reference: current token after text: "+tokenTypeDesc(lastType)); } + + /** + * Test that ensures that an invalid surrogate pair entities is caught. + * It could be pair of high surrogate and simple entity, high surrogate + * with no pair, low surrogate as first or unclosed entity + */ + public void testInvalidSurrogatePairEntities() + throws XMLStreamException + { + final String[][] invalidSurrogatePairsAndExpectedErrors = { + // Invalid pair + {"surrogate pair: �ᙚ.", "Invalid surrogate pair"}, + // No pair + {"surrogate pair: �.", "Cannot find surrogate pair"}, + // Low surrogate as first + {"surrogate pair: ��.", "Illegal character entity"}, + // Unclosed second entity + {"surrogate pair: �ȩ", "Cannot find surrogate pair"} + }; + + for (String[] surrogateAndError: invalidSurrogatePairsAndExpectedErrors) { + final String invalidSurrogatePair = surrogateAndError[0]; + final String expectedErrorPhrase = surrogateAndError[1]; + + XMLStreamReader sr = getReader(invalidSurrogatePair, + true, false, true, true); + try { + streamThrough(sr); + fail("Expected an exception for invalid surrogate pair"); + } catch (XMLStreamException e) { + if (!e.getMessage().startsWith(expectedErrorPhrase)) { + fail(String.format( + "Expected an exception starting from phrase: '%s' for invalid surrogate test case: '%s', but the message was: '%s'", + expectedErrorPhrase, + invalidSurrogatePair, + e.getMessage() + )); + } + } + } + } + + /** + * Test that ensures that an exception is thrown when + * allow surrogate pair entities option is disabled. + * Expected default behavior should be an exception + * with message starting with: Illegal character entity + */ + public void testAllowSurrogatePairEntitiesDisabled() + throws XMLStreamException + { + final String expectedErrorPhrase = "Illegal character entity"; + final String surrogatePairEntitiesXML = "surrogate pair: ��."; + + final XMLStreamReader sr = getReader(surrogatePairEntitiesXML, true, true, true, false); + assertTokenType(START_ELEMENT, sr.next()); + assertTokenType(CHARACTERS, sr.next()); + + try { + streamThrough(sr); + fail("Expected an exception for illegal character entity when surrogate pair entities allowation is disabled"); + } catch (XMLStreamException e) { + if (!e.getMessage().startsWith(expectedErrorPhrase)) { + fail(String.format( + "Expected an exception starting from phrase: '%s' when surrogate pair entities allowation is disabled, but the message was: '%s'", + expectedErrorPhrase, + e.getMessage() + )); + } + } finally { + sr.close(); + } + } /** * This unit test checks that external entities can be resolved; and @@ -424,7 +547,7 @@ public void testExternalEntityWithResolver() +"]>ent='&extEnt;'"; // ns-aware, coalescing (to simplify verifying), entity expanding - XMLInputFactory f = doGetFactory(true, true, true); + XMLInputFactory f = doGetFactory(true, true, true, false); if (!setSupportExternalEntities(f, true)) { reportNADueToExtEnt("testExternalEntityWithResolver"); @@ -482,7 +605,7 @@ private void doTestProperties(boolean nsAware) +"\n" +"\n" +"]>&myent;&ent2;", - nsAware, false, false); + nsAware, false, false, false); assertTokenType(DTD, sr.next()); assertTokenType(START_ELEMENT, sr.next()); @@ -611,15 +734,19 @@ private void doTestProperties(boolean nsAware) * need to be enabled just for that purpose. */ private XMLStreamReader getReader(String contents, boolean nsAware, - boolean coalescing, boolean replEntities) + boolean coalescing, boolean replEntities, + boolean resolveSurrogatePairs) throws XMLStreamException { - XMLInputFactory f = doGetFactory(nsAware, coalescing, replEntities); + XMLInputFactory f = doGetFactory(nsAware, coalescing, + replEntities, resolveSurrogatePairs); return constructStreamReader(f, contents); } private XMLInputFactory doGetFactory(boolean nsAware, - boolean coalescing, boolean replEntities) + boolean coalescing, + boolean replEntities, + boolean resolveSurrogatePairs) throws XMLStreamException { XMLInputFactory f = getInputFactory(); @@ -629,6 +756,7 @@ private XMLInputFactory doGetFactory(boolean nsAware, setSupportExternalEntities(f, true); setReplaceEntities(f, replEntities); setValidating(f, false); + setResolveEntitySurrogatePairs(f, resolveSurrogatePairs); return f; } }