jhy · jhy · May 6, 2023 · Oct 1, 2022 · Jan 31, 2023 · Apr 13, 2023
diff --git a/src/main/java/org/jsoup/helper/W3CDom.java b/src/main/java/org/jsoup/helper/W3CDom.java
@@ -38,7 +38,8 @@
 import java.util.Map;
 import java.util.Properties;
 import java.util.Stack;
-
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import static javax.xml.transform.OutputKeys.METHOD;
 import static org.jsoup.nodes.Document.OutputSettings.Syntax;
 
@@ -96,6 +97,35 @@ public static Document convert(org.jsoup.nodes.Document in) {
         return (new W3CDom().fromJsoup(in));
     }
 
+    /**
+     * Pattern to detect the <code>xmlns="http://www.w3.org/1999/xhtml"</code> default namespace
+     * declaration when serializing the DOM to HTML. This pattern is "good enough", relying in part
+     * on the output of the {@link Transformer} used in the implementation, but is not a complete
+     * solution for all the serializations possible; that is, if one constructed an XML string
+     * manually, it might be possible to find an obscure variation that this pattern would not
+     * match.
+     */
+    static final Pattern HTML_DEFAULT_NAMESPACE_PATTERN =
+        Pattern.compile("<html[^>]*(\\sxmlns=['\"]http://www.w3.org/1999/xhtml['\"])");
+
+    /**
+     * Removes the default <code>xmlns="http://www.w3.org/1999/xhtml"</code> HTML namespace
+     * declaration if present in the string.
+     * 
+     * @param html The serialized HTML.
+     * @return A string without the default <code>xmlns="http://www.w3.org/1999/xhtml"</code> HTML
+     *         namespace declaration.
+     * @see <a href="https://github.com/jhy/jsoup/issues/1837">Issue #1837: Bug: DOM elements not
+     *      being placed in (X)HTML namespace.</a>
+     */
+    static String removeDefaultHtmlNamespaceDeclaration(String html) {
+        Matcher matcher = HTML_DEFAULT_NAMESPACE_PATTERN.matcher(html);
+        if (matcher.find()) {
+          html = html.substring(0, matcher.start(1)) + html.substring(matcher.end(1));
+        }
+        return html;
+    }
+
     /**
      * Serialize a W3C document to a String. Provide Properties to define output settings including if HTML or XML. If
      * you don't provide the properties ({@code null}), the output will be auto-detected based on the content of the
@@ -140,7 +170,16 @@ else if (doctype.getName().equalsIgnoreCase("html")
             }
 
             transformer.transform(domSource, result);
-            return writer.toString();
+            String resultString = writer.toString();
+            String outputMethod = properties != null ? properties.get(METHOD) : null;
+            // Remove any default xmlns="http://www.w3.org/1999/xhtml" namespace declaration,
+            // but only if it was added during serialization (not if the document already had it).
+            // Monitor https://stackoverflow.com/q/73919306 for a better approach.
+            if ((outputMethod == null || outputMethod.equals("html"))   //only for HTML output
+                && !doc.getDocumentElement().hasAttribute("xmlns")) {
+              resultString = removeDefaultHtmlNamespaceDeclaration(resultString);
+            }
+            return resultString;
 
         } catch (TransformerException e) {
             throw new IllegalStateException(e);
@@ -348,6 +387,7 @@ protected static class W3CBuilder implements NodeVisitor {
         public W3CBuilder(Document doc) {
             this.doc = doc;
             namespacesStack.push(new HashMap<>());
+            namespacesStack.peek().put("", "http://www.w3.org/1999/xhtml"); // TODO document
             dest = doc;
             contextElement = (org.jsoup.nodes.Element) doc.getUserData(ContextProperty); // Track the context jsoup Element, so we can save the corresponding w3c element
         }
@@ -366,9 +406,9 @@ public void head(org.jsoup.nodes.Node source, int depth) {
                 tagname to something safe, because that isn't going to be meaningful downstream. This seems(?) to be
                 how browsers handle the situation, also. https://github.com/jhy/jsoup/issues/1093 */
                 try {
-                    Element el = namespace == null && tagName.contains(":") ?
-                        doc.createElementNS("", tagName) : // doesn't have a real namespace defined
-                        doc.createElementNS(namespace, tagName);
+                    // use an empty namespace if none is present but the tag name has a prefix
+                    String imputedNamespace = namespace == null && tagName.contains(":") ? "" : namespace;
+                    Element el = doc.createElementNS(imputedNamespace, tagName);
                     copyAttributes(sourceEl, el);
                     append(el, sourceEl);
                     if (sourceEl == contextElement)

diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java
@@ -26,6 +26,7 @@
 import java.nio.charset.StandardCharsets;
 import java.util.Locale;
 import java.util.Map;
+import java.util.regex.Matcher;
 
 import static org.junit.jupiter.api.Assertions.*;
 
@@ -51,6 +52,43 @@ private static Document parseXml(String xml, boolean nameSpaceAware) {
         }
     }
 
+    @Test
+    public void htmlDefaultNamespaceRemovalPatternDoubleQuotes() {
+        String xml =
+            "<!doctype html system \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"></head><body><p>one</p></body></html>";
+        Matcher matcher = W3CDom.HTML_DEFAULT_NAMESPACE_PATTERN.matcher(xml);
+        assertTrue(matcher.find());
+        assertTrue(matcher.group(1).equals(" xmlns=\"http://www.w3.org/1999/xhtml\""));
+    }
+
+    @Test
+    public void htmlDefaultNamespaceRemovalPatternSingleQuotes() {
+        String xml =
+            "<!doctype html system \"about:legacy-compat\"><html xmlns='http://www.w3.org/1999/xhtml'><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"></head><body><p>one</p></body></html>";
+        Matcher matcher = W3CDom.HTML_DEFAULT_NAMESPACE_PATTERN.matcher(xml);
+        assertTrue(matcher.find());
+        assertTrue(matcher.group(1).equals(" xmlns='http://www.w3.org/1999/xhtml'"));
+    }
+
+    @Test
+    public void htmlDefaultNamespaceRemovalPatternOtherAttributes() {
+        String xml =
+            "<!doctype html system \"about:legacy-compat\"><html foo=\"bar\" xmlns=\"http://www.w3.org/1999/xhtml\" test=\"example\"><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"></head><body><p>one</p></body></html>";
+        Matcher matcher = W3CDom.HTML_DEFAULT_NAMESPACE_PATTERN.matcher(xml);
+        assertTrue(matcher.find());
+        assertTrue(matcher.group(1).equals(" xmlns=\"http://www.w3.org/1999/xhtml\""));
+    }
+
+    @Test
+    public void removeDefaultHtmlNamespaceDeclaration() {
+        String htmlWithXmlns =
+            "<!doctype html system \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"></head><body><p>one</p></body></html>";
+        String htmlWithoutXmlns =
+            "<!doctype html system \"about:legacy-compat\"><html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"></head><body><p>one</p></body></html>";
+        assertTrue(W3CDom.removeDefaultHtmlNamespaceDeclaration(htmlWithXmlns).equals(htmlWithoutXmlns), "Removes default HTML namespace declaration.");
+        assertTrue(W3CDom.removeDefaultHtmlNamespaceDeclaration(htmlWithoutXmlns).equals(htmlWithoutXmlns), "Still works if no default HTML namespace declaration.");
+    }
+
     @Test
     public void simpleConversion() {
         String html = "<html><head><title>W3c</title></head><body><p class='one' id=12>Text</p><!-- comment --><invalid>What<script>alert('!')";
@@ -62,7 +100,7 @@ public void simpleConversion() {
         assertEquals(0, meta.getLength());
 
         String out = W3CDom.asString(wDoc, W3CDom.OutputXml());
-        String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><html><head><title>W3c</title></head><body><p class=\"one\" id=\"12\">Text</p><!-- comment --><invalid>What<script>alert('!')</script></invalid></body></html>";
+        String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><title>W3c</title></head><body><p class=\"one\" id=\"12\">Text</p><!-- comment --><invalid>What<script>alert('!')</script></invalid></body></html>";
         assertEquals(expected, TextUtil.stripNewlines(out));
 
         Document roundTrip = parseXml(out, true);
@@ -74,7 +112,7 @@ public void simpleConversion() {
         String furtherOut = W3CDom.asString(wDoc, properties);
         assertTrue(furtherOut.length() > out.length()); // wanted to assert formatting, but actual indentation is platform specific so breaks in CI
         String furtherExpected =
-            "<?xml version=\"1.0\" encoding=\"UTF-8\"?><html><head><title>W3c</title></head><body><p class=\"one\" id=\"12\">Text</p><!-- comment --><invalid>What<script>alert('!')</script></invalid></body></html>";
+            "<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><title>W3c</title></head><body><p class=\"one\" id=\"12\">Text</p><!-- comment --><invalid>What<script>alert('!')</script></invalid></body></html>";
         assertEquals(furtherExpected, TextUtil.stripNewlines(furtherOut)); // on windows, DOM will write newlines as \r\n
     }
 
@@ -86,7 +124,7 @@ public void convertsGoogle() throws IOException {
         W3CDom w3c = new W3CDom();
         Document wDoc = w3c.fromJsoup(doc);
         Node htmlEl = wDoc.getChildNodes().item(1);
-        assertNull(htmlEl.getNamespaceURI());
+        assertEquals("http://www.w3.org/1999/xhtml", htmlEl.getNamespaceURI());
         assertEquals("html", htmlEl.getLocalName());
         assertEquals("html", htmlEl.getNodeName());
 
@@ -187,7 +225,7 @@ public void handlesInvalidAttributeNames() {
 
         Document w3Doc = W3CDom.convert(jsoupDoc);
         String xml = W3CDom.asString(w3Doc, W3CDom.OutputXml());
-        assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html><head/><body name=\"\" style=\"color: red\"/></html>", xml);
+        assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body name=\"\" style=\"color: red\"/></html>", xml);
     }
 
     @Test
@@ -221,7 +259,7 @@ public void handlesInvalidTagAsText() {
 
         Document w3Doc = W3CDom.convert(jsoup);
         String xml = W3CDom.asString(w3Doc, W3CDom.OutputXml());
-        assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html><head/><body>&lt;インセンティブで高収入！&gt;Text <p>More</p></body></html>", xml);
+        assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body>&lt;インセンティブで高収入！&gt;Text <p>More</p></body></html>", xml);
     }
 
     @Test
@@ -232,7 +270,7 @@ public void treatsUndeclaredNamespaceAsLocalName() {
         Document w3Doc = new W3CDom().fromJsoup(doc);
         Node htmlEl = w3Doc.getFirstChild();
 
-        assertNull(htmlEl.getNamespaceURI());
+        assertEquals("http://www.w3.org/1999/xhtml", htmlEl.getNamespaceURI());
         assertEquals("html", htmlEl.getLocalName());
         assertEquals("html", htmlEl.getNodeName());
 
@@ -247,7 +285,7 @@ public void xmlnsXpathTest() throws XPathExpressionException {
         W3CDom w3c = new W3CDom();
         String html = "<html><body><div>hello</div></body></html>";
         Document dom = w3c.fromJsoup(Jsoup.parse(html));
-        NodeList nodeList = xpath(dom, "//body");// no ns, so needs no prefix
+        NodeList nodeList = xpath(dom, "//*[local-name()=\"body\"]");// namespace aware; HTML namespace is default
         assertEquals("div", nodeList.item(0).getLocalName());
 
         // default output is namespace aware, so query needs to be as well
@@ -302,9 +340,8 @@ public void testRoundTripDoctype() {
         // TODO - not super happy with this output - but plain DOM doesn't let it out, and don't want to rebuild the writer
         // because we have Saxon on the test classpath, the transformer will change to that, and so case may change (e.g. Java base in META, Saxon is meta for HTML)
         String base = "<!DOCTYPE html><p>One</p>";
-        assertEqualsIgnoreCase("<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p>One</p></body></html>",
-            output(base, true));
-        assertEqualsIgnoreCase("<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE html SYSTEM \"about:legacy-compat\"><html><head/><body><p>One</p></body></html>", output(base, false));
+        assertEqualsIgnoreCase("<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p>One</p></body></html>", output(base, true));
+        assertEqualsIgnoreCase("<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body><p>One</p></body></html>", output(base, false));
 
         String publicDoc = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">";
         assertEqualsIgnoreCase("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body></body></html>", output(publicDoc, true));
@@ -313,15 +350,15 @@ public void testRoundTripDoctype() {
 
         String systemDoc = "<!DOCTYPE html SYSTEM \"exampledtdfile.dtd\">";
         assertEqualsIgnoreCase("<!DOCTYPE html SYSTEM \"exampledtdfile.dtd\"><html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body></body></html>", output(systemDoc, true));
-        assertEqualsIgnoreCase("<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE html SYSTEM \"exampledtdfile.dtd\"><html><head/><body/></html>", output(systemDoc, false));
+        assertEqualsIgnoreCase("<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE html SYSTEM \"exampledtdfile.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body/></html>", output(systemDoc, false));
 
         String legacyDoc = "<!DOCTYPE html SYSTEM \"about:legacy-compat\">";
         assertEqualsIgnoreCase("<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body></body></html>", output(legacyDoc, true));
-        assertEqualsIgnoreCase("<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE html SYSTEM \"about:legacy-compat\"><html><head/><body/></html>", output(legacyDoc, false));
+        assertEqualsIgnoreCase("<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body/></html>", output(legacyDoc, false));
 
         String noDoctype = "<p>One</p>";
         assertEqualsIgnoreCase("<html><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p>One</p></body></html>", output(noDoctype, true));
-        assertEqualsIgnoreCase("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html><head/><body><p>One</p></body></html>", output(noDoctype, false));
+        assertEqualsIgnoreCase("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body><p>One</p></body></html>", output(noDoctype, false));
     }
 
     private String output(String in, boolean modeHtml) {