diff --git a/src/main/java/org/owasp/validator/html/scan/AbstractAntiSamyScanner.java b/src/main/java/org/owasp/validator/html/scan/AbstractAntiSamyScanner.java index 31eafc23..08d247c4 100644 --- a/src/main/java/org/owasp/validator/html/scan/AbstractAntiSamyScanner.java +++ b/src/main/java/org/owasp/validator/html/scan/AbstractAntiSamyScanner.java @@ -118,6 +118,18 @@ protected org.apache.xml.serialize.HTMLSerializer getHTMLSerializer( return new ASHTMLSerializer(w, format, policy); } + /** + * Returns a new {@link HtmlSerializer} configured for the current policy. This is the preferred + * serializer that does not depend on the deprecated Xerces + * {@code org.apache.xml.serialize.HTMLSerializer}. + * + * @param w the writer to serialize into + * @return a fully configured {@link HtmlSerializer} + */ + protected HtmlSerializer getHtmlSerializer(Writer w) { + return new HtmlSerializer(w, policy); + } + protected String trim(String original, String cleaned) { if (cleaned.endsWith("\n")) { if (!original.endsWith("\n")) { diff --git a/src/main/java/org/owasp/validator/html/scan/AntiSamyDOMScanner.java b/src/main/java/org/owasp/validator/html/scan/AntiSamyDOMScanner.java index ca754867..4716a122 100644 --- a/src/main/java/org/owasp/validator/html/scan/AntiSamyDOMScanner.java +++ b/src/main/java/org/owasp/validator/html/scan/AntiSamyDOMScanner.java @@ -185,11 +185,7 @@ public CleanResults scan(String html) throws ScanException { StringWriter out = new StringWriter(); - @SuppressWarnings("deprecation") - org.apache.xml.serialize.OutputFormat format = getOutputFormat(); - - //noinspection deprecation - org.apache.xml.serialize.HTMLSerializer serializer = getHTMLSerializer(out, format); + HtmlSerializer serializer = getHtmlSerializer(out); serializer.serialize(dom); /* diff --git a/src/main/java/org/owasp/validator/html/scan/HtmlSerializer.java b/src/main/java/org/owasp/validator/html/scan/HtmlSerializer.java new file mode 100644 index 00000000..3b50d6df --- /dev/null +++ b/src/main/java/org/owasp/validator/html/scan/HtmlSerializer.java @@ -0,0 +1,925 @@ +/* + * Copyright (c) 2007-2024, Arshan Dabirsiaghi, Jason Li + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list of conditions + * and the following disclaimer. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. Neither the name of OWASP nor the names of its + * contributors may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER + * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.owasp.validator.html.scan; + +import java.io.IOException; +import java.io.Writer; +import java.util.ArrayDeque; +import java.util.Arrays; +import java.util.Collections; +import java.util.Deque; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import org.owasp.validator.html.InternalPolicy; +import org.owasp.validator.html.TagMatcher; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Attr; +import org.w3c.dom.Comment; +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.Element; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.Text; + +/** + * A pure-Java HTML serializer that replaces the deprecated Xerces {@code HTMLSerializer} + * (org.apache.xml.serialize.HTMLSerializer) used by the old {@link ASHTMLSerializer}. This class + * serializes a DOM {@link DocumentFragment} to an HTML string without any Apache Xerces dependency. + * + *

Behaviour is modelled on the Xerces {@code HTMLSerializer} / {@code BaseMarkupSerializer} + * stack and the AntiSamy-specific {@code ASHTMLSerializer} overrides, so that existing output is + * preserved exactly. + * + * @see ASHTMLSerializer + */ +public class HtmlSerializer { + + private static final Logger logger = LoggerFactory.getLogger(HtmlSerializer.class); + + // ----------------------------------------------------------------------- + // Static HTML metadata tables (replicate Xerces HTMLdtd behaviour) + // ----------------------------------------------------------------------- + + /** + * Elements that Xerces HTMLdtd considers "empty" (isEmptyTag returns true). These are elements + * whose flags have the ONLY_OPENING (0x01) or EMPTY (0x10) bits set. When such an element has no + * child nodes it is serialized in the "else" (void) branch of serializeElement. + */ + private static final Set EMPTY_ELEMENTS; + + /** + * True void/self-closing HTML elements (have both EMPTY + ONLY_OPENING flags in Xerces, i.e. no + * closing tag should be printed). These never reach {@link #endElementIO} in practice because + * they are handled directly in the void branch. + */ + private static final Set VOID_ELEMENTS; + + /** Elements whose content should be treated as preserve-space (no indentation inside). */ + private static final Set PRESERVE_SPACE_ELEMENTS; + + /** + * Attribute names that carry URI values and must be escaped via {@link #printEscaped}. Matches + * Xerces {@code HTMLdtd.isURI} which checks {@code href} and {@code src} case-insensitively. + */ + private static final Set URI_ATTRS; + + /** + * Boolean attributes keyed by lower-case element name. Matches the Xerces {@code HTMLdtd} + * defineBoolean table. Boolean attributes are written as just the attribute name with no {@code + * ="value"} suffix. + */ + private static final Map> BOOLEAN_ATTRS_BY_ELEMENT; + + /** + * HTML named character-entity map: Unicode code point → entity name. Built from the same + * HTMLEntities.res data that Xerces uses, so entity encoding is identical. + */ + private static final Map HTML_ENTITIES; + + static { + // Elements treated as "empty" by Xerces isEmptyTag (ONLY_OPENING or EMPTY flag bits) + EMPTY_ELEMENTS = + Collections.unmodifiableSet( + new HashSet<>( + Arrays.asList( + "area", "base", "basefont", "br", "col", + "dd", "dt", "frame", "hr", "img", + "input", "isindex", "li", "link", "meta", + "option", "param"))); + + // Pure void HTML elements – no closing tag, ever + VOID_ELEMENTS = + Collections.unmodifiableSet( + new HashSet<>( + Arrays.asList( + "area", "base", "basefont", "br", "col", + "frame", "hr", "img", "input", "isindex", + "link", "meta", "param"))); + + // preserve-space elements (PRE has PRESERVE flag in Xerces; SCRIPT/STYLE also have it) + PRESERVE_SPACE_ELEMENTS = + Collections.unmodifiableSet( + new HashSet<>(Arrays.asList("pre", "script", "style", "textarea"))); + + URI_ATTRS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList("href", "src"))); + + Map> boolMap = new HashMap<>(); + boolMap.put("area", new HashSet<>(Collections.singletonList("href"))); + boolMap.put("button", new HashSet<>(Collections.singletonList("disabled"))); + boolMap.put("dir", new HashSet<>(Collections.singletonList("compact"))); + boolMap.put("dl", new HashSet<>(Collections.singletonList("compact"))); + boolMap.put("frame", new HashSet<>(Collections.singletonList("noresize"))); + boolMap.put("hr", new HashSet<>(Collections.singletonList("noshade"))); + boolMap.put("image", new HashSet<>(Collections.singletonList("ismap"))); + boolMap.put( + "input", + new HashSet<>( + Arrays.asList("defaultchecked", "checked", "readonly", "disabled"))); + boolMap.put("link", new HashSet<>(Collections.singletonList("link"))); + boolMap.put("menu", new HashSet<>(Collections.singletonList("compact"))); + boolMap.put("object", new HashSet<>(Collections.singletonList("declare"))); + boolMap.put("ol", new HashSet<>(Collections.singletonList("compact"))); + boolMap.put("optgroup", new HashSet<>(Collections.singletonList("disabled"))); + boolMap.put( + "option", + new HashSet<>(Arrays.asList("default-selected", "selected", "disabled"))); + boolMap.put("script", new HashSet<>(Collections.singletonList("defer"))); + boolMap.put("select", new HashSet<>(Arrays.asList("multiple", "disabled"))); + boolMap.put("style", new HashSet<>(Collections.singletonList("disabled"))); + boolMap.put("td", new HashSet<>(Collections.singletonList("nowrap"))); + boolMap.put("th", new HashSet<>(Collections.singletonList("nowrap"))); + boolMap.put( + "textarea", new HashSet<>(Arrays.asList("disabled", "readonly"))); + boolMap.put("ul", new HashSet<>(Collections.singletonList("compact"))); + BOOLEAN_ATTRS_BY_ELEMENT = Collections.unmodifiableMap(boolMap); + + // HTML named character entities (from Xerces HTMLEntities.res) + Map ent = new HashMap<>(); + // markup-significant + ent.put(34, "quot"); + ent.put(38, "amp"); + ent.put(60, "lt"); + ent.put(62, "gt"); + // ISO 8859-1 + ent.put(160, "nbsp"); + ent.put(161, "iexcl"); + ent.put(162, "cent"); + ent.put(163, "pound"); + ent.put(164, "curren"); + ent.put(165, "yen"); + ent.put(166, "brvbar"); + ent.put(167, "sect"); + ent.put(168, "uml"); + ent.put(169, "copy"); + ent.put(170, "ordf"); + ent.put(171, "laquo"); + ent.put(172, "not"); + ent.put(173, "shy"); + ent.put(174, "reg"); + ent.put(175, "macr"); + ent.put(176, "deg"); + ent.put(177, "plusmn"); + ent.put(178, "sup2"); + ent.put(179, "sup3"); + ent.put(180, "acute"); + ent.put(181, "micro"); + ent.put(182, "para"); + ent.put(183, "middot"); + ent.put(184, "cedil"); + ent.put(185, "sup1"); + ent.put(186, "ordm"); + ent.put(187, "raquo"); + ent.put(188, "frac14"); + ent.put(189, "frac12"); + ent.put(190, "frac34"); + ent.put(191, "iquest"); + ent.put(192, "Agrave"); + ent.put(193, "Aacute"); + ent.put(194, "Acirc"); + ent.put(195, "Atilde"); + ent.put(196, "Auml"); + ent.put(197, "Aring"); + ent.put(198, "AElig"); + ent.put(199, "Ccedil"); + ent.put(200, "Egrave"); + ent.put(201, "Eacute"); + ent.put(202, "Ecirc"); + ent.put(203, "Euml"); + ent.put(204, "Igrave"); + ent.put(205, "Iacute"); + ent.put(206, "Icirc"); + ent.put(207, "Iuml"); + ent.put(208, "ETH"); + ent.put(209, "Ntilde"); + ent.put(210, "Ograve"); + ent.put(211, "Oacute"); + ent.put(212, "Ocirc"); + ent.put(213, "Otilde"); + ent.put(214, "Ouml"); + ent.put(215, "times"); + ent.put(216, "Oslash"); + ent.put(217, "Ugrave"); + ent.put(218, "Uacute"); + ent.put(219, "Ucirc"); + ent.put(220, "Uuml"); + ent.put(221, "Yacute"); + ent.put(222, "THORN"); + ent.put(223, "szlig"); + ent.put(224, "agrave"); + ent.put(225, "aacute"); + ent.put(226, "acirc"); + ent.put(227, "atilde"); + ent.put(228, "auml"); + ent.put(229, "aring"); + ent.put(230, "aelig"); + ent.put(231, "ccedil"); + ent.put(232, "egrave"); + ent.put(233, "eacute"); + ent.put(234, "ecirc"); + ent.put(235, "euml"); + ent.put(236, "igrave"); + ent.put(237, "iacute"); + ent.put(238, "icirc"); + ent.put(239, "iuml"); + ent.put(240, "eth"); + ent.put(241, "ntilde"); + ent.put(242, "ograve"); + ent.put(243, "oacute"); + ent.put(244, "ocirc"); + ent.put(245, "otilde"); + ent.put(246, "ouml"); + ent.put(247, "divide"); + ent.put(248, "oslash"); + ent.put(249, "ugrave"); + ent.put(250, "uacute"); + ent.put(251, "ucirc"); + ent.put(252, "uuml"); + ent.put(253, "yacute"); + ent.put(254, "thorn"); + ent.put(255, "yuml"); + // Symbols / Math / Greek + ent.put(402, "fnof"); + ent.put(913, "Alpha"); + ent.put(914, "Beta"); + ent.put(915, "Gamma"); + ent.put(916, "Delta"); + ent.put(917, "Epsilon"); + ent.put(918, "Zeta"); + ent.put(919, "Eta"); + ent.put(920, "Theta"); + ent.put(921, "Iota"); + ent.put(922, "Kappa"); + ent.put(923, "Lambda"); + ent.put(924, "Mu"); + ent.put(925, "Nu"); + ent.put(926, "Xi"); + ent.put(927, "Omicron"); + ent.put(928, "Pi"); + ent.put(929, "Rho"); + ent.put(931, "Sigma"); + ent.put(932, "Tau"); + ent.put(933, "Upsilon"); + ent.put(934, "Phi"); + ent.put(935, "Chi"); + ent.put(936, "Psi"); + ent.put(937, "Omega"); + ent.put(945, "alpha"); + ent.put(946, "beta"); + ent.put(947, "gamma"); + ent.put(948, "delta"); + ent.put(949, "epsilon"); + ent.put(950, "zeta"); + ent.put(951, "eta"); + ent.put(952, "theta"); + ent.put(953, "iota"); + ent.put(954, "kappa"); + ent.put(955, "lambda"); + ent.put(956, "mu"); + ent.put(957, "nu"); + ent.put(958, "xi"); + ent.put(959, "omicron"); + ent.put(960, "pi"); + ent.put(961, "rho"); + ent.put(962, "sigmaf"); + ent.put(963, "sigma"); + ent.put(964, "tau"); + ent.put(965, "upsilon"); + ent.put(966, "phi"); + ent.put(967, "chi"); + ent.put(968, "psi"); + ent.put(969, "omega"); + ent.put(977, "thetasym"); + ent.put(978, "upsih"); + ent.put(982, "piv"); + // General Punctuation + ent.put(8226, "bull"); + ent.put(8230, "hellip"); + ent.put(8242, "prime"); + ent.put(8243, "Prime"); + ent.put(8254, "oline"); + ent.put(8260, "frasl"); + // Letterlike Symbols + ent.put(8472, "weierp"); + ent.put(8465, "image"); + ent.put(8476, "real"); + ent.put(8482, "trade"); + ent.put(8501, "alefsym"); + // Arrows + ent.put(8592, "larr"); + ent.put(8593, "uarr"); + ent.put(8594, "rarr"); + ent.put(8595, "darr"); + ent.put(8596, "harr"); + ent.put(8629, "crarr"); + ent.put(8656, "lArr"); + ent.put(8657, "uArr"); + ent.put(8658, "rArr"); + ent.put(8659, "dArr"); + ent.put(8660, "hArr"); + // Mathematical Operators + ent.put(8704, "forall"); + ent.put(8706, "part"); + ent.put(8707, "exist"); + ent.put(8709, "empty"); + ent.put(8711, "nabla"); + ent.put(8712, "isin"); + ent.put(8713, "notin"); + ent.put(8715, "ni"); + ent.put(8719, "prod"); + ent.put(8721, "sum"); + ent.put(8722, "minus"); + ent.put(8727, "lowast"); + ent.put(8730, "radic"); + ent.put(8733, "prop"); + ent.put(8734, "infin"); + ent.put(8736, "ang"); + ent.put(8743, "and"); + ent.put(8744, "or"); + ent.put(8745, "cap"); + ent.put(8746, "cup"); + ent.put(8747, "int"); + ent.put(8756, "there4"); + ent.put(8764, "sim"); + ent.put(8773, "cong"); + ent.put(8776, "asymp"); + ent.put(8800, "ne"); + ent.put(8801, "equiv"); + ent.put(8804, "le"); + ent.put(8805, "ge"); + ent.put(8834, "sub"); + ent.put(8835, "sup"); + ent.put(8836, "nsub"); + ent.put(8838, "sube"); + ent.put(8839, "supe"); + ent.put(8853, "oplus"); + ent.put(8855, "otimes"); + ent.put(8869, "perp"); + ent.put(8901, "sdot"); + // Miscellaneous Technical + ent.put(8968, "lceil"); + ent.put(8969, "rceil"); + ent.put(8970, "lfloor"); + ent.put(8971, "rfloor"); + ent.put(9001, "lang"); + ent.put(9002, "rang"); + // Geometric Shapes + ent.put(9674, "loz"); + // Miscellaneous Symbols + ent.put(9824, "spades"); + ent.put(9827, "clubs"); + ent.put(9829, "hearts"); + ent.put(9830, "diams"); + // Internationalisation + ent.put(338, "OElig"); + ent.put(339, "oelig"); + ent.put(376, "Yuml"); + ent.put(710, "circ"); + ent.put(732, "tilde"); + ent.put(8194, "ensp"); + ent.put(8195, "emsp"); + ent.put(8201, "thinsp"); + ent.put(8204, "zwnj"); + ent.put(8205, "zwj"); + ent.put(8206, "lrm"); + ent.put(8207, "rlm"); + ent.put(8211, "ndash"); + ent.put(8212, "mdash"); + ent.put(8216, "lsquo"); + ent.put(8217, "rsquo"); + ent.put(8218, "sbquo"); + ent.put(8220, "ldquo"); + ent.put(8221, "rdquo"); + ent.put(8222, "bdquo"); + ent.put(8224, "dagger"); + ent.put(8225, "Dagger"); + ent.put(8240, "permil"); + ent.put(8249, "lsaquo"); + ent.put(8250, "rsaquo"); + ent.put(8364, "euro"); + HTML_ENTITIES = Collections.unmodifiableMap(ent); + } + + // ----------------------------------------------------------------------- + // Instance state + // ----------------------------------------------------------------------- + + private final Writer writer; + private final boolean encodeAllPossibleEntities; + private final TagMatcher allowedEmptyTags; + private final TagMatcher requiresClosingTags; + private final boolean omitXmlDeclaration; + private final boolean omitDoctypeDeclaration; + private final boolean indenting; + private final int indentSize; + private final boolean globalPreserveSpace; + + // Tracking state across serialization + private boolean started = false; + private int currentIndent = 0; + private final Deque stateStack = new ArrayDeque<>(); + + // ----------------------------------------------------------------------- + // Per-element state (mirrors Xerces ElementState) + // ----------------------------------------------------------------------- + + private static class ElementState { + String rawName; + boolean preserveSpace; + /** True while the element's opening {@code >} has not yet been written. */ + boolean empty = true; + /** True when the last serialized sibling was an element (used for indenting). */ + boolean afterElement = false; + /** True for SCRIPT/STYLE – content is not HTML-escaped. */ + boolean unescaped = false; + } + + // ----------------------------------------------------------------------- + // Constructor + // ----------------------------------------------------------------------- + + public HtmlSerializer(Writer w, InternalPolicy policy) { + this.writer = w; + this.encodeAllPossibleEntities = policy.isEntityEncodeIntlCharacters(); + this.allowedEmptyTags = policy.getAllowedEmptyTags(); + this.requiresClosingTags = policy.getRequiresClosingTags(); + this.omitXmlDeclaration = policy.isOmitXmlDeclaration(); + this.omitDoctypeDeclaration = policy.isOmitDoctypeDeclaration(); + this.indenting = policy.isFormatOutput(); + this.indentSize = 2; + this.globalPreserveSpace = policy.isPreserveSpace(); + } + + // ----------------------------------------------------------------------- + // Public API + // ----------------------------------------------------------------------- + + /** + * Serialize a DOM {@link DocumentFragment} to the {@link Writer} supplied at construction time. + * + * @param fragment the fragment to serialize + * @throws IOException if writing to the underlying writer fails + */ + public void serialize(DocumentFragment fragment) throws IOException { + Node child = fragment.getFirstChild(); + while (child != null) { + serializeNode(child); + child = child.getNextSibling(); + } + writer.flush(); + } + + // ----------------------------------------------------------------------- + // Internal serialization helpers + // ----------------------------------------------------------------------- + + private void serializeNode(Node node) throws IOException { + switch (node.getNodeType()) { + case Node.ELEMENT_NODE: + serializeElement((Element) node); + break; + case Node.TEXT_NODE: + case Node.CDATA_SECTION_NODE: + serializeText((Text) node); + break; + case Node.COMMENT_NODE: + serializeComment((Comment) node); + break; + default: + // Processing instructions and other nodes have already been removed + // by the AntiSamy DOM scanner before serialization is called. + break; + } + } + + /** + * Emit an optional XML declaration / DOCTYPE, mirroring + * BaseMarkupSerializer.startDocument(). In AntiSamy both flags are normally {@code true} so + * nothing is output. + */ + private void startDocument(String rootTagName) throws IOException { + StringBuilder sb = new StringBuilder(); + if (!omitXmlDeclaration) { + sb.append(""); + } + if (!omitDoctypeDeclaration) { + sb.append(""); + } + if (sb.length() > 0) { + writer.write(sb.toString()); + writer.write('\n'); + } + started = true; + } + + /** + * Serializes a single DOM {@link Element}, replicating the logic in + * {@code ASHTMLSerializer.serializeElement} and {@code BaseMarkupSerializer}. + */ + private void serializeElement(Element elem) throws IOException { + String tagName = elem.getTagName(); + boolean isRootLevel = isDocumentState(); + ElementState parentState = peekState(); + + // --- Document-level bookkeeping (mirrors HTMLSerializer.serializeElement) --- + if (isRootLevel) { + if (!started) { + startDocument(tagName); + } + } else { + // Close the parent element's opening ">" if it hasn't been printed yet. + // Capture the value first so we can use it in the breakLine condition below. + boolean wasEmpty = parentState.empty; + if (wasEmpty) { + writer.write('>'); + parentState.empty = false; + } + // Line-break before this element when indenting. + if (indenting && !parentState.preserveSpace && (wasEmpty || parentState.afterElement)) { + breakLine(); + } + } + + // Inherit preserve-space from the parent. + boolean preserveSpace = + (parentState != null) ? parentState.preserveSpace : globalPreserveSpace; + + // --- Opening tag --- + writer.write('<'); + writer.write(tagName); + currentIndent++; + + // --- Attributes --- + NamedNodeMap attrMap = elem.getAttributes(); + if (attrMap != null) { + for (int i = 0; i < attrMap.getLength(); i++) { + Attr attr = (Attr) attrMap.item(i); + if (!attr.getSpecified()) { + continue; + } + String name = attr.getName().toLowerCase(Locale.ENGLISH); + String value = attr.getValue(); + if (value == null) { + value = ""; + } + writer.write(' '); + + if (isUriAttr(name)) { + // URI attribute: name="" + // escapeURI in ASHTMLSerializer calls printEscaped directly and returns "". + writer.write(name); + writer.write("=\""); + printEscaped(value); + writer.write('"'); + } else if (isBooleanAttr(tagName, name)) { + // Boolean attribute: print only the name + writer.write(name); + } else { + writer.write(name); + writer.write("=\""); + printEscaped(value); + writer.write('"'); + } + } + } + + if (isPreserveSpaceElement(tagName)) { + preserveSpace = true; + } + + // --- Decide: element with content vs. void element --- + if (elem.hasChildNodes() || !isEmptyElement(tagName)) { + // Push a new element state and serialize children. + ElementState state = pushState(tagName, preserveSpace); + + // A and TD: close the opening ">" immediately (no line breaks inside). + if ("a".equalsIgnoreCase(tagName) || "td".equalsIgnoreCase(tagName)) { + state.empty = false; + writer.write('>'); + } + + // SCRIPT and STYLE: content is not HTML-escaped. + if ("script".equalsIgnoreCase(tagName) || "style".equalsIgnoreCase(tagName)) { + state.unescaped = true; + } + + Node child = elem.getFirstChild(); + while (child != null) { + serializeNode(child); + child = child.getNextSibling(); + } + + endElementIO(tagName); + + } else { + // Void / empty element branch (mirrors ASHTMLSerializer.serializeElement else-branch). + currentIndent--; + if (isAllowedEmptyTag(tagName) && !requiresClosingTag(tagName)) { + writer.write("/>"); + } else { + writer.write('>'); + } + if (!isRootLevel) { + parentState.afterElement = true; + parentState.empty = false; + } + if (isRootLevel) { + if (indenting) { + writer.write('\n'); + } + writer.flush(); + } + } + } + + /** + * Closes an element that was opened via {@link #pushState}, mirroring + * {@code ASHTMLSerializer.endElementIO}. + */ + private void endElementIO(String rawName) throws IOException { + currentIndent--; + ElementState state = peekState(); + + if (state.empty && isAllowedEmptyTag(rawName) && !requiresClosingTag(rawName)) { + // Element had no children and is allowed to self-close. + writer.write("/>"); + } else { + // Close the opening tag if not already done. + if (state.empty) { + writer.write('>'); + } + // All elements reaching endElementIO get a closing tag. + // (Void elements are handled in the else-branch of serializeElement and never + // reach this method.) + if (indenting && !state.preserveSpace && state.afterElement) { + breakLine(); + } + writer.write("'); + } + + popState(); + ElementState parentState = peekState(); + // A and TD elements don't trigger afterElement indentation in their parent. + // rawName will always be non-null here (it's the tag name we just serialized). + if (!"a".equalsIgnoreCase(rawName) && !"td".equalsIgnoreCase(rawName)) { + if (parentState != null) { + parentState.afterElement = true; + parentState.empty = false; + } + } + if (isDocumentState()) { + if (indenting) { + writer.write('\n'); + } + writer.flush(); + } + } + + /** + * Serializes a text (or CDATA) node, replicating {@code BaseMarkupSerializer.content()} + + * {@code characters(String)}. + */ + private void serializeText(Text node) throws IOException { + String text = node.getNodeValue(); + if (text == null || text.isEmpty()) { + return; + } + + ElementState state = peekState(); + boolean inPreserveSpace = state != null && state.preserveSpace; + + // When formatting output, whitespace-only text nodes between block-level elements should be + // skipped. The Xerces IndentPrinter effectively absorbs them into its line buffer and they are + // displaced by the indentation. Skipping them here reproduces that behaviour without the + // complexity of a buffered printer. Preserve-space elements (PRE, SCRIPT, STYLE) are excluded. + if (indenting && !inPreserveSpace && isWhitespaceOnly(text)) { + return; + } + + // content() equivalent: close the opening tag and clear afterElement. + if (!isDocumentState() && state != null) { + if (state.empty) { + writer.write('>'); + state.empty = false; + } + state.afterElement = false; + } + + if (state != null && state.unescaped) { + // SCRIPT / STYLE content is written verbatim. + writer.write(text); + } else { + printEscaped(text); + } + } + + /** + * Serializes an HTML comment, replicating {@code BaseMarkupSerializer.comment(String)}. + */ + private void serializeComment(Comment comment) throws IOException { + ElementState state = peekState(); + // content() equivalent: close the opening tag. + if (!isDocumentState() && state != null && state.empty) { + writer.write('>'); + state.empty = false; + } + if (indenting && !isDocumentState() && state != null && !state.preserveSpace) { + breakLine(); + } + writer.write(""); + // After a comment, afterElement remains false (mirroring Xerces BaseMarkupSerializer). + if (!isDocumentState() && state != null) { + state.afterElement = false; + } + } + + // ----------------------------------------------------------------------- + // Entity / character escaping + // ----------------------------------------------------------------------- + + /** + * HTML-escapes {@code text} and writes the result directly to the writer, replicating {@code + * BaseMarkupSerializer.printEscaped}. Surrogate pairs are encoded as numeric character + * references. + */ + private void printEscaped(String text) throws IOException { + int length = text.length(); + for (int i = 0; i < length; ) { + char c = text.charAt(i); + + // Handle surrogate pairs (supplementary characters >= U+10000). + if (Character.isHighSurrogate(c) && i + 1 < length) { + char low = text.charAt(i + 1); + if (Character.isLowSurrogate(low)) { + int codePoint = Character.toCodePoint(c, low); + writer.write("&#x"); + writer.write(Integer.toHexString(codePoint)); + writer.write(';'); + i += 2; + continue; + } + } + + int ch = c; + String entity = getEntityRef(ch); + if (entity != null) { + writer.write('&'); + writer.write(entity); + writer.write(';'); + } else if (ch == '\n' || ch == '\r' || ch == '\t' || ch >= ' ') { + writer.write(ch); + } else { + // Non-printable control character – numeric reference. + writer.write("&#x"); + writer.write(Integer.toHexString(ch)); + writer.write(';'); + } + i++; + } + } + + /** + * Returns the HTML named entity for {@code ch}, or {@code null} if none should be used. Mirrors + * {@code ASHTMLSerializer.getEntityRef}: only consults the entity table when + * {@code encodeAllPossibleEntities} is {@code true} or the character is one of the "big5" + * security-critical chars ({@code < > " ' &}). + */ + private String getEntityRef(int ch) { + if (encodeAllPossibleEntities || Constants.big5CharsToEncodeSet.contains(ch)) { + return HTML_ENTITIES.get(ch); + } + return null; + } + + // ----------------------------------------------------------------------- + // HTML metadata helpers + // ----------------------------------------------------------------------- + + /** + * Returns true when the element is treated as "empty" by Xerces {@code HTMLdtd.isEmptyTag} + * (elements with the {@code ONLY_OPENING} or {@code EMPTY} flag). When such an element has no + * child nodes it is serialized in the void branch without calling {@link #endElementIO}. + */ + private boolean isEmptyElement(String tagName) { + return EMPTY_ELEMENTS.contains(tagName.toLowerCase(Locale.ENGLISH)); + } + + /** + * Returns true for space-preserving elements (PRE, SCRIPT, STYLE, TEXTAREA): inside them + * indentation line-breaks are suppressed. + */ + private boolean isPreserveSpaceElement(String tagName) { + return PRESERVE_SPACE_ELEMENTS.contains(tagName.toLowerCase(Locale.ENGLISH)); + } + + /** Returns true when the attribute name is a URI attribute (href or src). */ + private boolean isUriAttr(String attrName) { + return URI_ATTRS.contains(attrName.toLowerCase(Locale.ENGLISH)); + } + + /** + * Returns true when the attribute is a boolean attribute for the given element (matching the + * Xerces {@code HTMLdtd} defineBoolean table). Boolean attributes are printed without a value. + */ + private boolean isBooleanAttr(String tagName, String attrName) { + Set boolAttrs = + BOOLEAN_ATTRS_BY_ELEMENT.get(tagName.toLowerCase(Locale.ENGLISH)); + return boolAttrs != null + && boolAttrs.contains(attrName.toLowerCase(Locale.ENGLISH)); + } + + /** + * Returns true when the tag is allowed to be written as a self-closing empty tag by the policy. + * Mirrors {@code ASHTMLSerializer.isAllowedEmptyTag}. + */ + private boolean isAllowedEmptyTag(String tagName) { + String lower = tagName.toLowerCase(Locale.ENGLISH); + return "head".equals(lower) || allowedEmptyTags.matches(tagName); + } + + /** + * Returns true when the tag must have an explicit closing tag even when empty (e.g. {@code + * }). + */ + private boolean requiresClosingTag(String tagName) { + return requiresClosingTags.matches(tagName); + } + + /** Returns {@code true} when every character in {@code text} is an ASCII/Unicode whitespace. */ + private static boolean isWhitespaceOnly(String text) { + for (int i = 0; i < text.length(); i++) { + if (!Character.isWhitespace(text.charAt(i))) { + return false; + } + } + return true; + } + + // ----------------------------------------------------------------------- + // Element-state stack + // ----------------------------------------------------------------------- + + private boolean isDocumentState() { + return stateStack.isEmpty(); + } + + private ElementState peekState() { + return stateStack.isEmpty() ? null : stateStack.peek(); + } + + private ElementState pushState(String rawName, boolean preserveSpace) { + ElementState state = new ElementState(); + state.rawName = rawName; + state.preserveSpace = preserveSpace; + stateStack.push(state); + return state; + } + + private void popState() { + if (!stateStack.isEmpty()) { + stateStack.pop(); + } + } + + // ----------------------------------------------------------------------- + // Indentation + // ----------------------------------------------------------------------- + + private void breakLine() throws IOException { + writer.write('\n'); + for (int i = 0; i < currentIndent * indentSize; i++) { + writer.write(' '); + } + } +} diff --git a/src/test/java/org/owasp/validator/html/test/HtmlSerializerTest.java b/src/test/java/org/owasp/validator/html/test/HtmlSerializerTest.java new file mode 100644 index 00000000..0f1a518d --- /dev/null +++ b/src/test/java/org/owasp/validator/html/test/HtmlSerializerTest.java @@ -0,0 +1,1456 @@ +/* + * Copyright (c) 2007-2024, Arshan Dabirsiaghi, Jason Li + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * + * Redistributions of source code must retain the above copyright notice, this list of conditions + * and the following disclaimer. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the documentation and/or other + * materials provided with the distribution. Neither the name of OWASP nor the names of its + * contributors may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY + * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +package org.owasp.validator.html.test; + +import static org.hamcrest.CoreMatchers.containsString; +import static org.hamcrest.CoreMatchers.not; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.StringWriter; +import java.net.URL; +import java.util.Arrays; +import java.util.Collection; +import org.htmlunit.cyberneko.parsers.DOMFragmentParser; +import org.htmlunit.cyberneko.xerces.dom.DocumentImpl; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.owasp.validator.html.AntiSamy; +import org.owasp.validator.html.CleanResults; +import org.owasp.validator.html.InternalPolicy; +import org.owasp.validator.html.Policy; +import org.owasp.validator.html.scan.HtmlSerializer; +import org.w3c.dom.Document; +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.Element; +import org.w3c.dom.Text; +import org.xml.sax.InputSource; + +import java.io.StringReader; + +/** + * Comprehensive test suite for {@link HtmlSerializer} — the pure-Java replacement for the + * Xerces {@code HTMLSerializer} dependency. + * + *

Tests are organised into: + *

    + *
  1. Unit tests that build DOM fragments programmatically and assert serialized output.
  2. + *
  3. Data-driven round-trip tests that run {@code AntiSamy.scan()} in DOM mode and compare + * against expected substrings or exact values.
  4. + *
+ */ +public class HtmlSerializerTest { + + // --------------------------------------------------------------------------- + // Infrastructure + // --------------------------------------------------------------------------- + + private TestPolicy policy; + private AntiSamy antiSamy; + private Document document; + /** Policy with formatOutput=false (for unit tests that check exact serialized output). */ + private InternalPolicy noFormatPolicy; + /** Policy with entityEncodeIntlChars=true (for tests checking entity-encoded output). */ + private InternalPolicy encodeIntlPolicy; + + @Before + public void setUp() throws Exception { + URL url = getClass().getResource("/antisamy.xml"); + policy = TestPolicy.getInstance(url); + antiSamy = new AntiSamy(); + document = new DocumentImpl(); + noFormatPolicy = (InternalPolicy) policy.cloneWithDirective("formatOutput", "false"); + encodeIntlPolicy = (InternalPolicy) policy.cloneWithDirective("entityEncodeIntlChars", "true"); + } + + // --------------------------------------------------------------------------- + // Helpers + // --------------------------------------------------------------------------- + + /** Serialize a {@link DocumentFragment} using default policy settings. */ + private String serialize(DocumentFragment frag) throws Exception { + return serialize(frag, (InternalPolicy) policy); + } + + private String serialize(DocumentFragment frag, InternalPolicy pol) throws Exception { + StringWriter out = new StringWriter(); + HtmlSerializer ser = new HtmlSerializer(out, pol); + ser.serialize(frag); + return out.getBuffer().toString(); + } + + /** + * Parse {@code html} into a {@link DocumentFragment} using the same cyberneko parser + * configuration as the DOM scanner, then serialize with {@link HtmlSerializer} using the given + * policy. + */ + private String roundTrip(String html, InternalPolicy pol) throws Exception { + DOMFragmentParser parser = new DOMFragmentParser(); + parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); + parser.setFeature( + "http://cyberneko.org/html/features/scanner/style/strip-cdata-delims", false); + parser.setFeature("http://cyberneko.org/html/features/scanner/cdata-sections", true); + parser.setFeature("http://cyberneko.org/html/features/parse-noscript-content", false); + Document doc = new DocumentImpl(); + DocumentFragment frag = doc.createDocumentFragment(); + parser.parse(new InputSource(new StringReader(html)), frag); + return serialize(frag, pol); + } + + /** Convenience: round-trip with default policy. */ + private String roundTrip(String html) throws Exception { + return roundTrip(html, (InternalPolicy) policy); + } + + /** Build a DocumentFragment containing a single element (no children). */ + private DocumentFragment fragmentWithElement(String tagName) { + DocumentFragment frag = document.createDocumentFragment(); + frag.appendChild(document.createElement(tagName)); + return frag; + } + + /** Build a DocumentFragment containing a single element with one text child. */ + private DocumentFragment fragmentWithText(String tagName, String text) { + DocumentFragment frag = document.createDocumentFragment(); + Element el = document.createElement(tagName); + el.appendChild(document.createTextNode(text)); + frag.appendChild(el); + return frag; + } + + /** Scan via AntiSamy DOM scanner and return clean HTML. */ + private String domScan(String html) throws Exception { + return antiSamy.scan(html, policy, AntiSamy.DOM).getCleanHTML(); + } + + /** Scan via AntiSamy DOM scanner with a specific policy. */ + private String domScan(String html, InternalPolicy pol) throws Exception { + return antiSamy.scan(html, pol, AntiSamy.DOM).getCleanHTML(); + } + + // =========================================================================== + // 1. Basic element serialization + // =========================================================================== + + @Test + public void simpleElementWithText() throws Exception { + DocumentFragment frag = fragmentWithText("p", "Hello"); + assertEquals("

Hello

", serialize(frag, noFormatPolicy)); + } + + @Test + public void simpleElementNoChildren() throws Exception { + // 'div' has children=false but is not a void element, so gets
+ DocumentFragment frag = fragmentWithElement("div"); + String out = serialize(frag); + assertTrue("Expected
opening", out.startsWith(" or />", out.contains("
") || out.contains("/>")); + } + + @Test + public void nestedElements() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element outer = document.createElement("div"); + Element inner = document.createElement("p"); + inner.appendChild(document.createTextNode("nested")); + outer.appendChild(inner); + frag.appendChild(outer); + String out = serialize(frag); + assertThat(out, containsString("

nested

")); + assertThat(out, containsString("")); + } + + @Test + public void siblingElements() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element p1 = document.createElement("p"); + p1.appendChild(document.createTextNode("first")); + Element p2 = document.createElement("p"); + p2.appendChild(document.createTextNode("second")); + frag.appendChild(p1); + frag.appendChild(p2); + String out = serialize(frag); + assertThat(out, containsString("

first

")); + assertThat(out, containsString("

second

")); + } + + @Test + public void mixedContentElementAndText() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element p = document.createElement("p"); + p.appendChild(document.createTextNode("before ")); + Element em = document.createElement("em"); + em.appendChild(document.createTextNode("emphasis")); + p.appendChild(em); + p.appendChild(document.createTextNode(" after")); + frag.appendChild(p); + String out = serialize(frag, noFormatPolicy); + assertEquals("

before emphasis after

", out); + } + + // =========================================================================== + // 2. Void / self-closing elements + // =========================================================================== + + @Test + public void brElementSelfCloses() throws Exception { + DocumentFragment frag = fragmentWithElement("br"); + String out = serialize(frag, noFormatPolicy); + assertEquals("
", out); + } + + @Test + public void hrElementSelfCloses() throws Exception { + DocumentFragment frag = fragmentWithElement("hr"); + String out = serialize(frag, noFormatPolicy); + assertEquals("
", out); + } + + @Test + public void imgElementSelfCloses() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element img = document.createElement("img"); + img.setAttribute("src", "test.png"); + img.setAttribute("alt", "test"); + frag.appendChild(img); + String out = serialize(frag); + assertThat(out, containsString("")); + assertThat(out, not(containsString(""))); + } + + @Test + public void inputElementSelfCloses() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element inp = document.createElement("input"); + inp.setAttribute("type", "text"); + frag.appendChild(inp); + String out = serialize(frag); + assertThat(out, containsString("")); + assertThat(out, not(containsString(""))); + } + + @Test + public void metaElementSelfCloses() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element meta = document.createElement("meta"); + meta.setAttribute("charset", "UTF-8"); + frag.appendChild(meta); + String out = serialize(frag); + assertThat(out, containsString("")); + assertThat(out, not(containsString(""))); + } + + @Test + public void linkElementSelfCloses() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element link = document.createElement("link"); + link.setAttribute("rel", "stylesheet"); + link.setAttribute("href", "style.css"); + frag.appendChild(link); + String out = serialize(frag, noFormatPolicy); + assertThat(out, containsString("' not '/>' + assertThat(out, not(containsString(""))); + } + + @Test + public void colElementSelfCloses() throws Exception { + DocumentFragment frag = fragmentWithElement("col"); + String out = serialize(frag); + assertThat(out, containsString("")); + } + + @Test + public void paramElementSelfCloses() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element param = document.createElement("param"); + param.setAttribute("name", "movie"); + frag.appendChild(param); + String out = serialize(frag); + assertThat(out, containsString("")); + } + + // =========================================================================== + // 3. Attribute serialization + // =========================================================================== + + @Test + public void regularAttribute() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element a = document.createElement("a"); + a.setAttribute("href", "http://example.com"); + a.appendChild(document.createTextNode("link")); + frag.appendChild(a); + String out = serialize(frag); + assertThat(out, containsString("href=\"http://example.com\"")); + } + + @Test + public void multipleAttributes() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element img = document.createElement("img"); + img.setAttribute("src", "pic.jpg"); + img.setAttribute("alt", "picture"); + img.setAttribute("width", "100"); + frag.appendChild(img); + String out = serialize(frag); + assertThat(out, containsString("src=\"pic.jpg\"")); + assertThat(out, containsString("alt=\"picture\"")); + assertThat(out, containsString("width=\"100\"")); + } + + @Test + public void attributeWithSpecialCharsAreEscaped() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element el = document.createElement("p"); + el.setAttribute("title", "a&c\"d"); + el.appendChild(document.createTextNode("text")); + frag.appendChild(el); + String out = serialize(frag); + // Attribute value should have < and & and " escaped + assertThat(out, containsString("<")); + assertThat(out, containsString("&")); + assertThat(out, containsString(""")); + } + + @Test + public void booleanAttributeSelected() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element option = document.createElement("option"); + option.setAttribute("selected", "selected"); + option.setAttribute("value", "1"); + option.appendChild(document.createTextNode("One")); + frag.appendChild(option); + String out = serialize(frag); + // 'selected' is a boolean attr for option → serialized without value + assertThat(out, containsString("selected")); + assertThat(out, containsString("value=\"1\"")); + } + + @Test + public void booleanAttributeChecked() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element inp = document.createElement("input"); + inp.setAttribute("type", "checkbox"); + inp.setAttribute("checked", "checked"); + frag.appendChild(inp); + String out = serialize(frag); + assertThat(out, containsString("checked")); + // Not: checked="checked" + assertThat(out, not(containsString("checked=\"checked\""))); + } + + @Test + public void booleanAttributeDisabled() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element btn = document.createElement("button"); + btn.setAttribute("disabled", "disabled"); + btn.appendChild(document.createTextNode("Click")); + frag.appendChild(btn); + String out = serialize(frag); + assertThat(out, containsString("disabled")); + assertThat(out, not(containsString("disabled=\"disabled\""))); + } + + @Test + public void booleanAttributeMultiple() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element sel = document.createElement("select"); + sel.setAttribute("multiple", "multiple"); + frag.appendChild(sel); + String out = serialize(frag); + assertThat(out, containsString("multiple")); + assertThat(out, not(containsString("multiple=\"multiple\""))); + } + + @Test + public void booleanAttributeReadonly() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element inp = document.createElement("input"); + inp.setAttribute("readonly", "readonly"); + frag.appendChild(inp); + String out = serialize(frag); + assertThat(out, containsString("readonly")); + assertThat(out, not(containsString("readonly=\"readonly\""))); + } + + @Test + public void booleanAttributeNowrap() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element td = document.createElement("td"); + td.setAttribute("nowrap", "nowrap"); + td.appendChild(document.createTextNode("cell")); + frag.appendChild(td); + String out = serialize(frag); + assertThat(out, containsString("nowrap")); + assertThat(out, not(containsString("nowrap=\"nowrap\""))); + } + + @Test + public void emptyAttributeValue() throws Exception { + DocumentFragment frag = document.createDocumentFragment(); + Element p = document.createElement("p"); + p.setAttribute("class", ""); + p.appendChild(document.createTextNode("text")); + frag.appendChild(p); + String out = serialize(frag); + assertThat(out, containsString("class=\"\"")); + } + + @Test + public void hrefAttributeEscapedAsUri() throws Exception { + // href and src are URI attributes in Xerces HTMLdtd → special char handling + DocumentFragment frag = document.createDocumentFragment(); + Element a = document.createElement("a"); + a.setAttribute("href", "http://example.com/path?a=1&b=2"); + a.appendChild(document.createTextNode("link")); + frag.appendChild(a); + String out = serialize(frag); + assertThat(out, containsString("href=")); + // & in URI gets encoded to & + assertThat(out, containsString("&")); + } + + // =========================================================================== + // 4. Text content and entity encoding + // =========================================================================== + + @Test + public void textWithLessThanIsEscaped() throws Exception { + assertEquals("

<script>

", serialize(fragmentWithText("p", "