diff --git a/src/main/java/org/owasp/validator/html/scan/AbstractAntiSamyScanner.java b/src/main/java/org/owasp/validator/html/scan/AbstractAntiSamyScanner.java
index 31eafc23..08d247c4 100644
--- a/src/main/java/org/owasp/validator/html/scan/AbstractAntiSamyScanner.java
+++ b/src/main/java/org/owasp/validator/html/scan/AbstractAntiSamyScanner.java
@@ -118,6 +118,18 @@ protected org.apache.xml.serialize.HTMLSerializer getHTMLSerializer(
return new ASHTMLSerializer(w, format, policy);
}
+ /**
+ * Returns a new {@link HtmlSerializer} configured for the current policy. This is the preferred
+ * serializer that does not depend on the deprecated Xerces
+ * {@code org.apache.xml.serialize.HTMLSerializer}.
+ *
+ * @param w the writer to serialize into
+ * @return a fully configured {@link HtmlSerializer}
+ */
+ protected HtmlSerializer getHtmlSerializer(Writer w) {
+ return new HtmlSerializer(w, policy);
+ }
+
protected String trim(String original, String cleaned) {
if (cleaned.endsWith("\n")) {
if (!original.endsWith("\n")) {
diff --git a/src/main/java/org/owasp/validator/html/scan/AntiSamyDOMScanner.java b/src/main/java/org/owasp/validator/html/scan/AntiSamyDOMScanner.java
index ca754867..4716a122 100644
--- a/src/main/java/org/owasp/validator/html/scan/AntiSamyDOMScanner.java
+++ b/src/main/java/org/owasp/validator/html/scan/AntiSamyDOMScanner.java
@@ -185,11 +185,7 @@ public CleanResults scan(String html) throws ScanException {
StringWriter out = new StringWriter();
- @SuppressWarnings("deprecation")
- org.apache.xml.serialize.OutputFormat format = getOutputFormat();
-
- //noinspection deprecation
- org.apache.xml.serialize.HTMLSerializer serializer = getHTMLSerializer(out, format);
+ HtmlSerializer serializer = getHtmlSerializer(out);
serializer.serialize(dom);
/*
diff --git a/src/main/java/org/owasp/validator/html/scan/HtmlSerializer.java b/src/main/java/org/owasp/validator/html/scan/HtmlSerializer.java
new file mode 100644
index 00000000..3b50d6df
--- /dev/null
+++ b/src/main/java/org/owasp/validator/html/scan/HtmlSerializer.java
@@ -0,0 +1,925 @@
+/*
+ * Copyright (c) 2007-2024, Arshan Dabirsiaghi, Jason Li
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this list of conditions
+ * and the following disclaimer. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the documentation and/or other
+ * materials provided with the distribution. Neither the name of OWASP nor the names of its
+ * contributors may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.owasp.validator.html.scan;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.ArrayDeque;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Deque;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+import org.owasp.validator.html.InternalPolicy;
+import org.owasp.validator.html.TagMatcher;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Attr;
+import org.w3c.dom.Comment;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.Text;
+
+/**
+ * A pure-Java HTML serializer that replaces the deprecated Xerces {@code HTMLSerializer}
+ * (org.apache.xml.serialize.HTMLSerializer) used by the old {@link ASHTMLSerializer}. This class
+ * serializes a DOM {@link DocumentFragment} to an HTML string without any Apache Xerces dependency.
+ *
+ *
Behaviour is modelled on the Xerces {@code HTMLSerializer} / {@code BaseMarkupSerializer}
+ * stack and the AntiSamy-specific {@code ASHTMLSerializer} overrides, so that existing output is
+ * preserved exactly.
+ *
+ * @see ASHTMLSerializer
+ */
+public class HtmlSerializer {
+
+ private static final Logger logger = LoggerFactory.getLogger(HtmlSerializer.class);
+
+ // -----------------------------------------------------------------------
+ // Static HTML metadata tables (replicate Xerces HTMLdtd behaviour)
+ // -----------------------------------------------------------------------
+
+ /**
+ * Elements that Xerces HTMLdtd considers "empty" (isEmptyTag returns true). These are elements
+ * whose flags have the ONLY_OPENING (0x01) or EMPTY (0x10) bits set. When such an element has no
+ * child nodes it is serialized in the "else" (void) branch of serializeElement.
+ */
+ private static final Set EMPTY_ELEMENTS;
+
+ /**
+ * True void/self-closing HTML elements (have both EMPTY + ONLY_OPENING flags in Xerces, i.e. no
+ * closing tag should be printed). These never reach {@link #endElementIO} in practice because
+ * they are handled directly in the void branch.
+ */
+ private static final Set VOID_ELEMENTS;
+
+ /** Elements whose content should be treated as preserve-space (no indentation inside). */
+ private static final Set PRESERVE_SPACE_ELEMENTS;
+
+ /**
+ * Attribute names that carry URI values and must be escaped via {@link #printEscaped}. Matches
+ * Xerces {@code HTMLdtd.isURI} which checks {@code href} and {@code src} case-insensitively.
+ */
+ private static final Set URI_ATTRS;
+
+ /**
+ * Boolean attributes keyed by lower-case element name. Matches the Xerces {@code HTMLdtd}
+ * defineBoolean table. Boolean attributes are written as just the attribute name with no {@code
+ * ="value"} suffix.
+ */
+ private static final Map> BOOLEAN_ATTRS_BY_ELEMENT;
+
+ /**
+ * HTML named character-entity map: Unicode code point → entity name. Built from the same
+ * HTMLEntities.res data that Xerces uses, so entity encoding is identical.
+ */
+ private static final Map HTML_ENTITIES;
+
+ static {
+ // Elements treated as "empty" by Xerces isEmptyTag (ONLY_OPENING or EMPTY flag bits)
+ EMPTY_ELEMENTS =
+ Collections.unmodifiableSet(
+ new HashSet<>(
+ Arrays.asList(
+ "area", "base", "basefont", "br", "col",
+ "dd", "dt", "frame", "hr", "img",
+ "input", "isindex", "li", "link", "meta",
+ "option", "param")));
+
+ // Pure void HTML elements – no closing tag, ever
+ VOID_ELEMENTS =
+ Collections.unmodifiableSet(
+ new HashSet<>(
+ Arrays.asList(
+ "area", "base", "basefont", "br", "col",
+ "frame", "hr", "img", "input", "isindex",
+ "link", "meta", "param")));
+
+ // preserve-space elements (PRE has PRESERVE flag in Xerces; SCRIPT/STYLE also have it)
+ PRESERVE_SPACE_ELEMENTS =
+ Collections.unmodifiableSet(
+ new HashSet<>(Arrays.asList("pre", "script", "style", "textarea")));
+
+ URI_ATTRS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList("href", "src")));
+
+ Map> boolMap = new HashMap<>();
+ boolMap.put("area", new HashSet<>(Collections.singletonList("href")));
+ boolMap.put("button", new HashSet<>(Collections.singletonList("disabled")));
+ boolMap.put("dir", new HashSet<>(Collections.singletonList("compact")));
+ boolMap.put("dl", new HashSet<>(Collections.singletonList("compact")));
+ boolMap.put("frame", new HashSet<>(Collections.singletonList("noresize")));
+ boolMap.put("hr", new HashSet<>(Collections.singletonList("noshade")));
+ boolMap.put("image", new HashSet<>(Collections.singletonList("ismap")));
+ boolMap.put(
+ "input",
+ new HashSet<>(
+ Arrays.asList("defaultchecked", "checked", "readonly", "disabled")));
+ boolMap.put("link", new HashSet<>(Collections.singletonList("link")));
+ boolMap.put("menu", new HashSet<>(Collections.singletonList("compact")));
+ boolMap.put("object", new HashSet<>(Collections.singletonList("declare")));
+ boolMap.put("ol", new HashSet<>(Collections.singletonList("compact")));
+ boolMap.put("optgroup", new HashSet<>(Collections.singletonList("disabled")));
+ boolMap.put(
+ "option",
+ new HashSet<>(Arrays.asList("default-selected", "selected", "disabled")));
+ boolMap.put("script", new HashSet<>(Collections.singletonList("defer")));
+ boolMap.put("select", new HashSet<>(Arrays.asList("multiple", "disabled")));
+ boolMap.put("style", new HashSet<>(Collections.singletonList("disabled")));
+ boolMap.put("td", new HashSet<>(Collections.singletonList("nowrap")));
+ boolMap.put("th", new HashSet<>(Collections.singletonList("nowrap")));
+ boolMap.put(
+ "textarea", new HashSet<>(Arrays.asList("disabled", "readonly")));
+ boolMap.put("ul", new HashSet<>(Collections.singletonList("compact")));
+ BOOLEAN_ATTRS_BY_ELEMENT = Collections.unmodifiableMap(boolMap);
+
+ // HTML named character entities (from Xerces HTMLEntities.res)
+ Map ent = new HashMap<>();
+ // markup-significant
+ ent.put(34, "quot");
+ ent.put(38, "amp");
+ ent.put(60, "lt");
+ ent.put(62, "gt");
+ // ISO 8859-1
+ ent.put(160, "nbsp");
+ ent.put(161, "iexcl");
+ ent.put(162, "cent");
+ ent.put(163, "pound");
+ ent.put(164, "curren");
+ ent.put(165, "yen");
+ ent.put(166, "brvbar");
+ ent.put(167, "sect");
+ ent.put(168, "uml");
+ ent.put(169, "copy");
+ ent.put(170, "ordf");
+ ent.put(171, "laquo");
+ ent.put(172, "not");
+ ent.put(173, "shy");
+ ent.put(174, "reg");
+ ent.put(175, "macr");
+ ent.put(176, "deg");
+ ent.put(177, "plusmn");
+ ent.put(178, "sup2");
+ ent.put(179, "sup3");
+ ent.put(180, "acute");
+ ent.put(181, "micro");
+ ent.put(182, "para");
+ ent.put(183, "middot");
+ ent.put(184, "cedil");
+ ent.put(185, "sup1");
+ ent.put(186, "ordm");
+ ent.put(187, "raquo");
+ ent.put(188, "frac14");
+ ent.put(189, "frac12");
+ ent.put(190, "frac34");
+ ent.put(191, "iquest");
+ ent.put(192, "Agrave");
+ ent.put(193, "Aacute");
+ ent.put(194, "Acirc");
+ ent.put(195, "Atilde");
+ ent.put(196, "Auml");
+ ent.put(197, "Aring");
+ ent.put(198, "AElig");
+ ent.put(199, "Ccedil");
+ ent.put(200, "Egrave");
+ ent.put(201, "Eacute");
+ ent.put(202, "Ecirc");
+ ent.put(203, "Euml");
+ ent.put(204, "Igrave");
+ ent.put(205, "Iacute");
+ ent.put(206, "Icirc");
+ ent.put(207, "Iuml");
+ ent.put(208, "ETH");
+ ent.put(209, "Ntilde");
+ ent.put(210, "Ograve");
+ ent.put(211, "Oacute");
+ ent.put(212, "Ocirc");
+ ent.put(213, "Otilde");
+ ent.put(214, "Ouml");
+ ent.put(215, "times");
+ ent.put(216, "Oslash");
+ ent.put(217, "Ugrave");
+ ent.put(218, "Uacute");
+ ent.put(219, "Ucirc");
+ ent.put(220, "Uuml");
+ ent.put(221, "Yacute");
+ ent.put(222, "THORN");
+ ent.put(223, "szlig");
+ ent.put(224, "agrave");
+ ent.put(225, "aacute");
+ ent.put(226, "acirc");
+ ent.put(227, "atilde");
+ ent.put(228, "auml");
+ ent.put(229, "aring");
+ ent.put(230, "aelig");
+ ent.put(231, "ccedil");
+ ent.put(232, "egrave");
+ ent.put(233, "eacute");
+ ent.put(234, "ecirc");
+ ent.put(235, "euml");
+ ent.put(236, "igrave");
+ ent.put(237, "iacute");
+ ent.put(238, "icirc");
+ ent.put(239, "iuml");
+ ent.put(240, "eth");
+ ent.put(241, "ntilde");
+ ent.put(242, "ograve");
+ ent.put(243, "oacute");
+ ent.put(244, "ocirc");
+ ent.put(245, "otilde");
+ ent.put(246, "ouml");
+ ent.put(247, "divide");
+ ent.put(248, "oslash");
+ ent.put(249, "ugrave");
+ ent.put(250, "uacute");
+ ent.put(251, "ucirc");
+ ent.put(252, "uuml");
+ ent.put(253, "yacute");
+ ent.put(254, "thorn");
+ ent.put(255, "yuml");
+ // Symbols / Math / Greek
+ ent.put(402, "fnof");
+ ent.put(913, "Alpha");
+ ent.put(914, "Beta");
+ ent.put(915, "Gamma");
+ ent.put(916, "Delta");
+ ent.put(917, "Epsilon");
+ ent.put(918, "Zeta");
+ ent.put(919, "Eta");
+ ent.put(920, "Theta");
+ ent.put(921, "Iota");
+ ent.put(922, "Kappa");
+ ent.put(923, "Lambda");
+ ent.put(924, "Mu");
+ ent.put(925, "Nu");
+ ent.put(926, "Xi");
+ ent.put(927, "Omicron");
+ ent.put(928, "Pi");
+ ent.put(929, "Rho");
+ ent.put(931, "Sigma");
+ ent.put(932, "Tau");
+ ent.put(933, "Upsilon");
+ ent.put(934, "Phi");
+ ent.put(935, "Chi");
+ ent.put(936, "Psi");
+ ent.put(937, "Omega");
+ ent.put(945, "alpha");
+ ent.put(946, "beta");
+ ent.put(947, "gamma");
+ ent.put(948, "delta");
+ ent.put(949, "epsilon");
+ ent.put(950, "zeta");
+ ent.put(951, "eta");
+ ent.put(952, "theta");
+ ent.put(953, "iota");
+ ent.put(954, "kappa");
+ ent.put(955, "lambda");
+ ent.put(956, "mu");
+ ent.put(957, "nu");
+ ent.put(958, "xi");
+ ent.put(959, "omicron");
+ ent.put(960, "pi");
+ ent.put(961, "rho");
+ ent.put(962, "sigmaf");
+ ent.put(963, "sigma");
+ ent.put(964, "tau");
+ ent.put(965, "upsilon");
+ ent.put(966, "phi");
+ ent.put(967, "chi");
+ ent.put(968, "psi");
+ ent.put(969, "omega");
+ ent.put(977, "thetasym");
+ ent.put(978, "upsih");
+ ent.put(982, "piv");
+ // General Punctuation
+ ent.put(8226, "bull");
+ ent.put(8230, "hellip");
+ ent.put(8242, "prime");
+ ent.put(8243, "Prime");
+ ent.put(8254, "oline");
+ ent.put(8260, "frasl");
+ // Letterlike Symbols
+ ent.put(8472, "weierp");
+ ent.put(8465, "image");
+ ent.put(8476, "real");
+ ent.put(8482, "trade");
+ ent.put(8501, "alefsym");
+ // Arrows
+ ent.put(8592, "larr");
+ ent.put(8593, "uarr");
+ ent.put(8594, "rarr");
+ ent.put(8595, "darr");
+ ent.put(8596, "harr");
+ ent.put(8629, "crarr");
+ ent.put(8656, "lArr");
+ ent.put(8657, "uArr");
+ ent.put(8658, "rArr");
+ ent.put(8659, "dArr");
+ ent.put(8660, "hArr");
+ // Mathematical Operators
+ ent.put(8704, "forall");
+ ent.put(8706, "part");
+ ent.put(8707, "exist");
+ ent.put(8709, "empty");
+ ent.put(8711, "nabla");
+ ent.put(8712, "isin");
+ ent.put(8713, "notin");
+ ent.put(8715, "ni");
+ ent.put(8719, "prod");
+ ent.put(8721, "sum");
+ ent.put(8722, "minus");
+ ent.put(8727, "lowast");
+ ent.put(8730, "radic");
+ ent.put(8733, "prop");
+ ent.put(8734, "infin");
+ ent.put(8736, "ang");
+ ent.put(8743, "and");
+ ent.put(8744, "or");
+ ent.put(8745, "cap");
+ ent.put(8746, "cup");
+ ent.put(8747, "int");
+ ent.put(8756, "there4");
+ ent.put(8764, "sim");
+ ent.put(8773, "cong");
+ ent.put(8776, "asymp");
+ ent.put(8800, "ne");
+ ent.put(8801, "equiv");
+ ent.put(8804, "le");
+ ent.put(8805, "ge");
+ ent.put(8834, "sub");
+ ent.put(8835, "sup");
+ ent.put(8836, "nsub");
+ ent.put(8838, "sube");
+ ent.put(8839, "supe");
+ ent.put(8853, "oplus");
+ ent.put(8855, "otimes");
+ ent.put(8869, "perp");
+ ent.put(8901, "sdot");
+ // Miscellaneous Technical
+ ent.put(8968, "lceil");
+ ent.put(8969, "rceil");
+ ent.put(8970, "lfloor");
+ ent.put(8971, "rfloor");
+ ent.put(9001, "lang");
+ ent.put(9002, "rang");
+ // Geometric Shapes
+ ent.put(9674, "loz");
+ // Miscellaneous Symbols
+ ent.put(9824, "spades");
+ ent.put(9827, "clubs");
+ ent.put(9829, "hearts");
+ ent.put(9830, "diams");
+ // Internationalisation
+ ent.put(338, "OElig");
+ ent.put(339, "oelig");
+ ent.put(376, "Yuml");
+ ent.put(710, "circ");
+ ent.put(732, "tilde");
+ ent.put(8194, "ensp");
+ ent.put(8195, "emsp");
+ ent.put(8201, "thinsp");
+ ent.put(8204, "zwnj");
+ ent.put(8205, "zwj");
+ ent.put(8206, "lrm");
+ ent.put(8207, "rlm");
+ ent.put(8211, "ndash");
+ ent.put(8212, "mdash");
+ ent.put(8216, "lsquo");
+ ent.put(8217, "rsquo");
+ ent.put(8218, "sbquo");
+ ent.put(8220, "ldquo");
+ ent.put(8221, "rdquo");
+ ent.put(8222, "bdquo");
+ ent.put(8224, "dagger");
+ ent.put(8225, "Dagger");
+ ent.put(8240, "permil");
+ ent.put(8249, "lsaquo");
+ ent.put(8250, "rsaquo");
+ ent.put(8364, "euro");
+ HTML_ENTITIES = Collections.unmodifiableMap(ent);
+ }
+
+ // -----------------------------------------------------------------------
+ // Instance state
+ // -----------------------------------------------------------------------
+
+ private final Writer writer;
+ private final boolean encodeAllPossibleEntities;
+ private final TagMatcher allowedEmptyTags;
+ private final TagMatcher requiresClosingTags;
+ private final boolean omitXmlDeclaration;
+ private final boolean omitDoctypeDeclaration;
+ private final boolean indenting;
+ private final int indentSize;
+ private final boolean globalPreserveSpace;
+
+ // Tracking state across serialization
+ private boolean started = false;
+ private int currentIndent = 0;
+ private final Deque stateStack = new ArrayDeque<>();
+
+ // -----------------------------------------------------------------------
+ // Per-element state (mirrors Xerces ElementState)
+ // -----------------------------------------------------------------------
+
+ private static class ElementState {
+ String rawName;
+ boolean preserveSpace;
+ /** True while the element's opening {@code >} has not yet been written. */
+ boolean empty = true;
+ /** True when the last serialized sibling was an element (used for indenting). */
+ boolean afterElement = false;
+ /** True for SCRIPT/STYLE – content is not HTML-escaped. */
+ boolean unescaped = false;
+ }
+
+ // -----------------------------------------------------------------------
+ // Constructor
+ // -----------------------------------------------------------------------
+
+ public HtmlSerializer(Writer w, InternalPolicy policy) {
+ this.writer = w;
+ this.encodeAllPossibleEntities = policy.isEntityEncodeIntlCharacters();
+ this.allowedEmptyTags = policy.getAllowedEmptyTags();
+ this.requiresClosingTags = policy.getRequiresClosingTags();
+ this.omitXmlDeclaration = policy.isOmitXmlDeclaration();
+ this.omitDoctypeDeclaration = policy.isOmitDoctypeDeclaration();
+ this.indenting = policy.isFormatOutput();
+ this.indentSize = 2;
+ this.globalPreserveSpace = policy.isPreserveSpace();
+ }
+
+ // -----------------------------------------------------------------------
+ // Public API
+ // -----------------------------------------------------------------------
+
+ /**
+ * Serialize a DOM {@link DocumentFragment} to the {@link Writer} supplied at construction time.
+ *
+ * @param fragment the fragment to serialize
+ * @throws IOException if writing to the underlying writer fails
+ */
+ public void serialize(DocumentFragment fragment) throws IOException {
+ Node child = fragment.getFirstChild();
+ while (child != null) {
+ serializeNode(child);
+ child = child.getNextSibling();
+ }
+ writer.flush();
+ }
+
+ // -----------------------------------------------------------------------
+ // Internal serialization helpers
+ // -----------------------------------------------------------------------
+
+ private void serializeNode(Node node) throws IOException {
+ switch (node.getNodeType()) {
+ case Node.ELEMENT_NODE:
+ serializeElement((Element) node);
+ break;
+ case Node.TEXT_NODE:
+ case Node.CDATA_SECTION_NODE:
+ serializeText((Text) node);
+ break;
+ case Node.COMMENT_NODE:
+ serializeComment((Comment) node);
+ break;
+ default:
+ // Processing instructions and other nodes have already been removed
+ // by the AntiSamy DOM scanner before serialization is called.
+ break;
+ }
+ }
+
+ /**
+ * Emit an optional XML declaration / DOCTYPE, mirroring
+ * BaseMarkupSerializer.startDocument(). In AntiSamy both flags are normally {@code true} so
+ * nothing is output.
+ */
+ private void startDocument(String rootTagName) throws IOException {
+ StringBuilder sb = new StringBuilder();
+ if (!omitXmlDeclaration) {
+ sb.append("");
+ }
+ if (!omitDoctypeDeclaration) {
+ sb.append("");
+ }
+ if (sb.length() > 0) {
+ writer.write(sb.toString());
+ writer.write('\n');
+ }
+ started = true;
+ }
+
+ /**
+ * Serializes a single DOM {@link Element}, replicating the logic in
+ * {@code ASHTMLSerializer.serializeElement} and {@code BaseMarkupSerializer}.
+ */
+ private void serializeElement(Element elem) throws IOException {
+ String tagName = elem.getTagName();
+ boolean isRootLevel = isDocumentState();
+ ElementState parentState = peekState();
+
+ // --- Document-level bookkeeping (mirrors HTMLSerializer.serializeElement) ---
+ if (isRootLevel) {
+ if (!started) {
+ startDocument(tagName);
+ }
+ } else {
+ // Close the parent element's opening ">" if it hasn't been printed yet.
+ // Capture the value first so we can use it in the breakLine condition below.
+ boolean wasEmpty = parentState.empty;
+ if (wasEmpty) {
+ writer.write('>');
+ parentState.empty = false;
+ }
+ // Line-break before this element when indenting.
+ if (indenting && !parentState.preserveSpace && (wasEmpty || parentState.afterElement)) {
+ breakLine();
+ }
+ }
+
+ // Inherit preserve-space from the parent.
+ boolean preserveSpace =
+ (parentState != null) ? parentState.preserveSpace : globalPreserveSpace;
+
+ // --- Opening tag ---
+ writer.write('<');
+ writer.write(tagName);
+ currentIndent++;
+
+ // --- Attributes ---
+ NamedNodeMap attrMap = elem.getAttributes();
+ if (attrMap != null) {
+ for (int i = 0; i < attrMap.getLength(); i++) {
+ Attr attr = (Attr) attrMap.item(i);
+ if (!attr.getSpecified()) {
+ continue;
+ }
+ String name = attr.getName().toLowerCase(Locale.ENGLISH);
+ String value = attr.getValue();
+ if (value == null) {
+ value = "";
+ }
+ writer.write(' ');
+
+ if (isUriAttr(name)) {
+ // URI attribute: name=""
+ // escapeURI in ASHTMLSerializer calls printEscaped directly and returns "".
+ writer.write(name);
+ writer.write("=\"");
+ printEscaped(value);
+ writer.write('"');
+ } else if (isBooleanAttr(tagName, name)) {
+ // Boolean attribute: print only the name
+ writer.write(name);
+ } else {
+ writer.write(name);
+ writer.write("=\"");
+ printEscaped(value);
+ writer.write('"');
+ }
+ }
+ }
+
+ if (isPreserveSpaceElement(tagName)) {
+ preserveSpace = true;
+ }
+
+ // --- Decide: element with content vs. void element ---
+ if (elem.hasChildNodes() || !isEmptyElement(tagName)) {
+ // Push a new element state and serialize children.
+ ElementState state = pushState(tagName, preserveSpace);
+
+ // A and TD: close the opening ">" immediately (no line breaks inside).
+ if ("a".equalsIgnoreCase(tagName) || "td".equalsIgnoreCase(tagName)) {
+ state.empty = false;
+ writer.write('>');
+ }
+
+ // SCRIPT and STYLE: content is not HTML-escaped.
+ if ("script".equalsIgnoreCase(tagName) || "style".equalsIgnoreCase(tagName)) {
+ state.unescaped = true;
+ }
+
+ Node child = elem.getFirstChild();
+ while (child != null) {
+ serializeNode(child);
+ child = child.getNextSibling();
+ }
+
+ endElementIO(tagName);
+
+ } else {
+ // Void / empty element branch (mirrors ASHTMLSerializer.serializeElement else-branch).
+ currentIndent--;
+ if (isAllowedEmptyTag(tagName) && !requiresClosingTag(tagName)) {
+ writer.write("/>");
+ } else {
+ writer.write('>');
+ }
+ if (!isRootLevel) {
+ parentState.afterElement = true;
+ parentState.empty = false;
+ }
+ if (isRootLevel) {
+ if (indenting) {
+ writer.write('\n');
+ }
+ writer.flush();
+ }
+ }
+ }
+
+ /**
+ * Closes an element that was opened via {@link #pushState}, mirroring
+ * {@code ASHTMLSerializer.endElementIO}.
+ */
+ private void endElementIO(String rawName) throws IOException {
+ currentIndent--;
+ ElementState state = peekState();
+
+ if (state.empty && isAllowedEmptyTag(rawName) && !requiresClosingTag(rawName)) {
+ // Element had no children and is allowed to self-close.
+ writer.write("/>");
+ } else {
+ // Close the opening tag if not already done.
+ if (state.empty) {
+ writer.write('>');
+ }
+ // All elements reaching endElementIO get a closing tag.
+ // (Void elements are handled in the else-branch of serializeElement and never
+ // reach this method.)
+ if (indenting && !state.preserveSpace && state.afterElement) {
+ breakLine();
+ }
+ writer.write("");
+ writer.write(state.rawName);
+ writer.write('>');
+ }
+
+ popState();
+ ElementState parentState = peekState();
+ // A and TD elements don't trigger afterElement indentation in their parent.
+ // rawName will always be non-null here (it's the tag name we just serialized).
+ if (!"a".equalsIgnoreCase(rawName) && !"td".equalsIgnoreCase(rawName)) {
+ if (parentState != null) {
+ parentState.afterElement = true;
+ parentState.empty = false;
+ }
+ }
+ if (isDocumentState()) {
+ if (indenting) {
+ writer.write('\n');
+ }
+ writer.flush();
+ }
+ }
+
+ /**
+ * Serializes a text (or CDATA) node, replicating {@code BaseMarkupSerializer.content()} +
+ * {@code characters(String)}.
+ */
+ private void serializeText(Text node) throws IOException {
+ String text = node.getNodeValue();
+ if (text == null || text.isEmpty()) {
+ return;
+ }
+
+ ElementState state = peekState();
+ boolean inPreserveSpace = state != null && state.preserveSpace;
+
+ // When formatting output, whitespace-only text nodes between block-level elements should be
+ // skipped. The Xerces IndentPrinter effectively absorbs them into its line buffer and they are
+ // displaced by the indentation. Skipping them here reproduces that behaviour without the
+ // complexity of a buffered printer. Preserve-space elements (PRE, SCRIPT, STYLE) are excluded.
+ if (indenting && !inPreserveSpace && isWhitespaceOnly(text)) {
+ return;
+ }
+
+ // content() equivalent: close the opening tag and clear afterElement.
+ if (!isDocumentState() && state != null) {
+ if (state.empty) {
+ writer.write('>');
+ state.empty = false;
+ }
+ state.afterElement = false;
+ }
+
+ if (state != null && state.unescaped) {
+ // SCRIPT / STYLE content is written verbatim.
+ writer.write(text);
+ } else {
+ printEscaped(text);
+ }
+ }
+
+ /**
+ * Serializes an HTML comment, replicating {@code BaseMarkupSerializer.comment(String)}.
+ */
+ private void serializeComment(Comment comment) throws IOException {
+ ElementState state = peekState();
+ // content() equivalent: close the opening tag.
+ if (!isDocumentState() && state != null && state.empty) {
+ writer.write('>');
+ state.empty = false;
+ }
+ if (indenting && !isDocumentState() && state != null && !state.preserveSpace) {
+ breakLine();
+ }
+ writer.write("");
+ // After a comment, afterElement remains false (mirroring Xerces BaseMarkupSerializer).
+ if (!isDocumentState() && state != null) {
+ state.afterElement = false;
+ }
+ }
+
+ // -----------------------------------------------------------------------
+ // Entity / character escaping
+ // -----------------------------------------------------------------------
+
+ /**
+ * HTML-escapes {@code text} and writes the result directly to the writer, replicating {@code
+ * BaseMarkupSerializer.printEscaped}. Surrogate pairs are encoded as numeric character
+ * references.
+ */
+ private void printEscaped(String text) throws IOException {
+ int length = text.length();
+ for (int i = 0; i < length; ) {
+ char c = text.charAt(i);
+
+ // Handle surrogate pairs (supplementary characters >= U+10000).
+ if (Character.isHighSurrogate(c) && i + 1 < length) {
+ char low = text.charAt(i + 1);
+ if (Character.isLowSurrogate(low)) {
+ int codePoint = Character.toCodePoint(c, low);
+ writer.write("");
+ writer.write(Integer.toHexString(codePoint));
+ writer.write(';');
+ i += 2;
+ continue;
+ }
+ }
+
+ int ch = c;
+ String entity = getEntityRef(ch);
+ if (entity != null) {
+ writer.write('&');
+ writer.write(entity);
+ writer.write(';');
+ } else if (ch == '\n' || ch == '\r' || ch == '\t' || ch >= ' ') {
+ writer.write(ch);
+ } else {
+ // Non-printable control character – numeric reference.
+ writer.write("");
+ writer.write(Integer.toHexString(ch));
+ writer.write(';');
+ }
+ i++;
+ }
+ }
+
+ /**
+ * Returns the HTML named entity for {@code ch}, or {@code null} if none should be used. Mirrors
+ * {@code ASHTMLSerializer.getEntityRef}: only consults the entity table when
+ * {@code encodeAllPossibleEntities} is {@code true} or the character is one of the "big5"
+ * security-critical chars ({@code < > " ' &}).
+ */
+ private String getEntityRef(int ch) {
+ if (encodeAllPossibleEntities || Constants.big5CharsToEncodeSet.contains(ch)) {
+ return HTML_ENTITIES.get(ch);
+ }
+ return null;
+ }
+
+ // -----------------------------------------------------------------------
+ // HTML metadata helpers
+ // -----------------------------------------------------------------------
+
+ /**
+ * Returns true when the element is treated as "empty" by Xerces {@code HTMLdtd.isEmptyTag}
+ * (elements with the {@code ONLY_OPENING} or {@code EMPTY} flag). When such an element has no
+ * child nodes it is serialized in the void branch without calling {@link #endElementIO}.
+ */
+ private boolean isEmptyElement(String tagName) {
+ return EMPTY_ELEMENTS.contains(tagName.toLowerCase(Locale.ENGLISH));
+ }
+
+ /**
+ * Returns true for space-preserving elements (PRE, SCRIPT, STYLE, TEXTAREA): inside them
+ * indentation line-breaks are suppressed.
+ */
+ private boolean isPreserveSpaceElement(String tagName) {
+ return PRESERVE_SPACE_ELEMENTS.contains(tagName.toLowerCase(Locale.ENGLISH));
+ }
+
+ /** Returns true when the attribute name is a URI attribute (href or src). */
+ private boolean isUriAttr(String attrName) {
+ return URI_ATTRS.contains(attrName.toLowerCase(Locale.ENGLISH));
+ }
+
+ /**
+ * Returns true when the attribute is a boolean attribute for the given element (matching the
+ * Xerces {@code HTMLdtd} defineBoolean table). Boolean attributes are printed without a value.
+ */
+ private boolean isBooleanAttr(String tagName, String attrName) {
+ Set boolAttrs =
+ BOOLEAN_ATTRS_BY_ELEMENT.get(tagName.toLowerCase(Locale.ENGLISH));
+ return boolAttrs != null
+ && boolAttrs.contains(attrName.toLowerCase(Locale.ENGLISH));
+ }
+
+ /**
+ * Returns true when the tag is allowed to be written as a self-closing empty tag by the policy.
+ * Mirrors {@code ASHTMLSerializer.isAllowedEmptyTag}.
+ */
+ private boolean isAllowedEmptyTag(String tagName) {
+ String lower = tagName.toLowerCase(Locale.ENGLISH);
+ return "head".equals(lower) || allowedEmptyTags.matches(tagName);
+ }
+
+ /**
+ * Returns true when the tag must have an explicit closing tag even when empty (e.g. {@code
+ * }).
+ */
+ private boolean requiresClosingTag(String tagName) {
+ return requiresClosingTags.matches(tagName);
+ }
+
+ /** Returns {@code true} when every character in {@code text} is an ASCII/Unicode whitespace. */
+ private static boolean isWhitespaceOnly(String text) {
+ for (int i = 0; i < text.length(); i++) {
+ if (!Character.isWhitespace(text.charAt(i))) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // -----------------------------------------------------------------------
+ // Element-state stack
+ // -----------------------------------------------------------------------
+
+ private boolean isDocumentState() {
+ return stateStack.isEmpty();
+ }
+
+ private ElementState peekState() {
+ return stateStack.isEmpty() ? null : stateStack.peek();
+ }
+
+ private ElementState pushState(String rawName, boolean preserveSpace) {
+ ElementState state = new ElementState();
+ state.rawName = rawName;
+ state.preserveSpace = preserveSpace;
+ stateStack.push(state);
+ return state;
+ }
+
+ private void popState() {
+ if (!stateStack.isEmpty()) {
+ stateStack.pop();
+ }
+ }
+
+ // -----------------------------------------------------------------------
+ // Indentation
+ // -----------------------------------------------------------------------
+
+ private void breakLine() throws IOException {
+ writer.write('\n');
+ for (int i = 0; i < currentIndent * indentSize; i++) {
+ writer.write(' ');
+ }
+ }
+}
diff --git a/src/test/java/org/owasp/validator/html/test/HtmlSerializerTest.java b/src/test/java/org/owasp/validator/html/test/HtmlSerializerTest.java
new file mode 100644
index 00000000..0f1a518d
--- /dev/null
+++ b/src/test/java/org/owasp/validator/html/test/HtmlSerializerTest.java
@@ -0,0 +1,1456 @@
+/*
+ * Copyright (c) 2007-2024, Arshan Dabirsiaghi, Jason Li
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification, are permitted
+ * provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this list of conditions
+ * and the following disclaimer. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the documentation and/or other
+ * materials provided with the distribution. Neither the name of OWASP nor the names of its
+ * contributors may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+package org.owasp.validator.html.test;
+
+import static org.hamcrest.CoreMatchers.containsString;
+import static org.hamcrest.CoreMatchers.not;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.StringWriter;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.Collection;
+import org.htmlunit.cyberneko.parsers.DOMFragmentParser;
+import org.htmlunit.cyberneko.xerces.dom.DocumentImpl;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.owasp.validator.html.AntiSamy;
+import org.owasp.validator.html.CleanResults;
+import org.owasp.validator.html.InternalPolicy;
+import org.owasp.validator.html.Policy;
+import org.owasp.validator.html.scan.HtmlSerializer;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentFragment;
+import org.w3c.dom.Element;
+import org.w3c.dom.Text;
+import org.xml.sax.InputSource;
+
+import java.io.StringReader;
+
+/**
+ * Comprehensive test suite for {@link HtmlSerializer} — the pure-Java replacement for the
+ * Xerces {@code HTMLSerializer} dependency.
+ *
+ *
Tests are organised into:
+ *
+ *
Unit tests that build DOM fragments programmatically and assert serialized output.
+ *
Data-driven round-trip tests that run {@code AntiSamy.scan()} in DOM mode and compare
+ * against expected substrings or exact values.
+ *
+ */
+public class HtmlSerializerTest {
+
+ // ---------------------------------------------------------------------------
+ // Infrastructure
+ // ---------------------------------------------------------------------------
+
+ private TestPolicy policy;
+ private AntiSamy antiSamy;
+ private Document document;
+ /** Policy with formatOutput=false (for unit tests that check exact serialized output). */
+ private InternalPolicy noFormatPolicy;
+ /** Policy with entityEncodeIntlChars=true (for tests checking entity-encoded output). */
+ private InternalPolicy encodeIntlPolicy;
+
+ @Before
+ public void setUp() throws Exception {
+ URL url = getClass().getResource("/antisamy.xml");
+ policy = TestPolicy.getInstance(url);
+ antiSamy = new AntiSamy();
+ document = new DocumentImpl();
+ noFormatPolicy = (InternalPolicy) policy.cloneWithDirective("formatOutput", "false");
+ encodeIntlPolicy = (InternalPolicy) policy.cloneWithDirective("entityEncodeIntlChars", "true");
+ }
+
+ // ---------------------------------------------------------------------------
+ // Helpers
+ // ---------------------------------------------------------------------------
+
+ /** Serialize a {@link DocumentFragment} using default policy settings. */
+ private String serialize(DocumentFragment frag) throws Exception {
+ return serialize(frag, (InternalPolicy) policy);
+ }
+
+ private String serialize(DocumentFragment frag, InternalPolicy pol) throws Exception {
+ StringWriter out = new StringWriter();
+ HtmlSerializer ser = new HtmlSerializer(out, pol);
+ ser.serialize(frag);
+ return out.getBuffer().toString();
+ }
+
+ /**
+ * Parse {@code html} into a {@link DocumentFragment} using the same cyberneko parser
+ * configuration as the DOM scanner, then serialize with {@link HtmlSerializer} using the given
+ * policy.
+ */
+ private String roundTrip(String html, InternalPolicy pol) throws Exception {
+ DOMFragmentParser parser = new DOMFragmentParser();
+ parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
+ parser.setFeature(
+ "http://cyberneko.org/html/features/scanner/style/strip-cdata-delims", false);
+ parser.setFeature("http://cyberneko.org/html/features/scanner/cdata-sections", true);
+ parser.setFeature("http://cyberneko.org/html/features/parse-noscript-content", false);
+ Document doc = new DocumentImpl();
+ DocumentFragment frag = doc.createDocumentFragment();
+ parser.parse(new InputSource(new StringReader(html)), frag);
+ return serialize(frag, pol);
+ }
+
+ /** Convenience: round-trip with default policy. */
+ private String roundTrip(String html) throws Exception {
+ return roundTrip(html, (InternalPolicy) policy);
+ }
+
+ /** Build a DocumentFragment containing a single element (no children). */
+ private DocumentFragment fragmentWithElement(String tagName) {
+ DocumentFragment frag = document.createDocumentFragment();
+ frag.appendChild(document.createElement(tagName));
+ return frag;
+ }
+
+ /** Build a DocumentFragment containing a single element with one text child. */
+ private DocumentFragment fragmentWithText(String tagName, String text) {
+ DocumentFragment frag = document.createDocumentFragment();
+ Element el = document.createElement(tagName);
+ el.appendChild(document.createTextNode(text));
+ frag.appendChild(el);
+ return frag;
+ }
+
+ /** Scan via AntiSamy DOM scanner and return clean HTML. */
+ private String domScan(String html) throws Exception {
+ return antiSamy.scan(html, policy, AntiSamy.DOM).getCleanHTML();
+ }
+
+ /** Scan via AntiSamy DOM scanner with a specific policy. */
+ private String domScan(String html, InternalPolicy pol) throws Exception {
+ return antiSamy.scan(html, pol, AntiSamy.DOM).getCleanHTML();
+ }
+
+ // ===========================================================================
+ // 1. Basic element serialization
+ // ===========================================================================
+
+ @Test
+ public void simpleElementWithText() throws Exception {
+ DocumentFragment frag = fragmentWithText("p", "Hello");
+ assertEquals("
Hello
", serialize(frag, noFormatPolicy));
+ }
+
+ @Test
+ public void simpleElementNoChildren() throws Exception {
+ // 'div' has children=false but is not a void element, so gets
+ DocumentFragment frag = fragmentWithElement("div");
+ String out = serialize(frag);
+ assertTrue("Expected