View Javadoc
1   /*
2    * Copyright (c) 2002-2017 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * http://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package com.gargoylesoftware.htmlunit.html;
16  
17  import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.HTML_ATTRIBUTE_LOWER_CASE;
18  import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.HTML_COMMAND_TAG;
19  import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.HTML_ISINDEX_TAG;
20  import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.HTML_MAIN_TAG;
21  import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.KEYGEN_AS_SELECT;
22  import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.META_X_UA_COMPATIBLE;
23  import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.PAGE_WAIT_LOAD_BEFORE_BODY;
24  
25  import java.io.IOException;
26  import java.io.InputStream;
27  import java.io.StringReader;
28  import java.lang.reflect.InvocationTargetException;
29  import java.net.URL;
30  import java.nio.charset.Charset;
31  import java.util.ArrayDeque;
32  import java.util.ArrayList;
33  import java.util.Deque;
34  import java.util.HashMap;
35  import java.util.List;
36  import java.util.Locale;
37  import java.util.Map;
38  
39  import org.apache.commons.lang3.ArrayUtils;
40  import org.apache.commons.lang3.StringUtils;
41  import org.apache.xerces.parsers.AbstractSAXParser;
42  import org.apache.xerces.util.DefaultErrorHandler;
43  import org.apache.xerces.xni.Augmentations;
44  import org.apache.xerces.xni.QName;
45  import org.apache.xerces.xni.XMLAttributes;
46  import org.apache.xerces.xni.XNIException;
47  import org.apache.xerces.xni.parser.XMLInputSource;
48  import org.apache.xerces.xni.parser.XMLParseException;
49  import org.apache.xerces.xni.parser.XMLParserConfiguration;
50  import org.w3c.dom.Element;
51  import org.w3c.dom.Node;
52  import org.xml.sax.Attributes;
53  import org.xml.sax.ContentHandler;
54  import org.xml.sax.Locator;
55  import org.xml.sax.SAXException;
56  import org.xml.sax.ext.LexicalHandler;
57  
58  import com.gargoylesoftware.htmlunit.BrowserVersion;
59  import com.gargoylesoftware.htmlunit.ObjectInstantiationException;
60  import com.gargoylesoftware.htmlunit.Page;
61  import com.gargoylesoftware.htmlunit.SgmlPage;
62  import com.gargoylesoftware.htmlunit.WebAssert;
63  import com.gargoylesoftware.htmlunit.WebClient;
64  import com.gargoylesoftware.htmlunit.WebResponse;
65  import com.gargoylesoftware.htmlunit.WebWindow;
66  import com.gargoylesoftware.htmlunit.javascript.host.html.HTMLBodyElement;
67  import com.gargoylesoftware.htmlunit.javascript.host.html.HTMLDocument;
68  import com.gargoylesoftware.htmlunit.svg.SvgElementFactory;
69  
70  import net.sourceforge.htmlunit.cyberneko.HTMLConfiguration;
71  import net.sourceforge.htmlunit.cyberneko.HTMLElements;
72  import net.sourceforge.htmlunit.cyberneko.HTMLEventInfo;
73  import net.sourceforge.htmlunit.cyberneko.HTMLScanner;
74  import net.sourceforge.htmlunit.cyberneko.HTMLTagBalancer;
75  import net.sourceforge.htmlunit.cyberneko.HTMLTagBalancingListener;
76  
77  /**
78   * <p>SAX parser implementation that uses the NekoHTML {@link net.sourceforge.htmlunit.cyberneko.HTMLConfiguration}
79   * to parse HTML into a HtmlUnit-specific DOM (HU-DOM) tree.</p>
80   *
81   * @author <a href="mailto:cse@dynabean.de">Christian Sell</a>
82   * @author David K. Taylor
83   * @author Chris Erskine
84   * @author Ahmed Ashour
85   * @author Marc Guillemot
86   * @author Ethan Glasser-Camp
87   * @author Sudhan Moghe
88   * @author Ronald Brill
89   * @author Frank Danek
90   * @author Carsten Steul
91   */
92  public final class HTMLParser {
93  
94      /** XHTML namespace. */
95      public static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
96  
97      /** SVG namespace. */
98      public static final String SVG_NAMESPACE = "http://www.w3.org/2000/svg";
99  
100     /**
101      * The SVG factory.
102      */
103     public static final SvgElementFactory SVG_FACTORY = new SvgElementFactory();
104 
105     private static final Map<String, ElementFactory> ELEMENT_FACTORIES = new HashMap<>();
106 
107     static {
108         ELEMENT_FACTORIES.put(HtmlInput.TAG_NAME, InputElementFactory.instance);
109 
110         final DefaultElementFactory defaultElementFactory = new DefaultElementFactory();
111         for (final String tagName : DefaultElementFactory.SUPPORTED_TAGS_) {
112             ELEMENT_FACTORIES.put(tagName, defaultElementFactory);
113         }
114     }
115 
116     /**
117      * You should never need to create one of these!
118      */
119     private HTMLParser() {
120         // Empty.
121     }
122 
123     /**
124      * Parses the HTML content from the given string into an object tree representation.
125      *
126      * @param parent the parent for the new nodes
127      * @param source the (X)HTML to be parsed
128      * @throws SAXException if a SAX error occurs
129      * @throws IOException if an IO error occurs
130      */
131     public static void parseFragment(final DomNode parent, final String source) throws SAXException, IOException {
132         parseFragment(parent, parent, source);
133     }
134 
135     /**
136      * Parses the HTML content from the given string into an object tree representation.
137      *
138      * @param parent where the new parsed nodes will be added to
139      * @param context the context to build the fragment context stack
140      * @param source the (X)HTML to be parsed
141      * @throws SAXException if a SAX error occurs
142      * @throws IOException if an IO error occurs
143      */
144     public static void parseFragment(final DomNode parent, final DomNode context, final String source)
145         throws SAXException, IOException {
146         final Page page = parent.getPage();
147         if (!(page instanceof HtmlPage)) {
148             return;
149         }
150         final HtmlPage htmlPage = (HtmlPage) page;
151         final URL url = htmlPage.getUrl();
152 
153         final HtmlUnitDOMBuilder domBuilder = new HtmlUnitDOMBuilder(parent, url, source);
154         domBuilder.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
155         // build fragment context stack
156         DomNode node = context;
157         final List<QName> ancestors = new ArrayList<>();
158         while (node != null && node.getNodeType() != Node.DOCUMENT_NODE) {
159             ancestors.add(0, new QName(null, node.getNodeName(), null, null));
160             node = node.getParentNode();
161         }
162         if (ancestors.isEmpty() || !"html".equals(ancestors.get(0).localpart)) {
163             ancestors.add(0, new QName(null, "html", null, null));
164         }
165         if (ancestors.size() == 1 || !"body".equals(ancestors.get(1).localpart)) {
166             ancestors.add(1, new QName(null, "body", null, null));
167         }
168 
169         domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
170         domBuilder.setProperty(HTMLTagBalancer.FRAGMENT_CONTEXT_STACK, ancestors.toArray(new QName[] {}));
171 
172         final XMLInputSource in = new XMLInputSource(null, url.toString(), null, new StringReader(source), null);
173 
174         htmlPage.registerParsingStart();
175         htmlPage.registerSnippetParsingStart();
176         try {
177             domBuilder.parse(in);
178         }
179         finally {
180             htmlPage.registerParsingEnd();
181             htmlPage.registerSnippetParsingEnd();
182         }
183     }
184 
185     /**
186      * Parses the HTML content from the specified <tt>WebResponse</tt> into an object tree representation.
187      *
188      * @param webResponse the response data
189      * @param webWindow the web window into which the page is to be loaded
190      * @return the page object which is the root of the DOM tree
191      * @throws IOException if there is an IO error
192      */
193     public static HtmlPage parseHtml(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
194         final HtmlPage page = new HtmlPage(webResponse, webWindow);
195         parse(webResponse, webWindow, page, false);
196         return page;
197     }
198 
199     /**
200      * Parses the XHTML content from the specified <tt>WebResponse</tt> into an object tree representation.
201      *
202      * @param webResponse the response data
203      * @param webWindow the web window into which the page is to be loaded
204      * @return the page object which is the root of the DOM tree
205      * @throws IOException if there is an IO error
206      */
207     public static XHtmlPage parseXHtml(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
208         final XHtmlPage page = new XHtmlPage(webResponse, webWindow);
209         parse(webResponse, webWindow, page, true);
210         return page;
211     }
212 
213     private static void parse(final WebResponse webResponse, final WebWindow webWindow, final HtmlPage page,
214             final boolean xhtml)
215         throws IOException {
216 
217         webWindow.setEnclosedPage(page);
218 
219         final URL url = webResponse.getWebRequest().getUrl();
220         final HtmlUnitDOMBuilder domBuilder = new HtmlUnitDOMBuilder(page, url, null);
221 
222         Charset charset = webResponse.getContentCharsetOrNull();
223         try {
224             // handle charset
225             if (charset == null) {
226                 final Charset specifiedCharset = webResponse.getWebRequest().getCharset();
227                 if (specifiedCharset != null) {
228                     charset = specifiedCharset;
229                 }
230             }
231             else {
232                 domBuilder.setFeature(HTMLScanner.IGNORE_SPECIFIED_CHARSET, true);
233             }
234 
235             // xml content is different
236             if (xhtml) {
237                 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
238             }
239         }
240         catch (final Exception e) {
241             throw new ObjectInstantiationException("Error setting HTML parser feature", e);
242         }
243 
244         try (InputStream content = webResponse.getContentAsStream()) {
245             String encoding = null;
246             if (charset != null) {
247                 encoding = charset.name();
248             }
249             final XMLInputSource in = new XMLInputSource(null, url.toString(), null, content, encoding);
250 
251             page.registerParsingStart();
252             try {
253                 domBuilder.parse(in);
254             }
255             catch (final XNIException e) {
256                 // extract enclosed exception
257                 final Throwable origin = extractNestedException(e);
258                 throw new RuntimeException("Failed parsing content from " + url, origin);
259             }
260         }
261         finally {
262             page.registerParsingEnd();
263         }
264 
265         addBodyToPageIfNecessary(page, true, domBuilder.body_ != null);
266     }
267 
268     /**
269      * Adds a body element to the current page, if necessary. Strictly speaking, this should
270      * probably be done by NekoHTML. See the bug linked below. If and when that bug is fixed,
271      * we may be able to get rid of this code.
272      *
273      * http://sourceforge.net/p/nekohtml/bugs/15/
274      * @param page
275      * @param originalCall
276      * @param checkInsideFrameOnly true if the original page had body that was removed by JavaScript
277      */
278     private static void addBodyToPageIfNecessary(
279             final HtmlPage page, final boolean originalCall, final boolean checkInsideFrameOnly) {
280         // IE waits for the whole page to load before initializing bodies for frames.
281         final boolean waitToLoad = page.hasFeature(PAGE_WAIT_LOAD_BEFORE_BODY);
282         if (page.getEnclosingWindow() instanceof FrameWindow && originalCall && waitToLoad) {
283             return;
284         }
285 
286         // Find out if the document already has a body element (or frameset).
287         final Element doc = page.getDocumentElement();
288         boolean hasBody = false;
289         for (Node child = doc.getFirstChild(); child != null; child = child.getNextSibling()) {
290             if (child instanceof HtmlBody || child instanceof HtmlFrameSet) {
291                 hasBody = true;
292                 break;
293             }
294         }
295 
296         // If the document does not have a body, add it.
297         if (!hasBody && !checkInsideFrameOnly) {
298             final HtmlBody body = new HtmlBody("body", page, null, false);
299             doc.appendChild(body);
300         }
301 
302         // If this is IE, we need to initialize the bodies of any frames, as well.
303         // This will already have been done when emulating FF (see above).
304         if (waitToLoad) {
305             for (final FrameWindow frame : page.getFrames()) {
306                 final Page containedPage = frame.getEnclosedPage();
307                 if (containedPage != null && containedPage.isHtmlPage()) {
308                     addBodyToPageIfNecessary((HtmlPage) containedPage, false, false);
309                 }
310             }
311         }
312     }
313 
314     /**
315      * Extract nested exception within an XNIException (Nekohtml uses reflection and generated
316      * exceptions are wrapped many times within XNIException and InvocationTargetException)
317      *
318      * @param e the original XNIException
319      * @return the cause exception
320      */
321     static Throwable extractNestedException(final Throwable e) {
322         Throwable originalException = e;
323         Throwable cause = ((XNIException) e).getException();
324         while (cause != null) {
325             originalException = cause;
326             if (cause instanceof XNIException) {
327                 cause = ((XNIException) cause).getException();
328             }
329             else if (cause instanceof InvocationTargetException) {
330                 cause = cause.getCause();
331             }
332             else {
333                 cause = null;
334             }
335         }
336         return originalException;
337     }
338 
339     /**
340      * @param tagName an HTML element tag name
341      * @return a factory for creating HtmlElements representing the given tag
342      */
343     public static ElementFactory getFactory(final String tagName) {
344         final ElementFactory result = ELEMENT_FACTORIES.get(tagName);
345 
346         if (result != null) {
347             return result;
348         }
349         return UnknownElementFactory.instance;
350     }
351 
352     /**
353      * Returns the pre-registered element factory corresponding to the specified tag, or an UnknownElementFactory.
354      * @param page the page
355      * @param namespaceURI the namespace URI
356      * @param qualifiedName the qualified name
357      * @param insideSvg is the node inside an SVG node or not
358      * @param svgSupport true if called from javascript createElementNS
359      * @return the pre-registered element factory corresponding to the specified tag, or an UnknownElementFactory
360      */
361     static ElementFactory getElementFactory(final SgmlPage page, final String namespaceURI,
362             final String qualifiedName, final boolean insideSvg, final boolean svgSupport) {
363         if (insideSvg) {
364             return SVG_FACTORY;
365         }
366 
367         if (namespaceURI == null || namespaceURI.isEmpty()
368             || XHTML_NAMESPACE.equals(namespaceURI)
369             || SVG_NAMESPACE.equals(namespaceURI)
370             || !qualifiedName.contains(":")) {
371 
372             String tagName = qualifiedName;
373             final int index = tagName.indexOf(':');
374             if (index == -1) {
375                 tagName = tagName.toLowerCase(Locale.ROOT);
376             }
377             else {
378                 tagName = tagName.substring(index + 1);
379             }
380             final ElementFactory factory;
381             if (svgSupport && !"svg".equals(tagName) && SVG_NAMESPACE.equals(namespaceURI)) {
382                 factory = SVG_FACTORY;
383             }
384             else {
385                 factory = ELEMENT_FACTORIES.get(tagName);
386             }
387 
388             if (factory != null) {
389                 return factory;
390             }
391         }
392         return UnknownElementFactory.instance;
393     }
394 
395     /**
396      * The parser and DOM builder. This class subclasses Xerces's AbstractSAXParser and implements
397      * the ContentHandler interface. Thus all parser APIs are kept private. The ContentHandler methods
398      * consume SAX events to build the page DOM
399      */
400     static final class HtmlUnitDOMBuilder extends AbstractSAXParser
401             implements ContentHandler, LexicalHandler, HTMLTagBalancingListener {
402 
403         private enum HeadParsed { YES, SYNTHESIZED, NO };
404 
405         private final HtmlPage page_;
406 
407         private Locator locator_;
408         private final Deque<DomNode> stack_ = new ArrayDeque<>();
409 
410         /** Did the snippet tried to overwrite the start node? */
411         private boolean snippetStartNodeOverwritten_;
412         private final int initialSize_;
413         private DomNode currentNode_;
414         private StringBuilder characters_;
415         private HeadParsed headParsed_ = HeadParsed.NO;
416         private HtmlElement body_;
417         private boolean lastTagWasSynthesized_;
418         private HtmlForm formWaitingForLostChildren_;
419         private boolean insideSvg_;
420 
421         private static final String FEATURE_AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
422         private static final String FEATURE_PARSE_NOSCRIPT
423             = "http://cyberneko.org/html/features/parse-noscript-content";
424 
425         /**
426          * Parses and then inserts the specified HTML content into the HTML content currently being parsed.
427          * @param html the HTML content to push
428          */
429         public void pushInputString(final String html) {
430             page_.registerParsingStart();
431             page_.registerInlineSnippetParsingStart();
432             try {
433                 final WebResponse webResponse = page_.getWebResponse();
434                 final Charset charset = webResponse.getContentCharset();
435                 final String url = webResponse.getWebRequest().getUrl().toString();
436                 final XMLInputSource in = new XMLInputSource(null, url, null, new StringReader(html), charset.name());
437                 ((HTMLConfiguration) fConfiguration).evaluateInputSource(in);
438             }
439             finally {
440                 page_.registerParsingEnd();
441                 page_.registerInlineSnippetParsingEnd();
442             }
443         }
444 
445         /**
446          * Creates a new builder for parsing the specified response contents.
447          * @param node the location at which to insert the new content
448          * @param url the page's URL
449          */
450         private HtmlUnitDOMBuilder(final DomNode node, final URL url, final String htmlContent) {
451             super(createConfiguration(node.getPage().getWebClient().getBrowserVersion()));
452 
453             page_ = (HtmlPage) node.getPage();
454 
455             currentNode_ = node;
456             for (final Node ancestor : currentNode_.getAncestors()) {
457                 stack_.push((DomNode) ancestor);
458             }
459 
460             final WebClient webClient = page_.getWebClient();
461             final HTMLParserListener listener = webClient.getHTMLParserListener();
462             final boolean reportErrors = listener != null;
463             if (reportErrors) {
464                 fConfiguration.setErrorHandler(new HTMLErrorHandler(listener, url, htmlContent));
465             }
466 
467             try {
468                 setFeature(FEATURE_AUGMENTATIONS, true);
469                 setProperty("http://cyberneko.org/html/properties/names/elems", "default");
470                 if (!webClient.getBrowserVersion().hasFeature(HTML_ATTRIBUTE_LOWER_CASE)) {
471                     setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change");
472                 }
473                 setFeature("http://cyberneko.org/html/features/report-errors", reportErrors);
474                 setFeature(FEATURE_PARSE_NOSCRIPT, !webClient.getOptions().isJavaScriptEnabled());
475                 setFeature(HTMLScanner.ALLOW_SELFCLOSING_IFRAME, false);
476 
477                 setContentHandler(this);
478                 setLexicalHandler(this); //comments and CDATA
479             }
480             catch (final SAXException e) {
481                 throw new ObjectInstantiationException("unable to create HTML parser", e);
482             }
483             initialSize_ = stack_.size();
484         }
485 
486         /**
487          * Create the configuration depending on the simulated browser
488          * @param webClient the current WebClient
489          * @return the configuration
490          */
491         private static XMLParserConfiguration createConfiguration(final BrowserVersion browserVersion) {
492             final HTMLConfiguration configuration = new HTMLConfiguration();
493             if (browserVersion.hasFeature(HTML_COMMAND_TAG)) {
494                 configuration.htmlElements_.setElement(new HTMLElements.Element(HTMLElements.COMMAND, "COMMAND",
495                         HTMLElements.Element.EMPTY, HTMLElements.BODY, null));
496             }
497             if (browserVersion.hasFeature(HTML_ISINDEX_TAG)) {
498                 configuration.htmlElements_.setElement(new HTMLElements.Element(HTMLElements.ISINDEX, "ISINDEX",
499                         HTMLElements.Element.INLINE, HTMLElements.BODY, null));
500             }
501             if (browserVersion.hasFeature(HTML_MAIN_TAG)) {
502                 configuration.htmlElements_.setElement(new HTMLElements.Element(HTMLElements.MAIN, "MAIN",
503                         HTMLElements.Element.INLINE, HTMLElements.BODY, null));
504             }
505 
506             return configuration;
507         }
508 
509         /**
510          * @return the document locator
511          */
512         public Locator getLocator() {
513             return locator_;
514         }
515 
516         /** {@inheritDoc ContentHandler#setDocumentLocator} */
517         @Override
518         public void setDocumentLocator(final Locator locator) {
519             locator_ = locator;
520         }
521 
522         /** {@inheritDoc ContentHandler#startDocument()} */
523         @Override
524         public void startDocument() throws SAXException {
525         }
526 
527         /** {@inheritDoc} */
528         @Override
529         public void startElement(final QName element, final XMLAttributes attributes, final Augmentations augs)
530             throws XNIException {
531             // augs might change so we store only the interesting part
532             lastTagWasSynthesized_ = isSynthesized(augs);
533             super.startElement(element, attributes, augs);
534         }
535 
536         /** {@inheritDoc ContentHandler#startElement(String,String,String,Attributes)} */
537         @Override
538         public void startElement(String namespaceURI, final String localName, String qName, final Attributes atts)
539             throws SAXException {
540 
541             if (snippetStartNodeOverwritten_) {
542                 snippetStartNodeOverwritten_ = false;
543                 return;
544             }
545             handleCharacters();
546 
547             String tagLower = localName.toLowerCase(Locale.ROOT);
548             if (page_.isParsingHtmlSnippet() && ("html".equals(tagLower) || "body".equals(tagLower))) {
549                 return;
550             }
551 
552             if ("head".equals(tagLower)) {
553                 if (headParsed_ == HeadParsed.YES || page_.isParsingHtmlSnippet()) {
554                     return;
555                 }
556 
557                 headParsed_ = lastTagWasSynthesized_ ? HeadParsed.SYNTHESIZED : HeadParsed.YES;
558             }
559 
560             if (namespaceURI != null) {
561                 namespaceURI = namespaceURI.trim();
562             }
563 
564             // add a head if none was there
565             else if (headParsed_ == HeadParsed.NO && ("body".equals(tagLower) || "frameset".equals(tagLower))) {
566                 final ElementFactory factory = getElementFactory(page_, namespaceURI, "head", insideSvg_, false);
567                 final DomElement newElement = factory.createElement(page_, "head", null);
568                 currentNode_.appendChild(newElement);
569                 headParsed_ = HeadParsed.SYNTHESIZED;
570             }
571 
572             // If we're adding a body element, keep track of any temporary synthetic ones
573             // that we may have had to create earlier (for document.write(), for example).
574             HtmlBody oldBody = null;
575             if ("body".equals(qName) && page_.getBody() instanceof HtmlBody) {
576                 oldBody = (HtmlBody) page_.getBody();
577             }
578 
579             // Need to reset this at each starting form tag because it could be set from a synthesized
580             // end tag.
581             if ("form".equals(tagLower)) {
582                 formWaitingForLostChildren_ = null;
583             }
584 
585             // Add the new node.
586             if (!(page_ instanceof XHtmlPage) && XHTML_NAMESPACE.equals(namespaceURI)) {
587                 namespaceURI = null;
588             }
589 
590             final boolean keyGenAsSelect = "keygen".equals(tagLower) && page_.hasFeature(KEYGEN_AS_SELECT);
591             if (keyGenAsSelect) {
592                 tagLower = "select";
593                 qName = "select";
594             }
595 
596             final ElementFactory factory = getElementFactory(page_, namespaceURI, qName, insideSvg_, false);
597             if (factory == SVG_FACTORY) {
598                 namespaceURI = SVG_NAMESPACE;
599             }
600             final DomElement newElement = factory.createElementNS(page_, namespaceURI, qName, atts, true);
601             newElement.setStartLocation(locator_.getLineNumber(), locator_.getColumnNumber());
602 
603             // parse can't replace everything as it does not buffer elements while parsing
604             addNodeToRightParent(currentNode_, newElement);
605 
606             if ("svg".equals(tagLower)) {
607                 insideSvg_ = true;
608             }
609 
610             // If we had an old synthetic body and we just added a real body element, quietly
611             // remove the old body and move its children to the real body element we just added.
612             if (oldBody != null) {
613                 oldBody.quietlyRemoveAndMoveChildrenTo(newElement);
614             }
615 
616             if ("body".equals(tagLower)) {
617                 body_ = (HtmlElement) newElement;
618             }
619             else if ("meta".equals(tagLower) && page_.hasFeature(META_X_UA_COMPATIBLE)) {
620                 final HtmlMeta meta = (HtmlMeta) newElement;
621                 if ("X-UA-Compatible".equals(meta.getHttpEquivAttribute())) {
622                     final String content = meta.getContentAttribute();
623                     if (content.startsWith("IE=")) {
624                         final String mode = content.substring(3).trim();
625                         final int version = page_.getWebClient().getBrowserVersion().getBrowserVersionNumeric();
626                         if ("edge".equals(mode)) {
627                             ((HTMLDocument) page_.getScriptableObject()).forceDocumentMode(version);
628                         }
629                         else {
630                             try {
631                                 int value = Integer.parseInt(mode);
632                                 if (value > version) {
633                                     value = version;
634                                 }
635                                 ((HTMLDocument) page_.getScriptableObject()).forceDocumentMode(value);
636                             }
637                             catch (final Exception e) {
638                                 // ignore
639                             }
640                         }
641                     }
642                 }
643             }
644             if (keyGenAsSelect) {
645                 DomElement option = factory.createElementNS(page_, namespaceURI, "option", null, true);
646                 option.appendChild(new DomText(page_, "High Grade"));
647                 newElement.appendChild(option);
648 
649                 option = factory.createElementNS(page_, namespaceURI, "option", null, true);
650                 option.appendChild(new DomText(page_, "Medium Grade"));
651                 newElement.appendChild(option);
652             }
653             currentNode_ = newElement;
654             stack_.push(currentNode_);
655         }
656 
657         /**
658          * Adds the new node to the right parent that is not necessary the currentNode in case of
659          * malformed HTML code. The method tries to emulate the behavior of Firefox.
660          */
661         private void addNodeToRightParent(final DomNode currentNode, final DomElement newElement) {
662             final String currentNodeName = currentNode.getNodeName();
663             final String newNodeName = newElement.getNodeName();
664 
665             DomNode parent = currentNode;
666 
667             // If the new node is a table element and the current node isn't one search the stack for the
668             // correct parent.
669             if ("tr".equals(newNodeName) && !isTableChild(currentNodeName)) {
670                 parent = findElementOnStack("tbody", "thead", "tfoot");
671             }
672             else if (isTableChild(newNodeName) && !"table".equals(currentNodeName)) {
673                 parent = findElementOnStack("table");
674             }
675             else if (isTableCell(newNodeName) && !"tr".equals(currentNodeName)) {
676                 parent = findElementOnStack("tr");
677             }
678 
679             // If the parent changed and the old parent was a form it is now waiting for lost children.
680             if (parent != currentNode && "form".equals(currentNodeName)) {
681                 formWaitingForLostChildren_ = (HtmlForm) currentNode;
682             }
683 
684             final String parentNodeName = parent.getNodeName();
685 
686             if (("table".equals(parentNodeName) && !isTableChild(newNodeName))
687                     || (isTableChild(parentNodeName) && !"caption".equals(parentNodeName)
688                             && !"colgroup".equals(parentNodeName) && !"tr".equals(newNodeName))
689                     || ("colgroup".equals(parentNodeName) && !"col".equals(newNodeName))
690                     || ("tr".equals(parentNodeName) && !isTableCell(newNodeName))) {
691                 // If its a form or submittable just add it even though the resulting DOM is incorrect.
692                 // Otherwise insert the element before the table.
693                 if ("form".equals(newNodeName)) {
694                     formWaitingForLostChildren_ = (HtmlForm) newElement;
695                     parent.appendChild(newElement);
696                 }
697                 else if (newElement instanceof SubmittableElement) {
698                     if (formWaitingForLostChildren_ != null) {
699                         formWaitingForLostChildren_.addLostChild((HtmlElement) newElement);
700                     }
701                     parent.appendChild(newElement);
702                 }
703                 else {
704                     parent = findElementOnStack("table");
705                     parent.insertBefore(newElement);
706                 }
707             }
708             else if (formWaitingForLostChildren_ != null && "form".equals(parentNodeName)) {
709                 // Do not append any children to invalid form. Submittable are inserted after the form,
710                 // everything else before the table.
711                 if (newElement instanceof SubmittableElement) {
712                     formWaitingForLostChildren_.addLostChild((HtmlElement) newElement);
713                     parent.getParentNode().appendChild(newElement);
714                 }
715                 else {
716                     parent = findElementOnStack("table");
717                     parent.insertBefore(newElement);
718                 }
719             }
720             else if (formWaitingForLostChildren_ != null && newElement instanceof SubmittableElement) {
721                 formWaitingForLostChildren_.addLostChild((HtmlElement) newElement);
722                 parent.appendChild(newElement);
723             }
724             else {
725                 parent.appendChild(newElement);
726             }
727         }
728 
729         private DomNode findElementOnStack(final String... searchedElementNames) {
730             DomNode searchedNode = null;
731             for (final DomNode node : stack_) {
732                 if (ArrayUtils.contains(searchedElementNames, node.getNodeName())) {
733                     searchedNode = node;
734                     break;
735                 }
736             }
737 
738             if (searchedNode == null) {
739                 searchedNode = stack_.peek(); // this is surely wrong but at least it won't throw a NPE
740             }
741 
742             return searchedNode;
743         }
744 
745         private static boolean isTableChild(final String nodeName) {
746             return "thead".equals(nodeName) || "tbody".equals(nodeName)
747                     || "tfoot".equals(nodeName) || "caption".equals(nodeName)
748                     || "colgroup".equals(nodeName);
749         }
750 
751         private static boolean isTableCell(final String nodeName) {
752             return "td".equals(nodeName) || "th".equals(nodeName);
753         }
754 
755         /** {@inheritDoc} */
756         @Override
757         public void endElement(final QName element, final Augmentations augs)
758             throws XNIException {
759             // augs might change so we store only the interesting part
760             lastTagWasSynthesized_ = isSynthesized(augs);
761             super.endElement(element, augs);
762         }
763 
764         /** {@inheritDoc ContentHandler@endElement(String,String,String)} */
765         @Override
766         public void endElement(final String namespaceURI, final String localName, final String qName)
767             throws SAXException {
768 
769             handleCharacters();
770 
771             final String tagLower = localName.toLowerCase(Locale.ROOT);
772 
773             if (page_.isParsingHtmlSnippet()) {
774                 if ("html".equals(tagLower) || "body".equals(tagLower)) {
775                     return;
776                 }
777                 if (stack_.size() == initialSize_) {
778                     snippetStartNodeOverwritten_ = true;
779                     return;
780                 }
781             }
782 
783             if ("svg".equals(tagLower)) {
784                 insideSvg_ = false;
785             }
786 
787             // Need to reset this at each closing form tag because a valid form could start afterwards.
788             if ("form".equals(tagLower)) {
789                 formWaitingForLostChildren_ = null;
790             }
791 
792             final DomNode previousNode = stack_.pop(); //remove currentElement from stack
793             previousNode.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
794 
795             // special handling for form lost children (malformed HTML code where </form> is synthesized)
796             if (previousNode instanceof HtmlForm && lastTagWasSynthesized_) {
797                 formWaitingForLostChildren_ = (HtmlForm) previousNode;
798             }
799 
800             if (!stack_.isEmpty()) {
801                 currentNode_ = stack_.peek();
802             }
803 
804             final boolean postponed = page_.isParsingInlineHtmlSnippet();
805             previousNode.onAllChildrenAddedToPage(postponed);
806         }
807 
808         /** {@inheritDoc} */
809         @Override
810         public void characters(final char[] ch, final int start, final int length) throws SAXException {
811             if (characters_ == null) {
812                 characters_ = new StringBuilder();
813             }
814             characters_.append(ch, start, length);
815         }
816 
817         /** {@inheritDoc} */
818         @Override
819         public void ignorableWhitespace(final char[] ch, final int start, final int length) throws SAXException {
820             if (characters_ == null) {
821                 characters_ = new StringBuilder();
822             }
823             characters_.append(ch, start, length);
824         }
825 
826         /**
827          * Picks up the character data accumulated so far and add it to the current element as a text node.
828          */
829         private void handleCharacters() {
830             if (characters_ != null && characters_.length() != 0) {
831                 if (currentNode_ instanceof HtmlHtml) {
832                     // In HTML, the <html> node only has two possible children:
833                     // the <head> and the <body>; any text is ignored.
834                     characters_.setLength(0);
835                 }
836                 else {
837                     // Use the normal behavior: append a text node for the accumulated text.
838                     final String textValue = characters_.toString();
839                     final DomText text = new DomText(page_, textValue);
840                     characters_.setLength(0);
841 
842                     if (StringUtils.isNotBlank(textValue)) {
843                         // malformed HTML: </td>some text</tr> => text comes before the table
844                         if (currentNode_ instanceof HtmlTableRow) {
845                             final HtmlTableRow row = (HtmlTableRow) currentNode_;
846                             final HtmlTable enclosingTable = row.getEnclosingTable();
847                             if (enclosingTable != null) { // may be null when called from Range.createContextualFragment
848                                 if (enclosingTable.getPreviousSibling() instanceof DomText) {
849                                     final DomText domText = (DomText) enclosingTable.getPreviousSibling();
850                                     domText.setTextContent(domText + textValue);
851                                 }
852                                 else {
853                                     enclosingTable.insertBefore(text);
854                                 }
855                             }
856                         }
857                         else if (currentNode_ instanceof HtmlTable) {
858                             final HtmlTable enclosingTable = (HtmlTable) currentNode_;
859                             if (enclosingTable.getPreviousSibling() instanceof DomText) {
860                                 final DomText domText = (DomText) enclosingTable.getPreviousSibling();
861                                 domText.setTextContent(domText + textValue);
862                             }
863                             else {
864                                 enclosingTable.insertBefore(text);
865                             }
866                         }
867                         else if (currentNode_ instanceof HtmlImage) {
868                             currentNode_.setNextSibling(text);
869                         }
870                         else {
871                             currentNode_.appendChild(text);
872                         }
873                     }
874                     else {
875                         currentNode_.appendChild(text);
876                     }
877                 }
878             }
879         }
880 
881         /** {@inheritDoc} */
882         @Override
883         public void endDocument() throws SAXException {
884             handleCharacters();
885             final DomNode currentPage = page_;
886             currentPage.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
887         }
888 
889         /** {@inheritDoc} */
890         @Override
891         public void startPrefixMapping(final String prefix, final String uri) throws SAXException {
892         }
893 
894         /** {@inheritDoc} */
895         @Override
896         public void endPrefixMapping(final String prefix) throws SAXException {
897         }
898 
899         /** {@inheritDoc} */
900         @Override
901         public void processingInstruction(final String target, final String data) throws SAXException {
902         }
903 
904         /** {@inheritDoc} */
905         @Override
906         public void skippedEntity(final String name) throws SAXException {
907         }
908 
909         // LexicalHandler methods
910 
911         /** {@inheritDoc} */
912         @Override
913         public void comment(final char[] ch, final int start, final int length) {
914             handleCharacters();
915             final String data = new String(ch, start, length);
916             final DomComment comment = new DomComment(page_, data);
917             currentNode_.appendChild(comment);
918         }
919 
920         /** {@inheritDoc} */
921         @Override
922         public void endCDATA() {
923         }
924 
925         /** {@inheritDoc} */
926         @Override
927         public void endDTD() {
928         }
929 
930         /** {@inheritDoc} */
931         @Override
932         public void endEntity(final String name) {
933         }
934 
935         /** {@inheritDoc} */
936         @Override
937         public void startCDATA() {
938         }
939 
940         /** {@inheritDoc} */
941         @Override
942         public void startDTD(final String name, final String publicId, final String systemId) {
943             final DomDocumentType type = new DomDocumentType(page_, name, publicId, systemId);
944             page_.setDocumentType(type);
945 
946             final Node child;
947             child = type;
948             page_.appendChild(child);
949         }
950 
951         /** {@inheritDoc} */
952         @Override
953         public void startEntity(final String name) {
954         }
955 
956         /**
957          * {@inheritDoc}
958          */
959         @Override
960         public void ignoredEndElement(final QName element, final Augmentations augs) {
961             // if real </form> is reached, don't accept fields anymore as lost children
962             if ("form".equals(element.localpart)) {
963                 formWaitingForLostChildren_ = null;
964             }
965         }
966 
967         /**
968          * {@inheritDoc}
969          */
970         @Override
971         public void ignoredStartElement(final QName elem, final XMLAttributes attrs, final Augmentations augs) {
972             // when multiple body elements are encountered, the attributes of the discarded
973             // elements are used when not previously defined
974             if (body_ != null && "body".equalsIgnoreCase(elem.localpart) && attrs != null) {
975                 copyAttributes(body_, attrs);
976             }
977             if (body_ != null && "html".equalsIgnoreCase(elem.localpart) && attrs != null) {
978                 copyAttributes((DomElement) body_.getParentNode(), attrs);
979             }
980         }
981 
982         private static void copyAttributes(final DomElement to, final XMLAttributes attrs) {
983             final int length = attrs.getLength();
984             for (int i = 0; i < length; i++) {
985                 final String attrName = attrs.getLocalName(i).toLowerCase(Locale.ROOT);
986                 if (to.getAttributes().getNamedItem(attrName) == null) {
987                     to.setAttribute(attrName, attrs.getValue(i));
988                     if (attrName.startsWith("on") && to.getScriptableObject() instanceof HTMLBodyElement) {
989                         final HTMLBodyElement jsBody = (HTMLBodyElement) to.getScriptableObject();
990                         jsBody.createEventHandlerFromAttribute(attrName, attrs.getValue(i));
991                     }
992                 }
993             }
994         }
995 
996         /**
997          * {@inheritDoc}
998          */
999         @Override
1000         public void parse(final XMLInputSource inputSource) throws XNIException, IOException {
1001             final HtmlUnitDOMBuilder oldBuilder = page_.getBuilder();
1002             page_.setBuilder(this);
1003             try {
1004                 super.parse(inputSource);
1005             }
1006             finally {
1007                 page_.setBuilder(oldBuilder);
1008             }
1009         }
1010 
1011         private static boolean isSynthesized(final Augmentations augs) {
1012             final HTMLEventInfo info = (augs == null) ? null
1013                     : (HTMLEventInfo) augs.getItem(FEATURE_AUGMENTATIONS);
1014             return info != null && info.isSynthesized();
1015         }
1016     }
1017 }
1018 
1019 /**
1020  * Utility to transmit parsing errors to a {@link HTMLParserListener}.
1021  */
1022 class HTMLErrorHandler extends DefaultErrorHandler {
1023     private final HTMLParserListener listener_;
1024     private final URL url_;
1025     private String html_;
1026 
1027     HTMLErrorHandler(final HTMLParserListener listener, final URL url, final String htmlContent) {
1028         WebAssert.notNull("listener", listener);
1029         WebAssert.notNull("url", url);
1030         listener_ = listener;
1031         url_ = url;
1032         html_ = htmlContent;
1033     }
1034 
1035     /** @see DefaultErrorHandler#error(String,String,XMLParseException) */
1036     @Override
1037     public void error(final String domain, final String key,
1038             final XMLParseException exception) throws XNIException {
1039         listener_.error(exception.getMessage(),
1040                 url_,
1041                 html_,
1042                 exception.getLineNumber(),
1043                 exception.getColumnNumber(),
1044                 key);
1045     }
1046 
1047     /** @see DefaultErrorHandler#warning(String,String,XMLParseException) */
1048     @Override
1049     public void warning(final String domain, final String key,
1050             final XMLParseException exception) throws XNIException {
1051         listener_.warning(exception.getMessage(),
1052                 url_,
1053                 html_,
1054                 exception.getLineNumber(),
1055                 exception.getColumnNumber(),
1056                 key);
1057     }
1058 }