View Javadoc
1   /*
2    * Copyright (c) 2002-2017 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * http://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package com.gargoylesoftware.htmlunit.html;
16  
17  import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.HTML_ATTRIBUTE_LOWER_CASE;
18  import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.KEYGEN_AS_SELECT;
19  import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.META_X_UA_COMPATIBLE;
20  import static com.gargoylesoftware.htmlunit.BrowserVersionFeatures.PAGE_WAIT_LOAD_BEFORE_BODY;
21  
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.io.StringReader;
25  import java.lang.reflect.InvocationTargetException;
26  import java.net.URL;
27  import java.nio.charset.Charset;
28  import java.util.ArrayDeque;
29  import java.util.ArrayList;
30  import java.util.Deque;
31  import java.util.HashMap;
32  import java.util.List;
33  import java.util.Locale;
34  import java.util.Map;
35  
36  import org.apache.commons.lang3.ArrayUtils;
37  import org.apache.commons.lang3.StringUtils;
38  import org.apache.xerces.parsers.AbstractSAXParser;
39  import org.apache.xerces.util.DefaultErrorHandler;
40  import org.apache.xerces.xni.Augmentations;
41  import org.apache.xerces.xni.QName;
42  import org.apache.xerces.xni.XMLAttributes;
43  import org.apache.xerces.xni.XNIException;
44  import org.apache.xerces.xni.parser.XMLInputSource;
45  import org.apache.xerces.xni.parser.XMLParseException;
46  import org.apache.xerces.xni.parser.XMLParserConfiguration;
47  import org.w3c.dom.Element;
48  import org.w3c.dom.Node;
49  import org.xml.sax.Attributes;
50  import org.xml.sax.ContentHandler;
51  import org.xml.sax.Locator;
52  import org.xml.sax.SAXException;
53  import org.xml.sax.ext.LexicalHandler;
54  
55  import com.gargoylesoftware.htmlunit.BrowserVersion;
56  import com.gargoylesoftware.htmlunit.ObjectInstantiationException;
57  import com.gargoylesoftware.htmlunit.Page;
58  import com.gargoylesoftware.htmlunit.SgmlPage;
59  import com.gargoylesoftware.htmlunit.WebAssert;
60  import com.gargoylesoftware.htmlunit.WebClient;
61  import com.gargoylesoftware.htmlunit.WebResponse;
62  import com.gargoylesoftware.htmlunit.WebWindow;
63  import com.gargoylesoftware.htmlunit.javascript.host.html.HTMLBodyElement;
64  import com.gargoylesoftware.htmlunit.javascript.host.html.HTMLDocument;
65  import com.gargoylesoftware.htmlunit.svg.SvgElementFactory;
66  
67  import net.sourceforge.htmlunit.cyberneko.HTMLConfiguration;
68  import net.sourceforge.htmlunit.cyberneko.HTMLElements;
69  import net.sourceforge.htmlunit.cyberneko.HTMLEventInfo;
70  import net.sourceforge.htmlunit.cyberneko.HTMLScanner;
71  import net.sourceforge.htmlunit.cyberneko.HTMLTagBalancer;
72  import net.sourceforge.htmlunit.cyberneko.HTMLTagBalancingListener;
73  
74  /**
75   * <p>SAX parser implementation that uses the NekoHTML {@link net.sourceforge.htmlunit.cyberneko.HTMLConfiguration}
76   * to parse HTML into a HtmlUnit-specific DOM (HU-DOM) tree.</p>
77   *
78   * @author <a href="mailto:cse@dynabean.de">Christian Sell</a>
79   * @author David K. Taylor
80   * @author Chris Erskine
81   * @author Ahmed Ashour
82   * @author Marc Guillemot
83   * @author Ethan Glasser-Camp
84   * @author Sudhan Moghe
85   * @author Ronald Brill
86   * @author Frank Danek
87   * @author Carsten Steul
88   */
89  public final class HTMLParser {
90  
91      /** XHTML namespace. */
92      public static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
93  
94      /** SVG namespace. */
95      public static final String SVG_NAMESPACE = "http://www.w3.org/2000/svg";
96  
97      /**
98       * The SVG factory.
99       */
100     public static final SvgElementFactory SVG_FACTORY = new SvgElementFactory();
101 
102     private static final Map<String, ElementFactory> ELEMENT_FACTORIES = new HashMap<>();
103 
104     static {
105         ELEMENT_FACTORIES.put(HtmlInput.TAG_NAME, InputElementFactory.instance);
106 
107         final DefaultElementFactory defaultElementFactory = new DefaultElementFactory();
108         for (final String tagName : DefaultElementFactory.SUPPORTED_TAGS_) {
109             ELEMENT_FACTORIES.put(tagName, defaultElementFactory);
110         }
111     }
112 
113     /**
114      * You should never need to create one of these!
115      */
116     private HTMLParser() {
117         // Empty.
118     }
119 
120     /**
121      * Parses the HTML content from the given string into an object tree representation.
122      *
123      * @param parent the parent for the new nodes
124      * @param source the (X)HTML to be parsed
125      * @throws SAXException if a SAX error occurs
126      * @throws IOException if an IO error occurs
127      */
128     public static void parseFragment(final DomNode parent, final String source) throws SAXException, IOException {
129         parseFragment(parent, parent, source);
130     }
131 
132     /**
133      * Parses the HTML content from the given string into an object tree representation.
134      *
135      * @param parent where the new parsed nodes will be added to
136      * @param context the context to build the fragment context stack
137      * @param source the (X)HTML to be parsed
138      * @throws SAXException if a SAX error occurs
139      * @throws IOException if an IO error occurs
140      */
141     public static void parseFragment(final DomNode parent, final DomNode context, final String source)
142         throws SAXException, IOException {
143         final Page page = parent.getPage();
144         if (!(page instanceof HtmlPage)) {
145             return;
146         }
147         final HtmlPage htmlPage = (HtmlPage) page;
148         final URL url = htmlPage.getUrl();
149 
150         final HtmlUnitDOMBuilder domBuilder = new HtmlUnitDOMBuilder(parent, url, source);
151         domBuilder.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
152         // build fragment context stack
153         DomNode node = context;
154         final List<QName> ancestors = new ArrayList<>();
155         while (node != null && node.getNodeType() != Node.DOCUMENT_NODE) {
156             ancestors.add(0, new QName(null, node.getNodeName(), null, null));
157             node = node.getParentNode();
158         }
159         if (ancestors.isEmpty() || !"html".equals(ancestors.get(0).localpart)) {
160             ancestors.add(0, new QName(null, "html", null, null));
161         }
162         if (ancestors.size() == 1 || !"body".equals(ancestors.get(1).localpart)) {
163             ancestors.add(1, new QName(null, "body", null, null));
164         }
165 
166         domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
167         domBuilder.setProperty(HTMLTagBalancer.FRAGMENT_CONTEXT_STACK, ancestors.toArray(new QName[] {}));
168 
169         final XMLInputSource in = new XMLInputSource(null, url.toString(), null, new StringReader(source), null);
170 
171         htmlPage.registerParsingStart();
172         htmlPage.registerSnippetParsingStart();
173         try {
174             domBuilder.parse(in);
175         }
176         finally {
177             htmlPage.registerParsingEnd();
178             htmlPage.registerSnippetParsingEnd();
179         }
180     }
181 
182     /**
183      * Parses the HTML content from the specified <tt>WebResponse</tt> into an object tree representation.
184      *
185      * @param webResponse the response data
186      * @param webWindow the web window into which the page is to be loaded
187      * @return the page object which is the root of the DOM tree
188      * @throws IOException if there is an IO error
189      */
190     public static HtmlPage parseHtml(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
191         final HtmlPage page = new HtmlPage(webResponse, webWindow);
192         parse(webResponse, webWindow, page, false);
193         return page;
194     }
195 
196     /**
197      * Parses the XHTML content from the specified <tt>WebResponse</tt> into an object tree representation.
198      *
199      * @param webResponse the response data
200      * @param webWindow the web window into which the page is to be loaded
201      * @return the page object which is the root of the DOM tree
202      * @throws IOException if there is an IO error
203      */
204     public static XHtmlPage parseXHtml(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
205         final XHtmlPage page = new XHtmlPage(webResponse, webWindow);
206         parse(webResponse, webWindow, page, true);
207         return page;
208     }
209 
210     private static void parse(final WebResponse webResponse, final WebWindow webWindow, final HtmlPage page,
211             final boolean xhtml)
212         throws IOException {
213 
214         webWindow.setEnclosedPage(page);
215 
216         final URL url = webResponse.getWebRequest().getUrl();
217         final HtmlUnitDOMBuilder domBuilder = new HtmlUnitDOMBuilder(page, url, null);
218 
219         Charset charset = webResponse.getContentCharsetOrNull();
220         try {
221             // handle charset
222             if (charset == null) {
223                 final Charset specifiedCharset = webResponse.getWebRequest().getCharset();
224                 if (specifiedCharset != null) {
225                     charset = specifiedCharset;
226                 }
227             }
228             else {
229                 domBuilder.setFeature(HTMLScanner.IGNORE_SPECIFIED_CHARSET, true);
230             }
231 
232             // xml content is different
233             if (xhtml) {
234                 domBuilder.setFeature(HTMLScanner.ALLOW_SELFCLOSING_TAGS, true);
235             }
236         }
237         catch (final Exception e) {
238             throw new ObjectInstantiationException("Error setting HTML parser feature", e);
239         }
240 
241         try (InputStream content = webResponse.getContentAsStream()) {
242             String encoding = null;
243             if (charset != null) {
244                 encoding = charset.name();
245             }
246             final XMLInputSource in = new XMLInputSource(null, url.toString(), null, content, encoding);
247 
248             page.registerParsingStart();
249             try {
250                 domBuilder.parse(in);
251             }
252             catch (final XNIException e) {
253                 // extract enclosed exception
254                 final Throwable origin = extractNestedException(e);
255                 throw new RuntimeException("Failed parsing content from " + url, origin);
256             }
257         }
258         finally {
259             page.registerParsingEnd();
260         }
261 
262         addBodyToPageIfNecessary(page, true, domBuilder.body_ != null);
263     }
264 
265     /**
266      * Adds a body element to the current page, if necessary. Strictly speaking, this should
267      * probably be done by NekoHTML. See the bug linked below. If and when that bug is fixed,
268      * we may be able to get rid of this code.
269      *
270      * http://sourceforge.net/p/nekohtml/bugs/15/
271      * @param page
272      * @param originalCall
273      * @param checkInsideFrameOnly true if the original page had body that was removed by JavaScript
274      */
275     private static void addBodyToPageIfNecessary(
276             final HtmlPage page, final boolean originalCall, final boolean checkInsideFrameOnly) {
277         // IE waits for the whole page to load before initializing bodies for frames.
278         final boolean waitToLoad = page.hasFeature(PAGE_WAIT_LOAD_BEFORE_BODY);
279         if (page.getEnclosingWindow() instanceof FrameWindow && originalCall && waitToLoad) {
280             return;
281         }
282 
283         // Find out if the document already has a body element (or frameset).
284         final Element doc = page.getDocumentElement();
285         boolean hasBody = false;
286         for (Node child = doc.getFirstChild(); child != null; child = child.getNextSibling()) {
287             if (child instanceof HtmlBody || child instanceof HtmlFrameSet) {
288                 hasBody = true;
289                 break;
290             }
291         }
292 
293         // If the document does not have a body, add it.
294         if (!hasBody && !checkInsideFrameOnly) {
295             final HtmlBody body = new HtmlBody("body", page, null, false);
296             doc.appendChild(body);
297         }
298 
299         // If this is IE, we need to initialize the bodies of any frames, as well.
300         // This will already have been done when emulating FF (see above).
301         if (waitToLoad) {
302             for (final FrameWindow frame : page.getFrames()) {
303                 final Page containedPage = frame.getEnclosedPage();
304                 if (containedPage != null && containedPage.isHtmlPage()) {
305                     addBodyToPageIfNecessary((HtmlPage) containedPage, false, false);
306                 }
307             }
308         }
309     }
310 
311     /**
312      * Extract nested exception within an XNIException (Nekohtml uses reflection and generated
313      * exceptions are wrapped many times within XNIException and InvocationTargetException)
314      *
315      * @param e the original XNIException
316      * @return the cause exception
317      */
318     static Throwable extractNestedException(final Throwable e) {
319         Throwable originalException = e;
320         Throwable cause = ((XNIException) e).getException();
321         while (cause != null) {
322             originalException = cause;
323             if (cause instanceof XNIException) {
324                 cause = ((XNIException) cause).getException();
325             }
326             else if (cause instanceof InvocationTargetException) {
327                 cause = cause.getCause();
328             }
329             else {
330                 cause = null;
331             }
332         }
333         return originalException;
334     }
335 
336     /**
337      * @param tagName an HTML element tag name
338      * @return a factory for creating HtmlElements representing the given tag
339      */
340     public static ElementFactory getFactory(final String tagName) {
341         final ElementFactory result = ELEMENT_FACTORIES.get(tagName);
342 
343         if (result != null) {
344             return result;
345         }
346         return UnknownElementFactory.instance;
347     }
348 
349     /**
350      * Returns the pre-registered element factory corresponding to the specified tag, or an UnknownElementFactory.
351      * @param page the page
352      * @param namespaceURI the namespace URI
353      * @param qualifiedName the qualified name
354      * @param insideHtml is the node inside HTML or not
355      * @param insideSvg is the node inside an SVG node or not
356      * @return the pre-registered element factory corresponding to the specified tag, or an UnknownElementFactory
357      */
358     static ElementFactory getElementFactory(final SgmlPage page, final String namespaceURI,
359             final String qualifiedName, final boolean insideHtml, final boolean insideSvg) {
360         if (insideSvg) {
361             return SVG_FACTORY;
362         }
363 
364         if (namespaceURI == null || namespaceURI.isEmpty()
365             || !qualifiedName.contains(":")
366             || namespaceURI.equals(XHTML_NAMESPACE)
367             || namespaceURI.equals(SVG_NAMESPACE)) {
368 
369             String tagName = qualifiedName;
370             final int index = tagName.indexOf(':');
371             if (index == -1) {
372                 tagName = tagName.toLowerCase(Locale.ROOT);
373             }
374             else {
375                 tagName = tagName.substring(index + 1);
376             }
377             final ElementFactory factory = ELEMENT_FACTORIES.get(tagName);
378 
379             if (factory != null) {
380                 return factory;
381             }
382         }
383         return UnknownElementFactory.instance;
384     }
385 
386     /**
387      * The parser and DOM builder. This class subclasses Xerces's AbstractSAXParser and implements
388      * the ContentHandler interface. Thus all parser APIs are kept private. The ContentHandler methods
389      * consume SAX events to build the page DOM
390      */
391     static final class HtmlUnitDOMBuilder extends AbstractSAXParser
392             implements ContentHandler, LexicalHandler, HTMLTagBalancingListener {
393 
394         private enum HeadParsed { YES, SYNTHESIZED, NO };
395 
396         private final HtmlPage page_;
397 
398         private Locator locator_;
399         private final Deque<DomNode> stack_ = new ArrayDeque<>();
400 
401         /** Did the snippet tried to overwrite the start node? */
402         private boolean snippetStartNodeOverwritten_;
403         private final int initialSize_;
404         private DomNode currentNode_;
405         private StringBuilder characters_;
406         private HeadParsed headParsed_ = HeadParsed.NO;
407         private HtmlElement body_;
408         private boolean lastTagWasSynthesized_;
409         private HtmlForm formWaitingForLostChildren_;
410         private boolean insideSvg_;
411 
412         private static final String FEATURE_AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
413         private static final String FEATURE_PARSE_NOSCRIPT
414             = "http://cyberneko.org/html/features/parse-noscript-content";
415 
416         /**
417          * Parses and then inserts the specified HTML content into the HTML content currently being parsed.
418          * @param html the HTML content to push
419          */
420         public void pushInputString(final String html) {
421             page_.registerParsingStart();
422             page_.registerInlineSnippetParsingStart();
423             try {
424                 final WebResponse webResponse = page_.getWebResponse();
425                 final Charset charset = webResponse.getContentCharset();
426                 final String url = webResponse.getWebRequest().getUrl().toString();
427                 final XMLInputSource in = new XMLInputSource(null, url, null, new StringReader(html), charset.name());
428                 ((HTMLConfiguration) fConfiguration).evaluateInputSource(in);
429             }
430             finally {
431                 page_.registerParsingEnd();
432                 page_.registerInlineSnippetParsingEnd();
433             }
434         }
435 
436         /**
437          * Creates a new builder for parsing the specified response contents.
438          * @param node the location at which to insert the new content
439          * @param url the page's URL
440          */
441         private HtmlUnitDOMBuilder(final DomNode node, final URL url, final String htmlContent) {
442             super(createConfiguration(node.getPage().getWebClient()));
443             page_ = (HtmlPage) node.getPage();
444 
445             currentNode_ = node;
446             for (final Node ancestor : currentNode_.getAncestors()) {
447                 stack_.push((DomNode) ancestor);
448             }
449 
450             final WebClient webClient = page_.getWebClient();
451             final HTMLParserListener listener = webClient.getHTMLParserListener();
452             final boolean reportErrors;
453             if (listener != null) {
454                 reportErrors = true;
455                 fConfiguration.setErrorHandler(new HTMLErrorHandler(listener, url, htmlContent));
456             }
457             else {
458                 reportErrors = false;
459             }
460 
461             try {
462                 setFeature(FEATURE_AUGMENTATIONS, true);
463                 setProperty("http://cyberneko.org/html/properties/names/elems", "default");
464                 if (!webClient.getBrowserVersion().hasFeature(HTML_ATTRIBUTE_LOWER_CASE)) {
465                     setProperty("http://cyberneko.org/html/properties/names/attrs", "no-change");
466                 }
467                 setFeature("http://cyberneko.org/html/features/report-errors", reportErrors);
468                 setFeature(FEATURE_PARSE_NOSCRIPT, !webClient.getOptions().isJavaScriptEnabled());
469                 setFeature(HTMLScanner.ALLOW_SELFCLOSING_IFRAME, false);
470 
471                 setContentHandler(this);
472                 setLexicalHandler(this); //comments and CDATA
473             }
474             catch (final SAXException e) {
475                 throw new ObjectInstantiationException("unable to create HTML parser", e);
476             }
477             initialSize_ = stack_.size();
478         }
479 
480         /**
481          * Create the configuration depending on the simulated browser
482          * @param webClient the current WebClient
483          * @return the configuration
484          */
485         private static XMLParserConfiguration createConfiguration(final WebClient webClient) {
486             final HTMLConfiguration configuration = new HTMLConfiguration();
487             configuration.htmlElements_.setElement(new HTMLElements.Element(HTMLElements.AREA, "AREA",
488                         HTMLElements.Element.EMPTY, HTMLElements.HEAD, null));
489             final BrowserVersion browserVersion = webClient.getBrowserVersion();
490             if (browserVersion.isChrome()) {
491                 configuration.htmlElements_.setElement(new HTMLElements.Element(HTMLElements.COMMAND, "COMMAND",
492                         HTMLElements.Element.EMPTY, HTMLElements.HEAD, null));
493                 configuration.htmlElements_.setElement(new HTMLElements.Element(HTMLElements.ISINDEX, "ISINDEX",
494                         HTMLElements.Element.INLINE, HTMLElements.BODY, null));
495             }
496             else if (browserVersion.isIE()) {
497                 configuration.htmlElements_.setElement(new HTMLElements.Element(HTMLElements.COMMAND, "COMMAND",
498                         HTMLElements.Element.EMPTY, HTMLElements.HEAD, null));
499                 configuration.htmlElements_.setElement(new HTMLElements.Element(HTMLElements.MAIN, "MAIN",
500                         HTMLElements.Element.INLINE, HTMLElements.BODY, null));
501             }
502 
503             return configuration;
504         }
505 
506         /**
507          * @return the document locator
508          */
509         public Locator getLocator() {
510             return locator_;
511         }
512 
513         /** {@inheritDoc ContentHandler#setDocumentLocator} */
514         @Override
515         public void setDocumentLocator(final Locator locator) {
516             locator_ = locator;
517         }
518 
519         /** {@inheritDoc ContentHandler#startDocument()} */
520         @Override
521         public void startDocument() throws SAXException {
522         }
523 
524         /** {@inheritDoc} */
525         @Override
526         public void startElement(final QName element, final XMLAttributes attributes, final Augmentations augs)
527             throws XNIException {
528             // augs might change so we store only the interesting part
529             lastTagWasSynthesized_ = isSynthesized(augs);
530             super.startElement(element, attributes, augs);
531         }
532 
533         /** {@inheritDoc ContentHandler#startElement(String,String,String,Attributes)} */
534         @Override
535         public void startElement(String namespaceURI, final String localName, String qName, final Attributes atts)
536             throws SAXException {
537 
538             if (snippetStartNodeOverwritten_) {
539                 snippetStartNodeOverwritten_ = false;
540                 return;
541             }
542             handleCharacters();
543 
544             String tagLower = localName.toLowerCase(Locale.ROOT);
545             if (page_.isParsingHtmlSnippet() && ("html".equals(tagLower) || "body".equals(tagLower))) {
546                 return;
547             }
548 
549             if (namespaceURI != null) {
550                 namespaceURI = namespaceURI.trim();
551             }
552             if ("head".equals(tagLower)) {
553                 if (headParsed_ == HeadParsed.YES || page_.isParsingHtmlSnippet()) {
554                     return;
555                 }
556 
557                 headParsed_ = lastTagWasSynthesized_ ? HeadParsed.SYNTHESIZED : HeadParsed.YES;
558             }
559             // add a head if none was there
560             else if (headParsed_ == HeadParsed.NO && ("body".equals(tagLower) || "frameset".equals(tagLower))) {
561                 final ElementFactory factory = getElementFactory(page_, namespaceURI, "head", true, insideSvg_);
562                 final DomElement newElement = factory.createElement(page_, "head", null);
563                 currentNode_.appendChild(newElement);
564                 headParsed_ = HeadParsed.SYNTHESIZED;
565             }
566 
567             // If we're adding a body element, keep track of any temporary synthetic ones
568             // that we may have had to create earlier (for document.write(), for example).
569             HtmlBody oldBody = null;
570             if ("body".equals(qName) && page_.getBody() instanceof HtmlBody) {
571                 oldBody = (HtmlBody) page_.getBody();
572             }
573 
574             // Need to reset this at each starting form tag because it could be set from a synthesized
575             // end tag.
576             if ("form".equals(tagLower)) {
577                 formWaitingForLostChildren_ = null;
578             }
579 
580             // Add the new node.
581             if (!(page_ instanceof XHtmlPage) && XHTML_NAMESPACE.equals(namespaceURI)) {
582                 namespaceURI = null;
583             }
584 
585             final boolean keyGenAsSelect = "keygen".equals(tagLower) && page_.hasFeature(KEYGEN_AS_SELECT);
586             if (keyGenAsSelect) {
587                 tagLower = "select";
588                 qName = "select";
589             }
590 
591             final ElementFactory factory = getElementFactory(page_, namespaceURI, qName, isInsideHtml(), insideSvg_);
592             if (factory == SVG_FACTORY) {
593                 namespaceURI = SVG_NAMESPACE;
594             }
595             final DomElement newElement = factory.createElementNS(page_, namespaceURI, qName, atts, true);
596             newElement.setStartLocation(locator_.getLineNumber(), locator_.getColumnNumber());
597 
598             // parse can't replace everything as it does not buffer elements while parsing
599             addNodeToRightParent(currentNode_, newElement);
600 
601             if ("svg".equals(tagLower)) {
602                 insideSvg_ = true;
603             }
604 
605             // If we had an old synthetic body and we just added a real body element, quietly
606             // remove the old body and move its children to the real body element we just added.
607             if (oldBody != null) {
608                 oldBody.quietlyRemoveAndMoveChildrenTo(newElement);
609             }
610 
611             if ("body".equals(tagLower)) {
612                 body_ = (HtmlElement) newElement;
613             }
614             else if ("meta".equals(tagLower) && page_.hasFeature(META_X_UA_COMPATIBLE)) {
615                 final HtmlMeta meta = (HtmlMeta) newElement;
616                 if ("X-UA-Compatible".equals(meta.getHttpEquivAttribute())) {
617                     final String content = meta.getContentAttribute();
618                     if (content.startsWith("IE=")) {
619                         final String mode = content.substring(3).trim();
620                         final int version = page_.getWebClient().getBrowserVersion().getBrowserVersionNumeric();
621                         if ("edge".equals(mode)) {
622                             ((HTMLDocument) page_.getScriptableObject()).forceDocumentMode(version);
623                         }
624                         else {
625                             try {
626                                 int value = Integer.parseInt(mode);
627                                 if (value > version) {
628                                     value = version;
629                                 }
630                                 ((HTMLDocument) page_.getScriptableObject()).forceDocumentMode(value);
631                             }
632                             catch (final Exception e) {
633                                 // ignore
634                             }
635                         }
636                     }
637                 }
638             }
639             if (keyGenAsSelect) {
640                 DomElement option = factory.createElementNS(page_, namespaceURI, "option", null, true);
641                 option.appendChild(new DomText(page_, "High Grade"));
642                 newElement.appendChild(option);
643 
644                 option = factory.createElementNS(page_, namespaceURI, "option", null, true);
645                 option.appendChild(new DomText(page_, "Medium Grade"));
646                 newElement.appendChild(option);
647             }
648             currentNode_ = newElement;
649             stack_.push(currentNode_);
650         }
651 
652         /**
653          * Adds the new node to the right parent that is not necessary the currentNode in case of
654          * malformed HTML code. The method tries to emulate the behavior of Firefox.
655          */
656         private void addNodeToRightParent(final DomNode currentNode, final DomElement newElement) {
657             final String currentNodeName = currentNode.getNodeName();
658             final String newNodeName = newElement.getNodeName();
659 
660             DomNode parent = currentNode;
661 
662             // If the new node is a table element and the current node isn't one search the stack for the
663             // correct parent.
664             if ("tr".equals(newNodeName) && !isTableChild(currentNodeName)) {
665                 parent = findElementOnStack("tbody", "thead", "tfoot");
666             }
667             else if (isTableChild(newNodeName) && !"table".equals(currentNodeName)) {
668                 parent = findElementOnStack("table");
669             }
670             else if (isTableCell(newNodeName) && !"tr".equals(currentNodeName)) {
671                 parent = findElementOnStack("tr");
672             }
673 
674             // If the parent changed and the old parent was a form it is now waiting for lost children.
675             if (parent != currentNode && "form".equals(currentNodeName)) {
676                 formWaitingForLostChildren_ = (HtmlForm) currentNode;
677             }
678 
679             final String parentNodeName = parent.getNodeName();
680 
681             if (("table".equals(parentNodeName) && !isTableChild(newNodeName))
682                     || (isTableChild(parentNodeName) && !"caption".equals(parentNodeName)
683                             && !"colgroup".equals(parentNodeName) && !"tr".equals(newNodeName))
684                     || ("colgroup".equals(parentNodeName) && !"col".equals(newNodeName))
685                     || ("tr".equals(parentNodeName) && !isTableCell(newNodeName))) {
686                 // If its a form or submittable just add it even though the resulting DOM is incorrect.
687                 // Otherwise insert the element before the table.
688                 if ("form".equals(newNodeName)) {
689                     formWaitingForLostChildren_ = (HtmlForm) newElement;
690                     parent.appendChild(newElement);
691                 }
692                 else if (newElement instanceof SubmittableElement) {
693                     if (formWaitingForLostChildren_ != null) {
694                         formWaitingForLostChildren_.addLostChild((HtmlElement) newElement);
695                     }
696                     parent.appendChild(newElement);
697                 }
698                 else {
699                     parent = findElementOnStack("table");
700                     parent.insertBefore(newElement);
701                 }
702             }
703             else if (formWaitingForLostChildren_ != null && "form".equals(parentNodeName)) {
704                 // Do not append any children to invalid form. Submittable are inserted after the form,
705                 // everything else before the table.
706                 if (newElement instanceof SubmittableElement) {
707                     formWaitingForLostChildren_.addLostChild((HtmlElement) newElement);
708                     parent.getParentNode().appendChild(newElement);
709                 }
710                 else {
711                     parent = findElementOnStack("table");
712                     parent.insertBefore(newElement);
713                 }
714             }
715             else if (formWaitingForLostChildren_ != null && newElement instanceof SubmittableElement) {
716                 formWaitingForLostChildren_.addLostChild((HtmlElement) newElement);
717                 parent.appendChild(newElement);
718             }
719             else {
720                 parent.appendChild(newElement);
721             }
722         }
723 
724         private DomNode findElementOnStack(final String... searchedElementNames) {
725             DomNode searchedNode = null;
726             for (final DomNode node : stack_) {
727                 if (ArrayUtils.contains(searchedElementNames, node.getNodeName())) {
728                     searchedNode = node;
729                     break;
730                 }
731             }
732 
733             if (searchedNode == null) {
734                 searchedNode = stack_.peek(); // this is surely wrong but at least it won't throw a NPE
735             }
736 
737             return searchedNode;
738         }
739 
740         private static boolean isTableChild(final String nodeName) {
741             return "thead".equals(nodeName) || "tbody".equals(nodeName)
742                     || "tfoot".equals(nodeName) || "caption".equals(nodeName)
743                     || "colgroup".equals(nodeName);
744         }
745 
746         private static boolean isTableCell(final String nodeName) {
747             return "td".equals(nodeName) || "th".equals(nodeName);
748         }
749 
750         /** {@inheritDoc} */
751         @Override
752         public void endElement(final QName element, final Augmentations augs)
753             throws XNIException {
754             // augs might change so we store only the interesting part
755             lastTagWasSynthesized_ = isSynthesized(augs);
756             super.endElement(element, augs);
757         }
758 
759         /** {@inheritDoc ContentHandler@endElement(String,String,String)} */
760         @Override
761         public void endElement(final String namespaceURI, final String localName, final String qName)
762             throws SAXException {
763 
764             handleCharacters();
765 
766             final String tagLower = localName.toLowerCase(Locale.ROOT);
767 
768             if (page_.isParsingHtmlSnippet()) {
769                 if ("html".equals(tagLower) || "body".equals(tagLower)) {
770                     return;
771                 }
772                 if (stack_.size() == initialSize_) {
773                     snippetStartNodeOverwritten_ = true;
774                     return;
775                 }
776             }
777 
778             if ("svg".equals(tagLower)) {
779                 insideSvg_ = false;
780             }
781 
782             // Need to reset this at each closing form tag because a valid form could start afterwards.
783             if ("form".equals(tagLower)) {
784                 formWaitingForLostChildren_ = null;
785             }
786 
787             final DomNode previousNode = stack_.pop(); //remove currentElement from stack
788             previousNode.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
789 
790             // special handling for form lost children (malformed HTML code where </form> is synthesized)
791             if (previousNode instanceof HtmlForm && lastTagWasSynthesized_) {
792                 formWaitingForLostChildren_ = (HtmlForm) previousNode;
793             }
794 
795             if (!stack_.isEmpty()) {
796                 currentNode_ = stack_.peek();
797             }
798 
799             final boolean postponed = page_.isParsingInlineHtmlSnippet();
800             previousNode.onAllChildrenAddedToPage(postponed);
801         }
802 
803         /** {@inheritDoc} */
804         @Override
805         public void characters(final char[] ch, final int start, final int length) throws SAXException {
806             if (characters_ == null) {
807                 characters_ = new StringBuilder();
808             }
809             characters_.append(ch, start, length);
810         }
811 
812         /** {@inheritDoc} */
813         @Override
814         public void ignorableWhitespace(final char[] ch, final int start, final int length) throws SAXException {
815             if (characters_ == null) {
816                 characters_ = new StringBuilder();
817             }
818             characters_.append(ch, start, length);
819         }
820 
821         /**
822          * Picks up the character data accumulated so far and add it to the current element as a text node.
823          */
824         private void handleCharacters() {
825             if (characters_ != null && characters_.length() != 0) {
826                 if (currentNode_ instanceof HtmlHtml) {
827                     // In HTML, the <html> node only has two possible children:
828                     // the <head> and the <body>; any text is ignored.
829                     characters_.setLength(0);
830                 }
831                 else {
832                     // Use the normal behavior: append a text node for the accumulated text.
833                     final String textValue = characters_.toString();
834                     final DomText text = new DomText(page_, textValue);
835                     characters_.setLength(0);
836 
837                     if (StringUtils.isNotBlank(textValue)) {
838                         // malformed HTML: </td>some text</tr> => text comes before the table
839                         if (currentNode_ instanceof HtmlTableRow) {
840                             final HtmlTableRow row = (HtmlTableRow) currentNode_;
841                             final HtmlTable enclosingTable = row.getEnclosingTable();
842                             if (enclosingTable != null) { // may be null when called from Range.createContextualFragment
843                                 if (enclosingTable.getPreviousSibling() instanceof DomText) {
844                                     final DomText domText = (DomText) enclosingTable.getPreviousSibling();
845                                     domText.setTextContent(domText + textValue);
846                                 }
847                                 else {
848                                     enclosingTable.insertBefore(text);
849                                 }
850                             }
851                         }
852                         else if (currentNode_ instanceof HtmlTable) {
853                             final HtmlTable enclosingTable = (HtmlTable) currentNode_;
854                             if (enclosingTable.getPreviousSibling() instanceof DomText) {
855                                 final DomText domText = (DomText) enclosingTable.getPreviousSibling();
856                                 domText.setTextContent(domText + textValue);
857                             }
858                             else {
859                                 enclosingTable.insertBefore(text);
860                             }
861                         }
862                         else if (currentNode_ instanceof HtmlImage) {
863                             currentNode_.setNextSibling(text);
864                         }
865                         else {
866                             currentNode_.appendChild(text);
867                         }
868                     }
869                     else {
870                         currentNode_.appendChild(text);
871                     }
872                 }
873             }
874         }
875 
876         /** {@inheritDoc} */
877         @Override
878         public void endDocument() throws SAXException {
879             handleCharacters();
880             final DomNode currentPage = page_;
881             currentPage.setEndLocation(locator_.getLineNumber(), locator_.getColumnNumber());
882         }
883 
884         /** {@inheritDoc} */
885         @Override
886         public void startPrefixMapping(final String prefix, final String uri) throws SAXException {
887         }
888 
889         /** {@inheritDoc} */
890         @Override
891         public void endPrefixMapping(final String prefix) throws SAXException {
892         }
893 
894         /** {@inheritDoc} */
895         @Override
896         public void processingInstruction(final String target, final String data) throws SAXException {
897         }
898 
899         /** {@inheritDoc} */
900         @Override
901         public void skippedEntity(final String name) throws SAXException {
902         }
903 
904         // LexicalHandler methods
905 
906         /** {@inheritDoc} */
907         @Override
908         public void comment(final char[] ch, final int start, final int length) {
909             handleCharacters();
910             final String data = new String(ch, start, length);
911             final DomComment comment = new DomComment(page_, data);
912             currentNode_.appendChild(comment);
913         }
914 
915         /** {@inheritDoc} */
916         @Override
917         public void endCDATA() {
918         }
919 
920         /** {@inheritDoc} */
921         @Override
922         public void endDTD() {
923         }
924 
925         /** {@inheritDoc} */
926         @Override
927         public void endEntity(final String name) {
928         }
929 
930         /** {@inheritDoc} */
931         @Override
932         public void startCDATA() {
933         }
934 
935         /** {@inheritDoc} */
936         @Override
937         public void startDTD(final String name, final String publicId, final String systemId) {
938             final DomDocumentType type = new DomDocumentType(page_, name, publicId, systemId);
939             page_.setDocumentType(type);
940 
941             final Node child;
942             child = type;
943             page_.appendChild(child);
944         }
945 
946         /** {@inheritDoc} */
947         @Override
948         public void startEntity(final String name) {
949         }
950 
951         /**
952          * {@inheritDoc}
953          */
954         @Override
955         public void ignoredEndElement(final QName element, final Augmentations augs) {
956             // if real </form> is reached, don't accept fields anymore as lost children
957             if ("form".equals(element.localpart)) {
958                 formWaitingForLostChildren_ = null;
959             }
960         }
961 
962         /**
963          * {@inheritDoc}
964          */
965         @Override
966         public void ignoredStartElement(final QName elem, final XMLAttributes attrs, final Augmentations augs) {
967             // when multiple body elements are encountered, the attributes of the discarded
968             // elements are used when not previously defined
969             if (body_ != null && "body".equalsIgnoreCase(elem.localpart) && attrs != null) {
970                 copyAttributes(body_, attrs);
971             }
972             if (body_ != null && "html".equalsIgnoreCase(elem.localpart) && attrs != null) {
973                 copyAttributes((DomElement) body_.getParentNode(), attrs);
974             }
975         }
976 
977         private static void copyAttributes(final DomElement to, final XMLAttributes attrs) {
978             final int length = attrs.getLength();
979             for (int i = 0; i < length; i++) {
980                 final String attrName = attrs.getLocalName(i).toLowerCase(Locale.ROOT);
981                 if (to.getAttributes().getNamedItem(attrName) == null) {
982                     to.setAttribute(attrName, attrs.getValue(i));
983                     if (attrName.startsWith("on") && to.getScriptableObject() instanceof HTMLBodyElement) {
984                         final HTMLBodyElement jsBody = (HTMLBodyElement) to.getScriptableObject();
985                         jsBody.createEventHandlerFromAttribute(attrName, attrs.getValue(i));
986                     }
987                 }
988             }
989         }
990 
991         /**
992          * {@inheritDoc}
993          */
994         @Override
995         public void parse(final XMLInputSource inputSource) throws XNIException, IOException {
996             final HtmlUnitDOMBuilder oldBuilder = page_.getBuilder();
997             page_.setBuilder(this);
998             try {
999                 super.parse(inputSource);
1000             }
1001             finally {
1002                 page_.setBuilder(oldBuilder);
1003             }
1004         }
1005 
1006         private static boolean isSynthesized(final Augmentations augs) {
1007             final HTMLEventInfo info = (augs == null) ? null
1008                     : (HTMLEventInfo) augs.getItem(FEATURE_AUGMENTATIONS);
1009             return info != null && info.isSynthesized();
1010         }
1011 
1012         private boolean isInsideHtml() {
1013             boolean html = true;
1014             for (DomNode node = currentNode_; node != null; node = node.getParentNode()) {
1015                 if (!(node instanceof HtmlElement)) {
1016                     html = false;
1017                 }
1018             }
1019             return html;
1020         }
1021     }
1022 }
1023 
1024 /**
1025  * Utility to transmit parsing errors to a {@link HTMLParserListener}.
1026  */
1027 class HTMLErrorHandler extends DefaultErrorHandler {
1028     private final HTMLParserListener listener_;
1029     private final URL url_;
1030     private String html_;
1031 
1032     HTMLErrorHandler(final HTMLParserListener listener, final URL url, final String htmlContent) {
1033         WebAssert.notNull("listener", listener);
1034         WebAssert.notNull("url", url);
1035         listener_ = listener;
1036         url_ = url;
1037         html_ = htmlContent;
1038     }
1039 
1040     /** @see DefaultErrorHandler#error(String,String,XMLParseException) */
1041     @Override
1042     public void error(final String domain, final String key,
1043             final XMLParseException exception) throws XNIException {
1044         listener_.error(exception.getMessage(),
1045                 url_,
1046                 html_,
1047                 exception.getLineNumber(),
1048                 exception.getColumnNumber(),
1049                 key);
1050     }
1051 
1052     /** @see DefaultErrorHandler#warning(String,String,XMLParseException) */
1053     @Override
1054     public void warning(final String domain, final String key,
1055             final XMLParseException exception) throws XNIException {
1056         listener_.warning(exception.getMessage(),
1057                 url_,
1058                 html_,
1059                 exception.getLineNumber(),
1060                 exception.getColumnNumber(),
1061                 key);
1062     }
1063 }