View Javadoc
1   /*
2    * Copyright (c) 2002-2017 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * http://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package com.gargoylesoftware.htmlunit.xml;
16  
17  import static java.nio.charset.StandardCharsets.UTF_8;
18  
19  import java.io.IOException;
20  import java.io.InputStreamReader;
21  import java.io.Reader;
22  import java.io.StringReader;
23  import java.lang.reflect.Field;
24  import java.nio.charset.Charset;
25  import java.util.ArrayList;
26  import java.util.HashMap;
27  import java.util.LinkedHashMap;
28  import java.util.List;
29  import java.util.Locale;
30  import java.util.Map;
31  
32  import javax.xml.parsers.DocumentBuilder;
33  import javax.xml.parsers.DocumentBuilderFactory;
34  import javax.xml.parsers.ParserConfigurationException;
35  
36  import org.apache.commons.io.input.BOMInputStream;
37  import org.apache.commons.logging.Log;
38  import org.apache.commons.logging.LogFactory;
39  import org.apache.xerces.dom.DeferredDocumentImpl;
40  import org.apache.xerces.dom.DeferredNode;
41  import org.w3c.dom.Attr;
42  import org.w3c.dom.Document;
43  import org.w3c.dom.DocumentType;
44  import org.w3c.dom.NamedNodeMap;
45  import org.w3c.dom.Node;
46  import org.w3c.dom.NodeList;
47  import org.xml.sax.Attributes;
48  import org.xml.sax.EntityResolver;
49  import org.xml.sax.ErrorHandler;
50  import org.xml.sax.InputSource;
51  import org.xml.sax.SAXException;
52  import org.xml.sax.SAXParseException;
53  import org.xml.sax.helpers.AttributesImpl;
54  
55  import com.gargoylesoftware.htmlunit.SgmlPage;
56  import com.gargoylesoftware.htmlunit.WebResponse;
57  import com.gargoylesoftware.htmlunit.html.DomAttr;
58  import com.gargoylesoftware.htmlunit.html.DomCDataSection;
59  import com.gargoylesoftware.htmlunit.html.DomComment;
60  import com.gargoylesoftware.htmlunit.html.DomDocumentType;
61  import com.gargoylesoftware.htmlunit.html.DomElement;
62  import com.gargoylesoftware.htmlunit.html.DomNode;
63  import com.gargoylesoftware.htmlunit.html.DomProcessingInstruction;
64  import com.gargoylesoftware.htmlunit.html.DomText;
65  import com.gargoylesoftware.htmlunit.html.ElementFactory;
66  import com.gargoylesoftware.htmlunit.html.HTMLParser;
67  
68  /**
69   * <span style="color:red">INTERNAL API - SUBJECT TO CHANGE AT ANY TIME - USE AT YOUR OWN RISK.</span><br>
70   *
71   * Provides facility method to work with XML responses.
72   *
73   * @author Marc Guillemot
74   * @author Ahmed Ashour
75   * @author Sudhan Moghe
76   * @author Ronald Brill
77   * @author Chuck Dumont
78   * @author Frank Danek
79   */
80  public final class XmlUtil {
81  
82      /**
83       * Default encoding used.
84       *
85       * @deprecated as of 2.26, use {@link java.nio.charset.StandardCharsets#UTF_8}
86       */
87      @Deprecated
88      public static final Charset DEFAULT_CHARSET = UTF_8;
89  
90      private static final Log LOG = LogFactory.getLog(XmlUtil.class);
91  
92      private static final ErrorHandler DISCARD_MESSAGES_HANDLER = new ErrorHandler() {
93          /**
94           * Does nothing as we're not interested in this.
95           */
96          @Override
97          public void error(final SAXParseException exception) {
98              // Does nothing as we're not interested in this.
99          }
100 
101         /**
102          * Does nothing as we're not interested in this.
103          */
104         @Override
105         public void fatalError(final SAXParseException exception) {
106             // Does nothing as we're not interested in this.
107         }
108 
109         /**
110          * Does nothing as we're not interested in this.
111          */
112         @Override
113         public void warning(final SAXParseException exception) {
114             // Does nothing as we're not interested in this.
115         }
116     };
117 
118     /**
119      * Utility class, hide constructor.
120      */
121     private XmlUtil() {
122         // Empty.
123     }
124 
125     /**
126      * Builds a document from the content of the web response.
127      * A warning is logged if an exception is thrown while parsing the XML content
128      * (for instance when the content is not a valid XML and can't be parsed).
129      *
130      * @param webResponse the response from the server
131      * @throws IOException if the page could not be created
132      * @return the parse result
133      * @throws SAXException if the parsing fails
134      * @throws ParserConfigurationException if a DocumentBuilder cannot be created
135      */
136     public static Document buildDocument(final WebResponse webResponse)
137         throws IOException, SAXException, ParserConfigurationException {
138 
139         final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
140 
141         if (webResponse == null) {
142             return factory.newDocumentBuilder().newDocument();
143         }
144 
145         factory.setNamespaceAware(true);
146         final InputStreamReader reader = new InputStreamReader(
147                 new BOMInputStream(webResponse.getContentAsStream()),
148                 webResponse.getContentCharset());
149 
150         // we have to do the blank input check and the parsing in one step
151         final TrackBlankContentReader tracker = new TrackBlankContentReader(reader);
152 
153         final InputSource source = new InputSource(tracker);
154         final DocumentBuilder builder = factory.newDocumentBuilder();
155         builder.setErrorHandler(DISCARD_MESSAGES_HANDLER);
156         builder.setEntityResolver(new EntityResolver() {
157             @Override
158             public InputSource resolveEntity(final String publicId, final String systemId)
159                 throws SAXException, IOException {
160                 return new InputSource(new StringReader(""));
161             }
162         });
163         try {
164             return builder.parse(source);
165         }
166         catch (final SAXException e) {
167             if (tracker.wasBlank()) {
168                 return factory.newDocumentBuilder().newDocument();
169             }
170             throw e;
171         }
172     }
173 
174     /**
175      * Helper for memory and performance optimization.
176      */
177     private static final class TrackBlankContentReader extends Reader {
178         private Reader reader_;
179         private boolean wasBlank_ = true;
180 
181         TrackBlankContentReader(final Reader characterStream) {
182             reader_ = characterStream;
183         }
184 
185         public boolean wasBlank() {
186             return wasBlank_;
187         }
188 
189         @Override
190         public void close() throws IOException {
191             reader_.close();
192         }
193 
194         @Override
195         public int read(final char[] cbuf, final int off, final int len) throws IOException {
196             final int result = reader_.read(cbuf, off, len);
197 
198             if (wasBlank_ && result > -1) {
199                 for (int i = 0; i < result; i++) {
200                     final char ch = cbuf[off + i];
201                     if (!Character.isWhitespace(ch)) {
202                         wasBlank_ = false;
203                         break;
204                     }
205 
206                 }
207             }
208             return result;
209         }
210     }
211 
212     /**
213      * Recursively appends a {@link Node} child to {@link DomNode} parent.
214      *
215      * @param page the owner page of {@link DomElement}s to be created
216      * @param parent the parent DomNode
217      * @param child the child Node
218      * @param handleXHTMLAsHTML if true elements from the XHTML namespace are handled as HTML elements instead of
219      *     DOM elements
220      */
221     public static void appendChild(final SgmlPage page, final DomNode parent, final Node child,
222         final boolean handleXHTMLAsHTML) {
223         appendChild(page, parent, child, handleXHTMLAsHTML, null);
224     }
225 
226     /**
227      * Recursively appends a {@link Node} child to {@link DomNode} parent.
228      *
229      * @param page the owner page of {@link DomElement}s to be created
230      * @param parent the parent DomNode
231      * @param child the child Node
232      * @param handleXHTMLAsHTML if true elements from the XHTML namespace are handled as HTML elements instead of
233      *     DOM elements
234      * @param attributesOrderMap (optional) the one returned by {@link #getAttributesOrderMap(Document)}
235      */
236     public static void appendChild(final SgmlPage page, final DomNode parent, final Node child,
237         final boolean handleXHTMLAsHTML, final Map<Integer, List<String>> attributesOrderMap) {
238         final DocumentType documentType = child.getOwnerDocument().getDoctype();
239         if (documentType != null && page instanceof XmlPage) {
240             final DomDocumentType domDoctype = new DomDocumentType(
241                     page, documentType.getName(), documentType.getPublicId(), documentType.getSystemId());
242             ((XmlPage) page).setDocumentType(domDoctype);
243         }
244         final DomNode childXml = createFrom(page, child, handleXHTMLAsHTML, attributesOrderMap);
245         parent.appendChild(childXml);
246         copy(page, child, childXml, handleXHTMLAsHTML, attributesOrderMap);
247     }
248 
249     private static DomNode createFrom(final SgmlPage page, final Node source, final boolean handleXHTMLAsHTML,
250             final Map<Integer, List<String>> attributesOrderMap) {
251         if (source.getNodeType() == Node.TEXT_NODE) {
252             return new DomText(page, source.getNodeValue());
253         }
254         if (source.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE) {
255             return new DomProcessingInstruction(page, source.getNodeName(), source.getNodeValue());
256         }
257         if (source.getNodeType() == Node.COMMENT_NODE) {
258             return new DomComment(page, source.getNodeValue());
259         }
260         if (source.getNodeType() == Node.DOCUMENT_TYPE_NODE) {
261             final DocumentType documentType = (DocumentType) source;
262             return new DomDocumentType(page, documentType.getName(), documentType.getPublicId(),
263                     documentType.getSystemId());
264         }
265         final String ns = source.getNamespaceURI();
266         String localName = source.getLocalName();
267         if (handleXHTMLAsHTML && HTMLParser.XHTML_NAMESPACE.equals(ns)) {
268             final ElementFactory factory = HTMLParser.getFactory(localName);
269             return factory.createElementNS(page, ns, localName,
270                     namedNodeMapToSaxAttributes(source.getAttributes(), attributesOrderMap, source));
271         }
272         final NamedNodeMap nodeAttributes = source.getAttributes();
273         if (page != null && page.isHtmlPage()) {
274             localName = localName.toUpperCase(Locale.ROOT);
275         }
276         final String qualifiedName;
277         if (source.getPrefix() == null) {
278             qualifiedName = localName;
279         }
280         else {
281             qualifiedName = source.getPrefix() + ':' + localName;
282         }
283 
284         final String namespaceURI = source.getNamespaceURI();
285         if (HTMLParser.SVG_NAMESPACE.equals(namespaceURI)) {
286             return HTMLParser.SVG_FACTORY.createElementNS(page, namespaceURI, qualifiedName,
287                     namedNodeMapToSaxAttributes(nodeAttributes, attributesOrderMap, source));
288         }
289 
290         final Map<String, DomAttr> attributes = new LinkedHashMap<>();
291         for (int i = 0; i < nodeAttributes.getLength(); i++) {
292             final int orderedIndex = getIndex(nodeAttributes, attributesOrderMap, source, i);
293             final Attr attribute = (Attr) nodeAttributes.item(orderedIndex);
294             final String attributeNamespaceURI = attribute.getNamespaceURI();
295             final String attributeQualifiedName;
296             if (attribute.getPrefix() != null) {
297                 attributeQualifiedName = attribute.getPrefix() + ':' + attribute.getLocalName();
298             }
299             else {
300                 attributeQualifiedName = attribute.getLocalName();
301             }
302             final String value = attribute.getNodeValue();
303             final boolean specified = attribute.getSpecified();
304             final DomAttr xmlAttribute =
305                     new DomAttr(page, attributeNamespaceURI, attributeQualifiedName, value, specified);
306             attributes.put(attribute.getNodeName(), xmlAttribute);
307         }
308         return new DomElement(namespaceURI, qualifiedName, page, attributes);
309     }
310 
311     private static Attributes namedNodeMapToSaxAttributes(final NamedNodeMap attributesMap,
312             final Map<Integer, List<String>> attributesOrderMap, final Node element) {
313         final AttributesImpl attributes = new AttributesImpl();
314         final int length = attributesMap.getLength();
315         for (int i = 0; i < length; i++) {
316             final int orderedIndex = getIndex(attributesMap, attributesOrderMap, element, i);
317             final Node attr = attributesMap.item(orderedIndex);
318             attributes.addAttribute(attr.getNamespaceURI(), attr.getLocalName(),
319                 attr.getNodeName(), null, attr.getNodeValue());
320         }
321 
322         return attributes;
323     }
324 
325     private static int getIndex(final NamedNodeMap namedNodeMap, final Map<Integer, List<String>> attributesOrderMap,
326             final Node element, final int requiredIndex) {
327         if (attributesOrderMap != null && element instanceof DeferredNode) {
328             final int elementIndex = ((DeferredNode) element).getNodeIndex();
329             final List<String> attributesOrderList = attributesOrderMap.get(elementIndex);
330             if (attributesOrderList != null) {
331                 final String attributeName = attributesOrderList.get(requiredIndex);
332                 for (int i = 0; i < namedNodeMap.getLength(); i++) {
333                     if (namedNodeMap.item(i).getNodeName().equals(attributeName)) {
334                         return i;
335                     }
336                 }
337             }
338         }
339         return requiredIndex;
340     }
341 
342     /**
343      * Copy all children from 'source' to 'dest', within the context of the specified page.
344      * @param page the page which the nodes belong to
345      * @param source the node to copy from
346      * @param dest the node to copy to
347      * @param handleXHTMLAsHTML if true elements from the XHTML namespace are handled as HTML elements instead of
348      *     DOM elements
349      */
350     private static void copy(final SgmlPage page, final Node source, final DomNode dest,
351         final boolean handleXHTMLAsHTML, final Map<Integer, List<String>> attributesOrderMap) {
352         final NodeList nodeChildren = source.getChildNodes();
353         for (int i = 0; i < nodeChildren.getLength(); i++) {
354             final Node child = nodeChildren.item(i);
355             switch (child.getNodeType()) {
356                 case Node.ELEMENT_NODE:
357                     final DomNode childXml = createFrom(page, child, handleXHTMLAsHTML, attributesOrderMap);
358                     dest.appendChild(childXml);
359                     copy(page, child, childXml, handleXHTMLAsHTML, attributesOrderMap);
360                     break;
361 
362                 case Node.TEXT_NODE:
363                     dest.appendChild(new DomText(page, child.getNodeValue()));
364                     break;
365 
366                 case Node.CDATA_SECTION_NODE:
367                     dest.appendChild(new DomCDataSection(page, child.getNodeValue()));
368                     break;
369 
370                 case Node.COMMENT_NODE:
371                     dest.appendChild(new DomComment(page, child.getNodeValue()));
372                     break;
373 
374                 case Node.PROCESSING_INSTRUCTION_NODE:
375                     dest.appendChild(new DomProcessingInstruction(page, child.getNodeName(), child.getNodeValue()));
376                     break;
377 
378                 default:
379                     LOG.warn("NodeType " + child.getNodeType()
380                         + " (" + child.getNodeName() + ") is not yet supported.");
381             }
382         }
383     }
384 
385     /**
386      * Search for the namespace URI of the given prefix, starting from the specified element.
387      * The default namespace can be searched for by specifying "" as the prefix.
388      * @param element the element to start searching from
389      * @param prefix the namespace prefix
390      * @return the namespace URI bound to the prefix; or null if there is no such namespace
391      */
392     public static String lookupNamespaceURI(final DomElement element, final String prefix) {
393         String uri = DomElement.ATTRIBUTE_NOT_DEFINED;
394         if (prefix.isEmpty()) {
395             uri = element.getAttribute("xmlns");
396         }
397         else {
398             uri = element.getAttribute("xmlns:" + prefix);
399         }
400         if (uri == DomElement.ATTRIBUTE_NOT_DEFINED) {
401             final DomNode parentNode = element.getParentNode();
402             if (parentNode instanceof DomElement) {
403                 uri = lookupNamespaceURI((DomElement) parentNode, prefix);
404             }
405         }
406         return uri;
407     }
408 
409     /**
410      * Search for the prefix associated with specified namespace URI.
411      * @param element the element to start searching from
412      * @param namespace the namespace prefix
413      * @return the prefix bound to the namespace URI; or null if there is no such namespace
414      */
415     public static String lookupPrefix(final DomElement element, final String namespace) {
416         final Map<String, DomAttr> attributes = element.getAttributesMap();
417         for (final Map.Entry<String, DomAttr> entry : attributes.entrySet()) {
418             final String name = entry.getKey();
419             final DomAttr value = entry.getValue();
420             if (name.startsWith("xmlns:") && value.getValue().equals(namespace)) {
421                 return name.substring(6);
422             }
423         }
424         for (final DomNode child : element.getChildren()) {
425             if (child instanceof DomElement) {
426                 final String prefix = lookupPrefix((DomElement) child, namespace);
427                 if (prefix != null) {
428                     return prefix;
429                 }
430             }
431         }
432         return null;
433     }
434 
435     /**
436      * Returns internal Xerces details about all elements in the specified document.
437      * The id of the returned {@link Map} is the {@code nodeIndex} of an element, and the list
438      * is the array of ordered attributes names.
439      * @param document the document
440      * @return the map of an element index with its ordered attribute names
441      */
442     public static Map<Integer, List<String>> getAttributesOrderMap(final Document document) {
443         final Map<Integer, List<String>> map = new HashMap<>();
444         if (document instanceof DeferredDocumentImpl) {
445             final DeferredDocumentImpl deferredDocument = (DeferredDocumentImpl) document;
446             final int fNodeCount = getPrivate(deferredDocument, "fNodeCount");
447             for (int i = 0; i < fNodeCount; i++) {
448                 final int type = deferredDocument.getNodeType(i, false);
449                 if (type == org.w3c.dom.Node.ELEMENT_NODE) {
450                     int attrIndex = deferredDocument.getNodeExtra(i, false);
451                     final List<String> attributes = new ArrayList<>();
452                     map.put(i, attributes);
453                     while (attrIndex != -1) {
454                         attributes.add(deferredDocument.getNodeName(attrIndex, false));
455                         attrIndex = deferredDocument.getPrevSibling(attrIndex, false);
456                     }
457                 }
458             }
459         }
460         return map;
461     }
462 
463     @SuppressWarnings("unchecked")
464     private static <T> T getPrivate(final Object object, final String fieldName) {
465         try {
466             final Field f = object.getClass().getDeclaredField(fieldName);
467             f.setAccessible(true);
468             return (T) f.get(object);
469         }
470         catch (final Exception e) {
471             throw new RuntimeException(e);
472         }
473     }
474 }