View Javadoc
1   /*
2    * Copyright (c) 2002-2017 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * http://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package com.gargoylesoftware.htmlunit;
16  
17  import java.io.IOException;
18  import java.io.InputStream;
19  import java.io.Serializable;
20  import java.util.Locale;
21  
22  import org.apache.commons.io.IOUtils;
23  import org.apache.commons.lang3.ArrayUtils;
24  import org.apache.commons.lang3.StringUtils;
25  
26  import com.gargoylesoftware.htmlunit.html.DomElement;
27  import com.gargoylesoftware.htmlunit.html.HTMLParser;
28  import com.gargoylesoftware.htmlunit.html.HtmlPage;
29  import com.gargoylesoftware.htmlunit.html.XHtmlPage;
30  import com.gargoylesoftware.htmlunit.xml.XmlPage;
31  
32  /**
33   * The default implementation of {@link PageCreator}. Designed to be extended for easier handling of new content
34   * types. Just check the content type in <tt>createPage()</tt> and call <tt>super(createPage())</tt> if your custom
35   * type isn't found. There are also protected <tt>createXXXXPage()</tt> methods for creating the {@link Page} types
36   * which HtmlUnit already knows about for your custom content types.
37   *
38   * <p>
39   * The following table shows the type of {@link Page} created depending on the content type:<br>
40   * <br>
41   *  <table border="1" width="50%" summary="Page Types">
42   *    <tr>
43   *      <th>Content type</th>
44   *      <th>Type of page</th>
45   *    </tr>
46   *    <tr>
47   *      <td>text/html<br>
48   *          text/javascript</td>
49   *      <td>{@link HtmlPage}</td>
50   *    </tr>
51   *    <tr>
52   *      <td>text/xml<br>
53   *      application/xml<br>
54   *      text/vnd.wap.wml<br>
55   *      *+xml
56   *      </td>
57   *      <td>{@link XmlPage}, or an {@link XHtmlPage} if an XHTML namespace is used</td>
58   *    </tr>
59   *    <tr>
60   *      <td>text/*</td>
61   *      <td>{@link TextPage}</td>
62   *    </tr>
63   *    <tr>
64   *      <td>Anything else</td>
65   *      <td>{@link UnexpectedPage}</td>
66   *    </tr>
67   *  </table>
68   *
69   * @author <a href="mailto:mbowler@GargoyleSoftware.com">Mike Bowler</a>
70   * @author <a href="mailto:cse@dynabean.de">Christian Sell</a>
71   * @author <a href="mailto:yourgod@users.sourceforge.net">Brad Clarke</a>
72   * @author Marc Guillemot
73   * @author Ahmed Ashour
74   * @author Daniel Gredler
75   * @author Ronald Brill
76   */
77  public class DefaultPageCreator implements PageCreator, Serializable {
78  
79      private static final byte[] markerUTF8_ = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
80      private static final byte[] markerUTF16BE_ = {(byte) 0xfe, (byte) 0xff};
81      private static final byte[] markerUTF16LE_ = {(byte) 0xff, (byte) 0xfe};
82  
83      /**
84       * The different supported page types.
85       */
86      public enum PageType {
87          /** html. */
88          HTML,
89          /** javascript. */
90          JAVASCRIPT,
91          /** xml. */
92          XML,
93          /** text. */
94          TEXT,
95          /** unknown. */
96          UNKNOWN
97      }
98  
99      /**
100      * Determines the kind of page to create from the content type.
101      * @param contentType the content type to evaluate
102      * @return "xml", "html", "javascript", "text" or "unknown"
103      */
104     public static PageType determinePageType(final String contentType) {
105         if (null == contentType) {
106             return PageType.UNKNOWN;
107         }
108 
109         switch (contentType) {
110             case "text/html":
111             case "image/svg+xml":
112                 return PageType.HTML;
113 
114             case "text/javascript":
115             case "application/x-javascript":
116             case "application/javascript":
117                 return PageType.JAVASCRIPT;
118 
119             case "text/xml":
120             case "application/xml":
121             case "text/vnd.wap.wml":
122                 return PageType.XML;
123 
124             default:
125                 if (contentType.endsWith("+xml")) {
126                     return PageType.XML;
127                 }
128 
129                 if (contentType.startsWith("text/")) {
130                     return PageType.TEXT;
131                 }
132 
133                 return PageType.UNKNOWN;
134         }
135     }
136 
137     /**
138      * Creates an instance.
139      */
140     public DefaultPageCreator() {
141         // Empty.
142     }
143 
144     /**
145      * Create a Page object for the specified web response.
146      *
147      * @param webResponse the response from the server
148      * @param webWindow the window that this page will be loaded into
149      * @exception IOException if an IO problem occurs
150      * @return the new page object
151      */
152     @Override
153     public Page createPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
154         final String contentType = determineContentType(webResponse.getContentType().toLowerCase(Locale.ROOT),
155             webResponse.getContentAsStream());
156 
157         final PageType pageType = determinePageType(contentType);
158         switch (pageType) {
159             case HTML:
160                 return createHtmlPage(webResponse, webWindow);
161 
162             case JAVASCRIPT:
163                 return createHtmlPage(webResponse, webWindow);
164 
165             case XML:
166                 final SgmlPage sgmlPage = createXmlPage(webResponse, webWindow);
167                 final DomElement doc = sgmlPage.getDocumentElement();
168                 if (doc != null && HTMLParser.XHTML_NAMESPACE.equals(doc.getNamespaceURI())) {
169                     return createXHtmlPage(webResponse, webWindow);
170                 }
171                 return sgmlPage;
172 
173             case TEXT:
174                 return createTextPage(webResponse, webWindow);
175 
176             default:
177                 return createUnexpectedPage(webResponse, webWindow);
178         }
179     }
180 
181     /**
182      * Tries to determine the content type.
183      * TODO: implement a content type sniffer based on the
184      * <a href="http://tools.ietf.org/html/draft-abarth-mime-sniff-05">Content-Type Processing Model</a>
185      * @param contentType the contentType header if any
186      * @param contentAsStream stream allowing to read the downloaded content
187      * @return the sniffed mime type
188      * @exception IOException if an IO problem occurs
189      */
190     protected String determineContentType(final String contentType, final InputStream contentAsStream)
191         throws IOException {
192 
193         try {
194             if (!StringUtils.isEmpty(contentType)) {
195                 return contentType;
196             }
197 
198             final byte[] bytes = read(contentAsStream, 500);
199             if (bytes.length == 0) {
200                 return "text/plain";
201             }
202 
203             final String asAsciiString = new String(bytes, "ASCII").toUpperCase(Locale.ROOT);
204             if (asAsciiString.contains("<HTML")) {
205                 return "text/html";
206             }
207             else if (startsWith(bytes, markerUTF8_) || startsWith(bytes, markerUTF16BE_)
208                     || startsWith(bytes, markerUTF16LE_)) {
209                 return "text/plain";
210             }
211             else if (asAsciiString.trim().startsWith("<SCRIPT>")) {
212                 return "application/javascript";
213             }
214             else if (isBinary(bytes)) {
215                 return "application/octet-stream";
216             }
217         }
218         finally {
219             IOUtils.closeQuietly(contentAsStream);
220         }
221         return "text/plain";
222     }
223 
224     /**
225      * See http://tools.ietf.org/html/draft-abarth-mime-sniff-05#section-4
226      * @param bytes the bytes to check
227      */
228     private static boolean isBinary(final byte[] bytes) {
229         for (byte b : bytes) {
230             if (b < 0x08
231                 || b == 0x0B
232                 || (b >= 0x0E && b <= 0x1A)
233                 || (b >= 0x1C && b <= 0x1F)) {
234                 return true;
235             }
236         }
237         return false;
238     }
239 
240     private static boolean startsWith(final byte[] bytes, final byte[] lookFor) {
241         if (bytes.length < lookFor.length) {
242             return false;
243         }
244 
245         for (int i = 0; i < lookFor.length; i++) {
246             if (bytes[i] != lookFor[i]) {
247                 return false;
248             }
249         }
250 
251         return true;
252     }
253 
254     private static byte[] read(final InputStream stream, final int maxNb) throws IOException {
255         final byte[] buffer = new byte[maxNb];
256         final int nbRead = stream.read(buffer);
257         if (nbRead == buffer.length) {
258             return buffer;
259         }
260         return ArrayUtils.subarray(buffer, 0, nbRead);
261     }
262 
263     /**
264      * Creates an HtmlPage for this WebResponse.
265      *
266      * @param webResponse the page's source
267      * @param webWindow the WebWindow to place the HtmlPage in
268      * @return the newly created HtmlPage
269      * @throws IOException if the page could not be created
270      */
271     protected HtmlPage createHtmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
272         return HTMLParser.parseHtml(webResponse, webWindow);
273     }
274 
275     /**
276      * Creates an XHtmlPage for this WebResponse.
277      *
278      * @param webResponse the page's source
279      * @param webWindow the WebWindow to place the HtmlPage in
280      * @return the newly created XHtmlPage
281      * @throws IOException if the page could not be created
282      */
283     protected XHtmlPage createXHtmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
284         return HTMLParser.parseXHtml(webResponse, webWindow);
285     }
286 
287     /**
288      * Creates a TextPage for this WebResponse.
289      *
290      * @param webResponse the page's source
291      * @param webWindow the WebWindow to place the TextPage in
292      * @return the newly created TextPage
293      */
294     protected TextPage createTextPage(final WebResponse webResponse, final WebWindow webWindow) {
295         final TextPage newPage = new TextPage(webResponse, webWindow);
296         webWindow.setEnclosedPage(newPage);
297         return newPage;
298     }
299 
300     /**
301      * Creates an UnexpectedPage for this WebResponse.
302      *
303      * @param webResponse the page's source
304      * @param webWindow the WebWindow to place the UnexpectedPage in
305      * @return the newly created UnexpectedPage
306      */
307     protected UnexpectedPage createUnexpectedPage(final WebResponse webResponse, final WebWindow webWindow) {
308         final UnexpectedPage newPage = new UnexpectedPage(webResponse, webWindow);
309         webWindow.setEnclosedPage(newPage);
310         return newPage;
311     }
312 
313     /**
314      * Creates an SgmlPage for this WebResponse.
315      *
316      * @param webResponse the page's source
317      * @param webWindow the WebWindow to place the TextPage in
318      * @return the newly created TextPage
319      * @throws IOException if the page could not be created
320      */
321     protected SgmlPage createXmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
322         final SgmlPage page = new XmlPage(webResponse, webWindow);
323         webWindow.setEnclosedPage(page);
324         return page;
325     }
326 
327 }