View Javadoc
1   /*
2    * Copyright (c) 2002-2017 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * http://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package com.gargoylesoftware.htmlunit;
16  
17  import java.io.IOException;
18  import java.io.InputStream;
19  import java.io.Serializable;
20  import java.util.Locale;
21  
22  import org.apache.commons.lang3.ArrayUtils;
23  import org.apache.commons.lang3.StringUtils;
24  
25  import com.gargoylesoftware.htmlunit.html.DomElement;
26  import com.gargoylesoftware.htmlunit.html.HTMLParser;
27  import com.gargoylesoftware.htmlunit.html.HtmlPage;
28  import com.gargoylesoftware.htmlunit.html.XHtmlPage;
29  import com.gargoylesoftware.htmlunit.xml.XmlPage;
30  
31  /**
32   * The default implementation of {@link PageCreator}. Designed to be extended for easier handling of new content
33   * types. Just check the content type in <tt>createPage()</tt> and call <tt>super(createPage())</tt> if your custom
34   * type isn't found. There are also protected <tt>createXXXXPage()</tt> methods for creating the {@link Page} types
35   * which HtmlUnit already knows about for your custom content types.
36   *
37   * <p>
38   * The following table shows the type of {@link Page} created depending on the content type:<br>
39   * <br>
40   *  <table border="1" width="50%" summary="Page Types">
41   *    <tr>
42   *      <th>Content type</th>
43   *      <th>Type of page</th>
44   *    </tr>
45   *    <tr>
46   *      <td>text/html<br>
47   *          text/javascript</td>
48   *      <td>{@link HtmlPage}</td>
49   *    </tr>
50   *    <tr>
51   *      <td>text/xml<br>
52   *      application/xml<br>
53   *      text/vnd.wap.wml<br>
54   *      *+xml
55   *      </td>
56   *      <td>{@link XmlPage}, or an {@link XHtmlPage} if an XHTML namespace is used</td>
57   *    </tr>
58   *    <tr>
59   *      <td>text/*</td>
60   *      <td>{@link TextPage}</td>
61   *    </tr>
62   *    <tr>
63   *      <td>Anything else</td>
64   *      <td>{@link UnexpectedPage}</td>
65   *    </tr>
66   *  </table>
67   *
68   * @author <a href="mailto:mbowler@GargoyleSoftware.com">Mike Bowler</a>
69   * @author <a href="mailto:cse@dynabean.de">Christian Sell</a>
70   * @author <a href="mailto:yourgod@users.sourceforge.net">Brad Clarke</a>
71   * @author Marc Guillemot
72   * @author Ahmed Ashour
73   * @author Daniel Gredler
74   * @author Ronald Brill
75   */
76  public class DefaultPageCreator implements PageCreator, Serializable {
77  
78      private static final byte[] markerUTF8_ = {(byte) 0xef, (byte) 0xbb, (byte) 0xbf};
79      private static final byte[] markerUTF16BE_ = {(byte) 0xfe, (byte) 0xff};
80      private static final byte[] markerUTF16LE_ = {(byte) 0xff, (byte) 0xfe};
81  
82      /**
83       * The different supported page types.
84       */
85      public enum PageType {
86          /** html. */
87          HTML,
88          /** javascript. */
89          JAVASCRIPT,
90          /** xml. */
91          XML,
92          /** text. */
93          TEXT,
94          /** unknown. */
95          UNKNOWN
96      }
97  
98      /**
99       * Determines the kind of page to create from the content type.
100      * @param contentType the content type to evaluate
101      * @return "xml", "html", "javascript", "text" or "unknown"
102      */
103     public static PageType determinePageType(final String contentType) {
104         if (null == contentType) {
105             return PageType.UNKNOWN;
106         }
107 
108         switch (contentType) {
109             case "text/html":
110             case "image/svg+xml":
111                 return PageType.HTML;
112 
113             case "text/javascript":
114             case "application/x-javascript":
115             case "application/javascript":
116                 return PageType.JAVASCRIPT;
117 
118             case "text/xml":
119             case "application/xml":
120             case "text/vnd.wap.wml":
121                 return PageType.XML;
122 
123             default:
124                 if (contentType.endsWith("+xml")) {
125                     return PageType.XML;
126                 }
127 
128                 if (contentType.startsWith("text/")) {
129                     return PageType.TEXT;
130                 }
131 
132                 return PageType.UNKNOWN;
133         }
134     }
135 
136     /**
137      * Creates an instance.
138      */
139     public DefaultPageCreator() {
140         // Empty.
141     }
142 
143     /**
144      * Create a Page object for the specified web response.
145      *
146      * @param webResponse the response from the server
147      * @param webWindow the window that this page will be loaded into
148      * @exception IOException if an IO problem occurs
149      * @return the new page object
150      */
151     @Override
152     public Page createPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
153         final String contentType = determineContentType(webResponse);
154 
155         final PageType pageType = determinePageType(contentType);
156         switch (pageType) {
157             case HTML:
158                 return createHtmlPage(webResponse, webWindow);
159 
160             case JAVASCRIPT:
161                 return createHtmlPage(webResponse, webWindow);
162 
163             case XML:
164                 final SgmlPage sgmlPage = createXmlPage(webResponse, webWindow);
165                 final DomElement doc = sgmlPage.getDocumentElement();
166                 if (doc != null && HTMLParser.XHTML_NAMESPACE.equals(doc.getNamespaceURI())) {
167                     return createXHtmlPage(webResponse, webWindow);
168                 }
169                 return sgmlPage;
170 
171             case TEXT:
172                 return createTextPage(webResponse, webWindow);
173 
174             default:
175                 return createUnexpectedPage(webResponse, webWindow);
176         }
177     }
178 
179     /**
180      * Tries to determine the content type.
181      * TODO: implement a content type sniffer based on the
182      * <a href="http://tools.ietf.org/html/draft-abarth-mime-sniff-05">Content-Type Processing Model</a>
183      * @param webResponse the response from the server
184      * @return the sniffed mime type
185      * @exception IOException if an IO problem occurs
186      */
187     private String determineContentType(final WebResponse webResponse)
188         throws IOException {
189 
190         final String contentType = webResponse.getContentType();
191         if (!StringUtils.isEmpty(contentType)) {
192             return contentType.toLowerCase(Locale.ROOT);
193         }
194 
195         try (InputStream contentAsStream = webResponse.getContentAsStream()) {
196             final byte[] bytes = read(contentAsStream, 500);
197             if (bytes.length == 0) {
198                 return "text/plain";
199             }
200 
201             final String asAsciiString = new String(bytes, "ASCII").toUpperCase(Locale.ROOT);
202             if (asAsciiString.contains("<HTML")) {
203                 return "text/html";
204             }
205             else if (startsWith(bytes, markerUTF8_) || startsWith(bytes, markerUTF16BE_)
206                     || startsWith(bytes, markerUTF16LE_)) {
207                 return "text/plain";
208             }
209             else if (asAsciiString.trim().startsWith("<SCRIPT>")) {
210                 return "application/javascript";
211             }
212             else if (isBinary(bytes)) {
213                 return "application/octet-stream";
214             }
215         }
216         return "text/plain";
217     }
218 
219     /**
220      * See http://tools.ietf.org/html/draft-abarth-mime-sniff-05#section-4
221      * @param bytes the bytes to check
222      */
223     private static boolean isBinary(final byte[] bytes) {
224         for (byte b : bytes) {
225             if (b < 0x08
226                 || b == 0x0B
227                 || (b >= 0x0E && b <= 0x1A)
228                 || (b >= 0x1C && b <= 0x1F)) {
229                 return true;
230             }
231         }
232         return false;
233     }
234 
235     private static boolean startsWith(final byte[] bytes, final byte[] lookFor) {
236         if (bytes.length < lookFor.length) {
237             return false;
238         }
239 
240         for (int i = 0; i < lookFor.length; i++) {
241             if (bytes[i] != lookFor[i]) {
242                 return false;
243             }
244         }
245 
246         return true;
247     }
248 
249     private static byte[] read(final InputStream stream, final int maxNb) throws IOException {
250         final byte[] buffer = new byte[maxNb];
251         final int nbRead = stream.read(buffer);
252         if (nbRead == buffer.length) {
253             return buffer;
254         }
255         return ArrayUtils.subarray(buffer, 0, nbRead);
256     }
257 
258     /**
259      * Creates an HtmlPage for this WebResponse.
260      *
261      * @param webResponse the page's source
262      * @param webWindow the WebWindow to place the HtmlPage in
263      * @return the newly created HtmlPage
264      * @throws IOException if the page could not be created
265      */
266     protected HtmlPage createHtmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
267         return HTMLParser.parseHtml(webResponse, webWindow);
268     }
269 
270     /**
271      * Creates an XHtmlPage for this WebResponse.
272      *
273      * @param webResponse the page's source
274      * @param webWindow the WebWindow to place the HtmlPage in
275      * @return the newly created XHtmlPage
276      * @throws IOException if the page could not be created
277      */
278     protected XHtmlPage createXHtmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
279         return HTMLParser.parseXHtml(webResponse, webWindow);
280     }
281 
282     /**
283      * Creates a TextPage for this WebResponse.
284      *
285      * @param webResponse the page's source
286      * @param webWindow the WebWindow to place the TextPage in
287      * @return the newly created TextPage
288      */
289     protected TextPage createTextPage(final WebResponse webResponse, final WebWindow webWindow) {
290         final TextPage newPage = new TextPage(webResponse, webWindow);
291         webWindow.setEnclosedPage(newPage);
292         return newPage;
293     }
294 
295     /**
296      * Creates an UnexpectedPage for this WebResponse.
297      *
298      * @param webResponse the page's source
299      * @param webWindow the WebWindow to place the UnexpectedPage in
300      * @return the newly created UnexpectedPage
301      */
302     protected UnexpectedPage createUnexpectedPage(final WebResponse webResponse, final WebWindow webWindow) {
303         final UnexpectedPage newPage = new UnexpectedPage(webResponse, webWindow);
304         webWindow.setEnclosedPage(newPage);
305         return newPage;
306     }
307 
308     /**
309      * Creates an SgmlPage for this WebResponse.
310      *
311      * @param webResponse the page's source
312      * @param webWindow the WebWindow to place the TextPage in
313      * @return the newly created TextPage
314      * @throws IOException if the page could not be created
315      */
316     protected SgmlPage createXmlPage(final WebResponse webResponse, final WebWindow webWindow) throws IOException {
317         final SgmlPage page = new XmlPage(webResponse, webWindow);
318         webWindow.setEnclosedPage(page);
319         return page;
320     }
321 
322 }