View Javadoc
1   /*
2    * Copyright (c) 2002-2018 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * http://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package com.gargoylesoftware.htmlunit.util;
16  
17  import static java.nio.charset.StandardCharsets.US_ASCII;
18  import static java.nio.charset.StandardCharsets.UTF_8;
19  
20  import java.net.MalformedURLException;
21  import java.net.URI;
22  import java.net.URISyntaxException;
23  import java.net.URL;
24  import java.net.URLStreamHandler;
25  import java.nio.charset.Charset;
26  import java.util.BitSet;
27  import java.util.Locale;
28  
29  import org.apache.commons.codec.DecoderException;
30  import org.apache.commons.codec.net.URLCodec;
31  
32  import com.gargoylesoftware.htmlunit.WebAssert;
33  import com.gargoylesoftware.htmlunit.WebClient;
34  import com.gargoylesoftware.htmlunit.protocol.AnyHandler;
35  import com.gargoylesoftware.htmlunit.protocol.javascript.JavaScriptURLConnection;
36  
37  /**
38   * URL utilities class that makes it easy to create new URLs based off of old URLs
39   * without having to assemble or parse them yourself.
40   *
41   * @author Daniel Gredler
42   * @author Martin Tamme
43   * @author Sudhan Moghe
44   * @author Marc Guillemot
45   * @author Ahmed Ashour
46   * @author Ronald Brill
47   * @author Joerg Werner
48   * @author Hartmut Arlt
49   */
50  public final class UrlUtils {
51      private static final URLStreamHandler JS_HANDLER = new com.gargoylesoftware.htmlunit.protocol.javascript.Handler();
52      private static final URLStreamHandler ABOUT_HANDLER = new com.gargoylesoftware.htmlunit.protocol.about.Handler();
53      private static final URLStreamHandler DATA_HANDLER = new com.gargoylesoftware.htmlunit.protocol.data.Handler();
54  
55      private static final BitSet PATH_ALLOWED_CHARS = new BitSet(256);
56      private static final BitSet QUERY_ALLOWED_CHARS = new BitSet(256);
57      private static final BitSet ANCHOR_ALLOWED_CHARS = new BitSet(256);
58      private static final BitSet HASH_ALLOWED_CHARS = new BitSet(256);
59  
60      /**
61       * URI allowed char initialization; based on HttpClient 3.1's URI bit sets.
62       */
63      static {
64          final BitSet reserved = new BitSet(256);
65          reserved.set(';');
66          reserved.set('/');
67          reserved.set('?');
68          reserved.set(':');
69          reserved.set('@');
70          reserved.set('&');
71          reserved.set('=');
72          reserved.set('+');
73          reserved.set('$');
74          reserved.set(',');
75  
76          final BitSet mark = new BitSet(256);
77          mark.set('-');
78          mark.set('_');
79          mark.set('.');
80          mark.set('!');
81          mark.set('~');
82          mark.set('*');
83          mark.set('\'');
84          mark.set('(');
85          mark.set(')');
86  
87          final BitSet alpha = new BitSet(256);
88          for (int i = 'a'; i <= 'z'; i++) {
89              alpha.set(i);
90          }
91          for (int i = 'A'; i <= 'Z'; i++) {
92              alpha.set(i);
93          }
94  
95          final BitSet digit = new BitSet(256);
96          for (int i = '0'; i <= '9'; i++) {
97              digit.set(i);
98          }
99  
100         final BitSet alphanumeric = new BitSet(256);
101         alphanumeric.or(alpha);
102         alphanumeric.or(digit);
103 
104         final BitSet unreserved = new BitSet(256);
105         unreserved.or(alphanumeric);
106         unreserved.or(mark);
107 
108         final BitSet hex = new BitSet(256);
109         hex.or(digit);
110         for (int i = 'a'; i <= 'f'; i++) {
111             hex.set(i);
112         }
113         for (int i = 'A'; i <= 'F'; i++) {
114             hex.set(i);
115         }
116 
117         final BitSet escaped = new BitSet(256);
118         escaped.set('%');
119         escaped.or(hex);
120 
121         final BitSet uric = new BitSet(256);
122         uric.or(reserved);
123         uric.or(unreserved);
124         uric.or(escaped);
125 
126         final BitSet pchar = new BitSet(256);
127         pchar.or(unreserved);
128         pchar.or(escaped);
129         pchar.set(':');
130         pchar.set('@');
131         pchar.set('&');
132         pchar.set('=');
133         pchar.set('+');
134         pchar.set('$');
135         pchar.set(',');
136 
137         final BitSet param = pchar;
138 
139         final BitSet segment = new BitSet(256);
140         segment.or(pchar);
141         segment.set(';');
142         segment.or(param);
143 
144         final BitSet pathSegments = new BitSet(256);
145         pathSegments.set('/');
146         pathSegments.or(segment);
147 
148         final BitSet absPath = new BitSet(256);
149         absPath.set('/');
150         absPath.or(pathSegments);
151 
152         final BitSet allowedAbsPath = new BitSet(256);
153         allowedAbsPath.or(absPath);
154 
155         final BitSet allowedFragment = new BitSet(256);
156         allowedFragment.or(uric);
157 //        allowedFragment.clear('%');
158 
159         final BitSet allowedQuery = new BitSet(256);
160         allowedQuery.or(uric);
161 
162         final BitSet allowedHash = new BitSet(256);
163         allowedHash.or(uric);
164         allowedHash.clear('%');
165 
166         PATH_ALLOWED_CHARS.or(allowedAbsPath);
167         QUERY_ALLOWED_CHARS.or(allowedQuery);
168         ANCHOR_ALLOWED_CHARS.or(allowedFragment);
169         HASH_ALLOWED_CHARS.or(allowedHash);
170     }
171 
172     /**
173      * Disallow instantiation of this class.
174      */
175     private UrlUtils() {
176         // Empty.
177     }
178 
179     /**
180      * <p>Constructs a URL instance based on the specified URL string, taking into account the fact that the
181      * specified URL string may represent an <tt>"about:..."</tt> URL, a <tt>"javascript:..."</tt> URL, or
182      * a <tt>data:...</tt> URL.</p>
183      *
184      * <p>The caller should be sure that URL strings passed to this method will parse correctly as URLs, as
185      * this method never expects to have to handle {@link MalformedURLException}s.</p>
186      *
187      * @param url the URL string to convert into a URL instance
188      * @return the constructed URL instance
189      */
190     public static URL toUrlSafe(final String url) {
191         try {
192             return toUrlUnsafe(url);
193         }
194         catch (final MalformedURLException e) {
195             // Should never happen.
196             throw new RuntimeException(e);
197         }
198     }
199 
200     /**
201      * <p>Constructs a URL instance based on the specified URL string, taking into account the fact that the
202      * specified URL string may represent an <tt>"about:..."</tt> URL, a <tt>"javascript:..."</tt> URL, or
203      * a <tt>data:...</tt> URL.</p>
204      *
205      * <p>Unlike {@link #toUrlSafe(String)}, the caller need not be sure that URL strings passed to this
206      * method will parse correctly as URLs.</p>
207      *
208      * @param url the URL string to convert into a URL instance
209      * @return the constructed URL instance
210      * @throws MalformedURLException if the URL string cannot be converted to a URL instance
211      */
212     public static URL toUrlUnsafe(final String url) throws MalformedURLException {
213         WebAssert.notNull("url", url);
214 
215         final String protocol = org.apache.commons.lang3.StringUtils.substringBefore(url, ":").toLowerCase(Locale.ROOT);
216 
217         if (protocol.isEmpty() || UrlUtils.isNormalUrlProtocol(protocol)) {
218             final URL response = new URL(url);
219             if (response.getProtocol().startsWith("http")
220                     && org.apache.commons.lang3.StringUtils.isEmpty(response.getHost())) {
221                 throw new MalformedURLException("Missing host name in url: " + url);
222             }
223             return response;
224         }
225 
226         if (JavaScriptURLConnection.JAVASCRIPT_PREFIX.equals(protocol + ":")) {
227             return new URL(null, url, JS_HANDLER);
228         }
229 
230         if ("about".equals(protocol)) {
231             if (WebClient.URL_ABOUT_BLANK != null
232                     && org.apache.commons.lang3.StringUtils.
233                         equalsIgnoreCase(WebClient.URL_ABOUT_BLANK.toExternalForm(), url)) {
234                 return WebClient.URL_ABOUT_BLANK;
235             }
236             return new URL(null, url, ABOUT_HANDLER);
237         }
238 
239         if ("data".equals(protocol)) {
240             return new URL(null, url, DATA_HANDLER);
241         }
242 
243         return new URL(null, url, AnyHandler.INSTANCE);
244     }
245 
246     /**
247      * <p>Encodes illegal characters in the specified URL's path, query string and anchor according to the URL
248      * encoding rules observed in real browsers.</p>
249      *
250      * <p>For example, this method changes <tt>"http://first/?a=b c"</tt> to <tt>"http://first/?a=b%20c"</tt>.</p>
251      *
252      * @param url the URL to encode
253      * @param minimalQueryEncoding whether or not to perform minimal query encoding, like IE does
254      * @param charset the charset
255      * @return the encoded URL
256      */
257     public static URL encodeUrl(final URL url, final boolean minimalQueryEncoding, final Charset charset) {
258         if (!isNormalUrlProtocol(url.getProtocol())) {
259             return url; // javascript:, about:, data: and anything not supported like foo:
260         }
261 
262         try {
263             String path = url.getPath();
264             if (path != null) {
265                 path = encode(path, PATH_ALLOWED_CHARS, UTF_8);
266             }
267             String query = url.getQuery();
268             if (query != null) {
269                 if (minimalQueryEncoding) {
270                     query = org.apache.commons.lang3.StringUtils.replace(query, " ", "%20");
271                 }
272                 else {
273                     query = encode(query, QUERY_ALLOWED_CHARS, charset);
274                 }
275             }
276             String anchor = url.getRef();
277             if (anchor != null) {
278                 anchor = encode(anchor, ANCHOR_ALLOWED_CHARS, UTF_8);
279             }
280             return createNewUrl(url.getProtocol(), url.getUserInfo(), url.getHost(),
281                                 url.getPort(), path, anchor, query);
282         }
283         catch (final MalformedURLException e) {
284             // Impossible... I think.
285             throw new RuntimeException(e);
286         }
287     }
288 
289     /**
290      * Encodes and escapes the specified URI anchor string.
291      *
292      * @param anchor the anchor string to encode and escape
293      * @return the encoded and escaped anchor string
294      */
295     public static String encodeAnchor(String anchor) {
296         if (anchor != null) {
297             anchor = encode(anchor, ANCHOR_ALLOWED_CHARS, UTF_8);
298         }
299         return anchor;
300     }
301 
302     /**
303      * Encodes and escapes the specified URI anchor string.
304      *
305      * @param hash the anchor string to encode and escape
306      * @return the encoded and escaped anchor string
307      */
308     public static String encodeHash(String hash) {
309         if (hash != null) {
310             hash = encode(hash, HASH_ALLOWED_CHARS, UTF_8);
311         }
312         return hash;
313     }
314 
315     /**
316      * Unescapes and decodes the specified string.
317      *
318      * @param escaped the string to be unescaped and decoded
319      * @return the unescaped and decoded string
320      */
321     public static String decode(final String escaped) {
322         try {
323             final byte[] bytes = escaped.getBytes(US_ASCII);
324             final byte[] bytes2 = URLCodec.decodeUrl(bytes);
325             return new String(bytes2, UTF_8);
326         }
327         catch (final DecoderException e) {
328             // Should never happen.
329             throw new RuntimeException(e);
330         }
331     }
332 
333     /**
334      * Escapes and encodes the specified string. Based on HttpClient 3.1's <tt>URIUtil.encode()</tt> method.
335      *
336      * @param unescaped the string to encode
337      * @param allowed allowed characters that shouldn't be escaped
338      * @param charset the charset to use
339      * @return the escaped string
340      */
341     private static String encode(final String unescaped, final BitSet allowed, final Charset charset) {
342         final byte[] bytes = unescaped.getBytes(charset);
343         final byte[] bytes2 = URLCodec.encodeUrl(allowed, bytes);
344         return encodePercentSign(bytes2);
345     }
346 
347     /**
348      * Encodes every occurrence of the escape character '%' in the given input
349      * string that is not followed by two hexadecimal characters.
350      * @param str the input string
351      * @return the given input string where every occurrence of <code>%</code> in
352      * invalid escape sequences has been replace by <code>%25</code>
353      */
354     private static String encodePercentSign(final byte[] input) {
355         if (input == null) {
356             return null;
357         }
358 
359         final StringBuilder result = new StringBuilder(new String(input, US_ASCII));
360         int state = -0;
361         int offset = 0;
362         for (int i = 0; i < input.length; i++) {
363             final byte b = input[i];
364             if (state == 0 && b == '%') {
365                 state = 1;
366             }
367             else if (state == 1 || state == 2) {
368                 if (('0' <= b && b <= '9')
369                         || ('A' <= b && b <= 'F')
370                         || ('a' <= b && b <= 'f')) {
371                     state++;
372                     if (state == 3) {
373                         state = 0;
374                     }
375                 }
376                 else {
377                     final int st = i - state + offset;
378                     result.replace(st, st + 1, "%25");
379                     offset = offset + 2;
380                     state = b == '%' ? 1 : 0;
381                 }
382             }
383         }
384         if (state == 1 || state == 2) {
385             final int st = input.length - state + offset;
386             result.replace(st, st + 1, "%25");
387         }
388         return result.toString();
389     }
390 
391     /**
392      * Creates and returns a new URL identical to the specified URL, except using the specified protocol.
393      * @param u the URL on which to base the returned URL
394      * @param newProtocol the new protocol to use in the returned URL
395      * @return a new URL identical to the specified URL, except using the specified protocol
396      * @throws MalformedURLException if there is a problem creating the new URL
397      */
398     public static URL getUrlWithNewProtocol(final URL u, final String newProtocol) throws MalformedURLException {
399         return createNewUrl(newProtocol, u.getAuthority(), u.getPath(), u.getRef(), u.getQuery());
400     }
401 
402     /**
403      * Creates and returns a new URL identical to the specified URL, except using the specified host.
404      * @param u the URL on which to base the returned URL
405      * @param newHost the new host to use in the returned URL
406      * @return a new URL identical to the specified URL, except using the specified host
407      * @throws MalformedURLException if there is a problem creating the new URL
408      */
409     public static URL getUrlWithNewHost(final URL u, final String newHost)
410         throws MalformedURLException {
411         return createNewUrl(u.getProtocol(), u.getUserInfo(), newHost,
412                             u.getPort(), u.getPath(), u.getRef(), u.getQuery());
413     }
414 
415     /**
416      * Creates and returns a new URL identical to the specified URL, except using the specified host.
417      * @param u the URL on which to base the returned URL
418      * @param newHost the new host to use in the returned URL
419      * @param newPort the new port to use in the returned URL
420      * @return a new URL identical to the specified URL, except using the specified host
421      * @throws MalformedURLException if there is a problem creating the new URL
422      */
423     public static URL getUrlWithNewHostAndPort(final URL u, final String newHost, final int newPort)
424         throws MalformedURLException {
425         return createNewUrl(u.getProtocol(), u.getUserInfo(), newHost, newPort, u.getPath(), u.getRef(), u.getQuery());
426     }
427 
428     /**
429      * Creates and returns a new URL identical to the specified URL, except using the specified port.
430      * @param u the URL on which to base the returned URL
431      * @param newPort the new port to use in the returned URL
432      * @return a new URL identical to the specified URL, except using the specified port
433      * @throws MalformedURLException if there is a problem creating the new URL
434      */
435     public static URL getUrlWithNewPort(final URL u, final int newPort) throws MalformedURLException {
436         return createNewUrl(u.getProtocol(), u.getUserInfo(), u.getHost(),
437                             newPort, u.getPath(), u.getRef(), u.getQuery());
438     }
439 
440     /**
441      * Creates and returns a new URL identical to the specified URL, except using the specified path.
442      * @param u the URL on which to base the returned URL
443      * @param newPath the new path to use in the returned URL
444      * @return a new URL identical to the specified URL, except using the specified path
445      * @throws MalformedURLException if there is a problem creating the new URL
446      */
447     public static URL getUrlWithNewPath(final URL u, final String newPath) throws MalformedURLException {
448         return createNewUrl(u.getProtocol(), u.getAuthority(), newPath, u.getRef(), u.getQuery());
449     }
450 
451     /**
452      * Creates and returns a new URL identical to the specified URL, except using the specified reference.
453      * @param u the URL on which to base the returned URL
454      * @param newRef the new reference to use in the returned URL
455      * @return a new URL identical to the specified URL, except using the specified reference
456      * @throws MalformedURLException if there is a problem creating the new URL
457      */
458     public static URL getUrlWithNewRef(final URL u, final String newRef) throws MalformedURLException {
459         return createNewUrl(u.getProtocol(), u.getAuthority(), u.getPath(), newRef, u.getQuery());
460     }
461 
462     /**
463      * Creates and returns a new URL identical to the specified URL, except using the specified query string.
464      * @param u the URL on which to base the returned URL
465      * @param newQuery the new query string to use in the returned URL
466      * @return a new URL identical to the specified URL, except using the specified query string
467      * @throws MalformedURLException if there is a problem creating the new URL
468      */
469     public static URL getUrlWithNewQuery(final URL u, final String newQuery) throws MalformedURLException {
470         return createNewUrl(u.getProtocol(), u.getAuthority(), u.getPath(), u.getRef(), newQuery);
471     }
472 
473     /**
474      * Creates a new URL based on the specified fragments.
475      * @param protocol the protocol to use (may not be {@code null})
476      * @param userInfo the user info to use (may be {@code null})
477      * @param host the host to use (may not be {@code null})
478      * @param port the port to use (may be <tt>-1</tt> if no port is specified)
479      * @param path the path to use (may be {@code null} and may omit the initial <tt>'/'</tt>)
480      * @param ref the reference to use (may be {@code null} and must not include the <tt>'#'</tt>)
481      * @param query the query to use (may be {@code null} and must not include the <tt>'?'</tt>)
482      * @return a new URL based on the specified fragments
483      * @throws MalformedURLException if there is a problem creating the new URL
484      */
485     private static URL createNewUrl(final String protocol, final String userInfo, final String host, final int port,
486             final String path, final String ref, final String query) throws MalformedURLException {
487         final StringBuilder s = new StringBuilder();
488         s.append(protocol);
489         s.append("://");
490         if (userInfo != null) {
491             s.append(userInfo).append("@");
492         }
493         s.append(host);
494         if (port != -1) {
495             s.append(":").append(port);
496         }
497         if (path != null && !path.isEmpty()) {
498             if (!('/' == path.charAt(0))) {
499                 s.append("/");
500             }
501             s.append(path);
502         }
503         if (query != null) {
504             s.append("?").append(query);
505         }
506         if (ref != null) {
507             if (ref.isEmpty() || ref.charAt(0) != '#') {
508                 s.append("#");
509             }
510             s.append(ref);
511         }
512 
513         return new URL(s.toString());
514     }
515 
516     /**
517      * Creates a new URL based on the specified fragments.
518      * @param protocol the protocol to use (may not be {@code null})
519      * @param authority the authority to use (may not be {@code null})
520      * @param path the path to use (may be {@code null} and may omit the initial <tt>'/'</tt>)
521      * @param ref the reference to use (may be {@code null} and must not include the <tt>'#'</tt>)
522      * @param query the query to use (may be {@code null} and must not include the <tt>'?'</tt>)
523      * @return a new URL based on the specified fragments
524      * @throws MalformedURLException if there is a problem creating the new URL
525      */
526     private static URL createNewUrl(final String protocol, final String authority,
527             final String path, final String ref, final String query) throws MalformedURLException {
528 
529         // pre-compute length of StringBuilder
530         int len = protocol.length() + 1;
531         if (authority != null && !authority.isEmpty()) {
532             len += 2 + authority.length();
533         }
534         if (path != null) {
535             len += path.length();
536         }
537         if (query != null) {
538             len += 1 + query.length();
539         }
540         if (ref != null) {
541             len += 1 + ref.length();
542         }
543 
544         final StringBuilder s = new StringBuilder(len);
545         s.append(protocol);
546         s.append(":");
547         if (authority != null && !authority.isEmpty()) {
548             s.append("//");
549             s.append(authority);
550         }
551         if (path != null) {
552             s.append(path);
553         }
554         if (query != null) {
555             s.append('?');
556             s.append(query);
557         }
558         if (ref != null) {
559             if (ref.isEmpty() || ref.charAt(0) != '#') {
560                 s.append("#");
561             }
562             s.append(ref);
563         }
564 
565         return new URL(s.toString());
566     }
567 
568     /**
569      * Resolves a given relative URL against a base URL. See
570      * <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>
571      * Section 4 for more details.
572      *
573      * @param baseUrl     The base URL in which to resolve the specification.
574      * @param relativeUrl The relative URL to resolve against the base URL.
575      * @return the resolved specification.
576      */
577     public static String resolveUrl(final String baseUrl, final String relativeUrl) {
578         if (baseUrl == null) {
579             throw new IllegalArgumentException("Base URL must not be null");
580         }
581         if (relativeUrl == null) {
582             throw new IllegalArgumentException("Relative URL must not be null");
583         }
584         final Url url = resolveUrl(parseUrl(baseUrl.trim()), relativeUrl.trim());
585 
586         return url.toString();
587     }
588 
589     /**
590      * Resolves a given relative URL against a base URL. See
591      * <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>
592      * Section 4 for more details.
593      *
594      * @param baseUrl     The base URL in which to resolve the specification.
595      * @param relativeUrl The relative URL to resolve against the base URL.
596      * @return the resolved specification.
597      */
598     public static String resolveUrl(final URL baseUrl, final String relativeUrl) {
599         if (baseUrl == null) {
600             throw new IllegalArgumentException("Base URL must not be null");
601         }
602         return resolveUrl(baseUrl.toExternalForm(), relativeUrl);
603     }
604 
605     /**
606      * Parses a given specification using the algorithm depicted in
607      * <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>:
608      *
609      * Section 2.4: Parsing a URL
610      *
611      *   An accepted method for parsing URLs is useful to clarify the
612      *   generic-RL syntax of Section 2.2 and to describe the algorithm for
613      *   resolving relative URLs presented in Section 4. This section
614      *   describes the parsing rules for breaking down a URL (relative or
615      *   absolute) into the component parts described in Section 2.1.  The
616      *   rules assume that the URL has already been separated from any
617      *   surrounding text and copied to a "parse string". The rules are
618      *   listed in the order in which they would be applied by the parser.
619      *
620      * @param spec The specification to parse.
621      * @return the parsed specification.
622      */
623     private static Url parseUrl(final String spec) {
624         final Url url = new Url();
625         int startIndex = 0;
626         int endIndex = spec.length();
627 
628         // Section 2.4.1: Parsing the Fragment Identifier
629         //
630         //   If the parse string contains a crosshatch "#" character, then the
631         //   substring after the first (left-most) crosshatch "#" and up to the
632         //   end of the parse string is the <fragment> identifier. If the
633         //   crosshatch is the last character, or no crosshatch is present, then
634         //   the fragment identifier is empty. The matched substring, including
635         //   the crosshatch character, is removed from the parse string before
636         //   continuing.
637         //
638         //   Note that the fragment identifier is not considered part of the URL.
639         //   However, since it is often attached to the URL, parsers must be able
640         //   to recognize and set aside fragment identifiers as part of the
641         //   process.
642         final int crosshatchIndex = StringUtils.indexOf(spec, '#', startIndex, endIndex);
643 
644         if (crosshatchIndex >= 0) {
645             url.fragment_ = spec.substring(crosshatchIndex + 1, endIndex);
646             endIndex = crosshatchIndex;
647         }
648         // Section 2.4.2: Parsing the Scheme
649         //
650         //   If the parse string contains a colon ":" after the first character
651         //   and before any characters not allowed as part of a scheme name (i.e.,
652         //   any not an alphanumeric, plus "+", period ".", or hyphen "-"), the
653         //   <scheme> of the URL is the substring of characters up to but not
654         //   including the first colon. These characters and the colon are then
655         //   removed from the parse string before continuing.
656         final int colonIndex = StringUtils.indexOf(spec, ':', startIndex, endIndex);
657 
658         if (colonIndex > 0) {
659             final String scheme = spec.substring(startIndex, colonIndex);
660             if (isValidScheme(scheme)) {
661                 url.scheme_ = scheme;
662                 startIndex = colonIndex + 1;
663             }
664         }
665         // Section 2.4.3: Parsing the Network Location/Login
666         //
667         //   If the parse string begins with a double-slash "//", then the
668         //   substring of characters after the double-slash and up to, but not
669         //   including, the next slash "/" character is the network location/login
670         //   (<net_loc>) of the URL. If no trailing slash "/" is present, the
671         //   entire remaining parse string is assigned to <net_loc>. The double-
672         //   slash and <net_loc> are removed from the parse string before
673         //   continuing.
674         //
675         // Note: We also accept a question mark "?" or a semicolon ";" character as
676         //       delimiters for the network location/login (<net_loc>) of the URL.
677         final int locationStartIndex;
678         int locationEndIndex;
679 
680         if (spec.startsWith("//", startIndex)) {
681             locationStartIndex = startIndex + 2;
682             locationEndIndex = StringUtils.indexOf(spec, '/', locationStartIndex, endIndex);
683             if (locationEndIndex >= 0) {
684                 startIndex = locationEndIndex;
685             }
686         }
687         else {
688             locationStartIndex = -1;
689             locationEndIndex = -1;
690         }
691         // Section 2.4.4: Parsing the Query Information
692         //
693         //   If the parse string contains a question mark "?" character, then the
694         //   substring after the first (left-most) question mark "?" and up to the
695         //   end of the parse string is the <query> information. If the question
696         //   mark is the last character, or no question mark is present, then the
697         //   query information is empty. The matched substring, including the
698         //   question mark character, is removed from the parse string before
699         //   continuing.
700         final int questionMarkIndex = StringUtils.indexOf(spec, '?', startIndex, endIndex);
701 
702         if (questionMarkIndex >= 0) {
703             if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
704                 // The substring of characters after the double-slash and up to, but not
705                 // including, the question mark "?" character is the network location/login
706                 // (<net_loc>) of the URL.
707                 locationEndIndex = questionMarkIndex;
708                 startIndex = questionMarkIndex;
709             }
710             url.query_ = spec.substring(questionMarkIndex + 1, endIndex);
711             endIndex = questionMarkIndex;
712         }
713         // Section 2.4.5: Parsing the Parameters
714         //
715         //   If the parse string contains a semicolon ";" character, then the
716         //   substring after the first (left-most) semicolon ";" and up to the end
717         //   of the parse string is the parameters (<params>). If the semicolon
718         //   is the last character, or no semicolon is present, then <params> is
719         //   empty. The matched substring, including the semicolon character, is
720         //   removed from the parse string before continuing.
721         final int semicolonIndex = StringUtils.indexOf(spec, ';', startIndex, endIndex);
722 
723         if (semicolonIndex >= 0) {
724             if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
725                 // The substring of characters after the double-slash and up to, but not
726                 // including, the semicolon ";" character is the network location/login
727                 // (<net_loc>) of the URL.
728                 locationEndIndex = semicolonIndex;
729                 startIndex = semicolonIndex;
730             }
731             url.parameters_ = spec.substring(semicolonIndex + 1, endIndex);
732             endIndex = semicolonIndex;
733         }
734         // Section 2.4.6: Parsing the Path
735         //
736         //   After the above steps, all that is left of the parse string is the
737         //   URL <path> and the slash "/" that may precede it. Even though the
738         //   initial slash is not part of the URL path, the parser must remember
739         //   whether or not it was present so that later processes can
740         //   differentiate between relative and absolute paths. Often this is
741         //   done by simply storing the preceding slash along with the path.
742         if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
743             // The entire remaining parse string is assigned to the network
744             // location/login (<net_loc>) of the URL.
745             locationEndIndex = endIndex;
746         }
747         else if (startIndex < endIndex) {
748             url.path_ = spec.substring(startIndex, endIndex);
749         }
750         // Set the network location/login (<net_loc>) of the URL.
751         if ((locationStartIndex >= 0) && (locationEndIndex >= 0)) {
752             url.location_ = spec.substring(locationStartIndex, locationEndIndex);
753         }
754         return url;
755     }
756 
757     /*
758      * Returns true if specified string is a valid scheme name.
759      */
760     private static boolean isValidScheme(final String scheme) {
761         final int length = scheme.length();
762         if (length < 1) {
763             return false;
764         }
765         char c = scheme.charAt(0);
766         if (!Character.isLetter(c)) {
767             return false;
768         }
769         for (int i = 1; i < length; i++) {
770             c = scheme.charAt(i);
771             if (!Character.isLetterOrDigit(c) && c != '.' && c != '+' && c != '-') {
772                 return false;
773             }
774         }
775         return true;
776     }
777 
778     /**
779      * Resolves a given relative URL against a base URL using the algorithm
780      * depicted in <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>:
781      *
782      * Section 4: Resolving Relative URLs
783      *
784      *   This section describes an example algorithm for resolving URLs within
785      *   a context in which the URLs may be relative, such that the result is
786      *   always a URL in absolute form. Although this algorithm cannot
787      *   guarantee that the resulting URL will equal that intended by the
788      *   original author, it does guarantee that any valid URL (relative or
789      *   absolute) can be consistently transformed to an absolute form given a
790      *   valid base URL.
791      *
792      * @param baseUrl     The base URL in which to resolve the specification.
793      * @param relativeUrl The relative URL to resolve against the base URL.
794      * @return the resolved specification.
795      */
796     private static Url resolveUrl(final Url baseUrl, final String relativeUrl) {
797         final Url url = parseUrl(relativeUrl);
798         // Step 1: The base URL is established according to the rules of
799         //         Section 3.  If the base URL is the empty string (unknown),
800         //         the embedded URL is interpreted as an absolute URL and
801         //         we are done.
802         if (baseUrl == null) {
803             return url;
804         }
805         // Step 2: Both the base and embedded URLs are parsed into their
806         //         component parts as described in Section 2.4.
807         //      a) If the embedded URL is entirely empty, it inherits the
808         //         entire base URL (i.e., is set equal to the base URL)
809         //         and we are done.
810         if (relativeUrl.isEmpty()) {
811             return new Url(baseUrl);
812         }
813         //      b) If the embedded URL starts with a scheme name, it is
814         //         interpreted as an absolute URL and we are done.
815         if (url.scheme_ != null) {
816             return url;
817         }
818         //      c) Otherwise, the embedded URL inherits the scheme of
819         //         the base URL.
820         url.scheme_ = baseUrl.scheme_;
821         // Step 3: If the embedded URL's <net_loc> is non-empty, we skip to
822         //         Step 7.  Otherwise, the embedded URL inherits the <net_loc>
823         //         (if any) of the base URL.
824         if (url.location_ != null) {
825             return url;
826         }
827         url.location_ = baseUrl.location_;
828         // Step 4: If the embedded URL path is preceded by a slash "/", the
829         //         path is not relative and we skip to Step 7.
830         if (url.path_ != null && !url.path_.isEmpty() && url.path_.charAt(0) == '/') {
831             url.path_ = removeLeadingSlashPoints(url.path_);
832             return url;
833         }
834         // Step 5: If the embedded URL path is empty (and not preceded by a
835         //         slash), then the embedded URL inherits the base URL path,
836         //         and
837         if (url.path_ == null) {
838             url.path_ = baseUrl.path_;
839             //  a) if the embedded URL's <params> is non-empty, we skip to
840             //     step 7; otherwise, it inherits the <params> of the base
841             //     URL (if any) and
842             if (url.parameters_ != null) {
843                 return url;
844             }
845             url.parameters_ = baseUrl.parameters_;
846             //  b) if the embedded URL's <query> is non-empty, we skip to
847             //     step 7; otherwise, it inherits the <query> of the base
848             //     URL (if any) and we skip to step 7.
849             if (url.query_ != null) {
850                 return url;
851             }
852             url.query_ = baseUrl.query_;
853             return url;
854         }
855         // Step 6: The last segment of the base URL's path (anything
856         //         following the rightmost slash "/", or the entire path if no
857         //         slash is present) is removed and the embedded URL's path is
858         //         appended in its place.  The following operations are
859         //         then applied, in order, to the new path:
860         final String basePath = baseUrl.path_;
861         String path = "";
862 
863         if (basePath != null) {
864             final int lastSlashIndex = basePath.lastIndexOf('/');
865 
866             if (lastSlashIndex >= 0) {
867                 path = basePath.substring(0, lastSlashIndex + 1);
868             }
869         }
870         else {
871             path = "/";
872         }
873         path = path.concat(url.path_);
874         //      a) All occurrences of "./", where "." is a complete path
875         //         segment, are removed.
876         int pathSegmentIndex;
877 
878         while ((pathSegmentIndex = path.indexOf("/./")) >= 0) {
879             path = path.substring(0, pathSegmentIndex + 1).concat(path.substring(pathSegmentIndex + 3));
880         }
881         //      b) If the path ends with "." as a complete path segment,
882         //         that "." is removed.
883         if (path.endsWith("/.")) {
884             path = path.substring(0, path.length() - 1);
885         }
886         //      c) All occurrences of "<segment>/../", where <segment> is a
887         //         complete path segment not equal to "..", are removed.
888         //         Removal of these path segments is performed iteratively,
889         //         removing the leftmost matching pattern on each iteration,
890         //         until no matching pattern remains.
891         while ((pathSegmentIndex = path.indexOf("/../")) > 0) {
892             final String pathSegment = path.substring(0, pathSegmentIndex);
893             final int slashIndex = pathSegment.lastIndexOf('/');
894 
895             if (slashIndex >= 0) {
896                 if (!"..".equals(pathSegment.substring(slashIndex))) {
897                     path = path.substring(0, slashIndex + 1).concat(path.substring(pathSegmentIndex + 4));
898                 }
899             }
900             else {
901                 path = path.substring(pathSegmentIndex + 4);
902             }
903         }
904         //      d) If the path ends with "<segment>/..", where <segment> is a
905         //         complete path segment not equal to "..", that
906         //         "<segment>/.." is removed.
907         if (path.endsWith("/..")) {
908             final String pathSegment = path.substring(0, path.length() - 3);
909             final int slashIndex = pathSegment.lastIndexOf('/');
910 
911             if (slashIndex >= 0) {
912                 path = path.substring(0, slashIndex + 1);
913             }
914         }
915 
916         path = removeLeadingSlashPoints(path);
917 
918         url.path_ = path;
919         // Step 7: The resulting URL components, including any inherited from
920         //         the base URL, are recombined to give the absolute form of
921         //         the embedded URL.
922         return url;
923     }
924 
925     /**
926      * "/.." at the beginning should be removed as browsers do (not in RFC)
927      */
928     private static String removeLeadingSlashPoints(String path) {
929         while (path.startsWith("/..")) {
930             path = path.substring(3);
931         }
932 
933         return path;
934     }
935 
936     /**
937      * Class <tt>Url</tt> represents a Uniform Resource Locator.
938      *
939      * @author Martin Tamme
940      */
941     private static class Url {
942 
943         private String scheme_;
944         private String location_;
945         private String path_;
946         private String parameters_;
947         private String query_;
948         private String fragment_;
949 
950         /**
951          * Creates a <tt>Url</tt> object.
952          */
953         Url() {
954         }
955 
956         /**
957          * Creates a <tt>Url</tt> object from the specified
958          * <tt>Url</tt> object.
959          *
960          * @param url a <tt>Url</tt> object.
961          */
962         Url(final Url url) {
963             scheme_ = url.scheme_;
964             location_ = url.location_;
965             path_ = url.path_;
966             parameters_ = url.parameters_;
967             query_ = url.query_;
968             fragment_ = url.fragment_;
969         }
970 
971         /**
972          * Returns a string representation of the <tt>Url</tt> object.
973          *
974          * @return a string representation of the <tt>Url</tt> object.
975          */
976         @Override
977         public String toString() {
978             final StringBuilder sb = new StringBuilder();
979 
980             if (scheme_ != null) {
981                 sb.append(scheme_);
982                 sb.append(':');
983             }
984             if (location_ != null) {
985                 sb.append("//");
986                 sb.append(location_);
987             }
988             if (path_ != null) {
989                 sb.append(path_);
990             }
991             if (parameters_ != null) {
992                 sb.append(';');
993                 sb.append(parameters_);
994             }
995             if (query_ != null) {
996                 sb.append('?');
997                 sb.append(query_);
998             }
999             if (fragment_ != null) {
1000                 sb.append('#');
1001                 sb.append(fragment_);
1002             }
1003             return sb.toString();
1004         }
1005     }
1006 
1007     static boolean isNormalUrlProtocol(final String protocol) {
1008         return "http".equals(protocol) || "https".equals(protocol) || "file".equals(protocol);
1009     }
1010 
1011     /**
1012      * More or less the same as sameFile(URL, URL) but without
1013      * resolving the host to an IP address for comparing.
1014      * Additionally we do some path normalization.
1015      *
1016      * @param u1 a URL object
1017      * @param u2 a URL object
1018      * @return true if u1 and u2 refer to the same file
1019      */
1020     public static boolean sameFile(final URL u1, final URL u2) {
1021         if (u1 == u2) {
1022             return true;
1023         }
1024         if (u1 == null || u2 == null) {
1025             return false;
1026         }
1027 
1028         // Compare the protocols.
1029         final String p1 = u1.getProtocol();
1030         final String p2 = u2.getProtocol();
1031         if (!(p1 == p2 || (p1 != null && p1.equalsIgnoreCase(p2)))) {
1032             return false;
1033         }
1034 
1035         // Compare the ports.
1036         final int port1 = (u1.getPort() != -1) ? u1.getPort() : u1.getDefaultPort();
1037         final int port2 = (u2.getPort() != -1) ? u2.getPort() : u2.getDefaultPort();
1038         if (port1 != port2) {
1039             return false;
1040         }
1041 
1042         // Compare the hosts.
1043         final String h1 = u1.getHost();
1044         final String h2 = u2.getHost();
1045         if (!(h1 == h2 || (h1 != null && h1.equalsIgnoreCase(h2)))) {
1046             return false;
1047         }
1048 
1049         // Compare the files.
1050         String f1 = u1.getFile();
1051         if (f1.isEmpty()) {
1052             f1 = "/";
1053         }
1054         String f2 = u2.getFile();
1055         if (f2.isEmpty()) {
1056             f2 = "/";
1057         }
1058         if (f1.indexOf('.') > 0 || f2.indexOf('.') > 0) {
1059             try {
1060                 f1 = u1.toURI().normalize().toURL().getFile();
1061                 f2 = u2.toURI().normalize().toURL().getFile();
1062             }
1063             catch (final Exception e) {
1064                 // ignore
1065             }
1066         }
1067         if (!(f1 == f2 || (f1 != null && f1.equals(f2)))) {
1068             return false;
1069         }
1070 
1071         return true;
1072     }
1073 
1074     /**
1075      * Helper that constructs a normalized url string
1076      * usable as cache key.
1077      *
1078      * @param url a URL object
1079      * @return the normalized string
1080      */
1081     public static String normalize(final URL url) {
1082         final StringBuilder result = new StringBuilder();
1083 
1084         result.append(url.getProtocol());
1085         result.append("://");
1086         result.append(url.getHost());
1087         result.append(':');
1088         result.append((url.getPort() != -1) ? url.getPort() : url.getDefaultPort());
1089 
1090         // Compare the files.
1091         String f = url.getFile();
1092         if (f.isEmpty()) {
1093             result.append("/");
1094         }
1095         else {
1096             if (f.indexOf('.') > 0) {
1097                 try {
1098                     f = url.toURI().normalize().toURL().getFile();
1099                 }
1100                 catch (final Exception e) {
1101                     // ignore
1102                 }
1103             }
1104             result.append(f);
1105         }
1106 
1107         return result.toString();
1108     }
1109 
1110     /**
1111      * Constructs a {@link URI} using the specified URL.
1112      *
1113      * @param url the URL
1114      * @param query the query
1115      *
1116      * @throws URISyntaxException
1117      *         If both a scheme and a path are given but the path is
1118      *         relative, if the URI string constructed from the given
1119      *         components violates RFC&nbsp;2396, or if the authority
1120      *         component of the string is present but cannot be parsed
1121      *         as a server-based authority
1122      * @return the URI
1123      */
1124     public static URI toURI(final URL url, final String query) throws URISyntaxException {
1125         final String scheme = url.getProtocol();
1126         final String host = url.getHost();
1127         final int port = url.getPort();
1128         final String path = url.getPath();
1129         final StringBuilder buffer = new StringBuilder();
1130         if (host != null) {
1131             if (scheme != null) {
1132                 buffer.append(scheme);
1133                 buffer.append("://");
1134             }
1135             buffer.append(host);
1136             if (port > 0) {
1137                 buffer.append(':');
1138                 buffer.append(port);
1139             }
1140         }
1141         if (path == null || !path.startsWith("/")) {
1142             buffer.append('/');
1143         }
1144         if (path != null) {
1145             buffer.append(path);
1146         }
1147         if (query != null) {
1148             buffer.append('?');
1149             buffer.append(query);
1150         }
1151         return new URI(buffer.toString());
1152     }
1153 }