View Javadoc
1   /*
2    * Copyright (c) 2002-2017 Gargoyle Software Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * http://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  package com.gargoylesoftware.htmlunit.util;
16  
17  import static java.nio.charset.StandardCharsets.US_ASCII;
18  import static java.nio.charset.StandardCharsets.UTF_8;
19  
20  import java.net.MalformedURLException;
21  import java.net.URI;
22  import java.net.URISyntaxException;
23  import java.net.URL;
24  import java.nio.charset.Charset;
25  import java.util.BitSet;
26  
27  import org.apache.commons.codec.DecoderException;
28  import org.apache.commons.codec.net.URLCodec;
29  
30  import com.gargoylesoftware.htmlunit.WebAssert;
31  
32  /**
33   * URL utilities class that makes it easy to create new URLs based off of old URLs
34   * without having to assemble or parse them yourself.
35   *
36   * @author Daniel Gredler
37   * @author Martin Tamme
38   * @author Sudhan Moghe
39   * @author Marc Guillemot
40   * @author Ahmed Ashour
41   * @author Ronald Brill
42   * @author Joerg Werner
43   */
44  public final class UrlUtils {
45      private static final BitSet PATH_ALLOWED_CHARS = new BitSet(256);
46      private static final BitSet QUERY_ALLOWED_CHARS = new BitSet(256);
47      private static final BitSet ANCHOR_ALLOWED_CHARS = new BitSet(256);
48      private static final BitSet HASH_ALLOWED_CHARS = new BitSet(256);
49      private static final URLCreator URL_CREATOR = URLCreator.getCreator();
50  
51      /**
52       * URI allowed char initialization; based on HttpClient 3.1's URI bit sets.
53       */
54      static {
55          final BitSet reserved = new BitSet(256);
56          reserved.set(';');
57          reserved.set('/');
58          reserved.set('?');
59          reserved.set(':');
60          reserved.set('@');
61          reserved.set('&');
62          reserved.set('=');
63          reserved.set('+');
64          reserved.set('$');
65          reserved.set(',');
66  
67          final BitSet mark = new BitSet(256);
68          mark.set('-');
69          mark.set('_');
70          mark.set('.');
71          mark.set('!');
72          mark.set('~');
73          mark.set('*');
74          mark.set('\'');
75          mark.set('(');
76          mark.set(')');
77  
78          final BitSet alpha = new BitSet(256);
79          for (int i = 'a'; i <= 'z'; i++) {
80              alpha.set(i);
81          }
82          for (int i = 'A'; i <= 'Z'; i++) {
83              alpha.set(i);
84          }
85  
86          final BitSet digit = new BitSet(256);
87          for (int i = '0'; i <= '9'; i++) {
88              digit.set(i);
89          }
90  
91          final BitSet alphanumeric = new BitSet(256);
92          alphanumeric.or(alpha);
93          alphanumeric.or(digit);
94  
95          final BitSet unreserved = new BitSet(256);
96          unreserved.or(alphanumeric);
97          unreserved.or(mark);
98  
99          final BitSet hex = new BitSet(256);
100         hex.or(digit);
101         for (int i = 'a'; i <= 'f'; i++) {
102             hex.set(i);
103         }
104         for (int i = 'A'; i <= 'F'; i++) {
105             hex.set(i);
106         }
107 
108         final BitSet escaped = new BitSet(256);
109         escaped.set('%');
110         escaped.or(hex);
111 
112         final BitSet uric = new BitSet(256);
113         uric.or(reserved);
114         uric.or(unreserved);
115         uric.or(escaped);
116 
117         final BitSet pchar = new BitSet(256);
118         pchar.or(unreserved);
119         pchar.or(escaped);
120         pchar.set(':');
121         pchar.set('@');
122         pchar.set('&');
123         pchar.set('=');
124         pchar.set('+');
125         pchar.set('$');
126         pchar.set(',');
127 
128         final BitSet param = pchar;
129 
130         final BitSet segment = new BitSet(256);
131         segment.or(pchar);
132         segment.set(';');
133         segment.or(param);
134 
135         final BitSet pathSegments = new BitSet(256);
136         pathSegments.set('/');
137         pathSegments.or(segment);
138 
139         final BitSet absPath = new BitSet(256);
140         absPath.set('/');
141         absPath.or(pathSegments);
142 
143         final BitSet allowedAbsPath = new BitSet(256);
144         allowedAbsPath.or(absPath);
145 
146         final BitSet allowedFragment = new BitSet(256);
147         allowedFragment.or(uric);
148 //        allowedFragment.clear('%');
149 
150         final BitSet allowedQuery = new BitSet(256);
151         allowedQuery.or(uric);
152 
153         final BitSet allowedHash = new BitSet(256);
154         allowedHash.or(uric);
155         allowedHash.clear('%');
156 
157         PATH_ALLOWED_CHARS.or(allowedAbsPath);
158         QUERY_ALLOWED_CHARS.or(allowedQuery);
159         ANCHOR_ALLOWED_CHARS.or(allowedFragment);
160         HASH_ALLOWED_CHARS.or(allowedHash);
161     }
162 
163     /**
164      * Disallow instantiation of this class.
165      */
166     private UrlUtils() {
167         // Empty.
168     }
169 
170     /**
171      * <p>Constructs a URL instance based on the specified URL string, taking into account the fact that the
172      * specified URL string may represent an <tt>"about:..."</tt> URL, a <tt>"javascript:..."</tt> URL, or
173      * a <tt>data:...</tt> URL.</p>
174      *
175      * <p>The caller should be sure that URL strings passed to this method will parse correctly as URLs, as
176      * this method never expects to have to handle {@link MalformedURLException}s.</p>
177      *
178      * @param url the URL string to convert into a URL instance
179      * @return the constructed URL instance
180      */
181     public static URL toUrlSafe(final String url) {
182         try {
183             return toUrlUnsafe(url);
184         }
185         catch (final MalformedURLException e) {
186             // Should never happen.
187             throw new RuntimeException(e);
188         }
189     }
190 
191     /**
192      * <p>Constructs a URL instance based on the specified URL string, taking into account the fact that the
193      * specified URL string may represent an <tt>"about:..."</tt> URL, a <tt>"javascript:..."</tt> URL, or
194      * a <tt>data:...</tt> URL.</p>
195      *
196      * <p>Unlike {@link #toUrlSafe(String)}, the caller need not be sure that URL strings passed to this
197      * method will parse correctly as URLs.</p>
198      *
199      * @param url the URL string to convert into a URL instance
200      * @return the constructed URL instance
201      * @throws MalformedURLException if the URL string cannot be converted to a URL instance
202      */
203     public static URL toUrlUnsafe(final String url) throws MalformedURLException {
204         WebAssert.notNull("url", url);
205         return URL_CREATOR.toUrlUnsafeClassic(url);
206     }
207 
208     /**
209      * <p>Encodes illegal characters in the specified URL's path, query string and anchor according to the URL
210      * encoding rules observed in real browsers.</p>
211      *
212      * <p>For example, this method changes <tt>"http://first/?a=b c"</tt> to <tt>"http://first/?a=b%20c"</tt>.</p>
213      *
214      * @param url the URL to encode
215      * @param minimalQueryEncoding whether or not to perform minimal query encoding, like IE does
216      * @param charset the charset
217      * @return the encoded URL
218      */
219     public static URL encodeUrl(final URL url, final boolean minimalQueryEncoding, final Charset charset) {
220         if (!isNormalUrlProtocol(URL_CREATOR.getProtocol(url))) {
221             return url; // javascript:, about:, data: and anything not supported like foo:
222         }
223 
224         try {
225             String path = url.getPath();
226             if (path != null) {
227                 path = encode(path, PATH_ALLOWED_CHARS, UTF_8);
228             }
229             String query = url.getQuery();
230             if (query != null) {
231                 if (minimalQueryEncoding) {
232                     query = org.apache.commons.lang3.StringUtils.replace(query, " ", "%20");
233                 }
234                 else {
235                     query = encode(query, QUERY_ALLOWED_CHARS, charset);
236                 }
237             }
238             String anchor = url.getRef();
239             if (anchor != null) {
240                 anchor = encode(anchor, ANCHOR_ALLOWED_CHARS, UTF_8);
241             }
242             return createNewUrl(url.getProtocol(), url.getUserInfo(), url.getHost(),
243                                 url.getPort(), path, anchor, query);
244         }
245         catch (final MalformedURLException e) {
246             // Impossible... I think.
247             throw new RuntimeException(e);
248         }
249     }
250 
251     /**
252      * Encodes and escapes the specified URI anchor string.
253      *
254      * @param anchor the anchor string to encode and escape
255      * @return the encoded and escaped anchor string
256      */
257     public static String encodeAnchor(String anchor) {
258         if (anchor != null) {
259             anchor = encode(anchor, ANCHOR_ALLOWED_CHARS, UTF_8);
260         }
261         return anchor;
262     }
263 
264     /**
265      * Encodes and escapes the specified URI anchor string.
266      *
267      * @param hash the anchor string to encode and escape
268      * @return the encoded and escaped anchor string
269      */
270     public static String encodeHash(String hash) {
271         if (hash != null) {
272             hash = encode(hash, HASH_ALLOWED_CHARS, UTF_8);
273         }
274         return hash;
275     }
276 
277     /**
278      * Unescapes and decodes the specified string.
279      *
280      * @param escaped the string to be unescaped and decoded
281      * @return the unescaped and decoded string
282      */
283     public static String decode(final String escaped) {
284         try {
285             final byte[] bytes = escaped.getBytes(US_ASCII);
286             final byte[] bytes2 = URLCodec.decodeUrl(bytes);
287             return new String(bytes2, UTF_8);
288         }
289         catch (final DecoderException e) {
290             // Should never happen.
291             throw new RuntimeException(e);
292         }
293     }
294 
295     /**
296      * Escapes and encodes the specified string. Based on HttpClient 3.1's <tt>URIUtil.encode()</tt> method.
297      *
298      * @param unescaped the string to encode
299      * @param allowed allowed characters that shouldn't be escaped
300      * @param charset the charset to use
301      * @return the escaped string
302      */
303     private static String encode(final String unescaped, final BitSet allowed, final Charset charset) {
304         final byte[] bytes = unescaped.getBytes(charset);
305         final byte[] bytes2 = URLCodec.encodeUrl(allowed, bytes);
306         return encodePercentSign(bytes2);
307     }
308 
309     /**
310      * Encodes every occurrence of the escape character '%' in the given input
311      * string that is not followed by two hexadecimal characters.
312      * @param str the input string
313      * @return the given input string where every occurrence of <code>%</code> in
314      * invalid escape sequences has been replace by <code>%25</code>
315      */
316     private static String encodePercentSign(final byte[] input) {
317         if (input == null) {
318             return null;
319         }
320 
321         final StringBuilder result = new StringBuilder(new String(input, US_ASCII));
322         int state = -0;
323         int offset = 0;
324         for (int i = 0; i < input.length; i++) {
325             final byte b = input[i];
326             if (b == '%') {
327                 state = 1;
328             }
329             else if (state == 1 || state == 2) {
330                 if (('0' <= b && b <= '9')
331                         || ('A' <= b && b <= 'F')
332                         || ('a' <= b && b <= 'f')) {
333                     state++;
334                     if (state == 3) {
335                         state = 0;
336                     }
337                 }
338                 else {
339                     final int st = i - state + offset;
340                     result.replace(st, st + 1, "%25");
341                     offset = offset + 2;
342                     state = 0;
343                 }
344             }
345         }
346         if (state == 1 || state == 2) {
347             final int st = input.length - state + offset;
348             result.replace(st, st + 1, "%25");
349         }
350         return result.toString();
351     }
352 
353     /**
354      * Creates and returns a new URL identical to the specified URL, except using the specified protocol.
355      * @param u the URL on which to base the returned URL
356      * @param newProtocol the new protocol to use in the returned URL
357      * @return a new URL identical to the specified URL, except using the specified protocol
358      * @throws MalformedURLException if there is a problem creating the new URL
359      */
360     public static URL getUrlWithNewProtocol(final URL u, final String newProtocol) throws MalformedURLException {
361         return createNewUrl(newProtocol, u.getAuthority(), u.getPath(), u.getRef(), u.getQuery());
362     }
363 
364     /**
365      * Creates and returns a new URL identical to the specified URL, except using the specified host.
366      * @param u the URL on which to base the returned URL
367      * @param newHost the new host to use in the returned URL
368      * @return a new URL identical to the specified URL, except using the specified host
369      * @throws MalformedURLException if there is a problem creating the new URL
370      */
371     public static URL getUrlWithNewHost(final URL u, final String newHost)
372         throws MalformedURLException {
373         return createNewUrl(u.getProtocol(), u.getUserInfo(), newHost,
374                             u.getPort(), u.getPath(), u.getRef(), u.getQuery());
375     }
376 
377     /**
378      * Creates and returns a new URL identical to the specified URL, except using the specified host.
379      * @param u the URL on which to base the returned URL
380      * @param newHost the new host to use in the returned URL
381      * @param newPort the new port to use in the returned URL
382      * @return a new URL identical to the specified URL, except using the specified host
383      * @throws MalformedURLException if there is a problem creating the new URL
384      */
385     public static URL getUrlWithNewHostAndPort(final URL u, final String newHost, final int newPort)
386         throws MalformedURLException {
387         return createNewUrl(u.getProtocol(), u.getUserInfo(), newHost, newPort, u.getPath(), u.getRef(), u.getQuery());
388     }
389 
390     /**
391      * Creates and returns a new URL identical to the specified URL, except using the specified port.
392      * @param u the URL on which to base the returned URL
393      * @param newPort the new port to use in the returned URL
394      * @return a new URL identical to the specified URL, except using the specified port
395      * @throws MalformedURLException if there is a problem creating the new URL
396      */
397     public static URL getUrlWithNewPort(final URL u, final int newPort) throws MalformedURLException {
398         return createNewUrl(u.getProtocol(), u.getUserInfo(), u.getHost(),
399                             newPort, u.getPath(), u.getRef(), u.getQuery());
400     }
401 
402     /**
403      * Creates and returns a new URL identical to the specified URL, except using the specified path.
404      * @param u the URL on which to base the returned URL
405      * @param newPath the new path to use in the returned URL
406      * @return a new URL identical to the specified URL, except using the specified path
407      * @throws MalformedURLException if there is a problem creating the new URL
408      */
409     public static URL getUrlWithNewPath(final URL u, final String newPath) throws MalformedURLException {
410         return createNewUrl(u.getProtocol(), u.getAuthority(), newPath, u.getRef(), u.getQuery());
411     }
412 
413     /**
414      * Creates and returns a new URL identical to the specified URL, except using the specified reference.
415      * @param u the URL on which to base the returned URL
416      * @param newRef the new reference to use in the returned URL
417      * @return a new URL identical to the specified URL, except using the specified reference
418      * @throws MalformedURLException if there is a problem creating the new URL
419      */
420     public static URL getUrlWithNewRef(final URL u, final String newRef) throws MalformedURLException {
421         return createNewUrl(u.getProtocol(), u.getAuthority(), u.getPath(), newRef, u.getQuery());
422     }
423 
424     /**
425      * Creates and returns a new URL identical to the specified URL, except using the specified query string.
426      * @param u the URL on which to base the returned URL
427      * @param newQuery the new query string to use in the returned URL
428      * @return a new URL identical to the specified URL, except using the specified query string
429      * @throws MalformedURLException if there is a problem creating the new URL
430      */
431     public static URL getUrlWithNewQuery(final URL u, final String newQuery) throws MalformedURLException {
432         return createNewUrl(u.getProtocol(), u.getAuthority(), u.getPath(), u.getRef(), newQuery);
433     }
434 
435     /**
436      * Creates a new URL based on the specified fragments.
437      * @param protocol the protocol to use (may not be {@code null})
438      * @param userInfo the user info to use (may be {@code null})
439      * @param host the host to use (may not be {@code null})
440      * @param port the port to use (may be <tt>-1</tt> if no port is specified)
441      * @param path the path to use (may be {@code null} and may omit the initial <tt>'/'</tt>)
442      * @param ref the reference to use (may be {@code null} and must not include the <tt>'#'</tt>)
443      * @param query the query to use (may be {@code null} and must not include the <tt>'?'</tt>)
444      * @return a new URL based on the specified fragments
445      * @throws MalformedURLException if there is a problem creating the new URL
446      */
447     private static URL createNewUrl(final String protocol, final String userInfo, final String host, final int port,
448             final String path, final String ref, final String query) throws MalformedURLException {
449         final StringBuilder s = new StringBuilder();
450         s.append(protocol);
451         s.append("://");
452         if (userInfo != null) {
453             s.append(userInfo).append("@");
454         }
455         s.append(host);
456         if (port != -1) {
457             s.append(":").append(port);
458         }
459         if (path != null && !path.isEmpty()) {
460             if (!('/' == path.charAt(0))) {
461                 s.append("/");
462             }
463             s.append(path);
464         }
465         if (query != null) {
466             s.append("?").append(query);
467         }
468         if (ref != null) {
469             if (ref.isEmpty() || ref.charAt(0) != '#') {
470                 s.append("#");
471             }
472             s.append(ref);
473         }
474 
475         final URL url = new URL(s.toString());
476         return url;
477     }
478 
479     /**
480      * Creates a new URL based on the specified fragments.
481      * @param protocol the protocol to use (may not be {@code null})
482      * @param authority the authority to use (may not be {@code null})
483      * @param path the path to use (may be {@code null} and may omit the initial <tt>'/'</tt>)
484      * @param ref the reference to use (may be {@code null} and must not include the <tt>'#'</tt>)
485      * @param query the query to use (may be {@code null} and must not include the <tt>'?'</tt>)
486      * @return a new URL based on the specified fragments
487      * @throws MalformedURLException if there is a problem creating the new URL
488      */
489     private static URL createNewUrl(final String protocol, final String authority,
490             final String path, final String ref, final String query) throws MalformedURLException {
491 
492         // pre-compute length of StringBuilder
493         int len = protocol.length() + 1;
494         if (authority != null && !authority.isEmpty()) {
495             len += 2 + authority.length();
496         }
497         if (path != null) {
498             len += path.length();
499         }
500         if (query != null) {
501             len += 1 + query.length();
502         }
503         if (ref != null) {
504             len += 1 + ref.length();
505         }
506 
507         final StringBuilder s = new StringBuilder(len);
508         s.append(protocol);
509         s.append(":");
510         if (authority != null && !authority.isEmpty()) {
511             s.append("//");
512             s.append(authority);
513         }
514         if (path != null) {
515             s.append(path);
516         }
517         if (query != null) {
518             s.append('?');
519             s.append(query);
520         }
521         if (ref != null) {
522             if (ref.isEmpty() || ref.charAt(0) != '#') {
523                 s.append("#");
524             }
525             s.append(ref);
526         }
527 
528         return new URL(s.toString());
529     }
530 
531     /**
532      * Resolves a given relative URL against a base URL. See
533      * <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>
534      * Section 4 for more details.
535      *
536      * @param baseUrl     The base URL in which to resolve the specification.
537      * @param relativeUrl The relative URL to resolve against the base URL.
538      * @return the resolved specification.
539      */
540     public static String resolveUrl(final String baseUrl, final String relativeUrl) {
541         if (baseUrl == null) {
542             throw new IllegalArgumentException("Base URL must not be null");
543         }
544         if (relativeUrl == null) {
545             throw new IllegalArgumentException("Relative URL must not be null");
546         }
547         final Url url = resolveUrl(parseUrl(baseUrl.trim()), relativeUrl.trim());
548 
549         return url.toString();
550     }
551 
552     /**
553      * Resolves a given relative URL against a base URL. See
554      * <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>
555      * Section 4 for more details.
556      *
557      * @param baseUrl     The base URL in which to resolve the specification.
558      * @param relativeUrl The relative URL to resolve against the base URL.
559      * @return the resolved specification.
560      */
561     public static String resolveUrl(final URL baseUrl, final String relativeUrl) {
562         if (baseUrl == null) {
563             throw new IllegalArgumentException("Base URL must not be null");
564         }
565         return resolveUrl(baseUrl.toExternalForm(), relativeUrl);
566     }
567 
568     /**
569      * Parses a given specification using the algorithm depicted in
570      * <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>:
571      *
572      * Section 2.4: Parsing a URL
573      *
574      *   An accepted method for parsing URLs is useful to clarify the
575      *   generic-RL syntax of Section 2.2 and to describe the algorithm for
576      *   resolving relative URLs presented in Section 4. This section
577      *   describes the parsing rules for breaking down a URL (relative or
578      *   absolute) into the component parts described in Section 2.1.  The
579      *   rules assume that the URL has already been separated from any
580      *   surrounding text and copied to a "parse string". The rules are
581      *   listed in the order in which they would be applied by the parser.
582      *
583      * @param spec The specification to parse.
584      * @return the parsed specification.
585      */
586     private static Url parseUrl(final String spec) {
587         final Url url = new Url();
588         int startIndex = 0;
589         int endIndex = spec.length();
590 
591         // Section 2.4.1: Parsing the Fragment Identifier
592         //
593         //   If the parse string contains a crosshatch "#" character, then the
594         //   substring after the first (left-most) crosshatch "#" and up to the
595         //   end of the parse string is the <fragment> identifier. If the
596         //   crosshatch is the last character, or no crosshatch is present, then
597         //   the fragment identifier is empty. The matched substring, including
598         //   the crosshatch character, is removed from the parse string before
599         //   continuing.
600         //
601         //   Note that the fragment identifier is not considered part of the URL.
602         //   However, since it is often attached to the URL, parsers must be able
603         //   to recognize and set aside fragment identifiers as part of the
604         //   process.
605         final int crosshatchIndex = StringUtils.indexOf(spec, '#', startIndex, endIndex);
606 
607         if (crosshatchIndex >= 0) {
608             url.fragment_ = spec.substring(crosshatchIndex + 1, endIndex);
609             endIndex = crosshatchIndex;
610         }
611         // Section 2.4.2: Parsing the Scheme
612         //
613         //   If the parse string contains a colon ":" after the first character
614         //   and before any characters not allowed as part of a scheme name (i.e.,
615         //   any not an alphanumeric, plus "+", period ".", or hyphen "-"), the
616         //   <scheme> of the URL is the substring of characters up to but not
617         //   including the first colon. These characters and the colon are then
618         //   removed from the parse string before continuing.
619         final int colonIndex = StringUtils.indexOf(spec, ':', startIndex, endIndex);
620 
621         if (colonIndex > 0) {
622             final String scheme = spec.substring(startIndex, colonIndex);
623             if (isValidScheme(scheme)) {
624                 url.scheme_ = scheme;
625                 startIndex = colonIndex + 1;
626             }
627         }
628         // Section 2.4.3: Parsing the Network Location/Login
629         //
630         //   If the parse string begins with a double-slash "//", then the
631         //   substring of characters after the double-slash and up to, but not
632         //   including, the next slash "/" character is the network location/login
633         //   (<net_loc>) of the URL. If no trailing slash "/" is present, the
634         //   entire remaining parse string is assigned to <net_loc>. The double-
635         //   slash and <net_loc> are removed from the parse string before
636         //   continuing.
637         //
638         // Note: We also accept a question mark "?" or a semicolon ";" character as
639         //       delimiters for the network location/login (<net_loc>) of the URL.
640         final int locationStartIndex;
641         int locationEndIndex;
642 
643         if (spec.startsWith("//", startIndex)) {
644             locationStartIndex = startIndex + 2;
645             locationEndIndex = StringUtils.indexOf(spec, '/', locationStartIndex, endIndex);
646             if (locationEndIndex >= 0) {
647                 startIndex = locationEndIndex;
648             }
649         }
650         else {
651             locationStartIndex = -1;
652             locationEndIndex = -1;
653         }
654         // Section 2.4.4: Parsing the Query Information
655         //
656         //   If the parse string contains a question mark "?" character, then the
657         //   substring after the first (left-most) question mark "?" and up to the
658         //   end of the parse string is the <query> information. If the question
659         //   mark is the last character, or no question mark is present, then the
660         //   query information is empty. The matched substring, including the
661         //   question mark character, is removed from the parse string before
662         //   continuing.
663         final int questionMarkIndex = StringUtils.indexOf(spec, '?', startIndex, endIndex);
664 
665         if (questionMarkIndex >= 0) {
666             if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
667                 // The substring of characters after the double-slash and up to, but not
668                 // including, the question mark "?" character is the network location/login
669                 // (<net_loc>) of the URL.
670                 locationEndIndex = questionMarkIndex;
671                 startIndex = questionMarkIndex;
672             }
673             url.query_ = spec.substring(questionMarkIndex + 1, endIndex);
674             endIndex = questionMarkIndex;
675         }
676         // Section 2.4.5: Parsing the Parameters
677         //
678         //   If the parse string contains a semicolon ";" character, then the
679         //   substring after the first (left-most) semicolon ";" and up to the end
680         //   of the parse string is the parameters (<params>). If the semicolon
681         //   is the last character, or no semicolon is present, then <params> is
682         //   empty. The matched substring, including the semicolon character, is
683         //   removed from the parse string before continuing.
684         final int semicolonIndex = StringUtils.indexOf(spec, ';', startIndex, endIndex);
685 
686         if (semicolonIndex >= 0) {
687             if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
688                 // The substring of characters after the double-slash and up to, but not
689                 // including, the semicolon ";" character is the network location/login
690                 // (<net_loc>) of the URL.
691                 locationEndIndex = semicolonIndex;
692                 startIndex = semicolonIndex;
693             }
694             url.parameters_ = spec.substring(semicolonIndex + 1, endIndex);
695             endIndex = semicolonIndex;
696         }
697         // Section 2.4.6: Parsing the Path
698         //
699         //   After the above steps, all that is left of the parse string is the
700         //   URL <path> and the slash "/" that may precede it. Even though the
701         //   initial slash is not part of the URL path, the parser must remember
702         //   whether or not it was present so that later processes can
703         //   differentiate between relative and absolute paths. Often this is
704         //   done by simply storing the preceding slash along with the path.
705         if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
706             // The entire remaining parse string is assigned to the network
707             // location/login (<net_loc>) of the URL.
708             locationEndIndex = endIndex;
709         }
710         else if (startIndex < endIndex) {
711             url.path_ = spec.substring(startIndex, endIndex);
712         }
713         // Set the network location/login (<net_loc>) of the URL.
714         if ((locationStartIndex >= 0) && (locationEndIndex >= 0)) {
715             url.location_ = spec.substring(locationStartIndex, locationEndIndex);
716         }
717         return url;
718     }
719 
720     /*
721      * Returns true if specified string is a valid scheme name.
722      */
723     private static boolean isValidScheme(final String scheme) {
724         final int length = scheme.length();
725         if (length < 1) {
726             return false;
727         }
728         char c = scheme.charAt(0);
729         if (!Character.isLetter(c)) {
730             return false;
731         }
732         for (int i = 1; i < length; i++) {
733             c = scheme.charAt(i);
734             if (!Character.isLetterOrDigit(c) && c != '.' && c != '+' && c != '-') {
735                 return false;
736             }
737         }
738         return true;
739     }
740 
741     /**
742      * Resolves a given relative URL against a base URL using the algorithm
743      * depicted in <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>:
744      *
745      * Section 4: Resolving Relative URLs
746      *
747      *   This section describes an example algorithm for resolving URLs within
748      *   a context in which the URLs may be relative, such that the result is
749      *   always a URL in absolute form. Although this algorithm cannot
750      *   guarantee that the resulting URL will equal that intended by the
751      *   original author, it does guarantee that any valid URL (relative or
752      *   absolute) can be consistently transformed to an absolute form given a
753      *   valid base URL.
754      *
755      * @param baseUrl     The base URL in which to resolve the specification.
756      * @param relativeUrl The relative URL to resolve against the base URL.
757      * @return the resolved specification.
758      */
759     private static Url resolveUrl(final Url baseUrl, final String relativeUrl) {
760         final Url url = parseUrl(relativeUrl);
761         // Step 1: The base URL is established according to the rules of
762         //         Section 3.  If the base URL is the empty string (unknown),
763         //         the embedded URL is interpreted as an absolute URL and
764         //         we are done.
765         if (baseUrl == null) {
766             return url;
767         }
768         // Step 2: Both the base and embedded URLs are parsed into their
769         //         component parts as described in Section 2.4.
770         //      a) If the embedded URL is entirely empty, it inherits the
771         //         entire base URL (i.e., is set equal to the base URL)
772         //         and we are done.
773         if (relativeUrl.isEmpty()) {
774             return new Url(baseUrl);
775         }
776         //      b) If the embedded URL starts with a scheme name, it is
777         //         interpreted as an absolute URL and we are done.
778         if (url.scheme_ != null) {
779             return url;
780         }
781         //      c) Otherwise, the embedded URL inherits the scheme of
782         //         the base URL.
783         url.scheme_ = baseUrl.scheme_;
784         // Step 3: If the embedded URL's <net_loc> is non-empty, we skip to
785         //         Step 7.  Otherwise, the embedded URL inherits the <net_loc>
786         //         (if any) of the base URL.
787         if (url.location_ != null) {
788             return url;
789         }
790         url.location_ = baseUrl.location_;
791         // Step 4: If the embedded URL path is preceded by a slash "/", the
792         //         path is not relative and we skip to Step 7.
793         if (url.path_ != null && !url.path_.isEmpty() && url.path_.charAt(0) == '/') {
794             url.path_ = removeLeadingSlashPoints(url.path_);
795             return url;
796         }
797         // Step 5: If the embedded URL path is empty (and not preceded by a
798         //         slash), then the embedded URL inherits the base URL path,
799         //         and
800         if (url.path_ == null) {
801             url.path_ = baseUrl.path_;
802             //  a) if the embedded URL's <params> is non-empty, we skip to
803             //     step 7; otherwise, it inherits the <params> of the base
804             //     URL (if any) and
805             if (url.parameters_ != null) {
806                 return url;
807             }
808             url.parameters_ = baseUrl.parameters_;
809             //  b) if the embedded URL's <query> is non-empty, we skip to
810             //     step 7; otherwise, it inherits the <query> of the base
811             //     URL (if any) and we skip to step 7.
812             if (url.query_ != null) {
813                 return url;
814             }
815             url.query_ = baseUrl.query_;
816             return url;
817         }
818         // Step 6: The last segment of the base URL's path (anything
819         //         following the rightmost slash "/", or the entire path if no
820         //         slash is present) is removed and the embedded URL's path is
821         //         appended in its place.  The following operations are
822         //         then applied, in order, to the new path:
823         final String basePath = baseUrl.path_;
824         String path = "";
825 
826         if (basePath != null) {
827             final int lastSlashIndex = basePath.lastIndexOf('/');
828 
829             if (lastSlashIndex >= 0) {
830                 path = basePath.substring(0, lastSlashIndex + 1);
831             }
832         }
833         else {
834             path = "/";
835         }
836         path = path.concat(url.path_);
837         //      a) All occurrences of "./", where "." is a complete path
838         //         segment, are removed.
839         int pathSegmentIndex;
840 
841         while ((pathSegmentIndex = path.indexOf("/./")) >= 0) {
842             path = path.substring(0, pathSegmentIndex + 1).concat(path.substring(pathSegmentIndex + 3));
843         }
844         //      b) If the path ends with "." as a complete path segment,
845         //         that "." is removed.
846         if (path.endsWith("/.")) {
847             path = path.substring(0, path.length() - 1);
848         }
849         //      c) All occurrences of "<segment>/../", where <segment> is a
850         //         complete path segment not equal to "..", are removed.
851         //         Removal of these path segments is performed iteratively,
852         //         removing the leftmost matching pattern on each iteration,
853         //         until no matching pattern remains.
854         while ((pathSegmentIndex = path.indexOf("/../")) > 0) {
855             final String pathSegment = path.substring(0, pathSegmentIndex);
856             final int slashIndex = pathSegment.lastIndexOf('/');
857 
858             if (slashIndex >= 0) {
859                 if (!"..".equals(pathSegment.substring(slashIndex))) {
860                     path = path.substring(0, slashIndex + 1).concat(path.substring(pathSegmentIndex + 4));
861                 }
862             }
863             else {
864                 path = path.substring(pathSegmentIndex + 4);
865             }
866         }
867         //      d) If the path ends with "<segment>/..", where <segment> is a
868         //         complete path segment not equal to "..", that
869         //         "<segment>/.." is removed.
870         if (path.endsWith("/..")) {
871             final String pathSegment = path.substring(0, path.length() - 3);
872             final int slashIndex = pathSegment.lastIndexOf('/');
873 
874             if (slashIndex >= 0) {
875                 path = path.substring(0, slashIndex + 1);
876             }
877         }
878 
879         path = removeLeadingSlashPoints(path);
880 
881         url.path_ = path;
882         // Step 7: The resulting URL components, including any inherited from
883         //         the base URL, are recombined to give the absolute form of
884         //         the embedded URL.
885         return url;
886     }
887 
888     /**
889      * "/.." at the beginning should be removed as browsers do (not in RFC)
890      */
891     private static String removeLeadingSlashPoints(String path) {
892         while (path.startsWith("/..")) {
893             path = path.substring(3);
894         }
895 
896         return path;
897     }
898 
899     /**
900      * Class <tt>Url</tt> represents a Uniform Resource Locator.
901      *
902      * @author Martin Tamme
903      */
904     private static class Url {
905 
906         private String scheme_;
907         private String location_;
908         private String path_;
909         private String parameters_;
910         private String query_;
911         private String fragment_;
912 
913         /**
914          * Creates a <tt>Url</tt> object.
915          */
916         Url() {
917         }
918 
919         /**
920          * Creates a <tt>Url</tt> object from the specified
921          * <tt>Url</tt> object.
922          *
923          * @param url a <tt>Url</tt> object.
924          */
925         Url(final Url url) {
926             scheme_ = url.scheme_;
927             location_ = url.location_;
928             path_ = url.path_;
929             parameters_ = url.parameters_;
930             query_ = url.query_;
931             fragment_ = url.fragment_;
932         }
933 
934         /**
935          * Returns a string representation of the <tt>Url</tt> object.
936          *
937          * @return a string representation of the <tt>Url</tt> object.
938          */
939         @Override
940         public String toString() {
941             final StringBuilder sb = new StringBuilder();
942 
943             if (scheme_ != null) {
944                 sb.append(scheme_);
945                 sb.append(':');
946             }
947             if (location_ != null) {
948                 sb.append("//");
949                 sb.append(location_);
950             }
951             if (path_ != null) {
952                 sb.append(path_);
953             }
954             if (parameters_ != null) {
955                 sb.append(';');
956                 sb.append(parameters_);
957             }
958             if (query_ != null) {
959                 sb.append('?');
960                 sb.append(query_);
961             }
962             if (fragment_ != null) {
963                 sb.append('#');
964                 sb.append(fragment_);
965             }
966             return sb.toString();
967         }
968     }
969 
970     static boolean isNormalUrlProtocol(final String protocol) {
971         if ("http".equals(protocol) || "https".equals(protocol) || "file".equals(protocol)) {
972             return true;
973         }
974         return false;
975     }
976 
977     /**
978      * More or less the same as sameFile(URL, URL) but without
979      * resolving the host to an IP address for comparing.
980      * Additionally we do some path normalization.
981      *
982      * @param u1 a URL object
983      * @param u2 a URL object
984      * @return true if u1 and u2 refer to the same file
985      */
986     public static boolean sameFile(final URL u1, final URL u2) {
987         if (u1 == u2) {
988             return true;
989         }
990         if (u1 == null || u2 == null) {
991             return false;
992         }
993 
994         // Compare the protocols.
995         final String p1 = u1.getProtocol();
996         final String p2 = u2.getProtocol();
997         if (!(p1 == p2 || (p1 != null && p1.equalsIgnoreCase(p2)))) {
998             return false;
999         }
1000 
1001         // Compare the ports.
1002         final int port1 = (u1.getPort() != -1) ? u1.getPort() : u1.getDefaultPort();
1003         final int port2 = (u2.getPort() != -1) ? u2.getPort() : u2.getDefaultPort();
1004         if (port1 != port2) {
1005             return false;
1006         }
1007 
1008         // Compare the hosts.
1009         final String h1 = u1.getHost();
1010         final String h2 = u2.getHost();
1011         if (!(h1 == h2 || (h1 != null && h1.equalsIgnoreCase(h2)))) {
1012             return false;
1013         }
1014 
1015         // Compare the files.
1016         String f1 = u1.getFile();
1017         if (f1.isEmpty()) {
1018             f1 = "/";
1019         }
1020         String f2 = u2.getFile();
1021         if (f2.isEmpty()) {
1022             f2 = "/";
1023         }
1024         if (f1.indexOf('.') > 0 || f2.indexOf('.') > 0) {
1025             try {
1026                 f1 = u1.toURI().normalize().toURL().getFile();
1027                 f2 = u2.toURI().normalize().toURL().getFile();
1028             }
1029             catch (final Exception e) {
1030                 // ignore
1031             }
1032         }
1033         if (!(f1 == f2 || (f1 != null && f1.equals(f2)))) {
1034             return false;
1035         }
1036 
1037         return true;
1038     }
1039 
1040     /**
1041      * Helper that constructs a normalized url string
1042      * usable as cache key.
1043      *
1044      * @param url a URL object
1045      * @return the normalized string
1046      */
1047     public static String normalize(final URL url) {
1048         final StringBuilder result = new StringBuilder();
1049 
1050         result.append(url.getProtocol());
1051         result.append("://");
1052         result.append(url.getHost());
1053         result.append(':');
1054         result.append((url.getPort() != -1) ? url.getPort() : url.getDefaultPort());
1055 
1056         // Compare the files.
1057         String f = url.getFile();
1058         if (f.isEmpty()) {
1059             result.append("/");
1060         }
1061         else {
1062             if (f.indexOf('.') > 0) {
1063                 try {
1064                     f = url.toURI().normalize().toURL().getFile();
1065                 }
1066                 catch (final Exception e) {
1067                     // ignore
1068                 }
1069             }
1070             result.append(f);
1071         }
1072 
1073         return result.toString();
1074     }
1075 
1076     /**
1077      * Constructs a {@link URI} using the specified URL.
1078      *
1079      * @param url the URL
1080      * @param query the query
1081      *
1082      * @throws URISyntaxException
1083      *         If both a scheme and a path are given but the path is
1084      *         relative, if the URI string constructed from the given
1085      *         components violates RFC&nbsp;2396, or if the authority
1086      *         component of the string is present but cannot be parsed
1087      *         as a server-based authority
1088      * @return the URI
1089      */
1090     public static URI toURI(final URL url, final String query) throws URISyntaxException {
1091         final String scheme = url.getProtocol();
1092         final String host = url.getHost();
1093         final int port = url.getPort();
1094         final String path = url.getPath();
1095         final StringBuilder buffer = new StringBuilder();
1096         if (host != null) {
1097             if (scheme != null) {
1098                 buffer.append(scheme);
1099                 buffer.append("://");
1100             }
1101             buffer.append(host);
1102             if (port > 0) {
1103                 buffer.append(':');
1104                 buffer.append(port);
1105             }
1106         }
1107         if (path == null || !path.startsWith("/")) {
1108             buffer.append('/');
1109         }
1110         if (path != null) {
1111             buffer.append(path);
1112         }
1113         if (query != null) {
1114             buffer.append('?');
1115             buffer.append(query);
1116         }
1117         return new URI(buffer.toString());
1118     }
1119 }