001    /* URI.java -- An URI class
002       Copyright (C) 2002, 2004, 2005, 2006, 2008  Free Software Foundation, Inc.
003    
004    This file is part of GNU Classpath.
005    
006    GNU Classpath is free software; you can redistribute it and/or modify
007    it under the terms of the GNU General Public License as published by
008    the Free Software Foundation; either version 2, or (at your option)
009    any later version.
010    
011    GNU Classpath is distributed in the hope that it will be useful, but
012    WITHOUT ANY WARRANTY; without even the implied warranty of
013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014    General Public License for more details.
015    
016    You should have received a copy of the GNU General Public License
017    along with GNU Classpath; see the file COPYING.  If not, write to the
018    Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
019    02110-1301 USA.
020    
021    Linking this library statically or dynamically with other modules is
022    making a combined work based on this library.  Thus, the terms and
023    conditions of the GNU General Public License cover the whole
024    combination.
025    
026    As a special exception, the copyright holders of this library give you
027    permission to link this library with independent modules to produce an
028    executable, regardless of the license terms of these independent
029    modules, and to copy and distribute the resulting executable under
030    terms of your choice, provided that you also meet, for each linked
031    independent module, the terms and conditions of the license of that
032    module.  An independent module is a module which is not derived from
033    or based on this library.  If you modify this library, you may extend
034    this exception to your version of the library, but you are not
035    obligated to do so.  If you do not wish to do so, delete this
036    exception statement from your version. */
037    
038    
039    package java.net;
040    
041    import gnu.java.lang.CPStringBuilder;
042    
043    import java.io.IOException;
044    import java.io.ObjectInputStream;
045    import java.io.ObjectOutputStream;
046    import java.io.Serializable;
047    import java.util.regex.Matcher;
048    import java.util.regex.Pattern;
049    
050    /**
051     * <p>
052     * A URI instance represents that defined by
053     * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC3986</a>,
054     * with some deviations.
055     * </p>
056     * <p>
057     * At its highest level, a URI consists of:
058     * </p>
059     * <code>[<em>scheme</em><strong>:</strong>]<em>scheme-specific-part</em>
060     * [<strong>#</strong><em>fragment</em>]</code>
061     * </p>
062     * <p>
063     * where <strong>#</strong> and <strong>:</strong> are literal characters,
064     * and those parts enclosed in square brackets are optional.
065     * </p>
066     * <p>
067     * There are two main types of URI.  An <em>opaque</em> URI is one
068     * which just consists of the above three parts, and is not further
069     * defined.  An example of such a URI would be <em>mailto:</em> URI.
070     * In contrast, <em>hierarchical</em> URIs give further definition
071     * to the scheme-specific part, so as represent some part of a hierarchical
072     * structure.
073     * </p>
074     * <p>
075     * <code>[<strong>//</strong><em>authority</em>][<em>path</em>]
076     * [<strong>?</strong><em>query</em>]</code>
077     * </p>
078     * <p>
079     * with <strong>/</strong> and <strong>?</strong> being literal characters.
080     * When server-based, the authority section is further subdivided into:
081     * </p>
082     * <p>
083     * <code>[<em>user-info</em><strong>@</strong>]<em>host</em>
084     * [<strong>:</strong><em>port</em>]</code>
085     * </p>
086     * <p>
087     * with <strong>@</strong> and <strong>:</strong> as literal characters.
088     * Authority sections that are not server-based are said to be registry-based.
089     * </p>
090     * <p>
091     * Hierarchical URIs can be either relative or absolute.  Absolute URIs
092     * always start with a `<strong>/</strong>', while relative URIs don't
093     * specify a scheme.  Opaque URIs are always absolute.
094     * </p>
095     * <p>
096     * Each part of the URI may have one of three states: undefined, empty
097     * or containing some content.  The former two of these are represented
098     * by <code>null</code> and the empty string in Java, respectively.
099     * The scheme-specific part may never be undefined.  It also follows from
100     * this that the path sub-part may also not be undefined, so as to ensure
101     * the former.
102     * </p>
103     * <h2>Character Escaping and Quoting</h2>
104     * <p>
105     * The characters that can be used within a valid URI are restricted.
106     * There are two main classes of characters which can't be used as is
107     * within the URI:
108     * </p>
109     * <ol>
110     * <li><strong>Characters outside the US-ASCII character set</strong>.
111     * These have to be <strong>escaped</strong> in order to create
112     * an RFC-compliant URI; this means replacing the character with the
113     * appropriate hexadecimal value, preceded by a `%'.</li>
114     * <li><strong>Illegal characters</strong> (e.g. space characters,
115     * control characters) are quoted, which results in them being encoded
116     * in the same way as non-US-ASCII characters.</li>
117     * </ol>
118     * <p>
119     * The set of valid characters differs depending on the section of the URI:
120     * </p>
121     * <ul>
122     * <li><strong>Scheme</strong>: Must be an alphanumeric, `-', `.' or '+'.</li>
123     * <li><strong>Authority</strong>:Composed of the username, host, port, `@'
124     * and `:'.</li>
125     * <li><strong>Username</strong>: Allows unreserved or percent-encoded
126     * characters, sub-delimiters and `:'.</li>
127     * <li><strong>Host</strong>: Allows unreserved or percent-encoded
128     * characters, sub-delimiters and square brackets (`[' and `]') for IPv6
129     * addresses.</li>
130     * <li><strong>Port</strong>: Digits only.</li>
131     * <li><strong>Path</strong>: Allows the path characters and `/'.
132     * <li><strong>Query</strong>: Allows the path characters, `?' and '/'.
133     * <li><strong>Fragment</strong>: Allows the path characters, `?' and '/'.
134     * </ul>
135     * <p>
136     * These definitions reference the following sets of characters:
137     * </p>
138     * <ul>
139     * <li><strong>Unreserved characters</strong>: The alphanumerics plus
140     * `-', `.', `_', and `~'.</li>
141     * <li><strong>Sub-delimiters</strong>: `!', `$', `&', `(', `)', `*',
142     * `+', `,', `;', `=' and the single-quote itself.</li>
143     * <li><strong>Path characters</strong>: Unreserved and percent-encoded
144     * characters and the sub-delimiters along with `@' and `:'.</li>
145     * </ul>
146     * <p>
147     * The constructors and accessor methods allow the use and retrieval of
148     * URI components which contain non-US-ASCII characters directly.
149     * They are only escaped when the <code>toASCIIString()</code> method
150     * is used.  In contrast, illegal characters are always quoted, with the
151     * exception of the return values of the non-raw accessors.
152     * </p>
153     *
154     * @author Ito Kazumitsu (ito.kazumitsu@hitachi-cable.co.jp)
155     * @author Dalibor Topic (robilad@kaffe.org)
156     * @author Michael Koch (konqueror@gmx.de)
157     * @author Andrew John Hughes (gnu_andrew@member.fsf.org)
158     * @since 1.4
159     */
160    public final class URI
161      implements Comparable<URI>, Serializable
162    {
163      /**
164       * For serialization compatability.
165       */
166      static final long serialVersionUID = -6052424284110960213L;
167    
168      /**
169       * Regular expression for parsing URIs.
170       *
171       * Taken from RFC 2396, Appendix B.
172       * This expression doesn't parse IPv6 addresses.
173       */
174      private static final String URI_REGEXP =
175        "^(([^:/?#]+):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?";
176    
177      /**
178       * Regular expression for parsing the authority segment.
179       */
180      private static final String AUTHORITY_REGEXP =
181        "(([^?#]*)@)?([^?#:]*)(:([0-9]*))?";
182    
183      /**
184       * Valid characters (taken from rfc2396/3986)
185       */
186      private static final String RFC2396_DIGIT = "0123456789";
187      private static final String RFC2396_LOWALPHA = "abcdefghijklmnopqrstuvwxyz";
188      private static final String RFC2396_UPALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
189      private static final String RFC2396_ALPHA =
190        RFC2396_LOWALPHA + RFC2396_UPALPHA;
191      private static final String RFC2396_ALPHANUM = RFC2396_DIGIT + RFC2396_ALPHA;
192      private static final String RFC3986_UNRESERVED = RFC2396_ALPHANUM + "-._~";
193      private static final String RFC3986_SUBDELIMS = "!$&'()*+,;=";
194      private static final String RFC3986_REG_NAME =
195        RFC3986_UNRESERVED + RFC3986_SUBDELIMS + "%";
196      private static final String RFC3986_PCHAR = RFC3986_UNRESERVED +
197        RFC3986_SUBDELIMS + ":@%";
198      private static final String RFC3986_SEGMENT = RFC3986_PCHAR;
199      private static final String RFC3986_PATH_SEGMENTS = RFC3986_SEGMENT + "/";
200      private static final String RFC3986_SSP = RFC3986_PCHAR + "?/";
201      private static final String RFC3986_HOST = RFC3986_REG_NAME + "[]";
202      private static final String RFC3986_USERINFO = RFC3986_REG_NAME + ":";
203    
204      /**
205       * Index of scheme component in parsed URI.
206       */
207      private static final int SCHEME_GROUP = 2;
208    
209      /**
210       * Index of scheme-specific-part in parsed URI.
211       */
212      private static final int SCHEME_SPEC_PART_GROUP = 3;
213    
214      /**
215       * Index of authority component in parsed URI.
216       */
217      private static final int AUTHORITY_GROUP = 5;
218    
219      /**
220       * Index of path component in parsed URI.
221       */
222      private static final int PATH_GROUP = 6;
223    
224      /**
225       * Index of query component in parsed URI.
226       */
227      private static final int QUERY_GROUP = 8;
228    
229      /**
230       * Index of fragment component in parsed URI.
231       */
232      private static final int FRAGMENT_GROUP = 10;
233    
234      /**
235       * Index of userinfo component in parsed authority section.
236       */
237      private static final int AUTHORITY_USERINFO_GROUP = 2;
238    
239      /**
240       * Index of host component in parsed authority section.
241       */
242      private static final int AUTHORITY_HOST_GROUP = 3;
243    
244      /**
245       * Index of port component in parsed authority section.
246       */
247      private static final int AUTHORITY_PORT_GROUP = 5;
248    
249      /**
250       * The compiled version of the URI regular expression.
251       */
252      private static final Pattern URI_PATTERN;
253    
254      /**
255       * The compiled version of the authority regular expression.
256       */
257      private static final Pattern AUTHORITY_PATTERN;
258    
259      /**
260       * The set of valid hexadecimal characters.
261       */
262      private static final String HEX = "0123456789ABCDEF";
263    
264      private transient String scheme;
265      private transient String rawSchemeSpecificPart;
266      private transient String schemeSpecificPart;
267      private transient String rawAuthority;
268      private transient String authority;
269      private transient String rawUserInfo;
270      private transient String userInfo;
271      private transient String rawHost;
272      private transient String host;
273      private transient int port = -1;
274      private transient String rawPath;
275      private transient String path;
276      private transient String rawQuery;
277      private transient String query;
278      private transient String rawFragment;
279      private transient String fragment;
280      private String string;
281    
282      /**
283       * Static initializer to pre-compile the regular expressions.
284       */
285      static
286      {
287        URI_PATTERN = Pattern.compile(URI_REGEXP);
288        AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEXP);
289      }
290    
291      private void readObject(ObjectInputStream is)
292        throws ClassNotFoundException, IOException
293      {
294        this.string = (String) is.readObject();
295        try
296          {
297            parseURI(this.string);
298          }
299        catch (URISyntaxException x)
300          {
301            // Should not happen.
302            throw new RuntimeException(x);
303          }
304      }
305    
306      private void writeObject(ObjectOutputStream os) throws IOException
307      {
308        if (string == null)
309          string = toString();
310        os.writeObject(string);
311      }
312    
313      /**
314       * <p>
315       * Returns the string content of the specified group of the supplied
316       * matcher.  The returned value is modified according to the following:
317       * </p>
318       * <ul>
319       * <li>If the resulting string has a length greater than 0, then
320       * that string is returned.</li>
321       * <li>If a string of zero length, is matched, then the content
322       * of the preceding group is considered.  If this is also an empty
323       * string, then <code>null</code> is returned to indicate an undefined
324       * value.  Otherwise, the value is truly the empty string and this is
325       * the returned value.</li>
326       * </ul>
327       * <p>
328       * This method is used for matching against all parts of the URI
329       * that may be either undefined or empty (i.e. all those but the
330       * scheme-specific part and the path).  In each case, the preceding
331       * group is the content of the original group, along with some
332       * additional distinguishing feature.  For example, the preceding
333       * group for the query includes the preceding question mark,
334       * while that of the fragment includes the hash symbol.  The presence
335       * of these features enables disambiguation between the two cases
336       * of a completely unspecified value and a simple non-existant value.
337       * The scheme differs in that it will never return an empty string;
338       * the delimiter follows the scheme rather than preceding it, so
339       * it becomes part of the following section.  The same is true
340       * of the user information.
341       * </p>
342       *
343       * @param match the matcher, which contains the results of the URI
344       *              matched against the URI regular expression.
345       * @return either the matched content, <code>null</code> for undefined
346       *         values, or an empty string for a URI part with empty content.
347       */
348      private static String getURIGroup(Matcher match, int group)
349      {
350        String matched = match.group(group);
351        if (matched == null || matched.length() == 0)
352          {
353            String prevMatched = match.group(group -1);
354            if (prevMatched == null || prevMatched.length() == 0)
355              return null;
356            else
357              return "";
358          }
359        return matched;
360      }
361    
362      /**
363       * Sets fields of this URI by parsing the given string.
364       *
365       * @param str The string to parse
366       *
367       * @exception URISyntaxException If the given string violates RFC 2396
368       */
369      private void parseURI(String str) throws URISyntaxException
370      {
371        Matcher matcher = URI_PATTERN.matcher(str);
372    
373        if (matcher.matches())
374          {
375            scheme = getURIGroup(matcher, SCHEME_GROUP);
376            rawSchemeSpecificPart = matcher.group(SCHEME_SPEC_PART_GROUP);
377            schemeSpecificPart = unquote(rawSchemeSpecificPart);
378            if (!isOpaque())
379              {
380                rawAuthority = getURIGroup(matcher, AUTHORITY_GROUP);
381                rawPath = matcher.group(PATH_GROUP);
382                rawQuery = getURIGroup(matcher, QUERY_GROUP);
383              }
384            rawFragment = getURIGroup(matcher, FRAGMENT_GROUP);
385          }
386        else
387          throw new URISyntaxException(str,
388                                       "doesn't match URI regular expression");
389        parseServerAuthority();
390    
391        // We must eagerly unquote the parts, because this is the only time
392        // we may throw an exception.
393        authority = unquote(rawAuthority);
394        userInfo = unquote(rawUserInfo);
395        host = unquote(rawHost);
396        path = unquote(rawPath);
397        query = unquote(rawQuery);
398        fragment = unquote(rawFragment);
399      }
400    
401      /**
402       * Unquote "%" + hex quotes characters
403       *
404       * @param str The string to unquote or null.
405       *
406       * @return The unquoted string or null if str was null.
407       *
408       * @exception URISyntaxException If the given string contains invalid
409       * escape sequences.
410       */
411      private static String unquote(String str) throws URISyntaxException
412      {
413        if (str == null)
414          return null;
415        byte[] buf = new byte[str.length()];
416        int pos = 0;
417        for (int i = 0; i < str.length(); i++)
418          {
419            char c = str.charAt(i);
420            if (c == '%')
421              {
422                if (i + 2 >= str.length())
423                  throw new URISyntaxException(str, "Invalid quoted character");
424                int hi = Character.digit(str.charAt(++i), 16);
425                int lo = Character.digit(str.charAt(++i), 16);
426                if (lo < 0 || hi < 0)
427                  throw new URISyntaxException(str, "Invalid quoted character");
428                buf[pos++] = (byte) (hi * 16 + lo);
429              }
430            else
431              buf[pos++] = (byte) c;
432          }
433        try
434          {
435            return new String(buf, 0, pos, "utf-8");
436          }
437        catch (java.io.UnsupportedEncodingException x2)
438          {
439            throw (Error) new InternalError().initCause(x2);
440          }
441      }
442    
443      /**
444       * Quote characters illegal in URIs in given string.
445       *
446       * Replace illegal characters by encoding their UTF-8
447       * representation as "%" + hex code for each resulting
448       * UTF-8 character.
449       *
450       * @param str The string to quote
451       *
452       * @return The quoted string.
453       */
454      private static String quote(String str)
455      {
456        return quote(str, RFC3986_SSP);
457      }
458    
459      /**
460       * Quote characters illegal in URI authorities in given string.
461       *
462       * Replace illegal characters by encoding their UTF-8
463       * representation as "%" + hex code for each resulting
464       * UTF-8 character.
465       *
466       * @param str The string to quote
467       *
468       * @return The quoted string.
469       */
470      private static String quoteAuthority(String str)
471      {
472        // Technically, we should be using RFC2396_AUTHORITY, but
473        // it contains no additional characters.
474        return quote(str, RFC3986_REG_NAME);
475      }
476    
477      /**
478       * Quotes the characters in the supplied string that are not part of
479       * the specified set of legal characters.
480       *
481       * @param str the string to quote
482       * @param legalCharacters the set of legal characters
483       *
484       * @return the quoted string.
485       */
486      private static String quote(String str, String legalCharacters)
487      {
488        CPStringBuilder sb = new CPStringBuilder(str.length());
489        for (int i = 0; i < str.length(); i++)
490          {
491            char c = str.charAt(i);
492            if ((legalCharacters.indexOf(c) == -1)
493                && (c <= 127))
494              {
495                sb.append('%');
496                sb.append(HEX.charAt(c / 16));
497                sb.append(HEX.charAt(c % 16));
498              }
499            else
500              sb.append(c);
501          }
502        return sb.toString();
503      }
504    
505      /**
506       * Quote characters illegal in URI hosts in given string.
507       *
508       * Replace illegal characters by encoding their UTF-8
509       * representation as "%" + hex code for each resulting
510       * UTF-8 character.
511       *
512       * @param str The string to quote
513       *
514       * @return The quoted string.
515       */
516      private static String quoteHost(String str)
517      {
518        return quote(str, RFC3986_HOST);
519      }
520    
521      /**
522       * Quote characters illegal in URI paths in given string.
523       *
524       * Replace illegal characters by encoding their UTF-8
525       * representation as "%" + hex code for each resulting
526       * UTF-8 character.
527       *
528       * @param str The string to quote
529       *
530       * @return The quoted string.
531       */
532      private static String quotePath(String str)
533      {
534        // Technically, we should be using RFC2396_PATH, but
535        // it contains no additional characters.
536        return quote(str, RFC3986_PATH_SEGMENTS);
537      }
538    
539      /**
540       * Quote characters illegal in URI user infos in given string.
541       *
542       * Replace illegal characters by encoding their UTF-8
543       * representation as "%" + hex code for each resulting
544       * UTF-8 character.
545       *
546       * @param str The string to quote
547       *
548       * @return The quoted string.
549       */
550      private static String quoteUserInfo(String str)
551      {
552        return quote(str, RFC3986_USERINFO);
553      }
554    
555      /**
556       * Creates an URI from the given string
557       *
558       * @param str The string to create the URI from
559       *
560       * @exception URISyntaxException If the given string violates RFC 2396
561       * @exception NullPointerException If str is null
562       */
563      public URI(String str) throws URISyntaxException
564      {
565        this.string = str;
566        parseURI(str);
567      }
568    
569      /**
570       * Create an URI from the given components
571       *
572       * @param scheme The scheme name
573       * @param userInfo The username and authorization info
574       * @param host The hostname
575       * @param port The port number
576       * @param path The path
577       * @param query The query
578       * @param fragment The fragment
579       *
580       * @exception URISyntaxException If the given string violates RFC 2396
581       */
582      public URI(String scheme, String userInfo, String host, int port,
583                 String path, String query, String fragment)
584        throws URISyntaxException
585      {
586        this((scheme == null ? "" : scheme + ":")
587             + (userInfo == null && host == null && port == -1 ? "" : "//")
588             + (userInfo == null ? "" : quoteUserInfo(userInfo) + "@")
589             + (host == null ? "" : quoteHost(host))
590             + (port == -1 ? "" : ":" + String.valueOf(port))
591             + (path == null ? "" : quotePath(path))
592             + (query == null ? "" : "?" + quote(query))
593             + (fragment == null ? "" : "#" + quote(fragment)));
594      }
595    
596      /**
597       * Create an URI from the given components
598       *
599       * @param scheme The scheme name
600       * @param authority The authority
601       * @param path The apth
602       * @param query The query
603       * @param fragment The fragment
604       *
605       * @exception URISyntaxException If the given string violates RFC 2396
606       */
607      public URI(String scheme, String authority, String path, String query,
608                 String fragment) throws URISyntaxException
609      {
610        this((scheme == null ? "" : scheme + ":")
611             + (authority == null ? "" : "//" + quoteAuthority(authority))
612             + (path == null ? "" : quotePath(path))
613             + (query == null ? "" : "?" + quote(query))
614             + (fragment == null ? "" : "#" + quote(fragment)));
615      }
616    
617      /**
618       * Create an URI from the given components
619       *
620       * @param scheme The scheme name
621       * @param host The hostname
622       * @param path The path
623       * @param fragment The fragment
624       *
625       * @exception URISyntaxException If the given string violates RFC 2396
626       */
627      public URI(String scheme, String host, String path, String fragment)
628        throws URISyntaxException
629      {
630        this(scheme, null, host, -1, path, null, fragment);
631      }
632    
633      /**
634       * Create an URI from the given components
635       *
636       * @param scheme The scheme name
637       * @param ssp The scheme specific part
638       * @param fragment The fragment
639       *
640       * @exception URISyntaxException If the given string violates RFC 2396
641       */
642      public URI(String scheme, String ssp, String fragment)
643        throws URISyntaxException
644      {
645        this((scheme == null ? "" : scheme + ":")
646             + (ssp == null ? "" : quote(ssp))
647             + (fragment == null ? "" : "#" + quote(fragment)));
648      }
649    
650      /**
651       * Create an URI from the given string
652       *
653       * @param str The string to create the URI from
654       *
655       * @exception IllegalArgumentException If the given string violates RFC 2396
656       * @exception NullPointerException If str is null
657       */
658      public static URI create(String str)
659      {
660        try
661          {
662            return new URI(str);
663          }
664        catch (URISyntaxException e)
665          {
666            throw (IllegalArgumentException) new IllegalArgumentException()
667                  .initCause(e);
668          }
669      }
670    
671      /**
672       * Attempts to parse this URI's authority component, if defined,
673       * into user-information, host, and port components.  The purpose
674       * of this method was to disambiguate between some authority sections,
675       * which form invalid server-based authories, but valid registry
676       * based authorities.  In the updated RFC 3986, the authority section
677       * is defined differently, with registry-based authorities part of
678       * the host section.  Thus, this method is now simply an explicit
679       * way of parsing any authority section.
680       *
681       * @return the URI, with the authority section parsed into user
682       *         information, host and port components.
683       * @throws URISyntaxException if the given string violates RFC 2396
684       */
685      public URI parseServerAuthority() throws URISyntaxException
686      {
687        if (rawAuthority != null)
688          {
689            Matcher matcher = AUTHORITY_PATTERN.matcher(rawAuthority);
690    
691            if (matcher.matches())
692              {
693                rawUserInfo = getURIGroup(matcher, AUTHORITY_USERINFO_GROUP);
694                rawHost = getURIGroup(matcher, AUTHORITY_HOST_GROUP);
695    
696                String portStr = getURIGroup(matcher, AUTHORITY_PORT_GROUP);
697    
698                if (portStr != null && ! portStr.isEmpty())
699                  try
700                    {
701                      port = Integer.parseInt(portStr);
702                    }
703                  catch (NumberFormatException e)
704                    {
705                      URISyntaxException use =
706                        new URISyntaxException
707                          (string, "doesn't match URI regular expression");
708                      use.initCause(e);
709                      throw use;
710                    }
711              }
712            else
713              throw new URISyntaxException(string,
714                                           "doesn't match URI regular expression");
715          }
716        return this;
717      }
718    
719      /**
720       * <p>
721       * Returns a normalized version of the URI.  If the URI is opaque,
722       * or its path is already in normal form, then this URI is simply
723       * returned.  Otherwise, the following transformation of the path
724       * element takes place:
725       * </p>
726       * <ol>
727       * <li>All `.' segments are removed.</li>
728       * <li>Each `..' segment which can be paired with a prior non-`..' segment
729       * is removed along with the preceding segment.</li>
730       * <li>A `.' segment is added to the front if the first segment contains
731       * a colon (`:').  This is a deviation from the RFC, which prevents
732       * confusion between the path and the scheme.</li>
733       * </ol>
734       * <p>
735       * The resulting URI will be free of `.' and `..' segments, barring those
736       * that were prepended or which couldn't be paired, respectively.
737       * </p>
738       *
739       * @return the normalized URI.
740       */
741      public URI normalize()
742      {
743        if (isOpaque() || path.indexOf("/./") == -1 && path.indexOf("/../") == -1)
744          return this;
745        try
746          {
747            return new URI(scheme, authority, normalizePath(path), query,
748                           fragment);
749          }
750        catch (URISyntaxException e)
751          {
752            throw (Error) new InternalError("Normalized URI variant could not "+
753                                            "be constructed").initCause(e);
754          }
755      }
756    
757      /**
758       * <p>
759       * Normalize the given path.  The following transformation takes place:
760       * </p>
761       * <ol>
762       * <li>All `.' segments are removed.</li>
763       * <li>Each `..' segment which can be paired with a prior non-`..' segment
764       * is removed along with the preceding segment.</li>
765       * <li>A `.' segment is added to the front if the first segment contains
766       * a colon (`:').  This is a deviation from the RFC, which prevents
767       * confusion between the path and the scheme.</li>
768       * </ol>
769       * <p>
770       * The resulting URI will be free of `.' and `..' segments, barring those
771       * that were prepended or which couldn't be paired, respectively.
772       * </p>
773       *
774       * @param relativePath the relative path to be normalized.
775       * @return the normalized path.
776       */
777      private String normalizePath(String relativePath)
778      {
779        /*
780           This follows the algorithm in section 5.2.4. of RFC3986,
781           but doesn't modify the input buffer.
782        */
783        CPStringBuilder input = new CPStringBuilder(relativePath);
784        CPStringBuilder output = new CPStringBuilder();
785        int start = 0;
786        while (start < input.length())
787          {
788            /* A */
789            if (input.indexOf("../",start) == start)
790              {
791                start += 3;
792                continue;
793              }
794            if (input.indexOf("./",start) == start)
795              {
796                start += 2;
797                continue;
798              }
799            /* B */
800            if (input.indexOf("/./",start) == start)
801              {
802                start += 2;
803                continue;
804              }
805            if (input.indexOf("/.",start) == start
806                && input.charAt(start + 2) != '.')
807              {
808                start += 1;
809                input.setCharAt(start,'/');
810                continue;
811              }
812            /* C */
813            if (input.indexOf("/../",start) == start)
814              {
815                start += 3;
816                removeLastSegment(output);
817                continue;
818              }
819            if (input.indexOf("/..",start) == start)
820              {
821                start += 2;
822                input.setCharAt(start,'/');
823                removeLastSegment(output);
824                continue;
825              }
826            /* D */
827            if (start == input.length() - 1 && input.indexOf(".",start) == start)
828              {
829                input.delete(0,1);
830                continue;
831              }
832            if (start == input.length() - 2 && input.indexOf("..",start) == start)
833              {
834                input.delete(0,2);
835                continue;
836              }
837            /* E */
838            int indexOfSlash = input.indexOf("/",start);
839            while (indexOfSlash == start)
840              {
841                output.append("/");
842                ++start;
843                indexOfSlash = input.indexOf("/",start);
844              }
845            if (indexOfSlash == -1)
846              indexOfSlash = input.length();
847            output.append(input.substring(start, indexOfSlash));
848            start = indexOfSlash;
849          }
850        return output.toString();
851      }
852    
853      /**
854       * Removes the last segment of the path from the specified buffer.
855       *
856       * @param buffer the buffer containing the path.
857       */
858      private void removeLastSegment(CPStringBuilder buffer)
859      {
860        int lastSlash = buffer.lastIndexOf("/");
861        if (lastSlash == -1)
862          buffer.setLength(0);
863        else
864          buffer.setLength(lastSlash);
865      }
866    
867      /**
868       * Resolves the given URI against this URI
869       *
870       * @param uri The URI to resolve against this URI
871       *
872       * @return The resulting URI, or null when it couldn't be resolved
873       * for some reason.
874       *
875       * @throws NullPointerException if uri is null
876       */
877      public URI resolve(URI uri)
878      {
879        if (uri.isAbsolute())
880          return uri;
881        if (uri.isOpaque())
882          return uri;
883    
884        String scheme = uri.getScheme();
885        String schemeSpecificPart = uri.getSchemeSpecificPart();
886        String authority = uri.getAuthority();
887        String path = uri.getPath();
888        String query = uri.getQuery();
889        String fragment = uri.getFragment();
890    
891        try
892          {
893            if (fragment != null && path != null && path.equals("")
894                && scheme == null && authority == null && query == null)
895              return new URI(this.scheme, this.schemeSpecificPart, fragment);
896    
897            if (authority == null)
898              {
899                authority = this.authority;
900                if (path == null)
901                  path = "";
902                if (! (path.startsWith("/")))
903                  {
904                    CPStringBuilder basepath = new CPStringBuilder(this.path);
905                    int i = this.path.lastIndexOf('/');
906    
907                    if (i >= 0)
908                      basepath.delete(i + 1, basepath.length());
909    
910                    basepath.append(path);
911                    path = normalizePath(basepath.toString());
912                  }
913              }
914            return new URI(this.scheme, authority, path, query, fragment);
915          }
916        catch (URISyntaxException e)
917          {
918            throw (Error) new InternalError("Resolved URI variant could not "+
919                                            "be constructed").initCause(e);
920          }
921      }
922    
923      /**
924       * Resolves the given URI string against this URI
925       *
926       * @param str The URI as string to resolve against this URI
927       *
928       * @return The resulting URI
929       *
930       * @throws IllegalArgumentException If the given URI string
931       * violates RFC 2396
932       * @throws NullPointerException If uri is null
933       */
934      public URI resolve(String str) throws IllegalArgumentException
935      {
936        return resolve(create(str));
937      }
938    
939      /**
940       * <p>
941       * Relativizes the given URI against this URI.  The following
942       * algorithm is used:
943       * </p>
944       * <ul>
945       * <li>If either URI is opaque, the given URI is returned.</li>
946       * <li>If the schemes of the URIs differ, the given URI is returned.</li>
947       * <li>If the authority components of the URIs differ, then the given
948       * URI is returned.</li>
949       * <li>If the path of this URI is not a prefix of the supplied URI,
950       * then the given URI is returned.</li>
951       * <li>If all the above conditions hold, a new URI is created using the
952       * query and fragment components of the given URI, along with a path
953       * computed by removing the path of this URI from the start of the path
954       * of the supplied URI.</li>
955       * </ul>
956       *
957       * @param uri the URI to relativize agsint this URI
958       * @return the resulting URI
959       * @throws NullPointerException if the uri is null
960       */
961      public URI relativize(URI uri)
962      {
963        if (isOpaque() || uri.isOpaque())
964          return uri;
965        if (scheme == null && uri.getScheme() != null)
966          return uri;
967        if (scheme != null && !(scheme.equals(uri.getScheme())))
968          return uri;
969        if (rawAuthority == null && uri.getRawAuthority() != null)
970          return uri;
971        if (rawAuthority != null && !(rawAuthority.equals(uri.getRawAuthority())))
972          return uri;
973        String basePath = rawPath;
974        if (!(uri.getRawPath().equals(rawPath)))
975          {
976            if (!(basePath.endsWith("/")))
977              basePath = basePath.concat("/");
978            if (!(uri.getRawPath().startsWith(basePath)))
979              return uri;
980          }
981        try
982          {
983            return new URI(null, null,
984                           uri.getRawPath().substring(basePath.length()),
985                           uri.getRawQuery(), uri.getRawFragment());
986          }
987        catch (URISyntaxException e)
988          {
989            throw (Error) new InternalError("Relativized URI variant could not "+
990                                            "be constructed").initCause(e);
991          }
992      }
993    
994      /**
995       * Creates an URL from an URI
996       *
997       * @throws MalformedURLException If a protocol handler for the URL could
998       * not be found, or if some other error occurred while constructing the URL
999       * @throws IllegalArgumentException If the URI is not absolute
1000       */
1001      public URL toURL() throws IllegalArgumentException, MalformedURLException
1002      {
1003        if (isAbsolute())
1004          return new URL(this.toString());
1005    
1006        throw new IllegalArgumentException("not absolute");
1007      }
1008    
1009      /**
1010       * Returns the scheme of the URI
1011       */
1012      public String getScheme()
1013      {
1014        return scheme;
1015      }
1016    
1017      /**
1018       * Tells whether this URI is absolute or not
1019       */
1020      public boolean isAbsolute()
1021      {
1022        return scheme != null;
1023      }
1024    
1025      /**
1026       * Tell whether this URI is opaque or not
1027       */
1028      public boolean isOpaque()
1029      {
1030        return ((scheme != null) && ! (schemeSpecificPart.startsWith("/")));
1031      }
1032    
1033      /**
1034       * Returns the raw scheme specific part of this URI.
1035       * The scheme-specific part is never undefined, though it may be empty
1036       */
1037      public String getRawSchemeSpecificPart()
1038      {
1039        return rawSchemeSpecificPart;
1040      }
1041    
1042      /**
1043       * Returns the decoded scheme specific part of this URI.
1044       */
1045      public String getSchemeSpecificPart()
1046      {
1047        return schemeSpecificPart;
1048      }
1049    
1050      /**
1051       * Returns the raw authority part of this URI
1052       */
1053      public String getRawAuthority()
1054      {
1055        return rawAuthority;
1056      }
1057    
1058      /**
1059       * Returns the decoded authority part of this URI
1060       */
1061      public String getAuthority()
1062      {
1063        return authority;
1064      }
1065    
1066      /**
1067       * Returns the raw user info part of this URI
1068       */
1069      public String getRawUserInfo()
1070      {
1071        return rawUserInfo;
1072      }
1073    
1074      /**
1075       * Returns the decoded user info part of this URI
1076       */
1077      public String getUserInfo()
1078      {
1079        return userInfo;
1080      }
1081    
1082      /**
1083       * Returns the hostname of the URI
1084       */
1085      public String getHost()
1086      {
1087        return host;
1088      }
1089    
1090      /**
1091       * Returns the port number of the URI
1092       */
1093      public int getPort()
1094      {
1095        return port;
1096      }
1097    
1098      /**
1099       * Returns the raw path part of this URI
1100       */
1101      public String getRawPath()
1102      {
1103        return rawPath;
1104      }
1105    
1106      /**
1107       * Returns the path of the URI
1108       */
1109      public String getPath()
1110      {
1111        return path;
1112      }
1113    
1114      /**
1115       * Returns the raw query part of this URI
1116       */
1117      public String getRawQuery()
1118      {
1119        return rawQuery;
1120      }
1121    
1122      /**
1123       * Returns the query of the URI
1124       */
1125      public String getQuery()
1126      {
1127        return query;
1128      }
1129    
1130      /**
1131       * Return the raw fragment part of this URI
1132       */
1133      public String getRawFragment()
1134      {
1135        return rawFragment;
1136      }
1137    
1138      /**
1139       * Returns the fragment of the URI
1140       */
1141      public String getFragment()
1142      {
1143        return fragment;
1144      }
1145    
1146      /**
1147       * <p>
1148       * Compares the URI with the given object for equality.  If the
1149       * object is not a <code>URI</code>, then the method returns false.
1150       * Otherwise, the following criteria are observed:
1151       * </p>
1152       * <ul>
1153       * <li>The scheme of the URIs must either be null (undefined) in both cases,
1154       * or equal, ignorant of case.</li>
1155       * <li>The raw fragment of the URIs must either be null (undefined) in both
1156       * cases, or equal, ignorant of case.</li>
1157       * <li>Both URIs must be of the same type (opaque or hierarchial)</li>
1158       * <li><strong>For opaque URIs:</strong></li>
1159       * <ul>
1160       * <li>The raw scheme-specific parts must be equal.</li>
1161       * </ul>
1162       * <li>For hierarchical URIs:</li>
1163       * <ul>
1164       * <li>The raw paths must be equal, ignorant of case.</li>
1165       * <li>The raw queries are either both undefined or both equal, ignorant
1166       * of case.</li>
1167       * <li>The raw authority sections are either both undefined or:</li>
1168       * <li><strong>For registry-based authorities:</strong></li>
1169       * <ul><li>they are equal.</li></ul>
1170       * <li><strong>For server-based authorities:</strong></li>
1171       * <ul>
1172       * <li>the hosts are equal, ignoring case</li>
1173       * <li>the ports are equal</li>
1174       * <li>the user information components are equal</li>
1175       * </ul>
1176       * </ul>
1177       * </ul>
1178       *
1179       * @param obj the obj to compare the URI with.
1180       * @return <code>true</code> if the objects are equal, according to
1181       *         the specification above.
1182       */
1183      public boolean equals(Object obj)
1184      {
1185        if (!(obj instanceof URI))
1186          return false;
1187        URI uriObj = (URI) obj;
1188        if (scheme == null)
1189          {
1190            if (uriObj.getScheme() != null)
1191              return false;
1192          }
1193        else
1194          if (!(scheme.equalsIgnoreCase(uriObj.getScheme())))
1195            return false;
1196        if (rawFragment == null)
1197          {
1198            if (uriObj.getRawFragment() != null)
1199              return false;
1200          }
1201        else
1202          if (!(rawFragment.equalsIgnoreCase(uriObj.getRawFragment())))
1203            return false;
1204        boolean opaqueThis = isOpaque();
1205        boolean opaqueObj = uriObj.isOpaque();
1206        if (opaqueThis && opaqueObj)
1207          return rawSchemeSpecificPart.equals(uriObj.getRawSchemeSpecificPart());
1208        else if (!opaqueThis && !opaqueObj)
1209          {
1210            boolean common = rawPath.equalsIgnoreCase(uriObj.getRawPath())
1211              && ((rawQuery == null && uriObj.getRawQuery() == null)
1212                  || rawQuery.equalsIgnoreCase(uriObj.getRawQuery()));
1213            if (rawAuthority == null && uriObj.getRawAuthority() == null)
1214              return common;
1215            if (host == null)
1216              return common
1217                && rawAuthority.equalsIgnoreCase(uriObj.getRawAuthority());
1218            return common
1219              && host.equalsIgnoreCase(uriObj.getHost())
1220              && port == uriObj.getPort()
1221              && (rawUserInfo == null ?
1222                  uriObj.getRawUserInfo() == null :
1223                  rawUserInfo.equalsIgnoreCase(uriObj.getRawUserInfo()));
1224          }
1225        else
1226          return false;
1227      }
1228    
1229      /**
1230       * Computes the hashcode of the URI
1231       */
1232      public int hashCode()
1233      {
1234        return (getScheme() == null ? 0 : 13 * getScheme().hashCode())
1235          + 17 * getRawSchemeSpecificPart().hashCode()
1236          + (getRawFragment() == null ? 0 : 21 + getRawFragment().hashCode());
1237      }
1238    
1239      /**
1240       * Compare the URI with another URI.
1241       * Undefined components are taken to be less than any other component.
1242       * The following criteria are observed:
1243       * </p>
1244       * <ul>
1245       * <li>Two URIs with different schemes are compared according to their
1246       * scheme, regardless of case.</li>
1247       * <li>A hierarchical URI is less than an opaque URI with the same
1248       * scheme.</li>
1249       * <li><strong>For opaque URIs:</strong></li>
1250       * <ul>
1251       * <li>URIs with differing scheme-specific parts are ordered according
1252       * to the ordering of the scheme-specific part.</li>
1253       * <li>URIs with the same scheme-specific part are ordered by the
1254       * raw fragment.</li>
1255       * </ul>
1256       * <li>For hierarchical URIs:</li>
1257       * <ul>
1258       * <li>URIs are ordered according to their raw authority sections,
1259       * if they are unequal.</li>
1260       * <li><strong>For registry-based authorities:</strong></li>
1261       * <ul><li>they are ordered according to the ordering of the authority
1262       * component.</li></ul>
1263       * <li><strong>For server-based authorities:</strong></li>
1264       * <ul>
1265       * <li>URIs are ordered according to the raw user information.</li>
1266       * <li>URIs with the same user information are ordered by the host,
1267       * ignoring case.</li>
1268       * <lI>URIs with the same host are ordered by the port.</li>
1269       * </ul>
1270       * <li>URIs with the same authority section are ordered by the raw path.</li>
1271       * <li>URIs with the same path are ordered by their raw query.</li>
1272       * <li>URIs with the same query are ordered by their raw fragments.</li>
1273       * </ul>
1274       * </ul>
1275       *
1276       * @param uri The other URI to compare this URI with
1277       * @return a negative integer, zero or a positive integer depending
1278       *         on whether this URI is less than, equal to or greater
1279       *         than that supplied, respectively.
1280       */
1281      public int compareTo(URI uri)
1282        throws ClassCastException
1283      {
1284        if (scheme == null && uri.getScheme() != null)
1285          return -1;
1286        if (scheme != null)
1287          {
1288            int sCompare = scheme.compareToIgnoreCase(uri.getScheme());
1289            if (sCompare != 0)
1290              return sCompare;
1291          }
1292        boolean opaqueThis = isOpaque();
1293        boolean opaqueObj = uri.isOpaque();
1294        if (opaqueThis && !opaqueObj)
1295          return 1;
1296        if (!opaqueThis && opaqueObj)
1297          return -1;
1298        if (opaqueThis)
1299          {
1300            int ssCompare =
1301              rawSchemeSpecificPart.compareTo(uri.getRawSchemeSpecificPart());
1302            if (ssCompare == 0)
1303              return compareFragments(uri);
1304            else
1305              return ssCompare;
1306          }
1307        if (rawAuthority == null && uri.getRawAuthority() != null)
1308          return -1;
1309        if (rawAuthority != null)
1310          {
1311            int aCompare = rawAuthority.compareTo(uri.getRawAuthority());
1312            if (aCompare != 0)
1313              {
1314                if (host == null)
1315                  return aCompare;
1316                if (rawUserInfo == null && uri.getRawUserInfo() != null)
1317                  return -1;
1318                int uCompare = rawUserInfo.compareTo(uri.getRawUserInfo());
1319                if (uCompare != 0)
1320                  return uCompare;
1321                if (host == null && uri.getHost() != null)
1322                  return -1;
1323                int hCompare = host.compareTo(uri.getHost());
1324                if (hCompare != 0)
1325                  return hCompare;
1326                int uriPort = uri.getPort();
1327                return (uriPort == port) ? 0 : (uriPort > port) ? -1 : 1;
1328              }
1329          }
1330        if (rawPath == null && uri.getRawPath() != null)
1331          return -1;
1332        if (rawPath != null)
1333          {
1334            int pCompare = rawPath.compareTo(uri.getRawPath());
1335            if (pCompare != 0)
1336              return pCompare;
1337          }
1338        if (rawQuery == null && uri.getRawQuery() != null)
1339          return -1;
1340        if (rawQuery != null)
1341          {
1342            int qCompare = rawQuery.compareTo(uri.getRawQuery());
1343            if (qCompare != 0)
1344              return qCompare;
1345          }
1346        return compareFragments(uri);
1347      }
1348    
1349      /**
1350       * Compares the fragment of this URI with that of the supplied URI.
1351       *
1352       * @param uri the URI to compare with this one.
1353       * @return a negative integer, zero or a positive integer depending
1354       *         on whether this uri's fragment is less than, equal to
1355       *         or greater than the fragment of the uri supplied, respectively.
1356       */
1357      private int compareFragments(URI uri)
1358      {
1359        if (rawFragment == null && uri.getRawFragment() != null)
1360          return -1;
1361        else if (rawFragment == null)
1362          return 0;
1363        else
1364          return rawFragment.compareTo(uri.getRawFragment());
1365      }
1366    
1367      /**
1368       * Returns the URI as a String.  If the URI was created using a constructor,
1369       * then this will be the same as the original input string.
1370       *
1371       * @return a string representation of the URI.
1372       */
1373      public String toString()
1374      {
1375        return (scheme == null ? "" : scheme + ":")
1376          + rawSchemeSpecificPart
1377          + (rawFragment == null ? "" : "#" + rawFragment);
1378      }
1379    
1380      /**
1381       * Returns the URI as US-ASCII string.  This is the same as the result
1382       * from <code>toString()</code> for URIs that don't contain any non-US-ASCII
1383       * characters.  Otherwise, the non-US-ASCII characters are replaced
1384       * by their percent-encoded representations.
1385       *
1386       * @return a string representation of the URI, containing only US-ASCII
1387       *         characters.
1388       */
1389      public String toASCIIString()
1390      {
1391        String strRep = toString();
1392        boolean inNonAsciiBlock = false;
1393        CPStringBuilder buffer = new CPStringBuilder();
1394        CPStringBuilder encBuffer = null;
1395        for (int i = 0; i < strRep.length(); i++)
1396          {
1397            char c = strRep.charAt(i);
1398            if (c <= 127)
1399              {
1400                if (inNonAsciiBlock)
1401                  {
1402                    buffer.append(escapeCharacters(encBuffer.toString()));
1403                    inNonAsciiBlock = false;
1404                  }
1405                buffer.append(c);
1406              }
1407            else
1408              {
1409                if (!inNonAsciiBlock)
1410                  {
1411                    encBuffer = new CPStringBuilder();
1412                    inNonAsciiBlock = true;
1413                  }
1414                encBuffer.append(c);
1415              }
1416          }
1417        return buffer.toString();
1418      }
1419    
1420      /**
1421       * Converts the non-ASCII characters in the supplied string
1422       * to their equivalent percent-encoded representations.
1423       * That is, they are replaced by "%" followed by their hexadecimal value.
1424       *
1425       * @param str a string including non-ASCII characters.
1426       * @return the string with the non-ASCII characters converted to their
1427       *         percent-encoded representations.
1428       */
1429      private static String escapeCharacters(String str)
1430      {
1431        try
1432          {
1433            CPStringBuilder sb = new CPStringBuilder();
1434            // this is far from optimal, but it works
1435            byte[] utf8 = str.getBytes("utf-8");
1436            for (int j = 0; j < utf8.length; j++)
1437              {
1438                sb.append('%');
1439                sb.append(HEX.charAt((utf8[j] & 0xff) / 16));
1440                sb.append(HEX.charAt((utf8[j] & 0xff) % 16));
1441              }
1442            return sb.toString();
1443          }
1444        catch (java.io.UnsupportedEncodingException x)
1445          {
1446            throw (Error) new InternalError("Escaping error").initCause(x);
1447          }
1448      }
1449    
1450    }