| Tor Norbye | 3a2425a | 2013-11-04 10:16:08 -0800 | [diff] [blame^] | 1 | # pylint: disable-msg=C0103 |
| 2 | # |
| 3 | # backported code from 4Suite with slight modifications, started from r1.89 of |
| 4 | # Ft/Lib/Uri.py, by syt@logilab.fr on 2005-02-09 |
| 5 | # |
| 6 | # part if not all of this code should probably move to urlparse (or be used |
| 7 | # to fix some existant functions in this module) |
| 8 | # |
| 9 | # |
| 10 | # Copyright 2004 Fourthought, Inc. (USA). |
| 11 | # Detailed license and copyright information: http://4suite.org/COPYRIGHT |
| 12 | # Project home, documentation, distributions: http://4suite.org/ |
| 13 | import os.path |
| 14 | import sys |
| 15 | import re |
| 16 | import urlparse, urllib, urllib2 |
| 17 | |
| 18 | def UnsplitUriRef(uriRefSeq): |
| 19 | """should replace urlparse.urlunsplit |
| 20 | |
| 21 | Given a sequence as would be produced by SplitUriRef(), assembles and |
| 22 | returns a URI reference as a string. |
| 23 | """ |
| 24 | if not isinstance(uriRefSeq, (tuple, list)): |
| 25 | raise TypeError("sequence expected, got %s" % type(uriRefSeq)) |
| 26 | (scheme, authority, path, query, fragment) = uriRefSeq |
| 27 | uri = '' |
| 28 | if scheme is not None: |
| 29 | uri += scheme + ':' |
| 30 | if authority is not None: |
| 31 | uri += '//' + authority |
| 32 | uri += path |
| 33 | if query is not None: |
| 34 | uri += '?' + query |
| 35 | if fragment is not None: |
| 36 | uri += '#' + fragment |
| 37 | return uri |
| 38 | |
| 39 | SPLIT_URI_REF_PATTERN = re.compile(r"^(?:(?P<scheme>[^:/?#]+):)?(?://(?P<authority>[^/?#]*))?(?P<path>[^?#]*)(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?$") |
| 40 | |
| 41 | def SplitUriRef(uriref): |
| 42 | """should replace urlparse.urlsplit |
| 43 | |
| 44 | Given a valid URI reference as a string, returns a tuple representing the |
| 45 | generic URI components, as per RFC 2396 appendix B. The tuple's structure |
| 46 | is (scheme, authority, path, query, fragment). |
| 47 | |
| 48 | All values will be strings (possibly empty) or None if undefined. |
| 49 | |
| 50 | Note that per rfc3986, there is no distinction between a path and |
| 51 | an "opaque part", as there was in RFC 2396. |
| 52 | """ |
| 53 | # the pattern will match every possible string, so it's safe to |
| 54 | # assume there's a groupdict method to call. |
| 55 | g = SPLIT_URI_REF_PATTERN.match(uriref).groupdict() |
| 56 | scheme = g['scheme'] |
| 57 | authority = g['authority'] |
| 58 | path = g['path'] |
| 59 | query = g['query'] |
| 60 | fragment = g['fragment'] |
| 61 | return (scheme, authority, path, query, fragment) |
| 62 | |
| 63 | |
| 64 | def Absolutize(uriRef, baseUri): |
| 65 | """ |
| 66 | Resolves a URI reference to absolute form, effecting the result of RFC |
| 67 | 3986 section 5. The URI reference is considered to be relative to the |
| 68 | given base URI. |
| 69 | |
| 70 | It is the caller's responsibility to ensure that the base URI matches |
| 71 | the absolute-URI syntax rule of RFC 3986, and that its path component |
| 72 | does not contain '.' or '..' segments if the scheme is hierarchical. |
| 73 | Unexpected results may occur otherwise. |
| 74 | |
| 75 | This function only conducts a minimal sanity check in order to determine |
| 76 | if relative resolution is possible: it raises a UriException if the base |
| 77 | URI does not have a scheme component. While it is true that the base URI |
| 78 | is irrelevant if the URI reference has a scheme, an exception is raised |
| 79 | in order to signal that the given string does not even come close to |
| 80 | meeting the criteria to be usable as a base URI. |
| 81 | |
| 82 | It is the caller's responsibility to make a determination of whether the |
| 83 | URI reference constitutes a "same-document reference", as defined in RFC |
| 84 | 2396 or RFC 3986. As per the spec, dereferencing a same-document |
| 85 | reference "should not" involve retrieval of a new representation of the |
| 86 | referenced resource. Note that the two specs have different definitions |
| 87 | of same-document reference: RFC 2396 says it is *only* the cases where the |
| 88 | reference is the empty string, or "#" followed by a fragment; RFC 3986 |
| 89 | requires making a comparison of the base URI to the absolute form of the |
| 90 | reference (as is returned by the spec), minus its fragment component, |
| 91 | if any. |
| 92 | |
| 93 | This function is similar to urlparse.urljoin() and urllib.basejoin(). |
| 94 | Those functions, however, are (as of Python 2.3) outdated, buggy, and/or |
| 95 | designed to produce results acceptable for use with other core Python |
| 96 | libraries, rather than being earnest implementations of the relevant |
| 97 | specs. Their problems are most noticeable in their handling of |
| 98 | same-document references and 'file:' URIs, both being situations that |
| 99 | come up far too often to consider the functions reliable enough for |
| 100 | general use. |
| 101 | """ |
| 102 | # Reasons to avoid using urllib.basejoin() and urlparse.urljoin(): |
| 103 | # - Both are partial implementations of long-obsolete specs. |
| 104 | # - Both accept relative URLs as the base, which no spec allows. |
| 105 | # - urllib.basejoin() mishandles the '' and '..' references. |
| 106 | # - If the base URL uses a non-hierarchical or relative path, |
| 107 | # or if the URL scheme is unrecognized, the result is not |
| 108 | # always as expected (partly due to issues in RFC 1808). |
| 109 | # - If the authority component of a 'file' URI is empty, |
| 110 | # the authority component is removed altogether. If it was |
| 111 | # not present, an empty authority component is in the result. |
| 112 | # - '.' and '..' segments are not always collapsed as well as they |
| 113 | # should be (partly due to issues in RFC 1808). |
| 114 | # - Effective Python 2.4, urllib.basejoin() *is* urlparse.urljoin(), |
| 115 | # but urlparse.urljoin() is still based on RFC 1808. |
| 116 | |
| 117 | # This procedure is based on the pseudocode in RFC 3986 sec. 5.2. |
| 118 | # |
| 119 | # ensure base URI is absolute |
| 120 | if not baseUri: |
| 121 | raise ValueError('baseUri is required and must be a non empty string') |
| 122 | if not IsAbsolute(baseUri): |
| 123 | raise ValueError('%r is not an absolute URI' % baseUri) |
| 124 | # shortcut for the simplest same-document reference cases |
| 125 | if uriRef == '' or uriRef[0] == '#': |
| 126 | return baseUri.split('#')[0] + uriRef |
| 127 | # ensure a clean slate |
| 128 | tScheme = tAuth = tPath = tQuery = None |
| 129 | # parse the reference into its components |
| 130 | (rScheme, rAuth, rPath, rQuery, rFrag) = SplitUriRef(uriRef) |
| 131 | # if the reference is absolute, eliminate '.' and '..' path segments |
| 132 | # and skip to the end |
| 133 | if rScheme is not None: |
| 134 | tScheme = rScheme |
| 135 | tAuth = rAuth |
| 136 | tPath = RemoveDotSegments(rPath) |
| 137 | tQuery = rQuery |
| 138 | else: |
| 139 | # the base URI's scheme, and possibly more, will be inherited |
| 140 | (bScheme, bAuth, bPath, bQuery, bFrag) = SplitUriRef(baseUri) |
| 141 | # if the reference is a net-path, just eliminate '.' and '..' path |
| 142 | # segments; no other changes needed. |
| 143 | if rAuth is not None: |
| 144 | tAuth = rAuth |
| 145 | tPath = RemoveDotSegments(rPath) |
| 146 | tQuery = rQuery |
| 147 | # if it's not a net-path, we need to inherit pieces of the base URI |
| 148 | else: |
| 149 | # use base URI's path if the reference's path is empty |
| 150 | if not rPath: |
| 151 | tPath = bPath |
| 152 | # use the reference's query, if any, or else the base URI's, |
| 153 | tQuery = rQuery is not None and rQuery or bQuery |
| 154 | # the reference's path is not empty |
| 155 | else: |
| 156 | # just use the reference's path if it's absolute |
| 157 | if rPath[0] == '/': |
| 158 | tPath = RemoveDotSegments(rPath) |
| 159 | # merge the reference's relative path with the base URI's path |
| 160 | else: |
| 161 | if bAuth is not None and not bPath: |
| 162 | tPath = '/' + rPath |
| 163 | else: |
| 164 | tPath = bPath[:bPath.rfind('/')+1] + rPath |
| 165 | tPath = RemoveDotSegments(tPath) |
| 166 | # use the reference's query |
| 167 | tQuery = rQuery |
| 168 | # since the reference isn't a net-path, |
| 169 | # use the authority from the base URI |
| 170 | tAuth = bAuth |
| 171 | # inherit the scheme from the base URI |
| 172 | tScheme = bScheme |
| 173 | # always use the reference's fragment (but no need to define another var) |
| 174 | #tFrag = rFrag |
| 175 | |
| 176 | # now compose the target URI (RFC 3986 sec. 5.3) |
| 177 | return UnsplitUriRef((tScheme, tAuth, tPath, tQuery, rFrag)) |
| 178 | |
| 179 | |
| 180 | REG_NAME_HOST_PATTERN = re.compile(r"^(?:(?:[0-9A-Za-z\-_\.!~*'();&=+$,]|(?:%[0-9A-Fa-f]{2}))*)$") |
| 181 | |
| 182 | def MakeUrllibSafe(uriRef): |
| 183 | """ |
| 184 | Makes the given RFC 3986-conformant URI reference safe for passing |
| 185 | to legacy urllib functions. The result may not be a valid URI. |
| 186 | |
| 187 | As of Python 2.3.3, urllib.urlopen() does not fully support |
| 188 | internationalized domain names, it does not strip fragment components, |
| 189 | and on Windows, it expects file URIs to use '|' instead of ':' in the |
| 190 | path component corresponding to the drivespec. It also relies on |
| 191 | urllib.unquote(), which mishandles unicode arguments. This function |
| 192 | produces a URI reference that will work around these issues, although |
| 193 | the IDN workaround is limited to Python 2.3 only. May raise a |
| 194 | UnicodeEncodeError if the URI reference is Unicode and erroneously |
| 195 | contains non-ASCII characters. |
| 196 | """ |
| 197 | # IDN support requires decoding any percent-encoded octets in the |
| 198 | # host part (if it's a reg-name) of the authority component, and when |
| 199 | # doing DNS lookups, applying IDNA encoding to that string first. |
| 200 | # As of Python 2.3, there is an IDNA codec, and the socket and httplib |
| 201 | # modules accept Unicode strings and apply IDNA encoding automatically |
| 202 | # where necessary. However, urllib.urlopen() has not yet been updated |
| 203 | # to do the same; it raises an exception if you give it a Unicode |
| 204 | # string, and does no conversion on non-Unicode strings, meaning you |
| 205 | # have to give it an IDNA string yourself. We will only support it on |
| 206 | # Python 2.3 and up. |
| 207 | # |
| 208 | # see if host is a reg-name, as opposed to IPv4 or IPv6 addr. |
| 209 | if isinstance(uriRef, unicode): |
| 210 | try: |
| 211 | uriRef = uriRef.encode('us-ascii') # parts of urllib are not unicode safe |
| 212 | except UnicodeError: |
| 213 | raise ValueError("uri %r must consist of ASCII characters." % uriRef) |
| 214 | (scheme, auth, path, query, frag) = urlparse.urlsplit(uriRef) |
| 215 | if auth and auth.find('@') > -1: |
| 216 | userinfo, hostport = auth.split('@') |
| 217 | else: |
| 218 | userinfo = None |
| 219 | hostport = auth |
| 220 | if hostport and hostport.find(':') > -1: |
| 221 | host, port = hostport.split(':') |
| 222 | else: |
| 223 | host = hostport |
| 224 | port = None |
| 225 | if host and REG_NAME_HOST_PATTERN.match(host): |
| 226 | # percent-encoded hostnames will always fail DNS lookups |
| 227 | host = urllib.unquote(host) #PercentDecode(host) |
| 228 | # IDNA-encode if possible. |
| 229 | # We shouldn't do this for schemes that don't need DNS lookup, |
| 230 | # but are there any (that you'd be calling urlopen for)? |
| 231 | if sys.version_info[0:2] >= (2, 3): |
| 232 | if isinstance(host, str): |
| 233 | host = host.decode('utf-8') |
| 234 | host = host.encode('idna') |
| 235 | # reassemble the authority with the new hostname |
| 236 | # (percent-decoded, and possibly IDNA-encoded) |
| 237 | auth = '' |
| 238 | if userinfo: |
| 239 | auth += userinfo + '@' |
| 240 | auth += host |
| 241 | if port: |
| 242 | auth += ':' + port |
| 243 | |
| 244 | # On Windows, ensure that '|', not ':', is used in a drivespec. |
| 245 | if os.name == 'nt' and scheme == 'file': |
| 246 | path = path.replace(':', '|', 1) |
| 247 | |
| 248 | # Note that we drop fragment, if any. See RFC 3986 sec. 3.5. |
| 249 | uri = urlparse.urlunsplit((scheme, auth, path, query, None)) |
| 250 | |
| 251 | return uri |
| 252 | |
| 253 | |
| 254 | |
| 255 | def BaseJoin(base, uriRef): |
| 256 | """ |
| 257 | Merges a base URI reference with another URI reference, returning a |
| 258 | new URI reference. |
| 259 | |
| 260 | It behaves exactly the same as Absolutize(), except the arguments |
| 261 | are reversed, and it accepts any URI reference (even a relative URI) |
| 262 | as the base URI. If the base has no scheme component, it is |
| 263 | evaluated as if it did, and then the scheme component of the result |
| 264 | is removed from the result, unless the uriRef had a scheme. Thus, if |
| 265 | neither argument has a scheme component, the result won't have one. |
| 266 | |
| 267 | This function is named BaseJoin because it is very much like |
| 268 | urllib.basejoin(), but it follows the current rfc3986 algorithms |
| 269 | for path merging, dot segment elimination, and inheritance of query |
| 270 | and fragment components. |
| 271 | |
| 272 | WARNING: This function exists for 2 reasons: (1) because of a need |
| 273 | within the 4Suite repository to perform URI reference absolutization |
| 274 | using base URIs that are stored (inappropriately) as absolute paths |
| 275 | in the subjects of statements in the RDF model, and (2) because of |
| 276 | a similar need to interpret relative repo paths in a 4Suite product |
| 277 | setup.xml file as being relative to a path that can be set outside |
| 278 | the document. When these needs go away, this function probably will, |
| 279 | too, so it is not advisable to use it. |
| 280 | """ |
| 281 | if IsAbsolute(base): |
| 282 | return Absolutize(uriRef, base) |
| 283 | else: |
| 284 | dummyscheme = 'basejoin' |
| 285 | res = Absolutize(uriRef, '%s:%s' % (dummyscheme, base)) |
| 286 | if IsAbsolute(uriRef): |
| 287 | # scheme will be inherited from uriRef |
| 288 | return res |
| 289 | else: |
| 290 | # no scheme in, no scheme out |
| 291 | return res[len(dummyscheme)+1:] |
| 292 | |
| 293 | |
| 294 | def RemoveDotSegments(path): |
| 295 | """ |
| 296 | Supports Absolutize() by implementing the remove_dot_segments function |
| 297 | described in RFC 3986 sec. 5.2. It collapses most of the '.' and '..' |
| 298 | segments out of a path without eliminating empty segments. It is intended |
| 299 | to be used during the path merging process and may not give expected |
| 300 | results when used independently. Use NormalizePathSegments() or |
| 301 | NormalizePathSegmentsInUri() if more general normalization is desired. |
| 302 | |
| 303 | semi-private because it is not for general use. I've implemented it |
| 304 | using two segment stacks, as alluded to in the spec, rather than the |
| 305 | explicit string-walking algorithm that would be too inefficient. (mbrown) |
| 306 | """ |
| 307 | # return empty string if entire path is just "." or ".." |
| 308 | if path == '.' or path == '..': |
| 309 | return path[0:0] # preserves string type |
| 310 | # remove all "./" or "../" segments at the beginning |
| 311 | while path: |
| 312 | if path[:2] == './': |
| 313 | path = path[2:] |
| 314 | elif path[:3] == '../': |
| 315 | path = path[3:] |
| 316 | else: |
| 317 | break |
| 318 | # We need to keep track of whether there was a leading slash, |
| 319 | # because we're going to drop it in order to prevent our list of |
| 320 | # segments from having an ambiguous empty first item when we call |
| 321 | # split(). |
| 322 | leading_slash = 0 |
| 323 | if path[:1] == '/': |
| 324 | path = path[1:] |
| 325 | leading_slash = 1 |
| 326 | # replace a trailing "/." with just "/" |
| 327 | if path[-2:] == '/.': |
| 328 | path = path[:-1] |
| 329 | # convert the segments into a list and process each segment in |
| 330 | # order from left to right. |
| 331 | segments = path.split('/') |
| 332 | keepers = [] |
| 333 | segments.reverse() |
| 334 | while segments: |
| 335 | seg = segments.pop() |
| 336 | # '..' means drop the previous kept segment, if any. |
| 337 | # If none, and if the path is relative, then keep the '..'. |
| 338 | # If the '..' was the last segment, ensure |
| 339 | # that the result ends with '/'. |
| 340 | if seg == '..': |
| 341 | if keepers: |
| 342 | keepers.pop() |
| 343 | elif not leading_slash: |
| 344 | keepers.append(seg) |
| 345 | if not segments: |
| 346 | keepers.append('') |
| 347 | # ignore '.' segments and keep all others, even empty ones |
| 348 | elif seg != '.': |
| 349 | keepers.append(seg) |
| 350 | # reassemble the kept segments |
| 351 | return leading_slash * '/' + '/'.join(keepers) |
| 352 | |
| 353 | |
| 354 | SCHEME_PATTERN = re.compile(r'([a-zA-Z][a-zA-Z0-9+\-.]*):') |
| 355 | def GetScheme(uriRef): |
| 356 | """ |
| 357 | Obtains, with optimum efficiency, just the scheme from a URI reference. |
| 358 | Returns a string, or if no scheme could be found, returns None. |
| 359 | """ |
| 360 | # Using a regex seems to be the best option. Called 50,000 times on |
| 361 | # different URIs, on a 1.0-GHz PIII with FreeBSD 4.7 and Python |
| 362 | # 2.2.1, this method completed in 0.95s, and 0.05s if there was no |
| 363 | # scheme to find. By comparison, |
| 364 | # urllib.splittype()[0] took 1.5s always; |
| 365 | # Ft.Lib.Uri.SplitUriRef()[0] took 2.5s always; |
| 366 | # urlparse.urlparse()[0] took 3.5s always. |
| 367 | m = SCHEME_PATTERN.match(uriRef) |
| 368 | if m is None: |
| 369 | return None |
| 370 | else: |
| 371 | return m.group(1) |
| 372 | |
| 373 | |
| 374 | def IsAbsolute(identifier): |
| 375 | """ |
| 376 | Given a string believed to be a URI or URI reference, tests that it is |
| 377 | absolute (as per RFC 2396), not relative -- i.e., that it has a scheme. |
| 378 | """ |
| 379 | # We do it this way to avoid compiling another massive regex. |
| 380 | return GetScheme(identifier) is not None |