#!/bin/env python """ Uniform Resource Identifier (URI) path manipulation, above the access layer The name of this module and the functions are somewhat arbitrary; they hark to other parts of the python library; e.g. uripath.join() is somewhat like os.path.join(). REFERENCES Uniform Resource Identifiers (URI): Generic Syntax http://www.ietf.org/rfc/rfc2396.txt The Web Model: Information hiding and URI syntax (Jan 98) http://www.w3.org/DesignIssues/Model.html URI API design [was: URI Test Suite] Dan Connolly (Sun, Aug 12 2001) http://lists.w3.org/Archives/Public/uri/2001Aug/0021.html """ __version__ = "$Id: uripath.py,v 1.16 2004/03/21 04:24:35 timbl Exp $" from string import find, rfind, index def splitFrag(uriref): """split a URI reference between the fragment and the rest. Punctuation is thrown away. e.g. >>> splitFrag("abc#def") ('abc', 'def') >>> splitFrag("abcdef") ('abcdef', None) """ i = rfind(uriref, "#") if i>= 0: return uriref[:i], uriref[i+1:] else: return uriref, None def splitFragP(uriref, punct=0): """split a URI reference before the fragment Punctuation is kept. e.g. >>> splitFragP("abc#def") ('abc', '#def') >>> splitFragP("abcdef") ('abcdef', '') """ i = rfind(uriref, "#") if i>= 0: return uriref[:i], uriref[i:] else: return uriref, '' def join(here, there): """join an absolute URI and URI reference (non-ascii characters are supported/doctested; haven't checked the details of the IRI spec though) here is assumed to be absolute. there is URI reference. >>> join('http://example/x/y/z', '../abc') 'http://example/x/abc' Raise ValueError if there uses relative path syntax but here has no hierarchical path. >>> join('mid:foo@example', '../foo') Traceback (most recent call last): raise ValueError, here ValueError: Base has no slash after colon - with relative '../foo'. We grok IRIs >>> len(u'Andr\\xe9') 5 >>> join('http://example.org/', u'#Andr\\xe9') u'http://example.org/#Andr\\xe9' """ assert(find(here, "#") < 0), "Base may not contain hash: '%s'"% here # caller must splitFrag (why?) slashl = find(there, '/') colonl = find(there, ':') # join(base, 'foo:/') -- absolute if colonl >= 0 and (slashl < 0 or colonl < slashl): return there bcolonl = find(here, ':') assert(bcolonl >= 0), "Base uri '%s' is not absolute" % here # else it's not absolute # join('mid:foo@example', '../foo') bzzt if here[bcolonl+1:bcolonl+2] <> '/': raise ValueError ("Base <%s> has no slash after colon - with relative '%s'." %(here, there)) if here[bcolonl+1:bcolonl+3] == '//': bpath = find(here, '/', bcolonl+3) else: bpath = bcolonl+1 # join('http://xyz', 'foo') if bpath < 0: bpath = len(here) here = here + '/' # join('http://xyz/', '//abc') => 'http://abc' if there[:2] == '//': return here[:bcolonl+1] + there # join('http://xyz/', '/abc') => 'http://xyz/abc' if there[:1] == '/': return here[:bpath] + there slashr = rfind(here, '/') path, frag = splitFragP(there) if not path: return here + frag while 1: if path[:2] == './': path = path[2:] if path == '.': path = '' elif path[:3] == '../' or path == '..': path = path[3:] i = rfind(here, '/', bpath, slashr) if i >= 0: here = here[:i+1] slashr = i else: break return here[:slashr+1] + path + frag import re import string commonHost = re.compile(r'^[-_a-zA-Z0-9.]+:(//[^/]*)?/[^/]*$') def refTo(base, uri): """figure out a relative URI reference from base to uri >>> refTo('http://example/x/y/z', 'http://example/x/abc') '../abc' >>> refTo('file:/ex/x/y', 'file:/ex/x/q/r#s') 'q/r#s' >>> refTo(None, 'http://ex/x/y') 'http://ex/x/y' >>> refTo('http://ex/x/y', 'http://ex/x/y') '' Note the relationship between refTo and join: join(x, refTo(x, y)) == y which points out certain strings which cannot be URIs. e.g. >>> x='http://ex/x/y';y='http://ex/x/q:r';join(x, refTo(x, y)) == y 0 So 'http://ex/x/q:r' is not a URI. Use 'http://ex/x/q%3ar' instead: >>> x='http://ex/x/y';y='http://ex/x/q%3ar';join(x, refTo(x, y)) == y 1 This one checks that it uses a root-realtive one where that is all they share. Now uses root-relative where no path is shared. This is a matter of taste but tends to give more resilience IMHO -- and shorter paths Note that base may be None, meaning no base. In some situations, there just ain't a base. Slife. In these cases, relTo returns the absolute value. The axiom abs(,rel(b,x))=x still holds. This saves people having to set the base to "bogus:". >>> refTo('http://ex/x/y/z', 'http://ex/r') '/r' """ # assert base # don't mask bugs -danc # not a bug. -tim if not base: return uri if base == uri: return "" # Find how many path segments in common i=0 while i0 and uri[i-1] != '/' : i=i-1 # scan for slash if i < 3: return uri # No way. if string.find(base, "//", i-2)>0 \ or string.find(uri, "//", i-2)>0: return uri # An unshared "//" if string.find(base, ":", i)>0: return uri # An unshared ":" n = string.count(base, "/", i) if n == 0 and i file:/some/dir/#blort # # Revision 1.4 2002/08/07 14:32:21 timbl # uripath changes. passes 51 general tests and 25 loopback tests # # Revision 1.3 2002/08/06 01:36:09 connolly # cleanup: diagnostic interface, relative/absolute uri handling # # Revision 1.2 2002/03/15 23:53:02 connolly # handle no-auth case # # Revision 1.1 2002/02/19 22:52:42 connolly # renamed uritools.py to uripath.py # # Revision 1.2 2002/02/18 07:33:51 connolly # pathTo seems to work #