blob: 107e222b820ad35feca4450e82ebfb82f658ae60 [file] [log] [blame]
Phillip J. Eby069159b2006-04-18 04:05:34 +00001"""PyPI and direct package downloading"""
2
3import sys, os.path, re, urlparse, urllib2, shutil, random, socket
4from pkg_resources import *
5from distutils import log
6from distutils.errors import DistutilsError
7from md5 import md5
8from fnmatch import translate
9
10EGG_FRAGMENT = re.compile(r'^egg=([-A-Za-z0-9_.]+)$')
11HREF = re.compile("""href\\s*=\\s*['"]?([^'"> ]+)""", re.I)
12# this is here to fix emacs' cruddy broken syntax highlighting
13PYPI_MD5 = re.compile(
14 '<a href="([^"#]+)">([^<]+)</a>\n\s+\\(<a href="[^?]+\?:action=show_md5'
15 '&amp;digest=([0-9a-f]{32})">md5</a>\\)'
16)
17
18URL_SCHEME = re.compile('([-+.a-z0-9]{2,}):',re.I).match
19EXTENSIONS = ".tar.gz .tar.bz2 .tar .zip .tgz".split()
20
21__all__ = [
22 'PackageIndex', 'distros_for_url', 'parse_bdist_wininst',
23 'interpret_distro_name',
24]
25
26
27def parse_bdist_wininst(name):
28 """Return (base,pyversion) or (None,None) for possible .exe name"""
29
30 lower = name.lower()
31 base, py_ver = None, None
32
33 if lower.endswith('.exe'):
34 if lower.endswith('.win32.exe'):
35 base = name[:-10]
36 elif lower.startswith('.win32-py',-16):
37 py_ver = name[-7:-4]
38 base = name[:-16]
39
40 return base,py_ver
41
42def egg_info_for_url(url):
43 scheme, server, path, parameters, query, fragment = urlparse.urlparse(url)
44 base = urllib2.unquote(path.split('/')[-1])
45 if '#' in base: base, fragment = base.split('#',1)
46 return base,fragment
47
48def distros_for_url(url, metadata=None):
49 """Yield egg or source distribution objects that might be found at a URL"""
50 base, fragment = egg_info_for_url(url)
51 dists = distros_for_location(url, base, metadata)
52 if fragment and not dists:
53 match = EGG_FRAGMENT.match(fragment)
54 if match:
55 return interpret_distro_name(
56 url, match.group(1), metadata, precedence = CHECKOUT_DIST
57 )
58 return dists
59
60def distros_for_location(location, basename, metadata=None):
61 """Yield egg or source distribution objects based on basename"""
62 if basename.endswith('.egg.zip'):
63 basename = basename[:-4] # strip the .zip
64 if basename.endswith('.egg'): # only one, unambiguous interpretation
65 return [Distribution.from_location(location, basename, metadata)]
66
67 if basename.endswith('.exe'):
68 win_base, py_ver = parse_bdist_wininst(basename)
69 if win_base is not None:
70 return interpret_distro_name(
71 location, win_base, metadata, py_ver, BINARY_DIST, "win32"
72 )
73
74 # Try source distro extensions (.zip, .tgz, etc.)
75 #
76 for ext in EXTENSIONS:
77 if basename.endswith(ext):
78 basename = basename[:-len(ext)]
79 return interpret_distro_name(location, basename, metadata)
80 return [] # no extension matched
81
82
83def distros_for_filename(filename, metadata=None):
84 """Yield possible egg or source distribution objects based on a filename"""
85 return distros_for_location(
86 normalize_path(filename), os.path.basename(filename), metadata
87 )
88
89
90def interpret_distro_name(location, basename, metadata,
91 py_version=None, precedence=SOURCE_DIST, platform=None
92):
93 """Generate alternative interpretations of a source distro name
94
95 Note: if `location` is a filesystem filename, you should call
96 ``pkg_resources.normalize_path()`` on it before passing it to this
97 routine!
98 """
99
100 # Generate alternative interpretations of a source distro name
101 # Because some packages are ambiguous as to name/versions split
102 # e.g. "adns-python-1.1.0", "egenix-mx-commercial", etc.
103 # So, we generate each possible interepretation (e.g. "adns, python-1.1.0"
104 # "adns-python, 1.1.0", and "adns-python-1.1.0, no version"). In practice,
105 # the spurious interpretations should be ignored, because in the event
106 # there's also an "adns" package, the spurious "python-1.1.0" version will
107 # compare lower than any numeric version number, and is therefore unlikely
108 # to match a request for it. It's still a potential problem, though, and
109 # in the long run PyPI and the distutils should go for "safe" names and
110 # versions in distribution archive names (sdist and bdist).
111
112 parts = basename.split('-')
113 for p in range(1,len(parts)+1):
114 yield Distribution(
115 location, metadata, '-'.join(parts[:p]), '-'.join(parts[p:]),
116 py_version=py_version, precedence = precedence,
117 platform = platform
118 )
119
120
121
122
123
124class PackageIndex(Environment):
125 """A distribution index that scans web pages for download URLs"""
126
127 def __init__(self,index_url="http://www.python.org/pypi",hosts=('*',),*args,**kw):
128 Environment.__init__(self,*args,**kw)
129 self.index_url = index_url + "/"[:not index_url.endswith('/')]
130 self.scanned_urls = {}
131 self.fetched_urls = {}
132 self.package_pages = {}
133 self.allows = re.compile('|'.join(map(translate,hosts))).match
134 self.to_scan = []
135
136 def process_url(self, url, retrieve=False):
137 """Evaluate a URL as a possible download, and maybe retrieve it"""
138 url = fix_sf_url(url)
139 if url in self.scanned_urls and not retrieve:
140 return
141 self.scanned_urls[url] = True
142 if not URL_SCHEME(url):
143 self.process_filename(url)
144 return
145 else:
146 dists = list(distros_for_url(url))
147 if dists:
148 if not self.url_ok(url):
149 return
150 self.debug("Found link: %s", url)
151
152 if dists or not retrieve or url in self.fetched_urls:
153 map(self.add, dists)
154 return # don't need the actual page
155
156 if not self.url_ok(url):
157 self.fetched_urls[url] = True
158 return
159
160 self.info("Reading %s", url)
161 f = self.open_url(url)
162 self.fetched_urls[url] = self.fetched_urls[f.url] = True
163
164
165 if 'html' not in f.headers['content-type'].lower():
166 f.close() # not html, we can't process it
167 return
168
169 base = f.url # handle redirects
170 page = f.read()
171 f.close()
172 if url.startswith(self.index_url):
173 page = self.process_index(url, page)
174
175 for match in HREF.finditer(page):
176 link = urlparse.urljoin(base, match.group(1))
177 self.process_url(link)
178
179 def process_filename(self, fn, nested=False):
180 # process filenames or directories
181 if not os.path.exists(fn):
182 self.warn("Not found: %s", url)
183 return
184
185 if os.path.isdir(fn) and not nested:
186 path = os.path.realpath(fn)
187 for item in os.listdir(path):
188 self.process_filename(os.path.join(path,item), True)
189
190 dists = distros_for_filename(fn)
191 if dists:
192 self.debug("Found: %s", fn)
193 map(self.add, dists)
194
195 def url_ok(self, url, fatal=False):
196 if self.allows(urlparse.urlparse(url)[1]):
197 return True
198 msg = "\nLink to % s ***BLOCKED*** by --allow-hosts\n"
199 if fatal:
200 raise DistutilsError(msg % url)
201 else:
202 self.warn(msg, url)
203
204
205
206 def process_index(self,url,page):
207 """Process the contents of a PyPI page"""
208 def scan(link):
209 # Process a URL to see if it's for a package page
210 if link.startswith(self.index_url):
211 parts = map(
212 urllib2.unquote, link[len(self.index_url):].split('/')
213 )
214 if len(parts)==2:
215 # it's a package page, sanitize and index it
216 pkg = safe_name(parts[0])
217 ver = safe_version(parts[1])
218 self.package_pages.setdefault(pkg.lower(),{})[link] = True
219 return to_filename(pkg), to_filename(ver)
220 return None, None
221
222 if url==self.index_url or 'Index of Packages</title>' in page:
223 # process an index page into the package-page index
224 for match in HREF.finditer(page):
225 scan( urlparse.urljoin(url, match.group(1)) )
226 else:
227 pkg,ver = scan(url) # ensure this page is in the page index
228 # process individual package page
229 for tag in ("<th>Home Page", "<th>Download URL"):
230 pos = page.find(tag)
231 if pos!=-1:
232 match = HREF.search(page,pos)
233 if match:
234 # Process the found URL
235 new_url = urlparse.urljoin(url, match.group(1))
236 base, frag = egg_info_for_url(new_url)
237 if base.endswith('.py') and not frag:
238 if pkg and ver:
239 new_url+='#egg=%s-%s' % (pkg,ver)
240 else:
241 self.need_version_info(url)
242 self.scan_url(new_url)
243 return PYPI_MD5.sub(
244 lambda m: '<a href="%s#md5=%s">%s</a>' % m.group(1,3,2), page
245 )
246
247 def need_version_info(self, url):
248 self.scan_all(
249 "Page at %s links to .py file(s) without version info; an index "
250 "scan is required.", url
251 )
252
253 def scan_all(self, msg=None, *args):
254 if self.index_url not in self.fetched_urls:
255 if msg: self.warn(msg,*args)
256 self.warn(
257 "Scanning index of all packages (this may take a while)"
258 )
259 self.scan_url(self.index_url)
260
261 def find_packages(self, requirement):
262 self.scan_url(self.index_url + requirement.unsafe_name+'/')
Tim Peters584b0e02006-04-18 17:32:12 +0000263
Phillip J. Eby069159b2006-04-18 04:05:34 +0000264 if not self.package_pages.get(requirement.key):
265 # Fall back to safe version of the name
266 self.scan_url(self.index_url + requirement.project_name+'/')
267
268 if not self.package_pages.get(requirement.key):
269 # We couldn't find the target package, so search the index page too
270 self.warn(
271 "Couldn't find index page for %r (maybe misspelled?)",
272 requirement.unsafe_name
273 )
274 self.scan_all()
275
276 for url in self.package_pages.get(requirement.key,()):
277 # scan each page that might be related to the desired package
278 self.scan_url(url)
279
280 def obtain(self, requirement, installer=None):
281 self.prescan(); self.find_packages(requirement)
282 for dist in self[requirement.key]:
283 if dist in requirement:
284 return dist
285 self.debug("%s does not match %s", requirement, dist)
286 return super(PackageIndex, self).obtain(requirement,installer)
287
288 def check_md5(self, cs, info, filename, tfp):
289 if re.match('md5=[0-9a-f]{32}$', info):
290 self.debug("Validating md5 checksum for %s", filename)
291 if cs.hexdigest()<>info[4:]:
292 tfp.close()
293 os.unlink(filename)
294 raise DistutilsError(
295 "MD5 validation failed for "+os.path.basename(filename)+
296 "; possible download problem?"
297 )
298
299 def add_find_links(self, urls):
300 """Add `urls` to the list that will be prescanned for searches"""
301 for url in urls:
302 if (
303 self.to_scan is None # if we have already "gone online"
304 or not URL_SCHEME(url) # or it's a local file/directory
305 or url.startswith('file:')
306 or list(distros_for_url(url)) # or a direct package link
307 ):
308 # then go ahead and process it now
309 self.scan_url(url)
310 else:
311 # otherwise, defer retrieval till later
312 self.to_scan.append(url)
313
314 def prescan(self):
315 """Scan urls scheduled for prescanning (e.g. --find-links)"""
316 if self.to_scan:
317 map(self.scan_url, self.to_scan)
318 self.to_scan = None # from now on, go ahead and process immediately
319
320
321
322
323
324
325
326
327
328
329 def download(self, spec, tmpdir):
330 """Locate and/or download `spec` to `tmpdir`, returning a local path
331
332 `spec` may be a ``Requirement`` object, or a string containing a URL,
333 an existing local filename, or a project/version requirement spec
334 (i.e. the string form of a ``Requirement`` object). If it is the URL
335 of a .py file with an unambiguous ``#egg=name-version`` tag (i.e., one
336 that escapes ``-`` as ``_`` throughout), a trivial ``setup.py`` is
337 automatically created alongside the downloaded file.
338
339 If `spec` is a ``Requirement`` object or a string containing a
340 project/version requirement spec, this method returns the location of
341 a matching distribution (possibly after downloading it to `tmpdir`).
342 If `spec` is a locally existing file or directory name, it is simply
343 returned unchanged. If `spec` is a URL, it is downloaded to a subpath
344 of `tmpdir`, and the local filename is returned. Various errors may be
345 raised if a problem occurs during downloading.
346 """
347 if not isinstance(spec,Requirement):
348 scheme = URL_SCHEME(spec)
349 if scheme:
350 # It's a url, download it to tmpdir
351 found = self._download_url(scheme.group(1), spec, tmpdir)
352 base, fragment = egg_info_for_url(spec)
353 if base.endswith('.py'):
354 found = self.gen_setup(found,fragment,tmpdir)
355 return found
356 elif os.path.exists(spec):
357 # Existing file or directory, just return it
358 return spec
359 else:
360 try:
361 spec = Requirement.parse(spec)
362 except ValueError:
363 raise DistutilsError(
364 "Not a URL, existing file, or requirement spec: %r" %
365 (spec,)
366 )
367 return getattr(self.fetch_distribution(spec, tmpdir),'location',None)
368
369
370 def fetch_distribution(self,
371 requirement, tmpdir, force_scan=False, source=False, develop_ok=False
372 ):
373 """Obtain a distribution suitable for fulfilling `requirement`
374
375 `requirement` must be a ``pkg_resources.Requirement`` instance.
376 If necessary, or if the `force_scan` flag is set, the requirement is
377 searched for in the (online) package index as well as the locally
378 installed packages. If a distribution matching `requirement` is found,
379 the returned distribution's ``location`` is the value you would have
380 gotten from calling the ``download()`` method with the matching
381 distribution's URL or filename. If no matching distribution is found,
382 ``None`` is returned.
383
384 If the `source` flag is set, only source distributions and source
385 checkout links will be considered. Unless the `develop_ok` flag is
386 set, development and system eggs (i.e., those using the ``.egg-info``
387 format) will be ignored.
388 """
389
390 # process a Requirement
391 self.info("Searching for %s", requirement)
392 skipped = {}
393
394 def find(req):
395 # Find a matching distribution; may be called more than once
396
397 for dist in self[req.key]:
398
399 if dist.precedence==DEVELOP_DIST and not develop_ok:
400 if dist not in skipped:
401 self.warn("Skipping development or system egg: %s",dist)
402 skipped[dist] = 1
403 continue
404
405 if dist in req and (dist.precedence<=SOURCE_DIST or not source):
406 self.info("Best match: %s", dist)
407 return dist.clone(
408 location=self.download(dist.location, tmpdir)
409 )
410
411 if force_scan:
412 self.prescan()
413 self.find_packages(requirement)
414
415 dist = find(requirement)
416 if dist is None and self.to_scan is not None:
417 self.prescan()
418 dist = find(requirement)
419
420 if dist is None and not force_scan:
421 self.find_packages(requirement)
422 dist = find(requirement)
423
424 if dist is None:
425 self.warn(
426 "No local packages or download links found for %s%s",
427 (source and "a source distribution of " or ""),
428 requirement,
429 )
430 return dist
431
432 def fetch(self, requirement, tmpdir, force_scan=False, source=False):
433 """Obtain a file suitable for fulfilling `requirement`
434
435 DEPRECATED; use the ``fetch_distribution()`` method now instead. For
436 backward compatibility, this routine is identical but returns the
437 ``location`` of the downloaded distribution instead of a distribution
438 object.
439 """
440 dist = self.fetch_distribution(requirement,tmpdir,force_scan,source)
441 if dist is not None:
442 return dist.location
443 return None
444
445
446
447
448
449
450
451
452 def gen_setup(self, filename, fragment, tmpdir):
453 match = EGG_FRAGMENT.match(fragment); #import pdb; pdb.set_trace()
454 dists = match and [d for d in
455 interpret_distro_name(filename, match.group(1), None) if d.version
456 ] or []
457
458 if len(dists)==1: # unambiguous ``#egg`` fragment
459 basename = os.path.basename(filename)
460
461 # Make sure the file has been downloaded to the temp dir.
462 if os.path.dirname(filename) != tmpdir:
463 dst = os.path.join(tmpdir, basename)
464 from setuptools.command.easy_install import samefile
465 if not samefile(filename, dst):
466 shutil.copy2(filename, dst)
467 filename=dst
468
469 file = open(os.path.join(tmpdir, 'setup.py'), 'w')
470 file.write(
471 "from setuptools import setup\n"
472 "setup(name=%r, version=%r, py_modules=[%r])\n"
473 % (
474 dists[0].project_name, dists[0].version,
475 os.path.splitext(basename)[0]
476 )
477 )
478 file.close()
479 return filename
480
481 elif match:
482 raise DistutilsError(
483 "Can't unambiguously interpret project/version identifier %r; "
484 "any dashes in the name or version should be escaped using "
485 "underscores. %r" % (fragment,dists)
486 )
487 else:
488 raise DistutilsError(
489 "Can't process plain .py files without an '#egg=name-version'"
490 " suffix to enable automatic setup script generation."
491 )
Tim Peters584b0e02006-04-18 17:32:12 +0000492
Phillip J. Eby069159b2006-04-18 04:05:34 +0000493 dl_blocksize = 8192
494 def _download_to(self, url, filename):
495 self.url_ok(url,True) # raises error if not allowed
496 self.info("Downloading %s", url)
497 # Download the file
498 fp, tfp, info = None, None, None
499 try:
500 if '#' in url:
501 url, info = url.split('#', 1)
502 fp = self.open_url(url)
503 if isinstance(fp, urllib2.HTTPError):
504 raise DistutilsError(
505 "Can't download %s: %s %s" % (url, fp.code,fp.msg)
506 )
507 cs = md5()
508 headers = fp.info()
509 blocknum = 0
510 bs = self.dl_blocksize
511 size = -1
512 if "content-length" in headers:
513 size = int(headers["Content-Length"])
514 self.reporthook(url, filename, blocknum, bs, size)
515 tfp = open(filename,'wb')
516 while True:
517 block = fp.read(bs)
518 if block:
519 cs.update(block)
520 tfp.write(block)
521 blocknum += 1
522 self.reporthook(url, filename, blocknum, bs, size)
523 else:
524 break
525 if info: self.check_md5(cs, info, filename, tfp)
526 return headers
527 finally:
528 if fp: fp.close()
529 if tfp: tfp.close()
530
531 def reporthook(self, url, filename, blocknum, blksize, size):
532 pass # no-op
533
534 def retry_sf_download(self, url, filename):
535 try:
536 return self._download_to(url, filename)
537 except:
538 scheme, server, path, param, query, frag = urlparse.urlparse(url)
539 if server!='dl.sourceforge.net':
540 raise
541
542 mirror = get_sf_ip()
543
544 while _sf_mirrors:
545 self.warn("Download failed: %s", sys.exc_info()[1])
546 url = urlparse.urlunparse((scheme, mirror, path, param, '', frag))
547 try:
548 return self._download_to(url, filename)
549 except:
550 _sf_mirrors.remove(mirror) # don't retry the same mirror
551 mirror = get_sf_ip()
552
553 raise # fail if no mirror works
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575 def open_url(self, url):
576 try:
577 return urllib2.urlopen(url)
578 except urllib2.HTTPError, v:
579 return v
580 except urllib2.URLError, v:
581 raise DistutilsError("Download error: %s" % v.reason)
582
583
584 def _download_url(self, scheme, url, tmpdir):
585
586 # Determine download filename
587 #
588 name = filter(None,urlparse.urlparse(url)[2].split('/'))
589 if name:
590 name = name[-1]
591 while '..' in name:
592 name = name.replace('..','.').replace('\\','_')
593 else:
594 name = "__downloaded__" # default if URL has no path contents
595
596 if name.endswith('.egg.zip'):
597 name = name[:-4] # strip the extra .zip before download
598
599 filename = os.path.join(tmpdir,name)
600
601 # Download the file
602 #
603 if scheme=='svn' or scheme.startswith('svn+'):
604 return self._download_svn(url, filename)
605 else:
606 headers = self.retry_sf_download(url, filename)
607 if 'html' in headers['content-type'].lower():
608 return self._download_html(url, headers, filename, tmpdir)
609 else:
610 return filename
611
612 def scan_url(self, url):
613 self.process_url(url, True)
614
615
616 def _download_html(self, url, headers, filename, tmpdir):
617 file = open(filename)
618 for line in file:
619 if line.strip():
620 # Check for a subversion index page
621 if re.search(r'<title>Revision \d+:', line):
622 # it's a subversion index page:
623 file.close()
624 os.unlink(filename)
625 return self._download_svn(url, filename)
626 break # not an index page
627 file.close()
628 os.unlink(filename)
629 raise DistutilsError("Unexpected HTML page found at "+url)
630
631 def _download_svn(self, url, filename):
632 url = url.split('#',1)[0] # remove any fragment for svn's sake
633 self.info("Doing subversion checkout from %s to %s", url, filename)
634 os.system("svn checkout -q %s %s" % (url, filename))
635 return filename
636
637 def debug(self, msg, *args):
638 log.debug(msg, *args)
639
640 def info(self, msg, *args):
641 log.info(msg, *args)
642
643 def warn(self, msg, *args):
644 log.warn(msg, *args)
645
646
647
648
649
650
651
652
653
654
655
656
657def fix_sf_url(url):
658 scheme, server, path, param, query, frag = urlparse.urlparse(url)
659 if server!='prdownloads.sourceforge.net':
660 return url
661 return urlparse.urlunparse(
662 (scheme, 'dl.sourceforge.net', 'sourceforge'+path, param, '', frag)
663 )
664
665_sf_mirrors = []
666
667def get_sf_ip():
668 if not _sf_mirrors:
669 try:
670 _sf_mirrors[:] = socket.gethostbyname_ex('dl.sourceforge.net')[-1]
671 except socket.error:
672 # DNS-bl0ck1n9 f1r3w4llz sUx0rs!
673 _sf_mirrors[:] = ['dl.sourceforge.net']
674 return random.choice(_sf_mirrors)