Package: python-urlgrabber Version: 3.1.0-4 Severity: normal Please note that the CVS version is different (and newer) from the latest version released (3.1.0), also it has various fixes/improvements on its code. I attach as an example a diff from the file "urlgrabber/grabber.py".
-- System Information: Debian Release: 5.0 APT prefers testing APT policy: (500, 'testing') Architecture: amd64 (x86_64) Kernel: Linux 2.6.26-1-amd64 (SMP w/2 CPU cores) Locale: LANG=es_AR.UTF-8, LC_CTYPE=es_AR.UTF-8 (charmap=UTF-8) Shell: /bin/sh linked to /bin/bash Versions of packages python-urlgrabber depends on: ii python-support 0.8.7 automated rebuilding support for P ii python2.5 2.5.4-1 An interactive high-level object-o python-urlgrabber recommends no packages. python-urlgrabber suggests no packages. -- no debconf information -- JID: lavaram...@jabber.org | http://lusers.com.ar/ 2B82 A38D 1BA5 847A A74D 6C34 6AB7 9ED6 C8FD F9C1
--- /tmp/urlgrabber-3.1.0/urlgrabber/grabber.py 2006-09-21 21:58:05.000000000 -0300 +++ /tmp/urlgrabber-cvs/urlgrabber/grabber.py 2006-12-12 16:08:46.000000000 -0300 @@ -55,8 +55,9 @@ text = None - specifies an alternativ text item in the beginning of the progress - bar line. If not given, the basename of the file is used. + specifies alternative text to be passed to the progress meter + object. If not given, the default progress meter will use the + basename of the file. throttle = 1.0 @@ -167,6 +168,13 @@ chain integrity. You are responsible for ensuring that any extension handlers are present if said features are required. + cache_openers = True + + controls whether urllib2 openers should be cached and reused, or + whether they should be created each time. There's a modest + overhead in recreating them, but it's slightly safer to do so if + you're modifying the handlers between calls. + data = None Only relevant for the HTTP family (and ignored for other @@ -179,6 +187,44 @@ badly and if you do not use the proper case (shown here), your values will be overridden with the defaults. + urlparser = URLParser() + + The URLParser class handles pre-processing of URLs, including + auth-handling for user/pass encoded in http urls, file handing + (that is, filenames not sent as a URL), and URL quoting. If you + want to override any of this behavior, you can pass in a + replacement instance. See also the 'quote' option. + + quote = None + + Whether or not to quote the path portion of a url. + quote = 1 -> quote the URLs (they're not quoted yet) + quote = 0 -> do not quote them (they're already quoted) + quote = None -> guess what to do + + This option only affects proper urls like 'file:///etc/passwd'; it + does not affect 'raw' filenames like '/etc/passwd'. The latter + will always be quoted as they are converted to URLs. Also, only + the path part of a url is quoted. If you need more fine-grained + control, you should probably subclass URLParser and pass it in via + the 'urlparser' option. + + ssl_ca_cert = None + + this option can be used if M2Crypto is available and will be + ignored otherwise. If provided, it will be used to create an SSL + context. If both ssl_ca_cert and ssl_context are provided, then + ssl_context will be ignored and a new context will be created from + ssl_ca_cert. + + ssl_context = None + + this option can be used if M2Crypto is available and will be + ignored otherwise. If provided, this SSL context will be used. + If both ssl_ca_cert and ssl_context are provided, then ssl_context + will be ignored and a new context will be created from + ssl_ca_cert. + RETRY RELATED ARGUMENTS @@ -283,28 +329,6 @@ passed the same arguments, so you could use the same function for both. - urlparser = URLParser() - - The URLParser class handles pre-processing of URLs, including - auth-handling for user/pass encoded in http urls, file handing - (that is, filenames not sent as a URL), and URL quoting. If you - want to override any of this behavior, you can pass in a - replacement instance. See also the 'quote' option. - - quote = None - - Whether or not to quote the path portion of a url. - quote = 1 -> quote the URLs (they're not quoted yet) - quote = 0 -> do not quote them (they're already quoted) - quote = None -> guess what to do - - This option only affects proper urls like 'file:///etc/passwd'; it - does not affect 'raw' filenames like '/etc/passwd'. The latter - will always be quoted as they are converted to URLs. Also, only - the path part of a url is quoted. If you need more fine-grained - control, you should probably subclass URLParser and pass it in via - the 'urlparser' option. - BANDWIDTH THROTTLING urlgrabber supports throttling via two values: throttle and @@ -364,7 +388,7 @@ """ -# $Id: grabber.py,v 1.48 2006/09/22 00:58:05 mstenner Exp $ +# $Id: grabber.py,v 1.52 2006/12/12 19:08:46 mstenner Exp $ import os import os.path @@ -375,6 +399,7 @@ import string import urllib import urllib2 +import thread from stat import * # S_* and ST_* ######################################################################## @@ -406,8 +431,10 @@ import keepalive from keepalive import HTTPHandler, HTTPSHandler have_keepalive = True + keepalive_http_handler = HTTPHandler() except ImportError, msg: have_keepalive = False + keepalive_http_handler = None try: # add in range support conditionally too @@ -463,7 +490,7 @@ if sslfactory.DEBUG is None: sslfactory.DEBUG = DBOBJ -def _init_default_logger(): +def _init_default_logger(logspec=None): '''Examines the environment variable URLGRABBER_DEBUG and creates a logging object (logging.logger) based on the contents. It takes the form @@ -489,9 +516,12 @@ collect the code into a nice block.''' try: - dbinfo = os.environ['URLGRABBER_DEBUG'].split(',') + if logspec is None: + logspec = os.environ['URLGRABBER_DEBUG'] + dbinfo = logspec.split(',') import logging - level = logging._levelNames.get(dbinfo[0], int(dbinfo[0])) + level = logging._levelNames.get(dbinfo[0], None) + if level is None: level = int(dbinfo[0]) if level < 1: raise ValueError() formatter = logging.Formatter('%(asctime)s %(message)s') @@ -508,7 +538,17 @@ DBOBJ = None set_logger(DBOBJ) +def _log_package_state(): + if not DEBUG: return + DEBUG.info('urlgrabber version = %s' % __version__) + DEBUG.info('have_m2crypto = %s' % sslfactory.have_m2crypto) + DEBUG.info('trans function "_" = %s' % _) + DEBUG.info('have_keepalive = %s' % have_keepalive) + DEBUG.info('have_range = %s' % have_range) + DEBUG.info('have_socket_timeout = %s' % have_socket_timeout) + _init_default_logger() +_log_package_state() ######################################################################## # END MODULE INITIALIZATION ######################################################################## @@ -536,6 +576,7 @@ 13 - malformed proxy url 14 - HTTPError (includes .code and .exception attributes) 15 - user abort + 16 - error writing to local file MirrorGroup error codes (256 -- 511) 256 - No more mirrors left to try @@ -811,6 +852,24 @@ self.ssl_ca_cert = None self.ssl_context = None + def __repr__(self): + return self.format() + + def format(self, indent=' '): + keys = self.__dict__.keys() + if self.delegate is not None: + keys.remove('delegate') + keys.sort() + s = '{\n' + for k in keys: + s = s + indent + '%-15s: %s,\n' % \ + (repr(k), repr(self.__dict__[k])) + if self.delegate: + df = self.delegate.format(indent + ' ') + s = s + indent + '%-15s: %s\n' % ("'delegate'", df) + s = s + indent + '}' + return s + class URLGrabber: """Provides easy opening of URLs with a variety of options. @@ -878,6 +937,7 @@ like any other file object. """ opts = self.opts.derive(**kwargs) + if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) (url,parts) = opts.urlparser.parse(url, opts) def retryfunc(opts, url): return URLGrabberFileObject(url, filename=None, opts=opts) @@ -890,6 +950,7 @@ different from the passed-in filename if copy_local == 0. """ opts = self.opts.derive(**kwargs) + if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) (url,parts) = opts.urlparser.parse(url, opts) (scheme, host, path, parm, query, frag) = parts if filename is None: @@ -934,6 +995,7 @@ into memory, but don't use too much' """ opts = self.opts.derive(**kwargs) + if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) (url,parts) = opts.urlparser.parse(url, opts) if limit is not None: limit = limit + 1 @@ -1021,7 +1083,7 @@ # it _must_ come before all other handlers in the list or urllib2 # chokes. if self.opts.proxies: - handlers.append( CachedProxyHandler(self.opts.proxies) ) + handlers.append( _proxy_handler_cache.get(self.opts.proxies) ) # ------------------------------------------------------- # OK, these next few lines are a serious kludge to get @@ -1044,19 +1106,19 @@ handlers.append( urllib2.FTPHandler() ) # ------------------------------------------------------- - ssl_factory = sslfactory.get_factory(self.opts.ssl_ca_cert, - self.opts.ssl_context) + ssl_factory = _ssl_factory_cache.get( (self.opts.ssl_ca_cert, + self.opts.ssl_context) ) if need_keepalive_handler: - handlers.append(HTTPHandler()) - handlers.append(HTTPSHandler(ssl_factory)) + handlers.append(keepalive_http_handler) + handlers.append(_https_handler_cache.get(ssl_factory)) if need_range_handler: handlers.extend( range_handlers ) handlers.append( auth_handler ) if self.opts.cache_openers: - self._opener = CachedOpenerDirector(ssl_factory, *handlers) + self._opener = _opener_cache.get([ssl_factory,] + handlers) else: - self._opener = ssl_factory.create_opener(*handlers) + self._opener = _opener_cache.create([ssl_factory,] + handlers) # OK, I don't like to do this, but otherwise, we end up with # TWO user-agent headers. self._opener.addheaders = [] @@ -1196,15 +1258,35 @@ def _do_grab(self): """dump the file to self.filename.""" - if self.append: new_fo = open(self.filename, 'ab') - else: new_fo = open(self.filename, 'wb') + if self.append: mode = 'ab' + else: mode = 'wb' + if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \ + (self.filename, mode)) + try: + new_fo = open(self.filename, mode) + except IOError, e: + raise URLGrabError(16, _(\ + 'error opening local file, IOError: %s') % (e, )) + + try: + # if we have a known range, only try to read that much. + (low, high) = self.opts.range + amount = high - low + except TypeError, ValueError: + amount = None bs = 1024*8 size = 0 + if amount is not None: bs = min(bs, amount - size) block = self.read(bs) size = size + len(block) while block: - new_fo.write(block) + try: + new_fo.write(block) + except IOError, e: + raise URLGrabError(16, _(\ + 'error writing to local file, IOError: %s') % (e, )) + if amount is not None: bs = min(bs, amount - size) block = self.read(bs) size = size + len(block) @@ -1300,36 +1382,96 @@ try: self.fo.close_connection() except: pass -_handler_cache = [] -def CachedOpenerDirector(ssl_factory = None, *handlers): - for (cached_handlers, opener) in _handler_cache: - if cached_handlers == handlers: - for handler in opener.handlers: - handler.add_parent(opener) - return opener - if not ssl_factory: - ssl_factory = sslfactory.get_factory() - opener = ssl_factory.create_opener(*handlers) - _handler_cache.append( (handlers, opener) ) - return opener +##################################################################### -_proxy_cache = [] -def CachedProxyHandler(proxies): - for (pdict, handler) in _proxy_cache: - if pdict == proxies: - if DEBUG: DEBUG.debug('re-using proxy settings: %s', proxies) - break - else: +class NoDefault: pass +class ObjectCache: + def __init__(self, name=None): + self.name = name or self.__class__.__name__ + self._lock = thread.allocate_lock() + self._cache = [] + + def lock(self): + self._lock.acquire() + + def unlock(self): + self._lock.release() + + def get(self, key, create=None, found=None): + for (k, v) in self._cache: + if k == key: + if DEBUG: + DEBUG.debug('%s: found key' % self.name) + DEBUG.debug('%s: key = %s' % (self.name, key)) + DEBUG.debug('%s: val = %s' % (self.name, v)) + found = found or getattr(self, 'found', None) + if found: v = found(key, v) + return v + if DEBUG: + DEBUG.debug('%s: no key found' % self.name) + DEBUG.debug('%s: key = %s' % (self.name, key)) + create = create or getattr(self, 'create', None) + if create: + value = create(key) + if DEBUG: + DEBUG.info('%s: new value created' % self.name) + DEBUG.debug('%s: val = %s' % (self.name, value)) + self._cache.append( (key, value) ) + return value + else: + raise KeyError('key not found: %s' % key) + + def set(self, key, value): + if DEBUG: + DEBUG.info('%s: inserting key' % self.name) + DEBUG.debug('%s: key = %s' % (self.name, key)) + DEBUG.debug('%s: val = %s' % (self.name, value)) + self._cache.append( (key, value) ) + + def ts_get(self, key, create=None, found=None): + self._lock.acquire() + try: + self.get(key, create, found) + finally: + self._lock.release() + + def ts_set(self, key, value): + self._lock.acquire() + try: + self.set(key, value) + finally: + self._lock.release() + +class OpenerCache(ObjectCache): + def found(self, factory_and_handlers, opener): + for handler in factory_and_handlers[1:]: + handler.add_parent(opener) + return opener + def create(self, factory_and_handlers): + factory = factory_and_handlers[0] + handlers = factory_and_handlers[1:] + return factory.create_opener(*handlers) +_opener_cache = OpenerCache() + +class ProxyHandlerCache(ObjectCache): + def create(self, proxies): for k, v in proxies.items(): utype, url = urllib.splittype(v) host, other = urllib.splithost(url) if (utype is None) or (host is None): raise URLGrabError(13, _('Bad proxy URL: %s') % v) + return urllib2.ProxyHandler(proxies) +_proxy_handler_cache = ProxyHandlerCache() - if DEBUG: DEBUG.info('creating new proxy handler: %s', proxies) - handler = urllib2.ProxyHandler(proxies) - _proxy_cache.append( (proxies, handler) ) - return handler +class HTTPSHandlerCache(ObjectCache): + def create(self, ssl_factory): + return HTTPSHandler(ssl_factory) +_https_handler_cache = HTTPSHandlerCache() + +class SSLFactoryCache(ObjectCache): + def create(self, cert_and_context): + return sslfactory.get_factory(*cert_and_context) +_ssl_factory_cache = SSLFactoryCache() ##################################################################### # DEPRECATED FUNCTIONS