curl-and-python
Function to retrieve mulltiple URLs asyncrously
Date: Mon, 7 Mar 2005 21:14:50 -0800 (PST)
Has anyone written a function to retrieve multiple
URLs asyncronously? I tried to hack the
retriever-multi.py to do so (see below), but am having
some trouble with it.
Anyway, here's my hack. Comments, feedback, and fixes
appreciated. Please cc me on all responses.
import sys
import pycurl
import cStringIO
def harvestURLs(urls, num_conn = 10):
# We should ignore SIGPIPE when using pycurl.NOSIGNAL
- see
# the libcurl tutorial for more info.
try:
import signal
from signal import SIGPIPE, SIG_IGN
signal.signal(signal.SIGPIPE, signal.SIG_IGN)
except ImportError:
pass
# Make a queue with (url, filename) tuples
queue = []
for url in urls:
url = url.strip()
if not url or url[0] == "#":
continue
filename = "doc_%03d.dat" % (len(queue) + 1)
queue.append((url, filename))
# Check args
assert queue, "no URLs given"
num_urls = len(queue)
num_conn = min(num_conn, num_urls)
assert 1 <= num_conn <= 10000, "invalid number of
concurrent connections"
print "PycURL %s (compiled against 0x%x)" %
(pycurl.version, pycurl.COMPILE_LIBCURL_VERSION_NUM)
print "----- Getting", num_urls, "URLs using",
num_conn, "connections -----"
# Pre-allocate a list of curl objects
m = pycurl.CurlMulti()
m.handles = []
for i in range(num_conn):
c = pycurl.Curl()
c.fp = None
c.setopt(pycurl.FOLLOWLOCATION, 1)
c.setopt(pycurl.MAXREDIRS, 5)
#c.setopt(pycurl.CONNECTTIMEOUT, 30)
c.setopt(pycurl.CONNECTTIMEOUT, 15)
#c.setopt(pycurl.TIMEOUT, 300)
c.setopt(pycurl.TIMEOUT, 25)
c.setopt(pycurl.NOSIGNAL, 1)
m.handles.append(c)
# Main loop
freelist = m.handles[:]
num_processed = 0
r = []
while num_processed < num_urls:
# If there is an url to process and a free curl
object, add to multi stack
while queue and freelist:
url, filename = queue.pop(0)
c = freelist.pop()
#c.fp = open(filename, "wb")
c.setopt(pycurl.URL, url)
#c.setopt(pycurl.WRITEDATA, c.fp)
m.add_handle(c)
# store some info
#c.filename = filename
c.url = url
c.res = cStringIO.StringIO()
c.setopt(pycurl.WRITEFUNCTION, c.res.write)
# Run the internal curl state machine for the
multi stack
while 1:
ret, num_handles = m.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
# Check for curl objects which have terminated,
and add them to the freelist
while 1:
num_q, ok_list, err_list = m.info_read()
for c in ok_list:
#c.fp.close()
#c.fp = None
m.remove_handle(c)
#print "Success:", c.filename, c.url,
c.getinfo(pycurl.EFFECTIVE_URL)
print "Success:", c.url,
c.getinfo(pycurl.EFFECTIVE_URL)
freelist.append(c)
for c, errno, errmsg in err_list:
#c.fp.close()
#c.fp = None
m.remove_handle(c)
#print "Failed: ", c.filename, c.url,
errno, errmsg
print "Failed: ", c.url, errno, errmsg
freelist.append(c)
num_processed = num_processed + len(ok_list)
+ len(err_list)
if num_q == 0:
break
# Currently no more I/O is pending, could do
something in the meantime
# (display a progress bar, etc.).
# We just call select() to sleep until some more
data is available.
#m.select()
# I was getting stuckage, so let's try this
print "%s URLs total. %s completed." % (num_urls,
num_processed)
m.select(25)
for c in m.handles:
r.append((c.url, c.res.getvalue()))
# Cleanup
for c in m.handles:
#if c.fp is not None:
# c.fp.close()
#c.fp = None
c.close()
m.close()
print "Returning %s URLS" % len(r)
return r
__________________________________
Celebrate Yahoo!'s 10th Birthday!
Yahoo! Netrospective: 100 Moments of the Web
http://birthday.yahoo.com/netrospective/
_______________________________________________
http://cool.haxx.se/mailman/listinfo/curl-and-python
Received on 2005-03-08