crawler update

This commit is contained in:
Michael Scholz 2013-05-13 23:16:38 +02:00
parent d11e2030fe
commit e81ddb5c30

View File

@ -34,15 +34,22 @@ extractor = URLLister()
extractor.feed(page.read())
i = 1
numberOfSites = 1000
lastHost = ""
# crawl 100 sites...
while(i <= 100):
while(i <= numberOfSites):
# get random url from queue
url = random.choice(extractor.urls)
## remove url from queue
extractor.urls.remove(url)
print "currently visited url: "+url
page = urllib.urlopen(url)
extractor.feed(page.read())
i = i + 1
# check if lastHost == currentHost
if urlparse(url).netloc != urlparse(lastHost).netloc:
## remove url from queue
extractor.urls.remove(url)
print "("+str(i)+"/"+str(numberOfSites)+") currently visited url: "+url
page = urllib.urlopen(url)
extractor.feed(page.read())
i = i + 1
lastHost = url
extractor.close()