crawler update
This commit is contained in:
parent
d11e2030fe
commit
e81ddb5c30
@ -34,15 +34,22 @@ extractor = URLLister()
|
||||
extractor.feed(page.read())
|
||||
|
||||
i = 1
|
||||
numberOfSites = 1000
|
||||
lastHost = ""
|
||||
# crawl 100 sites...
|
||||
while(i <= 100):
|
||||
while(i <= numberOfSites):
|
||||
# get random url from queue
|
||||
url = random.choice(extractor.urls)
|
||||
## remove url from queue
|
||||
extractor.urls.remove(url)
|
||||
print "currently visited url: "+url
|
||||
page = urllib.urlopen(url)
|
||||
extractor.feed(page.read())
|
||||
i = i + 1
|
||||
|
||||
# check if lastHost == currentHost
|
||||
if urlparse(url).netloc != urlparse(lastHost).netloc:
|
||||
## remove url from queue
|
||||
extractor.urls.remove(url)
|
||||
print "("+str(i)+"/"+str(numberOfSites)+") currently visited url: "+url
|
||||
page = urllib.urlopen(url)
|
||||
extractor.feed(page.read())
|
||||
i = i + 1
|
||||
lastHost = url
|
||||
|
||||
|
||||
extractor.close()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user