crawler: removed old imports, added timeout
This commit is contained in:
parent
a136dc18f5
commit
4935da85eb
@ -1,11 +1,10 @@
|
||||
import urllib
|
||||
import urllib2
|
||||
import random
|
||||
import robotparser
|
||||
from sgmllib import SGMLParser
|
||||
from urlparse import urlparse
|
||||
import time
|
||||
import sys
|
||||
from termcolor import colored, cprint
|
||||
from termcolor import colored
|
||||
|
||||
'''
|
||||
TODO:
|
||||
@ -61,7 +60,11 @@ def checkRobotsTxt(url):
|
||||
def canonicalUrl(url):
|
||||
o = urlparse(url)
|
||||
if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl():
|
||||
return True
|
||||
if ".html" in o.path:
|
||||
return True
|
||||
if "." not in o.path:
|
||||
return True
|
||||
return False
|
||||
else:
|
||||
return False
|
||||
|
||||
@ -122,7 +125,7 @@ class URLLister(SGMLParser):
|
||||
|
||||
|
||||
startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
|
||||
page = urllib.urlopen(startsite)
|
||||
page = urllib2.urlopen(startsite, timeout = 5)
|
||||
print "currently visited url: "+startsite
|
||||
extractor = URLLister()
|
||||
extractor.feed(page.read())
|
||||
@ -135,7 +138,7 @@ while(i <= numberOfSites):
|
||||
url = getNextUrlToVisit()
|
||||
print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue")
|
||||
try:
|
||||
page = urllib.urlopen(url)
|
||||
page = urllib2.urlopen(url, timeout = 5)
|
||||
extractor.feed(page.read())
|
||||
global visitedSites
|
||||
visitedSites += 1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user