crawler: removed old imports, added timeout

This commit is contained in:
Michael Scholz 2013-05-14 19:22:27 +02:00
parent a136dc18f5
commit 4935da85eb

View File

@ -1,11 +1,10 @@
import urllib
import urllib2
import random
import robotparser
from sgmllib import SGMLParser
from urlparse import urlparse
import time
import sys
from termcolor import colored, cprint
from termcolor import colored
'''
TODO:
@ -61,7 +60,11 @@ def checkRobotsTxt(url):
def canonicalUrl(url):
o = urlparse(url)
if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl():
return True
if ".html" in o.path:
return True
if "." not in o.path:
return True
return False
else:
return False
@ -122,7 +125,7 @@ class URLLister(SGMLParser):
startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
page = urllib.urlopen(startsite)
page = urllib2.urlopen(startsite, timeout = 5)
print "currently visited url: "+startsite
extractor = URLLister()
extractor.feed(page.read())
@ -135,7 +138,7 @@ while(i <= numberOfSites):
url = getNextUrlToVisit()
print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue")
try:
page = urllib.urlopen(url)
page = urllib2.urlopen(url, timeout = 5)
extractor.feed(page.read())
global visitedSites
visitedSites += 1