crawler: removed old imports, added timeout

This commit is contained in:
Michael Scholz 2013-05-14 19:22:27 +02:00
parent a136dc18f5
commit 4935da85eb

View File

@ -1,11 +1,10 @@
import urllib import urllib2
import random import random
import robotparser import robotparser
from sgmllib import SGMLParser from sgmllib import SGMLParser
from urlparse import urlparse from urlparse import urlparse
import time import time
import sys from termcolor import colored
from termcolor import colored, cprint
''' '''
TODO: TODO:
@ -61,7 +60,11 @@ def checkRobotsTxt(url):
def canonicalUrl(url): def canonicalUrl(url):
o = urlparse(url) o = urlparse(url)
if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl(): if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl():
return True if ".html" in o.path:
return True
if "." not in o.path:
return True
return False
else: else:
return False return False
@ -122,7 +125,7 @@ class URLLister(SGMLParser):
startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten" startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
page = urllib.urlopen(startsite) page = urllib2.urlopen(startsite, timeout = 5)
print "currently visited url: "+startsite print "currently visited url: "+startsite
extractor = URLLister() extractor = URLLister()
extractor.feed(page.read()) extractor.feed(page.read())
@ -135,7 +138,7 @@ while(i <= numberOfSites):
url = getNextUrlToVisit() url = getNextUrlToVisit()
print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue") print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue")
try: try:
page = urllib.urlopen(url) page = urllib2.urlopen(url, timeout = 5)
extractor.feed(page.read()) extractor.feed(page.read())
global visitedSites global visitedSites
visitedSites += 1 visitedSites += 1