crawler: removed old imports, added timeout
This commit is contained in:
parent
a136dc18f5
commit
4935da85eb
@ -1,11 +1,10 @@
|
|||||||
import urllib
|
import urllib2
|
||||||
import random
|
import random
|
||||||
import robotparser
|
import robotparser
|
||||||
from sgmllib import SGMLParser
|
from sgmllib import SGMLParser
|
||||||
from urlparse import urlparse
|
from urlparse import urlparse
|
||||||
import time
|
import time
|
||||||
import sys
|
from termcolor import colored
|
||||||
from termcolor import colored, cprint
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
TODO:
|
TODO:
|
||||||
@ -61,7 +60,11 @@ def checkRobotsTxt(url):
|
|||||||
def canonicalUrl(url):
|
def canonicalUrl(url):
|
||||||
o = urlparse(url)
|
o = urlparse(url)
|
||||||
if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl():
|
if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl():
|
||||||
return True
|
if ".html" in o.path:
|
||||||
|
return True
|
||||||
|
if "." not in o.path:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -122,7 +125,7 @@ class URLLister(SGMLParser):
|
|||||||
|
|
||||||
|
|
||||||
startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
|
startsite = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
|
||||||
page = urllib.urlopen(startsite)
|
page = urllib2.urlopen(startsite, timeout = 5)
|
||||||
print "currently visited url: "+startsite
|
print "currently visited url: "+startsite
|
||||||
extractor = URLLister()
|
extractor = URLLister()
|
||||||
extractor.feed(page.read())
|
extractor.feed(page.read())
|
||||||
@ -135,7 +138,7 @@ while(i <= numberOfSites):
|
|||||||
url = getNextUrlToVisit()
|
url = getNextUrlToVisit()
|
||||||
print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue")
|
print colored("("+str(i)+"/"+str(numberOfSites)+") currently visiting url: "+url, "blue")
|
||||||
try:
|
try:
|
||||||
page = urllib.urlopen(url)
|
page = urllib2.urlopen(url, timeout = 5)
|
||||||
extractor.feed(page.read())
|
extractor.feed(page.read())
|
||||||
global visitedSites
|
global visitedSites
|
||||||
visitedSites += 1
|
visitedSites += 1
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user