crawler: exception handling
This commit is contained in:
parent
a64659eb1b
commit
b700831f56
@ -3,6 +3,7 @@ import sys
|
||||
import random
|
||||
import robotparser
|
||||
from sgmllib import SGMLParser
|
||||
import sgmllib
|
||||
from urlparse import urlparse
|
||||
import time
|
||||
from termcolor import colored
|
||||
@ -19,7 +20,7 @@ TODO:
|
||||
# crawler attributes
|
||||
entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
|
||||
entrypoint = "http://www.spiegel.de"
|
||||
#entrypoint = "http://www.buchaktuell.de/"
|
||||
entrypoint = "http://www.google.de/"
|
||||
numberOfPagesToCrawl = 1000
|
||||
timeBetweenSameHost = 5 # 5 sec
|
||||
|
||||
@ -92,7 +93,6 @@ def getNextUrlToVisit():
|
||||
|
||||
## check if url is blocked by robots.txt or was already visited ##
|
||||
if blockedByRobotsTxt(url) or url in visitedUrls:
|
||||
print str(len(extractor.urls))
|
||||
extractor.urls.remove(url)
|
||||
return getNextUrlToVisit()
|
||||
|
||||
@ -156,19 +156,22 @@ if __name__ == "__main__":
|
||||
extractor.feed(pageContent)
|
||||
pages[url] = pageContent
|
||||
i += 1
|
||||
|
||||
# exception handling
|
||||
except urllib2.HTTPError, err:
|
||||
if err.code == 404:
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: not found: "+url, "red")
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: not found: "+url, "red")
|
||||
if err.code == 400:
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: bad request: "+url, "red")
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: bad request: "+url, "red")
|
||||
if err.code == 403:
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: forbidden: "+url, "red")
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: forbidden: "+url, "red")
|
||||
except urllib2.URLError:
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red")
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red")
|
||||
except sgmllib.SGMLParseError:
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: sgmllib.SGMLParseError: "+url, "red")
|
||||
page.close()
|
||||
except:
|
||||
print "Unexpected error:", sys.exc_info()[0]
|
||||
|
||||
i = 1010
|
||||
|
||||
|
||||
extractor.close()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user