crawler: exception handling

This commit is contained in:
Michael Scholz 2013-05-16 12:09:29 +02:00
parent a64659eb1b
commit b700831f56

View File

@ -3,6 +3,7 @@ import sys
import random
import robotparser
from sgmllib import SGMLParser
import sgmllib
from urlparse import urlparse
import time
from termcolor import colored
@ -19,7 +20,7 @@ TODO:
# crawler attributes
entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
entrypoint = "http://www.spiegel.de"
#entrypoint = "http://www.buchaktuell.de/"
entrypoint = "http://www.google.de/"
numberOfPagesToCrawl = 1000
timeBetweenSameHost = 5 # 5 sec
@ -92,7 +93,6 @@ def getNextUrlToVisit():
## check if url is blocked by robots.txt or was already visited ##
if blockedByRobotsTxt(url) or url in visitedUrls:
print str(len(extractor.urls))
extractor.urls.remove(url)
return getNextUrlToVisit()
@ -156,19 +156,22 @@ if __name__ == "__main__":
extractor.feed(pageContent)
pages[url] = pageContent
i += 1
# exception handling
except urllib2.HTTPError, err:
if err.code == 404:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: not found: "+url, "red")
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: not found: "+url, "red")
if err.code == 400:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: bad request: "+url, "red")
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: bad request: "+url, "red")
if err.code == 403:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: forbidden: "+url, "red")
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: forbidden: "+url, "red")
except urllib2.URLError:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red")
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red")
except sgmllib.SGMLParseError:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: sgmllib.SGMLParseError: "+url, "red")
page.close()
except:
print "Unexpected error:", sys.exc_info()[0]
i = 1010
extractor.close()