From b700831f566c24d711be187ffc62988ece0db7c6 Mon Sep 17 00:00:00 2001 From: Michael Scholz Date: Thu, 16 May 2013 12:09:29 +0200 Subject: [PATCH] crawler: exception handling --- .../Uebungen/2_Uebung/crawler/crawler.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py index 9cfc0967..e6cb10b5 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py @@ -3,6 +3,7 @@ import sys import random import robotparser from sgmllib import SGMLParser +import sgmllib from urlparse import urlparse import time from termcolor import colored @@ -19,7 +20,7 @@ TODO: # crawler attributes entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten" entrypoint = "http://www.spiegel.de" -#entrypoint = "http://www.buchaktuell.de/" +entrypoint = "http://www.google.de/" numberOfPagesToCrawl = 1000 timeBetweenSameHost = 5 # 5 sec @@ -92,7 +93,6 @@ def getNextUrlToVisit(): ## check if url is blocked by robots.txt or was already visited ## if blockedByRobotsTxt(url) or url in visitedUrls: - print str(len(extractor.urls)) extractor.urls.remove(url) return getNextUrlToVisit() @@ -156,19 +156,22 @@ if __name__ == "__main__": extractor.feed(pageContent) pages[url] = pageContent i += 1 + + # exception handling except urllib2.HTTPError, err: if err.code == 404: - print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: not found: "+url, "red") + print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: not found: "+url, "red") if err.code == 400: - print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: bad request: "+url, "red") + print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: bad request: "+url, "red") if err.code == 403: - print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: forbidden: "+url, "red") + print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: forbidden: "+url, "red") except urllib2.URLError: - print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red") + print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red") + except sgmllib.SGMLParseError: + print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: sgmllib.SGMLParseError: "+url, "red") + page.close() except: print "Unexpected error:", sys.exc_info()[0] - - i = 1010 extractor.close()