From b700831f566c24d711be187ffc62988ece0db7c6 Mon Sep 17 00:00:00 2001
From: Michael Scholz <m.scholz88@googlemail.com>
Date: Thu, 16 May 2013 12:09:29 +0200
Subject: [PATCH] crawler: exception handling

---
 .../Uebungen/2_Uebung/crawler/crawler.py      | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py
index 9cfc0967..e6cb10b5 100644
--- a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py	
+++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py	
@@ -3,6 +3,7 @@ import sys
 import random
 import robotparser
 from sgmllib import SGMLParser
+import sgmllib
 from urlparse import urlparse
 import time
 from termcolor import colored
@@ -19,7 +20,7 @@ TODO:
 # crawler attributes
 entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
 entrypoint = "http://www.spiegel.de"
-#entrypoint = "http://www.buchaktuell.de/"
+entrypoint = "http://www.google.de/"
 numberOfPagesToCrawl = 1000
 timeBetweenSameHost = 5 # 5 sec
 
@@ -92,7 +93,6 @@ def getNextUrlToVisit():
     
     ## check if url is blocked by robots.txt or was already visited ##
     if blockedByRobotsTxt(url) or url in visitedUrls:
-        print str(len(extractor.urls))
         extractor.urls.remove(url)
         return getNextUrlToVisit()
         
@@ -156,19 +156,22 @@ if __name__ == "__main__":
             extractor.feed(pageContent)
             pages[url] = pageContent
             i += 1
+            
+        # exception handling
         except urllib2.HTTPError, err:
             if err.code == 404:
-                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: not found: "+url, "red")
+                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: not found: "+url, "red")
             if err.code == 400:
-                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: bad request: "+url, "red")
+                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: bad request: "+url, "red")
             if err.code == 403:
-                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: forbidden: "+url, "red")
+                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: forbidden: "+url, "red")
 	except urllib2.URLError:
-		print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red")
+            print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red")
+        except sgmllib.SGMLParseError:
+            print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: sgmllib.SGMLParseError: "+url, "red")
+            page.close()
         except:
             print "Unexpected error:", sys.exc_info()[0]
-            
-            i = 1010
 
             
     extractor.close()