crawler: exception handling

2013-05-16 12:09:29 +02:00 · 2013-05-16 12:09:29 +02:00 · b700831f56
commit b700831f56
parent a64659eb1b
1 changed files with 11 additions and 8 deletions
--- a/Mining/Uebungen/2_Uebung/crawler/crawler.py
+++ b/Mining/Uebungen/2_Uebung/crawler/crawler.py
@ -3,6 +3,7 @@ import sys
 import random
 import robotparser
 from sgmllib import SGMLParser
+import sgmllib
 from urlparse import urlparse
 import time
 from termcolor import colored
@ -19,7 +20,7 @@ TODO:
 # crawler attributes
 entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
 entrypoint = "http://www.spiegel.de"
-#entrypoint = "http://www.buchaktuell.de/"
+entrypoint = "http://www.google.de/"
 numberOfPagesToCrawl = 1000
 timeBetweenSameHost = 5 # 5 sec

@ -92,7 +93,6 @@ def getNextUrlToVisit():
    
    ## check if url is blocked by robots.txt or was already visited ##
    if blockedByRobotsTxt(url) or url in visitedUrls:
-        print str(len(extractor.urls))
        extractor.urls.remove(url)
        return getNextUrlToVisit()
        
@ -156,19 +156,22 @@ if __name__ == "__main__":
            extractor.feed(pageContent)
            pages[url] = pageContent
            i += 1
+            
+        # exception handling
        except urllib2.HTTPError, err:
            if err.code == 404:
-                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: not found: "+url, "red")
+                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: not found: "+url, "red")
            if err.code == 400:
-                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: bad request: "+url, "red")
+                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: bad request: "+url, "red")
            if err.code == 403:
-                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: forbidden: "+url, "red")
+                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: forbidden: "+url, "red")
 	except urllib2.URLError:
-		print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red")
+            print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red")
+        except sgmllib.SGMLParseError:
+            print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: sgmllib.SGMLParseError: "+url, "red")
+            page.close()
        except:
            print "Unexpected error:", sys.exc_info()[0]
-            
-            i = 1010

            
    extractor.close()