From ae01454a1db29292ea58ee4932eca3cd4a00827d Mon Sep 17 00:00:00 2001 From: Michael Scholz Date: Thu, 16 May 2013 13:46:17 +0200 Subject: [PATCH] crawler: fixed SGMLParseError --- .../Uebungen/2_Uebung/crawler/crawler.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py index e6cb10b5..7ae96742 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py @@ -1,4 +1,4 @@ -import urllib, urllib2 +import urllib2 import sys import random import robotparser @@ -20,7 +20,7 @@ TODO: # crawler attributes entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten" entrypoint = "http://www.spiegel.de" -entrypoint = "http://www.google.de/" +#entrypoint = "https://code.google.com/p/feedparser/issues/attachmentText?id=226&aid=-1296926914212963541&name=demonstrate_issue_226.xml&token=CHtgpTsdPmWnNsvScD0yfMuBriU%3A1368702558154" numberOfPagesToCrawl = 1000 timeBetweenSameHost = 5 # 5 sec @@ -107,7 +107,7 @@ def getNextUrlToVisit(): return url else: secondsToWait = timeBetweenSameHost - secondsSinceLastVisit - print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "red") + print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "magenta") return getNextUrlToVisit() else: visitedHostsWithTimestamp[host] = int(time.time()) @@ -117,6 +117,10 @@ def getNextUrlToVisit(): class URLLister(SGMLParser): + ## fix SGMLParseError + def resetParser(self): + SGMLParser.reset(self) + def reset(self): SGMLParser.reset(self) self.urls = [] @@ -152,6 +156,7 @@ if __name__ == "__main__": try: page = urllib2.urlopen(url, timeout = 6) pageContent = page.read() + pageContent = pageContent.replace('