crawler: fixed SGMLParseError

This commit is contained in:
Michael Scholz 2013-05-16 13:46:17 +02:00
parent b700831f56
commit ae01454a1d

View File

@ -1,4 +1,4 @@
import urllib, urllib2
import urllib2
import sys
import random
import robotparser
@ -20,7 +20,7 @@ TODO:
# crawler attributes
entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
entrypoint = "http://www.spiegel.de"
entrypoint = "http://www.google.de/"
#entrypoint = "https://code.google.com/p/feedparser/issues/attachmentText?id=226&aid=-1296926914212963541&name=demonstrate_issue_226.xml&token=CHtgpTsdPmWnNsvScD0yfMuBriU%3A1368702558154"
numberOfPagesToCrawl = 1000
timeBetweenSameHost = 5 # 5 sec
@ -107,7 +107,7 @@ def getNextUrlToVisit():
return url
else:
secondsToWait = timeBetweenSameHost - secondsSinceLastVisit
print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "red")
print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "magenta")
return getNextUrlToVisit()
else:
visitedHostsWithTimestamp[host] = int(time.time())
@ -117,6 +117,10 @@ def getNextUrlToVisit():
class URLLister(SGMLParser):
## fix SGMLParseError
def resetParser(self):
SGMLParser.reset(self)
def reset(self):
SGMLParser.reset(self)
self.urls = []
@ -152,6 +156,7 @@ if __name__ == "__main__":
try:
page = urllib2.urlopen(url, timeout = 6)
pageContent = page.read()
pageContent = pageContent.replace('<![CDATA[', '&lt;![CDATA[') ## bugfix for SGMLParser
page.close()
extractor.feed(pageContent)
pages[url] = pageContent
@ -161,18 +166,23 @@ if __name__ == "__main__":
except urllib2.HTTPError, err:
if err.code == 404:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: not found: "+url, "red")
pass
if err.code == 400:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: bad request: "+url, "red")
pass
if err.code == 403:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: forbidden: "+url, "red")
pass
except urllib2.URLError:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red")
pass
except sgmllib.SGMLParseError:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: sgmllib.SGMLParseError: "+url, "red")
page.close()
extractor.resetParser()
pass
except:
print "Unexpected error:", sys.exc_info()[0]
pass
extractor.close()