crawler: fixed SGMLParseError
This commit is contained in:
parent
b700831f56
commit
ae01454a1d
@ -1,4 +1,4 @@
|
||||
import urllib, urllib2
|
||||
import urllib2
|
||||
import sys
|
||||
import random
|
||||
import robotparser
|
||||
@ -20,7 +20,7 @@ TODO:
|
||||
# crawler attributes
|
||||
entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
|
||||
entrypoint = "http://www.spiegel.de"
|
||||
entrypoint = "http://www.google.de/"
|
||||
#entrypoint = "https://code.google.com/p/feedparser/issues/attachmentText?id=226&aid=-1296926914212963541&name=demonstrate_issue_226.xml&token=CHtgpTsdPmWnNsvScD0yfMuBriU%3A1368702558154"
|
||||
numberOfPagesToCrawl = 1000
|
||||
timeBetweenSameHost = 5 # 5 sec
|
||||
|
||||
@ -107,7 +107,7 @@ def getNextUrlToVisit():
|
||||
return url
|
||||
else:
|
||||
secondsToWait = timeBetweenSameHost - secondsSinceLastVisit
|
||||
print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "red")
|
||||
print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "magenta")
|
||||
return getNextUrlToVisit()
|
||||
else:
|
||||
visitedHostsWithTimestamp[host] = int(time.time())
|
||||
@ -117,6 +117,10 @@ def getNextUrlToVisit():
|
||||
|
||||
|
||||
class URLLister(SGMLParser):
|
||||
## fix SGMLParseError
|
||||
def resetParser(self):
|
||||
SGMLParser.reset(self)
|
||||
|
||||
def reset(self):
|
||||
SGMLParser.reset(self)
|
||||
self.urls = []
|
||||
@ -152,6 +156,7 @@ if __name__ == "__main__":
|
||||
try:
|
||||
page = urllib2.urlopen(url, timeout = 6)
|
||||
pageContent = page.read()
|
||||
pageContent = pageContent.replace('<![CDATA[', '<![CDATA[') ## bugfix for SGMLParser
|
||||
page.close()
|
||||
extractor.feed(pageContent)
|
||||
pages[url] = pageContent
|
||||
@ -161,18 +166,23 @@ if __name__ == "__main__":
|
||||
except urllib2.HTTPError, err:
|
||||
if err.code == 404:
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: not found: "+url, "red")
|
||||
pass
|
||||
if err.code == 400:
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: bad request: "+url, "red")
|
||||
pass
|
||||
if err.code == 403:
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: forbidden: "+url, "red")
|
||||
pass
|
||||
except urllib2.URLError:
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red")
|
||||
pass
|
||||
except sgmllib.SGMLParseError:
|
||||
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: sgmllib.SGMLParseError: "+url, "red")
|
||||
page.close()
|
||||
extractor.resetParser()
|
||||
pass
|
||||
except:
|
||||
print "Unexpected error:", sys.exc_info()[0]
|
||||
|
||||
pass
|
||||
|
||||
extractor.close()
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user