2013-05-19 22:19:45 +02:00

510 lines
15 KiB
Python

# -*- coding: utf-8 -*-
import urllib2
import sys
import random
import robotparser
import re
import math
from sgmllib import SGMLParser
import sgmllib
from urlparse import urlparse
from urlparse import urljoin
import matplotlib.pyplot as plt
import time
from termcolor import colored
from collections import Counter
'''
VN:
- Plagiats-Checker fertig
- Sprachprüfer fertig
TODO:
- DONE canonize urls -> canonize? slides? -> remember last host -> no magic here -> even using ugly global
- DONE with getNextUrlToVisit():
server timeout -> safe crawled host, set timeout for crawled host
- statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html
'''
# crawler attributes
entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
#entrypoint = "http://www.spiegel.de" # german website
#entrypoint = "http://www.cnn.com" # english website
#entrypoint = "http://www.red2000.com/spain/1index.html" # spanish website
#entrypoint = "https://code.google.com/p/feedparser/issues/attachmentText?id=226&aid=-1296926914212963541&name=demonstrate_issue_226.xml&token=CHtgpTsdPmWnNsvScD0yfMuBriU%3A1368702558154"
numberOfPagesToCrawl = 1000
timeBetweenSameHost = 0 # 5 sec
visitOnlyTuSites = True;
#some variables
prohibitedSites = 0
visitedUrls = [] # safe already visited urls, so no url will be visited more than once
pages = {} # downloaded pages
numberHyperlink = {} # safe number of hyperlinks...
numberHyperlinksPerPage = {} # safe number of hyperlinks per page
visitedHostsWithTimestamp = {} # safe visited hosts with timestamp
robotsTxtResults = {} # safe robots.txt
lasthost = '' #last host
def normalizeMap(m):
s = sum(m.values())
for k in m:
m[k] = float(m[k]) / float(s)
def subtractDicts(dict1, dict2):
dic = dict()
for key in dict1:
if key in dict2:
dic[key] = max(0, int(dict1[key]) - int(dict2[key]))
else:
dic[key] = int(dict1[key])
for key in dict2:
if key not in dict1:
dic[key] = int(dict2[key])
return dic
def countWords(words):
counts = {}
for word in words:
if word not in counts:
counts[word] = 1
else:
counts[word] += 1
return counts
def blockedByRobotsTxt(url):
o = urlparse(url)
robotsUrl = o.scheme+"://"+o.netloc+"/robots.txt"
if url in robotsTxtResults:
rp = robotsTxtResults[robotsUrl]
else:
rp = robotparser.RobotFileParser()
rp.set_url(robotsUrl)
try:
rp.read()
robotsTxtResults[robotsUrl] = rp
except:
robotsTxtResults[robotsUrl] = None # robots.txt doesn't exist
if robotsTxtResults[robotsUrl] == None:
return False # return false if robots.txt doesn't exist
else:
if rp.can_fetch("*", url):
return False
else:
print colored("-> not allowed to visit :( "+url, "red")
global prohibitedSites
prohibitedSites += 1
return True
def canonicalUrl(url):
global lasthost
url = url.lower().replace(" ", "")
o = urlparse(url)
if o.netloc != '':
lasthost = o.scheme + '://' + o.netloc
if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl():
if ".html" in o.path:
return [url]
if "." not in o.path:
return [url]
return []
else:
if o.scheme=='':
return [urljoin(lasthost,o.path)]
else:
return []
def getNextUrlToVisit():
url = random.choice(extractor.urls)
if visitOnlyTuSites:
if 'tu-darmstadt' not in urlparse(url).netloc:
extractor.urls.remove(url)
return getNextUrlToVisit()
if url in numberHyperlink:
numberHyperlink[url] += 1
else:
numberHyperlink[url] = 1
host = urlparse(url).netloc
## check if url is blocked by robots.txt or was already visited ##
if blockedByRobotsTxt(url) or url in visitedUrls:
extractor.urls.remove(url)
return getNextUrlToVisit()
## check if host got a timeout
if host in visitedHostsWithTimestamp:
timestamp = visitedHostsWithTimestamp[host]
secondsSinceLastVisit = int(time.time()) - timestamp
if secondsSinceLastVisit >= timeBetweenSameHost:
visitedHostsWithTimestamp[host] = int(time.time())
visitedUrls.append(url)
extractor.urls.remove(url)
return url
else:
secondsToWait = timeBetweenSameHost - secondsSinceLastVisit
print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "magenta")
return getNextUrlToVisit()
else:
visitedHostsWithTimestamp[host] = int(time.time())
visitedUrls.append(url)
extractor.urls.remove(url)
return url
class URLLister(SGMLParser):
## fix SGMLParseError
def resetParser(self):
SGMLParser.reset(self)
def reset(self):
SGMLParser.reset(self)
self.urls = []
def start_a(self, attrs):
href = [v for k, v in attrs if k=='href']
if href:
url = canonicalUrl(href[0])
self.urls.extend(url)
# count number of links on actual site
if href[0] in numberHyperlinksPerPage:
numberHyperlinksPerPage[href[0]] += 1
else:
numberHyperlinksPerPage[href[0]] = 1
if __name__ == "__main__":
page = urllib2.urlopen(entrypoint, timeout = 5)
print "currently visited url: "+entrypoint
extractor = URLLister()
extractor.feed(page.read())
page.close()
i = 1
while(i <= numberOfPagesToCrawl):
url = getNextUrlToVisit()
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") currently visiting url: "+url, "blue")
try:
page = urllib2.urlopen(url, timeout = 6)
pageContent = page.read()
pageContent = pageContent.replace('<![CDATA[', '&lt;![CDATA[') ## bugfix for SGMLParser
page.close()
extractor.feed(pageContent)
pages[url] = pageContent
i += 1
# exception handling
except urllib2.HTTPError, err:
if err.code == 404:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: not found: "+url, "red")
pass
if err.code == 400:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: bad request: "+url, "red")
pass
if err.code == 403:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: forbidden: "+url, "red")
pass
except urllib2.URLError:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red")
pass
except sgmllib.SGMLParseError:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: sgmllib.SGMLParseError: "+url, "red")
extractor.resetParser()
pass
except:
print "Unexpected error:", sys.exc_info()[0]
pass
extractor.close()
print "\n \n ==== robots.txt ===="
print "prohibit by robots.txt: "+str(prohibitedSites)
## print table number hyperlinks per website ##
print "\n \n ==== numberHyperlink ===="
print "#Hyperlinks \t Website"
linkCount1 = {}
for u in numberHyperlink.values():
if u not in linkCount1:
linkCount1[u] = 1
else:
linkCount1[u] += 1
xValues1 = []
yValues1 = []
for u in linkCount1:
xValues1.append(u)
yValues1.append(linkCount1[u])
plt.plot(xValues1, yValues1)
plt.xlabel('Haeufigkeiten des Auftretens')
plt.ylabel('Anzahl der URLs')
plt.show()
## print table number hyperlinks to page ##
print "\n \n ==== Anzahl URLs pro Seite ===="
print "#Anzahl URLs pro Seite"
linkCount2 = {}
for u in numberHyperlinksPerPage.values():
if u not in linkCount2:
linkCount2[u] = 1
else:
linkCount2[u] += 1
xValues2 = []
yValues2 = []
for u in linkCount2:
xValues2.append(u)
yValues2.append(linkCount2[u])
'''plt.plot(xValues2, yValues2)
plt.xlabel('Anzahl der Hyperlinks pro Seite')
plt.ylabel('Anzahl der URLs')
#plt.xscale('log')
#plt.yscale('log')
plt.show()'''
print "\n \n ==== url queue ===="
for u in extractor.urls:
pass
#print u
threshold = 0.9 # how much similar must 2 urls be to be logged
#print "\n \n ==== copied content probability (>= " + str(threshold*100) + " %) ===="
#print "URL1 \t URL2 \t Similarity in %"
# wordcounts per page
wordCountsByPage = {}
charsByPage = {}
## count words in all pages ##
for url in pages:
tmp = re.sub("[\n\r]", "", pages[url]) # remove all scripts
tmp = re.sub("<\s*script.*?>.+?<\s*\/script.*?>", "", tmp) # remove all scripts
tmp = re.sub("<\s*style.*?>.+?<\s*\/style.*?>", "", tmp) # remove all styles
tmp = re.sub("&.+?;", "", tmp) # remove all html entities
tmp = re.sub("<.+?>", "", tmp) # remove all html tags
tmp = re.sub("\d", "", tmp) # remove all numbers
words = re.findall("(\w+)", tmp) # split words
words = [x.lower() for x in words] # all words to lower case
words = [s for s in words if len(s) > 4 and len(s) <= 10]
wordCountsByPage[url] = countWords(words)
chars = re.findall("[A-za-z]", tmp); # find all characters
chars = [x.lower() for x in chars] # all characters to lower case
charsByPage[url] = chars
## calculate wordcount deltas and print double-content sites ##
wordCountDeltas = {}
for url1 in wordCountsByPage:
for url2 in wordCountsByPage:
if url1 == url2:
continue
if url1 not in wordCountDeltas:
wordCountDeltas[url1] = {}
if url2 in wordCountDeltas[url1]: # do it once only
continue
wordCounts1 = wordCountsByPage[url1]
wordCounts2 = wordCountsByPage[url2]
sum1 = sum(wordCounts1.values())
if sum1 == 0:
continue
#print "calculating deltas of url1: " + url1 + " -- url2: " + url2
deltaWordCounts = subtractDicts(wordCounts1, wordCounts2)
wordCountDeltas[url1][url2] = math.fabs(float(sum(deltaWordCounts.values())) / float(sum1))
if 1 - wordCountDeltas[url1][url2] > threshold:
#print url1 + " \t " + url2 + " \t " + str((1 - wordCountDeltas[url1][url2]) * 100)
pass
## determine the sites' languages ##
spanish = 'es'
english = 'en'
german = 'de'
pageLanguages = {}
lettersByLanguage = {}
lettersByLanguage[spanish] = {
'e' : 13.68,
'a' : 12.53,
'o' : 8.68,
's' : 7.98,
'r' : 6.87,
'n' : 6.71,
'i' : 6.25,
'd' : 5.86,
'l' : 4.97,
'c' : 4.68,
't' : 4.63,
'u' : 3.93,
'm' : 3.15,
'p' : 2.51,
'b' : 1.42,
'g' : 1.01,
'v' : 0.90,
'y' : 0.90,
'q' : 0.88,
'h' : 0.70,
'f' : 0.69,
'z' : 0.52,
'j' : 0.44,
'x' : 0.21,
'w' : 0.02,
'k' : 0.01
}
lettersByLanguage[english] = {
'e' : 12.70,
't' : 9.06,
'a' : 8.16,
'o' : 7.50,
'i' : 6.96,
'n' : 6.74,
's' : 6.32,
'h' : 6.09,
'r' : 5.99,
'd' : 4.25,
'l' : 4.03,
'c' : 2.78,
'u' : 2.76,
'm' : 2.41,
'w' : 2.36,
'f' : 2.23,
'g' : 2.02,
'y' : 1.97,
'p' : 1.93,
'b' : 1.49,
'v' : 0.98,
'k' : 0.77,
'j' : 0.15,
'x' : 0.15,
'q' : 0.10,
'z' : 0.07
}
lettersByLanguage[german] = {
'e' : 17.4,
'n' : 9.78,
'i' : 7.55,
's' : 7.27,
'r' : 7.00,
'a' : 6.51,
't' : 6.15,
'd' : 5.08,
'h' : 4.76,
'u' : 4.35,
'l' : 3.44,
'c' : 3.06,
'g' : 3.01,
'o' : 2.59,
'm' : 2.53,
'b' : 1.89,
'w' : 1.89,
'f' : 1.66,
'k' : 1.21,
'z' : 1.13,
'v' : 0.85,
'p' : 0.67,
'j' : 0.27,
'y' : 0.04,
'x' : 0.03,
'q' : 0.02
}
# normalize maps
normalizeMap(lettersByLanguage[spanish])
normalizeMap(lettersByLanguage[english])
normalizeMap(lettersByLanguage[german])
languageCounts = {}
for url in charsByPage:
tokens = charsByPage[url]
tokenCounts = dict(Counter(tokens))
tokenSum = sum(tokenCounts.values())
# Calculating the squared error
rankings = {}
matches = {}
for token in tokenCounts:
for key2 in lettersByLanguage:
if token not in lettersByLanguage[key2]:
continue
p = float(lettersByLanguage[key2][token]) * 100
if p >= 0:
if key2 not in rankings:
rankings[key2] = 0
matches[key2] = 0
# calculate the squared error from observed and reference frequencies
rankings[key2] += math.pow(math.fabs(tokenCounts[token] * 100 / tokenSum - p), 2)
matches[key2] += 1
# Resulting language has the minimal mean squared error
minRanking = -1
language = None
for key in rankings:
rankings[key] /= matches[key]
if minRanking == -1 or rankings[key] < minRanking:
minRanking = rankings[key]
language = key
if language != None:
pageLanguages[url] = language
if language not in languageCounts:
languageCounts[language] = 1
else:
languageCounts[language] += 1
print "\n \n ==== language distribution ===="
print "Language \t Number of occurences"
for lang in languageCounts:
print lang + " \t " + str(languageCounts[lang])