510 lines
15 KiB
Python
510 lines
15 KiB
Python
# -*- coding: utf-8 -*-
|
|
import urllib2
|
|
import sys
|
|
import random
|
|
import robotparser
|
|
import re
|
|
import math
|
|
from sgmllib import SGMLParser
|
|
import sgmllib
|
|
from urlparse import urlparse
|
|
from urlparse import urljoin
|
|
import matplotlib.pyplot as plt
|
|
import time
|
|
from termcolor import colored
|
|
from collections import Counter
|
|
|
|
'''
|
|
VN:
|
|
- Plagiats-Checker fertig
|
|
- Sprachprüfer fertig
|
|
|
|
TODO:
|
|
- DONE canonize urls -> canonize? slides? -> remember last host -> no magic here -> even using ugly global
|
|
- DONE with getNextUrlToVisit():
|
|
server timeout -> safe crawled host, set timeout for crawled host
|
|
- statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html
|
|
|
|
'''
|
|
|
|
# crawler attributes
|
|
entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
|
|
#entrypoint = "http://www.spiegel.de" # german website
|
|
#entrypoint = "http://www.cnn.com" # english website
|
|
#entrypoint = "http://www.red2000.com/spain/1index.html" # spanish website
|
|
#entrypoint = "https://code.google.com/p/feedparser/issues/attachmentText?id=226&aid=-1296926914212963541&name=demonstrate_issue_226.xml&token=CHtgpTsdPmWnNsvScD0yfMuBriU%3A1368702558154"
|
|
numberOfPagesToCrawl = 1000
|
|
timeBetweenSameHost = 0 # 5 sec
|
|
visitOnlyTuSites = True;
|
|
|
|
|
|
#some variables
|
|
prohibitedSites = 0
|
|
visitedUrls = [] # safe already visited urls, so no url will be visited more than once
|
|
|
|
pages = {} # downloaded pages
|
|
|
|
numberHyperlink = {} # safe number of hyperlinks...
|
|
numberHyperlinksPerPage = {} # safe number of hyperlinks per page
|
|
|
|
visitedHostsWithTimestamp = {} # safe visited hosts with timestamp
|
|
robotsTxtResults = {} # safe robots.txt
|
|
|
|
lasthost = '' #last host
|
|
|
|
def normalizeMap(m):
|
|
s = sum(m.values())
|
|
|
|
for k in m:
|
|
m[k] = float(m[k]) / float(s)
|
|
|
|
def subtractDicts(dict1, dict2):
|
|
dic = dict()
|
|
for key in dict1:
|
|
if key in dict2:
|
|
dic[key] = max(0, int(dict1[key]) - int(dict2[key]))
|
|
else:
|
|
dic[key] = int(dict1[key])
|
|
|
|
for key in dict2:
|
|
if key not in dict1:
|
|
dic[key] = int(dict2[key])
|
|
|
|
return dic
|
|
|
|
def countWords(words):
|
|
counts = {}
|
|
for word in words:
|
|
if word not in counts:
|
|
counts[word] = 1
|
|
else:
|
|
counts[word] += 1
|
|
return counts
|
|
|
|
def blockedByRobotsTxt(url):
|
|
o = urlparse(url)
|
|
robotsUrl = o.scheme+"://"+o.netloc+"/robots.txt"
|
|
|
|
if url in robotsTxtResults:
|
|
rp = robotsTxtResults[robotsUrl]
|
|
else:
|
|
rp = robotparser.RobotFileParser()
|
|
rp.set_url(robotsUrl)
|
|
|
|
try:
|
|
rp.read()
|
|
robotsTxtResults[robotsUrl] = rp
|
|
except:
|
|
robotsTxtResults[robotsUrl] = None # robots.txt doesn't exist
|
|
|
|
if robotsTxtResults[robotsUrl] == None:
|
|
return False # return false if robots.txt doesn't exist
|
|
else:
|
|
if rp.can_fetch("*", url):
|
|
return False
|
|
else:
|
|
print colored("-> not allowed to visit :( "+url, "red")
|
|
global prohibitedSites
|
|
prohibitedSites += 1
|
|
return True
|
|
|
|
def canonicalUrl(url):
|
|
global lasthost
|
|
url = url.lower().replace(" ", "")
|
|
|
|
o = urlparse(url)
|
|
|
|
if o.netloc != '':
|
|
lasthost = o.scheme + '://' + o.netloc
|
|
|
|
if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl():
|
|
if ".html" in o.path:
|
|
return [url]
|
|
if "." not in o.path:
|
|
return [url]
|
|
return []
|
|
else:
|
|
if o.scheme=='':
|
|
return [urljoin(lasthost,o.path)]
|
|
else:
|
|
return []
|
|
|
|
|
|
def getNextUrlToVisit():
|
|
url = random.choice(extractor.urls)
|
|
|
|
if visitOnlyTuSites:
|
|
if 'tu-darmstadt' not in urlparse(url).netloc:
|
|
extractor.urls.remove(url)
|
|
return getNextUrlToVisit()
|
|
if url in numberHyperlink:
|
|
numberHyperlink[url] += 1
|
|
else:
|
|
numberHyperlink[url] = 1
|
|
|
|
|
|
host = urlparse(url).netloc
|
|
|
|
## check if url is blocked by robots.txt or was already visited ##
|
|
if blockedByRobotsTxt(url) or url in visitedUrls:
|
|
extractor.urls.remove(url)
|
|
return getNextUrlToVisit()
|
|
|
|
## check if host got a timeout
|
|
if host in visitedHostsWithTimestamp:
|
|
timestamp = visitedHostsWithTimestamp[host]
|
|
secondsSinceLastVisit = int(time.time()) - timestamp
|
|
if secondsSinceLastVisit >= timeBetweenSameHost:
|
|
visitedHostsWithTimestamp[host] = int(time.time())
|
|
visitedUrls.append(url)
|
|
extractor.urls.remove(url)
|
|
return url
|
|
else:
|
|
secondsToWait = timeBetweenSameHost - secondsSinceLastVisit
|
|
print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "magenta")
|
|
return getNextUrlToVisit()
|
|
else:
|
|
visitedHostsWithTimestamp[host] = int(time.time())
|
|
visitedUrls.append(url)
|
|
extractor.urls.remove(url)
|
|
return url
|
|
|
|
|
|
class URLLister(SGMLParser):
|
|
## fix SGMLParseError
|
|
def resetParser(self):
|
|
SGMLParser.reset(self)
|
|
|
|
def reset(self):
|
|
SGMLParser.reset(self)
|
|
self.urls = []
|
|
|
|
def start_a(self, attrs):
|
|
|
|
href = [v for k, v in attrs if k=='href']
|
|
if href:
|
|
url = canonicalUrl(href[0])
|
|
self.urls.extend(url)
|
|
|
|
# count number of links on actual site
|
|
if href[0] in numberHyperlinksPerPage:
|
|
numberHyperlinksPerPage[href[0]] += 1
|
|
else:
|
|
numberHyperlinksPerPage[href[0]] = 1
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
page = urllib2.urlopen(entrypoint, timeout = 5)
|
|
print "currently visited url: "+entrypoint
|
|
extractor = URLLister()
|
|
extractor.feed(page.read())
|
|
page.close()
|
|
|
|
|
|
i = 1
|
|
while(i <= numberOfPagesToCrawl):
|
|
url = getNextUrlToVisit()
|
|
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") currently visiting url: "+url, "blue")
|
|
try:
|
|
page = urllib2.urlopen(url, timeout = 6)
|
|
pageContent = page.read()
|
|
pageContent = pageContent.replace('<![CDATA[', '<![CDATA[') ## bugfix for SGMLParser
|
|
page.close()
|
|
extractor.feed(pageContent)
|
|
pages[url] = pageContent
|
|
i += 1
|
|
|
|
# exception handling
|
|
except urllib2.HTTPError, err:
|
|
if err.code == 404:
|
|
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: not found: "+url, "red")
|
|
pass
|
|
if err.code == 400:
|
|
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: bad request: "+url, "red")
|
|
pass
|
|
if err.code == 403:
|
|
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: forbidden: "+url, "red")
|
|
pass
|
|
except urllib2.URLError:
|
|
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red")
|
|
pass
|
|
except sgmllib.SGMLParseError:
|
|
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: sgmllib.SGMLParseError: "+url, "red")
|
|
extractor.resetParser()
|
|
pass
|
|
except:
|
|
print "Unexpected error:", sys.exc_info()[0]
|
|
pass
|
|
|
|
extractor.close()
|
|
|
|
print "\n \n ==== robots.txt ===="
|
|
print "prohibit by robots.txt: "+str(prohibitedSites)
|
|
|
|
|
|
## print table number hyperlinks per website ##
|
|
print "\n \n ==== numberHyperlink ===="
|
|
print "#Hyperlinks \t Website"
|
|
|
|
linkCount1 = {}
|
|
for u in numberHyperlink.values():
|
|
if u not in linkCount1:
|
|
linkCount1[u] = 1
|
|
else:
|
|
linkCount1[u] += 1
|
|
|
|
|
|
xValues1 = []
|
|
yValues1 = []
|
|
|
|
for u in linkCount1:
|
|
xValues1.append(u)
|
|
yValues1.append(linkCount1[u])
|
|
|
|
plt.plot(xValues1, yValues1)
|
|
plt.xlabel('Haeufigkeiten des Auftretens')
|
|
plt.ylabel('Anzahl der URLs')
|
|
plt.show()
|
|
|
|
|
|
## print table number hyperlinks to page ##
|
|
print "\n \n ==== Anzahl URLs pro Seite ===="
|
|
print "#Anzahl URLs pro Seite"
|
|
linkCount2 = {}
|
|
for u in numberHyperlinksPerPage.values():
|
|
if u not in linkCount2:
|
|
linkCount2[u] = 1
|
|
else:
|
|
linkCount2[u] += 1
|
|
|
|
|
|
xValues2 = []
|
|
yValues2 = []
|
|
|
|
for u in linkCount2:
|
|
xValues2.append(u)
|
|
yValues2.append(linkCount2[u])
|
|
|
|
'''plt.plot(xValues2, yValues2)
|
|
plt.xlabel('Anzahl der Hyperlinks pro Seite')
|
|
plt.ylabel('Anzahl der URLs')
|
|
#plt.xscale('log')
|
|
#plt.yscale('log')
|
|
plt.show()'''
|
|
|
|
print "\n \n ==== url queue ===="
|
|
for u in extractor.urls:
|
|
pass
|
|
#print u
|
|
|
|
threshold = 0.9 # how much similar must 2 urls be to be logged
|
|
|
|
#print "\n \n ==== copied content probability (>= " + str(threshold*100) + " %) ===="
|
|
#print "URL1 \t URL2 \t Similarity in %"
|
|
# wordcounts per page
|
|
wordCountsByPage = {}
|
|
charsByPage = {}
|
|
## count words in all pages ##
|
|
for url in pages:
|
|
tmp = re.sub("[\n\r]", "", pages[url]) # remove all scripts
|
|
tmp = re.sub("<\s*script.*?>.+?<\s*\/script.*?>", "", tmp) # remove all scripts
|
|
tmp = re.sub("<\s*style.*?>.+?<\s*\/style.*?>", "", tmp) # remove all styles
|
|
tmp = re.sub("&.+?;", "", tmp) # remove all html entities
|
|
tmp = re.sub("<.+?>", "", tmp) # remove all html tags
|
|
tmp = re.sub("\d", "", tmp) # remove all numbers
|
|
words = re.findall("(\w+)", tmp) # split words
|
|
words = [x.lower() for x in words] # all words to lower case
|
|
words = [s for s in words if len(s) > 4 and len(s) <= 10]
|
|
|
|
wordCountsByPage[url] = countWords(words)
|
|
|
|
chars = re.findall("[A-za-z]", tmp); # find all characters
|
|
chars = [x.lower() for x in chars] # all characters to lower case
|
|
charsByPage[url] = chars
|
|
|
|
## calculate wordcount deltas and print double-content sites ##
|
|
wordCountDeltas = {}
|
|
for url1 in wordCountsByPage:
|
|
for url2 in wordCountsByPage:
|
|
if url1 == url2:
|
|
continue
|
|
|
|
if url1 not in wordCountDeltas:
|
|
wordCountDeltas[url1] = {}
|
|
if url2 in wordCountDeltas[url1]: # do it once only
|
|
continue
|
|
|
|
wordCounts1 = wordCountsByPage[url1]
|
|
wordCounts2 = wordCountsByPage[url2]
|
|
|
|
sum1 = sum(wordCounts1.values())
|
|
if sum1 == 0:
|
|
continue
|
|
|
|
#print "calculating deltas of url1: " + url1 + " -- url2: " + url2
|
|
deltaWordCounts = subtractDicts(wordCounts1, wordCounts2)
|
|
|
|
wordCountDeltas[url1][url2] = math.fabs(float(sum(deltaWordCounts.values())) / float(sum1))
|
|
if 1 - wordCountDeltas[url1][url2] > threshold:
|
|
#print url1 + " \t " + url2 + " \t " + str((1 - wordCountDeltas[url1][url2]) * 100)
|
|
pass
|
|
|
|
## determine the sites' languages ##
|
|
spanish = 'es'
|
|
english = 'en'
|
|
german = 'de'
|
|
|
|
pageLanguages = {}
|
|
lettersByLanguage = {}
|
|
lettersByLanguage[spanish] = {
|
|
'e' : 13.68,
|
|
'a' : 12.53,
|
|
'o' : 8.68,
|
|
's' : 7.98,
|
|
'r' : 6.87,
|
|
|
|
'n' : 6.71,
|
|
'i' : 6.25,
|
|
'd' : 5.86,
|
|
'l' : 4.97,
|
|
'c' : 4.68,
|
|
|
|
't' : 4.63,
|
|
'u' : 3.93,
|
|
'm' : 3.15,
|
|
'p' : 2.51,
|
|
'b' : 1.42,
|
|
|
|
'g' : 1.01,
|
|
'v' : 0.90,
|
|
'y' : 0.90,
|
|
'q' : 0.88,
|
|
'h' : 0.70,
|
|
|
|
'f' : 0.69,
|
|
'z' : 0.52,
|
|
'j' : 0.44,
|
|
'x' : 0.21,
|
|
'w' : 0.02,
|
|
|
|
'k' : 0.01
|
|
}
|
|
lettersByLanguage[english] = {
|
|
'e' : 12.70,
|
|
't' : 9.06,
|
|
'a' : 8.16,
|
|
'o' : 7.50,
|
|
'i' : 6.96,
|
|
|
|
'n' : 6.74,
|
|
's' : 6.32,
|
|
'h' : 6.09,
|
|
'r' : 5.99,
|
|
'd' : 4.25,
|
|
|
|
'l' : 4.03,
|
|
'c' : 2.78,
|
|
'u' : 2.76,
|
|
'm' : 2.41,
|
|
'w' : 2.36,
|
|
|
|
'f' : 2.23,
|
|
'g' : 2.02,
|
|
'y' : 1.97,
|
|
'p' : 1.93,
|
|
'b' : 1.49,
|
|
|
|
'v' : 0.98,
|
|
'k' : 0.77,
|
|
'j' : 0.15,
|
|
'x' : 0.15,
|
|
'q' : 0.10,
|
|
|
|
'z' : 0.07
|
|
}
|
|
lettersByLanguage[german] = {
|
|
'e' : 17.4,
|
|
'n' : 9.78,
|
|
'i' : 7.55,
|
|
's' : 7.27,
|
|
'r' : 7.00,
|
|
|
|
'a' : 6.51,
|
|
't' : 6.15,
|
|
'd' : 5.08,
|
|
'h' : 4.76,
|
|
'u' : 4.35,
|
|
|
|
'l' : 3.44,
|
|
'c' : 3.06,
|
|
'g' : 3.01,
|
|
'o' : 2.59,
|
|
'm' : 2.53,
|
|
|
|
'b' : 1.89,
|
|
'w' : 1.89,
|
|
'f' : 1.66,
|
|
'k' : 1.21,
|
|
'z' : 1.13,
|
|
|
|
'v' : 0.85,
|
|
'p' : 0.67,
|
|
'j' : 0.27,
|
|
'y' : 0.04,
|
|
'x' : 0.03,
|
|
|
|
'q' : 0.02
|
|
}
|
|
|
|
# normalize maps
|
|
normalizeMap(lettersByLanguage[spanish])
|
|
normalizeMap(lettersByLanguage[english])
|
|
normalizeMap(lettersByLanguage[german])
|
|
|
|
languageCounts = {}
|
|
for url in charsByPage:
|
|
tokens = charsByPage[url]
|
|
tokenCounts = dict(Counter(tokens))
|
|
|
|
tokenSum = sum(tokenCounts.values())
|
|
|
|
# Calculating the squared error
|
|
rankings = {}
|
|
matches = {}
|
|
for token in tokenCounts:
|
|
for key2 in lettersByLanguage:
|
|
if token not in lettersByLanguage[key2]:
|
|
continue
|
|
p = float(lettersByLanguage[key2][token]) * 100
|
|
if p >= 0:
|
|
if key2 not in rankings:
|
|
rankings[key2] = 0
|
|
matches[key2] = 0
|
|
# calculate the squared error from observed and reference frequencies
|
|
rankings[key2] += math.pow(math.fabs(tokenCounts[token] * 100 / tokenSum - p), 2)
|
|
matches[key2] += 1
|
|
|
|
# Resulting language has the minimal mean squared error
|
|
minRanking = -1
|
|
language = None
|
|
for key in rankings:
|
|
rankings[key] /= matches[key]
|
|
|
|
if minRanking == -1 or rankings[key] < minRanking:
|
|
minRanking = rankings[key]
|
|
language = key
|
|
|
|
if language != None:
|
|
pageLanguages[url] = language
|
|
|
|
if language not in languageCounts:
|
|
languageCounts[language] = 1
|
|
else:
|
|
languageCounts[language] += 1
|
|
|
|
print "\n \n ==== language distribution ===="
|
|
print "Language \t Number of occurences"
|
|
for lang in languageCounts:
|
|
print lang + " \t " + str(languageCounts[lang]) |