diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/Solution.doc b/ss2013/1_Web Mining/Uebungen/2_Uebung/Solution.doc new file mode 100644 index 00000000..d51a7442 Binary files /dev/null and b/ss2013/1_Web Mining/Uebungen/2_Uebung/Solution.doc differ diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/Solution.docx b/ss2013/1_Web Mining/Uebungen/2_Uebung/Solution.docx new file mode 100644 index 00000000..762b45d4 Binary files /dev/null and b/ss2013/1_Web Mining/Uebungen/2_Uebung/Solution.docx differ diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/Solution.pdf b/ss2013/1_Web Mining/Uebungen/2_Uebung/Solution.pdf new file mode 100644 index 00000000..55412fdc Binary files /dev/null and b/ss2013/1_Web Mining/Uebungen/2_Uebung/Solution.pdf differ diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/challenge.txt b/ss2013/1_Web Mining/Uebungen/2_Uebung/challenge.txt new file mode 100644 index 00000000..a4e3925f --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/challenge.txt @@ -0,0 +1,10 @@ +01 es +02 de +03 en +04 en +05 de +06 es +07 es +08 de +09 en +10 es \ No newline at end of file diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py index 7ae96742..8f36cd28 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/crawler.py @@ -1,14 +1,22 @@ +# -*- coding: utf-8 -*- import urllib2 import sys import random import robotparser +import re +import math from sgmllib import SGMLParser import sgmllib from urlparse import urlparse import time from termcolor import colored +from collections import Counter ''' +VN: + - Plagiats-Checker fertig + - Sprachprüfer fertig + TODO: - canonize urls -> canonize? slides? - DONE with getNextUrlToVisit(): @@ -19,10 +27,12 @@ TODO: # crawler attributes entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten" -entrypoint = "http://www.spiegel.de" +entrypoint = "http://www.spiegel.de" # german website +#entrypoint = "http://www.cnn.com" # english website +#entrypoint = "http://www.red2000.com/spain/1index.html" # spanish website #entrypoint = "https://code.google.com/p/feedparser/issues/attachmentText?id=226&aid=-1296926914212963541&name=demonstrate_issue_226.xml&token=CHtgpTsdPmWnNsvScD0yfMuBriU%3A1368702558154" numberOfPagesToCrawl = 1000 -timeBetweenSameHost = 5 # 5 sec +timeBetweenSameHost = 0 # 5 sec #some variables @@ -37,6 +47,34 @@ numberHyperlinksPerPage = {} # safe number of hyperlinks per page visitedHostsWithTimestamp = {} # safe visited hosts with timestamp robotsTxtResults = {} # safe robots.txt +def normalizeMap(m): + s = sum(m.values()) + + for k in m: + m[k] = float(m[k]) / float(s) + +def subtractDicts(dict1, dict2): + dic = dict() + for key in dict1: + if key in dict2: + dic[key] = max(0, int(dict1[key]) - int(dict2[key])) + else: + dic[key] = int(dict1[key]) + + for key in dict2: + if key not in dict1: + dic[key] = int(dict2[key]) + + return dic + +def countWords(words): + counts = {} + for word in words: + if word not in counts: + counts[word] = 1 + else: + counts[word] += 1 + return counts def blockedByRobotsTxt(url): o = urlparse(url) @@ -217,4 +255,214 @@ if __name__ == "__main__": print "\n \n ==== url queue ====" for u in extractor.urls: pass - #print u \ No newline at end of file + #print u + + threshold = 0.9 # how much similar must 2 urls be to be logged + + print "\n \n ==== copied content probability (>= " + str(threshold*100) + " %) ====" + print "URL1 \t URL2 \t Similarity in %" + # wordcounts per page + wordCountsByPage = {} + charsByPage = {} + ## count words in all pages ## + for url in pages: + tmp = re.sub("[\n\r]", "", pages[url]) # remove all scripts + tmp = re.sub("<\s*script.*?>.+?<\s*\/script.*?>", "", tmp) # remove all scripts + tmp = re.sub("<\s*style.*?>.+?<\s*\/style.*?>", "", tmp) # remove all styles + tmp = re.sub("&.+?;", "", tmp) # remove all html entities + tmp = re.sub("<.+?>", "", tmp) # remove all html tags + tmp = re.sub("\d", "", tmp) # remove all numbers + words = re.findall("(\w+)", tmp) # split words + words = [x.lower() for x in words] # all words to lower case + words = [s for s in words if len(s) > 4 and len(s) <= 10] + + wordCountsByPage[url] = countWords(words) + + chars = re.findall("[A-za-z]", tmp); # find all characters + chars = [x.lower() for x in chars] # all characters to lower case + charsByPage[url] = chars + + ## calculate wordcount deltas and print double-content sites ## + wordCountDeltas = {} + for url1 in wordCountsByPage: + for url2 in wordCountsByPage: + if url1 == url2: + continue + + if url1 not in wordCountDeltas: + wordCountDeltas[url1] = {} + if url2 in wordCountDeltas[url1]: # do it once only + continue + + wordCounts1 = wordCountsByPage[url1] + wordCounts2 = wordCountsByPage[url2] + + sum1 = sum(wordCounts1.values()) + if sum1 == 0: + continue + + #print "calculating deltas of url1: " + url1 + " -- url2: " + url2 + deltaWordCounts = subtractDicts(wordCounts1, wordCounts2) + + wordCountDeltas[url1][url2] = math.fabs(float(sum(deltaWordCounts.values())) / float(sum1)) + if 1 - wordCountDeltas[url1][url2] > threshold: + print url1 + " \t " + url2 + " \t " + str((1 - wordCountDeltas[url1][url2]) * 100) + + + ## determine the sites' languages ## + spanish = 'es' + english = 'en' + german = 'de' + + pageLanguages = {} + lettersByLanguage = {} + lettersByLanguage[spanish] = { + 'e' : 13.68, + 'a' : 12.53, + 'o' : 8.68, + 's' : 7.98, + 'r' : 6.87, + + 'n' : 6.71, + 'i' : 6.25, + 'd' : 5.86, + 'l' : 4.97, + 'c' : 4.68, + + 't' : 4.63, + 'u' : 3.93, + 'm' : 3.15, + 'p' : 2.51, + 'b' : 1.42, + + 'g' : 1.01, + 'v' : 0.90, + 'y' : 0.90, + 'q' : 0.88, + 'h' : 0.70, + + 'f' : 0.69, + 'z' : 0.52, + 'j' : 0.44, + 'x' : 0.21, + 'w' : 0.02, + + 'k' : 0.01 + } + lettersByLanguage[english] = { + 'e' : 12.70, + 't' : 9.06, + 'a' : 8.16, + 'o' : 7.50, + 'i' : 6.96, + + 'n' : 6.74, + 's' : 6.32, + 'h' : 6.09, + 'r' : 5.99, + 'd' : 4.25, + + 'l' : 4.03, + 'c' : 2.78, + 'u' : 2.76, + 'm' : 2.41, + 'w' : 2.36, + + 'f' : 2.23, + 'g' : 2.02, + 'y' : 1.97, + 'p' : 1.93, + 'b' : 1.49, + + 'v' : 0.98, + 'k' : 0.77, + 'j' : 0.15, + 'x' : 0.15, + 'q' : 0.10, + + 'z' : 0.07 + } + lettersByLanguage[german] = { + 'e' : 17.4, + 'n' : 9.78, + 'i' : 7.55, + 's' : 7.27, + 'r' : 7.00, + + 'a' : 6.51, + 't' : 6.15, + 'd' : 5.08, + 'h' : 4.76, + 'u' : 4.35, + + 'l' : 3.44, + 'c' : 3.06, + 'g' : 3.01, + 'o' : 2.59, + 'm' : 2.53, + + 'b' : 1.89, + 'w' : 1.89, + 'f' : 1.66, + 'k' : 1.21, + 'z' : 1.13, + + 'v' : 0.85, + 'p' : 0.67, + 'j' : 0.27, + 'y' : 0.04, + 'x' : 0.03, + + 'q' : 0.02 + } + + # normalize maps + normalizeMap(lettersByLanguage[spanish]) + normalizeMap(lettersByLanguage[english]) + normalizeMap(lettersByLanguage[german]) + + languageCounts = {} + for url in charsByPage: + tokens = charsByPage[url] + tokenCounts = dict(Counter(tokens)) + + tokenSum = sum(tokenCounts.values()) + + # Calculating the squared error + rankings = {} + matches = {} + for token in tokenCounts: + for key2 in lettersByLanguage: + if token not in lettersByLanguage[key2]: + continue + p = float(lettersByLanguage[key2][token]) * 100 + if p >= 0: + if key2 not in rankings: + rankings[key2] = 0 + matches[key2] = 0 + # calculate the squared error from observed and reference frequencies + rankings[key2] += math.pow(math.fabs(tokenCounts[token] * 100 / tokenSum - p), 2) + matches[key2] += 1 + + # Resulting language has the minimal mean squared error + minRanking = -1 + language = None + for key in rankings: + rankings[key] /= matches[key] + + if minRanking == -1 or rankings[key] < minRanking: + minRanking = rankings[key] + language = key + + if language != None: + pageLanguages[url] = language + + if language not in languageCounts: + languageCounts[language] = 1 + else: + languageCounts[language] += 1 + + print "\n \n ==== language distribution ====" + print "Language \t Number of occurences" + for lang in languageCounts: + print lang + " \t " + str(languageCounts[lang]) \ No newline at end of file diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc index b27a1e7a..6a83f8f3 100644 Binary files a/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc and b/ss2013/1_Web Mining/Uebungen/2_Uebung/crawler/termcolor.pyc differ diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/keaddon2/keaddon.xpi b/ss2013/1_Web Mining/Uebungen/2_Uebung/keaddon2/keaddon.xpi index b887136e..9d7577df 100644 Binary files a/ss2013/1_Web Mining/Uebungen/2_Uebung/keaddon2/keaddon.xpi and b/ss2013/1_Web Mining/Uebungen/2_Uebung/keaddon2/keaddon.xpi differ