2te Übung

2013-05-18 08:47:41 +02:00 · 2013-05-18 08:47:41 +02:00 · 6511d9a07b
commit 6511d9a07b
parent ae01454a1d
7 changed files with 261 additions and 3 deletions
--- a/Mining/Uebungen/2_Uebung/Solution.doc
+++ b/Mining/Uebungen/2_Uebung/Solution.doc
--- a/Mining/Uebungen/2_Uebung/Solution.docx
+++ b/Mining/Uebungen/2_Uebung/Solution.docx
--- a/Mining/Uebungen/2_Uebung/Solution.pdf
+++ b/Mining/Uebungen/2_Uebung/Solution.pdf
--- a/Mining/Uebungen/2_Uebung/challenge.txt
+++ b/Mining/Uebungen/2_Uebung/challenge.txt
@ -0,0 +1,10 @@
+01 es
+02 de
+03 en
+04 en
+05 de
+06 es
+07 es
+08 de
+09 en
+10 es 
--- a/Mining/Uebungen/2_Uebung/crawler/crawler.py
+++ b/Mining/Uebungen/2_Uebung/crawler/crawler.py
@ -1,14 +1,22 @@
+# -*- coding: utf-8 -*-
 import urllib2
 import sys
 import random
 import robotparser
+import re
+import math
 from sgmllib import SGMLParser
 import sgmllib
 from urlparse import urlparse
 import time
 from termcolor import colored
+from collections import Counter

 '''
+VN:
+    - Plagiats-Checker fertig
+    - Sprachprüfer fertig
+
 TODO:
    - canonize urls -> canonize? slides?
    - DONE with getNextUrlToVisit():
@ -19,10 +27,12 @@ TODO:

 # crawler attributes
 entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
-entrypoint = "http://www.spiegel.de"
+entrypoint = "http://www.spiegel.de" # german website
+#entrypoint = "http://www.cnn.com" # english website
+#entrypoint = "http://www.red2000.com/spain/1index.html" # spanish website
 #entrypoint = "https://code.google.com/p/feedparser/issues/attachmentText?id=226&aid=-1296926914212963541&name=demonstrate_issue_226.xml&token=CHtgpTsdPmWnNsvScD0yfMuBriU%3A1368702558154"
 numberOfPagesToCrawl = 1000
-timeBetweenSameHost = 5 # 5 sec
+timeBetweenSameHost = 0 # 5 sec


 #some variables
@ -37,6 +47,34 @@ numberHyperlinksPerPage = {} # safe number of hyperlinks per page
 visitedHostsWithTimestamp = {} # safe visited hosts with timestamp
 robotsTxtResults = {} # safe robots.txt

+def normalizeMap(m):
+    s = sum(m.values())
+	
+    for k in m:
+        m[k] = float(m[k]) / float(s)
+
+def subtractDicts(dict1, dict2):
+    dic = dict()
+    for key in dict1:
+        if key in dict2:
+            dic[key] = max(0, int(dict1[key]) - int(dict2[key]))
+        else:
+            dic[key] = int(dict1[key])
+            
+    for key in dict2:
+        if key not in dict1:
+            dic[key] = int(dict2[key])
+    
+    return dic
+
+def countWords(words):
+    counts = {}
+    for word in words:
+        if word not in counts:
+            counts[word] = 1
+        else:
+            counts[word] += 1
+    return counts

 def blockedByRobotsTxt(url):
    o = urlparse(url)
@ -217,4 +255,214 @@ if __name__ == "__main__":
    print "\n \n ==== url queue ===="    
    for u in extractor.urls:
        pass
-        #print u
+        #print u
+        
+    threshold = 0.9 # how much similar must 2 urls be to be logged
+        
+    print "\n \n ==== copied content probability (>= " + str(threshold*100) + " %) ===="
+    print "URL1 \t URL2 \t Similarity in %"
+    # wordcounts per page
+    wordCountsByPage = {}
+    charsByPage = {}
+    ## count words in all pages ##
+    for url in pages:
+        tmp = re.sub("[\n\r]", "", pages[url]) # remove all scripts
+        tmp = re.sub("<\s*script.*?>.+?<\s*\/script.*?>", "", tmp) # remove all scripts
+        tmp = re.sub("<\s*style.*?>.+?<\s*\/style.*?>", "", tmp) # remove all styles
+        tmp = re.sub("&.+?;", "", tmp) # remove all html entities
+        tmp = re.sub("<.+?>", "", tmp) # remove all html tags
+        tmp = re.sub("\d", "", tmp) # remove all numbers
+        words = re.findall("(\w+)", tmp) # split words
+        words = [x.lower() for x in words] # all words to lower case
+        words = [s for s in words if len(s) > 4 and len(s) <= 10]
+            
+        wordCountsByPage[url] = countWords(words)
+        
+        chars = re.findall("[A-za-z]", tmp); # find all characters
+        chars = [x.lower() for x in chars] # all characters to lower case
+        charsByPage[url] = chars
+
+    ## calculate wordcount deltas and print double-content sites ##
+    wordCountDeltas = {}
+    for url1 in wordCountsByPage:
+        for url2 in wordCountsByPage:
+            if url1 == url2:
+                continue
+                
+            if url1 not in wordCountDeltas:
+                wordCountDeltas[url1] = {}
+            if url2 in wordCountDeltas[url1]: # do it once only
+                continue
+                
+            wordCounts1 = wordCountsByPage[url1]
+            wordCounts2 = wordCountsByPage[url2]
+            
+            sum1 = sum(wordCounts1.values())
+            if sum1 == 0:
+                continue
+            
+            #print "calculating deltas of url1: " + url1 + " -- url2: " + url2
+            deltaWordCounts = subtractDicts(wordCounts1, wordCounts2)
+            
+            wordCountDeltas[url1][url2] = math.fabs(float(sum(deltaWordCounts.values())) / float(sum1))
+            if 1 - wordCountDeltas[url1][url2] > threshold:
+                print url1 + " \t " + url2 + " \t " + str((1 - wordCountDeltas[url1][url2]) * 100)
+
+    
+    ## determine the sites' languages ##
+    spanish = 'es'
+    english = 'en'
+    german = 'de'
+    
+    pageLanguages = {}
+    lettersByLanguage = {}
+    lettersByLanguage[spanish] = {
+        'e' : 13.68,
+        'a' : 12.53,
+        'o' : 8.68,
+        's' : 7.98,
+        'r' : 6.87,
+        
+        'n' : 6.71,
+        'i' : 6.25,
+        'd' : 5.86,
+        'l' : 4.97,
+        'c' : 4.68,
+        
+        't' : 4.63,
+        'u' : 3.93,
+        'm' : 3.15,
+        'p' : 2.51,
+        'b' : 1.42,
+        
+        'g' : 1.01,
+        'v' : 0.90,
+        'y' : 0.90,
+        'q' : 0.88,
+        'h' : 0.70,
+        
+        'f' : 0.69,
+        'z' : 0.52,
+        'j' : 0.44,
+        'x' : 0.21,
+        'w' : 0.02,
+        
+        'k' : 0.01
+    }
+    lettersByLanguage[english] = {
+        'e' : 12.70,
+        't' : 9.06,
+        'a' : 8.16,
+        'o' : 7.50,
+        'i' : 6.96,
+        
+        'n' : 6.74,
+        's' : 6.32,
+        'h' : 6.09,
+        'r' : 5.99,
+        'd' : 4.25,
+        
+        'l' : 4.03,
+        'c' : 2.78,
+        'u' : 2.76,
+        'm' : 2.41,
+        'w' : 2.36,
+        
+        'f' : 2.23,
+        'g' : 2.02,
+        'y' : 1.97,
+        'p' : 1.93,
+        'b' : 1.49,
+        
+        'v' : 0.98,
+        'k' : 0.77,
+        'j' : 0.15,
+        'x' : 0.15,
+        'q' : 0.10,
+        
+        'z' : 0.07
+    }
+    lettersByLanguage[german] = {
+        'e' : 17.4,
+        'n' : 9.78,
+        'i' : 7.55,
+        's' : 7.27,
+        'r' : 7.00,
+        
+        'a' : 6.51,
+        't' : 6.15,
+        'd' : 5.08,
+        'h' : 4.76,
+        'u' : 4.35,
+        
+        'l' : 3.44,
+        'c' : 3.06,
+        'g' : 3.01,
+        'o' : 2.59,
+        'm' : 2.53,
+        
+        'b' : 1.89,
+        'w' : 1.89,
+        'f' : 1.66,
+        'k' : 1.21,
+        'z' : 1.13,
+        
+        'v' : 0.85,
+        'p' : 0.67,
+        'j' : 0.27,
+        'y' : 0.04,
+        'x' : 0.03,
+        
+        'q' : 0.02
+    }
+
+    # normalize maps
+    normalizeMap(lettersByLanguage[spanish])
+    normalizeMap(lettersByLanguage[english])
+    normalizeMap(lettersByLanguage[german])
+
+    languageCounts = {}
+    for url in charsByPage:
+        tokens = charsByPage[url]
+        tokenCounts = dict(Counter(tokens))
+	
+	tokenSum = sum(tokenCounts.values())
+	
+	# Calculating the squared error
+	rankings = {}
+	matches = {}
+	for token in tokenCounts:
+           for key2 in lettersByLanguage:
+               if token not in lettersByLanguage[key2]:
+                   continue
+               p = float(lettersByLanguage[key2][token]) * 100
+               if p >= 0:
+                   if key2 not in rankings:
+                       rankings[key2] = 0
+                       matches[key2] = 0
+                   # calculate the squared error from observed and reference frequencies
+                   rankings[key2] += math.pow(math.fabs(tokenCounts[token] * 100 / tokenSum - p), 2)
+                   matches[key2] += 1
+	
+	# Resulting language has the minimal mean squared error
+	minRanking = -1
+	language = None
+	for key in rankings:
+            rankings[key] /= matches[key]
+	
+            if minRanking == -1 or rankings[key] < minRanking:
+                minRanking = rankings[key]
+                language = key
+                
+        if language != None:                        
+            pageLanguages[url] = language
+            
+            if language not in languageCounts:
+                languageCounts[language] = 1
+            else:
+                languageCounts[language] += 1
+            
+    print "\n \n ==== language distribution ===="
+    print "Language \t Number of occurences"
+    for lang in languageCounts:
+        print lang + " \t " + str(languageCounts[lang])
--- a/Mining/Uebungen/2_Uebung/crawler/termcolor.pyc
+++ b/Mining/Uebungen/2_Uebung/crawler/termcolor.pyc
--- a/Mining/Uebungen/2_Uebung/keaddon2/keaddon.xpi
+++ b/Mining/Uebungen/2_Uebung/keaddon2/keaddon.xpi