2te Übung

This commit is contained in:
Victor-Philipp Negoescu 2013-05-18 08:47:41 +02:00
parent ae01454a1d
commit 6511d9a07b
7 changed files with 261 additions and 3 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,10 @@
01 es
02 de
03 en
04 en
05 de
06 es
07 es
08 de
09 en
10 es

View File

@ -1,14 +1,22 @@
# -*- coding: utf-8 -*-
import urllib2
import sys
import random
import robotparser
import re
import math
from sgmllib import SGMLParser
import sgmllib
from urlparse import urlparse
import time
from termcolor import colored
from collections import Counter
'''
VN:
- Plagiats-Checker fertig
- Sprachprüfer fertig
TODO:
- canonize urls -> canonize? slides?
- DONE with getNextUrlToVisit():
@ -19,10 +27,12 @@ TODO:
# crawler attributes
entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
entrypoint = "http://www.spiegel.de"
entrypoint = "http://www.spiegel.de" # german website
#entrypoint = "http://www.cnn.com" # english website
#entrypoint = "http://www.red2000.com/spain/1index.html" # spanish website
#entrypoint = "https://code.google.com/p/feedparser/issues/attachmentText?id=226&aid=-1296926914212963541&name=demonstrate_issue_226.xml&token=CHtgpTsdPmWnNsvScD0yfMuBriU%3A1368702558154"
numberOfPagesToCrawl = 1000
timeBetweenSameHost = 5 # 5 sec
timeBetweenSameHost = 0 # 5 sec
#some variables
@ -37,6 +47,34 @@ numberHyperlinksPerPage = {} # safe number of hyperlinks per page
visitedHostsWithTimestamp = {} # safe visited hosts with timestamp
robotsTxtResults = {} # safe robots.txt
def normalizeMap(m):
s = sum(m.values())
for k in m:
m[k] = float(m[k]) / float(s)
def subtractDicts(dict1, dict2):
dic = dict()
for key in dict1:
if key in dict2:
dic[key] = max(0, int(dict1[key]) - int(dict2[key]))
else:
dic[key] = int(dict1[key])
for key in dict2:
if key not in dict1:
dic[key] = int(dict2[key])
return dic
def countWords(words):
counts = {}
for word in words:
if word not in counts:
counts[word] = 1
else:
counts[word] += 1
return counts
def blockedByRobotsTxt(url):
o = urlparse(url)
@ -217,4 +255,214 @@ if __name__ == "__main__":
print "\n \n ==== url queue ===="
for u in extractor.urls:
pass
#print u
#print u
threshold = 0.9 # how much similar must 2 urls be to be logged
print "\n \n ==== copied content probability (>= " + str(threshold*100) + " %) ===="
print "URL1 \t URL2 \t Similarity in %"
# wordcounts per page
wordCountsByPage = {}
charsByPage = {}
## count words in all pages ##
for url in pages:
tmp = re.sub("[\n\r]", "", pages[url]) # remove all scripts
tmp = re.sub("<\s*script.*?>.+?<\s*\/script.*?>", "", tmp) # remove all scripts
tmp = re.sub("<\s*style.*?>.+?<\s*\/style.*?>", "", tmp) # remove all styles
tmp = re.sub("&.+?;", "", tmp) # remove all html entities
tmp = re.sub("<.+?>", "", tmp) # remove all html tags
tmp = re.sub("\d", "", tmp) # remove all numbers
words = re.findall("(\w+)", tmp) # split words
words = [x.lower() for x in words] # all words to lower case
words = [s for s in words if len(s) > 4 and len(s) <= 10]
wordCountsByPage[url] = countWords(words)
chars = re.findall("[A-za-z]", tmp); # find all characters
chars = [x.lower() for x in chars] # all characters to lower case
charsByPage[url] = chars
## calculate wordcount deltas and print double-content sites ##
wordCountDeltas = {}
for url1 in wordCountsByPage:
for url2 in wordCountsByPage:
if url1 == url2:
continue
if url1 not in wordCountDeltas:
wordCountDeltas[url1] = {}
if url2 in wordCountDeltas[url1]: # do it once only
continue
wordCounts1 = wordCountsByPage[url1]
wordCounts2 = wordCountsByPage[url2]
sum1 = sum(wordCounts1.values())
if sum1 == 0:
continue
#print "calculating deltas of url1: " + url1 + " -- url2: " + url2
deltaWordCounts = subtractDicts(wordCounts1, wordCounts2)
wordCountDeltas[url1][url2] = math.fabs(float(sum(deltaWordCounts.values())) / float(sum1))
if 1 - wordCountDeltas[url1][url2] > threshold:
print url1 + " \t " + url2 + " \t " + str((1 - wordCountDeltas[url1][url2]) * 100)
## determine the sites' languages ##
spanish = 'es'
english = 'en'
german = 'de'
pageLanguages = {}
lettersByLanguage = {}
lettersByLanguage[spanish] = {
'e' : 13.68,
'a' : 12.53,
'o' : 8.68,
's' : 7.98,
'r' : 6.87,
'n' : 6.71,
'i' : 6.25,
'd' : 5.86,
'l' : 4.97,
'c' : 4.68,
't' : 4.63,
'u' : 3.93,
'm' : 3.15,
'p' : 2.51,
'b' : 1.42,
'g' : 1.01,
'v' : 0.90,
'y' : 0.90,
'q' : 0.88,
'h' : 0.70,
'f' : 0.69,
'z' : 0.52,
'j' : 0.44,
'x' : 0.21,
'w' : 0.02,
'k' : 0.01
}
lettersByLanguage[english] = {
'e' : 12.70,
't' : 9.06,
'a' : 8.16,
'o' : 7.50,
'i' : 6.96,
'n' : 6.74,
's' : 6.32,
'h' : 6.09,
'r' : 5.99,
'd' : 4.25,
'l' : 4.03,
'c' : 2.78,
'u' : 2.76,
'm' : 2.41,
'w' : 2.36,
'f' : 2.23,
'g' : 2.02,
'y' : 1.97,
'p' : 1.93,
'b' : 1.49,
'v' : 0.98,
'k' : 0.77,
'j' : 0.15,
'x' : 0.15,
'q' : 0.10,
'z' : 0.07
}
lettersByLanguage[german] = {
'e' : 17.4,
'n' : 9.78,
'i' : 7.55,
's' : 7.27,
'r' : 7.00,
'a' : 6.51,
't' : 6.15,
'd' : 5.08,
'h' : 4.76,
'u' : 4.35,
'l' : 3.44,
'c' : 3.06,
'g' : 3.01,
'o' : 2.59,
'm' : 2.53,
'b' : 1.89,
'w' : 1.89,
'f' : 1.66,
'k' : 1.21,
'z' : 1.13,
'v' : 0.85,
'p' : 0.67,
'j' : 0.27,
'y' : 0.04,
'x' : 0.03,
'q' : 0.02
}
# normalize maps
normalizeMap(lettersByLanguage[spanish])
normalizeMap(lettersByLanguage[english])
normalizeMap(lettersByLanguage[german])
languageCounts = {}
for url in charsByPage:
tokens = charsByPage[url]
tokenCounts = dict(Counter(tokens))
tokenSum = sum(tokenCounts.values())
# Calculating the squared error
rankings = {}
matches = {}
for token in tokenCounts:
for key2 in lettersByLanguage:
if token not in lettersByLanguage[key2]:
continue
p = float(lettersByLanguage[key2][token]) * 100
if p >= 0:
if key2 not in rankings:
rankings[key2] = 0
matches[key2] = 0
# calculate the squared error from observed and reference frequencies
rankings[key2] += math.pow(math.fabs(tokenCounts[token] * 100 / tokenSum - p), 2)
matches[key2] += 1
# Resulting language has the minimal mean squared error
minRanking = -1
language = None
for key in rankings:
rankings[key] /= matches[key]
if minRanking == -1 or rankings[key] < minRanking:
minRanking = rankings[key]
language = key
if language != None:
pageLanguages[url] = language
if language not in languageCounts:
languageCounts[language] = 1
else:
languageCounts[language] += 1
print "\n \n ==== language distribution ===="
print "Language \t Number of occurences"
for lang in languageCounts:
print lang + " \t " + str(languageCounts[lang])