diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/challenge.txt b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/challenge.txt new file mode 100644 index 00000000..a4e3925f --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/challenge.txt @@ -0,0 +1,10 @@ +01 es +02 de +03 en +04 en +05 de +06 es +07 es +08 de +09 en +10 es \ No newline at end of file diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/crawler/crawler.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/crawler/crawler.py new file mode 100644 index 00000000..77ab335f --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/crawler/crawler.py @@ -0,0 +1,510 @@ +# -*- coding: utf-8 -*- +import urllib2 +import sys +import random +import robotparser +import re +import math +from sgmllib import SGMLParser +import sgmllib +from urlparse import urlparse +from urlparse import urljoin +import matplotlib.pyplot as plt +import time +from termcolor import colored +from collections import Counter + +''' +VN: + - Plagiats-Checker fertig + - Sprachprüfer fertig + +TODO: + - DONE canonize urls -> canonize? slides? -> remember last host -> no magic here -> even using ugly global + - DONE with getNextUrlToVisit(): + server timeout -> safe crawled host, set timeout for crawled host + - statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html + +''' + +# crawler attributes +entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten" +entrypoint = "http://www.spiegel.de" # german website +#entrypoint = "http://www.cnn.com" # english website +#entrypoint = "http://www.red2000.com/spain/1index.html" # spanish website +#entrypoint = "https://code.google.com/p/feedparser/issues/attachmentText?id=226&aid=-1296926914212963541&name=demonstrate_issue_226.xml&token=CHtgpTsdPmWnNsvScD0yfMuBriU%3A1368702558154" +numberOfPagesToCrawl = 1000 +timeBetweenSameHost = 0 # 5 sec +visitOnlyTuSites = False; + + +#some variables +prohibitedSites = 0 +visitedUrls = [] # safe already visited urls, so no url will be visited more than once + +pages = {} # downloaded pages + +numberHyperlink = {} # safe number of hyperlinks... +numberHyperlinksPerPage = {} # safe number of hyperlinks per page + +visitedHostsWithTimestamp = {} # safe visited hosts with timestamp +robotsTxtResults = {} # safe robots.txt + +lasthost = '' #last host + +def normalizeMap(m): + s = sum(m.values()) + + for k in m: + m[k] = float(m[k]) / float(s) + +def subtractDicts(dict1, dict2): + dic = dict() + for key in dict1: + if key in dict2: + dic[key] = max(0, int(dict1[key]) - int(dict2[key])) + else: + dic[key] = int(dict1[key]) + + for key in dict2: + if key not in dict1: + dic[key] = int(dict2[key]) + + return dic + +def countWords(words): + counts = {} + for word in words: + if word not in counts: + counts[word] = 1 + else: + counts[word] += 1 + return counts + +def blockedByRobotsTxt(url): + o = urlparse(url) + robotsUrl = o.scheme+"://"+o.netloc+"/robots.txt" + + if url in robotsTxtResults: + rp = robotsTxtResults[robotsUrl] + else: + rp = robotparser.RobotFileParser() + rp.set_url(robotsUrl) + + try: + rp.read() + robotsTxtResults[robotsUrl] = rp + except: + robotsTxtResults[robotsUrl] = None # robots.txt doesn't exist + + if robotsTxtResults[robotsUrl] == None: + return False # return false if robots.txt doesn't exist + else: + if rp.can_fetch("*", url): + return False + else: + print colored("-> not allowed to visit :( "+url, "red") + global prohibitedSites + prohibitedSites += 1 + return True + +def canonicalUrl(url): + global lasthost + url = url.lower().replace(" ", "") + + o = urlparse(url) + + if o.netloc != '': + lasthost = o.scheme + '://' + o.netloc + + if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl(): + if ".html" in o.path: + return [url] + if "." not in o.path: + return [url] + return [] + else: + if o.scheme=='': + return [urljoin(lasthost,o.path)] + else: + return [] + + +def getNextUrlToVisit(): + url = random.choice(extractor.urls) + + if visitOnlyTuSites: + if 'tu-darmstadt' not in urlparse(url).netloc: + extractor.urls.remove(url) + return getNextUrlToVisit() + if url in numberHyperlink: + numberHyperlink[url] += 1 + else: + numberHyperlink[url] = 1 + + + host = urlparse(url).netloc + + ## check if url is blocked by robots.txt or was already visited ## + if blockedByRobotsTxt(url) or url in visitedUrls: + extractor.urls.remove(url) + return getNextUrlToVisit() + + ## check if host got a timeout + if host in visitedHostsWithTimestamp: + timestamp = visitedHostsWithTimestamp[host] + secondsSinceLastVisit = int(time.time()) - timestamp + if secondsSinceLastVisit >= timeBetweenSameHost: + visitedHostsWithTimestamp[host] = int(time.time()) + visitedUrls.append(url) + extractor.urls.remove(url) + return url + else: + secondsToWait = timeBetweenSameHost - secondsSinceLastVisit + print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "magenta") + return getNextUrlToVisit() + else: + visitedHostsWithTimestamp[host] = int(time.time()) + visitedUrls.append(url) + extractor.urls.remove(url) + return url + + +class URLLister(SGMLParser): + ## fix SGMLParseError + def resetParser(self): + SGMLParser.reset(self) + + def reset(self): + SGMLParser.reset(self) + self.urls = [] + + def start_a(self, attrs): + + href = [v for k, v in attrs if k=='href'] + if href: + url = canonicalUrl(href[0]) + self.urls.extend(url) + + # count number of links on actual site + if href[0] in numberHyperlinksPerPage: + numberHyperlinksPerPage[href[0]] += 1 + else: + numberHyperlinksPerPage[href[0]] = 1 + + + +if __name__ == "__main__": + + page = urllib2.urlopen(entrypoint, timeout = 5) + print "currently visited url: "+entrypoint + extractor = URLLister() + extractor.feed(page.read()) + page.close() + + + i = 1 + while(i <= numberOfPagesToCrawl): + url = getNextUrlToVisit() + print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") currently visiting url: "+url, "blue") + try: + page = urllib2.urlopen(url, timeout = 6) + pageContent = page.read() + pageContent = pageContent.replace('= " + str(threshold*100) + " %) ====" + #print "URL1 \t URL2 \t Similarity in %" + # wordcounts per page + wordCountsByPage = {} + charsByPage = {} + ## count words in all pages ## + for url in pages: + tmp = re.sub("[\n\r]", "", pages[url]) # remove all scripts + tmp = re.sub("<\s*script.*?>.+?<\s*\/script.*?>", "", tmp) # remove all scripts + tmp = re.sub("<\s*style.*?>.+?<\s*\/style.*?>", "", tmp) # remove all styles + tmp = re.sub("&.+?;", "", tmp) # remove all html entities + tmp = re.sub("<.+?>", "", tmp) # remove all html tags + tmp = re.sub("\d", "", tmp) # remove all numbers + words = re.findall("(\w+)", tmp) # split words + words = [x.lower() for x in words] # all words to lower case + words = [s for s in words if len(s) > 4 and len(s) <= 10] + + wordCountsByPage[url] = countWords(words) + + chars = re.findall("[A-za-z]", tmp); # find all characters + chars = [x.lower() for x in chars] # all characters to lower case + charsByPage[url] = chars + + ## calculate wordcount deltas and print double-content sites ## + wordCountDeltas = {} + for url1 in wordCountsByPage: + for url2 in wordCountsByPage: + if url1 == url2: + continue + + if url1 not in wordCountDeltas: + wordCountDeltas[url1] = {} + if url2 in wordCountDeltas[url1]: # do it once only + continue + + wordCounts1 = wordCountsByPage[url1] + wordCounts2 = wordCountsByPage[url2] + + sum1 = sum(wordCounts1.values()) + if sum1 == 0: + continue + + #print "calculating deltas of url1: " + url1 + " -- url2: " + url2 + deltaWordCounts = subtractDicts(wordCounts1, wordCounts2) + + wordCountDeltas[url1][url2] = math.fabs(float(sum(deltaWordCounts.values())) / float(sum1)) + if 1 - wordCountDeltas[url1][url2] > threshold: + #print url1 + " \t " + url2 + " \t " + str((1 - wordCountDeltas[url1][url2]) * 100) + pass + + ## determine the sites' languages ## + spanish = 'es' + english = 'en' + german = 'de' + + pageLanguages = {} + lettersByLanguage = {} + lettersByLanguage[spanish] = { + 'e' : 13.68, + 'a' : 12.53, + 'o' : 8.68, + 's' : 7.98, + 'r' : 6.87, + + 'n' : 6.71, + 'i' : 6.25, + 'd' : 5.86, + 'l' : 4.97, + 'c' : 4.68, + + 't' : 4.63, + 'u' : 3.93, + 'm' : 3.15, + 'p' : 2.51, + 'b' : 1.42, + + 'g' : 1.01, + 'v' : 0.90, + 'y' : 0.90, + 'q' : 0.88, + 'h' : 0.70, + + 'f' : 0.69, + 'z' : 0.52, + 'j' : 0.44, + 'x' : 0.21, + 'w' : 0.02, + + 'k' : 0.01 + } + lettersByLanguage[english] = { + 'e' : 12.70, + 't' : 9.06, + 'a' : 8.16, + 'o' : 7.50, + 'i' : 6.96, + + 'n' : 6.74, + 's' : 6.32, + 'h' : 6.09, + 'r' : 5.99, + 'd' : 4.25, + + 'l' : 4.03, + 'c' : 2.78, + 'u' : 2.76, + 'm' : 2.41, + 'w' : 2.36, + + 'f' : 2.23, + 'g' : 2.02, + 'y' : 1.97, + 'p' : 1.93, + 'b' : 1.49, + + 'v' : 0.98, + 'k' : 0.77, + 'j' : 0.15, + 'x' : 0.15, + 'q' : 0.10, + + 'z' : 0.07 + } + lettersByLanguage[german] = { + 'e' : 17.4, + 'n' : 9.78, + 'i' : 7.55, + 's' : 7.27, + 'r' : 7.00, + + 'a' : 6.51, + 't' : 6.15, + 'd' : 5.08, + 'h' : 4.76, + 'u' : 4.35, + + 'l' : 3.44, + 'c' : 3.06, + 'g' : 3.01, + 'o' : 2.59, + 'm' : 2.53, + + 'b' : 1.89, + 'w' : 1.89, + 'f' : 1.66, + 'k' : 1.21, + 'z' : 1.13, + + 'v' : 0.85, + 'p' : 0.67, + 'j' : 0.27, + 'y' : 0.04, + 'x' : 0.03, + + 'q' : 0.02 + } + + # normalize maps + normalizeMap(lettersByLanguage[spanish]) + normalizeMap(lettersByLanguage[english]) + normalizeMap(lettersByLanguage[german]) + + languageCounts = {} + for url in charsByPage: + tokens = charsByPage[url] + tokenCounts = dict(Counter(tokens)) + + tokenSum = sum(tokenCounts.values()) + + # Calculating the squared error + rankings = {} + matches = {} + for token in tokenCounts: + for key2 in lettersByLanguage: + if token not in lettersByLanguage[key2]: + continue + p = float(lettersByLanguage[key2][token]) * 100 + if p >= 0: + if key2 not in rankings: + rankings[key2] = 0 + matches[key2] = 0 + # calculate the squared error from observed and reference frequencies + rankings[key2] += math.pow(math.fabs(tokenCounts[token] * 100 / tokenSum - p), 2) + matches[key2] += 1 + + # Resulting language has the minimal mean squared error + minRanking = -1 + language = None + for key in rankings: + rankings[key] /= matches[key] + + if minRanking == -1 or rankings[key] < minRanking: + minRanking = rankings[key] + language = key + + if language != None: + pageLanguages[url] = language + + if language not in languageCounts: + languageCounts[language] = 1 + else: + languageCounts[language] += 1 + + print "\n \n ==== language distribution ====" + print "Language \t Number of occurences" + for lang in languageCounts: + print lang + " \t " + str(languageCounts[lang]) \ No newline at end of file diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/crawler/termcolor.py b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/crawler/termcolor.py new file mode 100644 index 00000000..f11b824b --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/crawler/termcolor.py @@ -0,0 +1,168 @@ +# coding: utf-8 +# Copyright (c) 2008-2011 Volvox Development Team +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# Author: Konstantin Lepa + +"""ANSII Color formatting for output in terminal.""" + +from __future__ import print_function +import os + + +__ALL__ = [ 'colored', 'cprint' ] + +VERSION = (1, 1, 0) + +ATTRIBUTES = dict( + list(zip([ + 'bold', + 'dark', + '', + 'underline', + 'blink', + '', + 'reverse', + 'concealed' + ], + list(range(1, 9)) + )) + ) +del ATTRIBUTES[''] + + +HIGHLIGHTS = dict( + list(zip([ + 'on_grey', + 'on_red', + 'on_green', + 'on_yellow', + 'on_blue', + 'on_magenta', + 'on_cyan', + 'on_white' + ], + list(range(40, 48)) + )) + ) + + +COLORS = dict( + list(zip([ + 'grey', + 'red', + 'green', + 'yellow', + 'blue', + 'magenta', + 'cyan', + 'white', + ], + list(range(30, 38)) + )) + ) + + +RESET = '\033[0m' + + +def colored(text, color=None, on_color=None, attrs=None): + """Colorize text. + + Available text colors: + red, green, yellow, blue, magenta, cyan, white. + + Available text highlights: + on_red, on_green, on_yellow, on_blue, on_magenta, on_cyan, on_white. + + Available attributes: + bold, dark, underline, blink, reverse, concealed. + + Example: + colored('Hello, World!', 'red', 'on_grey', ['blue', 'blink']) + colored('Hello, World!', 'green') + """ + if os.getenv('ANSI_COLORS_DISABLED') is None: + fmt_str = '\033[%dm%s' + if color is not None: + text = fmt_str % (COLORS[color], text) + + if on_color is not None: + text = fmt_str % (HIGHLIGHTS[on_color], text) + + if attrs is not None: + for attr in attrs: + text = fmt_str % (ATTRIBUTES[attr], text) + + text += RESET + return text + + +def cprint(text, color=None, on_color=None, attrs=None, **kwargs): + """Print colorize text. + + It accepts arguments of print function. + """ + + print((colored(text, color, on_color, attrs)), **kwargs) + + +if __name__ == '__main__': + print('Current terminal type: %s' % os.getenv('TERM')) + print('Test basic colors:') + cprint('Grey color', 'grey') + cprint('Red color', 'red') + cprint('Green color', 'green') + cprint('Yellow color', 'yellow') + cprint('Blue color', 'blue') + cprint('Magenta color', 'magenta') + cprint('Cyan color', 'cyan') + cprint('White color', 'white') + print(('-' * 78)) + + print('Test highlights:') + cprint('On grey color', on_color='on_grey') + cprint('On red color', on_color='on_red') + cprint('On green color', on_color='on_green') + cprint('On yellow color', on_color='on_yellow') + cprint('On blue color', on_color='on_blue') + cprint('On magenta color', on_color='on_magenta') + cprint('On cyan color', on_color='on_cyan') + cprint('On white color', color='grey', on_color='on_white') + print('-' * 78) + + print('Test attributes:') + cprint('Bold grey color', 'grey', attrs=['bold']) + cprint('Dark red color', 'red', attrs=['dark']) + cprint('Underline green color', 'green', attrs=['underline']) + cprint('Blink yellow color', 'yellow', attrs=['blink']) + cprint('Reversed blue color', 'blue', attrs=['reverse']) + cprint('Concealed Magenta color', 'magenta', attrs=['concealed']) + cprint('Bold underline reverse cyan color', 'cyan', + attrs=['bold', 'underline', 'reverse']) + cprint('Dark blink concealed white color', 'white', + attrs=['dark', 'blink', 'concealed']) + print(('-' * 78)) + + print('Test mixing:') + cprint('Underline red on grey color', 'red', 'on_grey', + ['underline']) + cprint('Reversed green on red color', 'green', 'on_red', ['reverse']) + diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/crawler/termcolor.pyc b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/crawler/termcolor.pyc new file mode 100644 index 00000000..79a5a97f Binary files /dev/null and b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/crawler/termcolor.pyc differ diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon.xpi b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon.xpi new file mode 100644 index 00000000..9d7577df Binary files /dev/null and b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon.xpi differ diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/README.md b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/README.md new file mode 100644 index 00000000..29ca1058 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/README.md @@ -0,0 +1,5 @@ +This is the keaddon add-on. It contains: + +* A program (lib/main.js). +* A few tests. +* Some meager documentation. diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/contentScripts/keworker.js b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/contentScripts/keworker.js new file mode 100644 index 00000000..a0d712c8 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/contentScripts/keworker.js @@ -0,0 +1,26 @@ +var text = ""; +var cleantext = ""; +var paragraphs = document.getElementsByTagName('p'); +var open = '<'; +var close = '>'; +for(var i=0; i + + \ No newline at end of file diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/Flag_of_Germany.svg b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/Flag_of_Germany.svg new file mode 100644 index 00000000..d466e44a --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/Flag_of_Germany.svg @@ -0,0 +1,8 @@ + + + + Flag of Germany + + + + \ No newline at end of file diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/Flag_of_Spain.svg b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/Flag_of_Spain.svg new file mode 100644 index 00000000..872f198b --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/Flag_of_Spain.svg @@ -0,0 +1,631 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/Flag_of_the_United_Kingdom.svg b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/Flag_of_the_United_Kingdom.svg new file mode 100644 index 00000000..63b1cb3d --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/Flag_of_the_United_Kingdom.svg @@ -0,0 +1,10 @@ + + + + + + + + + + \ No newline at end of file diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/de.png b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/de.png new file mode 100644 index 00000000..75a3cf30 Binary files /dev/null and b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/de.png differ diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/en.png b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/en.png new file mode 100644 index 00000000..ef13c9f6 Binary files /dev/null and b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/en.png differ diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/es.png b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/es.png new file mode 100644 index 00000000..66a31de2 Binary files /dev/null and b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/es.png differ diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/fr.png b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/fr.png new file mode 100644 index 00000000..7ea9c3bb Binary files /dev/null and b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/fr.png differ diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/keicon.png b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/keicon.png new file mode 100644 index 00000000..bd813f78 Binary files /dev/null and b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/keicon.png differ diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/docs/main.md b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/docs/main.md new file mode 100644 index 00000000..4dff0650 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/docs/main.md @@ -0,0 +1,2 @@ +The main module is a program that creates a widget. When a user clicks on +the widget, the program loads the mozilla.org website in a new tab. diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/lib/language.js b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/lib/language.js new file mode 100644 index 00000000..7b68cd18 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/lib/language.js @@ -0,0 +1,4 @@ +exports.german = 'de'; +exports.french = 'fr'; +exports.spanish = 'es'; +exports.english = 'en'; diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/lib/main.js b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/lib/main.js new file mode 100644 index 00000000..be2e6e01 --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/abgabe/keaddon2/lib/main.js @@ -0,0 +1,65 @@ +var widgets = require("widget"); +var pageMod = require("page-mod"); +var student = require("student"); +var data = require("self").data; + +var workers = new Array(); +var mod = null; + +exports.main = function(options, callback) { + mod = pageMod.PageMod( + { + include: "*", + contentScriptWhen:"ready", + contentScriptFile: data.url("./contentScripts/keworker.js"), + onAttach: function onAttach(worker) { + worker.on('message', handleMessage); + workers.push(worker); + } + } + ); + + var widget = widgets.Widget( + { + id: "ke", + label: "Knowledge Engineering", + contentURL: data.url("keicon.png") + } + ); + + function handleMessage(message) { + var lang = require("language"); + if(message.length > 0) { + //TODO: Iconswitch + var language = student.student(message); + console.log(language); + switch(language) { + case lang.german: + widget.contentURL = data.url("./flag/de.png"); + break; + case lang.spanish: + widget.contentURL = data.url("./flag/es.png"); + break; + case lang.english: + widget.contentURL = data.url("./flag/en.png"); + break; + case lang.french: + widget.contentURL = data.url("./flag/fr.png"); + break; + default: + widget.contentURL = data.url("./keicon.png"); + + } + //TODO: response + } + } + + console.log("The add-on is running."); +} + +exports.onUnload = function(reason) { + if(mod != null) {mod.destroy();} + for(var i=0; i 1) { + for(var i=1; i 0); +}; + +exports.test_url = function(test) { + require("request").Request({ + url: "http://www.mozilla.org/", + onComplete: function(response) { + test.assertEqual(response.statusText, "OK"); + test.done(); + } + }).get(); + test.waitUntilDone(20000); +}; + +exports.test_open_tab = function(test) { + const tabs = require("tabs"); + tabs.open({ + url: "http://www.mozilla.org/", + onReady: function(tab) { + test.assertEqual(tab.url, "http://www.mozilla.org/"); + test.done(); + } + }); + test.waitUntilDone(20000); +}; + +var errormessage = ""; + +exports.test_util_countElements = function(test) { + const util = require("utility"); + test.assert(compareObjects(util.countElements(["du", "du", "hallo", "hallo", "du"]),{"hallo":2, "du":3}),errormessage); +}; + +exports.test_util_toCharArray = function(test) { + const util = require("utility"); + test.assert(compareArrays(util.toCharArray("test"), ["t","e","s","t"]), errormessage); +}; + +exports.test_util_toCharPairs = function(test) { + const util = require("utility"); + test.assert(compareArrays(util.toCharPairs("mainz"),["ma", "ai", "in", "nz"]), errormessage); +}; + +exports.test_util_tokenize = function(test) { + const util = require("utility"); + test.assert(compareArrays(util.tokenize("Dem Igel geht's gut."),["dem","igel","gehts","gut"]), errormessage); +}; + +exports.test_student_student = function(test) { + const student = require("student"); + var text = "blubber"; + test.assertEqual(student.student(text), lang.german, "Geht nicht weil."); +}; + +function compareObjects(a,b) { + for(var key in a) { + if( a[key] != b[key] ) { + return false; + } + } + return true; +}; + +function compareArrays(a,b) { + if (a.length != b.length) { + errormessage = "Arrays of unequal size"; + return false + } + for(var i=0; i +] File: grafiken/a1_abb1.png Graphic file (type png) @@ -1481,7 +1481,7 @@ Underfull \hbox (badness 10000) has occurred while \output is active [4 -] +] File: grafiken/a1_abb2.png Graphic file (type png) @@ -1550,20 +1550,172 @@ Underfull \hbox (badness 10000) has occurred while \output is active [12 -] +] +File: grafiken/a2_spiegel_1.png Graphic file (type png) + + +Package pdftex.def Info: grafiken/a2_spiegel_1.png used on input line 171. +(pdftex.def) Requested size: 208.65793pt x 156.49014pt. + +Overfull \vbox (22.25151pt too high) detected at line 171 + [] + + Underfull \hbox (badness 10000) has occurred while \output is active [] [13 -] + <./grafiken/a2_spiegel_1.png>] + +File: grafiken/a2_spiegel_2.png Graphic file (type png) + + +Package pdftex.def Info: grafiken/a2_spiegel_2.png used on input line 180. +(pdftex.def) Requested size: 208.65793pt x 156.49014pt. + +Overfull \vbox (22.25151pt too high) detected at line 180 + [] + + Underfull \hbox (badness 10000) has occurred while \output is active [] [14 + <./grafiken/a2_spiegel_2.png>] +Underfull \hbox (badness 10000) has occurred while \output is active + + [] + +[15 + +] +File: grafiken/a2_spiegel_3.png Graphic file (type png) + + +Package pdftex.def Info: grafiken/a2_spiegel_3.png used on input line 196. +(pdftex.def) Requested size: 208.65793pt x 156.49014pt. + +Overfull \vbox (22.25151pt too high) detected at line 196 + [] + + +Underfull \hbox (badness 10000) has occurred while \output is active + + [] + +[16 + + <./grafiken/a2_spiegel_3.png>] + +File: grafiken/a2_spiegel_4.png Graphic file (type png) + + +Package pdftex.def Info: grafiken/a2_spiegel_4.png used on input line 205. +(pdftex.def) Requested size: 208.65793pt x 156.49014pt. + +Overfull \vbox (22.25151pt too high) detected at line 205 + [] + + +Underfull \hbox (badness 10000) has occurred while \output is active + + [] + +[17 + + <./grafiken/a2_spiegel_4.png>] +Underfull \hbox (badness 10000) has occurred while \output is active + + [] + +[18 + +] +Underfull \hbox (badness 10000) has occurred while \output is active + + [] + +[19 + +] +File: grafiken/a2_tu_1.png Graphic file (type png) + + +Package pdftex.def Info: grafiken/a2_tu_1.png used on input line 236. +(pdftex.def) Requested size: 208.65793pt x 156.49014pt. + +Overfull \vbox (22.25151pt too high) detected at line 236 + [] + + +Underfull \hbox (badness 10000) has occurred while \output is active + + [] + +[20 + + <./grafiken/a2_tu_1.png>] + +File: grafiken/a2_tu_2.png Graphic file (type png) + +Package pdftex.def Info: grafiken/a2_tu_2.png used on input line 245. +(pdftex.def) Requested size: 208.65793pt x 156.49014pt. + +Overfull \vbox (22.25151pt too high) detected at line 245 + [] + + +Underfull \hbox (badness 10000) has occurred while \output is active + + [] + +[21 + + <./grafiken/a2_tu_2.png>] + +File: grafiken/a2_tu_3.png Graphic file (type png) + +Package pdftex.def Info: grafiken/a2_tu_3.png used on input line 254. +(pdftex.def) Requested size: 208.65793pt x 156.49014pt. + +Overfull \vbox (22.25151pt too high) detected at line 254 + [] + + +Underfull \hbox (badness 10000) has occurred while \output is active + + [] + +[22 + + <./grafiken/a2_tu_3.png>] + +File: grafiken/a2_tu_4.png Graphic file (type png) + +Package pdftex.def Info: grafiken/a2_tu_4.png used on input line 263. +(pdftex.def) Requested size: 208.65793pt x 156.49014pt. + +Overfull \vbox (22.25151pt too high) detected at line 263 + [] + + +Underfull \hbox (badness 10000) has occurred while \output is active + + [] + +[23 + + <./grafiken/a2_tu_4.png>] +Underfull \hbox (badness 10000) has occurred while \output is active + + [] + +[24 + ] \tf@nav=\write7 \openout7 = `solution.nav'. @@ -1574,20 +1726,20 @@ Underfull \hbox (badness 10000) has occurred while \output is active \tf@snm=\write9 \openout9 = `solution.snm'. -Package atveryend Info: Empty hook `BeforeClearDocument' on input line 172. -Package atveryend Info: Empty hook `AfterLastShipout' on input line 172. +Package atveryend Info: Empty hook `BeforeClearDocument' on input line 278. +Package atveryend Info: Empty hook `AfterLastShipout' on input line 278. (./solution.aux) -Package atveryend Info: Empty hook `AtVeryEndDocument' on input line 172. -Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 172. +Package atveryend Info: Empty hook `AtVeryEndDocument' on input line 278. +Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 278. Package rerunfilecheck Info: File `solution.out' has not changed. -(rerunfilecheck) Checksum: 88D911AA5795ABD0722131B6C5D24A75;180. -Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 172. +(rerunfilecheck) Checksum: AE5CCE897D490A137427F55C345E5A34;90. +Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 278. ) Here is how much of TeX's memory you used: - 17220 strings out of 493633 - 315045 string characters out of 3143378 - 391807 words of memory out of 3000000 - 20016 multiletter control sequences out of 15000+200000 + 17292 strings out of 493633 + 316299 string characters out of 3143378 + 391806 words of memory out of 3000000 + 20062 multiletter control sequences out of 15000+200000 39628 words of font info for 52 fonts, out of 3000000 for 9000 831 hyphenation exceptions out of 8191 55i,20n,79p,425b,533s stack positions out of 5000i,500n,10000p,200000b,50000s @@ -1601,10 +1753,10 @@ texlive/2011/texmf-dist/fonts/type1/urw/helvetic/uhvb8a.pfb> -Output written on solution.pdf (14 pages, 146011 bytes). +Output written on solution.pdf (24 pages, 351323 bytes). PDF statistics: - 156 PDF objects out of 1000 (max. 8388607) - 122 compressed objects within 2 object streams - 33 named destinations out of 1000 (max. 500000) - 80 words of extra memory for PDF output out of 10000 (max. 10000000) + 221 PDF objects out of 1000 (max. 8388607) + 161 compressed objects within 2 object streams + 51 named destinations out of 1000 (max. 500000) + 104 words of extra memory for PDF output out of 10000 (max. 10000000) diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.nav b/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.nav index 3b1c159c..f6963ccc 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.nav +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.nav @@ -23,24 +23,38 @@ \headcommand {\beamer@subsectionpages {2}{8}} \headcommand {\slideentry {2}{0}{9}{9/9}{}{0}} \headcommand {\beamer@framepages {9}{9}} -\headcommand {\sectionentry {3}{2. Aufgabe}{10}{2. Aufgabe}{0}} -\headcommand {\beamer@sectionpages {9}{9}} -\headcommand {\beamer@subsectionpages {9}{9}} -\headcommand {\slideentry {3}{0}{10}{10/10}{}{0}} +\headcommand {\slideentry {2}{0}{10}{10/10}{}{0}} \headcommand {\beamer@framepages {10}{10}} -\headcommand {\sectionentry {4}{2. Aufgabe}{11}{2. Aufgabe}{0}} -\headcommand {\beamer@sectionpages {10}{10}} -\headcommand {\beamer@subsectionpages {10}{10}} -\headcommand {\slideentry {4}{0}{11}{11/11}{}{0}} +\headcommand {\slideentry {2}{0}{11}{11/11}{}{0}} \headcommand {\beamer@framepages {11}{11}} -\headcommand {\slideentry {4}{0}{12}{12/12}{}{0}} +\headcommand {\slideentry {2}{0}{12}{12/12}{}{0}} \headcommand {\beamer@framepages {12}{12}} -\headcommand {\slideentry {4}{0}{13}{13/13}{}{0}} +\headcommand {\slideentry {2}{0}{13}{13/13}{}{0}} \headcommand {\beamer@framepages {13}{13}} -\headcommand {\slideentry {4}{0}{14}{14/14}{}{0}} +\headcommand {\slideentry {2}{0}{14}{14/14}{}{0}} \headcommand {\beamer@framepages {14}{14}} -\headcommand {\beamer@partpages {1}{14}} -\headcommand {\beamer@subsectionpages {11}{14}} -\headcommand {\beamer@sectionpages {11}{14}} -\headcommand {\beamer@documentpages {14}} -\headcommand {\def \inserttotalframenumber {14}} +\headcommand {\slideentry {2}{0}{15}{15/15}{}{0}} +\headcommand {\beamer@framepages {15}{15}} +\headcommand {\slideentry {2}{0}{16}{16/16}{}{0}} +\headcommand {\beamer@framepages {16}{16}} +\headcommand {\slideentry {2}{0}{17}{17/17}{}{0}} +\headcommand {\beamer@framepages {17}{17}} +\headcommand {\slideentry {2}{0}{18}{18/18}{}{0}} +\headcommand {\beamer@framepages {18}{18}} +\headcommand {\slideentry {2}{0}{19}{19/19}{}{0}} +\headcommand {\beamer@framepages {19}{19}} +\headcommand {\slideentry {2}{0}{20}{20/20}{}{0}} +\headcommand {\beamer@framepages {20}{20}} +\headcommand {\slideentry {2}{0}{21}{21/21}{}{0}} +\headcommand {\beamer@framepages {21}{21}} +\headcommand {\slideentry {2}{0}{22}{22/22}{}{0}} +\headcommand {\beamer@framepages {22}{22}} +\headcommand {\slideentry {2}{0}{23}{23/23}{}{0}} +\headcommand {\beamer@framepages {23}{23}} +\headcommand {\slideentry {2}{0}{24}{24/24}{}{0}} +\headcommand {\beamer@framepages {24}{24}} +\headcommand {\beamer@partpages {1}{24}} +\headcommand {\beamer@subsectionpages {9}{24}} +\headcommand {\beamer@sectionpages {9}{24}} +\headcommand {\beamer@documentpages {24}} +\headcommand {\def \inserttotalframenumber {24}} diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.out b/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.out index 8577d465..76ba5bb0 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.out +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.out @@ -1,4 +1,2 @@ \BOOKMARK [2][]{Outline0.1}{1. Aufgabe}{}% 1 \BOOKMARK [2][]{Outline0.2}{2. Aufgabe}{}% 2 -\BOOKMARK [2][]{Outline0.3}{2. Aufgabe}{}% 3 -\BOOKMARK [2][]{Outline0.4}{2. Aufgabe}{}% 4 diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.pdf b/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.pdf index e7becfad..5cfb941d 100644 Binary files a/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.pdf and b/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.pdf differ diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.synctex.gz b/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.synctex.gz index 79a1b97f..9c41e9e8 100644 Binary files a/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.synctex.gz and b/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.synctex.gz differ diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.tex b/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.tex index 26475e24..0db87d4a 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.tex +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.tex @@ -80,7 +80,7 @@ $p_{expected(lp, lang) \approx \frac{1}{i * ln(1,78 * N)}}$ \end{frame} \begin{frame} -\frametitle{1. Aufgabe \\ Firefox-Plugin} +\frametitle{1. Aufgabe \\ Firefox-Plugin} \begin{itemize} \item Häufigkeiten der Buchstaben bzw. Buchstabenpaare ($n_{text(l)}$) relativ zur Gesamtanzahl ($n_{text}$): \\ \begin{center} @@ -117,7 +117,7 @@ $MSE(lang) = \frac{\sum_{l}(\tilde{p}_{text}(l) - p_{expected(lp, lang)})^2}{n_{ % % % % % % % % % % % % % % % % % % % % % % %% % % % % % % % % % % % % % % % % % % % % % %% % % % % % % % % % % % % % % % % % % % % % % % \section{2. Aufgabe} \begin{frame} -\frametitle{2. Aufgabe \\ Crawler} +\frametitle{2. und 3. Aufgabe \\ Crawler} \textbf{Verfahren zur Erkennung von Duplikaten:} \\ \textbf{1.} Alle Wörter mit einer Länge von 4 und kleiner 11 werden von der Webseite extrahiert. \begin{itemize} @@ -127,9 +127,9 @@ $MSE(lang) = \frac{\sum_{l}(\tilde{p}_{text}(l) - p_{expected(lp, lang)})^2}{n_{ \end{itemize} \end{frame} -\section{2. Aufgabe} + \begin{frame} -\frametitle{2. Aufgabe \\ Crawler} +\frametitle{2. und 3. Aufgabe \\ Crawler} \textbf{2.} Eine Zuweisung von Wörtern zu deren Auftrittshäufigkeit wird angefertigt \\ \textbf{3.} Für alle paarweise verschiedenen Seiten werden die Auftrittshäufigkeiten subtrahiert, so dass deren Ergebnis minimal null ergibt. Zudem werden alle Wörter, die auf einer, aber nicht auf der anderen Seite vorkommen, ebenfalls der anderen Seite zugewiesen \begin{itemize} @@ -137,29 +137,135 @@ $MSE(lang) = \frac{\sum_{l}(\tilde{p}_{text}(l) - p_{expected(lp, lang)})^2}{n_{ \end{itemize} \end{frame} -\section{2. Aufgabe} + \begin{frame} -\frametitle{2. Aufgabe \\ Crawler} +\frametitle{2. und 3. Aufgabe \\ Crawler} \textbf{4.} Anschließend wird die resultierende Gesamtzahl an Wortvorkommnissen durch die Anzahl der Wortvorkommnisse vor der Subtraktion geteilt. Fällt dieser Wert unter eine definierte Grenze, gilt die Seite als Duplikat. \\ \begin{itemize} \item Im Code ist anstatt einer Untergrenze eine Obergrenze von 90\% angegeben, die Berechnung wurde also umgekehrt, so dass hohe Werte eine hohe Duplikatswahrscheinlichkeit implizieren. \end{itemize} \end{frame} + + \begin{frame} -\frametitle{2. Aufgabe \\ Crawler} -Histogramm über die Anzahl der URLs pro Seite (wie beim ersten Übungsblatt mit Worthäufigkeiten, auch logarithmisch) +\frametitle{2. und 3. Aufgabe \\ Crawler} +\textbf{Startseite:} \\ +http://www.spiegel.de/ \\ +Es wurden 1000 Seiten besucht. \\ +\vspace{1cm} +\textbf{Erkannte Sprachen:} +\begin{center} +de $\to$ 623 \\ +en $\to$ 246 \\ +es $\to$ 131 \\ +\end{center} \end{frame} \begin{frame} -\frametitle{2. Aufgabe \\ Crawler} -Histogramm mit den Häufigkeiten des Auftretens von Hyperlinks, d.h. wie viele Links treten 1-mal, 2-mal, ... auftreten ... +\frametitle{2. und 3. Aufgabe \\ Crawler} +\begin{figure} +\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_spiegel_1.png} +\caption{Anzahl URLs pro Seite} +\end{figure} +\end{frame} + + +\begin{frame} +\frametitle{2. und 3. Aufgabe \\ Crawler} +\begin{figure} +\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_spiegel_2.png} +\caption{Anzahl URLs pro Seite (logarithmisch)} +\end{figure} +\end{frame} + +\begin{frame} +\frametitle{2. und 3. Aufgabe \\ Crawler} +\begin{itemize} +\item Viele Internetseiten verweisen auf wenige andere Internetseiten +\end{itemize} +\end{frame} + + +\begin{frame} +\frametitle{2. und 3. Aufgabe \\ Crawler} +\begin{figure} +\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_spiegel_3.png} +\caption{Häufigkeiten des Auftretens von Hyperlinks} +\end{figure} +\end{frame} + + +\begin{frame} +\frametitle{2. und 3. Aufgabe \\ Crawler} +\begin{figure} +\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_spiegel_4.png} +\caption{Häufigkeiten des Auftretens von Hyperlinks (logarithmisch)} +\end{figure} +\end{frame} + +\begin{frame} +\frametitle{2. und 3. Aufgabe \\ Crawler} +\begin{itemize} +\item Es gibt nur wenige Internetseiten die oft referenziert werden. +\end{itemize} +\end{frame} + + +\begin{frame} +\frametitle{2. und 3. Aufgabe \\ Crawler} +\textbf{Startseite:} \\ +http://www.ke.tu-darmstadt.de/lehre/arbeiten \\ +Crawler hat nur Seiten innerhalb der TU Darmstadt der Form *.tu.darmstadt.de besucht. +Es wurden 1000 Seiten besucht. \\ +\vspace{1cm} +\textbf{Erkannte Sprachen:} +\begin{center} +de $\to$ 329 \\ +en $\to$ 576 \\ +es $\to$ 95 \\ +\end{center} +\end{frame} + +\begin{frame} +\frametitle{2. und 3. Aufgabe \\ Crawler} +\begin{figure} +\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_tu_1.png} +\caption{Anzahl URLs pro Seite} +\end{figure} +\end{frame} + + +\begin{frame} +\frametitle{2. und 3. Aufgabe \\ Crawler} +\begin{figure} +\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_tu_2.png} +\caption{Anzahl URLs pro Seite (logarithmisch)} +\end{figure} +\end{frame} + + +\begin{frame} +\frametitle{2. und 3. Aufgabe \\ Crawler} +\begin{figure} +\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_tu_3.png} +\caption{Häufigkeiten des Auftretens von Hyperlinks} +\end{figure} +\end{frame} + + +\begin{frame} +\frametitle{2. und 3. Aufgabe \\ Crawler} +\begin{figure} +\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_tu_4.png} +\caption{Häufigkeiten des Auftretens von Hyperlinks (logarithmisch)} +\end{figure} \end{frame} \begin{frame} -\frametitle{2. Aufgabe \\ Crawler} +\frametitle{2. und 3. Aufgabe \\ Crawler} \textbf{Erfahrungen und Probleme:} \begin{itemize} \item Findet man einen Onlineshop, so wird die Queue mit sehr vielen Links dieses Shops gefüllt und der Crawler besucht mit sehr hoher Wahrscheinlichkeit nur noch URLs innerhalb des Shops. diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.toc b/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.toc index fdb1c280..95b4a70a 100644 --- a/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.toc +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/latex/solution.toc @@ -2,5 +2,3 @@ \select@language {ngerman} \beamer@sectionintoc {1}{1. Aufgabe}{2}{0}{1} \beamer@sectionintoc {2}{2. Aufgabe}{9}{0}{2} -\beamer@sectionintoc {3}{2. Aufgabe}{10}{0}{3} -\beamer@sectionintoc {4}{2. Aufgabe}{11}{0}{4} diff --git a/ss2013/1_Web Mining/Uebungen/2_Uebung/results.rtf b/ss2013/1_Web Mining/Uebungen/2_Uebung/results.rtf new file mode 100644 index 00000000..050d2b3c --- /dev/null +++ b/ss2013/1_Web Mining/Uebungen/2_Uebung/results.rtf @@ -0,0 +1,379 @@ +{\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf370 +{\fonttbl\f0\fswiss\fcharset0 Helvetica;} +{\colortbl;\red255\green255\blue255;} +\paperw11900\paperh16840\margl1440\margr1440\vieww16900\viewh8400\viewkind0 +\pard\tx566\tx1133\tx1700\tx2267\tx2834\tx3401\tx3968\tx4535\tx5102\tx5669\tx6236\tx6803\pardirnatural + +\f0\b\fs24 \cf0 entrypoint: google.de: +\b0 \ +\ + ==== robots.txt ====\ +prohibit by robots.txt: 172\ +\ + \ + ==== numberHyperlink ====\ +#Hyperlinks Website\ +19 http://www.blogger.com/profile/05109496878476775729\ +19 http://www.google.de/history/optout?hl=de\ +18 http://www.google.de/intl/de/options/\ +8 http://www.vovone.com/company/profile/\ +8 http://www.vovone.com/more/solutions/\ +8 http://www.vovone.com/company/partners/\ +7 http://www.google.de/intl/de/policies/privacy/\ +7 http://rocketsandsuch.blogspot.de/2009_08_01_archive.html\ +7 http://www.vovone.com/more/ssl-certificates/ssl-certificate-validation/\ +7 http://www.google.de/webhp?hl=de&tab=iw\ +7 http://www.vovone.com/company/ask-for-more/\ +6 http://www.vovone.com/domain-names/redirect-domain-name/\ +6 http://voice.google.com\ +6 http://www.vovone.com/support/f-a-q/\ +6 http://www.vovone.com/domain-names/domain-termination/\ +6 http://www.vovone.com/support/documentation/\ +6 http://www.vovone.com/company/careers/\ +5 http://www.vovone.com/discounts-offers/\ +5 http://www.google.com/press/blogs/directory.html#tab0\ +5 http://rocketsandsuch.blogspot.de/2008_03_01_archive.html\ +5 http://rocketsandsuch.blogspot.de/2009_09_01_archive.html\ +5 http://www.vovone.com/servers/\ +5 http://rocketsandsuch.blogspot.de/2009_01_01_archive.html\ +5 http://www.vovone.com\ +5 http://fusion.google.com/add?source=atgs&feedurl=http%3a//feeds.feedburner.com/googleappsupdates\ +5 http://www.vovone.com/more/solutions/service-level-agreements/\ +5 http://www.vovone.com/support/\ +5 http://www.vovone.com/servers/managed-servers/\ +5 http://rocketsandsuch.blogspot.de/2008_10_01_archive.html\ +5 http://rocketsandsuch.blogspot.de/2009_10_01_archive.html\ +5 http://feedburner.google.com/fb/a/mailverify?uri=googleappsupdates&loc=en_us\ +5 http://www.vovone.com/more/reseller-plans/affiliate-plan/\ +5 http://www.vovone.com/more/ssl-certificates/ssl-certificate-type/\ +5 http://blog.chromium.org/\ +5 http://www.vovone.com/company/conditions/notice-and-take-down/\ +5 http://www.vovone.com/more/ssl-certificates/ssl-certificates-brand/\ +4 http://www.vovone.com/more/colocation/private-rackspace/\ +4 http://www.vovone.com/more/ssl-certificates/\ +4 http://www.google.de/setprefdomain?prefdom=us&sig=0_h0pay1e5n4pq04s4m5soth6xqlk%3d\ +4 http://www.vovone.com/company/technology/security/\ +4 http://rocketsandsuch.blogspot.de/search?updated-min=2007-01-01t00:00:00-08:00&updated-max=2008-01-01t00:00:00-08:00&max-results=50\ +4 http://www.google.de/setprefdomain?prefdom=us&sig=0_bbxqe3gzyewbwv2egvfk2cujk3w%3d\ +4 http://www.vovone.com/more/\ +4 http://www.vovone.com/web-hosting/special-plans/special-plans-magento-hosting/\ +4 http://www.vovone.com/more/colocation/shared-rackspace/\ +4 http://www.vovone.com/company/conditions/\ +4 http://www.vovone.com/more/solutions/managed-services/\ +4 http://mail.google.com\ +4 http://rocketsandsuch.blogspot.de/2008/10/hubble-bubble-toil-and-trouble.html\ +4 http://www.vovone.com/servers/vps/vps-plan-8192/\ +\ + \ + ==== numberHyperlinksPerPage ====\ +#HyperlinksToPage Website\ +9088 javascript:void(0)\ +1867 #\ +898 javascript:;\ +522 http://www.blogger.com/profile/05109496878476775729\ +392 http://www.vovone.com\ +348 /\ +347 http://www.blogger.com/profile/09046869427384152063\ +317 \ +301 http://www.vovone.com/support/\ +298 https://my.vovone.com\ +295 http://www.vovone.com/company/careers/\ +272 http://feedburner.google.com/fb/a/mailverify?uri=GoogleAppsUpdates&loc=en_US\ +270 http://fusion.google.com/add?source=atgs&feedurl=http%3A//feeds.feedburner.com/GoogleAppsUpdates\ +256 the-button-element.html#concept-fe-value\ +242 http://www.blogger.com/profile/06992649719432295652\ +221 http://www.vovone.com/servers/\ +220 the-input-element.html#the-input-element\ +216 http://www.vovone.com/company/\ +206 http://www.vovone.com/web-hosting/\ +206 http://www.vovone.com/more/colocation/\ +205 http://www.vovone.com/more/ssl-certificates/\ +205 http://www.vovone.com/servers/dedicated-servers/\ +204 http://www.vovone.com/more/colocation/private-rackspace/\ +203 http://www.vovone.com/more/solutions/\ +203 http://www.vovone.com/company/technology/\ +203 http://www.vovone.com/servers/managed-servers/\ +203 http://www.vovone.com/domain-names/\ +202 http://www.vovone.com/voip-services/\ +202 http://www.vovone.com/company/conditions/\ +201 http://www.vovone.com/more/reseller-plans/\ +201 http://www.vovone.com/voip-services/cloud-voip/\ +200 http://www.vovone.com/company/promise/\ +200 http://www.vovone.com/voip-services/voip-accounts/\ +200 http://www.vovone.com/domain-names/domain-termination/\ +200 http://www.vovone.com/more/ssl-certificates/ssl-certificate-type/\ +200 http://www.vovone.com/domain-names/transfer-domain-name/\ +200 http://www.vovone.com/company/profile/\ +199 http://www.vovone.com/more/solutions/service-level-agreements/\ +199 http://www.vovone.com/more/solutions/managed-services/\ +199 http://www.vovone.com/support/documentation/\ +199 http://www.vovone.com/voip-services/business-voip/\ +199 http://www.vovone.com/more/ssl-certificates/ssl-certificate-validation/\ +199 http://www.vovone.com/more/ssl-certificates/ssl-certificates-brand/\ +199 http://www.vovone.com/more/colocation/shared-rackspace/\ +199 http://www.vovone.com/more/reseller-plans/affiliate-plan/\ +199 http://www.vovone.com/support/f-a-q/\ +198 http://www.vovone.com/support/support-desk/\ +198 http://www.vovone.com/voip-services/wholesale-voip/\ +197 http://www.vovone.com/domain-names/redirect-domain-name/\ +197 http://www.vovone.com/company/press/\ +\ +\ +\ + +\b entrypoint http://www.ke.tu-darmstadt.de/lehre/arbeiten: +\b0 \ +\ + ==== robots.txt ====\ +prohibit by robots.txt: 4\ +\ + \ + ==== numberHyperlink ====\ +#Hyperlinks Website\ +405 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1625\ +120 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1677\ +107 http://www.tu-darmstadt.de\ +77 http://www.informatik.tu-darmstadt.de\ +71 http://www.ke.tu-darmstadt.de\ +61 http://www.ke.tu-darmstadt.de/bibtex/authors/show/875\ +46 http://www.ke.tu-darmstadt.de/lehre\ +46 http://www.ke.tu-darmstadt.de/news\ +41 http://www.ke.tu-darmstadt.de/bibtex/authors/show/708\ +41 http://www.ke.tu-darmstadt.de/bibtex/search\ +40 http://www.ke.tu-darmstadt.de/de/studierende/studienbuero/ansprechpartner-studienbuero/\ +40 http://www.ke.tu-darmstadt.de/bibtex/export\ +39 http://www.informatik.tu-darmstadt.de/de/aktuelles/veranstaltungentermine/\ +39 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#proceedings\ +38 http://www.ke.tu-darmstadt.de/de/intern/index/\ +38 http://www.ke.tu-darmstadt.de/de/studierende/studiendekanat/ansprechpartner/\ +37 http://www.ke.tu-darmstadt.de/bibtex/publications\ +37 http://www.informatik.tu-darmstadt.de/de/aktuelles/neuigkeiten/neuigkeiten/artikel/vortrag-ueber-fitweltweit-programm-des-daad-1/\ +36 http://www.ke.tu-darmstadt.de/resources\ +36 http://www.ke.tu-darmstadt.de/bibtex/topics/single/77\ +36 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1849\ +35 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type\ +34 http://www.informatik.tu-darmstadt.de/de/aktuelles/neuigkeiten/neuigkeiten/artikel/smarte-spione/\ +34 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/recent\ +33 http://www.ke.tu-darmstadt.de/de/fachbereich/dekanat/\ +33 http://www.ke.tu-darmstadt.de/de/fachbereich/bilder/absolventenfeier-november-2012/begruessung/\ +33 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#inproceedings\ +33 http://www.ke.tu-darmstadt.de/research\ +31 http://www.ke.tu-darmstadt.de/de/forschung/netzwerkpartner/\ +29 http://www.ke.tu-darmstadt.de/de/aktuelles/newsletter-an-und-abmeldung/\ +29 http://www.ke.tu-darmstadt.de/bibtex/authors/show/702\ +29 http://www.ke.tu-darmstadt.de/projects\ +29 http://www.ke.tu-darmstadt.de/bibtex/topics/single/33\ +29 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#incollection\ +28 http://www.informatik.tu-darmstadt.de/de/aktuelles/neuigkeiten/neuigkeiten/artikel/eine-kultur-der-privatsphaere-im-internet/\ +28 http://www.ke.tu-darmstadt.de/bibtex/topics\ +28 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#book\ +27 http://www.ke.tu-darmstadt.de/de/aktuelles/neuigkeiten/\ +26 http://www.ke.tu-darmstadt.de/bibtex/authors/show/3036\ +25 http://www.ke.tu-darmstadt.de/bibtex/authors/show/2370\ +24 http://www.ke.tu-darmstadt.de/de/aktuelles/preise-und-auszeichnungen/\ +24 http://www.ke.tu-darmstadt.de/staff\ +24 http://www.ke.tu-darmstadt.de/impressum\ +24 http://www.ke.tu-darmstadt.de/de/studierende/news-fuer-studierende/\ +24 http://www.ke.tu-darmstadt.de/publications\ +23 http://www.ke.tu-darmstadt.de/bibtex/authors/show/2365\ +23 http://www.ke.tu-darmstadt.de/termine\ +23 http://www.ke.tu-darmstadt.de/de/ehemalige/alumni-portal-der-tu-darmstadt/\ +23 http://www.ke.tu-darmstadt.de/de/ehemalige/\ +22 http://www.tu-darmstadt.de/\ +\ + \ + ==== numberHyperlinksPerPage ====\ +#HyperlinksToPage Website\ +3528 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1625\ +915 http://www.tu-darmstadt.de\ +904 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1677\ +635 de/aktuelles/neuigkeiten/\ +577 de/fachbereich/dekanat/\ +575 de/fachbereich/bilder/absolventenfeier-november-2012/begruessung/\ +528 http://www.informatik.tu-darmstadt.de\ +499 http://www.ke.tu-darmstadt.de\ +490 de/aktuelles/newsletter-an-und-abmeldung/\ +482 de/forschung/netzwerkpartner/\ +481 http://www.ke.tu-darmstadt.de/bibtex/topics/single/33\ +474 de/studierende/studiendekanat/ansprechpartner/\ +468 de/studierende/studienbuero/ansprechpartner-studienbuero/\ +452 de/intern/index/\ +450 http://www.ke.tu-darmstadt.de/bibtex/authors/show/875\ +444 http://www.ke.tu-darmstadt.de/bibtex/topics/single/77\ +434 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/recent\ +434 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type\ +433 javascript:this.print()\ +429 javascript:fontsize('reset')\ +429 javascript:fontsize('inkrement')\ +429 javascript:fontsize('dekrement')\ +424 http://www.ke.tu-darmstadt.de/bibtex/search\ +424 http://www.ke.tu-darmstadt.de/bibtex/topics\ +424 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#Proceedings\ +424 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#Book\ +424 http://www.ke.tu-darmstadt.de/bibtex/publications\ +424 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#Inproceedings\ +424 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#Incollection\ +424 http://www.ke.tu-darmstadt.de/bibtex/export\ +412 de/aktuelles/neuigkeiten/neuigkeiten/artikel/smarte-spione/\ +408 de/aktuelles/neuigkeiten/neuigkeiten/artikel/eine-kultur-der-privatsphaere-im-internet/\ +408 de/aktuelles/neuigkeiten/neuigkeiten/artikel/vortrag-ueber-fitweltweit-programm-des-daad-1/\ +405 \ +382 http://www.ke.tu-darmstadt.de/bibtex/authors/show/708\ +369 de/fachbereich/\ +352 de/fachbereich/ehrungen-und-auszeichnungen/alwin-walther-medaille/\ +351 de/fachbereich/kontakt-und-anfahrt/\ +351 de/fachbereich/personen/\ +350 de/fachbereich/professuren-und-gruppenleitungen/\ +350 de/fachbereich/ueber-den-fachbereich/\ +350 de/fachbereich/ausschuesse-gremien-und-kommissionen/\ +349 http://www.informatik.tu-darmstadt.de/index.php?id=40\ +336 http://www.ke.tu-darmstadt.de/bibtex/authors/show/702\ +330 http://www.informatik.tu-darmstadt.de/index.php?id=1894\ +306 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1849\ +302 http://www.ke.tu-darmstadt.de/news\ +298 de/tu/\ +277 http://www.tu-darmstadt.de/\ +264 #top\ +\ + \ + ==== url queue ====\ +\ + \ + ==== language distribution ====\ +Language Number of occurences\ +de 329\ +en 576\ +es 94\ +\ +\ + +\b entrypoint http://www.spiegel.de:\ +\ +\ + +\b0 ==== robots.txt ====\ +prohibit by robots.txt: 115\ +\ + \ + ==== numberHyperlink ====\ +#Hyperlinks Website\ +43 https://www.amazon.de/b\ +38 http://www.amazon.de/spiegel\ +28 http://tv.adobe.com\ +21 http://tvprogramm.spiegel.de/\ +19 http://www.spiegel.de/\ +18 https://service.spiegel.de\ +18 http://www.spiegel.de/spiegel/spiegelgeschichte/index-2013-2.html\ +17 http://www.spiegel.de/spiegel/deinspiegel/index-2013-6.html\ +16 https://www.ebook.de/de/category/61110/unsere_vorteile.html\ +16 http://www.spiegel.de\ +15 http://www.spiegel.de/shop\ +14 http://www.shopbop.com/gp/help/customer/display.html\ +14 http://www.manager-magazin.de/\ +14 http://www.spiegel.de/spiegel/spiegelwissen/index-2013-2.html\ +13 http://www.spiegel.de/spiegel/\ +13 http://www.spiegel.de/wissenschaft/\ +12 http://wetter.spiegel.de/spiegel/\ +10 https://www.ebook.de/de/category/59475/kontakt_impressum.html\ +10 http://abo.spiegel.de/go/place!abosspsc\ +9 https://www.amazon.de/gp/cart/view.html\ +9 https://www.ebook.de/de/category/59424/hilfe.html\ +9 http://www.amazon.de/gp/feature.html\ +9 http://www.spiegel.de/sport/\ +9 https://media.libri.de/de/category/58974/sony_reader.html\ +9 http://www.spiegelgruppe-nachdrucke.de\ +9 https://www.ebook.de/de/category/61132/newsletter.html\ +9 http://www.spiegelwissen.tv/flashsite/index.html\ +8 http://www.spiegel.de/hilfe/\ +8 http://abo.spiegel.de/?et_cid=7&et_lid=1946&et_sub=heftkasten\ +8 https://www.amazon.es/b\ +8 https://www.ebook.de/de/category/59663/gutscheine_kaufen.html\ +8 https://www.ebook.de/de/category/52122/ebooks.html\ +8 http://www.spiegel.de/politik/\ +8 https://www.ebook.de/de/account/wishlist/add\ +8 https://www.amazon.de/pc-mac-downloads-herunterladen-digital-steam/b\ +8 http://www.spiegel.de/spiegel/unispiegel/\ +8 http://www.spiegel.de/unispiegel/studium/tools-hier-werden-sie-geholfen-a-640620.html\ +8 http://www.harvardbusinessmanager.de/\ +7 http://www.amazon.co.jp/\ +7 https://www.ebook.de/de/category/63461/ebooks_verschenken.html\ +7 https://www.ebook.de/de/category/browse\ +7 http://kdp.amazon.de/\ +7 http://abo.spiegel.de/?et_cid=7&et_lid=1946&et_sub=aboreiter\ +7 http://www.spiegel-qc.de/selbstbuchungstool\ +7 https://media.libri.de/de/category/52124/buecher.html\ +7 http://www.spiegel-qc.de/\ +7 https://www.ebook.de/de/magazine\ +7 https://www.ebook.de\ +7 http://www.spiegel.de/video/\ +7 http://www.libri.de/shop/action/magazine/6/ebooks_reader.html\ +\ + \ + ==== numberHyperlinksPerPage ====\ +#HyperlinksToPage Website\ +6966 #\ +1507 /\ +1027 \ +961 http://www.amazon.de/spiegel\ +671 http://tv.adobe.com/product/photoshop/\ +640 \{\{url\}\}\ +640 /gp/digital/fiona/manage\ +598 javascript:void(0);\ +597 http://tv.adobe.com/product/cs-production-premium/\ +586 http://www.spiegel.de/\ +575 http://www.spiegel.de/spiegel/\ +509 http://wetter.spiegel.de/spiegel/\ +504 <#=item.url #>\ +492 http://www.spiegel.de/shop\ +468 http://www.spiegel.de/spiegel/spiegelwissen/index-2013-2.html\ +468 http://www.spiegel.de/spiegel/deinspiegel/index-2013-6.html\ +468 http://www.spiegel.de/spiegel/spiegelgeschichte/index-2013-2.html\ +462 /gp/site-directory\ +460 /gp/cart/view.html?ie=UTF8&hasWorkingJavascript=1\ +441 /gp/registry/wishlist\ +435 /clouddrive\ +411 http://www.spiegel.de/sptv/magazin/\ +385 /gp/prime\ +382 /product/photoshop/\ +352 /gp/dmusic/mp3/player\ +323 https://www.ebook.de/de/account/ebookHistory\ +316 https://www.ebook.de/de/account/create/singlestep\ +311 http://tv.adobe.com/product/cs-design-premium/\ +311 http://forum.spiegel.de/\ +310 /product/illustrator/\ +308 http://tv.adobe.com/product/creative-cloud/\ +303 http://www.spiegel.de/video/\ +303 http://www.spiegel-qc.de/\ +297 /video/\ +296 http://www.spiegel.de/schlagzeilen/\ +294 http://www.quality-abo.de/\ +293 http://www.spiegelgruppe.de/\ +293 http://www.buchreport.de/\ +288 http://www.spiegelgruppe-nachdrucke.de\ +277 /de/category/60575/libri_de_ist_jetzt_ebook_de.html\ +276 http://www.manager-magazin.de/\ +274 http://tv.adobe.com/product/premiere-pro/\ +267 http://tv.adobe.com/product/after-effects/\ +264 http://www.harvardbusinessmanager.de/\ +262 /product/premiere-pro/\ +260 /MP3-Musik-Downloads/b?ie=UTF8&node=77195031\ +260 http://tvprogramm.spiegel.de/\ +259 /pc-mac-downloads-herunterladen-digital-steam/b?ie=UTF8&node=1333619031\ +259 /spiegel/\ +256 /Navigationssystems-Car-HiFi-Autoradios/b?ie=UTF8&node=236861011\ +\ + \ + ==== url queue ====\ +\ + \ + ==== language distribution ====\ +Language Number of occurences\ +de 623\ +en 246\ +es 130 +\b \ + +\b0 \ +\ +\ +} \ No newline at end of file