slides update

2013-05-19 22:19:45 +02:00 · 2013-05-19 22:19:45 +02:00 · c13047919a
commit c13047919a
parent a063983767
43 changed files with 2398 additions and 89 deletions
--- a/Mining/Uebungen/2_Uebung/abgabe/challenge.txt
+++ b/Mining/Uebungen/2_Uebung/abgabe/challenge.txt
@ -0,0 +1,10 @@
+01 es
+02 de
+03 en
+04 en
+05 de
+06 es
+07 es
+08 de
+09 en
+10 es 
--- a/Mining/Uebungen/2_Uebung/abgabe/crawler/crawler.py
+++ b/Mining/Uebungen/2_Uebung/abgabe/crawler/crawler.py
@ -0,0 +1,510 @@
+# -*- coding: utf-8 -*-
+import urllib2
+import sys
+import random
+import robotparser
+import re
+import math
+from sgmllib import SGMLParser
+import sgmllib
+from urlparse import urlparse
+from urlparse import urljoin
+import matplotlib.pyplot as plt
+import time
+from termcolor import colored
+from collections import Counter
+
+'''
+VN:
+    - Plagiats-Checker fertig
+    - Sprachprüfer fertig
+
+TODO:
+    - DONE canonize urls -> canonize? slides? -> remember last host -> no magic here -> even using ugly global
+    - DONE with getNextUrlToVisit():
+            server timeout -> safe crawled host, set timeout for crawled host
+    - statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html
+
+'''
+
+# crawler attributes
+entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
+entrypoint = "http://www.spiegel.de" # german website
+#entrypoint = "http://www.cnn.com" # english website
+#entrypoint = "http://www.red2000.com/spain/1index.html" # spanish website
+#entrypoint = "https://code.google.com/p/feedparser/issues/attachmentText?id=226&aid=-1296926914212963541&name=demonstrate_issue_226.xml&token=CHtgpTsdPmWnNsvScD0yfMuBriU%3A1368702558154"
+numberOfPagesToCrawl = 1000
+timeBetweenSameHost = 0 # 5 sec
+visitOnlyTuSites = False;
+
+
+#some variables
+prohibitedSites = 0
+visitedUrls = [] # safe already visited urls, so no url will be visited more than once
+
+pages = {} # downloaded pages
+
+numberHyperlink = {} # safe number of hyperlinks...
+numberHyperlinksPerPage = {} # safe number of hyperlinks per page
+
+visitedHostsWithTimestamp = {} # safe visited hosts with timestamp
+robotsTxtResults = {} # safe robots.txt
+
+lasthost = '' #last host
+
+def normalizeMap(m):
+    s = sum(m.values())
+	
+    for k in m:
+        m[k] = float(m[k]) / float(s)
+
+def subtractDicts(dict1, dict2):
+    dic = dict()
+    for key in dict1:
+        if key in dict2:
+            dic[key] = max(0, int(dict1[key]) - int(dict2[key]))
+        else:
+            dic[key] = int(dict1[key])
+            
+    for key in dict2:
+        if key not in dict1:
+            dic[key] = int(dict2[key])
+    
+    return dic
+
+def countWords(words):
+    counts = {}
+    for word in words:
+        if word not in counts:
+            counts[word] = 1
+        else:
+            counts[word] += 1
+    return counts
+
+def blockedByRobotsTxt(url):
+    o = urlparse(url)
+    robotsUrl = o.scheme+"://"+o.netloc+"/robots.txt"
+            
+    if url in robotsTxtResults:
+        rp = robotsTxtResults[robotsUrl]
+    else:
+        rp = robotparser.RobotFileParser()
+        rp.set_url(robotsUrl)
+    
+        try:
+            rp.read()
+            robotsTxtResults[robotsUrl] = rp 
+        except:
+            robotsTxtResults[robotsUrl] = None # robots.txt doesn't exist
+            
+    if robotsTxtResults[robotsUrl] == None:
+        return False # return false if robots.txt doesn't exist
+    else:
+        if rp.can_fetch("*", url):
+            return False
+        else:
+            print colored("-> not allowed to visit :(  "+url, "red")
+            global prohibitedSites
+            prohibitedSites += 1            
+            return True
+
+def canonicalUrl(url):
+    global lasthost
+    url = url.lower().replace(" ", "")
+
+    o = urlparse(url)
+    
+    if o.netloc != '':
+        lasthost = o.scheme + '://' + o.netloc
+        
+    if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl():
+        if ".html" in o.path:
+            return [url]
+        if "." not in o.path:
+            return [url]
+        return []
+    else:
+        if o.scheme=='':
+            return [urljoin(lasthost,o.path)]
+        else:
+            return []
+        
+    
+def getNextUrlToVisit():
+    url = random.choice(extractor.urls)
+    
+    if visitOnlyTuSites:
+        if 'tu-darmstadt' not in urlparse(url).netloc:
+            extractor.urls.remove(url)
+            return getNextUrlToVisit()
+    if url in numberHyperlink:
+        numberHyperlink[url] += 1
+    else:
+        numberHyperlink[url] = 1
+        
+        
+    host = urlparse(url).netloc
+    
+    ## check if url is blocked by robots.txt or was already visited ##
+    if blockedByRobotsTxt(url) or url in visitedUrls:
+        extractor.urls.remove(url)
+        return getNextUrlToVisit()
+        
+    ## check if host got a timeout
+    if host in visitedHostsWithTimestamp:
+        timestamp = visitedHostsWithTimestamp[host]
+        secondsSinceLastVisit = int(time.time()) - timestamp
+        if secondsSinceLastVisit >= timeBetweenSameHost:
+            visitedHostsWithTimestamp[host] = int(time.time())
+            visitedUrls.append(url)
+            extractor.urls.remove(url)
+            return url
+        else:
+            secondsToWait = timeBetweenSameHost - secondsSinceLastVisit
+            print colored("    -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "magenta")
+            return getNextUrlToVisit()   
+    else:
+        visitedHostsWithTimestamp[host] = int(time.time())
+        visitedUrls.append(url)
+        extractor.urls.remove(url)
+        return url
+    
+    
+class URLLister(SGMLParser):
+    ## fix SGMLParseError
+    def resetParser(self):
+        SGMLParser.reset(self)
+        
+    def reset(self):                              
+        SGMLParser.reset(self)
+        self.urls = []
+        
+    def start_a(self, attrs):                     
+        
+        href = [v for k, v in attrs if k=='href'] 
+        if href:
+            url = canonicalUrl(href[0])
+            self.urls.extend(url)         
+            
+            # count number of links on actual site
+            if href[0] in numberHyperlinksPerPage:
+                numberHyperlinksPerPage[href[0]] += 1
+            else:
+                numberHyperlinksPerPage[href[0]] = 1
+                    
+                    
+                
+if __name__ == "__main__":
+
+    page = urllib2.urlopen(entrypoint, timeout = 5)
+    print "currently visited url: "+entrypoint
+    extractor = URLLister()
+    extractor.feed(page.read())
+    page.close()
+    
+    
+    i = 1
+    while(i <= numberOfPagesToCrawl):
+        url = getNextUrlToVisit()
+        print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") currently visiting url: "+url, "blue")
+        try:
+            page = urllib2.urlopen(url, timeout = 6)
+            pageContent = page.read()
+            pageContent = pageContent.replace('<![CDATA[', '&lt;![CDATA[') ## bugfix for SGMLParser
+            page.close()
+            extractor.feed(pageContent)
+            pages[url] = pageContent
+            i += 1
+            
+        # exception handling
+        except urllib2.HTTPError, err:
+            if err.code == 404:
+                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: not found: "+url, "red")
+                pass
+            if err.code == 400:
+                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: bad request: "+url, "red")
+                pass
+            if err.code == 403:
+                print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: forbidden: "+url, "red")
+                pass
+	except urllib2.URLError:
+            print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red")
+            pass
+        except sgmllib.SGMLParseError:
+            print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: sgmllib.SGMLParseError: "+url, "red")
+            extractor.resetParser()
+            pass
+        except:
+            print "Unexpected error:", sys.exc_info()[0]
+            pass
+            
+    extractor.close()
+    
+    print "\n \n ==== robots.txt ===="    
+    print "prohibit by robots.txt: "+str(prohibitedSites)
+    
+    
+    ## print table number hyperlinks per website ##
+    print "\n \n ==== numberHyperlink ===="
+    print "#Hyperlinks \t Website"
+
+    linkCount1 = {}
+    for u in numberHyperlink.values():
+        if u not in linkCount1:
+            linkCount1[u] = 1
+        else:
+            linkCount1[u] += 1
+    
+                                                      
+    xValues1 = []
+    yValues1 = []        
+                        
+    for u in linkCount1:
+        xValues1.append(u)
+        yValues1.append(linkCount1[u])                                        
+                                                                         
+    plt.plot(xValues1, yValues1)
+    plt.xlabel('Haeufigkeiten des Auftretens')
+    plt.ylabel('Anzahl der URLs')
+    plt.show()   
+   
+    
+    ## print table number hyperlinks to page ##
+    print "\n \n ==== Anzahl URLs pro Seite ===="
+    print "#Anzahl URLs pro Seite"
+    linkCount2 = {}
+    for u in numberHyperlinksPerPage.values():
+        if u not in linkCount2:
+            linkCount2[u] = 1
+        else:
+            linkCount2[u] += 1
+    
+                                                      
+    xValues2 = []
+    yValues2 = []        
+                        
+    for u in linkCount2:
+        xValues2.append(u)
+        yValues2.append(linkCount2[u])                                        
+                                                                                    
+    '''plt.plot(xValues2, yValues2)
+    plt.xlabel('Anzahl der Hyperlinks pro Seite')
+    plt.ylabel('Anzahl der URLs')
+    #plt.xscale('log')
+    #plt.yscale('log')
+    plt.show()'''
+    
+    print "\n \n ==== url queue ===="    
+    for u in extractor.urls:
+        pass
+        #print u
+        
+    threshold = 0.9 # how much similar must 2 urls be to be logged
+        
+    #print "\n \n ==== copied content probability (>= " + str(threshold*100) + " %) ===="
+    #print "URL1 \t URL2 \t Similarity in %"
+    # wordcounts per page
+    wordCountsByPage = {}
+    charsByPage = {}
+    ## count words in all pages ##
+    for url in pages:
+        tmp = re.sub("[\n\r]", "", pages[url]) # remove all scripts
+        tmp = re.sub("<\s*script.*?>.+?<\s*\/script.*?>", "", tmp) # remove all scripts
+        tmp = re.sub("<\s*style.*?>.+?<\s*\/style.*?>", "", tmp) # remove all styles
+        tmp = re.sub("&.+?;", "", tmp) # remove all html entities
+        tmp = re.sub("<.+?>", "", tmp) # remove all html tags
+        tmp = re.sub("\d", "", tmp) # remove all numbers
+        words = re.findall("(\w+)", tmp) # split words
+        words = [x.lower() for x in words] # all words to lower case
+        words = [s for s in words if len(s) > 4 and len(s) <= 10]
+            
+        wordCountsByPage[url] = countWords(words)
+        
+        chars = re.findall("[A-za-z]", tmp); # find all characters
+        chars = [x.lower() for x in chars] # all characters to lower case
+        charsByPage[url] = chars
+
+    ## calculate wordcount deltas and print double-content sites ##
+    wordCountDeltas = {}
+    for url1 in wordCountsByPage:
+        for url2 in wordCountsByPage:
+            if url1 == url2:
+                continue
+                
+            if url1 not in wordCountDeltas:
+                wordCountDeltas[url1] = {}
+            if url2 in wordCountDeltas[url1]: # do it once only
+                continue
+                
+            wordCounts1 = wordCountsByPage[url1]
+            wordCounts2 = wordCountsByPage[url2]
+            
+            sum1 = sum(wordCounts1.values())
+            if sum1 == 0:
+                continue
+            
+            #print "calculating deltas of url1: " + url1 + " -- url2: " + url2
+            deltaWordCounts = subtractDicts(wordCounts1, wordCounts2)
+            
+            wordCountDeltas[url1][url2] = math.fabs(float(sum(deltaWordCounts.values())) / float(sum1))
+            if 1 - wordCountDeltas[url1][url2] > threshold:
+                #print url1 + " \t " + url2 + " \t " + str((1 - wordCountDeltas[url1][url2]) * 100)
+                pass
+    
+    ## determine the sites' languages ##
+    spanish = 'es'
+    english = 'en'
+    german = 'de'
+    
+    pageLanguages = {}
+    lettersByLanguage = {}
+    lettersByLanguage[spanish] = {
+        'e' : 13.68,
+        'a' : 12.53,
+        'o' : 8.68,
+        's' : 7.98,
+        'r' : 6.87,
+        
+        'n' : 6.71,
+        'i' : 6.25,
+        'd' : 5.86,
+        'l' : 4.97,
+        'c' : 4.68,
+        
+        't' : 4.63,
+        'u' : 3.93,
+        'm' : 3.15,
+        'p' : 2.51,
+        'b' : 1.42,
+        
+        'g' : 1.01,
+        'v' : 0.90,
+        'y' : 0.90,
+        'q' : 0.88,
+        'h' : 0.70,
+        
+        'f' : 0.69,
+        'z' : 0.52,
+        'j' : 0.44,
+        'x' : 0.21,
+        'w' : 0.02,
+        
+        'k' : 0.01
+    }
+    lettersByLanguage[english] = {
+        'e' : 12.70,
+        't' : 9.06,
+        'a' : 8.16,
+        'o' : 7.50,
+        'i' : 6.96,
+        
+        'n' : 6.74,
+        's' : 6.32,
+        'h' : 6.09,
+        'r' : 5.99,
+        'd' : 4.25,
+        
+        'l' : 4.03,
+        'c' : 2.78,
+        'u' : 2.76,
+        'm' : 2.41,
+        'w' : 2.36,
+        
+        'f' : 2.23,
+        'g' : 2.02,
+        'y' : 1.97,
+        'p' : 1.93,
+        'b' : 1.49,
+        
+        'v' : 0.98,
+        'k' : 0.77,
+        'j' : 0.15,
+        'x' : 0.15,
+        'q' : 0.10,
+        
+        'z' : 0.07
+    }
+    lettersByLanguage[german] = {
+        'e' : 17.4,
+        'n' : 9.78,
+        'i' : 7.55,
+        's' : 7.27,
+        'r' : 7.00,
+        
+        'a' : 6.51,
+        't' : 6.15,
+        'd' : 5.08,
+        'h' : 4.76,
+        'u' : 4.35,
+        
+        'l' : 3.44,
+        'c' : 3.06,
+        'g' : 3.01,
+        'o' : 2.59,
+        'm' : 2.53,
+        
+        'b' : 1.89,
+        'w' : 1.89,
+        'f' : 1.66,
+        'k' : 1.21,
+        'z' : 1.13,
+        
+        'v' : 0.85,
+        'p' : 0.67,
+        'j' : 0.27,
+        'y' : 0.04,
+        'x' : 0.03,
+        
+        'q' : 0.02
+    }
+
+    # normalize maps
+    normalizeMap(lettersByLanguage[spanish])
+    normalizeMap(lettersByLanguage[english])
+    normalizeMap(lettersByLanguage[german])
+
+    languageCounts = {}
+    for url in charsByPage:
+        tokens = charsByPage[url]
+        tokenCounts = dict(Counter(tokens))
+	
+	tokenSum = sum(tokenCounts.values())
+	
+	# Calculating the squared error
+	rankings = {}
+	matches = {}
+	for token in tokenCounts:
+           for key2 in lettersByLanguage:
+               if token not in lettersByLanguage[key2]:
+                   continue
+               p = float(lettersByLanguage[key2][token]) * 100
+               if p >= 0:
+                   if key2 not in rankings:
+                       rankings[key2] = 0
+                       matches[key2] = 0
+                   # calculate the squared error from observed and reference frequencies
+                   rankings[key2] += math.pow(math.fabs(tokenCounts[token] * 100 / tokenSum - p), 2)
+                   matches[key2] += 1
+	
+	# Resulting language has the minimal mean squared error
+	minRanking = -1
+	language = None
+	for key in rankings:
+            rankings[key] /= matches[key]
+	
+            if minRanking == -1 or rankings[key] < minRanking:
+                minRanking = rankings[key]
+                language = key
+                
+        if language != None:                        
+            pageLanguages[url] = language
+            
+            if language not in languageCounts:
+                languageCounts[language] = 1
+            else:
+                languageCounts[language] += 1
+            
+    print "\n \n ==== language distribution ===="
+    print "Language \t Number of occurences"
+    for lang in languageCounts:
+        print lang + " \t " + str(languageCounts[lang])
--- a/Mining/Uebungen/2_Uebung/abgabe/crawler/termcolor.py
+++ b/Mining/Uebungen/2_Uebung/abgabe/crawler/termcolor.py
@ -0,0 +1,168 @@
+# coding: utf-8
+# Copyright (c) 2008-2011 Volvox Development Team
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+# Author: Konstantin Lepa <konstantin.lepa@gmail.com>
+
+"""ANSII Color formatting for output in terminal."""
+
+from __future__ import print_function
+import os
+
+
+__ALL__ = [ 'colored', 'cprint' ]
+
+VERSION = (1, 1, 0)
+
+ATTRIBUTES = dict(
+        list(zip([
+            'bold',
+            'dark',
+            '',
+            'underline',
+            'blink',
+            '',
+            'reverse',
+            'concealed'
+            ],
+            list(range(1, 9))
+            ))
+        )
+del ATTRIBUTES['']
+
+
+HIGHLIGHTS = dict(
+        list(zip([
+            'on_grey',
+            'on_red',
+            'on_green',
+            'on_yellow',
+            'on_blue',
+            'on_magenta',
+            'on_cyan',
+            'on_white'
+            ],
+            list(range(40, 48))
+            ))
+        )
+
+
+COLORS = dict(
+        list(zip([
+            'grey',
+            'red',
+            'green',
+            'yellow',
+            'blue',
+            'magenta',
+            'cyan',
+            'white',
+            ],
+            list(range(30, 38))
+            ))
+        )
+
+
+RESET = '\033[0m'
+
+
+def colored(text, color=None, on_color=None, attrs=None):
+    """Colorize text.
+
+    Available text colors:
+        red, green, yellow, blue, magenta, cyan, white.
+
+    Available text highlights:
+        on_red, on_green, on_yellow, on_blue, on_magenta, on_cyan, on_white.
+
+    Available attributes:
+        bold, dark, underline, blink, reverse, concealed.
+
+    Example:
+        colored('Hello, World!', 'red', 'on_grey', ['blue', 'blink'])
+        colored('Hello, World!', 'green')
+    """
+    if os.getenv('ANSI_COLORS_DISABLED') is None:
+        fmt_str = '\033[%dm%s'
+        if color is not None:
+            text = fmt_str % (COLORS[color], text)
+
+        if on_color is not None:
+            text = fmt_str % (HIGHLIGHTS[on_color], text)
+
+        if attrs is not None:
+            for attr in attrs:
+                text = fmt_str % (ATTRIBUTES[attr], text)
+
+        text += RESET
+    return text
+
+
+def cprint(text, color=None, on_color=None, attrs=None, **kwargs):
+    """Print colorize text.
+
+    It accepts arguments of print function.
+    """
+
+    print((colored(text, color, on_color, attrs)), **kwargs)
+
+
+if __name__ == '__main__':
+    print('Current terminal type: %s' % os.getenv('TERM'))
+    print('Test basic colors:')
+    cprint('Grey color', 'grey')
+    cprint('Red color', 'red')
+    cprint('Green color', 'green')
+    cprint('Yellow color', 'yellow')
+    cprint('Blue color', 'blue')
+    cprint('Magenta color', 'magenta')
+    cprint('Cyan color', 'cyan')
+    cprint('White color', 'white')
+    print(('-' * 78))
+
+    print('Test highlights:')
+    cprint('On grey color', on_color='on_grey')
+    cprint('On red color', on_color='on_red')
+    cprint('On green color', on_color='on_green')
+    cprint('On yellow color', on_color='on_yellow')
+    cprint('On blue color', on_color='on_blue')
+    cprint('On magenta color', on_color='on_magenta')
+    cprint('On cyan color', on_color='on_cyan')
+    cprint('On white color', color='grey', on_color='on_white')
+    print('-' * 78)
+
+    print('Test attributes:')
+    cprint('Bold grey color', 'grey', attrs=['bold'])
+    cprint('Dark red color', 'red', attrs=['dark'])
+    cprint('Underline green color', 'green', attrs=['underline'])
+    cprint('Blink yellow color', 'yellow', attrs=['blink'])
+    cprint('Reversed blue color', 'blue', attrs=['reverse'])
+    cprint('Concealed Magenta color', 'magenta', attrs=['concealed'])
+    cprint('Bold underline reverse cyan color', 'cyan',
+            attrs=['bold', 'underline', 'reverse'])
+    cprint('Dark blink concealed white color', 'white',
+            attrs=['dark', 'blink', 'concealed'])
+    print(('-' * 78))
+
+    print('Test mixing:')
+    cprint('Underline red on grey color', 'red', 'on_grey',
+            ['underline'])
+    cprint('Reversed green on red color', 'green', 'on_red', ['reverse'])
+
--- a/Mining/Uebungen/2_Uebung/abgabe/crawler/termcolor.pyc
+++ b/Mining/Uebungen/2_Uebung/abgabe/crawler/termcolor.pyc
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon.xpi
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon.xpi
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/README.md
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/README.md
@ -0,0 +1,5 @@
+This is the keaddon add-on.  It contains:
+
+* A program (lib/main.js).
+* A few tests.
+* Some meager documentation.
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/contentScripts/keworker.js
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/contentScripts/keworker.js
@ -0,0 +1,26 @@
+var text =  "";
+var cleantext = "";
+var paragraphs = document.getElementsByTagName('p');
+var open = '<';
+var close = '>';
+for(var i=0; i<paragraphs.length; i++) {
+	text += paragraphs[i].innerHTML;
+}
+
+var doAppend = true;
+var tmp = "";
+for(var i=0; i<text.length; i++) {
+	tmp = text.charAt(i);
+	if( tmp == open ) {
+		doAppend = false;
+	}
+	if(doAppend) {	
+		cleantext += tmp;
+	}	
+	if( tmp == close ) {
+		doAppend = true;
+	}
+}
+//cleantext = unescape(cleantext);
+
+postMessage(cleantext);
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/Flag_of_France.svg
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/Flag_of_France.svg
@ -0,0 +1,3 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="900" height="600"><rect width="900" height="600" fill="#ED2939"/><rect width="600" height="600" fill="#fff"/><rect width="300" height="600" fill="#002395"/></svg>
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/Flag_of_Germany.svg
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/Flag_of_Germany.svg
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" width="1000" height="600" viewBox="0 0 5 3">
+	<desc>Flag of Germany</desc>
+	<rect id="black_stripe" width="5" height="3" y="0" x="0" fill="#000"/>
+	<rect id="red_stripe" width="5" height="2" y="1" x="0" fill="#D00"/>
+	<rect id="gold_stripe" width="5" height="1" y="2" x="0" fill="#FFCE00"/>
+</svg>
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/Flag_of_Spain.svg
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/Flag_of_Spain.svg
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/Flag_of_the_United_Kingdom.svg
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/Flag_of_the_United_Kingdom.svg
@ -0,0 +1,10 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 60 30" width="1200" height="600">
+<clipPath id="t">
+	<path d="M30,15 h30 v15 z v15 h-30 z h-30 v-15 z v-15 h30 z"/>
+</clipPath>
+<path d="M0,0 v30 h60 v-30 z" fill="#00247d"/>
+<path d="M0,0 L60,30 M60,0 L0,30" stroke="#fff" stroke-width="6"/>
+<path d="M0,0 L60,30 M60,0 L0,30" clip-path="url(#t)" stroke="#cf142b" stroke-width="4"/>
+<path d="M30,0 v30 M0,15 h60" stroke="#fff" stroke-width="10"/>
+<path d="M30,0 v30 M0,15 h60" stroke="#cf142b" stroke-width="6"/>
+</svg>
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/de.png
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/de.png
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/en.png
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/en.png
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/es.png
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/es.png
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/fr.png
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/flag/fr.png
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/keicon.png
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/data/keicon.png
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/docs/main.md
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/docs/main.md
@ -0,0 +1,2 @@
+The main module is a program that creates a widget.  When a user clicks on
+the widget, the program loads the mozilla.org website in a new tab.
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/lib/language.js
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/lib/language.js
@ -0,0 +1,4 @@
+exports.german = 'de';
+exports.french = 'fr';
+exports.spanish = 'es';
+exports.english = 'en';
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/lib/main.js
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/lib/main.js
@ -0,0 +1,65 @@
+var widgets = require("widget");
+var pageMod = require("page-mod");
+var student = require("student");
+var data = require("self").data;
+
+var workers = new Array();
+var mod = null;
+
+exports.main = function(options, callback) {
+	mod = pageMod.PageMod(
+		{
+			include: "*",
+			contentScriptWhen:"ready",
+			contentScriptFile: data.url("./contentScripts/keworker.js"),
+			onAttach: function onAttach(worker) {
+				worker.on('message', handleMessage);
+				workers.push(worker);
+			}
+		}
+	);
+
+	var widget = widgets.Widget(
+		{
+		  id: "ke",
+		  label: "Knowledge Engineering",
+		  contentURL: data.url("keicon.png")
+		}
+	);
+
+	function handleMessage(message) {
+		var lang = require("language");
+		if(message.length > 0) {
+			//TODO: Iconswitch
+			var language = student.student(message);
+			console.log(language);
+			switch(language) {
+				case lang.german: 
+					widget.contentURL = data.url("./flag/de.png");
+					break;
+				case lang.spanish: 
+					widget.contentURL = data.url("./flag/es.png");
+					break;
+				case lang.english: 
+					widget.contentURL = data.url("./flag/en.png");
+					break;
+				case lang.french: 
+					widget.contentURL = data.url("./flag/fr.png");
+					break;
+				default:
+					widget.contentURL = data.url("./keicon.png");
+					
+			}
+			//TODO: response
+		}
+	}
+
+	console.log("The add-on is running.");
+}
+
+exports.onUnload = function(reason) {
+	if(mod != null) {mod.destroy();}
+	for(var i=0; i<workers.length; i++) {
+		workers[i].destroy();
+	}
+}
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/lib/student.js
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/lib/student.js
@ -0,0 +1,8 @@
+var lang = require("language");
+var util = require("utility");
+
+function student(text) {
+	return lang.german;
+}
+
+exports.student = student;
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/lib/utility.js
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/lib/utility.js
@ -0,0 +1,82 @@
+
+/*
+Will count all equal array entries and list them in a javascript object such
+that there is a object property for each unique array entry. The value of that
+property equals the number of occurences. The order of the properties is sorted
+according to the number of occurences. With the most occurences first.
+
+You can use this construct
+
+	for(var key in obj) {
+		console.log(obj[key]);
+	}
+
+to iterate over all items in sorted order.
+*/
+function countElements(array) {
+	var tmp = {};
+	var result = {};
+	var helpArray = [];
+	for(var i=0; i<array.length; i++) {
+		var item = array[i];
+		if(!tmp.hasOwnProperty( item )) {
+			tmp[item] = 1;
+		} else {
+			tmp[item] += 1;
+		}
+	}
+	for(var key in tmp) {
+		helpArray.push({key:key, value:tmp[key]});
+	}
+	helpArray.sort(comparePairs);
+	for(var i=0; i<helpArray.length; i++) {
+		result[helpArray[i].key] = helpArray[i].value;
+	}
+	return result;
+};
+
+/*Helperfunction for countElements*/
+function comparePairs(a,b) {
+	return b.value - a.value;
+}
+
+/*
+Tokenize a text at whitespace.
+Some punctuations are removed
+*/
+function tokenize(text) {
+	var lct = text.toLowerCase();
+	lct = lct.replace(/(\.|,|!|\?|'|"|\\|\/|\|)/g,"");
+	var result = lct.split(/\W/g);
+	return result;
+}
+
+/*
+Create an array of chars from a given text.
+*/
+function toCharArray(text) {
+	var result = [];
+	for(var i=0; i<text.length; i++) {
+		result.push(text[i]);
+	}
+	return result;
+}
+
+
+/*
+Create an array of charpairs from a given text.
+*/
+function toCharPairs(text) {
+	var result = [];
+	if (text.length > 1) {
+		for(var i=1; i<text.length; i++) {
+			result.push(text[i-1]+text[i]);
+		}
+	}
+	return result;
+}
+
+exports.countElements = countElements;
+exports.tokenize = tokenize;
+exports.toCharArray = toCharArray;
+exports.toCharPairs = toCharPairs;
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/package.json
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/package.json
@ -0,0 +1,10 @@
+{
+    "name": "keaddon", 
+    "license": "MPL 1.1/GPL 2.0/LGPL 2.1", 
+    "author": "Clemens Dörrhöfer", 
+    "version": "0.1", 
+    "fullName": "keaddon", 
+    "id": "jid0-GN3ivO79cgfs9k4P3lxdo7TPFa4", 
+    "description": "a basic add-on",
+	"icon": "data/keicon.png" 
+}
--- a/Mining/Uebungen/2_Uebung/abgabe/keaddon2/tests/test-main.js
+++ b/Mining/Uebungen/2_Uebung/abgabe/keaddon2/tests/test-main.js
@ -0,0 +1,83 @@
+const main = require("main");
+const lang = require("language");
+exports.test_test_run = function(test) {
+  test.pass("Unit test running!");
+};
+
+exports.test_id = function(test) {
+  test.assert(require("self").id.length > 0);
+};
+
+exports.test_url = function(test) {
+  require("request").Request({
+    url: "http://www.mozilla.org/",
+    onComplete: function(response) {
+      test.assertEqual(response.statusText, "OK");
+      test.done();
+    }
+  }).get();
+  test.waitUntilDone(20000);
+};
+
+exports.test_open_tab = function(test) {
+  const tabs = require("tabs");
+  tabs.open({
+    url: "http://www.mozilla.org/",
+    onReady: function(tab) {
+      test.assertEqual(tab.url, "http://www.mozilla.org/");
+      test.done();
+    }
+  });
+  test.waitUntilDone(20000);
+};
+
+var errormessage = "";
+
+exports.test_util_countElements = function(test) {
+	const util = require("utility");
+	test.assert(compareObjects(util.countElements(["du", "du", "hallo", "hallo", "du"]),{"hallo":2, "du":3}),errormessage);
+};
+
+exports.test_util_toCharArray = function(test) {
+	const util = require("utility");
+	test.assert(compareArrays(util.toCharArray("test"), ["t","e","s","t"]), errormessage);
+};
+ 
+exports.test_util_toCharPairs = function(test) {
+	const util = require("utility");
+	test.assert(compareArrays(util.toCharPairs("mainz"),["ma", "ai", "in", "nz"]), errormessage);
+};
+
+exports.test_util_tokenize = function(test) {
+	const util = require("utility");
+	test.assert(compareArrays(util.tokenize("Dem Igel geht's gut."),["dem","igel","gehts","gut"]), errormessage);
+};
+
+exports.test_student_student = function(test) {
+	const student = require("student");
+	var text = "blubber";
+	test.assertEqual(student.student(text), lang.german, "Geht nicht weil.");
+};
+
+function compareObjects(a,b) {
+	for(var key in a) {
+		if( a[key] != b[key] ) {
+			return false;
+		}
+	}
+	return true;
+};
+
+function compareArrays(a,b) {
+	if (a.length != b.length) {
+		errormessage = "Arrays of unequal size";
+		return false
+	}
+	for(var i=0; i<a.length; i++) {
+		if (a[i] != b[i]) {
+			errormessage = a[i] + " != " + b[i];
+			return false;
+		}
+	}
+	return true;
+};
--- a/Mining/Uebungen/2_Uebung/abgabe/solution.pdf
+++ b/Mining/Uebungen/2_Uebung/abgabe/solution.pdf
--- a/Mining/Uebungen/2_Uebung/crawler/crawler.py
+++ b/Mining/Uebungen/2_Uebung/crawler/crawler.py
@ -34,7 +34,7 @@ entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
 #entrypoint = "http://www.red2000.com/spain/1index.html" # spanish website
 #entrypoint = "https://code.google.com/p/feedparser/issues/attachmentText?id=226&aid=-1296926914212963541&name=demonstrate_issue_226.xml&token=CHtgpTsdPmWnNsvScD0yfMuBriU%3A1368702558154"
 numberOfPagesToCrawl = 1000
-timeBetweenSameHost = 2 # 5 sec
+timeBetweenSameHost = 0 # 5 sec
 visitOnlyTuSites = True;


@ -247,27 +247,52 @@ if __name__ == "__main__":
    ## print table number hyperlinks per website ##
    print "\n \n ==== numberHyperlink ===="
    print "#Hyperlinks \t Website"
-    keys = numberHyperlink.keys()
-    keys.sort( lambda x,y: cmp(numberHyperlink[y], numberHyperlink[x]) ) # sort keys
-    i = 0
-    for u in keys:
-        pass
-        if i < 50:
-            print str(numberHyperlink[u])+"\t \t \t"+u
-            i += 1
-            
+
+    linkCount1 = {}
+    for u in numberHyperlink.values():
+        if u not in linkCount1:
+            linkCount1[u] = 1
+        else:
+            linkCount1[u] += 1
+    
+                                                      
+    xValues1 = []
+    yValues1 = []        
+                        
+    for u in linkCount1:
+        xValues1.append(u)
+        yValues1.append(linkCount1[u])                                        
+                                                                         
+    plt.plot(xValues1, yValues1)
+    plt.xlabel('Haeufigkeiten des Auftretens')
+    plt.ylabel('Anzahl der URLs')
+    plt.show()   
+   
    
    ## print table number hyperlinks to page ##
-    print "\n \n ==== numberHyperlinksPerPage ===="
-    print "#HyperlinksToPage \t Website"
-    keys = numberHyperlinksPerPage.keys()
-    keys.sort( lambda x,y: cmp(numberHyperlinksPerPage[y], numberHyperlinksPerPage[x]) ) # sort keys
-    i = 0
-    for u in keys:
-        pass
-        if i < 50:
-            print str(numberHyperlinksPerPage[u])+"\t \t \t"+u
-            i += 1
+    print "\n \n ==== Anzahl URLs pro Seite ===="
+    print "#Anzahl URLs pro Seite"
+    linkCount2 = {}
+    for u in numberHyperlinksPerPage.values():
+        if u not in linkCount2:
+            linkCount2[u] = 1
+        else:
+            linkCount2[u] += 1
+    
+                                                      
+    xValues2 = []
+    yValues2 = []        
+                        
+    for u in linkCount2:
+        xValues2.append(u)
+        yValues2.append(linkCount2[u])                                        
+                                                                                    
+    '''plt.plot(xValues2, yValues2)
+    plt.xlabel('Anzahl der Hyperlinks pro Seite')
+    plt.ylabel('Anzahl der URLs')
+    #plt.xscale('log')
+    #plt.yscale('log')
+    plt.show()'''
    
    print "\n \n ==== url queue ===="    
    for u in extractor.urls:
--- a/Mining/Uebungen/2_Uebung/keaddon2.zip
+++ b/Mining/Uebungen/2_Uebung/keaddon2.zip
--- a/Mining/Uebungen/2_Uebung/latex/grafiken/a2_spiegel_1.png
+++ b/Mining/Uebungen/2_Uebung/latex/grafiken/a2_spiegel_1.png
--- a/Mining/Uebungen/2_Uebung/latex/grafiken/a2_spiegel_2.png
+++ b/Mining/Uebungen/2_Uebung/latex/grafiken/a2_spiegel_2.png
--- a/Mining/Uebungen/2_Uebung/latex/grafiken/a2_spiegel_3.png
+++ b/Mining/Uebungen/2_Uebung/latex/grafiken/a2_spiegel_3.png
--- a/Mining/Uebungen/2_Uebung/latex/grafiken/a2_spiegel_4.png
+++ b/Mining/Uebungen/2_Uebung/latex/grafiken/a2_spiegel_4.png
--- a/Mining/Uebungen/2_Uebung/latex/grafiken/a2_tu_1.png
+++ b/Mining/Uebungen/2_Uebung/latex/grafiken/a2_tu_1.png
--- a/Mining/Uebungen/2_Uebung/latex/grafiken/a2_tu_2.png
+++ b/Mining/Uebungen/2_Uebung/latex/grafiken/a2_tu_2.png
--- a/Mining/Uebungen/2_Uebung/latex/grafiken/a2_tu_3.png
+++ b/Mining/Uebungen/2_Uebung/latex/grafiken/a2_tu_3.png
--- a/Mining/Uebungen/2_Uebung/latex/grafiken/a2_tu_4.png
+++ b/Mining/Uebungen/2_Uebung/latex/grafiken/a2_tu_4.png
--- a/Mining/Uebungen/2_Uebung/latex/solution.aux
+++ b/Mining/Uebungen/2_Uebung/latex/solution.aux
@ -47,26 +47,38 @@
 \@writefile{nav}{\headcommand {\beamer@subsectionpages {2}{8}}}
 \@writefile{nav}{\headcommand {\slideentry {2}{0}{9}{9/9}{}{0}}}
 \@writefile{nav}{\headcommand {\beamer@framepages {9}{9}}}
-\@writefile{toc}{\beamer@sectionintoc {3}{2. Aufgabe}{10}{0}{3}}
-\@writefile{nav}{\headcommand {\sectionentry {3}{2. Aufgabe}{10}{2. Aufgabe}{0}}}
-\@writefile{nav}{\headcommand {\beamer@sectionpages {9}{9}}}
-\@writefile{nav}{\headcommand {\beamer@subsectionpages {9}{9}}}
-\@writefile{nav}{\headcommand {\slideentry {3}{0}{10}{10/10}{}{0}}}
+\@writefile{nav}{\headcommand {\slideentry {2}{0}{10}{10/10}{}{0}}}
 \@writefile{nav}{\headcommand {\beamer@framepages {10}{10}}}
-\@writefile{toc}{\beamer@sectionintoc {4}{2. Aufgabe}{11}{0}{4}}
-\@writefile{nav}{\headcommand {\sectionentry {4}{2. Aufgabe}{11}{2. Aufgabe}{0}}}
-\@writefile{nav}{\headcommand {\beamer@sectionpages {10}{10}}}
-\@writefile{nav}{\headcommand {\beamer@subsectionpages {10}{10}}}
-\@writefile{nav}{\headcommand {\slideentry {4}{0}{11}{11/11}{}{0}}}
+\@writefile{nav}{\headcommand {\slideentry {2}{0}{11}{11/11}{}{0}}}
 \@writefile{nav}{\headcommand {\beamer@framepages {11}{11}}}
-\@writefile{nav}{\headcommand {\slideentry {4}{0}{12}{12/12}{}{0}}}
+\@writefile{nav}{\headcommand {\slideentry {2}{0}{12}{12/12}{}{0}}}
 \@writefile{nav}{\headcommand {\beamer@framepages {12}{12}}}
-\@writefile{nav}{\headcommand {\slideentry {4}{0}{13}{13/13}{}{0}}}
+\@writefile{nav}{\headcommand {\slideentry {2}{0}{13}{13/13}{}{0}}}
 \@writefile{nav}{\headcommand {\beamer@framepages {13}{13}}}
-\@writefile{nav}{\headcommand {\slideentry {4}{0}{14}{14/14}{}{0}}}
+\@writefile{nav}{\headcommand {\slideentry {2}{0}{14}{14/14}{}{0}}}
 \@writefile{nav}{\headcommand {\beamer@framepages {14}{14}}}
-\@writefile{nav}{\headcommand {\beamer@partpages {1}{14}}}
-\@writefile{nav}{\headcommand {\beamer@subsectionpages {11}{14}}}
-\@writefile{nav}{\headcommand {\beamer@sectionpages {11}{14}}}
-\@writefile{nav}{\headcommand {\beamer@documentpages {14}}}
-\@writefile{nav}{\headcommand {\def \inserttotalframenumber {14}}}
+\@writefile{nav}{\headcommand {\slideentry {2}{0}{15}{15/15}{}{0}}}
+\@writefile{nav}{\headcommand {\beamer@framepages {15}{15}}}
+\@writefile{nav}{\headcommand {\slideentry {2}{0}{16}{16/16}{}{0}}}
+\@writefile{nav}{\headcommand {\beamer@framepages {16}{16}}}
+\@writefile{nav}{\headcommand {\slideentry {2}{0}{17}{17/17}{}{0}}}
+\@writefile{nav}{\headcommand {\beamer@framepages {17}{17}}}
+\@writefile{nav}{\headcommand {\slideentry {2}{0}{18}{18/18}{}{0}}}
+\@writefile{nav}{\headcommand {\beamer@framepages {18}{18}}}
+\@writefile{nav}{\headcommand {\slideentry {2}{0}{19}{19/19}{}{0}}}
+\@writefile{nav}{\headcommand {\beamer@framepages {19}{19}}}
+\@writefile{nav}{\headcommand {\slideentry {2}{0}{20}{20/20}{}{0}}}
+\@writefile{nav}{\headcommand {\beamer@framepages {20}{20}}}
+\@writefile{nav}{\headcommand {\slideentry {2}{0}{21}{21/21}{}{0}}}
+\@writefile{nav}{\headcommand {\beamer@framepages {21}{21}}}
+\@writefile{nav}{\headcommand {\slideentry {2}{0}{22}{22/22}{}{0}}}
+\@writefile{nav}{\headcommand {\beamer@framepages {22}{22}}}
+\@writefile{nav}{\headcommand {\slideentry {2}{0}{23}{23/23}{}{0}}}
+\@writefile{nav}{\headcommand {\beamer@framepages {23}{23}}}
+\@writefile{nav}{\headcommand {\slideentry {2}{0}{24}{24/24}{}{0}}}
+\@writefile{nav}{\headcommand {\beamer@framepages {24}{24}}}
+\@writefile{nav}{\headcommand {\beamer@partpages {1}{24}}}
+\@writefile{nav}{\headcommand {\beamer@subsectionpages {9}{24}}}
+\@writefile{nav}{\headcommand {\beamer@sectionpages {9}{24}}}
+\@writefile{nav}{\headcommand {\beamer@documentpages {24}}}
+\@writefile{nav}{\headcommand {\def \inserttotalframenumber {24}}}
--- a/Mining/Uebungen/2_Uebung/latex/solution.log
+++ b/Mining/Uebungen/2_Uebung/latex/solution.log
@ -1,4 +1,4 @@
-This is pdfTeX, Version 3.1415926-2.3-1.40.12 (TeX Live 2011) (format=pdflatex 2011.7.3)  19 MAY 2013 18:14
+This is pdfTeX, Version 3.1415926-2.3-1.40.12 (TeX Live 2011) (format=pdflatex 2011.7.3)  19 MAY 2013 22:19
 entering extended mode
 restricted \write18 enabled.
 %&-line parsing enabled.
@ -1455,7 +1455,7 @@ Underfull \hbox (badness 10000) has occurred while \output is active

 [2

-] <grafiken/a1_abb1.png, id=47, 330.23375pt x 531.9875pt>
+] <grafiken/a1_abb1.png, id=39, 330.23375pt x 531.9875pt>
 File: grafiken/a1_abb1.png Graphic file (type png)

 <use grafiken/a1_abb1.png>
@ -1481,7 +1481,7 @@ Underfull \hbox (badness 10000) has occurred while \output is active

 [4

-] <grafiken/a1_abb2.png, id=58, 614.295pt x 131.49126pt>
+] <grafiken/a1_abb2.png, id=50, 614.295pt x 131.49126pt>
 File: grafiken/a1_abb2.png Graphic file (type png)

 <use grafiken/a1_abb2.png>
@ -1550,20 +1550,172 @@ Underfull \hbox (badness 10000) has occurred while \output is active

 [12

-]
+] <grafiken/a2_spiegel_1.png, id=96, 578.16pt x 433.62pt>
+File: grafiken/a2_spiegel_1.png Graphic file (type png)
+
+<use grafiken/a2_spiegel_1.png>
+Package pdftex.def Info: grafiken/a2_spiegel_1.png used on input line 171.
+(pdftex.def)             Requested size: 208.65793pt x 156.49014pt.
+
+Overfull \vbox (22.25151pt too high) detected at line 171
+ []
+
+
 Underfull \hbox (badness 10000) has occurred while \output is active

 []

 [13

-]
+ <./grafiken/a2_spiegel_1.png>]
+<grafiken/a2_spiegel_2.png, id=105, 578.16pt x 433.62pt>
+File: grafiken/a2_spiegel_2.png Graphic file (type png)
+
+<use grafiken/a2_spiegel_2.png>
+Package pdftex.def Info: grafiken/a2_spiegel_2.png used on input line 180.
+(pdftex.def)             Requested size: 208.65793pt x 156.49014pt.
+
+Overfull \vbox (22.25151pt too high) detected at line 180
+ []
+
+
 Underfull \hbox (badness 10000) has occurred while \output is active

 []

 [14

+ <./grafiken/a2_spiegel_2.png>]
+Underfull \hbox (badness 10000) has occurred while \output is active
+
+ []
+
+[15
+
+] <grafiken/a2_spiegel_3.png, id=117, 578.16pt x 433.62pt>
+File: grafiken/a2_spiegel_3.png Graphic file (type png)
+
+<use grafiken/a2_spiegel_3.png>
+Package pdftex.def Info: grafiken/a2_spiegel_3.png used on input line 196.
+(pdftex.def)             Requested size: 208.65793pt x 156.49014pt.
+
+Overfull \vbox (22.25151pt too high) detected at line 196
+ []
+
+
+Underfull \hbox (badness 10000) has occurred while \output is active
+
+ []
+
+[16
+
+ <./grafiken/a2_spiegel_3.png>]
+<grafiken/a2_spiegel_4.png, id=124, 578.16pt x 433.62pt>
+File: grafiken/a2_spiegel_4.png Graphic file (type png)
+
+<use grafiken/a2_spiegel_4.png>
+Package pdftex.def Info: grafiken/a2_spiegel_4.png used on input line 205.
+(pdftex.def)             Requested size: 208.65793pt x 156.49014pt.
+
+Overfull \vbox (22.25151pt too high) detected at line 205
+ []
+
+
+Underfull \hbox (badness 10000) has occurred while \output is active
+
+ []
+
+[17
+
+ <./grafiken/a2_spiegel_4.png>]
+Underfull \hbox (badness 10000) has occurred while \output is active
+
+ []
+
+[18
+
+]
+Underfull \hbox (badness 10000) has occurred while \output is active
+
+ []
+
+[19
+
+] <grafiken/a2_tu_1.png, id=142, 578.16pt x 433.62pt>
+File: grafiken/a2_tu_1.png Graphic file (type png)
+
+<use grafiken/a2_tu_1.png>
+Package pdftex.def Info: grafiken/a2_tu_1.png used on input line 236.
+(pdftex.def)             Requested size: 208.65793pt x 156.49014pt.
+
+Overfull \vbox (22.25151pt too high) detected at line 236
+ []
+
+
+Underfull \hbox (badness 10000) has occurred while \output is active
+
+ []
+
+[20
+
+ <./grafiken/a2_tu_1.png>]
+<grafiken/a2_tu_2.png, id=149, 578.16pt x 433.62pt>
+File: grafiken/a2_tu_2.png Graphic file (type png)
+ <use grafiken/a2_tu_2.png>
+Package pdftex.def Info: grafiken/a2_tu_2.png used on input line 245.
+(pdftex.def)             Requested size: 208.65793pt x 156.49014pt.
+
+Overfull \vbox (22.25151pt too high) detected at line 245
+ []
+
+
+Underfull \hbox (badness 10000) has occurred while \output is active
+
+ []
+
+[21
+
+ <./grafiken/a2_tu_2.png>]
+<grafiken/a2_tu_3.png, id=156, 578.16pt x 433.62pt>
+File: grafiken/a2_tu_3.png Graphic file (type png)
+ <use grafiken/a2_tu_3.png>
+Package pdftex.def Info: grafiken/a2_tu_3.png used on input line 254.
+(pdftex.def)             Requested size: 208.65793pt x 156.49014pt.
+
+Overfull \vbox (22.25151pt too high) detected at line 254
+ []
+
+
+Underfull \hbox (badness 10000) has occurred while \output is active
+
+ []
+
+[22
+
+ <./grafiken/a2_tu_3.png>]
+<grafiken/a2_tu_4.png, id=163, 578.16pt x 433.62pt>
+File: grafiken/a2_tu_4.png Graphic file (type png)
+ <use grafiken/a2_tu_4.png>
+Package pdftex.def Info: grafiken/a2_tu_4.png used on input line 263.
+(pdftex.def)             Requested size: 208.65793pt x 156.49014pt.
+
+Overfull \vbox (22.25151pt too high) detected at line 263
+ []
+
+
+Underfull \hbox (badness 10000) has occurred while \output is active
+
+ []
+
+[23
+
+ <./grafiken/a2_tu_4.png>]
+Underfull \hbox (badness 10000) has occurred while \output is active
+
+ []
+
+[24
+
 ]
 \tf@nav=\write7
 \openout7 = `solution.nav'.
@ -1574,20 +1726,20 @@ Underfull \hbox (badness 10000) has occurred while \output is active
 \tf@snm=\write9
 \openout9 = `solution.snm'.

-Package atveryend Info: Empty hook `BeforeClearDocument' on input line 172.
-Package atveryend Info: Empty hook `AfterLastShipout' on input line 172.
+Package atveryend Info: Empty hook `BeforeClearDocument' on input line 278.
+Package atveryend Info: Empty hook `AfterLastShipout' on input line 278.
 (./solution.aux)
-Package atveryend Info: Empty hook `AtVeryEndDocument' on input line 172.
-Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 172.
+Package atveryend Info: Empty hook `AtVeryEndDocument' on input line 278.
+Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 278.
 Package rerunfilecheck Info: File `solution.out' has not changed.
-(rerunfilecheck)             Checksum: 88D911AA5795ABD0722131B6C5D24A75;180.
-Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 172.
+(rerunfilecheck)             Checksum: AE5CCE897D490A137427F55C345E5A34;90.
+Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 278.
 ) 
 Here is how much of TeX's memory you used:
- 17220 strings out of 493633
- 315045 string characters out of 3143378
- 391807 words of memory out of 3000000
- 20016 multiletter control sequences out of 15000+200000
+ 17292 strings out of 493633
+ 316299 string characters out of 3143378
+ 391806 words of memory out of 3000000
+ 20062 multiletter control sequences out of 15000+200000
 39628 words of font info for 52 fonts, out of 3000000 for 9000
 831 hyphenation exceptions out of 8191
 55i,20n,79p,425b,533s stack positions out of 5000i,500n,10000p,200000b,50000s
@ -1601,10 +1753,10 @@ texlive/2011/texmf-dist/fonts/type1/urw/helvetic/uhvb8a.pfb></usr/local/texlive
 /2011/texmf-dist/fonts/type1/urw/helvetic/uhvbo8a.pfb></usr/local/texlive/2011/
 texmf-dist/fonts/type1/urw/helvetic/uhvr8a.pfb></usr/local/texlive/2011/texmf-d
 ist/fonts/type1/urw/helvetic/uhvro8a.pfb>
-Output written on solution.pdf (14 pages, 146011 bytes).
+Output written on solution.pdf (24 pages, 351323 bytes).
 PDF statistics:
- 156 PDF objects out of 1000 (max. 8388607)
- 122 compressed objects within 2 object streams
- 33 named destinations out of 1000 (max. 500000)
- 80 words of extra memory for PDF output out of 10000 (max. 10000000)
+ 221 PDF objects out of 1000 (max. 8388607)
+ 161 compressed objects within 2 object streams
+ 51 named destinations out of 1000 (max. 500000)
+ 104 words of extra memory for PDF output out of 10000 (max. 10000000)

--- a/Mining/Uebungen/2_Uebung/latex/solution.nav
+++ b/Mining/Uebungen/2_Uebung/latex/solution.nav
@ -23,24 +23,38 @@
 \headcommand {\beamer@subsectionpages {2}{8}}
 \headcommand {\slideentry {2}{0}{9}{9/9}{}{0}}
 \headcommand {\beamer@framepages {9}{9}}
-\headcommand {\sectionentry {3}{2. Aufgabe}{10}{2. Aufgabe}{0}}
-\headcommand {\beamer@sectionpages {9}{9}}
-\headcommand {\beamer@subsectionpages {9}{9}}
-\headcommand {\slideentry {3}{0}{10}{10/10}{}{0}}
+\headcommand {\slideentry {2}{0}{10}{10/10}{}{0}}
 \headcommand {\beamer@framepages {10}{10}}
-\headcommand {\sectionentry {4}{2. Aufgabe}{11}{2. Aufgabe}{0}}
-\headcommand {\beamer@sectionpages {10}{10}}
-\headcommand {\beamer@subsectionpages {10}{10}}
-\headcommand {\slideentry {4}{0}{11}{11/11}{}{0}}
+\headcommand {\slideentry {2}{0}{11}{11/11}{}{0}}
 \headcommand {\beamer@framepages {11}{11}}
-\headcommand {\slideentry {4}{0}{12}{12/12}{}{0}}
+\headcommand {\slideentry {2}{0}{12}{12/12}{}{0}}
 \headcommand {\beamer@framepages {12}{12}}
-\headcommand {\slideentry {4}{0}{13}{13/13}{}{0}}
+\headcommand {\slideentry {2}{0}{13}{13/13}{}{0}}
 \headcommand {\beamer@framepages {13}{13}}
-\headcommand {\slideentry {4}{0}{14}{14/14}{}{0}}
+\headcommand {\slideentry {2}{0}{14}{14/14}{}{0}}
 \headcommand {\beamer@framepages {14}{14}}
-\headcommand {\beamer@partpages {1}{14}}
-\headcommand {\beamer@subsectionpages {11}{14}}
-\headcommand {\beamer@sectionpages {11}{14}}
-\headcommand {\beamer@documentpages {14}}
-\headcommand {\def \inserttotalframenumber {14}}
+\headcommand {\slideentry {2}{0}{15}{15/15}{}{0}}
+\headcommand {\beamer@framepages {15}{15}}
+\headcommand {\slideentry {2}{0}{16}{16/16}{}{0}}
+\headcommand {\beamer@framepages {16}{16}}
+\headcommand {\slideentry {2}{0}{17}{17/17}{}{0}}
+\headcommand {\beamer@framepages {17}{17}}
+\headcommand {\slideentry {2}{0}{18}{18/18}{}{0}}
+\headcommand {\beamer@framepages {18}{18}}
+\headcommand {\slideentry {2}{0}{19}{19/19}{}{0}}
+\headcommand {\beamer@framepages {19}{19}}
+\headcommand {\slideentry {2}{0}{20}{20/20}{}{0}}
+\headcommand {\beamer@framepages {20}{20}}
+\headcommand {\slideentry {2}{0}{21}{21/21}{}{0}}
+\headcommand {\beamer@framepages {21}{21}}
+\headcommand {\slideentry {2}{0}{22}{22/22}{}{0}}
+\headcommand {\beamer@framepages {22}{22}}
+\headcommand {\slideentry {2}{0}{23}{23/23}{}{0}}
+\headcommand {\beamer@framepages {23}{23}}
+\headcommand {\slideentry {2}{0}{24}{24/24}{}{0}}
+\headcommand {\beamer@framepages {24}{24}}
+\headcommand {\beamer@partpages {1}{24}}
+\headcommand {\beamer@subsectionpages {9}{24}}
+\headcommand {\beamer@sectionpages {9}{24}}
+\headcommand {\beamer@documentpages {24}}
+\headcommand {\def \inserttotalframenumber {24}}
--- a/Mining/Uebungen/2_Uebung/latex/solution.out
+++ b/Mining/Uebungen/2_Uebung/latex/solution.out
@ -1,4 +1,2 @@
 \BOOKMARK [2][]{Outline0.1}{1. Aufgabe}{}% 1
 \BOOKMARK [2][]{Outline0.2}{2. Aufgabe}{}% 2
-\BOOKMARK [2][]{Outline0.3}{2. Aufgabe}{}% 3
-\BOOKMARK [2][]{Outline0.4}{2. Aufgabe}{}% 4
--- a/Mining/Uebungen/2_Uebung/latex/solution.pdf
+++ b/Mining/Uebungen/2_Uebung/latex/solution.pdf
--- a/Mining/Uebungen/2_Uebung/latex/solution.synctex.gz
+++ b/Mining/Uebungen/2_Uebung/latex/solution.synctex.gz
--- a/Mining/Uebungen/2_Uebung/latex/solution.tex
+++ b/Mining/Uebungen/2_Uebung/latex/solution.tex
@ -80,7 +80,7 @@ $p_{expected(lp, lang) \approx \frac{1}{i * ln(1,78 * N)}}$
 \end{frame}

 \begin{frame}
-\frametitle{1. Aufgabe \\ Firefox-Plugin} 
+\frametitle{1. Aufgabe \\ Firefox-Plugin}
 \begin{itemize}
 \item Häufigkeiten der Buchstaben bzw. Buchstabenpaare ($n_{text(l)}$) relativ zur Gesamtanzahl ($n_{text}$): \\
 \begin{center}
@ -117,7 +117,7 @@ $MSE(lang) = \frac{\sum_{l}(\tilde{p}_{text}(l) - p_{expected(lp, lang)})^2}{n_{
 % % % % % % % % % % % % % % % % % % % % % % %% % % % % % % % % % % % % % % % % % % % % % %% % % % % % % % % % % % % % % % % % % % % % % %
 \section{2. Aufgabe}
 \begin{frame}
-\frametitle{2. Aufgabe \\ Crawler}
+\frametitle{2. und 3. Aufgabe \\ Crawler}
 \textbf{Verfahren zur Erkennung von Duplikaten:} \\
 \textbf{1.} Alle Wörter mit einer Länge von 4 und kleiner 11 werden von der Webseite extrahiert.
 \begin{itemize}
@ -127,9 +127,9 @@ $MSE(lang) = \frac{\sum_{l}(\tilde{p}_{text}(l) - p_{expected(lp, lang)})^2}{n_{
 \end{itemize}
 \end{frame}

-\section{2. Aufgabe}
+
 \begin{frame}
-\frametitle{2. Aufgabe \\ Crawler}
+\frametitle{2. und 3. Aufgabe \\ Crawler}
 \textbf{2.} Eine Zuweisung von Wörtern zu deren Auftrittshäufigkeit wird angefertigt \\
 \textbf{3.} Für alle paarweise verschiedenen Seiten werden die Auftrittshäufigkeiten subtrahiert, so dass deren Ergebnis minimal null ergibt. Zudem werden alle Wörter, die auf einer, aber nicht auf der anderen Seite vorkommen, ebenfalls der anderen Seite zugewiesen 
 \begin{itemize}
@ -137,29 +137,135 @@ $MSE(lang) = \frac{\sum_{l}(\tilde{p}_{text}(l) - p_{expected(lp, lang)})^2}{n_{
 \end{itemize}
 \end{frame}

-\section{2. Aufgabe}
+
 \begin{frame}
-\frametitle{2. Aufgabe \\ Crawler}
+\frametitle{2. und 3. Aufgabe \\ Crawler}
 \textbf{4.} Anschließend wird die resultierende Gesamtzahl an Wortvorkommnissen durch die Anzahl der Wortvorkommnisse vor der Subtraktion geteilt. Fällt dieser Wert unter eine definierte Grenze, gilt die Seite als Duplikat. \\
 \begin{itemize}
 \item Im Code ist anstatt einer Untergrenze eine Obergrenze von 90\% angegeben, die Berechnung wurde also umgekehrt, so dass hohe Werte eine hohe Duplikatswahrscheinlichkeit implizieren.
 \end{itemize}
 \end{frame}

+
+
 \begin{frame}
-\frametitle{2. Aufgabe \\ Crawler}
-Histogramm über die Anzahl der URLs pro Seite (wie beim ersten Übungsblatt mit Worthäufigkeiten, auch logarithmisch)
+\frametitle{2. und 3. Aufgabe \\ Crawler}
+\textbf{Startseite:} \\
+http://www.spiegel.de/ \\
+Es wurden 1000 Seiten besucht. \\
+\vspace{1cm}
+\textbf{Erkannte Sprachen:}
+\begin{center}
+de $\to$ 623 \\
+en $\to$ 246 \\
+es $\to$ 131 \\
+\end{center}
 \end{frame}

 \begin{frame}
-\frametitle{2. Aufgabe \\ Crawler}
-Histogramm mit den Häufigkeiten des Auftretens von Hyperlinks, d.h. wie viele Links treten 1-mal, 2-mal, ... auftreten ...
+\frametitle{2. und 3. Aufgabe \\ Crawler}
+\begin{figure}
+\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_spiegel_1.png} 
+\caption{Anzahl URLs pro Seite}
+\end{figure}
+\end{frame}
+
+
+\begin{frame}
+\frametitle{2. und 3. Aufgabe \\ Crawler}
+\begin{figure}
+\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_spiegel_2.png} 
+\caption{Anzahl URLs pro Seite (logarithmisch)}
+\end{figure}
+\end{frame}
+
+\begin{frame}
+\frametitle{2. und 3. Aufgabe \\ Crawler}
+\begin{itemize}
+\item Viele Internetseiten verweisen auf wenige andere Internetseiten
+\end{itemize}
+\end{frame}
+
+
+\begin{frame}
+\frametitle{2. und 3. Aufgabe \\ Crawler}
+\begin{figure}
+\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_spiegel_3.png} 
+\caption{Häufigkeiten des Auftretens von Hyperlinks}
+\end{figure}
+\end{frame}
+
+
+\begin{frame}
+\frametitle{2. und 3. Aufgabe \\ Crawler}
+\begin{figure}
+\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_spiegel_4.png} 
+\caption{Häufigkeiten des Auftretens von Hyperlinks (logarithmisch)}
+\end{figure}
+\end{frame}
+
+\begin{frame}
+\frametitle{2. und 3. Aufgabe \\ Crawler}
+\begin{itemize}
+\item Es gibt nur wenige Internetseiten die oft referenziert werden.
+\end{itemize}
+\end{frame}
+
+
+\begin{frame}
+\frametitle{2. und 3. Aufgabe \\ Crawler}
+\textbf{Startseite:} \\
+http://www.ke.tu-darmstadt.de/lehre/arbeiten \\
+Crawler hat nur Seiten innerhalb der TU Darmstadt der Form *.tu.darmstadt.de besucht.
+Es wurden 1000 Seiten besucht. \\
+\vspace{1cm}
+\textbf{Erkannte Sprachen:}
+\begin{center}
+de $\to$ 329 \\
+en $\to$ 576 \\
+es $\to$ 95 \\
+\end{center}
+\end{frame}
+
+\begin{frame}
+\frametitle{2. und 3. Aufgabe \\ Crawler}
+\begin{figure}
+\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_tu_1.png} 
+\caption{Anzahl URLs pro Seite}
+\end{figure}
+\end{frame}
+
+
+\begin{frame}
+\frametitle{2. und 3. Aufgabe \\ Crawler}
+\begin{figure}
+\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_tu_2.png} 
+\caption{Anzahl URLs pro Seite (logarithmisch)}
+\end{figure}
+\end{frame}
+
+
+\begin{frame}
+\frametitle{2. und 3. Aufgabe \\ Crawler}
+\begin{figure}
+\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_tu_3.png} 
+\caption{Häufigkeiten des Auftretens von Hyperlinks}
+\end{figure}
+\end{frame}
+
+
+\begin{frame}
+\frametitle{2. und 3. Aufgabe \\ Crawler}
+\begin{figure}
+\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_tu_4.png} 
+\caption{Häufigkeiten des Auftretens von Hyperlinks (logarithmisch)}
+\end{figure}
 \end{frame}



 \begin{frame}
-\frametitle{2. Aufgabe \\ Crawler}
+\frametitle{2. und 3. Aufgabe \\ Crawler}
 \textbf{Erfahrungen und Probleme:}
 \begin{itemize}
 \item Findet man einen Onlineshop, so wird die Queue mit sehr vielen Links dieses Shops gefüllt und der Crawler besucht mit sehr hoher Wahrscheinlichkeit nur noch URLs innerhalb des Shops.
--- a/Mining/Uebungen/2_Uebung/latex/solution.toc
+++ b/Mining/Uebungen/2_Uebung/latex/solution.toc
@ -2,5 +2,3 @@
 \select@language {ngerman}
 \beamer@sectionintoc {1}{1. Aufgabe}{2}{0}{1}
 \beamer@sectionintoc {2}{2. Aufgabe}{9}{0}{2}
-\beamer@sectionintoc {3}{2. Aufgabe}{10}{0}{3}
-\beamer@sectionintoc {4}{2. Aufgabe}{11}{0}{4}
--- a/Mining/Uebungen/2_Uebung/results.rtf
+++ b/Mining/Uebungen/2_Uebung/results.rtf
@ -0,0 +1,379 @@
+{\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf370
+{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
+{\colortbl;\red255\green255\blue255;}
+\paperw11900\paperh16840\margl1440\margr1440\vieww16900\viewh8400\viewkind0
+\pard\tx566\tx1133\tx1700\tx2267\tx2834\tx3401\tx3968\tx4535\tx5102\tx5669\tx6236\tx6803\pardirnatural
+
+\f0\b\fs24 \cf0 entrypoint: google.de:
+\b0 \
+\
+ ==== robots.txt ====\
+prohibit by robots.txt: 172\
+\
+ \
+ ==== numberHyperlink ====\
+#Hyperlinks      Website\
+19                      http://www.blogger.com/profile/05109496878476775729\
+19                      http://www.google.de/history/optout?hl=de\
+18                      http://www.google.de/intl/de/options/\
+8                       http://www.vovone.com/company/profile/\
+8                       http://www.vovone.com/more/solutions/\
+8                       http://www.vovone.com/company/partners/\
+7                       http://www.google.de/intl/de/policies/privacy/\
+7                       http://rocketsandsuch.blogspot.de/2009_08_01_archive.html\
+7                       http://www.vovone.com/more/ssl-certificates/ssl-certificate-validation/\
+7                       http://www.google.de/webhp?hl=de&tab=iw\
+7                       http://www.vovone.com/company/ask-for-more/\
+6                       http://www.vovone.com/domain-names/redirect-domain-name/\
+6                       http://voice.google.com\
+6                       http://www.vovone.com/support/f-a-q/\
+6                       http://www.vovone.com/domain-names/domain-termination/\
+6                       http://www.vovone.com/support/documentation/\
+6                       http://www.vovone.com/company/careers/\
+5                       http://www.vovone.com/discounts-offers/\
+5                       http://www.google.com/press/blogs/directory.html#tab0\
+5                       http://rocketsandsuch.blogspot.de/2008_03_01_archive.html\
+5                       http://rocketsandsuch.blogspot.de/2009_09_01_archive.html\
+5                       http://www.vovone.com/servers/\
+5                       http://rocketsandsuch.blogspot.de/2009_01_01_archive.html\
+5                       http://www.vovone.com\
+5                       http://fusion.google.com/add?source=atgs&feedurl=http%3a//feeds.feedburner.com/googleappsupdates\
+5                       http://www.vovone.com/more/solutions/service-level-agreements/\
+5                       http://www.vovone.com/support/\
+5                       http://www.vovone.com/servers/managed-servers/\
+5                       http://rocketsandsuch.blogspot.de/2008_10_01_archive.html\
+5                       http://rocketsandsuch.blogspot.de/2009_10_01_archive.html\
+5                       http://feedburner.google.com/fb/a/mailverify?uri=googleappsupdates&loc=en_us\
+5                       http://www.vovone.com/more/reseller-plans/affiliate-plan/\
+5                       http://www.vovone.com/more/ssl-certificates/ssl-certificate-type/\
+5                       http://blog.chromium.org/\
+5                       http://www.vovone.com/company/conditions/notice-and-take-down/\
+5                       http://www.vovone.com/more/ssl-certificates/ssl-certificates-brand/\
+4                       http://www.vovone.com/more/colocation/private-rackspace/\
+4                       http://www.vovone.com/more/ssl-certificates/\
+4                       http://www.google.de/setprefdomain?prefdom=us&sig=0_h0pay1e5n4pq04s4m5soth6xqlk%3d\
+4                       http://www.vovone.com/company/technology/security/\
+4                       http://rocketsandsuch.blogspot.de/search?updated-min=2007-01-01t00:00:00-08:00&updated-max=2008-01-01t00:00:00-08:00&max-results=50\
+4                       http://www.google.de/setprefdomain?prefdom=us&sig=0_bbxqe3gzyewbwv2egvfk2cujk3w%3d\
+4                       http://www.vovone.com/more/\
+4                       http://www.vovone.com/web-hosting/special-plans/special-plans-magento-hosting/\
+4                       http://www.vovone.com/more/colocation/shared-rackspace/\
+4                       http://www.vovone.com/company/conditions/\
+4                       http://www.vovone.com/more/solutions/managed-services/\
+4                       http://mail.google.com\
+4                       http://rocketsandsuch.blogspot.de/2008/10/hubble-bubble-toil-and-trouble.html\
+4                       http://www.vovone.com/servers/vps/vps-plan-8192/\
+\
+ \
+ ==== numberHyperlinksPerPage ====\
+#HyperlinksToPage        Website\
+9088                    javascript:void(0)\
+1867                    #\
+898                     javascript:;\
+522                     http://www.blogger.com/profile/05109496878476775729\
+392                     http://www.vovone.com\
+348                     /\
+347                     http://www.blogger.com/profile/09046869427384152063\
+317                     \
+301                     http://www.vovone.com/support/\
+298                     https://my.vovone.com\
+295                     http://www.vovone.com/company/careers/\
+272                     http://feedburner.google.com/fb/a/mailverify?uri=GoogleAppsUpdates&loc=en_US\
+270                     http://fusion.google.com/add?source=atgs&feedurl=http%3A//feeds.feedburner.com/GoogleAppsUpdates\
+256                     the-button-element.html#concept-fe-value\
+242                     http://www.blogger.com/profile/06992649719432295652\
+221                     http://www.vovone.com/servers/\
+220                     the-input-element.html#the-input-element\
+216                     http://www.vovone.com/company/\
+206                     http://www.vovone.com/web-hosting/\
+206                     http://www.vovone.com/more/colocation/\
+205                     http://www.vovone.com/more/ssl-certificates/\
+205                     http://www.vovone.com/servers/dedicated-servers/\
+204                     http://www.vovone.com/more/colocation/private-rackspace/\
+203                     http://www.vovone.com/more/solutions/\
+203                     http://www.vovone.com/company/technology/\
+203                     http://www.vovone.com/servers/managed-servers/\
+203                     http://www.vovone.com/domain-names/\
+202                     http://www.vovone.com/voip-services/\
+202                     http://www.vovone.com/company/conditions/\
+201                     http://www.vovone.com/more/reseller-plans/\
+201                     http://www.vovone.com/voip-services/cloud-voip/\
+200                     http://www.vovone.com/company/promise/\
+200                     http://www.vovone.com/voip-services/voip-accounts/\
+200                     http://www.vovone.com/domain-names/domain-termination/\
+200                     http://www.vovone.com/more/ssl-certificates/ssl-certificate-type/\
+200                     http://www.vovone.com/domain-names/transfer-domain-name/\
+200                     http://www.vovone.com/company/profile/\
+199                     http://www.vovone.com/more/solutions/service-level-agreements/\
+199                     http://www.vovone.com/more/solutions/managed-services/\
+199                     http://www.vovone.com/support/documentation/\
+199                     http://www.vovone.com/voip-services/business-voip/\
+199                     http://www.vovone.com/more/ssl-certificates/ssl-certificate-validation/\
+199                     http://www.vovone.com/more/ssl-certificates/ssl-certificates-brand/\
+199                     http://www.vovone.com/more/colocation/shared-rackspace/\
+199                     http://www.vovone.com/more/reseller-plans/affiliate-plan/\
+199                     http://www.vovone.com/support/f-a-q/\
+198                     http://www.vovone.com/support/support-desk/\
+198                     http://www.vovone.com/voip-services/wholesale-voip/\
+197                     http://www.vovone.com/domain-names/redirect-domain-name/\
+197                     http://www.vovone.com/company/press/\
+\
+\
+\
+
+\b entrypoint http://www.ke.tu-darmstadt.de/lehre/arbeiten:
+\b0 \
+\
+ ==== robots.txt ====\
+prohibit by robots.txt: 4\
+\
+ \
+ ==== numberHyperlink ====\
+#Hyperlinks      Website\
+405                     http://www.ke.tu-darmstadt.de/bibtex/authors/show/1625\
+120                     http://www.ke.tu-darmstadt.de/bibtex/authors/show/1677\
+107                     http://www.tu-darmstadt.de\
+77                      http://www.informatik.tu-darmstadt.de\
+71                      http://www.ke.tu-darmstadt.de\
+61                      http://www.ke.tu-darmstadt.de/bibtex/authors/show/875\
+46                      http://www.ke.tu-darmstadt.de/lehre\
+46                      http://www.ke.tu-darmstadt.de/news\
+41                      http://www.ke.tu-darmstadt.de/bibtex/authors/show/708\
+41                      http://www.ke.tu-darmstadt.de/bibtex/search\
+40                      http://www.ke.tu-darmstadt.de/de/studierende/studienbuero/ansprechpartner-studienbuero/\
+40                      http://www.ke.tu-darmstadt.de/bibtex/export\
+39                      http://www.informatik.tu-darmstadt.de/de/aktuelles/veranstaltungentermine/\
+39                      http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#proceedings\
+38                      http://www.ke.tu-darmstadt.de/de/intern/index/\
+38                      http://www.ke.tu-darmstadt.de/de/studierende/studiendekanat/ansprechpartner/\
+37                      http://www.ke.tu-darmstadt.de/bibtex/publications\
+37                      http://www.informatik.tu-darmstadt.de/de/aktuelles/neuigkeiten/neuigkeiten/artikel/vortrag-ueber-fitweltweit-programm-des-daad-1/\
+36                      http://www.ke.tu-darmstadt.de/resources\
+36                      http://www.ke.tu-darmstadt.de/bibtex/topics/single/77\
+36                      http://www.ke.tu-darmstadt.de/bibtex/authors/show/1849\
+35                      http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type\
+34                      http://www.informatik.tu-darmstadt.de/de/aktuelles/neuigkeiten/neuigkeiten/artikel/smarte-spione/\
+34                      http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/recent\
+33                      http://www.ke.tu-darmstadt.de/de/fachbereich/dekanat/\
+33                      http://www.ke.tu-darmstadt.de/de/fachbereich/bilder/absolventenfeier-november-2012/begruessung/\
+33                      http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#inproceedings\
+33                      http://www.ke.tu-darmstadt.de/research\
+31                      http://www.ke.tu-darmstadt.de/de/forschung/netzwerkpartner/\
+29                      http://www.ke.tu-darmstadt.de/de/aktuelles/newsletter-an-und-abmeldung/\
+29                      http://www.ke.tu-darmstadt.de/bibtex/authors/show/702\
+29                      http://www.ke.tu-darmstadt.de/projects\
+29                      http://www.ke.tu-darmstadt.de/bibtex/topics/single/33\
+29                      http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#incollection\
+28                      http://www.informatik.tu-darmstadt.de/de/aktuelles/neuigkeiten/neuigkeiten/artikel/eine-kultur-der-privatsphaere-im-internet/\
+28                      http://www.ke.tu-darmstadt.de/bibtex/topics\
+28                      http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#book\
+27                      http://www.ke.tu-darmstadt.de/de/aktuelles/neuigkeiten/\
+26                      http://www.ke.tu-darmstadt.de/bibtex/authors/show/3036\
+25                      http://www.ke.tu-darmstadt.de/bibtex/authors/show/2370\
+24                      http://www.ke.tu-darmstadt.de/de/aktuelles/preise-und-auszeichnungen/\
+24                      http://www.ke.tu-darmstadt.de/staff\
+24                      http://www.ke.tu-darmstadt.de/impressum\
+24                      http://www.ke.tu-darmstadt.de/de/studierende/news-fuer-studierende/\
+24                      http://www.ke.tu-darmstadt.de/publications\
+23                      http://www.ke.tu-darmstadt.de/bibtex/authors/show/2365\
+23                      http://www.ke.tu-darmstadt.de/termine\
+23                      http://www.ke.tu-darmstadt.de/de/ehemalige/alumni-portal-der-tu-darmstadt/\
+23                      http://www.ke.tu-darmstadt.de/de/ehemalige/\
+22                      http://www.tu-darmstadt.de/\
+\
+ \
+ ==== numberHyperlinksPerPage ====\
+#HyperlinksToPage        Website\
+3528                    http://www.ke.tu-darmstadt.de/bibtex/authors/show/1625\
+915                     http://www.tu-darmstadt.de\
+904                     http://www.ke.tu-darmstadt.de/bibtex/authors/show/1677\
+635                     de/aktuelles/neuigkeiten/\
+577                     de/fachbereich/dekanat/\
+575                     de/fachbereich/bilder/absolventenfeier-november-2012/begruessung/\
+528                     http://www.informatik.tu-darmstadt.de\
+499                     http://www.ke.tu-darmstadt.de\
+490                     de/aktuelles/newsletter-an-und-abmeldung/\
+482                     de/forschung/netzwerkpartner/\
+481                     http://www.ke.tu-darmstadt.de/bibtex/topics/single/33\
+474                     de/studierende/studiendekanat/ansprechpartner/\
+468                     de/studierende/studienbuero/ansprechpartner-studienbuero/\
+452                     de/intern/index/\
+450                     http://www.ke.tu-darmstadt.de/bibtex/authors/show/875\
+444                     http://www.ke.tu-darmstadt.de/bibtex/topics/single/77\
+434                     http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/recent\
+434                     http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type\
+433                     javascript:this.print()\
+429                     javascript:fontsize('reset')\
+429                     javascript:fontsize('inkrement')\
+429                     javascript:fontsize('dekrement')\
+424                     http://www.ke.tu-darmstadt.de/bibtex/search\
+424                     http://www.ke.tu-darmstadt.de/bibtex/topics\
+424                     http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#Proceedings\
+424                     http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#Book\
+424                     http://www.ke.tu-darmstadt.de/bibtex/publications\
+424                     http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#Inproceedings\
+424                     http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#Incollection\
+424                     http://www.ke.tu-darmstadt.de/bibtex/export\
+412                     de/aktuelles/neuigkeiten/neuigkeiten/artikel/smarte-spione/\
+408                     de/aktuelles/neuigkeiten/neuigkeiten/artikel/eine-kultur-der-privatsphaere-im-internet/\
+408                     de/aktuelles/neuigkeiten/neuigkeiten/artikel/vortrag-ueber-fitweltweit-programm-des-daad-1/\
+405                     \
+382                     http://www.ke.tu-darmstadt.de/bibtex/authors/show/708\
+369                     de/fachbereich/\
+352                     de/fachbereich/ehrungen-und-auszeichnungen/alwin-walther-medaille/\
+351                     de/fachbereich/kontakt-und-anfahrt/\
+351                     de/fachbereich/personen/\
+350                     de/fachbereich/professuren-und-gruppenleitungen/\
+350                     de/fachbereich/ueber-den-fachbereich/\
+350                     de/fachbereich/ausschuesse-gremien-und-kommissionen/\
+349                     http://www.informatik.tu-darmstadt.de/index.php?id=40\
+336                     http://www.ke.tu-darmstadt.de/bibtex/authors/show/702\
+330                     http://www.informatik.tu-darmstadt.de/index.php?id=1894\
+306                     http://www.ke.tu-darmstadt.de/bibtex/authors/show/1849\
+302                     http://www.ke.tu-darmstadt.de/news\
+298                     de/tu/\
+277                     http://www.tu-darmstadt.de/\
+264                     #top\
+\
+ \
+ ==== url queue ====\
+\
+ \
+ ==== language distribution ====\
+Language         Number of occurences\
+de       329\
+en       576\
+es       94\
+\
+\
+
+\b entrypoint http://www.spiegel.de:\
+\
+\
+
+\b0  ==== robots.txt ====\
+prohibit by robots.txt: 115\
+\
+ \
+ ==== numberHyperlink ====\
+#Hyperlinks      Website\
+43                      https://www.amazon.de/b\
+38                      http://www.amazon.de/spiegel\
+28                      http://tv.adobe.com\
+21                      http://tvprogramm.spiegel.de/\
+19                      http://www.spiegel.de/\
+18                      https://service.spiegel.de\
+18                      http://www.spiegel.de/spiegel/spiegelgeschichte/index-2013-2.html\
+17                      http://www.spiegel.de/spiegel/deinspiegel/index-2013-6.html\
+16                      https://www.ebook.de/de/category/61110/unsere_vorteile.html\
+16                      http://www.spiegel.de\
+15                      http://www.spiegel.de/shop\
+14                      http://www.shopbop.com/gp/help/customer/display.html\
+14                      http://www.manager-magazin.de/\
+14                      http://www.spiegel.de/spiegel/spiegelwissen/index-2013-2.html\
+13                      http://www.spiegel.de/spiegel/\
+13                      http://www.spiegel.de/wissenschaft/\
+12                      http://wetter.spiegel.de/spiegel/\
+10                      https://www.ebook.de/de/category/59475/kontakt_impressum.html\
+10                      http://abo.spiegel.de/go/place!abosspsc\
+9                       https://www.amazon.de/gp/cart/view.html\
+9                       https://www.ebook.de/de/category/59424/hilfe.html\
+9                       http://www.amazon.de/gp/feature.html\
+9                       http://www.spiegel.de/sport/\
+9                       https://media.libri.de/de/category/58974/sony_reader.html\
+9                       http://www.spiegelgruppe-nachdrucke.de\
+9                       https://www.ebook.de/de/category/61132/newsletter.html\
+9                       http://www.spiegelwissen.tv/flashsite/index.html\
+8                       http://www.spiegel.de/hilfe/\
+8                       http://abo.spiegel.de/?et_cid=7&et_lid=1946&et_sub=heftkasten\
+8                       https://www.amazon.es/b\
+8                       https://www.ebook.de/de/category/59663/gutscheine_kaufen.html\
+8                       https://www.ebook.de/de/category/52122/ebooks.html\
+8                       http://www.spiegel.de/politik/\
+8                       https://www.ebook.de/de/account/wishlist/add\
+8                       https://www.amazon.de/pc-mac-downloads-herunterladen-digital-steam/b\
+8                       http://www.spiegel.de/spiegel/unispiegel/\
+8                       http://www.spiegel.de/unispiegel/studium/tools-hier-werden-sie-geholfen-a-640620.html\
+8                       http://www.harvardbusinessmanager.de/\
+7                       http://www.amazon.co.jp/\
+7                       https://www.ebook.de/de/category/63461/ebooks_verschenken.html\
+7                       https://www.ebook.de/de/category/browse\
+7                       http://kdp.amazon.de/\
+7                       http://abo.spiegel.de/?et_cid=7&et_lid=1946&et_sub=aboreiter\
+7                       http://www.spiegel-qc.de/selbstbuchungstool\
+7                       https://media.libri.de/de/category/52124/buecher.html\
+7                       http://www.spiegel-qc.de/\
+7                       https://www.ebook.de/de/magazine\
+7                       https://www.ebook.de\
+7                       http://www.spiegel.de/video/\
+7                       http://www.libri.de/shop/action/magazine/6/ebooks_reader.html\
+\
+ \
+ ==== numberHyperlinksPerPage ====\
+#HyperlinksToPage        Website\
+6966                    #\
+1507                    /\
+1027                    \
+961                     http://www.amazon.de/spiegel\
+671                     http://tv.adobe.com/product/photoshop/\
+640                     \{\{url\}\}\
+640                     /gp/digital/fiona/manage\
+598                     javascript:void(0);\
+597                     http://tv.adobe.com/product/cs-production-premium/\
+586                     http://www.spiegel.de/\
+575                     http://www.spiegel.de/spiegel/\
+509                     http://wetter.spiegel.de/spiegel/\
+504                     <#=item.url #>\
+492                     http://www.spiegel.de/shop\
+468                     http://www.spiegel.de/spiegel/spiegelwissen/index-2013-2.html\
+468                     http://www.spiegel.de/spiegel/deinspiegel/index-2013-6.html\
+468                     http://www.spiegel.de/spiegel/spiegelgeschichte/index-2013-2.html\
+462                     /gp/site-directory\
+460                     /gp/cart/view.html?ie=UTF8&hasWorkingJavascript=1\
+441                     /gp/registry/wishlist\
+435                     /clouddrive\
+411                     http://www.spiegel.de/sptv/magazin/\
+385                     /gp/prime\
+382                     /product/photoshop/\
+352                     /gp/dmusic/mp3/player\
+323                     https://www.ebook.de/de/account/ebookHistory\
+316                     https://www.ebook.de/de/account/create/singlestep\
+311                     http://tv.adobe.com/product/cs-design-premium/\
+311                     http://forum.spiegel.de/\
+310                     /product/illustrator/\
+308                     http://tv.adobe.com/product/creative-cloud/\
+303                     http://www.spiegel.de/video/\
+303                     http://www.spiegel-qc.de/\
+297                     /video/\
+296                     http://www.spiegel.de/schlagzeilen/\
+294                     http://www.quality-abo.de/\
+293                     http://www.spiegelgruppe.de/\
+293                     http://www.buchreport.de/\
+288                     http://www.spiegelgruppe-nachdrucke.de\
+277                     /de/category/60575/libri_de_ist_jetzt_ebook_de.html\
+276                     http://www.manager-magazin.de/\
+274                     http://tv.adobe.com/product/premiere-pro/\
+267                     http://tv.adobe.com/product/after-effects/\
+264                     http://www.harvardbusinessmanager.de/\
+262                     /product/premiere-pro/\
+260                     /MP3-Musik-Downloads/b?ie=UTF8&node=77195031\
+260                     http://tvprogramm.spiegel.de/\
+259                     /pc-mac-downloads-herunterladen-digital-steam/b?ie=UTF8&node=1333619031\
+259                     /spiegel/\
+256                     /Navigationssystems-Car-HiFi-Autoradios/b?ie=UTF8&node=236861011\
+\
+ \
+ ==== url queue ====\
+\
+ \
+ ==== language distribution ====\
+Language         Number of occurences\
+de       623\
+en       246\
+es       130
+\b \
+
+\b0 \
+\
+\
+}