slides update

This commit is contained in:
Michael Scholz 2013-05-19 22:19:45 +02:00
parent a063983767
commit c13047919a
43 changed files with 2398 additions and 89 deletions

View File

@ -0,0 +1,10 @@
01 es
02 de
03 en
04 en
05 de
06 es
07 es
08 de
09 en
10 es

View File

@ -0,0 +1,510 @@
# -*- coding: utf-8 -*-
import urllib2
import sys
import random
import robotparser
import re
import math
from sgmllib import SGMLParser
import sgmllib
from urlparse import urlparse
from urlparse import urljoin
import matplotlib.pyplot as plt
import time
from termcolor import colored
from collections import Counter
'''
VN:
- Plagiats-Checker fertig
- Sprachprüfer fertig
TODO:
- DONE canonize urls -> canonize? slides? -> remember last host -> no magic here -> even using ugly global
- DONE with getNextUrlToVisit():
server timeout -> safe crawled host, set timeout for crawled host
- statistics -> http://www.ke.tu-darmstadt.de/lehre/ss13/web-mining/uebung2.html
'''
# crawler attributes
entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
entrypoint = "http://www.spiegel.de" # german website
#entrypoint = "http://www.cnn.com" # english website
#entrypoint = "http://www.red2000.com/spain/1index.html" # spanish website
#entrypoint = "https://code.google.com/p/feedparser/issues/attachmentText?id=226&aid=-1296926914212963541&name=demonstrate_issue_226.xml&token=CHtgpTsdPmWnNsvScD0yfMuBriU%3A1368702558154"
numberOfPagesToCrawl = 1000
timeBetweenSameHost = 0 # 5 sec
visitOnlyTuSites = False;
#some variables
prohibitedSites = 0
visitedUrls = [] # safe already visited urls, so no url will be visited more than once
pages = {} # downloaded pages
numberHyperlink = {} # safe number of hyperlinks...
numberHyperlinksPerPage = {} # safe number of hyperlinks per page
visitedHostsWithTimestamp = {} # safe visited hosts with timestamp
robotsTxtResults = {} # safe robots.txt
lasthost = '' #last host
def normalizeMap(m):
s = sum(m.values())
for k in m:
m[k] = float(m[k]) / float(s)
def subtractDicts(dict1, dict2):
dic = dict()
for key in dict1:
if key in dict2:
dic[key] = max(0, int(dict1[key]) - int(dict2[key]))
else:
dic[key] = int(dict1[key])
for key in dict2:
if key not in dict1:
dic[key] = int(dict2[key])
return dic
def countWords(words):
counts = {}
for word in words:
if word not in counts:
counts[word] = 1
else:
counts[word] += 1
return counts
def blockedByRobotsTxt(url):
o = urlparse(url)
robotsUrl = o.scheme+"://"+o.netloc+"/robots.txt"
if url in robotsTxtResults:
rp = robotsTxtResults[robotsUrl]
else:
rp = robotparser.RobotFileParser()
rp.set_url(robotsUrl)
try:
rp.read()
robotsTxtResults[robotsUrl] = rp
except:
robotsTxtResults[robotsUrl] = None # robots.txt doesn't exist
if robotsTxtResults[robotsUrl] == None:
return False # return false if robots.txt doesn't exist
else:
if rp.can_fetch("*", url):
return False
else:
print colored("-> not allowed to visit :( "+url, "red")
global prohibitedSites
prohibitedSites += 1
return True
def canonicalUrl(url):
global lasthost
url = url.lower().replace(" ", "")
o = urlparse(url)
if o.netloc != '':
lasthost = o.scheme + '://' + o.netloc
if o.scheme=='http' and not "pdf" in o.path and not ".." in o.geturl():
if ".html" in o.path:
return [url]
if "." not in o.path:
return [url]
return []
else:
if o.scheme=='':
return [urljoin(lasthost,o.path)]
else:
return []
def getNextUrlToVisit():
url = random.choice(extractor.urls)
if visitOnlyTuSites:
if 'tu-darmstadt' not in urlparse(url).netloc:
extractor.urls.remove(url)
return getNextUrlToVisit()
if url in numberHyperlink:
numberHyperlink[url] += 1
else:
numberHyperlink[url] = 1
host = urlparse(url).netloc
## check if url is blocked by robots.txt or was already visited ##
if blockedByRobotsTxt(url) or url in visitedUrls:
extractor.urls.remove(url)
return getNextUrlToVisit()
## check if host got a timeout
if host in visitedHostsWithTimestamp:
timestamp = visitedHostsWithTimestamp[host]
secondsSinceLastVisit = int(time.time()) - timestamp
if secondsSinceLastVisit >= timeBetweenSameHost:
visitedHostsWithTimestamp[host] = int(time.time())
visitedUrls.append(url)
extractor.urls.remove(url)
return url
else:
secondsToWait = timeBetweenSameHost - secondsSinceLastVisit
print colored(" -> give host ("+host+") a break (wait at least: "+str(secondsToWait)+" seconds)", "magenta")
return getNextUrlToVisit()
else:
visitedHostsWithTimestamp[host] = int(time.time())
visitedUrls.append(url)
extractor.urls.remove(url)
return url
class URLLister(SGMLParser):
## fix SGMLParseError
def resetParser(self):
SGMLParser.reset(self)
def reset(self):
SGMLParser.reset(self)
self.urls = []
def start_a(self, attrs):
href = [v for k, v in attrs if k=='href']
if href:
url = canonicalUrl(href[0])
self.urls.extend(url)
# count number of links on actual site
if href[0] in numberHyperlinksPerPage:
numberHyperlinksPerPage[href[0]] += 1
else:
numberHyperlinksPerPage[href[0]] = 1
if __name__ == "__main__":
page = urllib2.urlopen(entrypoint, timeout = 5)
print "currently visited url: "+entrypoint
extractor = URLLister()
extractor.feed(page.read())
page.close()
i = 1
while(i <= numberOfPagesToCrawl):
url = getNextUrlToVisit()
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") currently visiting url: "+url, "blue")
try:
page = urllib2.urlopen(url, timeout = 6)
pageContent = page.read()
pageContent = pageContent.replace('<![CDATA[', '&lt;![CDATA[') ## bugfix for SGMLParser
page.close()
extractor.feed(pageContent)
pages[url] = pageContent
i += 1
# exception handling
except urllib2.HTTPError, err:
if err.code == 404:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: not found: "+url, "red")
pass
if err.code == 400:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: bad request: "+url, "red")
pass
if err.code == 403:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: HTTP Error: forbidden: "+url, "red")
pass
except urllib2.URLError:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: urllib2.URLError: "+url, "red")
pass
except sgmllib.SGMLParseError:
print colored("("+str(i)+"/"+str(numberOfPagesToCrawl)+") ERROR: sgmllib.SGMLParseError: "+url, "red")
extractor.resetParser()
pass
except:
print "Unexpected error:", sys.exc_info()[0]
pass
extractor.close()
print "\n \n ==== robots.txt ===="
print "prohibit by robots.txt: "+str(prohibitedSites)
## print table number hyperlinks per website ##
print "\n \n ==== numberHyperlink ===="
print "#Hyperlinks \t Website"
linkCount1 = {}
for u in numberHyperlink.values():
if u not in linkCount1:
linkCount1[u] = 1
else:
linkCount1[u] += 1
xValues1 = []
yValues1 = []
for u in linkCount1:
xValues1.append(u)
yValues1.append(linkCount1[u])
plt.plot(xValues1, yValues1)
plt.xlabel('Haeufigkeiten des Auftretens')
plt.ylabel('Anzahl der URLs')
plt.show()
## print table number hyperlinks to page ##
print "\n \n ==== Anzahl URLs pro Seite ===="
print "#Anzahl URLs pro Seite"
linkCount2 = {}
for u in numberHyperlinksPerPage.values():
if u not in linkCount2:
linkCount2[u] = 1
else:
linkCount2[u] += 1
xValues2 = []
yValues2 = []
for u in linkCount2:
xValues2.append(u)
yValues2.append(linkCount2[u])
'''plt.plot(xValues2, yValues2)
plt.xlabel('Anzahl der Hyperlinks pro Seite')
plt.ylabel('Anzahl der URLs')
#plt.xscale('log')
#plt.yscale('log')
plt.show()'''
print "\n \n ==== url queue ===="
for u in extractor.urls:
pass
#print u
threshold = 0.9 # how much similar must 2 urls be to be logged
#print "\n \n ==== copied content probability (>= " + str(threshold*100) + " %) ===="
#print "URL1 \t URL2 \t Similarity in %"
# wordcounts per page
wordCountsByPage = {}
charsByPage = {}
## count words in all pages ##
for url in pages:
tmp = re.sub("[\n\r]", "", pages[url]) # remove all scripts
tmp = re.sub("<\s*script.*?>.+?<\s*\/script.*?>", "", tmp) # remove all scripts
tmp = re.sub("<\s*style.*?>.+?<\s*\/style.*?>", "", tmp) # remove all styles
tmp = re.sub("&.+?;", "", tmp) # remove all html entities
tmp = re.sub("<.+?>", "", tmp) # remove all html tags
tmp = re.sub("\d", "", tmp) # remove all numbers
words = re.findall("(\w+)", tmp) # split words
words = [x.lower() for x in words] # all words to lower case
words = [s for s in words if len(s) > 4 and len(s) <= 10]
wordCountsByPage[url] = countWords(words)
chars = re.findall("[A-za-z]", tmp); # find all characters
chars = [x.lower() for x in chars] # all characters to lower case
charsByPage[url] = chars
## calculate wordcount deltas and print double-content sites ##
wordCountDeltas = {}
for url1 in wordCountsByPage:
for url2 in wordCountsByPage:
if url1 == url2:
continue
if url1 not in wordCountDeltas:
wordCountDeltas[url1] = {}
if url2 in wordCountDeltas[url1]: # do it once only
continue
wordCounts1 = wordCountsByPage[url1]
wordCounts2 = wordCountsByPage[url2]
sum1 = sum(wordCounts1.values())
if sum1 == 0:
continue
#print "calculating deltas of url1: " + url1 + " -- url2: " + url2
deltaWordCounts = subtractDicts(wordCounts1, wordCounts2)
wordCountDeltas[url1][url2] = math.fabs(float(sum(deltaWordCounts.values())) / float(sum1))
if 1 - wordCountDeltas[url1][url2] > threshold:
#print url1 + " \t " + url2 + " \t " + str((1 - wordCountDeltas[url1][url2]) * 100)
pass
## determine the sites' languages ##
spanish = 'es'
english = 'en'
german = 'de'
pageLanguages = {}
lettersByLanguage = {}
lettersByLanguage[spanish] = {
'e' : 13.68,
'a' : 12.53,
'o' : 8.68,
's' : 7.98,
'r' : 6.87,
'n' : 6.71,
'i' : 6.25,
'd' : 5.86,
'l' : 4.97,
'c' : 4.68,
't' : 4.63,
'u' : 3.93,
'm' : 3.15,
'p' : 2.51,
'b' : 1.42,
'g' : 1.01,
'v' : 0.90,
'y' : 0.90,
'q' : 0.88,
'h' : 0.70,
'f' : 0.69,
'z' : 0.52,
'j' : 0.44,
'x' : 0.21,
'w' : 0.02,
'k' : 0.01
}
lettersByLanguage[english] = {
'e' : 12.70,
't' : 9.06,
'a' : 8.16,
'o' : 7.50,
'i' : 6.96,
'n' : 6.74,
's' : 6.32,
'h' : 6.09,
'r' : 5.99,
'd' : 4.25,
'l' : 4.03,
'c' : 2.78,
'u' : 2.76,
'm' : 2.41,
'w' : 2.36,
'f' : 2.23,
'g' : 2.02,
'y' : 1.97,
'p' : 1.93,
'b' : 1.49,
'v' : 0.98,
'k' : 0.77,
'j' : 0.15,
'x' : 0.15,
'q' : 0.10,
'z' : 0.07
}
lettersByLanguage[german] = {
'e' : 17.4,
'n' : 9.78,
'i' : 7.55,
's' : 7.27,
'r' : 7.00,
'a' : 6.51,
't' : 6.15,
'd' : 5.08,
'h' : 4.76,
'u' : 4.35,
'l' : 3.44,
'c' : 3.06,
'g' : 3.01,
'o' : 2.59,
'm' : 2.53,
'b' : 1.89,
'w' : 1.89,
'f' : 1.66,
'k' : 1.21,
'z' : 1.13,
'v' : 0.85,
'p' : 0.67,
'j' : 0.27,
'y' : 0.04,
'x' : 0.03,
'q' : 0.02
}
# normalize maps
normalizeMap(lettersByLanguage[spanish])
normalizeMap(lettersByLanguage[english])
normalizeMap(lettersByLanguage[german])
languageCounts = {}
for url in charsByPage:
tokens = charsByPage[url]
tokenCounts = dict(Counter(tokens))
tokenSum = sum(tokenCounts.values())
# Calculating the squared error
rankings = {}
matches = {}
for token in tokenCounts:
for key2 in lettersByLanguage:
if token not in lettersByLanguage[key2]:
continue
p = float(lettersByLanguage[key2][token]) * 100
if p >= 0:
if key2 not in rankings:
rankings[key2] = 0
matches[key2] = 0
# calculate the squared error from observed and reference frequencies
rankings[key2] += math.pow(math.fabs(tokenCounts[token] * 100 / tokenSum - p), 2)
matches[key2] += 1
# Resulting language has the minimal mean squared error
minRanking = -1
language = None
for key in rankings:
rankings[key] /= matches[key]
if minRanking == -1 or rankings[key] < minRanking:
minRanking = rankings[key]
language = key
if language != None:
pageLanguages[url] = language
if language not in languageCounts:
languageCounts[language] = 1
else:
languageCounts[language] += 1
print "\n \n ==== language distribution ===="
print "Language \t Number of occurences"
for lang in languageCounts:
print lang + " \t " + str(languageCounts[lang])

View File

@ -0,0 +1,168 @@
# coding: utf-8
# Copyright (c) 2008-2011 Volvox Development Team
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# Author: Konstantin Lepa <konstantin.lepa@gmail.com>
"""ANSII Color formatting for output in terminal."""
from __future__ import print_function
import os
__ALL__ = [ 'colored', 'cprint' ]
VERSION = (1, 1, 0)
ATTRIBUTES = dict(
list(zip([
'bold',
'dark',
'',
'underline',
'blink',
'',
'reverse',
'concealed'
],
list(range(1, 9))
))
)
del ATTRIBUTES['']
HIGHLIGHTS = dict(
list(zip([
'on_grey',
'on_red',
'on_green',
'on_yellow',
'on_blue',
'on_magenta',
'on_cyan',
'on_white'
],
list(range(40, 48))
))
)
COLORS = dict(
list(zip([
'grey',
'red',
'green',
'yellow',
'blue',
'magenta',
'cyan',
'white',
],
list(range(30, 38))
))
)
RESET = '\033[0m'
def colored(text, color=None, on_color=None, attrs=None):
"""Colorize text.
Available text colors:
red, green, yellow, blue, magenta, cyan, white.
Available text highlights:
on_red, on_green, on_yellow, on_blue, on_magenta, on_cyan, on_white.
Available attributes:
bold, dark, underline, blink, reverse, concealed.
Example:
colored('Hello, World!', 'red', 'on_grey', ['blue', 'blink'])
colored('Hello, World!', 'green')
"""
if os.getenv('ANSI_COLORS_DISABLED') is None:
fmt_str = '\033[%dm%s'
if color is not None:
text = fmt_str % (COLORS[color], text)
if on_color is not None:
text = fmt_str % (HIGHLIGHTS[on_color], text)
if attrs is not None:
for attr in attrs:
text = fmt_str % (ATTRIBUTES[attr], text)
text += RESET
return text
def cprint(text, color=None, on_color=None, attrs=None, **kwargs):
"""Print colorize text.
It accepts arguments of print function.
"""
print((colored(text, color, on_color, attrs)), **kwargs)
if __name__ == '__main__':
print('Current terminal type: %s' % os.getenv('TERM'))
print('Test basic colors:')
cprint('Grey color', 'grey')
cprint('Red color', 'red')
cprint('Green color', 'green')
cprint('Yellow color', 'yellow')
cprint('Blue color', 'blue')
cprint('Magenta color', 'magenta')
cprint('Cyan color', 'cyan')
cprint('White color', 'white')
print(('-' * 78))
print('Test highlights:')
cprint('On grey color', on_color='on_grey')
cprint('On red color', on_color='on_red')
cprint('On green color', on_color='on_green')
cprint('On yellow color', on_color='on_yellow')
cprint('On blue color', on_color='on_blue')
cprint('On magenta color', on_color='on_magenta')
cprint('On cyan color', on_color='on_cyan')
cprint('On white color', color='grey', on_color='on_white')
print('-' * 78)
print('Test attributes:')
cprint('Bold grey color', 'grey', attrs=['bold'])
cprint('Dark red color', 'red', attrs=['dark'])
cprint('Underline green color', 'green', attrs=['underline'])
cprint('Blink yellow color', 'yellow', attrs=['blink'])
cprint('Reversed blue color', 'blue', attrs=['reverse'])
cprint('Concealed Magenta color', 'magenta', attrs=['concealed'])
cprint('Bold underline reverse cyan color', 'cyan',
attrs=['bold', 'underline', 'reverse'])
cprint('Dark blink concealed white color', 'white',
attrs=['dark', 'blink', 'concealed'])
print(('-' * 78))
print('Test mixing:')
cprint('Underline red on grey color', 'red', 'on_grey',
['underline'])
cprint('Reversed green on red color', 'green', 'on_red', ['reverse'])

View File

@ -0,0 +1,5 @@
This is the keaddon add-on. It contains:
* A program (lib/main.js).
* A few tests.
* Some meager documentation.

View File

@ -0,0 +1,26 @@
var text = "";
var cleantext = "";
var paragraphs = document.getElementsByTagName('p');
var open = '<';
var close = '>';
for(var i=0; i<paragraphs.length; i++) {
text += paragraphs[i].innerHTML;
}
var doAppend = true;
var tmp = "";
for(var i=0; i<text.length; i++) {
tmp = text.charAt(i);
if( tmp == open ) {
doAppend = false;
}
if(doAppend) {
cleantext += tmp;
}
if( tmp == close ) {
doAppend = true;
}
}
//cleantext = unescape(cleantext);
postMessage(cleantext);

View File

@ -0,0 +1,3 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<svg xmlns="http://www.w3.org/2000/svg" version="1.1" width="900" height="600"><rect width="900" height="600" fill="#ED2939"/><rect width="600" height="600" fill="#fff"/><rect width="300" height="600" fill="#002395"/></svg>

After

Width:  |  Height:  |  Size: 378 B

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<svg xmlns="http://www.w3.org/2000/svg" width="1000" height="600" viewBox="0 0 5 3">
<desc>Flag of Germany</desc>
<rect id="black_stripe" width="5" height="3" y="0" x="0" fill="#000"/>
<rect id="red_stripe" width="5" height="2" y="1" x="0" fill="#D00"/>
<rect id="gold_stripe" width="5" height="1" y="2" x="0" fill="#FFCE00"/>
</svg>

After

Width:  |  Height:  |  Size: 491 B

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 230 KiB

View File

@ -0,0 +1,10 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 60 30" width="1200" height="600">
<clipPath id="t">
<path d="M30,15 h30 v15 z v15 h-30 z h-30 v-15 z v-15 h30 z"/>
</clipPath>
<path d="M0,0 v30 h60 v-30 z" fill="#00247d"/>
<path d="M0,0 L60,30 M60,0 L0,30" stroke="#fff" stroke-width="6"/>
<path d="M0,0 L60,30 M60,0 L0,30" clip-path="url(#t)" stroke="#cf142b" stroke-width="4"/>
<path d="M30,0 v30 M0,15 h60" stroke="#fff" stroke-width="10"/>
<path d="M30,0 v30 M0,15 h60" stroke="#cf142b" stroke-width="6"/>
</svg>

After

Width:  |  Height:  |  Size: 521 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 760 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 835 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 674 B

View File

@ -0,0 +1,2 @@
The main module is a program that creates a widget. When a user clicks on
the widget, the program loads the mozilla.org website in a new tab.

View File

@ -0,0 +1,4 @@
exports.german = 'de';
exports.french = 'fr';
exports.spanish = 'es';
exports.english = 'en';

View File

@ -0,0 +1,65 @@
var widgets = require("widget");
var pageMod = require("page-mod");
var student = require("student");
var data = require("self").data;
var workers = new Array();
var mod = null;
exports.main = function(options, callback) {
mod = pageMod.PageMod(
{
include: "*",
contentScriptWhen:"ready",
contentScriptFile: data.url("./contentScripts/keworker.js"),
onAttach: function onAttach(worker) {
worker.on('message', handleMessage);
workers.push(worker);
}
}
);
var widget = widgets.Widget(
{
id: "ke",
label: "Knowledge Engineering",
contentURL: data.url("keicon.png")
}
);
function handleMessage(message) {
var lang = require("language");
if(message.length > 0) {
//TODO: Iconswitch
var language = student.student(message);
console.log(language);
switch(language) {
case lang.german:
widget.contentURL = data.url("./flag/de.png");
break;
case lang.spanish:
widget.contentURL = data.url("./flag/es.png");
break;
case lang.english:
widget.contentURL = data.url("./flag/en.png");
break;
case lang.french:
widget.contentURL = data.url("./flag/fr.png");
break;
default:
widget.contentURL = data.url("./keicon.png");
}
//TODO: response
}
}
console.log("The add-on is running.");
}
exports.onUnload = function(reason) {
if(mod != null) {mod.destroy();}
for(var i=0; i<workers.length; i++) {
workers[i].destroy();
}
}

View File

@ -0,0 +1,8 @@
var lang = require("language");
var util = require("utility");
function student(text) {
return lang.german;
}
exports.student = student;

View File

@ -0,0 +1,82 @@
/*
Will count all equal array entries and list them in a javascript object such
that there is a object property for each unique array entry. The value of that
property equals the number of occurences. The order of the properties is sorted
according to the number of occurences. With the most occurences first.
You can use this construct
for(var key in obj) {
console.log(obj[key]);
}
to iterate over all items in sorted order.
*/
function countElements(array) {
var tmp = {};
var result = {};
var helpArray = [];
for(var i=0; i<array.length; i++) {
var item = array[i];
if(!tmp.hasOwnProperty( item )) {
tmp[item] = 1;
} else {
tmp[item] += 1;
}
}
for(var key in tmp) {
helpArray.push({key:key, value:tmp[key]});
}
helpArray.sort(comparePairs);
for(var i=0; i<helpArray.length; i++) {
result[helpArray[i].key] = helpArray[i].value;
}
return result;
};
/*Helperfunction for countElements*/
function comparePairs(a,b) {
return b.value - a.value;
}
/*
Tokenize a text at whitespace.
Some punctuations are removed
*/
function tokenize(text) {
var lct = text.toLowerCase();
lct = lct.replace(/(\.|,|!|\?|'|"|\\|\/|\|)/g,"");
var result = lct.split(/\W/g);
return result;
}
/*
Create an array of chars from a given text.
*/
function toCharArray(text) {
var result = [];
for(var i=0; i<text.length; i++) {
result.push(text[i]);
}
return result;
}
/*
Create an array of charpairs from a given text.
*/
function toCharPairs(text) {
var result = [];
if (text.length > 1) {
for(var i=1; i<text.length; i++) {
result.push(text[i-1]+text[i]);
}
}
return result;
}
exports.countElements = countElements;
exports.tokenize = tokenize;
exports.toCharArray = toCharArray;
exports.toCharPairs = toCharPairs;

View File

@ -0,0 +1,10 @@
{
"name": "keaddon",
"license": "MPL 1.1/GPL 2.0/LGPL 2.1",
"author": "Clemens Dörrhöfer",
"version": "0.1",
"fullName": "keaddon",
"id": "jid0-GN3ivO79cgfs9k4P3lxdo7TPFa4",
"description": "a basic add-on",
"icon": "data/keicon.png"
}

View File

@ -0,0 +1,83 @@
const main = require("main");
const lang = require("language");
exports.test_test_run = function(test) {
test.pass("Unit test running!");
};
exports.test_id = function(test) {
test.assert(require("self").id.length > 0);
};
exports.test_url = function(test) {
require("request").Request({
url: "http://www.mozilla.org/",
onComplete: function(response) {
test.assertEqual(response.statusText, "OK");
test.done();
}
}).get();
test.waitUntilDone(20000);
};
exports.test_open_tab = function(test) {
const tabs = require("tabs");
tabs.open({
url: "http://www.mozilla.org/",
onReady: function(tab) {
test.assertEqual(tab.url, "http://www.mozilla.org/");
test.done();
}
});
test.waitUntilDone(20000);
};
var errormessage = "";
exports.test_util_countElements = function(test) {
const util = require("utility");
test.assert(compareObjects(util.countElements(["du", "du", "hallo", "hallo", "du"]),{"hallo":2, "du":3}),errormessage);
};
exports.test_util_toCharArray = function(test) {
const util = require("utility");
test.assert(compareArrays(util.toCharArray("test"), ["t","e","s","t"]), errormessage);
};
exports.test_util_toCharPairs = function(test) {
const util = require("utility");
test.assert(compareArrays(util.toCharPairs("mainz"),["ma", "ai", "in", "nz"]), errormessage);
};
exports.test_util_tokenize = function(test) {
const util = require("utility");
test.assert(compareArrays(util.tokenize("Dem Igel geht's gut."),["dem","igel","gehts","gut"]), errormessage);
};
exports.test_student_student = function(test) {
const student = require("student");
var text = "blubber";
test.assertEqual(student.student(text), lang.german, "Geht nicht weil.");
};
function compareObjects(a,b) {
for(var key in a) {
if( a[key] != b[key] ) {
return false;
}
}
return true;
};
function compareArrays(a,b) {
if (a.length != b.length) {
errormessage = "Arrays of unequal size";
return false
}
for(var i=0; i<a.length; i++) {
if (a[i] != b[i]) {
errormessage = a[i] + " != " + b[i];
return false;
}
}
return true;
};

View File

@ -34,7 +34,7 @@ entrypoint = "http://www.ke.tu-darmstadt.de/lehre/arbeiten"
#entrypoint = "http://www.red2000.com/spain/1index.html" # spanish website
#entrypoint = "https://code.google.com/p/feedparser/issues/attachmentText?id=226&aid=-1296926914212963541&name=demonstrate_issue_226.xml&token=CHtgpTsdPmWnNsvScD0yfMuBriU%3A1368702558154"
numberOfPagesToCrawl = 1000
timeBetweenSameHost = 2 # 5 sec
timeBetweenSameHost = 0 # 5 sec
visitOnlyTuSites = True;
@ -247,27 +247,52 @@ if __name__ == "__main__":
## print table number hyperlinks per website ##
print "\n \n ==== numberHyperlink ===="
print "#Hyperlinks \t Website"
keys = numberHyperlink.keys()
keys.sort( lambda x,y: cmp(numberHyperlink[y], numberHyperlink[x]) ) # sort keys
i = 0
for u in keys:
pass
if i < 50:
print str(numberHyperlink[u])+"\t \t \t"+u
i += 1
linkCount1 = {}
for u in numberHyperlink.values():
if u not in linkCount1:
linkCount1[u] = 1
else:
linkCount1[u] += 1
xValues1 = []
yValues1 = []
for u in linkCount1:
xValues1.append(u)
yValues1.append(linkCount1[u])
plt.plot(xValues1, yValues1)
plt.xlabel('Haeufigkeiten des Auftretens')
plt.ylabel('Anzahl der URLs')
plt.show()
## print table number hyperlinks to page ##
print "\n \n ==== numberHyperlinksPerPage ===="
print "#HyperlinksToPage \t Website"
keys = numberHyperlinksPerPage.keys()
keys.sort( lambda x,y: cmp(numberHyperlinksPerPage[y], numberHyperlinksPerPage[x]) ) # sort keys
i = 0
for u in keys:
pass
if i < 50:
print str(numberHyperlinksPerPage[u])+"\t \t \t"+u
i += 1
print "\n \n ==== Anzahl URLs pro Seite ===="
print "#Anzahl URLs pro Seite"
linkCount2 = {}
for u in numberHyperlinksPerPage.values():
if u not in linkCount2:
linkCount2[u] = 1
else:
linkCount2[u] += 1
xValues2 = []
yValues2 = []
for u in linkCount2:
xValues2.append(u)
yValues2.append(linkCount2[u])
'''plt.plot(xValues2, yValues2)
plt.xlabel('Anzahl der Hyperlinks pro Seite')
plt.ylabel('Anzahl der URLs')
#plt.xscale('log')
#plt.yscale('log')
plt.show()'''
print "\n \n ==== url queue ===="
for u in extractor.urls:

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

View File

@ -47,26 +47,38 @@
\@writefile{nav}{\headcommand {\beamer@subsectionpages {2}{8}}}
\@writefile{nav}{\headcommand {\slideentry {2}{0}{9}{9/9}{}{0}}}
\@writefile{nav}{\headcommand {\beamer@framepages {9}{9}}}
\@writefile{toc}{\beamer@sectionintoc {3}{2. Aufgabe}{10}{0}{3}}
\@writefile{nav}{\headcommand {\sectionentry {3}{2. Aufgabe}{10}{2. Aufgabe}{0}}}
\@writefile{nav}{\headcommand {\beamer@sectionpages {9}{9}}}
\@writefile{nav}{\headcommand {\beamer@subsectionpages {9}{9}}}
\@writefile{nav}{\headcommand {\slideentry {3}{0}{10}{10/10}{}{0}}}
\@writefile{nav}{\headcommand {\slideentry {2}{0}{10}{10/10}{}{0}}}
\@writefile{nav}{\headcommand {\beamer@framepages {10}{10}}}
\@writefile{toc}{\beamer@sectionintoc {4}{2. Aufgabe}{11}{0}{4}}
\@writefile{nav}{\headcommand {\sectionentry {4}{2. Aufgabe}{11}{2. Aufgabe}{0}}}
\@writefile{nav}{\headcommand {\beamer@sectionpages {10}{10}}}
\@writefile{nav}{\headcommand {\beamer@subsectionpages {10}{10}}}
\@writefile{nav}{\headcommand {\slideentry {4}{0}{11}{11/11}{}{0}}}
\@writefile{nav}{\headcommand {\slideentry {2}{0}{11}{11/11}{}{0}}}
\@writefile{nav}{\headcommand {\beamer@framepages {11}{11}}}
\@writefile{nav}{\headcommand {\slideentry {4}{0}{12}{12/12}{}{0}}}
\@writefile{nav}{\headcommand {\slideentry {2}{0}{12}{12/12}{}{0}}}
\@writefile{nav}{\headcommand {\beamer@framepages {12}{12}}}
\@writefile{nav}{\headcommand {\slideentry {4}{0}{13}{13/13}{}{0}}}
\@writefile{nav}{\headcommand {\slideentry {2}{0}{13}{13/13}{}{0}}}
\@writefile{nav}{\headcommand {\beamer@framepages {13}{13}}}
\@writefile{nav}{\headcommand {\slideentry {4}{0}{14}{14/14}{}{0}}}
\@writefile{nav}{\headcommand {\slideentry {2}{0}{14}{14/14}{}{0}}}
\@writefile{nav}{\headcommand {\beamer@framepages {14}{14}}}
\@writefile{nav}{\headcommand {\beamer@partpages {1}{14}}}
\@writefile{nav}{\headcommand {\beamer@subsectionpages {11}{14}}}
\@writefile{nav}{\headcommand {\beamer@sectionpages {11}{14}}}
\@writefile{nav}{\headcommand {\beamer@documentpages {14}}}
\@writefile{nav}{\headcommand {\def \inserttotalframenumber {14}}}
\@writefile{nav}{\headcommand {\slideentry {2}{0}{15}{15/15}{}{0}}}
\@writefile{nav}{\headcommand {\beamer@framepages {15}{15}}}
\@writefile{nav}{\headcommand {\slideentry {2}{0}{16}{16/16}{}{0}}}
\@writefile{nav}{\headcommand {\beamer@framepages {16}{16}}}
\@writefile{nav}{\headcommand {\slideentry {2}{0}{17}{17/17}{}{0}}}
\@writefile{nav}{\headcommand {\beamer@framepages {17}{17}}}
\@writefile{nav}{\headcommand {\slideentry {2}{0}{18}{18/18}{}{0}}}
\@writefile{nav}{\headcommand {\beamer@framepages {18}{18}}}
\@writefile{nav}{\headcommand {\slideentry {2}{0}{19}{19/19}{}{0}}}
\@writefile{nav}{\headcommand {\beamer@framepages {19}{19}}}
\@writefile{nav}{\headcommand {\slideentry {2}{0}{20}{20/20}{}{0}}}
\@writefile{nav}{\headcommand {\beamer@framepages {20}{20}}}
\@writefile{nav}{\headcommand {\slideentry {2}{0}{21}{21/21}{}{0}}}
\@writefile{nav}{\headcommand {\beamer@framepages {21}{21}}}
\@writefile{nav}{\headcommand {\slideentry {2}{0}{22}{22/22}{}{0}}}
\@writefile{nav}{\headcommand {\beamer@framepages {22}{22}}}
\@writefile{nav}{\headcommand {\slideentry {2}{0}{23}{23/23}{}{0}}}
\@writefile{nav}{\headcommand {\beamer@framepages {23}{23}}}
\@writefile{nav}{\headcommand {\slideentry {2}{0}{24}{24/24}{}{0}}}
\@writefile{nav}{\headcommand {\beamer@framepages {24}{24}}}
\@writefile{nav}{\headcommand {\beamer@partpages {1}{24}}}
\@writefile{nav}{\headcommand {\beamer@subsectionpages {9}{24}}}
\@writefile{nav}{\headcommand {\beamer@sectionpages {9}{24}}}
\@writefile{nav}{\headcommand {\beamer@documentpages {24}}}
\@writefile{nav}{\headcommand {\def \inserttotalframenumber {24}}}

View File

@ -1,4 +1,4 @@
This is pdfTeX, Version 3.1415926-2.3-1.40.12 (TeX Live 2011) (format=pdflatex 2011.7.3) 19 MAY 2013 18:14
This is pdfTeX, Version 3.1415926-2.3-1.40.12 (TeX Live 2011) (format=pdflatex 2011.7.3) 19 MAY 2013 22:19
entering extended mode
restricted \write18 enabled.
%&-line parsing enabled.
@ -1455,7 +1455,7 @@ Underfull \hbox (badness 10000) has occurred while \output is active
[2
] <grafiken/a1_abb1.png, id=47, 330.23375pt x 531.9875pt>
] <grafiken/a1_abb1.png, id=39, 330.23375pt x 531.9875pt>
File: grafiken/a1_abb1.png Graphic file (type png)
<use grafiken/a1_abb1.png>
@ -1481,7 +1481,7 @@ Underfull \hbox (badness 10000) has occurred while \output is active
[4
] <grafiken/a1_abb2.png, id=58, 614.295pt x 131.49126pt>
] <grafiken/a1_abb2.png, id=50, 614.295pt x 131.49126pt>
File: grafiken/a1_abb2.png Graphic file (type png)
<use grafiken/a1_abb2.png>
@ -1550,20 +1550,172 @@ Underfull \hbox (badness 10000) has occurred while \output is active
[12
]
] <grafiken/a2_spiegel_1.png, id=96, 578.16pt x 433.62pt>
File: grafiken/a2_spiegel_1.png Graphic file (type png)
<use grafiken/a2_spiegel_1.png>
Package pdftex.def Info: grafiken/a2_spiegel_1.png used on input line 171.
(pdftex.def) Requested size: 208.65793pt x 156.49014pt.
Overfull \vbox (22.25151pt too high) detected at line 171
[]
Underfull \hbox (badness 10000) has occurred while \output is active
[]
[13
]
<./grafiken/a2_spiegel_1.png>]
<grafiken/a2_spiegel_2.png, id=105, 578.16pt x 433.62pt>
File: grafiken/a2_spiegel_2.png Graphic file (type png)
<use grafiken/a2_spiegel_2.png>
Package pdftex.def Info: grafiken/a2_spiegel_2.png used on input line 180.
(pdftex.def) Requested size: 208.65793pt x 156.49014pt.
Overfull \vbox (22.25151pt too high) detected at line 180
[]
Underfull \hbox (badness 10000) has occurred while \output is active
[]
[14
<./grafiken/a2_spiegel_2.png>]
Underfull \hbox (badness 10000) has occurred while \output is active
[]
[15
] <grafiken/a2_spiegel_3.png, id=117, 578.16pt x 433.62pt>
File: grafiken/a2_spiegel_3.png Graphic file (type png)
<use grafiken/a2_spiegel_3.png>
Package pdftex.def Info: grafiken/a2_spiegel_3.png used on input line 196.
(pdftex.def) Requested size: 208.65793pt x 156.49014pt.
Overfull \vbox (22.25151pt too high) detected at line 196
[]
Underfull \hbox (badness 10000) has occurred while \output is active
[]
[16
<./grafiken/a2_spiegel_3.png>]
<grafiken/a2_spiegel_4.png, id=124, 578.16pt x 433.62pt>
File: grafiken/a2_spiegel_4.png Graphic file (type png)
<use grafiken/a2_spiegel_4.png>
Package pdftex.def Info: grafiken/a2_spiegel_4.png used on input line 205.
(pdftex.def) Requested size: 208.65793pt x 156.49014pt.
Overfull \vbox (22.25151pt too high) detected at line 205
[]
Underfull \hbox (badness 10000) has occurred while \output is active
[]
[17
<./grafiken/a2_spiegel_4.png>]
Underfull \hbox (badness 10000) has occurred while \output is active
[]
[18
]
Underfull \hbox (badness 10000) has occurred while \output is active
[]
[19
] <grafiken/a2_tu_1.png, id=142, 578.16pt x 433.62pt>
File: grafiken/a2_tu_1.png Graphic file (type png)
<use grafiken/a2_tu_1.png>
Package pdftex.def Info: grafiken/a2_tu_1.png used on input line 236.
(pdftex.def) Requested size: 208.65793pt x 156.49014pt.
Overfull \vbox (22.25151pt too high) detected at line 236
[]
Underfull \hbox (badness 10000) has occurred while \output is active
[]
[20
<./grafiken/a2_tu_1.png>]
<grafiken/a2_tu_2.png, id=149, 578.16pt x 433.62pt>
File: grafiken/a2_tu_2.png Graphic file (type png)
<use grafiken/a2_tu_2.png>
Package pdftex.def Info: grafiken/a2_tu_2.png used on input line 245.
(pdftex.def) Requested size: 208.65793pt x 156.49014pt.
Overfull \vbox (22.25151pt too high) detected at line 245
[]
Underfull \hbox (badness 10000) has occurred while \output is active
[]
[21
<./grafiken/a2_tu_2.png>]
<grafiken/a2_tu_3.png, id=156, 578.16pt x 433.62pt>
File: grafiken/a2_tu_3.png Graphic file (type png)
<use grafiken/a2_tu_3.png>
Package pdftex.def Info: grafiken/a2_tu_3.png used on input line 254.
(pdftex.def) Requested size: 208.65793pt x 156.49014pt.
Overfull \vbox (22.25151pt too high) detected at line 254
[]
Underfull \hbox (badness 10000) has occurred while \output is active
[]
[22
<./grafiken/a2_tu_3.png>]
<grafiken/a2_tu_4.png, id=163, 578.16pt x 433.62pt>
File: grafiken/a2_tu_4.png Graphic file (type png)
<use grafiken/a2_tu_4.png>
Package pdftex.def Info: grafiken/a2_tu_4.png used on input line 263.
(pdftex.def) Requested size: 208.65793pt x 156.49014pt.
Overfull \vbox (22.25151pt too high) detected at line 263
[]
Underfull \hbox (badness 10000) has occurred while \output is active
[]
[23
<./grafiken/a2_tu_4.png>]
Underfull \hbox (badness 10000) has occurred while \output is active
[]
[24
]
\tf@nav=\write7
\openout7 = `solution.nav'.
@ -1574,20 +1726,20 @@ Underfull \hbox (badness 10000) has occurred while \output is active
\tf@snm=\write9
\openout9 = `solution.snm'.
Package atveryend Info: Empty hook `BeforeClearDocument' on input line 172.
Package atveryend Info: Empty hook `AfterLastShipout' on input line 172.
Package atveryend Info: Empty hook `BeforeClearDocument' on input line 278.
Package atveryend Info: Empty hook `AfterLastShipout' on input line 278.
(./solution.aux)
Package atveryend Info: Empty hook `AtVeryEndDocument' on input line 172.
Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 172.
Package atveryend Info: Empty hook `AtVeryEndDocument' on input line 278.
Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 278.
Package rerunfilecheck Info: File `solution.out' has not changed.
(rerunfilecheck) Checksum: 88D911AA5795ABD0722131B6C5D24A75;180.
Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 172.
(rerunfilecheck) Checksum: AE5CCE897D490A137427F55C345E5A34;90.
Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 278.
)
Here is how much of TeX's memory you used:
17220 strings out of 493633
315045 string characters out of 3143378
391807 words of memory out of 3000000
20016 multiletter control sequences out of 15000+200000
17292 strings out of 493633
316299 string characters out of 3143378
391806 words of memory out of 3000000
20062 multiletter control sequences out of 15000+200000
39628 words of font info for 52 fonts, out of 3000000 for 9000
831 hyphenation exceptions out of 8191
55i,20n,79p,425b,533s stack positions out of 5000i,500n,10000p,200000b,50000s
@ -1601,10 +1753,10 @@ texlive/2011/texmf-dist/fonts/type1/urw/helvetic/uhvb8a.pfb></usr/local/texlive
/2011/texmf-dist/fonts/type1/urw/helvetic/uhvbo8a.pfb></usr/local/texlive/2011/
texmf-dist/fonts/type1/urw/helvetic/uhvr8a.pfb></usr/local/texlive/2011/texmf-d
ist/fonts/type1/urw/helvetic/uhvro8a.pfb>
Output written on solution.pdf (14 pages, 146011 bytes).
Output written on solution.pdf (24 pages, 351323 bytes).
PDF statistics:
156 PDF objects out of 1000 (max. 8388607)
122 compressed objects within 2 object streams
33 named destinations out of 1000 (max. 500000)
80 words of extra memory for PDF output out of 10000 (max. 10000000)
221 PDF objects out of 1000 (max. 8388607)
161 compressed objects within 2 object streams
51 named destinations out of 1000 (max. 500000)
104 words of extra memory for PDF output out of 10000 (max. 10000000)

View File

@ -23,24 +23,38 @@
\headcommand {\beamer@subsectionpages {2}{8}}
\headcommand {\slideentry {2}{0}{9}{9/9}{}{0}}
\headcommand {\beamer@framepages {9}{9}}
\headcommand {\sectionentry {3}{2. Aufgabe}{10}{2. Aufgabe}{0}}
\headcommand {\beamer@sectionpages {9}{9}}
\headcommand {\beamer@subsectionpages {9}{9}}
\headcommand {\slideentry {3}{0}{10}{10/10}{}{0}}
\headcommand {\slideentry {2}{0}{10}{10/10}{}{0}}
\headcommand {\beamer@framepages {10}{10}}
\headcommand {\sectionentry {4}{2. Aufgabe}{11}{2. Aufgabe}{0}}
\headcommand {\beamer@sectionpages {10}{10}}
\headcommand {\beamer@subsectionpages {10}{10}}
\headcommand {\slideentry {4}{0}{11}{11/11}{}{0}}
\headcommand {\slideentry {2}{0}{11}{11/11}{}{0}}
\headcommand {\beamer@framepages {11}{11}}
\headcommand {\slideentry {4}{0}{12}{12/12}{}{0}}
\headcommand {\slideentry {2}{0}{12}{12/12}{}{0}}
\headcommand {\beamer@framepages {12}{12}}
\headcommand {\slideentry {4}{0}{13}{13/13}{}{0}}
\headcommand {\slideentry {2}{0}{13}{13/13}{}{0}}
\headcommand {\beamer@framepages {13}{13}}
\headcommand {\slideentry {4}{0}{14}{14/14}{}{0}}
\headcommand {\slideentry {2}{0}{14}{14/14}{}{0}}
\headcommand {\beamer@framepages {14}{14}}
\headcommand {\beamer@partpages {1}{14}}
\headcommand {\beamer@subsectionpages {11}{14}}
\headcommand {\beamer@sectionpages {11}{14}}
\headcommand {\beamer@documentpages {14}}
\headcommand {\def \inserttotalframenumber {14}}
\headcommand {\slideentry {2}{0}{15}{15/15}{}{0}}
\headcommand {\beamer@framepages {15}{15}}
\headcommand {\slideentry {2}{0}{16}{16/16}{}{0}}
\headcommand {\beamer@framepages {16}{16}}
\headcommand {\slideentry {2}{0}{17}{17/17}{}{0}}
\headcommand {\beamer@framepages {17}{17}}
\headcommand {\slideentry {2}{0}{18}{18/18}{}{0}}
\headcommand {\beamer@framepages {18}{18}}
\headcommand {\slideentry {2}{0}{19}{19/19}{}{0}}
\headcommand {\beamer@framepages {19}{19}}
\headcommand {\slideentry {2}{0}{20}{20/20}{}{0}}
\headcommand {\beamer@framepages {20}{20}}
\headcommand {\slideentry {2}{0}{21}{21/21}{}{0}}
\headcommand {\beamer@framepages {21}{21}}
\headcommand {\slideentry {2}{0}{22}{22/22}{}{0}}
\headcommand {\beamer@framepages {22}{22}}
\headcommand {\slideentry {2}{0}{23}{23/23}{}{0}}
\headcommand {\beamer@framepages {23}{23}}
\headcommand {\slideentry {2}{0}{24}{24/24}{}{0}}
\headcommand {\beamer@framepages {24}{24}}
\headcommand {\beamer@partpages {1}{24}}
\headcommand {\beamer@subsectionpages {9}{24}}
\headcommand {\beamer@sectionpages {9}{24}}
\headcommand {\beamer@documentpages {24}}
\headcommand {\def \inserttotalframenumber {24}}

View File

@ -1,4 +1,2 @@
\BOOKMARK [2][]{Outline0.1}{1. Aufgabe}{}% 1
\BOOKMARK [2][]{Outline0.2}{2. Aufgabe}{}% 2
\BOOKMARK [2][]{Outline0.3}{2. Aufgabe}{}% 3
\BOOKMARK [2][]{Outline0.4}{2. Aufgabe}{}% 4

View File

@ -80,7 +80,7 @@ $p_{expected(lp, lang) \approx \frac{1}{i * ln(1,78 * N)}}$
\end{frame}
\begin{frame}
\frametitle{1. Aufgabe \\ Firefox-Plugin}
\frametitle{1. Aufgabe \\ Firefox-Plugin}
\begin{itemize}
\item Häufigkeiten der Buchstaben bzw. Buchstabenpaare ($n_{text(l)}$) relativ zur Gesamtanzahl ($n_{text}$): \\
\begin{center}
@ -117,7 +117,7 @@ $MSE(lang) = \frac{\sum_{l}(\tilde{p}_{text}(l) - p_{expected(lp, lang)})^2}{n_{
% % % % % % % % % % % % % % % % % % % % % % %% % % % % % % % % % % % % % % % % % % % % % %% % % % % % % % % % % % % % % % % % % % % % % %
\section{2. Aufgabe}
\begin{frame}
\frametitle{2. Aufgabe \\ Crawler}
\frametitle{2. und 3. Aufgabe \\ Crawler}
\textbf{Verfahren zur Erkennung von Duplikaten:} \\
\textbf{1.} Alle Wörter mit einer Länge von 4 und kleiner 11 werden von der Webseite extrahiert.
\begin{itemize}
@ -127,9 +127,9 @@ $MSE(lang) = \frac{\sum_{l}(\tilde{p}_{text}(l) - p_{expected(lp, lang)})^2}{n_{
\end{itemize}
\end{frame}
\section{2. Aufgabe}
\begin{frame}
\frametitle{2. Aufgabe \\ Crawler}
\frametitle{2. und 3. Aufgabe \\ Crawler}
\textbf{2.} Eine Zuweisung von Wörtern zu deren Auftrittshäufigkeit wird angefertigt \\
\textbf{3.} Für alle paarweise verschiedenen Seiten werden die Auftrittshäufigkeiten subtrahiert, so dass deren Ergebnis minimal null ergibt. Zudem werden alle Wörter, die auf einer, aber nicht auf der anderen Seite vorkommen, ebenfalls der anderen Seite zugewiesen
\begin{itemize}
@ -137,29 +137,135 @@ $MSE(lang) = \frac{\sum_{l}(\tilde{p}_{text}(l) - p_{expected(lp, lang)})^2}{n_{
\end{itemize}
\end{frame}
\section{2. Aufgabe}
\begin{frame}
\frametitle{2. Aufgabe \\ Crawler}
\frametitle{2. und 3. Aufgabe \\ Crawler}
\textbf{4.} Anschließend wird die resultierende Gesamtzahl an Wortvorkommnissen durch die Anzahl der Wortvorkommnisse vor der Subtraktion geteilt. Fällt dieser Wert unter eine definierte Grenze, gilt die Seite als Duplikat. \\
\begin{itemize}
\item Im Code ist anstatt einer Untergrenze eine Obergrenze von 90\% angegeben, die Berechnung wurde also umgekehrt, so dass hohe Werte eine hohe Duplikatswahrscheinlichkeit implizieren.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{2. Aufgabe \\ Crawler}
Histogramm über die Anzahl der URLs pro Seite (wie beim ersten Übungsblatt mit Worthäufigkeiten, auch logarithmisch)
\frametitle{2. und 3. Aufgabe \\ Crawler}
\textbf{Startseite:} \\
http://www.spiegel.de/ \\
Es wurden 1000 Seiten besucht. \\
\vspace{1cm}
\textbf{Erkannte Sprachen:}
\begin{center}
de $\to$ 623 \\
en $\to$ 246 \\
es $\to$ 131 \\
\end{center}
\end{frame}
\begin{frame}
\frametitle{2. Aufgabe \\ Crawler}
Histogramm mit den Häufigkeiten des Auftretens von Hyperlinks, d.h. wie viele Links treten 1-mal, 2-mal, ... auftreten ...
\frametitle{2. und 3. Aufgabe \\ Crawler}
\begin{figure}
\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_spiegel_1.png}
\caption{Anzahl URLs pro Seite}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{2. und 3. Aufgabe \\ Crawler}
\begin{figure}
\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_spiegel_2.png}
\caption{Anzahl URLs pro Seite (logarithmisch)}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{2. und 3. Aufgabe \\ Crawler}
\begin{itemize}
\item Viele Internetseiten verweisen auf wenige andere Internetseiten
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{2. und 3. Aufgabe \\ Crawler}
\begin{figure}
\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_spiegel_3.png}
\caption{Häufigkeiten des Auftretens von Hyperlinks}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{2. und 3. Aufgabe \\ Crawler}
\begin{figure}
\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_spiegel_4.png}
\caption{Häufigkeiten des Auftretens von Hyperlinks (logarithmisch)}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{2. und 3. Aufgabe \\ Crawler}
\begin{itemize}
\item Es gibt nur wenige Internetseiten die oft referenziert werden.
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{2. und 3. Aufgabe \\ Crawler}
\textbf{Startseite:} \\
http://www.ke.tu-darmstadt.de/lehre/arbeiten \\
Crawler hat nur Seiten innerhalb der TU Darmstadt der Form *.tu.darmstadt.de besucht.
Es wurden 1000 Seiten besucht. \\
\vspace{1cm}
\textbf{Erkannte Sprachen:}
\begin{center}
de $\to$ 329 \\
en $\to$ 576 \\
es $\to$ 95 \\
\end{center}
\end{frame}
\begin{frame}
\frametitle{2. und 3. Aufgabe \\ Crawler}
\begin{figure}
\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_tu_1.png}
\caption{Anzahl URLs pro Seite}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{2. und 3. Aufgabe \\ Crawler}
\begin{figure}
\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_tu_2.png}
\caption{Anzahl URLs pro Seite (logarithmisch)}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{2. und 3. Aufgabe \\ Crawler}
\begin{figure}
\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_tu_3.png}
\caption{Häufigkeiten des Auftretens von Hyperlinks}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{2. und 3. Aufgabe \\ Crawler}
\begin{figure}
\noindent\includegraphics[height=5.5cm,keepaspectratio]{grafiken/a2_tu_4.png}
\caption{Häufigkeiten des Auftretens von Hyperlinks (logarithmisch)}
\end{figure}
\end{frame}
\begin{frame}
\frametitle{2. Aufgabe \\ Crawler}
\frametitle{2. und 3. Aufgabe \\ Crawler}
\textbf{Erfahrungen und Probleme:}
\begin{itemize}
\item Findet man einen Onlineshop, so wird die Queue mit sehr vielen Links dieses Shops gefüllt und der Crawler besucht mit sehr hoher Wahrscheinlichkeit nur noch URLs innerhalb des Shops.

View File

@ -2,5 +2,3 @@
\select@language {ngerman}
\beamer@sectionintoc {1}{1. Aufgabe}{2}{0}{1}
\beamer@sectionintoc {2}{2. Aufgabe}{9}{0}{2}
\beamer@sectionintoc {3}{2. Aufgabe}{10}{0}{3}
\beamer@sectionintoc {4}{2. Aufgabe}{11}{0}{4}

View File

@ -0,0 +1,379 @@
{\rtf1\ansi\ansicpg1252\cocoartf1187\cocoasubrtf370
{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
{\colortbl;\red255\green255\blue255;}
\paperw11900\paperh16840\margl1440\margr1440\vieww16900\viewh8400\viewkind0
\pard\tx566\tx1133\tx1700\tx2267\tx2834\tx3401\tx3968\tx4535\tx5102\tx5669\tx6236\tx6803\pardirnatural
\f0\b\fs24 \cf0 entrypoint: google.de:
\b0 \
\
==== robots.txt ====\
prohibit by robots.txt: 172\
\
\
==== numberHyperlink ====\
#Hyperlinks Website\
19 http://www.blogger.com/profile/05109496878476775729\
19 http://www.google.de/history/optout?hl=de\
18 http://www.google.de/intl/de/options/\
8 http://www.vovone.com/company/profile/\
8 http://www.vovone.com/more/solutions/\
8 http://www.vovone.com/company/partners/\
7 http://www.google.de/intl/de/policies/privacy/\
7 http://rocketsandsuch.blogspot.de/2009_08_01_archive.html\
7 http://www.vovone.com/more/ssl-certificates/ssl-certificate-validation/\
7 http://www.google.de/webhp?hl=de&tab=iw\
7 http://www.vovone.com/company/ask-for-more/\
6 http://www.vovone.com/domain-names/redirect-domain-name/\
6 http://voice.google.com\
6 http://www.vovone.com/support/f-a-q/\
6 http://www.vovone.com/domain-names/domain-termination/\
6 http://www.vovone.com/support/documentation/\
6 http://www.vovone.com/company/careers/\
5 http://www.vovone.com/discounts-offers/\
5 http://www.google.com/press/blogs/directory.html#tab0\
5 http://rocketsandsuch.blogspot.de/2008_03_01_archive.html\
5 http://rocketsandsuch.blogspot.de/2009_09_01_archive.html\
5 http://www.vovone.com/servers/\
5 http://rocketsandsuch.blogspot.de/2009_01_01_archive.html\
5 http://www.vovone.com\
5 http://fusion.google.com/add?source=atgs&feedurl=http%3a//feeds.feedburner.com/googleappsupdates\
5 http://www.vovone.com/more/solutions/service-level-agreements/\
5 http://www.vovone.com/support/\
5 http://www.vovone.com/servers/managed-servers/\
5 http://rocketsandsuch.blogspot.de/2008_10_01_archive.html\
5 http://rocketsandsuch.blogspot.de/2009_10_01_archive.html\
5 http://feedburner.google.com/fb/a/mailverify?uri=googleappsupdates&loc=en_us\
5 http://www.vovone.com/more/reseller-plans/affiliate-plan/\
5 http://www.vovone.com/more/ssl-certificates/ssl-certificate-type/\
5 http://blog.chromium.org/\
5 http://www.vovone.com/company/conditions/notice-and-take-down/\
5 http://www.vovone.com/more/ssl-certificates/ssl-certificates-brand/\
4 http://www.vovone.com/more/colocation/private-rackspace/\
4 http://www.vovone.com/more/ssl-certificates/\
4 http://www.google.de/setprefdomain?prefdom=us&sig=0_h0pay1e5n4pq04s4m5soth6xqlk%3d\
4 http://www.vovone.com/company/technology/security/\
4 http://rocketsandsuch.blogspot.de/search?updated-min=2007-01-01t00:00:00-08:00&updated-max=2008-01-01t00:00:00-08:00&max-results=50\
4 http://www.google.de/setprefdomain?prefdom=us&sig=0_bbxqe3gzyewbwv2egvfk2cujk3w%3d\
4 http://www.vovone.com/more/\
4 http://www.vovone.com/web-hosting/special-plans/special-plans-magento-hosting/\
4 http://www.vovone.com/more/colocation/shared-rackspace/\
4 http://www.vovone.com/company/conditions/\
4 http://www.vovone.com/more/solutions/managed-services/\
4 http://mail.google.com\
4 http://rocketsandsuch.blogspot.de/2008/10/hubble-bubble-toil-and-trouble.html\
4 http://www.vovone.com/servers/vps/vps-plan-8192/\
\
\
==== numberHyperlinksPerPage ====\
#HyperlinksToPage Website\
9088 javascript:void(0)\
1867 #\
898 javascript:;\
522 http://www.blogger.com/profile/05109496878476775729\
392 http://www.vovone.com\
348 /\
347 http://www.blogger.com/profile/09046869427384152063\
317 \
301 http://www.vovone.com/support/\
298 https://my.vovone.com\
295 http://www.vovone.com/company/careers/\
272 http://feedburner.google.com/fb/a/mailverify?uri=GoogleAppsUpdates&loc=en_US\
270 http://fusion.google.com/add?source=atgs&feedurl=http%3A//feeds.feedburner.com/GoogleAppsUpdates\
256 the-button-element.html#concept-fe-value\
242 http://www.blogger.com/profile/06992649719432295652\
221 http://www.vovone.com/servers/\
220 the-input-element.html#the-input-element\
216 http://www.vovone.com/company/\
206 http://www.vovone.com/web-hosting/\
206 http://www.vovone.com/more/colocation/\
205 http://www.vovone.com/more/ssl-certificates/\
205 http://www.vovone.com/servers/dedicated-servers/\
204 http://www.vovone.com/more/colocation/private-rackspace/\
203 http://www.vovone.com/more/solutions/\
203 http://www.vovone.com/company/technology/\
203 http://www.vovone.com/servers/managed-servers/\
203 http://www.vovone.com/domain-names/\
202 http://www.vovone.com/voip-services/\
202 http://www.vovone.com/company/conditions/\
201 http://www.vovone.com/more/reseller-plans/\
201 http://www.vovone.com/voip-services/cloud-voip/\
200 http://www.vovone.com/company/promise/\
200 http://www.vovone.com/voip-services/voip-accounts/\
200 http://www.vovone.com/domain-names/domain-termination/\
200 http://www.vovone.com/more/ssl-certificates/ssl-certificate-type/\
200 http://www.vovone.com/domain-names/transfer-domain-name/\
200 http://www.vovone.com/company/profile/\
199 http://www.vovone.com/more/solutions/service-level-agreements/\
199 http://www.vovone.com/more/solutions/managed-services/\
199 http://www.vovone.com/support/documentation/\
199 http://www.vovone.com/voip-services/business-voip/\
199 http://www.vovone.com/more/ssl-certificates/ssl-certificate-validation/\
199 http://www.vovone.com/more/ssl-certificates/ssl-certificates-brand/\
199 http://www.vovone.com/more/colocation/shared-rackspace/\
199 http://www.vovone.com/more/reseller-plans/affiliate-plan/\
199 http://www.vovone.com/support/f-a-q/\
198 http://www.vovone.com/support/support-desk/\
198 http://www.vovone.com/voip-services/wholesale-voip/\
197 http://www.vovone.com/domain-names/redirect-domain-name/\
197 http://www.vovone.com/company/press/\
\
\
\
\b entrypoint http://www.ke.tu-darmstadt.de/lehre/arbeiten:
\b0 \
\
==== robots.txt ====\
prohibit by robots.txt: 4\
\
\
==== numberHyperlink ====\
#Hyperlinks Website\
405 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1625\
120 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1677\
107 http://www.tu-darmstadt.de\
77 http://www.informatik.tu-darmstadt.de\
71 http://www.ke.tu-darmstadt.de\
61 http://www.ke.tu-darmstadt.de/bibtex/authors/show/875\
46 http://www.ke.tu-darmstadt.de/lehre\
46 http://www.ke.tu-darmstadt.de/news\
41 http://www.ke.tu-darmstadt.de/bibtex/authors/show/708\
41 http://www.ke.tu-darmstadt.de/bibtex/search\
40 http://www.ke.tu-darmstadt.de/de/studierende/studienbuero/ansprechpartner-studienbuero/\
40 http://www.ke.tu-darmstadt.de/bibtex/export\
39 http://www.informatik.tu-darmstadt.de/de/aktuelles/veranstaltungentermine/\
39 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#proceedings\
38 http://www.ke.tu-darmstadt.de/de/intern/index/\
38 http://www.ke.tu-darmstadt.de/de/studierende/studiendekanat/ansprechpartner/\
37 http://www.ke.tu-darmstadt.de/bibtex/publications\
37 http://www.informatik.tu-darmstadt.de/de/aktuelles/neuigkeiten/neuigkeiten/artikel/vortrag-ueber-fitweltweit-programm-des-daad-1/\
36 http://www.ke.tu-darmstadt.de/resources\
36 http://www.ke.tu-darmstadt.de/bibtex/topics/single/77\
36 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1849\
35 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type\
34 http://www.informatik.tu-darmstadt.de/de/aktuelles/neuigkeiten/neuigkeiten/artikel/smarte-spione/\
34 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/recent\
33 http://www.ke.tu-darmstadt.de/de/fachbereich/dekanat/\
33 http://www.ke.tu-darmstadt.de/de/fachbereich/bilder/absolventenfeier-november-2012/begruessung/\
33 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#inproceedings\
33 http://www.ke.tu-darmstadt.de/research\
31 http://www.ke.tu-darmstadt.de/de/forschung/netzwerkpartner/\
29 http://www.ke.tu-darmstadt.de/de/aktuelles/newsletter-an-und-abmeldung/\
29 http://www.ke.tu-darmstadt.de/bibtex/authors/show/702\
29 http://www.ke.tu-darmstadt.de/projects\
29 http://www.ke.tu-darmstadt.de/bibtex/topics/single/33\
29 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#incollection\
28 http://www.informatik.tu-darmstadt.de/de/aktuelles/neuigkeiten/neuigkeiten/artikel/eine-kultur-der-privatsphaere-im-internet/\
28 http://www.ke.tu-darmstadt.de/bibtex/topics\
28 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#book\
27 http://www.ke.tu-darmstadt.de/de/aktuelles/neuigkeiten/\
26 http://www.ke.tu-darmstadt.de/bibtex/authors/show/3036\
25 http://www.ke.tu-darmstadt.de/bibtex/authors/show/2370\
24 http://www.ke.tu-darmstadt.de/de/aktuelles/preise-und-auszeichnungen/\
24 http://www.ke.tu-darmstadt.de/staff\
24 http://www.ke.tu-darmstadt.de/impressum\
24 http://www.ke.tu-darmstadt.de/de/studierende/news-fuer-studierende/\
24 http://www.ke.tu-darmstadt.de/publications\
23 http://www.ke.tu-darmstadt.de/bibtex/authors/show/2365\
23 http://www.ke.tu-darmstadt.de/termine\
23 http://www.ke.tu-darmstadt.de/de/ehemalige/alumni-portal-der-tu-darmstadt/\
23 http://www.ke.tu-darmstadt.de/de/ehemalige/\
22 http://www.tu-darmstadt.de/\
\
\
==== numberHyperlinksPerPage ====\
#HyperlinksToPage Website\
3528 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1625\
915 http://www.tu-darmstadt.de\
904 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1677\
635 de/aktuelles/neuigkeiten/\
577 de/fachbereich/dekanat/\
575 de/fachbereich/bilder/absolventenfeier-november-2012/begruessung/\
528 http://www.informatik.tu-darmstadt.de\
499 http://www.ke.tu-darmstadt.de\
490 de/aktuelles/newsletter-an-und-abmeldung/\
482 de/forschung/netzwerkpartner/\
481 http://www.ke.tu-darmstadt.de/bibtex/topics/single/33\
474 de/studierende/studiendekanat/ansprechpartner/\
468 de/studierende/studienbuero/ansprechpartner-studienbuero/\
452 de/intern/index/\
450 http://www.ke.tu-darmstadt.de/bibtex/authors/show/875\
444 http://www.ke.tu-darmstadt.de/bibtex/topics/single/77\
434 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/recent\
434 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type\
433 javascript:this.print()\
429 javascript:fontsize('reset')\
429 javascript:fontsize('inkrement')\
429 javascript:fontsize('dekrement')\
424 http://www.ke.tu-darmstadt.de/bibtex/search\
424 http://www.ke.tu-darmstadt.de/bibtex/topics\
424 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#Proceedings\
424 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#Book\
424 http://www.ke.tu-darmstadt.de/bibtex/publications\
424 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#Inproceedings\
424 http://www.ke.tu-darmstadt.de/bibtex/publications/showlist/type#Incollection\
424 http://www.ke.tu-darmstadt.de/bibtex/export\
412 de/aktuelles/neuigkeiten/neuigkeiten/artikel/smarte-spione/\
408 de/aktuelles/neuigkeiten/neuigkeiten/artikel/eine-kultur-der-privatsphaere-im-internet/\
408 de/aktuelles/neuigkeiten/neuigkeiten/artikel/vortrag-ueber-fitweltweit-programm-des-daad-1/\
405 \
382 http://www.ke.tu-darmstadt.de/bibtex/authors/show/708\
369 de/fachbereich/\
352 de/fachbereich/ehrungen-und-auszeichnungen/alwin-walther-medaille/\
351 de/fachbereich/kontakt-und-anfahrt/\
351 de/fachbereich/personen/\
350 de/fachbereich/professuren-und-gruppenleitungen/\
350 de/fachbereich/ueber-den-fachbereich/\
350 de/fachbereich/ausschuesse-gremien-und-kommissionen/\
349 http://www.informatik.tu-darmstadt.de/index.php?id=40\
336 http://www.ke.tu-darmstadt.de/bibtex/authors/show/702\
330 http://www.informatik.tu-darmstadt.de/index.php?id=1894\
306 http://www.ke.tu-darmstadt.de/bibtex/authors/show/1849\
302 http://www.ke.tu-darmstadt.de/news\
298 de/tu/\
277 http://www.tu-darmstadt.de/\
264 #top\
\
\
==== url queue ====\
\
\
==== language distribution ====\
Language Number of occurences\
de 329\
en 576\
es 94\
\
\
\b entrypoint http://www.spiegel.de:\
\
\
\b0 ==== robots.txt ====\
prohibit by robots.txt: 115\
\
\
==== numberHyperlink ====\
#Hyperlinks Website\
43 https://www.amazon.de/b\
38 http://www.amazon.de/spiegel\
28 http://tv.adobe.com\
21 http://tvprogramm.spiegel.de/\
19 http://www.spiegel.de/\
18 https://service.spiegel.de\
18 http://www.spiegel.de/spiegel/spiegelgeschichte/index-2013-2.html\
17 http://www.spiegel.de/spiegel/deinspiegel/index-2013-6.html\
16 https://www.ebook.de/de/category/61110/unsere_vorteile.html\
16 http://www.spiegel.de\
15 http://www.spiegel.de/shop\
14 http://www.shopbop.com/gp/help/customer/display.html\
14 http://www.manager-magazin.de/\
14 http://www.spiegel.de/spiegel/spiegelwissen/index-2013-2.html\
13 http://www.spiegel.de/spiegel/\
13 http://www.spiegel.de/wissenschaft/\
12 http://wetter.spiegel.de/spiegel/\
10 https://www.ebook.de/de/category/59475/kontakt_impressum.html\
10 http://abo.spiegel.de/go/place!abosspsc\
9 https://www.amazon.de/gp/cart/view.html\
9 https://www.ebook.de/de/category/59424/hilfe.html\
9 http://www.amazon.de/gp/feature.html\
9 http://www.spiegel.de/sport/\
9 https://media.libri.de/de/category/58974/sony_reader.html\
9 http://www.spiegelgruppe-nachdrucke.de\
9 https://www.ebook.de/de/category/61132/newsletter.html\
9 http://www.spiegelwissen.tv/flashsite/index.html\
8 http://www.spiegel.de/hilfe/\
8 http://abo.spiegel.de/?et_cid=7&et_lid=1946&et_sub=heftkasten\
8 https://www.amazon.es/b\
8 https://www.ebook.de/de/category/59663/gutscheine_kaufen.html\
8 https://www.ebook.de/de/category/52122/ebooks.html\
8 http://www.spiegel.de/politik/\
8 https://www.ebook.de/de/account/wishlist/add\
8 https://www.amazon.de/pc-mac-downloads-herunterladen-digital-steam/b\
8 http://www.spiegel.de/spiegel/unispiegel/\
8 http://www.spiegel.de/unispiegel/studium/tools-hier-werden-sie-geholfen-a-640620.html\
8 http://www.harvardbusinessmanager.de/\
7 http://www.amazon.co.jp/\
7 https://www.ebook.de/de/category/63461/ebooks_verschenken.html\
7 https://www.ebook.de/de/category/browse\
7 http://kdp.amazon.de/\
7 http://abo.spiegel.de/?et_cid=7&et_lid=1946&et_sub=aboreiter\
7 http://www.spiegel-qc.de/selbstbuchungstool\
7 https://media.libri.de/de/category/52124/buecher.html\
7 http://www.spiegel-qc.de/\
7 https://www.ebook.de/de/magazine\
7 https://www.ebook.de\
7 http://www.spiegel.de/video/\
7 http://www.libri.de/shop/action/magazine/6/ebooks_reader.html\
\
\
==== numberHyperlinksPerPage ====\
#HyperlinksToPage Website\
6966 #\
1507 /\
1027 \
961 http://www.amazon.de/spiegel\
671 http://tv.adobe.com/product/photoshop/\
640 \{\{url\}\}\
640 /gp/digital/fiona/manage\
598 javascript:void(0);\
597 http://tv.adobe.com/product/cs-production-premium/\
586 http://www.spiegel.de/\
575 http://www.spiegel.de/spiegel/\
509 http://wetter.spiegel.de/spiegel/\
504 <#=item.url #>\
492 http://www.spiegel.de/shop\
468 http://www.spiegel.de/spiegel/spiegelwissen/index-2013-2.html\
468 http://www.spiegel.de/spiegel/deinspiegel/index-2013-6.html\
468 http://www.spiegel.de/spiegel/spiegelgeschichte/index-2013-2.html\
462 /gp/site-directory\
460 /gp/cart/view.html?ie=UTF8&hasWorkingJavascript=1\
441 /gp/registry/wishlist\
435 /clouddrive\
411 http://www.spiegel.de/sptv/magazin/\
385 /gp/prime\
382 /product/photoshop/\
352 /gp/dmusic/mp3/player\
323 https://www.ebook.de/de/account/ebookHistory\
316 https://www.ebook.de/de/account/create/singlestep\
311 http://tv.adobe.com/product/cs-design-premium/\
311 http://forum.spiegel.de/\
310 /product/illustrator/\
308 http://tv.adobe.com/product/creative-cloud/\
303 http://www.spiegel.de/video/\
303 http://www.spiegel-qc.de/\
297 /video/\
296 http://www.spiegel.de/schlagzeilen/\
294 http://www.quality-abo.de/\
293 http://www.spiegelgruppe.de/\
293 http://www.buchreport.de/\
288 http://www.spiegelgruppe-nachdrucke.de\
277 /de/category/60575/libri_de_ist_jetzt_ebook_de.html\
276 http://www.manager-magazin.de/\
274 http://tv.adobe.com/product/premiere-pro/\
267 http://tv.adobe.com/product/after-effects/\
264 http://www.harvardbusinessmanager.de/\
262 /product/premiere-pro/\
260 /MP3-Musik-Downloads/b?ie=UTF8&node=77195031\
260 http://tvprogramm.spiegel.de/\
259 /pc-mac-downloads-herunterladen-digital-steam/b?ie=UTF8&node=1333619031\
259 /spiegel/\
256 /Navigationssystems-Car-HiFi-Autoradios/b?ie=UTF8&node=236861011\
\
\
==== url queue ====\
\
\
==== language distribution ====\
Language Number of occurences\
de 623\
en 246\
es 130
\b \
\b0 \
\
\
}