web mining u1 - u4
This commit is contained in:
parent
5f94d45927
commit
bc50b297ea
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
3786
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/Einfuehrung in die moderne Logik.txt
Executable file
3786
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/Einfuehrung in die moderne Logik.txt
Executable file
File diff suppressed because it is too large
Load Diff
13254
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/Expository Writing by Mervin James Curl.txt
Executable file
13254
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/Expository Writing by Mervin James Curl.txt
Executable file
File diff suppressed because it is too large
Load Diff
BIN
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/Solution.doc
Executable file
BIN
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/Solution.doc
Executable file
Binary file not shown.
BIN
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/Solution.docx
Executable file
BIN
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/Solution.docx
Executable file
Binary file not shown.
BIN
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/charfreq_over_rank_english.png
Executable file
BIN
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/charfreq_over_rank_english.png
Executable file
Binary file not shown.
|
After Width: | Height: | Size: 25 KiB |
BIN
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/charfreq_over_rank_german.png
Executable file
BIN
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/charfreq_over_rank_german.png
Executable file
Binary file not shown.
|
After Width: | Height: | Size: 23 KiB |
128
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/english
Executable file
128
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/english
Executable file
@ -0,0 +1,128 @@
|
||||
i
|
||||
me
|
||||
my
|
||||
myself
|
||||
we
|
||||
our
|
||||
ours
|
||||
ourselves
|
||||
you
|
||||
your
|
||||
yours
|
||||
yourself
|
||||
yourselves
|
||||
he
|
||||
him
|
||||
his
|
||||
himself
|
||||
she
|
||||
her
|
||||
hers
|
||||
herself
|
||||
it
|
||||
its
|
||||
itself
|
||||
they
|
||||
them
|
||||
their
|
||||
theirs
|
||||
themselves
|
||||
what
|
||||
which
|
||||
who
|
||||
whom
|
||||
this
|
||||
that
|
||||
these
|
||||
those
|
||||
am
|
||||
is
|
||||
are
|
||||
was
|
||||
were
|
||||
be
|
||||
been
|
||||
being
|
||||
have
|
||||
has
|
||||
had
|
||||
having
|
||||
do
|
||||
does
|
||||
did
|
||||
doing
|
||||
a
|
||||
an
|
||||
the
|
||||
and
|
||||
but
|
||||
if
|
||||
or
|
||||
because
|
||||
as
|
||||
until
|
||||
while
|
||||
of
|
||||
at
|
||||
by
|
||||
for
|
||||
with
|
||||
about
|
||||
against
|
||||
between
|
||||
into
|
||||
through
|
||||
during
|
||||
before
|
||||
after
|
||||
above
|
||||
below
|
||||
to
|
||||
from
|
||||
up
|
||||
down
|
||||
in
|
||||
out
|
||||
on
|
||||
off
|
||||
over
|
||||
under
|
||||
again
|
||||
further
|
||||
then
|
||||
once
|
||||
here
|
||||
there
|
||||
when
|
||||
where
|
||||
why
|
||||
how
|
||||
all
|
||||
any
|
||||
both
|
||||
each
|
||||
few
|
||||
more
|
||||
most
|
||||
other
|
||||
some
|
||||
such
|
||||
no
|
||||
nor
|
||||
not
|
||||
only
|
||||
own
|
||||
same
|
||||
so
|
||||
than
|
||||
too
|
||||
very
|
||||
s
|
||||
t
|
||||
can
|
||||
will
|
||||
just
|
||||
don
|
||||
should
|
||||
now
|
||||
|
||||
231
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/german
Executable file
231
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/german
Executable file
@ -0,0 +1,231 @@
|
||||
aber
|
||||
alle
|
||||
allem
|
||||
allen
|
||||
aller
|
||||
alles
|
||||
als
|
||||
also
|
||||
am
|
||||
an
|
||||
ander
|
||||
andere
|
||||
anderem
|
||||
anderen
|
||||
anderer
|
||||
anderes
|
||||
anderm
|
||||
andern
|
||||
anderr
|
||||
anders
|
||||
auch
|
||||
auf
|
||||
aus
|
||||
bei
|
||||
bin
|
||||
bis
|
||||
bist
|
||||
da
|
||||
damit
|
||||
dann
|
||||
der
|
||||
den
|
||||
des
|
||||
dem
|
||||
die
|
||||
das
|
||||
daß
|
||||
derselbe
|
||||
derselben
|
||||
denselben
|
||||
desselben
|
||||
demselben
|
||||
dieselbe
|
||||
dieselben
|
||||
dasselbe
|
||||
dazu
|
||||
dein
|
||||
deine
|
||||
deinem
|
||||
deinen
|
||||
deiner
|
||||
deines
|
||||
denn
|
||||
derer
|
||||
dessen
|
||||
dich
|
||||
dir
|
||||
du
|
||||
dies
|
||||
diese
|
||||
diesem
|
||||
diesen
|
||||
dieser
|
||||
dieses
|
||||
doch
|
||||
dort
|
||||
durch
|
||||
ein
|
||||
eine
|
||||
einem
|
||||
einen
|
||||
einer
|
||||
eines
|
||||
einig
|
||||
einige
|
||||
einigem
|
||||
einigen
|
||||
einiger
|
||||
einiges
|
||||
einmal
|
||||
er
|
||||
ihn
|
||||
ihm
|
||||
es
|
||||
etwas
|
||||
euer
|
||||
eure
|
||||
eurem
|
||||
euren
|
||||
eurer
|
||||
eures
|
||||
für
|
||||
gegen
|
||||
gewesen
|
||||
hab
|
||||
habe
|
||||
haben
|
||||
hat
|
||||
hatte
|
||||
hatten
|
||||
hier
|
||||
hin
|
||||
hinter
|
||||
ich
|
||||
mich
|
||||
mir
|
||||
ihr
|
||||
ihre
|
||||
ihrem
|
||||
ihren
|
||||
ihrer
|
||||
ihres
|
||||
euch
|
||||
im
|
||||
in
|
||||
indem
|
||||
ins
|
||||
ist
|
||||
jede
|
||||
jedem
|
||||
jeden
|
||||
jeder
|
||||
jedes
|
||||
jene
|
||||
jenem
|
||||
jenen
|
||||
jener
|
||||
jenes
|
||||
jetzt
|
||||
kann
|
||||
kein
|
||||
keine
|
||||
keinem
|
||||
keinen
|
||||
keiner
|
||||
keines
|
||||
können
|
||||
könnte
|
||||
machen
|
||||
man
|
||||
manche
|
||||
manchem
|
||||
manchen
|
||||
mancher
|
||||
manches
|
||||
mein
|
||||
meine
|
||||
meinem
|
||||
meinen
|
||||
meiner
|
||||
meines
|
||||
mit
|
||||
muss
|
||||
musste
|
||||
nach
|
||||
nicht
|
||||
nichts
|
||||
noch
|
||||
nun
|
||||
nur
|
||||
ob
|
||||
oder
|
||||
ohne
|
||||
sehr
|
||||
sein
|
||||
seine
|
||||
seinem
|
||||
seinen
|
||||
seiner
|
||||
seines
|
||||
selbst
|
||||
sich
|
||||
sie
|
||||
ihnen
|
||||
sind
|
||||
so
|
||||
solche
|
||||
solchem
|
||||
solchen
|
||||
solcher
|
||||
solches
|
||||
soll
|
||||
sollte
|
||||
sondern
|
||||
sonst
|
||||
über
|
||||
um
|
||||
und
|
||||
uns
|
||||
unse
|
||||
unsem
|
||||
unsen
|
||||
unser
|
||||
unses
|
||||
unter
|
||||
viel
|
||||
vom
|
||||
von
|
||||
vor
|
||||
während
|
||||
war
|
||||
waren
|
||||
warst
|
||||
was
|
||||
weg
|
||||
weil
|
||||
weiter
|
||||
welche
|
||||
welchem
|
||||
welchen
|
||||
welcher
|
||||
welches
|
||||
wenn
|
||||
werde
|
||||
werden
|
||||
wie
|
||||
wieder
|
||||
will
|
||||
wir
|
||||
wird
|
||||
wirst
|
||||
wo
|
||||
wollen
|
||||
wollte
|
||||
würde
|
||||
würden
|
||||
zu
|
||||
zum
|
||||
zur
|
||||
zwar
|
||||
zwischen
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 25 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 30 KiB |
168
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/task2.py
Executable file
168
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/task2.py
Executable file
@ -0,0 +1,168 @@
|
||||
import sys
|
||||
import re
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
def split(text, regex):
|
||||
ret = re.findall(regex, text)
|
||||
for i in range(len(ret)):
|
||||
ret[i] = ret[i].lower()
|
||||
return ret
|
||||
|
||||
def splitAll(files, regex):
|
||||
words = []
|
||||
for filename in files:
|
||||
print "Processing " + filename + "."
|
||||
print "\tSplitting words..."
|
||||
words += split(open(filename).read(), regex)
|
||||
wordcount = len(words)
|
||||
print "Absolute word count: " + str(wordcount)
|
||||
return words
|
||||
|
||||
def countWords(words):
|
||||
counts = {}
|
||||
for word in words:
|
||||
if word not in counts:
|
||||
counts[word] = 1
|
||||
else:
|
||||
counts[word] += 1
|
||||
return counts
|
||||
|
||||
def reverseDictionary(dictionary):
|
||||
ret = {}
|
||||
i = 0
|
||||
for word in dictionary.keys():
|
||||
if dictionary[word] not in ret:
|
||||
ret[dictionary[word]] = [word]
|
||||
else:
|
||||
ret[dictionary[word]].append(word)
|
||||
return ret
|
||||
|
||||
def printWordCounts(countToWords, maxCount = -1):
|
||||
totalcount = 0
|
||||
for count in countToWords:
|
||||
totalcount += count * len(countToWords[count])
|
||||
|
||||
print "#\t| Abs.\t\t| Rel.\t\t| Words"
|
||||
sortedKeys = sorted(countToWords.iterkeys(), None, None, True)
|
||||
i = 0
|
||||
breakOuter = False
|
||||
for count in sortedKeys:
|
||||
for word in countToWords[count]:
|
||||
fmtStr = str(i+1) + "\t| " + str(count) + "\t\t| %3.2f %% \t| " + word
|
||||
values = (float(count) / float(totalcount) * 100)
|
||||
print fmtStr % values
|
||||
i += 1
|
||||
if i >= maxCount and maxCount > 0:
|
||||
breakOuter = True
|
||||
break
|
||||
if breakOuter:
|
||||
break
|
||||
|
||||
basedir = "C:/Users/Victor/Dropbox/Uni/Web Mining/Ex 1/"
|
||||
file0 = "A History of Banks for Savings in Great Britain and Ireland by William Lewins.txt"
|
||||
file1 = "Expository Writing by Mervin James Curl.txt"
|
||||
file2 = "A Little Girl in Old San Francisco by Amanda Minnie Douglas.txt"
|
||||
file3 = "Einfuehrung in die moderne Logik.txt"
|
||||
|
||||
words = splitAll([basedir + file0, basedir + file1, basedir + file2], "\w+")
|
||||
|
||||
# (2.1)
|
||||
print "\n=== 30 most frequent words ===\n"
|
||||
countToWords = reverseDictionary(countWords(words))
|
||||
|
||||
printWordCounts(countToWords, 30)
|
||||
|
||||
# (2.2)
|
||||
print "\n=== 30 most frequent words w/o stopwords ===\n"
|
||||
countToWordsWithoutStopwords = {}
|
||||
stopWords = split(open(basedir + "english").read(), "\w+")
|
||||
for count in countToWords.keys():
|
||||
newWords = []
|
||||
for word in countToWords[count]:
|
||||
if word not in stopWords:
|
||||
newWords.append(word)
|
||||
|
||||
if len(newWords):
|
||||
countToWordsWithoutStopwords[count] = newWords
|
||||
|
||||
printWordCounts(countToWordsWithoutStopwords, 30)
|
||||
|
||||
# (3.1)
|
||||
xValues = []
|
||||
yValues = []
|
||||
i = 0
|
||||
sortedKeys = sorted(countToWords.iterkeys(), None, None, True)
|
||||
for count in sortedKeys:
|
||||
for word in countToWords[count]:
|
||||
xValues.append(i)
|
||||
yValues.append(count)
|
||||
i += 1
|
||||
|
||||
'''plt.plot(xValues, yValues)
|
||||
plt.xlabel('word rank')
|
||||
plt.ylabel('word frequency')
|
||||
plt.xscale('log')
|
||||
plt.yscale('log')
|
||||
plt.show()'''
|
||||
|
||||
# (3.2)
|
||||
xValues = []
|
||||
yValues = []
|
||||
sortedKeys = sorted(countToWords.iterkeys(), None, None, True)
|
||||
for count in sortedKeys:
|
||||
xValues.append(count)
|
||||
yValues.append(len(countToWords[count]))
|
||||
|
||||
'''plt.plot(xValues, yValues)
|
||||
plt.xlabel('word frequency')
|
||||
plt.ylabel('number of words with this frequency')
|
||||
plt.xscale('log')
|
||||
plt.yscale('log')
|
||||
plt.show()'''
|
||||
|
||||
# (4)
|
||||
characters = splitAll([basedir + file0], "\w{1,2}")
|
||||
|
||||
print "\n=== characters by frequency (text 1) ===\n"
|
||||
countToWords = reverseDictionary(countWords(characters))
|
||||
|
||||
printWordCounts(countToWords, 10)
|
||||
|
||||
'''xValues = []
|
||||
yValues = []
|
||||
i = 0
|
||||
sortedKeys = sorted(countToWords.iterkeys(), None, None, True)
|
||||
for count in sortedKeys:
|
||||
for word in countToWords[count]:
|
||||
xValues.append(i)
|
||||
yValues.append(count)
|
||||
i += 1
|
||||
|
||||
plt.plot(xValues, yValues)
|
||||
plt.xlabel('character rank')
|
||||
plt.ylabel('character frequency')
|
||||
plt.yscale('log')
|
||||
plt.show()'''
|
||||
|
||||
characters = splitAll([basedir + file3], "\w{1,2}")
|
||||
|
||||
print "\n=== characters by frequency (text 2) ===\n"
|
||||
countToWords = reverseDictionary(countWords(characters))
|
||||
|
||||
printWordCounts(countToWords, 10)
|
||||
|
||||
'''xValues = []
|
||||
yValues = []
|
||||
i = 0
|
||||
sortedKeys = sorted(countToWords.iterkeys(), None, None, True)
|
||||
for count in sortedKeys:
|
||||
for word in countToWords[count]:
|
||||
xValues.append(i)
|
||||
yValues.append(count)
|
||||
i += 1
|
||||
|
||||
plt.plot(xValues, yValues)
|
||||
plt.xlabel('character rank')
|
||||
plt.ylabel('character frequency')
|
||||
plt.yscale('log')
|
||||
plt.show()'''
|
||||
BIN
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/wordfreq_over_wordrank_linear.png
Executable file
BIN
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/wordfreq_over_wordrank_linear.png
Executable file
Binary file not shown.
|
After Width: | Height: | Size: 18 KiB |
BIN
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/wordfreq_over_wordrank_log2.png
Executable file
BIN
ss2013/1_Web Mining/Uebungen/1_Uebung/u1-u4/wordfreq_over_wordrank_log2.png
Executable file
Binary file not shown.
|
After Width: | Height: | Size: 23 KiB |
Loading…
x
Reference in New Issue
Block a user