web mining u1 - u4

This commit is contained in:
Michael Scholz 2013-05-05 10:14:38 +02:00
parent 5f94d45927
commit bc50b297ea
15 changed files with 42729 additions and 0 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB

View File

@ -0,0 +1,128 @@
i
me
my
myself
we
our
ours
ourselves
you
your
yours
yourself
yourselves
he
him
his
himself
she
her
hers
herself
it
its
itself
they
them
their
theirs
themselves
what
which
who
whom
this
that
these
those
am
is
are
was
were
be
been
being
have
has
had
having
do
does
did
doing
a
an
the
and
but
if
or
because
as
until
while
of
at
by
for
with
about
against
between
into
through
during
before
after
above
below
to
from
up
down
in
out
on
off
over
under
again
further
then
once
here
there
when
where
why
how
all
any
both
each
few
more
most
other
some
such
no
nor
not
only
own
same
so
than
too
very
s
t
can
will
just
don
should
now

View File

@ -0,0 +1,231 @@
aber
alle
allem
allen
aller
alles
als
also
am
an
ander
andere
anderem
anderen
anderer
anderes
anderm
andern
anderr
anders
auch
auf
aus
bei
bin
bis
bist
da
damit
dann
der
den
des
dem
die
das
daß
derselbe
derselben
denselben
desselben
demselben
dieselbe
dieselben
dasselbe
dazu
dein
deine
deinem
deinen
deiner
deines
denn
derer
dessen
dich
dir
du
dies
diese
diesem
diesen
dieser
dieses
doch
dort
durch
ein
eine
einem
einen
einer
eines
einig
einige
einigem
einigen
einiger
einiges
einmal
er
ihn
ihm
es
etwas
euer
eure
eurem
euren
eurer
eures
für
gegen
gewesen
hab
habe
haben
hat
hatte
hatten
hier
hin
hinter
ich
mich
mir
ihr
ihre
ihrem
ihren
ihrer
ihres
euch
im
in
indem
ins
ist
jede
jedem
jeden
jeder
jedes
jene
jenem
jenen
jener
jenes
jetzt
kann
kein
keine
keinem
keinen
keiner
keines
können
könnte
machen
man
manche
manchem
manchen
mancher
manches
mein
meine
meinem
meinen
meiner
meines
mit
muss
musste
nach
nicht
nichts
noch
nun
nur
ob
oder
ohne
sehr
sein
seine
seinem
seinen
seiner
seines
selbst
sich
sie
ihnen
sind
so
solche
solchem
solchen
solcher
solches
soll
sollte
sondern
sonst
über
um
und
uns
unse
unsem
unsen
unser
unses
unter
viel
vom
von
vor
während
war
waren
warst
was
weg
weil
weiter
welche
welchem
welchen
welcher
welches
wenn
werde
werden
wie
wieder
will
wir
wird
wirst
wo
wollen
wollte
würde
würden
zu
zum
zur
zwar
zwischen

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

View File

@ -0,0 +1,168 @@
import sys
import re
import matplotlib.pyplot as plt
def split(text, regex):
ret = re.findall(regex, text)
for i in range(len(ret)):
ret[i] = ret[i].lower()
return ret
def splitAll(files, regex):
words = []
for filename in files:
print "Processing " + filename + "."
print "\tSplitting words..."
words += split(open(filename).read(), regex)
wordcount = len(words)
print "Absolute word count: " + str(wordcount)
return words
def countWords(words):
counts = {}
for word in words:
if word not in counts:
counts[word] = 1
else:
counts[word] += 1
return counts
def reverseDictionary(dictionary):
ret = {}
i = 0
for word in dictionary.keys():
if dictionary[word] not in ret:
ret[dictionary[word]] = [word]
else:
ret[dictionary[word]].append(word)
return ret
def printWordCounts(countToWords, maxCount = -1):
totalcount = 0
for count in countToWords:
totalcount += count * len(countToWords[count])
print "#\t| Abs.\t\t| Rel.\t\t| Words"
sortedKeys = sorted(countToWords.iterkeys(), None, None, True)
i = 0
breakOuter = False
for count in sortedKeys:
for word in countToWords[count]:
fmtStr = str(i+1) + "\t| " + str(count) + "\t\t| %3.2f %% \t| " + word
values = (float(count) / float(totalcount) * 100)
print fmtStr % values
i += 1
if i >= maxCount and maxCount > 0:
breakOuter = True
break
if breakOuter:
break
basedir = "C:/Users/Victor/Dropbox/Uni/Web Mining/Ex 1/"
file0 = "A History of Banks for Savings in Great Britain and Ireland by William Lewins.txt"
file1 = "Expository Writing by Mervin James Curl.txt"
file2 = "A Little Girl in Old San Francisco by Amanda Minnie Douglas.txt"
file3 = "Einfuehrung in die moderne Logik.txt"
words = splitAll([basedir + file0, basedir + file1, basedir + file2], "\w+")
# (2.1)
print "\n=== 30 most frequent words ===\n"
countToWords = reverseDictionary(countWords(words))
printWordCounts(countToWords, 30)
# (2.2)
print "\n=== 30 most frequent words w/o stopwords ===\n"
countToWordsWithoutStopwords = {}
stopWords = split(open(basedir + "english").read(), "\w+")
for count in countToWords.keys():
newWords = []
for word in countToWords[count]:
if word not in stopWords:
newWords.append(word)
if len(newWords):
countToWordsWithoutStopwords[count] = newWords
printWordCounts(countToWordsWithoutStopwords, 30)
# (3.1)
xValues = []
yValues = []
i = 0
sortedKeys = sorted(countToWords.iterkeys(), None, None, True)
for count in sortedKeys:
for word in countToWords[count]:
xValues.append(i)
yValues.append(count)
i += 1
'''plt.plot(xValues, yValues)
plt.xlabel('word rank')
plt.ylabel('word frequency')
plt.xscale('log')
plt.yscale('log')
plt.show()'''
# (3.2)
xValues = []
yValues = []
sortedKeys = sorted(countToWords.iterkeys(), None, None, True)
for count in sortedKeys:
xValues.append(count)
yValues.append(len(countToWords[count]))
'''plt.plot(xValues, yValues)
plt.xlabel('word frequency')
plt.ylabel('number of words with this frequency')
plt.xscale('log')
plt.yscale('log')
plt.show()'''
# (4)
characters = splitAll([basedir + file0], "\w{1,2}")
print "\n=== characters by frequency (text 1) ===\n"
countToWords = reverseDictionary(countWords(characters))
printWordCounts(countToWords, 10)
'''xValues = []
yValues = []
i = 0
sortedKeys = sorted(countToWords.iterkeys(), None, None, True)
for count in sortedKeys:
for word in countToWords[count]:
xValues.append(i)
yValues.append(count)
i += 1
plt.plot(xValues, yValues)
plt.xlabel('character rank')
plt.ylabel('character frequency')
plt.yscale('log')
plt.show()'''
characters = splitAll([basedir + file3], "\w{1,2}")
print "\n=== characters by frequency (text 2) ===\n"
countToWords = reverseDictionary(countWords(characters))
printWordCounts(countToWords, 10)
'''xValues = []
yValues = []
i = 0
sortedKeys = sorted(countToWords.iterkeys(), None, None, True)
for count in sortedKeys:
for word in countToWords[count]:
xValues.append(i)
yValues.append(count)
i += 1
plt.plot(xValues, yValues)
plt.xlabel('character rank')
plt.ylabel('character frequency')
plt.yscale('log')
plt.show()'''

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 23 KiB