diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/code/PorterStemmer.pyc b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/PorterStemmer.pyc new file mode 100644 index 00000000..452121cd Binary files /dev/null and b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/PorterStemmer.pyc differ diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py index 0a131f40..a2bc31a3 100644 --- a/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py @@ -261,7 +261,7 @@ class multiclassClassifier: result /= percentage*notwordcount #result /= len(termfrequenciesOfClasses) #return result - print cl +" "+str(result) + #print cl +" "+str(result) return math.log(result) def clean_word(self, word): @@ -321,10 +321,11 @@ if __name__ == '__main__': listing = os.listdir(trainDir) for classes in listing: # classes path = trainDir+'/'+classes+'/testdata' + path = testDir listing = os.listdir(path) for infile in listing: - currentPath = trainDir+'/'+classes+'/testdata/'+infile - print currentPath + currentPath = testDir+'/'+infile + #print currentPath maxRes = sys.maxint * -1 # check all possible classes for cl in mc.percentage.keys(): diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/grafiken/a3_abb1.png b/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/grafiken/a3_abb1.png new file mode 100644 index 00000000..7f5ad59f Binary files /dev/null and b/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/grafiken/a3_abb1.png differ diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/grafiken/a3_abb2.png b/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/grafiken/a3_abb2.png new file mode 100644 index 00000000..cd6bcca2 Binary files /dev/null and b/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/grafiken/a3_abb2.png differ diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.aux b/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.aux index 42d5eb33..2026da13 100644 --- a/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.aux +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.aux @@ -41,8 +41,18 @@ \@writefile{nav}{\headcommand {\beamer@subsectionpages {3}{3}}} \@writefile{nav}{\headcommand {\slideentry {3}{0}{4}{4/4}{}{0}}} \@writefile{nav}{\headcommand {\beamer@framepages {4}{4}}} -\@writefile{nav}{\headcommand {\beamer@partpages {1}{4}}} -\@writefile{nav}{\headcommand {\beamer@subsectionpages {4}{4}}} -\@writefile{nav}{\headcommand {\beamer@sectionpages {4}{4}}} -\@writefile{nav}{\headcommand {\beamer@documentpages {4}}} -\@writefile{nav}{\headcommand {\def \inserttotalframenumber {4}}} +\@writefile{nav}{\headcommand {\slideentry {3}{0}{5}{5/5}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {5}{5}}} +\@writefile{nav}{\headcommand {\slideentry {3}{0}{6}{6/6}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {6}{6}}} +\@writefile{nav}{\headcommand {\slideentry {3}{0}{7}{7/7}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {7}{7}}} +\@writefile{nav}{\headcommand {\slideentry {3}{0}{8}{8/8}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {8}{8}}} +\@writefile{nav}{\headcommand {\slideentry {3}{0}{9}{9/9}{}{0}}} +\@writefile{nav}{\headcommand {\beamer@framepages {9}{9}}} +\@writefile{nav}{\headcommand {\beamer@partpages {1}{9}}} +\@writefile{nav}{\headcommand {\beamer@subsectionpages {4}{9}}} +\@writefile{nav}{\headcommand {\beamer@sectionpages {4}{9}}} +\@writefile{nav}{\headcommand {\beamer@documentpages {9}}} +\@writefile{nav}{\headcommand {\def \inserttotalframenumber {9}}} diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.log b/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.log index 66794d1a..13979b67 100644 --- a/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.log +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.log @@ -1,4 +1,4 @@ -This is pdfTeX, Version 3.1415926-2.3-1.40.12 (TeX Live 2011) (format=pdflatex 2011.7.3) 16 JUN 2013 14:41 +This is pdfTeX, Version 3.1415926-2.3-1.40.12 (TeX Live 2011) (format=pdflatex 2011.7.3) 17 JUN 2013 00:29 entering extended mode restricted \write18 enabled. %&-line parsing enabled. @@ -1463,6 +1463,11 @@ Underfull \hbox (badness 10000) has occurred while \output is active [3 ] +LaTeX Font Info: Font shape `OT1/phv/m/n' will be +(Font) scaled to size 9.40002pt on input line 69. +LaTeX Font Info: Font shape `OT1/phv/m/sl' will be +(Font) scaled to size 9.40002pt on input line 69. + Underfull \hbox (badness 10000) has occurred while \output is active [] @@ -1470,6 +1475,54 @@ Underfull \hbox (badness 10000) has occurred while \output is active [4 ] +Underfull \hbox (badness 10000) has occurred while \output is active + + [] + +[5 + +] +Underfull \hbox (badness 10000) has occurred while \output is active + + [] + +[6 + +] +Underfull \hbox (badness 10000) has occurred while \output is active + + [] + +[7 + +] +File: grafiken/a3_abb1.png Graphic file (type png) + + +Package pdftex.def Info: grafiken/a3_abb1.png used on input line 125. +(pdftex.def) Requested size: 341.4726pt x 84.01309pt. + +Underfull \hbox (badness 10000) has occurred while \output is active + + [] + +[8 + + <./grafiken/a3_abb1.png>] + +File: grafiken/a3_abb2.png Graphic file (type png) + + +Package pdftex.def Info: grafiken/a3_abb2.png used on input line 134. +(pdftex.def) Requested size: 269.00433pt x 96.86163pt. + +Underfull \hbox (badness 10000) has occurred while \output is active + + [] + +[9 + + <./grafiken/a3_abb2.png>] \tf@nav=\write7 \openout7 = `solution.nav'. @@ -1479,31 +1532,32 @@ Underfull \hbox (badness 10000) has occurred while \output is active \tf@snm=\write9 \openout9 = `solution.snm'. -Package atveryend Info: Empty hook `BeforeClearDocument' on input line 67. -Package atveryend Info: Empty hook `AfterLastShipout' on input line 67. +Package atveryend Info: Empty hook `BeforeClearDocument' on input line 137. +Package atveryend Info: Empty hook `AfterLastShipout' on input line 137. (./solution.aux) -Package atveryend Info: Empty hook `AtVeryEndDocument' on input line 67. -Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 67. +Package atveryend Info: Empty hook `AtVeryEndDocument' on input line 137. +Package atveryend Info: Executing hook `AtEndAfterFileList' on input line 137. Package rerunfilecheck Info: File `solution.out' has not changed. (rerunfilecheck) Checksum: 9D67D81423E41833F99DCEF802B7F5DF;135. -Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 67. +Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 137. ) Here is how much of TeX's memory you used: - 17143 strings out of 493633 - 314122 string characters out of 3143378 - 388825 words of memory out of 3000000 - 19970 multiletter control sequences out of 15000+200000 - 28485 words of font info for 41 fonts, out of 3000000 for 9000 + 17188 strings out of 493633 + 314812 string characters out of 3143378 + 389823 words of memory out of 3000000 + 20001 multiletter control sequences out of 15000+200000 + 30168 words of font info for 45 fonts, out of 3000000 for 9000 831 hyphenation exceptions out of 8191 55i,20n,79p,425b,533s stack positions out of 5000i,500n,10000p,200000b,50000s {/usr/local/texlive/2011/texmf-dist/fonts/enc/dvips/base/8r.enc} -Output written on solution.pdf (4 pages, 65241 bytes). +exlive/2011/texmf-dist/fonts/type1/public/amsfonts/cm/cmsy10.pfb> +Output written on solution.pdf (9 pages, 95991 bytes). PDF statistics: - 69 PDF objects out of 1000 (max. 8388607) - 53 compressed objects within 1 object stream - 12 named destinations out of 1000 (max. 500000) - 67 words of extra memory for PDF output out of 10000 (max. 10000000) + 104 PDF objects out of 1000 (max. 8388607) + 80 compressed objects within 1 object stream + 22 named destinations out of 1000 (max. 500000) + 77 words of extra memory for PDF output out of 10000 (max. 10000000) diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.nav b/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.nav index 2fe7decd..ad35e8b6 100644 --- a/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.nav +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.nav @@ -16,8 +16,18 @@ \headcommand {\beamer@subsectionpages {3}{3}} \headcommand {\slideentry {3}{0}{4}{4/4}{}{0}} \headcommand {\beamer@framepages {4}{4}} -\headcommand {\beamer@partpages {1}{4}} -\headcommand {\beamer@subsectionpages {4}{4}} -\headcommand {\beamer@sectionpages {4}{4}} -\headcommand {\beamer@documentpages {4}} -\headcommand {\def \inserttotalframenumber {4}} +\headcommand {\slideentry {3}{0}{5}{5/5}{}{0}} +\headcommand {\beamer@framepages {5}{5}} +\headcommand {\slideentry {3}{0}{6}{6/6}{}{0}} +\headcommand {\beamer@framepages {6}{6}} +\headcommand {\slideentry {3}{0}{7}{7/7}{}{0}} +\headcommand {\beamer@framepages {7}{7}} +\headcommand {\slideentry {3}{0}{8}{8/8}{}{0}} +\headcommand {\beamer@framepages {8}{8}} +\headcommand {\slideentry {3}{0}{9}{9/9}{}{0}} +\headcommand {\beamer@framepages {9}{9}} +\headcommand {\beamer@partpages {1}{9}} +\headcommand {\beamer@subsectionpages {4}{9}} +\headcommand {\beamer@sectionpages {4}{9}} +\headcommand {\beamer@documentpages {9}} +\headcommand {\def \inserttotalframenumber {9}} diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.pdf b/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.pdf index 512d77fc..897f7265 100644 Binary files a/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.pdf and b/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.pdf differ diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.synctex.gz b/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.synctex.gz index 29f18034..c74cf9da 100644 Binary files a/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.synctex.gz and b/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.synctex.gz differ diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.tex b/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.tex index 39b4eef0..91b528ef 100644 --- a/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.tex +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/latex/solution.tex @@ -51,6 +51,7 @@ \item Die gelabelten Trianinsdaten werden in zwei gleich große Mengen aufgeteilt \item Somit erhalten wir gelabelte Trainings- und Testdaten \item Aufteilung erfolgt zufällig +\item Somit werden keine Präferenzen gesetzt \end{itemize} \end{frame} @@ -59,9 +60,78 @@ \section{3. Aufgabe} \begin{frame} \frametitle{3. Aufgabe \\ Experimente und Abgabe} +Probleme im Naive Bayes Klassifizierer: \begin{itemize} -\item blabla +\item Durch das Produkt über jedes Wort in einem Dokument werden Dokumente meistens großen Klassen zugeordnet. +\item Dies gilt es zu verhindern $\to$ Anpassung an Dokumentengröße +\item Trainingsmenge ist sehr klein. Um Sie zu verdoppeln wird im finalen Lauf des Klassifizierers keine Aufteilung in Test- und Trainingsmenge mehr vorgenommen. \end{itemize} \end{frame} + +\begin{frame} +\frametitle{3. Aufgabe \\ Experimente und Abgabe} +Ergebnisse der Validierung: +\begin{itemize} +\item Accuracy: 46.0606\% +\item Precision Macro-Avg: 41.1415\% +\item Precision Micro-Avg: 46.0606\% +\item Recall Micro-Avg: 7.8675\% +\end{itemize} +\end{frame} + + +\begin{frame} +\frametitle{3. Aufgabe \\ Experimente und Abgabe} +Ergebnisse der Validierung: \\ +\textbf{Precision} pro Klasse: \\ +adventure: 25.0\% \\ +belles\_lettres: 35.3846\% \\ +editorial: 0.0\% \\ +fiction: 31.25\% \\ +government: 40.0\% \\ +hobbies: 66.6667\% \\ +learned: 72.0\% \\ +lore: 20.0\% \\ +mystery: 66.6667\% \\ +news: 70.5882\% \\ +romance: 25.0\% \\ +\end{frame} + +\begin{frame} +\frametitle{3. Aufgabe \\ Experimente und Abgabe} +Ergebnisse der Validierung: \\ +\textbf{Recall} pro Klasse: \\ +adventure: 20.0\% \\ +belles\_lettres: 85.1852\% \\ +editorial: 0.0\% \\ +fiction: 45.4545\% \\ +government: 36.3636\% \\ +hobbies: 46.1538\% \\ +learned: 60.0\% \\ +lore: 5.5556\% \\ +mystery: 50.0\% \\ +news: 75.0\% \\ +romance: 9.0909\% \\ +\end{frame} + +\begin{frame} +\frametitle{3. Aufgabe \\ Experimente und Abgabe} +Ergebnisse der Validierung: \\ +\begin{figure} +\includegraphics[scale=0.45]{grafiken/a3_abb1.png} +\caption{Konfusionsmatrix Teil 1} +\end{figure} +\end{frame} + +\begin{frame} +\frametitle{3. Aufgabe \\ Experimente und Abgabe} +Ergebnisse der Validierung: \\ +\begin{figure} +\includegraphics[scale=0.5]{grafiken/a3_abb2.png} +\caption{Konfusionsmatrix Teil 2} +\end{figure} +\end{frame} + + \end{document} \ No newline at end of file