From 49d6eef95d73d7c521e33301aff98043154290a9 Mon Sep 17 00:00:00 2001 From: Michael Scholz Date: Mon, 10 Jun 2013 10:53:44 +0200 Subject: [PATCH] web mining update --- .../Uebungen/4_Uebung/code/naive_bayes.py | 64 ++++++++++++------ .../Uebungen/4_Uebung/solution.odt | Bin 0 -> 9576 bytes 2 files changed, 44 insertions(+), 20 deletions(-) create mode 100644 ss2013/1_Web Mining/Uebungen/4_Uebung/solution.odt diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py index ecce396d..eae3e971 100644 --- a/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py @@ -6,6 +6,7 @@ import random + # config variables actualDir = os.path.dirname(os.path.realpath(__file__)) dataDir = os.path.join(actualDir, '../data') @@ -23,18 +24,10 @@ class trainingsset: #def __init__(self): - classes = ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'learned', 'lore','mystery','news','romantic'] def createTrainingsset(self): self.splitTrainingsdataRandomly(self) # first split our data into trainings- and testdata - - - def getClassesToTrain(self): - for dirpath, dirnames, filenames in os.walk(trainDir): - # TODO: implement - pass - - + # copies files randomly to new directories. Each directory will contain fileCount / 2 numbers of files # If fileCount is uneven /trainingsdata will contain one file more than /testdata @@ -82,8 +75,9 @@ class trainingsset: ''' class multiclassClassifier: - filesToPrediction = {}; - testFiles = {} + + termfrequenciesOfClasses = {}; + countClass = {} def writePredictionFile(self): @@ -93,13 +87,43 @@ class multiclassClassifier: f.closed return - # reads all testData from /u4_train to the list - def getTestData(self): - listing = os.listdir(testDir) - for infile in listing: - f = open(testDir+'/'+infile, 'r') - self.testFiles[infile] = f.readline() - f.close() + # calculates all necessary stuff for multiclass classifier + def getTermfrequenciesOfClasses(self): + listing = os.listdir(trainDir) + for classes in listing: # classes + self.termfrequenciesOfClasses[classes] = {} + for classes in self.termfrequenciesOfClasses.keys(): + currentPath = trainDir+'/'+classes+'/trainingsdata' + listing = os.listdir(currentPath) + for infile in listing: + if self.countClass.has_key(classes): + self.countClass[classes] += 1 + else: + self.countClass[classes] = 1 + currentPath = trainDir+'/'+classes+'/trainingsdata/'+infile + # update termfrequency for specific class: + self.termfrequenciesOfClasses[classes] = self.updateDictonary(currentPath, self.termfrequenciesOfClasses[classes]) + + + + # "incudludes" a file into the termfrequency dictonary + def updateDictonary(self, pathToFile, dictonary): + f = open(pathToFile, 'r') + lines = f.readlines(); + for line in lines: + thisline = line.split(" "); + for word in thisline: + word = word.lower() + word = word.replace(".", " ") + word = word.replace(",", " ") + if dictonary.has_key(word): + dictonary[str(word)] += 1 + else: + dictonary[str(word)] = 1 + f.close() + return dictonary + + def bayes(self, listOfFiles): @@ -119,10 +143,10 @@ class multiclassClassifier: if __name__ == '__main__': ts = trainingsset() #ts.splitTrainingsdataRandomly(); - ts.getClassesToTrain(); + #ts.getClassesToTrain(); mc = multiclassClassifier() - #mc.getTestData() + mc.getTermfrequenciesOfClasses() #mc.writePredictionFile() diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/solution.odt b/ss2013/1_Web Mining/Uebungen/4_Uebung/solution.odt new file mode 100644 index 0000000000000000000000000000000000000000..58721b24978113a2bb7ab2c18b042ac39343638f GIT binary patch literal 9576 zcma)C1z1$g*Ixw$0fQ7!QbKZBngx~y=@3OiU|E)2cGqR;k`hHh=~O~OKnX<}krI$n z5a}-I?)ny$S6@GUpa0?6``kNoYR(LE&;8BOR>Q-m1OSKtfJ~(iawhKuNI3uiz|jxa z1hBWUhhtsQa3~s$uz^9bHYg;}32DKHf;!kZ@S)Ifqy-A*Xb(qX`LJ*otoFZ@;pG1z zI^3N+3TbI$<%s#F#z6q+fQ4cm9n7H^;CDG9BBF0j{3C?BIdbH8IVcPUN5FAvP#7T0 z5ra9h?qZL?18A!e9ytv-d33?i-`%0X_5QBb5`ltZ;m6J#N93ELAD2H30*yjDqHzIm z{4t0>$zA$`H8cigg@HT#GWdU1g+idANI2p@EdFVHM^63>_Mfc%N$SYje@ij` zVRSb3P%F3tP|gNx4@Eot$NZAugkdNo_QxDf7-}HLL?{P0DPIs%@@lI@Avg*pTA36D z-T_oNVp0{I@eEVo=2vvtGOsjdIe$G-It?m~k6&Z;{G14{$5^<`P%p9H-+H^bD+OuC zm{fS(qb=po)v3yenp)#gaAA`B@GNt^W8k8@MFnJcG%}{mMut6^s67U4Pv~p66)Y2< zt-}1oFms&3hw3^-2rPvi9COcqN!6}hK?`GA7u5df#cQ{-NL#y=q%y9Hb3uGh9XAvF z;(L>84fNrD0VyJ#Qz_Xie&!p-GM!`icBRyHc+d96J{wv;Q8yNC65yDXkWfJOVCP!T&$@AD zKJcl*hN|qfjhY4AFCF=CJtZJK)cs8LUj2IKs|yzWozYb}5nPXjrV}iEkTWFAGupv& z`Le6%eG}9$b=w(iA=Oirtow*}elM2*uak0$?}<{KK|duSitmpXd3kY&Qtr)_EhKYJ z6&FHyziGYGK931x;yc}B7LQ2-nch&MBR3SJFAZarnxl({@g7*nCFzAx%>A0eeEGpnL6Kq8wk1=^kioPd_0o{vOcSM zEhhOB?_U*IGpUU!DmAHbL<7~_)fD6$0e-U#YH_dklzog#Ows8UmDUD%P7zK~ zf$p%7N_{Y!zs?)t;*|sM*o|{a#&4p?=y-?Ge$2E^TCYB4)y6yO+vUlYof_X6cp$XZ z7F0XZn=i8GbmdyS)23V`b!oKiR&G?*8)p;iN65V?#;;LjPV4^s?#`#p(C!xSv#%pS zDLTwTv)cOEE(rE*`Lnyjn-EG-{Ojh#j(r~HF^1lU8}7IK?>mz*>9ur!`4J>(IWi2kQWO%C8EN{#!Q}3)& z^E8ND5bO4Az=c!ztWwigF&pu9r+#4MhpKzDjF&+in%>GLs=@Jh+gvB$ahCsxvQcWBj^FE(9fJniP=DL-x(O*u zo7hjt(`&(?T%+_05lQ#P%k(Hh7Fg3S6>Bq#JfaQFn?67BmGd&C;=_+-J}jJ&ec6w~ zBV-@?zpy0Z>AQ1MEGk8tcOcSDz8E zO5^2fP49IilD-e>;7v245pi8QPZ;uvt@(^U)XL8rexL(Cn9jAfG*K3zsBC~$tG|rC zld*H!D?ltAEUNNcXIzXRow787EcSNR$A=!2Afb_2poC({OIpmqUdT(g&4tkc@aKno zs}nQ+ck7C!<{v5JZ54nSb~#|EVfTadbG&vaiA?s}Vpd_;HmpuglTwy;>0sfR(kmBAv_E4ma zCENkahq17132#CA@t^l!lnj8kcFO<>2rySZl-MS|6TW+s0JsuIKKM2y{Dw|y)=c8( zu^c7Ap^$?3dFZ8v2d|!2=?TF&**;vgelyJtF5@dn>jU?87jiSk(J$Njr6qEeIwQHn zO``&MMM)8DuT!qL#H~tdm9tcJirv0S`o189=rlvHoXz^U2c3B9=xIxR9qeqL2{EewRu$)#?PJR&~UylIKV3%_=(lQY6imxI@~y z2k()lPnZl>uAF<6K^1ITw-cNI0Q~U&D}2Pb@Ht>z5paj2+-tzl=uuxGxtDZhwf3jC zp9;&%dfvoqeCm@WVdeao+D5>uLw!+|^6Ce}sV?WMLF8;|g<-cgs|=X#h}Fc7h>Du* z?F42#Y}rt_`eC)6DFNt5+jyCw!()?XZ+-djEA!S!%c03CYcg3+tt6LV7&mLD51F`M zV@dzbQJ8j+M@@bzCVcE+LvWBtIa5h|!E%YHQY1a7I$cw=`2eB#^z-M&xFW#|s0=%i z=7YJL!?rUPEhI5J9TX79m2itIBUeTZe5mFPmTRpKra@ODxkMf)UcbINW7%4VB+KcV z4M-L_rF_OzhuE@s)LNG2(}!`Egg}z|c(QlDfdjrq`+vW1HK8D*R78&j4y(=J0?Z6zXvtw8VxgOa_zA{J8<>r)Ax1XNzIpr~OQtW)sEWcyF9MDZ~ zWkBw+FG(j&m^x7VPHUD*t{xmASs~P!zo2mG+$&#kQIe~|MA(9i)SdKe-kM{TCN$}0 zH@Y-(Z{+%l=J0b|3c~na_BAI5ZBM?@cy8%pS-Jj-(N9{YF;Kp6!c3g?(b$J864-kW zI?M;7$by&W@LJ(wOkX98(c2aE7KFYA%ITAe5@8gw%$gR8X_6V6mkEYi2E%PPINH#V ztkQZ>*^1{AZ!QsBCu3bX{;>BBm2DdZ++>xUTDVvF^KI+Yxa-qXgZLd2I)`@zH~#5pMgwA-5=t1fBb@0=O? zIQ-W2=1iqodfzU4@E6kgXxA@hz(ogE#yYL~^zBTk9C!>L~zFf}iB`dL%)TyRz zjfgYk=cH16odS}|C|F4@RSlj@Gyd2Hz_*NZTt|nw=3bzpe=R4Tz_otQQO5F&fS;qD zURuM#7xL!R3sMBs-NOU2eUF8ttJ6je3H>R9PZzz2q2hERen@L1%RlG;PEklVB&9F3 zkarBEZgn!#p5wVS1H}N|TFw{Khl1@rMAvCX?_SVV9w_cPQ&zZZEu6**kk6^Gjp(XM zQBYA&jgeK6_DdVd(zbj!;-NoEPg)nnISS9g_pbp~iZpQ$yYcNPC?M5~PF7sEjU@sz zwmunmGw1F91jz`|wY%+~IKDUTte8ZGY3_TGAA3_ z4-th_2r*C*;8Do}jv3`dKxAYW&*%#`njz>= zNIIY;6{NvoqBfgqz4eVgGqTTrdz6(iVK)7}9XM*l@3AB)xZ2pZysKZf zRv^Iv+~i%0%r&ULzhq%JUK>NZ1{WLHOU&=#tW4r~XW~=IU)D=W*Tm-#ZH+2zey=0a zIi%fu>fJ4W-g-e~>SixU3U7lcZ}V<=V?<$(ro(5SMvii!mnv?CbZ#%LUd4L{@@{sJ zfMe-gIiBpvF}RPoc%y{m+CG(7p-jfnk+x!1QibW8Pi>IWk)#`TR_iwl!&TWgAGO}< zK-Y-XwVU--M9r|+mDj7mAEA`94{CIM87^OTueNoX^_Bwo46ZE-$nWG($qyl@UvNXE z-Y^}_gc<$-#@c0}oWP}p-P)d#D?DL&x4Qwpj2`TC0&#|XC>%$I|h|YI$72^Pt?krMp(dRd}+FjTYl%@0E#~y)h^#l&KI(l zoh|1kY&u9iCDT_37N%2>th?=t>Npm;{dZ$j5B)<5%u37YWd^yGk+bzRka=G1* zKh<*0unuAxe{pub)N`CpE`z!~CCSsM!?R=rblZT#O z4Ns6doK-u^O&&aZk#2tVzP;}xLJHy@Lwg#m3p)J_+Z#8zhb)0|6?Yczo01+1ZMIY% zB?0f1ia<0Ro2P&iDBqu#-#4oAOl5E2{$afcW@X znBYhl%EAU|CB<@GPl;EIMH)gZiL$h`fx*GYxDqc8fPymNM3D~Q?_E+Xju<5P8v+GJ zLha!WU@Q#$D-;F(l^guc(DxP>gbmV8ip3g>MT3DrXJ=t*?2 z$^!NeAKDRv_{L=c1LAO}Be2T>2;u_)k9m(g{9m0q;{DYn6bki^p`$T;AL2K20{r|! z!0*q;maH%q7KnfNe;ELdR}g3gw18rvyiPW7=gTZiKjQdHzy(+!$I(4<10wm&85^Vx z)&`2;g<;@Fh#Ex024)R~BbctitWgLzN#GxuzKebn!ObQw&Mq7xzz-7S^h?)wN8lE?F(9pYEq;KD5N$ny zs{%qAf@0TxxbXit{w)Y|h2b(;$g#u6+&`MX^IE}?-$6%+7H*=nzAe*l6D7ze#0O%% zj~X>A&@rUFAU{~#7eQ?{%yWb$q~f%dn3yH$lmX*=wCQNKg7{c4Acq(MOz;;gM@zcIHNEY z$Bm-L?6@_7fx>X`=y8vbm^cd%BKc#1f{$GK`Si~~zBT_H-8}*v^{gH3&5=+WgaZ(J ze9MPMTFsI!IZRE2jhdU84X>}EPoHczqNSms6^ps^f~%fDijW#r@SHO^E1b*ZsHare&=QeUNRJ%DJ(i?tA;9Ry`&2i(7|ISziwf zR=qFmgI^7L_P!0QTy-CeQt38zmw2s=zvcm*p$RX z2q*`Ao;y|PXg*Vko(+?xf+y@}Z8W`PtneO8zr}5(|5kAdUwH&P0o&^#ocZFp-;COC zEFVtnP$~;al_tSB?eAKPC%^Bicv*QEC%Z3YS(Bw)La?o@x+j9Nsc^9$k}NBmdi(il z-{z8~>zn+x%DpCO(nG}B?!jijrn!CN*5Q{T&2rGZ_mxR%de7PZk;RJ&biBRIt-aFV zqS2JVch1VU`c@e2xTqn<%OgQOds*|lxxBkuE`|HcMUy8Aogss{v8B#KtBYi!k1jM& z=}8NgtqV*xio_o@WaV@R(V9w|n%IW20}Y~S9h22nszO^=k3@D;?IH2jn&p1_a8t)iuY(;bj{#wxjH1n6=b8%lji`j3 z_4V3|b-R-1H0_P74mZX_ffUgKi$yo1x7X_HOENX!(rMf|2cy9LnuA^jI>L-C69;|v zN=U2eR^a@t17F(p6MbLyqxt(^S*6=o9y}x7wQ5p?tR}7;2*ZGO$i{;v(^O}kn1ecW zkxpZiJ>nV+31&z<*wrp`PQ_}BtbN_NrUq9(Xe-vgwN_TX*%{BRJX+c8S~NVhTy&x5 zjJ^5BllJj=Bk%F$H=Vbm<-0naC7*VRmK}t|#Hly8RJZNFzWit<+GD4qzhd`u;v^!w z)|+tqfytgtl=MM{_%tJC)T1=*pfYw~w36H1VW!h4-OIVHqH_J9Mc!>kxi)2fYDxq6 zhIh~HLJaZ?Ln6bjJtS~&TKXD;XJOa7&;un}J@_7U!pM%pcu-(bDlf@=3A^rKFct9; z{~K&l2hH5I$Gx=M@Q+{Ve}hdN;Mk+>_S={L8^a-oQ87}l%F2{*XVvLP6;U=lr?EY7 zH4;3mL=lReY`7+0pzzvorL?$@$pyJ+jU|eX+9)2yoi&w(`6)Q7rirXg$VzMWlc%m6 zOn9Uv34^X}&zvTDcl)UUj(_k^bnW4xwX9^RkroH5TAyJeh^5_iK+R8+KcZQj!64yHOIFzdsO zO8tO)sGzyKbNrdFR$sqvvfU_hegzqq=t^)84rUaY6h(Fsze*$)J#R3S@Pe`L*??>6 ztAh{F6YADy;>!X|LiHp!V`@xv&>ZoENld*EA+nz=PRG8vnK6Uk{9J20`DI)EH8$hsva!FN!XUy1TvY8OA3k2x6pDifVN zLMGqM;aTkux#_`XemkJ)eAM}c9WLp&?NSl6wvI9N5+h%9-sn)4o7CK0aa{(M0GIfX zIjXhwPbD)ub8?9judn9cc{ZiO)hTP3fg#)#@NZw7Nssu-PUokSGxvHnyUF~OH;dBc zumXBpLA{B*=S??aVyJ^y!}@4Fd8RdLw?FB+bph?EY4o0x@#}Kj*ngB-7m*vs$&o z3iQ0N+M0;HM{^72uc9n{C+zK{`UVAU4D0%RJjG0LygUR^;cPW%-1+{9R6Vnxt1yOQ zwN7DZj3SQufkw)(NxmX#FL$945?|WT$Lt=!NDH59jFOp`Q`rH%Rme9m;5H^$e2h_e z=X`1Lx^^|bDUori?N!+ajey$amLv}?*&NZ>^Qy)Cm*2b-z>j8R3Ea06DwNLM_MRlrEX7#QPE5vedNV(vgklwtv1L7Z?*OM86%O0MllS(Hkv$T)zgu z{}ir2<0%lFL;AYZgI0tf+(F~3r9A>f##+rOv67|`ZriN|B@Ww2$Xjjsc>W#3P+)|l z6j{!vMY1c6co*)+^$NVJ!@EObUJg0zKt6B~L!=4xA`Da;UK{HYom!%x^fDmVH%pEu zZF+u^8DW|Su)IQqrhl%|v$m;l5n|9&+Yan+W{6dfvF0*!u&M~8M~ImlL}h8`o(Uqp z^+LvI!)q-Wel8cf|RfS4T zazzlzy!0W^AEJS06I`zn0WbHtvGND7lT2OnZQ=2@Msi4|(QURvZut0|pRWGigYM9G`aZCYsBow5krx*S#BBLzz}m}q&8c_2 zYwk43sMF}?` zd8h3nBH!?GW9Z-Nsv2ZquGD13k^;X{Inqh9JqnWxX#v1Q8~R6l`3C zI$=>>07Ql6*~jBuUam|}vPgW`?5OKvyJ$(^PtT2&^vT-ksf+=Huhp0Z--yVlJ8R^n zJsfSJnU`VoV%0JF1uvmuU)*VSOL1c@1NS-g#Of+uzJPW*p7D0Vzie*4 z_rFH0b$3>6cqqZLIzuCv~>` zNXa?9_e!$$ml^ZC=UDDsIo$8y;#wY?6-_*f?ZxUf5n3FEE&T0WcH$%@;O{jKxB&k` z#}yCSYX6q{g#~bQD*ux-aJRqL68u`)@LkWZ@6zM?grnkypA>+*{aw#rN*(^oxucx= zC-viQe^>gm(BXd_A~`OUziaum@Zs-zj!GYXlIoxI{HqY+@0$J{bpJnS`n53P?|S|n zpYlKG`B^COziL_egO;C#6MxroR66mKuHe?i?@|7Lg%sbF9j~TekGhUZDSi?hrwsP9 zu;O^d91H$>$a8cO_LGKipv^z~4DMX)cgbIopQA&dpTvNB{COmFEcZ8D=%<`KInE;P z`|$`RI+i;2^Vg*G-!liJ{`zXe<6ncX5;V$BkKA WjR1G}2mp}b{?NGnXO!va>Hh!_DDlex literal 0 HcmV?d00001