web mining update
This commit is contained in:
parent
49d6eef95d
commit
f0161831c9
@ -0,0 +1,165 @@
|
||||
00f0316054ddf9504f87ea28e73683b6.txt learned
|
||||
025338a8f0b21608e843df13d54c8c70.txt belles_lettres
|
||||
05aac70e552dd51430af3c194ad0fdec.txt learned
|
||||
083ae11a870e96d1f5c9835eaf48118c.txt learned
|
||||
08932387850eea34daae545225dcf8a2.txt learned
|
||||
08bfc610c065764b9dfec4eed039fa69.txt government
|
||||
096fddf36cb7d1de3f236d85bdefb938.txt learned
|
||||
0bd46492f5ad09df3b80cb62a335e689.txt learned
|
||||
0c16f57f865c4e0e072546097b1d4adb.txt learned
|
||||
0c267d9b541a7d3e07eb0e841609b307.txt learned
|
||||
0f0c1baf6f76f3e16cac7855405029a6.txt learned
|
||||
100ec74fe0d1dd74956246df46d9b845.txt learned
|
||||
10609aabf1b727d9728d96ffe9064f11.txt government
|
||||
1361da4407b0db04d4fd9b0dc51331b8.txt belles_lettres
|
||||
13a8cf112cb61bb237b91bffac75f506.txt learned
|
||||
14c10df92c36df39ec7d541654884a6f.txt learned
|
||||
15d5e6a3a826d0471c4dec0215169c94.txt learned
|
||||
17543675c21f3d1961df70e4bc05b677.txt government
|
||||
1af257d94c2c7c31650edd49fe5c3888.txt learned
|
||||
1da789efbc92bb26ab551c749a498714.txt learned
|
||||
21ee90d49d229072cdfe3373f386fbc6.txt learned
|
||||
2269c21867d5c492b2e223bc5589897e.txt learned
|
||||
233809d44fe0b4625aba2b21e2a090c3.txt learned
|
||||
2448920f7507a90ae5de9895518a4256.txt belles_lettres
|
||||
25544eed4ec559512c188e8d73c61576.txt government
|
||||
280024d9ca375fe894e0c3852153d91c.txt learned
|
||||
2ae543a13eb502dfb34efcd691af4c19.txt learned
|
||||
2bfe3d3546118761639b703dc042174b.txt learned
|
||||
3068c168367e3ed5cac6af3bde2e566e.txt learned
|
||||
31307aa6842b932e7f3073b253b687d4.txt learned
|
||||
32e8d2431fed46743b954c35de544335.txt learned
|
||||
332614e6d84d25bcb0724247debae9a6.txt learned
|
||||
3455cd8f4a88bbe179c3d16ff2d08aed.txt learned
|
||||
3463870779e91a0b3ef42dcb5614c417.txt learned
|
||||
347c4ff3005261de62a1350cf3552db1.txt learned
|
||||
371e9a205208a5fe2d058b1373246b06.txt romance
|
||||
3a0e840d849fb693fb0350a9bca049a7.txt learned
|
||||
3cfe918d71f0216d698a656bb261754a.txt learned
|
||||
42f560bacaae1ac7960efdafc40c9957.txt learned
|
||||
444c4cd32cbc3f38551a7cddc23c65bb.txt learned
|
||||
461965dbfcd3a75d610b913fd51b93e5.txt learned
|
||||
46795cf89bb03979cf64942c96be6fa1.txt learned
|
||||
46ace5b2774edd552502d72d113a2537.txt learned
|
||||
48ffdc34faa528fe84ba1575ad6cf022.txt learned
|
||||
4a8178c328135fabac148a10a7dbd795.txt learned
|
||||
4ad03bf39d4b20405d92877d8a2d620c.txt learned
|
||||
4b1042b36a2e8d19883107213a55d4fd.txt learned
|
||||
4cd9f5cf912b67d8d541cf805e35ec9d.txt learned
|
||||
4d1f93581f8df325a0a8fd9df3a60f49.txt government
|
||||
4e7a71284825f9b8302c914b3bf65c41.txt government
|
||||
4f8b6422ab5ad965d2925bb93f1a5ad1.txt learned
|
||||
4feeaa056745eaa93855a6d05cc21d20.txt learned
|
||||
5122f89d4fff6ec6e26062ded7c5387e.txt learned
|
||||
5185857492e797eb189d39ded8a8b64f.txt learned
|
||||
53115e407b6ae7d1d6b90edd4ac7f2b7.txt learned
|
||||
541e21b0a2ab6b31a44b787ffef004d7.txt learned
|
||||
5759e663a1214223b2068cf85e891953.txt learned
|
||||
5a17378f15a3eaac38b1245f842cd0d6.txt learned
|
||||
5a3733909b787420f2ae4a84095d90b6.txt learned
|
||||
5a6fe4735711b757130334f30a5c0d8e.txt learned
|
||||
5cfb1bed9bb97b6a0aabd93ea65d677b.txt learned
|
||||
5db5250d2936c795389841699a64b1dc.txt belles_lettres
|
||||
5e9a239de5aeb08b0713d0245fc914c7.txt learned
|
||||
5f606972d66ed49044f3eadaf4eb2a54.txt learned
|
||||
60e338de63774c5ef4e7beba18bc6577.txt learned
|
||||
61f7508fa32ee25eb9ee4cf982eb6d27.txt learned
|
||||
625237d5189df7054c13e62318cd9819.txt learned
|
||||
63167efcd7a7bdbd4b742f6e482312f4.txt learned
|
||||
635e2c48ef4a37462fd8a4cd17375c5c.txt learned
|
||||
64812690c6155fba3f1aba0514496dd9.txt learned
|
||||
65f1d037cb5f92da6605cea6d0d703d0.txt belles_lettres
|
||||
66abce82b770b4368691f2926f87089e.txt belles_lettres
|
||||
691c1e5e341a19e59b27dfb4f71fc0e0.txt learned
|
||||
712b9c9622c73dbb0e6dc5ba2c231cf0.txt learned
|
||||
71aa03bfef20157578b6b613174d3fe6.txt learned
|
||||
71f153ecdef94026a97b635a40b375c8.txt learned
|
||||
7341b4fda4d972adfbf854a0d6be3400.txt learned
|
||||
74486d71097c34544195b52bdd844839.txt learned
|
||||
745df40e8d2ba4bf6abfcb197c65359c.txt government
|
||||
74a1421e246c3ffc08398609f75e292c.txt belles_lettres
|
||||
784346fad149c3736d309036e925526c.txt learned
|
||||
787d5f0883aa5fa768a624c226fc7294.txt learned
|
||||
791f3304bbd155e0211904d1d002b081.txt learned
|
||||
7a297cedd35c3ffb12ab6011d34f1244.txt belles_lettres
|
||||
7c809ae6732c39ea9a020a307ff35b3a.txt learned
|
||||
7f8b847188c77b75a2b00e906e0ae693.txt learned
|
||||
805ea08c406a72dbff755a3627aeb677.txt learned
|
||||
8459fa5551ec11ae82c5fc404f2b3988.txt learned
|
||||
853f9d4b400a22d2abbf0f2e17d6ae33.txt learned
|
||||
864ff44244fb6229ba79ce3df93df701.txt belles_lettres
|
||||
8758b603d3ce23de68cbd13665a128d4.txt learned
|
||||
87d7774f30d9221f856bab02a3f5ffc4.txt learned
|
||||
8b2d2ff3e27f2d56f5c51f85c2754cf9.txt learned
|
||||
8babd57d7cbd695d8c04d698626593e8.txt belles_lettres
|
||||
8ce16ec688419c614801d5c29cec6153.txt learned
|
||||
8d2066cd72a448eb69348dbb68f754d8.txt learned
|
||||
8fb3df3b7d96dc4383c84447a4fdd1a3.txt learned
|
||||
9101cbf87bfd4ef26e71f5b8c1e61d18.txt learned
|
||||
93c4b35148e7dcb767ea607fe7edf2c3.txt learned
|
||||
990e5a79b032e5cb9ab3e56cab71a6ef.txt learned
|
||||
9b9ed2005178bb6098ae874260128fc6.txt learned
|
||||
9c97ea8f2d4dea9c31ebe73765f2396b.txt belles_lettres
|
||||
9f08d188f8174081f5b02a7f07668846.txt learned
|
||||
9f9b19682a8401fd40bce446f33d508b.txt learned
|
||||
9fe0cd0d62c294ed1bc7b29e7e65c18a.txt government
|
||||
9febf62c0e6509f3e1ad065a5a6aef8d.txt learned
|
||||
a03db0b1e3bb05fc0f961d2a655e8dad.txt learned
|
||||
a716803991f9713e7986d252e26e7382.txt belles_lettres
|
||||
a98e64947521853ff24f52e12b77c789.txt learned
|
||||
aa5156a64316e6836b14c61879d80712.txt learned
|
||||
ac848bdeda712352e09e5fa392be4574.txt learned
|
||||
ad12792f75798b70a59b37178798e145.txt learned
|
||||
ad3b98d2d08faf751ccfd7f8d0b4f045.txt learned
|
||||
af3d510667a872139daf2df8c2a17c1e.txt learned
|
||||
b07fc0f7edd49dcd538372888095d3d6.txt learned
|
||||
b303c034152030a3594d72626d1f784d.txt learned
|
||||
b31afca8898a09c9087b272701d61c89.txt learned
|
||||
b3346fa7bed6f5b9ad06bc831c59ad6c.txt learned
|
||||
b3681b289f0dd87a5c1f9573cd825866.txt learned
|
||||
b4d65c8e57797e496834f5f6d9d3e49e.txt learned
|
||||
b65707c01e68cc6d4d59e18d9f98f423.txt learned
|
||||
b8a039ba1694ce7ce87737ce5c7480d8.txt government
|
||||
b998ac20277e09a1c3fecbdfb028b33a.txt government
|
||||
ba6843edc446617d1e6e5ec53246d849.txt belles_lettres
|
||||
bb6d375a8b847c7c10f9bdbf7324eb03.txt belles_lettres
|
||||
bbbda4cef7aeb20352c9f1d9b453a9e5.txt learned
|
||||
be6f1bd428b9933bedbc6bd401868415.txt learned
|
||||
bf8ce15b10cb746bb1181645a42012db.txt learned
|
||||
bfd0a578b0ec650d83963ddcf443f7a1.txt learned
|
||||
c1bdfb06016223b3b2c5e03e02af81f3.txt learned
|
||||
c22274385e9d77bbb900ef9db6ef66ff.txt learned
|
||||
c39fda6fbf81d87bb6508b1bbe7faf93.txt learned
|
||||
c5a19f446f960c849d67b25238a08397.txt learned
|
||||
c65f6ecdb1ba01da0e6525dd525621e1.txt learned
|
||||
c942ba590a82fd0827b79e3d6bfb25d3.txt learned
|
||||
c9497d141930518b8005ba352b4d1637.txt government
|
||||
cb24d378b3966cf4f3f663f8b13430f2.txt belles_lettres
|
||||
ce39b27592fc593d0ee117651b072cc1.txt learned
|
||||
ceacd82d3757974d93538f67b74bc25e.txt learned
|
||||
cfdd298764ed82fa2304e427dcb53db9.txt learned
|
||||
d027a28847a6228383dd9594f0984bdf.txt learned
|
||||
d1f9469856a51f6007f0f785aadf8c1f.txt learned
|
||||
d59cd5ad1285a9094a1f82a67fe4ba7b.txt learned
|
||||
d5aa7d7a519c1600db10ad01a00a7e3a.txt learned
|
||||
d86c9cee65263cdfddbfaaffab1aeeb7.txt learned
|
||||
dc713f9e699e9e610b458b5c991ce514.txt learned
|
||||
dc89c7bfd3f0eefd385f0a81c1a59981.txt learned
|
||||
dc9a7b20833ff389ae573597095f253d.txt learned
|
||||
dcacb995ec95ede56ba389128922603c.txt learned
|
||||
dd1a33aada4ffb0564f709c10b95cedc.txt government
|
||||
e058a15d26f17f7193a032eed51bbbfc.txt learned
|
||||
e2daacfa9c33ea659beaa1a7763bfe57.txt learned
|
||||
e43c7ff67adf6fdd0710c0ec91776481.txt learned
|
||||
e852750e57424cf3e5968b6a3f642553.txt learned
|
||||
e88e97dfcade103cef59919bf49f46d3.txt learned
|
||||
eb6bf7af7572cc1fa1a9aa36c0d0feb3.txt learned
|
||||
ecf327ee7344767f939a3e7695607be5.txt learned
|
||||
ef98917ffbb5b1f6e3ce0428d47f2f23.txt learned
|
||||
f083fda6715b3b3860162e8367ea1209.txt learned
|
||||
f2b173d5ffa6eda874a71aea5ba076d2.txt learned
|
||||
f3b16a0072a6afc3a64e592f6c8ab78b.txt learned
|
||||
f433e3a3fdf6455b68183790d72f7fd8.txt learned
|
||||
f7099ffdcda8a3e231652cdfbdfe1d26.txt learned
|
||||
fc97d173fc6d18448bd334ccdbf36e4c.txt learned
|
||||
fdcc797bb8b504885a2ce07017555f33.txt learned
|
||||
@ -3,6 +3,8 @@
|
||||
import os
|
||||
import shutil
|
||||
import random
|
||||
import sys
|
||||
import math
|
||||
|
||||
|
||||
|
||||
@ -14,6 +16,7 @@ trainDir = os.path.join(dataDir, 'u4_train')
|
||||
testDir = os.path.join(dataDir, 'u4_test')
|
||||
|
||||
|
||||
|
||||
'''
|
||||
################################################################################################################################
|
||||
--> CLASS Trainingsset <--
|
||||
@ -75,10 +78,10 @@ class trainingsset:
|
||||
'''
|
||||
class multiclassClassifier:
|
||||
|
||||
|
||||
filesToPrediction = {}
|
||||
termfrequenciesOfClasses = {};
|
||||
countClass = {}
|
||||
|
||||
countClasses = {}
|
||||
percentage = {}
|
||||
|
||||
def writePredictionFile(self):
|
||||
with open(actualDir+'/../G22_predictions.txt', 'w') as f:
|
||||
@ -96,10 +99,10 @@ class multiclassClassifier:
|
||||
currentPath = trainDir+'/'+classes+'/trainingsdata'
|
||||
listing = os.listdir(currentPath)
|
||||
for infile in listing:
|
||||
if self.countClass.has_key(classes):
|
||||
self.countClass[classes] += 1
|
||||
if self.countClasses.has_key(classes):
|
||||
self.countClasses[classes] += 1
|
||||
else:
|
||||
self.countClass[classes] = 1
|
||||
self.countClasses[classes] = 1
|
||||
currentPath = trainDir+'/'+classes+'/trainingsdata/'+infile
|
||||
# update termfrequency for specific class:
|
||||
self.termfrequenciesOfClasses[classes] = self.updateDictonary(currentPath, self.termfrequenciesOfClasses[classes])
|
||||
@ -121,15 +124,25 @@ class multiclassClassifier:
|
||||
else:
|
||||
dictonary[str(word)] = 1
|
||||
f.close()
|
||||
return dictonary
|
||||
|
||||
|
||||
return dictonary
|
||||
|
||||
|
||||
def bayes(self, text, termfrequenciesOfClasses, termCount, percentage):
|
||||
for line in text:
|
||||
thisline = line.split(" ");
|
||||
for word in thisline:
|
||||
word = word.lower()
|
||||
word = word.replace(',', ' ')
|
||||
word = word.replace(".", " ")
|
||||
result = 1.0
|
||||
for word in text:
|
||||
if termfrequenciesOfClasses.has_key(str(word)):
|
||||
result += math.log((termfrequenciesOfClasses[word]+1.)/(termCount+1))
|
||||
else:
|
||||
result += math.log(1./(termCount+1))
|
||||
result += math.log(percentage)
|
||||
return result
|
||||
|
||||
def bayes(self, listOfFiles):
|
||||
#TODO : implement
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
@ -142,12 +155,45 @@ class multiclassClassifier:
|
||||
# main method
|
||||
if __name__ == '__main__':
|
||||
ts = trainingsset()
|
||||
#ts.splitTrainingsdataRandomly();
|
||||
#ts.getClassesToTrain();
|
||||
#ts.splitTrainingsdataRandomly(); already done -> specific folder structure
|
||||
|
||||
mc = multiclassClassifier()
|
||||
|
||||
# calculates a dictonary depending on all testdata with the form:
|
||||
# dictonary[CLASSNAME][WORD] = Integer
|
||||
mc.getTermfrequenciesOfClasses()
|
||||
#mc.writePredictionFile()
|
||||
|
||||
|
||||
# calculates the percentage of P(C) for all given classes
|
||||
sumOfClasses = 0.0
|
||||
for v in mc.countClasses.values():
|
||||
sumOfClasses += v
|
||||
|
||||
for classes in mc.countClasses.keys():
|
||||
mc.percentage[classes] = mc.countClasses[classes]/sumOfClasses
|
||||
|
||||
|
||||
listing = os.listdir(trainDir)
|
||||
for classes in listing: # classes
|
||||
path = trainDir+'/'+classes+'/testdata'
|
||||
listing = os.listdir(path)
|
||||
for infile in listing:
|
||||
currentPath = trainDir+'/'+classes+'/testdata/'+infile
|
||||
print currentPath
|
||||
maxRes = sys.maxint * -1
|
||||
# check all possible classes
|
||||
for cl in mc.percentage.keys():
|
||||
f = open(currentPath, 'r')
|
||||
temp = mc.bayes(f.readlines(), mc.termfrequenciesOfClasses[cl], sumOfClasses, mc.percentage[cl])
|
||||
if (temp >= maxRes):
|
||||
maxRes = temp
|
||||
mc.filesToPrediction[infile] = cl
|
||||
f.close()
|
||||
|
||||
|
||||
mc.writePredictionFile()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user