2013-06-17 01:00:59 +02:00

348 lines
17 KiB
Python

# -*- coding: utf-8 -*-
# imports
import os
import shutil
import random
import sys
import math
import re
from PorterStemmer import PorterStemmer
# config variables
actualDir = os.path.dirname(os.path.realpath(__file__))
dataDir = os.path.join(actualDir, '../data')
trainDir = os.path.join(dataDir, 'u4_train')
testDir = os.path.join(dataDir, 'u4_test')
stopwords = os.path.join(dataDir, 'stopwords/english')
'''
################################################################################################################################
--> CLASS Trainingsset <--
################################################################################################################################
'''
class trainingsset:
#def __init__(self):
def createTrainingsset(self):
self.splitTrainingsdataRandomly(self) # first split our data into trainings- and testdata
# copies files randomly to new directories. Each directory will contain fileCount / 2 numbers of files
# If fileCount is uneven /trainingsdata will contain one file more than /testdata
def splitTrainingsdataRandomly(self):
for dirpath, dirnames, filenames in os.walk(trainDir, topdown=False):
newTrainDir = dirpath+'/trainingsdata'
newTestDir = dirpath+'/testdata'
fileCount = len(filenames)
if(fileCount > 0):
#remove old dirs if they already exist
if os.path.isdir(newTrainDir):
shutil.rmtree(newTrainDir)
if os.path.isdir(newTestDir):
shutil.rmtree(newTestDir)
# create new directories
os.mkdir(newTestDir)
os.mkdir(newTrainDir)
numberOfFilesInTraining = 0
numberOfFilesInTest = 0
for actualFile in filenames:
fileCopied = False
while(fileCopied == False):
randomBool = bool(random.getrandbits(1))
if(randomBool and numberOfFilesInTraining <= fileCount / 2):
numberOfFilesInTraining += 1
shutil.copy(dirpath+'/'+actualFile, dirpath+'/trainingsdata/'+actualFile)
fileCopied = True
else:
if numberOfFilesInTest < fileCount / 2:
numberOfFilesInTest += 1
fileCopied = True
shutil.copy(dirpath+'/'+actualFile, dirpath+'/testdata/'+actualFile)
'''
################################################################################################################################
--> CLASS MulticlassClassifier <--
################################################################################################################################
'''
class multiclassClassifier:
filesToPrediction = {}
termfrequenciesOfClasses = {};
countClasses = {}
percentage = {}
def writePredictionFile(self):
with open(actualDir+'/../G22_predictions.txt', 'w') as f:
for k in sorted(self.filesToPrediction.iterkeys()):
f.write(str(k)+'\t'+str(self.filesToPrediction[k])+'\n')
f.closed
return
# calculates all necessary stuff for multiclass classifier
def getTermfrequenciesOfClasses(self):
listing = os.listdir(trainDir)
for classes in listing: # classes
self.termfrequenciesOfClasses[classes] = {}
for classes in self.termfrequenciesOfClasses.keys():
currentPath = trainDir+'/'+classes
listing = os.listdir(currentPath)
for infile in listing:
if self.countClasses.has_key(classes):
self.countClasses[classes] += 1
else:
self.countClasses[classes] = 1
currentPath = trainDir+'/'+classes+'/'+infile
# update termfrequency for specific class:
self.termfrequenciesOfClasses[classes] = self.updateDictonary(currentPath, self.termfrequenciesOfClasses[classes])
# "incudludes" a file into the termfrequency dictonary
def updateDictonary(self, pathToFile, dictonary):
f = open(pathToFile, 'r')
lines = f.readlines();
for line in lines:
thisline = line.split(" ");
for word in thisline:
word = self.clean_word(word)
if word != "":
if dictonary.has_key(word):
dictonary[str(word)] += 1
else:
dictonary[str(word)] = 1
f.close()
return dictonary
def bayes(self, text, termfrequenciesOfClasses, termCount, percentage, cl):
result = 1.0
wordcount = 0.0
notwordcount = 0.0
for line in text:
thisline = line.split(" ");
for word in thisline:
word = self.clean_word(word)
if word != "":
'''
Accuracy: 21.2121%
Precision per class: adventure:40.0% belles_lettres:22.2222% editorial:17.6471% fiction:36.3636% government:0.0% hobbies:11.1111% learned:0.0% lore:17.5439% mystery:0.0% news:23.4043% romance:0.0%
Precision Macroavg: 15.2993%
Precision Microavg: 21.2121%
Recall per class: adventure:20.0% belles_lettres:14.8148% editorial:30.0% fiction:36.3636% government:0.0% hobbies:7.6923% learned:0.0% lore:55.5556% mystery:0.0% news:68.75% romance:0.0%
Recall Microavg: 2.6217%
if termfrequenciesOfClasses.has_key(str(word)):
result += math.log(1./((termfrequenciesOfClasses[word]+1.)/(termCount+1))) #gewichte häufig auftretende worter am wenigsten, wenigauftretende am stärksten + termcount -> was ist das?
...
return result
Accuracy: 21.8182%
Precision per class: adventure:40.0% belles_lettres:22.2222% editorial:20.0% fiction:36.3636% government:0.0% hobbies:20.0% learned:0.0% lore:17.5439% mystery:0.0% news:22.9167% romance:0.0%
Precision Macroavg: 16.2769%
Precision Microavg: 21.8182%
Recall per class: adventure:20.0% belles_lettres:14.8148% editorial:30.0% fiction:36.3636% government:0.0% hobbies:15.3846% learned:0.0% lore:55.5556% mystery:0.0% news:68.75% romance:0.0%
Recall Microavg: 2.7149%
if termfrequenciesOfClasses.has_key(str(word)):
wordcount += 1
result += math.log(1./((termfrequenciesOfClasses[word]+1.)/(termCount+1))) #gewichte häufig auftretende worter am wenigsten, wenigauftretende am stärksten + termcount -> was ist das?
...
result += math.log(percentage)
result += math.log(wordcount)
return result
Accuracy: 33.9394%
Precision per class: adventure:0.0% belles_lettres:36.8421% editorial:0.0% fiction:0.0% government:20.0% hobbies:0.0% learned:38.9831% lore:0.0% mystery:36.8421% news:0.0% romance:52.9412%
Precision Macroavg: 16.8735%
Precision Microavg: 33.9394%
Recall per class: adventure:0.0% belles_lettres:25.9259% editorial:0.0% fiction:0.0% government:90.9091% hobbies:0.0% learned:76.6667% lore:0.0% mystery:87.5% news:0.0% romance:81.8182%
Recall Microavg: 4.8866%
if termfrequenciesOfClasses.has_key(str(word)):
wordcount += 1
result += termfrequenciesOfClasses[word]/(termCount+1)
#print "known word: "+word
else:
result -= 1./(termCount+1)
#print "new word: "+word
...
result /= len(termfrequenciesOfClasses)
print cl +" "+str(result)
return math.log(result)
Accuracy: 37.5758%
Precision per class: adventure:66.6667% belles_lettres:36.5385% editorial:0.0% fiction:0.0% government:28.0% hobbies:0.0% learned:36.8421% lore:0.0% mystery:50.0% news:100.0% romance:37.5%
Precision Macroavg: 32.3225%
Precision Microavg: 37.5758%
Recall per class: adventure:20.0% belles_lettres:70.3704% editorial:0.0% fiction:0.0% government:63.6364% hobbies:0.0% learned:70.0% lore:0.0% mystery:50.0% news:18.75% romance:54.5455%
Recall Microavg: 5.6777%
if termfrequenciesOfClasses.has_key(str(word)):
wordcount += 1
result += termfrequenciesOfClasses[word]/(termCount+1)
else:
result -= 1./(termCount+1)
...
result *= wordcount
result /= len(termfrequenciesOfClasses)
#return result
print cl +" "+str(result)
return math.log(result)
Accuracy: 40.6061%
Precision per class: adventure:40.0% belles_lettres:44.7368% editorial:0.0% fiction:0.0% government:23.6842% hobbies:66.6667% learned:40.0% lore:0.0% mystery:46.1538% news:100.0% romance:47.3684%
Precision Macroavg: 37.1464%
Precision Microavg: 40.6061%
Recall per class: adventure:20.0% belles_lettres:62.963% editorial:0.0% fiction:0.0% government:81.8182% hobbies:15.3846% learned:60.0% lore:0.0% mystery:75.0% news:25.0% romance:81.8182%
Recall Microavg: 6.3992%
if termfrequenciesOfClasses.has_key(str(word)):
wordcount += 1
result += (termfrequenciesOfClasses[word]/(termCount+1))*(1-percentage)
...
result *= wordcount
result /= len(termfrequenciesOfClasses)
print cl +" "+str(result)
return math.log(result)
Accuracy: 46.0606%
Precision per class: adventure:25.0% belles_lettres:35.3846% editorial:0.0% fiction:31.25% government:40.0% hobbies:66.6667% learned:72.0% lore:20.0% mystery:66.6667% news:70.5882% romance:25.0%
Precision Macroavg: 41.1415%
Precision Microavg: 46.0606%
Recall per class: adventure:20.0% belles_lettres:85.1852% editorial:0.0% fiction:45.4545% government:36.3636% hobbies:46.1538% learned:60.0% lore:5.5556% mystery:50.0% news:75.0% romance:9.0909%
Recall Microavg: 7.8675%
if termfrequenciesOfClasses.has_key(str(word)):
wordcount += 1
result += (termfrequenciesOfClasses[word]/(termCount+1))
else:
notwordcount += 1
result += (1./(termCount+1))
...
result *= (1-percentage)*wordcount
result /= percentage*notwordcount
print cl +" "+str(result)
return math.log(result)
'''
#result = 1.0
#for word in text:
if termfrequenciesOfClasses.has_key(str(word)):
wordcount += 1
#result += math.log(1./(termfrequenciesOfClasses[word]+1.))
#result += math.log((termfrequenciesOfClasses[word]+1.)/(termCount+1)) #gewichte häufig auftretende terme am stärksten
#result += math.log(1./((termfrequenciesOfClasses[word]+1.)/(termCount+1))) #gewichte häufig auftretende worter am wenigsten, wenigauftretende am stärksten + termcount -> was ist das?
result += (termfrequenciesOfClasses[word]/(termCount+1))
#print "known word: "+word
else:
notwordcount += 1
result += (1./(termCount+1))
#result += math.log(1./(termCount+1))
#result += math.log(1.)
#print "new word: "+word
#result += math.log(percentage)
#result /= percentage
#result += math.log(wordcount)
#result *= (wordcount/(wordcount + notwordcount))
result *= (1-percentage)*wordcount
result /= percentage*notwordcount
#result /= len(termfrequenciesOfClasses)
#return result
#print cl +" "+str(result)
return math.log(result)
def clean_word(self, word):
#print word
word = word.lower() #lowercase
word = word.strip() # remove lineendings etc
#return word
word = "".join(re.findall("[a-z]+", word)) #only characters
#return word
if len(word) <= 4: #only words longer 4
return ""
#return word
if self.isStopWord(word): #stopwordfilter
return ""
#print word
p = PorterStemmer() #stemming
word = p.stem(word, 0,len(word)-1)
return word
def isStopWord(self,word):
for line in open(stopwords,'r').readlines():
if line.strip() == word:
return True
return False
'''
################################################################################################################################
--> Main method <--
################################################################################################################################
'''
# main method
if __name__ == '__main__':
ts = trainingsset()
#ts.splitTrainingsdataRandomly(); already done -> specific folder structure
mc = multiclassClassifier()
# calculates a dictonary depending on all testdata with the form:
# dictonary[CLASSNAME][WORD] = Integer
mc.getTermfrequenciesOfClasses()
# calculates the percentage of P(C) for all given classes
sumOfClasses = 0.0
for v in mc.countClasses.values():
sumOfClasses += v
for classes in mc.countClasses.keys():
mc.percentage[classes] = mc.countClasses[classes]/sumOfClasses
#class_matches = []
#listing = os.listdir(trainDir)
#for classes in listing: # classes
path = trainDir+'/'+classes+'/testdata'
path = testDir
listing = os.listdir(path)
for infile in listing:
currentPath = testDir+'/'+infile
#print currentPath
maxRes = sys.maxint * -1
# check all possible classes
for cl in mc.percentage.keys():
f = open(currentPath, 'r')
temp = mc.bayes(f.readlines(), mc.termfrequenciesOfClasses[cl], sumOfClasses, mc.percentage[cl], cl)
#class_matches.append([infile,cl,temp])
#print class_matches
if (temp >= maxRes):
maxRes = temp
mc.filesToPrediction[infile] = cl
f.close()
print currentPath + " " + mc.filesToPrediction[infile]
mc.writePredictionFile()