348 lines
17 KiB
Python
348 lines
17 KiB
Python
# -*- coding: utf-8 -*-
|
|
# imports
|
|
|
|
import os
|
|
import shutil
|
|
import random
|
|
import sys
|
|
import math
|
|
import re
|
|
from PorterStemmer import PorterStemmer
|
|
|
|
|
|
# config variables
|
|
actualDir = os.path.dirname(os.path.realpath(__file__))
|
|
dataDir = os.path.join(actualDir, '../data')
|
|
trainDir = os.path.join(dataDir, 'u4_train')
|
|
testDir = os.path.join(dataDir, 'u4_test')
|
|
stopwords = os.path.join(dataDir, 'stopwords/english')
|
|
|
|
|
|
|
|
'''
|
|
################################################################################################################################
|
|
--> CLASS Trainingsset <--
|
|
################################################################################################################################
|
|
'''
|
|
|
|
class trainingsset:
|
|
|
|
#def __init__(self):
|
|
|
|
|
|
def createTrainingsset(self):
|
|
self.splitTrainingsdataRandomly(self) # first split our data into trainings- and testdata
|
|
|
|
|
|
# copies files randomly to new directories. Each directory will contain fileCount / 2 numbers of files
|
|
# If fileCount is uneven /trainingsdata will contain one file more than /testdata
|
|
def splitTrainingsdataRandomly(self):
|
|
for dirpath, dirnames, filenames in os.walk(trainDir, topdown=False):
|
|
newTrainDir = dirpath+'/trainingsdata'
|
|
newTestDir = dirpath+'/testdata'
|
|
fileCount = len(filenames)
|
|
|
|
if(fileCount > 0):
|
|
#remove old dirs if they already exist
|
|
if os.path.isdir(newTrainDir):
|
|
shutil.rmtree(newTrainDir)
|
|
if os.path.isdir(newTestDir):
|
|
shutil.rmtree(newTestDir)
|
|
|
|
# create new directories
|
|
os.mkdir(newTestDir)
|
|
os.mkdir(newTrainDir)
|
|
numberOfFilesInTraining = 0
|
|
numberOfFilesInTest = 0
|
|
|
|
for actualFile in filenames:
|
|
fileCopied = False
|
|
|
|
while(fileCopied == False):
|
|
randomBool = bool(random.getrandbits(1))
|
|
if(randomBool and numberOfFilesInTraining <= fileCount / 2):
|
|
numberOfFilesInTraining += 1
|
|
shutil.copy(dirpath+'/'+actualFile, dirpath+'/trainingsdata/'+actualFile)
|
|
fileCopied = True
|
|
else:
|
|
if numberOfFilesInTest < fileCount / 2:
|
|
numberOfFilesInTest += 1
|
|
fileCopied = True
|
|
shutil.copy(dirpath+'/'+actualFile, dirpath+'/testdata/'+actualFile)
|
|
|
|
|
|
|
|
|
|
'''
|
|
################################################################################################################################
|
|
--> CLASS MulticlassClassifier <--
|
|
################################################################################################################################
|
|
'''
|
|
class multiclassClassifier:
|
|
|
|
filesToPrediction = {}
|
|
termfrequenciesOfClasses = {};
|
|
countClasses = {}
|
|
percentage = {}
|
|
|
|
def writePredictionFile(self):
|
|
with open(actualDir+'/../G22_predictions.txt', 'w') as f:
|
|
for k in sorted(self.filesToPrediction.iterkeys()):
|
|
f.write(str(k)+'\t'+str(self.filesToPrediction[k])+'\n')
|
|
f.closed
|
|
return
|
|
|
|
# calculates all necessary stuff for multiclass classifier
|
|
def getTermfrequenciesOfClasses(self):
|
|
listing = os.listdir(trainDir)
|
|
for classes in listing: # classes
|
|
self.termfrequenciesOfClasses[classes] = {}
|
|
for classes in self.termfrequenciesOfClasses.keys():
|
|
currentPath = trainDir+'/'+classes
|
|
listing = os.listdir(currentPath)
|
|
for infile in listing:
|
|
if self.countClasses.has_key(classes):
|
|
self.countClasses[classes] += 1
|
|
else:
|
|
self.countClasses[classes] = 1
|
|
currentPath = trainDir+'/'+classes+'/'+infile
|
|
# update termfrequency for specific class:
|
|
self.termfrequenciesOfClasses[classes] = self.updateDictonary(currentPath, self.termfrequenciesOfClasses[classes])
|
|
|
|
|
|
|
|
# "incudludes" a file into the termfrequency dictonary
|
|
def updateDictonary(self, pathToFile, dictonary):
|
|
f = open(pathToFile, 'r')
|
|
lines = f.readlines();
|
|
for line in lines:
|
|
thisline = line.split(" ");
|
|
for word in thisline:
|
|
word = self.clean_word(word)
|
|
if word != "":
|
|
if dictonary.has_key(word):
|
|
dictonary[str(word)] += 1
|
|
else:
|
|
dictonary[str(word)] = 1
|
|
f.close()
|
|
return dictonary
|
|
|
|
|
|
def bayes(self, text, termfrequenciesOfClasses, termCount, percentage, cl):
|
|
result = 1.0
|
|
wordcount = 0.0
|
|
notwordcount = 0.0
|
|
for line in text:
|
|
thisline = line.split(" ");
|
|
for word in thisline:
|
|
word = self.clean_word(word)
|
|
if word != "":
|
|
'''
|
|
Accuracy: 21.2121%
|
|
Precision per class: adventure:40.0% belles_lettres:22.2222% editorial:17.6471% fiction:36.3636% government:0.0% hobbies:11.1111% learned:0.0% lore:17.5439% mystery:0.0% news:23.4043% romance:0.0%
|
|
Precision Macroavg: 15.2993%
|
|
Precision Microavg: 21.2121%
|
|
Recall per class: adventure:20.0% belles_lettres:14.8148% editorial:30.0% fiction:36.3636% government:0.0% hobbies:7.6923% learned:0.0% lore:55.5556% mystery:0.0% news:68.75% romance:0.0%
|
|
Recall Microavg: 2.6217%
|
|
|
|
if termfrequenciesOfClasses.has_key(str(word)):
|
|
result += math.log(1./((termfrequenciesOfClasses[word]+1.)/(termCount+1))) #gewichte häufig auftretende worter am wenigsten, wenigauftretende am stärksten + termcount -> was ist das?
|
|
...
|
|
return result
|
|
|
|
Accuracy: 21.8182%
|
|
Precision per class: adventure:40.0% belles_lettres:22.2222% editorial:20.0% fiction:36.3636% government:0.0% hobbies:20.0% learned:0.0% lore:17.5439% mystery:0.0% news:22.9167% romance:0.0%
|
|
Precision Macroavg: 16.2769%
|
|
Precision Microavg: 21.8182%
|
|
Recall per class: adventure:20.0% belles_lettres:14.8148% editorial:30.0% fiction:36.3636% government:0.0% hobbies:15.3846% learned:0.0% lore:55.5556% mystery:0.0% news:68.75% romance:0.0%
|
|
Recall Microavg: 2.7149%
|
|
|
|
if termfrequenciesOfClasses.has_key(str(word)):
|
|
wordcount += 1
|
|
result += math.log(1./((termfrequenciesOfClasses[word]+1.)/(termCount+1))) #gewichte häufig auftretende worter am wenigsten, wenigauftretende am stärksten + termcount -> was ist das?
|
|
...
|
|
result += math.log(percentage)
|
|
result += math.log(wordcount)
|
|
return result
|
|
|
|
Accuracy: 33.9394%
|
|
Precision per class: adventure:0.0% belles_lettres:36.8421% editorial:0.0% fiction:0.0% government:20.0% hobbies:0.0% learned:38.9831% lore:0.0% mystery:36.8421% news:0.0% romance:52.9412%
|
|
Precision Macroavg: 16.8735%
|
|
Precision Microavg: 33.9394%
|
|
Recall per class: adventure:0.0% belles_lettres:25.9259% editorial:0.0% fiction:0.0% government:90.9091% hobbies:0.0% learned:76.6667% lore:0.0% mystery:87.5% news:0.0% romance:81.8182%
|
|
Recall Microavg: 4.8866%
|
|
|
|
if termfrequenciesOfClasses.has_key(str(word)):
|
|
wordcount += 1
|
|
result += termfrequenciesOfClasses[word]/(termCount+1)
|
|
#print "known word: "+word
|
|
else:
|
|
result -= 1./(termCount+1)
|
|
#print "new word: "+word
|
|
...
|
|
result /= len(termfrequenciesOfClasses)
|
|
print cl +" "+str(result)
|
|
return math.log(result)
|
|
|
|
Accuracy: 37.5758%
|
|
Precision per class: adventure:66.6667% belles_lettres:36.5385% editorial:0.0% fiction:0.0% government:28.0% hobbies:0.0% learned:36.8421% lore:0.0% mystery:50.0% news:100.0% romance:37.5%
|
|
Precision Macroavg: 32.3225%
|
|
Precision Microavg: 37.5758%
|
|
Recall per class: adventure:20.0% belles_lettres:70.3704% editorial:0.0% fiction:0.0% government:63.6364% hobbies:0.0% learned:70.0% lore:0.0% mystery:50.0% news:18.75% romance:54.5455%
|
|
Recall Microavg: 5.6777%
|
|
|
|
if termfrequenciesOfClasses.has_key(str(word)):
|
|
wordcount += 1
|
|
result += termfrequenciesOfClasses[word]/(termCount+1)
|
|
else:
|
|
result -= 1./(termCount+1)
|
|
...
|
|
result *= wordcount
|
|
result /= len(termfrequenciesOfClasses)
|
|
#return result
|
|
print cl +" "+str(result)
|
|
return math.log(result)
|
|
|
|
Accuracy: 40.6061%
|
|
Precision per class: adventure:40.0% belles_lettres:44.7368% editorial:0.0% fiction:0.0% government:23.6842% hobbies:66.6667% learned:40.0% lore:0.0% mystery:46.1538% news:100.0% romance:47.3684%
|
|
Precision Macroavg: 37.1464%
|
|
Precision Microavg: 40.6061%
|
|
Recall per class: adventure:20.0% belles_lettres:62.963% editorial:0.0% fiction:0.0% government:81.8182% hobbies:15.3846% learned:60.0% lore:0.0% mystery:75.0% news:25.0% romance:81.8182%
|
|
Recall Microavg: 6.3992%
|
|
|
|
if termfrequenciesOfClasses.has_key(str(word)):
|
|
wordcount += 1
|
|
result += (termfrequenciesOfClasses[word]/(termCount+1))*(1-percentage)
|
|
...
|
|
result *= wordcount
|
|
result /= len(termfrequenciesOfClasses)
|
|
print cl +" "+str(result)
|
|
return math.log(result)
|
|
|
|
Accuracy: 46.0606%
|
|
Precision per class: adventure:25.0% belles_lettres:35.3846% editorial:0.0% fiction:31.25% government:40.0% hobbies:66.6667% learned:72.0% lore:20.0% mystery:66.6667% news:70.5882% romance:25.0%
|
|
Precision Macroavg: 41.1415%
|
|
Precision Microavg: 46.0606%
|
|
Recall per class: adventure:20.0% belles_lettres:85.1852% editorial:0.0% fiction:45.4545% government:36.3636% hobbies:46.1538% learned:60.0% lore:5.5556% mystery:50.0% news:75.0% romance:9.0909%
|
|
Recall Microavg: 7.8675%
|
|
|
|
if termfrequenciesOfClasses.has_key(str(word)):
|
|
wordcount += 1
|
|
result += (termfrequenciesOfClasses[word]/(termCount+1))
|
|
else:
|
|
notwordcount += 1
|
|
result += (1./(termCount+1))
|
|
...
|
|
result *= (1-percentage)*wordcount
|
|
result /= percentage*notwordcount
|
|
print cl +" "+str(result)
|
|
return math.log(result)
|
|
'''
|
|
#result = 1.0
|
|
#for word in text:
|
|
if termfrequenciesOfClasses.has_key(str(word)):
|
|
wordcount += 1
|
|
#result += math.log(1./(termfrequenciesOfClasses[word]+1.))
|
|
#result += math.log((termfrequenciesOfClasses[word]+1.)/(termCount+1)) #gewichte häufig auftretende terme am stärksten
|
|
#result += math.log(1./((termfrequenciesOfClasses[word]+1.)/(termCount+1))) #gewichte häufig auftretende worter am wenigsten, wenigauftretende am stärksten + termcount -> was ist das?
|
|
result += (termfrequenciesOfClasses[word]/(termCount+1))
|
|
#print "known word: "+word
|
|
else:
|
|
notwordcount += 1
|
|
result += (1./(termCount+1))
|
|
#result += math.log(1./(termCount+1))
|
|
#result += math.log(1.)
|
|
#print "new word: "+word
|
|
#result += math.log(percentage)
|
|
#result /= percentage
|
|
#result += math.log(wordcount)
|
|
#result *= (wordcount/(wordcount + notwordcount))
|
|
result *= (1-percentage)*wordcount
|
|
result /= percentage*notwordcount
|
|
#result /= len(termfrequenciesOfClasses)
|
|
#return result
|
|
#print cl +" "+str(result)
|
|
return math.log(result)
|
|
|
|
def clean_word(self, word):
|
|
#print word
|
|
word = word.lower() #lowercase
|
|
word = word.strip() # remove lineendings etc
|
|
#return word
|
|
word = "".join(re.findall("[a-z]+", word)) #only characters
|
|
#return word
|
|
if len(word) <= 4: #only words longer 4
|
|
return ""
|
|
#return word
|
|
if self.isStopWord(word): #stopwordfilter
|
|
return ""
|
|
#print word
|
|
p = PorterStemmer() #stemming
|
|
word = p.stem(word, 0,len(word)-1)
|
|
return word
|
|
|
|
def isStopWord(self,word):
|
|
for line in open(stopwords,'r').readlines():
|
|
if line.strip() == word:
|
|
return True
|
|
return False
|
|
|
|
|
|
|
|
|
|
'''
|
|
################################################################################################################################
|
|
--> Main method <--
|
|
################################################################################################################################
|
|
'''
|
|
|
|
# main method
|
|
if __name__ == '__main__':
|
|
ts = trainingsset()
|
|
#ts.splitTrainingsdataRandomly(); already done -> specific folder structure
|
|
|
|
mc = multiclassClassifier()
|
|
|
|
# calculates a dictonary depending on all testdata with the form:
|
|
# dictonary[CLASSNAME][WORD] = Integer
|
|
mc.getTermfrequenciesOfClasses()
|
|
|
|
|
|
# calculates the percentage of P(C) for all given classes
|
|
sumOfClasses = 0.0
|
|
for v in mc.countClasses.values():
|
|
sumOfClasses += v
|
|
|
|
for classes in mc.countClasses.keys():
|
|
mc.percentage[classes] = mc.countClasses[classes]/sumOfClasses
|
|
|
|
|
|
#class_matches = []
|
|
#listing = os.listdir(trainDir)
|
|
#for classes in listing: # classes
|
|
path = trainDir+'/'+classes+'/testdata'
|
|
path = testDir
|
|
listing = os.listdir(path)
|
|
for infile in listing:
|
|
currentPath = testDir+'/'+infile
|
|
#print currentPath
|
|
maxRes = sys.maxint * -1
|
|
# check all possible classes
|
|
for cl in mc.percentage.keys():
|
|
f = open(currentPath, 'r')
|
|
temp = mc.bayes(f.readlines(), mc.termfrequenciesOfClasses[cl], sumOfClasses, mc.percentage[cl], cl)
|
|
#class_matches.append([infile,cl,temp])
|
|
#print class_matches
|
|
if (temp >= maxRes):
|
|
maxRes = temp
|
|
mc.filesToPrediction[infile] = cl
|
|
f.close()
|
|
print currentPath + " " + mc.filesToPrediction[infile]
|
|
|
|
|
|
mc.writePredictionFile()
|
|
|
|
|
|
|
|
|
|
|