diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py index ecce396d..eae3e971 100644 --- a/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py +++ b/ss2013/1_Web Mining/Uebungen/4_Uebung/code/naive_bayes.py @@ -6,6 +6,7 @@ import random + # config variables actualDir = os.path.dirname(os.path.realpath(__file__)) dataDir = os.path.join(actualDir, '../data') @@ -23,18 +24,10 @@ class trainingsset: #def __init__(self): - classes = ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'learned', 'lore','mystery','news','romantic'] def createTrainingsset(self): self.splitTrainingsdataRandomly(self) # first split our data into trainings- and testdata - - - def getClassesToTrain(self): - for dirpath, dirnames, filenames in os.walk(trainDir): - # TODO: implement - pass - - + # copies files randomly to new directories. Each directory will contain fileCount / 2 numbers of files # If fileCount is uneven /trainingsdata will contain one file more than /testdata @@ -82,8 +75,9 @@ class trainingsset: ''' class multiclassClassifier: - filesToPrediction = {}; - testFiles = {} + + termfrequenciesOfClasses = {}; + countClass = {} def writePredictionFile(self): @@ -93,13 +87,43 @@ class multiclassClassifier: f.closed return - # reads all testData from /u4_train to the list - def getTestData(self): - listing = os.listdir(testDir) - for infile in listing: - f = open(testDir+'/'+infile, 'r') - self.testFiles[infile] = f.readline() - f.close() + # calculates all necessary stuff for multiclass classifier + def getTermfrequenciesOfClasses(self): + listing = os.listdir(trainDir) + for classes in listing: # classes + self.termfrequenciesOfClasses[classes] = {} + for classes in self.termfrequenciesOfClasses.keys(): + currentPath = trainDir+'/'+classes+'/trainingsdata' + listing = os.listdir(currentPath) + for infile in listing: + if self.countClass.has_key(classes): + self.countClass[classes] += 1 + else: + self.countClass[classes] = 1 + currentPath = trainDir+'/'+classes+'/trainingsdata/'+infile + # update termfrequency for specific class: + self.termfrequenciesOfClasses[classes] = self.updateDictonary(currentPath, self.termfrequenciesOfClasses[classes]) + + + + # "incudludes" a file into the termfrequency dictonary + def updateDictonary(self, pathToFile, dictonary): + f = open(pathToFile, 'r') + lines = f.readlines(); + for line in lines: + thisline = line.split(" "); + for word in thisline: + word = word.lower() + word = word.replace(".", " ") + word = word.replace(",", " ") + if dictonary.has_key(word): + dictonary[str(word)] += 1 + else: + dictonary[str(word)] = 1 + f.close() + return dictonary + + def bayes(self, listOfFiles): @@ -119,10 +143,10 @@ class multiclassClassifier: if __name__ == '__main__': ts = trainingsset() #ts.splitTrainingsdataRandomly(); - ts.getClassesToTrain(); + #ts.getClassesToTrain(); mc = multiclassClassifier() - #mc.getTestData() + mc.getTermfrequenciesOfClasses() #mc.writePredictionFile() diff --git a/ss2013/1_Web Mining/Uebungen/4_Uebung/solution.odt b/ss2013/1_Web Mining/Uebungen/4_Uebung/solution.odt new file mode 100644 index 00000000..58721b24 Binary files /dev/null and b/ss2013/1_Web Mining/Uebungen/4_Uebung/solution.odt differ