web mining update

This commit is contained in:
Michael Scholz 2013-06-10 10:53:44 +02:00
parent 80e36cf86e
commit 49d6eef95d
2 changed files with 44 additions and 20 deletions

View File

@ -6,6 +6,7 @@ import random
# config variables
actualDir = os.path.dirname(os.path.realpath(__file__))
dataDir = os.path.join(actualDir, '../data')
@ -23,18 +24,10 @@ class trainingsset:
#def __init__(self):
classes = ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'learned', 'lore','mystery','news','romantic']
def createTrainingsset(self):
self.splitTrainingsdataRandomly(self) # first split our data into trainings- and testdata
def getClassesToTrain(self):
for dirpath, dirnames, filenames in os.walk(trainDir):
# TODO: implement
pass
# copies files randomly to new directories. Each directory will contain fileCount / 2 numbers of files
# If fileCount is uneven /trainingsdata will contain one file more than /testdata
@ -82,8 +75,9 @@ class trainingsset:
'''
class multiclassClassifier:
filesToPrediction = {};
testFiles = {}
termfrequenciesOfClasses = {};
countClass = {}
def writePredictionFile(self):
@ -93,13 +87,43 @@ class multiclassClassifier:
f.closed
return
# reads all testData from /u4_train to the list
def getTestData(self):
listing = os.listdir(testDir)
for infile in listing:
f = open(testDir+'/'+infile, 'r')
self.testFiles[infile] = f.readline()
f.close()
# calculates all necessary stuff for multiclass classifier
def getTermfrequenciesOfClasses(self):
listing = os.listdir(trainDir)
for classes in listing: # classes
self.termfrequenciesOfClasses[classes] = {}
for classes in self.termfrequenciesOfClasses.keys():
currentPath = trainDir+'/'+classes+'/trainingsdata'
listing = os.listdir(currentPath)
for infile in listing:
if self.countClass.has_key(classes):
self.countClass[classes] += 1
else:
self.countClass[classes] = 1
currentPath = trainDir+'/'+classes+'/trainingsdata/'+infile
# update termfrequency for specific class:
self.termfrequenciesOfClasses[classes] = self.updateDictonary(currentPath, self.termfrequenciesOfClasses[classes])
# "incudludes" a file into the termfrequency dictonary
def updateDictonary(self, pathToFile, dictonary):
f = open(pathToFile, 'r')
lines = f.readlines();
for line in lines:
thisline = line.split(" ");
for word in thisline:
word = word.lower()
word = word.replace(".", " ")
word = word.replace(",", " ")
if dictonary.has_key(word):
dictonary[str(word)] += 1
else:
dictonary[str(word)] = 1
f.close()
return dictonary
def bayes(self, listOfFiles):
@ -119,10 +143,10 @@ class multiclassClassifier:
if __name__ == '__main__':
ts = trainingsset()
#ts.splitTrainingsdataRandomly();
ts.getClassesToTrain();
#ts.getClassesToTrain();
mc = multiclassClassifier()
#mc.getTestData()
mc.getTermfrequenciesOfClasses()
#mc.writePredictionFile()

Binary file not shown.