u4 update

This commit is contained in:
Michael Scholz 2013-06-05 14:26:45 +02:00
parent f80de8e68d
commit 80e36cf86e
148 changed files with 78 additions and 12 deletions

View File

@ -7,26 +7,44 @@ import random
# config variables
actualDir = os.path.dirname(os.path.realpath(__file__))
dataDir = os.path.join(actualDir, '../data')
trainDir = os.path.join(dataDir, 'u4_train')
testDir = os.path.join(dataDir, 'u4_test')
'''
################################################################################################################################
--> CLASS Trainingsset <--
################################################################################################################################
'''
class trainingsset:
actualDir = os.path.dirname(os.path.realpath(__file__))
dataDir = os.path.join(actualDir, '../data')
trainDir = os.path.join(dataDir, 'u4_train')
#def __init__(self):
classes = ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'learned', 'lore','mystery','news','romantic']
def createTrainingsset(self):
self.splitTrainingsdataRandomly(self) # first split our data into trainings- and testdata
def getClassesToTrain(self):
for dirpath, dirnames, filenames in os.walk(trainDir):
# TODO: implement
pass
# copies files randomly to new directories. Each directory will contain fileCount / 2 numbers of files
# If fileCount is uneven /trainingsdata will contain one file more than /testdata
def splitTrainingsdataRandomly(self):
for dirpath, dirnames, filenames in os.walk(self.trainDir, topdown=False):
for dirpath, dirnames, filenames in os.walk(trainDir, topdown=False):
newTrainDir = dirpath+'/trainingsdata'
newTestDir = dirpath+'/testdata'
fileCount = len(filenames)
if(fileCount > 0):
if(fileCount > 0):
#remove old dirs if they already exist
if os.path.isdir(newTrainDir):
shutil.rmtree(newTrainDir)
@ -39,8 +57,6 @@ class trainingsset:
numberOfFilesInTraining = 0
numberOfFilesInTest = 0
# copies files randomly to new directories. Each directory will contain fileCount / 2 numbers of files
# If fileCount is uneven /trainingsdata will contain one file more than /testdata
for actualFile in filenames:
fileCopied = False
@ -57,7 +73,57 @@ class trainingsset:
shutil.copy(dirpath+'/'+actualFile, dirpath+'/testdata/'+actualFile)
'''
################################################################################################################################
--> CLASS MulticlassClassifier <--
################################################################################################################################
'''
class multiclassClassifier:
filesToPrediction = {};
testFiles = {}
def writePredictionFile(self):
with open(actualDir+'/../G22_predictions.txt', 'w') as f:
for k in sorted(self.filesToPrediction.iterkeys()):
f.write(str(k)+'\t'+str(self.filesToPrediction[k])+'\n')
f.closed
return
# reads all testData from /u4_train to the list
def getTestData(self):
listing = os.listdir(testDir)
for infile in listing:
f = open(testDir+'/'+infile, 'r')
self.testFiles[infile] = f.readline()
f.close()
def bayes(self, listOfFiles):
#TODO : implement
pass
'''
################################################################################################################################
--> Main method <--
################################################################################################################################
'''
# main method
if __name__ == '__main__':
bla = trainingsset()
bla.splitTrainingsdataRandomly();
ts = trainingsset()
#ts.splitTrainingsdataRandomly();
ts.getClassesToTrain();
mc = multiclassClassifier()
#mc.getTestData()
#mc.writePredictionFile()

Some files were not shown because too many files have changed in this diff Show More