diff --git a/evaluate_data_with_stop.py b/evaluate_data_with_stop.py new file mode 100755 index 0000000000000000000000000000000000000000..8490311e3c2f4841298d9c2a72c171823ee2e818 --- /dev/null +++ b/evaluate_data_with_stop.py @@ -0,0 +1,303 @@ +#!/usr/bin/env python + +import fnmatch +import os +import sys +from operator import itemgetter +import time +import matplotlib.pyplot as plt +import matplotlib.patches as mpatches +import numpy as np +import matplotlib.mlab as mlab +import csv + +DO_PRINT_RECORDS = False +PATH_TO_RECORDS = '' # gets set from command line parameter + +# data/record filenames +INFO_EXTENSION = '.info' +FULL_RECORD_FILENAME = 'fullRecord' + INFO_EXTENSION +COMMIT_INFO_FILENAME = 'buildInfo_musl_with_stop' + INFO_EXTENSION +BUILD_TIME_DATA_FILENAME = 'totalBuildTimes.csv' +BUILD_TIME_FILENAME = 'totalBuildTimes.pdf' +BUILD_TIME_DATA_HEADER = ['totalParseTimes', 'totalHashTimes', 'totalCompileTimes', 'diffToBuildTimes', 'totalBuildTimes'] + + +def abs_path(filename): + """Prepends the absolute path to the filename. + """ + return PATH_TO_RECORDS + '/../' + filename + + +def getListOfFiles(directory): + for root, dirnames, filenames in os.walk(directory): + for filename in fnmatch.filter(filenames, '*' + INFO_EXTENSION): + yield os.path.join(root, filename) + + +errorCount = 0 +astDifferObjSameCount = 0 +missingCount = 0 + +################################################################################ +# +# +################################################################################ + + +keyTranslationToNr = { + 'start-time': 0, + 'hash-start-time': 1, + 'object-hash': 2, + 'return-code': 3, + 'parse-duration': 4, + 'object-file-size': 5, + 'processed-bytes': 6, + 'hash-duration': 7, + 'filename': 8, + 'project': 9, + 'compile-duration': 10, # time the compiler was running (incl. parse-duration) + 'ast-hash': 11, + 'commit-hash': 12, + 'element-hashes': 13, + 'commit-time': 14, + 'build-time': 15, # time the 'make -jx' command took, times x + 'files': 16, + 'files-changed': 17, + 'insertions': 18, + 'deletions': 19, + 'run_id': 20 +} +keyTranslationFromNr = {v: k for k, v in keyTranslationToNr.items()} + +keyTranslation = keyTranslationToNr.copy() +keyTranslation.update(keyTranslationFromNr) + + +def tr(key): + """lookup key translation (both directions)""" + return keyTranslation[key] + + +def buildFullRecordTo(pathToFullRecordFile): + """structure of full record: + {commitID: {'build-time': time, files: {filename: {record}, filename: {record}}}} + """ + fullRecord = buildFullRecord() + if DO_PRINT_RECORDS: + f = open(pathToFullRecordFile, 'w') + try: + f.write(repr(fullRecord) + "\n") + except MemoryError as me: + print me + raise + finally: + print time.ctime() + f.close() + print "built full record, wrote to " + pathToFullRecordFile + + return fullRecord + + +def buildFullRecord(): + """Builds a complete record from all the single hash records. + The records are grouped by the commitIDs + """ + fullRecord = {} + with open(abs_path(COMMIT_INFO_FILENAME), 'r') as commitInfoFile: + commitInfo = eval(commitInfoFile.read()) + for run_id in commitInfo: + if not isinstance(run_id, int): # dict also contains key 'commit-hash' + continue; + currentRecord = {} + currentRecord[tr('filename')] = commitInfo[run_id]['filename'] + currentRecord[tr('build-time')] = commitInfo[run_id]['build-time'] + currentRecord[tr('files')] = {} + fullRecord[run_id] = currentRecord + + for recordFilename in getListOfFiles(PATH_TO_RECORDS): + for line in open(recordFilename): + data = eval(line) +# commitID = data['commit-hash'] +# del data['commit-hash'] + + objFilename = data['obj-file'] + del data['obj-file'] + + + # del everything I don't need + del data['return-code'] + del data['element-hashes'] + del data['project'] + del data['processed-bytes'] + del data['object-file-size'] + + run_id = data['run_id'] + + dataNewKeys = {tr(k): v for k, v in data.items()} + fullRecord[run_id][tr('files')][objFilename] = dataNewKeys + + return fullRecord + + +################################################################################ + +def write_to_csv(data, columnNames, filename): + with open(filename, "w") as csvFile: + writer = csv.writer(csvFile) + writer.writerow(columnNames) + for line in data: + writer.writerow(line) + + +def printAvg(data, name): + print 'avg %s: %f' % (name, sum(data)/float(len(data))) + +################################################################################ + +parseColor, hashColor, compileColor, remainColor = ('#FFFF66','#FF0000','#3399FF','#008800') + +def plot_build_time_graph1(data): + plotBuildTimeCompositionGraph(data[0], data[1], data[2], data[3]) + + +def plotBuildTimeCompositionGraph(parseTimes, hashTimes, compileTimes, diffToBuildTime): # times in ms + fig, ax = plt.subplots() + + ax.stackplot(np.arange(1, len(parseTimes)+1), # x axis + [parseTimes, hashTimes, compileTimes, + #diffToBuildTime + ], colors=[parseColor,hashColor,compileColor, + # remainColor + ], edgecolor='none') + plt.xlim(1,len(parseTimes)) + plt.xlabel('commits') + plt.ylabel('time [s]') + ax.set_yscale('log') + lgd = ax.legend([#mpatches.Patch(color=remainColor), + mpatches.Patch(color=compileColor), + mpatches.Patch(color=hashColor), + mpatches.Patch(color=parseColor)], + [#'remaining build time', + 'compile time', 'hash time', 'parse time'], + loc='center left', bbox_to_anchor=(1, 0.5)) + fig.savefig(abs_path(BUILD_TIME_FILENAME), bbox_extra_artists=(lgd,), bbox_inches='tight') + + printAvg(parseTimes, 'parse') + printAvg(hashTimes, 'hash') + printAvg(compileTimes, 'compile') + printAvg(diffToBuildTime, 'remainder') + + + +################################################################################ + +def makeGraphs(fullRecord): + # data for build time graphs + totalParseTimes = [] + totalHashTimes = [] + totalCompileTimes = [] + totalBuildTimes = [] + diffToBuildTimes = [] + +# freshBuildRecord = fullRecord[0] + for run_id in fullRecord: + if run_id < 2: # skip fresh build (and also 1st, seems to be buggy...) + continue + + currentRecord = fullRecord[run_id] + currentFiles = currentRecord[tr('files')] + filesChanged = len(currentFiles) # count changed files per run #TODO! + + print currentRecord[tr('filename')] + + totalParseDuration = 0 + totalHashDuration = 0 + totalCompileDuration = 0 + for filename in currentFiles: # deal with first commit +# if tr('ast-hash') not in currentFiles[filename].keys(): +# print "error: missing AST hash for file %s" % filename +# continue + currentFileRecord = currentFiles[filename] + totalParseDuration += currentFileRecord[tr('parse-duration')] + totalHashDuration += currentFileRecord[tr('hash-duration')] + totalCompileDuration += currentFileRecord[tr('compile-duration')] + +# if totalParseDuration == 0:# or (totalCompileDuration/1e6) > 500000: +# continue + + totalParseTimes.append(totalParseDuration / 1e6) # nano to milli + totalHashTimes.append(totalHashDuration / 1e6) + totalCompileTimes.append(totalCompileDuration / 1e6) + buildTime = currentRecord[tr('build-time')] + totalBuildTimes.append(buildTime / 1e6) + diffToBuildTimes.append((buildTime - totalParseDuration - totalHashDuration - totalCompileDuration) / 1e6) + + + print 'run_id %d, #filesChanged: %d' % (run_id, filesChanged) + + printAvg(totalBuildTimes, 'total') + + # save data to csv files + buildTimeData = np.column_stack((totalParseTimes, totalHashTimes, totalCompileTimes, diffToBuildTimes, totalBuildTimes)) + write_to_csv(buildTimeData, BUILD_TIME_DATA_HEADER, abs_path(BUILD_TIME_DATA_FILENAME)) + + plotBuildTimeCompositionGraph(totalParseTimes, totalHashTimes, totalCompileTimes, diffToBuildTimes) + +################################################################################ +"""functions for reading data from the csv files to skip full record building""" + +def csv_files_are_existing(): + return os.path.isfile(abs_path(BUILD_TIME_DATA_FILENAME)) + +def read_from_csv(filename, columnNames): + data = [] + with open(filename) as csv_file: + reader = csv.reader(csv_file) + is_header_row = True + for row in reader: + if is_header_row: + for col in row: + data.append([]) + is_header_row = False + else: + colnum = 0 + for col in row: + data[colnum].append(float(col)) + colnum += 1 + return data + + +def read_csv_data_and_plot_graphs(): + """Build the graphs from the data from the csv files from previous runs + to save time by skipping the whole "full record" building. + """ + build_time_data = read_from_csv(abs_path(BUILD_TIME_DATA_FILENAME), BUILD_TIME_DATA_HEADER) + plot_build_time_graph1(build_time_data) + + +################################################################################ + + +# main: +if (len(sys.argv) > 1): + PATH_TO_RECORDS = sys.argv[1] + path_to_full_record_file = abs_path(FULL_RECORD_FILENAME) + print "Starting at %s" % time.ctime() + +# if csv_files_are_existing(): +# # skip building record, read csv data from files and work with that +# print "reading from csv files" +# read_csv_data_and_plot_graphs() +# print "finished graphs at %s" % time.ctime() +# else: + full_record = buildFullRecordTo(path_to_full_record_file) + print "finished building/loading full record at %s" % time.ctime() + + makeGraphs(full_record) + print "finished graphs at %s" % time.ctime() + + print "Finished at %s" % time.ctime() +else: + print "Missing path to record files.\nUsage:\n\t%s PATH_TO_RECORDS" % sys.argv[0] + diff --git a/run_test_with_stopping.py b/run_test_with_stopping.py index d69ec1f20b6c08853e792c845c8b9615feeca8e3..d4d348589e9e1e2dafdd98d208a76f5e3d47552a 100755 --- a/run_test_with_stopping.py +++ b/run_test_with_stopping.py @@ -7,72 +7,133 @@ from subprocess import check_output import datetime import time import sys +import fnmatch +# remove unused imports + make_j = 16 #TODO: make paths independent => make project path command line argument! -projectName = "musl" -pathToProject = os.path.abspath("../hash_projects/musl") -clanghashWrapper = os.path.abspath("build/wrappers/clang") -buildInfoFilePath = os.path.abspath("build/muslHashes/buildInfo_%s.info" % projectName) - - -def checkout(commitID): - """checkout commit with commitID""" - os.chdir(pathToProject) - subprocess.call(["git", "checkout", "-f", "-q", commitID]) - +project_name = "musl" +path_to_project = os.path.abspath("../hash_projects/musl") +clanghash_wrapper = os.path.abspath("build/wrappers/clang") +build_info_file_path = os.path.abspath("build/muslHashes/buildInfo_%s_with_stop.info" % project_name) + + +def get_source_files(directory): + for root, dirnames, filenames in os.walk(directory): + for filename in filenames: + if filename.endswith(('.h','.c')): + yield os.path.join(root, filename) + +def touch(filename): + with open(filename, 'a'): + os.utime(filename, None)#TODO: evtl None als 2. param + +def git_checkout(commit_id): + """checkout commit with commit_id""" + os.chdir(path_to_project) + subprocess.call(["git", "checkout", "-f", "-q", commit_id]) + +def make_clean(): + subprocess.call(["make", "clean"]) + +def git_clean(): + subprocess.call(["git", "clean", "-f", "-q", "-x"]) + +def call_configure(): + """ + TODO: rename this function to "init_build_environment" or sth like that and replace it for other projects. also think about calling call_make here (for first build). + """ + subprocess.call(["./configure"]) + +def call_make(): + """this function builds the project (e.g. calls make) + TODO: replace this method for other projects; or move to class and then replace that class. + """ + print "calling make -j%d" % make_j + p = subprocess.Popen(["make", "-j%d" % make_j], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + out, err = p.communicate() + retcode = p.wait() + print ">>>>>>>>>>" + print out + print err + print "<<<<<<<<<<" -def getCommitTime(commitID): - os.chdir(pathToProject) - commitTime = check_output(["git", "show", "-s", "--format=%ct", commitID]) - return commitTime.replace('\n', '') +################################################################################ -DEBUG = 1 -def log(message): - if (DEBUG == 1): - print message +print "Starting at %s" % datetime.datetime.now() -################################################################################ +filecount = 0 +for root, dirnames, filenames in os.walk(path_to_project): + for filename in fnmatch.filter(filenames, '*.c'): + filecount += 1 +print "#files: %d" % filecount -log("Starting at %s" % datetime.datetime.now()) -os.environ['CC'] = clanghashWrapper -os.environ['PROJECT'] = projectName +os.environ['CC'] = clanghash_wrapper +os.environ['PROJECT'] = project_name os.environ['STOP_IF_SAME_HASH'] = '1' #reset to latest version -checkout("master") -log("calling make clean and git clean") -subprocess.call(["make", "clean"]) -subprocess.call(["git", "clean", "-f", "-q", "-x"]) -subprocess.call(["./configure"]) - -commitInfo = {} -commitID = check_output(["git", "log", "-1", "--pretty='%H'"]) # get current commit hash -commitInfo['commit-hash'] = commitID -os.environ['COMMIT_HASH'] = commitID +git_checkout("master") +print "calling make clean and git clean" +make_clean() +git_clean() +call_configure() + +build_info = {} +commit_id = check_output(["git", "log", "-1", "--pretty='%H'"]) # get current commit hash +build_info['commit-hash'] = commit_id +os.environ['COMMIT_HASH'] = commit_id + +print "starting initial build at %s" % datetime.datetime.now() #TODO: evtl. refactor duplicate code +run_id = 0 # for collecting the data later +os.environ['RUN_ID'] = "%d" % run_id +build_info[run_id] = {} +current_build_info = build_info[run_id] +current_build_info['filename'] = 'FRESH_BUILD' + +start_time = time.time() +call_make() +build_time = int((time.time() - start_time) * 1e9) * make_j # nano * nr of threads +current_build_info['build-time'] = int((time.time() - start_time) * 1e9) * make_j # nano * nr of threads +print "finished initial build at %s, duration: %s" % (datetime.datetime.now(), build_time) + +""" +TODO: +for file in dir: + touch file/insert NL + make + collect data (this has to be done in clang.in?): + times: parse, compilation, hashing + #files changed/#files w/ same hashes +""" +for filename in get_source_files(path_to_project): + run_id += 1 + os.environ['RUN_ID'] = "%d" % run_id + print "current id: %d, current file: %s" % (run_id, filename) -log("calling make -j%d" % make_j) -startTime = time.time() -p = subprocess.Popen(["make", "-j%d" % make_j], stdout=subprocess.PIPE, stderr=subprocess.PIPE) -out, err = p.communicate() -retcode = p.wait() + build_info[run_id] = {} + current_build_info = build_info[run_id] + current_build_info['filename'] = filename + + touch(filename) #TODO: use abs filepath? -log(">>>>>>>>>>") -log(out) -log(err) -log("<<<<<<<<<<") + print "starting build at %s" % datetime.datetime.now() #TODO: evtl. refactor duplicate code + start_time = time.time() + call_make() + current_build_info['build-time'] = int((time.time() - start_time) * 1e9) * make_j # nano * nr of threads + print "finished commit %s at %s" % (commit_id, datetime.datetime.now()) + -commitInfo['build-time'] = int((time.time() - startTime) * 1e9) * make_j # nano * nr of threads -log("finished commit %s at %s" % (commitID, datetime.datetime.now())) -f = open(buildInfoFilePath, 'a') -f.write(repr(commitInfo) + "\n") +f = open(build_info_file_path, 'a') +f.write(repr(build_info) + "\n") f.close() -log("Finished at %s" % datetime.datetime.now()) +print "Finished at %s" % datetime.datetime.now() diff --git a/validate_hashes.py b/validate_hashes.py index 4d9ccc82c1578eb1ad059eae591607b935367cc8..2c504d1fb7eff51b328be0e55bcaa8af8fe3ebbe 100755 --- a/validate_hashes.py +++ b/validate_hashes.py @@ -22,7 +22,7 @@ FULL_RECORD_FILENAME = 'fullRecord' + INFO_EXTENSION COMMIT_INFO_FILENAME = 'commitInfo_musl' + INFO_EXTENSION # graph filenames -PNG_EXTENSION = '.png' +PNG_EXTENSION = '.pdf' #TODO: rename to sth. like GRAPH_FILE_EXTENSION PARSE_TIME_HISTOGRAM_FILENAME = 'parseTimeHistogram' + PNG_EXTENSION HASH_TIME_HISTOGRAM_FILENAME = 'hashTimeHistogram' + PNG_EXTENSION COMPILE_TIME_HISTOGRAM_FILENAME = 'compileTimeHistogram' + PNG_EXTENSION @@ -186,7 +186,11 @@ def buildFullRecord(): for commitID in commitInfo: fullRecord[commitID] = {} fullRecord[commitID][tr('commit-time')] = commitInfo[commitID]['commit-time'] - fullRecord[commitID][tr('build-time')] = commitInfo[commitID]['build-time'] + print commitID + if 'build-time' in commitInfo[commitID]: + fullRecord[commitID][tr('build-time')] = commitInfo[commitID]['build-time'] + else: + fullRecord[commitID][tr('build-time')] = 0 fullRecord[commitID][tr('files')] = {} fullRecord[commitID][tr('files-changed')] = commitInfo[commitID]['files-changed'] @@ -246,22 +250,32 @@ def plotBuildTimeGraph(measuredBuildTimes, realClangHashBuildTimes, optimalClang def plot_build_time_composition_graph1(data): plotBuildTimeCompositionGraph(data[0], data[1], data[2], data[3]) +def printAvg(data, name): + print 'avg %s: %f' % (name, sum(data)/float(len(data))) + +parseColor, hashColor, compileColor, remainColor = ('#FFFF66','#FF0000','#3399FF','#008800') + def plotBuildTimeCompositionGraph(parseTimes, hashTimes, compileTimes, diffToBuildTime): # times in s fig, ax = plt.subplots() - + ax.stackplot(np.arange(1, len(parseTimes)+1), # x axis - [parseTimes, hashTimes, compileTimes, diffToBuildTime], - colors=['#008800','#FF0000','#0000FF', '#000000']) +# [parseTimes, hashTimes, compileTimes, diffToBuildTime], + [[i/60 for i in parseTimes], [i/60 for i in hashTimes], [i/60 for i in compileTimes], [i/60 for i in diffToBuildTime]], + colors=[parseColor,hashColor,compileColor,remainColor], edgecolor='none') plt.xlim(1,len(parseTimes)) plt.xlabel('commits') - plt.ylabel('time [s]') - lgd = ax.legend([mpatches.Patch(color='#000000'), - mpatches.Patch(color='#0000FF'), - mpatches.Patch(color='#FF0000'), - mpatches.Patch(color='#008800')], + plt.ylabel('time [min]') + lgd = ax.legend([mpatches.Patch(color=remainColor), + mpatches.Patch(color=compileColor), + mpatches.Patch(color=hashColor), + mpatches.Patch(color=parseColor)], ['remaining build time','compile time', 'hash time', 'parse time'], loc='center left', bbox_to_anchor=(1, 0.5)) fig.savefig(abs_path(BUILD_TIME_COMPOSITION_FILENAME), bbox_extra_artists=(lgd,), bbox_inches='tight') + printAvg(parseTimes, 'parse') + printAvg(hashTimes, 'hash') + printAvg(compileTimes, 'compile') + printAvg(diffToBuildTime, 'remainder') def plotTimeHistogram(times, filename): @@ -280,14 +294,19 @@ def plotTimeMultiHistogram(parseTimes, hashTimes, compileTimes, filename): bins = np.linspace(0, 5000, 50) data = np.vstack([parseTimes, hashTimes, compileTimes]).T fig, ax = plt.subplots() - plt.hist(data, bins, alpha=0.7, label=['parsing', 'hashing', 'compiling']) + plt.hist(data, bins, alpha=0.7, label=['parsing', 'hashing', 'compiling'], color=[parseColor, hashColor, compileColor]) plt.legend(loc='upper right') + plt.xlabel('time [ms]') + plt.ylabel('#files') fig.savefig(filename) fig, ax = plt.subplots() data = [parseTimes, hashTimes, compileTimes] - plt.boxplot(data, 0, 'rs', 0)#, [5, 95]) - fig.savefig(filename[:-4] + '_boxplots.png') + plt.boxplot(data, 0, 'rs', 0, [5, 95]) + plt.xlabel('time [ms]') + plt.yticks([1, 2, 3], ['parsing', 'hashing', 'compiling']) + #lgd = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # legend on the right + fig.savefig(filename[:-4] + '_boxplots' + PNG_EXTENSION) @@ -306,11 +325,12 @@ def plot_changes_graph1(data): def plotChangesGraph(fileCounts, sameHashes, differentAstHashes, differentObjHashes): fig, ax = plt.subplots() - - ax.plot(fileCounts, label='#objfiles') - ax.plot(sameHashes, label='unchanged') - ax.plot(differentAstHashes, label='astHash differs') - ax.plot(differentObjHashes, label='objHash differs') + + #('#FFFF66','#FF0000','#3399FF','#008800') + ax.plot(fileCounts, label='#objfiles', color='#EEAD0E')#'black') + #ax.plot(sameHashes, label='unchanged')#, color='blue') + ax.plot(differentAstHashes, label='astHash differs', color=compileColor)#'#000088') + ax.plot(differentObjHashes, label='objHash differs', color=hashColor)#'#FFFF00') #'#0099FF') box = ax.get_position() lgd = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # legend on the right @@ -397,6 +417,7 @@ def makeGraphs(fullRecord): totalFilesChanged += currentCommit[tr('files-changed')] for filename in currentFiles: + fileCount += 1 if tr('ast-hash') not in currentFiles[filename].keys(): print "error: missing AST hash for file %s" % filename continue @@ -451,7 +472,6 @@ def makeGraphs(fullRecord): else: same += 1 - fileCount += 1 if missingFiles > currentCommit[tr('files-changed')]: print "!!!!FAIL!!!!" @@ -483,7 +503,8 @@ def makeGraphs(fullRecord): prevCommit = currentCommit prevCommitID = commitID - + if fileCount == 0: + print "no filecount at %s" % commitID print "missingFilesTotal: %d, missingFileErrors: %d" % (missingFilesTotal, missingFileErrors) print "totalFilesChanged: %d, sizes: parseTimes(%d), hashTimes(%d), compileTimes(%d)" % (totalFilesChanged, len(parseTimes), len(hashTimes), len(compileTimes))