Commit ba37f93b authored by Ludwig Fueracker's avatar Ludwig Fueracker
Browse files

updated/new scripts

parent 3c9d08db
#!/usr/bin/env python
import fnmatch
import os
import sys
from operator import itemgetter
import time
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import matplotlib.mlab as mlab
import csv
DO_PRINT_RECORDS = False
PATH_TO_RECORDS = '' # gets set from command line parameter
# data/record filenames
INFO_EXTENSION = '.info'
FULL_RECORD_FILENAME = 'fullRecord' + INFO_EXTENSION
COMMIT_INFO_FILENAME = 'buildInfo_musl_with_stop' + INFO_EXTENSION
BUILD_TIME_DATA_FILENAME = 'totalBuildTimes.csv'
BUILD_TIME_FILENAME = 'totalBuildTimes.pdf'
BUILD_TIME_DATA_HEADER = ['totalParseTimes', 'totalHashTimes', 'totalCompileTimes', 'diffToBuildTimes', 'totalBuildTimes']
def abs_path(filename):
"""Prepends the absolute path to the filename.
"""
return PATH_TO_RECORDS + '/../' + filename
def getListOfFiles(directory):
for root, dirnames, filenames in os.walk(directory):
for filename in fnmatch.filter(filenames, '*' + INFO_EXTENSION):
yield os.path.join(root, filename)
errorCount = 0
astDifferObjSameCount = 0
missingCount = 0
################################################################################
#
#
################################################################################
keyTranslationToNr = {
'start-time': 0,
'hash-start-time': 1,
'object-hash': 2,
'return-code': 3,
'parse-duration': 4,
'object-file-size': 5,
'processed-bytes': 6,
'hash-duration': 7,
'filename': 8,
'project': 9,
'compile-duration': 10, # time the compiler was running (incl. parse-duration)
'ast-hash': 11,
'commit-hash': 12,
'element-hashes': 13,
'commit-time': 14,
'build-time': 15, # time the 'make -jx' command took, times x
'files': 16,
'files-changed': 17,
'insertions': 18,
'deletions': 19,
'run_id': 20
}
keyTranslationFromNr = {v: k for k, v in keyTranslationToNr.items()}
keyTranslation = keyTranslationToNr.copy()
keyTranslation.update(keyTranslationFromNr)
def tr(key):
"""lookup key translation (both directions)"""
return keyTranslation[key]
def buildFullRecordTo(pathToFullRecordFile):
"""structure of full record:
{commitID: {'build-time': time, files: {filename: {record}, filename: {record}}}}
"""
fullRecord = buildFullRecord()
if DO_PRINT_RECORDS:
f = open(pathToFullRecordFile, 'w')
try:
f.write(repr(fullRecord) + "\n")
except MemoryError as me:
print me
raise
finally:
print time.ctime()
f.close()
print "built full record, wrote to " + pathToFullRecordFile
return fullRecord
def buildFullRecord():
"""Builds a complete record from all the single hash records.
The records are grouped by the commitIDs
"""
fullRecord = {}
with open(abs_path(COMMIT_INFO_FILENAME), 'r') as commitInfoFile:
commitInfo = eval(commitInfoFile.read())
for run_id in commitInfo:
if not isinstance(run_id, int): # dict also contains key 'commit-hash'
continue;
currentRecord = {}
currentRecord[tr('filename')] = commitInfo[run_id]['filename']
currentRecord[tr('build-time')] = commitInfo[run_id]['build-time']
currentRecord[tr('files')] = {}
fullRecord[run_id] = currentRecord
for recordFilename in getListOfFiles(PATH_TO_RECORDS):
for line in open(recordFilename):
data = eval(line)
# commitID = data['commit-hash']
# del data['commit-hash']
objFilename = data['obj-file']
del data['obj-file']
# del everything I don't need
del data['return-code']
del data['element-hashes']
del data['project']
del data['processed-bytes']
del data['object-file-size']
run_id = data['run_id']
dataNewKeys = {tr(k): v for k, v in data.items()}
fullRecord[run_id][tr('files')][objFilename] = dataNewKeys
return fullRecord
################################################################################
def write_to_csv(data, columnNames, filename):
with open(filename, "w") as csvFile:
writer = csv.writer(csvFile)
writer.writerow(columnNames)
for line in data:
writer.writerow(line)
def printAvg(data, name):
print 'avg %s: %f' % (name, sum(data)/float(len(data)))
################################################################################
parseColor, hashColor, compileColor, remainColor = ('#FFFF66','#FF0000','#3399FF','#008800')
def plot_build_time_graph1(data):
plotBuildTimeCompositionGraph(data[0], data[1], data[2], data[3])
def plotBuildTimeCompositionGraph(parseTimes, hashTimes, compileTimes, diffToBuildTime): # times in ms
fig, ax = plt.subplots()
ax.stackplot(np.arange(1, len(parseTimes)+1), # x axis
[parseTimes, hashTimes, compileTimes,
#diffToBuildTime
], colors=[parseColor,hashColor,compileColor,
# remainColor
], edgecolor='none')
plt.xlim(1,len(parseTimes))
plt.xlabel('commits')
plt.ylabel('time [s]')
ax.set_yscale('log')
lgd = ax.legend([#mpatches.Patch(color=remainColor),
mpatches.Patch(color=compileColor),
mpatches.Patch(color=hashColor),
mpatches.Patch(color=parseColor)],
[#'remaining build time',
'compile time', 'hash time', 'parse time'],
loc='center left', bbox_to_anchor=(1, 0.5))
fig.savefig(abs_path(BUILD_TIME_FILENAME), bbox_extra_artists=(lgd,), bbox_inches='tight')
printAvg(parseTimes, 'parse')
printAvg(hashTimes, 'hash')
printAvg(compileTimes, 'compile')
printAvg(diffToBuildTime, 'remainder')
################################################################################
def makeGraphs(fullRecord):
# data for build time graphs
totalParseTimes = []
totalHashTimes = []
totalCompileTimes = []
totalBuildTimes = []
diffToBuildTimes = []
# freshBuildRecord = fullRecord[0]
for run_id in fullRecord:
if run_id < 2: # skip fresh build (and also 1st, seems to be buggy...)
continue
currentRecord = fullRecord[run_id]
currentFiles = currentRecord[tr('files')]
filesChanged = len(currentFiles) # count changed files per run #TODO!
print currentRecord[tr('filename')]
totalParseDuration = 0
totalHashDuration = 0
totalCompileDuration = 0
for filename in currentFiles: # deal with first commit
# if tr('ast-hash') not in currentFiles[filename].keys():
# print "error: missing AST hash for file %s" % filename
# continue
currentFileRecord = currentFiles[filename]
totalParseDuration += currentFileRecord[tr('parse-duration')]
totalHashDuration += currentFileRecord[tr('hash-duration')]
totalCompileDuration += currentFileRecord[tr('compile-duration')]
# if totalParseDuration == 0:# or (totalCompileDuration/1e6) > 500000:
# continue
totalParseTimes.append(totalParseDuration / 1e6) # nano to milli
totalHashTimes.append(totalHashDuration / 1e6)
totalCompileTimes.append(totalCompileDuration / 1e6)
buildTime = currentRecord[tr('build-time')]
totalBuildTimes.append(buildTime / 1e6)
diffToBuildTimes.append((buildTime - totalParseDuration - totalHashDuration - totalCompileDuration) / 1e6)
print 'run_id %d, #filesChanged: %d' % (run_id, filesChanged)
printAvg(totalBuildTimes, 'total')
# save data to csv files
buildTimeData = np.column_stack((totalParseTimes, totalHashTimes, totalCompileTimes, diffToBuildTimes, totalBuildTimes))
write_to_csv(buildTimeData, BUILD_TIME_DATA_HEADER, abs_path(BUILD_TIME_DATA_FILENAME))
plotBuildTimeCompositionGraph(totalParseTimes, totalHashTimes, totalCompileTimes, diffToBuildTimes)
################################################################################
"""functions for reading data from the csv files to skip full record building"""
def csv_files_are_existing():
return os.path.isfile(abs_path(BUILD_TIME_DATA_FILENAME))
def read_from_csv(filename, columnNames):
data = []
with open(filename) as csv_file:
reader = csv.reader(csv_file)
is_header_row = True
for row in reader:
if is_header_row:
for col in row:
data.append([])
is_header_row = False
else:
colnum = 0
for col in row:
data[colnum].append(float(col))
colnum += 1
return data
def read_csv_data_and_plot_graphs():
"""Build the graphs from the data from the csv files from previous runs
to save time by skipping the whole "full record" building.
"""
build_time_data = read_from_csv(abs_path(BUILD_TIME_DATA_FILENAME), BUILD_TIME_DATA_HEADER)
plot_build_time_graph1(build_time_data)
################################################################################
# main:
if (len(sys.argv) > 1):
PATH_TO_RECORDS = sys.argv[1]
path_to_full_record_file = abs_path(FULL_RECORD_FILENAME)
print "Starting at %s" % time.ctime()
# if csv_files_are_existing():
# # skip building record, read csv data from files and work with that
# print "reading from csv files"
# read_csv_data_and_plot_graphs()
# print "finished graphs at %s" % time.ctime()
# else:
full_record = buildFullRecordTo(path_to_full_record_file)
print "finished building/loading full record at %s" % time.ctime()
makeGraphs(full_record)
print "finished graphs at %s" % time.ctime()
print "Finished at %s" % time.ctime()
else:
print "Missing path to record files.\nUsage:\n\t%s PATH_TO_RECORDS" % sys.argv[0]
......@@ -7,72 +7,133 @@ from subprocess import check_output
import datetime
import time
import sys
import fnmatch
# remove unused imports
make_j = 16
#TODO: make paths independent => make project path command line argument!
projectName = "musl"
pathToProject = os.path.abspath("../hash_projects/musl")
clanghashWrapper = os.path.abspath("build/wrappers/clang")
buildInfoFilePath = os.path.abspath("build/muslHashes/buildInfo_%s.info" % projectName)
def checkout(commitID):
"""checkout commit with commitID"""
os.chdir(pathToProject)
subprocess.call(["git", "checkout", "-f", "-q", commitID])
project_name = "musl"
path_to_project = os.path.abspath("../hash_projects/musl")
clanghash_wrapper = os.path.abspath("build/wrappers/clang")
build_info_file_path = os.path.abspath("build/muslHashes/buildInfo_%s_with_stop.info" % project_name)
def get_source_files(directory):
for root, dirnames, filenames in os.walk(directory):
for filename in filenames:
if filename.endswith(('.h','.c')):
yield os.path.join(root, filename)
def touch(filename):
with open(filename, 'a'):
os.utime(filename, None)#TODO: evtl None als 2. param
def git_checkout(commit_id):
"""checkout commit with commit_id"""
os.chdir(path_to_project)
subprocess.call(["git", "checkout", "-f", "-q", commit_id])
def make_clean():
subprocess.call(["make", "clean"])
def git_clean():
subprocess.call(["git", "clean", "-f", "-q", "-x"])
def call_configure():
"""
TODO: rename this function to "init_build_environment" or sth like that and replace it for other projects. also think about calling call_make here (for first build).
"""
subprocess.call(["./configure"])
def call_make():
"""this function builds the project (e.g. calls make)
TODO: replace this method for other projects; or move to class and then replace that class.
"""
print "calling make -j%d" % make_j
p = subprocess.Popen(["make", "-j%d" % make_j], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
retcode = p.wait()
print ">>>>>>>>>>"
print out
print err
print "<<<<<<<<<<"
def getCommitTime(commitID):
os.chdir(pathToProject)
commitTime = check_output(["git", "show", "-s", "--format=%ct", commitID])
return commitTime.replace('\n', '')
################################################################################
DEBUG = 1
def log(message):
if (DEBUG == 1):
print message
print "Starting at %s" % datetime.datetime.now()
################################################################################
filecount = 0
for root, dirnames, filenames in os.walk(path_to_project):
for filename in fnmatch.filter(filenames, '*.c'):
filecount += 1
print "#files: %d" % filecount
log("Starting at %s" % datetime.datetime.now())
os.environ['CC'] = clanghashWrapper
os.environ['PROJECT'] = projectName
os.environ['CC'] = clanghash_wrapper
os.environ['PROJECT'] = project_name
os.environ['STOP_IF_SAME_HASH'] = '1'
#reset to latest version
checkout("master")
log("calling make clean and git clean")
subprocess.call(["make", "clean"])
subprocess.call(["git", "clean", "-f", "-q", "-x"])
subprocess.call(["./configure"])
commitInfo = {}
commitID = check_output(["git", "log", "-1", "--pretty='%H'"]) # get current commit hash
commitInfo['commit-hash'] = commitID
os.environ['COMMIT_HASH'] = commitID
git_checkout("master")
print "calling make clean and git clean"
make_clean()
git_clean()
call_configure()
build_info = {}
commit_id = check_output(["git", "log", "-1", "--pretty='%H'"]) # get current commit hash
build_info['commit-hash'] = commit_id
os.environ['COMMIT_HASH'] = commit_id
print "starting initial build at %s" % datetime.datetime.now() #TODO: evtl. refactor duplicate code
run_id = 0 # for collecting the data later
os.environ['RUN_ID'] = "%d" % run_id
build_info[run_id] = {}
current_build_info = build_info[run_id]
current_build_info['filename'] = 'FRESH_BUILD'
start_time = time.time()
call_make()
build_time = int((time.time() - start_time) * 1e9) * make_j # nano * nr of threads
current_build_info['build-time'] = int((time.time() - start_time) * 1e9) * make_j # nano * nr of threads
print "finished initial build at %s, duration: %s" % (datetime.datetime.now(), build_time)
"""
TODO:
for file in dir:
touch file/insert NL
make
collect data (this has to be done in clang.in?):
times: parse, compilation, hashing
#files changed/#files w/ same hashes
"""
for filename in get_source_files(path_to_project):
run_id += 1
os.environ['RUN_ID'] = "%d" % run_id
print "current id: %d, current file: %s" % (run_id, filename)
log("calling make -j%d" % make_j)
startTime = time.time()
p = subprocess.Popen(["make", "-j%d" % make_j], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = p.communicate()
retcode = p.wait()
build_info[run_id] = {}
current_build_info = build_info[run_id]
current_build_info['filename'] = filename
touch(filename) #TODO: use abs filepath?
log(">>>>>>>>>>")
log(out)
log(err)
log("<<<<<<<<<<")
print "starting build at %s" % datetime.datetime.now() #TODO: evtl. refactor duplicate code
start_time = time.time()
call_make()
current_build_info['build-time'] = int((time.time() - start_time) * 1e9) * make_j # nano * nr of threads
print "finished commit %s at %s" % (commit_id, datetime.datetime.now())
commitInfo['build-time'] = int((time.time() - startTime) * 1e9) * make_j # nano * nr of threads
log("finished commit %s at %s" % (commitID, datetime.datetime.now()))
f = open(buildInfoFilePath, 'a')
f.write(repr(commitInfo) + "\n")
f = open(build_info_file_path, 'a')
f.write(repr(build_info) + "\n")
f.close()
log("Finished at %s" % datetime.datetime.now())
print "Finished at %s" % datetime.datetime.now()
......@@ -22,7 +22,7 @@ FULL_RECORD_FILENAME = 'fullRecord' + INFO_EXTENSION
COMMIT_INFO_FILENAME = 'commitInfo_musl' + INFO_EXTENSION
# graph filenames
PNG_EXTENSION = '.png'
PNG_EXTENSION = '.pdf' #TODO: rename to sth. like GRAPH_FILE_EXTENSION
PARSE_TIME_HISTOGRAM_FILENAME = 'parseTimeHistogram' + PNG_EXTENSION
HASH_TIME_HISTOGRAM_FILENAME = 'hashTimeHistogram' + PNG_EXTENSION
COMPILE_TIME_HISTOGRAM_FILENAME = 'compileTimeHistogram' + PNG_EXTENSION
......@@ -186,7 +186,11 @@ def buildFullRecord():
for commitID in commitInfo:
fullRecord[commitID] = {}
fullRecord[commitID][tr('commit-time')] = commitInfo[commitID]['commit-time']
fullRecord[commitID][tr('build-time')] = commitInfo[commitID]['build-time']
print commitID
if 'build-time' in commitInfo[commitID]:
fullRecord[commitID][tr('build-time')] = commitInfo[commitID]['build-time']
else:
fullRecord[commitID][tr('build-time')] = 0
fullRecord[commitID][tr('files')] = {}
fullRecord[commitID][tr('files-changed')] = commitInfo[commitID]['files-changed']
......@@ -246,22 +250,32 @@ def plotBuildTimeGraph(measuredBuildTimes, realClangHashBuildTimes, optimalClang
def plot_build_time_composition_graph1(data):
plotBuildTimeCompositionGraph(data[0], data[1], data[2], data[3])
def printAvg(data, name):
print 'avg %s: %f' % (name, sum(data)/float(len(data)))
parseColor, hashColor, compileColor, remainColor = ('#FFFF66','#FF0000','#3399FF','#008800')
def plotBuildTimeCompositionGraph(parseTimes, hashTimes, compileTimes, diffToBuildTime): # times in s
fig, ax = plt.subplots()
ax.stackplot(np.arange(1, len(parseTimes)+1), # x axis
[parseTimes, hashTimes, compileTimes, diffToBuildTime],
colors=['#008800','#FF0000','#0000FF', '#000000'])
# [parseTimes, hashTimes, compileTimes, diffToBuildTime],
[[i/60 for i in parseTimes], [i/60 for i in hashTimes], [i/60 for i in compileTimes], [i/60 for i in diffToBuildTime]],
colors=[parseColor,hashColor,compileColor,remainColor], edgecolor='none')
plt.xlim(1,len(parseTimes))
plt.xlabel('commits')
plt.ylabel('time [s]')
lgd = ax.legend([mpatches.Patch(color='#000000'),
mpatches.Patch(color='#0000FF'),
mpatches.Patch(color='#FF0000'),
mpatches.Patch(color='#008800')],
plt.ylabel('time [min]')
lgd = ax.legend([mpatches.Patch(color=remainColor),
mpatches.Patch(color=compileColor),
mpatches.Patch(color=hashColor),
mpatches.Patch(color=parseColor)],
['remaining build time','compile time', 'hash time', 'parse time'],
loc='center left', bbox_to_anchor=(1, 0.5))
fig.savefig(abs_path(BUILD_TIME_COMPOSITION_FILENAME), bbox_extra_artists=(lgd,), bbox_inches='tight')
printAvg(parseTimes, 'parse')
printAvg(hashTimes, 'hash')
printAvg(compileTimes, 'compile')
printAvg(diffToBuildTime, 'remainder')
def plotTimeHistogram(times, filename):
......@@ -280,14 +294,19 @@ def plotTimeMultiHistogram(parseTimes, hashTimes, compileTimes, filename):
bins = np.linspace(0, 5000, 50)
data = np.vstack([parseTimes, hashTimes, compileTimes]).T
fig, ax = plt.subplots()
plt.hist(data, bins, alpha=0.7, label=['parsing', 'hashing', 'compiling'])
plt.hist(data, bins, alpha=0.7, label=['parsing', 'hashing', 'compiling'], color=[parseColor, hashColor, compileColor])
plt.legend(loc='upper right')
plt.xlabel('time [ms]')
plt.ylabel('#files')
fig.savefig(filename)
fig, ax = plt.subplots()
data = [parseTimes, hashTimes, compileTimes]
plt.boxplot(data, 0, 'rs', 0)#, [5, 95])
fig.savefig(filename[:-4] + '_boxplots.png')
plt.boxplot(data, 0, 'rs', 0, [5, 95])
plt.xlabel('time [ms]')
plt.yticks([1, 2, 3], ['parsing', 'hashing', 'compiling'])
#lgd = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # legend on the right
fig.savefig(filename[:-4] + '_boxplots' + PNG_EXTENSION)
......@@ -306,11 +325,12 @@ def plot_changes_graph1(data):
def plotChangesGraph(fileCounts, sameHashes, differentAstHashes, differentObjHashes):
fig, ax = plt.subplots()
ax.plot(fileCounts, label='#objfiles')
ax.plot(sameHashes, label='unchanged')
ax.plot(differentAstHashes, label='astHash differs')
ax.plot(differentObjHashes, label='objHash differs')
#('#FFFF66','#FF0000','#3399FF','#008800')
ax.plot(fileCounts, label='#objfiles', color='#EEAD0E')#'black')
#ax.plot(sameHashes, label='unchanged')#, color='blue')
ax.plot(differentAstHashes, label='astHash differs', color=compileColor)#'#000088')
ax.plot(differentObjHashes, label='objHash differs', color=hashColor)#'#FFFF00') #'#0099FF')
box = ax.get_position()
lgd = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # legend on the right
......@@ -397,6 +417,7 @@ def makeGraphs(fullRecord):
totalFilesChanged += currentCommit[tr('files-changed')]
for filename in currentFiles:
fileCount += 1
if tr('ast-hash') not in currentFiles[filename].keys():
print "error: missing AST hash for file %s" % filename
continue
......@@ -451,7 +472,6 @@ def makeGraphs(fullRecord):
else:
same += 1
fileCount += 1
if missingFiles > currentCommit[tr('files-changed')]:
print "!!!!FAIL!!!!"
......@@ -483,7 +503,8 @@ def makeGraphs(fullRecord):
prevCommit = currentCommit
prevCommitID = commitID
if fileCount == 0:
print "no filecount at %s" % commitID