validate_hashes.py 27.4 KB
Newer Older
1
2
#!/usr/bin/env python

3
4
5
import matplotlib
matplotlib.use('Agg')

6
7
8
import fnmatch
import os
import sys
9
from operator import itemgetter
10
11
import time
import matplotlib.pyplot as plt
12
13
import matplotlib.patches as mpatches
import numpy as np
Ludwig Fueracker's avatar
Ludwig Fueracker committed
14
import matplotlib.mlab as mlab
15
import csv
16

17
18
19
20
21
22
23
24
PATH_TO_RECORDS = '' # gets set from command line parameter

# data/record filenames
INFO_EXTENSION = '.info'
FULL_RECORD_FILENAME = 'fullRecord' + INFO_EXTENSION
COMMIT_INFO_FILENAME = 'commitInfo_musl' + INFO_EXTENSION

# graph filenames
25
26
27
28
29
30
31
32
GRAPH_EXTENSION = '.pdf' #TODO: rename to sth. like GRAPH_FILE_EXTENSION
PARSE_TIME_HISTOGRAM_FILENAME = 'parseTimeHistogram' + GRAPH_EXTENSION
HASH_TIME_HISTOGRAM_FILENAME = 'hashTimeHistogram' + GRAPH_EXTENSION
COMPILE_TIME_HISTOGRAM_FILENAME = 'compileTimeHistogram' + GRAPH_EXTENSION
BUILD_TIME_HISTOGRAM_FILENAME = 'buildTimeHistogram' + GRAPH_EXTENSION
CHANGES_GRAPH_FILENAME = 'changes' + GRAPH_EXTENSION
BUILD_TIMES_GRAPH_FILENAME = 'buildTimes' + GRAPH_EXTENSION
BUILD_TIME_COMPOSITION_FILENAME = 'buildTimeComposition' + GRAPH_EXTENSION
33
34
35
36
37
38
39
40
41

# CSV filenames
#TODO: put in dict for easier check if existing
CSV_EXTENSION = '.csv'
BUILD_TIME_COMPOSITION_DATA_FILENAME = 'buildTimeCompositionData' + CSV_EXTENSION
BUILD_TIME_DATA_FILENAME = 'buildTimeData' + CSV_EXTENSION
CHANGES_DATA_FILENAME = 'changesData' + CSV_EXTENSION
SINGLE_TIMES_DATA_FILENAME = 'singleTimesData' + CSV_EXTENSION

42
43
44
45
46
47
48
# CSV headers
BUILD_TIME_DATA_HEADER = ['measuredBuildTimes', 'realClangHashBuildTimes', 'optimalClangHashBuildTimes', 'optimalBuildTimes']
BUILD_TIME_COMPOSITION_DATA_HEADER = ['totalParseTimes', 'totalHashTimes', 'totalCompileTimes', 'diffToBuildTime']
CHANGES_DATA_HEADER = ['fileCount','sameHashes', 'differentAstHashes', 'differentObjHashes']
SINGLE_TIMES_DATA_HEADER = ['parsing', 'hashing', 'compiling']


49
def abs_path(filename):
50
    """Prepends the absolute path to the filename."""
51
    return PATH_TO_RECORDS + '/../' + filename
52

53

54
def get_list_of_files(directory):
55
    for root, dirnames, filenames in os.walk(directory):
56
        for filename in fnmatch.filter(filenames, '*' + INFO_EXTENSION):
57
58
            yield os.path.join(root, filename)

59
60
61
62
63
64
65
def write_to_csv(data, column_names, filename):
    with open(filename, "w") as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(column_names)
        for line in data:
            writer.writerow(line)

66
67
68
69
70
71
72
def write_to_file(data, filename):
    with open(abs_path(filename), 'w') as f:
        try:
            f.write(repr(data))
        except MemoryError as me:
            print me
            raise
73
74
75
76
77
78
79


def plot_hash_count_histogram(hash_values, filename):
    dictionary = plt.figure()
    fig, ax = plt.subplots()
    plt.xlabel('nr of different hashes')
    plt.ylabel('nr of files')
80
    ax.bar(hash_values.keys(), hash_values.values(), align='center')
81
    fig.savefig(filename)
82

83

84
85
86
87
88
89
90
91
92
93
94
95
96
97
false_negatives = 0
false_positives = 0
ast_hash_missing = 0

source_files = set() # all filenames of the hashed files
ast_hashes_dict = {} # maps filename -> set(ast hashes)
obj_hashes_dict = {} # maps filename -> set(obj hashes)

nr_of_records = 0 # for calculating avg
sum_of_times = {'parsing': 0,
             'hashing': 0,
             'compiling': 0}


98
99
def validate_records():
    for filename in get_list_of_files(PATH_TO_RECORDS):
100
        records = [eval(line) for line in open(filename)]
101
102
        validate_hashes(records)

103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
    different_ast_hashes = 0  # number of different ast hashes (in total)
    for v in ast_hashes_dict.values():
        different_ast_hashes += len(v)

    different_obj_hashes = 0 # number of different obj hashes (in total)
    for v in obj_hashes_dict.values():
        different_obj_hashes += len(v)

    print "\n---- Results ----"
    print "false negatives (errors): %d" % false_negatives
    print "false positives: %d" % false_positives
    print "missing ast hashes: %d" % ast_hash_missing
    print ""
    print "source files: %d" % len(source_files)
    print "different AST hashes: %d" % different_ast_hashes
    print "different obj hashes: %d" % different_obj_hashes
    print ""
    print "avg times:"
    for k,v in sum_of_times.items():
122
        print "%s: %d ns" % (k, v/nr_of_records)
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
    print "-----------------\n"

    write_to_csv([ [k,len(v)] for k,v in ast_hashes_dict.items() ], ['filename', 'nr of different hashes'], abs_path('different_ast_hashes_per_file.csv'))
    write_to_csv([ [k,len(v)] for k,v in obj_hashes_dict.items() ], ['filename', 'nr of different hashes'], abs_path('different_obj_hashes_per_file.csv'))


def print_hash_info(message, prev_record, record, is_error=True):
    print "%s: file %s, commits %s to %s : %s" % ("ERROR" if is_error else "INFO", record['filename'], prev_record['commit-hash'], record['commit-hash'], message)


def validate_hashes(record_list):
    ''' All records in the list must be from the same object file'''
    #TODO: collect data from all files before validating (changing paths src and crt) and sort

    global false_negatives, false_positives, ast_hash_missing
    global source_files
    global ast_hashes_dict, obj_hashes_dict
    global sum_of_times, nr_of_records

    iter_records = iter(record_list)
    prev_record = next(iter_records)
    filename = prev_record['filename']
    source_files.add(filename)
146
 
147
    if 'ast-hash' not in prev_record.keys():
148
        #print "MISSING: no ast-hash in records for file " + filename
149
        ast_hash_missing += 1
150
        return
151

152
153
154
    # the different hashes of the current file
    ast_hashes = set()
    obj_hashes = set()
155

156
157
158
159
160
161
162
163
164
165
166
167
    ast_hashes.add(prev_record['ast-hash'])
    obj_hashes.add(prev_record['object-hash'])

    nr_of_records += 1
    sum_of_times['parsing'] += prev_record['parse-duration']
    sum_of_times['hashing'] += prev_record['hash-duration']
    sum_of_times['compiling'] += prev_record['compile-duration'] - (prev_record['parse-duration'] + prev_record['hash-duration'])


    for record in iter_records:
        
        if prev_record['start-time'] > record['start-time']:
Ludwig Fueracker's avatar
Ludwig Fueracker committed
168
            print "Error: wrong order of records" #TODO: fix, then remove this
169
170
171
172
        if 'ast-hash' not in record.keys() or 'object-hash' not in record.keys():
            print "ERROR: stopping validating for file %s; no ast-hash available for commit %s" % (filename, record['commit-hash'])
            break

173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
        if prev_record['object-hash'] != record['object-hash']:
            if prev_record['ast-hash'] == record['ast-hash']:
                print_hash_info("object hashes differ, ast hashes same", prev_record, record)
                false_negatives += 1
        elif prev_record['ast-hash'] != record['ast-hash']:
            #print_hash_info("ast hashes differ, object hashes same", prev_record, record, False) #TODO: include this and look at the changes
            false_positives += 1
        
        if prev_record['ast-hash'] != record['ast-hash']:
            ast_hashes.add(record['ast-hash'])
        if prev_record['object-hash'] != record['object-hash']:
            obj_hashes.add(record['object-hash'])

        nr_of_records += 1
        sum_of_times['parsing'] += record['parse-duration']
        sum_of_times['hashing'] += record['hash-duration']
        sum_of_times['compiling'] += record['compile-duration'] - (record['parse-duration'] + record['hash-duration'])

        prev_record = record


    if filename in ast_hashes_dict:
        ast_hashes_dict[filename] |=ast_hashes # merge sets
196
    else:
197
198
199
200
        ast_hashes_dict[filename] = ast_hashes

    if filename in obj_hashes_dict:
        obj_hashes_dict[filename] |= obj_hashes # merge sets
201
    else:
202
203
204
205
206
207
        obj_hashes_dict[filename] = obj_hashes


    different_ast_hash_count = len(ast_hashes)
    different_obj_hash_count = len(obj_hashes)

208

209
210
################################################################################

211
212
213
214
215
216
217
218
219
220
221
222
def build_key_translation_dict():
    key_translation_to_nr = {
        'start-time':       0,
        'hash-start-time':  1,
        'object-hash':      2,
        'return-code':      3,
        'parse-duration':   4,
        'object-file-size': 5,
        'processed-bytes':  6,
        'hash-duration':    7,
        'filename':         8,
        'project':          9,
223
        'compile-duration': 10, # time the compiler was running (incl. parse-duration) #TODO: AND hash-duration?
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
        'ast-hash':         11,
        'commit-hash':      12,
        'element-hashes':   13,
        'commit-time':      14,
        'build-time':       15, # time the 'make -jx' command took, times x 
        'files':            16,
        'files-changed':    17,
        'insertions':       18,
        'deletions':        19,
        'other': 20 #TODO: remove, just 4 testing
    }
    key_translation_from_nr = {v: k for k, v in key_translation_to_nr.items()}

    key_translation_dict = key_translation_to_nr.copy()
    key_translation_dict.update(key_translation_from_nr)

    return key_translation_dict


key_translation = build_key_translation_dict()
244
245

def tr(key):
246
    """lookup key translation (both directions)"""
247
    return key_translation[key]
248
249


250
def build_full_record_to(pathToFullRecordFile):
251
252
253
    """structure of full record:
    {commitID: {'build-time': time, files: {filename: {record}, filename: {record}}}}
    """
254
    full_record = {}
255
256
257
258
    # this leads to being Killed by OS due to tremendous memory consumtion...
    #if os.path.isfile(pathToFullRecordFile):
    #    with open(pathToFullRecordFile, 'r') as fullRecordFile:
    #        print "loading full record from " + pathToFullRecordFile
259
    #        full_record = eval(fullRecordFile.read())
260
261
    #        print "read full record from " + pathToFullRecordFile
    #else:
262
    full_record = build_full_record()
263
264
#    f = open(pathToFullRecordFile, 'w')
#    try:
265
#        f.write(repr(full_record) + "\n")
266
267
268
269
270
271
272
#    except MemoryError as me:
#        print me
#        raise
#    finally:
#        print time.ctime()
#        f.close()
#    print "built full record, wrote to " + pathToFullRecordFile
273
    return full_record
Ludwig Fueracker's avatar
Ludwig Fueracker committed
274

275

276
def build_full_record():
277
278
279
    """Builds a complete record from all the single hash records.
    The records are grouped by the commitIDs
    """
280
    full_record = {}
281
    with open(abs_path(COMMIT_INFO_FILENAME), 'r') as commitInfoFile:
282
283
        commitInfo = eval(commitInfoFile.read())
        for commitID in commitInfo:
284
285
            full_record[commitID] = {}
            full_record[commitID][tr('commit-time')] = commitInfo[commitID]['commit-time']
Ludwig Fueracker's avatar
Ludwig Fueracker committed
286
287
            print commitID
            if 'build-time' in commitInfo[commitID]:
288
                full_record[commitID][tr('build-time')] = commitInfo[commitID]['build-time']
Ludwig Fueracker's avatar
Ludwig Fueracker committed
289
            else:
290
291
292
                full_record[commitID][tr('build-time')] = 0
            full_record[commitID][tr('files')] = {}
            full_record[commitID][tr('files-changed')] = commitInfo[commitID]['files-changed']
293
            
294
            if 'insertions' in commitInfo[commitID]:
295
                full_record[commitID][tr('insertions')] = commitInfo[commitID]['insertions']
296
            if 'deletions' in commitInfo[commitID]:
297
                full_record[commitID][tr('deletions')] = commitInfo[commitID]['deletions']
298

299
    for recordFilename in get_list_of_files(PATH_TO_RECORDS):
300
301
302
303
        for line in open(recordFilename):
            data = eval(line)
            commitID = data['commit-hash']
            del data['commit-hash']
304
305
306
            
            objFilename = data['obj-file']
            del data['obj-file']
307
308
309
310
311
312
313
314

            
            # del everything I don't need
            del data['return-code']
            del data['element-hashes']
            del data['project']
            del data['processed-bytes']
            del data['object-file-size']
315
            
316
            dataNewKeys = {tr(k): v for k, v in data.items()} 
317
            full_record[commitID][tr('files')][objFilename] = dataNewKeys
318

319
    return full_record
320

321
################################################################################
322

323
324
def get_sorted_commit_id_list(full_record):
    return sorted(full_record, key=lambda x: (full_record[x][tr('commit-time')]))
325

326
327
################################################################################

328

329
def plot_build_time_graph1(data):
330
    plot_build_time_graph(data[0], data[1], data[2], data[3])
331

332
def plot_build_time_graph(measuredBuildTimes, realClangHashBuildTimes, optimalClangHashBuildTimes, optimalBuildTimes): # times in s
333
334
    fig, ax = plt.subplots()

335
336
337
338
    ax.plot([i/60 for i in measuredBuildTimes], label='measured build time')
    ax.plot([i/60 for i in realClangHashBuildTimes], label='real clang-hash build time')
    ax.plot([i/60 for i in optimalClangHashBuildTimes], label='optimal clang-hash build time')
    ax.plot([i/60 for i in optimalBuildTimes], label='optimal build time')
339
340

    lgd = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # legend on the right
341
    ax.set_ylim([0,5])
342
    plt.xlabel('commits')
343
    plt.ylabel('time [min]')
344
    fig.savefig(abs_path(BUILD_TIMES_GRAPH_FILENAME), bbox_extra_artists=(lgd,), bbox_inches='tight')
345
346


347
def plot_build_time_composition_graph1(data):
348
    plot_build_time_composition_graph(data[0], data[1], data[2], data[3])
349

350
def print_avg(data, name):
Ludwig Fueracker's avatar
Ludwig Fueracker committed
351
352
353
354
    print 'avg %s: %f' % (name, sum(data)/float(len(data)))

parseColor, hashColor, compileColor, remainColor = ('#FFFF66','#FF0000','#3399FF','#008800') 

355
def plot_build_time_composition_graph(parseTimes, hashTimes, compileTimes, diffToBuildTime): # times in s
356
    fig, ax = plt.subplots()
Ludwig Fueracker's avatar
Ludwig Fueracker committed
357

358
    ax.stackplot(np.arange(1, len(parseTimes)+1), # x axis
Ludwig Fueracker's avatar
Ludwig Fueracker committed
359
360
361
#                 [parseTimes, hashTimes, compileTimes, diffToBuildTime],
                  [[i/60 for i in parseTimes], [i/60 for i in hashTimes], [i/60 for i in compileTimes], [i/60 for i in diffToBuildTime]],
                 colors=[parseColor,hashColor,compileColor,remainColor], edgecolor='none')
362
363
    plt.xlim(1,len(parseTimes))
    plt.xlabel('commits')
Ludwig Fueracker's avatar
Ludwig Fueracker committed
364
365
366
367
368
    plt.ylabel('time [min]')
    lgd = ax.legend([mpatches.Patch(color=remainColor),
                     mpatches.Patch(color=compileColor),
                     mpatches.Patch(color=hashColor),
                     mpatches.Patch(color=parseColor)],
369
370
                    ['remaining build time','compile time', 'hash time', 'parse time'],
                    loc='center left', bbox_to_anchor=(1, 0.5))
371
    fig.savefig(abs_path(BUILD_TIME_COMPOSITION_FILENAME), bbox_extra_artists=(lgd,), bbox_inches='tight')
372
373
374
375
    print_avg(parseTimes, 'parse')
    print_avg(hashTimes, 'hash')
    print_avg(compileTimes, 'compile')
    print_avg(diffToBuildTime, 'remainder')
376
377


378
def plotTimeHistogram(times, filename): # times in ms
379
    #TODO: understand params and vars
380
    hist, bins = np.histogram([i/1000 for i in times], bins=50) # times to s
381
382
383
    width = 0.7 * (bins[1] - bins[0])
    center = (bins[:-1] + bins[1:]) / 2
    fig, ax = plt.subplots()
384
    plt.xlabel('time [s]')
385
386
    plt.ylabel('#files')
    ax.bar(center, hist, align='center', width=width)
387
    fig.savefig(filename)
388
389


390
def plotTimeMultiHistogram(parseTimes, hashTimes, compileTimes, filename): # times in ms
391
392
    bins = np.linspace(0, 5000, 50)
    data = np.vstack([parseTimes, hashTimes, compileTimes]).T
393
    fig, ax = plt.subplots()
Ludwig Fueracker's avatar
Ludwig Fueracker committed
394
    plt.hist(data, bins, alpha=0.7, label=['parsing', 'hashing', 'compiling'], color=[parseColor, hashColor, compileColor])
395
    plt.legend(loc='upper right')
Ludwig Fueracker's avatar
Ludwig Fueracker committed
396
397
    plt.xlabel('time [ms]')
    plt.ylabel('#files')
398
    fig.savefig(filename)
399

400
    fig, ax = plt.subplots()
401
402
403
    boxplot_data = [[i/1000 for i in parseTimes], [i/1000 for i in hashTimes], [i/1000 for i in compileTimes]] # times to s
    plt.boxplot(boxplot_data, 0, 'rs', 0, [5, 95])
    plt.xlabel('time [s]')
Ludwig Fueracker's avatar
Ludwig Fueracker committed
404
405
    plt.yticks([1, 2, 3], ['parsing', 'hashing', 'compiling'])
    #lgd = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # legend on the right
406
    fig.savefig(filename[:-4] + '_boxplots' + GRAPH_EXTENSION)
407
408


409

410
411
412
def plot_time_histograms1(data):
    plotTimeHistograms(data[0], data[1], data[2])

413
def plotTimeHistograms(parseTimes, hashTimes, compileTimes): # times in ms
414
415
416
    plotTimeHistogram(parseTimes, abs_path(PARSE_TIME_HISTOGRAM_FILENAME))
    plotTimeHistogram(hashTimes, abs_path(HASH_TIME_HISTOGRAM_FILENAME))
    plotTimeHistogram(compileTimes, abs_path(COMPILE_TIME_HISTOGRAM_FILENAME))
417
    plotTimeMultiHistogram(parseTimes, hashTimes, compileTimes, abs_path(BUILD_TIME_HISTOGRAM_FILENAME))
418
419


420
421
422
def plot_changes_graph1(data):
    plotChangesGraph(data[0], data[1], data[2], data[3])

423
424
def plotChangesGraph(fileCounts, sameHashes, differentAstHashes, differentObjHashes):
    fig, ax = plt.subplots()
Ludwig Fueracker's avatar
Ludwig Fueracker committed
425
426
427
428
429
430
    
    #('#FFFF66','#FF0000','#3399FF','#008800') 
    ax.plot(fileCounts, label='#objfiles', color='#EEAD0E')#'black')
    #ax.plot(sameHashes, label='unchanged')#, color='blue')
    ax.plot(differentAstHashes, label='astHash differs', color=compileColor)#'#000088')
    ax.plot(differentObjHashes, label='objHash differs', color=hashColor)#'#FFFF00') #'#0099FF')
431
432
433
434
435
436

    box = ax.get_position()
    lgd = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # legend on the right

    plt.xlabel('commits')
    plt.ylabel('#files')
437
438
439
440
    fig.savefig(abs_path(CHANGES_GRAPH_FILENAME), bbox_extra_artists=(lgd,), bbox_inches='tight')


################################################################################
441

442

443
444
def make_graphs(full_record):
    sortedCommitIDs = get_sorted_commit_id_list(full_record)
445
    iterCommits = iter(sortedCommitIDs)
446
    prevCommitID = next(iterCommits)
447
    prevCommit = full_record[prevCommitID]
448

449
    # data for build time graphs
450
    measuredBuildTimes = []
451
    optimalBuildTimes = []
452
453
    optimalClangHashBuildTimes = []
    realClangHashBuildTimes = []
Ludwig Fueracker's avatar
Ludwig Fueracker committed
454

455
456
457
    totalParseTimes = []
    totalHashTimes = []
    totalCompileTimes = []
458
459
    diffToBuildTime = []

460
    #data for histograms
Ludwig Fueracker's avatar
Ludwig Fueracker committed
461
462
463
464
    parseTimes = []
    hashTimes = []
    compileTimes = []

465
466
467
468
469
470
    # data for changes graph
    differentAstHashes = []
    differentObjHashes = []
    sameHashes = []
    fileCounts = []

471
472
473
    missingFilesTotal = 0 # count how many files are added in commits, just 4 checking
    missingFileErrors = 0
    totalFilesChanged = 0
474

Ludwig Fueracker's avatar
Ludwig Fueracker committed
475
    currentFiles = prevCommit[tr('files')]
476
    totalFilesChanged += len(currentFiles)
477
    for filename in currentFiles: # deal with first commit
Ludwig Fueracker's avatar
Ludwig Fueracker committed
478
479
480
        if tr('ast-hash') not in currentFiles[filename].keys():
            print "error: missing AST hash for file %s" % filename
            continue
481
482
483
484
        currentRecord = currentFiles[filename]
        parseTimes.append(currentRecord[tr('parse-duration')] / 1e6) # ns to ms
        hashTimes.append(currentRecord[tr('hash-duration')] / 1e6) 
        compileTimes.append(currentRecord[tr('compile-duration')] / 1e6)
Ludwig Fueracker's avatar
Ludwig Fueracker committed
485

Ludwig Fueracker's avatar
Ludwig Fueracker committed
486
    for commitID in iterCommits:
487
        currentCommit = full_record[commitID]
488
489
        currentFiles = currentCommit[tr('files')]
        prevFiles = prevCommit[tr('files')]
490
491
492
493
494
495
496
497
498
  
        totalOptimalRedundantTime = 0 # ns
        totalOptimalRedundantCompileTime = 0 # ns
        totalASTHashRedundantCompileTime = 0 # ns
        totalCompileDuration = 0 # ns # incl. parsing time
        totalParseDuration = 0 # ns
        totalHashDuration = 0 # ns

        same = 0 #TODO: rename to ...Count?
Ludwig Fueracker's avatar
Ludwig Fueracker committed
499
500
        differentAstHash = 0
        differentObjHash = 0
Ludwig Fueracker's avatar
Ludwig Fueracker committed
501
        fileCount = 0
Ludwig Fueracker's avatar
Ludwig Fueracker committed
502

503
504
505
        missingFiles = 0

        totalFilesChanged += currentCommit[tr('files-changed')]
506

Ludwig Fueracker's avatar
Ludwig Fueracker committed
507
        for filename in currentFiles:
Ludwig Fueracker's avatar
Ludwig Fueracker committed
508
            fileCount += 1
509
            if tr('ast-hash') not in currentFiles[filename].keys():
510
                print "error: missing AST hash for file %s" % filename
Ludwig Fueracker's avatar
Ludwig Fueracker committed
511
                continue
512
            prevFilename = filename
Ludwig Fueracker's avatar
Ludwig Fueracker committed
513
514
            if filename not in prevFiles:
                if 'src/' + filename in prevFiles:
515
                    print "file %s changed place to src/" % filename  #TODO: is this actually necessary?
516
                    prevFilename = 'src/' + filename
517
518
                elif 'crt/' + filename in prevFiles:
                    print "file %s changed place to crt/" % filename
519
                    prevFilename = 'crt/' + filename
520
                else:
521
522
523
                    print "MISSING: %s not in prev (%s), current is (%s)" % (filename, prevCommitID, commitID)
                    missingFilesTotal += 1
                    missingFiles += 1
Ludwig Fueracker's avatar
Ludwig Fueracker committed
524
                    continue
525

526
527

            currentRecord = currentFiles[filename]
528
            prev_record = prevFiles[prevFilename]
529
530
531
532
533
534
535
536
537
538

            parseDuration = currentRecord[tr('parse-duration')] # ns
            hashDuration = currentRecord[tr('hash-duration')] # ns
            compileDuration = currentRecord[tr('compile-duration')] - parseDuration # ns

            totalParseDuration += parseDuration
            totalHashDuration += hashDuration
            totalCompileDuration += compileDuration
 

539
            if prev_record[tr('ast-hash')] == currentRecord[tr('ast-hash')]:
540
541
                totalASTHashRedundantCompileTime += compileDuration # ns

542
            if prev_record[tr('object-hash')] == currentRecord[tr('object-hash')]:
543
544
545
546
547
548
549
550
551
552
                totalOptimalRedundantTime += compileDuration + hashDuration + parseDuration #ns
                totalOptimalRedundantCompileTime += compileDuration
            else:
                # For the histograms, only take file into account if it changed, to prevent the same files to be counted multiple times
                parseTimes.append(currentRecord[tr('parse-duration')] / 1e6) # ns to ms
                hashTimes.append(currentRecord[tr('hash-duration')] / 1e6) 
                compileTimes.append(currentRecord[tr('compile-duration')] / 1e6) 


            # for changes graph
553
            if prev_record[tr('object-hash')] != currentRecord[tr('object-hash')]:
Ludwig Fueracker's avatar
Ludwig Fueracker committed
554
555
                differentObjHash += 1
                differentAstHash += 1
556
            elif prev_record[tr('ast-hash')] != currentRecord[tr('ast-hash')]:
Ludwig Fueracker's avatar
Ludwig Fueracker committed
557
558
559
560
                differentAstHash += 1
            else:
                same += 1

Ludwig Fueracker's avatar
Ludwig Fueracker committed
561

562
563
564
565
        if missingFiles > currentCommit[tr('files-changed')]:
            print "!!!!FAIL!!!!"
            missingFileErrors += 1
            print "%s: missingFiles: %d, filesChanged: %d" % (commitID, missingFiles, currentCommit[tr('files-changed')])
566
567
568
569
570
571
572

        buildTime = currentCommit[tr('build-time')] # ns
        optimalBuildTime = buildTime - totalOptimalRedundantTime # = buildTime - sum((clangTime(file) + hashTime) if objhash(file) unchanged)
        realAstHashBuildTime = buildTime - totalASTHashRedundantCompileTime # = buildTime - sum(compileTime(file) if asthash(file) unchanged)
        optimalAstHashBuildTime = buildTime - totalOptimalRedundantCompileTime


573
574
575
576
577
578
        #TODO: remove broken commits; ok?
        if buildTime > 3e12 and totalParseDuration/1e9 > 300:
            measuredBuildTimes.append(buildTime / 16e9) # nano to seconds; also /16 to account for make -j16
            optimalBuildTimes.append(optimalBuildTime / 16e9)
            optimalClangHashBuildTimes.append(optimalAstHashBuildTime / 16e9)
            realClangHashBuildTimes.append(realAstHashBuildTime / 16e9)
579

580
581
582
583
            totalParseTimes.append(totalParseDuration / 16e9) # nano to seconds
            totalHashTimes.append(totalHashDuration / 16e9)
            totalCompileTimes.append(totalCompileDuration / 16e9)
            diffToBuildTime.append((buildTime - totalParseDuration - totalHashDuration - totalCompileDuration) / 16e9)
584
585


586
587
588
589
590
            # changes graph
            differentAstHashes.append(differentAstHash)
            differentObjHashes.append(differentObjHash)
            sameHashes.append(same)
            fileCounts.append(fileCount)
Ludwig Fueracker's avatar
Ludwig Fueracker committed
591
592

        prevCommit = currentCommit
593
        prevCommitID = commitID
Ludwig Fueracker's avatar
Ludwig Fueracker committed
594
595
        if fileCount == 0:
            print "no filecount at %s" % commitID
596

597
598
599
    print "missingFilesTotal: %d, missingFileErrors: %d" % (missingFilesTotal, missingFileErrors)
    print "totalFilesChanged: %d, sizes: parseTimes(%d), hashTimes(%d), compileTimes(%d)" % (totalFilesChanged, len(parseTimes), len(hashTimes), len(compileTimes))

600
601
    plot_build_time_graph(measuredBuildTimes, realClangHashBuildTimes, optimalClangHashBuildTimes, optimalBuildTimes)
    plot_build_time_composition_graph(totalParseTimes, totalHashTimes, totalCompileTimes, diffToBuildTime)
602
    plotTimeHistograms(parseTimes, hashTimes, compileTimes)
603
    plotChangesGraph(fileCounts, sameHashes, differentAstHashes, differentObjHashes)
Ludwig Fueracker's avatar
Ludwig Fueracker committed
604

605
606
    # save data to csv files
    buildTimeData = np.column_stack((measuredBuildTimes, realClangHashBuildTimes, optimalClangHashBuildTimes, optimalBuildTimes))
607
    write_to_csv(buildTimeData, BUILD_TIME_DATA_HEADER, abs_path(BUILD_TIME_DATA_FILENAME))
608
609

    buildTimeCompositionData = np.column_stack((totalParseTimes, totalHashTimes, totalCompileTimes, diffToBuildTime))
610
    write_to_csv(buildTimeCompositionData, BUILD_TIME_COMPOSITION_DATA_HEADER, abs_path(BUILD_TIME_COMPOSITION_DATA_FILENAME))
611
612

    singleTimesData = np.column_stack((parseTimes, hashTimes, compileTimes))
613
    write_to_csv(singleTimesData, SINGLE_TIMES_DATA_HEADER, abs_path(SINGLE_TIMES_DATA_FILENAME))
614
615

    changesData = np.column_stack((fileCounts, sameHashes, differentAstHashes, differentObjHashes))
616
    write_to_csv(changesData, CHANGES_DATA_HEADER, abs_path(CHANGES_DATA_FILENAME))
617
618
619
620
621
622
623
624
625
626

################################################################################
"""functions for reading data from the csv files to skip full record building"""

def csv_files_are_existing():
    return (os.path.isfile(abs_path(BUILD_TIME_COMPOSITION_DATA_FILENAME))
            and os.path.isfile(abs_path(BUILD_TIME_DATA_FILENAME))
            and os.path.isfile(abs_path(CHANGES_DATA_FILENAME))
            and os.path.isfile(abs_path(SINGLE_TIMES_DATA_FILENAME)))

627
def read_from_csv(filename, column_names):
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
    data = []
    with open(filename) as csv_file:
        reader = csv.reader(csv_file)
        is_header_row = True
        for row in reader:
            if is_header_row:
                for col in row:
                    data.append([])
                is_header_row = False
            else:
                colnum = 0
                for col in row:
                    data[colnum].append(float(col))
                    colnum += 1
    return data

644
645
646
647
648

def read_csv_data_and_plot_graphs():
    """Build the graphs from the data from the csv files from previous runs
    to save time by skipping the whole "full record" building.
    """
649
650
651
652
653
654
655
656
657
658
659
660
661
662

    build_time_data = read_from_csv(abs_path(BUILD_TIME_DATA_FILENAME), BUILD_TIME_DATA_HEADER)
    plot_build_time_graph1(build_time_data)

    build_time_composition_data = read_from_csv(abs_path(BUILD_TIME_COMPOSITION_DATA_FILENAME), BUILD_TIME_COMPOSITION_DATA_HEADER)
    plot_build_time_composition_graph1(build_time_composition_data)

    changes_data = read_from_csv(abs_path(CHANGES_DATA_FILENAME), CHANGES_DATA_HEADER)
    plot_changes_graph1(changes_data)

    single_times_data = read_from_csv(abs_path(SINGLE_TIMES_DATA_FILENAME), SINGLE_TIMES_DATA_HEADER)
    plot_time_histograms1(single_times_data)


663
################################################################################
664

665
# main:
666
if (len(sys.argv) > 1):
667
668
    PATH_TO_RECORDS = sys.argv[1]
    path_to_full_record_file = abs_path(FULL_RECORD_FILENAME)
Ludwig Fueracker's avatar
Ludwig Fueracker committed
669
    print "Starting at %s" % time.ctime()
670

671
672

    if '--skip-validating' not in sys.argv:
673
        print "validating..."
674
        validate_records()
675
        print "finished validating at %s" % time.ctime()
676
677


678
679
680
681
682
683
    if csv_files_are_existing():
        # skip building record, read csv data from files and work with that
        print "reading from csv files"
        read_csv_data_and_plot_graphs()
        print "finished graphs at %s" % time.ctime()
    else:
684
        full_record = build_full_record_to(path_to_full_record_file)
685
        print "finished building/loading full record at %s" % time.ctime()
686

687
        make_graphs(full_record)
688
        print "finished graphs at %s" % time.ctime()
689

Ludwig Fueracker's avatar
Ludwig Fueracker committed
690
    print "Finished at %s" % time.ctime()
691
else:
692
    print "Missing path to record files.\nUsage:\n\t%s PATH_TO_RECORDS" % sys.argv[0]
693