Commit 1e89fc34 authored by Ludwig Fueracker's avatar Ludwig Fueracker
Browse files

fixed order of validation (now from old to new commit)

parent 5a5e607a
...@@ -88,6 +88,11 @@ source_files = set() # all filenames of the hashed files ...@@ -88,6 +88,11 @@ source_files = set() # all filenames of the hashed files
ast_hashes_dict = {} # maps filename -> set(ast hashes) ast_hashes_dict = {} # maps filename -> set(ast hashes)
obj_hashes_dict = {} # maps filename -> set(obj hashes) obj_hashes_dict = {} # maps filename -> set(obj hashes)
unchanged_counter = 0
ast_hashes_set = set()
obj_hashes_set = set()
nr_of_records = 0 # for calculating avg nr_of_records = 0 # for calculating avg
sum_of_times = {'parsing': 0, sum_of_times = {'parsing': 0,
'hashing': 0, 'hashing': 0,
...@@ -100,11 +105,12 @@ commit_ids = set() ...@@ -100,11 +105,12 @@ commit_ids = set()
def validate_records(): def validate_records():
total_number_of_records = 0 total_number_of_records = 0
for filename in get_list_of_files(PATH_TO_RECORDS): for filename in get_list_of_files(PATH_TO_RECORDS):
records = [eval(line) for line in open(filename)] with open(filename) as f:
total_number_of_records += len(records) records = list(reversed([eval(line) for line in f]))
validate_hashes(records) total_number_of_records += len(records)
validate_hashes(records)
different_ast_hashes = 0 # number of different ast hashes (in total) different_ast_hashes = 0 # number of different ast hashes (in total)
for v in ast_hashes_dict.values(): for v in ast_hashes_dict.values():
different_ast_hashes += len(v) different_ast_hashes += len(v)
...@@ -127,6 +133,9 @@ def validate_records(): ...@@ -127,6 +133,9 @@ def validate_records():
print "source files: %d" % len(source_files) print "source files: %d" % len(source_files)
print "different AST hashes: %d" % different_ast_hashes print "different AST hashes: %d" % different_ast_hashes
print "different obj hashes: %d" % different_obj_hashes print "different obj hashes: %d" % different_obj_hashes
print "size of ast_hashes_dict: %d, obj_hashes_dict: %d" % (len(ast_hashes_dict), len(obj_hashes_dict))
print "len(ast_hashes_set): %d, len(obj_hashes_set): %d" % (len(ast_hashes_set), len(obj_hashes_set))
print "unchanged_counter: %d" % unchanged_counter
print "" print ""
print "avg times:" print "avg times:"
for k,v in sum_of_times.items(): for k,v in sum_of_times.items():
...@@ -134,9 +143,9 @@ def validate_records(): ...@@ -134,9 +143,9 @@ def validate_records():
print "-----------------" print "-----------------"
print "false positive commits: %d" % len(false_positive_records) print "false positive commits: %d" % len(false_positive_records)
for k in sorted(false_positive_records, key=lambda k: len(false_positive_records[k]), reverse=True): for k in sorted(false_positive_records, key=lambda k: len(false_positive_records[k]), reverse=True):
print "%s: %d" % (k, len(false_positive_records[k])) print "\t%s: %d" % (k, len(false_positive_records[k]))
print "-----------------\n" print "-----------------\n"
write_to_csv([ [k,len(v)] for k,v in ast_hashes_dict.items() ], ['filename', 'nr of different hashes'], abs_path('different_ast_hashes_per_file.csv')) write_to_csv([ [k,len(v)] for k,v in ast_hashes_dict.items() ], ['filename', 'nr of different hashes'], abs_path('different_ast_hashes_per_file.csv'))
write_to_csv([ [k,len(v)] for k,v in obj_hashes_dict.items() ], ['filename', 'nr of different hashes'], abs_path('different_obj_hashes_per_file.csv')) write_to_csv([ [k,len(v)] for k,v in obj_hashes_dict.items() ], ['filename', 'nr of different hashes'], abs_path('different_obj_hashes_per_file.csv'))
...@@ -145,6 +154,7 @@ def print_hash_info(message, prev_record, record, is_error=True): ...@@ -145,6 +154,7 @@ def print_hash_info(message, prev_record, record, is_error=True):
print "%s: file %s, commits %s to %s : %s" % ("ERROR" if is_error else "INFO", record['filename'], prev_record['commit-hash'], record['commit-hash'], message) print "%s: file %s, commits %s to %s : %s" % ("ERROR" if is_error else "INFO", record['filename'], prev_record['commit-hash'], record['commit-hash'], message)
def validate_hashes(record_list): def validate_hashes(record_list):
''' All records in the list must be from the same object file''' ''' All records in the list must be from the same object file'''
#TODO: collect data from all files before validating (changing paths src and crt) and sort #TODO: collect data from all files before validating (changing paths src and crt) and sort
...@@ -152,8 +162,9 @@ def validate_hashes(record_list): ...@@ -152,8 +162,9 @@ def validate_hashes(record_list):
global false_negatives, ast_hash_missing global false_negatives, ast_hash_missing
global source_files global source_files
global ast_hashes_dict, obj_hashes_dict global ast_hashes_dict, obj_hashes_dict
global ast_hashes_set, obj_hashes_set
global sum_of_times, nr_of_records global sum_of_times, nr_of_records
global unchanged_counter
iter_records = iter(record_list) iter_records = iter(record_list)
prev_record = next(iter_records) prev_record = next(iter_records)
filename = prev_record['filename'] filename = prev_record['filename']
...@@ -170,6 +181,8 @@ def validate_hashes(record_list): ...@@ -170,6 +181,8 @@ def validate_hashes(record_list):
ast_hashes.add(prev_record['ast-hash']) ast_hashes.add(prev_record['ast-hash'])
obj_hashes.add(prev_record['object-hash']) obj_hashes.add(prev_record['object-hash'])
ast_hashes_set.add(prev_record['ast-hash'])
obj_hashes_set.add(prev_record['object-hash'])
commit_ids.add(prev_record['commit-hash']) commit_ids.add(prev_record['commit-hash'])
...@@ -181,8 +194,8 @@ def validate_hashes(record_list): ...@@ -181,8 +194,8 @@ def validate_hashes(record_list):
for record in iter_records: for record in iter_records:
if prev_record['start-time'] > record['start-time']: #if prev_record['start-time'] > record['start-time']:
print "Error: wrong order of records" #TODO: fix, then remove this # print "Error: wrong order of records" #TODO: fix, then remove this
if 'ast-hash' not in record.keys() or 'object-hash' not in record.keys(): if 'ast-hash' not in record.keys() or 'object-hash' not in record.keys():
print "ERROR: stopping validating for file %s; no ast-hash available for commit %s" % (filename, record['commit-hash']) print "ERROR: stopping validating for file %s; no ast-hash available for commit %s" % (filename, record['commit-hash'])
break break
...@@ -197,11 +210,16 @@ def validate_hashes(record_list): ...@@ -197,11 +210,16 @@ def validate_hashes(record_list):
if commit_hash not in false_positive_records: if commit_hash not in false_positive_records:
false_positive_records[commit_hash] = set() false_positive_records[commit_hash] = set()
false_positive_records[commit_hash].add(record['filename']) false_positive_records[commit_hash].add(record['filename'])
else:
unchanged_counter += 1
if prev_record['ast-hash'] != record['ast-hash']: # if prev_record['ast-hash'] != record['ast-hash']:
ast_hashes.add(record['ast-hash']) ast_hashes.add(record['ast-hash'])
if prev_record['object-hash'] != record['object-hash']: # if prev_record['object-hash'] != record['object-hash']:
obj_hashes.add(record['object-hash']) obj_hashes.add(record['object-hash'])
ast_hashes_set.add(record['ast-hash'])
obj_hashes_set.add(record['object-hash'])
nr_of_records += 1 nr_of_records += 1
sum_of_times['parsing'] += record['parse-duration'] sum_of_times['parsing'] += record['parse-duration']
...@@ -462,12 +480,10 @@ def plotChangesGraph(fileCounts, sameHashes, differentAstHashes, differentObjHas ...@@ -462,12 +480,10 @@ def plotChangesGraph(fileCounts, sameHashes, differentAstHashes, differentObjHas
################################################################################ ################################################################################
broken_commits = set() broken_commits = set()
broken_commit_count = 0
working_commit_count = 0
def make_graphs(full_record): def make_graphs(full_record):
global broken_commit_count, working_commit_count global broken_commits
sortedCommitIDs = get_sorted_commit_id_list(full_record) sortedCommitIDs = get_sorted_commit_id_list(full_record)
iterCommits = iter(sortedCommitIDs) iterCommits = iter(sortedCommitIDs)
...@@ -616,9 +632,7 @@ def make_graphs(full_record): ...@@ -616,9 +632,7 @@ def make_graphs(full_record):
differentObjHashes.append(differentObjHash) differentObjHashes.append(differentObjHash)
sameHashes.append(same) sameHashes.append(same)
fileCounts.append(fileCount) fileCounts.append(fileCount)
working_commit_count += 1
else: else:
broken_commit_count += 1
broken_commits.add(commitID) broken_commits.add(commitID)
prevCommit = currentCommit prevCommit = currentCommit
...@@ -719,9 +733,9 @@ if (len(sys.argv) > 1): ...@@ -719,9 +733,9 @@ if (len(sys.argv) > 1):
make_graphs(full_record) make_graphs(full_record)
print "finished graphs at %s" % time.ctime() print "finished graphs at %s" % time.ctime()
print "broken commits: %d, %d" % (broken_commit_count, len(broken_commits)) print "broken commits: %d" % len(broken_commits)
print broken_commits for commit in broken_commits:
# print "working commits: %d" % working_commit_count #TODO: not actually correct. shoult be totalNr - brokenCount print "\t" + commit
print "Finished at %s" % time.ctime() print "Finished at %s" % time.ctime()
else: else:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment