Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Christian Dietrich
clang-hash
Commits
91b5e033
Commit
91b5e033
authored
Jan 29, 2017
by
Ludwig Fueracker
Browse files
cleaned up, added evaluation output
parent
f4144178
Changes
1
Hide whitespace changes
Inline
Side-by-side
validate_hashes.py
View file @
91b5e033
#!/usr/bin/env python
import
matplotlib
matplotlib
.
use
(
'Agg'
)
import
fnmatch
import
os
import
sys
...
...
@@ -11,9 +14,6 @@ import numpy as np
import
matplotlib.mlab
as
mlab
import
csv
SKIP_VALIDATING
=
False
#True #TODO: make command line arg
PATH_TO_RECORDS
=
''
# gets set from command line parameter
# data/record filenames
...
...
@@ -46,11 +46,8 @@ CHANGES_DATA_HEADER = ['fileCount','sameHashes', 'differentAstHashes', 'differen
SINGLE_TIMES_DATA_HEADER
=
[
'parsing'
,
'hashing'
,
'compiling'
]
def
abs_path
(
filename
):
"""Prepends the absolute path to the filename.
"""
"""Prepends the absolute path to the filename."""
return
PATH_TO_RECORDS
+
'/../'
+
filename
...
...
@@ -59,7 +56,6 @@ def get_list_of_files(directory):
for
filename
in
fnmatch
.
filter
(
filenames
,
'*'
+
INFO_EXTENSION
):
yield
os
.
path
.
join
(
root
,
filename
)
def
write_to_csv
(
data
,
column_names
,
filename
):
with
open
(
filename
,
"w"
)
as
csv_file
:
writer
=
csv
.
writer
(
csv_file
)
...
...
@@ -67,113 +63,149 @@ def write_to_csv(data, column_names, filename):
for
line
in
data
:
writer
.
writerow
(
line
)
def
print_hash_info
(
message
,
prevRecord
,
record
,
isError
=
True
):
print
"%s: file %s, commits %s to %s : %s"
%
(
"ERROR"
if
isError
else
"INFO"
,
record
[
'filename'
],
prevRecord
[
'commit-hash'
],
record
[
'commit-hash'
],
message
)
errorCount
=
0
astDifferObjSameCount
=
0
missingCount
=
0
different_source_files
=
[]
different_source_ast_hashes
=
0
different_sources_dict
=
{}
different_hashes_dict_count
=
{}
def
write_to_file
(
data
,
filename
):
with
open
(
abs_path
(
filename
),
'w'
)
as
f
:
try
:
f
.
write
(
repr
(
data
))
except
MemoryError
as
me
:
print
me
raise
def
plot_hash_count_histogram
(
hash_values
,
filename
):
dictionary
=
plt
.
figure
()
D
=
hash_values
# plt.xticks(range(len(D)), D.keys())
fig
,
ax
=
plt
.
subplots
()
plt
.
xlabel
(
'nr of different hashes'
)
plt
.
ylabel
(
'nr of files'
)
ax
.
bar
(
D
.
keys
(),
D
.
values
(),
align
=
'center'
)
ax
.
bar
(
hash_values
.
keys
(),
hash_values
.
values
(),
align
=
'center'
)
fig
.
savefig
(
filename
)
false_negatives
=
0
false_positives
=
0
ast_hash_missing
=
0
source_files
=
set
()
# all filenames of the hashed files
ast_hashes_dict
=
{}
# maps filename -> set(ast hashes)
obj_hashes_dict
=
{}
# maps filename -> set(obj hashes)
nr_of_records
=
0
# for calculating avg
sum_of_times
=
{
'parsing'
:
0
,
'hashing'
:
0
,
'compiling'
:
0
}
def
validate_records
():
for
filename
in
get_list_of_files
(
PATH_TO_RECORDS
):
records
=
[
eval
(
line
)
for
line
in
open
(
filename
)]
validate_hashes
(
records
)
print
"Errors: %d, Infos: %d, Missing: %d"
%
(
errorCount
,
astDifferObjSameCount
,
missingCount
)
print
"different source files: %d, different source AST hashes: %d"
%
(
len
(
different_source_files
),
different_source_ast_hashes
)
with
open
(
abs_path
(
'different_hashes.info'
),
'w'
)
as
f
:
try
:
f
.
write
(
repr
(
different_sources_dict
))
except
MemoryError
as
me
:
print
me
raise
with
open
(
abs_path
(
'different_hashes_counts.info'
),
'w'
)
as
f
:
try
:
f
.
write
(
repr
(
different_hashes_dict_count
))
except
MemoryError
as
me
:
print
me
raise
plot_hash_count_histogram
(
different_hashes_dict_count
,
abs_path
(
'hash_count_histogram.pdf'
))
write_to_csv
([
[
k
,
v
]
for
k
,
v
in
different_hashes_dict_count
.
items
()
],
[
'nr of different hashes'
,
'nr of files'
],
abs_path
(
'different_hashes_counts.csv'
))
def
validate_hashes
(
recordList
):
global
errorCount
,
astDifferObjSameCount
,
missingCount
global
different_source_files
,
different_source_ast_hashes
global
different_sources_dict
,
different_hashes_dict
#TODO: also sort this, perhaps execute on fullRecords or check against sorted commitIDs
#also TODO: collect data from all files before validating (changing paths src and crt)
#TODO: validate return-code!
#TODO: this method assumes that all records are from the same object file
iterRecords
=
iter
(
recordList
)
prevRecord
=
next
(
iterRecords
)
filename
=
prevRecord
[
'filename'
]
if
filename
not
in
different_source_files
:
different_source_files
.
append
(
filename
)
different_ast_hashes
=
0
# number of different ast hashes (in total)
for
v
in
ast_hashes_dict
.
values
():
different_ast_hashes
+=
len
(
v
)
different_obj_hashes
=
0
# number of different obj hashes (in total)
for
v
in
obj_hashes_dict
.
values
():
different_obj_hashes
+=
len
(
v
)
print
"
\n
---- Results ----"
print
"false negatives (errors): %d"
%
false_negatives
print
"false positives: %d"
%
false_positives
print
"missing ast hashes: %d"
%
ast_hash_missing
print
""
print
"source files: %d"
%
len
(
source_files
)
print
"different AST hashes: %d"
%
different_ast_hashes
print
"different obj hashes: %d"
%
different_obj_hashes
print
""
print
"avg times:"
for
k
,
v
in
sum_of_times
.
items
():
print
"%s: %d"
%
(
k
,
v
/
nr_of_records
)
print
"-----------------
\n
"
write_to_csv
([
[
k
,
len
(
v
)]
for
k
,
v
in
ast_hashes_dict
.
items
()
],
[
'filename'
,
'nr of different hashes'
],
abs_path
(
'different_ast_hashes_per_file.csv'
))
write_to_csv
([
[
k
,
len
(
v
)]
for
k
,
v
in
obj_hashes_dict
.
items
()
],
[
'filename'
,
'nr of different hashes'
],
abs_path
(
'different_obj_hashes_per_file.csv'
))
def
print_hash_info
(
message
,
prev_record
,
record
,
is_error
=
True
):
print
"%s: file %s, commits %s to %s : %s"
%
(
"ERROR"
if
is_error
else
"INFO"
,
record
[
'filename'
],
prev_record
[
'commit-hash'
],
record
[
'commit-hash'
],
message
)
def
validate_hashes
(
record_list
):
''' All records in the list must be from the same object file'''
#TODO: collect data from all files before validating (changing paths src and crt) and sort
global
false_negatives
,
false_positives
,
ast_hash_missing
global
source_files
global
ast_hashes_dict
,
obj_hashes_dict
global
sum_of_times
,
nr_of_records
iter_records
=
iter
(
record_list
)
prev_record
=
next
(
iter_records
)
filename
=
prev_record
[
'filename'
]
source_files
.
add
(
filename
)
if
'ast-hash'
not
in
prev
R
ecord
.
keys
():
if
'ast-hash'
not
in
prev
_r
ecord
.
keys
():
#print "MISSING: no ast-hash in records for file " + filename
missing
Count
+=
1
ast_hash_
missing
+=
1
return
different_hash_count
=
1
# the different hashes of the current file
ast_hashes
=
set
()
obj_hashes
=
set
()
for
record
in
iterRecords
:
if
prevRecord
[
'start-time'
]
>
record
[
'start-time'
]:
ast_hashes
.
add
(
prev_record
[
'ast-hash'
])
obj_hashes
.
add
(
prev_record
[
'object-hash'
])
nr_of_records
+=
1
sum_of_times
[
'parsing'
]
+=
prev_record
[
'parse-duration'
]
sum_of_times
[
'hashing'
]
+=
prev_record
[
'hash-duration'
]
sum_of_times
[
'compiling'
]
+=
prev_record
[
'compile-duration'
]
-
(
prev_record
[
'parse-duration'
]
+
prev_record
[
'hash-duration'
])
for
record
in
iter_records
:
if
prev_record
[
'start-time'
]
>
record
[
'start-time'
]:
print
"Error: wrong order of records"
#TODO: fix, then remove this
if
'ast-hash'
not
in
record
.
keys
()
or
'object-hash'
not
in
record
.
keys
():
print
"ERROR: stopping validating for file %s; no ast-hash available for commit %s"
%
(
filename
,
record
[
'commit-hash'
])
break
if
prevRecord
[
'object-hash'
]
!=
record
[
'object-hash'
]:
if
prevRecord
[
'ast-hash'
]
==
record
[
'ast-hash'
]:
print_hash_info
(
"object hashes differ, ast hashes same"
,
prevRecord
,
record
)
errorCount
+=
1
elif
prevRecord
[
'ast-hash'
]
!=
record
[
'ast-hash'
]:
#print_hash_info("ast hashes differ, object hashes same", prevRecord, record, False)
astDifferObjSameCount
+=
1
different_source_ast_hashes
+=
1
different_hash_count
+=
1
prevRecord
=
record
if
filename
in
different_sources_dict
:
different_sources_dict
[
filename
]
+=
different_hash_count
if
prev_record
[
'object-hash'
]
!=
record
[
'object-hash'
]:
if
prev_record
[
'ast-hash'
]
==
record
[
'ast-hash'
]:
print_hash_info
(
"object hashes differ, ast hashes same"
,
prev_record
,
record
)
false_negatives
+=
1
elif
prev_record
[
'ast-hash'
]
!=
record
[
'ast-hash'
]:
#print_hash_info("ast hashes differ, object hashes same", prev_record, record, False) #TODO: include this and look at the changes
false_positives
+=
1
if
prev_record
[
'ast-hash'
]
!=
record
[
'ast-hash'
]:
ast_hashes
.
add
(
record
[
'ast-hash'
])
if
prev_record
[
'object-hash'
]
!=
record
[
'object-hash'
]:
obj_hashes
.
add
(
record
[
'object-hash'
])
nr_of_records
+=
1
sum_of_times
[
'parsing'
]
+=
record
[
'parse-duration'
]
sum_of_times
[
'hashing'
]
+=
record
[
'hash-duration'
]
sum_of_times
[
'compiling'
]
+=
record
[
'compile-duration'
]
-
(
record
[
'parse-duration'
]
+
record
[
'hash-duration'
])
prev_record
=
record
if
filename
in
ast_hashes_dict
:
ast_hashes_dict
[
filename
]
|=
ast_hashes
# merge sets
else
:
different_sourc
es_dict
[
filename
]
=
different_hash_count
if
different_hash_count
not
in
different
_hashes_dict
_count
:
different
_hashes_dict
_count
[
different_hash_count
]
=
1
ast_hash
es_dict
[
filename
]
=
ast_hashes
if
filename
in
obj
_hashes_dict
:
obj
_hashes_dict
[
filename
]
|=
obj_hashes
# merge sets
else
:
different_hashes_dict_count
[
different_hash_count
]
+=
1
obj_hashes_dict
[
filename
]
=
obj_hashes
different_ast_hash_count
=
len
(
ast_hashes
)
different_obj_hash_count
=
len
(
obj_hashes
)
################################################################################
#
#
################################################################################
def
build_key_translation_dict
():
...
...
@@ -188,7 +220,7 @@ def build_key_translation_dict():
'hash-duration'
:
7
,
'filename'
:
8
,
'project'
:
9
,
'compile-duration'
:
10
,
# time the compiler was running (incl. parse-duration)
'compile-duration'
:
10
,
# time the compiler was running (incl. parse-duration)
#TODO: AND hash-duration?
'ast-hash'
:
11
,
'commit-hash'
:
12
,
'element-hashes'
:
13
,
...
...
@@ -480,13 +512,11 @@ def make_graphs(full_record):
prevFilename
=
filename
if
filename
not
in
prevFiles
:
if
'src/'
+
filename
in
prevFiles
:
print
"file %s changed place to src/"
%
filename
print
"file %s changed place to src/"
%
filename
#TODO: is this actually necessary?
prevFilename
=
'src/'
+
filename
prevRecord
=
prevFiles
[
prevFilename
]
elif
'crt/'
+
filename
in
prevFiles
:
print
"file %s changed place to crt/"
%
filename
prevFilename
=
'crt/'
+
filename
prevRecord
=
prevFiles
[
prevFilename
]
else
:
print
"MISSING: %s not in prev (%s), current is (%s)"
%
(
filename
,
prevCommitID
,
commitID
)
missingFilesTotal
+=
1
...
...
@@ -495,7 +525,7 @@ def make_graphs(full_record):
currentRecord
=
currentFiles
[
filename
]
prev
R
ecord
=
prevFiles
[
prevFilename
]
prev
_r
ecord
=
prevFiles
[
prevFilename
]
parseDuration
=
currentRecord
[
tr
(
'parse-duration'
)]
# ns
hashDuration
=
currentRecord
[
tr
(
'hash-duration'
)]
# ns
...
...
@@ -506,10 +536,10 @@ def make_graphs(full_record):
totalCompileDuration
+=
compileDuration
if
prev
R
ecord
[
tr
(
'ast-hash'
)]
==
currentRecord
[
tr
(
'ast-hash'
)]:
if
prev
_r
ecord
[
tr
(
'ast-hash'
)]
==
currentRecord
[
tr
(
'ast-hash'
)]:
totalASTHashRedundantCompileTime
+=
compileDuration
# ns
if
prev
R
ecord
[
tr
(
'object-hash'
)]
==
currentRecord
[
tr
(
'object-hash'
)]:
if
prev
_r
ecord
[
tr
(
'object-hash'
)]
==
currentRecord
[
tr
(
'object-hash'
)]:
totalOptimalRedundantTime
+=
compileDuration
+
hashDuration
+
parseDuration
#ns
totalOptimalRedundantCompileTime
+=
compileDuration
else
:
...
...
@@ -520,10 +550,10 @@ def make_graphs(full_record):
# for changes graph
if
prev
R
ecord
[
tr
(
'object-hash'
)]
!=
currentRecord
[
tr
(
'object-hash'
)]:
if
prev
_r
ecord
[
tr
(
'object-hash'
)]
!=
currentRecord
[
tr
(
'object-hash'
)]:
differentObjHash
+=
1
differentAstHash
+=
1
elif
prev
R
ecord
[
tr
(
'ast-hash'
)]
!=
currentRecord
[
tr
(
'ast-hash'
)]:
elif
prev
_r
ecord
[
tr
(
'ast-hash'
)]
!=
currentRecord
[
tr
(
'ast-hash'
)]:
differentAstHash
+=
1
else
:
same
+=
1
...
...
@@ -592,7 +622,7 @@ def csv_files_are_existing():
and
os
.
path
.
isfile
(
abs_path
(
CHANGES_DATA_FILENAME
))
and
os
.
path
.
isfile
(
abs_path
(
SINGLE_TIMES_DATA_FILENAME
)))
def
read_from_csv
(
filename
,
column_names
):
#TODO: unused?
def
read_from_csv
(
filename
,
column_names
):
data
=
[]
with
open
(
filename
)
as
csv_file
:
reader
=
csv
.
reader
(
csv_file
)
...
...
@@ -618,35 +648,31 @@ def read_csv_data_and_plot_graphs():
build_time_data
=
read_from_csv
(
abs_path
(
BUILD_TIME_DATA_FILENAME
),
BUILD_TIME_DATA_HEADER
)
plot_build_time_graph1
(
build_time_data
)
build_time_composition_data
=
read_from_csv
(
abs_path
(
BUILD_TIME_COMPOSITION_DATA_FILENAME
),
BUILD_TIME_COMPOSITION_DATA_HEADER
)
plot_build_time_composition_graph1
(
build_time_composition_data
)
changes_data
=
read_from_csv
(
abs_path
(
CHANGES_DATA_FILENAME
),
CHANGES_DATA_HEADER
)
plot_changes_graph1
(
changes_data
)
single_times_data
=
read_from_csv
(
abs_path
(
SINGLE_TIMES_DATA_FILENAME
),
SINGLE_TIMES_DATA_HEADER
)
plot_time_histograms1
(
single_times_data
)
################################################################################
# main:
if
(
len
(
sys
.
argv
)
>
1
):
PATH_TO_RECORDS
=
sys
.
argv
[
1
]
path_to_full_record_file
=
abs_path
(
FULL_RECORD_FILENAME
)
print
"Starting at %s"
%
time
.
ctime
()
if
not
SKIP_VALIDATING
:
if
'--skip-validating'
not
in
sys
.
argv
:
print
"validating..."
validate_records
()
print
"finished validating at %s"
%
time
.
ctime
()
'''
if
csv_files_are_existing
():
# skip building record, read csv data from files and work with that
print
"reading from csv files"
...
...
@@ -658,7 +684,7 @@ if (len(sys.argv) > 1):
make_graphs
(
full_record
)
print
"finished graphs at %s"
%
time
.
ctime
()
'''
print
"Finished at %s"
%
time
.
ctime
()
else
:
print
"Missing path to record files.
\n
Usage:
\n\t
%s PATH_TO_RECORDS"
%
sys
.
argv
[
0
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment