Skip to content
Snippets Groups Projects
Select Git revision
  • 8640cbc496ff3f9103ac27cae79a8b1f6fb83cb8
  • main default protected
2 results

README.md

Blame
  • summarize.py 5.59 KiB
    #!/usr/bin/env python3
    """Collect and summarize evaluation results"""
    
    import argparse
    import os
    import sys
    import typing as T
    from pathlib import Path
    
    import numpy
    
    VALUE_CONVERSIONS = {'duration_time:u': lambda x: x / 1000 / 1000 / 1000}
    
    Measurements = dict[str, list[float]]
    Data = dict[str, Measurements]
    
    
    def collect(result_dir, verbose=False) -> T.Optional[Data]:
        """Collect the data in result_dir and return calculated averages"""
        if not result_dir or not os.path.exists(result_dir) or not os.path.isdir(
                result_dir):
            print(f'{result_dir} is not a directory')
            return None
    
        result_dir = Path(result_dir)
    
        data = {}
    
        for result_file_path in result_dir.iterdir():
            if result_file_path.suffix != '.stats':
                continue
            target = result_file_path.name.split('.')[0]
            results = {}
            with open(result_file_path, 'r', encoding='utf-8') as result_file:
                for line in result_file.readlines()[2:]:
                    fields = line.split(';')
                    key, _value = fields[2], fields[0]
    
                    try:
                        value = float(_value)
                    except ValueError as val_err:
                        if verbose:
                            print(
                                f'{val_err} occured during value conversion of {key}'
                            )
                        results[key] = numpy.nan
                        continue
    
                    if key in VALUE_CONVERSIONS:
                        value = VALUE_CONVERSIONS[key](value)
                    results[key] = value
    
            if target not in data:
                data[target] = {k: [v] for k, v in results.items()}
            else:
                target_data = data[target]
                for key in target_data:
                    target_data[key].append(results[key])
    
        return data
    
    
    def calc_avgs(data):
        """Calculate only averages from data ignoring nan"""
        avgs = {
            t: {k: numpy.nanmean(vs)
                for k, vs in ms.items()}
            for t, ms in data.items()
        }
    
        return avgs
    
    
    Outliers = list[float]
    DescriptiveStats = dict[str, T.Union[float, Outliers]]
    TargetStats = dict[str, DescriptiveStats]
    Stats = dict[str, TargetStats]
    
    
    def calc_stats(data: Data) -> Stats:
        """Calculate and return descriptive stats of all measurements in data"""
        stats = {}
        for target, measurements in data.items():
            target_stats: TargetStats = {}
            stats[target] = target_stats
            for measure, values in measurements.items():
                measure_stats: DescriptiveStats = {}
                target_stats[measure] = measure_stats
    
                measure_stats['mean'] = numpy.mean(values)
                measure_stats['std'] = numpy.std(values)
    
                values.sort()
                measure_stats['min'] = values[0]
                measure_stats['max'] = values[-1]
                measure_stats['median'] = float(numpy.median(values))
                upper_quartile = float(numpy.percentile(values, 75))
                measure_stats['upper_quartile'] = upper_quartile
                lower_quartile = float(numpy.percentile(values, 25))
                measure_stats['lower_quartile'] = lower_quartile
                iqr = upper_quartile - lower_quartile
    
                # find whiskers
                i = 0
                while values[i] < lower_quartile - 1.5 * iqr:
                    i += 1
                measure_stats['lower_whisker'] = values[i]
                outliers =  values[:i]
    
                i = len(values) - 1
                while values[i] > upper_quartile + 1.5 * iqr:
                    i -= 1
                measure_stats['upper_whisker'] = values[i]
                outliers += values[i + 1:]
                measure_stats['outliers'] = outliers
    
                # convert everything to float to easily dump it using pyyaml
                for key, value in measure_stats.items():
                    if isinstance(value, list):
                        continue
                    measure_stats[key] = float(value)
        return stats
    
    
    def summarize(avgs=None, stats=None, keys=None, desc_stats=None):
        """Print a summary for each selected key of the collected stats"""
        if not keys:
            keys = ['duration_time:u']
    
        if not avgs and not stats:
            print('no data to summarize')
            return False
    
        for key in keys:
            print(f'{key}:')
            for target in avgs or stats:
                if avgs:
                    print(f'\t{target}: {avgs[target][key]}')
                else:
                    for stat in desc_stats or stats[target][key].keys():
                        print(f'\t{target}-{stat}: {stats[target][key][stat]}')
    
        return True
    
    
    def collect_and_summarize(result_dir=None,
                              keys=None,
                              desc_stats=None,
                              verbose=False):
        """Collect data and print a summary of the collected data"""
        data = collect(result_dir, verbose=verbose)
        stats = calc_stats(data)
    
        if not stats:
            return 1
    
        if not summarize(stats=stats, keys=keys, desc_stats=desc_stats):
            return 1
    
        return 0
    
    
    if __name__ == '__main__':
        parser = argparse.ArgumentParser()
        parser.add_argument('-v',
                            '--verbose',
                            help='show build output',
                            action='store_true')
        parser.add_argument('-k', '--keys', help='keys to summarize', nargs='*')
        parser.add_argument('-s',
                            '--desc-stats',
                            help='print all stats not only means',
                            nargs='*')
        parser.add_argument('result_dir',
                            help='directory containing the results to summarize')
    
        _args = parser.parse_args()
    
        print('### Summary ###')
        sys.exit(collect_and_summarize(**vars(_args)))