diff --git a/config.yaml b/config.yaml index 796c3f3..87fc0d2 100644 --- a/config.yaml +++ b/config.yaml @@ -27,19 +27,19 @@ ledgers: - tezos - zcash -# Execution flags -execution_flags: - force_map: false +# Flag that controls whether to force mapping or use already processed data +# when available. +force_map: false -# Analyze flags -analyze_flags: - clustering: true +# Flag that determines whether to cluster block producers into entities that +# control them. +clustering: true # The timeframe for which an analysis should be performed. # Each date is a string of the form YYYY-MM-DD. timeframe: start_date: 2018-01-01 - end_date: 2025-03-01 + end_date: 2026-02-01 # The number of days to use for the estimation window, i.e.how many days of blocks to use for each data point. # If left empty, then the entire time frame will be used (only valid when combined with empty frequency). diff --git a/consensus_decentralization/analyze.py b/consensus_decentralization/analyze.py index 044dc28..93e7201 100644 --- a/consensus_decentralization/analyze.py +++ b/consensus_decentralization/analyze.py @@ -82,18 +82,15 @@ def analyze(projects, aggregated_data_filename, input_dir, output_dir, populatio csv_contents[metric_name][row_index + 1].append(result) aggregate_output[project][date][metric_name] = result - for metric in metric_names: - with open(output_dir / f'{metric}.csv', 'w') as f: - csv_writer = csv.writer(f) - csv_writer.writerows(csv_contents[metric]) - aggregate_csv_output = [['ledger', 'date', 'clustering'] + metric_names] for project, timeframes in aggregate_output.items(): for date, results in timeframes.items(): metric_values = [results[metric] for metric in metric_names] if any(metric_values): aggregate_csv_output.append([project, date, clustering_flag] + metric_values) - with open(output_dir / 'output.csv', 'w') as f: + + output_filename = hlp.get_output_filename(clustering_flag) + with open(output_dir / output_filename, 'w') as f: csv_writer = csv.writer(f) csv_writer.writerows(aggregate_csv_output) diff --git a/consensus_decentralization/helper.py b/consensus_decentralization/helper.py index 00d215f..68dbf24 100644 --- a/consensus_decentralization/helper.py +++ b/consensus_decentralization/helper.py @@ -428,7 +428,7 @@ def get_force_map_flag(): """ config = get_config_data() try: - return config['execution_flags']['force_map'] + return config['force_map'] except KeyError: raise ValueError('Flag "force_map" missing from config file') @@ -441,7 +441,7 @@ def get_clustering_flag(): """ config = get_config_data() try: - return config['analyze_flags']['clustering'] + return config['clustering'] except KeyError: raise ValueError('Flag "clustering" missing from config file') @@ -465,6 +465,15 @@ def get_mapped_data_filename(clustering_flag): return 'mapped_data_' + ('clustered' if clustering_flag else 'non_clustered') + '.json' +def get_output_filename(clustering_flag): + """ + Retrieves the filename of the output file + :param clustering_flag: boolean that determines whether clustering was performed + :returns: str with the filename of the output file, which depends on whether clustering was performed + """ + return 'output_' + ('clustered' if clustering_flag else 'non_clustered') + '.csv' + + def get_input_directories(): """ Reads the config file and retrieves the directories to look for raw block data diff --git a/run.py b/run.py index 84cef8b..b88d398 100644 --- a/run.py +++ b/run.py @@ -19,8 +19,8 @@ def process_data(force_map, ledger_dir, ledger, output_dir): return None -def main(ledgers, timeframe, estimation_window, frequency, population_windows, interim_dir=hlp.INTERIM_DIR, - results_dir=hlp.RESULTS_DIR): +def main(ledgers, timeframe, estimation_window, frequency, population_windows, + force_map, interim_dir=hlp.INTERIM_DIR, results_dir=hlp.RESULTS_DIR): """ Executes the entire pipeline (parsing, mapping, analyzing) for some projects and timeframes. :param ledgers: list of strings that correspond to the ledgers whose data should be analyzed @@ -30,13 +30,18 @@ def main(ledgers, timeframe, estimation_window, frequency, population_windows, i timeframe will be considered. :param frequency: int or None. The number of days to consider for the frequency of the analysis (i.e. the number of days between each data point considered in the analysis). If None, only one data point will be considered, - spanning the entire timeframe (i.e. it needs to be combined with None estimation_window). - :param interim_dir: pathlib.PosixPath object of the directory where the output data will be saved + spanning the entire timeframe (i.e. it needs to be combined with None + estimation_window). + :param population_windows: int. The number of windows to look backwards and forwards to determine the population of + active block producers for a given time period. + :param force_map: bool. If True, then the mapping will be performed, + regardless of whether mapped data for the project already exist. + :param interim_dir: pathlib.PosixPath object of the directory where the + output data will be saved + :param results_dir: pathlib.PosixPath object of the directory where the results will be saved """ logging.info(f"The ledgers that will be analyzed are: {','.join(ledgers)}") - force_map = hlp.get_force_map_flag() - for ledger in list(ledgers): ledger_dir = interim_dir / ledger ledger_dir.mkdir(parents=True, exist_ok=True) # create ledger output directory if it doesn't already exist @@ -89,6 +94,7 @@ def main(ledgers, timeframe, estimation_window, frequency, population_windows, i estimation_window, frequency = hlp.get_estimation_window_and_frequency() population_windows = hlp.get_population_windows() + force_map_flag = hlp.get_force_map_flag() results_dir = hlp.get_results_dir(estimation_window, frequency, population_windows) results_dir.mkdir(parents=True, exist_ok=True) @@ -101,6 +107,7 @@ def main(ledgers, timeframe, estimation_window, frequency, population_windows, i 'the first date.') timeframe = (timeframe_start, timeframe_end) - main(ledgers, timeframe, estimation_window, frequency, population_windows, results_dir=results_dir) + main(ledgers, timeframe, estimation_window, frequency, population_windows, + force_map_flag, results_dir=results_dir) logging.info('Done. Please check the output directory for results.') diff --git a/tests/test_analyze.py b/tests/test_analyze.py index 5164125..ba854d6 100644 --- a/tests/test_analyze.py +++ b/tests/test_analyze.py @@ -1,5 +1,6 @@ import shutil import pytest +import csv from consensus_decentralization.helper import INTERIM_DIR, get_clustering_flag from consensus_decentralization.analyze import analyze @@ -59,62 +60,49 @@ def test_analyze(setup_and_cleanup): population_windows=0 ) - metrics = ['gini', 'nakamoto_coefficient', 'entropy=1'] - for metric in metrics: - output_file = test_output_dir / 'metrics' / f'{metric}.csv' - assert output_file.is_file() + output_file = test_output_dir / 'metrics' / 'output_clustered.csv' + assert output_file.is_file() - with open(output_file) as f: - lines = f.readlines() - assert lines[0] == 'timeframe,sample_bitcoin\n' - if metric == 'gini': - assert lines[1] == '2018,0.25\n' - elif metric == 'nakamoto_coefficient': - assert lines[1] == '2018,2\n' - elif metric == 'entropy=1': - assert lines[1] == '2018,1.836591668108979\n' + with open(output_file) as f: + reader = list(csv.reader(f)) + header = reader[0] + # find metric column indices + gini_idx = header.index('gini') + nc_idx = header.index('nakamoto_coefficient') + ent_idx = header.index('entropy=1') - analyze( - projects=projects, - aggregated_data_filename='month_from_2018-02-01_to_2018-03-31.csv', - input_dir=test_output_dir, - output_dir=test_output_dir / 'metrics', - population_windows=0 - ) - - metrics = ['gini', 'nakamoto_coefficient', 'entropy=1'] - for metric in metrics: - output_file = test_output_dir / 'metrics' / f'{metric}.csv' - assert output_file.is_file() - - with open(output_file) as f: - lines = f.readlines() - assert lines[0] == 'timeframe,sample_bitcoin\n' - if metric == 'gini': - assert lines[1] == 'Feb-2018,0.16666666666666666\n' - assert lines[2] == 'Mar-2018,0.0\n' - elif metric == 'nakamoto_coefficient': - assert lines[1] == 'Feb-2018,1\n' - assert lines[2] == 'Mar-2018,1\n' - elif metric == 'entropy=1': - assert lines[1] == 'Feb-2018,1.5\n' - assert lines[2] == 'Mar-2018,0.0\n' + # find the row for sample_bitcoin and 2018 + data_row = None + for row in reader[1:]: + if row[0] == 'sample_bitcoin' and row[1] == '2018': + data_row = row + break + assert data_row is not None + assert data_row[gini_idx] == '0.25' + assert data_row[nc_idx] == '2' + assert data_row[ent_idx] == '1.836591668108979' analyze( projects=projects, - aggregated_data_filename='year_from_2010-01-01_to_2010-12-31.csv', + aggregated_data_filename='month_from_2018-02-01_to_2018-03-31.csv', input_dir=test_output_dir, output_dir=test_output_dir / 'metrics', population_windows=0 ) - metrics = ['gini', 'nakamoto_coefficient', 'entropy=1'] - for metric in metrics: - output_file = test_output_dir / 'metrics' / f'{metric}.csv' - assert output_file.is_file() + output_file = test_output_dir / 'metrics' / 'output_clustered.csv' + assert output_file.is_file() + with open(output_file) as f: + reader = list(csv.reader(f)) + header = reader[0] + gini_idx = header.index('gini') + nc_idx = header.index('nakamoto_coefficient') + ent_idx = header.index('entropy=1') - with open(output_file) as f: - lines = f.readlines() - assert len(lines) == 2 - assert lines[0] == 'timeframe,sample_bitcoin\n' - assert lines[1] == '2010,\n' + rows_for_project = {row[1]: row for row in reader[1:] if row[0] == 'sample_bitcoin'} + assert rows_for_project['Feb-2018'][gini_idx] == '0.16666666666666666' + assert rows_for_project['Mar-2018'][gini_idx] == '0.0' + assert rows_for_project['Feb-2018'][nc_idx] == '1' + assert rows_for_project['Mar-2018'][nc_idx] == '1' + assert rows_for_project['Feb-2018'][ent_idx] == '1.5' + assert rows_for_project['Mar-2018'][ent_idx] == '0.0' diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py index 9cad933..e3aa485 100644 --- a/tests/test_end_to_end.py +++ b/tests/test_end_to_end.py @@ -11,6 +11,7 @@ from consensus_decentralization.mappings.cardano_mapping import CardanoMapping from consensus_decentralization.helper import INTERIM_DIR, config import pytest +import csv @pytest.fixture @@ -28,9 +29,7 @@ def setup_and_cleanup(): ledger_mapping['sample_cardano'] = CardanoMapping ledger_parser['sample_cardano'] = DummyParser - force_map_flag = config['execution_flags']['force_map'] - config['execution_flags']['force_map'] = True - config['analyze_flags']['clustering'] = True + config['clustering'] = True mapping_info_dir = pathlib.Path(__file__).resolve().parent.parent / 'mapping_information' for project in ['bitcoin', 'cardano']: @@ -72,49 +71,10 @@ def setup_and_cleanup(): except FileNotFoundError: pass - config['execution_flags']['force_map'] = force_map_flag - def test_end_to_end(setup_and_cleanup): test_output_dir, test_metrics_dir = setup_and_cleanup - main( - ['sample_bitcoin', 'sample_cardano'], - (datetime.date(2010, 1, 1), datetime.date(2010, 12, 31)), - estimation_window=None, - frequency=None, - interim_dir=test_output_dir, - results_dir=test_output_dir, - population_windows=0 - ) - - expected_entropy = [ - 'timeframe,sample_bitcoin,sample_cardano\n', - '2010-07-02,,\n' - ] - with open(test_metrics_dir / 'entropy=1.csv') as f: - lines = f.readlines() - for idx, line in enumerate(lines): - assert line == expected_entropy[idx] - - expected_gini = [ - 'timeframe,sample_bitcoin,sample_cardano\n', - '2010-07-02,,\n' - ] - with open(test_metrics_dir / 'gini.csv') as f: - lines = f.readlines() - for idx, line in enumerate(lines): - assert line == expected_gini[idx] - - expected_nc = [ - 'timeframe,sample_bitcoin,sample_cardano\n', - '2010-07-02,,\n' - ] - with open(test_metrics_dir / 'nakamoto_coefficient.csv') as f: - lines = f.readlines() - for idx, line in enumerate(lines): - assert line == expected_nc[idx] - main( ['sample_bitcoin', 'sample_cardano'], (datetime.date(2018, 2, 1), datetime.date(2018, 3, 31)), @@ -122,19 +82,10 @@ def test_end_to_end(setup_and_cleanup): frequency=30, interim_dir=test_output_dir, results_dir=test_output_dir, - population_windows=0 + population_windows=0, + force_map=True ) - expected_entropy = [ - 'timeframe,sample_bitcoin,sample_cardano\n', - '2018-02-15,1.5,\n', - '2018-03-17,0.0,\n', - ] - with open(test_metrics_dir / 'entropy=1.csv') as f: - lines = f.readlines() - for idx, line in enumerate(lines): - assert line == expected_entropy[idx] - # todo fix test (remake calculations from sample files given the new window/frequency) # expected_gini = [ # 'timeframe,sample_bitcoin,sample_cardano\n', @@ -146,14 +97,18 @@ def test_end_to_end(setup_and_cleanup): # for idx, line in enumerate(lines): # assert line == expected_gini[idx] - expected_nc = [ - 'timeframe,sample_bitcoin,sample_cardano\n', - '2018-02-15,1,\n', '2018-03-17,1,\n' - ] - with open(test_metrics_dir / 'nakamoto_coefficient.csv') as f: - lines = f.readlines() - for idx, line in enumerate(lines): - assert line == expected_nc[idx] + output_file = test_metrics_dir / 'output_clustered.csv' + assert output_file.is_file() + with open(output_file) as f: + rows = list(csv.reader(f)) + header = rows[0] + ent_idx = header.index('entropy=1') + nc_idx = header.index('nakamoto_coefficient') + + # build mapping ledger+date -> row + row_map = {(r[0], r[1]): r for r in rows[1:]} + assert row_map[('sample_bitcoin', '2018-02-15')][ent_idx] == '1.5' + assert row_map[('sample_bitcoin', '2018-02-15')][nc_idx] == '1' main( ['sample_bitcoin', 'sample_cardano'], @@ -162,32 +117,19 @@ def test_end_to_end(setup_and_cleanup): frequency=31, interim_dir=test_output_dir, results_dir=test_output_dir, - population_windows=0 + population_windows=0, + force_map=True ) - expected_entropy = [ - 'timeframe,sample_bitcoin,sample_cardano\n', - '2020-12-16,,1.9219280948873623\n' - ] - with open(test_metrics_dir / 'entropy=1.csv') as f: - lines = f.readlines() - for idx, line in enumerate(lines): - assert line == expected_entropy[idx] - - expected_gini = [ - 'timeframe,sample_bitcoin,sample_cardano\n', - '2020-12-16,,0.15\n' - ] - with open(test_metrics_dir / 'gini.csv') as f: - lines = f.readlines() - for idx, line in enumerate(lines): - assert line == expected_gini[idx] - - expected_nc = [ - 'timeframe,sample_bitcoin,sample_cardano\n', - '2020-12-16,,2\n' - ] - with open(test_metrics_dir / 'nakamoto_coefficient.csv') as f: - lines = f.readlines() - for idx, line in enumerate(lines): - assert line == expected_nc[idx] + output_file = test_metrics_dir / 'output_clustered.csv' + assert output_file.is_file() + with open(output_file) as f: + rows = list(csv.reader(f)) + header = rows[0] + ent_idx = header.index('entropy=1') + gini_idx = header.index('gini') + nc_idx = header.index('nakamoto_coefficient') + row_map = {(r[0], r[1]): r for r in rows[1:]} + assert row_map[('sample_cardano', '2020-12-16')][ent_idx] == '1.9219280948873623' + assert row_map[('sample_cardano', '2020-12-16')][gini_idx] == '0.15' + assert row_map[('sample_cardano', '2020-12-16')][nc_idx] == '2'