Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,19 @@ ledgers:
- tezos
- zcash

# Execution flags
execution_flags:
force_map: false
# Flag that controls whether to force mapping or use already processed data
# when available.
force_map: false

# Analyze flags
analyze_flags:
clustering: true
# Flag that determines whether to cluster block producers into entities that
# control them.
clustering: true

# The timeframe for which an analysis should be performed.
# Each date is a string of the form YYYY-MM-DD.
timeframe:
start_date: 2018-01-01
end_date: 2025-03-01
end_date: 2026-02-01

# The number of days to use for the estimation window, i.e.how many days of blocks to use for each data point.
# If left empty, then the entire time frame will be used (only valid when combined with empty frequency).
Expand Down
9 changes: 3 additions & 6 deletions consensus_decentralization/analyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,18 +82,15 @@ def analyze(projects, aggregated_data_filename, input_dir, output_dir, populatio
csv_contents[metric_name][row_index + 1].append(result)
aggregate_output[project][date][metric_name] = result

for metric in metric_names:
with open(output_dir / f'{metric}.csv', 'w') as f:
csv_writer = csv.writer(f)
csv_writer.writerows(csv_contents[metric])

aggregate_csv_output = [['ledger', 'date', 'clustering'] + metric_names]
for project, timeframes in aggregate_output.items():
for date, results in timeframes.items():
metric_values = [results[metric] for metric in metric_names]
if any(metric_values):
aggregate_csv_output.append([project, date, clustering_flag] + metric_values)
with open(output_dir / 'output.csv', 'w') as f:

output_filename = hlp.get_output_filename(clustering_flag)
with open(output_dir / output_filename, 'w') as f:
csv_writer = csv.writer(f)
csv_writer.writerows(aggregate_csv_output)

Expand Down
13 changes: 11 additions & 2 deletions consensus_decentralization/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ def get_force_map_flag():
"""
config = get_config_data()
try:
return config['execution_flags']['force_map']
return config['force_map']
except KeyError:
raise ValueError('Flag "force_map" missing from config file')

Expand All @@ -441,7 +441,7 @@ def get_clustering_flag():
"""
config = get_config_data()
try:
return config['analyze_flags']['clustering']
return config['clustering']
except KeyError:
raise ValueError('Flag "clustering" missing from config file')

Expand All @@ -465,6 +465,15 @@ def get_mapped_data_filename(clustering_flag):
return 'mapped_data_' + ('clustered' if clustering_flag else 'non_clustered') + '.json'


def get_output_filename(clustering_flag):
"""
Retrieves the filename of the output file
:param clustering_flag: boolean that determines whether clustering was performed
:returns: str with the filename of the output file, which depends on whether clustering was performed
"""
return 'output_' + ('clustered' if clustering_flag else 'non_clustered') + '.csv'


def get_input_directories():
"""
Reads the config file and retrieves the directories to look for raw block data
Expand Down
21 changes: 14 additions & 7 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ def process_data(force_map, ledger_dir, ledger, output_dir):
return None


def main(ledgers, timeframe, estimation_window, frequency, population_windows, interim_dir=hlp.INTERIM_DIR,
results_dir=hlp.RESULTS_DIR):
def main(ledgers, timeframe, estimation_window, frequency, population_windows,
force_map, interim_dir=hlp.INTERIM_DIR, results_dir=hlp.RESULTS_DIR):
"""
Executes the entire pipeline (parsing, mapping, analyzing) for some projects and timeframes.
:param ledgers: list of strings that correspond to the ledgers whose data should be analyzed
Expand All @@ -30,13 +30,18 @@ def main(ledgers, timeframe, estimation_window, frequency, population_windows, i
timeframe will be considered.
:param frequency: int or None. The number of days to consider for the frequency of the analysis (i.e. the number
of days between each data point considered in the analysis). If None, only one data point will be considered,
spanning the entire timeframe (i.e. it needs to be combined with None estimation_window).
:param interim_dir: pathlib.PosixPath object of the directory where the output data will be saved
spanning the entire timeframe (i.e. it needs to be combined with None
estimation_window).
:param population_windows: int. The number of windows to look backwards and forwards to determine the population of
active block producers for a given time period.
:param force_map: bool. If True, then the mapping will be performed,
regardless of whether mapped data for the project already exist.
:param interim_dir: pathlib.PosixPath object of the directory where the
output data will be saved
:param results_dir: pathlib.PosixPath object of the directory where the results will be saved
"""
logging.info(f"The ledgers that will be analyzed are: {','.join(ledgers)}")

force_map = hlp.get_force_map_flag()

for ledger in list(ledgers):
ledger_dir = interim_dir / ledger
ledger_dir.mkdir(parents=True, exist_ok=True) # create ledger output directory if it doesn't already exist
Expand Down Expand Up @@ -89,6 +94,7 @@ def main(ledgers, timeframe, estimation_window, frequency, population_windows, i

estimation_window, frequency = hlp.get_estimation_window_and_frequency()
population_windows = hlp.get_population_windows()
force_map_flag = hlp.get_force_map_flag()

results_dir = hlp.get_results_dir(estimation_window, frequency, population_windows)
results_dir.mkdir(parents=True, exist_ok=True)
Expand All @@ -101,6 +107,7 @@ def main(ledgers, timeframe, estimation_window, frequency, population_windows, i
'the first date.')
timeframe = (timeframe_start, timeframe_end)

main(ledgers, timeframe, estimation_window, frequency, population_windows, results_dir=results_dir)
main(ledgers, timeframe, estimation_window, frequency, population_windows,
force_map_flag, results_dir=results_dir)

logging.info('Done. Please check the output directory for results.')
84 changes: 36 additions & 48 deletions tests/test_analyze.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import shutil
import pytest
import csv
from consensus_decentralization.helper import INTERIM_DIR, get_clustering_flag
from consensus_decentralization.analyze import analyze

Expand Down Expand Up @@ -59,62 +60,49 @@ def test_analyze(setup_and_cleanup):
population_windows=0
)

metrics = ['gini', 'nakamoto_coefficient', 'entropy=1']
for metric in metrics:
output_file = test_output_dir / 'metrics' / f'{metric}.csv'
assert output_file.is_file()
output_file = test_output_dir / 'metrics' / 'output_clustered.csv'
assert output_file.is_file()

with open(output_file) as f:
lines = f.readlines()
assert lines[0] == 'timeframe,sample_bitcoin\n'
if metric == 'gini':
assert lines[1] == '2018,0.25\n'
elif metric == 'nakamoto_coefficient':
assert lines[1] == '2018,2\n'
elif metric == 'entropy=1':
assert lines[1] == '2018,1.836591668108979\n'
with open(output_file) as f:
reader = list(csv.reader(f))
header = reader[0]
# find metric column indices
gini_idx = header.index('gini')
nc_idx = header.index('nakamoto_coefficient')
ent_idx = header.index('entropy=1')

analyze(
projects=projects,
aggregated_data_filename='month_from_2018-02-01_to_2018-03-31.csv',
input_dir=test_output_dir,
output_dir=test_output_dir / 'metrics',
population_windows=0
)

metrics = ['gini', 'nakamoto_coefficient', 'entropy=1']
for metric in metrics:
output_file = test_output_dir / 'metrics' / f'{metric}.csv'
assert output_file.is_file()

with open(output_file) as f:
lines = f.readlines()
assert lines[0] == 'timeframe,sample_bitcoin\n'
if metric == 'gini':
assert lines[1] == 'Feb-2018,0.16666666666666666\n'
assert lines[2] == 'Mar-2018,0.0\n'
elif metric == 'nakamoto_coefficient':
assert lines[1] == 'Feb-2018,1\n'
assert lines[2] == 'Mar-2018,1\n'
elif metric == 'entropy=1':
assert lines[1] == 'Feb-2018,1.5\n'
assert lines[2] == 'Mar-2018,0.0\n'
# find the row for sample_bitcoin and 2018
data_row = None
for row in reader[1:]:
if row[0] == 'sample_bitcoin' and row[1] == '2018':
data_row = row
break
assert data_row is not None
assert data_row[gini_idx] == '0.25'
assert data_row[nc_idx] == '2'
assert data_row[ent_idx] == '1.836591668108979'

analyze(
projects=projects,
aggregated_data_filename='year_from_2010-01-01_to_2010-12-31.csv',
aggregated_data_filename='month_from_2018-02-01_to_2018-03-31.csv',
input_dir=test_output_dir,
output_dir=test_output_dir / 'metrics',
population_windows=0
)

metrics = ['gini', 'nakamoto_coefficient', 'entropy=1']
for metric in metrics:
output_file = test_output_dir / 'metrics' / f'{metric}.csv'
assert output_file.is_file()
output_file = test_output_dir / 'metrics' / 'output_clustered.csv'
assert output_file.is_file()
with open(output_file) as f:
reader = list(csv.reader(f))
header = reader[0]
gini_idx = header.index('gini')
nc_idx = header.index('nakamoto_coefficient')
ent_idx = header.index('entropy=1')

with open(output_file) as f:
lines = f.readlines()
assert len(lines) == 2
assert lines[0] == 'timeframe,sample_bitcoin\n'
assert lines[1] == '2010,\n'
rows_for_project = {row[1]: row for row in reader[1:] if row[0] == 'sample_bitcoin'}
assert rows_for_project['Feb-2018'][gini_idx] == '0.16666666666666666'
assert rows_for_project['Mar-2018'][gini_idx] == '0.0'
assert rows_for_project['Feb-2018'][nc_idx] == '1'
assert rows_for_project['Mar-2018'][nc_idx] == '1'
assert rows_for_project['Feb-2018'][ent_idx] == '1.5'
assert rows_for_project['Mar-2018'][ent_idx] == '0.0'
Loading