Merge pull request #77 from Blockchain-Technology-Lab/output_directory

dimkarakostas · web-flow · commit 38fab52fb537 · 2024-08-05T12:00:41.000+01:00
Restructure output directory
diff --git a/tests/test_helper.py b/tests/test_helper.py
@@ -68,10 +68,9 @@ def test_input_directories():
     assert len(input_dirs) > 0
 
 
-def test_output_directories():
-    output_dirs = hlp.get_output_directories()
-    assert isinstance(output_dirs, list)
-    assert len(output_dirs) > 0
+def test_output_directory():
+    output_dir = hlp.get_output_directory()
+    assert isinstance(output_dir, pathlib.Path)
 
 
 def test_tau_thresholds():
@@ -301,13 +300,6 @@ def test_get_circulation_from_entries():
     assert circulation == 21
 
 
-def test_get_output_files(mocker):
-    get_config_mock = mocker.patch("tokenomics_decentralization.helper.get_output_directories")
-    get_config_mock.return_value = [pathlib.Path(__file__).resolve().parent]
-    output_files = hlp.get_output_files()
-    assert isinstance(output_files, list)
-
-
 def test_get_special_addresses():
     ethereum_special_addresses = hlp.get_special_addresses('ethereum')
     assert isinstance(ethereum_special_addresses, list)
@@ -395,63 +387,53 @@ def test_get_output_row(mocker):
     assert csv_row == ['bitcoin', '2010-01-01', False, True, 'absolute', 1, False, True, 1, 0]
 
 
-def test_write_csv_output(mocker):
-    get_metrics_mock = mocker.patch('tokenomics_decentralization.helper.get_metrics')
-    get_metrics_mock.return_value = ['hhi']
-
-    get_output_directories_mock = mocker.patch('tokenomics_decentralization.helper.get_output_directories')
-    get_output_directories_mock.return_value = [pathlib.Path(__file__).resolve().parent]
-
-    get_clustering_mock = mocker.patch('tokenomics_decentralization.helper.get_clustering_flag')
+def test_get_output_filename(mocker):
+    get_output_directory_mock = mocker.patch('tokenomics_decentralization.helper.get_output_directory')
+    get_output_directory_mock.return_value = pathlib.Path(__file__).resolve().parent
     get_exclude_contracts_mock = mocker.patch('tokenomics_decentralization.helper.get_exclude_contracts_flag')
-    get_exclude_below_fees_mock = mocker.patch('tokenomics_decentralization.helper.get_exclude_below_fees_flag')
-    get_exclude_below_usd_cent_mock = mocker.patch('tokenomics_decentralization.helper.get_exclude_below_usd_cent_flag')
-    get_top_limit_type_mock = mocker.patch('tokenomics_decentralization.helper.get_top_limit_type')
-    get_top_limit_value_mock = mocker.patch('tokenomics_decentralization.helper.get_top_limit_value')
-
-    get_clustering_mock.return_value = True
     get_exclude_contracts_mock.return_value = False
+    get_exclude_below_fees_mock = mocker.patch('tokenomics_decentralization.helper.get_exclude_below_fees_flag')
     get_exclude_below_fees_mock.return_value = False
+    get_exclude_below_usd_cent_mock = mocker.patch('tokenomics_decentralization.helper.get_exclude_below_usd_cent_flag')
     get_exclude_below_usd_cent_mock.return_value = False
+    get_top_limit_type_mock = mocker.patch('tokenomics_decentralization.helper.get_top_limit_type')
     get_top_limit_type_mock.return_value = 'absolute'
+    get_top_limit_value_mock = mocker.patch('tokenomics_decentralization.helper.get_top_limit_value')
     get_top_limit_value_mock.return_value = 0
 
-    hlp.write_csv_output([
-        ['bitcoin', '2010-01-01', True, False, 'absolute', 0, False, False, 100],
-        ['ethereum', '2010-01-01', True, False, 'absolute', 0, False, False, 200],
-        ])
-    with open(pathlib.Path(__file__).resolve().parent / 'output.csv') as f:
-        lines = f.readlines()
-        assert lines[0] == ','.join(['ledger', 'snapshot_date', 'clustering', 'exclude_contract_addresses',
-                                     'top_limit_type', 'top_limit_value', 'exclude_below_fees',
-                                     'exclude_below_usd_cent', 'hhi']) + '\n'
-        assert lines[1] == ','.join(['bitcoin', '2010-01-01', 'True', 'False', 'absolute', '0', 'False', 'False',
-                                     '100']) + '\n'
-        assert lines[2] == ','.join(['ethereum', '2010-01-01', 'True', 'False', 'absolute', '0', 'False', 'False',
-                                     '200']) + '\n'
-    os.remove(pathlib.Path(__file__).resolve().parent / 'output.csv')
+    output_filename = hlp.get_output_filename()
+    assert output_filename == pathlib.Path(__file__).resolve().parent / 'output.csv'
 
-    get_clustering_mock.return_value = False
     get_exclude_contracts_mock.return_value = True
     get_exclude_below_fees_mock.return_value = True
     get_exclude_below_usd_cent_mock.return_value = True
-    get_top_limit_type_mock.return_value = 'absolute'
     get_top_limit_value_mock.return_value = 10
 
+    output_filename = hlp.get_output_filename()
+    assert output_filename == pathlib.Path(__file__).resolve().parent / 'output-exclude_contract_addresses-absolute_10-exclude_below_fees-exclude_below_usd_cent.csv'
+
+
+def test_write_csv_output(mocker):
+    get_metrics_mock = mocker.patch('tokenomics_decentralization.helper.get_metrics')
+    get_metrics_mock.return_value = ['hhi']
+
+    get_output_filename_mock = mocker.patch('tokenomics_decentralization.helper.get_output_filename')
+    get_output_filename_mock.return_value = pathlib.Path(__file__).resolve().parent / 'output.csv'
+
     hlp.write_csv_output([
-        ['bitcoin', '2010-01-01', False, False, 'absolute', 0, False, False, 100],
-        ['ethereum', '2010-01-01', False, False, 'absolute', 0, False, False, 200],
+        ['bitcoin', '2010-01-01', True, False, 'absolute', 0, False, False, 100],
+        ['ethereum', '2010-01-01', True, False, 'absolute', 0, False, False, 200],
     ])
-    with open(pathlib.Path(__file__).resolve().parent / 'output-no_clustering-exclude_contract_addresses-absolute_10-exclude_below_fees-exclude_below_usd_cent.csv') as f:
+    with open(pathlib.Path(__file__).resolve().parent / 'output.csv') as f:
         lines = f.readlines()
         assert lines[0] == ','.join(['ledger', 'snapshot_date', 'clustering', 'exclude_contract_addresses',
                                      'top_limit_type', 'top_limit_value', 'exclude_below_fees',
                                      'exclude_below_usd_cent', 'hhi']) + '\n'
-        assert lines[1] == ','.join(['bitcoin', '2010-01-01', 'False', 'False', 'absolute', '0', 'False', 'False',
+        assert lines[1] == ','.join(['bitcoin', '2010-01-01', 'True', 'False', 'absolute', '0', 'False', 'False',
                                      '100']) + '\n'
-        assert lines[2] == ','.join(['ethereum', '2010-01-01', 'False', 'False', 'absolute', '0', 'False', 'False',
+        assert lines[2] == ','.join(['ethereum', '2010-01-01', 'True', 'False', 'absolute', '0', 'False', 'False',
                                      '200']) + '\n'
-    os.remove(pathlib.Path(__file__).resolve().parent / 'output-no_clustering-exclude_contract_addresses-absolute_10-exclude_below_fees-exclude_below_usd_cent.csv')
+    os.remove(pathlib.Path(__file__).resolve().parent / 'output.csv')
 
 
 def test_get_active_source_keywords(mocker):
diff --git a/tokenomics_decentralization/analyze.py b/tokenomics_decentralization/analyze.py
@@ -122,20 +122,35 @@ def analyze_ledger_snapshot(ledger, date, output_rows, sema):
     :param output_rows: a list of strings in the form of csv output rows
     :param sema: a multiprocessing semaphore
     """
-    input_filename = None
-    input_paths = [input_dir / f'{ledger}_{date}_raw_data.csv' for input_dir in hlp.get_input_directories()]
-    for filename in input_paths:
-        if os.path.isfile(filename):
-            input_filename = filename
-            break
-    if input_filename:
-        logging.info(f'[*] {ledger} - {date}')
-
-        entries = get_entries(ledger, date, filename)
-        metrics_values = analyze_snapshot(entries)
-        del entries
-
-        row = hlp.get_output_row(ledger, date, metrics_values)
+    row = None
+
+    try:
+        with open(hlp.get_output_filename()) as f:
+            csv_reader = csv.reader(f)
+            for line in csv_reader:
+                if line[0] == ledger and line[1] == date:
+                    row = line
+                    break
+    except FileNotFoundError:
+        pass
+
+    if not row:
+        input_filename = None
+        input_paths = [input_dir / f'{ledger}_{date}_raw_data.csv' for input_dir in hlp.get_input_directories()]
+        for filename in input_paths:
+            if os.path.isfile(filename):
+                input_filename = filename
+                break
+        if input_filename:
+            logging.info(f'[*] {ledger} - {date}')
+
+            entries = get_entries(ledger, date, filename)
+            metrics_values = analyze_snapshot(entries)
+            del entries
+
+            row = hlp.get_output_row(ledger, date, metrics_values)
+
+    if row:
         output_rows.append(row)
 
     sema.release()  # Release the semaphore s.t. the loop in analyze() can continue
diff --git a/tokenomics_decentralization/helper.py b/tokenomics_decentralization/helper.py
@@ -141,13 +141,16 @@ def increment_date(date, by):
         raise ValueError(f'Invalid granularity: {by}')
 
 
-def get_output_directories():
+def get_output_directory():
     """
     Reads the config file and retrieves the output directories
     :returns: a list of directories that might contain the db files
     """
     config = get_config_data()
-    return [pathlib.Path(db_dir).resolve() for db_dir in config['output_directories']]
+    sources = ' - '.join(get_active_source_keywords())
+    if not sources:
+        sources = 'No clustering'
+    return [pathlib.Path(db_dir).resolve() for db_dir in config['output_directories']][0] / sources
 
 
 def get_input_directories():
@@ -348,15 +351,6 @@ def get_plot_config_data():
     return get_config_data()['plot_parameters']
 
 
-def get_output_files():
-    """
-    Retrieves all output files produced by some run
-    :returns: a list of filenames
-    """
-    output_dir = str(get_output_directories()[0])
-    return [filename for filename in os.listdir(output_dir) if filename.startswith('output') and filename.endswith('.csv')]
-
-
 def get_special_addresses(ledger):
     """
     Retrieves the ledger's special addresses that should be excluded from the analysis
@@ -485,24 +479,17 @@ def get_output_row(ledger, date, metrics):
     return csv_row
 
 
-def write_csv_output(output_rows):
+def get_output_filename():
     """
-    Produces the output csv file for the given data.
-    :param output_rows: a list of lists, where each list corresponds to a line in the output csv file
+    Produces the name (full path) of the output file.
+    :returns output_filename: a pathlib path of the output file
     """
-    header = ['ledger', 'snapshot_date', 'clustering', 'exclude_contract_addresses', 'top_limit_type',
-              'top_limit_value', 'exclude_below_fees', 'exclude_below_usd_cent']
-    header += get_metrics()
-
-    clustering = get_clustering_flag()
     exclude_contract_addresses_flag = get_exclude_contracts_flag()
     top_limit_type = get_top_limit_type()
     top_limit_value = get_top_limit_value()
     exclude_below_fees_flag = get_exclude_below_fees_flag()
     exclude_below_usd_cent_flag = get_exclude_below_usd_cent_flag()
     output_filename = 'output'
-    if not clustering:
-        output_filename += '-no_clustering'
     if exclude_contract_addresses_flag:
         output_filename += '-exclude_contract_addresses'
     if top_limit_value:
@@ -512,9 +499,19 @@ def write_csv_output(output_rows):
     if exclude_below_usd_cent_flag:
         output_filename += '-exclude_below_usd_cent'
     output_filename += '.csv'
+    return get_output_directory() / output_filename
+
+
+def write_csv_output(output_rows):
+    """
+    Produces the output csv file for the given data.
+    :param output_rows: a list of lists, where each list corresponds to a line in the output csv file
+    """
+    header = ['ledger', 'snapshot_date', 'clustering', 'exclude_contract_addresses', 'top_limit_type',
+              'top_limit_value', 'exclude_below_fees', 'exclude_below_usd_cent']
+    header += get_metrics()
 
-    output_dir = get_output_directories()[0]
-    with open(output_dir / output_filename, 'w') as f:
+    with open(get_output_filename(), 'w') as f:
         csv_writer = csv.writer(f)
         csv_writer.writerow(header)
         csv_writer.writerows(output_rows)