77import logging
88import secrets
99from dataclasses import dataclass
10- import tempfile
10+ import shutil
1111
1212# allows specifying explicit variable types
1313from typing import Dict , List , Tuple , Sequence , Union
4343from schematic_db .rdb .synapse_database import SynapseDatabase
4444
4545
46- from schematic .utils .df_utils import update_df , load_df , col_in_dataframe , populate_df_col_with_another_col
46+ from schematic .utils .df_utils import update_df , load_df , col_in_dataframe
4747from schematic .utils .validate_utils import comma_separated_list_regex , rule_in_rule_list
48- from schematic .utils .general import entity_type_mapping , get_dir_size , convert_size , convert_gb_to_bytes , create_temp_folder
48+ from schematic .utils .general import entity_type_mapping , get_dir_size , convert_gb_to_bytes , create_temp_folder , check_synapse_cache_size , clear_synapse_cache
4949from schematic .schemas .explorer import SchemaExplorer
5050from schematic .schemas .generator import SchemaGenerator
5151from schematic .store .base import BaseStorage
5252from schematic .exceptions import MissingConfigValueError , AccessCredentialsError
5353
5454from schematic .configuration .configuration import CONFIG
5555
56- from schematic .utils .general import profile
56+ from schematic .utils .general import profile , calculate_datetime
5757
5858logger = logging .getLogger ("Synapse storage" )
5959
@@ -75,12 +75,16 @@ def _download_manifest_to_folder(self) -> File:
7575 """
7676 if "SECRETS_MANAGER_SECRETS" in os .environ :
7777 temporary_manifest_storage = "/var/tmp/temp_manifest_download"
78+ # clear out all the existing manifests
79+ if os .path .exists (temporary_manifest_storage ):
80+ shutil .rmtree (temporary_manifest_storage )
81+ # create a new directory to store manifest
7882 if not os .path .exists (temporary_manifest_storage ):
79- os .mkdir ("/var/tmp/temp_manifest_download" )
83+ os .mkdir (temporary_manifest_storage )
84+ # create temporary folders for storing manifests
8085 download_location = create_temp_folder (temporary_manifest_storage )
8186 else :
8287 download_location = CONFIG .manifest_folder
83-
8488 manifest_data = self .syn .get (
8589 self .manifest_id ,
8690 downloadLocation = download_location ,
@@ -177,41 +181,34 @@ def __init__(
177181 Typical usage example:
178182 syn_store = SynapseStorage()
179183 """
180-
184+ # TODO: turn root_synapse_cache to a parameter in init
181185 self .syn = self .login (token , access_token )
182186 self .project_scope = project_scope
183187 self .storageFileview = CONFIG .synapse_master_fileview_id
184188 self .manifest = CONFIG .synapse_manifest_basename
189+ self .root_synapse_cache = "/root/.synapseCache"
185190 self ._query_fileview ()
186191
187- def _purge_synapse_cache (self , root_dir : str = "/var/www/.synapseCache/" , maximum_storage_allowed_cache_gb = 7 ):
192+ def _purge_synapse_cache (self , maximum_storage_allowed_cache_gb = 1 ):
188193 """
189- Purge synapse cache if it exceeds 7GB
194+ Purge synapse cache if it exceeds a certain size. Default to 1GB.
190195 Args:
191- root_dir: directory of the .synapseCache function
192- maximum_storage_allowed_cache_gb: the maximum storage allowed before purging cache. Default is 7 GB.
193-
194- Returns:
195- if size of cache reaches a certain threshold (default is 7GB), return the number of files that get deleted
196- otherwise, return the total remaining space (assuming total ephemeral storage is 20GB on AWS )
196+ maximum_storage_allowed_cache_gb: the maximum storage allowed before purging cache. Default is 1 GB.
197197 """
198198 # try clearing the cache
199199 # scan a directory and check size of files
200- cache = self .syn .cache
201- if os .path .exists (root_dir ):
200+ if os .path .exists (self .root_synapse_cache ):
202201 maximum_storage_allowed_cache_bytes = convert_gb_to_bytes (maximum_storage_allowed_cache_gb )
203- total_ephemeral_storag_gb = 20
204- total_ephemeral_storage_bytes = convert_gb_to_bytes (total_ephemeral_storag_gb )
205- nbytes = get_dir_size (root_dir )
206- # if 7 GB has already been taken, purge cache before 15 min
207- if nbytes >= maximum_storage_allowed_cache_bytes :
208- minutes_earlier = datetime .strftime (datetime .utcnow ()- timedelta (minutes = 15 ), '%s' )
209- num_of_deleted_files = cache .purge (before_date = int (minutes_earlier ))
210- logger .info (f'{ num_of_deleted_files } number of files have been deleted from { root_dir } ' )
202+ nbytes = get_dir_size (self .root_synapse_cache )
203+ dir_size_bytes = check_synapse_cache_size (directory = self .root_synapse_cache )
204+ # if 1 GB has already been taken, purge cache before 15 min
205+ if dir_size_bytes >= maximum_storage_allowed_cache_bytes :
206+ num_of_deleted_files = clear_synapse_cache (self .syn .cache , minutes = 15 )
207+ logger .info (f'{ num_of_deleted_files } files have been deleted from { self .root_synapse_cache } ' )
211208 else :
212- remaining_space = total_ephemeral_storage_bytes - nbytes
213- converted_space = convert_size ( remaining_space )
214- logger .info (f'Estimated { remaining_space } bytes (which is approximately { converted_space } ) remained in ephemeral storage after calculating size of .synapseCache excluding OS ' )
209+ # on AWS, OS takes around 14-17% of our ephemeral storage (20GiB)
210+ # instead of guessing how much space that we left, print out .synapseCache here
211+ logger .info (f'the total size of .synapseCache is: { nbytes } bytes ' )
215212
216213 def _query_fileview (self ):
217214 self ._purge_synapse_cache ()
0 commit comments