Skip to content

Commit d861295

Browse files
authored
Merge pull request #1273 from Sage-Bionetworks/develop
Schematic 23.8.1
2 parents f382831 + 388584f commit d861295

File tree

11 files changed

+232
-79
lines changed

11 files changed

+232
-79
lines changed

.github/workflows/docker_build.yml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ jobs:
2525
- name: Checkout repository
2626
uses: actions/checkout@v2
2727

28+
- name: Set env variable for version tag
29+
run: echo "RELEASE_VERSION=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
30+
2831
- name: Log in to the Container registry
2932
uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
3033
with:
@@ -41,10 +44,14 @@ jobs:
4144
type=ref,event=branch
4245
type=ref,event=pr
4346
type=semver,pattern={{raw}}
47+
4448
- name: Build and push Docker image
4549
uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
4650
with:
4751
file: schematic_api/Dockerfile
4852
push: true
4953
tags: ${{ steps.meta.outputs.tags }}
50-
labels: ${{ steps.meta.outputs.labels }}
54+
labels: ${{ steps.meta.outputs.labels }}
55+
build-args: |
56+
TAG=${{ env.RELEASE_VERSION }}
57+

.github/workflows/publish.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,8 @@ jobs:
113113
# See also: https://api.slack.com/methods/chat.postMessage#channels
114114
# You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
115115
# ibc-fair-data channel and data-curator-schematic channel
116-
channel-id: 'C01HSSMPQBG,C01ANC02U59'
116+
channel-id: 'C050YD75QRL,C01ANC02U59'
117117
# For posting a simple plain text message
118118
slack-message: "Schematic has just been released. Check out new version: ${{ github.ref_name }}"
119119
env:
120-
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
120+
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ This command will install the dependencies based on what we specify in poetry.lo
7575
*Note*: If you won't interact with Synapse, please ignore this section.
7676

7777
There are two main configuration files that need to be edited:
78-
[config.yml](https://github.com/Sage-Bionetworks/schematic/blob/develop/config.yml)
78+
config.yml
7979
and [synapseConfig](https://raw.githubusercontent.com/Sage-Bionetworks/synapsePythonClient/v2.3.0-rc/synapseclient/.synapseConfig)
8080

8181
<strong>Configure .synapseConfig File</strong>

pyproject.toml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,9 @@ markers = [
126126
Google credentials (skipped on GitHub CI) \
127127
""",
128128
"""\
129+
not_windows: tests that don't work on on windows machine
130+
""",
131+
"""\
129132
schematic_api: marks tests covering \
130133
API functionality (skipped on regular GitHub CI test suite)
131134
""",
@@ -143,4 +146,4 @@ markers = [
143146
rule_benchmark: marks tests covering \
144147
validation rule benchmarking
145148
"""
146-
]
149+
]

schematic/manifest/generator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1600,7 +1600,7 @@ def _get_end_columns(self, current_schema_headers, existing_manifest_headers, ou
16001600
end_columns.append(id_name)
16011601

16021602
# Add entity_id to the end columns if it should be there but isn't
1603-
if 'entityId' in (current_schema_headers or existing_manfiest_headers) and 'entityId' not in end_columns:
1603+
if 'entityId' in (current_schema_headers or existing_manifest_headers) and 'entityId' not in end_columns:
16041604
end_columns.append('entityId')
16051605
return end_columns
16061606

@@ -1621,7 +1621,7 @@ def _update_dataframe_with_existing_df(self, empty_manifest_url: str, existing_d
16211621

16221622
# Get headers for the current schema and existing manifest df.
16231623
current_schema_headers = list(self.get_dataframe_by_url(manifest_url=empty_manifest_url).columns)
1624-
existing_manfiest_headers = list(existing_df.columns)
1624+
existing_manifest_headers = list(existing_df.columns)
16251625

16261626
# Find columns that exist in the current schema, but are not in the manifest being downloaded.
16271627
new_columns = self._get_missing_columns(current_schema_headers, existing_manifest_headers)

schematic/store/synapse.py

Lines changed: 25 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import logging
88
import secrets
99
from dataclasses import dataclass
10-
import tempfile
10+
import shutil
1111

1212
# allows specifying explicit variable types
1313
from typing import Dict, List, Tuple, Sequence, Union
@@ -43,17 +43,17 @@
4343
from schematic_db.rdb.synapse_database import SynapseDatabase
4444

4545

46-
from schematic.utils.df_utils import update_df, load_df, col_in_dataframe, populate_df_col_with_another_col
46+
from schematic.utils.df_utils import update_df, load_df, col_in_dataframe
4747
from schematic.utils.validate_utils import comma_separated_list_regex, rule_in_rule_list
48-
from schematic.utils.general import entity_type_mapping, get_dir_size, convert_size, convert_gb_to_bytes, create_temp_folder
48+
from schematic.utils.general import entity_type_mapping, get_dir_size, convert_gb_to_bytes, create_temp_folder, check_synapse_cache_size, clear_synapse_cache
4949
from schematic.schemas.explorer import SchemaExplorer
5050
from schematic.schemas.generator import SchemaGenerator
5151
from schematic.store.base import BaseStorage
5252
from schematic.exceptions import MissingConfigValueError, AccessCredentialsError
5353

5454
from schematic.configuration.configuration import CONFIG
5555

56-
from schematic.utils.general import profile
56+
from schematic.utils.general import profile, calculate_datetime
5757

5858
logger = logging.getLogger("Synapse storage")
5959

@@ -75,12 +75,16 @@ def _download_manifest_to_folder(self) -> File:
7575
"""
7676
if "SECRETS_MANAGER_SECRETS" in os.environ:
7777
temporary_manifest_storage = "/var/tmp/temp_manifest_download"
78+
# clear out all the existing manifests
79+
if os.path.exists(temporary_manifest_storage):
80+
shutil.rmtree(temporary_manifest_storage)
81+
# create a new directory to store manifest
7882
if not os.path.exists(temporary_manifest_storage):
79-
os.mkdir("/var/tmp/temp_manifest_download")
83+
os.mkdir(temporary_manifest_storage)
84+
# create temporary folders for storing manifests
8085
download_location = create_temp_folder(temporary_manifest_storage)
8186
else:
8287
download_location=CONFIG.manifest_folder
83-
8488
manifest_data = self.syn.get(
8589
self.manifest_id,
8690
downloadLocation=download_location,
@@ -177,41 +181,34 @@ def __init__(
177181
Typical usage example:
178182
syn_store = SynapseStorage()
179183
"""
180-
184+
# TODO: turn root_synapse_cache to a parameter in init
181185
self.syn = self.login(token, access_token)
182186
self.project_scope = project_scope
183187
self.storageFileview = CONFIG.synapse_master_fileview_id
184188
self.manifest = CONFIG.synapse_manifest_basename
189+
self.root_synapse_cache = "/root/.synapseCache"
185190
self._query_fileview()
186191

187-
def _purge_synapse_cache(self, root_dir: str = "/var/www/.synapseCache/", maximum_storage_allowed_cache_gb=7):
192+
def _purge_synapse_cache(self, maximum_storage_allowed_cache_gb=1):
188193
"""
189-
Purge synapse cache if it exceeds 7GB
194+
Purge synapse cache if it exceeds a certain size. Default to 1GB.
190195
Args:
191-
root_dir: directory of the .synapseCache function
192-
maximum_storage_allowed_cache_gb: the maximum storage allowed before purging cache. Default is 7 GB.
193-
194-
Returns:
195-
if size of cache reaches a certain threshold (default is 7GB), return the number of files that get deleted
196-
otherwise, return the total remaining space (assuming total ephemeral storage is 20GB on AWS )
196+
maximum_storage_allowed_cache_gb: the maximum storage allowed before purging cache. Default is 1 GB.
197197
"""
198198
# try clearing the cache
199199
# scan a directory and check size of files
200-
cache = self.syn.cache
201-
if os.path.exists(root_dir):
200+
if os.path.exists(self.root_synapse_cache):
202201
maximum_storage_allowed_cache_bytes = convert_gb_to_bytes(maximum_storage_allowed_cache_gb)
203-
total_ephemeral_storag_gb = 20
204-
total_ephemeral_storage_bytes = convert_gb_to_bytes(total_ephemeral_storag_gb)
205-
nbytes = get_dir_size(root_dir)
206-
# if 7 GB has already been taken, purge cache before 15 min
207-
if nbytes >= maximum_storage_allowed_cache_bytes:
208-
minutes_earlier = datetime.strftime(datetime.utcnow()- timedelta(minutes = 15), '%s')
209-
num_of_deleted_files = cache.purge(before_date = int(minutes_earlier))
210-
logger.info(f'{num_of_deleted_files} number of files have been deleted from {root_dir}')
202+
nbytes = get_dir_size(self.root_synapse_cache)
203+
dir_size_bytes = check_synapse_cache_size(directory=self.root_synapse_cache)
204+
# if 1 GB has already been taken, purge cache before 15 min
205+
if dir_size_bytes >= maximum_storage_allowed_cache_bytes:
206+
num_of_deleted_files = clear_synapse_cache(self.syn.cache, minutes=15)
207+
logger.info(f'{num_of_deleted_files} files have been deleted from {self.root_synapse_cache}')
211208
else:
212-
remaining_space = total_ephemeral_storage_bytes - nbytes
213-
converted_space = convert_size(remaining_space)
214-
logger.info(f'Estimated {remaining_space} bytes (which is approximately {converted_space}) remained in ephemeral storage after calculating size of .synapseCache excluding OS')
209+
# on AWS, OS takes around 14-17% of our ephemeral storage (20GiB)
210+
# instead of guessing how much space that we left, print out .synapseCache here
211+
logger.info(f'the total size of .synapseCache is: {nbytes} bytes')
215212

216213
def _query_fileview(self):
217214
self._purge_synapse_cache()

schematic/utils/general.py

Lines changed: 69 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,20 @@
11
# allows specifying explicit variable types
2-
from typing import Any, Dict, Optional, Text
3-
import os
4-
import math
52
import logging
3+
import math
4+
import os
65
import pstats
6+
import subprocess
7+
import tempfile
78
from cProfile import Profile
9+
from datetime import datetime, timedelta
810
from functools import wraps
9-
10-
import tempfile
11+
from typing import Union
1112

1213
from synapseclient.core.exceptions import SynapseHTTPError
13-
from synapseclient.table import EntityViewSchema
1414
from synapseclient.entity import File, Folder, Project
15+
from synapseclient.table import EntityViewSchema
16+
17+
import synapseclient.core.cache as cache
1518

1619
logger = logging.getLogger(__name__)
1720

@@ -57,24 +60,69 @@ def get_dir_size(path: str):
5760
total += get_dir_size(entry.path)
5861
return total
5962

63+
def calculate_datetime(minutes: int, input_date: datetime, before_or_after: str = "before") -> datetime:
64+
"""calculate date time
65+
66+
Args:
67+
input_date (datetime): date time object provided by users
68+
minutes (int): number of minutes
69+
before_or_after (str): default to "before". if "before", calculate x minutes before current date time. if "after", calculate x minutes after current date time.
70+
71+
Returns:
72+
datetime: return result of date time calculation
73+
"""
74+
if before_or_after=="before":
75+
date_time_result = input_date - timedelta(minutes=minutes)
76+
elif before_or_after=="after":
77+
date_time_result = input_date + timedelta(minutes=minutes)
78+
else:
79+
raise ValueError("Invalid value. Use either 'before' or 'after'.")
80+
return date_time_result
81+
82+
83+
def check_synapse_cache_size(directory='/root/.synapseCache')-> Union[float, int]:
84+
"""use du --sh command to calculate size of .synapseCache.
6085
61-
def convert_size(size_bytes: int):
62-
"""convert bytes to a human readable format
6386
Args:
64-
size_bytes: total byte sizes
65-
return: a string that indicates bytes in a different format
87+
directory (str, optional): .synapseCache directory. Defaults to '/root/.synapseCache'
88+
89+
Returns:
90+
float or integer: returns size of .synapsecache directory in bytes
6691
"""
67-
if size_bytes == 0:
68-
return "0B"
69-
size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
70-
# calculate the log of size (in bytes) to base 1024 and run it down to the nearest integer
71-
index_int = int(math.floor(math.log(size_bytes, 1024)))
72-
# return the value of 1024 raised to the power of index
73-
power_cal = math.pow(1024, index_int)
74-
# convert bytes to a different unit if applicable
75-
size_bytes_converted = round(size_bytes / power_cal, 2)
76-
return f"{size_bytes_converted} {size_name[index_int]})"
92+
# Note: this command might fail on windows user. But since this command is primarily for running on AWS, it is fine.
93+
command = ['du', '-sh', directory]
94+
output = subprocess.run(command, capture_output=True).stdout.decode('utf-8')
95+
96+
# Parsing the output to extract the directory size
97+
size = output.split('\t')[0]
98+
if "K" in size:
99+
size_in_kb = float(size.rstrip('K'))
100+
byte_size = size_in_kb * 1000
101+
elif "M" in size:
102+
size_in_mb = float(size.rstrip('M'))
103+
byte_size = size_in_mb * 1000000
104+
elif "G" in size:
105+
size_in_gb = float(size.rstrip('G'))
106+
byte_size = convert_gb_to_bytes(size_in_gb)
107+
elif "B" in size:
108+
byte_size = float(size.rstrip('B'))
109+
else:
110+
logger.error('Cannot recongize the file size unit')
111+
return byte_size
112+
113+
def clear_synapse_cache(cache: cache.Cache, minutes: int) -> int:
114+
"""clear synapse cache before a certain time
77115
116+
Args:
117+
cache: an object of synapseclient Cache.
118+
minutes (int): all files before this minute will be removed
119+
Returns:
120+
int: number of files that get deleted
121+
"""
122+
current_date = datetime.utcnow()
123+
minutes_earlier = calculate_datetime(input_date=current_date, minutes=minutes, before_or_after="before")
124+
num_of_deleted_files = cache.purge(before_date = minutes_earlier)
125+
return num_of_deleted_files
78126

79127
def convert_gb_to_bytes(gb: int):
80128
"""convert gb to bytes
@@ -84,6 +132,7 @@ def convert_gb_to_bytes(gb: int):
84132
"""
85133
return gb * 1024 * 1024 * 1024
86134

135+
87136
def entity_type_mapping(syn, entity_id):
88137
"""
89138
Return the entity type of manifest

schematic_api/Dockerfile

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
FROM tiangolo/uwsgi-nginx-flask:python3.10
22

3+
# add version tag as a build argument
4+
ARG TAG
5+
36
# the environment variables defined here are the default
47
# and can be overwritten by docker run -e VARIABLE = XX
58
# or can be overwritten by .env when using docker compose
@@ -15,7 +18,8 @@ ENV PYTHONFAULTHANDLER=1 \
1518
APP_DIR=/app/app \
1619
ROOT=/ \
1720
UWSGI_INI=/app/uwsgi.ini \
18-
NGINX_WORKER_PROCESSES=1
21+
NGINX_WORKER_PROCESSES=1 \
22+
VERSION=$TAG
1923

2024
# Note:
2125
# The starting number of uWSGI processes is controlled by the variable UWSGI_CHEAPER, by default set to 2.
@@ -73,7 +77,9 @@ WORKDIR ${APP_DIR}
7377

7478
# copy other files to app/app
7579
# Note: run_api.py is not needed
76-
COPY ./pyproject.toml ./poetry.lock ./config.yml ./main.py ./
80+
81+
COPY ./pyproject.toml ./poetry.lock ./main.py ./
82+
COPY ./config_example.yml ./config.yml
7783
RUN poetry config virtualenvs.create false
7884
RUN poetry install --no-interaction --no-ansi --no-root --with aws
7985

schematic_api/api/openapi/api.yaml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1262,4 +1262,22 @@ paths:
12621262
Family History,FamilyHistory,TBD,False,True,," If Diagnosis is ""Cancer"" then ""Family History"" is required",Patient
12631263
tags:
12641264
- Visualization Operations
1265+
1266+
/version:
1267+
get:
1268+
summary: Get the version of schematic currently being used
1269+
description: >-
1270+
Get the version of schematic that is currently deployed and being used
1271+
operationId: schematic_api.api.routes.get_schematic_version
1272+
responses:
1273+
"200":
1274+
description: Returns a JSON String containing the version of schematic.
1275+
content:
1276+
text/plain:
1277+
schema:
1278+
type: string
1279+
"500":
1280+
description: Schematic version was not able to be identified.
1281+
tags:
1282+
- Version
12651283

schematic_api/api/routes.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -560,10 +560,8 @@ def download_manifest(access_token, manifest_id, new_manifest_name='', as_json=T
560560
# call config_handler()
561561
config_handler()
562562

563-
# use Synapse Storage
564-
store = SynapseStorage(access_token=access_token)
565-
# try logging in to asset store
566-
syn = store.login(access_token=access_token)
563+
# use login method in synapse storage
564+
syn = SynapseStorage.login(access_token=access_token)
567565
try:
568566
md = ManifestDownload(syn, manifest_id)
569567
manifest_data = ManifestDownload.download_manifest(md, new_manifest_name)
@@ -817,3 +815,14 @@ def get_nodes_display_names(schema_url: str, node_list: list[str]) -> list:
817815
node_display_names = gen.get_nodes_display_names(node_list, mm_graph)
818816
return node_display_names
819817

818+
def get_schematic_version() -> str:
819+
"""
820+
Return the current version of schematic
821+
"""
822+
if "VERSION" in os.environ:
823+
version = os.environ["VERSION"]
824+
else:
825+
raise NotImplementedError(
826+
"Using this endpoint to check the version of schematic is only supported when the API is running in a docker container."
827+
)
828+
return version

0 commit comments

Comments
 (0)