From 917fbe28db90199f16ca102e1091b8a79cced0eb Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Tue, 23 Jul 2024 09:32:17 -0400 Subject: [PATCH 01/20] doc: Document and refactor config module Signed-off-by: Dheshan Mohandass --- lochness/config/__init__.py | 101 ++++++++++++++++++++++-------------- 1 file changed, 62 insertions(+), 39 deletions(-) diff --git a/lochness/config/__init__.py b/lochness/config/__init__.py index b39bea71..5097a3d7 100644 --- a/lochness/config/__init__.py +++ b/lochness/config/__init__.py @@ -1,52 +1,70 @@ +""" +Module to read Lochness configuration file and keyring file. +""" + +import getpass as gp +import logging import os +import string +from typing import Any, Dict + +import cryptease as crypt import yaml -import logging import yaml.reader -import getpass as gp -import cryptease as crypt -import string logger = logging.getLogger(__name__) -def load(f: 'location', archive_base=None): - '''load configuration file and keyring''' - logger.debug('loading configuration') +def load(path: str, archive_base=None) -> Dict[str, Any]: + """ + Load configuration file and keyring + + Uses passphrase from environment variable NRG_KEYRING_PASS if available. + Otherwise, prompts user for passphrase. + + Args: + path (str): path to configuration file (yaml) + archive_base (str): path to the root of the archive - with open(os.path.expanduser(f), 'rb') as fp: - Lochness = _read_config_file(fp) + Returns: + Dict[str, Any]: configuration dictionary + """ + logger.debug("loading configuration") + Lochness = _read_config_file(path) if archive_base: - Lochness['phoenix_root'] = archive_base - if 'phoenix_root' not in Lochness: - raise ConfigError('need either --archive-base or ' - '\'phoenix_root\' in config file') - Lochness['phoenix_root'] = os.path.expanduser(Lochness['phoenix_root']) - Lochness['keyring_file'] = os.path.expanduser(Lochness['keyring_file']) + Lochness["phoenix_root"] = archive_base + if "phoenix_root" not in Lochness: + raise ConfigError( + "need either --archive-base or 'phoenix_root' in config file" + ) + Lochness["phoenix_root"] = os.path.expanduser(Lochness["phoenix_root"]) + Lochness["keyring_file"] = os.path.expanduser(Lochness["keyring_file"]) # box file pattern strings from the config to string template # regardless of the selected study in the args - if 'box' in Lochness: - for _, study_dict in Lochness['box'].items(): - for _, modality_values in study_dict['file_patterns'].items(): + if "box" in Lochness: + for _, study_dict in Lochness["box"].items(): + for _, modality_values in study_dict["file_patterns"].items(): for modality_dict in modality_values: - modality_dict['pattern'] = \ - string.Template(modality_dict['pattern']) + modality_dict["pattern"] = string.Template(modality_dict["pattern"]) - with open(Lochness['keyring_file'], 'rb') as fp: - logger.info('reading keyring file {0}'.format(Lochness['keyring_file'])) - if 'NRG_KEYRING_PASS' in os.environ: - load.passphrase = os.environ['NRG_KEYRING_PASS'] + with open(Lochness["keyring_file"], "rb") as fp: + logger.info(f"reading keyring file {Lochness["keyring_file"]}") + if "NRG_KEYRING_PASS" in os.environ: + load.passphrase = os.environ["NRG_KEYRING_PASS"] if load.passphrase is None: - load.passphrase = gp.getpass('enter passphrase: ') + load.passphrase = gp.getpass("enter passphrase: ") key = crypt.key_from_file(fp, load.passphrase) - content = b'' + content = b"" for chunk in crypt.decrypt(fp, key): content += chunk try: - Lochness['keyring'] = yaml.load(content, Loader=yaml.FullLoader) + Lochness["keyring"] = yaml.load(content, Loader=yaml.FullLoader) except yaml.reader.ReaderError: - raise KeyringError('could not decrypt keyring {0} (wrong passphrase?)'.format(Lochness['keyring_file'])) + raise KeyringError( + f"could not decrypt keyring {Lochness["keyring_file"]} (wrong passphrase?)" + ) return Lochness @@ -55,19 +73,24 @@ def load(f: 'location', archive_base=None): class KeyringError(Exception): - pass + """ + Generic keyring error. + """ -def _read_config_file(fp): - '''helper to read lochness configuration file''' - try: - cfg = yaml.load(fp.read(), Loader=yaml.FullLoader) - except Exception as e: - raise ConfigError('failed to parse {0} with error: {1}'.format(fp.name, e)) - return cfg +def _read_config_file(path: str) -> Dict[str, Any]: + """helper to read lochness configuration file""" - -class ConfigError(Exception): - pass + expanded_path = os.path.expanduser(path) + with open(expanded_path, "rb") as fp: + try: + cfg = yaml.load(fp.read(), Loader=yaml.FullLoader) + except Exception as e: + raise ConfigError(f"failed to parse {expanded_path} with error: {e}") + return cfg +class ConfigError(Exception): + """ + Malformed configuration file. + """ From 7b4dafd6614f1321b3a6b1e346129708ba95f33d Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Tue, 23 Jul 2024 11:28:33 -0400 Subject: [PATCH 02/20] feat.: Add DB models Signed-off-by: Dheshan Mohandass --- lochness/db/__init__.py | 290 ++++++++++++++++++++++++++++ lochness/db/models/__init__.py | 69 +++++++ lochness/db/models/file_mappings.py | 99 ++++++++++ lochness/db/models/phoenix_files.py | 142 ++++++++++++++ lochness/db/models/remote_files.py | 112 +++++++++++ lochness/db/models/study.py | 54 ++++++ lochness/db/models/subject.py | 89 +++++++++ 7 files changed, 855 insertions(+) create mode 100644 lochness/db/__init__.py create mode 100644 lochness/db/models/__init__.py create mode 100644 lochness/db/models/file_mappings.py create mode 100644 lochness/db/models/phoenix_files.py create mode 100644 lochness/db/models/remote_files.py create mode 100644 lochness/db/models/study.py create mode 100644 lochness/db/models/subject.py diff --git a/lochness/db/__init__.py b/lochness/db/__init__.py new file mode 100644 index 00000000..846608d1 --- /dev/null +++ b/lochness/db/__init__.py @@ -0,0 +1,290 @@ +""" +Helper functions for interacting with a PostgreSQL database. +""" + +import json +import logging +import sys +from pathlib import Path +from typing import Callable, Dict, Literal, Optional, Any +import hashlib + +import pandas as pd +import psycopg2 +import sqlalchemy + +logger = logging.getLogger(__name__) + + +def compute_hash(file_path: Path, hash_type: str = "md5") -> str: + """ + Compute the hash digest of a file. + + Args: + file_path (Path): The path to the file. + hash_type (str, optional): The type of hash algorithm to use. Defaults to 'md5'. + + Returns: + str: The computed hash digest of the file. + """ + with open(file_path, "rb") as file: + file_hash = hashlib.file_digest(file, hash_type) + hash_str = file_hash.hexdigest() + + return hash_str + + +def handle_null(query: str) -> str: + """ + Replaces all occurrences of the string 'NULL' with the SQL NULL keyword in the given query. + + Args: + query (str): The SQL query to modify. + + Returns: + str: The modified SQL query with 'NULL' replaced with NULL. + """ + query = query.replace("'NULL'", "NULL") + + return query + + +def handle_nan(query: str) -> str: + """ + Replaces all occurrences of the string 'nan' with the SQL NULL keyword in the given query. + + Args: + query (str): The SQL query to modify. + + Returns: + str: The modified SQL query with 'nan' replaced with NULL. + """ + query = query.replace("'nan'", "NULL") + + return query + + +def santize_string(string: str) -> str: + """ + Sanitizes a string by escaping single quotes. + + Args: + string (str): The string to sanitize. + + Returns: + str: The sanitized string. + """ + return string.replace("'", "''") + + +def sanitize_json(json_dict: dict) -> str: + """ + Sanitizes a JSON object by replacing single quotes with double quotes. + + Args: + json_dict (dict): The JSON object to sanitize. + + Returns: + str: The sanitized JSON object. + """ + for key, value in json_dict.items(): + if isinstance(value, str): + json_dict[key] = santize_string(value) + + json_str = json.dumps(json_dict, default=str) + + # Replace NaN with NULL + json_str = json_str.replace("NaN", "null") + + return json_str + + +def on_failure(): + """ + Exits the program with exit code 1. + """ + sys.exit(1) + + +def get_db_credentials(lochness_config: Dict[str, Any]) -> Dict[str, str]: + """ + Retrieves the database credentials from the configuration file. + + Args: + lochness_config: Dict[str, Any]: The Lochness configuration dictionary. + db (str, optional): The section of the configuration file to use. + Defaults to "postgresql". + + Returns: + Dict[str, str]: A dictionary containing the database credentials. + """ + # TODO: Support reading from Lochness Dict + + credentials = lochness_config["database"] + + return credentials + + +def execute_queries( + lochness_config: Dict[str, Any], + queries: list, + show_commands=True, + silent=False, + on_failure: Optional[Callable] = on_failure, +) -> list: + """ + Executes a list of SQL queries on a PostgreSQL database. + + Args: + lochness_config: Dict[str, Any]: The Lochness configuration dictionary. + queries (list): A list of SQL queries to execute. + show_commands (bool, optional): Whether to display the executed SQL queries. + Defaults to True. + show_progress (bool, optional): Whether to display a progress bar. Defaults to False. + silent (bool, optional): Whether to suppress output. Defaults to False. + db (str, optional): The section of the configuration file to use. + Defaults to "postgresql". + backup (bool, optional): Whether to sace all executed queries to a file. + + Returns: + list: A list of tuples containing the results of the executed queries. + """ + command = None + output = [] + + try: + credentials = get_db_credentials(lochness_config=lochness_config) + conn: psycopg2.extensions.connection = psycopg2.connect(**credentials) # type: ignore + cur = conn.cursor() + + def execute_query(query: str): + if show_commands: + logger.debug("Executing query:") + logger.debug(f"[bold blue]{query}", extra={"markup": True}) + cur.execute(query) + try: + output.append(cur.fetchall()) + except psycopg2.ProgrammingError: + pass + for command in queries: + execute_query(command) + + cur.close() + + conn.commit() + + if not silent: + logger.debug( + f"[grey]Executed {len(queries)} SQL query(ies).", extra={"markup": True} + ) + except (Exception, psycopg2.DatabaseError) as e: + logger.error("[bold red]Error executing queries.", extra={"markup": True}) + if command is not None: + logger.error(f"[red]For query: {command}", extra={"markup": True}) + logger.error(e) + if on_failure is not None: + on_failure() + else: + raise e + finally: + if conn is not None: + conn.close() + + return output + + +def get_db_connection(lochness_config: Dict[str, Any]) -> sqlalchemy.engine.base.Engine: + """ + Establishes a connection to the PostgreSQL database using the provided configuration file. + + Args: + lochness_config (Dict[str, Any]): The Lochness configuration dictionary. + + Returns: + sqlalchemy.engine.base.Engine: The database connection engine. + """ + credentials = get_db_credentials(lochness_config=lochness_config) + engine = sqlalchemy.create_engine( + "postgresql+psycopg2://" + + credentials["user"] + + ":" + + credentials["password"] + + "@" + + credentials["host"] + + ":" + + credentials["port"] + + "/" + + credentials["database"] + ) + + return engine + + +def execute_sql( + lochness_config: Dict[str, Any], query: str, debug: bool = False +) -> pd.DataFrame: + """ + Executes a SQL query on a PostgreSQL database and returns the result as a pandas DataFrame. + + Args: + lochness_config: Dict[str, Any]: The Lochness configuration dictionary. + query (str): The SQL query to execute. + + Returns: + pd.DataFrame: A pandas DataFrame containing the result of the SQL query. + """ + engine = get_db_connection(lochness_config=lochness_config) + + if debug: + logger.debug(f"Executing query: {query}") + + df = pd.read_sql(query, engine) + + engine.dispose() + + return df + + +def fetch_record(lochness_config: Path, query: str) -> Optional[str]: + """ + Fetches a single record from the database using the provided SQL query. + + Args: + config_file_path (str): The path to the database configuration file. + query (str): The SQL query to execute. + + Returns: + Optional[str]: The value of the first column of the first row of the result set, + or None if the result set is empty. + """ + df = execute_sql(lochness_config=lochness_config, query=query) + + # Check if there is a row + if df.shape[0] == 0: + return None + + value = df.iloc[0, 0] + + return str(value) + + +def df_to_table( + lochness_config: Dict[str, Any], + df: pd.DataFrame, + table_name: str, + if_exists: Literal["fail", "replace", "append"] = "replace", +) -> None: + """ + Writes a pandas DataFrame to a table in a PostgreSQL database. + + Args: + lochness_config (Dict[str, Any]): The Lochness configuration dictionary. + df (pd.DataFrame): The DataFrame to write to the database. + table_name (str): The name of the table to write to. + if_exists (Literal["fail", "replace", "append"], optional): What to do + if the table already exists. + """ + + engine = get_db_connection(lochness_config=lochness_config) + df.to_sql(table_name, engine, if_exists=if_exists, index=False) + engine.dispose() diff --git a/lochness/db/models/__init__.py b/lochness/db/models/__init__.py new file mode 100644 index 00000000..dad34758 --- /dev/null +++ b/lochness/db/models/__init__.py @@ -0,0 +1,69 @@ +""" +Contains DB models for Lochness. +""" + +from pathlib import Path +from typing import List, Union + +from lochness import db +from lochness.db.models.study import Study +from lochness.db.models.subject import Subject +from lochness.db.models.phoenix_files import PhoenixFiles +from lochness.db.models.remote_files import RemoteFile +from lochness.db.models.file_mappings import FileMapping + + +def flatten_list(coll: list) -> list: + """ + Flattens a list of lists into a single list. + + Args: + coll (list): List of lists. + + Returns: + list: Flattened list. + """ + flat_list = [] + for i in coll: + if isinstance(i, list): + flat_list += flatten_list(i) + else: + flat_list.append(i) + return flat_list + + +def init_db(config_file: Path): + """ + Initializes the database. + + WARNING: This will drop all tables and recreate them. + DO NOT RUN THIS IN PRODUCTION. + + Args: + config_file (Path): Path to the config file. + """ + drop_queries_l: List[Union[str, List[str]]] = [ + FileMapping.drop_table_query(), + RemoteFile.drop_table_query(), + PhoenixFiles.drop_table_query(), + Subject.drop_table_query(), + Study.drop_table_query(), + ] + + create_queries_l: List[Union[str, List[str]]] = [ + Study.init_table_query(), + Subject.init_table_query(), + PhoenixFiles.init_table_query(), + RemoteFile.init_table_query(), + FileMapping.init_table_query(), + ] + + drop_queries = flatten_list(drop_queries_l) + create_queries = flatten_list(create_queries_l) + + sql_queries: List[str] = drop_queries + create_queries + + db.execute_queries( + config_file=config_file, + queries=sql_queries, + ) diff --git a/lochness/db/models/file_mappings.py b/lochness/db/models/file_mappings.py new file mode 100644 index 00000000..830be13f --- /dev/null +++ b/lochness/db/models/file_mappings.py @@ -0,0 +1,99 @@ +""" +File mappings are used to map files from one source to another. + +This module contains the FileMapping class, which represents a mapping between +a file on the local file system and a file on a remote file system. +""" + +from pathlib import Path + +from lochness import db + + +class FileMapping: + """ + Maps a file on a remote file system to a file on the local file system. + + Attributes: + remote_file_path (Path): The path to the file on the remote file system. + local_file_path (Path): The path to the file on the local file system. + remote_name (str): The name of the remote system. + subject_id (str): The subject ID assciated with this asset. + """ + + def __init__( + self, + remote_file_path: Path, + local_file_path: Path, + remote_name: str, + subject_id: str, + ): + """ + Initialize a FileMapping object. + + Args: + remote_file_path (Path): The path to the file on the remote file system. + local_file_path (Path): The path to the file on the local file system. + remote_name (str): The name of the remote system. + subject_id (str): The subject ID associated with this asset. + """ + self.remote_file_path = remote_file_path + self.local_file_path = local_file_path + self.remote_name = remote_name + self.subject_id = subject_id + + def __str__(self): + """ + Return a string representation of the FileMapping object. + """ + return f"FileMapping({self.remote_file_path},\ + {self.local_file_path}, {self.remote_name}, {self.subject_id})" + + def __repr__(self): + """ + Return a string representation of the FileMapping object. + """ + return self.__str__() + + @staticmethod + def init_table_query() -> str: + """ + Return the SQL query to create the 'file_mappings' table. + """ + sql_query = """ + CREATE TABLE file_mappings ( + remote_file_path TEXT NOT NULL, + remote_name TEXT NOT NULL, + local_file_path TEXT NOT NULL, + subject_id TEXT NOT NULL, + PRIMARY KEY (remote_file_path, local_file_path, remote_name, subject_id) + ); + """ + + return sql_query + + @staticmethod + def drop_table_query() -> str: + """ + Return the SQL query to drop the 'file_mappings' table. + """ + sql_query = """ + DROP TABLE IF EXISTS file_mappings; + """ + + return sql_query + + def to_sql(self) -> str: + """ + Return the SQL query to insert the object into the 'file_mappings' table. + """ + remote_file_path = db.santize_string(self.remote_file_path) + local_file_path = db.santize_string(self.local_file_path) + remote_name = db.santize_string(self.remote_name) + subject_id = db.santize_string(self.subject_id) + + return f""" + INSERT INTO file_mappings (remote_file_path, local_file_path, remote_name, subject_id) + VALUES ('{remote_file_path}', '{local_file_path}', '{remote_name}', '{subject_id}') + ON CONFLICT DO NOTHING; + """ diff --git a/lochness/db/models/phoenix_files.py b/lochness/db/models/phoenix_files.py new file mode 100644 index 00000000..4fe95e8c --- /dev/null +++ b/lochness/db/models/phoenix_files.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python +""" +File Model +""" + +from pathlib import Path +from datetime import datetime + +from lochness import db + + +class PhoenixFiles: + """ + Represents a file on the PHOENIX file system. + + Attributes: + file_path (Path): The path to the file. + """ + + def __init__(self, file_path: Path, with_hash: bool = True): + """ + Initialize a File object. + + Automatically computes the file size and modification time. + + Args: + file_path (Path): The path to the file. + """ + self.file_path = file_path + + if not file_path.exists(): + raise FileNotFoundError(f"File not found: {file_path}") + + self.file_name = file_path.name + self.file_type = file_path.suffix + + self.file_size_mb = file_path.stat().st_size / 1024 / 1024 + self.m_time = datetime.fromtimestamp(file_path.stat().st_mtime) + if with_hash: + self.md5 = db.compute_hash(file_path=file_path, hash_type="md5") + else: + self.md5 = None + + def __str__(self): + """ + Return a string representation of the File object. + """ + return f"PhoenixFile({self.file_name}, {self.file_type}, {self.file_size_mb}, \ + {self.file_path}, {self.m_time}, {self.md5})" + + def __repr__(self): + """ + Return a string representation of the File object. + """ + return self.__str__() + + @staticmethod + def init_table_query() -> str: + """ + Return the SQL query to create the 'phoenix_files' table. + """ + sql_query = """ + CREATE TABLE phoenix_files ( + p_file_name TEXT NOT NULL, + p_file_type TEXT NOT NULL, + p_file_size_mb FLOAT NOT NULL, + p_file_path TEXT PRIMARY KEY, + m_time TIMESTAMP NOT NULL, + md5 TEXT + ); + """ + + return sql_query + + @staticmethod + def drop_table_query() -> str: + """ + Return the SQL query to drop the 'phoenix_files' table if it exists. + """ + sql_query = """ + DROP TABLE IF EXISTS phoenix_files CASCADE; + """ + + return sql_query + + @staticmethod + def find_matches_by_hash_query(hash_val: str) -> str: + """ + Return the SQL query to find matching files by hash. + """ + sql_query = f""" + SELECT p_file_name, p_file_type, p_file_size_mb, p_file_path, m_time, md5 + FROM phoenix_files + WHERE md5 = '{hash_val}'; + """ + + return sql_query + + @staticmethod + def update_file_query(orig_path: Path, new_path: Path) -> str: + """ + Return the SQL query to update the p_file_path of a File object. + """ + orig_path = db.santize_string(str(orig_path)) + new_path = db.santize_string(str(new_path)) + + sql_query = f""" + UPDATE files + SET p_file_path = '{new_path}' + WHERE p_file_path = '{orig_path}'; + """ + + return sql_query + + def to_sql(self): + """ + Return the SQL query to insert the File object into the 'phoenix_files' table. + """ + f_name = db.santize_string(self.file_name) + f_path = db.santize_string(str(self.file_path)) + + if self.md5 is None: + hash_val = "NULL" + else: + hash_val = self.md5 + + sql_query = f""" + INSERT INTO phoenix_files (p_file_name, p_file_type, p_file_size_mb, + p_file_path, m_time, md5) + VALUES ('{f_name}', '{self.file_type}', '{self.file_size_mb}', + '{f_path}', '{self.m_time}', '{hash_val}') + ON CONFLICT (p_file_path) DO UPDATE SET + p_file_name = excluded.p_file_name, + p_file_type = excluded.p_file_type, + p_file_size_mb = excluded.p_file_size_mb, + m_time = excluded.m_time, + md5 = excluded.md5; + """ + + sql_query = db.handle_null(sql_query) + + return sql_query diff --git a/lochness/db/models/remote_files.py b/lochness/db/models/remote_files.py new file mode 100644 index 00000000..be015520 --- /dev/null +++ b/lochness/db/models/remote_files.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +""" +RemoteFile Model +""" + +from datetime import datetime + +from lochness import db + + +class RemoteFile: + """ + Represents a file on some remote file system. + + Attributes: + file_path (Path): The path to the file. + """ + + def __init__( + self, file_path: str, remote_name: str, hash_val: str, last_checked: datetime + ): + """ + Initialize a RemoteFile object. + + Args: + file_path (Path): The path to the file. + remote_name (str): The name of the remote system. + hash_val (str): The hash value of the file, + as provided by the remote system. + """ + self.file_path = file_path + self.remote_name = remote_name + self.hash_val = hash_val + self.last_checked = last_checked + + def __str__(self): + """ + Return a string representation of the RemoteFile object. + """ + return f"RemoteFile({self.file_path}, {self.remote_name}, {self.last_checked})" + + def __repr__(self): + """ + Return a string representation of the File object. + """ + return self.__str__() + + @staticmethod + def init_table_query() -> str: + """ + Return the SQL query to create the 'files' table. + """ + sql_query = """ + CREATE TABLE remote_files ( + r_file_path TEXT NOT NULL, + r_remote_name TEXT NOT NULL, + r_hash_val TEXT NOT NULL, + r_last_checked TIMESTAMP NOT NULL, + PRIMARY KEY (file_path, remote_name) + ); + """ + + return sql_query + + @staticmethod + def drop_table_query() -> str: + """ + Return the SQL query to drop the 'remote_files' table if it exists. + """ + sql_query = """ + DROP TABLE IF EXISTS remote_files CASCADE; + """ + + return sql_query + + @staticmethod + def find_matches_by_hash_query(hash_val: str) -> str: + """ + Return the SQL query to find matching remote_files by hash. + """ + sql_query = f""" + SELECT r_file_path, r_remote_name, r_hash_val, r_last_checked + FROM remote_files + WHERE r_hash_val = '{hash_val}'; + """ + + return sql_query + + def to_sql(self): + """ + Return the SQL query to insert the RemoteFile object into the 'remote_files' table. + + Returns: + str: The SQL query. + """ + + file_path = db.santize_string(str(self.file_path)) + remote_name = db.santize_string(self.remote_name) + hash_val = db.santize_string(self.hash_val) + last_checked = db.santize_string(self.last_checked) + + sql_query = f""" + INSERT INTO remote_files ( + r_file_path, r_remote_name, r_hash_val, r_last_checked + ) VALUES ( + '{file_path}', '{remote_name}', '{hash_val}', '{last_checked}' + ) ON CONFLICT (file_path, remote_name) UPDATE SET + r_hash_val = excluded.r_hash_val, + r_last_checked = excluded.r_last_checked; + """ + + return sql_query diff --git a/lochness/db/models/study.py b/lochness/db/models/study.py new file mode 100644 index 00000000..6b6af8c2 --- /dev/null +++ b/lochness/db/models/study.py @@ -0,0 +1,54 @@ +""" +Study Model +""" + +from lochness import db + + +class Study: + """ + Represents a study. + + Attributes: + study_id (str): The study ID. + """ + + def __init__(self, study_id: str): + self.study_id = study_id + + def __str__(self): + return f"Study({self.study_id})" + + def __repr__(self): + return self.__str__() + + @staticmethod + def init_table_query() -> str: + """ + Return the SQL query to create the 'study' table. + """ + return """ + CREATE TABLE IF NOT EXISTS study ( + study_id TEXT PRIMARY KEY + ); + """ + + @staticmethod + def drop_table_query() -> str: + """ + Return the SQL query to drop the 'study' table. + """ + return """ + DROP TABLE IF EXISTS study; + """ + + def to_sql(self): + """ + Return the SQL query to insert the object into the 'study' table. + """ + study_id = db.santize_string(self.study_id) + + return f""" + INSERT INTO study (study_id) + VALUES ('{study_id}') ON CONFLICT DO NOTHING; + """ diff --git a/lochness/db/models/subject.py b/lochness/db/models/subject.py new file mode 100644 index 00000000..21ac09fb --- /dev/null +++ b/lochness/db/models/subject.py @@ -0,0 +1,89 @@ +""" +Subject Model +""" + +from datetime import datetime + +from lochness import db + + +class Subject: + """ + Represents a subject / study participant. + + Attributes: + study_id (str): The study ID. + subject_id (str): The subject ID. + is_active (bool): Whether or not the subject is active. + consent_date (datetime): The date the subject consented to the study. + optional_notes (dict): Optional notes about the subject. + """ + + def __init__( + self, + study_id: str, + subject_id: str, + is_active: bool, + consent_date: datetime, + optional_notes: dict, + ): + self.study_id = study_id + self.subject_id = subject_id + self.is_active = is_active + self.consent_date = consent_date + self.optional_notes = optional_notes + + def __str__(self): + return f"Subject({self.study_id}, {self.subject_id}, {self.is_active}, \ + {self.consent_date}, {self.optional_notes})" + + def __repr__(self): + return self.__str__() + + @staticmethod + def init_table_query() -> str: + """ + Return the SQL query to create the 'subjects' table. + """ + sql_query = """ + CREATE TABLE subjects ( + study_id TEXT NOT NULL REFERENCES study (study_id), + subject_id TEXT NOT NULL, + is_active BOOLEAN NOT NULL, + consent_date DATE NOT NULL, + optional_notes JSON, + PRIMARY KEY (study_id, subject_id) + ); + """ + + return sql_query + + @staticmethod + def drop_table_query() -> str: + """ + Return the SQL query to drop the 'subjects' table. + """ + sql_query = """ + DROP TABLE IF EXISTS subjects; + """ + + return sql_query + + def to_sql(self) -> str: + """ + Return the SQL query to insert the subject into the 'subjects' table. + """ + + consent_date = self.consent_date.strftime("%Y-%m-%d") + optional_notes = db.sanitize_json(self.optional_notes) + + sql_query = f""" + INSERT INTO subjects (study_id, subject_id, is_active, consent_date, optional_notes) + VALUES ('{self.study_id}', '{self.subject_id}', {self.is_active}, '{consent_date}', '{optional_notes}') + ON CONFLICT(study_id, subject_id) DO UPDATE SET + is_active = excluded.is_active, + consent_date = excluded.consent_date, + optional_notes = excluded.optional_notes; + """ + + return sql_query From 400616a200343c72eb87224cc30e442ec1d0649b Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Fri, 26 Jul 2024 14:56:17 -0400 Subject: [PATCH 03/20] feat.: Add modality to file_mappings table Signed-off-by: Dheshan Mohandass --- lochness/db/models/file_mappings.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/lochness/db/models/file_mappings.py b/lochness/db/models/file_mappings.py index 830be13f..1a9a1c7d 100644 --- a/lochness/db/models/file_mappings.py +++ b/lochness/db/models/file_mappings.py @@ -19,6 +19,7 @@ class FileMapping: local_file_path (Path): The path to the file on the local file system. remote_name (str): The name of the remote system. subject_id (str): The subject ID assciated with this asset. + modality (str): The modality associated with this asset. """ def __init__( @@ -27,6 +28,7 @@ def __init__( local_file_path: Path, remote_name: str, subject_id: str, + modality: str, ): """ Initialize a FileMapping object. @@ -36,18 +38,21 @@ def __init__( local_file_path (Path): The path to the file on the local file system. remote_name (str): The name of the remote system. subject_id (str): The subject ID associated with this asset. + modality (str): The modality associated with this asset. """ self.remote_file_path = remote_file_path self.local_file_path = local_file_path self.remote_name = remote_name self.subject_id = subject_id + self.modality = modality def __str__(self): """ Return a string representation of the FileMapping object. """ return f"FileMapping({self.remote_file_path},\ - {self.local_file_path}, {self.remote_name}, {self.subject_id})" + {self.local_file_path}, {self.remote_name}, {self.subject_id}, \ + {self.modality})" def __repr__(self): """ @@ -66,6 +71,7 @@ def init_table_query() -> str: remote_name TEXT NOT NULL, local_file_path TEXT NOT NULL, subject_id TEXT NOT NULL, + modality TEXT NOT NULL, PRIMARY KEY (remote_file_path, local_file_path, remote_name, subject_id) ); """ @@ -91,9 +97,14 @@ def to_sql(self) -> str: local_file_path = db.santize_string(self.local_file_path) remote_name = db.santize_string(self.remote_name) subject_id = db.santize_string(self.subject_id) + modality = db.santize_string(self.modality) return f""" - INSERT INTO file_mappings (remote_file_path, local_file_path, remote_name, subject_id) - VALUES ('{remote_file_path}', '{local_file_path}', '{remote_name}', '{subject_id}') + INSERT INTO file_mappings ( + remote_file_path, local_file_path, remote_name, + subject_id, modality + ) VALUES ( + '{remote_file_path}', '{local_file_path}', '{remote_name}', + '{subject_id}', '{modality}' ON CONFLICT DO NOTHING; """ From 41e717e402d44ff5735aa764b491bd65dc7f96f7 Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Fri, 26 Jul 2024 14:56:53 -0400 Subject: [PATCH 04/20] feat.: Add remote metadata to track fileds like owner, modifier, last_modified Signed-off-by: Dheshan Mohandass --- lochness/db/models/remote_files.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/lochness/db/models/remote_files.py b/lochness/db/models/remote_files.py index be015520..7abc1281 100644 --- a/lochness/db/models/remote_files.py +++ b/lochness/db/models/remote_files.py @@ -4,6 +4,7 @@ """ from datetime import datetime +from typing import Dict from lochness import db @@ -17,7 +18,12 @@ class RemoteFile: """ def __init__( - self, file_path: str, remote_name: str, hash_val: str, last_checked: datetime + self, + file_path: str, + remote_name: str, + hash_val: str, + last_checked: datetime, + remote_metadata: Dict[str, str], ): """ Initialize a RemoteFile object. @@ -27,11 +33,14 @@ def __init__( remote_name (str): The name of the remote system. hash_val (str): The hash value of the file, as provided by the remote system. + last_checked (datetime): The last time the file was checked. + remote_metadata (Dict[str, str]): Metadata about the file, """ self.file_path = file_path self.remote_name = remote_name self.hash_val = hash_val self.last_checked = last_checked + self.remote_metadata = remote_metadata def __str__(self): """ @@ -56,6 +65,7 @@ def init_table_query() -> str: r_remote_name TEXT NOT NULL, r_hash_val TEXT NOT NULL, r_last_checked TIMESTAMP NOT NULL, + r_remote_metadata JSONB, PRIMARY KEY (file_path, remote_name) ); """ @@ -79,7 +89,7 @@ def find_matches_by_hash_query(hash_val: str) -> str: Return the SQL query to find matching remote_files by hash. """ sql_query = f""" - SELECT r_file_path, r_remote_name, r_hash_val, r_last_checked + SELECT r_file_path, r_remote_name, r_hash_val, r_last_checked, r_remote_metadata FROM remote_files WHERE r_hash_val = '{hash_val}'; """ @@ -99,14 +109,17 @@ def to_sql(self): hash_val = db.santize_string(self.hash_val) last_checked = db.santize_string(self.last_checked) + metadata = db.sanitize_json(self.remote_metadata) + sql_query = f""" INSERT INTO remote_files ( - r_file_path, r_remote_name, r_hash_val, r_last_checked + r_file_path, r_remote_name, r_hash_val, r_last_checked, r_remote_metadata ) VALUES ( - '{file_path}', '{remote_name}', '{hash_val}', '{last_checked}' + '{file_path}', '{remote_name}', '{hash_val}', '{last_checked}', '{metadata}' ) ON CONFLICT (file_path, remote_name) UPDATE SET r_hash_val = excluded.r_hash_val, - r_last_checked = excluded.r_last_checked; + r_last_checked = excluded.r_last_checked, + r_remote_metadata = excluded.r_remote_metadata; """ return sql_query From b0f01b76d25cb3c279bdbc55b81d3d6a7fcc970f Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Fri, 26 Jul 2024 15:07:36 -0400 Subject: [PATCH 05/20] feat.: Impl. Audit Log model Signed-off-by: Dheshan Mohandass --- lochness/db/models/files_audit_log.py | 122 ++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 lochness/db/models/files_audit_log.py diff --git a/lochness/db/models/files_audit_log.py b/lochness/db/models/files_audit_log.py new file mode 100644 index 00000000..0fe0932f --- /dev/null +++ b/lochness/db/models/files_audit_log.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +""" +AuditLog Model +""" +from typing import Dict + +from lochness import db + + +class AuditLog: + """ + Represents an audit log entry. + + Attributes: + source_file (str): The source file. + destination_file (str): The destination file. + system (str): The system the action was taken on. + action (str): The action taken. + metadata (Dict[str, str]): Metadata about the action. + timestamp (datetime): The time the action was taken. + """ + + def __init__( + self, + source_file: str, + destination_file: str, + system: str, + action: str, + metadata: Dict[str, str], + timestamp: str, + ): + """ + Initialize an AuditLog object. + + Args: + source_file (str): The source file. + destination_file (str): The destination file. + system (str): The system the action was taken on. e.g. 'local', 'dropbox', etc. + action (str): The action taken. e.g. 'move', 'delete', etc. + metadata (Dict[str, str]): Metadata about the action. + timestamp (str): The time the action was taken. + """ + + self.source_file = source_file + self.destination_file = destination_file + self.system = system + self.action = action + self.metadata = metadata + self.timestamp = timestamp + + def __str__(self): + """ + Return a string representation of the AuditLog object. + """ + return f""" +AuditLog( + {self.source_file}, + {self.destination_file}, + {self.system}, + {self.action}, + {self.metadata}, + {self.timestamp} +) +""" + + def __repr__(self): + """ + Return a string representation of the AuditLog object. + """ + return self.__str__() + + @staticmethod + def init_table_query() -> str: + """ + Return the SQL query to create the 'audit_log' table. + """ + sql_query = """ + CREATE TABLE IF audit_log ( + source_file TEXT NOT NULL, + destination_file TEXT, + system TEXT NOT NULL, + action TEXT NOT NULL, + metadata JSONB, + timestamp TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP + ); + """ + + return sql_query + + @staticmethod + def drop_table_query() -> str: + """ + Return the SQL query to drop the 'audit_log' table if it exists. + """ + sql_query = """ + DROP TABLE IF EXISTS audit_log CASCADE; + """ + + return sql_query + + def to_sql(self) -> str: + """ + Return the SQL query to insert the object into the 'audit_log' table. + """ + source_file = db.santize_string(self.source_file) + destination_file = db.santize_string(self.destination_file) + system = db.santize_string(self.system) + action = db.santize_string(self.action) + metadata = db.santize_string(self.metadata) + timestamp = db.santize_string(self.timestamp) + + sql_query = f""" + INSERT INTO audit_log ( + source_file, destination_file, system, + action, metadata, timestamp + ) VALUES ( + '{source_file}', '{destination_file}', '{system}', + '{action}', '{metadata}', '{timestamp}' + ); + """ + + return sql_query From a00b9e99855446efe7011767aa27bf0201986b9a Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Tue, 30 Jul 2024 01:16:27 +1000 Subject: [PATCH 06/20] fix: execute_queries funtion --- lochness/db/__init__.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lochness/db/__init__.py b/lochness/db/__init__.py index 846608d1..6583d21d 100644 --- a/lochness/db/__init__.py +++ b/lochness/db/__init__.py @@ -118,8 +118,6 @@ def get_db_credentials(lochness_config: Dict[str, Any]) -> Dict[str, str]: Returns: Dict[str, str]: A dictionary containing the database credentials. """ - # TODO: Support reading from Lochness Dict - credentials = lochness_config["database"] return credentials @@ -166,8 +164,8 @@ def execute_query(query: str): output.append(cur.fetchall()) except psycopg2.ProgrammingError: pass - for command in queries: - execute_query(command) + for command in queries: + execute_query(command) cur.close() From ebb8702e3a4bcc85bb3f30c16b2d0a0e7cf2aaf2 Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Tue, 30 Jul 2024 01:23:55 +1000 Subject: [PATCH 07/20] fix: models Signed-off-by: Dheshan Mohandass --- lochness/db/models/__init__.py | 8 ++++---- lochness/db/models/remote_files.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lochness/db/models/__init__.py b/lochness/db/models/__init__.py index dad34758..c2c0b9d9 100644 --- a/lochness/db/models/__init__.py +++ b/lochness/db/models/__init__.py @@ -2,8 +2,7 @@ Contains DB models for Lochness. """ -from pathlib import Path -from typing import List, Union +from typing import List, Union, Dict, Any from lochness import db from lochness.db.models.study import Study @@ -32,7 +31,7 @@ def flatten_list(coll: list) -> list: return flat_list -def init_db(config_file: Path): +def init_db(lochness_config: Dict[str, Any]) -> None: """ Initializes the database. @@ -64,6 +63,7 @@ def init_db(config_file: Path): sql_queries: List[str] = drop_queries + create_queries db.execute_queries( - config_file=config_file, + lochness_config=lochness_config, queries=sql_queries, + show_commands=True, ) diff --git a/lochness/db/models/remote_files.py b/lochness/db/models/remote_files.py index 7abc1281..ea77a675 100644 --- a/lochness/db/models/remote_files.py +++ b/lochness/db/models/remote_files.py @@ -66,7 +66,7 @@ def init_table_query() -> str: r_hash_val TEXT NOT NULL, r_last_checked TIMESTAMP NOT NULL, r_remote_metadata JSONB, - PRIMARY KEY (file_path, remote_name) + PRIMARY KEY (r_file_path, r_remote_name) ); """ From 9419856fa7eae8e861ab0aff94f1d01faccf2ae2 Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Tue, 30 Jul 2024 01:24:34 +1000 Subject: [PATCH 08/20] feat.: Add Foreign Keys to mappings table Signed-off-by: Dheshan Mohandass --- lochness/db/models/file_mappings.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lochness/db/models/file_mappings.py b/lochness/db/models/file_mappings.py index 1a9a1c7d..6a021ac7 100644 --- a/lochness/db/models/file_mappings.py +++ b/lochness/db/models/file_mappings.py @@ -72,7 +72,10 @@ def init_table_query() -> str: local_file_path TEXT NOT NULL, subject_id TEXT NOT NULL, modality TEXT NOT NULL, - PRIMARY KEY (remote_file_path, local_file_path, remote_name, subject_id) + PRIMARY KEY (remote_file_path, local_file_path, remote_name, subject_id), + FOREIGN KEY (remote_file_path, remote_name) REFERENCES remote_files(r_file_path, r_remote_name), + FOREIGN KEY (local_file_path) REFERENCES phoenix_files(p_file_path), + FOREIGN KEY (subject_id) REFERENCES subjects(subject_id) ); """ From 468637e4609f837956103714c5937466cabcb801 Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Tue, 30 Jul 2024 01:25:07 +1000 Subject: [PATCH 09/20] feat.: Impl. init db script Signed-off-by: Dheshan Mohandass --- scripts/init_db.py | 49 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100755 scripts/init_db.py diff --git a/scripts/init_db.py b/scripts/init_db.py new file mode 100755 index 00000000..190f0fe0 --- /dev/null +++ b/scripts/init_db.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +""" +Initialize the database with the schema defined in lochness.db.models +""" + +import sys +from pathlib import Path + +file = Path(__file__).resolve() +parent = file.parent +ROOT = None +for parent in file.parents: + if parent.name == "lochness-dev": + ROOT = parent +sys.path.append(str(ROOT)) + +# remove current directory from path +try: + sys.path.remove(str(parent)) +except ValueError: + pass + +import logging + +import lochness.config as config +from lochness.db import models + +logger = logging.getLogger(__name__) +logargs = { + "level": logging.DEBUG, + "format": "%(asctime)s - %(process)d - %(name)s - %(levelname)s - %(message)s", +} +logging.basicConfig(**logargs) + + +if __name__ == "__main__": + logger.info("Initializing database...") + logger.debug( + "This will drop all tables and recreate them. DO NOT RUN THIS IN PRODUCTION." + ) + + config_file = "/var/lib/prescient/soft/lochness-dev/scratch/config.yml" + logger.info(f"Loading config file: {config_file}") + lochness_config = config.load(path=config_file) + + logger.info("Initializing database...") + models.init_db(lochness_config) + + logger.info("Done!") From fa0c45229bb9fbe1320ddd90b86e1ee81568e07c Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Mon, 29 Jul 2024 12:53:15 -0400 Subject: [PATCH 10/20] feat.: Add AutditLog to init_db --- lochness/db/models/__init__.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/lochness/db/models/__init__.py b/lochness/db/models/__init__.py index c2c0b9d9..702557c4 100644 --- a/lochness/db/models/__init__.py +++ b/lochness/db/models/__init__.py @@ -2,14 +2,15 @@ Contains DB models for Lochness. """ -from typing import List, Union, Dict, Any +from typing import Any, Dict, List, Union from lochness import db -from lochness.db.models.study import Study -from lochness.db.models.subject import Subject +from lochness.db.models.file_mappings import FileMapping +from lochness.db.models.files_audit_log import AuditLog from lochness.db.models.phoenix_files import PhoenixFiles from lochness.db.models.remote_files import RemoteFile -from lochness.db.models.file_mappings import FileMapping +from lochness.db.models.study import Study +from lochness.db.models.subject import Subject def flatten_list(coll: list) -> list: @@ -31,7 +32,7 @@ def flatten_list(coll: list) -> list: return flat_list -def init_db(lochness_config: Dict[str, Any]) -> None: +def init_db(lochness_config: Dict[str, Any]): """ Initializes the database. @@ -39,9 +40,10 @@ def init_db(lochness_config: Dict[str, Any]) -> None: DO NOT RUN THIS IN PRODUCTION. Args: - config_file (Path): Path to the config file. + lochness_config (Path): Path to the config file. """ drop_queries_l: List[Union[str, List[str]]] = [ + AuditLog.drop_table_query(), FileMapping.drop_table_query(), RemoteFile.drop_table_query(), PhoenixFiles.drop_table_query(), @@ -55,6 +57,7 @@ def init_db(lochness_config: Dict[str, Any]) -> None: PhoenixFiles.init_table_query(), RemoteFile.init_table_query(), FileMapping.init_table_query(), + AuditLog.init_table_query(), ] drop_queries = flatten_list(drop_queries_l) From 91dd0850d50eb42482335cf618f9784eb74439cf Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Tue, 30 Jul 2024 03:04:13 +1000 Subject: [PATCH 11/20] fix: CREATE query Signed-off-by: Dheshan Mohandass --- lochness/db/models/files_audit_log.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lochness/db/models/files_audit_log.py b/lochness/db/models/files_audit_log.py index 0fe0932f..4362586d 100644 --- a/lochness/db/models/files_audit_log.py +++ b/lochness/db/models/files_audit_log.py @@ -75,7 +75,7 @@ def init_table_query() -> str: Return the SQL query to create the 'audit_log' table. """ sql_query = """ - CREATE TABLE IF audit_log ( + CREATE TABLE audit_log ( source_file TEXT NOT NULL, destination_file TEXT, system TEXT NOT NULL, @@ -111,7 +111,7 @@ def to_sql(self) -> str: sql_query = f""" INSERT INTO audit_log ( - source_file, destination_file, system, + source_file, destination_file, system, action, metadata, timestamp ) VALUES ( '{source_file}', '{destination_file}', '{system}', From 1116a53e58b8d911df36a7e5d7c417406c0f53c1 Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Tue, 30 Jul 2024 03:04:35 +1000 Subject: [PATCH 12/20] fix: Add study_id to satisfy unique constraint Signed-off-by: Dheshan Mohandass --- lochness/db/models/file_mappings.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/lochness/db/models/file_mappings.py b/lochness/db/models/file_mappings.py index 6a021ac7..d219d060 100644 --- a/lochness/db/models/file_mappings.py +++ b/lochness/db/models/file_mappings.py @@ -19,6 +19,7 @@ class FileMapping: local_file_path (Path): The path to the file on the local file system. remote_name (str): The name of the remote system. subject_id (str): The subject ID assciated with this asset. + study_id (str): The study ID associated with this asset. modality (str): The modality associated with this asset. """ @@ -28,6 +29,7 @@ def __init__( local_file_path: Path, remote_name: str, subject_id: str, + study_id: str, modality: str, ): """ @@ -38,12 +40,14 @@ def __init__( local_file_path (Path): The path to the file on the local file system. remote_name (str): The name of the remote system. subject_id (str): The subject ID associated with this asset. + study_id (str): The study ID associated with this asset. modality (str): The modality associated with this asset. """ self.remote_file_path = remote_file_path self.local_file_path = local_file_path self.remote_name = remote_name self.subject_id = subject_id + self.study_id = study_id self.modality = modality def __str__(self): @@ -71,11 +75,12 @@ def init_table_query() -> str: remote_name TEXT NOT NULL, local_file_path TEXT NOT NULL, subject_id TEXT NOT NULL, + study_id TEXT NOT NULL, modality TEXT NOT NULL, - PRIMARY KEY (remote_file_path, local_file_path, remote_name, subject_id), + PRIMARY KEY (remote_file_path, local_file_path, remote_name, subject_id, study_id), FOREIGN KEY (remote_file_path, remote_name) REFERENCES remote_files(r_file_path, r_remote_name), FOREIGN KEY (local_file_path) REFERENCES phoenix_files(p_file_path), - FOREIGN KEY (subject_id) REFERENCES subjects(subject_id) + FOREIGN KEY (study_id, subject_id) REFERENCES subjects(study_id, subject_id) ); """ @@ -100,14 +105,15 @@ def to_sql(self) -> str: local_file_path = db.santize_string(self.local_file_path) remote_name = db.santize_string(self.remote_name) subject_id = db.santize_string(self.subject_id) + study_id = db.santize_string(self.study_id) modality = db.santize_string(self.modality) return f""" INSERT INTO file_mappings ( remote_file_path, local_file_path, remote_name, - subject_id, modality + subject_id, study_id, modality ) VALUES ( '{remote_file_path}', '{local_file_path}', '{remote_name}', - '{subject_id}', '{modality}' + '{subject_id}', '{study_id}', '{modality}' ON CONFLICT DO NOTHING; """ From 35382f6684363a6b5884eda5cb63ee8f741a12d1 Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Mon, 29 Jul 2024 14:11:36 -0400 Subject: [PATCH 13/20] feat.: Impl. DB crawler for metadata Signed-off-by: Dheshan Mohandass --- lochness/db/crawlers/__init__.py | 3 ++ lochness/db/crawlers/metadata.py | 48 ++++++++++++++++++++++++++++++++ lochness/db/models/subject.py | 6 ++-- lochness/redcap/__init__.py | 33 ++++++++++++++-------- lochness/rpms/__init__.py | 34 ++++++++++++++-------- 5 files changed, 98 insertions(+), 26 deletions(-) create mode 100644 lochness/db/crawlers/__init__.py create mode 100644 lochness/db/crawlers/metadata.py diff --git a/lochness/db/crawlers/__init__.py b/lochness/db/crawlers/__init__.py new file mode 100644 index 00000000..20c6921c --- /dev/null +++ b/lochness/db/crawlers/__init__.py @@ -0,0 +1,3 @@ +""" +Contains crawlers for Lochness. +""" diff --git a/lochness/db/crawlers/metadata.py b/lochness/db/crawlers/metadata.py new file mode 100644 index 00000000..6822d1cf --- /dev/null +++ b/lochness/db/crawlers/metadata.py @@ -0,0 +1,48 @@ +""" +Imports metadata information into DB. +""" + +from typing import Any, Dict + +import pandas as pd + +from lochness import db +from lochness.db.models.subject import Subject +from lochness.db.models.study import Study + + +def import_metadata_df( + lochness_config: Dict[str, Any], metadata_df: pd.DataFrame, study_id: str +) -> None: + """ + Import subject metadata from a DataFrame into the database. + + Args: + metadata_df (pd.DataFrame): The DataFrame containing the columns: + 'Subject ID', 'Active', 'Consent', '...'. + study_id (str): The study ID. + """ + queries = [] + + study = Study(study_id=study_id) + insert_study_sql = study.to_sql() + queries.append(insert_study_sql) + + for _, row in metadata_df.iterrows(): + optional_notes = {} + for column in metadata_df.columns: + if column not in ["Subject ID", "Active", "Consent"]: + optional_notes[column] = row[column] + + subject = Subject( + study_id=study_id, + subject_id=row["Subject ID"], + is_active=row["Active"], + consent_date=row["Consent"], + optional_notes=optional_notes, + ) + + subject_sql = subject.to_sql() + queries.append(subject_sql) + + db.execute_queries(lochness_config=lochness_config, queries=queries) diff --git a/lochness/db/models/subject.py b/lochness/db/models/subject.py index 21ac09fb..adc64d5f 100644 --- a/lochness/db/models/subject.py +++ b/lochness/db/models/subject.py @@ -78,8 +78,10 @@ def to_sql(self) -> str: optional_notes = db.sanitize_json(self.optional_notes) sql_query = f""" - INSERT INTO subjects (study_id, subject_id, is_active, consent_date, optional_notes) - VALUES ('{self.study_id}', '{self.subject_id}', {self.is_active}, '{consent_date}', '{optional_notes}') + INSERT INTO subjects (study_id, subject_id, is_active, + consent_date, optional_notes) + VALUES ('{self.study_id}', '{self.subject_id}', {self.is_active}, + '{consent_date}', '{optional_notes}') ON CONFLICT(study_id, subject_id) DO UPDATE SET is_active = excluded.is_active, consent_date = excluded.consent_date, diff --git a/lochness/redcap/__init__.py b/lochness/redcap/__init__.py index 3729be6a..3d7d55fe 100644 --- a/lochness/redcap/__init__.py +++ b/lochness/redcap/__init__.py @@ -1,21 +1,23 @@ -import os -import sys -import re +import collections as col +import datetime import json -import lochness import logging -import requests -import lochness.net as net -import collections as col -import lochness.tree as tree +import os +import re +import sys +import tempfile as tf from pathlib import Path -import pandas as pd -import datetime from typing import List, Union -import tempfile as tf -from lochness.redcap.process_piis import process_and_copy_db +import pandas as pd +import requests from requests.packages.urllib3.exceptions import InsecureRequestWarning + +import lochness +import lochness.net as net +import lochness.tree as tree +from lochness.db.crawlers import metadata as metadata_crawler + requests.packages.urllib3.disable_warnings(InsecureRequestWarning) @@ -211,6 +213,13 @@ def initialize_metadata(Lochness: 'Lochness object', same_df = df.reset_index(drop=True).equals(target_df) + # import into DB + metadata_crawler.import_metadata_df( + lochness_config=Lochness, + metadata_df=df, + study_id=study_name + ) + if same_df: pass else: diff --git a/lochness/rpms/__init__.py b/lochness/rpms/__init__.py index 7b38c0a3..225bf327 100644 --- a/lochness/rpms/__init__.py +++ b/lochness/rpms/__init__.py @@ -1,20 +1,23 @@ -import os -import yaml -import lochness +""" +Module to import data from RPMS +""" +import collections as col import logging -import zipfile +import os +import re import shutil +from datetime import datetime from pathlib import Path -import tempfile as tf -import collections as col +from time import sleep +from typing import Dict, List, Union + +import pandas as pd +import yaml + import lochness.net as net import lochness.tree as tree -from typing import List, Dict, Union -import pandas as pd -from datetime import datetime -from time import sleep -import re -from lochness.redcap.process_piis import process_and_copy_db +from lochness.db.crawlers import metadata as metadata_crawler + pd.set_option('mode.chained_assignment', None) @@ -353,6 +356,13 @@ def initialize_metadata(Lochness: 'Lochness object', df_final = df_final[main_cols + \ [x for x in df_final.columns if x not in main_cols]] + # import into DB + metadata_crawler.import_metadata_df( + lochness_config=Lochness, + metadata_df=df_final, + study_id=study_name + ) + general_path = Path(Lochness['phoenix_root']) / 'PROTECTED' metadata_study = general_path / study_name / f"{study_name}_metadata.csv" From 270036c7c360ad92798c2b5a3582944ff7456a49 Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Mon, 29 Jul 2024 14:32:14 -0400 Subject: [PATCH 14/20] feat.: Impl. logging for crawler Signed-off-by: Dheshan Mohandass --- lochness/db/crawlers/metadata.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lochness/db/crawlers/metadata.py b/lochness/db/crawlers/metadata.py index 6822d1cf..4ad916f5 100644 --- a/lochness/db/crawlers/metadata.py +++ b/lochness/db/crawlers/metadata.py @@ -3,6 +3,7 @@ """ from typing import Any, Dict +import logging import pandas as pd @@ -10,6 +11,8 @@ from lochness.db.models.subject import Subject from lochness.db.models.study import Study +logger = logging.getLogger("lochness.crawlers.metadata") + def import_metadata_df( lochness_config: Dict[str, Any], metadata_df: pd.DataFrame, study_id: str @@ -22,12 +25,14 @@ def import_metadata_df( 'Subject ID', 'Active', 'Consent', '...'. study_id (str): The study ID. """ + logger.info(f"Importing metadata for study {study_id}") queries = [] study = Study(study_id=study_id) insert_study_sql = study.to_sql() queries.append(insert_study_sql) + logger.debug(f"Found {metadata_df.shape[0]} subjects for study {study_id}") for _, row in metadata_df.iterrows(): optional_notes = {} for column in metadata_df.columns: @@ -46,3 +51,4 @@ def import_metadata_df( queries.append(subject_sql) db.execute_queries(lochness_config=lochness_config, queries=queries) + logger.info(f"Successfully imported metadata for study {study_id}") From b5ff6df5600c6a5b362ba12fcecf4aa0703eec45 Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Mon, 29 Jul 2024 14:36:10 -0400 Subject: [PATCH 15/20] fix: cast consent date to datetime Signed-off-by: Dheshan Mohandass --- lochness/db/crawlers/metadata.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lochness/db/crawlers/metadata.py b/lochness/db/crawlers/metadata.py index 4ad916f5..426069b0 100644 --- a/lochness/db/crawlers/metadata.py +++ b/lochness/db/crawlers/metadata.py @@ -39,11 +39,12 @@ def import_metadata_df( if column not in ["Subject ID", "Active", "Consent"]: optional_notes[column] = row[column] + consent_date = pd.to_datetime(row["Consent"]).to_pydatetime() subject = Subject( study_id=study_id, subject_id=row["Subject ID"], is_active=row["Active"], - consent_date=row["Consent"], + consent_date=consent_date, optional_notes=optional_notes, ) From 0e244d9722e3823b0db74eaba85bb13b7780eba3 Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Mon, 29 Jul 2024 14:38:57 -0400 Subject: [PATCH 16/20] fix: cast is_active to bool Signed-off-by: Dheshan Mohandass --- lochness/db/crawlers/metadata.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lochness/db/crawlers/metadata.py b/lochness/db/crawlers/metadata.py index 426069b0..e1c699bd 100644 --- a/lochness/db/crawlers/metadata.py +++ b/lochness/db/crawlers/metadata.py @@ -40,10 +40,11 @@ def import_metadata_df( optional_notes[column] = row[column] consent_date = pd.to_datetime(row["Consent"]).to_pydatetime() + is_active = row["Active"] == '1' subject = Subject( study_id=study_id, subject_id=row["Subject ID"], - is_active=row["Active"], + is_active=is_active, consent_date=consent_date, optional_notes=optional_notes, ) From a361ba7368f711d2c460465f0871729d2365fee2 Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Mon, 29 Jul 2024 14:44:10 -0400 Subject: [PATCH 17/20] fix: is_active type Signed-off-by: Dheshan Mohandass --- lochness/db/crawlers/metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lochness/db/crawlers/metadata.py b/lochness/db/crawlers/metadata.py index e1c699bd..d230124f 100644 --- a/lochness/db/crawlers/metadata.py +++ b/lochness/db/crawlers/metadata.py @@ -40,7 +40,7 @@ def import_metadata_df( optional_notes[column] = row[column] consent_date = pd.to_datetime(row["Consent"]).to_pydatetime() - is_active = row["Active"] == '1' + is_active = row["Active"] == 1 subject = Subject( study_id=study_id, subject_id=row["Subject ID"], From 7643b810350d7fb7f3a05a38a687aa66e5bb9f12 Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Mon, 29 Jul 2024 15:40:28 -0400 Subject: [PATCH 18/20] fix: Minor DB schema adjustments Signed-off-by: Dheshan Mohandass --- lochness/db/crawlers/metadata.py | 1 + lochness/db/models/files_audit_log.py | 6 +++--- lochness/db/models/remote_files.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/lochness/db/crawlers/metadata.py b/lochness/db/crawlers/metadata.py index d230124f..e98c40c4 100644 --- a/lochness/db/crawlers/metadata.py +++ b/lochness/db/crawlers/metadata.py @@ -23,6 +23,7 @@ def import_metadata_df( Args: metadata_df (pd.DataFrame): The DataFrame containing the columns: 'Subject ID', 'Active', 'Consent', '...'. + Note.: This DataFrame is generally obtained from RPMS or REDCap modules. study_id (str): The study ID. """ logger.info(f"Importing metadata for study {study_id}") diff --git a/lochness/db/models/files_audit_log.py b/lochness/db/models/files_audit_log.py index 4362586d..fff7a779 100644 --- a/lochness/db/models/files_audit_log.py +++ b/lochness/db/models/files_audit_log.py @@ -3,6 +3,7 @@ AuditLog Model """ from typing import Dict +from datetime import datetime from lochness import db @@ -27,7 +28,7 @@ def __init__( system: str, action: str, metadata: Dict[str, str], - timestamp: str, + timestamp: datetime, ): """ Initialize an AuditLog object. @@ -107,7 +108,6 @@ def to_sql(self) -> str: system = db.santize_string(self.system) action = db.santize_string(self.action) metadata = db.santize_string(self.metadata) - timestamp = db.santize_string(self.timestamp) sql_query = f""" INSERT INTO audit_log ( @@ -115,7 +115,7 @@ def to_sql(self) -> str: action, metadata, timestamp ) VALUES ( '{source_file}', '{destination_file}', '{system}', - '{action}', '{metadata}', '{timestamp}' + '{action}', '{metadata}', '{self.timestamp}' ); """ diff --git a/lochness/db/models/remote_files.py b/lochness/db/models/remote_files.py index ea77a675..7ca53968 100644 --- a/lochness/db/models/remote_files.py +++ b/lochness/db/models/remote_files.py @@ -63,7 +63,7 @@ def init_table_query() -> str: CREATE TABLE remote_files ( r_file_path TEXT NOT NULL, r_remote_name TEXT NOT NULL, - r_hash_val TEXT NOT NULL, + r_hash_val TEXT, r_last_checked TIMESTAMP NOT NULL, r_remote_metadata JSONB, PRIMARY KEY (r_file_path, r_remote_name) From 7b72d3e4ca23144494bdc86aecdaf946ad3c6984 Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Mon, 29 Jul 2024 15:42:48 -0400 Subject: [PATCH 19/20] feat.: Impl. Logging mediaflux downloads to DB Signed-off-by: Dheshan Mohandass --- lochness/db/log/__init__.py | 80 ++++++++++++++++++++++++++++++++++ lochness/mediaflux/__init__.py | 42 +++++++++++------- 2 files changed, 107 insertions(+), 15 deletions(-) create mode 100644 lochness/db/log/__init__.py diff --git a/lochness/db/log/__init__.py b/lochness/db/log/__init__.py new file mode 100644 index 00000000..18d0e313 --- /dev/null +++ b/lochness/db/log/__init__.py @@ -0,0 +1,80 @@ +""" +Contains helper functions to log information to DB. +""" + +import logging +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional + +from lochness import db +from lochness.db.models.file_mappings import FileMapping +from lochness.db.models.files_audit_log import AuditLog +from lochness.db.models.phoenix_files import PhoenixFiles +from lochness.db.models.remote_files import RemoteFile + +logger = logging.getLogger("lochness.db.log") + + +def log_download( + lochness_config: Dict[str, Any], + remote_file_path: Path, + remote_name: str, + remote_hash: Optional[str], + remote_metadata: Dict[str, Any], + local_file_path: Path, + subject_id: str, + study_id: str, + modality: str, +) -> None: + """ + Logs a download operation to the audit_log table. + + Adds entries in: + - audit_log + - file_mappings + - phoenix_files + - remote_files + """ + + logger.info(f"Logging download of {remote_file_path} to {local_file_path}") + + # Log download to audit_log + audit_log_entry = AuditLog( + source_file=remote_file_path, + destination_file=local_file_path, + action="download", + system=remote_name, + metadata={}, + timestamp=datetime.now(), + ) + + # Log to remote_files + remote_file = RemoteFile( + file_path=remote_file_path, + remote_name=remote_name, + hash_val=remote_hash, + last_checked=datetime.now(), + remote_metadata=remote_metadata, + ) + + # log to phoenix_files + phoenix_file = PhoenixFiles(file_path=local_file_path, with_hash=True) + + # log to file_mappings + file_mapping = FileMapping( + remote_file_path=remote_file_path, + local_file_path=local_file_path, + remote_name=remote_name, + subject_id=subject_id, + study_id=study_id, + modality=modality, + ) + + queries: List[str] = [] + queries.append(audit_log_entry.to_sql()) + queries.append(remote_file.to_sql()) + queries.append(phoenix_file.to_sql()) + queries.append(file_mapping.to_sql()) + + db.execute_queries(lochness_config=lochness_config, queries=queries) diff --git a/lochness/mediaflux/__init__.py b/lochness/mediaflux/__init__.py index 1452f8fb..de040458 100644 --- a/lochness/mediaflux/__init__.py +++ b/lochness/mediaflux/__init__.py @@ -1,25 +1,24 @@ -import os, sys -import gzip +""" +Module to sync data from Mediaflux. +""" import logging -import importlib -import lochness +import os +import re import tempfile as tf -import cryptease as crypt -import lochness.net as net -from typing import Generator, Tuple +from distutils.spawn import find_executable +from os.path import basename, dirname, isfile +from os.path import join as pjoin from pathlib import Path -import hashlib -from io import BytesIO -import lochness.keyring as keyring -from os.path import join as pjoin, basename, dirname, isfile +from subprocess import DEVNULL, STDOUT, Popen + import cryptease as enc -import re -from subprocess import Popen, DEVNULL, STDOUT import pandas as pd -from numpy import nan -from distutils.spawn import find_executable + +import lochness +import lochness.keyring as keyring import lochness.tree as tree from lochness.cleaner import is_transferred_and_removed +from lochness.db import log as db_log logger = logging.getLogger(__name__) Module = lochness.lchop(__name__, 'lochness.') @@ -206,6 +205,19 @@ def sync_module(Lochness: 'lochness.config', stdout=DEVNULL, stderr=STDOUT) p.wait() + # log download to DB + db_log.log_download( + lochness_config=Lochness, + remote_file_path=Path(remote), + remote_name='mediaflux', + remote_hash=checksum, + remote_metadata={}, + local_file_path=Path(mf_local) / subpath.name, + subject_id=subject.id, + study_id=study_name, + modality=datatype + ) + # write checksum to local with open(prev_checksum_file, 'w') as fp: fp.write(checksum) From e7b74df6c6b2419e517979bede36d284312c26d9 Mon Sep 17 00:00:00 2001 From: Dheshan Mohandass Date: Mon, 29 Jul 2024 15:47:06 -0400 Subject: [PATCH 20/20] feat.: Avoid spamming SQL to log Signed-off-by: Dheshan Mohandass --- lochness/db/crawlers/metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lochness/db/crawlers/metadata.py b/lochness/db/crawlers/metadata.py index e98c40c4..1cc2f425 100644 --- a/lochness/db/crawlers/metadata.py +++ b/lochness/db/crawlers/metadata.py @@ -53,5 +53,5 @@ def import_metadata_df( subject_sql = subject.to_sql() queries.append(subject_sql) - db.execute_queries(lochness_config=lochness_config, queries=queries) + db.execute_queries(lochness_config=lochness_config, queries=queries, show_commands=False) logger.info(f"Successfully imported metadata for study {study_id}")