remade the db and db_utils functions

miloswrath · miloswrath · commit 45eb18391923 · 2025-02-24T10:02:48.000-06:00
diff --git a/app/db.py b/app/db.py
@@ -13,38 +13,58 @@ def initialize_schema(connection):
     try:
         with connection.cursor() as cursor:
             cursor.execute("""
-            CREATE TABLE IF NOT EXISTS study (
-                id SERIAL PRIMARY KEY,
-                name VARCHAR(50) UNIQUE NOT NULL
-            );
-
-            CREATE TABLE IF NOT EXISTS site (
-                id SERIAL PRIMARY KEY,
-                name VARCHAR(50) NOT NULL,
-                study_id INT REFERENCES study(id) ON DELETE CASCADE
-            );
-
-            CREATE TABLE IF NOT EXISTS subject (
-                id SERIAL PRIMARY KEY,
-                name VARCHAR(50) NOT NULL,
-                site_id INT REFERENCES site(id) ON DELETE CASCADE
-            );
-
-            CREATE TABLE IF NOT EXISTS task (
-                id SERIAL PRIMARY KEY,
-                name VARCHAR(50) NOT NULL,
-                subject_id INT REFERENCES subject(id) ON DELETE CASCADE
-            );
-
-            CREATE TABLE IF NOT EXISTS session (
-                id SERIAL PRIMARY KEY,
-                session_name VARCHAR(50) NOT NULL,
-                category INT NOT NULL,
-                csv_path TEXT,
-                plot_paths TEXT[],
-                task_id INT REFERENCES task(id) ON DELETE CASCADE,
-                date TIMESTAMP DEFAULT CURRENT_TIMESTAMP NOT NULL
-            );
+                -- Drop existing tables in reverse dependency order.
+                DROP TABLE IF EXISTS session CASCADE;
+                DROP TABLE IF EXISTS task CASCADE;
+                DROP TABLE IF EXISTS subject CASCADE;
+                DROP TABLE IF EXISTS site CASCADE;
+                DROP TABLE IF EXISTS study CASCADE;
+
+                -- Create table "study"
+                CREATE TABLE study (
+                    id SERIAL PRIMARY KEY,
+                    name TEXT NOT NULL UNIQUE
+                );
+
+                -- Create table "site"
+                CREATE TABLE site (
+                    id SERIAL PRIMARY KEY,
+                    name TEXT NOT NULL,
+                    study_id INTEGER NOT NULL,
+                    UNIQUE (name, study_id),
+                    FOREIGN KEY (study_id) REFERENCES study(id) ON DELETE CASCADE
+                );
+
+                -- Create table "subject"
+                CREATE TABLE subject (
+                    id SERIAL PRIMARY KEY,
+                    name TEXT NOT NULL,
+                    site_id INTEGER NOT NULL,
+                    UNIQUE (name, site_id),
+                    FOREIGN KEY (site_id) REFERENCES site(id) ON DELETE CASCADE
+                );
+
+                -- Create table "task"
+                CREATE TABLE task (
+                    id SERIAL PRIMARY KEY,
+                    name TEXT NOT NULL,
+                    subject_id INTEGER NOT NULL,
+                    UNIQUE (name, subject_id),
+                    FOREIGN KEY (subject_id) REFERENCES subject(id) ON DELETE CASCADE
+                );
+
+                -- Create table "session"
+                CREATE TABLE session (
+                    id SERIAL PRIMARY KEY,
+                    session_name TEXT NOT NULL,
+                    category INTEGER NOT NULL,
+                    csv_path TEXT NOT NULL,
+                    task_id INTEGER NOT NULL,
+                    date TIMESTAMP,
+                    plot_paths TEXT[],
+                    FOREIGN KEY (task_id) REFERENCES task(id) ON DELETE CASCADE,
+                    UNIQUE (session_name, category, csv_path, task_id)
+                );
             """)
             connection.commit()
     except Exception as e:
@@ -131,18 +151,17 @@ def populate_database(connection, data_folder):
 
 # Main entry point
 if __name__ == "__main__":
-    db_name = "boostbeh"
+    db_name = "boost-beh"
     user = "zakg04"
     password = "*mIloisfAT23*123*"
     data_folder = "../data"
     connection = connect_to_db(db_name, user, password)
+    try:
+        initialize_schema(connection)
+    finally:
+        connection.close()
+'''
     util_instance = DatabaseUtils(connection, data_folder)
     util_instance.update_database()
 
-    """conn = connect_to_db(db_name, user, password)
-    try:
-        initialize_schema(conn)
-        populate_database(conn, data_folder)
-        print("Database initialized and populated successfully.")
-    finally:
-        conn.close()"""
+'''
diff --git a/app/main/__pycache__/update_db.cpython-39.pyc b/app/main/__pycache__/update_db.cpython-39.pyc
diff --git a/app/main/update_db.py b/app/main/update_db.py
@@ -1,62 +1,100 @@
 import os
 import logging
-import psycopg
+import psycopg  # PostgreSQL database adapter
 import pandas as pd
 from datetime import datetime
 
 class DatabaseUtils:
     def __init__(self, connection, data_folder):
+        """
+        Initializes the DatabaseUtils class with a database connection and a data folder path.
+        
+        :param connection: PostgreSQL database connection object.
+        :param data_folder: Path to the directory containing study data.
+        """
         self.connection = connection
         self.data_folder = data_folder
 
     def update_database(self):
+        """
+        Iterates through the directory structure and updates the database with study, site, subject, 
+        task, and session information. Commits changes at the end of processing each study.
+        """
         logging.info("Starting database update.")
 
+        # Loop through each study folder in the data directory
         for study_name in os.listdir(self.data_folder):
             study_path = os.path.join(self.data_folder, study_name)
             if not os.path.isdir(study_path):
                 logging.warning(f"Skipping non-directory: {study_path}")
                 continue
 
+            # Add or retrieve the study ID from the database
             study_id = self._add_or_get_id("study", {"name": study_name})
 
+            # Loop through each site folder within the study
             for site_name in os.listdir(study_path):
                 site_path = os.path.join(study_path, site_name)
                 if not os.path.isdir(site_path):
                     logging.warning(f"Skipping non-directory: {site_path}")
                     continue
 
+                # Add or retrieve the site ID from the database
                 site_id = self._add_or_get_id("site", {"name": site_name, "study_id": study_id})
 
+                # Loop through each subject folder within the site
                 for subject_name in os.listdir(site_path):
                     subject_path = os.path.join(site_path, subject_name)
                     if not os.path.isdir(subject_path):
                         logging.warning(f"Skipping non-directory: {subject_path}")
                         continue
 
+                    # Since subject names are always four-digit numbers, format them accordingly.
+                    try:
+                        # Ensure the subject folder name is numeric and pad with leading zeros if necessary.
+                        int(subject_name)
+                        subject_name = subject_name.zfill(4)
+                    except ValueError:
+                        logging.warning(f"Subject name {subject_name} is not numeric; saving as-is.")
+
+                    # Add or retrieve the subject ID from the database
                     subject_id = self._add_or_get_id("subject", {"name": subject_name, "site_id": site_id})
 
+                    # Loop through each task folder within the subject
                     for task_name in os.listdir(subject_path):
                         task_path = os.path.join(subject_path, task_name)
                         if not os.path.isdir(task_path):
                             logging.warning(f"Skipping non-directory: {task_path}")
                             continue
 
+                        # Add or retrieve the task ID from the database
                         task_id = self._add_or_get_id("task", {"name": task_name, "subject_id": subject_id})
 
+                        # Process data files within the task folder
                         self._process_data_folder(task_path, task_id)
+                        # Process plot images within the task folder
                         self._process_plot_folder(task_path, task_id)
 
+            # Commit all changes for the current study
             self.connection.commit()
             logging.info("Database committed.")
 
         logging.info("Database update complete.")
 
     def _add_or_get_id(self, table, values):
-        placeholders = ', '.join([f"{key} = %s" for key in values.keys()])
+        """
+        Adds a new entry to the specified table or retrieves the existing entry's ID.
+
+        :param table: The name of the database table.
+        :param values: A dictionary containing column names and their values.
+        :return: The ID of the existing or newly inserted row.
+        """
+        # Build a WHERE clause for checking existing rows
+        placeholders = ' AND '.join([f"{key} = %s" for key in values.keys()])
         columns = ', '.join(values.keys())
         values_list = list(values.values())
 
+        # SQL query to insert a new record, avoiding conflicts
         query = f"""
             INSERT INTO {table} ({columns}) 
             VALUES ({', '.join(['%s'] * len(values))}) 
@@ -69,33 +107,42 @@ def _add_or_get_id(self, table, values):
             if result:
                 return int(result[0])
 
+            # If no ID is returned, retrieve the existing record's ID.
             select_query = f"SELECT id FROM {table} WHERE {placeholders};"
             cursor.execute(select_query, values_list)
             return int(cursor.fetchone()[0])
 
     def _process_data_folder(self, task_path, task_id):
+        """
+        Processes CSV files in the "data" folder within a task directory and inserts session records into the database.
+
+        :param task_path: Path to the task directory.
+        :param task_id: ID of the corresponding task in the database.
+        """
         data_folder_path = os.path.join(task_path, "data")
         if os.path.exists(data_folder_path):
             for file in os.listdir(data_folder_path):
                 if file.endswith(".csv"):
                     logging.debug(f"Processing file: {file}")
                     try:
+                        # Extract session and category information from the filename
                         parts = file.split("_")
                         if len(parts) < 3:
                             raise ValueError(f"Unexpected file format: {file}")
 
-                        session_name = parts[1].split("-")[1]  # Ensure split works correctly
-                        category = int(parts[2].split("-")[1].split(".")[0])
+                        session_name = parts[1].split("-")[1]  # Extract session name
+                        category = int(parts[2].split("-")[1].split(".")[0])  # Extract category
                         csv_path = os.path.join(data_folder_path, file)
 
-                        # Extract and clean date from CSV if column 'datetime' exists
+                        # Extract and clean the date from the CSV if it contains a 'datetime' column
                         date = None
                         df = pd.read_csv(csv_path)
                         if 'datetime' in df.columns:
                             raw_date = str(df['datetime'].iloc[0])
                             date = self._clean_date(raw_date)
-                        del df
+                        del df  # Free up memory
 
+                        # Insert session data into the database
                         with self.connection.cursor() as cursor:
                             cursor.execute(
                                 """
@@ -110,6 +157,12 @@ def _process_data_folder(self, task_path, task_id):
                         logging.error(f"Error processing file {file}: {e}")
 
     def _process_plot_folder(self, task_path, task_id):
+        """
+        Processes PNG image files in the "plot" folder and updates the session record with plot file paths.
+
+        :param task_path: Path to the task directory.
+        :param task_id: ID of the corresponding task in the database.
+        """
         plot_folder_path = os.path.join(task_path, "plot")
         if os.path.exists(plot_folder_path):
             plots = [os.path.join(plot_folder_path, f) for f in os.listdir(plot_folder_path) if f.endswith(".png")]
@@ -126,14 +179,19 @@ def _process_plot_folder(self, task_path, task_id):
                 logging.debug(f"Plots updated for task {task_id}: {plots}")
 
     def _clean_date(self, raw_date):
+        """
+        Converts a raw date string into a standardized format.
+
+        :param raw_date: Date string extracted from a CSV file.
+        :return: Standardized date string or None if parsing fails.
+        """
         import re
-        """Converts raw date strings into a standardized format."""
         try:
-            # Remove timezone information in parentheses, if any
+            # Remove timezone information enclosed in parentheses
             cleaned_raw_date = re.sub(r"\s\(.*?\)", "", raw_date)
-            # Parse the cleaned date string
+            # Parse the cleaned date string into a datetime object
             clean_date = datetime.strptime(cleaned_raw_date, "%a %b %d %Y %H:%M:%S %Z%z")
-            # Standardize to SQL-compatible format
+            # Convert to SQL-compatible format
             return clean_date.strftime("%Y-%m-%d %H:%M:%S")
         except ValueError as e:
             logging.error(f"Error parsing date: {raw_date} - {e}")