new changes to fix API problems

miloswrath · miloswrath · commit d77d86dc9fe0 · 2025-11-05T12:12:20.000-06:00
diff --git a/code/data_processing/cc_qc.py b/code/data_processing/cc_qc.py
@@ -86,7 +86,12 @@ def cc_qc(self, df, threshold, TS=False):
             CATEGORY = 2
             print(f"FOR TASK SWITCHING -> Average accuracy at or below 0.5 across conditions and CATEGORY set to 2")
 
-        problematic_conditions = QC_UTILS.cond_block_not_reported(raw, self.ACC_COLUMN_NAME, self.COND_COLUMN_NAME, self.INCORRECT_SYMBOL)
+        problematic_conditions = QC_UTILS.cond_block_not_reported(
+            raw,
+            self.COND_COLUMN_NAME,
+            self.ACC_COLUMN_NAME,
+            self.INCORRECT_SYMBOL,
+        )
 
         if len(problematic_conditions) != 0:
             CATEGORY = 3
diff --git a/code/data_processing/plot_utils.py b/code/data_processing/plot_utils.py
@@ -27,7 +27,25 @@ def af_nf_plot(self, df):
             tuple: A tuple containing two Axes objects (count_plot, response_time_plot).
         """
         # Filter to drop practice data
-        test = df[df['block'] == 'test'].copy()
+        block_series = df['block'].astype(str).str.strip().str.lower()
+        test_mask = block_series == 'test'
+        test = df[test_mask].copy()
+
+        if test.empty:
+            subject_series = df.get('subject_id')
+            if subject_series is not None:
+                subjects = sorted(
+                    {str(value).strip() for value in subject_series.dropna() if str(value).strip()}
+                )
+            else:
+                subjects = []
+
+            unique_blocks = sorted({value for value in block_series.unique() if value and value != 'nan'})
+            raise ValueError(
+                "No 'test' block rows available for plotting. "
+                f"Observed block labels: {unique_blocks or '<none>'}. "
+                f"Subjects in frame: {subjects or '<unknown>'}"
+            )
 
         # Generate count plot
         plt.figure(figsize=(10, 6))
@@ -402,7 +420,27 @@ def fn_plot(self, df):
         Returns:
             tuple: The scatter/box plot and bar chart plot objects.
         """
-        test = df[df['block'] == 'test']
+        block_series = df['block'].astype(str).str.strip().str.lower()
+        test_mask = block_series == 'test'
+        test = df[test_mask].copy()
+
+        if test.empty:
+            subject_series = df.get('subject_id')
+            if subject_series is not None:
+                subjects = sorted(
+                    {str(value).strip() for value in subject_series.dropna() if str(value).strip()}
+                )
+            else:
+                subjects = []
+
+            unique_blocks = sorted({value for value in block_series.unique() if value and value != 'nan'})
+            raise ValueError(
+                "No 'test' block rows available for MEM plotting. "
+                f"Observed block labels: {unique_blocks or '<none>'}. "
+                f"Subjects in frame: {subjects or '<unknown>'}"
+            )
+
+        test['block'] = test['block'].astype(str).str.strip()
         test['correct_label'] = test['correct'].map({0: 'Incorrect', 1: 'Correct'})
 
         # Scatter and box plot
@@ -459,7 +497,27 @@ def sm_plot(self, df):
         Returns:
             The scatter and box plot object.
         """
-        test = df[df['block'] == 'test']
+        block_series = df['block'].astype(str).str.strip().str.lower()
+        test_mask = block_series == 'test'
+        test = df[test_mask].copy()
+
+        if test.empty:
+            subject_series = df.get('subject_id')
+            if subject_series is not None:
+                subjects = sorted(
+                    {str(value).strip() for value in subject_series.dropna() if str(value).strip()}
+                )
+            else:
+                subjects = []
+
+            unique_blocks = sorted({value for value in block_series.unique() if value and value != 'nan'})
+            raise ValueError(
+                "No 'test' block rows available for MEM plotting. "
+                f"Observed block labels: {unique_blocks or '<none>'}. "
+                f"Subjects in frame: {subjects or '<unknown>'}"
+            )
+
+        test['block'] = test['block'].astype(str).str.strip()
         mapping = {'no': 'Incongruent', 'yes': 'Congruent'}
         test['target_congruent'] = test['target_congruent'].map(mapping)
         test['correct_label'] = test['correct'].map({0: 'Incorrect', 1: 'Correct'})
@@ -595,9 +653,6 @@ def dwl_plot(self, df):
 
 
 
-
-
-
 
 
 
diff --git a/code/data_processing/utils.py b/code/data_processing/utils.py
@@ -21,8 +21,15 @@ def convert_to_csv(self, txt_dfs):
                 # If file is empty or has no valid records, skip
                 continue
 
-            # Normalize nested dicts into flat columns so downstream QC stays unchanged
-            flattened_df = pd.json_normalize(records)
+            normalized_records = [
+                self._flatten_record(record) for record in records if isinstance(record, dict)
+            ]
+            if not normalized_records:
+                continue
+
+            flattened_df = pd.json_normalize(normalized_records, sep="_")
+            flattened_df = self._harmonize_columns(flattened_df)
+            flattened_df = self._normalize_semantics(flattened_df)
             new_dfs.append(flattened_df)
 
         return new_dfs
@@ -82,6 +89,117 @@ def _collect_records(self, payload):
 
         return []
 
+    def _flatten_record(self, record):
+        """
+        Recursively merge wrapper keys (``data``, ``trialdata``) into a flat dict so
+        downstream QC modules see consistent column names like ``block`` and ``correct``.
+        """
+        flattened = {}
+        for key, value in record.items():
+            if isinstance(value, dict) and key.lower() in {"data", "trialdata"}:
+                flattened.update(self._flatten_record(value))
+            elif isinstance(value, dict):
+                nested = self._flatten_record(value)
+                for nested_key, nested_value in nested.items():
+                    combined_key = f"{key}_{nested_key}"
+                    flattened[combined_key] = nested_value
+            else:
+                flattened[key] = value
+        return flattened
+
+    def _harmonize_columns(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Strip known wrapper prefixes to restore historical column names and drop duplicates.
+        """
+        rename_map = {}
+        for col in df.columns:
+            new_col = col
+            for prefix in ("trialdata_", "data_", "payload_", "TrialData_", "trialData_"):
+                if new_col.startswith(prefix):
+                    new_col = new_col[len(prefix):]
+            rename_map[col] = new_col
+
+        harmonized = df.rename(columns=rename_map)
+
+        canonical_map = {
+            "Block": "block",
+            "BlockName": "block",
+            "blockName": "block",
+            "Block_Type": "block",
+            "block_type": "block",
+            "Condition": "condition",
+            "Cond": "condition",
+            "stim_condition": "condition",
+            "Correct": "correct",
+            "isCorrect": "correct",
+            "Session": "session_number",
+            "session": "session_number",
+            "SessionID": "session_number",
+            "Subject": "subject_id",
+            "subject": "subject_id",
+        }
+
+        harmonized = harmonized.rename(columns=lambda col: canonical_map.get(col, col))
+
+        # If both original and harmonized columns exist, keep the first non-null values.
+        if harmonized.columns.duplicated().any():
+            deduped = {}
+            for col in harmonized.columns.unique():
+                dupes = [c for c in harmonized.columns if c == col]
+                if len(dupes) == 1:
+                    deduped[col] = harmonized[dupes[0]]
+                else:
+                    stacked = harmonized[dupes].bfill(axis=1)
+                    deduped[col] = stacked.iloc[:, 0]
+            harmonized = pd.DataFrame(deduped)
+
+        return harmonized
+
+    def _normalize_semantics(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Coerce critical columns (block/condition/correct/session/subject_id) into their
+        historical dtypes and label space so downstream QC and persistence stay stable.
+        """
+        normalized = df.copy()
+
+        if "block" in normalized.columns:
+            normalized["block"] = normalized["block"].map(self._standardize_block)
+
+        if "condition" in normalized.columns:
+            normalized["condition"] = normalized["condition"].apply(
+                lambda val: val.strip() if isinstance(val, str) else val
+            )
+
+        if "correct" in normalized.columns:
+            normalized["correct"] = pd.to_numeric(normalized["correct"], errors="coerce")
+
+        if "session_number" in normalized.columns:
+            normalized["session_number"] = pd.to_numeric(
+                normalized["session_number"], errors="coerce"
+            )
+
+        if "subject_id" in normalized.columns:
+            normalized["subject_id"] = normalized["subject_id"].apply(
+                lambda val: str(val).strip() if pd.notna(val) else val
+            )
+
+        return normalized
+
+    @staticmethod
+    def _standardize_block(value):
+        if isinstance(value, str):
+            cleaned = value.strip().lower()
+            if cleaned.startswith("test"):
+                return "test"
+            if cleaned.startswith(("prac", "practice")):
+                return "prac"
+            if cleaned in {"training", "train"}:
+                return "prac"
+            if cleaned == "":
+                return np.nan
+            return cleaned
+        return value
+
     def save_csv(self):
         return None
 
diff --git a/code/data_processing/wl_qc.py b/code/data_processing/wl_qc.py
@@ -23,7 +23,7 @@ def wl_qc(self, submission, version):
 
         if self.CATEGORY == 3:
             print("One or more conditions are empty, status finalized at 3")
-            return self.CATEGORY
+            return df_all, self.CATEGORY
         # Assuming df_all is the DataFrame and self.CATEGORY exists in the class context
 
         if (df_all['block'] == 'immediate').any():
@@ -48,7 +48,7 @@ def dwl_qc(self, submission, version):
 
         if self.CATEGORY == 3:
             print("One or more conditions are empty, status finalized at 3")
-            return self.CATEGORY
+            return df_all, self.CATEGORY
         # Assuming df_all is the DataFrame and self.CATEGORY exists in the class context
 
         if (df_all['block'] == 'delay').any():
@@ -90,4 +90,3 @@ def dwl_count_correct(df_all):
                .reindex(['delay'], fill_value=0)
                .to_frame().T)  # one row: column 'delay'
         return out
-
diff --git a/code/main_handler.py b/code/main_handler.py
diff --git a/tests/data_processing/test_convert_to_csv.py b/tests/data_processing/test_convert_to_csv.py