Expand/pivot PsychoPy lists-in-cells + various fixes

marcelzwiers · marcelzwiers · commit 385792b0d42c · 2025-03-07T17:20:21.000+01:00
diff --git a/bidscoin/bidseditor.py b/bidscoin/bidseditor.py
@@ -1507,12 +1507,12 @@ def run2data(self) -> tuple:
 
             # Set up the data for the events table
             df = self.events.logtable
-            events_data['log_table'] = [[{'value': name, 'editable': False} for name in df.columns]] if len(df) else []
+            events_data['log_table'] = [[{'value': name, 'editable': False} for name in df]] if len(df) else []
             for i in range(len(df)):
                 events_data['log_table'].append([{'value': value, 'editable': False} for value in df.iloc[i]])
 
             df = self.events.eventstable
-            events_data['table'] = [[{'value': name, 'editable': False} for name in df.columns]] if len(df) else []
+            events_data['table'] = [[{'value': name, 'editable': False} for name in df]] if len(df) else []
             for i in range(len(df)):
                 events_data['table'].append([{'value': value, 'editable': False} for value in df.iloc[i]])
 
@@ -1915,6 +1915,7 @@ def reset(self, refresh: bool=False):
         self.fill_table(self.meta_table, meta_data)
         if events_data:
             self.fill_table(self.events_parsing, events_data['parsing'])
+            self.fill_table(self.log_table, events_data['log_table'])
             self.fill_table(self.events_time, events_data['time'])
             self.fill_table(self.events_rows, events_data['rows'])
             self.fill_table(self.events_columns, events_data['columns'])
diff --git a/bidscoin/heuristics/bidsmap_dccn.yaml b/bidscoin/heuristics/bidsmap_dccn.yaml
@@ -1161,13 +1161,14 @@ Psychopy:
     events: &psychopy_events
       parsing:                          # The settings to parse the source table from the log file
         table: [long-wide, pivot, 1]    # The raw source table or a pivoted 'onset', 'duration', 'event_type' version
+        expand: scannerPulse.rt         # Expands lists into columns for each array item
       columns:                          # Columns that are included in the output table, i.e. {output column: input column}
       - onset: onset                    # The mapping for the first required column 'onset'
       - duration: duration              # The mapping for the second required column 'duration'
       - event_type: event_type
       rows:
       - condition:                      # Dict(s): key = column name of the log input table, value = fullmatch regular expression to select the rows of interest
-          onset: '\d.*'                 # Select rows with numerical / non-empty onsets
+          event_type: '.*'
       time:
         cols: ['(?i).*time.*', '(?i).*duration.*', '(?i).*onset.*', '(?i).*start.*', '(?i).*stop.*', '.*\.rt']
 
@@ -1247,9 +1248,6 @@ Logdata:
     meta: &free_func_meta
       TaskName:
     events: &free_events
-      rows:                             # Rows that are included in the output table
-      - condition:                      # Dict(s): key = column name of the log input table, value = fullmatch regular expression to select the rows of interest
-          onset: '\d.*'                 # Select rows with numerical / non-empty onsets
       time:
         cols: ['(?i).*time.*', '(?i).*duration.*', '(?i).*onset.*', '(?i).*start.*', '(?i).*stop.*']
 
diff --git a/bidscoin/heuristics/bidsmap_sst.yaml b/bidscoin/heuristics/bidsmap_sst.yaml
@@ -1163,13 +1163,14 @@ Psychopy:
     events: &psychopy_events
       parsing:                          # The settings to parse the source table from the log file
         table: [long-wide, pivot, 1]    # The raw source table or a pivoted 'onset', 'duration', 'event_type' version
+        expand: scannerPulse.rt         # Expands lists into columns for each array item
       columns:                          # Columns that are included in the output table, i.e. {output column: input column}
       - onset: onset                    # The mapping for the first required column 'onset'
       - duration: duration              # The mapping for the second required column 'duration'
       - event_type: event_type
       rows:
       - condition:                      # Dict(s): key = column name of the log input table, value = fullmatch regular expression to select the rows of interest
-          onset: '\d.*'                 # Select rows with numerical / non-empty onsets
+          event_type: '.*'
       time:
         cols: ['(?i).*time.*', '(?i).*duration.*', '(?i).*onset.*', '(?i).*start.*', '(?i).*stop.*', '.*\.rt']
 
@@ -1249,9 +1250,6 @@ Logdata:
     meta: &free_func_meta
       TaskName:
     events: &free_events
-      rows:                             # Rows that are included in the output table
-      - condition:                      # Dict(s): key = column name of the log input table, value = fullmatch regular expression to select the rows of interest
-          onset: '\d.*'                 # Select rows with numerical / non-empty onsets
       time:
         cols: ['(?i).*time.*', '(?i).*duration.*', '(?i).*onset.*', '(?i).*start.*', '(?i).*stop.*']
 
diff --git a/bidscoin/plugins/__init__.py b/bidscoin/plugins/__init__.py
@@ -245,33 +245,33 @@ def eventstable(self) -> pd.DataFrame:
         df = self.logtable.copy()           # Ensure we do not change the source data
 
         # Convert the timing values to seconds (with maximally 4 digits after the decimal point)
-        timecols     = list(set([col for col in df.columns for pattern in self.time.cols if re.fullmatch(pattern, col)]))
+        timecols     = list(set(col for col in df for pattern in self.time.cols if re.fullmatch(pattern, col)))
         df[timecols] = (df[timecols].apply(pd.to_numeric, errors='coerce') / self.time.unit).round(4)
 
         # Take the logtable columns of interest and from now on use the BIDS column names
-        df         = df.loc[:, [sourcecol for item in self.columns for sourcecol in item.values() if sourcecol in df.columns]]
-        df.columns = [eventscol for item in self.columns for eventscol, sourcecol in item.items() if sourcecol in df.columns]
-        if 'onset'    not in df.columns: df.insert(0, 'onset',    None)
-        if 'duration' not in df.columns: df.insert(1, 'duration', None)
+        df         = df.loc[:, [sourcecol for item in self.columns for sourcecol in item.values() if sourcecol in df]]
+        df.columns = [eventscol for item in self.columns for eventscol, sourcecol in item.items() if sourcecol in df]
+        if 'onset'    not in df: df.insert(0, 'onset',    None)
+        if 'duration' not in df: df.insert(1, 'duration', None)
 
         # Set the clock at zero at the start of the experiment
         if self.time.start:
-            start = pd.Series([True] * len(df))
+            start = pd.Series([True] * len(df), index=df.index)
             for column, value in self.time.start.items():
-                if column in self.logtable.columns:
+                if column in self.logtable:
                     start &= (self.logtable[column].astype(str) == str(value))
             if start.any():
                 LOGGER.bcdebug(f"Resetting clock offset: {df['onset'][start].iloc[0]}")
                 df['onset'] -= df['onset'][start].iloc[0]                   # Take the time of the first occurrence as zero
 
         # Loop over the row groups to filter/edit the rows
-        rows = pd.Series([len(self.rows) == 0] * len(df))                   # All rows are True if no row expressions were specified
+        rows = pd.Series([len(self.rows) == 0] * len(df), index=df.index)   # All rows are True if no row expressions were specified
         for group in self.rows:                                             # With a group the expressions are AND between groups they are OR
 
-            rowgroup = pd.Series([True] * len(df))
+            rowgroup = pd.Series([True] * len(df), index=df.index)
             for column, pattern in (group.get('condition') or {}).items():
 
-                if column not in self.logtable.columns:
+                if column not in self.logtable:
                     LOGGER.bcdebug(f"Unknown condition column: {column}")
                     continue
 
diff --git a/bidscoin/plugins/events2bids.py b/bidscoin/plugins/events2bids.py
@@ -1,5 +1,5 @@
 """The events2bids plugin converts neurobs Presentation logfiles to event.tsv files"""
-
+import ast
 import logging
 import json
 import dateutil.parser
@@ -310,46 +310,56 @@ def __init__(self, sourcefile: Path, data: dict, options: dict):
         else:
             LOGGER.debug(f"Cannot read/parse {sourcefile}")
             self._sourcetable = pd.DataFrame()
-        self._sourcecols  = self._sourcetable.columns
-        """Store the original column names"""
 
     @property
     def logtable(self) -> pd.DataFrame:
         """Returns the Psychopy log-table"""
 
+        table = self.parsing.get('table', ['long-wide', 'pivot', 1])
+        table = table[table[-1]]
+
         # Start with a fresh data frame
-        df         = self._sourcetable
-        df.columns = self._sourcecols
+        df = self._sourcetable.copy()
         if not len(df):
             return df
 
+        # Expand the array items
+        try:
+            for expand in set(col for col in df if re.fullmatch(self.parsing.get('expand') or '', col)):
+                ds  = df[expand].apply(lambda x: ast.literal_eval(x) if isinstance(x,str) and x.startswith('[') else [])    # Convert string representation of lists into actual Python lists
+                df_ = ds.apply(pd.Series).add_prefix(f"{expand}{'.started' if '.' in expand and table=='pivot' else ''}_")  # Append `.started` to pivot the data into the onset column
+                if '.' in expand:                                                                                           # Time columns should have a `.` in their name
+                    df_ = df_.rename(columns=lambda col: re.sub(r'(.*)\.(\w+)_(\d+)', r'\1_\3.\2', col))        # Put e.g. `.rt` or `.started` back at the end
+                if not df_.empty:
+                    df = pd.concat([df.drop(columns=[expand]), df_], axis=1)
+        except re.error as pattern_error:
+            LOGGER.warning(f"The expand pattern {self.parsing.get('expand')} is invalid\n{pattern_error}")
+
         # Use the raw source data
-        table = self.parsing.get('table', ['long-wide', 'pivot', 1])
-        table = table[table[-1]]
         if table == 'long-wide':
             pass
 
         # Create a pivoted dataframe with 'onset', 'duration' and 'event_type' columns
         elif table == 'pivot':
 
-            df_piv = pd.DataFrame(columns=['onset', 'duration', 'event_type'])
-
             # Extract event column names without '.started' suffixes
-            events = sorted(set(col.split('.')[0] for col in df.columns if '.started' in col))
+            events = set(col.rsplit('.',1)[0] for col in df if col.endswith('.started'))
 
             # Create new DataFrame with 'onset', 'duration', and 'event_type'
+            df_piv = pd.DataFrame(columns=['onset', 'duration', 'event_type'])  # Collects all pivoted event data
             for event in events:
-                onset = df[(started := f"{event}.started")]         # Get the onset times
-                if (stopped := f"{event}.stopped") in df.columns:
+                onset = df[(started := f"{event}.started")]                     # Get the onset times
+                if (stopped := f"{event}.stopped") in df:
                     duration = df[stopped] - df[started]
                 else:
-                    duration = pd.Series([float('nan')] * len(df))  # Use NaN for missing `.stopped`
-                event_type = [event] * len(df)                      # Store the event name
-                timecols   = list(set([col for col in df.columns for pattern in self.time.cols if re.fullmatch(pattern, col)
-                                       and col not in df_piv.columns and not col.endswith(('.started', '.stopped'))]))
-                df_piv = pd.concat([df_piv.dropna(axis=1, how='all'),
-                                         pd.DataFrame({'onset': onset, 'duration': duration, 'event_type': event_type}).dropna(axis=1, how='all'),
-                                         df[timecols].dropna(axis=1, how='all')], ignore_index=True)
+                    duration = pd.Series([float('nan')] * len(df), index=df.index)
+                df_piv_ = pd.DataFrame({'onset': onset, 'duration': duration, 'event_type': [event]*len(df)}, index=df.index).dropna(subset=['onset'])
+                df_misc = df.filter(regex=r'^(?!.*\.(started|stopped)$)').loc[df_piv_.index,:]  # Drop all columns that end with '.started', '.stopped'
+                if not df_piv_.empty:                                           # Only concatenate if df_piv_ has data
+                    if df_piv.empty:
+                        df_piv = pd.concat([df_piv_, df_misc], axis=1)      # Re-initialize df_piv / avoid future warnings below about concatenating empty frames
+                    else:
+                        df_piv = pd.concat([df_piv, pd.concat([df_piv_, df_misc], axis=1)])
             df = df_piv.sort_values(by='onset')
 
         else:
diff --git a/bidscoin/utilities/physio.py b/bidscoin/utilities/physio.py
@@ -371,7 +371,7 @@ def physio2tsv(physio: dict, tsvfile: Union[str, Path]):
 
     # Add each trace to a data table and save the table as a BIDS-compliant gzipped tsv file
     physiotable = pd.DataFrame(columns=[key for key in physio if key not in ('UUID','ScanDate','Freq','SliceMap','ACQ','Meta')])
-    for key in physiotable.columns:
+    for key in physiotable:
         physiotable[key] = physio[key]
     LOGGER.verbose(f"Writing physiological traces to: '{tsvfile}'")
     physiotable.to_csv(tsvfile, header=False, index=False, sep='\t', compression='infer')