Skip to content

Commit 407b957

Browse files
committed
Bugfixes to deal with duplicate column names
1 parent b1f92cc commit 407b957

File tree

2 files changed

+55
-33
lines changed

2 files changed

+55
-33
lines changed

bidscoin/bids.py

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -102,15 +102,14 @@ def eventstable(self) -> pd.DataFrame:
102102
if not self.isvalid:
103103
return pd.DataFrame()
104104

105-
# Take the columns of interest from the logtable and rename them
106105
df = copy.deepcopy(self.logtable)
107106

108107
# Convert the timing values to seconds (with maximally 4 digits after the decimal point)
109108
df[self.time['cols']] = (df[self.time['cols']].apply(pd.to_numeric, errors='coerce') / self.time['unit']).round(4)
110109

111-
# Take the columns of interest and from now on use the BIDS column names
112-
df = df.loc[:, [name for item in self.columns for name in item.values()]]
113-
df.columns = [name for item in self.columns for name in item.keys()]
110+
# Take the logtable columns of interest and from now on use the BIDS column names
111+
df = df.loc[:, [sourcecol for item in self.columns for sourcecol in item.values() if sourcecol]]
112+
df.columns = [eventscol for item in self.columns for eventscol, sourcecol in item.items() if sourcecol]
114113

115114
# Set the clock at zero at the start of the experiment
116115
if self.time.get('start'):
@@ -121,22 +120,22 @@ def eventstable(self) -> pd.DataFrame:
121120
df['onset'] = df['onset'] - df['onset'][start].iloc[0] # Take the time of the first occurrence as zero
122121

123122
# Loop over the row groups to filter/edit the rows
124-
rows = pd.Series([len(self.rows) == 0] * len(df)).astype(bool) # Series with True values if no row expressions were specified
123+
rows = pd.Series([len(self.rows) == 0] * len(df)).astype(bool) # Boolean series with True values if no row expressions were specified
125124
for group in self.rows:
126125

127126
for column, regex in group['include'].items():
128127

129-
# Get the rows that match the expression
128+
# Get the rows that match the expression, i.e. make them True
130129
rowgroup = self.logtable[column].astype(str).str.fullmatch(str(regex))
131130

132131
# Add the matching rows to the grand rows group
133-
rows |= rowgroup
132+
rows |= rowgroup.values
134133

135134
# Write the value(s) of the matching rows
136-
for newcolumn, newvalue in (group.get('cast') or {}).items():
137-
df.loc[rowgroup, newcolumn] = newvalue
135+
for colname, values in (group.get('cast') or {}).items():
136+
df.loc[rowgroup, colname] = values
138137

139-
return df.loc[rows].sort_values(by='onset')
138+
return df.loc[rows.values].sort_values(by='onset')
140139

141140
@property
142141
def columns(self) -> List[dict]:
@@ -177,30 +176,35 @@ def is_float(s):
177176
return False
178177

179178
if not (valid := len(self.columns) >= 2):
180-
LOGGER.warning(f"Events table must have at least two columns, got {len(self.columns)} instead")
179+
LOGGER.warning(f"Events table must have at least two columns, got {len(self.columns)} instead\n{self}")
181180
return False
182181

183182
if (key := [*self.columns[0].keys()][0]) != 'onset':
184-
LOGGER.warning(f"First events column must be named 'onset', got '{key}' instead")
183+
LOGGER.warning(f"First events column must be named 'onset', got '{key}' instead\n{self}")
185184
valid = False
186185

187186
if (key := [*self.columns[1].keys()][0]) != 'duration':
188-
LOGGER.warning(f"Second events column must be named 'duration', got '{key}' instead")
187+
LOGGER.warning(f"Second events column must be named 'duration', got '{key}' instead\n{self}")
189188
valid = False
190189

191190
if len(self.time.get('cols',[])) < 2:
192-
LOGGER.warning(f"Events table must have at least two timecol items, got {len(self.time.get('cols',[]))} instead")
191+
LOGGER.warning(f"Events table must have at least two timecol items, got {len(self.time.get('cols',[]))} instead\n{self}")
193192
return False
194193

195194
elif not is_float(self.time.get('unit')):
196-
LOGGER.warning(f"Time conversion factor must be a float, got '{self.time.get('unit')}' instead")
195+
LOGGER.warning(f"Time conversion factor must be a float, got '{self.time.get('unit')}' instead\n{self}")
197196
valid = False
198197

198+
# Check if the logtable has existing and unique column names
199+
df = self.logtable
199200
for name in set([name for item in self.columns for name in item.values()] + [name for item in self.rows for name in item['include'].keys()] +
200201
[*self.time.get('start',{}).keys()] + self.time.get('cols',[])):
201-
if name not in self.logtable:
202-
LOGGER.warning(f"Column '{name}' not found in the event table of {self.sourcefile}")
202+
if name and name not in df:
203+
LOGGER.warning(f"Column '{name}' not found in the event table of {self}")
203204
valid = False
205+
if not df.columns[df.columns.duplicated()].empty:
206+
LOGGER.warning(f"Duplicate columns found in: {df.columns}\n{self}")
207+
valid = False
204208

205209
return valid
206210

@@ -677,7 +681,7 @@ def check(self, checks: Tuple[bool, bool, bool]=(False, False, False)) -> Tuple[
677681
for ext in ('.tsv', '.tsv.gz'): # NB: `ext` used to be '.json', which is more generic (but see https://github.com/bids-standard/bids-validator/issues/2113)
678682
if run_suffixok := bids_validator.BIDSValidator().is_bids(f"/sub-unknown/{datatype}/{bidsname}{ext}"): break # NB: Using the BIDSValidator sounds nice but doesn't give any control over the BIDS-version
679683
run_valsok = run_suffixok
680-
LOGGER.bcdebug(f"bidsname={run_suffixok}: /sub-unknown/{datatype}/{bidsname}.*")
684+
LOGGER.bcdebug(f"bidsname (suffixok={run_suffixok}): /sub-unknown/{datatype}/{bidsname}.*")
681685

682686
if checks[0] and run_keysok in (None, False):
683687
LOGGER.bcdebug(f'Invalid "{run_keysok}" key-checks in run-item: "{bids["suffix"]}" ({datatype} -> {provenance})\nRun["bids"]:\t{bids}')
@@ -1100,8 +1104,8 @@ def __init__(self, yamlfile: Path, folder: Path=templatefolder, plugins: Iterabl
11001104
module = bcoin.import_plugin(plugin)
11011105
if not self.plugins.get(plugin):
11021106
LOGGER.info(f"Adding default bidsmap options from the {plugin} plugin")
1103-
self.plugins[plugin] = module.OPTIONS if 'OPTIONS' in dir(module) else {}
1104-
if 'BIDSMAP' in dir(module) and yamlfile.parent == templatefolder:
1107+
self.plugins[plugin] = module.OPTIONS if hasattr(module, 'OPTIONS') else {}
1108+
if hasattr(module, 'BIDSMAP') and yamlfile.parent == templatefolder:
11051109
for dataformat, datasection in module.BIDSMAP.items():
11061110
if dataformat not in bidsmap_data:
11071111
LOGGER.info(f"Adding default bidsmappings from the {plugin} plugin")

bidscoin/plugins/events2bids.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -249,33 +249,51 @@ def __init__(self, sourcefile: Path, _data: dict, options: dict):
249249
def logtable(self) -> pd.DataFrame:
250250
"""Returns a Presentation log-table"""
251251

252-
nrows = len(self._sourcetable)
253-
stimulus_header = (self._sourcetable.iloc[:, 0] == 'Event Type').idxmax() or nrows
254-
video_header = (self._sourcetable.iloc[:, 0] == 'filename').idxmax() or nrows
255-
survey_header = (self._sourcetable.iloc[:, 0] == 'Time').idxmax() or nrows
256-
257-
# Keep only the event, stimulus, video or survey table
258-
self._sourcetable.columns = self._columns
252+
df = self._sourcetable
253+
nrows = len(df)
254+
stimulus_header = (df.iloc[:, 0] == 'Event Type').idxmax() or nrows
255+
video_header = (df.iloc[:, 0] == 'filename').idxmax() or nrows
256+
survey_header = (df.iloc[:, 0] == 'Time').idxmax() or nrows
257+
258+
# Get the row indices to slice the event, stimulus, video or survey table
259+
df.columns = self._columns
259260
if self.options['table'] == 'event':
260261
begin = 0
261262
end = min(stimulus_header, video_header, survey_header)
262263
elif self.options['table'] == 'stimulus':
263-
self._sourcetable.columns = self._sourcetable.iloc[stimulus_header]
264+
df.columns = df.iloc[stimulus_header]
264265
begin = stimulus_header + 1
265266
end = min(video_header, survey_header)
266267
elif self.options['table'] == 'video':
267-
self._sourcetable.columns = self._sourcetable.iloc[video_header]
268+
df.columns = df.iloc[video_header]
268269
begin = video_header + 1
269270
end = survey_header
270271
elif self.options['table'] == 'survey':
271-
self._sourcetable.columns = self._sourcetable.iloc[survey_header]
272+
df.columns = df.iloc[survey_header]
272273
begin = survey_header + 1
273274
end = nrows
274275
else:
275276
begin = 0
276277
end = nrows
277278
LOGGER.error(f"NOT IMPLEMENTED TABLE: {self.options['table']}")
278279

279-
LOGGER.bcdebug(f"Slicing '{self.options['table']}' sourcetable[{begin}:{end}]")
280-
281-
return self._sourcetable.iloc[begin:end]
280+
LOGGER.bcdebug(f"Slicing '{self.options['table']}{df.shape}' sourcetable[{begin}:{end}]")
281+
282+
# Ensure unique column names by renaming columns with NaN or empty names and by appending suffixes to duplicate names
283+
cols = [] # The new column names
284+
dupl = {} # The duplicate index number
285+
for i, col in enumerate(df.columns):
286+
if pd.isna(col) or col == '': # Check if the column name is NaN or an empty string
287+
cols.append(new_col := f"unknown_{i}")
288+
LOGGER.info(f"Renaming empty column name at index {i}: {col} -> {new_col}")
289+
elif col in dupl: # If duplicate, append the index number
290+
dupl[col] += 1
291+
cols.append(new_col := f"{col}_{dupl[col]}")
292+
LOGGER.info(f"Renaming duplicate column name: {col} -> {new_col}")
293+
else: # First occurrence of the column name, add it to dupl
294+
dupl[col] = 0
295+
cols.append(col)
296+
df.columns = cols
297+
298+
# Return the sliced the table
299+
return df.iloc[begin:end]

0 commit comments

Comments
 (0)