Skip to content

Commit fecfb26

Browse files
committed
Count only for the first hit for subject or tissue within filename
Also added assertion so we do not count incorrectly. But may be should be just a warning? Closes #172
1 parent b310e3e commit fecfb26

File tree

1 file changed

+11
-5
lines changed

1 file changed

+11
-5
lines changed

dandischema/metadata.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -311,13 +311,17 @@ def _add_asset_to_stats(assetmeta: Dict[str, Any], stats: _stats_type) -> None:
311311
stats = _get_samples(value, stats, hierarchy)
312312
break
313313

314+
# which components already found, so we do not count more than
315+
# once in some incorrectly named datasets
316+
found = {}
314317
for part in Path(assetmeta["path"]).name.split(".")[0].split("_"):
315-
if part.startswith("sub-"):
316-
subject = part.replace("sub-", "")
318+
if found.get("subject") and part.startswith("sub-"):
319+
found["subject"] = subject = part.split("sub-", 1)[1]
317320
if subject not in stats["subjects"]:
318321
stats["subjects"].append(subject)
319-
if part.startswith("sample-"):
320-
sample = part.replace("sample-", "")
322+
found.add("subject")
323+
if not found.get("sample") and part.startswith("sample-"):
324+
found["sample"] = sample = part.replace("sample-", "")
321325
if sample not in stats["tissuesample"]:
322326
stats["tissuesample"].append(sample)
323327

@@ -338,10 +342,12 @@ def aggregate_assets_summary(metadata: Iterable[Dict[str, Any]]) -> dict:
338342
stats: _stats_type = {}
339343
for meta in metadata:
340344
_add_asset_to_stats(meta, stats)
341-
342345
stats["numberOfBytes"] = stats.get("numberOfBytes", 0)
343346
stats["numberOfFiles"] = stats.get("numberOfFiles", 0)
344347
stats["numberOfSubjects"] = len(stats.pop("subjects", [])) or None
348+
if stats["numberOfFiles"]:
349+
# Must not happen. If does -- a bug in software
350+
assert stats["numberOfSubjects"] <= stats["numberOfFiles"]
345351
stats["numberOfSamples"] = (
346352
len(stats.pop("tissuesample", [])) + len(stats.pop("slice", []))
347353
) or None

0 commit comments

Comments
 (0)