Skip to content

Commit 7b9a460

Browse files
committed
Count only for the first hit for subject or tissue within filename
Also added assertion so we do not count incorrectly. But may be should be just a warning? Closes #172
1 parent b310e3e commit 7b9a460

File tree

1 file changed

+12
-5
lines changed

1 file changed

+12
-5
lines changed

dandischema/metadata.py

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -311,13 +311,17 @@ def _add_asset_to_stats(assetmeta: Dict[str, Any], stats: _stats_type) -> None:
311311
stats = _get_samples(value, stats, hierarchy)
312312
break
313313

314+
# which components already found, so we do not count more than
315+
# once in some incorrectly named datasets
316+
found = {}
314317
for part in Path(assetmeta["path"]).name.split(".")[0].split("_"):
315-
if part.startswith("sub-"):
316-
subject = part.replace("sub-", "")
318+
if found.get("subject") and part.startswith("sub-"):
319+
found["subject"] = subject = part.split("sub-", 1)[1]
317320
if subject not in stats["subjects"]:
318321
stats["subjects"].append(subject)
319-
if part.startswith("sample-"):
320-
sample = part.replace("sample-", "")
322+
found.add("subject")
323+
if not found.get("sample") and part.startswith("sample-"):
324+
found["sample"] = sample = part.replace("sample-", "")
321325
if sample not in stats["tissuesample"]:
322326
stats["tissuesample"].append(sample)
323327

@@ -338,10 +342,13 @@ def aggregate_assets_summary(metadata: Iterable[Dict[str, Any]]) -> dict:
338342
stats: _stats_type = {}
339343
for meta in metadata:
340344
_add_asset_to_stats(meta, stats)
341-
342345
stats["numberOfBytes"] = stats.get("numberOfBytes", 0)
343346
stats["numberOfFiles"] = stats.get("numberOfFiles", 0)
344347
stats["numberOfSubjects"] = len(stats.pop("subjects", [])) or None
348+
if stats["numberOfSubjects"]:
349+
# Must not happen. If does -- a bug in software
350+
assert stats["numberOfFiles"]
351+
assert stats["numberOfSubjects"] <= stats["numberOfFiles"]
345352
stats["numberOfSamples"] = (
346353
len(stats.pop("tissuesample", [])) + len(stats.pop("slice", []))
347354
) or None

0 commit comments

Comments
 (0)