-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsummarise_series_metadata.py
More file actions
76 lines (63 loc) · 2.75 KB
/
summarise_series_metadata.py
File metadata and controls
76 lines (63 loc) · 2.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""Summarise values in DICOM series metadata file."""
import argparse
import pandas as pd
import numpy as np
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
# Fields to count
fields = ['Modality', 'StudyDescription', 'SeriesDescription', 'BodyPartExamined']
# Groups of fields with their group labels
groups = {'ModDescPart': ['Modality', 'StudyDescription', 'SeriesDescription', 'BodyPartExamined'],}
def main(args):
print("Loading DICOM index.")
di = pd.read_parquet(args.di)
if args.ctmr:
di = di.loc[(di['Modality'] == 'CT') | (di['Modality'] == 'MR'), :]
if args.out:
excel_filename = args.out
else:
excel_filename = args.di.replace('.parquet', '_summary.xlsx')
with pd.ExcelWriter(excel_filename) as xl:
print("Writing summary to Excel.")
print("Calculating missingness.")
isna = di.isna().sum(axis=0).rename('isna')
empty = di.fillna('ignore').map(lambda x: x.size==0 if isinstance(x, np.ndarray) else not bool(x)
).sum(axis=0).rename('empty')
null_strings = ['none', 'null', 'nan', 'n/a', 'na']
null_string = di.fillna('ignore').astype(str).apply(lambda x: x.str.lower()).isin(null_strings).sum(
axis=0).rename('null_string')
missing = pd.concat([isna, empty, null_string], axis=1)
missing['total'] = missing.sum(axis=1)
missing.reset_index(names='attribute').to_excel(
xl, sheet_name='missing', index=False)
for col in fields:
print(f"Counting column {col}.")
if col in di.columns:
parsed_col = di[col].fillna('NaN').astype(str).replace({'': '_EMPTY_', '[]': '_EMPTY_'})
parsed_col.value_counts().reset_index().to_excel(xl, sheet_name=col, index=False)
for desc, cols in groups.items():
print(f"Counting group {desc} with columns {cols}.")
if all([x in di.columns for x in cols]):
parsed_cols = di[cols].fillna('NaN').astype(str).replace({'': '_EMPTY_', '[]': '_EMPTY_'})
parsed_cols.value_counts().reset_index().to_excel(xl, sheet_name=desc, index=False)
print("Done.")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Summarise values in DICOM metadata file.")
parser.add_argument(
'--di',
required=True,
help='DICOM index parquet file.'
)
parser.add_argument(
'--ctmr',
required=False,
action='store_true',
help='Only include Modality of CT or MR.'
)
parser.add_argument(
'--out',
required=False,
help='Output Excel filename.'
)
main_args = parser.parse_args()
main(main_args)