|
| 1 | +from datetime import datetime |
| 2 | +from functools import cached_property |
| 3 | +from typing import Dict, NamedTuple, Sequence |
| 4 | + |
| 5 | +import numpy as np |
| 6 | +import pandas as pd |
| 7 | +import SimpleITK as sitk |
| 8 | +from dpipe.im.box import limit_box |
| 9 | +from dpipe.itertools import collect |
| 10 | + |
| 11 | +from .internals import Dataset, field, licenses, register |
| 12 | + |
| 13 | + |
| 14 | +class NoduleBlock(NamedTuple): |
| 15 | + image: np.ndarray |
| 16 | + metadata: Dict |
| 17 | + |
| 18 | + |
| 19 | +class LUNA25Nodule(NamedTuple): |
| 20 | + coords: Sequence[float] |
| 21 | + lesion_id: int |
| 22 | + annotation_id: str |
| 23 | + nodule_id: str |
| 24 | + malignancy: bool |
| 25 | + center_voxel: Sequence[float] |
| 26 | + bbox: np.ndarray |
| 27 | + |
| 28 | + |
| 29 | +@register( |
| 30 | + body_region='Chest', |
| 31 | + license=licenses.CC_BY_40, |
| 32 | + link='https://luna25.grand-challenge.org/', |
| 33 | + modality='CT', |
| 34 | + prep_data_size=None, |
| 35 | + raw_data_size=None, |
| 36 | + task='Lung nodule malignancy risk estimation', |
| 37 | +) |
| 38 | +class LUNA25(Dataset): |
| 39 | + """ |
| 40 | + The LUNA25 Challenge dataset is a comprehensive collection designed to support |
| 41 | + the development and validation of AI algorithms for lung nodule malignancy risk |
| 42 | + estimation using low-dose chest CT scans. In total, it contains 2120 patients |
| 43 | + and 4069 low-dose chest CT scans, with 555 annotated malignant nodules and |
| 44 | + 5608 benign nodules (3762 unique nodules, 348 of them are malignant). |
| 45 | + The dataset was acquired in participants who enrolled in the |
| 46 | + National Lung Cancer Screening Trial (NLST) between 2002 and 2004 in |
| 47 | + one of the 33 centers in the United States. |
| 48 | +
|
| 49 | + Parameters |
| 50 | + ---------- |
| 51 | + root : str, Path, optional |
| 52 | + path to the folder containing the raw downloaded archives. |
| 53 | + If not provided, the cache is assumed to be already populated. |
| 54 | +
|
| 55 | + """ |
| 56 | + |
| 57 | + @property |
| 58 | + def ids(self): |
| 59 | + return [file.name[: -len('.mha')] for file in (self.root / 'luna25_images').iterdir()] |
| 60 | + |
| 61 | + def _image(self, i): |
| 62 | + return sitk.ReadImage(self.root / f'luna25_images/{i}.mha') |
| 63 | + |
| 64 | + @field |
| 65 | + def image(self, i): |
| 66 | + return sitk.GetArrayFromImage(self._image(i)) |
| 67 | + |
| 68 | + @field |
| 69 | + def spacing(self, i): |
| 70 | + return self._image(i).GetSpacing()[::-1] |
| 71 | + |
| 72 | + def _image_origin(self, i): |
| 73 | + return self._image(i).GetOrigin()[::-1] |
| 74 | + |
| 75 | + def _direction(self, i): |
| 76 | + return self._image(i).GetDirection()[::-1] |
| 77 | + |
| 78 | + @cached_property |
| 79 | + def _data(self): |
| 80 | + return pd.read_csv(self.root / 'LUNA25_Public_Training_Development_Data.csv') |
| 81 | + |
| 82 | + def _data_rows(self, i): |
| 83 | + return self._data[self._data['SeriesInstanceUID'] == i] |
| 84 | + |
| 85 | + def _data_column_value(self, i, column_name): |
| 86 | + values = self._data_rows(i).get(column_name).unique() |
| 87 | + assert len(values) == 1 |
| 88 | + value = values[0] |
| 89 | + assert not pd.isnull(value) |
| 90 | + return value |
| 91 | + |
| 92 | + @field |
| 93 | + def patient_id(self, i): |
| 94 | + return str(self._data_column_value(i, 'PatientID')) |
| 95 | + |
| 96 | + @field |
| 97 | + def study_date(self, i): |
| 98 | + study_date = str(self._data_column_value(i, 'StudyDate')) |
| 99 | + return datetime.strptime(study_date, "%Y%m%d").date() |
| 100 | + |
| 101 | + @field |
| 102 | + def age(self, i): |
| 103 | + return self._data_column_value(i, 'Age_at_StudyDate') |
| 104 | + |
| 105 | + @field |
| 106 | + def sex(self, i): |
| 107 | + return self._data_column_value(i, 'Gender') |
| 108 | + |
| 109 | + @field |
| 110 | + @collect |
| 111 | + def nodules(self, i): |
| 112 | + for row in self._data_rows(i).itertuples(): |
| 113 | + coords = np.array([row.CoordX, row.CoordY, row.CoordZ]) |
| 114 | + nodule_block_metadata = self.nodule_block_metadata(row.AnnotationID) |
| 115 | + assert np.all(nodule_block_metadata['spacing'] == self.spacing(i)) |
| 116 | + image_origin = self._image_origin(i) |
| 117 | + direction = np.array(self._direction(i)[::4]) |
| 118 | + center_voxel = ((coords[::-1] - image_origin) / self.spacing(i)) * direction |
| 119 | + bbox_start_point = ((nodule_block_metadata['origin'] - image_origin) / self.spacing(i)) * direction |
| 120 | + bbox = limit_box([bbox_start_point, bbox_start_point + np.array([64, 128, 128])], self.image(i).shape) |
| 121 | + yield LUNA25Nodule( |
| 122 | + coords=coords, |
| 123 | + lesion_id=row.LesionID, |
| 124 | + annotation_id=str(row.AnnotationID), |
| 125 | + nodule_id=str(row.NoduleID), |
| 126 | + malignancy=row.label, |
| 127 | + center_voxel=np.round(center_voxel).astype(int), |
| 128 | + bbox=np.round(bbox).astype(int), |
| 129 | + ) |
| 130 | + |
| 131 | + def nodule_block_image(self, annotation_id): |
| 132 | + return np.load(self.root / f'luna25_nodule_blocks/image/{annotation_id}.npy') |
| 133 | + |
| 134 | + def nodule_block_metadata(self, annotation_id): |
| 135 | + metadata = np.load(self.root / f'luna25_nodule_blocks/metadata/{annotation_id}.npy', allow_pickle=True) |
| 136 | + assert metadata.shape == () |
| 137 | + return metadata.item() |
0 commit comments