Skip to content

Commit 8187d71

Browse files
authored
Merge pull request #8 from jpata/jp_20250829_datapaths
consolidate paths in notebooks, add data download script
2 parents 7e03a54 + 0e02ac4 commit 8187d71

File tree

5 files changed

+83
-37
lines changed

5 files changed

+83
-37
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
# particlemind
22
Self-supervised learning on HEP events.
33

4+
## Datasets
5+
6+
A small testing dataset (about 20GB) can be downloaded from zenodo:
7+
```
8+
./scripts/download_data.sh
9+
```
10+
411
### Approximate repo structure
512
```
613
├── README.md

notebooks/cld-visualize.ipynb

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,16 @@
88
"# Visualize CLD events from Key4HEP full simulation + reconstruction"
99
]
1010
},
11+
{
12+
"cell_type": "code",
13+
"execution_count": null,
14+
"id": "a9c7184b-eb96-4247-806f-12ae84b96f20",
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"root_file_path = \"../data/p8_ee_tt_ecm365/root\""
19+
]
20+
},
1121
{
1222
"cell_type": "markdown",
1323
"id": "57214593-71db-4bbc-82f6-1f66224a32b7",
@@ -285,13 +295,13 @@
285295
"metadata": {},
286296
"outputs": [],
287297
"source": [
288-
"root_files_dir = Path(\"/mnt/ceph/users/ewulff/data/cld/Dec3/subfolder_0/\")\n",
289-
"root_file = root_files_dir / \"reco_p8_ee_tt_ecm365_10000.root\"\n",
298+
"root_files_dir = Path(root_file_path)\n",
299+
"root_file = root_files_dir / \"reco_p8_ee_tt_ecm365_60000.root\"\n",
290300
"fi = uproot.open(root_file)\n",
291301
"ev = fi[\"events\"]\n",
292302
"\n",
293303
"# which event to pick from the file\n",
294-
"iev = 2"
304+
"iev = 4"
295305
]
296306
},
297307
{
@@ -590,9 +600,9 @@
590600
"for subdetector in [0, 1, 2, 3]:\n",
591601
"\n",
592602
" trace = go.Scatter3d(\n",
593-
" x=np.clip(df[\"px\"][df[\"subdetector\"] == subdetector], -4000, 4000),\n",
594-
" y=np.clip(df[\"py\"][df[\"subdetector\"] == subdetector], -4000, 4000),\n",
595-
" z=np.clip(df[\"pz\"][df[\"subdetector\"] == subdetector], -4000, 4000),\n",
603+
" x=np.clip(df[\"px\"][df[\"subdetector\"] == subdetector], -8000, 8000),\n",
604+
" y=np.clip(df[\"py\"][df[\"subdetector\"] == subdetector], -8000, 8000),\n",
605+
" z=np.clip(df[\"pz\"][df[\"subdetector\"] == subdetector], -8000, 8000),\n",
596606
" mode=\"markers\",\n",
597607
" marker=dict(\n",
598608
" size=np.clip(2 + 2 * np.log(df[\"plotsize\"]), 1, 15),\n",

notebooks/debug-cld-processing.ipynb

Lines changed: 47 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,17 @@
77
"# Data processing for CLD"
88
]
99
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": null,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"root_file_path = \"../data/p8_ee_tt_ecm365/root\"\n",
17+
"parquet_file_path = \"../data/p8_ee_tt_ecm365/parquet\"\n",
18+
"module_path = \"../\""
19+
]
20+
},
1021
{
1122
"cell_type": "markdown",
1223
"metadata": {},
@@ -53,7 +64,7 @@
5364
"metadata": {},
5465
"outputs": [],
5566
"source": [
56-
"sys.path.append(\"/mnt/ceph/users/ewulff/particlemind/\")\n",
67+
"sys.path.append(module_path)\n",
5768
"from src.datasets.utils import Collater"
5869
]
5970
},
@@ -77,11 +88,6 @@
7788
"c = 3e8 # speed of light in m/s\n",
7889
"scale = 1000\n",
7990
"\n",
80-
"# append path\n",
81-
"import sys\n",
82-
"\n",
83-
"sys.path.append(str(Path(\"/mnt/ceph/users/ewulff/particlemind\")))\n",
84-
"\n",
8591
"from data_processing.cld_processing import (\n",
8692
" get_event_data,\n",
8793
" gen_to_features,\n",
@@ -128,8 +134,8 @@
128134
"metadata": {},
129135
"outputs": [],
130136
"source": [
131-
"root_files_dir = Path(\"/mnt/ceph/users/ewulff/data/cld/Dec3/subfolder_0/\")\n",
132-
"root_file = root_files_dir / \"reco_p8_ee_tt_ecm365_10000.root\"\n",
137+
"root_files_dir = Path(root_file_path)\n",
138+
"root_file = root_files_dir / \"reco_p8_ee_tt_ecm365_60000.root\"\n",
133139
"fi = uproot.open(root_file)\n",
134140
"ev = fi[\"events\"]\n",
135141
"\n",
@@ -200,8 +206,9 @@
200206
")\n",
201207
"# Check if the two dense gp_to_gp matrices are equal\n",
202208
"assert (coo_matrix_gp_to_gp.todense() == coo_matrix_gp_to_gp2.todense()).all()\n",
209+
"\n",
203210
"# Define the output file path\n",
204-
"output_file = Path(\"extracted_features.hdf5\")"
211+
"# output_file = Path(\"extracted_features.hdf5\")"
205212
]
206213
},
207214
{
@@ -835,13 +842,10 @@
835842
" output_dir.mkdir(parents=True, exist_ok=True)\n",
836843
"\n",
837844
" root_counter = 0\n",
838-
" root_file_list = list(Path(input_dir).rglob(\"*.root\"))\n",
839-
" total_files_to_porcess = max_root_files or len(root_file_list)\n",
845+
" root_file_list = sorted(list(Path(input_dir).rglob(\"*.root\")))[:max_root_files]\n",
846+
" total_files_to_process = len(root_file_list)\n",
840847
"\n",
841-
" for root_file in tqdm(sorted(root_file_list), desc=\"Processing ROOT files\", total=total_files_to_porcess):\n",
842-
" if max_root_files is not None and root_counter >= max_root_files:\n",
843-
" print(f\"Reached max_root_files limit: {max_root_files}. Stopping processing.\")\n",
844-
" break\n",
848+
" for root_file in tqdm(root_file_list, desc=\"Processing ROOT files\", total=total_files_to_process):\n",
845849
" try:\n",
846850
" output_file = output_dir / f\"{root_file.stem}.parquet\"\n",
847851
" if output_file.exists():\n",
@@ -929,7 +933,7 @@
929933
"metadata": {},
930934
"outputs": [],
931935
"source": [
932-
"# process_root_files_to_parquet(\"/mnt/ceph/users/ewulff/data/cld/\", \"/mnt/ceph/users/ewulff/data/cld/processed/parquet\", max_root_files=2)"
936+
"process_root_files_to_parquet(root_file_path, parquet_file_path, max_root_files=5)"
933937
]
934938
},
935939
{
@@ -938,7 +942,7 @@
938942
"metadata": {},
939943
"outputs": [],
940944
"source": [
941-
"event_data1 = ak.from_parquet(next(Path(\"/mnt/ceph/users/ewulff/data/cld/processed/parquet/\").glob(\"*.parquet\")))\n",
945+
"event_data1 = ak.from_parquet(next(Path(parquet_file_path).glob(\"*.parquet\")))\n",
942946
"event_data1.fields"
943947
]
944948
},
@@ -1100,13 +1104,6 @@
11001104
"# lmdb_data = read_full_lmdb_database(\"/mnt/ceph/users/ewulff/data/cld/processed/lmdb\")"
11011105
]
11021106
},
1103-
{
1104-
"cell_type": "code",
1105-
"execution_count": null,
1106-
"metadata": {},
1107-
"outputs": [],
1108-
"source": []
1109-
},
11101107
{
11111108
"cell_type": "markdown",
11121109
"metadata": {},
@@ -1120,7 +1117,7 @@
11201117
"metadata": {},
11211118
"outputs": [],
11221119
"source": [
1123-
"event_data1 = ak.from_parquet(next(Path(\"/mnt/ceph/users/ewulff/data/cld/processed/parquet/\").glob(\"*.parquet\")))\n",
1120+
"event_data1 = ak.from_parquet(next(Path(parquet_file_path).glob(\"*.parquet\")))\n",
11241121
"\n",
11251122
"# Extract genparticle_to_calo_hit_matrix from event_data1\n",
11261123
"# This matrix contains the mapping of genparticles to calorimeter hits in a COO format\n",
@@ -1286,6 +1283,7 @@
12861283
" \"\"\"\n",
12871284
" self.folder_path = Path(folder_path)\n",
12881285
" self.parquet_files = list(self.folder_path.glob(\"*.parquet\"))\n",
1286+
" print(self.parquet_files)\n",
12891287
" self.shuffle_files = shuffle_files\n",
12901288
"\n",
12911289
" self.split = split\n",
@@ -1295,6 +1293,7 @@
12951293
" self.parquet_files = self.parquet_files[:split_index]\n",
12961294
" elif self.split == \"val\":\n",
12971295
" self.parquet_files = self.parquet_files[split_index:]\n",
1296+
" print(split_index)\n",
12981297
"\n",
12991298
" if self.shuffle_files:\n",
13001299
" self.shuffle_shards()\n",
@@ -1353,11 +1352,8 @@
13531352
" }\n",
13541353
"\n",
13551354
"\n",
1356-
"# Define the folder containing parquet files\n",
1357-
"folder_path = \"/mnt/ceph/users/ewulff/data/cld/processed/parquet/\"\n",
1358-
"\n",
13591355
"# Create the dataset and dataloader\n",
1360-
"dataset = CLDHits(folder_path, \"train\")\n",
1356+
"dataset = CLDHits(parquet_file_path, \"train\")\n",
13611357
"# dataloader = DataLoader(dataset, batch_size=32, shuffle=True)\n",
13621358
"\n",
13631359
"# Create the dataset and dataloader with the custom collate function\n",
@@ -1501,13 +1497,33 @@
15011497
"\n",
15021498
"\n",
15031499
"train_dl, val_dl = get_dataloaders(\n",
1504-
" \"/mnt/ceph/users/ewulff/data/cld/processed/parquet/\",\n",
1500+
" parquet_file_path,\n",
15051501
" batch_size=1,\n",
15061502
" ntrain=1000, # Number of training samples\n",
15071503
" nvalid=200, # Number of validation samples\n",
15081504
")"
15091505
]
15101506
},
1507+
{
1508+
"cell_type": "code",
1509+
"execution_count": null,
1510+
"metadata": {},
1511+
"outputs": [],
1512+
"source": [
1513+
"len(train_dl), len(val_dl)"
1514+
]
1515+
},
1516+
{
1517+
"cell_type": "code",
1518+
"execution_count": null,
1519+
"metadata": {},
1520+
"outputs": [],
1521+
"source": [
1522+
"for elem in train_dl:\n",
1523+
" print(elem)\n",
1524+
" break"
1525+
]
1526+
},
15111527
{
15121528
"cell_type": "code",
15131529
"execution_count": null,
@@ -1558,5 +1574,5 @@
15581574
}
15591575
},
15601576
"nbformat": 4,
1561-
"nbformat_minor": 2
1577+
"nbformat_minor": 4
15621578
}

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ ipykernel
88
jupyterlab
99
kaleido
1010
lightning
11+
lmdb
1112
matplotlib
1213
nbdev
1314
nbformat

scripts/download_data.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
set -e
3+
4+
mkdir -p data
5+
cd data
6+
7+
wget https://zenodo.org/records/14930758/files/p8_ee_tt_ecm365_rootfiles.tgz?download=1 -O p8_ee_tt_ecm365_rootfiles.tgz
8+
tar xf p8_ee_tt_ecm365_rootfiles.tgz
9+
mkdir -p p8_ee_tt_ecm365/root
10+
mv p8_ee_tt_ecm365_rootfiles/*.root p8_ee_tt_ecm365/root/
11+
12+
rm -f p8_ee_tt_ecm365_rootfiles.tgz

0 commit comments

Comments
 (0)