Merge pull request #8 from jpata/jp_20250829_datapaths

erwulff · web-flow · commit 8187d715ab5f · 2025-09-02T14:02:22.000+02:00
consolidate paths in notebooks, add data download script
diff --git a/README.md b/README.md
@@ -1,6 +1,13 @@
 # particlemind
 Self-supervised learning on HEP events.
 
+## Datasets
+
+A small testing dataset (about 20GB) can be downloaded from zenodo:
+```
+./scripts/download_data.sh
+```
+
 ### Approximate repo structure
 ```
 ├── README.md
diff --git a/notebooks/cld-visualize.ipynb b/notebooks/cld-visualize.ipynb
@@ -8,6 +8,16 @@
     "# Visualize CLD events from Key4HEP full simulation + reconstruction"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9c7184b-eb96-4247-806f-12ae84b96f20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "root_file_path = \"../data/p8_ee_tt_ecm365/root\""
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "57214593-71db-4bbc-82f6-1f66224a32b7",
@@ -285,13 +295,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "root_files_dir = Path(\"/mnt/ceph/users/ewulff/data/cld/Dec3/subfolder_0/\")\n",
-    "root_file = root_files_dir / \"reco_p8_ee_tt_ecm365_10000.root\"\n",
+    "root_files_dir = Path(root_file_path)\n",
+    "root_file = root_files_dir / \"reco_p8_ee_tt_ecm365_60000.root\"\n",
     "fi = uproot.open(root_file)\n",
     "ev = fi[\"events\"]\n",
     "\n",
     "# which event to pick from the file\n",
-    "iev = 2"
+    "iev = 4"
    ]
   },
   {
@@ -590,9 +600,9 @@
     "for subdetector in [0, 1, 2, 3]:\n",
     "\n",
     "    trace = go.Scatter3d(\n",
-    "        x=np.clip(df[\"px\"][df[\"subdetector\"] == subdetector], -4000, 4000),\n",
-    "        y=np.clip(df[\"py\"][df[\"subdetector\"] == subdetector], -4000, 4000),\n",
-    "        z=np.clip(df[\"pz\"][df[\"subdetector\"] == subdetector], -4000, 4000),\n",
+    "        x=np.clip(df[\"px\"][df[\"subdetector\"] == subdetector], -8000, 8000),\n",
+    "        y=np.clip(df[\"py\"][df[\"subdetector\"] == subdetector], -8000, 8000),\n",
+    "        z=np.clip(df[\"pz\"][df[\"subdetector\"] == subdetector], -8000, 8000),\n",
     "        mode=\"markers\",\n",
     "        marker=dict(\n",
     "            size=np.clip(2 + 2 * np.log(df[\"plotsize\"]), 1, 15),\n",
diff --git a/notebooks/debug-cld-processing.ipynb b/notebooks/debug-cld-processing.ipynb
@@ -7,6 +7,17 @@
     "# Data processing for CLD"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "root_file_path = \"../data/p8_ee_tt_ecm365/root\"\n",
+    "parquet_file_path = \"../data/p8_ee_tt_ecm365/parquet\"\n",
+    "module_path = \"../\""
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -53,7 +64,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sys.path.append(\"/mnt/ceph/users/ewulff/particlemind/\")\n",
+    "sys.path.append(module_path)\n",
     "from src.datasets.utils import Collater"
    ]
   },
@@ -77,11 +88,6 @@
     "c = 3e8  # speed of light in m/s\n",
     "scale = 1000\n",
     "\n",
-    "# append path\n",
-    "import sys\n",
-    "\n",
-    "sys.path.append(str(Path(\"/mnt/ceph/users/ewulff/particlemind\")))\n",
-    "\n",
     "from data_processing.cld_processing import (\n",
     "    get_event_data,\n",
     "    gen_to_features,\n",
@@ -128,8 +134,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "root_files_dir = Path(\"/mnt/ceph/users/ewulff/data/cld/Dec3/subfolder_0/\")\n",
-    "root_file = root_files_dir / \"reco_p8_ee_tt_ecm365_10000.root\"\n",
+    "root_files_dir = Path(root_file_path)\n",
+    "root_file = root_files_dir / \"reco_p8_ee_tt_ecm365_60000.root\"\n",
     "fi = uproot.open(root_file)\n",
     "ev = fi[\"events\"]\n",
     "\n",
@@ -200,8 +206,9 @@
     ")\n",
     "# Check if the two dense gp_to_gp matrices are equal\n",
     "assert (coo_matrix_gp_to_gp.todense() == coo_matrix_gp_to_gp2.todense()).all()\n",
+    "\n",
     "# Define the output file path\n",
-    "output_file = Path(\"extracted_features.hdf5\")"
+    "# output_file = Path(\"extracted_features.hdf5\")"
    ]
   },
   {
@@ -835,13 +842,10 @@
     "    output_dir.mkdir(parents=True, exist_ok=True)\n",
     "\n",
     "    root_counter = 0\n",
-    "    root_file_list = list(Path(input_dir).rglob(\"*.root\"))\n",
-    "    total_files_to_porcess = max_root_files or len(root_file_list)\n",
+    "    root_file_list = sorted(list(Path(input_dir).rglob(\"*.root\")))[:max_root_files]\n",
+    "    total_files_to_process = len(root_file_list)\n",
     "\n",
-    "    for root_file in tqdm(sorted(root_file_list), desc=\"Processing ROOT files\", total=total_files_to_porcess):\n",
-    "        if max_root_files is not None and root_counter >= max_root_files:\n",
-    "            print(f\"Reached max_root_files limit: {max_root_files}. Stopping processing.\")\n",
-    "            break\n",
+    "    for root_file in tqdm(root_file_list, desc=\"Processing ROOT files\", total=total_files_to_process):\n",
     "        try:\n",
     "            output_file = output_dir / f\"{root_file.stem}.parquet\"\n",
     "            if output_file.exists():\n",
@@ -929,7 +933,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# process_root_files_to_parquet(\"/mnt/ceph/users/ewulff/data/cld/\", \"/mnt/ceph/users/ewulff/data/cld/processed/parquet\", max_root_files=2)"
+    "process_root_files_to_parquet(root_file_path, parquet_file_path, max_root_files=5)"
    ]
   },
   {
@@ -938,7 +942,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "event_data1 = ak.from_parquet(next(Path(\"/mnt/ceph/users/ewulff/data/cld/processed/parquet/\").glob(\"*.parquet\")))\n",
+    "event_data1 = ak.from_parquet(next(Path(parquet_file_path).glob(\"*.parquet\")))\n",
     "event_data1.fields"
    ]
   },
@@ -1100,13 +1104,6 @@
     "# lmdb_data = read_full_lmdb_database(\"/mnt/ceph/users/ewulff/data/cld/processed/lmdb\")"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1120,7 +1117,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "event_data1 = ak.from_parquet(next(Path(\"/mnt/ceph/users/ewulff/data/cld/processed/parquet/\").glob(\"*.parquet\")))\n",
+    "event_data1 = ak.from_parquet(next(Path(parquet_file_path).glob(\"*.parquet\")))\n",
     "\n",
     "# Extract genparticle_to_calo_hit_matrix from event_data1\n",
     "# This matrix contains the mapping of genparticles to calorimeter hits in a COO format\n",
@@ -1286,6 +1283,7 @@
     "        \"\"\"\n",
     "        self.folder_path = Path(folder_path)\n",
     "        self.parquet_files = list(self.folder_path.glob(\"*.parquet\"))\n",
+    "        print(self.parquet_files)\n",
     "        self.shuffle_files = shuffle_files\n",
     "\n",
     "        self.split = split\n",
@@ -1295,6 +1293,7 @@
     "                self.parquet_files = self.parquet_files[:split_index]\n",
     "            elif self.split == \"val\":\n",
     "                self.parquet_files = self.parquet_files[split_index:]\n",
+    "        print(split_index)\n",
     "\n",
     "        if self.shuffle_files:\n",
     "            self.shuffle_shards()\n",
@@ -1353,11 +1352,8 @@
     "                }\n",
     "\n",
     "\n",
-    "# Define the folder containing parquet files\n",
-    "folder_path = \"/mnt/ceph/users/ewulff/data/cld/processed/parquet/\"\n",
-    "\n",
     "# Create the dataset and dataloader\n",
-    "dataset = CLDHits(folder_path, \"train\")\n",
+    "dataset = CLDHits(parquet_file_path, \"train\")\n",
     "# dataloader = DataLoader(dataset, batch_size=32, shuffle=True)\n",
     "\n",
     "# Create the dataset and dataloader with the custom collate function\n",
@@ -1501,13 +1497,33 @@
     "\n",
     "\n",
     "train_dl, val_dl = get_dataloaders(\n",
-    "    \"/mnt/ceph/users/ewulff/data/cld/processed/parquet/\",\n",
+    "    parquet_file_path,\n",
     "    batch_size=1,\n",
     "    ntrain=1000,  # Number of training samples\n",
     "    nvalid=200,  # Number of validation samples\n",
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(train_dl), len(val_dl)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for elem in train_dl:\n",
+    "    print(elem)\n",
+    "    break"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1558,5 +1574,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/requirements.txt b/requirements.txt
@@ -8,6 +8,7 @@ ipykernel
 jupyterlab
 kaleido
 lightning
+lmdb
 matplotlib
 nbdev
 nbformat
diff --git a/scripts/download_data.sh b/scripts/download_data.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e
+
+mkdir -p data
+cd data
+
+wget https://zenodo.org/records/14930758/files/p8_ee_tt_ecm365_rootfiles.tgz?download=1 -O p8_ee_tt_ecm365_rootfiles.tgz
+tar xf p8_ee_tt_ecm365_rootfiles.tgz
+mkdir -p p8_ee_tt_ecm365/root
+mv p8_ee_tt_ecm365_rootfiles/*.root p8_ee_tt_ecm365/root/
+
+rm -f p8_ee_tt_ecm365_rootfiles.tgz