Merge pull request #10 from jpata/jp_20251107_clustering

erwulff · web-flow · commit 7faee7655eef · 2025-12-02T20:05:02.000+01:00
Clustering studies, baseline reco clustering

- add notebook to visualize cluster information
- in CLDHits retrieve Pandora cluster indices as a baseline clustering to compare our ML against
- use dvc to keep track of datasets
diff --git a/.dvc/.gitignore b/.dvc/.gitignore
@@ -0,0 +1,3 @@
+/config.local
+/tmp
+/cache
diff --git a/.dvc/config b/.dvc/config
@@ -0,0 +1,5 @@
+[core]
+    remote = cern-jpata
+
+['remote "cern-jpata"']
+    url = https://jpata.web.cern.ch/dvc/particlemind
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,2 @@
-data/
 *.ipynb_checkpoints
 *.pyc
diff --git a/README_Tallinn.md b/README_Tallinn.md
@@ -0,0 +1,7 @@
+```
+./scripts/run_ee.sh dvc config --local cache.dir /scratch/persistent/$USER/dvc-cache
+./scripts/run_ee.sh dvc config --local cache.type symlink
+./scripts/run_ee.sh dvc fetch
+./scripts/run_ee.sh dvc pull
+./scripts/run_ee.sh jupyter notebook
+```
diff --git a/data/.gitignore b/data/.gitignore
@@ -0,0 +1 @@
+/p8_ee_tt_ecm365
diff --git a/data/p8_ee_tt_ecm365.dvc b/data/p8_ee_tt_ecm365.dvc
@@ -0,0 +1,6 @@
+outs:
+- md5: 1e061a23483ff95a45c620fdd23db85d.dir
+  size: 39000735175
+  nfiles: 1000
+  hash: md5
+  path: p8_ee_tt_ecm365
diff --git a/notebooks/clustering_studies.ipynb b/notebooks/clustering_studies.ipynb
@@ -0,0 +1,322 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7084fc8e-60bb-4341-af6f-92eb97ebc42c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "root_file_path = \"../data/p8_ee_tt_ecm365/root\"\n",
+    "parquet_file_path = \"../data/p8_ee_tt_ecm365/parquet\"\n",
+    "module_path = \"../\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f153459-a73f-4126-8029-5ebf597ba1ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import awkward as ak"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7480d773-180e-4709-9229-7e91facbdcb3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "\n",
+    "sys.path.append(module_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c392fc8-50fe-4429-8f73-a20d6bf98717",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.datasets.CLDHits import CLDHits"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a06ee555-0b52-49ce-97c8-3586fb8081be",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_train = CLDHits(parquet_file_path, \"train\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4eb29375-8ca1-455c-bb09-e5c2d1d5c748",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "elems = []\n",
+    "for elem in dataset_train:\n",
+    "    unique_labels, contiguous_labels = np.unique(elem[\"hit_labels\"], return_inverse=True)\n",
+    "    elem[\"hit_labels_contiguous\"] = contiguous_labels\n",
+    "    elems.append(elem)\n",
+    "    if len(elems) >= 100:\n",
+    "        break\n",
+    "\n",
+    "elems = [[ak.from_iter(elem)] for elem in elems]\n",
+    "elems = ak.concatenate(elems, axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "44a55aa6-20a6-4aa4-b91a-44731dcd971d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(ak.max(elems[\"hit_labels_contiguous\"], axis=1), bins=np.linspace(0, 400, 41))\n",
+    "plt.xlabel(\"Clusters per event\")\n",
+    "plt.ylabel(\"Event count\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7c2fbe8-a243-4a07-aab0-3f8b7576cc66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hit_labels_c_f = ak.flatten(elems[\"hit_labels_contiguous\"])\n",
+    "calo_hit_features_f = ak.flatten(elems[\"calo_hit_features\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4f52c7c6-1dd1-436a-bdeb-6a8754862512",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(calo_hit_features_f[:, 0], np.linspace(-5000, 5000, 100), histtype=\"step\", lw=2, label=\"x\")\n",
+    "plt.hist(calo_hit_features_f[:, 1], np.linspace(-5000, 5000, 100), histtype=\"step\", lw=2, label=\"y\")\n",
+    "plt.hist(calo_hit_features_f[:, 2], np.linspace(-5000, 5000, 100), histtype=\"step\", lw=2, label=\"z\")\n",
+    "plt.xlabel(\"Hit position (mm)\")\n",
+    "plt.ylabel(\"Hit count\")\n",
+    "plt.legend()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c3cd37c-3626-4884-8712-f5e0e8e02111",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(10 * calo_hit_features_f[:, 3], np.logspace(-3, 1, 100))\n",
+    "plt.xscale(\"log\")\n",
+    "plt.xlabel(\"Hit energy (GeV)\")\n",
+    "plt.ylabel(\"Hit count\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3cbc7257-df54-400a-a0ca-34b2c3ecb295",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(elems)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5927be66-b729-427c-9e01-5df7e83b7487",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_cluster_std_x = []\n",
+    "all_cluster_std_y = []\n",
+    "all_cluster_std_z = []\n",
+    "all_cluster_sum_e = []\n",
+    "all_cluster_hit_count = []\n",
+    "all_cluster_id = []\n",
+    "\n",
+    "for ielem in range(5):\n",
+    "    print(ielem)\n",
+    "    elem = elems[ielem]\n",
+    "    cluster_ids = np.unique(elem[\"hit_labels_contiguous\"])\n",
+    "    cluster_std_x = []\n",
+    "    cluster_std_y = []\n",
+    "    cluster_std_z = []\n",
+    "    cluster_sum_e = []\n",
+    "    cluster_hit_count = []\n",
+    "    cluster_id = []\n",
+    "    for clid in cluster_ids:\n",
+    "        cl_mask = elem[\"hit_labels_contiguous\"] == clid\n",
+    "        std_x = np.std(elem[\"calo_hit_features\"][:, 0][cl_mask])\n",
+    "        std_y = np.std(elem[\"calo_hit_features\"][:, 1][cl_mask])\n",
+    "        std_z = np.std(elem[\"calo_hit_features\"][:, 2][cl_mask])\n",
+    "        sum_e = np.sum(elem[\"calo_hit_features\"][:, 3][cl_mask])\n",
+    "        hit_count = np.sum(cl_mask)\n",
+    "\n",
+    "        cluster_std_x.append(std_x)\n",
+    "        cluster_std_y.append(std_y)\n",
+    "        cluster_std_z.append(std_z)\n",
+    "        cluster_sum_e.append(sum_e)\n",
+    "        cluster_hit_count.append(hit_count)\n",
+    "        cluster_id.append(clid)\n",
+    "\n",
+    "    all_cluster_std_x.append(cluster_std_x)\n",
+    "    all_cluster_std_y.append(cluster_std_y)\n",
+    "    all_cluster_std_z.append(cluster_std_z)\n",
+    "    all_cluster_sum_e.append(cluster_sum_e)\n",
+    "    all_cluster_hit_count.append(cluster_hit_count)\n",
+    "    all_cluster_id.append(cluster_id)\n",
+    "\n",
+    "\n",
+    "all_cluster_std_x = ak.Array(all_cluster_std_x)\n",
+    "all_cluster_std_y = ak.Array(all_cluster_std_y)\n",
+    "all_cluster_std_z = ak.Array(all_cluster_std_z)\n",
+    "all_cluster_sum_e = ak.Array(all_cluster_sum_e)\n",
+    "all_cluster_hit_count = ak.Array(all_cluster_hit_count)\n",
+    "all_cluster_id = ak.Array(all_cluster_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "005eadee-b7b1-4f55-a7f0-0c8965d3ab7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist2d(\n",
+    "    ak.to_numpy(ak.flatten(all_cluster_hit_count[all_cluster_hit_count > 5])),\n",
+    "    ak.to_numpy(ak.flatten(all_cluster_std_x[all_cluster_hit_count > 5])),\n",
+    "    bins=(np.logspace(0, 3, 100), np.logspace(-2, 4, 100)),\n",
+    ")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xlabel(\"Hits per cluster\")\n",
+    "plt.ylabel(\"Hit pos x stddev\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7b088f67-c4e4-454c-9244-0bbbc3c6500f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist2d(\n",
+    "    ak.to_numpy(ak.flatten(ak.Array(all_cluster_hit_count))),\n",
+    "    ak.to_numpy(ak.flatten(ak.Array(all_cluster_std_y))),\n",
+    "    bins=(np.logspace(0, 3, 100), np.logspace(-2, 4, 100)),\n",
+    ")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xlabel(\"Hits per cluster\")\n",
+    "plt.ylabel(\"Hit pos y stddev\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5004224b-ef89-4183-b6d5-f3774dbef0bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist2d(\n",
+    "    ak.to_numpy(ak.flatten(ak.Array(all_cluster_hit_count[all_cluster_hit_count > 5]))),\n",
+    "    ak.to_numpy(ak.flatten(ak.Array(all_cluster_std_z[all_cluster_hit_count > 5]))),\n",
+    "    bins=(np.logspace(0, 3, 100), np.logspace(-2, 4, 100)),\n",
+    ")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xlabel(\"Hits per cluster\")\n",
+    "plt.ylabel(\"Hit pos z stddev\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b314c040-84e7-4970-9b3d-42144ca5ef4b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(5, 5))\n",
+    "plt.hist2d(\n",
+    "    ak.to_numpy(ak.flatten(ak.Array(all_cluster_hit_count))),\n",
+    "    ak.to_numpy(ak.flatten(ak.Array(all_cluster_sum_e))),\n",
+    "    bins=(np.logspace(0, 3, 100), np.logspace(-2, 3, 100)),\n",
+    ")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xlabel(\"Hits per cluster\")\n",
+    "plt.ylabel(\"Sum energy per cluster\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2410866d-9c17-4c3b-b182-c67c5692506d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.hist(ak.flatten(all_cluster_hit_count), bins=np.linspace(0, 1500, 100))\n",
+    "plt.yscale(\"log\")\n",
+    "plt.xlabel(\"Number of hits per cluster\")\n",
+    "plt.ylabel(\"Cluster count\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11acb17c-255a-4d01-9371-f01b6a378520",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, axs = plt.subplots(3, 3, figsize=(10, 10))\n",
+    "axs = axs.flatten()\n",
+    "for ielem in range(9):\n",
+    "    plt.sca(axs[ielem])\n",
+    "    elem = elems[ielem]\n",
+    "\n",
+    "    unique_labels, contiguous_labels = np.unique(elem[\"hit_labels\"], return_inverse=True)\n",
+    "    cmap = plt.get_cmap(\"viridis\")\n",
+    "    distinct_colors = cmap(np.linspace(0, 1, len(unique_labels)))\n",
+    "\n",
+    "    plt.scatter(\n",
+    "        elem[\"calo_hit_features\"][:, 0],\n",
+    "        elem[\"calo_hit_features\"][:, 1],\n",
+    "        s=np.clip(100 * elem[\"calo_hit_features\"][:, 3], 0.1, 10),\n",
+    "        c=distinct_colors[contiguous_labels],\n",
+    "    )\n",
+    "    plt.xlim(-6000, 6000)\n",
+    "    plt.ylim(-6000, 6000)\n",
+    "    plt.title(\n",
+    "        \"$N_{{hit}}$={}, $N_{{cl}}$={}\".format(len(elem[\"calo_hit_features\"]), len(np.unique(elem[\"hit_labels\"])))\n",
+    "    )\n",
+    "    plt.xticks([])\n",
+    "    plt.yticks([])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "python3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/debug-cld-processing.ipynb b/notebooks/debug-cld-processing.ipynb
diff --git a/scripts/run_ee.sh b/scripts/run_ee.sh
diff --git a/src/datasets/CLDHits.py b/src/datasets/CLDHits.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,2 @@`
`1`		`-data/`
`2`	`1`	`*.ipynb_checkpoints`
`3`	`2`	`*.pyc`