|
7 | 7 | "# Data processing for CLD" |
8 | 8 | ] |
9 | 9 | }, |
| 10 | + { |
| 11 | + "cell_type": "code", |
| 12 | + "execution_count": null, |
| 13 | + "metadata": {}, |
| 14 | + "outputs": [], |
| 15 | + "source": [ |
| 16 | + "root_file_path = \"../data/p8_ee_tt_ecm365/root\"\n", |
| 17 | + "parquet_file_path = \"../data/p8_ee_tt_ecm365/parquet\"\n", |
| 18 | + "module_path = \"../\"" |
| 19 | + ] |
| 20 | + }, |
10 | 21 | { |
11 | 22 | "cell_type": "markdown", |
12 | 23 | "metadata": {}, |
|
53 | 64 | "metadata": {}, |
54 | 65 | "outputs": [], |
55 | 66 | "source": [ |
56 | | - "sys.path.append(\"/mnt/ceph/users/ewulff/particlemind/\")\n", |
| 67 | + "sys.path.append(module_path)\n", |
57 | 68 | "from src.datasets.utils import Collater" |
58 | 69 | ] |
59 | 70 | }, |
|
77 | 88 | "c = 3e8 # speed of light in m/s\n", |
78 | 89 | "scale = 1000\n", |
79 | 90 | "\n", |
80 | | - "# append path\n", |
81 | | - "import sys\n", |
82 | | - "\n", |
83 | | - "sys.path.append(str(Path(\"/mnt/ceph/users/ewulff/particlemind\")))\n", |
84 | | - "\n", |
85 | 91 | "from data_processing.cld_processing import (\n", |
86 | 92 | " get_event_data,\n", |
87 | 93 | " gen_to_features,\n", |
|
128 | 134 | "metadata": {}, |
129 | 135 | "outputs": [], |
130 | 136 | "source": [ |
131 | | - "root_files_dir = Path(\"/mnt/ceph/users/ewulff/data/cld/Dec3/subfolder_0/\")\n", |
132 | | - "root_file = root_files_dir / \"reco_p8_ee_tt_ecm365_10000.root\"\n", |
| 137 | + "root_files_dir = Path(root_file_path)\n", |
| 138 | + "root_file = root_files_dir / \"reco_p8_ee_tt_ecm365_60000.root\"\n", |
133 | 139 | "fi = uproot.open(root_file)\n", |
134 | 140 | "ev = fi[\"events\"]\n", |
135 | 141 | "\n", |
|
200 | 206 | ")\n", |
201 | 207 | "# Check if the two dense gp_to_gp matrices are equal\n", |
202 | 208 | "assert (coo_matrix_gp_to_gp.todense() == coo_matrix_gp_to_gp2.todense()).all()\n", |
| 209 | + "\n", |
203 | 210 | "# Define the output file path\n", |
204 | | - "output_file = Path(\"extracted_features.hdf5\")" |
| 211 | + "# output_file = Path(\"extracted_features.hdf5\")" |
205 | 212 | ] |
206 | 213 | }, |
207 | 214 | { |
|
835 | 842 | " output_dir.mkdir(parents=True, exist_ok=True)\n", |
836 | 843 | "\n", |
837 | 844 | " root_counter = 0\n", |
838 | | - " root_file_list = list(Path(input_dir).rglob(\"*.root\"))\n", |
839 | | - " total_files_to_porcess = max_root_files or len(root_file_list)\n", |
| 845 | + " root_file_list = sorted(list(Path(input_dir).rglob(\"*.root\")))[:max_root_files]\n", |
| 846 | + " total_files_to_process = len(root_file_list)\n", |
840 | 847 | "\n", |
841 | | - " for root_file in tqdm(sorted(root_file_list), desc=\"Processing ROOT files\", total=total_files_to_porcess):\n", |
842 | | - " if max_root_files is not None and root_counter >= max_root_files:\n", |
843 | | - " print(f\"Reached max_root_files limit: {max_root_files}. Stopping processing.\")\n", |
844 | | - " break\n", |
| 848 | + " for root_file in tqdm(root_file_list, desc=\"Processing ROOT files\", total=total_files_to_process):\n", |
845 | 849 | " try:\n", |
846 | 850 | " output_file = output_dir / f\"{root_file.stem}.parquet\"\n", |
847 | 851 | " if output_file.exists():\n", |
|
929 | 933 | "metadata": {}, |
930 | 934 | "outputs": [], |
931 | 935 | "source": [ |
932 | | - "# process_root_files_to_parquet(\"/mnt/ceph/users/ewulff/data/cld/\", \"/mnt/ceph/users/ewulff/data/cld/processed/parquet\", max_root_files=2)" |
| 936 | + "process_root_files_to_parquet(root_file_path, parquet_file_path, max_root_files=5)" |
933 | 937 | ] |
934 | 938 | }, |
935 | 939 | { |
|
938 | 942 | "metadata": {}, |
939 | 943 | "outputs": [], |
940 | 944 | "source": [ |
941 | | - "event_data1 = ak.from_parquet(next(Path(\"/mnt/ceph/users/ewulff/data/cld/processed/parquet/\").glob(\"*.parquet\")))\n", |
| 945 | + "event_data1 = ak.from_parquet(next(Path(parquet_file_path).glob(\"*.parquet\")))\n", |
942 | 946 | "event_data1.fields" |
943 | 947 | ] |
944 | 948 | }, |
|
1100 | 1104 | "# lmdb_data = read_full_lmdb_database(\"/mnt/ceph/users/ewulff/data/cld/processed/lmdb\")" |
1101 | 1105 | ] |
1102 | 1106 | }, |
1103 | | - { |
1104 | | - "cell_type": "code", |
1105 | | - "execution_count": null, |
1106 | | - "metadata": {}, |
1107 | | - "outputs": [], |
1108 | | - "source": [] |
1109 | | - }, |
1110 | 1107 | { |
1111 | 1108 | "cell_type": "markdown", |
1112 | 1109 | "metadata": {}, |
|
1120 | 1117 | "metadata": {}, |
1121 | 1118 | "outputs": [], |
1122 | 1119 | "source": [ |
1123 | | - "event_data1 = ak.from_parquet(next(Path(\"/mnt/ceph/users/ewulff/data/cld/processed/parquet/\").glob(\"*.parquet\")))\n", |
| 1120 | + "event_data1 = ak.from_parquet(next(Path(parquet_file_path).glob(\"*.parquet\")))\n", |
1124 | 1121 | "\n", |
1125 | 1122 | "# Extract genparticle_to_calo_hit_matrix from event_data1\n", |
1126 | 1123 | "# This matrix contains the mapping of genparticles to calorimeter hits in a COO format\n", |
|
1286 | 1283 | " \"\"\"\n", |
1287 | 1284 | " self.folder_path = Path(folder_path)\n", |
1288 | 1285 | " self.parquet_files = list(self.folder_path.glob(\"*.parquet\"))\n", |
| 1286 | + " print(self.parquet_files)\n", |
1289 | 1287 | " self.shuffle_files = shuffle_files\n", |
1290 | 1288 | "\n", |
1291 | 1289 | " self.split = split\n", |
|
1295 | 1293 | " self.parquet_files = self.parquet_files[:split_index]\n", |
1296 | 1294 | " elif self.split == \"val\":\n", |
1297 | 1295 | " self.parquet_files = self.parquet_files[split_index:]\n", |
| 1296 | + " print(split_index)\n", |
1298 | 1297 | "\n", |
1299 | 1298 | " if self.shuffle_files:\n", |
1300 | 1299 | " self.shuffle_shards()\n", |
|
1353 | 1352 | " }\n", |
1354 | 1353 | "\n", |
1355 | 1354 | "\n", |
1356 | | - "# Define the folder containing parquet files\n", |
1357 | | - "folder_path = \"/mnt/ceph/users/ewulff/data/cld/processed/parquet/\"\n", |
1358 | | - "\n", |
1359 | 1355 | "# Create the dataset and dataloader\n", |
1360 | | - "dataset = CLDHits(folder_path, \"train\")\n", |
| 1356 | + "dataset = CLDHits(parquet_file_path, \"train\")\n", |
1361 | 1357 | "# dataloader = DataLoader(dataset, batch_size=32, shuffle=True)\n", |
1362 | 1358 | "\n", |
1363 | 1359 | "# Create the dataset and dataloader with the custom collate function\n", |
|
1501 | 1497 | "\n", |
1502 | 1498 | "\n", |
1503 | 1499 | "train_dl, val_dl = get_dataloaders(\n", |
1504 | | - " \"/mnt/ceph/users/ewulff/data/cld/processed/parquet/\",\n", |
| 1500 | + " parquet_file_path,\n", |
1505 | 1501 | " batch_size=1,\n", |
1506 | 1502 | " ntrain=1000, # Number of training samples\n", |
1507 | 1503 | " nvalid=200, # Number of validation samples\n", |
1508 | 1504 | ")" |
1509 | 1505 | ] |
1510 | 1506 | }, |
| 1507 | + { |
| 1508 | + "cell_type": "code", |
| 1509 | + "execution_count": null, |
| 1510 | + "metadata": {}, |
| 1511 | + "outputs": [], |
| 1512 | + "source": [ |
| 1513 | + "len(train_dl), len(val_dl)" |
| 1514 | + ] |
| 1515 | + }, |
| 1516 | + { |
| 1517 | + "cell_type": "code", |
| 1518 | + "execution_count": null, |
| 1519 | + "metadata": {}, |
| 1520 | + "outputs": [], |
| 1521 | + "source": [ |
| 1522 | + "for elem in train_dl:\n", |
| 1523 | + " print(elem)\n", |
| 1524 | + " break" |
| 1525 | + ] |
| 1526 | + }, |
1511 | 1527 | { |
1512 | 1528 | "cell_type": "code", |
1513 | 1529 | "execution_count": null, |
|
1558 | 1574 | } |
1559 | 1575 | }, |
1560 | 1576 | "nbformat": 4, |
1561 | | - "nbformat_minor": 2 |
| 1577 | + "nbformat_minor": 4 |
1562 | 1578 | } |
0 commit comments