Fix scaffold split: use rarest-first ordering, fix normalization, add visualizations

whitead · claude · whitead · commit 4c327661e657 · 2026-02-19T21:06:32.000-08:00
- Iterate from rarest to most common scaffolds so test set contains novel chemistry
- Fix normalization bug where in-place loop corrupted train stats before normalizing test
- Add cells showing train vs test scaffolds and example molecules
- Add rdkit.Chem.Draw import for molecule grid visualization

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/ml/regression.ipynb b/ml/regression.ipynb
@@ -60,7 +60,7 @@
     "import jax.numpy as jnp\n",
     "from jax.example_libraries import optimizers\n",
     "import jax\n",
-    "import rdkit, rdkit.Chem\n",
+    "import rdkit, rdkit.Chem, rdkit.Chem.Draw\n",
     "from rdkit.Chem.Scaffolds import MurckoScaffold\n",
     "import dmol"
    ]
@@ -1643,7 +1643,7 @@
     "\n",
     "For molecular problems, a random train/test split can still leak structural similarity between train and test molecules. A **scaffold split** groups molecules by their core scaffold (often the Bemis-Murcko scaffold{cite}`bemis1996properties`) and puts whole scaffold groups into either train or test. This usually gives a harder but more realistic estimate of performance on novel chemistry. Scaffold splits were popularized as a standard benchmark by the MoleculeNet suite{cite}`wu2018moleculenet`.\n",
     "\n",
-    "The quick demo below builds Murcko scaffolds from SMILES, places the most common scaffolds into the test set until we reach about 20% of molecules, and compares that error with a random split of the same size.\n"
+    "The standard approach places the **rarest** scaffolds into the test set, since those represent the most novel chemistry. The demo below builds Murcko scaffolds from SMILES, accumulates the least-common scaffolds into the test set until we reach about 20% of molecules, and compares that error with a random split of the same size.\n"
    ]
   },
   {
@@ -1655,8 +1655,6 @@
     "smiles_col = next(\n",
     "    (c for c in [\"SMILES\", \"smiles\", \"CanonicalSMILES\"] if c in soldata.columns), None\n",
     ")\n",
-    "if smiles_col is None:\n",
-    "    raise ValueError(\"No SMILES column found for scaffold split demo.\")\n",
     "\n",
     "\n",
     "def murcko_scaffold(smiles):\n",
@@ -1674,7 +1672,8 @@
     "test_target = int(0.2 * len(scaffold_data))\n",
     "test_scaffolds = set()\n",
     "running = 0\n",
-    "for scaffold, count in scaffold_counts.items():\n",
+    "# iterate from rarest to most common\n",
+    "for scaffold, count in scaffold_counts.iloc[::-1].items():\n",
     "    if running >= test_target:\n",
     "        break\n",
     "    test_scaffolds.add(scaffold)\n",
@@ -1683,10 +1682,10 @@
     "test = scaffold_data[scaffold_data[\"Scaffold\"].isin(test_scaffolds)].copy()\n",
     "train = scaffold_data[~scaffold_data[\"Scaffold\"].isin(test_scaffolds)].copy()\n",
     "\n",
-    "for frame in [train, test]:\n",
-    "    frame[feature_names] = (frame[feature_names] - train[feature_names].mean()) / train[\n",
-    "        feature_names\n",
-    "    ].std()\n",
+    "train_mean = train[feature_names].mean()\n",
+    "train_std = train[feature_names].std()\n",
+    "train[feature_names] = (train[feature_names] - train_mean) / train_std\n",
+    "test[feature_names] = (test[feature_names] - train_mean) / train_std\n",
     "\n",
     "x, y = train[feature_names].values, train[\"Solubility\"].values\n",
     "test_x, test_y = test[feature_names].values, test[\"Solubility\"].values\n",
@@ -1701,10 +1700,10 @@
     "\n",
     "random_train = scaffold_data.iloc[rand_train_idx].copy()\n",
     "random_test = scaffold_data.iloc[rand_test_idx].copy()\n",
-    "for frame in [random_train, random_test]:\n",
-    "    frame[feature_names] = (\n",
-    "        frame[feature_names] - random_train[feature_names].mean()\n",
-    "    ) / random_train[feature_names].std()\n",
+    "rand_mean = random_train[feature_names].mean()\n",
+    "rand_std = random_train[feature_names].std()\n",
+    "random_train[feature_names] = (random_train[feature_names] - rand_mean) / rand_std\n",
+    "random_test[feature_names] = (random_test[feature_names] - rand_mean) / rand_std\n",
     "\n",
     "x, y = random_train[feature_names].values, random_train[\"Solubility\"].values\n",
     "test_x, test_y = random_test[feature_names].values, random_test[\"Solubility\"].values\n",
@@ -1714,7 +1713,63 @@
     "\n",
     "print(f\"Scaffold split test MSE: {scaffold_mse:.2f}\")\n",
     "print(f\"Random split test MSE:   {random_mse:.2f}\")\n",
-    "print(f\"Unique scaffolds used for test: {len(test_scaffolds)}\")"
+    "print(f\"Unique scaffolds in test: {len(test_scaffolds)}\")\n",
+    "print(f\"Test molecules: {len(test)}, Train molecules: {len(train)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# show some common train scaffolds and rare test scaffolds\n",
+    "train_scaffolds = set(scaffold_counts.index) - test_scaffolds\n",
+    "train_scaffold_counts = scaffold_counts[scaffold_counts.index.isin(train_scaffolds)]\n",
+    "test_scaffold_counts = scaffold_counts[scaffold_counts.index.isin(test_scaffolds)]\n",
+    "\n",
+    "common_train = train_scaffold_counts.head(3)\n",
+    "rare_test = test_scaffold_counts.sort_values(ascending=False).head(3)\n",
+    "\n",
+    "scaffold_smiles = list(common_train.index) + list(rare_test.index)\n",
+    "scaffold_mols = [rdkit.Chem.MolFromSmiles(s) for s in scaffold_smiles]\n",
+    "legends = [f\"Train (n={c})\" for c in common_train.values] + [\n",
+    "    f\"Test (n={c})\" for c in rare_test.values\n",
+    "]\n",
+    "\n",
+    "# filter out any scaffolds that failed to parse\n",
+    "valid = [(m, l) for m, l in zip(scaffold_mols, legends) if m is not None]\n",
+    "scaffold_mols, legends = zip(*valid) if valid else ([], [])\n",
+    "\n",
+    "print(\"Top train scaffolds (most common) vs test scaffolds:\")\n",
+    "rdkit.Chem.Draw.MolsToGridImage(\n",
+    "    scaffold_mols, molsPerRow=3, subImgSize=(250, 250), legends=list(legends)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# show example molecules from train and test splits\n",
+    "train_examples = train.sample(3, random_state=42)\n",
+    "test_examples = test.sample(3, random_state=42)\n",
+    "\n",
+    "example_smiles = list(train_examples[smiles_col]) + list(test_examples[smiles_col])\n",
+    "example_labels = [\"Train\"] * len(train_examples) + [\"Test\"] * len(test_examples)\n",
+    "example_pairs = [\n",
+    "    (rdkit.Chem.MolFromSmiles(s), l)\n",
+    "    for s, l in zip(example_smiles, example_labels)\n",
+    "    if rdkit.Chem.MolFromSmiles(s) is not None\n",
+    "]\n",
+    "example_mols, example_legends = zip(*example_pairs) if example_pairs else ([], [])\n",
+    "\n",
+    "print(\"Example molecules from each split:\")\n",
+    "rdkit.Chem.Draw.MolsToGridImage(\n",
+    "    example_mols, molsPerRow=3, subImgSize=(250, 250), legends=list(example_legends)\n",
+    ")"
    ]
   },
   {