maziyarpanahi · maziyarpanahi · Jun 20, 2026 · Jun 19, 2026 · Jun 19, 2026
diff --git a/examples/notebooks/Deidentification_Cookbook.ipynb b/examples/notebooks/Deidentification_Cookbook.ipynb
@@ -0,0 +1,263 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2cfc8ca0",
+   "metadata": {},
+   "source": [
+    "# De-identification Cookbook\n",
+    "\n",
+    "Short, copy-paste **recipes** for the most common de-identification workflows with OpenMed:\n",
+    "\n",
+    "1. De-identify a list or CSV of clinical strings\n",
+    "2. Batch-redact a directory of text files\n",
+    "3. Reversible replace + re-identify round-trip\n",
+    "4. Per-language model selection with `DEFAULT_PII_MODELS`\n",
+    "\n",
+    "> **Synthetic data only.** Every name, date, phone number, email, and MRN in this notebook is fabricated. Never run these cells on real PHI you are not authorized to process, and never commit real patient data.\n",
+    "\n",
+    "**Prerequisites:** `pip install \"openmed[hf]\"` (pulls in `transformers`). The recipes use a lightweight 44M English PII model so they stay CPU-friendly. The first call downloads model weights if they are not already cached."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4a962b86",
+   "metadata": {},
+   "source": [
+    "## Setup\n",
+    "\n",
+    "Import the public API and pick a lightweight English model. Every recipe reuses these."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d17f8d94",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "import tempfile\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from openmed import (\n",
+    "    deidentify,\n",
+    "    reidentify,\n",
+    "    BatchProcessor,\n",
+    "    DEFAULT_PII_MODELS,\n",
+    ")\n",
+    "\n",
+    "# 44M English PII model - the lightest default, good for CPU-only runs.\n",
+    "MODEL = DEFAULT_PII_MODELS[\"en\"]\n",
+    "print(\"Using model:\", MODEL)\n",
+    "\n",
+    "# Shared synthetic notes (fabricated PHI).\n",
+    "NOTES = [\n",
+    "    \"Patient John Doe (DOB 01/15/1970) called from 555-123-4567.\",\n",
+    "    \"Contact Jane Roe at jane.roe@example.com regarding MRN 00123456.\",\n",
+    "    \"Dr. Alan Grant reviewed the chart for Maria Lopez on 2025-03-02.\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "38ae77c6",
+   "metadata": {},
+   "source": [
+    "## Recipe 1 — De-identify a list or CSV of clinical strings\n",
+    "\n",
+    "Call `deidentify()` per string for an in-memory list, then the same idea applied to a CSV column using the stdlib `csv` module (no pandas required)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d383f799",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# De-identify an in-memory list of strings with the default \"mask\" method.\n",
+    "for note in NOTES:\n",
+    "    result = deidentify(note, method=\"mask\", model_name=MODEL)\n",
+    "    print(result.deidentified_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8445c3ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# De-identify one column of a CSV and write a redacted copy — stdlib csv only.\n",
+    "workdir = Path(tempfile.mkdtemp(prefix=\"openmed_cookbook_\"))\n",
+    "src_csv = workdir / \"patients.csv\"\n",
+    "with src_csv.open(\"w\", newline=\"\", encoding=\"utf-8\") as fh:\n",
+    "    writer = csv.writer(fh)\n",
+    "    writer.writerow([\"id\", \"note\"])\n",
+    "    for i, note in enumerate(NOTES, start=1):\n",
+    "        writer.writerow([i, note])\n",
+    "\n",
+    "out_csv = workdir / \"patients_deidentified.csv\"\n",
+    "with src_csv.open(encoding=\"utf-8\") as fin, out_csv.open(\"w\", newline=\"\", encoding=\"utf-8\") as fout:\n",
+    "    reader = csv.DictReader(fin)\n",
+    "    writer = csv.DictWriter(fout, fieldnames=reader.fieldnames)\n",
+    "    writer.writeheader()\n",
+    "    for row in reader:\n",
+    "        row[\"note\"] = deidentify(row[\"note\"], method=\"mask\", model_name=MODEL).deidentified_text\n",
+    "        writer.writerow(row)\n",
+    "\n",
+    "print(out_csv.read_text(encoding=\"utf-8\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e2a750f5",
+   "metadata": {},
+   "source": [
+    "## Recipe 2 — Batch-redact a directory with BatchProcessor\n",
+    "\n",
+    "`BatchProcessor` with `operation=\"deidentify\"` streams a whole folder of `.txt` files through the model in batches. Use `process_directory()` and write a redacted copy for each successful item."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "84ee75f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a folder of synthetic .txt notes.\n",
+    "notes_dir = workdir / \"notes\"\n",
+    "notes_dir.mkdir(exist_ok=True)\n",
+    "for i, note in enumerate(NOTES, start=1):\n",
+    "    (notes_dir / f\"note_{i}.txt\").write_text(note, encoding=\"utf-8\")\n",
+    "\n",
+    "processor = BatchProcessor(\n",
+    "    model_name=MODEL,\n",
+    "    operation=\"deidentify\",\n",
+    "    batch_size=8,\n",
+    "    method=\"mask\",\n",
+    "    confidence_threshold=0.7,\n",
+    ")\n",
+    "batch = processor.process_directory(notes_dir, pattern=\"*.txt\")\n",
+    "\n",
+    "redacted_dir = workdir / \"notes_deidentified\"\n",
+    "redacted_dir.mkdir(exist_ok=True)\n",
+    "\n",
+    "print(batch.summary())\n",
+    "for item in batch.items:\n",
+    "    if not item.success:\n",
+    "        print(f\"Skipped {item.id}: {item.error}\")\n",
+    "        continue\n",
+    "    redacted_path = redacted_dir / item.id\n",
+    "    redacted_path.write_text(item.result.deidentified_text, encoding=\"utf-8\")\n",
+    "    print(f\"{redacted_path.name}: {redacted_path.read_text(encoding='utf-8')}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "95de93bc",
+   "metadata": {},
+   "source": [
+    "## Recipe 3 — Reversible replace + re-identify round-trip\n",
+    "\n",
+    "`method=\"replace\"` swaps PHI for realistic synthetic values. With `keep_mapping=True` you also get a mapping you can feed to `reidentify()` to restore the original — a reversible pipeline for workflows that must re-link later. `consistent=True` keeps the same replacement for repeated entities."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a46710c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "record = \"Patient Sarah Connor, DOB 05/13/1985, phone 555-867-5309, MRN 99887766.\"\n",
+    "\n",
+    "deid = deidentify(\n",
+    "    record,\n",
+    "    method=\"replace\",     # synthetic stand-ins instead of [TAGS]\n",
+    "    model_name=MODEL,\n",
+    "    keep_mapping=True,     # required to reverse later\n",
+    "    consistent=True,       # same entity -> same replacement\n",
+    ")\n",
+    "print(\"De-identified:\", deid.deidentified_text)\n",
+    "print(\"Mapping:\", deid.mapping)\n",
+    "\n",
+    "restored = reidentify(deid.deidentified_text, deid.mapping)\n",
+    "print(\"Re-identified:\", restored)\n",
+    "\n",
+    "# Round-trip fidelity depends on what the model detected: only entities that were\n",
+    "# replaced can be restored. This is illustrative, not a guarantee for every input.\n",
+    "print(\"Round-trip matches original:\", restored == record)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "25022aab",
+   "metadata": {},
+   "source": [
+    "## Recipe 4 — Per-language model selection with DEFAULT_PII_MODELS\n",
+    "\n",
+    "`DEFAULT_PII_MODELS` maps each supported language code to a sensible default PII model. Pass `lang=\"fr\"` (etc.) to `deidentify()`/`extract_pii()` and OpenMed picks the right model and locale-aware patterns automatically. Only the lightweight English model is executed below to avoid heavyweight downloads."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2696b03e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Inspect the per-language defaults.\n",
+    "for lang in [\"en\", \"fr\", \"de\", \"es\", \"nl\", \"hi\"]:\n",
+    "    print(f\"{lang}: {DEFAULT_PII_MODELS[lang]}\")\n",
+    "\n",
+    "def model_for(lang: str) -> str:\n",
+    "    \"\"\"Return the default PII model id for a language code.\"\"\"\n",
+    "    return DEFAULT_PII_MODELS[lang]\n",
+    "\n",
+    "# Run the lightweight English model live:\n",
+    "sample_en = \"Patient John Doe was discharged on 2025-01-05.\"\n",
+    "print(deidentify(sample_en, method=\"mask\", model_name=model_for(\"en\"), lang=\"en\").deidentified_text)\n",
+    "\n",
+    "# For other languages, just pass lang=<code>; OpenMed selects the model + locale rules.\n",
+    "# Not executed here to avoid downloading a larger model:\n",
+    "#   deidentify(\"Né le 15/01/1970 à Paris.\", method=\"mask\", lang=\"fr\").deidentified_text"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3d9552bf",
+   "metadata": {},
+   "source": [
+    "## Summary\n",
+    "\n",
+    "You now have four reusable de-identification recipes: list/CSV, directory batch, reversible round-trip, and per-language selection. For the full reference — every method, smart merging, confidence thresholds, custom patterns, and HIPAA notes — see [PII_Detection_Complete_Guide.ipynb](./PII_Detection_Complete_Guide.ipynb) and [Multilingual_PII_Detection_Guide.ipynb](./Multilingual_PII_Detection_Guide.ipynb).\n",
+    "\n",
+    "**Remember:** synthetic data only — never commit real PHI."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "openmed",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/notebooks/README.md b/examples/notebooks/README.md
@@ -38,6 +38,23 @@ A comprehensive tutorial covering **everything** about PII functionality:
 
 ---
 
+### [Deidentification_Cookbook.ipynb](./Deidentification_Cookbook.ipynb)
+
+**Task-oriented de-identification recipes (copy-paste)**
+
+Short, self-contained recipes for the most common real workflows:
+
+- De-identify a list or CSV of clinical strings
+- Batch-redact a directory of text files with `BatchProcessor`
+- Reversible replace + re-identify round-trip
+- Per-language model selection with `DEFAULT_PII_MODELS`
+
+Uses **synthetic PHI only** and a lightweight 44M English model for CPU-friendly runs.
+
+**Recommended for:** Engineers wiring de-identification into a pipeline who want a quick, runnable starting point.
+
+---
+
 ### 🌍 [Multilingual_PII_Detection_Guide.ipynb](./Multilingual_PII_Detection_Guide.ipynb)
 
 **Multilingual PII detection across 9 supported languages**

diff --git a/tests/unit/test_deidentification_cookbook.py b/tests/unit/test_deidentification_cookbook.py
@@ -0,0 +1,101 @@
+"""Structural validation for the de-identification cookbook notebook.
+
+The notebook's model-calling cells are not executed in CI (out of scope); these
+tests confirm the notebook is valid JSON and has the expected recipe structure.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[2]
+NOTEBOOK = ROOT / "examples" / "notebooks" / "Deidentification_Cookbook.ipynb"
+NB_README = ROOT / "examples" / "notebooks" / "README.md"
+
+REQUIRED_SECTIONS = [
+    "Recipe 1 — De-identify a list or CSV of clinical strings",
+    "Recipe 2 — Batch-redact a directory with BatchProcessor",
+    "Recipe 3 — Reversible replace + re-identify round-trip",
+    "Recipe 4 — Per-language model selection with DEFAULT_PII_MODELS",
+]
+
+
+def _load():
+    return json.loads(NOTEBOOK.read_text(encoding="utf-8"))
+
+
+def _cell_text(cell):
+    source = cell.get("source", "")
+    if isinstance(source, list):
+        return "".join(source)
+    return source
+
+
+def _notebook_text():
+    return "\n".join(_cell_text(cell) for cell in _load()["cells"])
+
+
+def test_notebook_exists():
+    assert NOTEBOOK.exists()
+
+
+def test_notebook_is_valid_structure():
+    nb = _load()
+    assert nb["nbformat"] == 4
+    assert isinstance(nb.get("nbformat_minor"), int)
+    assert isinstance(nb.get("metadata"), dict)
+    assert isinstance(nb.get("cells"), list)
+    assert nb["cells"]
+
+    for cell in nb["cells"]:
+        assert cell["cell_type"] in {"markdown", "code"}
+        assert isinstance(cell.get("metadata"), dict)
+        assert isinstance(cell.get("source"), list)
+        assert all(isinstance(line, str) for line in cell["source"])
+
+
+def test_notebook_has_four_recipes():
+    nb = _load()
+    markdown = "\n".join(
+        _cell_text(cell) for cell in nb["cells"] if cell["cell_type"] == "markdown"
+    )
+    for header in REQUIRED_SECTIONS:
+        assert header in markdown, f"missing recipe section: {header}"
+
+
+def test_notebook_uses_synthetic_data_disclaimer():
+    assert "Synthetic data only" in _notebook_text()
+
+
+def test_notebook_uses_expected_public_apis():
+    source = _notebook_text()
+    for snippet in [
+        "deidentify(",
+        "reidentify(",
+        "BatchProcessor(",
+        "process_directory(",
+        "DEFAULT_PII_MODELS",
+        'method="replace"',
+        "keep_mapping=True",
+        "consistent=True",
+    ]:
+        assert snippet in source
+
+
+def test_batch_recipe_writes_redacted_files():
+    source = _notebook_text()
+    assert "notes_deidentified" in source
+    assert "write_text(item.result.deidentified_text" in source
+
+
+def test_notebook_outputs_are_cleared():
+    nb = _load()
+    for cell in nb["cells"]:
+        if cell["cell_type"] == "code":
+            assert cell.get("outputs") == []
+            assert cell.get("execution_count") is None
+
+
+def test_readme_links_cookbook():
+    assert "Deidentification_Cookbook.ipynb" in NB_README.read_text(encoding="utf-8")