neo4j-partners
diff --git a/‎README.md‎
Lines changed: 22 additions & 10 deletions b/‎README.md‎
Lines changed: 22 additions & 10 deletions
diff --git a/‎build_dbc.py‎
Lines changed: 88 additions & 0 deletions b/‎build_dbc.py‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎full_demo/README.md‎
Lines changed: 2 additions & 2 deletions b/‎full_demo/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎full_demo/agent_modules/check_neo4j.py‎
Lines changed: 12 additions & 13 deletions b/‎full_demo/agent_modules/check_neo4j.py‎
Lines changed: 12 additions & 13 deletions
diff --git a/‎full_demo/agent_modules/generate_embeddings.py‎
Lines changed: 18 additions & 22 deletions b/‎full_demo/agent_modules/generate_embeddings.py‎
Lines changed: 18 additions & 22 deletions
diff --git a/‎full_demo/agent_modules/run_augmentation_agent.py‎
Lines changed: 10 additions & 0 deletions b/‎full_demo/agent_modules/run_augmentation_agent.py‎
Lines changed: 10 additions & 0 deletions
@@ -59,16 +59,26 @@ Create a **Dedicated** cluster with the Neo4j Spark Connector:
 
 ### 2. Import the Workshop
 
-1. Clone or download this repository
-2. In Databricks, go to **Workspace**
-3. Click **Import** and upload the `labs/` folder
+In Databricks, go to **Workspace** > right-click your user folder > **Import** > **URL** and paste:
+
+```
+<DBC_URL>
+```
+
+This imports all lab notebooks into your workspace. Data files (CSV, HTML, embeddings) are downloaded automatically from GitHub when you run the setup notebook.
+
+> **Alternative:** If you prefer to import manually, clone the repo and use the Databricks CLI:
+> ```bash
+> git clone https://github.com/neo4j-partners/graph-enrichment.git
+> databricks workspace import-dir graph-enrichment/labs /Users/<your-email>/graph-enrichment
+> ```
 
 ### 3. Run Required Setup
 
-Open and run **labs/0 - Required Setup**. It will:
+Open and run **0 - Required Setup**. It will:
 
 - Create a catalog, schema, and volume based on your username
-- Copy all data files (CSV, HTML, and pre-computed embeddings) to the volume
+- Download all data files (CSV, HTML, and pre-computed embeddings) from GitHub into your volume
 - Prompt you for Neo4j connection details and store them as Databricks secrets
 - Verify the Neo4j connection
 
@@ -99,15 +109,19 @@ graph-enrichment/
 ├── labs/
 │   ├── 0 - Required Setup.py                 # Environment setup notebook
 │   ├── 1 - Neo4j Import.py                   # Single-step Neo4j data import
+│   ├── 4 - Neo4j to Lakehouse.py             # Export graph to Delta tables
+│   ├── 5 - AI Agents.py                      # Genie + Knowledge Assistant
+│   ├── 6 - Supervisor Agent.py               # Multi-agent coordinator
 │   └── Includes/
-│       ├── config.yaml                        # Workshop configuration
+│       ├── config.py                          # Workshop configuration (imported via %run)
 │       ├── _lib/
-│       │   ├── setup_orchestrator.py          # Setup logic
+│       │   ├── setup_orchestrator.py          # Setup + GitHub data download
 │       │   └── neo4j_import.py                # Import logic
 │       └── data/
 │           ├── csv/                           # Source CSV files (7 files)
 │           ├── html/                          # Source HTML documents (14 files)
 │           └── embeddings/                    # Pre-computed embedding vectors
+├── build_dbc.py                               # Script to package labs/ as a .dbc archive
 ├── lab_7_augmentation_agent/                  # Lab 7: Graph Augmentation
 ├── full_demo/                                 # Reference implementation, validation scripts, and admin tools
 ├── docs/                                      # Reference documentation
@@ -124,11 +138,10 @@ graph-enrichment/
 | Runtime | 13.3 LTS ML or higher (Spark 3.x) |
 | Maven Library | `org.neo4j:neo4j-connector-apache-spark_2.12:5.3.1_for_spark_3` |
 
-The **ML Runtime** is recommended because it includes `pyyaml`, `neo4j`, and `beautifulsoup4`. If using a standard (non-ML) runtime, install these Python packages as cluster libraries:
+The **ML Runtime** is recommended because it includes `neo4j` and `beautifulsoup4`. If using a standard (non-ML) runtime, install these Python packages as cluster libraries:
 
 | Package | Used By |
 |---------|---------|
-| `pyyaml` | Setup notebook (reads config.yaml) |
 | `neo4j` | Import notebook (Neo4j Python driver for document graph) |
 | `beautifulsoup4` | Embedding generation (`generate_embeddings.py`, not student-facing) |
 | `databricks-langchain` | Embedding generation (`generate_embeddings.py`, not student-facing) |
@@ -172,7 +185,6 @@ Any issues discovered through the use of this project should be filed as GitHub
 | pydantic | Data validation | MIT | https://github.com/pydantic/pydantic |
 | mlflow | ML experiment tracking | Apache 2.0 | https://github.com/mlflow/mlflow |
 | beautifulsoup4 | HTML parsing | MIT | https://www.crummy.com/software/BeautifulSoup/ |
-| pyyaml | YAML parsing | MIT | https://github.com/yaml/pyyaml |
 | sentence-transformers | Embedding models | Apache 2.0 | https://github.com/UKPLab/sentence-transformers |
 
 &copy; 2026 Databricks, Inc. All rights reserved. The source in this notebook is provided subject to the [Databricks License](https://databricks.com/db-license-source). All included or referenced third party libraries are subject to the licenses set forth above.
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+"""
+Build a .dbc archive from the labs/ directory.
+
+A .dbc file is a ZIP archive containing Databricks notebooks. Each notebook
+is stored as a JSON entry with its source code, language, and relative path.
+
+Usage:
+    python build_dbc.py                    # outputs labs.dbc
+    python build_dbc.py -o my_workshop.dbc # custom output name
+"""
+
+import argparse
+import base64
+import json
+import os
+import zipfile
+
+LABS_DIR = os.path.join(os.path.dirname(__file__), "labs")
+
+# Map file extensions to Databricks language identifiers
+LANG_MAP = {
+    ".py": "PYTHON",
+    ".sql": "SQL",
+    ".scala": "SCALA",
+    ".r": "R",
+}
+
+
+def build_dbc(labs_dir: str, output_path: str):
+    """Package all notebook files into a .dbc archive."""
+    notebooks = []
+
+    for root, _dirs, files in os.walk(labs_dir):
+        for filename in sorted(files):
+            ext = os.path.splitext(filename)[1].lower()
+            if ext not in LANG_MAP:
+                continue
+
+            filepath = os.path.join(root, filename)
+            rel_path = os.path.relpath(filepath, labs_dir)
+
+            # Remove the file extension for the notebook path
+            notebook_path = os.path.splitext(rel_path)[0]
+
+            with open(filepath, "r") as f:
+                source = f.read()
+
+            notebooks.append({
+                "path": notebook_path,
+                "language": LANG_MAP[ext],
+                "source": source,
+            })
+
+    with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf:
+        for nb in notebooks:
+            # Each entry is a JSON file at the notebook's path
+            entry_path = nb["path"]
+            entry = json.dumps({
+                "version": "NotebookV1",
+                "origId": 0,
+                "name": os.path.basename(nb["path"]),
+                "language": nb["language"],
+                "commands": [
+                    {
+                        "version": "CommandV1",
+                        "origId": 0,
+                        "guid": "",
+                        "subtype": "command",
+                        "commandType": "auto",
+                        "position": 1.0,
+                        "command": nb["source"],
+                    }
+                ],
+            })
+            zf.writestr(entry_path, entry)
+
+    print(f"Built {output_path}")
+    print(f"  {len(notebooks)} notebooks:")
+    for nb in notebooks:
+        print(f"    {nb['path']} ({nb['language']})")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Build a .dbc archive from labs/")
+    parser.add_argument("-o", "--output", default="labs.dbc", help="Output .dbc filename")
+    args = parser.parse_args()
+    build_dbc(LABS_DIR, args.output)
@@ -107,7 +107,7 @@ run_lab7.py         # 7. DSPy augmentation agent (requires Supervisor Agent from
 ## How It Works
 
 - `python -m cli upload` pushes Python files to the Databricks workspace via the Databricks SDK
-- `python -m cli submit` checks that the cluster is RUNNING (errors if not), injects Neo4j credentials from `.env` as command-line arguments, and submits a one-shot job via the SDK Jobs API
-- Each script uses `argparse` to receive credentials and prints PASS/FAIL for each verification check
+- `python -m cli submit` checks that the cluster is RUNNING (auto-starts if terminated), passes all non-core `.env` keys as `KEY=VALUE` parameters, and submits a one-shot job via the SDK Jobs API
+- Each script parses `KEY=VALUE` parameters from `sys.argv` into `os.environ` at startup, then reads configuration via `os.environ` / `os.getenv()`
 - Scripts exit with code 0 on success, code 1 on any failure
 - `python -m cli clean` removes the remote workspace directory and deletes job runs matching the `graph_validation:` prefix
@@ -7,29 +7,28 @@
     python -m cli upload check_neo4j.py && python -m cli submit check_neo4j.py
 """
 
-import argparse
+import os
 import sys
 import time
 
+# Parse KEY=VALUE parameters from cli.submit into environment variables.
+for _arg in sys.argv[1:]:
+    if "=" in _arg and not _arg.startswith("-"):
+        _key, _, _value = _arg.partition("=")
+        os.environ.setdefault(_key, _value)
+
 
 def main():
-    parser = argparse.ArgumentParser(description="Neo4j Connectivity Check")
-    parser.add_argument("--neo4j-uri", required=True, help="Neo4j Aura URI")
-    parser.add_argument("--neo4j-username", default="neo4j", help="Neo4j username")
-    parser.add_argument("--neo4j-password", required=True, help="Neo4j password")
-    parser.add_argument(
-        "--volume-path",
-        default="",
-        help="(unused, accepted for cli.submit compatibility)",
-    )
-    args = parser.parse_args()
+    neo4j_uri = os.environ["NEO4J_URI"]
+    neo4j_username = os.getenv("NEO4J_USERNAME", "neo4j")
+    neo4j_password = os.environ["NEO4J_PASSWORD"]
 
     from neo4j import GraphDatabase
 
     print("=" * 60)
     print("Neo4j Connectivity Check")
     print("=" * 60)
-    print(f"Neo4j URI:  {args.neo4j_uri}")
+    print(f"Neo4j URI:  {neo4j_uri}")
     print()
 
     results = []  # (name, passed, detail)
@@ -44,7 +43,7 @@ def record(name, passed, detail=""):
     try:
         t0 = time.time()
         driver = GraphDatabase.driver(
-            args.neo4j_uri, auth=(args.neo4j_username, args.neo4j_password)
+            neo4j_uri, auth=(neo4j_username, neo4j_password)
         )
         driver.verify_connectivity()
         elapsed = time.time() - t0
 
@@ -12,24 +12,25 @@
     3. Neo4j credentials configured (for document type classification)
 
 Usage:
-    Run as a Databricks job via submit.sh, or directly on a cluster:
-
-    python generate_embeddings.py \
-        --volume-path /Volumes/catalog/schema/volume \
-        --output-path /Volumes/catalog/schema/volume/embeddings/document_chunks_embedded.json
-
-    Download the output JSON and commit it to Includes/data/embeddings/.
+    python -m cli upload generate_embeddings.py && python -m cli submit generate_embeddings.py
 """
 
-import argparse
 import json
+import os
 import re
+import sys
 import time
 import uuid
 from datetime import datetime, timezone
 from enum import Enum
 from typing import Optional
 
+# Parse KEY=VALUE parameters from cli.submit into environment variables.
+for _arg in sys.argv[1:]:
+    if "=" in _arg and not _arg.startswith("-"):
+        _key, _, _value = _arg.partition("=")
+        os.environ.setdefault(_key, _value)
+
 from bs4 import BeautifulSoup
 
 
@@ -206,28 +207,23 @@ def generate_embeddings_databricks(texts: list[str], endpoint: str = "databricks
 # =============================================================================
 
 def main():
-    parser = argparse.ArgumentParser(description="Generate pre-computed embeddings for workshop HTML files")
-    parser.add_argument("--volume-path", required=True, help="Unity Catalog Volume path containing HTML files")
-    parser.add_argument("--output-path", default=None, help="Output path for JSON file (defaults to volume-path/embeddings/document_chunks_embedded.json)")
-    parser.add_argument("--endpoint", default="databricks-gte-large-en", help="Databricks embedding model endpoint")
-    args = parser.parse_args()
-
-    output_path = args.output_path or f"{args.volume_path}/embeddings/document_chunks_embedded.json"
+    volume_path = os.environ["DATABRICKS_VOLUME_PATH"]
+    endpoint = os.getenv("EMBEDDING_ENDPOINT", "databricks-gte-large-en")
+    output_path = os.getenv("EMBEDDING_OUTPUT_PATH") or f"{volume_path}/embeddings/document_chunks_embedded.json"
 
     print("=" * 70)
     print("EMBEDDING GENERATION - Pre-computing document embeddings")
     print("=" * 70)
-    print(f"Volume path:  {args.volume_path}")
+    print(f"Volume path:  {volume_path}")
     print(f"Output path:  {output_path}")
-    print(f"Endpoint:     {args.endpoint}")
+    print(f"Endpoint:     {endpoint}")
     print("")
 
     # Step 1: List HTML files
     print("[1/4] Listing HTML files...")
-    html_path = f"{args.volume_path}/html"
+    html_path = f"{volume_path}/html"
 
     # Volumes are regular filesystem paths on Databricks clusters
-    import os
     html_files = sorted(
         f for f in os.listdir(html_path) if f.endswith(".html")
     )
@@ -282,7 +278,7 @@ def main():
     start_time = time.time()
 
     texts = [chunk["text"] for chunk in all_chunks]
-    embeddings = generate_embeddings_databricks(texts, endpoint=args.endpoint)
+    embeddings = generate_embeddings_databricks(texts, endpoint=endpoint)
 
     for i, chunk in enumerate(all_chunks):
         chunk["embedding"] = embeddings[i]
@@ -298,7 +294,7 @@ def main():
     output = {
         "metadata": {
             "generated_at": datetime.now(timezone.utc).isoformat(),
-            "embedding_model": args.endpoint,
+            "embedding_model": endpoint,
             "embedding_dimensions": dimensions,
             "chunk_size": 4000,
             "chunk_overlap": 200,
@@ -324,7 +320,7 @@ def main():
     print("=" * 70)
     print(f"  Documents: {len(documents)}")
     print(f"  Chunks:    {len(all_chunks)}")
-    print(f"  Model:     {args.endpoint}")
+    print(f"  Model:     {endpoint}")
     print(f"  Dims:      {dimensions}")
     print("")
     print("Next steps:")
 
@@ -7,6 +7,16 @@
 job JSON (cli.submit handles this automatically).
 """
 
+import os
+import sys
+
+# Parse KEY=VALUE parameters from cli.submit into environment variables.
+# databricks_job_runner is not available on the cluster, so we inline the logic.
+for _arg in sys.argv[1:]:
+    if "=" in _arg and not _arg.startswith("-"):
+        _key, _, _value = _arg.partition("=")
+        os.environ.setdefault(_key, _value)
+
 from augmentation_agent.__main__ import main
 
 if __name__ == "__main__":