cleanup

retroryan · retroryan · commit 4b2d98d8f31c · 2026-04-10T18:51:04.000-06:00
diff --git a/.env.sample b/.env.sample
@@ -3,5 +3,5 @@
 # by ./setup_secrets.sh
 
 NEO4J_URI=neo4j+s://xxxxxxxx.databases.neo4j.io
-NEO4J_USER=neo4j
+NEO4J_USERNAME=neo4j
 NEO4J_PASSWORD=replace-with-aura-password
diff --git a/00_setup_and_data.py b/00_setup_and_data.py
@@ -24,8 +24,8 @@
 CATALOG = "graph_feature_engineering_demo"
 SCHEMA  = "neo4j_webinar"
 
-NEO4J_URI      = dbutils.secrets.get("neo4j-graph-engineering", "uri")       # neo4j+s://xxx.databases.neo4j.io
-NEO4J_USER     = dbutils.secrets.get("neo4j-graph-engineering", "user")      # neo4j
+NEO4J_URI      = dbutils.secrets.get("neo4j-graph-engineering", "uri")        # neo4j+s://xxx.databases.neo4j.io
+NEO4J_USER     = dbutils.secrets.get("neo4j-graph-engineering", "username")   # neo4j
 NEO4J_PASSWORD = dbutils.secrets.get("neo4j-graph-engineering", "password")   # from Aura credentials file
 
 # COMMAND ----------
diff --git a/01_neo4j_ingest.py b/01_neo4j_ingest.py
@@ -26,7 +26,7 @@
 SCHEMA  = "neo4j_webinar"
 
 NEO4J_URI      = dbutils.secrets.get("neo4j-graph-engineering", "uri")
-NEO4J_USER     = dbutils.secrets.get("neo4j-graph-engineering", "user")
+NEO4J_USER     = dbutils.secrets.get("neo4j-graph-engineering", "username")
 NEO4J_PASSWORD = dbutils.secrets.get("neo4j-graph-engineering", "password")
 
 # Common Spark Connector options
diff --git a/03_pull_and_model.py b/03_pull_and_model.py
@@ -33,7 +33,7 @@
 SCHEMA  = "neo4j_webinar"
 
 NEO4J_URI      = dbutils.secrets.get("neo4j-graph-engineering", "uri")
-NEO4J_USER     = dbutils.secrets.get("neo4j-graph-engineering", "user")
+NEO4J_USER     = dbutils.secrets.get("neo4j-graph-engineering", "username")
 NEO4J_PASSWORD = dbutils.secrets.get("neo4j-graph-engineering", "password")
 
 NEO4J_OPTS = {
diff --git a/README.md b/README.md
@@ -52,7 +52,7 @@ The notebooks read credentials via:
 
 ```python
 dbutils.secrets.get("neo4j-graph-engineering", "uri")
-dbutils.secrets.get("neo4j-graph-engineering", "user")
+dbutils.secrets.get("neo4j-graph-engineering", "username")
 dbutils.secrets.get("neo4j-graph-engineering", "password")
 ```
 
diff --git a/aura_gds_guide.md b/aura_gds_guide.md
@@ -18,10 +18,14 @@ When finished, return to Databricks and run `03_pull_and_model`.
 
 ---
 
-## Step 1: Verify the Graph Loaded Correctly
+## Step 1: Verify and Explore the Graph
+
+Before projecting anything, make sure the ingest landed and get a feel for the
+shape of the data.
+
+### 1a. Node and relationship counts
 
 ```cypher
-// Check node and relationship counts
 MATCH (a:Account) WITH count(a) AS accounts
 MATCH (m:Merchant) WITH accounts, count(m) AS merchants
 MATCH ()-[t:TRANSACTED_WITH]->() WITH accounts, merchants, count(t) AS txns
@@ -31,6 +35,50 @@ RETURN accounts, merchants, txns, p2p
 
 **Expected:** ~5,000 accounts, ~500 merchants, ~50,000 transactions, ~8,000 transfers.
 
+### 1b. Fraud vs legitimate account breakdown
+
+```cypher
+MATCH (a:Account)
+RETURN a.is_fraud             AS is_fraud,
+       count(a)                AS account_count,
+       round(avg(a.balance), 2) AS avg_balance,
+       min(a.holder_age)        AS min_age,
+       max(a.holder_age)        AS max_age
+ORDER BY is_fraud DESC
+```
+
+**What to look for:** ~200 fraud accounts (4%) vs ~4,800 legitimate accounts.
+Balances and holder-age ranges should overlap heavily — that's the whole point
+of the dataset. The graph is where the separation lives.
+
+### 1c. Merchant risk-tier distribution
+
+```cypher
+MATCH (m:Merchant)
+RETURN m.risk_tier     AS risk_tier,
+       m.category       AS category,
+       count(m)         AS merchant_count
+ORDER BY risk_tier, merchant_count DESC
+```
+
+**What to look for:** `crypto` and `gaming` categories skew heavily toward
+`risk_tier = high` — these are the merchants fraud accounts preferentially
+transact with.
+
+### 1d. Sample the subgraph around a fraud account
+
+```cypher
+MATCH (a:Account {is_fraud: true})
+WITH a LIMIT 1
+OPTIONAL MATCH (a)-[t:TRANSACTED_WITH]->(m:Merchant)
+OPTIONAL MATCH (a)-[p:TRANSFERRED_TO]->(b:Account)
+RETURN a, t, m, p, b
+```
+
+**What to look for:** the fraud account should connect to several `risk_tier = high`
+merchants and have at least one outgoing `TRANSFERRED_TO` edge to another
+account. Good visual primer before running the algorithms.
+
 ---
 
 ## Step 2: Project the Account Transfer Graph
@@ -248,6 +296,79 @@ across all three features.
 
 ---
 
+## Step 11: Fraud Detection Queries in Pure Cypher
+
+Before handing the features back to Databricks, it is worth seeing the payoff
+in Cypher alone. These two queries combine the GDS-written properties with
+the raw graph to surface fraud patterns directly.
+
+### 11a. Identify Ring Members
+
+A fraud ring is a Louvain community where multiple accounts both send *and*
+receive money within the same community. Accounts that only send or only
+receive are peripheral; accounts on both sides of a transfer are core ring
+participants. The query collects senders and receivers per community, then
+intersects them — any account in both lists is a confirmed bidirectional
+participant. Communities with three or more such accounts are coordinated
+rings, not coincidence.
+
+```cypher
+MATCH (s:Account)-[:TRANSFERRED_TO]->(r:Account)
+WHERE s.community_id IS NOT NULL
+  AND s.community_id = r.community_id
+WITH s.community_id AS community,
+     collect(DISTINCT s.account_id) AS senders,
+     collect(DISTINCT r.account_id) AS receivers
+WITH community,
+     [x IN senders WHERE x IN receivers] AS ring_members
+WHERE size(ring_members) >= 3
+RETURN community,
+       ring_members,
+       size(ring_members) AS ring_size
+ORDER BY ring_size DESC
+```
+
+**What to look for:** small communities (tight clusters) with `ring_size >= 3`.
+Cross-reference the `ring_members` account IDs against the `is_fraud` ground
+truth and you should see a high precision — the Louvain + bidirectional
+intersection combo finds rings without needing labels.
+
+### 11b. Off-Hours Transaction Detection
+
+Fraud accounts in this dataset skew slightly toward off-hours activity.
+Flagging accounts with three or more transactions between midnight and 5am,
+then joining the already-written `risk_score` and `community_id`, gives a
+single ranked list that combines structural (graph) and behavioural (time-of-day)
+signal.
+
+```cypher
+MATCH (a:Account)-[t:TRANSACTED_WITH]->(m:Merchant)
+WHERE t.txn_hour >= 0 AND t.txn_hour < 6
+WITH a,
+     count(t)                        AS off_hours_count,
+     round(avg(t.amount), 2)         AS avg_amount,
+     round(sum(t.amount), 2)         AS total_amount,
+     collect(DISTINCT m.merchant_id) AS merchants_used
+WHERE off_hours_count >= 3
+RETURN a.account_id         AS account_id,
+       a.is_fraud            AS is_fraud,
+       a.risk_score          AS risk_score,
+       a.community_id        AS community_id,
+       off_hours_count,
+       avg_amount,
+       total_amount,
+       size(merchants_used)  AS distinct_merchants
+ORDER BY off_hours_count DESC
+LIMIT 25
+```
+
+**What to look for:** accounts with high `off_hours_count` that *also* have
+a high `risk_score` and share a `community_id` with other flagged accounts.
+Those are the strongest fraud candidates — three independent signals pointing
+at the same account.
+
+---
+
 ## Done in Aura
 
 The graph now has three GDS-computed properties on every Account node:
diff --git a/setup_secrets.sh b/setup_secrets.sh
@@ -3,11 +3,39 @@
 # scope "neo4j-graph-engineering". Requires the Databricks CLI to be
 # installed and authenticated (databricks auth login or DATABRICKS_HOST/
 # DATABRICKS_TOKEN env vars).
+#
+# Usage:
+#   ./setup_secrets.sh [--profile NAME] [ENV_FILE]
+#
+# The Databricks profile is resolved once and exported as
+# DATABRICKS_CONFIG_PROFILE so every subsequent CLI call in this script
+# reuses it without re-prompting. Resolution order:
+#   1. --profile / -p flag
+#   2. DATABRICKS_CONFIG_PROFILE environment variable
+#   3. Interactive prompt (lists available profiles)
 
 set -euo pipefail
 
 SCOPE="neo4j-graph-engineering"
-ENV_FILE="${1:-.env}"
+ENV_FILE=".env"
+PROFILE="${DATABRICKS_CONFIG_PROFILE:-}"
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    -p|--profile)
+      PROFILE="$2"
+      shift 2
+      ;;
+    -h|--help)
+      echo "Usage: $0 [--profile NAME] [ENV_FILE]"
+      exit 0
+      ;;
+    *)
+      ENV_FILE="$1"
+      shift
+      ;;
+  esac
+done
 
 if [ ! -f "$ENV_FILE" ]; then
   echo "Error: $ENV_FILE not found."
@@ -20,22 +48,43 @@ if ! command -v databricks >/dev/null 2>&1; then
   exit 1
 fi
 
+# Resolve the Databricks profile once — every CLI call below inherits it via
+# the exported DATABRICKS_CONFIG_PROFILE, so the user is never re-prompted.
+if [ -z "$PROFILE" ]; then
+  echo "Available Databricks profiles:"
+  databricks auth profiles 2>/dev/null || echo "  (could not list profiles — check your ~/.databrickscfg)"
+  echo
+  read -r -p "Profile name [DEFAULT]: " PROFILE
+  PROFILE="${PROFILE:-DEFAULT}"
+fi
+
+export DATABRICKS_CONFIG_PROFILE="$PROFILE"
+echo "Using Databricks profile: $DATABRICKS_CONFIG_PROFILE"
+echo
+
 # Load .env
 set -a
 # shellcheck disable=SC1090
 source "$ENV_FILE"
 set +a
 
 : "${NEO4J_URI:?NEO4J_URI is not set in $ENV_FILE}"
-: "${NEO4J_USER:?NEO4J_USER is not set in $ENV_FILE}"
+: "${NEO4J_USERNAME:?NEO4J_USERNAME is not set in $ENV_FILE}"
 : "${NEO4J_PASSWORD:?NEO4J_PASSWORD is not set in $ENV_FILE}"
 
-# Create the scope if it does not already exist
-if databricks secrets list-scopes --output json | grep -q "\"name\":\"$SCOPE\""; then
+# Create the scope — if it already exists, that is fine.
+set +e
+create_out=$(databricks secrets create-scope "$SCOPE" 2>&1)
+create_rc=$?
+set -e
+
+if [ "$create_rc" -eq 0 ]; then
+  echo "Created secret scope: $SCOPE"
+elif [[ "$create_out" == *"already exists"* ]]; then
   echo "Secret scope already exists: $SCOPE"
 else
-  echo "Creating secret scope: $SCOPE"
-  databricks secrets create-scope "$SCOPE"
+  echo "Error creating scope: $create_out" >&2
+  exit 1
 fi
 
 put_secret() {
@@ -47,7 +96,7 @@ put_secret() {
 
 echo "Writing secrets into $SCOPE:"
 put_secret "uri"      "$NEO4J_URI"
-put_secret "user"     "$NEO4J_USER"
+put_secret "username" "$NEO4J_USERNAME"
 put_secret "password" "$NEO4J_PASSWORD"
 
 echo