yaooqinn
diff --git a/‎setup.py‎
Lines changed: 5 additions & 1 deletion b/‎setup.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎spark_history_cli/cli.py‎
Lines changed: 12 additions & 9 deletions b/‎spark_history_cli/cli.py‎
Lines changed: 12 additions & 9 deletions
diff --git a/‎spark_history_cli/skills/spark-advisor/SKILL.md‎
Lines changed: 143 additions & 0 deletions b/‎spark_history_cli/skills/spark-advisor/SKILL.md‎
Lines changed: 143 additions & 0 deletions
diff --git a/‎spark_history_cli/skills/spark-advisor/references/comparison.md‎
Lines changed: 147 additions & 0 deletions b/‎spark_history_cli/skills/spark-advisor/references/comparison.md‎
Lines changed: 147 additions & 0 deletions
@@ -15,7 +15,11 @@
     },
     packages=find_packages(),
     package_data={
-        "spark_history_cli": ["skills/*.md"],
+        "spark_history_cli": [
+            "skills/*.md",
+            "skills/spark-advisor/*.md",
+            "skills/spark-advisor/references/*.md",
+        ],
     },
     install_requires=[
         "click>=8.0.0",
 
@@ -18,7 +18,7 @@
 from spark_history_cli.core.client import SparkHistoryClient, HistoryServerError
 from spark_history_cli.core.session import Session
 from spark_history_cli.core import formatters as fmt
-from spark_history_cli.utils.skill_install import default_skill_target, install_copilot_skill
+from spark_history_cli.utils.skill_install import default_skill_target, install_all_skills
 
 
 # ── Shared state via Click context ────────────────────────────────────
@@ -1100,27 +1100,30 @@ def cmd_install_skill(
     target_dir: Path | None,
     force: bool,
 ):
-    """Install the bundled Copilot skill."""
-    destination = target_dir or default_skill_target(scope)
+    """Install the bundled Copilot skills (spark-history-cli + spark-advisor)."""
+    base = target_dir or default_skill_target(scope)
     try:
-        installed_path = install_copilot_skill(destination, force=force)
+        installed = install_all_skills(base, force=force)
     except FileExistsError as exc:
         raise click.ClickException(str(exc)) from exc
 
+    names = [p.name for p in installed]
     result = {
-        "name": "spark-history-cli",
-        "installed_to": str(installed_path),
+        "skills": names,
+        "installed_to": str(base),
         "scope": scope,
         "next_steps": [
             "Run /skills reload in Copilot CLI if it is already open.",
-            "Verify with /skills list or /skills info spark-history-cli.",
-            "Use it with prompts like 'Use /spark-history-cli to inspect my latest SHS app'.",
+            "Verify with /skills list.",
+            "Use spark-history-cli skill for SHS queries.",
+            "Use spark-advisor skill for diagnosis and comparison.",
         ],
     }
     if state.json_mode:
         output_json(result)
     else:
-        click.echo(f"Installed Copilot skill to {installed_path}")
+        for path in installed:
+            click.echo(f"Installed skill: {path.name} -> {path}")
         click.echo("Next steps:")
         for step in result["next_steps"]:
             click.echo(f"  - {step}")
 
@@ -0,0 +1,143 @@
+---
+name: "spark-advisor"
+description: "Diagnose, compare, and optimize Apache Spark applications and SQL queries using Spark History Server data. Use this skill whenever the user wants to understand why a Spark app is slow, compare two benchmark runs or TPC-DS results, find performance bottlenecks (skew, GC pressure, shuffle spill, straggler tasks), get tuning recommendations, or optimize Spark/Gluten configurations. Also trigger when the user mentions 'diagnose', 'compare runs', 'why is this query slow', 'tune my Spark job', 'benchmark comparison', 'performance regression', or asks about executor skew, shuffle overhead, AQE effectiveness, or Gluten offloading issues."
+---
+
+# Spark Advisor
+
+You are a Spark performance engineer. Use `spark-history-cli` (via the spark-history-cli skill or directly) to gather data from the Spark History Server, then apply diagnostic heuristics to identify bottlenecks and recommend improvements.
+
+## When to use this skill
+
+- User asks why a Spark application or SQL query is slow
+- User wants to compare two benchmark runs (especially TPC-DS)
+- User asks for tuning advice based on actual execution data
+- User mentions performance regressions between runs
+- User wants to understand executor skew, GC pressure, shuffle overhead, or spill
+- User asks about Gluten/Velox offloading effectiveness
+
+## Prerequisites
+
+- A running Spark History Server accessible via `spark-history-cli`
+- If the CLI is not installed: `pip install spark-history-cli`
+- Default server: `http://localhost:18080` (override with `--server`)
+
+## Core Workflow
+
+### 1. Gather Context
+
+Always start by understanding what the user has and what they want to know:
+- Which application(s)? Get app IDs.
+- Single app diagnosis or comparison between two apps?
+- Specific query concern or overall app performance?
+- What changed between runs (config, data, Spark version, Gluten version)?
+
+### 2. Collect Data
+
+Use `--json` for all data collection so you can reason over structured data.
+
+**For single-app diagnosis**, collect in this order:
+```bash
+# Overview first
+spark-history-cli --json -a <app> summary
+spark-history-cli --json -a <app> env
+
+# Then drill into workload
+spark-history-cli --json -a <app> sql                    # all SQL executions
+spark-history-cli --json -a <app> stages                 # all stages
+spark-history-cli --json -a <app> executors --all         # executor metrics
+```
+
+**For app comparison**, collect the same data for both apps.
+
+**For specific query diagnosis**, also fetch:
+```bash
+spark-history-cli --json -a <app> sql <exec-id>          # SQL detail with nodes/edges
+spark-history-cli -a <app> sql-plan <exec-id> --view final   # post-AQE plan
+spark-history-cli -a <app> sql-plan <exec-id> --view initial # pre-AQE plan
+spark-history-cli --json -a <app> sql-jobs <exec-id>     # linked jobs
+spark-history-cli --json -a <app> stage-summary <stage>  # task quantiles for slow stages
+spark-history-cli --json -a <app> stage-tasks <stage> --sort-by -runtime --length 10  # stragglers
+```
+
+### 3. Analyze
+
+Apply the diagnostic rules from `references/diagnostics.md` to identify issues.
+Key areas to check:
+- **Duration breakdown**: Where is time spent? (stages, tasks, shuffle, GC)
+- **Skew detection**: Compare p50 vs p95 in stage-summary; >3x ratio suggests skew
+- **GC pressure**: Total GC time vs executor run time; >10% is concerning
+- **Shuffle overhead**: Large shuffle read/write relative to input size
+- **Spill**: Any memory or disk spill indicates memory pressure
+- **Straggler tasks**: Tasks much slower than peers (check stage-tasks sorted by runtime)
+- **Config issues**: Suboptimal shuffle partitions, executor sizing, serializer choice
+
+### 4. Compare (when applicable)
+
+For TPC-DS benchmark comparisons, see `references/comparison.md` for the structured approach:
+- Match queries by name (q1, q2, ..., q99)
+- Calculate speedup/regression per query
+- Identify top-N improved and regressed queries
+- Drill into regressed queries to find root cause
+- Compare configurations side-by-side
+
+### 5. Report
+
+Produce two outputs:
+1. **Conversation summary**: Key findings and top recommendations (concise, actionable)
+2. **Detailed report file**: Full analysis saved to disk as Markdown
+
+Report structure:
+```markdown
+# Spark Performance Report
+
+## Executive Summary
+<2-3 sentence overview of findings>
+
+## Application Overview
+<summary data for each app>
+
+## Findings
+### Finding 1: <title>
+- **Severity**: High/Medium/Low
+- **Evidence**: <specific metrics>
+- **Recommendation**: <what to change>
+
+## Configuration Comparison (if comparing)
+<side-by-side diff of key Spark properties>
+
+## Query-Level Analysis (if TPC-DS)
+<table of query durations with speedup/regression>
+
+## Recommendations
+<prioritized list of actionable changes>
+```
+
+## Diagnostic Quick Reference
+
+These are the most impactful things to check. For the full diagnostic ruleset, see `references/diagnostics.md`.
+
+| Symptom | What to Check | CLI Command |
+|---------|--------------|-------------|
+| Slow overall | Duration breakdown by stage | `summary`, `stages` |
+| Task skew | p50 vs p95 duration | `stage-summary <id>` |
+| GC pressure | GC time vs run time per executor | `executors --all` |
+| Shuffle heavy | Shuffle bytes vs input bytes | `stages`, `stage <id>` |
+| Memory spill | Spill bytes > 0 | `stage <id>`, `stage-summary <id>` |
+| Straggler tasks | Top tasks by runtime | `stage-tasks <id> --sort-by -runtime` |
+| Bad config | Partition count, executor sizing | `env`, `summary` |
+| AQE ineffective | Initial vs final plan difference | `sql-plan <id> --view initial/final` |
+| Gluten fallback | Non-Transformer nodes in final plan | `sql-plan <id> --view final` |
+
+## Gluten/Velox Awareness
+
+When analyzing Gluten-accelerated applications:
+- **Plan nodes**: `*Transformer` and `*ExecTransformer` nodes indicate Gluten-offloaded operators
+- **Fallback detection**: Non-Transformer nodes in the final plan (e.g., `SortMergeJoin` instead of `ShuffledHashJoinExecTransformer`) indicate Gluten fallback — these are performance-critical to investigate
+- **Columnar exchanges**: `ColumnarExchange` and `ColumnarBroadcastExchange` are Gluten's native shuffle — look for `VeloxColumnarToRow` transitions which indicate fallback boundaries
+- **Native metrics**: Gluten stages may show different metric patterns (lower GC, different memory profiles) than vanilla Spark stages
+
+## References
+
+- `references/diagnostics.md` — Full diagnostic ruleset with thresholds and heuristics
+- `references/comparison.md` — TPC-DS benchmark comparison methodology
@@ -0,0 +1,147 @@
+# TPC-DS Benchmark Comparison
+
+Methodology for comparing two TPC-DS benchmark runs using `spark-history-cli`.
+
+## Step 1: Identify the Two Runs
+
+Ask the user for two application IDs, or find them automatically:
+```bash
+spark-history-cli --json apps --status completed --limit 10
+```
+
+Label them as **baseline** (older/reference run) and **candidate** (newer/test run).
+
+## Step 2: Collect Summaries
+
+For each app:
+```bash
+spark-history-cli --json -a <app> summary
+spark-history-cli --json -a <app> env
+spark-history-cli --json -a <app> sql
+```
+
+## Step 3: Match Queries
+
+TPC-DS queries are identified by their SQL execution `description` field, which typically contains the query name (e.g., "Query - q1", "q1 [i:1]").
+
+Parse the description to extract query names and match them across the two runs.
+
+**Matching rules**:
+- Strip prefixes like "Query - ", "Delta: Query - ", etc.
+- Match on the base query name (q1, q2, ..., q99, q14a, q14b, q23a, q23b, q24a, q24b)
+- TPC-DS has 99 queries but some are split (q14a/b, q23a/b, q24a/b, q39a/b) = ~103 total
+- If a query ran multiple iterations, use the first successful run or the fastest
+
+## Step 4: Calculate Metrics
+
+For each matched query pair:
+```
+duration_baseline = baseline SQL execution duration (ms)
+duration_candidate = candidate SQL execution duration (ms)
+speedup = duration_baseline / duration_candidate
+regression = duration_candidate / duration_baseline (if > 1.0)
+delta_seconds = (duration_candidate - duration_baseline) / 1000
+```
+
+Aggregate metrics:
+```
+total_baseline = sum of all baseline query durations
+total_candidate = sum of all candidate query durations
+overall_speedup = total_baseline / total_candidate
+geomean_speedup = geometric mean of per-query speedups
+```
+
+## Step 5: Produce Comparison Table
+
+Sort queries by absolute time delta (largest regression first):
+
+```markdown
+| Query | Baseline | Candidate | Delta | Speedup | Status |
+|-------|----------|-----------|-------|---------|--------|
+| q67   | 72s      | 85s       | +13s  | 0.85x   | ⚠ REGRESSED |
+| q1    | 61s      | 45s       | -16s  | 1.36x   | ✓ IMPROVED |
+| ...   | ...      | ...       | ...   | ...     | ... |
+```
+
+**Status labels**:
+- `✓ IMPROVED`: speedup > 1.05x (>5% faster)
+- `≈ NEUTRAL`: speedup between 0.95x and 1.05x
+- `⚠ REGRESSED`: speedup < 0.95x (>5% slower)
+
+## Step 6: Drill into Regressions
+
+For the top-3 regressed queries, investigate root cause:
+
+1. **Compare plans**: Fetch `sql-plan --view final` for both apps
+   - Did the plan change? (different join strategies, missing Gluten offloading)
+   - Did AQE make different decisions?
+
+2. **Compare stage metrics**: For the slowest stages in each
+   - Check task skew (`stage-summary`)
+   - Check shuffle size changes
+   - Check GC time differences
+
+3. **Compare configurations**: Diff the `env` output
+   - Focus on: shuffle partitions, memory, broadcast threshold, AQE settings
+   - For Gluten: check native engine version, offload settings
+
+## Step 7: Configuration Diff
+
+Extract key Spark properties from both apps' `env` and show differences:
+
+```markdown
+| Property | Baseline | Candidate |
+|----------|----------|-----------|
+| spark.executor.memory | 56g | 64g |
+| spark.sql.shuffle.partitions | 200 | 512 |
+| spark.sql.adaptive.enabled | true | true |
+```
+
+Focus on properties that differ and are performance-relevant:
+- `spark.executor.memory`, `spark.executor.cores`, `spark.executor.instances`
+- `spark.sql.shuffle.partitions`
+- `spark.sql.adaptive.*`
+- `spark.sql.autoBroadcastJoinThreshold`
+- `spark.serializer`
+- Any `spark.gluten.*` or `spark.plugins` changes
+
+## Step 8: Recommendations
+
+Based on the comparison findings, prioritize recommendations:
+
+1. **If overall regression**: Focus on the top regressed queries and their root causes
+2. **If overall improvement but some regressions**: Note the wins, investigate the regressions
+3. **If config change caused issues**: Recommend reverting specific settings
+4. **If plan changes caused regression**: Recommend query hints or optimizer settings
+
+## Report Template
+
+```markdown
+# TPC-DS Benchmark Comparison
+
+## Overview
+- **Baseline**: <app-name> (<app-id>) — <duration>
+- **Candidate**: <app-name> (<app-id>) — <duration>
+- **Overall**: <speedup>x (total <baseline-total>s → <candidate-total>s)
+- **Improved**: N queries | **Regressed**: N queries | **Neutral**: N queries
+
+## Configuration Changes
+<config diff table>
+
+## Query Results
+<full comparison table sorted by delta>
+
+## Top Regressions
+### q67: +13s (0.85x)
+- **Root cause**: <explanation>
+- **Evidence**: <specific metrics>
+
+## Top Improvements
+### q1: -16s (1.36x)
+- **Likely cause**: <explanation>
+
+## Recommendations
+1. <highest impact recommendation>
+2. <second recommendation>
+3. ...
+```