Add summary command for concise application overview

yaooqinn · Copilot · yaooqinn · commit c4edcac820da · 2026-03-20T00:17:04.000+08:00
Aggregates app details, resource config (driver/executor/shuffle),
and workload stats (jobs/stages/tasks/SQL) into a single view.
Uses 6 API calls: app, env, jobs, stages, executors, sql.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/README.md b/README.md
@@ -71,6 +71,7 @@ spark-history-cli --app-id <id> stages
 spark-history-cli --app-id <id> executors --all
 spark-history-cli --app-id <id> sql
 spark-history-cli --app-id <id> env
+spark-history-cli --app-id <id> summary
 
 # SQL execution plans
 spark-history-cli --app-id <id> sql-plan <exec-id>                # full plan
@@ -106,6 +107,7 @@ executors [--all]       List executors
 sql [id]                List or show SQL executions
 sql-plan <id> [opts]    Show SQL plan (--view, --dot, -o)
 sql-jobs <id>           Show jobs for a SQL execution
+summary                 Application overview (config + workload)
 rdds                    List cached RDDs
 env                     Show environment/config
 logs [path]             Download event logs
diff --git a/spark_history_cli/cli.py b/spark_history_cli/cli.py
@@ -241,6 +241,18 @@ def repl(state: CliState):
                     output_status_block(skin, info, title="Application")
                     skin.hint(f"Context set to {app_id}")
 
+            elif cmd == "summary":
+                app_id = state.resolve_app_id(None)
+                app = client.get_application(app_id)
+                env = client.get_environment(app_id)
+                jobs = client.list_jobs(app_id)
+                stages = client.list_stages(app_id)
+                executors = client.list_all_executors(app_id)
+                sqls = client.list_sql(app_id, length=100000)
+                sections = fmt.format_summary(app, env, jobs, stages, executors, sqls)
+                for title, info in sections.items():
+                    output_status_block(skin, info, title=title)
+
             elif cmd == "jobs":
                 app_id = state.resolve_app_id(args[0] if args else None)
                 status_filter = None
@@ -442,6 +454,39 @@ def cmd_app(state: CliState, app_id: str):
         output_status_block(skin, info, title="Application")
 
 
+@cli.command("summary")
+@pass_state
+def cmd_summary(state: CliState):
+    """Show a concise summary of an application.
+
+    Aggregates application details, resource config, and workload stats
+    into a single overview.
+
+    Examples:
+
+      spark-history-cli -a <app> summary
+
+      spark-history-cli -a <app> --json summary
+    """
+    client = state.ensure_client()
+    app_id = state.resolve_app_id(None)
+    app = client.get_application(app_id)
+    env = client.get_environment(app_id)
+    jobs = client.list_jobs(app_id)
+    stages = client.list_stages(app_id)
+    executors = client.list_all_executors(app_id)
+    sqls = client.list_sql(app_id, length=100000)
+    if state.json_mode:
+        sections = fmt.format_summary(app, env, jobs, stages, executors, sqls)
+        output_json(sections)
+    else:
+        from spark_history_cli.utils.repl_skin import ReplSkin
+        skin = ReplSkin("spark_history", version=__version__)
+        sections = fmt.format_summary(app, env, jobs, stages, executors, sqls)
+        for title, info in sections.items():
+            output_status_block(skin, info, title=title)
+
+
 @cli.command("jobs")
 @click.option("--status", type=click.Choice(
     ["running", "succeeded", "failed", "unknown"], case_sensitive=False))
diff --git a/spark_history_cli/core/formatters.py b/spark_history_cli/core/formatters.py
@@ -115,6 +115,91 @@ def format_app_detail(app: dict) -> dict[str, str]:
     return info
 
 
+def format_summary(
+    app: dict,
+    env: dict,
+    jobs: list[dict],
+    stages: list[dict],
+    executors: list[dict],
+    sqls: list[dict],
+) -> dict[str, dict[str, str]]:
+    """Build a multi-section summary from several API responses.
+
+    Returns an ordered dict of {section_title: {key: value}} pairs.
+    """
+    from collections import Counter
+
+    attempts = app.get("attempts", [])
+    latest = attempts[0] if attempts else {}
+    status = "RUNNING" if not latest.get("completed", True) else "COMPLETED"
+    runtime = env.get("runtime", {})
+    sp = dict(env.get("sparkProperties", []))
+
+    # ── Application ──
+    application = {
+        "App ID": app.get("id", ""),
+        "Name": app.get("name", ""),
+        "Status": f"{_status_icon(status)} {status}",
+        "Duration": _duration(latest.get("duration")),
+        "Spark Version": (
+            f"{latest.get('appSparkVersion', 'N/A')}  "
+            f"(Scala {runtime.get('scalaVersion', 'N/A').replace('version ', '')}, "
+            f"Java {runtime.get('javaVersion', 'N/A')})"
+        ),
+        "Master": sp.get("spark.master", "N/A"),
+        "User": latest.get("sparkUser", ""),
+        "Started": _ts(latest.get("startTimeEpoch")),
+        "Ended": _ts(latest.get("endTimeEpoch")),
+    }
+
+    # ── Resources ──
+    driver_mem = sp.get("spark.driver.memory", "N/A")
+    driver_cores = sp.get("spark.driver.cores", "N/A")
+    exec_mem = sp.get("spark.executor.memory", "N/A")
+    exec_cores = sp.get("spark.executor.cores", "N/A")
+    exec_instances = sp.get("spark.executor.instances", "N/A")
+    active_execs = sum(1 for e in executors if e.get("isActive"))
+    total_execs = len(executors)
+    dyn_alloc = sp.get("spark.dynamicAllocation.enabled", "false")
+
+    resources = {
+        "Driver": f"{driver_mem} / {driver_cores} cores",
+        "Executors": f"{exec_instances} × {exec_mem} / {exec_cores} cores ({total_execs} total, {active_execs} active)",
+        "Dynamic Allocation": dyn_alloc,
+        "Shuffle Partitions": sp.get("spark.sql.shuffle.partitions", "200"),
+        "Serializer": sp.get("spark.serializer", "JavaSerializer").rsplit(".", 1)[-1],
+    }
+
+    # ── Workload ──
+    job_statuses = Counter(j.get("status", "UNKNOWN") for j in jobs)
+    stage_statuses = Counter(s.get("status", "UNKNOWN") for s in stages)
+    sql_statuses = Counter(s.get("status", "UNKNOWN") for s in sqls)
+
+    total_tasks = sum(j.get("numTasks", 0) for j in jobs)
+    completed_tasks = sum(j.get("numCompletedTasks", 0) for j in jobs)
+
+    def _status_summary(counts: Counter) -> str:
+        total = sum(counts.values())
+        parts = []
+        for s in ["SUCCEEDED", "COMPLETED", "COMPLETE", "RUNNING", "FAILED", "SKIPPED", "KILLED", "PENDING", "UNKNOWN"]:
+            if counts.get(s):
+                parts.append(f"{counts[s]} {s.lower()}")
+        return f"{total} ({', '.join(parts)})" if parts else str(total)
+
+    workload = {
+        "Jobs": _status_summary(job_statuses),
+        "Stages": _status_summary(stage_statuses),
+        "Tasks": f"{completed_tasks:,}/{total_tasks:,} completed",
+        "SQL Executions": _status_summary(sql_statuses),
+    }
+
+    return {
+        "Application": application,
+        "Resources": resources,
+        "Workload": workload,
+    }
+
+
 # ── Job Formatters ────────────────────────────────────────────────────
 
 
diff --git a/spark_history_cli/skills/SKILL.md b/spark_history_cli/skills/SKILL.md
@@ -38,6 +38,7 @@ spark-history-cli --json --server http://localhost:18080 --app-id <app-id> sql
 spark-history-cli --json --server http://localhost:18080 --app-id <app-id> sql-plan <exec-id> --view final
 spark-history-cli --server http://localhost:18080 --app-id <app-id> sql-plan <exec-id> --dot -o plan.dot
 spark-history-cli --json --server http://localhost:18080 --app-id <app-id> sql-jobs <exec-id>
+spark-history-cli --json --server http://localhost:18080 --app-id <app-id> summary
 spark-history-cli --json --server http://localhost:18080 --app-id <app-id> env
 spark-history-cli --server http://localhost:18080 --app-id <app-id> logs output.zip
 ```
@@ -64,6 +65,7 @@ python -m spark_history_cli --json apps
   - `--json` + `--view`: structured JSON with `isAdaptive`, `sectionCount`, `plan`, and `sections`
   - `-o <file>`: write output to file instead of stdout
 - `sql-jobs <id>` for jobs associated with a SQL execution (fetches all linked jobs by ID)
+- `summary` for a concise application overview: app info, resource config (driver/executor/shuffle), and workload stats (jobs/stages/tasks/SQL)
 - `env` for Spark config/runtime context
 - `logs` only when the user explicitly wants the event log archive saved locally