RyanAlberts
diff --git a/‎.secrets.baseline‎
Lines changed: 2 additions & 2 deletions b/‎.secrets.baseline‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 3 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/README.md‎
Lines changed: 2 additions & 1 deletion b/‎examples/README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/output/deck-w26-pr14-2026-05-01.pptx‎
271 KB b/‎examples/output/deck-w26-pr14-2026-05-01.pptx‎
271 KB
diff --git a/‎pyproject.toml‎
Lines changed: 18 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎src/ycai/analytics.py‎
Lines changed: 214 additions & 0 deletions b/‎src/ycai/analytics.py‎
Lines changed: 214 additions & 0 deletions
diff --git a/‎src/ycai/cli.py‎
Lines changed: 53 additions & 0 deletions b/‎src/ycai/cli.py‎
Lines changed: 53 additions & 0 deletions
@@ -142,9 +142,9 @@
         "filename": "src/ycai/dashboard.py",
         "hashed_secret": "3c09e03744a49c6020501c9b7ef6218ad440976e",
         "is_verified": false,
-        "line_number": 46
+        "line_number": 47
       }
     ]
   },
-  "generated_at": "2026-05-01T21:56:30Z"
+  "generated_at": "2026-05-02T02:15:53Z"
 }
@@ -15,6 +15,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Changed
 - **PR #12 — Apache ECharts replaces static CSS bars in the dashboard.** Heatmap is now a real 2D heatmap with hover tooltips. Pie charts (confidence, OSS posture) render with proper labeling and click-to-isolate. Bar charts (industry, tech stack, YC tags, regions) get axis pointers and value tooltips. Loaded from CDN with SRI-pinned integrity hash; falls back to a `<noscript>` table if JS is disabled or the CDN is blocked. Each canvas carries `role="img"` + descriptive `aria-label`. All chart options ship as pure JSON in a `<script type="application/json">` block — no JS function strings, no client-side `eval`. 7 chart canvases, 121 tests passing.
 
+### Phase 2 — reports
+- **PR #14 — VC-style `.pptx` deck with anti-hallucination Layer 2.** New `src/ycai/analytics.py` is the single source of chart math, consumed by both the dashboard (ECharts JSON) and the deck (matplotlib PNG). New `src/ycai/reports/ppt.py` builds a 16-slide deck (cream/orange palette, sans/serif typography). Each chart is a matplotlib PNG anchored to the same Counter the dashboard used. `ycai report <run-dir>` produces `deck.pptx` from existing artifacts at zero LLM cost. New `src/ycai/reports/anti_hallucination.py`: forbidden-phrase scan + numerical-drift check + date-pattern stripping. Two prose streams audited separately — aggregate commentary gets full drift check, per-company taglines/rationales get forbidden-phrase only (Layer 1 already gated their source URLs). 24 new tests (145 total).
+
 ## [0.1.0] — 2026-05-01
 
 First publishable release. End-to-end pipeline that pulls the latest YC batch, classifies it with a Sonnet-class model under strict anti-hallucination guards, and renders a single-file HTML dashboard with row-level drill-downs.
 
@@ -4,7 +4,8 @@ Sanitized sample artifacts. Every commit goes through `make publish-check` so PI
 
 | File | What |
 |---|---|
-| [`output/dashboard-w26-pr12-2026-05-01.html`](output/dashboard-w26-pr12-2026-05-01.html) | **PR #12 dashboard — current best.** Same W26 data as PR #11 (depth=1 crawl, 113/124 high-confidence) but charts upgraded to Apache ECharts: real heatmap, pies with click-to-isolate, bars with axis tooltips. Loads via CDN with SRI; `<noscript>` fallback if JS disabled. |
+| [`output/deck-w26-pr14-2026-05-01.pptx`](output/deck-w26-pr14-2026-05-01.pptx) | **PR #14 VC-style deck — current best.** 16 slides, a16z-feel palette, matplotlib chart PNGs anchored to the same data the dashboard uses. Anti-hallucination Layer 2 ran before write (no forbidden phrases, no numerical drift). |
+| [`output/dashboard-w26-pr12-2026-05-01.html`](output/dashboard-w26-pr12-2026-05-01.html) | **PR #12 dashboard — current best HTML.** Same W26 data, ECharts canvases (real heatmap, pies, bars). |
 | [`output/dashboard-w26-pr11-2026-05-01.html`](output/dashboard-w26-pr11-2026-05-01.html) | PR #11 dashboard with the depth=1 crawl but static CSS bars. Useful for comparing visual fidelity vs. PR #12. |
 | [`output/analyses-w26-pr11-2026-05-01.json`](output/analyses-w26-pr11-2026-05-01.json) | Source data for both PR #11 and PR #12 dashboards. 113/124 high-confidence. |
 | [`output/dashboard-w26-pr4-2026-05-01.html`](output/dashboard-w26-pr4-2026-05-01.html) | PR #4 / v0.1.0 dashboard. Useful baseline (no crawl, 65 OSS-unknown rows; static CSS bars). |
 
@@ -23,6 +23,8 @@ dependencies = [
   "rich>=13",
   "anthropic>=0.40",
   "claude-agent-sdk>=0.1",
+  "python-pptx>=1.0",
+  "matplotlib>=3.7",
 ]
 
 [project.optional-dependencies]
@@ -81,6 +83,22 @@ warn_return_any = true
 warn_unreachable = true
 files = ["src/ycai"]
 
+# Third-party libraries without published type stubs. We pin to the public
+# constructor surface and otherwise treat their return types as Any.
+[[tool.mypy.overrides]]
+module = ["pptx.*", "matplotlib.*"]
+ignore_missing_imports = true
+
+# The deck builder is a thin wrapper over python-pptx whose API surface is
+# untyped (Presentation is a factory function, not a class). Keep strict
+# mode for everything else; relax just this module.
+[[tool.mypy.overrides]]
+module = "ycai.reports.ppt"
+disallow_untyped_calls = false
+disallow_untyped_defs = false
+warn_return_any = false
+disable_error_code = ["valid-type", "attr-defined", "no-any-return", "type-arg", "misc", "no-untyped-def"]
+
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 addopts = "-ra --strict-markers --strict-config"
 
@@ -0,0 +1,214 @@
+"""Pure-Python chart math, shared by every renderer.
+
+Each function takes the validated CompanyAnalysis cohort plus optional context
+and returns plain Python data structures (Counters, dicts, lists). No HTML, no
+ECharts JSON, no matplotlib. The rendering layers (``dashboard.py`` for HTML,
+``reports/ppt.py`` for the deck, ``reports/docx.py`` for the memo) consume the
+output of this module.
+
+This separation is what lets the deck and memo cite the *same* numbers as the
+dashboard. The numerical-drift check in ``reports/anti_hallucination.py`` walks
+the output of these functions and compares it to the prose in the deck/memo.
+"""
+
+from __future__ import annotations
+
+from collections import Counter, defaultdict
+from collections.abc import Iterable
+from dataclasses import dataclass
+
+from ycai.schemas import (
+    CompanyAnalysis,
+    Industry,
+    RawCompany,
+)
+
+
+@dataclass(frozen=True)
+class CapabilityHeatmap:
+    """Capability x industry matrix for the headline chart."""
+
+    capabilities: list[str]  # row labels (Y axis)
+    industries: list[str]  # col labels (X axis)
+    matrix: dict[tuple[str, str], int]
+    total_keep: int  # rows fed into the heatmap (high+medium confidence)
+
+
+def keep_for_charts(analyses: Iterable[CompanyAnalysis]) -> list[CompanyAnalysis]:
+    """Standard cohort: high + medium confidence only. Low excluded by design.
+
+    Every renderer should pass through this gate. If a chart is computed from
+    something else (e.g. the YC tag distribution doesn't go through enrichment),
+    it lives outside the LLM-derived analytics and uses different math.
+    """
+    return [a for a in analyses if a.confidence in ("high", "medium")]
+
+
+def confidence_breakdown(analyses: Iterable[CompanyAnalysis]) -> Counter[str]:
+    return Counter(a.confidence for a in analyses)
+
+
+def industry_distribution(analyses: Iterable[CompanyAnalysis]) -> Counter[str]:
+    """LLM-classified primary industry counts (high+medium only)."""
+    keep = keep_for_charts(analyses)
+    return Counter(a.industry_primary.value for a in keep)
+
+
+def capability_heatmap(analyses: Iterable[CompanyAnalysis]) -> CapabilityHeatmap:
+    """Capability x industry matrix. Rows are top capabilities by total count;
+    columns are top industries by total count."""
+    keep = keep_for_charts(analyses)
+    matrix: dict[tuple[str, str], int] = defaultdict(int)
+    for a in keep:
+        for cap in a.ai_capability:
+            matrix[(cap.value, a.industry_primary.value)] += 1
+    cap_totals: Counter[str] = Counter()
+    ind_totals: Counter[str] = Counter()
+    for (cap_label, ind_label), v in matrix.items():
+        cap_totals[cap_label] += v
+        ind_totals[ind_label] += v
+    top_caps = [name for name, _ in cap_totals.most_common(8)]
+    top_inds = [name for name, _ in ind_totals.most_common(6)]
+    cap_set = set(top_caps)
+    ind_set = set(top_inds)
+    restricted = {(c, i): v for (c, i), v in matrix.items() if c in cap_set and i in ind_set}
+    return CapabilityHeatmap(
+        capabilities=top_caps,
+        industries=top_inds,
+        matrix=restricted,
+        total_keep=len(keep),
+    )
+
+
+def capability_totals(analyses: Iterable[CompanyAnalysis]) -> Counter[str]:
+    """Flat capability counter (any company contributes to every capability it lists)."""
+    keep = keep_for_charts(analyses)
+    counter: Counter[str] = Counter()
+    for a in keep:
+        for cap in a.ai_capability:
+            counter[cap.value] += 1
+    return counter
+
+
+def tech_stack_distribution(analyses: Iterable[CompanyAnalysis]) -> Counter[str]:
+    """Tech stack counter. Empty stack list → counted under 'unknown' for clarity."""
+    keep = keep_for_charts(analyses)
+    counter: Counter[str] = Counter()
+    for a in keep:
+        if not a.tech_stack:
+            counter["unknown"] += 1
+        else:
+            for stack in a.tech_stack:
+                counter[stack.value] += 1
+    return counter
+
+
+def oss_posture_distribution(analyses: Iterable[CompanyAnalysis]) -> Counter[str]:
+    keep = keep_for_charts(analyses)
+    return Counter(a.oss_posture.value for a in keep)
+
+
+def yc_tag_distribution(companies: Iterable[RawCompany]) -> Counter[str]:
+    counter: Counter[str] = Counter()
+    for c in companies:
+        for tag in c.tags:
+            counter[tag] += 1
+    return counter
+
+
+def region_distribution(companies: Iterable[RawCompany]) -> Counter[str]:
+    counter: Counter[str] = Counter()
+    for c in companies:
+        for region in c.regions:
+            counter[region] += 1
+    return counter
+
+
+def headline_numbers(
+    analyses: Iterable[CompanyAnalysis],
+    coverage: object | None = None,
+) -> dict[str, int]:
+    """Numbers any prose layer is allowed to cite. The drift checker compares
+    every number in deck/memo prose against this dict.
+
+    When ``coverage`` is provided, also includes the coverage stats (tier
+    counts, upstream/official counts) so deck/memo methodology slides can
+    cite them.
+    """
+    keep = keep_for_charts(analyses)
+    by_conf = confidence_breakdown(analyses)
+    cap_counts = capability_totals(analyses)
+    out: dict[str, int] = {
+        "total_analyses": sum(by_conf.values()),
+        "high_confidence": by_conf["high"],
+        "medium_confidence": by_conf["medium"],
+        "low_confidence": by_conf["low"],
+        "cohort_size": len(keep),
+        "agents_count": cap_counts.get("agents", 0),
+        "no_ai_count": cap_counts.get("no-ai", 0),
+        "rag_count": cap_counts.get("rag", 0),
+    }
+    if coverage is not None:
+        out["upstream_count"] = getattr(coverage, "upstream_company_count", 0)
+        official = getattr(coverage, "yc_official_count", None)
+        if official:
+            out["yc_official_count"] = official
+        out["tier_a_count"] = getattr(coverage, "tier_a_count", 0)
+        out["tier_b_count"] = getattr(coverage, "tier_b_count", 0)
+        out["tier_c_count"] = getattr(coverage, "tier_c_count", 0)
+        out["analyzable_count"] = getattr(coverage, "analyzable_count", 0)
+        # Percentages the deck/memo headlines reference.
+        for pct_attr in ("coverage_pct_of_official", "coverage_pct_of_upstream"):
+            value = getattr(coverage, pct_attr, None)
+            if value is not None:
+                out[pct_attr] = round(value)
+    return out
+
+
+def spotlight_companies(analyses: Iterable[CompanyAnalysis], top_n: int = 6) -> list[CompanyAnalysis]:
+    """Pick the ``top_n`` most differentiated high-confidence companies.
+
+    Heuristic:
+      - high confidence required
+      - score = number of distinct AI capabilities + 2 if industry not 'B2B SaaS'
+        (off-the-beaten-path industries are inherently more interesting)
+      - tie-break by tagline length (longer = more substantive)
+    """
+    candidates = [a for a in analyses if a.confidence == "high"]
+
+    def score(a: CompanyAnalysis) -> tuple[int, int]:
+        diversity = len(set(c.value for c in a.ai_capability))
+        off_beat = 0 if a.industry_primary == Industry.B2B_SAAS else 2
+        return (diversity + off_beat, len(a.tagline_rewrite))
+
+    return sorted(candidates, key=score, reverse=True)[:top_n]
+
+
+def quote_candidates(analyses: Iterable[CompanyAnalysis]) -> list[CompanyAnalysis]:
+    """Companies whose tagline_rewrite is suitable for a pull quote.
+
+    The rewrite is short by design (<=140 chars), so we just pick the most
+    striking high-confidence ones. The deck consumes this list and renders the
+    pull-quote slide using ``tagline_rewrite`` verbatim — no second pass through
+    the LLM.
+    """
+    keep = [a for a in analyses if a.confidence == "high" and len(a.tagline_rewrite) >= 30]
+    # Prefer taglines that actually *say something*: avoid the boring ones.
+    return sorted(keep, key=lambda a: (-len(a.tagline_rewrite), a.slug))
+
+
+__all__ = [
+    "CapabilityHeatmap",
+    "capability_heatmap",
+    "capability_totals",
+    "confidence_breakdown",
+    "headline_numbers",
+    "industry_distribution",
+    "keep_for_charts",
+    "oss_posture_distribution",
+    "quote_candidates",
+    "region_distribution",
+    "spotlight_companies",
+    "tech_stack_distribution",
+    "yc_tag_distribution",
+]
@@ -115,6 +115,59 @@ def dashboard_cmd(
     console.print(f"[green]✓[/green] wrote dashboard.html → {out}")
 
 
+@app.command("report")
+def report_cmd(
+    run_dir: Path = typer.Argument(..., help="Run directory with coverage.json + analyses.json(l)."),
+    deck_only: bool = typer.Option(False, "--deck-only", help="Skip the .docx memo (Phase 2 PR #15)."),
+) -> None:
+    """Generate the .pptx deck (and .docx memo when shipped) from existing artifacts.
+
+    Anti-hallucination Layer 2 runs before any file is written:
+    forbidden-phrase scan + numerical-drift check. Any violation aborts the
+    build with the offending span so you can fix the prose.
+    """
+    coverage_path = run_dir / "coverage.json"
+    raw_path = run_dir / "raw" / "yc_companies.json"
+    if not coverage_path.exists() or not raw_path.exists():
+        console.print(f"[red]✗ {run_dir} doesn't look like a valid run directory[/red]")
+        raise typer.Exit(2)
+
+    coverage = BatchCoverage.model_validate_json(coverage_path.read_text())
+    companies = [RawCompany.model_validate(c) for c in json.loads(raw_path.read_text())]
+
+    analyses_jsonl = run_dir / "analyses.jsonl"
+    analyses_json = run_dir / "analyses.json"
+    if analyses_jsonl.exists():
+        analyses = [
+            CompanyAnalysis.model_validate_json(line)
+            for line in analyses_jsonl.read_text().splitlines()
+            if line.strip()
+        ]
+    elif analyses_json.exists():
+        analyses = [CompanyAnalysis.model_validate(a) for a in json.loads(analyses_json.read_text())]
+    else:
+        console.print(f"[red]✗ no analyses found in {run_dir}. Run with --enrich first.[/red]")
+        raise typer.Exit(2)
+
+    from ycai.reports.ppt import Layer2Failure, build_deck
+
+    deck_path = run_dir / "deck.pptx"
+    console.print("[cyan]→[/cyan] building deck.pptx (Layer 2 audit before write)…")
+    try:
+        build_deck(coverage, companies, analyses, output_path=deck_path)
+    except Layer2Failure as exc:
+        console.print(f"[red]✗ Layer 2 audit failed:[/red] {exc}")
+        for hit in exc.forbidden[:5]:
+            console.print(f"  [red]forbidden phrase '{hit.phrase}':[/red] {hit.excerpt}")
+        for drift in exc.drifts[:5]:
+            console.print(f"  [red]numerical drift '{drift.number}':[/red] {drift.excerpt}")
+        raise typer.Exit(5) from exc
+    console.print(f"[green]✓[/green] wrote {deck_path}")
+
+    if not deck_only:
+        console.print("[yellow]⚠ .docx memo lands in PR #15.[/yellow]")
+
+
 @app.command("resume")
 def resume_cmd(
     run_dir: Path = typer.Argument(..., help="Run directory from a previous (partial) enrichment."),
Original file line number	Diff line number	Diff line change
`@@ -142,9 +142,9 @@`
`142`	`142`	`"filename": "src/ycai/dashboard.py",`
`143`	`143`	`"hashed_secret": "3c09e03744a49c6020501c9b7ef6218ad440976e",`
`144`	`144`	`"is_verified": false,`
`145`		`- "line_number": 46`
	`145`	`+ "line_number": 47`
`146`	`146`	`}`
`147`	`147`	`]`
`148`	`148`	`},`
`149`		`- "generated_at": "2026-05-01T21:56:30Z"`
	`149`	`+ "generated_at": "2026-05-02T02:15:53Z"`
`150`	`150`	`}`