|
| 1 | +"""Pure-Python chart math, shared by every renderer. |
| 2 | +
|
| 3 | +Each function takes the validated CompanyAnalysis cohort plus optional context |
| 4 | +and returns plain Python data structures (Counters, dicts, lists). No HTML, no |
| 5 | +ECharts JSON, no matplotlib. The rendering layers (``dashboard.py`` for HTML, |
| 6 | +``reports/ppt.py`` for the deck, ``reports/docx.py`` for the memo) consume the |
| 7 | +output of this module. |
| 8 | +
|
| 9 | +This separation is what lets the deck and memo cite the *same* numbers as the |
| 10 | +dashboard. The numerical-drift check in ``reports/anti_hallucination.py`` walks |
| 11 | +the output of these functions and compares it to the prose in the deck/memo. |
| 12 | +""" |
| 13 | + |
| 14 | +from __future__ import annotations |
| 15 | + |
| 16 | +from collections import Counter, defaultdict |
| 17 | +from collections.abc import Iterable |
| 18 | +from dataclasses import dataclass |
| 19 | + |
| 20 | +from ycai.schemas import ( |
| 21 | + CompanyAnalysis, |
| 22 | + Industry, |
| 23 | + RawCompany, |
| 24 | +) |
| 25 | + |
| 26 | + |
| 27 | +@dataclass(frozen=True) |
| 28 | +class CapabilityHeatmap: |
| 29 | + """Capability x industry matrix for the headline chart.""" |
| 30 | + |
| 31 | + capabilities: list[str] # row labels (Y axis) |
| 32 | + industries: list[str] # col labels (X axis) |
| 33 | + matrix: dict[tuple[str, str], int] |
| 34 | + total_keep: int # rows fed into the heatmap (high+medium confidence) |
| 35 | + |
| 36 | + |
| 37 | +def keep_for_charts(analyses: Iterable[CompanyAnalysis]) -> list[CompanyAnalysis]: |
| 38 | + """Standard cohort: high + medium confidence only. Low excluded by design. |
| 39 | +
|
| 40 | + Every renderer should pass through this gate. If a chart is computed from |
| 41 | + something else (e.g. the YC tag distribution doesn't go through enrichment), |
| 42 | + it lives outside the LLM-derived analytics and uses different math. |
| 43 | + """ |
| 44 | + return [a for a in analyses if a.confidence in ("high", "medium")] |
| 45 | + |
| 46 | + |
| 47 | +def confidence_breakdown(analyses: Iterable[CompanyAnalysis]) -> Counter[str]: |
| 48 | + return Counter(a.confidence for a in analyses) |
| 49 | + |
| 50 | + |
| 51 | +def industry_distribution(analyses: Iterable[CompanyAnalysis]) -> Counter[str]: |
| 52 | + """LLM-classified primary industry counts (high+medium only).""" |
| 53 | + keep = keep_for_charts(analyses) |
| 54 | + return Counter(a.industry_primary.value for a in keep) |
| 55 | + |
| 56 | + |
| 57 | +def capability_heatmap(analyses: Iterable[CompanyAnalysis]) -> CapabilityHeatmap: |
| 58 | + """Capability x industry matrix. Rows are top capabilities by total count; |
| 59 | + columns are top industries by total count.""" |
| 60 | + keep = keep_for_charts(analyses) |
| 61 | + matrix: dict[tuple[str, str], int] = defaultdict(int) |
| 62 | + for a in keep: |
| 63 | + for cap in a.ai_capability: |
| 64 | + matrix[(cap.value, a.industry_primary.value)] += 1 |
| 65 | + cap_totals: Counter[str] = Counter() |
| 66 | + ind_totals: Counter[str] = Counter() |
| 67 | + for (cap_label, ind_label), v in matrix.items(): |
| 68 | + cap_totals[cap_label] += v |
| 69 | + ind_totals[ind_label] += v |
| 70 | + top_caps = [name for name, _ in cap_totals.most_common(8)] |
| 71 | + top_inds = [name for name, _ in ind_totals.most_common(6)] |
| 72 | + cap_set = set(top_caps) |
| 73 | + ind_set = set(top_inds) |
| 74 | + restricted = {(c, i): v for (c, i), v in matrix.items() if c in cap_set and i in ind_set} |
| 75 | + return CapabilityHeatmap( |
| 76 | + capabilities=top_caps, |
| 77 | + industries=top_inds, |
| 78 | + matrix=restricted, |
| 79 | + total_keep=len(keep), |
| 80 | + ) |
| 81 | + |
| 82 | + |
| 83 | +def capability_totals(analyses: Iterable[CompanyAnalysis]) -> Counter[str]: |
| 84 | + """Flat capability counter (any company contributes to every capability it lists).""" |
| 85 | + keep = keep_for_charts(analyses) |
| 86 | + counter: Counter[str] = Counter() |
| 87 | + for a in keep: |
| 88 | + for cap in a.ai_capability: |
| 89 | + counter[cap.value] += 1 |
| 90 | + return counter |
| 91 | + |
| 92 | + |
| 93 | +def tech_stack_distribution(analyses: Iterable[CompanyAnalysis]) -> Counter[str]: |
| 94 | + """Tech stack counter. Empty stack list → counted under 'unknown' for clarity.""" |
| 95 | + keep = keep_for_charts(analyses) |
| 96 | + counter: Counter[str] = Counter() |
| 97 | + for a in keep: |
| 98 | + if not a.tech_stack: |
| 99 | + counter["unknown"] += 1 |
| 100 | + else: |
| 101 | + for stack in a.tech_stack: |
| 102 | + counter[stack.value] += 1 |
| 103 | + return counter |
| 104 | + |
| 105 | + |
| 106 | +def oss_posture_distribution(analyses: Iterable[CompanyAnalysis]) -> Counter[str]: |
| 107 | + keep = keep_for_charts(analyses) |
| 108 | + return Counter(a.oss_posture.value for a in keep) |
| 109 | + |
| 110 | + |
| 111 | +def yc_tag_distribution(companies: Iterable[RawCompany]) -> Counter[str]: |
| 112 | + counter: Counter[str] = Counter() |
| 113 | + for c in companies: |
| 114 | + for tag in c.tags: |
| 115 | + counter[tag] += 1 |
| 116 | + return counter |
| 117 | + |
| 118 | + |
| 119 | +def region_distribution(companies: Iterable[RawCompany]) -> Counter[str]: |
| 120 | + counter: Counter[str] = Counter() |
| 121 | + for c in companies: |
| 122 | + for region in c.regions: |
| 123 | + counter[region] += 1 |
| 124 | + return counter |
| 125 | + |
| 126 | + |
| 127 | +def headline_numbers( |
| 128 | + analyses: Iterable[CompanyAnalysis], |
| 129 | + coverage: object | None = None, |
| 130 | +) -> dict[str, int]: |
| 131 | + """Numbers any prose layer is allowed to cite. The drift checker compares |
| 132 | + every number in deck/memo prose against this dict. |
| 133 | +
|
| 134 | + When ``coverage`` is provided, also includes the coverage stats (tier |
| 135 | + counts, upstream/official counts) so deck/memo methodology slides can |
| 136 | + cite them. |
| 137 | + """ |
| 138 | + keep = keep_for_charts(analyses) |
| 139 | + by_conf = confidence_breakdown(analyses) |
| 140 | + cap_counts = capability_totals(analyses) |
| 141 | + out: dict[str, int] = { |
| 142 | + "total_analyses": sum(by_conf.values()), |
| 143 | + "high_confidence": by_conf["high"], |
| 144 | + "medium_confidence": by_conf["medium"], |
| 145 | + "low_confidence": by_conf["low"], |
| 146 | + "cohort_size": len(keep), |
| 147 | + "agents_count": cap_counts.get("agents", 0), |
| 148 | + "no_ai_count": cap_counts.get("no-ai", 0), |
| 149 | + "rag_count": cap_counts.get("rag", 0), |
| 150 | + } |
| 151 | + if coverage is not None: |
| 152 | + out["upstream_count"] = getattr(coverage, "upstream_company_count", 0) |
| 153 | + official = getattr(coverage, "yc_official_count", None) |
| 154 | + if official: |
| 155 | + out["yc_official_count"] = official |
| 156 | + out["tier_a_count"] = getattr(coverage, "tier_a_count", 0) |
| 157 | + out["tier_b_count"] = getattr(coverage, "tier_b_count", 0) |
| 158 | + out["tier_c_count"] = getattr(coverage, "tier_c_count", 0) |
| 159 | + out["analyzable_count"] = getattr(coverage, "analyzable_count", 0) |
| 160 | + # Percentages the deck/memo headlines reference. |
| 161 | + for pct_attr in ("coverage_pct_of_official", "coverage_pct_of_upstream"): |
| 162 | + value = getattr(coverage, pct_attr, None) |
| 163 | + if value is not None: |
| 164 | + out[pct_attr] = round(value) |
| 165 | + return out |
| 166 | + |
| 167 | + |
| 168 | +def spotlight_companies(analyses: Iterable[CompanyAnalysis], top_n: int = 6) -> list[CompanyAnalysis]: |
| 169 | + """Pick the ``top_n`` most differentiated high-confidence companies. |
| 170 | +
|
| 171 | + Heuristic: |
| 172 | + - high confidence required |
| 173 | + - score = number of distinct AI capabilities + 2 if industry not 'B2B SaaS' |
| 174 | + (off-the-beaten-path industries are inherently more interesting) |
| 175 | + - tie-break by tagline length (longer = more substantive) |
| 176 | + """ |
| 177 | + candidates = [a for a in analyses if a.confidence == "high"] |
| 178 | + |
| 179 | + def score(a: CompanyAnalysis) -> tuple[int, int]: |
| 180 | + diversity = len(set(c.value for c in a.ai_capability)) |
| 181 | + off_beat = 0 if a.industry_primary == Industry.B2B_SAAS else 2 |
| 182 | + return (diversity + off_beat, len(a.tagline_rewrite)) |
| 183 | + |
| 184 | + return sorted(candidates, key=score, reverse=True)[:top_n] |
| 185 | + |
| 186 | + |
| 187 | +def quote_candidates(analyses: Iterable[CompanyAnalysis]) -> list[CompanyAnalysis]: |
| 188 | + """Companies whose tagline_rewrite is suitable for a pull quote. |
| 189 | +
|
| 190 | + The rewrite is short by design (<=140 chars), so we just pick the most |
| 191 | + striking high-confidence ones. The deck consumes this list and renders the |
| 192 | + pull-quote slide using ``tagline_rewrite`` verbatim — no second pass through |
| 193 | + the LLM. |
| 194 | + """ |
| 195 | + keep = [a for a in analyses if a.confidence == "high" and len(a.tagline_rewrite) >= 30] |
| 196 | + # Prefer taglines that actually *say something*: avoid the boring ones. |
| 197 | + return sorted(keep, key=lambda a: (-len(a.tagline_rewrite), a.slug)) |
| 198 | + |
| 199 | + |
| 200 | +__all__ = [ |
| 201 | + "CapabilityHeatmap", |
| 202 | + "capability_heatmap", |
| 203 | + "capability_totals", |
| 204 | + "confidence_breakdown", |
| 205 | + "headline_numbers", |
| 206 | + "industry_distribution", |
| 207 | + "keep_for_charts", |
| 208 | + "oss_posture_distribution", |
| 209 | + "quote_candidates", |
| 210 | + "region_distribution", |
| 211 | + "spotlight_companies", |
| 212 | + "tech_stack_distribution", |
| 213 | + "yc_tag_distribution", |
| 214 | +] |
0 commit comments