|
| 1 | +"""Taxonomies + deterministic prefilling. |
| 2 | +
|
| 3 | +Where we can answer a classification question from yc-oss/api fields alone |
| 4 | +(without an LLM), we do. This: |
| 5 | + 1. saves Sonnet calls, |
| 6 | + 2. produces a deterministic answer auditors can re-derive, |
| 7 | + 3. reduces the surface area where the model can hallucinate. |
| 8 | +
|
| 9 | +The LLM still classifies AI capability, tech stack, OSS posture, and the |
| 10 | +tagline — fields that can't be derived from YC's tag list. |
| 11 | +""" |
| 12 | + |
| 13 | +from __future__ import annotations |
| 14 | + |
| 15 | +from ycai.schemas import Industry |
| 16 | + |
| 17 | +# yc-oss industry / subindustry / tag substrings -> our enum. |
| 18 | +# Ordered most-specific first; first match wins. |
| 19 | +_INDUSTRY_RULES: tuple[tuple[str, Industry], ...] = ( |
| 20 | + ("ai infrastructure", Industry.AI_INFRASTRUCTURE), |
| 21 | + ("developer tools", Industry.DEVELOPER_TOOLS), |
| 22 | + ("dev tools", Industry.DEVELOPER_TOOLS), |
| 23 | + ("security", Industry.SECURITY), |
| 24 | + ("biotech", Industry.BIOTECH), |
| 25 | + ("healthcare", Industry.HEALTHCARE), |
| 26 | + ("medical", Industry.HEALTHCARE), |
| 27 | + ("fintech", Industry.FINTECH), |
| 28 | + ("financial", Industry.FINTECH), |
| 29 | + ("legal", Industry.LEGAL), |
| 30 | + ("education", Industry.EDUCATION), |
| 31 | + ("real estate", Industry.REAL_ESTATE_CONSTRUCTION), |
| 32 | + ("construction", Industry.REAL_ESTATE_CONSTRUCTION), |
| 33 | + ("logistics", Industry.SUPPLY_CHAIN_LOGISTICS), |
| 34 | + ("supply chain", Industry.SUPPLY_CHAIN_LOGISTICS), |
| 35 | + ("climate", Industry.CLIMATE_ENERGY), |
| 36 | + ("energy", Industry.CLIMATE_ENERGY), |
| 37 | + ("robotics", Industry.ROBOTICS), |
| 38 | + ("hardware", Industry.HARDWARE), |
| 39 | + ("industrials", Industry.INDUSTRIALS), |
| 40 | + ("government", Industry.GOVERNMENT_DEFENSE), |
| 41 | + ("defense", Industry.GOVERNMENT_DEFENSE), |
| 42 | + ("media", Industry.MEDIA_CONTENT), |
| 43 | + ("content", Industry.MEDIA_CONTENT), |
| 44 | + ("consumer", Industry.CONSUMER), |
| 45 | + ("b2b", Industry.B2B_SAAS), |
| 46 | + ("saas", Industry.B2B_SAAS), |
| 47 | +) |
| 48 | + |
| 49 | + |
| 50 | +def map_industry(yc_industry: str, yc_subindustry: str = "", yc_tags: list[str] | None = None) -> Industry: |
| 51 | + """Map a yc-oss industry/subindustry/tags hint into our enum. |
| 52 | +
|
| 53 | + Returns ``Industry.UNKNOWN`` only if absolutely nothing matches — the LLM |
| 54 | + can override our guess if it has a stronger signal from the website. |
| 55 | + """ |
| 56 | + haystack = " ".join( |
| 57 | + [yc_industry or "", yc_subindustry or "", " ".join(yc_tags or [])], |
| 58 | + ).lower() |
| 59 | + for needle, industry in _INDUSTRY_RULES: |
| 60 | + if needle in haystack: |
| 61 | + return industry |
| 62 | + return Industry.UNKNOWN |
| 63 | + |
| 64 | + |
| 65 | +def industry_secondaries(yc_industry: str, yc_subindustry: str, yc_tags: list[str]) -> list[Industry]: |
| 66 | + """Extra industry hits beyond the primary, from the same haystack. |
| 67 | +
|
| 68 | + Caps at 3 to keep the chart legible. |
| 69 | + """ |
| 70 | + haystack = " ".join([yc_industry or "", yc_subindustry or "", " ".join(yc_tags or [])]).lower() |
| 71 | + seen: list[Industry] = [] |
| 72 | + for needle, industry in _INDUSTRY_RULES: |
| 73 | + if needle in haystack and industry not in seen: |
| 74 | + seen.append(industry) |
| 75 | + return seen[1:4] # skip the primary (index 0), take next 3 |
| 76 | + |
| 77 | + |
| 78 | +__all__ = ["industry_secondaries", "map_industry"] |
0 commit comments