Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gitnexus/bench/emit-persistence/baselines.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"fingerprint": "4cc418ea87b6d20a68b5c1139f35d81820b715c63de0ec812e73e2135f5b00b1",
"fingerprint": "b169463b7d02185d757b6d8601db6215ac6e7b2a20e52fb0f1276cc153836bd4",
"scaling_budget": 1.8,
"max_ms_large": 1000,
"_note": "fingerprint = sha256 over per-file digests (filename + sha256(file bytes)), entry list sorted — binds each emitted line to its file so a row routed to the WRONG pair file changes the hash, AND catches within-file row reordering (file bytes hashed as-written). Byte-identity gate for #2203 U2/U3. NOTE: a future change that legitimately reorders emit (without changing the node/edge SET) will trip --check; regenerate then. scaling_budget bounds (t_large/t_small)/(LARGE/SMALL): observed ~0.95-1.05 (linear); 1.8 tolerates disk-I/O timing noise on CI while still catching an O(n^2) re-regression (~4x). max_ms_large=1000ms is a coarse absolute backstop (observed ~200ms) that catches a gross uniform slowdown the ratio gate misses; generous so CI host noise won't flake it. Regenerate via `node --import tsx bench/emit-persistence/measure.mjs`."
Expand Down
20 changes: 20 additions & 0 deletions gitnexus/src/core/group/extractors/http-patterns/java.ts
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,26 @@ function scanSpringProject(files: readonly HttpScanInput[]): HttpFileDetections[
export const JAVA_HTTP_PLUGIN: HttpLanguagePlugin = {
name: 'java-http',
language: Java,
// routeCoverage intentionally LEFT at the default 'partial' (#2138 Part 2).
// The graph provider set is a strict *subset* of this scan()'s provider set —
// ingestion does NOT emit a Route node for (1) array-form `@GetMapping({...})`,
// (2) interface-inherited Spring routes, or (3) the 2nd verb of a same-URL
// GET+POST pair (Route nodes are URL-keyed). Declaring 'complete' here would
// let the parse-skip drop those group-only providers. Java flips to 'complete'
// only once ingestion provider extraction matches this scan (a follow-up:
// array-form query branch + interface-inheritance emission + per-verb Route
// identity). `hasConsumerSignals` below is kept ready for that flip.
// Consumer signals this plugin's scan() can detect: RestTemplate / WebClient /
// OkHttp / Java-HttpClient / Apache-HttpClient call sites, OpenFeign
// (`@FeignClient` + `@RequestLine`) interfaces, and Spring 6 HTTP Interface
// `@(Get|...)Exchange` / `@HttpExchange`. A provider-covered file containing
// any of these must still be parsed so its consumer contracts are not dropped
// (ingestion emits no FETCHES for Java). Conservative by design.
hasConsumerSignals(content) {
return /\brestTemplate\b|\bwebClient\b|Request\.Builder|HttpRequest|HttpMethod\.|new\s+Http(Get|Post|Put|Delete|Patch)\b|@RequestLine|@FeignClient|Exchange/.test(
content,
);
},
scan(tree) {
const out: HttpDetection[] = [];

Expand Down
11 changes: 11 additions & 0 deletions gitnexus/src/core/group/extractors/http-patterns/php.ts
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,17 @@ function isHttpUrlLiteral(path: string): boolean {
export const PHP_HTTP_PLUGIN: HttpLanguagePlugin = {
name: 'php-http',
language: PHP.php_only,
// Laravel `Route::<verb>(...)` definitions are emitted as Route nodes by
// ingestion, so the graph is authoritative for PHP providers (#2138 Part 2).
routeCoverage: 'complete',
// Consumer signals scan() can detect: Laravel `Http::<verb>`, Guzzle client
// `->get/post/.../request(...)`, and `file_get_contents` of an HTTP URL. A
// provider-covered file with any of these must still be parsed (ingestion
// emits no FETCHES for PHP). Conservative — the `->verb(` shape over-matches
// ordinary method calls, which only costs a parse, never data.
hasConsumerSignals(content) {
return /Http::|file_get_contents|->\s*(get|post|put|delete|patch|request)\s*\(/i.test(content);
},
scan(tree) {
const out: HttpDetection[] = [];

Expand Down
16 changes: 16 additions & 0 deletions gitnexus/src/core/group/extractors/http-patterns/python.ts
Original file line number Diff line number Diff line change
Expand Up @@ -920,6 +920,22 @@ function joinPrefix(prefix: string, route: string): string {
export const PYTHON_HTTP_PLUGIN: HttpLanguagePlugin = {
name: 'python-http',
language: Python,
// routeCoverage intentionally LEFT at the default 'partial' (#2138 Part 2).
// It would be a no-op even if set to 'complete': FastAPI decorator routes set
// no handlerName (generic worker path) and Django sets methodName: null, so no
// Python file ever resolves a handlerSymbolId and none would be parse-skipped.
// Declaring 'complete' now is only a latent trap for the moment a follow-up
// gives FastAPI routes a handlerName. `hasConsumerSignals` is kept (and is a
// true superset of scan()'s consumer shapes) so the precondition already holds
// when Python is later flipped to 'complete'.
// Consumer signals scan() can detect: `requests.<verb>`/`requests.request`,
// `httpx` (sync/async client), the `uri=`/`url=` keyword/variable wrapper
// calls, plus aiohttp/urllib. Conservative — over-matching only costs a parse.
hasConsumerSignals(content) {
return /\brequests\s*\.|\bhttpx\b|\baiohttp\b|\burllib\b|\burlopen\b|\buri\s*=|\burl\s*=/.test(
content,
);
},
prepareRepo({ files, parser, readFile, parseSource }): RepoContext {
return buildPythonRepoContext(files, parser, readFile, parseSource);
},
Expand Down
37 changes: 37 additions & 0 deletions gitnexus/src/core/group/extractors/http-patterns/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,43 @@ export interface HttpLanguagePlugin {
name: string;
/** tree-sitter grammar object (passed to the shared parser). */
language: unknown;
/**
* Whether ingestion is known to emit a `Route` graph node for EVERY
* provider route in this language (Spring/FastAPI/Laravel annotations are
* extracted into Route nodes during parse). When `'complete'`, the
* orchestrator may skip the source-scan + tree-sitter parse for a file whose
* graph provider routes all resolved a handler symbol (#2138 Part 2) — the
* graph is authoritative, the scan would only re-discover the same routes.
*
* Defaults to `'partial'` (the safe assumption): the source scan always runs,
* so a language whose ingestion coverage is incomplete never loses routes.
* This is a deliberate, per-language trust assertion — set it only for
* languages whose route ingestion is provably complete.
*/
routeCoverage?: 'complete' | 'partial';
/**
* Cheap, parse-free pre-check used by the parse-skip optimization (#2138
* Part 2). Given a file's raw source text, return `false` ONLY when the file
* provably contains no outbound-HTTP (consumer) call that this plugin's
* `scan()` would detect; return `true` on any doubt.
*
* Why it exists: `routeCoverage: 'complete'` asserts *provider* Route-node
* completeness only. A provider-covered file may ALSO be a consumer (e.g. a
* Spring `@RestController` that calls `restTemplate`/`webClient`, a Laravel
* controller using Guzzle, a FastAPI handler calling `requests`/`httpx`).
* Ingestion's `FETCHES` edges are JS/TS-only, so the graph cannot back up
* those server-side consumers — they come solely from the source scan. The
* orchestrator may therefore skip a provider-covered file's parse only when
* this returns `false`; otherwise the file is still scanned so its consumer
* contracts are not dropped.
*
* MUST be implemented by any plugin whose `scan()` can emit `'consumer'`
* detections AND that declares `routeCoverage: 'complete'`; otherwise that
* language's provider-covered files are never parse-skipped (safe, no win).
* The check is intentionally conservative — over-matching only costs a parse
* that could have been skipped; it never drops data.
*/
hasConsumerSignals?(content: string): boolean;
/**
* Optional pre-pass: walk the relevant files in the repo and produce
* an opaque context that `scan` can use to resolve cross-file facts.
Expand Down
193 changes: 135 additions & 58 deletions gitnexus/src/core/group/extractors/http-route-extractor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ MATCH (handlerFile:File)-[r:CodeRelation {type: 'HANDLES_ROUTE'}]->(route:Route)
RETURN handlerFile.id AS fileId, handlerFile.filePath AS filePath,
route.name AS routePath, route.id AS routeId,
route.method AS routeMethod,
route.handlerSymbolId AS handlerSymbolId,
route.responseKeys AS responseKeys,
r.reason AS routeSource`;
const FETCHES_QUERY = `
Expand Down Expand Up @@ -282,22 +283,57 @@ export class HttpRouteExtractor implements ContractExtractor {
};

const files = await getScannedFiles();
await collectProjectDetections(files);

// Run the graph provider pass FIRST. After #2138 Part 2 it reads handler
// symbols from the graph (no source parse for resolved routes), so it can
// report which files are fully graph-covered BEFORE we decide what to
// parse. Files fully covered by a `routeCoverage: 'complete'` language are
// candidates to skip the source scan + tree-sitter parse — but only their
// *providers* are graph-authoritative; the consumer-safety gate below
// removes any candidate that still needs scanning for outbound calls.
const coveredFiles = new Set<string>();
const graphProviders =
dbExecutor != null ? await this.extractProvidersGraph(dbExecutor, getDetections) : [];
// Source scan always runs to capture routes in languages/files not covered
// by graph edges; the glob and per-file parse results are cached above.
dbExecutor != null
? await this.extractProvidersGraph(dbExecutor, getDetections, coveredFiles)
: [];

// Consumer-safety gate (#2138 Part 2): `extractProvidersGraph` marks a file
// covered on *provider* grounds (all HANDLES_ROUTE rows resolved + a
// `routeCoverage: 'complete'` language). But a provider-covered file may also
// be a *consumer* (a controller that calls RestTemplate/WebClient/Guzzle/
// requests/...), and ingestion emits no FETCHES edges for those server-side
// languages — the graph can't back them up. So a covered file is only truly
// safe to skip (parse) when its plugin can PROVE, from a cheap parse-free
// text scan, that it holds no such consumer call. Anything else (a positive
// signal, no `hasConsumerSignals` hook, or an unreadable file) stays in the
// scan set so its consumer contracts are preserved.
for (const f of [...coveredFiles]) {
const plugin = getPluginForFile(f);
const content = readSafe(repoPath, f);
const provenNoConsumer =
content != null && typeof plugin?.hasConsumerSignals === 'function'
? plugin.hasConsumerSignals(content) === false
: false;
if (!provenNoConsumer) coveredFiles.delete(f);
}

// Everything the graph did not fully cover still gets a full source scan
// (fail-open: partial-coverage languages, unresolved routes, and graph-less
// runs all land here).
const scanFiles = files.filter((f) => !coveredFiles.has(f));

await collectProjectDetections(scanFiles);

const providers = this.mergeGraphAndSourceContracts(
graphProviders,
await this.extractProvidersSourceScan(files, getDetections),
await this.extractProvidersSourceScan(scanFiles, getDetections),
);

const graphConsumers =
dbExecutor != null ? await this.extractConsumersGraph(dbExecutor, getDetections) : [];
const consumers = this.mergeGraphAndSourceContracts(
graphConsumers,
await this.extractConsumersSourceScan(files, getDetections),
await this.extractConsumersSourceScan(scanFiles, getDetections),
);

return [...providers, ...consumers];
Expand All @@ -323,8 +359,14 @@ export class HttpRouteExtractor implements ContractExtractor {
private async extractProvidersGraph(
db: CypherExecutor,
getDetections: (rel: string) => Promise<HttpDetection[]>,
coveredFiles?: Set<string>,
): Promise<ExtractedContract[]> {
const out: ExtractedContract[] = [];
// Per-file coverage tracking (#2138 Part 2): a file is "fully graph-covered"
// when every one of its HANDLES_ROUTE rows resolved a handlerSymbolId AND its
// language plugin declares `routeCoverage: 'complete'`. Such files can skip
// the source scan + parse entirely — the graph is authoritative for them.
const fileAllResolved = new Map<string, boolean>();
let rows: Record<string, unknown>[];
try {
rows = await db(HANDLES_ROUTE_QUERY);
Expand Down Expand Up @@ -354,67 +396,90 @@ export class HttpRouteExtractor implements ContractExtractor {
.toUpperCase();
let method = (graphMethod || null) ?? methodFromRouteReason(routeSource);

// Look up handler name (and backfill method if missing) from the
// plugin's scan of the handler file. This replaces the old
// regex-based `inferMethodFromFileScan` and `pickJavaHandlerName`
// helpers — tree-sitter gives both pieces of information
// structurally. Always run the lookup: even when method is set by
// `methodFromRouteReason`, we still need the handler name.
const detections = filePath ? await getDetections(filePath) : [];
const providerDetections = detections.filter((d) => d.role === 'provider');
let handlerName: string | null = null;
const normalizedRoute = normalizeHttpPath(routePath);
// Candidates share the same normalized path. When multiple
// detections at the same path exist (e.g. GET + POST /api/orders
// in one router), a blind `.find()` silently returned the first
// verb — attaching the wrong handler and, when method was not
// already pinned by the route reason, the wrong method too.
// Disambiguate by method when we know it; refuse to guess when
// we don't.
const candidates = providerDetections.filter(
(d) => normalizeHttpPath(d.path) === normalizedRoute,
);
let match: (typeof candidates)[number] | undefined;
const ambiguousCandidates = !method && candidates.length > 1;
if (method) {
match = candidates.find((d) => d.method === method);
} else if (candidates.length === 1) {
match = candidates[0];
}
// else: multiple candidates + unknown method → leave match
// undefined so handlerName stays null and skip symbol
// enrichment below, keeping the file-basename fallback instead
// of letting pickSymbolUid silently pick the first Function /
// Method in the file (which reintroduces the mis-attribution
// we were trying to avoid). Method stays at the conservative
// 'GET' default set below.
if (match) {
if (!method) method = match.method;
handlerName = match.name;
const handlerSymbolId = String(row.handlerSymbolId ?? '').trim();
const fileId = row.fileId ?? row[0];
// Track per-file resolution for the parse-skip coverage set: a file stays
// "all resolved" only while every one of its rows carries a handlerSymbolId.
if (filePath) {
const prev = fileAllResolved.get(filePath);
fileAllResolved.set(filePath, (prev ?? true) && handlerSymbolId.length > 0);
}
if (!method) method = 'GET';

const pathNorm = normalizeHttpPath(routePath);
const cid = contractIdFor(method, pathNorm);
const pathNormEarly = normalizeHttpPath(routePath);

let symbolUid = '';
let symbolName = path.basename(filePath) || 'handler';
let symPath = filePath;
const fileId = row.fileId ?? row[0];
if (fileId && !ambiguousCandidates) {
try {
const syms = await db(CONTAINS_QUERY, { fileId });
if (syms.length > 0) {
const picked = pickSymbolUid(syms, handlerName);
symbolUid = picked.uid;
symbolName = picked.name;
symPath = picked.filePath || filePath;

if (handlerSymbolId) {
// Fast path (Part 2, #2138): the handler symbol was resolved during
// ingestion and persisted on the Route node, so we read it straight
// from the graph and SKIP the `getDetections()` source-scan/parse the
// legacy path needed just to recover the handler name. CONTAINS is a
// cheap graph query (no tree-sitter parse) used only to surface the
// handler's display name/path; the uid is authoritative regardless.
if (!method) method = 'GET';
symbolUid = handlerSymbolId;
if (fileId) {
try {
const syms = await db(CONTAINS_QUERY, { fileId });
const hit = syms.find((s) => String(s.uid ?? s[0]) === handlerSymbolId);
if (hit) {
symbolName = String(hit.name ?? hit[1]) || symbolName;
symPath = String(hit.filePath ?? hit[2]) || filePath;
}
} catch {
/* keep the authoritative uid + basename fallback */
}
}
} else {
// Legacy fallback (old index / unresolved handler): recover the handler
// name from the plugin's scan of the handler file (this parses source).
// Always run the lookup: even when method is set, we still need the name.
const detections = filePath ? await getDetections(filePath) : [];
const providerDetections = detections.filter((d) => d.role === 'provider');
let handlerName: string | null = null;
// Candidates share the same normalized path. When multiple detections at
// the same path exist (GET + POST /api/orders in one router), a blind
// `.find()` silently returned the first verb — attaching the wrong
// handler/method. Disambiguate by method when known; refuse to guess.
const candidates = providerDetections.filter(
(d) => normalizeHttpPath(d.path) === pathNormEarly,
);
let match: (typeof candidates)[number] | undefined;
const ambiguousCandidates = !method && candidates.length > 1;
if (method) {
match = candidates.find((d) => d.method === method);
} else if (candidates.length === 1) {
match = candidates[0];
}
// else: multiple candidates + unknown method → leave match undefined so
// handlerName stays null and we skip symbol enrichment, keeping the
// file-basename fallback rather than letting pickSymbolUid pick the
// first Function/Method (which reintroduces mis-attribution).
if (match) {
if (!method) method = match.method;
handlerName = match.name;
}
if (!method) method = 'GET';

if (fileId && !ambiguousCandidates) {
try {
const syms = await db(CONTAINS_QUERY, { fileId });
if (syms.length > 0) {
const picked = pickSymbolUid(syms, handlerName);
symbolUid = picked.uid;
symbolName = picked.name;
symPath = picked.filePath || filePath;
}
} catch {
/* ignore */
}
} catch {
/* ignore */
}
}

const pathNorm = pathNormEarly;
const cid = contractIdFor(method, pathNorm);

out.push({
contractId: cid,
type: 'http',
Expand All @@ -432,6 +497,18 @@ export class HttpRouteExtractor implements ContractExtractor {
},
});
}

// Populate the parse-skip coverage set: files whose every provider route
// resolved a handler symbol AND whose language declares complete ingestion
// route coverage. Fail-open — any unresolved row or a 'partial' language
// leaves the file out, so it still gets a full source scan.
if (coveredFiles) {
for (const [fp, allResolved] of fileAllResolved) {
if (allResolved && getPluginForFile(fp)?.routeCoverage === 'complete') {
coveredFiles.add(fp);
}
}
}
return out;
}

Expand Down
Loading
Loading