Skip to content

Commit 83f8806

Browse files
rawwerksclaude
andcommitted
feat: content extractors for manifests and entry points
Adds an optional `extract` field to priority rules that transforms file content before budget accounting. Instead of including a full Cargo.toml or truncating package.json to its first two lines, extractors produce a focused summary (just the dependencies, just the pub mod declarations) that fits in the content budget and actually orients an agent. Four built-in extractors: - json_keys: extract named top-level keys from JSON - toml_sections: extract named sections from TOML - lines_matching: extract lines starting with given prefixes - api_surface: first 5 lines + all pub/export declarations Default rules auto-extract for: - Cargo.toml -> dependencies, dev-dependencies - package.json -> dependencies, devDependencies, scripts - go.mod -> require and module lines - src/lib.*, **/mod.rs -> public API surface The extract field is optional and defaults to None, so existing configs work unchanged. Users can add extract specs to any priority rule in dirpack.toml. New file: src/packer/extractors.rs (~330 lines incl. 14 unit tests) Modified: src/config.rs (extract field + defaults), src/packer/mod.rs (3-line integration), src/priority.rs (test fixes), CHANGELOG.md, default_config.toml Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 5ebffb3 commit 83f8806

6 files changed

Lines changed: 531 additions & 0 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Semantic Versioning.
1717
- `content_weight` per file category controls how aggressively each category's files are included in the content phase. Defaults: code/docs 1.0, config 0.2, build/data 0.0. Configurable in dirpack.toml via `[categories.<name>].content_weight`.
1818
- `test_content_weight` and `fixture_content_weight` in `[priority]` (both default 0.0). Test and fixture files get spine + signatures but never content body. Set to 1.0 to restore old behavior.
1919
- Import-only snippet detection: truncated snippets where >80% of lines are import/require/use statements are skipped automatically.
20+
- Content extractors: optional `extract` field on priority rules transforms file content before budget accounting. Four built-in extractors: `json_keys` (package.json deps/scripts), `toml_sections` (Cargo.toml dependencies), `lines_matching` (go.mod require blocks), `api_surface` (pub mod/use/fn declarations from entry points). Default rules auto-extract for common manifests and lib.rs files. Configurable per-pattern in dirpack.toml.
2021

2122
### Changed
2223
- Content phase skips files under 50 bytes (fully represented by spine) and snippets truncated to fewer than 3 lines (eliminates single-import/shebang junk). Among equal-priority files, larger files now sort first for content inclusion instead of smaller ones.

default_config.toml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,12 @@ priority = 30
4545
content_weight = 0.0 # never include raw data content
4646

4747
# Priority rules - higher = more important, included first in budget
48+
# Optional "extract" field transforms content before budget accounting.
49+
# Built-in extractors:
50+
# json_keys:key1,key2 - include only named top-level JSON keys
51+
# toml_sections:sec1,sec2 - include only named TOML sections
52+
# lines_matching:prefix1|pfx2 - include lines starting with any prefix
53+
# api_surface - first 5 lines + all pub/export declarations
4854
[[priority_rules]]
4955
pattern = "README*"
5056
priority = 200
@@ -60,14 +66,17 @@ priority = 200
6066
[[priority_rules]]
6167
pattern = "Cargo.toml"
6268
priority = 150
69+
extract = "toml_sections:dependencies,dev-dependencies"
6370

6471
[[priority_rules]]
6572
pattern = "package.json"
6673
priority = 150
74+
extract = "json_keys:dependencies,devDependencies,scripts"
6775

6876
[[priority_rules]]
6977
pattern = "go.mod"
7078
priority = 150
79+
extract = "lines_matching:require |module "
7180

7281
[[priority_rules]]
7382
pattern = "src/main.*"
@@ -76,10 +85,12 @@ priority = 140
7685
[[priority_rules]]
7786
pattern = "src/lib.*"
7887
priority = 140
88+
extract = "api_surface"
7989

8090
[[priority_rules]]
8191
pattern = "**/mod.rs"
8292
priority = 130
93+
extract = "api_surface"
8394

8495
[[priority_rules]]
8596
pattern = "**/*_test.*"

src/config.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,17 @@ impl Category {
253253
pub struct PriorityRule {
254254
pub pattern: String,
255255
pub priority: i32,
256+
/// Optional content extractor spec. When set, the content phase
257+
/// transforms this file's content through the named extractor before
258+
/// budget accounting. Format: `"extractor_name:arg1,arg2"`.
259+
///
260+
/// Built-in extractors:
261+
/// - `json_keys:key1,key2` — include only named top-level JSON keys
262+
/// - `toml_sections:sec1,sec2` — include only named TOML sections
263+
/// - `lines_matching:prefix1|prefix2` — include lines starting with any prefix
264+
/// - `api_surface` — first 5 lines + all pub/export declarations
265+
#[serde(default)]
266+
pub extract: Option<String>,
256267
}
257268

258269
/// Configurable priority weight adjustments.
@@ -400,50 +411,62 @@ fn default_priority_rules() -> Vec<PriorityRule> {
400411
PriorityRule {
401412
pattern: "README*".to_string(),
402413
priority: 200,
414+
extract: None,
403415
},
404416
PriorityRule {
405417
pattern: "AGENTS.md".to_string(),
406418
priority: 200,
419+
extract: None,
407420
},
408421
PriorityRule {
409422
pattern: "CLAUDE.md".to_string(),
410423
priority: 200,
424+
extract: None,
411425
},
412426
PriorityRule {
413427
pattern: "Cargo.toml".to_string(),
414428
priority: 150,
429+
extract: Some("toml_sections:dependencies,dev-dependencies".to_string()),
415430
},
416431
PriorityRule {
417432
pattern: "package.json".to_string(),
418433
priority: 150,
434+
extract: Some("json_keys:dependencies,devDependencies,scripts".to_string()),
419435
},
420436
PriorityRule {
421437
pattern: "go.mod".to_string(),
422438
priority: 150,
439+
extract: Some("lines_matching:require |module ".to_string()),
423440
},
424441
PriorityRule {
425442
pattern: "src/main.*".to_string(),
426443
priority: 140,
444+
extract: None,
427445
},
428446
PriorityRule {
429447
pattern: "src/lib.*".to_string(),
430448
priority: 140,
449+
extract: Some("api_surface".to_string()),
431450
},
432451
PriorityRule {
433452
pattern: "**/mod.rs".to_string(),
434453
priority: 130,
454+
extract: Some("api_surface".to_string()),
435455
},
436456
PriorityRule {
437457
pattern: "**/*_test.*".to_string(),
438458
priority: 50,
459+
extract: None,
439460
},
440461
PriorityRule {
441462
pattern: "**/test_*".to_string(),
442463
priority: 50,
464+
extract: None,
443465
},
444466
PriorityRule {
445467
pattern: "**/*.lock".to_string(),
446468
priority: 10,
469+
extract: None,
447470
},
448471
]
449472
}

0 commit comments

Comments
 (0)