Skip to content

Commit 9fb2668

Browse files
committed
feat: pipeline improvements for medium files + cJSON migration to 97 tests
Lower chunking threshold from 2000 to 800 LOC so files in the 500-2000 range get multi-pass translation instead of single-shot (which caused FallbackUnsafe on cjson_full). Add structural chunking that puts data model (structs + constructors) in chunk 0, quality gate that re-translates if >5 unsafe blocks, and configurable repair abbreviation limits. Extend cJSON migration with Phases A-F: convenience add functions, reference (clone) wrappers, case-sensitive ops, float/string array creators, parse variants with ParseError, print variants, setters, and version API. 1696 LOC C → 1439 LOC Rust, 97/97 C + 96/96 Rust tests pass, diff test byte-exact.
1 parent f2053a1 commit 9fb2668

6 files changed

Lines changed: 976 additions & 18 deletions

File tree

crates/noricum-agents/src/repair.rs

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ pub async fn repair_function(
4545
.await
4646
}
4747

48-
/// Repair with an optional base temperature override.
48+
/// Repair with an optional base temperature override and configurable abbreviation limit.
4949
#[allow(clippy::too_many_arguments)]
5050
pub async fn repair_function_with_temperature(
5151
client: &LlmClient,
@@ -57,6 +57,35 @@ pub async fn repair_function_with_temperature(
5757
iteration: u32,
5858
max_iterations: u32,
5959
base_temperature: Option<f64>,
60+
) -> Result<String, AgentError> {
61+
repair_function_full(
62+
client,
63+
model,
64+
rust_source,
65+
compiler_errors,
66+
diff_feedback,
67+
c_source,
68+
iteration,
69+
max_iterations,
70+
base_temperature,
71+
None,
72+
)
73+
.await
74+
}
75+
76+
/// Repair with full configuration: temperature and C source abbreviation limit.
77+
#[allow(clippy::too_many_arguments)]
78+
pub async fn repair_function_full(
79+
client: &LlmClient,
80+
model: &str,
81+
rust_source: &str,
82+
compiler_errors: &[String],
83+
diff_feedback: &[String],
84+
c_source: &str,
85+
iteration: u32,
86+
max_iterations: u32,
87+
base_temperature: Option<f64>,
88+
c_abbreviation_limit: Option<usize>,
6089
) -> Result<String, AgentError> {
6190
let error_count = compiler_errors.len();
6291
let diff_count = diff_feedback.len();
@@ -119,7 +148,8 @@ pub async fn repair_function_with_temperature(
119148

120149
// For large C sources, abbreviate to save context tokens.
121150
// The repair agent primarily needs the Rust code + errors; the C source is just reference.
122-
let c_context = abbreviate_c_source(c_source, 300);
151+
let abbrev_limit = c_abbreviation_limit.unwrap_or(300);
152+
let c_context = abbreviate_c_source(c_source, abbrev_limit);
123153

124154
user_message.push_str(&format!(
125155
"## Original C source (for reference)\n<c_source>\n{c_context}\n</c_source>\n\n\

crates/noricum-agents/src/translation.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,10 @@ pub async fn translate_function_with_patterns_and_temperature(
7474

7575
let mut user_message = String::new();
7676

77-
// For very large files (>2000 LOC), prepend a structural summary
77+
// For medium+ files (>800 LOC), prepend a structural summary
7878
// to help the LLM understand the codebase before translating.
7979
let c_lines = c_source.lines().count();
80-
if c_lines > 2000 {
80+
if c_lines > 800 {
8181
let summary = build_structural_summary(c_source);
8282
user_message.push_str(&format!(
8383
"## Structural summary ({c_lines} lines)\n{summary}\n\n"
@@ -160,6 +160,14 @@ pub async fn translate_chunked(
160160

161161
// Build a synthetic C source: shared context + this chunk's functions
162162
let mut chunk_source = chunk.shared_context.clone();
163+
// For chunk 0 with type definitions, add data model translation guidance
164+
if i == 0 && chunk.shared_context.contains("struct ") {
165+
chunk_source.push_str(
166+
"\n\n// === INSTRUCTION: Translate data model to idiomatic Rust types FIRST. ===\n\
167+
// Use enum variants instead of type tags. Convert linked-lists to Vec.\n\
168+
// Use String instead of *char. Replace malloc/free with RAII.\n"
169+
);
170+
}
163171
chunk_source.push_str("\n\n// === Functions to translate ===\n");
164172
chunk_source.push_str(&chunk.functions_source);
165173

crates/noricum-core/src/orchestrator.rs

Lines changed: 111 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,11 @@ use tracing::{debug, info, warn};
1919
/// Prevents OOM on extremely large C files.
2020
const MAX_C_SOURCE_SIZE: usize = 5 * 1024 * 1024;
2121

22+
/// LOC threshold above which chunked translation is used.
23+
const MEDIUM_FILE_LOC: usize = 800;
2224
/// LOC threshold above which repair iterations are reduced to save tokens.
2325
const LARGE_FILE_LOC: usize = 1000;
24-
/// LOC threshold above which repair iterations are further reduced.
26+
/// LOC threshold above which repair iterations are further reduced and larger chunk targets apply.
2527
const VERY_LARGE_FILE_LOC: usize = 2000;
2628

2729
use crate::CoreError;
@@ -163,7 +165,7 @@ fn preflight_budget_check(config: &MigrationConfig, c_source: &str, name: &str)
163165
// Repair (per iter): Rust source + C source + errors in, Rust out
164166
let repair_per_iter = c_tokens * 4;
165167
let c_lines = c_source.lines().count();
166-
let effective_iters = effective_repair_iterations(config.max_repair_iterations, c_lines);
168+
let effective_iters = effective_repair_iterations(config.max_repair_iterations, c_lines, false);
167169
let repair_cost = repair_per_iter * effective_iters as u64;
168170
// Test gen: C + Rust in, tests out
169171
let test_gen_cost = c_tokens * 3;
@@ -191,11 +193,15 @@ fn preflight_budget_check(config: &MigrationConfig, c_source: &str, name: &str)
191193
}
192194
}
193195

194-
/// Compute effective max repair iterations based on file size.
196+
/// Compute effective max repair iterations based on file size and chunking.
195197
///
196198
/// Large files use fewer iterations to conserve tokens — each repair
197199
/// iteration sends the full Rust + C source, which is expensive.
198-
fn effective_repair_iterations(configured_max: u32, c_lines: usize) -> u32 {
200+
/// Chunked translations get full iterations since each chunk is small.
201+
fn effective_repair_iterations(configured_max: u32, c_lines: usize, was_chunked: bool) -> u32 {
202+
if was_chunked {
203+
return configured_max;
204+
}
199205
if c_lines > VERY_LARGE_FILE_LOC {
200206
configured_max.min(2)
201207
} else if c_lines > LARGE_FILE_LOC {
@@ -618,12 +624,32 @@ pub async fn migrate_file(
618624
"calling translation agent"
619625
);
620626
let c_lines_for_chunk = unit.c_source.lines().count();
621-
let rust_code = if c_lines_for_chunk > VERY_LARGE_FILE_LOC {
622-
let chunks = noricum_tools::ast::chunk_c_source(&unit.c_source, 500);
627+
let use_chunked = c_lines_for_chunk > MEDIUM_FILE_LOC;
628+
let rust_code = if use_chunked {
629+
let chunk_target = if c_lines_for_chunk > VERY_LARGE_FILE_LOC { 500 } else { 400 };
630+
// Use structural chunking when data model patterns are detected
631+
let has_data_model = analysis.patterns.iter().any(|p| {
632+
p.contains("struct") || p.contains("linked_list") || p.contains("recursive")
633+
});
634+
let chunks = if has_data_model {
635+
let structural = noricum_tools::ast::chunk_c_source_structural(&unit.c_source, chunk_target);
636+
if structural.len() > 1 {
637+
info!(
638+
function = %name,
639+
"using structural chunking (data model first)"
640+
);
641+
structural
642+
} else {
643+
noricum_tools::ast::chunk_c_source(&unit.c_source, chunk_target)
644+
}
645+
} else {
646+
noricum_tools::ast::chunk_c_source(&unit.c_source, chunk_target)
647+
};
623648
info!(
624649
function = %name,
625650
chunks = chunks.len(),
626651
c_lines = c_lines_for_chunk,
652+
chunk_target,
627653
"using multi-pass chunked translation"
628654
);
629655
match noricum_agents::translation::translate_chunked(
@@ -662,6 +688,54 @@ pub async fn migrate_file(
662688
}
663689
}
664690
};
691+
692+
// Quality gate: if initial translation has >5 unsafe blocks, re-translate
693+
let unsafe_count = noricum_tools::ast::count_unsafe_blocks_ast(&rust_code);
694+
let rust_code = if unsafe_count > 5 {
695+
warn!(
696+
function = %name,
697+
unsafe_count,
698+
"initial translation has too many unsafe blocks, re-translating with temperature 0.5"
699+
);
700+
unit.metrics.llm_calls += 1;
701+
match noricum_agents::translation::translate_function_with_patterns_and_temperature(
702+
&client,
703+
&translation_model_sel.model,
704+
&unit.c_source,
705+
unit.c2rust_output.as_deref(),
706+
&analysis,
707+
&relevant_patterns,
708+
Some(0.5),
709+
)
710+
.await
711+
{
712+
Ok(retranslated) => {
713+
let new_unsafe = noricum_tools::ast::count_unsafe_blocks_ast(&retranslated);
714+
if new_unsafe < unsafe_count {
715+
info!(
716+
function = %name,
717+
old_unsafe = unsafe_count,
718+
new_unsafe,
719+
"re-translation reduced unsafe blocks"
720+
);
721+
retranslated
722+
} else {
723+
info!(
724+
function = %name,
725+
"re-translation did not improve, keeping original"
726+
);
727+
rust_code
728+
}
729+
}
730+
Err(e) => {
731+
warn!(function = %name, error = %e, "re-translation failed, keeping original");
732+
rust_code
733+
}
734+
}
735+
} else {
736+
rust_code
737+
};
738+
665739
unit.rust_output = Some(rust_code);
666740
unit.state = MigrationState::Refined;
667741
unit.metrics.translation_ms = translation_start.elapsed().as_millis() as u64;
@@ -677,7 +751,8 @@ pub async fn migrate_file(
677751

678752
// --- Stage 6: Validate ---
679753
let c_lines = unit.c_source.lines().count();
680-
let max_iters = effective_repair_iterations(config.max_repair_iterations, c_lines);
754+
let was_chunked = c_lines > MEDIUM_FILE_LOC;
755+
let max_iters = effective_repair_iterations(config.max_repair_iterations, c_lines, was_chunked);
681756

682757
let validation =
683758
noricum_validation::validate_with_threshold(&unit, config.min_idiomatic_score)?;
@@ -755,7 +830,8 @@ pub async fn migrate_file(
755830
);
756831
}
757832

758-
let repaired = noricum_agents::repair::repair_function_with_temperature(
833+
let c_abbrev_limit = if c_lines > 500 { Some(500) } else { None };
834+
let repaired = noricum_agents::repair::repair_function_full(
759835
&client,
760836
&repair_model_sel.model,
761837
current_rust,
@@ -765,6 +841,7 @@ pub async fn migrate_file(
765841
iteration,
766842
max_iters,
767843
config.repair_base_temperature,
844+
c_abbrev_limit,
768845
)
769846
.await?;
770847

@@ -1233,34 +1310,54 @@ fn main() {
12331310

12341311
#[test]
12351312
fn test_effective_small() {
1236-
assert_eq!(effective_repair_iterations(5, 500), 5);
1313+
assert_eq!(effective_repair_iterations(5, 500, false), 5);
12371314
}
12381315

12391316
#[test]
12401317
fn test_effective_large() {
1241-
assert_eq!(effective_repair_iterations(5, 1500), 3);
1318+
assert_eq!(effective_repair_iterations(5, 1500, false), 3);
12421319
}
12431320

12441321
#[test]
12451322
fn test_effective_very_large() {
1246-
assert_eq!(effective_repair_iterations(5, 3000), 2);
1323+
assert_eq!(effective_repair_iterations(5, 3000, false), 2);
12471324
}
12481325

12491326
#[test]
12501327
fn test_effective_already_low() {
1251-
assert_eq!(effective_repair_iterations(1, 3000), 1);
1328+
assert_eq!(effective_repair_iterations(1, 3000, false), 1);
12521329
}
12531330

12541331
#[test]
12551332
fn test_effective_boundary_1000() {
12561333
// 1000 is NOT > LARGE_FILE_LOC (1000), so no reduction
1257-
assert_eq!(effective_repair_iterations(5, 1000), 5);
1334+
assert_eq!(effective_repair_iterations(5, 1000, false), 5);
12581335
}
12591336

12601337
#[test]
12611338
fn test_effective_boundary_2001() {
12621339
// 2001 > VERY_LARGE_FILE_LOC (2000), so min(5, 2) = 2
1263-
assert_eq!(effective_repair_iterations(5, 2001), 2);
1340+
assert_eq!(effective_repair_iterations(5, 2001, false), 2);
1341+
}
1342+
1343+
#[test]
1344+
fn test_effective_chunked_gets_full_iterations() {
1345+
// Chunked translations always get full configured iterations
1346+
assert_eq!(effective_repair_iterations(5, 1500, true), 5);
1347+
assert_eq!(effective_repair_iterations(5, 3000, true), 5);
1348+
assert_eq!(effective_repair_iterations(3, 5000, true), 3);
1349+
}
1350+
1351+
#[test]
1352+
fn test_medium_file_triggers_chunking() {
1353+
// Files >800 LOC should trigger chunked translation
1354+
assert!(800 < LARGE_FILE_LOC);
1355+
assert!(MEDIUM_FILE_LOC == 800);
1356+
// 900 LOC > MEDIUM_FILE_LOC, so chunking is used
1357+
let c_lines = 900;
1358+
assert!(c_lines > MEDIUM_FILE_LOC);
1359+
// But with was_chunked=true, full iterations are restored
1360+
assert_eq!(effective_repair_iterations(5, c_lines, true), 5);
12641361
}
12651362

12661363
#[test]

0 commit comments

Comments
 (0)