Skip to content

Commit edb8bc4

Browse files
committed
refactor(pipeline): source law model from regelrecht-law-model in pipeline and harvester
Advies 11: the two remaining consumers that kept a partial mirror of the canonical law-YAML model now source it from the leaf crate regelrecht-law-model, so the executable model has a single definition. pipeline: - Replace the untyped serde_yaml_ng::Value treewalk in count_article_stats (map.get("articles") / contains_key("machine_readable")) with a typed parse into ArticleBasedLaw. The read/count side is now type-checked against the single source of truth instead of stringly-typed field access. - The surgical write path is intentionally left untouched: production enrichment is performed by an external LLM subprocess that edits the file directly, so there is no Rust-side model write to unify. - Behavior change: a structurally-invalid law now surfaces as a parse error in the count rather than being silently undercounted — desirable, since this only runs on real harvested/enriched corpus files. Test fixtures updated to realistic minimal laws; an empty-machine_readable edge case is added. harvester: - The writer stays a deliberate write-projection (it carries preamble, organisation, cvdr_id and per-article references that the executable ArticleBasedLaw does not, with bespoke quoting/indent post-processing), but the internal YamlReference struct was a byte-identical duplicate of types::Reference, so it is dropped and types::Reference is serialized directly. - Add regelrecht-law-model as a dev-dependency and two regression tests: golden byte-identity for a BWB and a CVDR law (the repo previously only asserted substrings), and a conformance test proving the harvested output deserializes cleanly into ArticleBasedLaw.
1 parent 2fc2131 commit edb8bc4

5 files changed

Lines changed: 215 additions & 113 deletions

File tree

packages/Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/harvester/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ textwrap = "0.16"
3838
unicode-normalization = "0.1"
3939

4040
[dev-dependencies]
41+
regelrecht-law-model = { path = "../law-model" }
4142
tempfile.workspace = true
4243
serde_json.workspace = true
4344

packages/harvester/src/yaml/writer.rs

Lines changed: 129 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -37,47 +37,18 @@ struct YamlPreamble {
3737
}
3838

3939
/// Article representation for YAML serialization.
40+
///
41+
/// References are serialized straight from the canonical [`Reference`] model
42+
/// (`crate::types`) — it already carries the exact field set, order and
43+
/// `skip_serializing_if` behavior, so there is no separate write mirror to keep
44+
/// in sync.
4045
#[derive(Debug, Serialize)]
4146
struct YamlArticle {
4247
number: String,
4348
text: String,
4449
url: String,
4550
#[serde(skip_serializing_if = "Vec::is_empty")]
46-
references: Vec<YamlReference>,
47-
}
48-
49-
/// Reference representation for YAML serialization.
50-
#[derive(Debug, Serialize)]
51-
struct YamlReference {
52-
id: String,
53-
bwb_id: String,
54-
#[serde(skip_serializing_if = "Option::is_none")]
55-
artikel: Option<String>,
56-
#[serde(skip_serializing_if = "Option::is_none")]
57-
lid: Option<String>,
58-
#[serde(skip_serializing_if = "Option::is_none")]
59-
onderdeel: Option<String>,
60-
#[serde(skip_serializing_if = "Option::is_none")]
61-
hoofdstuk: Option<String>,
62-
#[serde(skip_serializing_if = "Option::is_none")]
63-
paragraaf: Option<String>,
64-
#[serde(skip_serializing_if = "Option::is_none")]
65-
afdeling: Option<String>,
66-
}
67-
68-
impl From<&Reference> for YamlReference {
69-
fn from(r: &Reference) -> Self {
70-
Self {
71-
id: r.id.clone(),
72-
bwb_id: r.bwb_id.clone(),
73-
artikel: r.artikel.clone(),
74-
lid: r.lid.clone(),
75-
onderdeel: r.onderdeel.clone(),
76-
hoofdstuk: r.hoofdstuk.clone(),
77-
paragraaf: r.paragraaf.clone(),
78-
afdeling: r.afdeling.clone(),
79-
}
80-
}
51+
references: Vec<Reference>,
8152
}
8253

8354
/// Full law representation for YAML serialization.
@@ -147,7 +118,7 @@ fn generate_yaml_struct(law: &Law, effective_date: &str) -> YamlLaw {
147118
number: article.number.clone(),
148119
text,
149120
url: article.url.clone(),
150-
references: article.references.iter().map(YamlReference::from).collect(),
121+
references: article.references.clone(),
151122
}
152123
})
153124
.collect();
@@ -553,22 +524,129 @@ mod tests {
553524
);
554525
}
555526

556-
#[test]
557-
fn test_yaml_reference_serialization() {
558-
let reference = Reference {
559-
id: "ref1".to_string(),
560-
bwb_id: "BWBR0018451".to_string(),
561-
artikel: Some("4".to_string()),
562-
lid: None,
563-
onderdeel: None,
564-
hoofdstuk: None,
565-
paragraaf: None,
566-
afdeling: None,
527+
/// A CVDR (local-regulation) law that exercises the writer-only projection
528+
/// fields — `preamble`, `organisation`, `cvdr_id`, `officiele_titel` and a
529+
/// per-article `references` block — which the canonical executable model
530+
/// (`ArticleBasedLaw`) does not carry. Keeping these in a golden test guards
531+
/// against accidental loss when the writer changes.
532+
fn create_test_cvdr_law() -> Law {
533+
let metadata = LawMetadata {
534+
bwb_id: String::new(),
535+
cvdr_id: Some("CVDR123456".to_string()),
536+
title: "Voorbeeldverordening".to_string(),
537+
regulatory_layer: RegulatoryLayer::GemeentelijkeVerordening,
538+
publication_date: Some("2024-01-01".to_string()),
539+
effective_date: None,
540+
valid_to: None,
541+
creator: Some("Gemeente Voorbeeld".to_string()),
542+
scope_code: Some("GM0000".to_string()),
567543
};
568544

569-
let yaml_ref = YamlReference::from(&reference);
570-
assert_eq!(yaml_ref.id, "ref1");
571-
assert_eq!(yaml_ref.artikel, Some("4".to_string()));
572-
assert!(yaml_ref.lid.is_none());
545+
let mut law = Law::new(metadata);
546+
law.preamble = Some(crate::types::Preamble {
547+
text: "De raad van de gemeente Voorbeeld besluit.".to_string(),
548+
url: "https://lokaleregelgeving.overheid.nl/CVDR123456".to_string(),
549+
});
550+
law.add_article(
551+
Article::new(
552+
"1",
553+
"Begripsbepalingen.",
554+
"https://lokaleregelgeving.overheid.nl/CVDR123456#Artikel1",
555+
)
556+
.with_references(vec![Reference {
557+
id: "ref1".to_string(),
558+
bwb_id: "BWBR0018451".to_string(),
559+
artikel: Some("4".to_string()),
560+
lid: None,
561+
onderdeel: None,
562+
hoofdstuk: None,
563+
paragraaf: None,
564+
afdeling: None,
565+
}]),
566+
);
567+
law
568+
}
569+
570+
/// Golden byte-identity test for a national (BWB) law. Pins the *exact*
571+
/// serialized output so any change to the writer (formatting, field order,
572+
/// the de-duplicated `references` projection) is caught, not just substrings.
573+
#[test]
574+
fn test_generate_yaml_golden_bwb() {
575+
let law = create_test_law();
576+
let yaml = generate_yaml(&law, "2025-01-01").unwrap();
577+
// Indentation below is significant — it is the exact emitted YAML.
578+
// SCHEMA_URL is interpolated so a schema-version bump updates this golden
579+
// automatically while every other byte stays pinned.
580+
let expected = format!(
581+
"---
582+
$schema: {SCHEMA_URL}
583+
$id: wet_op_de_zorgtoeslag
584+
regulatory_layer: WET
585+
publication_date: '2005-12-29'
586+
valid_from: '2025-01-01'
587+
bwb_id: BWBR0018451
588+
url: https://wetten.overheid.nl/BWBR0018451/2025-01-01
589+
articles:
590+
- number: '1'
591+
text: 'In deze wet wordt verstaan onder toeslagpartner: partner.'
592+
url: https://wetten.overheid.nl/BWBR0018451/2025-01-01#Artikel1
593+
"
594+
);
595+
assert_eq!(yaml, expected);
596+
}
597+
598+
/// Golden byte-identity test for a CVDR (local) law, covering the
599+
/// writer-only projection fields and a per-article `references` block.
600+
#[test]
601+
fn test_generate_yaml_golden_cvdr() {
602+
let law = create_test_cvdr_law();
603+
let yaml = generate_yaml(&law, "2024-01-01").unwrap();
604+
// Indentation below is significant — it is the exact emitted YAML,
605+
// including the writer-only `preamble`/`organisation`/`references` fields.
606+
let expected = format!(
607+
"---
608+
$schema: {SCHEMA_URL}
609+
$id: voorbeeldverordening
610+
regulatory_layer: GEMEENTELIJKE_VERORDENING
611+
publication_date: '2024-01-01'
612+
valid_from: '2024-01-01'
613+
cvdr_id: CVDR123456
614+
officiele_titel: Voorbeeldverordening
615+
organisation: Gemeente Voorbeeld
616+
url: https://lokaleregelgeving.overheid.nl/CVDR123456
617+
preamble:
618+
text: De raad van de gemeente Voorbeeld besluit.
619+
url: https://lokaleregelgeving.overheid.nl/CVDR123456
620+
articles:
621+
- number: '1'
622+
text: Begripsbepalingen.
623+
url: https://lokaleregelgeving.overheid.nl/CVDR123456#Artikel1
624+
references:
625+
- id: ref1
626+
bwb_id: BWBR0018451
627+
artikel: '4'
628+
"
629+
);
630+
assert_eq!(yaml, expected);
631+
}
632+
633+
/// Conformance: the harvested YAML deserializes cleanly into the canonical
634+
/// executable model (`regelrecht-law-model::ArticleBasedLaw`). The harvester
635+
/// writer stays a separate write-projection, but this proves its output
636+
/// remains faithful to the single source of truth for the law format.
637+
#[test]
638+
fn test_generated_yaml_conforms_to_law_model() {
639+
for (law, date) in [
640+
(create_test_law(), "2025-01-01"),
641+
(create_test_cvdr_law(), "2024-01-01"),
642+
] {
643+
let yaml = generate_yaml(&law, date).unwrap();
644+
let parsed: regelrecht_law_model::ArticleBasedLaw = serde_yaml_ng::from_str(&yaml)
645+
.unwrap_or_else(|e| {
646+
panic!("harvested YAML must parse as ArticleBasedLaw: {e}\n{yaml}")
647+
});
648+
assert_eq!(parsed.articles.len(), law.articles.len());
649+
assert_eq!(parsed.regulatory_layer, law.metadata.regulatory_layer);
650+
}
573651
}
574652
}

packages/pipeline/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ path = "src/bin/pipeline_api.rs"
2525
[dependencies]
2626
regelrecht-corpus = { path = "../corpus" }
2727
regelrecht-harvester = { path = "../harvester" }
28+
regelrecht-law-model = { path = "../law-model" }
2829
regelrecht-shared = { path = "../shared", features = ["telemetry"] }
2930
sqlx = { workspace = true, features = ["runtime-tokio", "tls-rustls", "postgres", "uuid", "chrono", "json", "migrate"] }
3031
tokio = { workspace = true, features = ["macros", "rt-multi-thread", "signal", "process", "time", "fs", "net", "io-util"] }

0 commit comments

Comments
 (0)