Skip to content

Commit 5ef2e50

Browse files
committed
IMDI export should always have 3 letter codes
1 parent 84e64f4 commit 5ef2e50

7 files changed

Lines changed: 106 additions & 32 deletions

File tree

src/export/Imdi-static-fns.ts

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import * as XmlBuilder from "xmlbuilder";
22
import { Field } from "../model/field/Field";
3+
import { staticLanguageFinder } from "../languageFinder/LanguageFinder";
34

45
// public for testing
56
// imdiSupportsMultipleElements: when true, creates separate elements with LanguageId attributes for each language.
@@ -80,9 +81,12 @@ function addElementsFromFieldContent(
8081
// Multiple languages and IMDI supports LanguageId attributes
8182
axes.forEach((language) => {
8283
element = tail.element(elementName, field.getTextAxis(language));
83-
// 2 letter is ISO639-1, 3 letter is taken to be the ISO639-3 (ethonogue code)
84-
const kind = language.length === 2 ? "ISO639-1" : "ISO639-3";
85-
element.attribute("LanguageId", kind + ":" + language);
84+
// Always use ISO639-3 (3-letter) codes for IMDI export
85+
// Archives require 3-letter codes, they can't handle 2-letter ISO639-1 codes
86+
const iso639_3 = staticLanguageFinder
87+
? staticLanguageFinder.getIso639_3Code(language)
88+
: language;
89+
element.attribute("LanguageId", "ISO639-3:" + iso639_3);
8690
});
8791
}
8892
}

src/export/ImdiGenerator-field.spec.ts

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ import {
99
isTildeBirthYear,
1010
resetTildeBirthYearWarning
1111
} from "./ImdiGenerator.ts";
12+
import { setupLanguageFinderForTests } from "../languageFinder/LanguageFinder";
13+
14+
// Set up staticLanguageFinder for ISO639-3 code lookups
15+
setupLanguageFinderForTests();
1216

1317
describe("imdi monolingual field export", () => {
1418
it("exports default if element is required", () => {
@@ -118,7 +122,7 @@ describe("imdi multilingual field export", () => {
118122
});
119123
expect("//description[@LanguageId]").toHaveCount(2);
120124
//printResultXml();
121-
expect("//description[@LanguageId='ISO639-1:en']").toHaveText("a house");
125+
expect("//description[@LanguageId='ISO639-3:eng']").toHaveText("a house");
122126
expect("//description[@LanguageId='ISO639-3:etr']").toHaveText(
123127
"house in edolo"
124128
);
@@ -172,11 +176,11 @@ describe("imdi monolingual field export", () => {
172176
});
173177
// Should have multiple Title elements with LanguageId attributes
174178
expect("//title[@LanguageId]").toHaveCount(3);
175-
expect("//title[@LanguageId='ISO639-1:en']").toHaveText(
179+
expect("//title[@LanguageId='ISO639-3:eng']").toHaveText(
176180
"idea for tomorrow"
177181
);
178-
expect("//title[@LanguageId='ISO639-1:es']").toHaveText("idea para mañana");
179-
expect("//title[@LanguageId='ISO639-1:fr']").toHaveText("idée pour demain");
182+
expect("//title[@LanguageId='ISO639-3:spa']").toHaveText("idea para mañana");
183+
expect("//title[@LanguageId='ISO639-3:fra']").toHaveText("idée pour demain");
180184
});
181185

182186
it("creates multiple elements with LanguageId when imdiSupportsMultipleElements is true", () => {

src/export/ImdiGenerator-vocab.spec.ts

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -94,17 +94,17 @@ describe("ImdiGenerator multilingual vocabulary export", () => {
9494

9595
// English should be present
9696
xexpect(
97-
"//Session/MDGroup/Content/Genre[@LanguageId='ISO639-1:en']"
97+
"//Session/MDGroup/Content/Genre[@LanguageId='ISO639-3:eng']"
9898
).toHaveText("Narrative");
9999

100100
// Spanish should be present (from genres.csv)
101101
xexpect(
102-
"//Session/MDGroup/Content/Genre[@LanguageId='ISO639-1:es']"
102+
"//Session/MDGroup/Content/Genre[@LanguageId='ISO639-3:spa']"
103103
).toHaveText("Narrativa");
104104

105105
// Portuguese should be present (from genres.csv - pt-BR column)
106106
xexpect(
107-
"//Session/MDGroup/Content/Genre[@LanguageId='ISO639-1:pt']"
107+
"//Session/MDGroup/Content/Genre[@LanguageId='ISO639-3:por']"
108108
).toHaveText("Narrativa");
109109

110110
// All should have the correct vocabulary link and type
@@ -135,7 +135,7 @@ describe("ImdiGenerator multilingual vocabulary export", () => {
135135

136136
// Check English - should be sentence case
137137
xexpect(
138-
"//Session/MDGroup/Content/SubGenre[@LanguageId='ISO639-1:en']"
138+
"//Session/MDGroup/Content/SubGenre[@LanguageId='ISO639-3:eng']"
139139
).toHaveText("Myth");
140140
});
141141

@@ -162,12 +162,12 @@ describe("ImdiGenerator multilingual vocabulary export", () => {
162162

163163
// English should have "Speaker" (sentence case)
164164
xexpect(
165-
"//Session/MDGroup/Actors/Actor/Role[@LanguageId='ISO639-1:en']"
165+
"//Session/MDGroup/Actors/Actor/Role[@LanguageId='ISO639-3:eng']"
166166
).toHaveText("Speaker");
167167

168168
// Spanish should have the translation
169169
xexpect(
170-
"//Session/MDGroup/Actors/Actor/Role[@LanguageId='ISO639-1:es']"
170+
"//Session/MDGroup/Actors/Actor/Role[@LanguageId='ISO639-3:spa']"
171171
).toHaveText("Hablante");
172172

173173
// All Role elements should have vocabulary attributes
@@ -223,9 +223,9 @@ describe("ImdiGenerator multilingual vocabulary export", () => {
223223
);
224224
setResultXml(imdi);
225225

226-
// Should have English (2-letter code uses ISO639-1)
226+
// Should have English (using ISO639-3 code)
227227
xexpect(
228-
"//Session/MDGroup/Content/Genre[@LanguageId='ISO639-1:en']"
228+
"//Session/MDGroup/Content/Genre[@LanguageId='ISO639-3:eng']"
229229
).toHaveText("Narrative");
230230

231231
// Should NOT have Tok Pisin since there's no translation
@@ -431,7 +431,7 @@ describe("ImdiGenerator ELAR schema multilingual vocabulary export", () => {
431431

432432
// English should be present
433433
xexpect(
434-
"//Session/MDGroup/Content/Genre[@LanguageId='ISO639-1:en']"
434+
"//Session/MDGroup/Content/Genre[@LanguageId='ISO639-3:eng']"
435435
).toHaveText("Narrative");
436436
});
437437

@@ -451,16 +451,16 @@ describe("ImdiGenerator ELAR schema multilingual vocabulary export", () => {
451451
);
452452
expect(metadataLangCount).toBe(3);
453453

454-
// Check the format: "ISO639-1:en: English" or "ISO639-3:xxx: Name"
454+
// Check the format: "ISO639-3:eng: English"
455455
const keys = select("//Session/MDGroup/Keys/Key[@Name='MetadataLanguage']");
456456
const values = keys.map((node) => (node as Element).textContent);
457457

458-
// English - 2-letter code gets ISO639-1
459-
expect(values).toContain("ISO639-1:en: English");
460-
// Spanish - 2-letter code gets ISO639-1
461-
expect(values).toContain("ISO639-1:es: Spanish");
462-
// Portuguese - 2-letter code gets ISO639-1
463-
expect(values).toContain("ISO639-1:pt: Portuguese");
458+
// English - always use ISO639-3
459+
expect(values).toContain("ISO639-3:eng: English");
460+
// Spanish - always use ISO639-3
461+
expect(values).toContain("ISO639-3:spa: Spanish");
462+
// Portuguese - always use ISO639-3
463+
expect(values).toContain("ISO639-3:por: Portuguese");
464464
});
465465

466466
it("should not output MetadataLanguage keys when only default language is set", () => {

src/export/ImdiGenerator.ts

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import { Field } from "../model/field/Field";
3030
import { fieldElement } from "./Imdi-static-fns";
3131
import { GetOtherConfigurationSettings } from "../model/Project/OtherConfigurationSettings";
3232
import { ImdiVocabularyTranslator } from "./ImdiVocabularyTranslator";
33+
import { staticLanguageFinder } from "../languageFinder/LanguageFinder";
3334

3435
// IMDI Date_Value_Type pattern from IMDI_3.0.xsd
3536
// Valid: YYYY, YYYY-MM, YYYY-MM-DD, date ranges with /, "Unknown", "Unspecified", or empty
@@ -641,9 +642,11 @@ export default class ImdiGenerator {
641642

642643
this.group("Keys", () => {
643644
for (const slot of metadataSlots) {
644-
// Use ISO639-3 for 3-letter codes, ISO639-1 for 2-letter codes
645-
const kind = slot.tag.length === 2 ? "ISO639-1" : "ISO639-3";
646-
const value = `${kind}:${slot.tag}: ${slot.name}`;
645+
// Always use ISO639-3 (3-letter) codes - archives can't handle 2-letter ISO639-1 codes
646+
const iso639_3 = staticLanguageFinder
647+
? staticLanguageFinder.getIso639_3Code(slot.tag)
648+
: slot.tag;
649+
const value = `ISO639-3:${iso639_3}: ${slot.name}`;
647650
this.keyElement("MetadataLanguage", value);
648651
}
649652
});
@@ -1507,9 +1510,11 @@ export default class ImdiGenerator {
15071510
// (ELAR prefers "Careful speech speaker" not "Careful Speech Speaker")
15081511
const normalizedTranslation = sentenceCase(translation);
15091512
const newElement = this.tail.element(elementName, normalizedTranslation);
1510-
// LanguageId uses ISO639-3 for 3-letter codes, ISO639-1 for 2-letter codes
1511-
const kind = slot.tag.length === 2 ? "ISO639-1" : "ISO639-3";
1512-
newElement.attribute("LanguageId", kind + ":" + slot.tag);
1513+
// Always use ISO639-3 (3-letter) codes - archives can't handle 2-letter ISO639-1 codes
1514+
const iso639_3 = staticLanguageFinder
1515+
? staticLanguageFinder.getIso639_3Code(slot.tag)
1516+
: slot.tag;
1517+
newElement.attribute("LanguageId", "ISO639-3:" + iso639_3);
15131518
newElement.attribute("Link", vocabularyUrl);
15141519
newElement.attribute("Type", "OpenVocabulary");
15151520
this.mostRecentElement = newElement;

src/languageFinder/LanguageFinder.spec.ts

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,4 +180,39 @@ describe("LanguageFinder", () => {
180180
expect(languageFinder.normalizeToBcp47("tpi-PG")).toBe("tpi-PG");
181181
});
182182
});
183+
184+
describe("getIso639_3Code", () => {
185+
it("should preserve 3-letter codes", () => {
186+
expect(languageFinder.getIso639_3Code("eng")).toBe("eng");
187+
expect(languageFinder.getIso639_3Code("spa")).toBe("spa");
188+
expect(languageFinder.getIso639_3Code("fra")).toBe("fra");
189+
expect(languageFinder.getIso639_3Code("etr")).toBe("etr");
190+
expect(languageFinder.getIso639_3Code("tpi")).toBe("tpi");
191+
});
192+
193+
it("should convert 2-letter codes to 3-letter", () => {
194+
expect(languageFinder.getIso639_3Code("en")).toBe("eng");
195+
expect(languageFinder.getIso639_3Code("es")).toBe("spa");
196+
expect(languageFinder.getIso639_3Code("fr")).toBe("fra");
197+
expect(languageFinder.getIso639_3Code("pt")).toBe("por");
198+
expect(languageFinder.getIso639_3Code("de")).toBe("deu");
199+
expect(languageFinder.getIso639_3Code("id")).toBe("ind");
200+
});
201+
202+
it("should handle case-insensitively", () => {
203+
expect(languageFinder.getIso639_3Code("EN")).toBe("eng");
204+
expect(languageFinder.getIso639_3Code("En")).toBe("eng");
205+
expect(languageFinder.getIso639_3Code("ENG")).toBe("eng");
206+
});
207+
208+
it("should trim whitespace", () => {
209+
expect(languageFinder.getIso639_3Code(" en ")).toBe("eng");
210+
expect(languageFinder.getIso639_3Code(" eng ")).toBe("eng");
211+
});
212+
213+
it("should handle empty input", () => {
214+
expect(languageFinder.getIso639_3Code("")).toBe("");
215+
expect(languageFinder.getIso639_3Code(" ")).toBe("");
216+
});
217+
});
183218
});

src/languageFinder/LanguageFinder.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,32 @@ export class LanguageFinder {
473473
return lowered;
474474
}
475475

476+
/**
477+
* Convert a language code to its ISO 639-3 (3-letter) form.
478+
* If already 3 letters, returns as-is. If 2-letter ISO 639-1 code,
479+
* looks up the corresponding 3-letter code.
480+
* E.g., "en" -> "eng", "es" -> "spa", "etr" -> "etr"
481+
*/
482+
public getIso639_3Code(code: string): string {
483+
const lowered = code.toLowerCase().trim();
484+
if (lowered.length === 0) return lowered;
485+
486+
// Already a 3-letter code? Return as-is
487+
if (lowered.length === 3) return lowered;
488+
489+
// If 2-letter code, look up the 3-letter equivalent
490+
if (lowered.length === 2) {
491+
const matches = this.lookupInIndexAndCustomLanguages(lowered);
492+
const exactMatch = matches.find((m) => m.iso639_1 === lowered);
493+
if (exactMatch?.iso639_3) {
494+
return exactMatch.iso639_3;
495+
}
496+
}
497+
498+
// Fall back to original code if lookup fails
499+
return lowered;
500+
}
501+
476502
/**
477503
* Get the localized language name for display.
478504
* Currently returns the English name, but could be extended for localization.

src/mainProcess/validateImdi.spec.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,9 +141,9 @@ describe("validateImdiAsyncInternal with ELAR schema", () => {
141141
</Project>
142142
<Keys/>
143143
<Content>
144-
<Genre LanguageId="ISO639-1:en" Link="http://www.mpi.nl/IMDI/Schema/Content-Genre.xml" Type="OpenVocabulary">Narrative</Genre>
145-
<Genre LanguageId="ISO639-1:es" Link="http://www.mpi.nl/IMDI/Schema/Content-Genre.xml" Type="OpenVocabulary">Narrativa</Genre>
146-
<Genre LanguageId="ISO639-1:pt" Link="http://www.mpi.nl/IMDI/Schema/Content-Genre.xml" Type="OpenVocabulary">Narrativa</Genre>
144+
<Genre LanguageId="ISO639-3:eng" Link="http://www.mpi.nl/IMDI/Schema/Content-Genre.xml" Type="OpenVocabulary">Narrative</Genre>
145+
<Genre LanguageId="ISO639-3:spa" Link="http://www.mpi.nl/IMDI/Schema/Content-Genre.xml" Type="OpenVocabulary">Narrativa</Genre>
146+
<Genre LanguageId="ISO639-3:por" Link="http://www.mpi.nl/IMDI/Schema/Content-Genre.xml" Type="OpenVocabulary">Narrativa</Genre>
147147
<CommunicationContext/>
148148
<Languages/>
149149
<Keys/>

0 commit comments

Comments
 (0)