Add function to build augmented LOINC data files for testing (#91)

m-goggins · web-flow · commit 54741fbef472 · 2025-09-30T12:02:07.000-04:00
## Description This function creates 3 data files with augmented LOINC test data; 1 file for short names, 1 for long common names, and 1 for display names. ## Related Issues Closes #62 ## Additional Notes
diff --git a/data_curation/augmentation.py b/data_curation/augmentation.py
@@ -1,3 +1,4 @@
+import csv
 import random
 import re
 import typing
@@ -9,7 +10,6 @@
 from utils import path as path
 
 enhancements = path.load_loinc_enhancements()
-
 LOINC_ENHANCEMENTS = normalize.merge_enhancements(enhancements)
 
 
@@ -450,3 +450,61 @@ def generate_augmented_examples(
         augmented_examples.append(ex_code)
 
     return augmented_examples
+
+
+def build_augmented_loinc_files(
+    input_path: str,
+    config: schemas.LoincFileGenerationConfig,
+    num_lcn: int = 5,
+    num_sn: int = 5,
+    num_dn: int = 5,
+    output_path_base: str = "./data/training_files/augmented_loinc",
+) -> None:
+    """
+    Generates augmented LOINC data files for the long common names, short
+    common names, and display names based on the provided configurations.
+
+    :param path: The path to the base LOINC name file.
+    :param configs: Configuration dictionaries for long common names, short
+        common names, and display names.
+    :param num_lcn: The number of augmented long common names to generate.
+    :param num_sn: The number of augmented short common names to generate.
+    :param num_dn: The number of augmented display names to generate.
+    :param output_files_base: The base path for the output files.
+    :return: None
+    """
+
+    num_map = {"short_name": num_sn, "long_common_name": num_lcn, "display_name": num_dn}
+
+    # Read in data/loinc_lab_names_XXXX.csv
+    with open(
+        input_path,
+        encoding="utf-8",
+    ) as fp:
+        data = fp.readlines()
+
+    for row in data:
+        r = row.split("|")
+        # skip any malformed rows
+        if len(r) < 6:
+            continue
+
+        loinc_code, short_name, long_name, display_name = r[0], r[1], r[2], r[3]
+        related_names = r[5].split(";") if r[5] else []
+
+        values = {
+            "short_name": short_name,
+            "long_common_name": long_name,
+            "display_name": display_name,
+        }
+
+        for key, base_value in values.items():
+            augmented_examples = generate_augmented_examples(
+                base_value, related_names, num_map[key], config[key]
+            )
+
+            # Append data to respective files
+            # Note: these files should be opened using a CSV reader
+            with open(f"{output_path_base}_{key}.csv", "a", encoding="utf-8", newline="") as fp:
+                writer = csv.writer(fp, delimiter=":")  # use ":" instead of default ","
+                writer.writerow([loinc_code, base_value, "|".join(augmented_examples)])
diff --git a/data_curation/configs.py b/data_curation/configs.py
@@ -3,7 +3,7 @@
 """
 A default augmentation configuration meant as a "representative" synthetic
 baseline for most cases. Slightly favors applying enhancement, but if 
-enhancement isn't performed, heavily skews towards insertion. This promotes
+enhancement isn't performed, heavily skews towards insertion. This promotes 
 variance in semantic meaning in the input, which then undergoes a moderate
 level of skewing / corruption.
 """
@@ -72,3 +72,13 @@
         "deletion_prob": 0.2,
     },
 }
+
+"""
+A configuration intended for the generation of augmented LOINC files, with
+granular control over the levels of enhancement at the individual type level. 
+"""
+LOINC_FILE_GENERATION_AUGMENTATION: schemas.LoincFileGenerationConfig = {
+    "long_common_name": DEFAULT_AUGMENTATION,
+    "short_name": DEFAULT_AUGMENTATION,
+    "display_name": DEFAULT_AUGMENTATION,
+}
diff --git a/data_curation/schemas/__init__.py b/data_curation/schemas/__init__.py
@@ -1,7 +1,9 @@
 from .augmentation import AugmentationConfig
 from .augmentation import EnhancementType
+from .augmentation import LoincFileGenerationConfig
 
 __all__ = [
     "AugmentationConfig",
     "EnhancementType",
+    "LoincFileGenerationConfig",
 ]
diff --git a/data_curation/schemas/augmentation.py b/data_curation/schemas/augmentation.py
@@ -203,3 +203,15 @@ class AugmentationConfig(pydantic.BaseModel):
     insertion: InsertionOptions = InsertionOptions()
     permutation: PermutationOptions = PermutationOptions()
     deletion: DeletionOptions = DeletionOptions()
+
+
+class LoincFileGenerationConfig(pydantic.BaseModel):
+    """
+    The schema for a dictionary of configuration options governing how to generate
+    synthetic data specifically for LOINC short names, long common names, and display
+    names.
+    """
+
+    short_name: AugmentationConfig = AugmentationConfig()
+    long_common_name: AugmentationConfig = AugmentationConfig()
+    display_name: AugmentationConfig = AugmentationConfig()
diff --git a/tests/unit/assets/loinc_lab_names_20250930.csv b/tests/unit/assets/loinc_lab_names_20250930.csv
@@ -0,0 +1,10 @@
+code|short_name|long_name|display_name|definition_desc|related_names
+109224-6|Weed Allerg Mix3 IgE Msmt Ser|Weed Allergen Mix 3 (Mugwort+Goosefoot or Lambs quarters+English plantain+Goldenrod+Nettle) IgE Ab [Measurement] in Serum|Mugwort+Goosefoot or Lambs quarters+English plantain+Goldenrod+Nettle IgE (S) [Measurement]|This term is intended to collate similar measurements for the LOINC SNOMED CT Collaboration in an ontological view. Additionally, it can be used to communicate a laboratory order, either alone or in combination with specimen or other information in the order. It may NOT be used to report back the measured patient value. [https://lo.inc/ce2mb6]|ABS; Aby; Antby; Anti; Antibodies; Antibody; Autoantibodies; Autoantibody; Buckhorn plantain; Engl plantain; English plantain; Golden rod; Goldenrod; Goosefoot; Immune globulin E; Immunoglobulin E; Lab orders; Lamb's quarter; Lambs quarters; Lamb's quarters; Msmt; Mugwort; Nettle; Plantain Buckhorn; Quinoa; Ribwort; Serum; SR; w006; w009; w010; w012; w10; w12; w20; w6; w9; Weed Allerg Mix3; Weed mix; wx3
+102115-3|Epid Allerg Mix IgE pl SerPl|Epidermal Allergen Mix (Dog dander+Cat epithelium+Horse dander) Ab.IgE panel - Serum or Plasma|(Dog dander+Cat epithelium+Horse dander) IgE pl||Allergen; Allergens; ALLERGY TESTING; Canine; Dander; Dog hair; e003; e005; e3; e5; Epid Allerg Mix; Epid Allerg Mix IgE pl; Epithelia; Equus spp; Feline; Felis domesticus; Horse dander IgE pnl; Immune globulin E; Immunoglobulin E; Pan; PANEL.ALLERGY; Panl; Pl; Plasma; Plsm; Pnl; Point in time; Random; SerP; SerPl; SerPlas; Serum; Serum or plasma; SR
+87549-2|DHVD+DHVD2+DHVD3 Pnl SerPl|1,25-Dihydroxyvitamin D and 1,25-Dihydroxyvitamin D2 and 1,25-Dihydroxyvitamin D3 panel - Serum or Plasma|1,25-dihydroxyvitamin D and 1,25-dihydroxyvitamin D2 and 1,25-dihydroxyvitamin D3 panel|This panel contains quantitative results for 1,25-Dihydroxyvitamin D2 (DHVD2) and 1,25-Dihydroxyvitamin D3 (DHVD3, also known as calcitriol) as well as the total DHVD. This test may be ordered to assess vitamin D status, especially in patients with renal disease, and for patients with clinical evidence of a vitamin D deficiency, such as in patients with vitamin D-dependent rickets due to hereditary deficiency of renal 1-alpha hydroxylase.|1,25(OH)2D; 1,25(OH)2D2; Chemistry; Dextro; DHVD+DHVD2+DHVD3 Pnl; i; Pan; PANEL.CHEMISTRY; Panl; Pl; Plasma; Plsm; Pnl; Point in time; QNT; Quan; Quant; Quantitative; Random; SerP; SerPl; SerPlas; Serum; Serum or plasma; SR
+108566-1|THA Msmt Ur|11-Dehydrotetrahydrocorticosterone [Measurement] in Urine|11-Dehydrotetrahydrocorticosterone (U) [Measurement]|This term is intended to collate similar measurements for the LOINC SNOMED CT Collaboration in an ontological view. Additionally, it can be used to communicate a laboratory order, either alone or in combination with specimen or other information in the order. It may NOT be used to report back the measured patient value. [https://lo.inc/ce2mb6]|Lab orders; Msmt; THA; UA; UR; Urn
+109096-8|11DOC Msmt SerPl|11-Deoxycorticosterone [Measurement] in Serum or Plasma|11-Deoxycorticosterone [Measurement]|This term is intended to collate similar measurements for the LOINC SNOMED CT Collaboration in an ontological view. Additionally, it can be used to communicate a laboratory order, either alone or in combination with specimen or other information in the order. It may NOT be used to report back the measured patient value. [https://lo.inc/ce2mb6]|11DOC; 11-DOC; 21-Hydroxyprogesterone; DOC; Lab orders; Msmt; Pl; Plasma; Plsm; SerP; SerPl; SerPlas; Serum; Serum or plasma; SR
+110334-0|11DOC Msmt Ur|11-Deoxycorticosterone [Measurement] in Urine|11-Deoxycorticosterone (U) [Measurement]|This term is intended to collate similar measurements for the LOINC SNOMED CT Collaboration in an ontological view. Additionally, it can be used to communicate a laboratory order, either alone or in combination with specimen or other information in the order. It may NOT be used to report back the measured patient value. [https://lo.inc/ce2mb6]|11DOC; 11-DOC; 21-Hydroxyprogesterone; DOC; Lab orders; Msmt; UA; UR; Urn
+110615-2|11DC Msmt Ur|11-Deoxycortisol [Measurement] in Urine|11-Deoxycortisol (U) [Measurement]|This term is intended to collate similar measurements for the LOINC SNOMED CT Collaboration in an ontological view. Additionally, it can be used to communicate a laboratory order, either alone or in combination with specimen or other information in the order. It may NOT be used to report back the measured patient value. [https://lo.inc/ce2mb6]|11DC; Compound S; Cortexolone; Cortodoxone; Deoxycortisol; Desoxycortisol; Lab orders; Metopirone; Msmt; UA; UR; Urn
+110335-7|TH-DOC Msmt Ur|11-Deoxytetrahydrocorticosterone [Measurement] in Urine|11-Deoxytetrahydrocorticosterone (U) [Measurement]|This term is intended to collate similar measurements for the LOINC SNOMED CT Collaboration in an ontological view. Additionally, it can be used to communicate a laboratory order, either alone or in combination with specimen or other information in the order. It may NOT be used to report back the measured patient value. [https://lo.inc/ce2mb6]|3 Alpha, 21-dihydroxy-5-beta-pregnan-20-one; Lab orders; Msmt; Tetrahydro-11-deoxycorticosterone; TH DOC; TH-DOC; UA; UR; Urn
+110614-5|11OH-Androst Msmt Ur|11-Hydroxyandrosterone [Measurement] in Urine|11-Hydroxyandrosterone (U) [Measurement]|This term is intended to collate similar measurements for the LOINC SNOMED CT Collaboration in an ontological view. Additionally, it can be used to communicate a laboratory order, either alone or in combination with specimen or other information in the order. It may NOT be used to report back the measured patient value. [https://lo.inc/ce2mb6]|11-beta-hydroxyandrosterone; 11-Hydroxy androsterone; 11OH-Androst; Lab orders; Msmt; UA; UR; Urn
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
@@ -32,3 +32,17 @@ def moto_setup(monkeypatch):
 @pytest.fixture(autouse=True)
 def fixed_random_seed():
     random.seed(42)
+
+
+@pytest.fixture(scope="function")
+def cleanup_tmp_files():
+    # Setup: Ensure the tmp directory exists
+    os.makedirs("./tmp", exist_ok=True)
+    yield
+    # Cleanup augmented files after test
+    for filename in os.listdir("./tmp"):
+        os.remove(os.path.join("./tmp", filename))
+
+    # Optionally, remove the tmp directory if empty
+    if not os.listdir("./tmp"):
+        os.rmdir("./tmp")
diff --git a/tests/unit/test_augmentation.py b/tests/unit/test_augmentation.py
@@ -1,3 +1,6 @@
+import csv
+import os
+
 import pytest
 
 from data_curation import augmentation
@@ -223,3 +226,47 @@ class TestGenerateAugmentedTrainingSamples:
     def test_generate_augmented_examples(self, text, related_names, num_examples, config, expected):
         result = augmentation.generate_augmented_examples(text, related_names, num_examples, config)
         assert result == expected
+
+
+class TestBuildAugmentedLoincFiles:
+    def test_build_augmented_loinc_files(self, cleanup_tmp_files):
+        input_path = "./tests/unit/assets/loinc_lab_names_20250930.csv"
+        num_sn = 2
+        num_lcn = 2
+        num_dn = 2
+        config = {
+            "long_common_name": AUGMENTATION_WITHOUT_ENHANCEMENT,
+            "short_name": AUGMENTATION_WITHOUT_ENHANCEMENT,
+            "display_name": AUGMENTATION_WITHOUT_ENHANCEMENT,
+        }
+        output_base_path = "./tmp/augmented_loinc"
+        augmentation.build_augmented_loinc_files(
+            input_path=input_path,
+            num_sn=num_sn,
+            num_lcn=num_lcn,
+            num_dn=num_dn,
+            config=config,
+            output_path_base=output_base_path,
+        )
+
+        # Check that the expected files were created
+        # Assert files were created
+        for key in config:
+            file_path = f"{output_base_path}_{key}.csv"
+            assert os.path.exists(file_path)
+
+            # Check that the files are not empty
+            assert os.path.getsize(file_path) > 0
+
+            # Check that the files contain the expected number of augmented examples
+            with open(file_path, "r", encoding="utf-8", newline="") as fp:
+                reader = csv.reader(fp, delimiter=":")
+                for row in reader:
+                    loinc_code, base_value, augmented_examples = row
+                    augmented_examples = augmented_examples.split("|")
+                    if key == "long_common_name":
+                        assert len(augmented_examples) == num_lcn
+                    elif key == "short_name":
+                        assert len(augmented_examples) == num_sn
+                    elif key == "display_name":
+                        assert len(augmented_examples) == num_dn

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,9 @@`
`1`	`1`	`from .augmentation import AugmentationConfig`
`2`	`2`	`from .augmentation import EnhancementType`
	`3`	`+from .augmentation import LoincFileGenerationConfig`
`3`	`4`
`4`	`5`	`__all__ = [`
`5`	`6`	`"AugmentationConfig",`
`6`	`7`	`"EnhancementType",`
	`8`	`+ "LoincFileGenerationConfig",`
`7`	`9`	`]`