Skip to content

Commit 54741fb

Browse files
authored
Add function to build augmented LOINC data files for testing (#91)
## Description This function creates 3 data files with augmented LOINC test data; 1 file for short names, 1 for long common names, and 1 for display names. ## Related Issues Closes #62 ## Additional Notes
1 parent dc77dc1 commit 54741fb

File tree

7 files changed

+155
-2
lines changed

7 files changed

+155
-2
lines changed

data_curation/augmentation.py

Lines changed: 59 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import csv
12
import random
23
import re
34
import typing
@@ -9,7 +10,6 @@
910
from utils import path as path
1011

1112
enhancements = path.load_loinc_enhancements()
12-
1313
LOINC_ENHANCEMENTS = normalize.merge_enhancements(enhancements)
1414

1515

@@ -450,3 +450,61 @@ def generate_augmented_examples(
450450
augmented_examples.append(ex_code)
451451

452452
return augmented_examples
453+
454+
455+
def build_augmented_loinc_files(
456+
input_path: str,
457+
config: schemas.LoincFileGenerationConfig,
458+
num_lcn: int = 5,
459+
num_sn: int = 5,
460+
num_dn: int = 5,
461+
output_path_base: str = "./data/training_files/augmented_loinc",
462+
) -> None:
463+
"""
464+
Generates augmented LOINC data files for the long common names, short
465+
common names, and display names based on the provided configurations.
466+
467+
:param path: The path to the base LOINC name file.
468+
:param configs: Configuration dictionaries for long common names, short
469+
common names, and display names.
470+
:param num_lcn: The number of augmented long common names to generate.
471+
:param num_sn: The number of augmented short common names to generate.
472+
:param num_dn: The number of augmented display names to generate.
473+
:param output_files_base: The base path for the output files.
474+
:return: None
475+
"""
476+
477+
num_map = {"short_name": num_sn, "long_common_name": num_lcn, "display_name": num_dn}
478+
479+
# Read in data/loinc_lab_names_XXXX.csv
480+
with open(
481+
input_path,
482+
encoding="utf-8",
483+
) as fp:
484+
data = fp.readlines()
485+
486+
for row in data:
487+
r = row.split("|")
488+
# skip any malformed rows
489+
if len(r) < 6:
490+
continue
491+
492+
loinc_code, short_name, long_name, display_name = r[0], r[1], r[2], r[3]
493+
related_names = r[5].split(";") if r[5] else []
494+
495+
values = {
496+
"short_name": short_name,
497+
"long_common_name": long_name,
498+
"display_name": display_name,
499+
}
500+
501+
for key, base_value in values.items():
502+
augmented_examples = generate_augmented_examples(
503+
base_value, related_names, num_map[key], config[key]
504+
)
505+
506+
# Append data to respective files
507+
# Note: these files should be opened using a CSV reader
508+
with open(f"{output_path_base}_{key}.csv", "a", encoding="utf-8", newline="") as fp:
509+
writer = csv.writer(fp, delimiter=":") # use ":" instead of default ","
510+
writer.writerow([loinc_code, base_value, "|".join(augmented_examples)])

data_curation/configs.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44
A default augmentation configuration meant as a "representative" synthetic
55
baseline for most cases. Slightly favors applying enhancement, but if
6-
enhancement isn't performed, heavily skews towards insertion. This promotes
6+
enhancement isn't performed, heavily skews towards insertion. This promotes
77
variance in semantic meaning in the input, which then undergoes a moderate
88
level of skewing / corruption.
99
"""
@@ -72,3 +72,13 @@
7272
"deletion_prob": 0.2,
7373
},
7474
}
75+
76+
"""
77+
A configuration intended for the generation of augmented LOINC files, with
78+
granular control over the levels of enhancement at the individual type level.
79+
"""
80+
LOINC_FILE_GENERATION_AUGMENTATION: schemas.LoincFileGenerationConfig = {
81+
"long_common_name": DEFAULT_AUGMENTATION,
82+
"short_name": DEFAULT_AUGMENTATION,
83+
"display_name": DEFAULT_AUGMENTATION,
84+
}

data_curation/schemas/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
from .augmentation import AugmentationConfig
22
from .augmentation import EnhancementType
3+
from .augmentation import LoincFileGenerationConfig
34

45
__all__ = [
56
"AugmentationConfig",
67
"EnhancementType",
8+
"LoincFileGenerationConfig",
79
]

data_curation/schemas/augmentation.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,3 +203,15 @@ class AugmentationConfig(pydantic.BaseModel):
203203
insertion: InsertionOptions = InsertionOptions()
204204
permutation: PermutationOptions = PermutationOptions()
205205
deletion: DeletionOptions = DeletionOptions()
206+
207+
208+
class LoincFileGenerationConfig(pydantic.BaseModel):
209+
"""
210+
The schema for a dictionary of configuration options governing how to generate
211+
synthetic data specifically for LOINC short names, long common names, and display
212+
names.
213+
"""
214+
215+
short_name: AugmentationConfig = AugmentationConfig()
216+
long_common_name: AugmentationConfig = AugmentationConfig()
217+
display_name: AugmentationConfig = AugmentationConfig()
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
code|short_name|long_name|display_name|definition_desc|related_names
2+
109224-6|Weed Allerg Mix3 IgE Msmt Ser|Weed Allergen Mix 3 (Mugwort+Goosefoot or Lambs quarters+English plantain+Goldenrod+Nettle) IgE Ab [Measurement] in Serum|Mugwort+Goosefoot or Lambs quarters+English plantain+Goldenrod+Nettle IgE (S) [Measurement]|This term is intended to collate similar measurements for the LOINC SNOMED CT Collaboration in an ontological view. Additionally, it can be used to communicate a laboratory order, either alone or in combination with specimen or other information in the order. It may NOT be used to report back the measured patient value. [https://lo.inc/ce2mb6]|ABS; Aby; Antby; Anti; Antibodies; Antibody; Autoantibodies; Autoantibody; Buckhorn plantain; Engl plantain; English plantain; Golden rod; Goldenrod; Goosefoot; Immune globulin E; Immunoglobulin E; Lab orders; Lamb's quarter; Lambs quarters; Lamb's quarters; Msmt; Mugwort; Nettle; Plantain Buckhorn; Quinoa; Ribwort; Serum; SR; w006; w009; w010; w012; w10; w12; w20; w6; w9; Weed Allerg Mix3; Weed mix; wx3
3+
102115-3|Epid Allerg Mix IgE pl SerPl|Epidermal Allergen Mix (Dog dander+Cat epithelium+Horse dander) Ab.IgE panel - Serum or Plasma|(Dog dander+Cat epithelium+Horse dander) IgE pl||Allergen; Allergens; ALLERGY TESTING; Canine; Dander; Dog hair; e003; e005; e3; e5; Epid Allerg Mix; Epid Allerg Mix IgE pl; Epithelia; Equus spp; Feline; Felis domesticus; Horse dander IgE pnl; Immune globulin E; Immunoglobulin E; Pan; PANEL.ALLERGY; Panl; Pl; Plasma; Plsm; Pnl; Point in time; Random; SerP; SerPl; SerPlas; Serum; Serum or plasma; SR
4+
87549-2|DHVD+DHVD2+DHVD3 Pnl SerPl|1,25-Dihydroxyvitamin D and 1,25-Dihydroxyvitamin D2 and 1,25-Dihydroxyvitamin D3 panel - Serum or Plasma|1,25-dihydroxyvitamin D and 1,25-dihydroxyvitamin D2 and 1,25-dihydroxyvitamin D3 panel|This panel contains quantitative results for 1,25-Dihydroxyvitamin D2 (DHVD2) and 1,25-Dihydroxyvitamin D3 (DHVD3, also known as calcitriol) as well as the total DHVD. This test may be ordered to assess vitamin D status, especially in patients with renal disease, and for patients with clinical evidence of a vitamin D deficiency, such as in patients with vitamin D-dependent rickets due to hereditary deficiency of renal 1-alpha hydroxylase.|1,25(OH)2D; 1,25(OH)2D2; Chemistry; Dextro; DHVD+DHVD2+DHVD3 Pnl; i; Pan; PANEL.CHEMISTRY; Panl; Pl; Plasma; Plsm; Pnl; Point in time; QNT; Quan; Quant; Quantitative; Random; SerP; SerPl; SerPlas; Serum; Serum or plasma; SR
5+
108566-1|THA Msmt Ur|11-Dehydrotetrahydrocorticosterone [Measurement] in Urine|11-Dehydrotetrahydrocorticosterone (U) [Measurement]|This term is intended to collate similar measurements for the LOINC SNOMED CT Collaboration in an ontological view. Additionally, it can be used to communicate a laboratory order, either alone or in combination with specimen or other information in the order. It may NOT be used to report back the measured patient value. [https://lo.inc/ce2mb6]|Lab orders; Msmt; THA; UA; UR; Urn
6+
109096-8|11DOC Msmt SerPl|11-Deoxycorticosterone [Measurement] in Serum or Plasma|11-Deoxycorticosterone [Measurement]|This term is intended to collate similar measurements for the LOINC SNOMED CT Collaboration in an ontological view. Additionally, it can be used to communicate a laboratory order, either alone or in combination with specimen or other information in the order. It may NOT be used to report back the measured patient value. [https://lo.inc/ce2mb6]|11DOC; 11-DOC; 21-Hydroxyprogesterone; DOC; Lab orders; Msmt; Pl; Plasma; Plsm; SerP; SerPl; SerPlas; Serum; Serum or plasma; SR
7+
110334-0|11DOC Msmt Ur|11-Deoxycorticosterone [Measurement] in Urine|11-Deoxycorticosterone (U) [Measurement]|This term is intended to collate similar measurements for the LOINC SNOMED CT Collaboration in an ontological view. Additionally, it can be used to communicate a laboratory order, either alone or in combination with specimen or other information in the order. It may NOT be used to report back the measured patient value. [https://lo.inc/ce2mb6]|11DOC; 11-DOC; 21-Hydroxyprogesterone; DOC; Lab orders; Msmt; UA; UR; Urn
8+
110615-2|11DC Msmt Ur|11-Deoxycortisol [Measurement] in Urine|11-Deoxycortisol (U) [Measurement]|This term is intended to collate similar measurements for the LOINC SNOMED CT Collaboration in an ontological view. Additionally, it can be used to communicate a laboratory order, either alone or in combination with specimen or other information in the order. It may NOT be used to report back the measured patient value. [https://lo.inc/ce2mb6]|11DC; Compound S; Cortexolone; Cortodoxone; Deoxycortisol; Desoxycortisol; Lab orders; Metopirone; Msmt; UA; UR; Urn
9+
110335-7|TH-DOC Msmt Ur|11-Deoxytetrahydrocorticosterone [Measurement] in Urine|11-Deoxytetrahydrocorticosterone (U) [Measurement]|This term is intended to collate similar measurements for the LOINC SNOMED CT Collaboration in an ontological view. Additionally, it can be used to communicate a laboratory order, either alone or in combination with specimen or other information in the order. It may NOT be used to report back the measured patient value. [https://lo.inc/ce2mb6]|3 Alpha, 21-dihydroxy-5-beta-pregnan-20-one; Lab orders; Msmt; Tetrahydro-11-deoxycorticosterone; TH DOC; TH-DOC; UA; UR; Urn
10+
110614-5|11OH-Androst Msmt Ur|11-Hydroxyandrosterone [Measurement] in Urine|11-Hydroxyandrosterone (U) [Measurement]|This term is intended to collate similar measurements for the LOINC SNOMED CT Collaboration in an ontological view. Additionally, it can be used to communicate a laboratory order, either alone or in combination with specimen or other information in the order. It may NOT be used to report back the measured patient value. [https://lo.inc/ce2mb6]|11-beta-hydroxyandrosterone; 11-Hydroxy androsterone; 11OH-Androst; Lab orders; Msmt; UA; UR; Urn

tests/unit/conftest.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,17 @@ def moto_setup(monkeypatch):
3232
@pytest.fixture(autouse=True)
3333
def fixed_random_seed():
3434
random.seed(42)
35+
36+
37+
@pytest.fixture(scope="function")
38+
def cleanup_tmp_files():
39+
# Setup: Ensure the tmp directory exists
40+
os.makedirs("./tmp", exist_ok=True)
41+
yield
42+
# Cleanup augmented files after test
43+
for filename in os.listdir("./tmp"):
44+
os.remove(os.path.join("./tmp", filename))
45+
46+
# Optionally, remove the tmp directory if empty
47+
if not os.listdir("./tmp"):
48+
os.rmdir("./tmp")

tests/unit/test_augmentation.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import csv
2+
import os
3+
14
import pytest
25

36
from data_curation import augmentation
@@ -223,3 +226,47 @@ class TestGenerateAugmentedTrainingSamples:
223226
def test_generate_augmented_examples(self, text, related_names, num_examples, config, expected):
224227
result = augmentation.generate_augmented_examples(text, related_names, num_examples, config)
225228
assert result == expected
229+
230+
231+
class TestBuildAugmentedLoincFiles:
232+
def test_build_augmented_loinc_files(self, cleanup_tmp_files):
233+
input_path = "./tests/unit/assets/loinc_lab_names_20250930.csv"
234+
num_sn = 2
235+
num_lcn = 2
236+
num_dn = 2
237+
config = {
238+
"long_common_name": AUGMENTATION_WITHOUT_ENHANCEMENT,
239+
"short_name": AUGMENTATION_WITHOUT_ENHANCEMENT,
240+
"display_name": AUGMENTATION_WITHOUT_ENHANCEMENT,
241+
}
242+
output_base_path = "./tmp/augmented_loinc"
243+
augmentation.build_augmented_loinc_files(
244+
input_path=input_path,
245+
num_sn=num_sn,
246+
num_lcn=num_lcn,
247+
num_dn=num_dn,
248+
config=config,
249+
output_path_base=output_base_path,
250+
)
251+
252+
# Check that the expected files were created
253+
# Assert files were created
254+
for key in config:
255+
file_path = f"{output_base_path}_{key}.csv"
256+
assert os.path.exists(file_path)
257+
258+
# Check that the files are not empty
259+
assert os.path.getsize(file_path) > 0
260+
261+
# Check that the files contain the expected number of augmented examples
262+
with open(file_path, "r", encoding="utf-8", newline="") as fp:
263+
reader = csv.reader(fp, delimiter=":")
264+
for row in reader:
265+
loinc_code, base_value, augmented_examples = row
266+
augmented_examples = augmented_examples.split("|")
267+
if key == "long_common_name":
268+
assert len(augmented_examples) == num_lcn
269+
elif key == "short_name":
270+
assert len(augmented_examples) == num_sn
271+
elif key == "display_name":
272+
assert len(augmented_examples) == num_dn

0 commit comments

Comments
 (0)