CDCgov
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎data_curation/augmentation.py‎
Lines changed: 138 additions & 2 deletions b/‎data_curation/augmentation.py‎
Lines changed: 138 additions & 2 deletions
diff --git a/‎data_curation/schemas/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎data_curation/schemas/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎data_curation/schemas/augmentation.py‎
Lines changed: 14 additions & 0 deletions b/‎data_curation/schemas/augmentation.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎tests/unit/test_augmentation.py‎
Lines changed: 105 additions & 0 deletions b/‎tests/unit/test_augmentation.py‎
Lines changed: 105 additions & 0 deletions
@@ -38,3 +38,6 @@ coverage.xml
 # macOS
 **/.DS_Store
 /.vscode
+
+# Training files
+data/training_files/
@@ -1,7 +1,16 @@
 import random
 import re
 import typing
-from typing import Tuple
+
+import pydantic
+
+import data_curation.schemas.augmentation as schemas
+from utils import normalize as normalize
+from utils import path as path
+
+enhancements = path.load_loinc_enhancements()
+
+LOINC_ENHANCEMENTS = normalize.merge_enhancements(enhancements)
 
 
 def scramble_word_order(
@@ -67,7 +76,7 @@ def _word_deletion(
     return delete_indices
 
 
-def _get_word_detail_by_char_range(word_details: dict, char_idx: int) -> Tuple[int, dict]:
+def _get_word_detail_by_char_range(word_details: dict, char_idx: int) -> typing.Tuple[int, dict]:
     for key, word_deets in word_details.items():
         if char_idx in range(int(word_deets["start"]), int(word_deets["end"])):
             return int(key), word_deets
@@ -212,3 +221,130 @@ def insert_loinc_related_names(
         words.insert(idx_to_insert, name_to_insert)
 
     return " ".join(words)
+
+
+@pydantic.validate_call
+def enhance_loinc_str(
+    text: str,
+    enhancement_type: schemas.EnhancementType,
+    max_enhancements: int,
+    min_enhancements: int = 1,
+) -> str:
+    """
+    Enhances the input text by applying specified enhancement techniques.
+    :param text: The input text to enhance.
+    :param enhancement_type: The type of enhancement to apply. Options are:
+        - "abbrv": Replace words with their abbrveviations.
+        - "synonyms": Replace words with semantically related terms.
+        - "all": Apply all of the above techniques.
+    :param max_enhancements: The maximum number of enhancements to apply.
+    :param min_enhancements: The minimum number of enhancements to apply.
+    :return: The enhanced text.
+    """
+    if max_enhancements <= min_enhancements:
+        raise ValueError("max_enhancements must be greater than min_enhancements")
+
+    words = [[word.lower().strip(), [i]] for i, word in enumerate(text.split())]
+    # Check for possible enhancements
+    possible_words_to_enhance = _check_for_enhancements(words)
+
+    # Choose number of enhancements to apply
+    # Look for substrings to enhance if there are no individual words to enhance
+    if len(possible_words_to_enhance) < 1:
+        words = _generate_substrings(words)
+        possible_words_to_enhance = _check_for_enhancements(words)
+
+    if not possible_words_to_enhance:
+        return text
+
+    # Determine number of enhancements to apply
+    if len(possible_words_to_enhance) < min_enhancements:
+        num_enhancements = len(possible_words_to_enhance)
+    else:
+        num_enhancements = random.randint(
+            min_enhancements, min(max_enhancements, len(possible_words_to_enhance))
+        )
+
+    # Apply enhancements
+    words = _apply_enhancements(
+        words, possible_words_to_enhance, enhancement_type, num_enhancements
+    )
+
+    return " ".join(w[0] for w in words)
+
+
+def _apply_enhancements(
+    words: list[str, list[int]],
+    possible_words_to_enhance: dict[int, str],
+    enhancement_type: typing.Annotated[schemas.EnhancementType, pydantic.Field()],
+    num_enhancements: int,
+) -> list[str, list[int]]:
+    """
+
+    :param words: The list of words in the input text with their indices.
+    :param possible_words_to_enhance: A dictionary of words that can be enhanced.
+    :param enhancement_type: The type of enhancement to apply.
+    :param num_enhancements: The number of enhancements to apply.
+    :return: A tuple containing the enhanced list of words and the number of enhancements made.
+    """
+
+    # Apply enhancements
+    for _ in range(num_enhancements):
+        word_to_enhance = random.choice(list(possible_words_to_enhance.keys()))
+        word_to_enhance_idx = possible_words_to_enhance.pop(word_to_enhance)
+
+        possible_enhancements = LOINC_ENHANCEMENTS[word_to_enhance]
+        if not possible_enhancements.get(enhancement_type) and enhancement_type != "all":
+            continue
+
+        if enhancement_type == "all":
+            # Randomly choose between abbrveviation and synonyms & randomly pick an enhancement from the available options for the specified type
+            enhancement_type = random.choice(["abbrv", "synonyms"])
+            # If there are no enhancements of the chosen type, switch to the other type
+            if not possible_enhancements.get(enhancement_type):
+                enhancement_type = "abbrv" if enhancement_type == "synonyms" else "synonyms"
+
+        enhancement = random.choice(possible_enhancements[enhancement_type])
+
+        words[word_to_enhance_idx[0]][0] = enhancement
+
+    return words
+
+
+def _check_for_enhancements(words: list[str, list[int]]) -> list[str, list[int]]:
+    """
+    Checks the list of words for possible enhancements based on the LOINC_ENHANCEMENTS dictionary.
+
+    :param words: List of words to check for enhancements, including their indices.
+    :return: A dictionary with indices of words that can be enhanced as keys and the words themselves as values.
+    """
+    # Check that there are words to enhance
+    possible_words_to_enhance = {}
+
+    for word, idx in words:
+        if word in LOINC_ENHANCEMENTS:
+            # Only add if there are enhancements available
+            if not LOINC_ENHANCEMENTS[word].get("abbrv") and not LOINC_ENHANCEMENTS[word].get(
+                "synonyms"
+            ):
+                continue
+            possible_words_to_enhance[word] = idx
+
+    return possible_words_to_enhance
+
+
+def _generate_substrings(words: list[str, list[int]]) -> list[str, list[int]]:
+    """
+    Generates all possible substrings of the input list of words with at least 2 words
+    per substring.
+
+    :param words: List of words, including their indices, to generate substrings from.
+    :return: List of substrings, including their indices.
+    """
+    substrings = []
+    for start_idx in range(len(words)):
+        for end_idx in range(start_idx + 2, len(words) + 1):  # ensures at least 2 words
+            substring = " ".join(word for word, _ in words[start_idx:end_idx])
+            substrings.append([substring, [start_idx, end_idx]])
+
+    return substrings
@@ -0,0 +1,5 @@
+from .augmentation import EnhancementType
+
+__all__ = [
+    "EnhancementType",
+]
@@ -0,0 +1,14 @@
+"""
+data_curation.schemas.augmentation
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This module contains the schema definitions for the augmented data.
+"""
+
+import enum
+
+
+class EnhancementType(str, enum.Enum):
+    ABBRV = "abbrv"
+    SYNONYMS = "synonyms"
+    ALL = "all"
@@ -85,3 +85,108 @@ def test_insert_loinc_related_names(self, text, loinc_names, max_inserts, expect
             text, loinc_names, min_inserts=2, max_inserts=max_inserts
         )
         assert result == expected
+
+
+@pytest.mark.parametrize(
+    "words, expected",
+    [
+        # Test case 1: Words with possible enhancements
+        ([("blood", [0]), ("glucose", [1]), ("measurement", [2])], {"glucose": [1]}),
+        # Test case 2: No words with enhancements
+        ([("this", [0]), ("term", [1]), ("has", [2]), ("no", [3]), ("enhancements", [4])], {}),
+        # Test case 3: Mixed case words
+        ([("blood", [0]), ("glucose", [1]), ("zscore", [2])], {"glucose": [1], "zscore": [2]}),
+    ],
+)
+class TestCheckForEnhancements:
+    def test_check_for_enhancements(self, words, expected):
+        result = augmentation._check_for_enhancements(words)
+        assert result == expected
+
+
+@pytest.mark.parametrize(
+    "words, expected",
+    [
+        # Test case 1: Typical case with multiple words
+        (
+            [["blood", [0]], ["glucose", [1]], ["measurement", [2]]],
+            [
+                ["blood glucose", [0, 2]],
+                ["blood glucose measurement", [0, 3]],
+                ["glucose measurement", [1, 3]],
+            ],
+        ),
+        # Test case 2: Single word (no substrings possible)
+        ([["blood", [0]]], []),
+        # # Test case 3: Two words
+        ([["blood", [0]], ["glucose", [1]]], [["blood glucose", [0, 2]]]),
+    ],
+)
+class TestGenerateSubstrings:
+    def test_generate_substrings(self, words, expected):
+        result = augmentation._generate_substrings(words)
+        assert result == expected
+
+
+@pytest.mark.parametrize(
+    "text, enhancement_type, max_enhancements, min_enhancements, expected",
+    [
+        # Test case 1: Basic enhancement with a single synonym
+        (
+            "Blood Glucose Measurement",
+            "synonyms",
+            2,
+            1,
+            "blood glucoseur measurement",
+        ),
+        # Test case 2: Enhancement with "all" replacements
+        (
+            "Blood Glucose Measurement",
+            "all",
+            2,
+            1,
+            "blood gluc measurement",
+        ),
+        # Test case 3: No possible enhancements
+        (
+            "This term has no enhancements",
+            "abbrv",
+            2,
+            1,
+            "This term has no enhancements",
+        ),
+        # Test case 4: Enhancement on substrings
+        (
+            "Blood Glucose Measurement",
+            "all",
+            4,
+            2,
+            "blood glucoseur measurement",
+        ),
+    ],
+)
+class TestEnhanceLoinc:
+    def test_enhance_loinc(
+        self, text, enhancement_type, max_enhancements, min_enhancements, expected
+    ):
+        result = augmentation.enhance_loinc_str(
+            text,
+            enhancement_type=enhancement_type,
+            max_enhancements=max_enhancements,
+            min_enhancements=min_enhancements,
+        )
+
+        assert result == expected
+
+
+class TestEnhanceLoincError:
+    def test_enhance_loinc_str_raise_error(self):
+        text = "Blood Glucose Measurement"
+
+        with pytest.raises(ValueError):
+            augmentation.enhance_loinc_str(
+                text=text,
+                enhancement_type="abbrv",
+                max_enhancements=1,
+                min_enhancements=3,
+            )