CDCgov
diff --git a/‎data_curation/augmentation.py‎
Lines changed: 104 additions & 2 deletions b/‎data_curation/augmentation.py‎
Lines changed: 104 additions & 2 deletions
diff --git a/‎data_curation/configs.py‎
Lines changed: 74 additions & 0 deletions b/‎data_curation/configs.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎data_curation/schemas/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎data_curation/schemas/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -178,11 +178,11 @@ def random_char_deletion(
     if deletion_count > len(char_indices):
         deletion_count = len(char_indices - 1)
 
-    ####### word method ######
+    # word method
     if method == "word":
         delete_indices = _word_deletion(deletion_count, words, words_details, max_per_word)
 
-    ####### char method ######
+    # char method
     elif method == "char":
         delete_indices = _char_deletion(deletion_count, char_indices, words_details, max_per_word)
 
@@ -348,3 +348,105 @@ def _generate_substrings(words: list[str, list[int]]) -> list[str, list[int]]:
             substrings.append([substring, [start_idx, end_idx]])
 
     return substrings
+
+
+def generate_augmented_examples(
+    input_code: str,
+    related_names: typing.List[str],
+    num_examples: int,
+    config: schemas.AugmentationConfig,
+):
+    """
+    Given a LOINC code string, generates a specified number of augmented
+    training examples, which are returned as a list. Each augmented example is
+    probabilistically operated on by a scrambling or enhancement function
+    above to create a semantically and syntactically variant instance. The
+    order of augmentation operations is always enhancement, insertion,
+    permutation, then deletion.
+
+    :param input_code: The LOINC code string to generate augmented copies of.
+    :param related_names: A list of strings consisting of the LOINC "Related
+      Names" field pulled from the SNOINC extracts.
+    :param num_examples: The number of augmented examples to generate.
+    :param config: An Augmentation Configuration object indicating the
+      thresholds, options, and probabilities used to modify the example.
+    :returns: A list of augmented training examples.
+    """
+
+    augmented_examples = []
+    for _ in range(num_examples):
+        ex_code = input_code
+        performed_enhancement = False
+
+        if "enhancement_all" in config:
+            prob = random.uniform(0.0, 1.0)
+            if prob <= config["enhancement_all"]["enhancement_prob"]:
+                performed_enhancement = True
+                ex_code = enhance_loinc_str(
+                    text=input_code,
+                    enhancement_type="all",
+                    max_enhancements=config["enhancement_all"]["max_enhancements"],
+                )
+        else:
+            if "enhancement_synonyms" in config:
+                prob = random.uniform(0.0, 1.0)
+                if prob <= config["enhancement_synonyms"]["enhancement_prob"]:
+                    performed_enhancement = True
+                    ex_code = enhance_loinc_str(
+                        text=input_code,
+                        enhancement_type="synonyms",
+                        max_enhancements=config["enhancement_synonyms"]["max_enhancements"],
+                    )
+            if "enhancement_abbreviation" in config:
+                prob = random.uniform(0.0, 1.0)
+                if prob <= config["enhancement_abbreviation"]["enhancement_prob"]:
+                    performed_enhancement = True
+                    ex_code = enhance_loinc_str(
+                        text=input_code,
+                        enhancement_type="abbrv",
+                        max_enhancements=config["enhancement_abbreviation"]["max_enhancements"],
+                    )
+
+        # Use the right insertion probability threshold
+        # Inserts come after enhancements so that the random index any related
+        # names are inserted at doesn't interfere with substring searching
+        # for acronyms or abbreviations
+        if performed_enhancement:
+            t = config["insertion"]["insert_prob_after_enhance"]
+        else:
+            t = config["insertion"]["insert_prob_without_enhance"]
+        prob = random.uniform(0.0, 1.0)
+        if prob <= t:
+            ex_code = insert_loinc_related_names(
+                ex_code,
+                related_names,
+                config["insertion"]["max_inserts"],
+                config["insertion"]["min_inserts"],
+            )
+
+        # Next comes permutations, if applicable; no risk of interference
+        # with deletions, but they have to come after enhancements for the
+        # same reasons as insertions, and insertions have priority as the
+        # only other mechanism to insert new semantic meaning
+        prob = random.uniform(0.0, 1.0)
+        if prob <= config["permutation"]["swap_prob"]:
+            ex_code = scramble_word_order(
+                ex_code, config["permutation"]["max_swaps"], config["permutation"]["min_swaps"]
+            )
+
+        # Last come the deletions: must be the final operation because
+        # they're syntactically destructive, and other operations depend on
+        # the full syntax of each token
+        prob = random.uniform(0.0, 1.0)
+        if prob <= config["deletion"]["deletion_prob"]:
+            ex_code = random_char_deletion(
+                ex_code,
+                config["deletion"]["min_deletes"],
+                config["deletion"]["max_deletes"],
+                config["deletion"]["max_deletes_per_word"],
+                config["deletion"]["deletion_mode"],
+            )
+
+        augmented_examples.append(ex_code)
+
+    return augmented_examples
@@ -0,0 +1,74 @@
+import data_curation.schemas.augmentation as schemas
+
+"""
+A default augmentation configuration meant as a "representative" synthetic
+baseline for most cases. Slightly favors applying enhancement, but if 
+enhancement isn't performed, heavily skews towards insertion. This promotes
+variance in semantic meaning in the input, which then undergoes a moderate
+level of skewing / corruption.
+"""
+DEFAULT_AUGMENTATION: schemas.AugmentationConfig = {
+    "enhancement_all": {"min_enhances": 1, "max_enhances": 4, "enhancement_prob": 0.6},
+    "insertion": {
+        "min_inserts": 1,
+        "max_inserts": 3,
+        "insert_prob_after_enhance": 0.3,
+        "insert_prob_without_enhance": 0.7,
+    },
+    "permutation": {"min_swaps": 1, "max_swaps": 3, "swap_prob": 0.3},
+    "deletion": {
+        "deletion_mode": "char",
+        "min_deletes": 1,
+        "max_deletes": 6,
+        "max_deletes_per_word": 2,
+        "deletion_prob": 0.4,
+    },
+}
+
+
+"""
+An augmentation configuration meant for cases in which applying enhancement
+is undesirable or enhancement data is too sparse. Insertions are all but
+guaranteed with this configuration as a means to encourage semantic and
+syntactic variety. Since insertion is index-random, permutation swaps are
+less important than might otherwise seem, but sporadic deletion becomes
+more important to break up character-cluster memoization.
+"""
+AUGMENTATION_WITHOUT_ENHANCEMENT: schemas.AugmentationConfig = {
+    "insertion": {"min_inserts": 1, "max_inserts": 3, "insert_prob_without_enhance": 1.0},
+    "permutation": {"min_swaps": 1, "max_swaps": 3, "swap_prob": 0.3},
+    "deletion": {
+        "deletion_mode": "char",
+        "min_deletes": 1,
+        "max_deletes": 8,
+        "max_deletes_per_word": 2,
+        "deletion_prob": 0.6,
+    },
+}
+
+
+"""
+A configuration intended for granular control over the levels of enhancement
+and semantic variance performed. This config is a good candidate for simulating
+higher levels of "human shorthand" by preferencing synthetic data towards
+syntactic rather than semantic substitution. Deletion is down-weighted here
+to not interfere with synonym and abbreviation usage.
+"""
+AUGMENTATION_INDIVIDUALLY_SPECIFIED: schemas.AugmentationConfig = {
+    "enhancement_abbreviation": {"min_enhances": 1, "max_enhances": 3, "enhancement_prob": 0.8},
+    "enhancement_synonyms": {"min_enhances": 1, "max_enhances": 2, "enhancement_prob": 0.4},
+    "insertion": {
+        "min_inserts": 1,
+        "max_inserts": 2,
+        "insert_prob_after_enhance": 0.4,
+        "insert_prob_without_enhance": 0.8,
+    },
+    "permutation": {"min_swaps": 1, "max_swaps": 3, "swap_prob": 0.4},
+    "deletion": {
+        "deletion_mode": "word",
+        "min_deletes": 1,
+        "max_deletes": 5,
+        "max_deletes_per_word": 1,
+        "deletion_prob": 0.2,
+    },
+}
@@ -1,5 +1,7 @@
+from .augmentation import AugmentationConfig
 from .augmentation import EnhancementType
 
 __all__ = [
+    "AugmentationConfig",
     "EnhancementType",
 ]
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,7 @@`
	`1`	`+from .augmentation import AugmentationConfig`
`1`	`2`	`from .augmentation import EnhancementType`
`2`	`3`
`3`	`4`	`__all__ = [`
	`5`	`+ "AugmentationConfig",`
`4`	`6`	`"EnhancementType",`
`5`	`7`	`]`