Skip to content

Commit 13606f6

Browse files
bamaderm-goggins
andauthored
Write generate_augmented_examples function (#84)
## Description This PR introduces an "orchestrator" function that takes as input a standard LOINC code string and generates a list of augmented training examples by probabilistically using other augmentation and scrambling functions we have thus far developed. ## Related Issues Closes #63 ## Additional Notes Merging this PR is blocked until we get the enhancement function done (so that I can refactor to add it, as well as test it), but all other functionality and logic is as it should be. Probably most relevant for the time being are the schemas for augmentation configurations and the initial default config objects I'm proposing. --------- Co-authored-by: m-goggins <marcelle.goggins@gmail.com>
1 parent a1962e7 commit 13606f6

File tree

6 files changed

+437
-31
lines changed

6 files changed

+437
-31
lines changed

data_curation/augmentation.py

Lines changed: 104 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,11 +178,11 @@ def random_char_deletion(
178178
if deletion_count > len(char_indices):
179179
deletion_count = len(char_indices - 1)
180180

181-
####### word method ######
181+
# word method
182182
if method == "word":
183183
delete_indices = _word_deletion(deletion_count, words, words_details, max_per_word)
184184

185-
####### char method ######
185+
# char method
186186
elif method == "char":
187187
delete_indices = _char_deletion(deletion_count, char_indices, words_details, max_per_word)
188188

@@ -348,3 +348,105 @@ def _generate_substrings(words: list[str, list[int]]) -> list[str, list[int]]:
348348
substrings.append([substring, [start_idx, end_idx]])
349349

350350
return substrings
351+
352+
353+
def generate_augmented_examples(
354+
input_code: str,
355+
related_names: typing.List[str],
356+
num_examples: int,
357+
config: schemas.AugmentationConfig,
358+
):
359+
"""
360+
Given a LOINC code string, generates a specified number of augmented
361+
training examples, which are returned as a list. Each augmented example is
362+
probabilistically operated on by a scrambling or enhancement function
363+
above to create a semantically and syntactically variant instance. The
364+
order of augmentation operations is always enhancement, insertion,
365+
permutation, then deletion.
366+
367+
:param input_code: The LOINC code string to generate augmented copies of.
368+
:param related_names: A list of strings consisting of the LOINC "Related
369+
Names" field pulled from the SNOINC extracts.
370+
:param num_examples: The number of augmented examples to generate.
371+
:param config: An Augmentation Configuration object indicating the
372+
thresholds, options, and probabilities used to modify the example.
373+
:returns: A list of augmented training examples.
374+
"""
375+
376+
augmented_examples = []
377+
for _ in range(num_examples):
378+
ex_code = input_code
379+
performed_enhancement = False
380+
381+
if "enhancement_all" in config:
382+
prob = random.uniform(0.0, 1.0)
383+
if prob <= config["enhancement_all"]["enhancement_prob"]:
384+
performed_enhancement = True
385+
ex_code = enhance_loinc_str(
386+
text=input_code,
387+
enhancement_type="all",
388+
max_enhancements=config["enhancement_all"]["max_enhancements"],
389+
)
390+
else:
391+
if "enhancement_synonyms" in config:
392+
prob = random.uniform(0.0, 1.0)
393+
if prob <= config["enhancement_synonyms"]["enhancement_prob"]:
394+
performed_enhancement = True
395+
ex_code = enhance_loinc_str(
396+
text=input_code,
397+
enhancement_type="synonyms",
398+
max_enhancements=config["enhancement_synonyms"]["max_enhancements"],
399+
)
400+
if "enhancement_abbreviation" in config:
401+
prob = random.uniform(0.0, 1.0)
402+
if prob <= config["enhancement_abbreviation"]["enhancement_prob"]:
403+
performed_enhancement = True
404+
ex_code = enhance_loinc_str(
405+
text=input_code,
406+
enhancement_type="abbrv",
407+
max_enhancements=config["enhancement_abbreviation"]["max_enhancements"],
408+
)
409+
410+
# Use the right insertion probability threshold
411+
# Inserts come after enhancements so that the random index any related
412+
# names are inserted at doesn't interfere with substring searching
413+
# for acronyms or abbreviations
414+
if performed_enhancement:
415+
t = config["insertion"]["insert_prob_after_enhance"]
416+
else:
417+
t = config["insertion"]["insert_prob_without_enhance"]
418+
prob = random.uniform(0.0, 1.0)
419+
if prob <= t:
420+
ex_code = insert_loinc_related_names(
421+
ex_code,
422+
related_names,
423+
config["insertion"]["max_inserts"],
424+
config["insertion"]["min_inserts"],
425+
)
426+
427+
# Next comes permutations, if applicable; no risk of interference
428+
# with deletions, but they have to come after enhancements for the
429+
# same reasons as insertions, and insertions have priority as the
430+
# only other mechanism to insert new semantic meaning
431+
prob = random.uniform(0.0, 1.0)
432+
if prob <= config["permutation"]["swap_prob"]:
433+
ex_code = scramble_word_order(
434+
ex_code, config["permutation"]["max_swaps"], config["permutation"]["min_swaps"]
435+
)
436+
437+
# Last come the deletions: must be the final operation because
438+
# they're syntactically destructive, and other operations depend on
439+
# the full syntax of each token
440+
prob = random.uniform(0.0, 1.0)
441+
if prob <= config["deletion"]["deletion_prob"]:
442+
ex_code = random_char_deletion(
443+
ex_code,
444+
config["deletion"]["min_deletes"],
445+
config["deletion"]["max_deletes"],
446+
config["deletion"]["max_deletes_per_word"],
447+
config["deletion"]["deletion_mode"],
448+
)
449+
450+
augmented_examples.append(ex_code)
451+
452+
return augmented_examples

data_curation/configs.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import data_curation.schemas.augmentation as schemas
2+
3+
"""
4+
A default augmentation configuration meant as a "representative" synthetic
5+
baseline for most cases. Slightly favors applying enhancement, but if
6+
enhancement isn't performed, heavily skews towards insertion. This promotes
7+
variance in semantic meaning in the input, which then undergoes a moderate
8+
level of skewing / corruption.
9+
"""
10+
DEFAULT_AUGMENTATION: schemas.AugmentationConfig = {
11+
"enhancement_all": {"min_enhances": 1, "max_enhances": 4, "enhancement_prob": 0.6},
12+
"insertion": {
13+
"min_inserts": 1,
14+
"max_inserts": 3,
15+
"insert_prob_after_enhance": 0.3,
16+
"insert_prob_without_enhance": 0.7,
17+
},
18+
"permutation": {"min_swaps": 1, "max_swaps": 3, "swap_prob": 0.3},
19+
"deletion": {
20+
"deletion_mode": "char",
21+
"min_deletes": 1,
22+
"max_deletes": 6,
23+
"max_deletes_per_word": 2,
24+
"deletion_prob": 0.4,
25+
},
26+
}
27+
28+
29+
"""
30+
An augmentation configuration meant for cases in which applying enhancement
31+
is undesirable or enhancement data is too sparse. Insertions are all but
32+
guaranteed with this configuration as a means to encourage semantic and
33+
syntactic variety. Since insertion is index-random, permutation swaps are
34+
less important than might otherwise seem, but sporadic deletion becomes
35+
more important to break up character-cluster memoization.
36+
"""
37+
AUGMENTATION_WITHOUT_ENHANCEMENT: schemas.AugmentationConfig = {
38+
"insertion": {"min_inserts": 1, "max_inserts": 3, "insert_prob_without_enhance": 1.0},
39+
"permutation": {"min_swaps": 1, "max_swaps": 3, "swap_prob": 0.3},
40+
"deletion": {
41+
"deletion_mode": "char",
42+
"min_deletes": 1,
43+
"max_deletes": 8,
44+
"max_deletes_per_word": 2,
45+
"deletion_prob": 0.6,
46+
},
47+
}
48+
49+
50+
"""
51+
A configuration intended for granular control over the levels of enhancement
52+
and semantic variance performed. This config is a good candidate for simulating
53+
higher levels of "human shorthand" by preferencing synthetic data towards
54+
syntactic rather than semantic substitution. Deletion is down-weighted here
55+
to not interfere with synonym and abbreviation usage.
56+
"""
57+
AUGMENTATION_INDIVIDUALLY_SPECIFIED: schemas.AugmentationConfig = {
58+
"enhancement_abbreviation": {"min_enhances": 1, "max_enhances": 3, "enhancement_prob": 0.8},
59+
"enhancement_synonyms": {"min_enhances": 1, "max_enhances": 2, "enhancement_prob": 0.4},
60+
"insertion": {
61+
"min_inserts": 1,
62+
"max_inserts": 2,
63+
"insert_prob_after_enhance": 0.4,
64+
"insert_prob_without_enhance": 0.8,
65+
},
66+
"permutation": {"min_swaps": 1, "max_swaps": 3, "swap_prob": 0.4},
67+
"deletion": {
68+
"deletion_mode": "word",
69+
"min_deletes": 1,
70+
"max_deletes": 5,
71+
"max_deletes_per_word": 1,
72+
"deletion_prob": 0.2,
73+
},
74+
}

data_curation/schemas/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1+
from .augmentation import AugmentationConfig
12
from .augmentation import EnhancementType
23

34
__all__ = [
5+
"AugmentationConfig",
46
"EnhancementType",
57
]

0 commit comments

Comments
 (0)