@@ -178,11 +178,11 @@ def random_char_deletion(
178178 if deletion_count > len (char_indices ):
179179 deletion_count = len (char_indices - 1 )
180180
181- ####### word method ######
181+ # word method
182182 if method == "word" :
183183 delete_indices = _word_deletion (deletion_count , words , words_details , max_per_word )
184184
185- ####### char method ######
185+ # char method
186186 elif method == "char" :
187187 delete_indices = _char_deletion (deletion_count , char_indices , words_details , max_per_word )
188188
@@ -348,3 +348,105 @@ def _generate_substrings(words: list[str, list[int]]) -> list[str, list[int]]:
348348 substrings .append ([substring , [start_idx , end_idx ]])
349349
350350 return substrings
351+
352+
353+ def generate_augmented_examples (
354+ input_code : str ,
355+ related_names : typing .List [str ],
356+ num_examples : int ,
357+ config : schemas .AugmentationConfig ,
358+ ):
359+ """
360+ Given a LOINC code string, generates a specified number of augmented
361+ training examples, which are returned as a list. Each augmented example is
362+ probabilistically operated on by a scrambling or enhancement function
363+ above to create a semantically and syntactically variant instance. The
364+ order of augmentation operations is always enhancement, insertion,
365+ permutation, then deletion.
366+
367+ :param input_code: The LOINC code string to generate augmented copies of.
368+ :param related_names: A list of strings consisting of the LOINC "Related
369+ Names" field pulled from the SNOINC extracts.
370+ :param num_examples: The number of augmented examples to generate.
371+ :param config: An Augmentation Configuration object indicating the
372+ thresholds, options, and probabilities used to modify the example.
373+ :returns: A list of augmented training examples.
374+ """
375+
376+ augmented_examples = []
377+ for _ in range (num_examples ):
378+ ex_code = input_code
379+ performed_enhancement = False
380+
381+ if "enhancement_all" in config :
382+ prob = random .uniform (0.0 , 1.0 )
383+ if prob <= config ["enhancement_all" ]["enhancement_prob" ]:
384+ performed_enhancement = True
385+ ex_code = enhance_loinc_str (
386+ text = input_code ,
387+ enhancement_type = "all" ,
388+ max_enhancements = config ["enhancement_all" ]["max_enhancements" ],
389+ )
390+ else :
391+ if "enhancement_synonyms" in config :
392+ prob = random .uniform (0.0 , 1.0 )
393+ if prob <= config ["enhancement_synonyms" ]["enhancement_prob" ]:
394+ performed_enhancement = True
395+ ex_code = enhance_loinc_str (
396+ text = input_code ,
397+ enhancement_type = "synonyms" ,
398+ max_enhancements = config ["enhancement_synonyms" ]["max_enhancements" ],
399+ )
400+ if "enhancement_abbreviation" in config :
401+ prob = random .uniform (0.0 , 1.0 )
402+ if prob <= config ["enhancement_abbreviation" ]["enhancement_prob" ]:
403+ performed_enhancement = True
404+ ex_code = enhance_loinc_str (
405+ text = input_code ,
406+ enhancement_type = "abbrv" ,
407+ max_enhancements = config ["enhancement_abbreviation" ]["max_enhancements" ],
408+ )
409+
410+ # Use the right insertion probability threshold
411+ # Inserts come after enhancements so that the random index any related
412+ # names are inserted at doesn't interfere with substring searching
413+ # for acronyms or abbreviations
414+ if performed_enhancement :
415+ t = config ["insertion" ]["insert_prob_after_enhance" ]
416+ else :
417+ t = config ["insertion" ]["insert_prob_without_enhance" ]
418+ prob = random .uniform (0.0 , 1.0 )
419+ if prob <= t :
420+ ex_code = insert_loinc_related_names (
421+ ex_code ,
422+ related_names ,
423+ config ["insertion" ]["max_inserts" ],
424+ config ["insertion" ]["min_inserts" ],
425+ )
426+
427+ # Next comes permutations, if applicable; no risk of interference
428+ # with deletions, but they have to come after enhancements for the
429+ # same reasons as insertions, and insertions have priority as the
430+ # only other mechanism to insert new semantic meaning
431+ prob = random .uniform (0.0 , 1.0 )
432+ if prob <= config ["permutation" ]["swap_prob" ]:
433+ ex_code = scramble_word_order (
434+ ex_code , config ["permutation" ]["max_swaps" ], config ["permutation" ]["min_swaps" ]
435+ )
436+
437+ # Last come the deletions: must be the final operation because
438+ # they're syntactically destructive, and other operations depend on
439+ # the full syntax of each token
440+ prob = random .uniform (0.0 , 1.0 )
441+ if prob <= config ["deletion" ]["deletion_prob" ]:
442+ ex_code = random_char_deletion (
443+ ex_code ,
444+ config ["deletion" ]["min_deletes" ],
445+ config ["deletion" ]["max_deletes" ],
446+ config ["deletion" ]["max_deletes_per_word" ],
447+ config ["deletion" ]["deletion_mode" ],
448+ )
449+
450+ augmented_examples .append (ex_code )
451+
452+ return augmented_examples
0 commit comments