Updated method and encoder params to accept strings

sharanvamsi · facebook-github-bot · commit ad49c05f6b8a · 2025-06-17T07:29:54.000-07:00
Summary:
Changed functional.py to accept strings for method and encoder params. This is addressing the comments from D76369496
Added enum type checking to be done in functional for encoder and method checking in encode_text_strategy. Updated all dependant files as well.

Reviewed By: bclarkson-code

Differential Revision: D76706716

fbshipit-source-id: ed7431a6b335466858ef454490e9aa7c1ac0573f
diff --git a/augly/tests/assets/expected_metadata/text_tests/expected_metadata.json b/augly/tests/assets/expected_metadata/text_tests/expected_metadata.json
@@ -95,7 +95,7 @@
             "n": 1,
             "p": 1.0,
             "encoder": "base64",
-            "method": "sentence"
+            "granularity": "all"
         }
     ],
     "get_baseline": [
diff --git a/augly/tests/text_tests/functional_unit_test.py b/augly/tests/text_tests/functional_unit_test.py
@@ -10,9 +10,7 @@
 import unittest
 
 from augly import text as txtaugs
-from augly.text.augmenters.utils import Encoding
 from augly.utils import FUN_FONTS_GREEK_PATH
-from nlpaug.util import Method
 
 
 class FunctionalTextUnitTest(unittest.TestCase):
@@ -40,22 +38,16 @@ def test_apply_lambda(self) -> None:
 
     def test_base64_sentence(self) -> None:
         augmented_words = txtaugs.encode_text(
-            "Hello, world!", 1, 1, 1.0, Method.SENTENCE, Encoding.BASE64
+            "Hello, world!", 1, 1, 1.0, "all", "base64"
         )
         self.assertEqual(augmented_words[0], "SGVsbG8sIHdvcmxkIQ==")
 
     def test_base64_word(self) -> None:
         augmented_words_word = txtaugs.encode_text(
-            "Hello, world!", 1, 1, 1.0, Method.WORD, Encoding.BASE64
+            "Hello, world!", 1, 1, 1.0, "word", "base64"
         )
         self.assertEqual(augmented_words_word[0], "SGVsbG8=, world!")
 
-    def test_base64_char(self) -> None:
-        augmented_words_char = txtaugs.encode_text(
-            "Hello, world!", 1, 1, 1.0, Method.CHAR, Encoding.BASE64
-        )
-        self.assertEqual(augmented_words_char[0], "SA==ello LA== dw==orld IQ==")
-
     def test_change_case(self) -> None:
         augmented_words = txtaugs.change_case(self.texts[0], cadence=3.0, case="upper")
         self.assertTrue(
@@ -274,13 +266,13 @@ def test_insert_zero_width_chars(self) -> None:
 
     def test_leetspeak_sentence(self) -> None:
         augmented_words = txtaugs.encode_text(
-            "Hello, world!", 1, 1, 1.0, Method.SENTENCE, Encoding.LEETSPEAK
+            "Hello, world!", 1, 1, 1.0, "all", "leetspeak"
         )
         self.assertEqual(augmented_words[0], "h3110, w0r1d!")
 
     def test_leetspeak_word(self) -> None:
         augmented_words = txtaugs.encode_text(
-            "Hello, world!", 1, 1, 1.0, Method.WORD, Encoding.LEETSPEAK
+            "Hello, world!", 1, 1, 1.0, "word", "leetspeak"
         )
         self.assertEqual(augmented_words[0], "h3110, world!")
 
diff --git a/augly/tests/text_tests/transforms_unit_test.py b/augly/tests/text_tests/transforms_unit_test.py
@@ -14,9 +14,7 @@
 from typing import Any, Dict, List
 
 from augly import text as txtaugs
-from augly.text.augmenters.utils import Encoding
 from augly.utils import TEXT_METADATA_PATH
-from nlpaug.util import Method
 
 
 def are_equal_metadata(
@@ -143,17 +141,15 @@ def test_Base64_Sentence(self) -> None:
             aug_min=1,
             aug_max=1,
             aug_p=1.0,
-            method=Method.SENTENCE,
-            encoder=Encoding.BASE64,
+            granularity="all",
+            encoder="base64",
             n=1,
             p=1.0,
         )(
             ["Hello, world!"],
             metadata=self.metadata,
         )
-
         self.assertTrue(augmented_text[0] == "SGVsbG8sIHdvcmxkIQ==")
-        self.expected_metadata["encode_text"][0]["encoder"] = Encoding.BASE64
         self.assertTrue(
             are_equal_metadata(self.metadata, self.expected_metadata["encode_text"])
         )
@@ -165,8 +161,8 @@ def test_Base64_Word(self) -> None:
             aug_min=1,
             aug_max=1,
             aug_p=1.0,
-            method=Method.WORD,
-            encoder=Encoding.BASE64,
+            granularity="word",
+            encoder="base64",
             n=1,
             p=1.0,
         )(
@@ -176,32 +172,9 @@ def test_Base64_Word(self) -> None:
         self.assertEqual(augmented_text[0], "SGVsbG8=, world!")
 
         metadata_expected = deepcopy(self.expected_metadata["encode_text"])
-        metadata_expected[0]["method"] = "word"
-        metadata_expected[0]["encoder"] = Encoding.BASE64
+        metadata_expected[0]["granularity"] = "word"
         self.assertTrue(are_equal_metadata(self.metadata, metadata_expected))
 
-    def test_Base64_Char(self) -> None:
-        self.metadata = []
-
-        augmented_text = txtaugs.EncodeTextTransform(
-            aug_min=1,
-            aug_max=1,
-            aug_p=1.0,
-            method=Method.CHAR,
-            encoder=Encoding.BASE64,
-            n=1,
-            p=1.0,
-        )(
-            ["Hello, world!"],
-            metadata=self.metadata,
-        )
-        self.assertEqual(augmented_text[0], "SA==ello LA== wocg==ld IQ==")
-
-        expected_metadata = deepcopy(self.expected_metadata["encode_text"])
-        expected_metadata[0]["method"] = "char"
-        expected_metadata[0]["encoder"] = Encoding.BASE64
-        self.assertTrue(are_equal_metadata(self.metadata, expected_metadata))
-
     def test_GetBaseline(self) -> None:
         augmented_baseline = txtaugs.GetBaseline()(self.texts, metadata=self.metadata)
 
@@ -296,8 +269,8 @@ def test_LeetSpeak_Sentence(self) -> None:
             aug_min=1,
             aug_max=1,
             aug_p=1.0,
-            method=Method.SENTENCE,
-            encoder=Encoding.LEETSPEAK,
+            granularity="all",
+            encoder="leetspeak",
             n=1,
             p=1.0,
         )(
@@ -306,7 +279,7 @@ def test_LeetSpeak_Sentence(self) -> None:
         )
 
         self.assertTrue(augmented_text[0] == "h3110, w0r1d!")
-        self.expected_metadata["encode_text"][0]["encoder"] = Encoding.LEETSPEAK
+        self.expected_metadata["encode_text"][0]["encoder"] = "leetspeak"
         self.assertTrue(
             are_equal_metadata(self.metadata, self.expected_metadata["encode_text"])
         )
@@ -318,8 +291,8 @@ def test_Leetspeak_Word(self) -> None:
             aug_min=1,
             aug_max=1,
             aug_p=1.0,
-            method=Method.WORD,
-            encoder=Encoding.LEETSPEAK,
+            granularity="word",
+            encoder="leetspeak",
             n=1,
             p=1.0,
         )(
@@ -329,8 +302,8 @@ def test_Leetspeak_Word(self) -> None:
         self.assertEqual(augmented_text[0], "h3110, world!")
 
         metadata_expected = deepcopy(self.expected_metadata["encode_text"])
-        metadata_expected[0]["method"] = "word"
-        metadata_expected[0]["encoder"] = Encoding.LEETSPEAK
+        metadata_expected[0]["granularity"] = "word"
+        metadata_expected[0]["encoder"] = "leetspeak"
         self.assertTrue(are_equal_metadata(self.metadata, metadata_expected))
 
     def test_MergeWords(self) -> None:
diff --git a/augly/text/augmenters/base64.py b/augly/text/augmenters/base64.py
@@ -6,10 +6,9 @@
 # LICENSE file in the root directory of this source tree.
 
 import codecs
+from typing import Literal
 
 from augly.text.augmenters.encode_text_strategy import EncodeTextAugmentation
-from augly.text.augmenters.utils import Encoding
-from nlpaug.util import Method
 
 
 class Base64(EncodeTextAugmentation):
@@ -18,16 +17,20 @@ def __init__(
         aug_min: int,
         aug_max: int,
         aug_p: float,
-        method: Method,
+        granularity: Literal["all", "word", "char"],
     ):
         super().__init__(
             name="Base64",
             aug_min=aug_min,
             aug_max=aug_max,
             aug_p=aug_p,
-            encoder=Encoding.BASE64,
-            method=str(method),
+            encoder="base64",
+            granularity=granularity,
         )
+        assert granularity in {
+            "all",
+            "word",
+        }, f"Base64 only supports granularity type 'all' or 'word', found type {granularity}"
         assert 0 <= aug_min <= aug_max
         assert 0 <= aug_p <= 1
 
diff --git a/augly/text/augmenters/encode_text_strategy.py b/augly/text/augmenters/encode_text_strategy.py
@@ -8,9 +8,9 @@
 # pyre-unsafe
 
 from abc import abstractmethod
-from typing import List, Union
+from typing import List, Literal, Union
 
-from augly.text.augmenters.utils import detokenize, Encoding, get_aug_idxes, tokenize
+from augly.text.augmenters.utils import detokenize, get_aug_idxes, tokenize
 from nlpaug.augmenter.word import Augmenter
 from nlpaug.util import Action, Method
 
@@ -22,20 +22,25 @@ def __init__(
         aug_min: int,
         aug_max: int,
         aug_p: float,
-        encoder: Encoding = Encoding.BASE64,
-        method: str = Method.SENTENCE,
+        granularity: Literal["all", "word", "char"],
+        encoder: Literal["base64", "leetspeak"],
     ):
+        assert granularity in {
+            "all",
+            "word",
+            "char",
+        }, f"Granularity type must be either 'all', 'word', 'char', found type {granularity}"
         super().__init__(
             name=name,
             aug_min=aug_min,
             aug_max=aug_max,
             aug_p=aug_p,
             action=Action.SUBSTITUTE,
-            method=method,
+            method=Method.SENTENCE,
         )
 
         self.encoder = encoder
-        self.method = method
+        self.granularity = granularity
 
     @classmethod
     def clean(cls, data: Union[str, List[str], None]) -> Union[str, List[str]]:
@@ -61,35 +66,35 @@ def encode(self, input_string: str) -> str:
         raise NotImplementedError
 
     def substitute(self, data: str) -> str:
-        if self.method == Method.SENTENCE:
+        if self.granularity == "all":
             return self.encode(data)
 
         tokens = tokenize(data)
         if not tokens:
             return ""
 
-        if self.method == Method.WORD:
+        if self.granularity == "word":
             augment_count = self._generate_aug_cnt(
                 len(tokens), self.aug_min, self.aug_max, self.aug_p
             )
             to_augment = set(
                 get_aug_idxes(
-                    self, tokens, list(range(len(tokens))), augment_count, Method.WORD
+                    self, tokens, list(range(len(tokens))), augment_count, "word"
                 )
             )
             for i, token in enumerate(tokens):
                 if i in to_augment:
                     tokens[i] = self.encode(token)
 
-        elif self.method == Method.CHAR:
+        elif self.granularity == "char":
             for token_idx, token in enumerate(tokens):
                 chars = list(token)
                 augment_count = self._generate_aug_cnt(
                     len(chars), self.aug_min, self.aug_max, self.aug_p
                 )
                 to_augment = set(
                     get_aug_idxes(
-                        self, chars, list(range(len(chars))), augment_count, Method.CHAR
+                        self, chars, list(range(len(chars))), augment_count, "char"
                     )
                 )
                 for char_idx, char in enumerate(chars):
diff --git a/augly/text/augmenters/leetspeak.py b/augly/text/augmenters/leetspeak.py
@@ -6,10 +6,9 @@
 # LICENSE file in the root directory of this source tree.
 
 import random
+from typing import Literal
 
 from augly.text.augmenters.encode_text_strategy import EncodeTextAugmentation
-from augly.text.augmenters.utils import Encoding
-from nlpaug.util import Method
 
 
 class LeetSpeak(EncodeTextAugmentation):
@@ -18,15 +17,15 @@ def __init__(
         aug_min: int,
         aug_max: int,
         aug_p: float,
-        method: Method,
+        granularity: Literal["all", "word", "char"],
     ):
         super().__init__(
             name="LeetSpeak",
             aug_min=aug_min,
             aug_max=aug_max,
             aug_p=aug_p,
-            encoder=Encoding.LEETSPEAK,
-            method=str(method),
+            encoder="leetspeak",
+            granularity=granularity,
         )
         assert 0 <= aug_min <= aug_max
         assert 0 <= aug_p <= 1
diff --git a/augly/text/augmenters/utils.py b/augly/text/augmenters/utils.py
@@ -8,7 +8,6 @@
 # pyre-unsafe
 
 import re
-from enum import Enum
 from typing import List, Optional, Tuple
 
 import regex
@@ -270,8 +269,3 @@ def get_aug_idxes(
         aug_idxes = augmenter.sample(priority_idxes, aug_cnt)
 
     return aug_idxes
-
-
-class Encoding(Enum):
-    BASE64 = "base64"
-    LEETSPEAK = "leetspeak"
diff --git a/augly/text/functional.py b/augly/text/functional.py
@@ -8,18 +8,16 @@
 # pyre-unsafe
 
 from copy import deepcopy
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, Union
 
 from augly.text import augmenters as a, utils as txtutils
-from augly.text.augmenters.utils import Encoding
 from augly.utils import (
     CONTRACTIONS_MAPPING,
     FUN_FONTS_PATH,
     GENDERED_WORDS_MAPPING,
     MISSPELLING_DICTIONARY_PATH,
     UNICODE_MAPPING_PATH,
 )
-from nlpaug.util import Method
 
 
 def apply_lambda(
@@ -174,8 +172,8 @@ def encode_text(
     aug_min: int,
     aug_max: int,
     aug_p: float,
-    method: Method,
-    encoder: Encoding,
+    granularity: Literal["all", "word", "char"],
+    encoder: Literal["base64", "leetspeak"],
     n: int = 1,
     p: float = 1.0,
     metadata: Optional[List[Dict[str, Any]]] = None,
@@ -206,14 +204,19 @@ def encode_text(
 
     @returns: the list of augmented(now in base 64) text documents
     """
+    assert encoder in {
+        "base64",
+        "leetspeak",
+    }, f"Encode text only supports encoder type 'base64' or 'leetspeak', found type {encoder}"
+
     func_kwargs = txtutils.get_func_kwargs(metadata, locals())
 
     if not isinstance(texts, list):
         texts = [texts]
-    if encoder == Encoding.BASE64:
-        encoder_strategy = a.Base64(aug_min, aug_max, aug_p, method)
+    if encoder == "base64":
+        encoder_strategy = a.Base64(aug_min, aug_max, aug_p, granularity)
     else:
-        encoder_strategy = a.LeetSpeak(aug_min, aug_max, aug_p, method)
+        encoder_strategy = a.LeetSpeak(aug_min, aug_max, aug_p, granularity)
     encoder_context = a.EncodeText(encoder_strategy)
     aug_texts = encoder_context.augmenter(texts)
 
diff --git a/augly/text/intensity.py b/augly/text/intensity.py
diff --git a/augly/text/transforms.py b/augly/text/transforms.py

Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,7 @@`
`95`	`95`	`"n": 1,`
`96`	`96`	`"p": 1.0,`
`97`	`97`	`"encoder": "base64",`
`98`		`- "method": "sentence"`
	`98`	`+ "granularity": "all"`
`99`	`99`	`}`
`100`	`100`	`],`
`101`	`101`	`"get_baseline": [`