Skip to content

Commit dffa7b0

Browse files
SwapCase (#24)
Fixing pipeline documentation bug, adding SwapCase generator, fixing LeetSpeak bug, Adding swapcase documentation + pipelines
2 parents 14e84db + 38e872a commit dffa7b0

File tree

7 files changed

+159
-27
lines changed

7 files changed

+159
-27
lines changed

badgers/generators/text/typos.py

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,7 @@ class SwapLettersGenerator(TyposGenerator):
3232

3333
def __init__(self, random_generator=default_rng(seed=0)):
3434
"""
35-
3635
:param random_generator: A random generator
37-
38-
3936
"""
4037
super().__init__(random_generator)
4138

@@ -110,10 +107,11 @@ def randomly_replace_letter(self, letter, replacement_proba):
110107
:param replacement_proba: the probability of replacing a letter with its leet counterpart
111108
:return:
112109
"""
113-
if self.random_generator.random() < replacement_proba:
114-
return self.random_generator.choice(self.leet_speak_mapping[letter.upper()])
115-
else:
116-
return letter
110+
if letter.upper() in self.leet_speak_mapping:
111+
if self.random_generator.random() < replacement_proba:
112+
letter = self.random_generator.choice(self.leet_speak_mapping[letter.upper()])
113+
114+
return letter
117115

118116
def generate(self, X, y, replacement_proba: float = 0.1) -> Tuple:
119117
"""
@@ -130,3 +128,32 @@ def generate(self, X, y, replacement_proba: float = 0.1) -> Tuple:
130128
]
131129

132130
return Xt, y
131+
132+
class SwapCaseGenerator(TyposGenerator):
133+
134+
def __init__(self, random_generator=default_rng(seed=0)):
135+
"""
136+
:param random_generator: A random generator
137+
"""
138+
super().__init__(random_generator)
139+
140+
def randomly_swapcase_letter(self, letter, swapcase_proba):
141+
"""
142+
Randomly swap case a letter
143+
:param letter:
144+
:param swapcase_proba: the probability of swapping case
145+
:return:
146+
"""
147+
if self.random_generator.random() < swapcase_proba:
148+
letter = letter.swapcase()
149+
150+
return letter
151+
152+
def generate(self, X, y, swapcase_proba: float = 0.1) -> Tuple:
153+
assert 0 <= swapcase_proba <= 1
154+
Xt = [
155+
''.join([self.randomly_swapcase_letter(l, swapcase_proba=swapcase_proba) for l in word])
156+
for word in X
157+
]
158+
159+
return Xt, y

docs/tutorials/text/Typos-Text.ipynb

Lines changed: 96 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"metadata": {},
1616
"outputs": [],
1717
"source": [
18-
"from badgers.generators.text.typos import SwapLettersGenerator, LeetSpeakGenerator"
18+
"from badgers.generators.text.typos import SwapLettersGenerator, LeetSpeakGenerator, SwapCaseGenerator"
1919
]
2020
},
2121
{
@@ -152,13 +152,104 @@
152152
"print('Transformed:\\t'+' '.join(Xt))"
153153
]
154154
},
155+
{
156+
"cell_type": "markdown",
157+
"id": "180fe290-ed49-4027-9e59-c18cf89e5eb4",
158+
"metadata": {},
159+
"source": [
160+
"## Swap case"
161+
]
162+
},
163+
{
164+
"cell_type": "code",
165+
"execution_count": 10,
166+
"id": "9bcc0865-d9dd-4b95-ab4a-57feaad7c330",
167+
"metadata": {},
168+
"outputs": [],
169+
"source": [
170+
"swap_case = SwapCaseGenerator()"
171+
]
172+
},
155173
{
156174
"cell_type": "code",
157-
"execution_count": null,
158-
"id": "04439b35-c84f-4dcf-ad73-0e6ba11518a7",
175+
"execution_count": 11,
176+
"id": "d947f6af-71fe-4913-90fa-4b37d929ea21",
159177
"metadata": {},
160178
"outputs": [],
161-
"source": []
179+
"source": [
180+
"Xt, _ = swap_case.generate(X.copy(), y=None, swapcase_proba=0.25)"
181+
]
182+
},
183+
{
184+
"cell_type": "code",
185+
"execution_count": 12,
186+
"id": "ded6a17c-4e2c-483a-8894-99857a2132f2",
187+
"metadata": {},
188+
"outputs": [
189+
{
190+
"name": "stdout",
191+
"output_type": "stream",
192+
"text": [
193+
"Original:\tthe quick brown fox jumps over the lazy dog\n",
194+
"Transformed:\tthE Quick broWn FoX jumpS Over the lazy Dog\n"
195+
]
196+
}
197+
],
198+
"source": [
199+
"print('Original:\\t'+' '.join(X))\n",
200+
"print('Transformed:\\t'+' '.join(Xt))"
201+
]
202+
},
203+
{
204+
"cell_type": "markdown",
205+
"id": "d0fdd30b-a223-4a35-9dab-4696238915a5",
206+
"metadata": {},
207+
"source": [
208+
"## Using pipelines"
209+
]
210+
},
211+
{
212+
"cell_type": "code",
213+
"execution_count": 13,
214+
"id": "90e5ca7b-c959-44de-aa8b-ebd4d09d24aa",
215+
"metadata": {},
216+
"outputs": [],
217+
"source": [
218+
"from badgers.core.pipeline import Pipeline"
219+
]
220+
},
221+
{
222+
"cell_type": "code",
223+
"execution_count": 14,
224+
"id": "87f4d09e-8d89-4c3f-9640-6646f2082fa7",
225+
"metadata": {},
226+
"outputs": [],
227+
"source": [
228+
"generators = {'swap_letters': swap_letters, 'leet_speak': leet_speak, 'swap_case': swap_case}\n",
229+
"params = {'swap_letters': {'swap_proba':0.5}, 'leet_speak': {'replacement_proba':0.25}, 'swap_case': {'swapcase_proba':0.25}}\n",
230+
"pipeline = Pipeline(generators=generators)\n",
231+
"Xt, _ = pipeline.generate(X.copy(), y=None, params=params)"
232+
]
233+
},
234+
{
235+
"cell_type": "code",
236+
"execution_count": 15,
237+
"id": "2b5fe819-9133-4038-bd25-189c3ef6a4a4",
238+
"metadata": {},
239+
"outputs": [
240+
{
241+
"name": "stdout",
242+
"output_type": "stream",
243+
"text": [
244+
"Original:\tthe quick brown fox jumps over the lazy dog\n",
245+
"Transformed:\tthe quick /3®Ow^ vo)( Jv(v)|^eHS ov€R t|~|e lzay dog\n"
246+
]
247+
}
248+
],
249+
"source": [
250+
"print('Original:\\t'+' '.join(X))\n",
251+
"print('Transformed:\\t'+' '.join(Xt))"
252+
]
162253
}
163254
],
164255
"metadata": {
@@ -177,7 +268,7 @@
177268
"name": "python",
178269
"nbconvert_exporter": "python",
179270
"pygments_lexer": "ipython3",
180-
"version": "3.11.2"
271+
"version": "3.12.2"
181272
}
182273
},
183274
"nbformat": 4,

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
44

55
[project]
66
name = "badgers"
7-
version = "0.0.9"
7+
version = "0.0.10"
88
keywords = ["data quality", "bad data", "data science"]
99
authors = [
1010
{ name = "Julien Siebert", email = "[email protected]" },

requirements.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
numpy~=2.2.0
2+
scikit-learn~=1.6.0
3+
networkx~=3.4.2
4+
pandas~=2.2.3
5+
scipy~=1.14.1
6+
tox~=4.23.2
7+
mkdocs~=1.4.3
8+
mkdocstrings~=0.25.2
9+
mkdocstrings-python~=1.10.9
10+
mkdocs-gen-files~=0.5.0
11+
mkdocs-material~=9.1.12
12+
mkdocs-literate-nav~=0.6.0
13+
mkdocs-jupyter~=0.24.1
14+
jupyterlab~=4.3.4
15+
matplotlib~=3.10.0

requirements_dev.txt

Lines changed: 0 additions & 12 deletions
This file was deleted.

tests/generators/text/test_typos.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from numpy.random import default_rng
55

6-
from badgers.generators.text.typos import SwapLettersGenerator, LeetSpeakGenerator
6+
from badgers.generators.text.typos import SwapLettersGenerator, LeetSpeakGenerator, SwapCaseGenerator
77

88

99
class TestSwapLettersGenerator(unittest.TestCase):
@@ -43,11 +43,22 @@ def test_transform(self):
4343
class TestLeetSpeakGenerator(unittest.TestCase):
4444

4545
def test_generate(self):
46-
X = ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'fox']
46+
X = ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'fox', ' <> ']
4747
trf = LeetSpeakGenerator()
4848
Xt, _ = trf.generate(X, None)
4949
self.assertEqual(len(X), len(Xt))
5050

5151

52+
class TestSwapCaseGenerator(unittest.TestCase):
53+
54+
def test_generate(self):
55+
X = ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'fox', ' <> ']
56+
trf = SwapCaseGenerator()
57+
Xt, _ = trf.generate(X, None, swapcase_proba=1.)
58+
self.assertEqual(len(X), len(Xt))
59+
for w1, w2 in zip(X, Xt):
60+
self.assertEqual(w1.upper(), w2)
61+
62+
5263
if __name__ == '__main__':
5364
unittest.main()

0 commit comments

Comments
 (0)