From 9f35f8b4d5109b2a5cbde3b9922f743cf587ccc5 Mon Sep 17 00:00:00 2001 From: keshprad <32313895+keshprad@users.noreply.github.com> Date: Sat, 3 Jul 2021 10:24:50 -0700 Subject: [PATCH 1/5] create out_of_vocab_handler function --- truecase/TrueCaser.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/truecase/TrueCaser.py b/truecase/TrueCaser.py index 29253fb..8173e64 100644 --- a/truecase/TrueCaser.py +++ b/truecase/TrueCaser.py @@ -91,6 +91,17 @@ def get_score(self, prev_token, possible_token, next_token): def first_token_case(self, raw): return raw.capitalize() + def out_of_vocabulary_handler(self, token, out_of_vocabulary_token_option="title"): + if out_of_vocabulary_token_option == "title": + return token.title() + elif out_of_vocabulary_token_option == "capitalize": + return token.capitalize() + elif out_of_vocabulary_token_option == "lower": + return token.lower() + else: + # Return original casing + return token + def get_true_case(self, sentence, out_of_vocabulary_token_option="title"): """ Wrapper function for handling untokenized input. @@ -121,7 +132,7 @@ def get_true_case_from_tokens(self, tokens, out_of_vocabulary_token_option="titl """ tokens_true_case = [] for token_idx, token in enumerate(tokens): - + token_og_case = token if token in string.punctuation or token.isdigit(): tokens_true_case.append(token) else: @@ -154,14 +165,7 @@ def get_true_case_from_tokens(self, tokens, out_of_vocabulary_token_option="titl tokens_true_case[0]) else: # Token out of vocabulary - if out_of_vocabulary_token_option == "title": - tokens_true_case.append(token.title()) - elif out_of_vocabulary_token_option == "capitalize": - tokens_true_case.append(token.capitalize()) - elif out_of_vocabulary_token_option == "lower": - tokens_true_case.append(token.lower()) - else: - tokens_true_case.append(token) + tokens_true_case.append(self.out_of_vocabulary_handler(token_og_case, out_of_vocabulary_token_option)) return tokens_true_case From f74c804f9212d8decf6f45c59d95f565c0d39108 Mon Sep 17 00:00:00 2001 From: keshprad <32313895+keshprad@users.noreply.github.com> Date: Sat, 3 Jul 2021 10:42:14 -0700 Subject: [PATCH 2/5] Implement lambda func for out_of_vocab option --- truecase/TrueCaser.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/truecase/TrueCaser.py b/truecase/TrueCaser.py index 8173e64..e949cf1 100644 --- a/truecase/TrueCaser.py +++ b/truecase/TrueCaser.py @@ -2,6 +2,7 @@ import os import pickle import string +from typing import Callable import nltk from nltk.tokenize import word_tokenize @@ -92,7 +93,9 @@ def first_token_case(self, raw): return raw.capitalize() def out_of_vocabulary_handler(self, token, out_of_vocabulary_token_option="title"): - if out_of_vocabulary_token_option == "title": + if isinstance(out_of_vocabulary_token_option, Callable): + return out_of_vocabulary_token_option(token) + elif out_of_vocabulary_token_option == "title": return token.title() elif out_of_vocabulary_token_option == "capitalize": return token.capitalize() From e0546c686ecc441face59bf37d2cb07a8a8abf59 Mon Sep 17 00:00:00 2001 From: keshprad <32313895+keshprad@users.noreply.github.com> Date: Sat, 3 Jul 2021 10:45:41 -0700 Subject: [PATCH 3/5] add lambda func test case & update readme for out_of_vocab --- README.md | 19 ++++++++++++++++++- tests/test_truecase.py | 5 +++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4118689..27bf2e5 100644 --- a/README.md +++ b/README.md @@ -33,9 +33,26 @@ Simple usecase: ```python >>> import truecase >>> truecase.get_true_case('hey, what is the weather in new york?') -'Hey, what is the weather in New York?'' +'Hey, what is the weather in New York?' ``` +You can also pass an `out_of_vocabulary_token_option`, which will be used if a word is not found in the model's vocabulary +```python +>>> import truecase +>>> truecase.get_true_case('my favorite music genre is hip-hop.', "capitalize") +'My favorite music genre is Hip-Hop.' +``` +`out_of_vocabulary_token_option`: +- "capitalize" < DEFAULT +- "title" +- "lower" +- Or, pass if your own lambda function + +```python +>>> import truecase +>>> truecase.get_true_case('i work in the nsa.', lambda token: token.upper()) +'I work in the NSA.' +``` ## Training your own model TODO. For now refer to Trainer.py diff --git a/tests/test_truecase.py b/tests/test_truecase.py index d6a5edf..016c511 100644 --- a/tests/test_truecase.py +++ b/tests/test_truecase.py @@ -47,3 +47,8 @@ def test_get_true_case(self): expected = "Testing $bug" result = self.tc.get_true_case(sentence) assert result == expected + + sentence = "i work in the nsa." + expected = "I work in the NSA." + result = self.tc.get_true_case(sentence, lambda token: token.upper()) + assert result == expected From afaa96cb5d517bddba0ebf74a844156e57792468 Mon Sep 17 00:00:00 2001 From: keshprad <32313895+keshprad@users.noreply.github.com> Date: Sat, 3 Jul 2021 10:58:27 -0700 Subject: [PATCH 4/5] fixed error in readme --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 27bf2e5..4a8b0bb 100644 --- a/README.md +++ b/README.md @@ -39,12 +39,12 @@ Simple usecase: You can also pass an `out_of_vocabulary_token_option`, which will be used if a word is not found in the model's vocabulary ```python >>> import truecase ->>> truecase.get_true_case('my favorite music genre is hip-hop.', "capitalize") +>>> truecase.get_true_case('my favorite music genre is hip-hop.', "title") 'My favorite music genre is Hip-Hop.' ``` `out_of_vocabulary_token_option`: -- "capitalize" < DEFAULT -- "title" +- "title" < DEFAULT +- "capitalize" - "lower" - Or, pass if your own lambda function From bbd9c42ed57c2042fe53cdf0deddb694d462e86b Mon Sep 17 00:00:00 2001 From: keshprad <32313895+keshprad@users.noreply.github.com> Date: Sat, 3 Jul 2021 11:20:15 -0700 Subject: [PATCH 5/5] Use title if invalid out of vocab option passed --- README.md | 7 +++++-- truecase/TrueCaser.py | 14 +++++++------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 4a8b0bb..2112e6f 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ Simple usecase: 'Hey, what is the weather in New York?' ``` -You can also pass an `out_of_vocabulary_token_option`, which will be used if a word is not found in the model's vocabulary +You can also pass an `out_of_vocabulary_token_option`, which will be used if a word is not found in the model's vocabulary: ```python >>> import truecase >>> truecase.get_true_case('my favorite music genre is hip-hop.', "title") @@ -46,8 +46,11 @@ You can also pass an `out_of_vocabulary_token_option`, which will be used if a w - "title" < DEFAULT - "capitalize" - "lower" -- Or, pass if your own lambda function +- Or, pass if your own lambda function (takes the token with original casing as a single parameter) +*If an invalid option is passed, title is used* + +Lambda function example: ```python >>> import truecase >>> truecase.get_true_case('i work in the nsa.', lambda token: token.upper()) diff --git a/truecase/TrueCaser.py b/truecase/TrueCaser.py index e949cf1..fed333d 100644 --- a/truecase/TrueCaser.py +++ b/truecase/TrueCaser.py @@ -92,18 +92,18 @@ def get_score(self, prev_token, possible_token, next_token): def first_token_case(self, raw): return raw.capitalize() - def out_of_vocabulary_handler(self, token, out_of_vocabulary_token_option="title"): + def out_of_vocabulary_handler(self, token_og_case, out_of_vocabulary_token_option="title"): if isinstance(out_of_vocabulary_token_option, Callable): - return out_of_vocabulary_token_option(token) + return out_of_vocabulary_token_option(token_og_case) elif out_of_vocabulary_token_option == "title": - return token.title() + return token_og_case.title() elif out_of_vocabulary_token_option == "capitalize": - return token.capitalize() + return token_og_case.capitalize() elif out_of_vocabulary_token_option == "lower": - return token.lower() + return token_og_case.lower() else: - # Return original casing - return token + # If value passed is invalid, use .title() + return token_og_case.title() def get_true_case(self, sentence, out_of_vocabulary_token_option="title"): """ Wrapper function for handling untokenized input.