Skip to content

Lambda function for out_of_vocabulary_token_option #24

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,29 @@ Simple usecase:
```python
>>> import truecase
>>> truecase.get_true_case('hey, what is the weather in new york?')
'Hey, what is the weather in New York?''
'Hey, what is the weather in New York?'
```

You can also pass an `out_of_vocabulary_token_option`, which will be used if a word is not found in the model's vocabulary:
```python
>>> import truecase
>>> truecase.get_true_case('my favorite music genre is hip-hop.', "title")
'My favorite music genre is Hip-Hop.'
```
`out_of_vocabulary_token_option`:
- "title" < DEFAULT
- "capitalize"
- "lower"
- Or, pass if your own lambda function (takes the token with original casing as a single parameter)

*If an invalid option is passed, title is used*

Lambda function example:
```python
>>> import truecase
>>> truecase.get_true_case('i work in the nsa.', lambda token: token.upper())
'I work in the NSA.'
```
## Training your own model

TODO. For now refer to Trainer.py
Expand Down
5 changes: 5 additions & 0 deletions tests/test_truecase.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,8 @@ def test_get_true_case(self):
expected = "Testing $bug"
result = self.tc.get_true_case(sentence)
assert result == expected

sentence = "i work in the nsa."
expected = "I work in the NSA."
result = self.tc.get_true_case(sentence, lambda token: token.upper())
assert result == expected
25 changes: 16 additions & 9 deletions truecase/TrueCaser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import pickle
import string
from typing import Callable

import nltk
from nltk.tokenize import word_tokenize
Expand Down Expand Up @@ -91,6 +92,19 @@ def get_score(self, prev_token, possible_token, next_token):
def first_token_case(self, raw):
return raw.capitalize()

def out_of_vocabulary_handler(self, token_og_case, out_of_vocabulary_token_option="title"):
if isinstance(out_of_vocabulary_token_option, Callable):
return out_of_vocabulary_token_option(token_og_case)
elif out_of_vocabulary_token_option == "title":
return token_og_case.title()
elif out_of_vocabulary_token_option == "capitalize":
return token_og_case.capitalize()
elif out_of_vocabulary_token_option == "lower":
return token_og_case.lower()
else:
# If value passed is invalid, use .title()
return token_og_case.title()

def get_true_case(self, sentence, out_of_vocabulary_token_option="title"):
""" Wrapper function for handling untokenized input.

Expand Down Expand Up @@ -121,7 +135,7 @@ def get_true_case_from_tokens(self, tokens, out_of_vocabulary_token_option="titl
"""
tokens_true_case = []
for token_idx, token in enumerate(tokens):

token_og_case = token
if token in string.punctuation or token.isdigit():
tokens_true_case.append(token)
else:
Expand Down Expand Up @@ -154,14 +168,7 @@ def get_true_case_from_tokens(self, tokens, out_of_vocabulary_token_option="titl
tokens_true_case[0])

else: # Token out of vocabulary
if out_of_vocabulary_token_option == "title":
tokens_true_case.append(token.title())
elif out_of_vocabulary_token_option == "capitalize":
tokens_true_case.append(token.capitalize())
elif out_of_vocabulary_token_option == "lower":
tokens_true_case.append(token.lower())
else:
tokens_true_case.append(token)
tokens_true_case.append(self.out_of_vocabulary_handler(token_og_case, out_of_vocabulary_token_option))

return tokens_true_case

Expand Down