From 9f35f8b4d5109b2a5cbde3b9922f743cf587ccc5 Mon Sep 17 00:00:00 2001
From: keshprad <32313895+keshprad@users.noreply.github.com>
Date: Sat, 3 Jul 2021 10:24:50 -0700
Subject: [PATCH 1/5] create out_of_vocab_handler function

---
 truecase/TrueCaser.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/truecase/TrueCaser.py b/truecase/TrueCaser.py
index 29253fb..8173e64 100644
--- a/truecase/TrueCaser.py
+++ b/truecase/TrueCaser.py
@@ -91,6 +91,17 @@ def get_score(self, prev_token, possible_token, next_token):
     def first_token_case(self, raw):
         return raw.capitalize()
 
+    def out_of_vocabulary_handler(self, token, out_of_vocabulary_token_option="title"):
+        if out_of_vocabulary_token_option == "title":
+            return token.title()
+        elif out_of_vocabulary_token_option == "capitalize":
+            return token.capitalize()
+        elif out_of_vocabulary_token_option == "lower":
+            return token.lower()
+        else:
+            # Return original casing
+            return token
+
     def get_true_case(self, sentence, out_of_vocabulary_token_option="title"):
         """ Wrapper function for handling untokenized input.
         
@@ -121,7 +132,7 @@ def get_true_case_from_tokens(self, tokens, out_of_vocabulary_token_option="titl
         """
         tokens_true_case = []
         for token_idx, token in enumerate(tokens):
-
+            token_og_case = token
             if token in string.punctuation or token.isdigit():
                 tokens_true_case.append(token)
             else:
@@ -154,14 +165,7 @@ def get_true_case_from_tokens(self, tokens, out_of_vocabulary_token_option="titl
                             tokens_true_case[0])
 
                 else:  # Token out of vocabulary
-                    if out_of_vocabulary_token_option == "title":
-                        tokens_true_case.append(token.title())
-                    elif out_of_vocabulary_token_option == "capitalize":
-                        tokens_true_case.append(token.capitalize())
-                    elif out_of_vocabulary_token_option == "lower":
-                        tokens_true_case.append(token.lower())
-                    else:
-                        tokens_true_case.append(token)
+                    tokens_true_case.append(self.out_of_vocabulary_handler(token_og_case, out_of_vocabulary_token_option))
 
         return tokens_true_case
 

From f74c804f9212d8decf6f45c59d95f565c0d39108 Mon Sep 17 00:00:00 2001
From: keshprad <32313895+keshprad@users.noreply.github.com>
Date: Sat, 3 Jul 2021 10:42:14 -0700
Subject: [PATCH 2/5] Implement lambda func for out_of_vocab option

---
 truecase/TrueCaser.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/truecase/TrueCaser.py b/truecase/TrueCaser.py
index 8173e64..e949cf1 100644
--- a/truecase/TrueCaser.py
+++ b/truecase/TrueCaser.py
@@ -2,6 +2,7 @@
 import os
 import pickle
 import string
+from typing import Callable
 
 import nltk
 from nltk.tokenize import word_tokenize
@@ -92,7 +93,9 @@ def first_token_case(self, raw):
         return raw.capitalize()
 
     def out_of_vocabulary_handler(self, token, out_of_vocabulary_token_option="title"):
-        if out_of_vocabulary_token_option == "title":
+        if isinstance(out_of_vocabulary_token_option, Callable):
+            return out_of_vocabulary_token_option(token)
+        elif out_of_vocabulary_token_option == "title":
             return token.title()
         elif out_of_vocabulary_token_option == "capitalize":
             return token.capitalize()

From e0546c686ecc441face59bf37d2cb07a8a8abf59 Mon Sep 17 00:00:00 2001
From: keshprad <32313895+keshprad@users.noreply.github.com>
Date: Sat, 3 Jul 2021 10:45:41 -0700
Subject: [PATCH 3/5] add lambda func test case & update readme for
 out_of_vocab

---
 README.md              | 19 ++++++++++++++++++-
 tests/test_truecase.py |  5 +++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4118689..27bf2e5 100644
--- a/README.md
+++ b/README.md
@@ -33,9 +33,26 @@ Simple usecase:
 ```python
 >>> import truecase
 >>> truecase.get_true_case('hey, what is the weather in new york?')
-'Hey, what is the weather in New York?''
+'Hey, what is the weather in New York?'
 ```
 
+You can also pass an `out_of_vocabulary_token_option`, which will be used if a word is not found in the model's vocabulary
+```python
+>>> import truecase
+>>> truecase.get_true_case('my favorite music genre is hip-hop.', "capitalize")
+'My favorite music genre is Hip-Hop.'
+```
+`out_of_vocabulary_token_option`:
+- "capitalize" < DEFAULT
+- "title"
+- "lower"
+- Or, pass if your own lambda function
+
+```python
+>>> import truecase
+>>> truecase.get_true_case('i work in the nsa.', lambda token: token.upper())
+'I work in the NSA.'
+```
 ## Training your own model
 
 TODO. For now refer to Trainer.py
diff --git a/tests/test_truecase.py b/tests/test_truecase.py
index d6a5edf..016c511 100644
--- a/tests/test_truecase.py
+++ b/tests/test_truecase.py
@@ -47,3 +47,8 @@ def test_get_true_case(self):
         expected = "Testing $bug"
         result = self.tc.get_true_case(sentence)
         assert result == expected
+
+        sentence = "i work in the nsa."
+        expected = "I work in the NSA."
+        result = self.tc.get_true_case(sentence, lambda token: token.upper())
+        assert result == expected

From afaa96cb5d517bddba0ebf74a844156e57792468 Mon Sep 17 00:00:00 2001
From: keshprad <32313895+keshprad@users.noreply.github.com>
Date: Sat, 3 Jul 2021 10:58:27 -0700
Subject: [PATCH 4/5] fixed error in readme

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 27bf2e5..4a8b0bb 100644
--- a/README.md
+++ b/README.md
@@ -39,12 +39,12 @@ Simple usecase:
 You can also pass an `out_of_vocabulary_token_option`, which will be used if a word is not found in the model's vocabulary
 ```python
 >>> import truecase
->>> truecase.get_true_case('my favorite music genre is hip-hop.', "capitalize")
+>>> truecase.get_true_case('my favorite music genre is hip-hop.', "title")
 'My favorite music genre is Hip-Hop.'
 ```
 `out_of_vocabulary_token_option`:
-- "capitalize" < DEFAULT
-- "title"
+- "title" < DEFAULT
+- "capitalize"
 - "lower"
 - Or, pass if your own lambda function
 

From bbd9c42ed57c2042fe53cdf0deddb694d462e86b Mon Sep 17 00:00:00 2001
From: keshprad <32313895+keshprad@users.noreply.github.com>
Date: Sat, 3 Jul 2021 11:20:15 -0700
Subject: [PATCH 5/5] Use title if invalid out of vocab option passed

---
 README.md             |  7 +++++--
 truecase/TrueCaser.py | 14 +++++++-------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 4a8b0bb..2112e6f 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,7 @@ Simple usecase:
 'Hey, what is the weather in New York?'
 ```
 
-You can also pass an `out_of_vocabulary_token_option`, which will be used if a word is not found in the model's vocabulary
+You can also pass an `out_of_vocabulary_token_option`, which will be used if a word is not found in the model's vocabulary:
 ```python
 >>> import truecase
 >>> truecase.get_true_case('my favorite music genre is hip-hop.', "title")
@@ -46,8 +46,11 @@ You can also pass an `out_of_vocabulary_token_option`, which will be used if a w
 - "title" < DEFAULT
 - "capitalize"
 - "lower"
-- Or, pass if your own lambda function
+- Or, pass if your own lambda function (takes the token with original casing as a single parameter)
 
+*If an invalid option is passed, title is used*
+
+Lambda function example:
 ```python
 >>> import truecase
 >>> truecase.get_true_case('i work in the nsa.', lambda token: token.upper())
diff --git a/truecase/TrueCaser.py b/truecase/TrueCaser.py
index e949cf1..fed333d 100644
--- a/truecase/TrueCaser.py
+++ b/truecase/TrueCaser.py
@@ -92,18 +92,18 @@ def get_score(self, prev_token, possible_token, next_token):
     def first_token_case(self, raw):
         return raw.capitalize()
 
-    def out_of_vocabulary_handler(self, token, out_of_vocabulary_token_option="title"):
+    def out_of_vocabulary_handler(self, token_og_case, out_of_vocabulary_token_option="title"):
         if isinstance(out_of_vocabulary_token_option, Callable):
-            return out_of_vocabulary_token_option(token)
+            return out_of_vocabulary_token_option(token_og_case)
         elif out_of_vocabulary_token_option == "title":
-            return token.title()
+            return token_og_case.title()
         elif out_of_vocabulary_token_option == "capitalize":
-            return token.capitalize()
+            return token_og_case.capitalize()
         elif out_of_vocabulary_token_option == "lower":
-            return token.lower()
+            return token_og_case.lower()
         else:
-            # Return original casing
-            return token
+            # If value passed is invalid, use .title()
+            return token_og_case.title()
 
     def get_true_case(self, sentence, out_of_vocabulary_token_option="title"):
         """ Wrapper function for handling untokenized input.