Workaround to skip utf-8 characters in plaintext tokenizer (#11)

bmcutler · web-flow · commit e93dfb6847a8 · 2018-07-29T21:30:19.000-04:00
diff --git a/tokenizer/plaintext/plaintext_tokenizer.cpp b/tokenizer/plaintext/plaintext_tokenizer.cpp
@@ -53,6 +53,11 @@ int main(int argc, char* argv[]) {
   while (std::cin >> std::noskipws >> c) {
     bool is_punctuation = !isspace(c) && !std::isdigit(c) && !std::isalpha(c);
 
+    if ((unsigned int)(c) > 127) {
+      // FIXME: for now, just skip utf-8 characters since nlohmann dump gets stuck
+      continue;
+    }
+
     // ------------------------------
     // decide when to break the current string
     // break on spaces, punctuation (any symbol), or if we switch between letters and numbers