[Refactor:Plagiarism] Refactor data.json (#76)

williamjallen · web-flow · commit 47a4e2d32981 · 2022-02-28T15:14:24.000-05:00
* progress

* Make the changes backwards-compatible
diff --git a/bin/tokenize_all.py b/bin/tokenize_all.py
@@ -6,6 +6,7 @@
 import argparse
 import os
 import json
+import subprocess
 import humanize
 import datetime
 
@@ -19,28 +20,38 @@ def parse_args():
 def tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file):
     language = lichen_config_data["language"]
 
+    cli_args = list()
     language_token_data = dict()
 
     data_json_path = "./data.json"  # data.json is in the Lichen/bin directory after install
     with open(data_json_path, 'r') as token_data_file:
-        token_data = json.load(token_data_file)
-        language_token_data = token_data[language]
+        data_file = json.load(token_data_file)
+        language_token_data = data_file[language]
+        if "arguments" in lichen_config_data.keys():  # For backwards compatibility - TODO: Remove
+            for argument in lichen_config_data["arguments"]:
+                if argument in language_token_data["command_args"]:
+                    cli_args.append(language_token_data["command_args"][argument]["argument"])
+                else:
+                    print(f"Error: Unknown tokenization argument {argument}")
+        else:  # Use the default arguments
+            for argument in language_token_data["command_args"]:
+                if "default" in language_token_data["command_args"][argument].keys() and\
+                        language_token_data["command_args"][argument]["default"]:
+                    cli_args.append(language_token_data["command_args"][argument]["argument"])
 
     tokenizer = f"./{language_token_data['tokenizer']}"
 
-    if language_token_data.get('input_as_argument') is not None and \
-       language_token_data['input_as_argument'] is not False:
-        my_concatenated_file = f'< {my_concatenated_file}'
+    result = subprocess.run([language_token_data['command_executable'],
+                             tokenizer, my_concatenated_file] + cli_args,
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE)
 
-    if "command_args" in language_token_data:
-        cli_args = " ".join(language_token_data["command_args"])
-    else:
-        cli_args = ""
+    stderr = result.stderr.decode('utf-8')
+    if not stderr.isspace() and stderr is not None and stderr != '':
+        print(result.stderr.decode("utf-8"))
 
-    command = f"{language_token_data['command_executable']} {tokenizer} "\
-              f"{cli_args} {my_concatenated_file} > {my_tokenized_file}".strip()
-
-    os.system(command)
+    with open(my_tokenized_file, 'w') as file:
+        file.write(result.stdout.decode('utf-8'))
 
 
 def main():
diff --git a/tokenizer/data.json b/tokenizer/data.json
@@ -1,38 +1,63 @@
 {
-  "plaintext": {
-    "tokenizer": "plaintext_tokenizer.py",
-    "command_executable": "python3",
-    "input_as_argument": false,
-    "command_args": [
-      "--ignore_newlines"
-    ],
-    "token_value": "value"
-  },
-  "python": {
-    "tokenizer": "python_tokenizer.py",
-    "command_executable": "python3",
-    "input_as_argument": true,
-    "token_value": "type"
-  },
-  "cpp": {
-    "tokenizer": "c_tokenizer.py",
-    "command_executable": "python3",
-    "input_as_argument": false,
-    "command_args": [
-      "--ignore_comments"
-    ],
-    "token_value": "type"
-  },
-  "java": {
-    "tokenizer": "java_tokenizer.py",
-    "command_executable": "python3",
-    "input_as_argument": true,
-    "token_value": "type"
-  },
-  "mips": {
-    "tokenizer": "mips_tokenizer.py",
-    "command_executable": "python3",
-    "input_as_argument": true,
-    "token_value": "type"
-  }
+    "plaintext": {
+        "name": "Plain Text",
+        "tokenizer": "plaintext_tokenizer.py",
+        "command_executable": "python3",
+        "command_args": {
+            "ignore_punctuation": {
+                "name": "Ignore punctuation",
+                "argument": "--ignore_punctuation"
+            },
+            "to_lower": {
+                "name": "Convert to lower case",
+                "argument": "--to_lower"
+            },
+            "ignore_numbers": {
+                "name": "Ignore numbers",
+                "argument": "--ignore_numbers"
+            },
+            "ignore_newlines": {
+                "name": "Ignore newlines",
+                "argument": "--ignore_newlines",
+                "default": true
+            }
+        },
+        "token_value": "value",
+        "default_hash_size": 14
+    },
+    "python": {
+        "name": "Python",
+        "tokenizer": "python_tokenizer.py",
+        "command_executable": "python3",
+        "token_value": "type",
+        "default_hash_size": 14
+    },
+    "cpp": {
+        "name": "C/C++",
+        "tokenizer": "c_tokenizer.py",
+        "command_executable": "python3",
+        "command_args": {
+            "ignore_comments": {
+                "name": "Ignore comments",
+                "argument": "--ignore_comments",
+                "default": true
+            }
+        },
+        "token_value": "type",
+        "default_hash_size": 14
+    },
+    "java": {
+        "name": "Java",
+        "tokenizer": "java_tokenizer.py",
+        "command_executable": "python3",
+        "token_value": "type",
+        "default_hash_size": 14
+    },
+    "mips": {
+        "name": "MIPS Assembly",
+        "tokenizer": "mips_tokenizer.py",
+        "command_executable": "python3",
+        "token_value": "type",
+        "default_hash_size": 5
+    }
 }