Skip to content

Commit 47a4e2d

Browse files
[Refactor:Plagiarism] Refactor data.json (#76)
* progress * Make the changes backwards-compatible
1 parent ed20eb9 commit 47a4e2d

File tree

2 files changed

+85
-49
lines changed

2 files changed

+85
-49
lines changed

bin/tokenize_all.py

+24-13
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import argparse
77
import os
88
import json
9+
import subprocess
910
import humanize
1011
import datetime
1112

@@ -19,28 +20,38 @@ def parse_args():
1920
def tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file):
2021
language = lichen_config_data["language"]
2122

23+
cli_args = list()
2224
language_token_data = dict()
2325

2426
data_json_path = "./data.json" # data.json is in the Lichen/bin directory after install
2527
with open(data_json_path, 'r') as token_data_file:
26-
token_data = json.load(token_data_file)
27-
language_token_data = token_data[language]
28+
data_file = json.load(token_data_file)
29+
language_token_data = data_file[language]
30+
if "arguments" in lichen_config_data.keys(): # For backwards compatibility - TODO: Remove
31+
for argument in lichen_config_data["arguments"]:
32+
if argument in language_token_data["command_args"]:
33+
cli_args.append(language_token_data["command_args"][argument]["argument"])
34+
else:
35+
print(f"Error: Unknown tokenization argument {argument}")
36+
else: # Use the default arguments
37+
for argument in language_token_data["command_args"]:
38+
if "default" in language_token_data["command_args"][argument].keys() and\
39+
language_token_data["command_args"][argument]["default"]:
40+
cli_args.append(language_token_data["command_args"][argument]["argument"])
2841

2942
tokenizer = f"./{language_token_data['tokenizer']}"
3043

31-
if language_token_data.get('input_as_argument') is not None and \
32-
language_token_data['input_as_argument'] is not False:
33-
my_concatenated_file = f'< {my_concatenated_file}'
44+
result = subprocess.run([language_token_data['command_executable'],
45+
tokenizer, my_concatenated_file] + cli_args,
46+
stdout=subprocess.PIPE,
47+
stderr=subprocess.PIPE)
3448

35-
if "command_args" in language_token_data:
36-
cli_args = " ".join(language_token_data["command_args"])
37-
else:
38-
cli_args = ""
49+
stderr = result.stderr.decode('utf-8')
50+
if not stderr.isspace() and stderr is not None and stderr != '':
51+
print(result.stderr.decode("utf-8"))
3952

40-
command = f"{language_token_data['command_executable']} {tokenizer} "\
41-
f"{cli_args} {my_concatenated_file} > {my_tokenized_file}".strip()
42-
43-
os.system(command)
53+
with open(my_tokenized_file, 'w') as file:
54+
file.write(result.stdout.decode('utf-8'))
4455

4556

4657
def main():

tokenizer/data.json

+61-36
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,63 @@
11
{
2-
"plaintext": {
3-
"tokenizer": "plaintext_tokenizer.py",
4-
"command_executable": "python3",
5-
"input_as_argument": false,
6-
"command_args": [
7-
"--ignore_newlines"
8-
],
9-
"token_value": "value"
10-
},
11-
"python": {
12-
"tokenizer": "python_tokenizer.py",
13-
"command_executable": "python3",
14-
"input_as_argument": true,
15-
"token_value": "type"
16-
},
17-
"cpp": {
18-
"tokenizer": "c_tokenizer.py",
19-
"command_executable": "python3",
20-
"input_as_argument": false,
21-
"command_args": [
22-
"--ignore_comments"
23-
],
24-
"token_value": "type"
25-
},
26-
"java": {
27-
"tokenizer": "java_tokenizer.py",
28-
"command_executable": "python3",
29-
"input_as_argument": true,
30-
"token_value": "type"
31-
},
32-
"mips": {
33-
"tokenizer": "mips_tokenizer.py",
34-
"command_executable": "python3",
35-
"input_as_argument": true,
36-
"token_value": "type"
37-
}
2+
"plaintext": {
3+
"name": "Plain Text",
4+
"tokenizer": "plaintext_tokenizer.py",
5+
"command_executable": "python3",
6+
"command_args": {
7+
"ignore_punctuation": {
8+
"name": "Ignore punctuation",
9+
"argument": "--ignore_punctuation"
10+
},
11+
"to_lower": {
12+
"name": "Convert to lower case",
13+
"argument": "--to_lower"
14+
},
15+
"ignore_numbers": {
16+
"name": "Ignore numbers",
17+
"argument": "--ignore_numbers"
18+
},
19+
"ignore_newlines": {
20+
"name": "Ignore newlines",
21+
"argument": "--ignore_newlines",
22+
"default": true
23+
}
24+
},
25+
"token_value": "value",
26+
"default_hash_size": 14
27+
},
28+
"python": {
29+
"name": "Python",
30+
"tokenizer": "python_tokenizer.py",
31+
"command_executable": "python3",
32+
"token_value": "type",
33+
"default_hash_size": 14
34+
},
35+
"cpp": {
36+
"name": "C/C++",
37+
"tokenizer": "c_tokenizer.py",
38+
"command_executable": "python3",
39+
"command_args": {
40+
"ignore_comments": {
41+
"name": "Ignore comments",
42+
"argument": "--ignore_comments",
43+
"default": true
44+
}
45+
},
46+
"token_value": "type",
47+
"default_hash_size": 14
48+
},
49+
"java": {
50+
"name": "Java",
51+
"tokenizer": "java_tokenizer.py",
52+
"command_executable": "python3",
53+
"token_value": "type",
54+
"default_hash_size": 14
55+
},
56+
"mips": {
57+
"name": "MIPS Assembly",
58+
"tokenizer": "mips_tokenizer.py",
59+
"command_executable": "python3",
60+
"token_value": "type",
61+
"default_hash_size": 5
62+
}
3863
}

0 commit comments

Comments
 (0)