10
10
import json
11
11
import time
12
12
import hashlib
13
+ from pathlib import Path
13
14
14
15
15
16
def parse_args ():
@@ -18,9 +19,9 @@ def parse_args():
18
19
return parser .parse_args ()
19
20
20
21
21
- def hasher (lichen_config_data , my_tokenized_file , my_hashes_file ):
22
- language = lichen_config_data ["language" ]
23
- sequence_length = int (lichen_config_data ["sequence_length" ])
22
+ def hasher (lichen_config , lichen_run_config , my_tokenized_file , my_hashes_file ):
23
+ language = lichen_run_config ["language" ]
24
+ sequence_length = int (lichen_run_config ["sequence_length" ])
24
25
25
26
data_json_path = "./data.json" # data.json is in the Lichen/bin directory after install
26
27
with open (data_json_path ) as token_data_file :
@@ -39,69 +40,76 @@ def hasher(lichen_config_data, my_tokenized_file, my_hashes_file):
39
40
token_values [x :x + sequence_length ]).encode ())
40
41
.hexdigest ())[0 :8 ] for x in range (0 , num - sequence_length + 1 )]
41
42
43
+ if len (token_hashed_values ) > lichen_config ["max_sequences_per_file" ]:
44
+ token_hashed_values = token_hashed_values [slice (0 , lichen_config ["max_sequences_per_file" ])] # noqa E501
45
+ print (f"File { my_hashes_file } truncated after exceeding max sequence limit" )
46
+
42
47
my_hf .write ('\n ' .join (token_hashed_values ))
43
48
44
49
45
50
def main ():
46
51
start_time = time .time ()
47
52
args = parse_args ()
48
53
49
- with open (os .path .join (args .basepath , "config.json" )) as lichen_config :
50
- lichen_config_data = json .load (lichen_config )
54
+ with open (Path (args .basepath , "config.json" )) as lichen_run_config_file :
55
+ lichen_run_config = json .load (lichen_run_config_file )
56
+
57
+ with open (Path (__file__ ).resolve ().parent / "lichen_config.json" ) as lichen_config_file :
58
+ lichen_config = json .load (lichen_config_file )
51
59
52
60
print ("HASH ALL..." , end = "" )
53
61
54
62
# ==========================================================================
55
63
# walk the subdirectories of this gradeable
56
- users_dir = os . path . join (args .basepath , "users" )
64
+ users_dir = Path (args .basepath , "users" )
57
65
if not os .path .isdir (users_dir ):
58
66
raise SystemExit ("ERROR! Unable to find users directory" )
59
67
60
68
for user in sorted (os .listdir (users_dir )):
61
- user_dir = os . path . join (users_dir , user )
69
+ user_dir = Path (users_dir , user )
62
70
if not os .path .isdir (user_dir ):
63
71
continue
64
72
65
73
for version in sorted (os .listdir (user_dir )):
66
- my_dir = os . path . join (user_dir , version )
74
+ my_dir = Path (user_dir , version )
67
75
if not os .path .isdir (my_dir ):
68
76
continue
69
77
70
- my_tokenized_file = os . path . join (my_dir , "tokens.json" )
71
- my_hashes_file = os . path . join (my_dir , "hashes.txt" )
72
- hasher (lichen_config_data , my_tokenized_file , my_hashes_file )
78
+ my_tokenized_file = Path (my_dir , "tokens.json" )
79
+ my_hashes_file = Path (my_dir , "hashes.txt" )
80
+ hasher (lichen_config , lichen_run_config , my_tokenized_file , my_hashes_file )
73
81
74
82
# ==========================================================================
75
83
# walk the subdirectories of the other gradeables
76
84
77
- other_gradeables_dir = os . path . join (args .basepath , "other_gradeables" )
85
+ other_gradeables_dir = Path (args .basepath , "other_gradeables" )
78
86
if not os .path .isdir (other_gradeables_dir ):
79
87
raise SystemExit ("ERROR! Unable to find other gradeables directory" )
80
88
81
89
for other_gradeable in sorted (os .listdir (other_gradeables_dir )):
82
- other_gradeable_dir = os . path . join (other_gradeables_dir , other_gradeable )
90
+ other_gradeable_dir = Path (other_gradeables_dir , other_gradeable )
83
91
if not os .path .isdir (other_gradeable_dir ):
84
92
continue
85
93
86
94
for other_user in sorted (os .listdir (other_gradeable_dir )):
87
- other_user_dir = os . path . join (other_gradeable_dir , other_user )
95
+ other_user_dir = Path (other_gradeable_dir , other_user )
88
96
if not os .path .isdir (other_user_dir ):
89
97
continue
90
98
91
99
for other_version in sorted (os .listdir (other_user_dir )):
92
- other_version_dir = os . path . join (other_user_dir , other_version )
100
+ other_version_dir = Path (other_user_dir , other_version )
93
101
if not os .path .isdir (other_version_dir ):
94
102
continue
95
103
96
- other_tokenized_file = os . path . join (other_version_dir , "tokens.json" )
97
- other_hashes_file = os . path . join (other_version_dir , "hashes.txt" )
98
- hasher (lichen_config_data , other_tokenized_file , other_hashes_file )
104
+ other_tokenized_file = Path (other_version_dir , "tokens.json" )
105
+ other_hashes_file = Path (other_version_dir , "hashes.txt" )
106
+ hasher (lichen_config , lichen_run_config , other_tokenized_file , other_hashes_file )
99
107
100
108
# ==========================================================================
101
109
# hash the provided code
102
- provided_code_tokenized = os . path . join (args .basepath , "provided_code" , "tokens.json" )
103
- provided_code_hashed = os . path . join (args .basepath , "provided_code" , "hashes.txt" )
104
- hasher (lichen_config_data , provided_code_tokenized , provided_code_hashed )
110
+ provided_code_tokenized = Path (args .basepath , "provided_code" , "tokens.json" )
111
+ provided_code_hashed = Path (args .basepath , "provided_code" , "hashes.txt" )
112
+ hasher (lichen_config , lichen_run_config , provided_code_tokenized , provided_code_hashed )
105
113
106
114
# ==========================================================================
107
115
end_time = time .time ()
0 commit comments