File tree 9 files changed +1167
-0
lines changed
9 files changed +1167
-0
lines changed Original file line number Diff line number Diff line change
1
+ * ~
Original file line number Diff line number Diff line change
1
+ #! /usr/bin/env bash
2
+
3
+ src_location=" ."
4
+ build_location=" ."
5
+ bin_location=" ./bin"
6
+
7
+ nlohmann_dir=${src_location} /GIT_NLOHMANN_JSON/
8
+
9
+ if [ ! -d " ${nlohmann_dir} " ]; then
10
+ echo ' should install'
11
+ git clone --depth 1 https://github.com/nlohmann/json.git ${nlohmann_dir}
12
+ fi
13
+
14
+
15
+ mkdir -p ${bin_location}
16
+ clang++ -I ${nlohmann_dir} /include/ -std=c++11 -Wall tokenizer/plaintext/plaintext_tokenizer.cpp -o ${bin_location} /plaintext_tokenizer.out
17
+
18
+ ${bin_location} /plaintext_tokenizer.out < tokenizer/plaintext/input.txt > output.json
19
+ ${bin_location} /plaintext_tokenizer.out --ignore_newlines < tokenizer/plaintext/input.txt > output_ignore_newlines.json
20
+ ${bin_location} /plaintext_tokenizer.out --to_lower < tokenizer/plaintext/input.txt > output_to_lower.json
21
+ ${bin_location} /plaintext_tokenizer.out --ignore_punctuation < tokenizer/plaintext/input.txt > output_ignore_punctuation.json
22
+ ${bin_location} /plaintext_tokenizer.out --ignore_punctuation --ignore_numbers --ignore_newlines --to_lower < tokenizer/plaintext/input.txt > output_ignore_everything.json
23
+
24
+
25
+
Original file line number Diff line number Diff line change
1
+ [
2
+ {
3
+ "char" : 5 ,
4
+ "line" : 1 ,
5
+ "type" : " string" ,
6
+ "value" : " A"
7
+ },
8
+ {
9
+ "char" : 7 ,
10
+ "line" : 1 ,
11
+ "type" : " string" ,
12
+ "value" : " Sample"
13
+ },
14
+ {
15
+ "char" : 14 ,
16
+ "line" : 1 ,
17
+ "type" : " string" ,
18
+ "value" : " File"
19
+ },
20
+ {
21
+ "char" : 18 ,
22
+ "line" : 1 ,
23
+ "type" : " newline" ,
24
+ "value" : " \n "
25
+ },
26
+ {
27
+ "char" : 1 ,
28
+ "line" : 2 ,
29
+ "type" : " newline" ,
30
+ "value" : " \n "
31
+ },
32
+ {
33
+ "char" : 1 ,
34
+ "line" : 3 ,
35
+ "type" : " string" ,
36
+ "value" : " This"
37
+ },
38
+ {
39
+ "char" : 6 ,
40
+ "line" : 3 ,
41
+ "type" : " string" ,
42
+ "value" : " file"
43
+ },
44
+ {
45
+ "char" : 11 ,
46
+ "line" : 3 ,
47
+ "type" : " string" ,
48
+ "value" : " contains"
49
+ },
50
+ {
51
+ "char" : 20 ,
52
+ "line" : 3 ,
53
+ "type" : " number" ,
54
+ "value" : " 1"
55
+ },
56
+ {
57
+ "char" : 22 ,
58
+ "line" : 3 ,
59
+ "type" : " punctuation" ,
60
+ "value" : " \" "
61
+ },
62
+ {
63
+ "char" : 23 ,
64
+ "line" : 3 ,
65
+ "type" : " string" ,
66
+ "value" : " sample"
67
+ },
68
+ {
69
+ "char" : 30 ,
70
+ "line" : 3 ,
71
+ "type" : " string" ,
72
+ "value" : " of"
73
+ },
74
+ {
75
+ "char" : 33 ,
76
+ "line" : 3 ,
77
+ "type" : " string" ,
78
+ "value" : " plaintext"
79
+ },
80
+ {
81
+ "char" : 42 ,
82
+ "line" : 3 ,
83
+ "type" : " punctuation" ,
84
+ "value" : " \" "
85
+ },
86
+ {
87
+ "char" : 43 ,
88
+ "line" : 3 ,
89
+ "type" : " punctuation" ,
90
+ "value" : " ."
91
+ },
92
+ {
93
+ "char" : 46 ,
94
+ "line" : 3 ,
95
+ "type" : " string" ,
96
+ "value" : " We"
97
+ },
98
+ {
99
+ "char" : 48 ,
100
+ "line" : 3 ,
101
+ "type" : " newline" ,
102
+ "value" : " \n "
103
+ },
104
+ {
105
+ "char" : 1 ,
106
+ "line" : 4 ,
107
+ "type" : " string" ,
108
+ "value" : " can"
109
+ },
110
+ {
111
+ "char" : 5 ,
112
+ "line" : 4 ,
113
+ "type" : " string" ,
114
+ "value" : " tokenize"
115
+ },
116
+ {
117
+ "char" : 14 ,
118
+ "line" : 4 ,
119
+ "type" : " string" ,
120
+ "value" : " THIS"
121
+ },
122
+ {
123
+ "char" : 18 ,
124
+ "line" : 4 ,
125
+ "type" : " punctuation" ,
126
+ "value" : " ."
127
+ },
128
+ {
129
+ "char" : 21 ,
130
+ "line" : 4 ,
131
+ "type" : " string" ,
132
+ "value" : " a"
133
+ },
134
+ {
135
+ "char" : 22 ,
136
+ "line" : 4 ,
137
+ "type" : " punctuation" ,
138
+ "value" : " ."
139
+ },
140
+ {
141
+ "char" : 23 ,
142
+ "line" : 4 ,
143
+ "type" : " string" ,
144
+ "value" : " b"
145
+ },
146
+ {
147
+ "char" : 24 ,
148
+ "line" : 4 ,
149
+ "type" : " punctuation" ,
150
+ "value" : " ."
151
+ },
152
+ {
153
+ "char" : 25 ,
154
+ "line" : 4 ,
155
+ "type" : " string" ,
156
+ "value" : " c"
157
+ },
158
+ {
159
+ "char" : 26 ,
160
+ "line" : 4 ,
161
+ "type" : " number" ,
162
+ "value" : " 1"
163
+ },
164
+ {
165
+ "char" : 27 ,
166
+ "line" : 4 ,
167
+ "type" : " string" ,
168
+ "value" : " d"
169
+ },
170
+ {
171
+ "char" : 28 ,
172
+ "line" : 4 ,
173
+ "type" : " number" ,
174
+ "value" : " 2"
175
+ },
176
+ {
177
+ "char" : 29 ,
178
+ "line" : 4 ,
179
+ "type" : " string" ,
180
+ "value" : " e"
181
+ },
182
+ {
183
+ "char" : 30 ,
184
+ "line" : 4 ,
185
+ "type" : " punctuation" ,
186
+ "value" : " !"
187
+ },
188
+ {
189
+ "char" : 31 ,
190
+ "line" : 4 ,
191
+ "type" : " newline" ,
192
+ "value" : " \n "
193
+ },
194
+ {
195
+ "char" : 1 ,
196
+ "line" : 5 ,
197
+ "type" : " string" ,
198
+ "value" : " Good"
199
+ },
200
+ {
201
+ "char" : 5 ,
202
+ "line" : 5 ,
203
+ "type" : " punctuation" ,
204
+ "value" : " -"
205
+ },
206
+ {
207
+ "char" : 6 ,
208
+ "line" : 5 ,
209
+ "type" : " string" ,
210
+ "value" : " bye"
211
+ },
212
+ {
213
+ "char" : 9 ,
214
+ "line" : 5 ,
215
+ "type" : " punctuation" ,
216
+ "value" : " ."
217
+ },
218
+ {
219
+ "char" : 10 ,
220
+ "line" : 5 ,
221
+ "type" : " newline" ,
222
+ "value" : " \n "
223
+ },
224
+ {
225
+ "char" : 1 ,
226
+ "line" : 6 ,
227
+ "type" : " newline" ,
228
+ "value" : " \n "
229
+ }
230
+ ]
Original file line number Diff line number Diff line change
1
+ [
2
+ {
3
+ "char" : 5 ,
4
+ "line" : 1 ,
5
+ "type" : " string" ,
6
+ "value" : " a"
7
+ },
8
+ {
9
+ "char" : 7 ,
10
+ "line" : 1 ,
11
+ "type" : " string" ,
12
+ "value" : " sample"
13
+ },
14
+ {
15
+ "char" : 14 ,
16
+ "line" : 1 ,
17
+ "type" : " string" ,
18
+ "value" : " file"
19
+ },
20
+ {
21
+ "char" : 1 ,
22
+ "line" : 3 ,
23
+ "type" : " string" ,
24
+ "value" : " this"
25
+ },
26
+ {
27
+ "char" : 6 ,
28
+ "line" : 3 ,
29
+ "type" : " string" ,
30
+ "value" : " file"
31
+ },
32
+ {
33
+ "char" : 11 ,
34
+ "line" : 3 ,
35
+ "type" : " string" ,
36
+ "value" : " contains"
37
+ },
38
+ {
39
+ "char" : 23 ,
40
+ "line" : 3 ,
41
+ "type" : " string" ,
42
+ "value" : " sample"
43
+ },
44
+ {
45
+ "char" : 30 ,
46
+ "line" : 3 ,
47
+ "type" : " string" ,
48
+ "value" : " of"
49
+ },
50
+ {
51
+ "char" : 33 ,
52
+ "line" : 3 ,
53
+ "type" : " string" ,
54
+ "value" : " plaintext"
55
+ },
56
+ {
57
+ "char" : 46 ,
58
+ "line" : 3 ,
59
+ "type" : " string" ,
60
+ "value" : " we"
61
+ },
62
+ {
63
+ "char" : 1 ,
64
+ "line" : 4 ,
65
+ "type" : " string" ,
66
+ "value" : " can"
67
+ },
68
+ {
69
+ "char" : 5 ,
70
+ "line" : 4 ,
71
+ "type" : " string" ,
72
+ "value" : " tokenize"
73
+ },
74
+ {
75
+ "char" : 14 ,
76
+ "line" : 4 ,
77
+ "type" : " string" ,
78
+ "value" : " this"
79
+ },
80
+ {
81
+ "char" : 21 ,
82
+ "line" : 4 ,
83
+ "type" : " string" ,
84
+ "value" : " a"
85
+ },
86
+ {
87
+ "char" : 23 ,
88
+ "line" : 4 ,
89
+ "type" : " string" ,
90
+ "value" : " b"
91
+ },
92
+ {
93
+ "char" : 25 ,
94
+ "line" : 4 ,
95
+ "type" : " string" ,
96
+ "value" : " c"
97
+ },
98
+ {
99
+ "char" : 27 ,
100
+ "line" : 4 ,
101
+ "type" : " string" ,
102
+ "value" : " d"
103
+ },
104
+ {
105
+ "char" : 29 ,
106
+ "line" : 4 ,
107
+ "type" : " string" ,
108
+ "value" : " e"
109
+ },
110
+ {
111
+ "char" : 1 ,
112
+ "line" : 5 ,
113
+ "type" : " string" ,
114
+ "value" : " good"
115
+ },
116
+ {
117
+ "char" : 6 ,
118
+ "line" : 5 ,
119
+ "type" : " string" ,
120
+ "value" : " bye"
121
+ }
122
+ ]
You can’t perform that action at this time.
0 commit comments