Skip to content

Commit 76bfc13

Browse files
authored
Initial Plaintext Tokenizer (#1)
1 parent a60eb1d commit 76bfc13

9 files changed

+1167
-0
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
*~

install.sh

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/usr/bin/env bash
2+
3+
src_location="."
4+
build_location="."
5+
bin_location="./bin"
6+
7+
nlohmann_dir=${src_location}/GIT_NLOHMANN_JSON/
8+
9+
if [ ! -d "${nlohmann_dir}" ]; then
10+
echo 'should install'
11+
git clone --depth 1 https://github.com/nlohmann/json.git ${nlohmann_dir}
12+
fi
13+
14+
15+
mkdir -p ${bin_location}
16+
clang++ -I ${nlohmann_dir}/include/ -std=c++11 -Wall tokenizer/plaintext/plaintext_tokenizer.cpp -o ${bin_location}/plaintext_tokenizer.out
17+
18+
${bin_location}/plaintext_tokenizer.out < tokenizer/plaintext/input.txt > output.json
19+
${bin_location}/plaintext_tokenizer.out --ignore_newlines < tokenizer/plaintext/input.txt > output_ignore_newlines.json
20+
${bin_location}/plaintext_tokenizer.out --to_lower < tokenizer/plaintext/input.txt > output_to_lower.json
21+
${bin_location}/plaintext_tokenizer.out --ignore_punctuation < tokenizer/plaintext/input.txt > output_ignore_punctuation.json
22+
${bin_location}/plaintext_tokenizer.out --ignore_punctuation --ignore_numbers --ignore_newlines --to_lower < tokenizer/plaintext/input.txt > output_ignore_everything.json
23+
24+
25+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
[
2+
{
3+
"char": 5,
4+
"line": 1,
5+
"type": "string",
6+
"value": "A"
7+
},
8+
{
9+
"char": 7,
10+
"line": 1,
11+
"type": "string",
12+
"value": "Sample"
13+
},
14+
{
15+
"char": 14,
16+
"line": 1,
17+
"type": "string",
18+
"value": "File"
19+
},
20+
{
21+
"char": 18,
22+
"line": 1,
23+
"type": "newline",
24+
"value": "\n"
25+
},
26+
{
27+
"char": 1,
28+
"line": 2,
29+
"type": "newline",
30+
"value": "\n"
31+
},
32+
{
33+
"char": 1,
34+
"line": 3,
35+
"type": "string",
36+
"value": "This"
37+
},
38+
{
39+
"char": 6,
40+
"line": 3,
41+
"type": "string",
42+
"value": "file"
43+
},
44+
{
45+
"char": 11,
46+
"line": 3,
47+
"type": "string",
48+
"value": "contains"
49+
},
50+
{
51+
"char": 20,
52+
"line": 3,
53+
"type": "number",
54+
"value": "1"
55+
},
56+
{
57+
"char": 22,
58+
"line": 3,
59+
"type": "punctuation",
60+
"value": "\""
61+
},
62+
{
63+
"char": 23,
64+
"line": 3,
65+
"type": "string",
66+
"value": "sample"
67+
},
68+
{
69+
"char": 30,
70+
"line": 3,
71+
"type": "string",
72+
"value": "of"
73+
},
74+
{
75+
"char": 33,
76+
"line": 3,
77+
"type": "string",
78+
"value": "plaintext"
79+
},
80+
{
81+
"char": 42,
82+
"line": 3,
83+
"type": "punctuation",
84+
"value": "\""
85+
},
86+
{
87+
"char": 43,
88+
"line": 3,
89+
"type": "punctuation",
90+
"value": "."
91+
},
92+
{
93+
"char": 46,
94+
"line": 3,
95+
"type": "string",
96+
"value": "We"
97+
},
98+
{
99+
"char": 48,
100+
"line": 3,
101+
"type": "newline",
102+
"value": "\n"
103+
},
104+
{
105+
"char": 1,
106+
"line": 4,
107+
"type": "string",
108+
"value": "can"
109+
},
110+
{
111+
"char": 5,
112+
"line": 4,
113+
"type": "string",
114+
"value": "tokenize"
115+
},
116+
{
117+
"char": 14,
118+
"line": 4,
119+
"type": "string",
120+
"value": "THIS"
121+
},
122+
{
123+
"char": 18,
124+
"line": 4,
125+
"type": "punctuation",
126+
"value": "."
127+
},
128+
{
129+
"char": 21,
130+
"line": 4,
131+
"type": "string",
132+
"value": "a"
133+
},
134+
{
135+
"char": 22,
136+
"line": 4,
137+
"type": "punctuation",
138+
"value": "."
139+
},
140+
{
141+
"char": 23,
142+
"line": 4,
143+
"type": "string",
144+
"value": "b"
145+
},
146+
{
147+
"char": 24,
148+
"line": 4,
149+
"type": "punctuation",
150+
"value": "."
151+
},
152+
{
153+
"char": 25,
154+
"line": 4,
155+
"type": "string",
156+
"value": "c"
157+
},
158+
{
159+
"char": 26,
160+
"line": 4,
161+
"type": "number",
162+
"value": "1"
163+
},
164+
{
165+
"char": 27,
166+
"line": 4,
167+
"type": "string",
168+
"value": "d"
169+
},
170+
{
171+
"char": 28,
172+
"line": 4,
173+
"type": "number",
174+
"value": "2"
175+
},
176+
{
177+
"char": 29,
178+
"line": 4,
179+
"type": "string",
180+
"value": "e"
181+
},
182+
{
183+
"char": 30,
184+
"line": 4,
185+
"type": "punctuation",
186+
"value": "!"
187+
},
188+
{
189+
"char": 31,
190+
"line": 4,
191+
"type": "newline",
192+
"value": "\n"
193+
},
194+
{
195+
"char": 1,
196+
"line": 5,
197+
"type": "string",
198+
"value": "Good"
199+
},
200+
{
201+
"char": 5,
202+
"line": 5,
203+
"type": "punctuation",
204+
"value": "-"
205+
},
206+
{
207+
"char": 6,
208+
"line": 5,
209+
"type": "string",
210+
"value": "bye"
211+
},
212+
{
213+
"char": 9,
214+
"line": 5,
215+
"type": "punctuation",
216+
"value": "."
217+
},
218+
{
219+
"char": 10,
220+
"line": 5,
221+
"type": "newline",
222+
"value": "\n"
223+
},
224+
{
225+
"char": 1,
226+
"line": 6,
227+
"type": "newline",
228+
"value": "\n"
229+
}
230+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
[
2+
{
3+
"char": 5,
4+
"line": 1,
5+
"type": "string",
6+
"value": "a"
7+
},
8+
{
9+
"char": 7,
10+
"line": 1,
11+
"type": "string",
12+
"value": "sample"
13+
},
14+
{
15+
"char": 14,
16+
"line": 1,
17+
"type": "string",
18+
"value": "file"
19+
},
20+
{
21+
"char": 1,
22+
"line": 3,
23+
"type": "string",
24+
"value": "this"
25+
},
26+
{
27+
"char": 6,
28+
"line": 3,
29+
"type": "string",
30+
"value": "file"
31+
},
32+
{
33+
"char": 11,
34+
"line": 3,
35+
"type": "string",
36+
"value": "contains"
37+
},
38+
{
39+
"char": 23,
40+
"line": 3,
41+
"type": "string",
42+
"value": "sample"
43+
},
44+
{
45+
"char": 30,
46+
"line": 3,
47+
"type": "string",
48+
"value": "of"
49+
},
50+
{
51+
"char": 33,
52+
"line": 3,
53+
"type": "string",
54+
"value": "plaintext"
55+
},
56+
{
57+
"char": 46,
58+
"line": 3,
59+
"type": "string",
60+
"value": "we"
61+
},
62+
{
63+
"char": 1,
64+
"line": 4,
65+
"type": "string",
66+
"value": "can"
67+
},
68+
{
69+
"char": 5,
70+
"line": 4,
71+
"type": "string",
72+
"value": "tokenize"
73+
},
74+
{
75+
"char": 14,
76+
"line": 4,
77+
"type": "string",
78+
"value": "this"
79+
},
80+
{
81+
"char": 21,
82+
"line": 4,
83+
"type": "string",
84+
"value": "a"
85+
},
86+
{
87+
"char": 23,
88+
"line": 4,
89+
"type": "string",
90+
"value": "b"
91+
},
92+
{
93+
"char": 25,
94+
"line": 4,
95+
"type": "string",
96+
"value": "c"
97+
},
98+
{
99+
"char": 27,
100+
"line": 4,
101+
"type": "string",
102+
"value": "d"
103+
},
104+
{
105+
"char": 29,
106+
"line": 4,
107+
"type": "string",
108+
"value": "e"
109+
},
110+
{
111+
"char": 1,
112+
"line": 5,
113+
"type": "string",
114+
"value": "good"
115+
},
116+
{
117+
"char": 6,
118+
"line": 5,
119+
"type": "string",
120+
"value": "bye"
121+
}
122+
]

0 commit comments

Comments
 (0)