Skip to content

Commit 7ba0d8d

Browse files
[Feature:Plagiarism] Add flag to ignore C++ comments (#69)
1 parent fd22c02 commit 7ba0d8d

File tree

7 files changed

+560
-68
lines changed

7 files changed

+560
-68
lines changed

bin/tokenize_all.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ def tokenize(lichen_config_data, my_concatenated_file, my_tokenized_file):
2727

2828
tokenizer = f"./{language_token_data['tokenizer']}"
2929

30-
if not language_token_data.get("input_as_argument"):
30+
if language_token_data.get('input_as_argument') is not None and \
31+
language_token_data['input_as_argument'] is not False:
3132
my_concatenated_file = f'< {my_concatenated_file}'
3233

3334
if "command_args" in language_token_data:

tests/data/tokenizer/c/expected_output/output.json

+69-33
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,12 @@
107107
"type": "PUNCTUATION-;",
108108
"value": ";"
109109
},
110+
{
111+
"char": 21,
112+
"line": 6,
113+
"type": "COMMENT",
114+
"value": "// define a variable"
115+
},
110116
{
111117
"char": 5,
112118
"line": 7,
@@ -149,6 +155,12 @@
149155
"type": "PUNCTUATION-;",
150156
"value": ";"
151157
},
158+
{
159+
"char": 39,
160+
"line": 7,
161+
"type": "COMMENT",
162+
"value": "// define a variable and set it equal to 1"
163+
},
152164
{
153165
"char": 5,
154166
"line": 9,
@@ -173,6 +185,12 @@
173185
"type": "PUNCTUATION-;",
174186
"value": ";"
175187
},
188+
{
189+
"char": 43,
190+
"line": 9,
191+
"type": "COMMENT",
192+
"value": "// print something"
193+
},
176194
{
177195
"char": 5,
178196
"line": 10,
@@ -200,204 +218,222 @@
200218
{
201219
"char": 5,
202220
"line": 12,
221+
"type": "COMMENT",
222+
"value": "// loop from 1 to n and multiply the previous result by i"
223+
},
224+
{
225+
"char": 5,
226+
"line": 13,
203227
"type": "KEYWORD",
204228
"value": "for"
205229
},
206230
{
207231
"char": 8,
208-
"line": 12,
232+
"line": 13,
209233
"type": "PUNCTUATION-(",
210234
"value": "("
211235
},
212236
{
213237
"char": 9,
214-
"line": 12,
238+
"line": 13,
215239
"type": "KEYWORD",
216240
"value": "int"
217241
},
218242
{
219243
"char": 13,
220-
"line": 12,
244+
"line": 13,
221245
"type": "IDENTIFIER",
222246
"value": "i"
223247
},
224248
{
225249
"char": 15,
226-
"line": 12,
250+
"line": 13,
227251
"type": "PUNCTUATION-=",
228252
"value": "="
229253
},
230254
{
231255
"char": 17,
232-
"line": 12,
256+
"line": 13,
233257
"type": "LITERAL",
234258
"value": "1"
235259
},
236260
{
237261
"char": 18,
238-
"line": 12,
262+
"line": 13,
239263
"type": "PUNCTUATION-;",
240264
"value": ";"
241265
},
242266
{
243267
"char": 20,
244-
"line": 12,
268+
"line": 13,
245269
"type": "IDENTIFIER",
246270
"value": "i"
247271
},
248272
{
249273
"char": 22,
250-
"line": 12,
274+
"line": 13,
251275
"type": "PUNCTUATION-<=",
252276
"value": "<="
253277
},
254278
{
255279
"char": 24,
256-
"line": 12,
280+
"line": 13,
257281
"type": "IDENTIFIER",
258282
"value": "n"
259283
},
260284
{
261285
"char": 25,
262-
"line": 12,
286+
"line": 13,
263287
"type": "PUNCTUATION-;",
264288
"value": ";"
265289
},
266290
{
267291
"char": 27,
268-
"line": 12,
292+
"line": 13,
269293
"type": "PUNCTUATION-++",
270294
"value": "++"
271295
},
272296
{
273297
"char": 29,
274-
"line": 12,
298+
"line": 13,
275299
"type": "IDENTIFIER",
276300
"value": "i"
277301
},
278302
{
279303
"char": 30,
280-
"line": 12,
304+
"line": 13,
281305
"type": "PUNCTUATION-)",
282306
"value": ")"
283307
},
284308
{
285309
"char": 5,
286-
"line": 13,
310+
"line": 14,
287311
"type": "PUNCTUATION-{",
288312
"value": "{"
289313
},
290314
{
291315
"char": 9,
292-
"line": 14,
316+
"line": 15,
293317
"type": "IDENTIFIER",
294318
"value": "factorial"
295319
},
296320
{
297321
"char": 19,
298-
"line": 14,
322+
"line": 15,
299323
"type": "PUNCTUATION-*=",
300324
"value": "*="
301325
},
302326
{
303327
"char": 22,
304-
"line": 14,
328+
"line": 15,
305329
"type": "IDENTIFIER",
306330
"value": "i"
307331
},
308332
{
309333
"char": 23,
310-
"line": 14,
334+
"line": 15,
311335
"type": "PUNCTUATION-;",
312336
"value": ";"
313337
},
338+
{
339+
"char": 9,
340+
"line": 16,
341+
"type": "COMMENT",
342+
"value": "/*\n factorial += i; // this doesn't work\n factorial -= i; // this doesn't work either\n */"
343+
},
314344
{
315345
"char": 5,
316-
"line": 15,
346+
"line": 20,
317347
"type": "PUNCTUATION-}",
318348
"value": "}"
319349
},
320350
{
321351
"char": 5,
322-
"line": 17,
352+
"line": 22,
323353
"type": "IDENTIFIER",
324354
"value": "cout"
325355
},
326356
{
327357
"char": 10,
328-
"line": 17,
358+
"line": 22,
329359
"type": "PUNCTUATION-<<",
330360
"value": "<<"
331361
},
332362
{
333363
"char": 13,
334-
"line": 17,
364+
"line": 22,
335365
"type": "LITERAL",
336366
"value": "\"Factorial of \""
337367
},
338368
{
339369
"char": 29,
340-
"line": 17,
370+
"line": 22,
341371
"type": "PUNCTUATION-<<",
342372
"value": "<<"
343373
},
344374
{
345375
"char": 32,
346-
"line": 17,
376+
"line": 22,
347377
"type": "IDENTIFIER",
348378
"value": "n"
349379
},
350380
{
351381
"char": 34,
352-
"line": 17,
382+
"line": 22,
353383
"type": "PUNCTUATION-<<",
354384
"value": "<<"
355385
},
356386
{
357387
"char": 37,
358-
"line": 17,
388+
"line": 22,
359389
"type": "LITERAL",
360390
"value": "\" = \""
361391
},
362392
{
363393
"char": 43,
364-
"line": 17,
394+
"line": 22,
365395
"type": "PUNCTUATION-<<",
366396
"value": "<<"
367397
},
368398
{
369399
"char": 46,
370-
"line": 17,
400+
"line": 22,
371401
"type": "IDENTIFIER",
372402
"value": "factorial"
373403
},
374404
{
375405
"char": 55,
376-
"line": 17,
406+
"line": 22,
377407
"type": "PUNCTUATION-;",
378408
"value": ";"
379409
},
410+
{
411+
"char": 57,
412+
"line": 22,
413+
"type": "COMMENT",
414+
"value": "// print the result"
415+
},
380416
{
381417
"char": 5,
382-
"line": 18,
418+
"line": 23,
383419
"type": "KEYWORD",
384420
"value": "return"
385421
},
386422
{
387423
"char": 12,
388-
"line": 18,
424+
"line": 23,
389425
"type": "LITERAL",
390426
"value": "0"
391427
},
392428
{
393429
"char": 13,
394-
"line": 18,
430+
"line": 23,
395431
"type": "PUNCTUATION-;",
396432
"value": ";"
397433
},
398434
{
399435
"char": 1,
400-
"line": 19,
436+
"line": 24,
401437
"type": "PUNCTUATION-}",
402438
"value": "}"
403439
}

0 commit comments

Comments
 (0)