Skip to content

Commit 743b622

Browse files
committed
Fix hashtag tagging
Comment lines must start with a '# ' sequence. Resolves #2 Signed-off-by: Marc Kupietz <[email protected]> Change-Id: I9b45957be067645f1361788b446a13a82b0d7551
1 parent c33c0ef commit 743b622

File tree

1 file changed

+58
-1
lines changed
  • korap-treetagger-processor/src

1 file changed

+58
-1
lines changed

korap-treetagger-processor/src/main.rs

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,9 @@ fn postprocess(mut input: impl BufRead, writer: &mut impl Write) -> anyhow::Resu
104104

105105
// $id++; $id=0 if(/^(#|\s*$)/);
106106
id += 1;
107-
if line.starts_with('#') || line.trim().is_empty() {
107+
// Only reset id for actual comment lines (which start with "# ")
108+
// Tokens starting with # (like hashtags) don't have a space after #
109+
if line.starts_with("# ") || line.trim().is_empty() {
108110
id = 0;
109111
}
110112

@@ -357,4 +359,59 @@ mod tests {
357359
let output_str = String::from_utf8(output).unwrap();
358360
assert!(output_str.contains("invalid ? utf8"));
359361
}
362+
363+
/// Regression test for GitHub issue #2: hashtags incorrectly always get index 0
364+
/// https://github.com/KorAP/conllu-treetagger-docker/issues/2
365+
///
366+
/// The bug was that tokens starting with # (like #MeToo) were incorrectly
367+
/// identified as comment lines, causing their index to reset to 0.
368+
#[test]
369+
fn test_postprocess_hashtag_index() {
370+
// Simulated tree-tagger output for tokens including hashtags
371+
// Format: word\tTAG\tlemma
372+
let input = b"#MeToo\tNN\t<unknown>\ntest\tVVFIN\ttesten\nfoo\tNN\tfoo\nbar\tNN\tbar\n#MeToo\tNN\t<unknown>\nend\tNN\tend\n";
373+
let mut output = Vec::new();
374+
375+
postprocess(&input[..], &mut output).unwrap();
376+
377+
let output_str = String::from_utf8(output).unwrap();
378+
let lines: Vec<&str> = output_str.lines().collect();
379+
380+
// #MeToo at position 1 should get index 1, not 0
381+
assert!(lines[0].starts_with("1\t#MeToo\t"), "First #MeToo should have index 1, got: {}", lines[0]);
382+
383+
// test at position 2 should get index 2
384+
assert!(lines[1].starts_with("2\ttest\t"), "test should have index 2, got: {}", lines[1]);
385+
386+
// foo at position 3 should get index 3
387+
assert!(lines[2].starts_with("3\tfoo\t"), "foo should have index 3, got: {}", lines[2]);
388+
389+
// bar at position 4 should get index 4
390+
assert!(lines[3].starts_with("4\tbar\t"), "bar should have index 4, got: {}", lines[3]);
391+
392+
// Second #MeToo at position 5 should get index 5, not 0
393+
assert!(lines[4].starts_with("5\t#MeToo\t"), "Second #MeToo should have index 5, got: {}", lines[4]);
394+
395+
// end at position 6 should get index 6
396+
assert!(lines[5].starts_with("6\tend\t"), "end should have index 6, got: {}", lines[5]);
397+
}
398+
399+
/// Test that actual comment lines still reset the index correctly
400+
#[test]
401+
fn test_postprocess_comment_line_resets_index() {
402+
// Comment line (wrapped) followed by tokens
403+
let input = b"<# This is a comment>\nword\tNN\tword\n";
404+
let mut output = Vec::new();
405+
406+
postprocess(&input[..], &mut output).unwrap();
407+
408+
let output_str = String::from_utf8(output).unwrap();
409+
let lines: Vec<&str> = output_str.lines().collect();
410+
411+
// Comment line should have index 0
412+
assert!(lines[0].starts_with("# This is a comment"), "Comment line not preserved correctly: {}", lines[0]);
413+
414+
// Word after comment should have index 1
415+
assert!(lines[1].starts_with("1\tword\t"), "Word after comment should have index 1, got: {}", lines[1]);
416+
}
360417
}

0 commit comments

Comments
 (0)