@@ -104,7 +104,9 @@ fn postprocess(mut input: impl BufRead, writer: &mut impl Write) -> anyhow::Resu
104104
105105 // $id++; $id=0 if(/^(#|\s*$)/);
106106 id += 1 ;
107- if line. starts_with ( '#' ) || line. trim ( ) . is_empty ( ) {
107+ // Only reset id for actual comment lines (which start with "# ")
108+ // Tokens starting with # (like hashtags) don't have a space after #
109+ if line. starts_with ( "# " ) || line. trim ( ) . is_empty ( ) {
108110 id = 0 ;
109111 }
110112
@@ -357,4 +359,59 @@ mod tests {
357359 let output_str = String :: from_utf8 ( output) . unwrap ( ) ;
358360 assert ! ( output_str. contains( "invalid ? utf8" ) ) ;
359361 }
362+
363+ /// Regression test for GitHub issue #2: hashtags incorrectly always get index 0
364+ /// https://github.com/KorAP/conllu-treetagger-docker/issues/2
365+ ///
366+ /// The bug was that tokens starting with # (like #MeToo) were incorrectly
367+ /// identified as comment lines, causing their index to reset to 0.
368+ #[ test]
369+ fn test_postprocess_hashtag_index ( ) {
370+ // Simulated tree-tagger output for tokens including hashtags
371+ // Format: word\tTAG\tlemma
372+ let input = b"#MeToo\t NN\t <unknown>\n test\t VVFIN\t testen\n foo\t NN\t foo\n bar\t NN\t bar\n #MeToo\t NN\t <unknown>\n end\t NN\t end\n " ;
373+ let mut output = Vec :: new ( ) ;
374+
375+ postprocess ( & input[ ..] , & mut output) . unwrap ( ) ;
376+
377+ let output_str = String :: from_utf8 ( output) . unwrap ( ) ;
378+ let lines: Vec < & str > = output_str. lines ( ) . collect ( ) ;
379+
380+ // #MeToo at position 1 should get index 1, not 0
381+ assert ! ( lines[ 0 ] . starts_with( "1\t #MeToo\t " ) , "First #MeToo should have index 1, got: {}" , lines[ 0 ] ) ;
382+
383+ // test at position 2 should get index 2
384+ assert ! ( lines[ 1 ] . starts_with( "2\t test\t " ) , "test should have index 2, got: {}" , lines[ 1 ] ) ;
385+
386+ // foo at position 3 should get index 3
387+ assert ! ( lines[ 2 ] . starts_with( "3\t foo\t " ) , "foo should have index 3, got: {}" , lines[ 2 ] ) ;
388+
389+ // bar at position 4 should get index 4
390+ assert ! ( lines[ 3 ] . starts_with( "4\t bar\t " ) , "bar should have index 4, got: {}" , lines[ 3 ] ) ;
391+
392+ // Second #MeToo at position 5 should get index 5, not 0
393+ assert ! ( lines[ 4 ] . starts_with( "5\t #MeToo\t " ) , "Second #MeToo should have index 5, got: {}" , lines[ 4 ] ) ;
394+
395+ // end at position 6 should get index 6
396+ assert ! ( lines[ 5 ] . starts_with( "6\t end\t " ) , "end should have index 6, got: {}" , lines[ 5 ] ) ;
397+ }
398+
399+ /// Test that actual comment lines still reset the index correctly
400+ #[ test]
401+ fn test_postprocess_comment_line_resets_index ( ) {
402+ // Comment line (wrapped) followed by tokens
403+ let input = b"<# This is a comment>\n word\t NN\t word\n " ;
404+ let mut output = Vec :: new ( ) ;
405+
406+ postprocess ( & input[ ..] , & mut output) . unwrap ( ) ;
407+
408+ let output_str = String :: from_utf8 ( output) . unwrap ( ) ;
409+ let lines: Vec < & str > = output_str. lines ( ) . collect ( ) ;
410+
411+ // Comment line should have index 0
412+ assert ! ( lines[ 0 ] . starts_with( "# This is a comment" ) , "Comment line not preserved correctly: {}" , lines[ 0 ] ) ;
413+
414+ // Word after comment should have index 1
415+ assert ! ( lines[ 1 ] . starts_with( "1\t word\t " ) , "Word after comment should have index 1, got: {}" , lines[ 1 ] ) ;
416+ }
360417}
0 commit comments