@@ -108,10 +108,6 @@ fn postprocess() -> anyhow::Result<()> {
108108
109109 // s/^(# *foundry *= *)base/$1 tree_tagger/
110110 if line. starts_with ( "#" ) && line. contains ( "foundry" ) && line. contains ( "base" ) {
111- // Simple replacement for now, regex if needed
112- // Perl: s/^(# *foundry *= *)base/$1 tree_tagger/
113- // This keeps the prefix and changes base to tree_tagger
114- // We can use regex for this to be safe
115111 let re = regex:: Regex :: new ( r"^(# *foundry *= *)base" ) . unwrap ( ) ;
116112 line = re. replace ( & line, "${1}tree_tagger" ) . to_string ( ) ;
117113 }
@@ -122,92 +118,93 @@ fn postprocess() -> anyhow::Result<()> {
122118 id = 0 ;
123119 }
124120
125- // my @cols = split("\t");
121+ // Split by tabs
126122 let cols: Vec < & str > = line. split ( '\t' ) . collect ( ) ;
127123
128- if cols. len ( ) == 3 {
129- // print "$id\t$cols[0]\t$cols[2]\t_\t$cols[1]\t_\t_\t_\t_\t_"
130- writeln ! ( writer, "{}\t {}\t {}\t _\t {}\t _\t _\t _\t _\t _" , id, cols[ 0 ] , cols[ 2 ] , cols[ 1 ] ) ?;
131- } else if cols. len ( ) > 3 {
132- // my $extra = join(" ", @cols[3..$#cols]);
133- let extra_parts = & cols[ 3 ..] ;
134- let mut extra = extra_parts. join ( " " ) ;
135-
136- // $extra =~ s/^[fsc]\s+//;
137- if extra. starts_with ( "f " ) || extra. starts_with ( "s " ) || extra. starts_with ( "c " ) {
138- extra = extra[ 2 ..] . to_string ( ) ;
139- }
140-
141- // my @tags; my @probs; my @probs_cols = split(/\s+/, $extra);
142- let probs_cols: Vec < & str > = extra. split_whitespace ( ) . collect ( ) ;
124+ // Check if this is the new format with probabilities
125+ // New format: columns after the first contain spaces (e.g., "TAG lemma prob")
126+ // Old format: columns are just single values without spaces
127+ let has_prob_format = cols. len ( ) >= 2 && cols[ 1 ..] . iter ( ) . any ( |col| col. contains ( ' ' ) ) ;
143128
144- // Parse lemmas
145- let lemmas: Vec < & str > = cols[ 2 ] . split ( '|' ) . collect ( ) ;
146-
147- // for (my $i=0; $i < @probs_cols; $i+=2)
148- struct TagLemmaProb < ' a > {
149- tag : & ' a str ,
150- lemma : & ' a str ,
151- prob_str : & ' a str ,
129+ if !has_prob_format && cols. len ( ) == 3 {
130+ // Handle simple 3-column format (word, tag, lemma) - no probabilities
131+ writeln ! ( writer, "{}\t {}\t {}\t _\t {}\t _\t _\t _\t _\t _" , id, cols[ 0 ] , cols[ 2 ] , cols[ 1 ] ) ?;
132+ }
133+ else if has_prob_format {
134+ // Handle new format: word \t TAG1 lemma1 prob1 \t TAG2 lemma2 prob2 \t ...
135+ struct TagLemmaProb {
136+ tag : String ,
137+ lemma : String ,
138+ prob_str : String ,
152139 prob_val : f64 ,
153140 }
154141
155142 let mut triples: Vec < TagLemmaProb > = Vec :: new ( ) ;
156143
157- for ( i, chunk) in probs_cols. chunks ( 2 ) . enumerate ( ) {
158- let lemma = if i < lemmas. len ( ) { lemmas[ i] } else { lemmas. last ( ) . unwrap_or ( & "" ) } ;
144+ // First column is the word, remaining columns are "TAG lemma prob" triplets
145+ for col in & cols[ 1 ..] {
146+ let parts: Vec < & str > = col. split_whitespace ( ) . collect ( ) ;
159147
160- if chunk. len ( ) >= 2 {
161- let p_val = chunk[ 1 ] . parse :: < f64 > ( ) . unwrap_or ( 0.0 ) ;
148+ if parts. len ( ) >= 3 {
149+ // Format: TAG lemma prob
150+ let tag = parts[ 0 ] ;
151+ let lemma = parts[ 1 ] ;
152+ let prob_str = parts[ 2 ] ;
153+ let prob_val = prob_str. parse :: < f64 > ( ) . unwrap_or ( 0.0 ) ;
154+
162155 triples. push ( TagLemmaProb {
163- tag : chunk [ 0 ] ,
164- lemma,
165- prob_str : chunk [ 1 ] ,
166- prob_val : p_val ,
156+ tag : tag . to_string ( ) ,
157+ lemma : lemma . to_string ( ) ,
158+ prob_str : prob_str . to_string ( ) ,
159+ prob_val,
167160 } ) ;
168- } else if chunk. len ( ) == 1 {
161+ } else if parts. len ( ) == 2 {
162+ // Fallback: TAG lemma (no prob)
163+ let tag = parts[ 0 ] ;
164+ let lemma = parts[ 1 ] ;
165+
169166 triples. push ( TagLemmaProb {
170- tag : chunk [ 0 ] ,
171- lemma,
172- prob_str : "0 .0" ,
173- prob_val : 0 .0,
167+ tag : tag . to_string ( ) ,
168+ lemma : lemma . to_string ( ) ,
169+ prob_str : "1 .0" . to_string ( ) ,
170+ prob_val : 1 .0,
174171 } ) ;
175172 }
176173 }
177174
178- // Sort descending by prob_val
179- triples. sort_by ( |a, b| b. prob_val . partial_cmp ( & a. prob_val ) . unwrap_or ( std:: cmp:: Ordering :: Equal ) ) ;
180-
181- let tags: Vec < & str > = triples. iter ( ) . map ( |t| t. tag ) . collect ( ) ;
182- let lemmas_sorted: Vec < & str > = triples. iter ( ) . map ( |t| t. lemma ) . collect ( ) ;
183- let probs: Vec < & str > = triples. iter ( ) . map ( |t| t. prob_str ) . collect ( ) ;
184-
185- // my $xpos = join("|", @tags);
186- let xpos = tags. join ( "|" ) ;
187-
188- // Deduplicate lemmas if all are the same
189- let unique_lemmas: Vec < & str > = lemmas_sorted. iter ( )
190- . copied ( )
191- . collect :: < std:: collections:: HashSet < _ > > ( )
192- . into_iter ( )
193- . collect ( ) ;
194-
195- let lemma_str = if unique_lemmas. len ( ) == 1 {
196- unique_lemmas[ 0 ] . to_string ( )
175+ if triples. is_empty ( ) {
176+ // Fallback to just printing the line as-is
177+ writeln ! ( writer, "{}" , line) ?;
197178 } else {
198- lemmas_sorted. join ( "|" )
199- } ;
200-
201- // my $misc = (scalar(@tags) == 1) ? "_" : join("|", @probs);
202- let misc = if tags. len ( ) == 1 {
203- "_" . to_string ( )
204- } else {
205- probs. join ( "|" )
206- } ;
179+ // Sort descending by prob_val
180+ triples. sort_by ( |a, b| b. prob_val . partial_cmp ( & a. prob_val ) . unwrap_or ( std:: cmp:: Ordering :: Equal ) ) ;
207181
208- // print "$id\t$cols[0]\t$cols[2]\t_\t$xpos\t_\t_\t_\t_\t$misc"
209- writeln ! ( writer, "{}\t {}\t {}\t _\t {}\t _\t _\t _\t _\t {}" , id, cols[ 0 ] , lemma_str, xpos, misc) ?;
182+ let tags: Vec < String > = triples. iter ( ) . map ( |t| t. tag . clone ( ) ) . collect ( ) ;
183+ let lemmas_sorted: Vec < String > = triples. iter ( ) . map ( |t| t. lemma . clone ( ) ) . collect ( ) ;
184+ let probs: Vec < String > = triples. iter ( ) . map ( |t| t. prob_str . clone ( ) ) . collect ( ) ;
210185
186+ // Join tags with |
187+ let xpos = tags. join ( "|" ) ;
188+
189+ // Deduplicate lemmas if all are the same
190+ let unique_lemmas: std:: collections:: HashSet < String > = lemmas_sorted. iter ( ) . cloned ( ) . collect ( ) ;
191+
192+ let lemma_str = if unique_lemmas. len ( ) == 1 {
193+ lemmas_sorted[ 0 ] . clone ( )
194+ } else {
195+ lemmas_sorted. join ( "|" )
196+ } ;
197+
198+ // If only one tag, use "_" for misc, otherwise join probabilities
199+ let misc = if tags. len ( ) == 1 {
200+ "_" . to_string ( )
201+ } else {
202+ probs. join ( "|" )
203+ } ;
204+
205+ // Output: id \t word \t lemma \t _ \t xpos \t _ \t _ \t _ \t _ \t misc
206+ writeln ! ( writer, "{}\t {}\t {}\t _\t {}\t _\t _\t _\t _\t {}" , id, cols[ 0 ] , lemma_str, xpos, misc) ?;
207+ }
211208 } else {
212209 writeln ! ( writer, "{}" , line) ?;
213210 }
0 commit comments