Skip to content

Commit 330e7e1

Browse files
committed
Fix multiple interpretations and add -t <threshold>
Change-Id: I759f0156ac7a63a2b167be7f8c6692f1af0a9f02
1 parent 5595268 commit 330e7e1

File tree

4 files changed

+89
-76
lines changed

4 files changed

+89
-76
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased] - 3.2.5-5
99

10+
### Fixed
11+
12+
- Use proper -prob flag for making tree-tagger output different interpretations with probabilities
13+
1014
### Added
15+
- **threshold option**: Added `-t` option to set probability threshold (default: 0.1)
1116
- **Probability-based Sorting**: Annotations now sorted in descending order by probability value
1217
- **Smart Lemma Handling**:
1318
- Lemmas are paired with their corresponding POS tags and sorted together

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,10 +50,10 @@ $ docker run --rm -i korap/conllu-treetagger < goe.conllu | head -8
5050
3 Frankreich Frankreich _ NE _ _ _ _ _
5151
```
5252

53-
To output different pos/lemma interpretations with their probabilities, use the `-p` option:
53+
To output different pos/lemma interpretations with their probabilities, use the `-p` option. You can optionally specify a threshold with `-t` (default: 0.1):
5454

5555
``` shell
56-
$ docker run --rm -i korap/conllu-treetagger -p < goe.conllu | head -8
56+
$ docker run --rm -i korap/conllu-treetagger -p -t 0.01 < goe.conllu | head -8
5757

5858
# foundry = tree_tagger
5959
# filename = GOE/AGA/00000/base/tokens.xml

docker-entrypoint.sh

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,20 @@ set -o pipefail
44

55
# Default values
66
lang="german"
7+
threshold="0.1"
78

89
usage() {
9-
echo "Usage: $0 [-h] [-l LANG] [-L]"
10+
echo "Usage: $0 [-h] [-l LANG] [-L] [-p] [-t THRESHOLD]"
1011
echo " -h Display this help message"
1112
echo " -l LANG Specify a language (default: $lang)"
1213
echo " -L List available languages/models"
14+
echo " -p Output probabilities for different interpretations"
15+
echo " -t THRESHOLD Set probability threshold (default: $threshold, requires -p)"
1316
exit 1
1417
}
1518

1619
# Parse command line options
17-
while getopts "hl:Lp" opt; do
20+
while getopts "hl:Lpt:" opt; do
1821
case $opt in
1922
h)
2023
usage
@@ -27,7 +30,10 @@ while getopts "hl:Lp" opt; do
2730
exit 0
2831
;;
2932
p)
30-
PROB="-proto-with-prob"
33+
USE_PROB=1
34+
;;
35+
t)
36+
threshold="$OPTARG"
3137
;;
3238
\?)
3339
echo "Invalid option: -$OPTARG" >&2
@@ -40,6 +46,11 @@ while getopts "hl:Lp" opt; do
4046
esac
4147
done
4248

49+
# Set PROB variable if -p was specified
50+
if [ -n "$USE_PROB" ]; then
51+
PROB="-threshold $threshold -prob"
52+
fi
53+
4354
if [ $OPTIND -le $# ]; then
4455
usage
4556
fi

korap-treetagger-processor/src/main.rs

Lines changed: 68 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -108,10 +108,6 @@ fn postprocess() -> anyhow::Result<()> {
108108

109109
// s/^(# *foundry *= *)base/$1 tree_tagger/
110110
if line.starts_with("#") && line.contains("foundry") && line.contains("base") {
111-
// Simple replacement for now, regex if needed
112-
// Perl: s/^(# *foundry *= *)base/$1 tree_tagger/
113-
// This keeps the prefix and changes base to tree_tagger
114-
// We can use regex for this to be safe
115111
let re = regex::Regex::new(r"^(# *foundry *= *)base").unwrap();
116112
line = re.replace(&line, "${1}tree_tagger").to_string();
117113
}
@@ -122,92 +118,93 @@ fn postprocess() -> anyhow::Result<()> {
122118
id = 0;
123119
}
124120

125-
// my @cols = split("\t");
121+
// Split by tabs
126122
let cols: Vec<&str> = line.split('\t').collect();
127123

128-
if cols.len() == 3 {
129-
// print "$id\t$cols[0]\t$cols[2]\t_\t$cols[1]\t_\t_\t_\t_\t_"
130-
writeln!(writer, "{}\t{}\t{}\t_\t{}\t_\t_\t_\t_\t_", id, cols[0], cols[2], cols[1])?;
131-
} else if cols.len() > 3 {
132-
// my $extra = join(" ", @cols[3..$#cols]);
133-
let extra_parts = &cols[3..];
134-
let mut extra = extra_parts.join(" ");
135-
136-
// $extra =~ s/^[fsc]\s+//;
137-
if extra.starts_with("f ") || extra.starts_with("s ") || extra.starts_with("c ") {
138-
extra = extra[2..].to_string();
139-
}
140-
141-
// my @tags; my @probs; my @probs_cols = split(/\s+/, $extra);
142-
let probs_cols: Vec<&str> = extra.split_whitespace().collect();
124+
// Check if this is the new format with probabilities
125+
// New format: columns after the first contain spaces (e.g., "TAG lemma prob")
126+
// Old format: columns are just single values without spaces
127+
let has_prob_format = cols.len() >= 2 && cols[1..].iter().any(|col| col.contains(' '));
143128

144-
// Parse lemmas
145-
let lemmas: Vec<&str> = cols[2].split('|').collect();
146-
147-
// for (my $i=0; $i < @probs_cols; $i+=2)
148-
struct TagLemmaProb<'a> {
149-
tag: &'a str,
150-
lemma: &'a str,
151-
prob_str: &'a str,
129+
if !has_prob_format && cols.len() == 3 {
130+
// Handle simple 3-column format (word, tag, lemma) - no probabilities
131+
writeln!(writer, "{}\t{}\t{}\t_\t{}\t_\t_\t_\t_\t_", id, cols[0], cols[2], cols[1])?;
132+
}
133+
else if has_prob_format {
134+
// Handle new format: word \t TAG1 lemma1 prob1 \t TAG2 lemma2 prob2 \t ...
135+
struct TagLemmaProb {
136+
tag: String,
137+
lemma: String,
138+
prob_str: String,
152139
prob_val: f64,
153140
}
154141

155142
let mut triples: Vec<TagLemmaProb> = Vec::new();
156143

157-
for (i, chunk) in probs_cols.chunks(2).enumerate() {
158-
let lemma = if i < lemmas.len() { lemmas[i] } else { lemmas.last().unwrap_or(&"") };
144+
// First column is the word, remaining columns are "TAG lemma prob" triplets
145+
for col in &cols[1..] {
146+
let parts: Vec<&str> = col.split_whitespace().collect();
159147

160-
if chunk.len() >= 2 {
161-
let p_val = chunk[1].parse::<f64>().unwrap_or(0.0);
148+
if parts.len() >= 3 {
149+
// Format: TAG lemma prob
150+
let tag = parts[0];
151+
let lemma = parts[1];
152+
let prob_str = parts[2];
153+
let prob_val = prob_str.parse::<f64>().unwrap_or(0.0);
154+
162155
triples.push(TagLemmaProb {
163-
tag: chunk[0],
164-
lemma,
165-
prob_str: chunk[1],
166-
prob_val: p_val,
156+
tag: tag.to_string(),
157+
lemma: lemma.to_string(),
158+
prob_str: prob_str.to_string(),
159+
prob_val,
167160
});
168-
} else if chunk.len() == 1 {
161+
} else if parts.len() == 2 {
162+
// Fallback: TAG lemma (no prob)
163+
let tag = parts[0];
164+
let lemma = parts[1];
165+
169166
triples.push(TagLemmaProb {
170-
tag: chunk[0],
171-
lemma,
172-
prob_str: "0.0",
173-
prob_val: 0.0,
167+
tag: tag.to_string(),
168+
lemma: lemma.to_string(),
169+
prob_str: "1.0".to_string(),
170+
prob_val: 1.0,
174171
});
175172
}
176173
}
177174

178-
// Sort descending by prob_val
179-
triples.sort_by(|a, b| b.prob_val.partial_cmp(&a.prob_val).unwrap_or(std::cmp::Ordering::Equal));
180-
181-
let tags: Vec<&str> = triples.iter().map(|t| t.tag).collect();
182-
let lemmas_sorted: Vec<&str> = triples.iter().map(|t| t.lemma).collect();
183-
let probs: Vec<&str> = triples.iter().map(|t| t.prob_str).collect();
184-
185-
// my $xpos = join("|", @tags);
186-
let xpos = tags.join("|");
187-
188-
// Deduplicate lemmas if all are the same
189-
let unique_lemmas: Vec<&str> = lemmas_sorted.iter()
190-
.copied()
191-
.collect::<std::collections::HashSet<_>>()
192-
.into_iter()
193-
.collect();
194-
195-
let lemma_str = if unique_lemmas.len() == 1 {
196-
unique_lemmas[0].to_string()
175+
if triples.is_empty() {
176+
// Fallback to just printing the line as-is
177+
writeln!(writer, "{}", line)?;
197178
} else {
198-
lemmas_sorted.join("|")
199-
};
200-
201-
// my $misc = (scalar(@tags) == 1) ? "_" : join("|", @probs);
202-
let misc = if tags.len() == 1 {
203-
"_".to_string()
204-
} else {
205-
probs.join("|")
206-
};
179+
// Sort descending by prob_val
180+
triples.sort_by(|a, b| b.prob_val.partial_cmp(&a.prob_val).unwrap_or(std::cmp::Ordering::Equal));
207181

208-
// print "$id\t$cols[0]\t$cols[2]\t_\t$xpos\t_\t_\t_\t_\t$misc"
209-
writeln!(writer, "{}\t{}\t{}\t_\t{}\t_\t_\t_\t_\t{}", id, cols[0], lemma_str, xpos, misc)?;
182+
let tags: Vec<String> = triples.iter().map(|t| t.tag.clone()).collect();
183+
let lemmas_sorted: Vec<String> = triples.iter().map(|t| t.lemma.clone()).collect();
184+
let probs: Vec<String> = triples.iter().map(|t| t.prob_str.clone()).collect();
210185

186+
// Join tags with |
187+
let xpos = tags.join("|");
188+
189+
// Deduplicate lemmas if all are the same
190+
let unique_lemmas: std::collections::HashSet<String> = lemmas_sorted.iter().cloned().collect();
191+
192+
let lemma_str = if unique_lemmas.len() == 1 {
193+
lemmas_sorted[0].clone()
194+
} else {
195+
lemmas_sorted.join("|")
196+
};
197+
198+
// If only one tag, use "_" for misc, otherwise join probabilities
199+
let misc = if tags.len() == 1 {
200+
"_".to_string()
201+
} else {
202+
probs.join("|")
203+
};
204+
205+
// Output: id \t word \t lemma \t _ \t xpos \t _ \t _ \t _ \t _ \t misc
206+
writeln!(writer, "{}\t{}\t{}\t_\t{}\t_\t_\t_\t_\t{}", id, cols[0], lemma_str, xpos, misc)?;
207+
}
211208
} else {
212209
writeln!(writer, "{}", line)?;
213210
}

0 commit comments

Comments
 (0)