Skip to content

Commit dd42f0d

Browse files
committed
Merge branch 'print_counts'
2 parents 343c4e5 + a2b4833 commit dd42f0d

File tree

8 files changed

+3097
-3
lines changed

8 files changed

+3097
-3
lines changed

example/counts/corpus.a

+1,000
Large diffs are not rendered by default.

example/counts/corpus.en

+1,000
Large diffs are not rendered by default.

example/counts/corpus.es

+1,000
Large diffs are not rendered by default.

example/counts/thrax-phrase.conf

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
# this is an example Thrax configuration file
2+
# <- this symbol indicates a comment
3+
# each line should be a key-value pair separated by whitespace
4+
5+
###
6+
### GRAMMAR OPTIONS
7+
###
8+
9+
grammar hiero # or samt
10+
reverse false
11+
source-is-parsed false
12+
target-is-parsed false
13+
# default-nt X # X is the default anyway
14+
15+
min-rule-count 1
16+
17+
# the number of reducers
18+
reducers 16
19+
20+
# Maximum length of initial phrase pairs. These are set to be shorter than
21+
# used by Hiero.
22+
initial-phrase-length 5
23+
lex-source-words 5
24+
lex-target-words 5
25+
26+
# maximum number of NTs in a rule
27+
arity 0
28+
29+
# minimum number of aligned terminals in a rule
30+
lexicality 1
31+
32+
# allow adjacent nonterminals on source side
33+
adjacent-nts false
34+
35+
# allow unaligned words at boundaries of phrases
36+
loose true
37+
38+
allow-abstract-rules false
39+
allow-nonlexical-x false
40+
allow-full-sentence-rules false
41+
42+
nonlex-source-length 5
43+
nonlex-target-length 5
44+
nonlex-source-words 5
45+
nonlex-target-words 5
46+
47+
allow-double-plus false
48+
49+
rule-span-limit 12
50+
51+
phrase-penalty 2.718
52+
53+
# a whitespace seperated list of features
54+
# in this example, the features are phrase translation probability,
55+
# lexical probability, and phrase penalty
56+
# features phrase-penalty e2fphrase f2ephrase lexprob lexical abstract adjacent x-rule source-terminals-without-target target-terminals-without-source monotonic glue-rule rarity target-word-count unaligned-count
57+
features e_given_f_phrase f_given_e_phrase e_given_f_lex f_given_e_lex rarity phrase-penalty alignment count
58+
59+
# the only option and default later we will want to add formats for other decoders such as moses and
60+
# cdec, if they use other formats
61+
output-format joshua
62+
63+
# label feature scores? each score will be output as name=score
64+
label-feature-scores false
65+
66+
amazon-work s3://edu.jhu.cs.jonny/wmt11/fr-en/hiero
67+
amazon-jar s3://edu.jhu.cs.jonny/thrax.jar
68+
amazon-num-instances 15
69+
70+
max-split-size 8388608
71+
72+
# the format should be:
73+
# foreign sentence ||| english sentence ||| alignment
74+
# where the english is either parsed or not depending on whether you want
75+
# SAMT or you want Hiero.
76+
#input-file s3://edu.jhu.cs.jonny/wmt11/corpus.fr-en
77+
input-file pipeline-es-en-phrase-_export_projects_mpost_language-packs_es-en_1.3/input-file

src/edu/jhu/thrax/Thrax.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ private synchronized void scheduleJobs() throws SchedulerException {
100100
String features = BackwardsCompatibility.equivalent(conf.get("thrax.features", ""));
101101

102102
System.err.println("Running in mode: " + type);
103+
System.err.println("Features: " + features);
103104

104105
scheduler.schedule(VocabularyJob.class);
105106

@@ -174,7 +175,7 @@ public static void main(String[] argv) throws Exception {
174175
int returnCode = ToolRunner.run(null, new Thrax(), argv);
175176
System.exit(returnCode);
176177
}
177-
178+
178179
protected synchronized void workerDone(Class<? extends ThraxJob> theClass, boolean success) {
179180
try {
180181
scheduler.setState(theClass, success ? JobState.SUCCESS : JobState.FAILED);

src/edu/jhu/thrax/hadoop/features/annotation/AnnotationFeatureFactory.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ else if (name.equals(SourceGivenTargetLexicalProbabilityFeature.NAME))
2222
return new SourceGivenTargetLexicalProbabilityFeature();
2323
else if (name.equals(TargetGivenSourceLexicalProbabilityFeature.NAME))
2424
return new TargetGivenSourceLexicalProbabilityFeature();
25-
else if (name.equals(AlignmentFeature.NAME)) return new AlignmentFeature();
25+
else if (name.equals(AlignmentFeature.NAME))
26+
return new AlignmentFeature();
2627

2728
return null;
2829
}

src/edu/jhu/thrax/hadoop/features/mapred/MapReduceFeatureFactory.java

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import java.util.ArrayList;
44
import java.util.List;
55

6+
import edu.jhu.thrax.hadoop.features.annotation.CountFeature;
67
import edu.jhu.thrax.util.FormatUtils;
78

89
public class MapReduceFeatureFactory {

src/edu/jhu/thrax/util/FormatUtils.java

+15-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,10 @@ public static int[] applyIndices(int[] input, boolean monotonic) {
9999
public static Text ruleToText(RuleWritable r, Map<String, Writable> fs, boolean label,
100100
boolean sparse) {
101101
if (r == null) throw new IllegalArgumentException("Cannot convert a null rule to text.");
102+
103+
// Both alignment and count are features that are handled specially
102104
String alignment = null;
105+
String ruleCount = "";
103106
StringBuilder sb = new StringBuilder();
104107
sb.append(Vocabulary.word(r.lhs));
105108
sb.append(DELIM);
@@ -125,7 +128,10 @@ public static Text ruleToText(RuleWritable r, Map<String, Writable> fs, boolean
125128
for (String t : fs.keySet()) {
126129
String score;
127130
Writable val = fs.get(t);
128-
if (val instanceof FloatWritable) {
131+
if (t.equals("Count")) {
132+
ruleCount = String.format("%d", ((IntWritable) fs.get(t)).get());
133+
continue;
134+
} else if (val instanceof FloatWritable) {
129135
float value = ((FloatWritable) fs.get(t)).get();
130136
if (value == -0.0 || Math.abs(value) < 0.000005)
131137
score = "0";
@@ -150,6 +156,14 @@ public static Text ruleToText(RuleWritable r, Map<String, Writable> fs, boolean
150156
}
151157
if (alignment != null)
152158
sb.append(DELIMITER + " ").append(alignment + " ");
159+
160+
if (! ruleCount.equals("")) {
161+
// If there was no alignment, output a blank
162+
if (alignment == null)
163+
sb.append(DELIMITER + " ");
164+
165+
sb.append(DELIMITER + " ").append(ruleCount).append(" ");
166+
}
153167
return new Text(sb.substring(0, sb.length() - 1));
154168
}
155169

0 commit comments

Comments
 (0)