Skip to content

Commit 4c67520

Browse files
committed
rewriting determinize to work for FSAs and FSTs, including adding gallic, union, and generic semirings
1 parent a9381d9 commit 4c67520

File tree

19 files changed

+2356
-196
lines changed

19 files changed

+2356
-196
lines changed

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
<groupId>com.github.steveash.jopenfst</groupId>
2323
<artifactId>jopenfst</artifactId>
2424
<name>jopenfst</name>
25-
<version>0.2.0</version>
25+
<version>0.3.0</version>
2626
<description>Partial Java port of the OpenFST library; forked from the CMU Sphinx project</description>
2727
<packaging>jar</packaging>
2828

src/main/java/com/github/steveash/jopenfst/io/Convert.java

Lines changed: 66 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,18 @@
11
/*
2-
* Copyright 2014 Steve Ash
2+
* Copyright 2018 Steve Ash
33
*
4-
* Licensed under the Apache License, Version 2.0 (the "License");
5-
* you may not use this file except in compliance with the License.
6-
* You may obtain a copy of the License at
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
77
*
8-
* http://www.apache.org/licenses/LICENSE-2.0
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
915
*
10-
* Unless required by applicable law or agreed to in writing, software
11-
* distributed under the License is distributed on an "AS IS" BASIS,
12-
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13-
* See the License for the specific language governing permissions and
14-
* limitations under the License.
1516
*/
1617

1718
package com.github.steveash.jopenfst.io;
@@ -49,7 +50,7 @@
4950

5051
/**
5152
* Provides the required functionality in order to convert from/to openfst's text format
52-
*
53+
* <p>
5354
* NOTE that the original CMU implementation of this assumed that the symbols themselves were in the fst text format
5455
* and NOT the symbol ids (as described in the AT&T spec). There is a static flag (yuck, I know) to control whether
5556
* you expect symbols or symbol ids in the input/output text files (defaulting to expecting the symbols themselves
@@ -65,10 +66,13 @@ public class Convert {
6566

6667
// if true, then expect the tokens in the text format to be integer symbol ids and not the symbols themselves
6768
private static boolean useSymbolIdsInText = false;
69+
private static String regexToSplitOn = "\\t";
70+
private static boolean omitZeroStates = true;
6871

6972
/**
70-
* if true, then expects that the tokens in the input and output symbols are the integer ids of the token and not
73+
* If true, then expects that the tokens in the input and output symbols are the integer ids of the token and not
7174
* the token itself
75+
*
7276
* @return
7377
*/
7478
public static boolean isUseSymbolIdsInText() {
@@ -78,12 +82,54 @@ public static boolean isUseSymbolIdsInText() {
7882
/**
7983
* If true then when importing an FST text file, it interprets the states as ids from the isymb/osymb tables
8084
* instead of the symbol values themselves (the strings)
85+
*
8186
* @param useSymbolIdsInText
8287
*/
8388
public static void setUseSymbolIdsInText(boolean useSymbolIdsInText) {
8489
Convert.useSymbolIdsInText = useSymbolIdsInText;
8590
}
8691

92+
/**
93+
* the regex to use to split the FST file; defaults to \\t to split on tabs
94+
*
95+
* @return
96+
*/
97+
public static String getRegexToSplitOn() {
98+
return regexToSplitOn;
99+
}
100+
101+
/**
102+
* sets the regex to use to split the FST file; defaults to \\t but can be set to \\s+ to relax
103+
* the whitespace requirements a little (which can be convenient)
104+
*
105+
* @param regexToSplitOn
106+
*/
107+
public static void setRegexToSplitOn(String regexToSplitOn) {
108+
Convert.regexToSplitOn = regexToSplitOn;
109+
}
110+
111+
/**
112+
* If true (default) then states with a zero weight (i.e. non-final states) aren't printed at the start of the
113+
* file (except the start state; that's always printed first); omitting zero states is also the behavior of
114+
* openfst
115+
*
116+
* @return
117+
*/
118+
public static boolean isOmitZeroStates() {
119+
return omitZeroStates;
120+
}
121+
122+
/**
123+
* If true (default) then states with a zero weight (i.e. non-final states) aren't printed at the start of the
124+
* file (except the start state; that's always printed first); omitting zero states is also the behavior of
125+
* openfst
126+
*
127+
* @param omitZeroStates
128+
*/
129+
public static void setOmitZeroStates(boolean omitZeroStates) {
130+
Convert.omitZeroStates = omitZeroStates;
131+
}
132+
87133
/**
88134
* Exports an fst to the openfst text format Several files are created as follows: - basename.input.syms -
89135
* basename.output.syms - basename.fst.txt See <a href="http://www.openfst.org/twiki/bin/view/FST/FstQuickTour">OpenFst
@@ -121,7 +167,10 @@ private static void exportFst(Fst fst, String filename) {
121167
int numStates = fst.getStateCount();
122168
for (int i = 0; i < numStates; i++) {
123169
State s = fst.getState(i);
124-
if (s.getId() != fst.getStartState().getId()) {
170+
if (s.getId() == fst.getStartState().getId()) {
171+
continue;
172+
}
173+
if (fst.getSemiring().isNotZero(s.getFinalWeight()) || !omitZeroStates) {
125174
out.println(s.getId() + "\t" + s.getFinalWeight());
126175
}
127176
}
@@ -145,8 +194,8 @@ private static void exportFst(Fst fst, String filename) {
145194
}
146195

147196
out.println(s.getId() + "\t" + arc.getNextState().getId()
148-
+ "\t" + isym + "\t" + osym + "\t"
149-
+ arc.getWeight());
197+
+ "\t" + isym + "\t" + osym + "\t"
198+
+ arc.getWeight());
150199
}
151200
}
152201

@@ -312,7 +361,7 @@ private static MutableFst convertFrom(CharSource fstSource, Optional<MutableSymb
312361
continue;
313362
}
314363
try {
315-
String[] tokens = line.split("\\t");
364+
String[] tokens = line.split(regexToSplitOn);
316365
Integer inputStateId;
317366
if (ssyms == null) {
318367
inputStateId = Integer.parseInt(tokens[0]);
@@ -374,7 +423,7 @@ private static MutableFst convertFrom(CharSource fstSource, Optional<MutableSymb
374423
}
375424
} catch (RuntimeException e) {
376425
throw new RuntimeException("Problem converting and parsing line " + lineNo + " from FST input file. Line: " +
377-
line, e);
426+
line, e);
378427
}
379428
}
380429
} catch (IOException e) {

0 commit comments

Comments
 (0)