11/*
2- * Copyright 2014 Steve Ash
2+ * Copyright 2018 Steve Ash
33 *
4- * Licensed under the Apache License, Version 2.0 (the "License");
5- * you may not use this file except in compliance with the License.
6- * You may obtain a copy of the License at
4+ * Licensed under the Apache License, Version 2.0 (the "License");
5+ * you may not use this file except in compliance with the License.
6+ * You may obtain a copy of the License at
77 *
8- * http://www.apache.org/licenses/LICENSE-2.0
8+ * http://www.apache.org/licenses/LICENSE-2.0
9+ *
10+ * Unless required by applicable law or agreed to in writing, software
11+ * distributed under the License is distributed on an "AS IS" BASIS,
12+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+ * See the License for the specific language governing permissions and
14+ * limitations under the License.
915 *
10- * Unless required by applicable law or agreed to in writing, software
11- * distributed under the License is distributed on an "AS IS" BASIS,
12- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13- * See the License for the specific language governing permissions and
14- * limitations under the License.
1516 */
1617
1718package com .github .steveash .jopenfst .io ;
4950
5051/**
5152 * Provides the required functionality in order to convert from/to openfst's text format
52- *
53+ * <p>
5354 * NOTE that the original CMU implementation of this assumed that the symbols themselves were in the fst text format
5455 * and NOT the symbol ids (as described in the AT&T spec). There is a static flag (yuck, I know) to control whether
5556 * you expect symbols or symbol ids in the input/output text files (defaulting to expecting the symbols themselves
@@ -65,10 +66,13 @@ public class Convert {
6566
6667 // if true, then expect the tokens in the text format to be integer symbol ids and not the symbols themselves
6768 private static boolean useSymbolIdsInText = false ;
69+ private static String regexToSplitOn = "\\ t" ;
70+ private static boolean omitZeroStates = true ;
6871
6972 /**
70- * if true, then expects that the tokens in the input and output symbols are the integer ids of the token and not
73+ * If true, then expects that the tokens in the input and output symbols are the integer ids of the token and not
7174 * the token itself
75+ *
7276 * @return
7377 */
7478 public static boolean isUseSymbolIdsInText () {
@@ -78,12 +82,54 @@ public static boolean isUseSymbolIdsInText() {
7882 /**
7983 * If true then when importing an FST text file, it interprets the states as ids from the isymb/osymb tables
8084 * instead of the symbol values themselves (the strings)
85+ *
8186 * @param useSymbolIdsInText
8287 */
8388 public static void setUseSymbolIdsInText (boolean useSymbolIdsInText ) {
8489 Convert .useSymbolIdsInText = useSymbolIdsInText ;
8590 }
8691
92+ /**
93+ * the regex to use to split the FST file; defaults to \\t to split on tabs
94+ *
95+ * @return
96+ */
97+ public static String getRegexToSplitOn () {
98+ return regexToSplitOn ;
99+ }
100+
101+ /**
102+ * sets the regex to use to split the FST file; defaults to \\t but can be set to \\s+ to relax
103+ * the whitespace requirements a little (which can be convenient)
104+ *
105+ * @param regexToSplitOn
106+ */
107+ public static void setRegexToSplitOn (String regexToSplitOn ) {
108+ Convert .regexToSplitOn = regexToSplitOn ;
109+ }
110+
111+ /**
112+ * If true (default) then states with a zero weight (i.e. non-final states) aren't printed at the start of the
113+ * file (except the start state; that's always printed first); omitting zero states is also the behavior of
114+ * openfst
115+ *
116+ * @return
117+ */
118+ public static boolean isOmitZeroStates () {
119+ return omitZeroStates ;
120+ }
121+
122+ /**
123+ * If true (default) then states with a zero weight (i.e. non-final states) aren't printed at the start of the
124+ * file (except the start state; that's always printed first); omitting zero states is also the behavior of
125+ * openfst
126+ *
127+ * @param omitZeroStates
128+ */
129+ public static void setOmitZeroStates (boolean omitZeroStates ) {
130+ Convert .omitZeroStates = omitZeroStates ;
131+ }
132+
87133 /**
88134 * Exports an fst to the openfst text format Several files are created as follows: - basename.input.syms -
89135 * basename.output.syms - basename.fst.txt See <a href="http://www.openfst.org/twiki/bin/view/FST/FstQuickTour">OpenFst
@@ -121,7 +167,10 @@ private static void exportFst(Fst fst, String filename) {
121167 int numStates = fst .getStateCount ();
122168 for (int i = 0 ; i < numStates ; i ++) {
123169 State s = fst .getState (i );
124- if (s .getId () != fst .getStartState ().getId ()) {
170+ if (s .getId () == fst .getStartState ().getId ()) {
171+ continue ;
172+ }
173+ if (fst .getSemiring ().isNotZero (s .getFinalWeight ()) || !omitZeroStates ) {
125174 out .println (s .getId () + "\t " + s .getFinalWeight ());
126175 }
127176 }
@@ -145,8 +194,8 @@ private static void exportFst(Fst fst, String filename) {
145194 }
146195
147196 out .println (s .getId () + "\t " + arc .getNextState ().getId ()
148- + "\t " + isym + "\t " + osym + "\t "
149- + arc .getWeight ());
197+ + "\t " + isym + "\t " + osym + "\t "
198+ + arc .getWeight ());
150199 }
151200 }
152201
@@ -312,7 +361,7 @@ private static MutableFst convertFrom(CharSource fstSource, Optional<MutableSymb
312361 continue ;
313362 }
314363 try {
315- String [] tokens = line .split (" \\ t" );
364+ String [] tokens = line .split (regexToSplitOn );
316365 Integer inputStateId ;
317366 if (ssyms == null ) {
318367 inputStateId = Integer .parseInt (tokens [0 ]);
@@ -374,7 +423,7 @@ private static MutableFst convertFrom(CharSource fstSource, Optional<MutableSymb
374423 }
375424 } catch (RuntimeException e ) {
376425 throw new RuntimeException ("Problem converting and parsing line " + lineNo + " from FST input file. Line: " +
377- line , e );
426+ line , e );
378427 }
379428 }
380429 } catch (IOException e ) {
0 commit comments