Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit a9b0850

Browse files
authoredDec 6, 2020
Merge pull request #63 from TensorSpeech/dev/decoding
Supported New RNN Transducer Beam Search
2 parents b7cb5d3 + d3cfb4d commit a9b0850

25 files changed

+495
-463
lines changed
 

‎examples/conformer/README.md

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,10 @@ TFLite Conversion, see `python examples/conformer/tflite_conformer.py --help`
108108

109109
**Error Rates**
110110

111-
| Test-clean | WER (%) | CER (%) |
112-
| :--------: | :-------: | :--------: |
113-
| _Greedy_ | 6.4476862 | 2.51828337 |
111+
| **Test-clean** | WER (%) | CER (%) |
112+
| :------------: | :-------: | :--------: |
113+
| _Greedy_ | 6.4476862 | 2.51828337 |
114+
115+
| **Test-other** | WER (%) | CER (%) |
116+
| :------------: | :--------: | :--------: |
117+
| _Greedy_ | 15.7308521 | 7.67273521 |

‎examples/conformer/config.yml

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -33,27 +33,28 @@ decoder_config:
3333

3434
model_config:
3535
name: conformer
36-
subsampling:
36+
encoder_subsampling:
3737
type: conv2d
3838
filters: 144
3939
kernel_size: 3
4040
strides: 2
41-
positional_encoding: sinusoid_concat
42-
dmodel: 144
43-
num_blocks: 16
44-
head_size: 36
45-
num_heads: 4
46-
mha_type: relmha
47-
kernel_size: 32
48-
fc_factor: 0.5
49-
dropout: 0.1
50-
embed_dim: 320
51-
embed_dropout: 0.1
52-
num_rnns: 1
53-
rnn_units: 320
54-
rnn_type: lstm
55-
layer_norm: True
56-
projection_units: 0
41+
encoder_positional_encoding: sinusoid_concat
42+
encoder_dmodel: 144
43+
encoder_num_blocks: 16
44+
encoder_head_size: 36
45+
encoder_num_heads: 4
46+
encoder_mha_type: relmha
47+
encoder_kernel_size: 32
48+
encoder_fc_factor: 0.5
49+
encoder_dropout: 0.1
50+
prediction_embed_dim: 320
51+
prediction_embed_dropout: 0.1
52+
prediction_num_rnns: 1
53+
prediction_rnn_units: 320
54+
prediction_rnn_type: lstm
55+
prediction_rnn_implementation: 1
56+
prediction_layer_norm: True
57+
prediction_projection_units: 0
5758
joint_dim: 320
5859

5960
learning_config:

‎examples/conformer/test_conformer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
setup_devices([args.device], cpu=args.cpu)
5454

5555
from tensorflow_asr.configs.config import Config
56-
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
56+
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordTestDataset, ASRSliceTestDataset
5757
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
5858
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
5959
from tensorflow_asr.runners.base_runners import BaseTester
@@ -67,15 +67,15 @@
6767
assert args.saved
6868

6969
if args.tfrecords:
70-
test_dataset = ASRTFRecordDataset(
70+
test_dataset = ASRTFRecordTestDataset(
7171
data_paths=config.learning_config.dataset_config.test_paths,
7272
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
7373
speech_featurizer=speech_featurizer,
7474
text_featurizer=text_featurizer,
7575
stage="test", shuffle=False
7676
)
7777
else:
78-
test_dataset = ASRSliceDataset(
78+
test_dataset = ASRSliceTestDataset(
7979
data_paths=config.learning_config.dataset_config.test_paths,
8080
speech_featurizer=speech_featurizer,
8181
text_featurizer=text_featurizer,

‎examples/conformer/test_subword_conformer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
setup_devices([args.device], cpu=args.cpu)
5757

5858
from tensorflow_asr.configs.config import Config
59-
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
59+
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordTestDataset, ASRSliceTestDataset
6060
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
6161
from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
6262
from tensorflow_asr.runners.base_runners import BaseTester
@@ -75,15 +75,15 @@
7575
assert args.saved
7676

7777
if args.tfrecords:
78-
test_dataset = ASRTFRecordDataset(
78+
test_dataset = ASRTFRecordTestDataset(
7979
data_paths=config.learning_config.dataset_config.test_paths,
8080
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
8181
speech_featurizer=speech_featurizer,
8282
text_featurizer=text_featurizer,
8383
stage="test", shuffle=False
8484
)
8585
else:
86-
test_dataset = ASRSliceDataset(
86+
test_dataset = ASRSliceTestDataset(
8787
data_paths=config.learning_config.dataset_config.test_paths,
8888
speech_featurizer=speech_featurizer,
8989
text_featurizer=text_featurizer,

‎examples/conformer/train_ga_conformer.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -113,16 +113,15 @@
113113
conformer._build(speech_featurizer.shape)
114114
conformer.summary(line_length=120)
115115

116-
optimizer_config = config.learning_config.optimizer_config
117116
optimizer = tf.keras.optimizers.Adam(
118117
TransformerSchedule(
119-
d_model=config.model_config["dmodel"],
120-
warmup_steps=optimizer_config["warmup_steps"],
121-
max_lr=(0.05 / math.sqrt(config.model_config["dmodel"]))
118+
d_model=config.model_config["encoder_dmodel"],
119+
warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
120+
max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"]))
122121
),
123-
beta_1=optimizer_config["beta1"],
124-
beta_2=optimizer_config["beta2"],
125-
epsilon=optimizer_config["epsilon"]
122+
beta_1=config.learning_config.optimizer_config["beta1"],
123+
beta_2=config.learning_config.optimizer_config["beta2"],
124+
epsilon=config.learning_config.optimizer_config["epsilon"]
126125
)
127126

128127
conformer_trainer.compile(model=conformer, optimizer=optimizer,

‎examples/conformer/train_ga_subword_conformer.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -129,16 +129,15 @@
129129
conformer._build(speech_featurizer.shape)
130130
conformer.summary(line_length=120)
131131

132-
optimizer_config = config.learning_config.optimizer_config
133132
optimizer = tf.keras.optimizers.Adam(
134133
TransformerSchedule(
135-
d_model=config.model_config["dmodel"],
136-
warmup_steps=optimizer_config["warmup_steps"],
137-
max_lr=(0.05 / math.sqrt(config.model_config["dmodel"]))
134+
d_model=config.model_config["encoder_dmodel"],
135+
warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
136+
max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"]))
138137
),
139-
beta_1=optimizer_config["beta1"],
140-
beta_2=optimizer_config["beta2"],
141-
epsilon=optimizer_config["epsilon"]
138+
beta_1=config.learning_config.optimizer_config["beta1"],
139+
beta_2=config.learning_config.optimizer_config["beta2"],
140+
epsilon=config.learning_config.optimizer_config["epsilon"]
142141
)
143142

144143
conformer_trainer.compile(model=conformer, optimizer=optimizer,

‎examples/conformer/train_subword_conformer.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -126,16 +126,15 @@
126126
conformer._build(speech_featurizer.shape)
127127
conformer.summary(line_length=120)
128128

129-
optimizer_config = config.learning_config.optimizer_config
130129
optimizer = tf.keras.optimizers.Adam(
131130
TransformerSchedule(
132-
d_model=config.model_config["dmodel"],
133-
warmup_steps=optimizer_config["warmup_steps"],
134-
max_lr=(0.05 / math.sqrt(config.model_config["dmodel"]))
131+
d_model=config.model_config["encoder_dmodel"],
132+
warmup_steps=config.learning_config.optimizer_config["warmup_steps"],
133+
max_lr=(0.05 / math.sqrt(config.model_config["encoder_dmodel"]))
135134
),
136-
beta_1=optimizer_config["beta1"],
137-
beta_2=optimizer_config["beta2"],
138-
epsilon=optimizer_config["epsilon"]
135+
beta_1=config.learning_config.optimizer_config["beta1"],
136+
beta_2=config.learning_config.optimizer_config["beta2"],
137+
epsilon=config.learning_config.optimizer_config["epsilon"]
139138
)
140139

141140
conformer_trainer.compile(model=conformer, optimizer=optimizer,

‎examples/deepspeech2/test_ds2.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
setup_devices([args.device])
5151

5252
from tensorflow_asr.configs.config import Config
53-
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
53+
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordTestDataset, ASRSliceTestDataset
5454
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
5555
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
5656
from tensorflow_asr.runners.base_runners import BaseTester
@@ -70,15 +70,15 @@
7070
ds2_model.add_featurizers(speech_featurizer, text_featurizer)
7171

7272
if args.tfrecords:
73-
test_dataset = ASRTFRecordDataset(
73+
test_dataset = ASRTFRecordTestDataset(
7474
data_paths=config.learning_config.dataset_config.test_paths,
7575
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
7676
speech_featurizer=speech_featurizer,
7777
text_featurizer=text_featurizer,
7878
stage="test", shuffle=False
7979
)
8080
else:
81-
test_dataset = ASRSliceDataset(
81+
test_dataset = ASRSliceTestDataset(
8282
data_paths=config.learning_config.dataset_config.test_paths,
8383
speech_featurizer=speech_featurizer,
8484
text_featurizer=text_featurizer,

‎examples/demonstration/conformer.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import os
1516
import argparse
1617
from tensorflow_asr.utils import setup_environment, setup_devices
1718

@@ -32,6 +33,8 @@
3233
parser.add_argument("--blank", type=int, default=0,
3334
help="Path to conformer tflite")
3435

36+
parser.add_argument("--beam_width", type=int, default=0, help="Beam width")
37+
3538
parser.add_argument("--num_rnns", type=int, default=1,
3639
help="Number of RNN layers in prediction network")
3740

@@ -47,19 +50,30 @@
4750
parser.add_argument("--cpu", default=False, action="store_true",
4851
help="Whether to only use cpu")
4952

53+
parser.add_argument("--subwords", type=str, default=None,
54+
help="Path to file that stores generated subwords")
55+
56+
parser.add_argument("--output_name", type=str, default="test",
57+
help="Result filename name prefix")
58+
5059
args = parser.parse_args()
5160

5261
setup_devices([args.device], cpu=args.cpu)
5362

5463
from tensorflow_asr.configs.config import Config
5564
from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio
5665
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
57-
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
66+
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer, SubwordFeaturizer
5867
from tensorflow_asr.models.conformer import Conformer
5968

6069
config = Config(args.config, learning=False)
6170
speech_featurizer = TFSpeechFeaturizer(config.speech_config)
62-
text_featurizer = CharFeaturizer(config.decoder_config)
71+
if args.subwords and os.path.exists(args.subwords):
72+
print("Loading subwords ...")
73+
text_featurizer = SubwordFeaturizer.load_from_file(config.decoder_config, args.subwords)
74+
else:
75+
text_featurizer = CharFeaturizer(config.decoder_config)
76+
text_featurizer.decoder_config.beam_width = args.beam_width
6377

6478
# build model
6579
conformer = Conformer(**config.model_config, vocabulary_size=text_featurizer.num_classes)
@@ -69,9 +83,10 @@
6983
conformer.add_featurizers(speech_featurizer, text_featurizer)
7084

7185
signal = read_raw_audio(args.filename)
72-
predicted = tf.constant(args.blank, dtype=tf.int32)
73-
states = tf.zeros([args.num_rnns, args.nstates, 1, args.statesize], dtype=tf.float32)
7486

75-
hyp, _, _ = conformer.recognize_tflite(signal, predicted, states)
87+
if (args.beam_width):
88+
transcript = conformer.recognize_beam(signal[None, ...])
89+
else:
90+
transcript = conformer.recognize(signal[None, ...])
7691

77-
print("".join([chr(u) for u in hyp]))
92+
tf.print("Transcript:", transcript[0])

‎examples/jasper/test_jasper.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@
5050
setup_devices([args.device])
5151

5252
from tensorflow_asr.configs.config import Config
53-
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
53+
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordTestDataset, ASRSliceTestDataset
5454
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
5555
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
5656
from tensorflow_asr.runners.base_runners import BaseTester
@@ -70,15 +70,15 @@
7070
jasper.add_featurizers(speech_featurizer, text_featurizer)
7171

7272
if args.tfrecords:
73-
test_dataset = ASRTFRecordDataset(
73+
test_dataset = ASRTFRecordTestDataset(
7474
data_paths=config.learning_config.dataset_config.test_paths,
7575
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
7676
speech_featurizer=speech_featurizer,
7777
text_featurizer=text_featurizer,
7878
stage="test", shuffle=False
7979
)
8080
else:
81-
test_dataset = ASRSliceDataset(
81+
test_dataset = ASRSliceTestDataset(
8282
data_paths=config.learning_config.dataset_config.test_paths,
8383
speech_featurizer=speech_featurizer,
8484
text_featurizer=text_featurizer,

‎examples/streaming_transducer/test_streaming_transducer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
setup_devices([args.device], cpu=args.cpu)
5454

5555
from tensorflow_asr.configs.config import Config
56-
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
56+
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordTestDataset, ASRSliceTestDataset
5757
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
5858
from tensorflow_asr.featurizers.text_featurizers import CharFeaturizer
5959
from tensorflow_asr.runners.base_runners import BaseTester
@@ -67,15 +67,15 @@
6767
assert args.saved
6868

6969
if args.tfrecords:
70-
test_dataset = ASRTFRecordDataset(
70+
test_dataset = ASRTFRecordTestDataset(
7171
data_paths=config.learning_config.dataset_config.test_paths,
7272
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
7373
speech_featurizer=speech_featurizer,
7474
text_featurizer=text_featurizer,
7575
stage="test", shuffle=False
7676
)
7777
else:
78-
test_dataset = ASRSliceDataset(
78+
test_dataset = ASRSliceTestDataset(
7979
data_paths=config.learning_config.dataset_config.test_paths,
8080
speech_featurizer=speech_featurizer,
8181
text_featurizer=text_featurizer,

‎examples/streaming_transducer/test_subword_streaming_transducer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
setup_devices([args.device], cpu=args.cpu)
5757

5858
from tensorflow_asr.configs.config import Config
59-
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordDataset, ASRSliceDataset
59+
from tensorflow_asr.datasets.asr_dataset import ASRTFRecordTestDataset, ASRSliceTestDataset
6060
from tensorflow_asr.featurizers.speech_featurizers import TFSpeechFeaturizer
6161
from tensorflow_asr.featurizers.text_featurizers import SubwordFeaturizer
6262
from tensorflow_asr.runners.base_runners import BaseTester
@@ -75,15 +75,15 @@
7575
assert args.saved
7676

7777
if args.tfrecords:
78-
test_dataset = ASRTFRecordDataset(
78+
test_dataset = ASRTFRecordTestDataset(
7979
data_paths=config.learning_config.dataset_config.test_paths,
8080
tfrecords_dir=config.learning_config.dataset_config.tfrecords_dir,
8181
speech_featurizer=speech_featurizer,
8282
text_featurizer=text_featurizer,
8383
stage="test", shuffle=False
8484
)
8585
else:
86-
test_dataset = ASRSliceDataset(
86+
test_dataset = ASRSliceTestDataset(
8787
data_paths=config.learning_config.dataset_config.test_paths,
8888
speech_featurizer=speech_featurizer,
8989
text_featurizer=text_featurizer,

‎setup.cfg

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
[flake8]
22
ignore = E402,E701,E702,E704,E251
3-
max-line-length = 100
3+
max-line-length = 150
44

55
[pep8]
66
ignore = E402,E701,E702,E704,E251
7-
max-line-length = 100
7+
max-line-length = 150
88
indent-size = 4

‎setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737

3838
setuptools.setup(
3939
name="TensorFlowASR",
40-
version="0.3.2",
40+
version="0.4.0",
4141
author="Huy Le Nguyen",
4242
author_email="nlhuy.cs.16@gmail.com",
4343
description="Almost State-of-the-art Automatic Speech Recognition using Tensorflow 2",

0 commit comments

Comments
 (0)