diff --git a/README.md b/README.md index e07cc6c..04b0f56 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,15 @@ We plan to support a variaty of hardwares and platforms, including: * Android * Raspberry Pi +## Discussion + +For Chinese users, you can scan the QR code on the left to follow our offical account of WeNet. +We also created a WeChat group for better discussion and quicker response. +Please scan the QR code on the right to join the chat group. + +| | | +| ---- | ---- | + ## Reference * Mining Effective Negative Training Samples for Keyword Spotting diff --git a/examples/hi_xiaowen/s0/README.md b/examples/hi_xiaowen/s0/README.md index b88ad49..87ff5c3 100644 --- a/examples/hi_xiaowen/s0/README.md +++ b/examples/hi_xiaowen/s0/README.md @@ -4,9 +4,7 @@ FRRs with FAR fixed at once per hour: |------------------|-----------|-----------|------------|--------------| | GRU | 203 | 80(avg30) | 0.088901 | 0.083827 | | TCN | 134 | 80(avg30) | 0.023494 | 0.029884 | -| DS_TCN | 21 | 60 | 0.011559 | 0.014190 | -| DS_TCN | 21 | 80 | 0.010807 | 0.014754 | -| DS_TCN | 21 | 80(avg30) | 0.009867 | 0.014472 | -| DS_TCN(spec_aug) | 21 | 80(avg30) | 0.029039 | 0.022648 | +| DS_TCN | 21 | 80(avg30) | 0.019641 | 0.018325 | +| DS_TCN(spec_aug) | 21 | 80(avg30) | 0.029509 | 0.008928 | | MDTC | 156 | 80(avg10) | 0.007142 | 0.005920 | | MDTC_Small | 31 | 80(avg10) | 0.005357 | 0.005920 | diff --git a/examples/hi_xiaowen/s0/run.sh b/examples/hi_xiaowen/s0/run.sh index 7c97999..3f7967b 100755 --- a/examples/hi_xiaowen/s0/run.sh +++ b/examples/hi_xiaowen/s0/run.sh @@ -9,15 +9,15 @@ stage=0 stop_stage=4 num_keywords=2 -config=conf/mdtc_small.yaml -norm_mean=false -norm_var=false +config=conf/ds_tcn.yaml +norm_mean=true +norm_var=true gpu_id=0 checkpoint= -dir=exp/mdtc_small +dir=exp/ds_tcn -num_average=10 +num_average=30 score_checkpoint=$dir/avg_${num_average}.pt download_dir=./data/local # your data dir @@ -82,6 +82,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then --num_workers 8 \ --num_keywords $num_keywords \ --min_duration 50 \ + --seed 666 \ $cmvn_opts \ ${checkpoint:+--checkpoint $checkpoint} fi @@ -97,7 +98,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # Compute posterior score result_dir=$dir/test_$(basename $score_checkpoint) mkdir -p $result_dir - python kws/bin/score.py --gpu 1 \ + python kws/bin/score.py --gpu $gpu_id \ --config $dir/config.yaml \ --test_data data/test/data.list \ --batch_size 256 \ diff --git a/examples/speechcommand_v1/s0/run.sh b/examples/speechcommand_v1/s0/run.sh index d843209..b18a856 100755 --- a/examples/speechcommand_v1/s0/run.sh +++ b/examples/speechcommand_v1/s0/run.sh @@ -13,7 +13,7 @@ num_keywords=11 config=conf/mdtc.yaml norm_mean=false norm_var=false -gpu_id=4 +gpu_id=0 checkpoint= dir=exp/mdtc @@ -79,3 +79,30 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then $cmvn_opts \ ${checkpoint:+--checkpoint $checkpoint} fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # Do model average + python kws/bin/average_model.py \ + --dst_model $score_checkpoint \ + --src_path $dir \ + --num ${num_average} \ + --val_best + + # Testing + result_dir=$dir/test_$(basename $score_checkpoint) + mkdir -p $result_dir + python kws/bin/compute_accuracy.py --gpu 3 \ + --config $dir/config.yaml \ + --test_data data/test/data.list \ + --batch_size 256 \ + --num_workers 8 \ + --checkpoint $score_checkpoint +fi + + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + python kws/bin/export_jit.py --config $dir/config.yaml \ + --checkpoint $score_checkpoint \ + --output_file $dir/final.zip \ + --output_quant_file $dir/final.quant.zip +fi diff --git a/kws/bin/compute_accuracy.py b/kws/bin/compute_accuracy.py new file mode 100644 index 0000000..a6cde27 --- /dev/null +++ b/kws/bin/compute_accuracy.py @@ -0,0 +1,102 @@ +# Copyright (c) 2021 Binbin Zhang(binbzha@qq.com) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import argparse +import copy +import logging +import os + +import torch +import yaml +from torch.utils.data import DataLoader + +from kws.dataset.dataset import Dataset +from kws.model.kws_model import init_model +from kws.utils.checkpoint import load_checkpoint +from kws.utils.executor import Executor + + +def get_args(): + parser = argparse.ArgumentParser(description='recognize with your model') + parser.add_argument('--config', required=True, help='config file') + parser.add_argument('--test_data', required=True, help='test data file') + parser.add_argument('--gpu', + type=int, + default=-1, + help='gpu id for this rank, -1 for cpu') + parser.add_argument('--checkpoint', required=True, help='checkpoint model') + parser.add_argument('--batch_size', + default=16, + type=int, + help='batch size for inference') + parser.add_argument('--num_workers', + default=0, + type=int, + help='num of subprocess workers for reading') + parser.add_argument('--pin_memory', + action='store_true', + default=False, + help='Use pinned memory buffers used for reading') + parser.add_argument('--prefetch', + default=100, + type=int, + help='prefetch number') + args = parser.parse_args() + return args + + +def main(): + args = get_args() + logging.basicConfig(level=logging.DEBUG, + format='%(asctime)s %(levelname)s %(message)s') + os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) + + with open(args.config, 'r') as fin: + configs = yaml.load(fin, Loader=yaml.FullLoader) + + test_conf = copy.deepcopy(configs['dataset_conf']) + test_conf['filter_conf']['max_length'] = 102400 + test_conf['filter_conf']['min_length'] = 0 + test_conf['speed_perturb'] = False + test_conf['spec_aug'] = False + test_conf['shuffle'] = False + test_conf['feature_extraction_conf']['dither'] = 0.0 + test_conf['batch_conf']['batch_size'] = args.batch_size + + test_dataset = Dataset(args.test_data, test_conf) + test_data_loader = DataLoader(test_dataset, + batch_size=None, + pin_memory=args.pin_memory, + num_workers=args.num_workers) + + # Init asr model from configs + model = init_model(configs['model']) + + load_checkpoint(model, args.checkpoint) + use_cuda = args.gpu >= 0 and torch.cuda.is_available() + device = torch.device('cuda' if use_cuda else 'cpu') + model = model.to(device) + executor = Executor() + model.eval() + training_config = configs['training_config'] + with torch.no_grad(): + test_loss, test_acc = executor.test(model, test_data_loader, device, + training_config) + logging.info('Test Loss {} Acc {}'.format(test_loss, test_acc)) + + +if __name__ == '__main__': + main() diff --git a/kws/model/tcn.py b/kws/model/tcn.py index 6a002b1..958a951 100644 --- a/kws/model/tcn.py +++ b/kws/model/tcn.py @@ -20,21 +20,14 @@ import torch.nn as nn import torch.nn.functional as F -class CnnBlock(nn.Module): +class Block(nn.Module): def __init__(self, channel: int, kernel_size: int, dilation: int, dropout: float = 0.1): super().__init__() - # The CNN used here is causal convolution self.padding = (kernel_size - 1) * dilation - self.cnn = nn.Conv1d(channel, - channel, - kernel_size, - stride=1, - dilation=dilation) - self.dropout = nn.Dropout(dropout) def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None): """ @@ -43,6 +36,7 @@ class CnnBlock(nn.Module): Returns: torch.Tensor(B, D, T) """ + # The CNN used here is causal convolution if cache is None: y = F.pad(x, (self.padding, 0), value=0.0) else: @@ -50,14 +44,32 @@ class CnnBlock(nn.Module): assert y.size(2) > self.padding new_cache = y[:, :, -self.padding:] + # self.cnn is defined in the subclass of Block y = self.cnn(y) - y = F.relu(y) - y = self.dropout(y) y = y + x # residual connection return y, new_cache -class DsCnnBlock(nn.Module): +class CnnBlock(Block): + def __init__(self, + channel: int, + kernel_size: int, + dilation: int, + dropout: float = 0.1): + super().__init__(channel, kernel_size, dilation, dropout) + self.cnn = nn.Sequential( + nn.Conv1d(channel, + channel, + kernel_size, + stride=1, + dilation=dilation), + nn.BatchNorm1d(channel), + nn.ReLU(), + nn.Dropout(dropout), + ) + + +class DsCnnBlock(Block): """ Depthwise Separable Convolution """ def __init__(self, @@ -65,41 +77,21 @@ class DsCnnBlock(nn.Module): kernel_size: int, dilation: int, dropout: float = 0.1): - super().__init__() - # The CNN used here is causal convolution - self.padding = (kernel_size - 1) * dilation - self.depthwise_cnn = nn.Conv1d(channel, - channel, - kernel_size, - stride=1, - dilation=dilation, - groups=channel) - self.pointwise_cnn = nn.Conv1d(channel, - channel, - kernel_size=1, - stride=1) - self.dropout = nn.Dropout(dropout) - - def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None): - """ - Args: - x(torch.Tensor): Input tensor (B, D, T) - Returns: - torch.Tensor(B, D, T) - """ - if cache is None: - y = F.pad(x, (self.padding, 0), value=0.0) - else: - y = torch.cat((cache, x), dim=2) - assert y.size(2) > self.padding - new_cache = y[:, :, -self.padding:] - - y = self.depthwise_cnn(y) - y = self.pointwise_cnn(y) - y = F.relu(y) - y = self.dropout(y) - y = y + x # residual connection - return y, new_cache + super().__init__(channel, kernel_size, dilation, dropout) + self.cnn = nn.Sequential( + nn.Conv1d(channel, + channel, + kernel_size, + stride=1, + dilation=dilation, + groups=channel), + nn.BatchNorm1d(channel), + nn.ReLU(), + nn.Conv1d(channel, channel, kernel_size=1, stride=1), + nn.BatchNorm1d(channel), + nn.ReLU(), + nn.Dropout(dropout), + ) class TCN(nn.Module):