From dbebee86fdf65d69c88c92081ed4043c5b8f2f37 Mon Sep 17 00:00:00 2001 From: Binbin Zhang Date: Wed, 10 Nov 2021 18:57:52 +0800 Subject: [PATCH] [examples] support hi xiaowen dataset --- examples/hi_xiaowen/s0/README.md | 10 ++ examples/hi_xiaowen/s0/conf/ds_tcn.yaml | 44 ++++++ examples/hi_xiaowen/s0/conf/gru.yaml | 41 ++++++ examples/hi_xiaowen/s0/conf/tcn.yaml | 44 ++++++ examples/hi_xiaowen/s0/kws | 1 + .../s0/local/mobvoi_data_download.sh | 65 +++++++++ examples/hi_xiaowen/s0/local/prepare_data.py | 43 ++++++ examples/hi_xiaowen/s0/path.sh | 5 + examples/hi_xiaowen/s0/run.sh | 126 ++++++++++++++++++ examples/hi_xiaowen/s0/tools | 1 + 10 files changed, 380 insertions(+) create mode 100644 examples/hi_xiaowen/s0/README.md create mode 100644 examples/hi_xiaowen/s0/conf/ds_tcn.yaml create mode 100644 examples/hi_xiaowen/s0/conf/gru.yaml create mode 100644 examples/hi_xiaowen/s0/conf/tcn.yaml create mode 120000 examples/hi_xiaowen/s0/kws create mode 100755 examples/hi_xiaowen/s0/local/mobvoi_data_download.sh create mode 100755 examples/hi_xiaowen/s0/local/prepare_data.py create mode 100755 examples/hi_xiaowen/s0/path.sh create mode 100755 examples/hi_xiaowen/s0/run.sh create mode 120000 examples/hi_xiaowen/s0/tools diff --git a/examples/hi_xiaowen/s0/README.md b/examples/hi_xiaowen/s0/README.md new file mode 100644 index 0000000..15084d5 --- /dev/null +++ b/examples/hi_xiaowen/s0/README.md @@ -0,0 +1,10 @@ +FRRs with FAR fixed at once per hour: + +| model | params(K) | epoch | hi_xiaowen | nihao_wenwen | +|------------------|-----------|-----------|------------|--------------| +| GRU | 203 | 80(avg30) | 0.088901 | 0.083827 | +| TCN | 134 | 80(avg30) | 0.023494 | 0.029884 | +| DS_TCN | 21 | 60 | 0.011559 | 0.014190 | +| DS_TCN | 21 | 80 | 0.010807 | 0.014754 | +| DS_TCN | 21 | 80(avg30) | 0.009867 | 0.014472 | +| DS_TCN(spec_aug) | 21 | 80(avg30) | 0.029039 | 0.022648 | diff --git a/examples/hi_xiaowen/s0/conf/ds_tcn.yaml b/examples/hi_xiaowen/s0/conf/ds_tcn.yaml new file mode 100644 index 0000000..7da0181 --- /dev/null +++ b/examples/hi_xiaowen/s0/conf/ds_tcn.yaml @@ -0,0 +1,44 @@ +dataset_conf: + filter_conf: + max_length: 2048 + min_length: 0 + resample_conf: + resample_rate: 16000 + speed_perturb: false + fbank_conf: + num_mel_bins: 40 + frame_shift: 10 + frame_length: 25 + dither: 0.1 + spec_aug: true + spec_aug_conf: + num_t_mask: 1 + num_f_mask: 1 + max_t: 50 + max_f: 30 + shuffle: true + shuffle_conf: + shuffle_size: 1500 + batch_conf: + batch_size: 256 + +model: + hidden_dim: 64 + subsampling: + type: linear + body: + type: tcn + ds: true + num_layers: 4 + kernel_size: 8 + dropout: 0.1 + +optim: adam +optim_conf: + lr: 0.001 + +training_config: + grad_clip: 5 + max_epoch: 80 + log_interval: 10 + diff --git a/examples/hi_xiaowen/s0/conf/gru.yaml b/examples/hi_xiaowen/s0/conf/gru.yaml new file mode 100644 index 0000000..e664319 --- /dev/null +++ b/examples/hi_xiaowen/s0/conf/gru.yaml @@ -0,0 +1,41 @@ +dataset_conf: + filter_conf: + max_length: 2048 + min_length: 0 + resample_conf: + resample_rate: 16000 + speed_perturb: false + fbank_conf: + num_mel_bins: 40 + frame_shift: 10 + frame_length: 25 + dither: 0.1 + spec_aug: false + spec_aug_conf: + num_t_mask: 2 + num_f_mask: 2 + max_t: 50 + max_f: 30 + shuffle: true + shuffle_conf: + shuffle_size: 1500 + batch_conf: + batch_size: 256 + +model: + hidden_dim: 128 + subsampling: + type: linear + body: + type: gru + num_layers: 2 + +optim: adam +optim_conf: + lr: 0.001 + +training_config: + grad_clip: 5 + max_epoch: 80 + log_interval: 10 + diff --git a/examples/hi_xiaowen/s0/conf/tcn.yaml b/examples/hi_xiaowen/s0/conf/tcn.yaml new file mode 100644 index 0000000..0612634 --- /dev/null +++ b/examples/hi_xiaowen/s0/conf/tcn.yaml @@ -0,0 +1,44 @@ +dataset_conf: + filter_conf: + max_length: 2048 + min_length: 0 + resample_conf: + resample_rate: 16000 + speed_perturb: false + fbank_conf: + num_mel_bins: 40 + frame_shift: 10 + frame_length: 25 + dither: 0.1 + spec_aug: false + spec_aug_conf: + num_t_mask: 2 + num_f_mask: 2 + max_t: 50 + max_f: 30 + shuffle: true + shuffle_conf: + shuffle_size: 1500 + batch_conf: + batch_size: 256 + +model: + hidden_dim: 64 + subsampling: + type: linear + body: + type: tcn + ds: false + num_layers: 4 + kernel_size: 8 + dropout: 0.1 + +optim: adam +optim_conf: + lr: 0.001 + +training_config: + grad_clip: 5 + max_epoch: 80 + log_interval: 10 + diff --git a/examples/hi_xiaowen/s0/kws b/examples/hi_xiaowen/s0/kws new file mode 120000 index 0000000..7a3e8e1 --- /dev/null +++ b/examples/hi_xiaowen/s0/kws @@ -0,0 +1 @@ +../../../kws \ No newline at end of file diff --git a/examples/hi_xiaowen/s0/local/mobvoi_data_download.sh b/examples/hi_xiaowen/s0/local/mobvoi_data_download.sh new file mode 100755 index 0000000..7ba3709 --- /dev/null +++ b/examples/hi_xiaowen/s0/local/mobvoi_data_download.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# Copyright 2018-2020 Yiming Wang +# 2018-2020 Daniel Povey +# 2021 Binbin Zhang + +[ -f ./path.sh ] && . ./path.sh + +dl_dir=data/download + +. tools/parse_options.sh || exit 1; + +mkdir -p $dl_dir + +dataset=mobvoi_hotword_dataset.tgz +resources=mobvoi_hotword_dataset_resources.tgz + +# base url for downloads. +data_url=http://www.openslr.org/resources/87 + +if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then + src_path=/export/fs04/a07/ywang/mobvoihotwords +else + src_path=$dl_dir +fi + +if [ ! -f $src_path/$dataset ] || [ ! -f $src_path/$resources ]; then + if ! which wget >/dev/null; then + echo "$0: wget is not installed." + exit 1; + fi + + if [ ! -f $src_path/$dataset ]; then + echo "$0: downloading data from $data_url/$dataset. This may take some time, please be patient." + if ! wget --no-check-certificate -O $dl_dir/$dataset $data_url/$dataset; then + echo "$0: error executing wget $data_url/$dataset" + exit 1; + fi + fi + + if [ ! -f $src_path/$resources ]; then + if ! wget --no-check-certificate -O $dl_dir/$resources $data_url/$resources; then + echo "$0: error executing wget $data_url/$resources" + exit 1; + fi + fi +fi + +if [ -d $dl_dir/$(basename "$dataset" .tgz) ]; then + echo "Not extracting $(basename "$dataset" .tgz) as it is already there." +else + echo "Extracting $dataset..." + tar -xvzf $src_path/$dataset -C $dl_dir || exit 1; + echo "Done extracting $dataset." +fi + +if [ -d $dl_dir/$(basename "$resources" .tgz) ]; then + echo "Not extracting $(basename "$dataset" .tar.gz) as it is already there." +else + echo "Extracting $resources..." + tar -xvzf $src_path/$resources -C $dl_dir || exit 1; + echo "Done extracting $resources." +fi + +exit 0 diff --git a/examples/hi_xiaowen/s0/local/prepare_data.py b/examples/hi_xiaowen/s0/local/prepare_data.py new file mode 100755 index 0000000..ad75a7f --- /dev/null +++ b/examples/hi_xiaowen/s0/local/prepare_data.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 + +# Copyright 2018-2020 Yiming Wang +# 2018-2020 Daniel Povey +# 2021 Binbin Zhang +# Apache 2.0 +""" This script prepares the Mobvoi data into kaldi format. +""" + +import argparse +import os +import json + + +def main(): + parser = argparse.ArgumentParser(description="""Prepare data.""") + parser.add_argument('wav_dir', + type=str, + help='dir containing all the wav files') + parser.add_argument('path', type=str, help='path to the json file') + parser.add_argument('out_dir', type=str, help='out dir') + args = parser.parse_args() + + with open(args.path, 'r', encoding='utf-8') as f: + data = json.load(f) + utt_id, label = [], [] + for entry in data: + utt_id.append(entry['utt_id']) + label.append(int(entry['keyword_id'])) + + abs_dir = os.path.abspath(args.wav_dir) + wav_path = os.path.join(args.out_dir, 'wav.scp') + text_path = os.path.join(args.out_dir, 'text') + with open(wav_path, 'w', encoding='utf-8') as f_wav, \ + open(text_path, 'w', encoding='utf-8') as f_text: + for utt, l in zip(utt_id, label): + f_wav.write('{} {}\n'.format(utt, + os.path.join(abs_dir, utt + ".wav"))) + f_text.write('{} {}\n'.format(utt, l)) + + +if __name__ == "__main__": + main() diff --git a/examples/hi_xiaowen/s0/path.sh b/examples/hi_xiaowen/s0/path.sh new file mode 100755 index 0000000..cf09584 --- /dev/null +++ b/examples/hi_xiaowen/s0/path.sh @@ -0,0 +1,5 @@ +export PATH=$PWD:$PATH + +# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=../../:$PYTHONPATH diff --git a/examples/hi_xiaowen/s0/run.sh b/examples/hi_xiaowen/s0/run.sh new file mode 100755 index 0000000..650532f --- /dev/null +++ b/examples/hi_xiaowen/s0/run.sh @@ -0,0 +1,126 @@ +#!/bin/bash +# Copyright 2021 Binbin Zhang + +. ./path.sh + +export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" + +stage=0 +stop_stage=4 +num_keywords=2 + +config=conf/ds_tcn.yaml +norm_mean=true +norm_var=true +gpu_id=0 + +checkpoint= +dir=exp/ds_tcn + +num_average=30 +score_checkpoint=$dir/avg_${num_average}.pt + +download_dir=/export/expts6/binbinzhang/data/ + +. tools/parse_options.sh || exit 1; + +set -euo pipefail + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + echo "Download and extracte all datasets" + local/mobvoi_data_download.sh --dl_dir $download_dir +fi + + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo "Preparing datasets..." + mkdir dict + echo " -1" > dict/words.txt + echo "Hi_Xiaowen 0" >> dict/words.txt + echo "Nihao_Wenwen 1" >> dict/words.txt + + for folder in train dev eval; do + mkdir -p data/$folder + for prefix in p n; do + mkdir -p data/${prefix}_$folder + json_path=$download_dir/mobvoi_hotword_dataset_resources/${prefix}_$folder.json + if [ $folder = "eval" ]; then + json_path=$download_dir/mobvoi_hotword_dataset_resources/${prefix}_test.json + fi + local/prepare_data.py $download_dir/mobvoi_hotword_dataset $json_path \ + data/${prefix}_$folder + done + cat data/p_$folder/wav.scp data/n_$folder/wav.scp > data/$folder/wav.scp + cat data/p_$folder/text data/n_$folder/text > data/$folder/text + rm -rf data/p_$folder data/n_$folder + done +fi + + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "Compute CMVN and Format datasets" + tools/compute_cmvn_stats.py --num_workers 16 --train_config $config \ + --in_scp data/train/wav.scp \ + --out_cmvn data/train/global_cmvn + + for x in train dev eval; do + tools/wav_to_duration.sh --nj 8 data/$x/wav.scp data/$x/wav.dur + tools/make_list.py data/$x/wav.scp data/$x/text \ + data/$x/wav.dur data/$x/data.list + done +fi + + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + echo "Start training ..." + mkdir -p $dir + cmvn_opts= + $norm_mean && cmvn_opts="--cmvn_file data/train/global_cmvn" + $norm_var && cmvn_opts="$cmvn_opts --norm_var" + python kws/bin/train.py --gpu $gpu_id \ + --config $config \ + --train_data data/train/data.list \ + --cv_data data/dev/data.list \ + --model_dir $dir \ + --num_workers 8 \ + --num_keywords $num_keywords \ + --min_duration 50 \ + $cmvn_opts \ + ${checkpoint:+--checkpoint $checkpoint} +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + # Do model average + python kws/bin/average_model.py \ + --dst_model $score_checkpoint \ + --src_path $dir \ + --num ${num_average} \ + --val_best + + # Compute posterior score + result_dir=$dir/test_$(basename $score_checkpoint) + mkdir -p $result_dir + python kws/bin/score.py --gpu -1 \ + --config $dir/config.yaml \ + --test_data data/eval/data.list \ + --batch_size 256 \ + --checkpoint $score_checkpoint \ + --score_file $result_dir/score.txt + + # Compute detection error tradeoff + for keyword in 0 1; do + python kws/bin/compute_det.py \ + --keyword $keyword \ + --test_data data/eval/data.list \ + --score_file $result_dir/score.txt \ + --stats_file $result_dir/stats.${keyword}.txt + done +fi + + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + python kws/bin/export_jit.py --config $dir/config.yaml \ + --checkpoint $score_checkpoint \ + --output_file $dir/final.zip \ + --output_quant_file $dir/final.quant.zip +fi diff --git a/examples/hi_xiaowen/s0/tools b/examples/hi_xiaowen/s0/tools new file mode 120000 index 0000000..c92f417 --- /dev/null +++ b/examples/hi_xiaowen/s0/tools @@ -0,0 +1 @@ +../../../tools \ No newline at end of file