diff --git a/tools/compute_cmvn_stats.py b/tools/compute_cmvn_stats.py new file mode 100755 index 0000000..25ae546 --- /dev/null +++ b/tools/compute_cmvn_stats.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +import sys +import argparse +import json +import codecs +import yaml + +import torch +import torchaudio +import torchaudio.compliance.kaldi as kaldi +from torch.utils.data import Dataset, DataLoader + +torchaudio.set_audio_backend("sox_io") + + +class CollateFunc(object): + ''' Collate function for AudioDataset + ''' + def __init__(self, feat_dim, resample_rate): + self.feat_dim = feat_dim + self.resample_rate = resample_rate + pass + + def __call__(self, batch): + mean_stat = torch.zeros(self.feat_dim) + var_stat = torch.zeros(self.feat_dim) + number = 0 + for item in batch: + value = item[1].strip().split(",") + assert len(value) == 3 or len(value) == 1 + wav_path = value[0] + sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate + resample_rate = sample_rate + # len(value) == 3 means segmented wav.scp, + # len(value) == 1 means original wav.scp + if len(value) == 3: + start_frame = int(float(value[1]) * sample_rate) + end_frame = int(float(value[2]) * sample_rate) + waveform, sample_rate = torchaudio.backend.sox_io_backend.load( + filepath=wav_path, + num_frames=end_frame - start_frame, + frame_offset=start_frame) + else: + waveform, sample_rate = torchaudio.load(item[1]) + + waveform = waveform * (1 << 15) + if self.resample_rate != 0 and self.resample_rate != sample_rate: + resample_rate = self.resample_rate + waveform = torchaudio.transforms.Resample( + orig_freq=sample_rate, new_freq=resample_rate)(waveform) + + mat = kaldi.fbank(waveform, + num_mel_bins=self.feat_dim, + dither=0.0, + energy_floor=0.0, + sample_frequency=resample_rate) + mean_stat += torch.sum(mat, axis=0) + var_stat += torch.sum(torch.square(mat), axis=0) + number += mat.shape[0] + return number, mean_stat, var_stat + + +class AudioDataset(Dataset): + def __init__(self, data_file): + self.items = [] + with codecs.open(data_file, 'r', encoding='utf-8') as f: + for line in f: + arr = line.strip().split() + self.items.append((arr[0], arr[1])) + + def __len__(self): + return len(self.items) + + def __getitem__(self, idx): + return self.items[idx] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='extract CMVN stats') + parser.add_argument('--num_workers', + default=0, + type=int, + help='num of subprocess workers for processing') + parser.add_argument('--train_config', + default='', + help='training yaml conf') + parser.add_argument('--in_scp', default=None, help='wav scp file') + parser.add_argument('--out_cmvn', + default='global_cmvn', + help='global cmvn file') + + args = parser.parse_args() + + with open(args.train_config, 'r') as fin: + configs = yaml.load(fin, Loader=yaml.FullLoader) + feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins'] + resample_rate = 0 + if 'resample_conf' in configs['dataset_conf']: + resample_rate = configs['dataset_conf']['resample_conf']['resample_rate'] + print('using resample and new sample rate is {}'.format(resample_rate)) + + collate_func = CollateFunc(feat_dim, resample_rate) + dataset = AudioDataset(args.in_scp) + batch_size = 20 + data_loader = DataLoader(dataset, + batch_size=batch_size, + shuffle=True, + sampler=None, + num_workers=args.num_workers, + collate_fn=collate_func) + + with torch.no_grad(): + all_number = 0 + all_mean_stat = torch.zeros(feat_dim) + all_var_stat = torch.zeros(feat_dim) + wav_number = 0 + for i, batch in enumerate(data_loader): + number, mean_stat, var_stat = batch + all_mean_stat += mean_stat + all_var_stat += var_stat + all_number += number + wav_number += batch_size + if wav_number % 1000 == 0: + print(f'processed {wav_number} wavs, {all_number} frames', + file=sys.stderr, + flush=True) + + cmvn_info = { + 'mean_stat': list(all_mean_stat.tolist()), + 'var_stat': list(all_var_stat.tolist()), + 'frame_num': all_number + } + + with open(args.out_cmvn, 'w') as fout: + fout.write(json.dumps(cmvn_info)) diff --git a/tools/make_list.py b/tools/make_list.py new file mode 100755 index 0000000..5825761 --- /dev/null +++ b/tools/make_list.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import json + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='') + parser.add_argument('wav_file', help='wav file') + parser.add_argument('text_file', help='text file') + parser.add_argument('duration_file', help='duration file') + parser.add_argument('output_file', help='output list file') + args = parser.parse_args() + + wav_table = {} + with open(args.wav_file, 'r', encoding='utf8') as fin: + for line in fin: + arr = line.strip().split() + assert len(arr) == 2 + wav_table[arr[0]] = arr[1] + + duration_table = {} + with open(args.duration_file, 'r', encoding='utf8') as fin: + for line in fin: + arr = line.strip().split() + assert len(arr) == 2 + duration_table[arr[0]] = float(arr[1]) + + with open(args.text_file, 'r', encoding='utf8') as fin, \ + open(args.output_file, 'w', encoding='utf8') as fout: + for line in fin: + arr = line.strip().split(maxsplit=1) + key = arr[0] + txt = int(arr[1]) + assert key in wav_table + wav = wav_table[key] + assert key in duration_table + duration = duration_table[key] + line = dict(key=key, txt=txt, duration=duration, wav=wav) + json_line = json.dumps(line, ensure_ascii=False) + fout.write(json_line + '\n') diff --git a/tools/parse_options.sh b/tools/parse_options.sh new file mode 100755 index 0000000..34476fd --- /dev/null +++ b/tools/parse_options.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Copyright 2012 Johns Hopkins University (Author: Daniel Povey); +# Arnab Ghoshal, Karel Vesely + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Parse command-line options. +# To be sourced by another script (as in ". parse_options.sh"). +# Option format is: --option-name arg +# and shell variable "option_name" gets set to value "arg." +# The exception is --help, which takes no arguments, but prints the +# $help_message variable (if defined). + + +### +### The --config file options have lower priority to command line +### options, so we need to import them first... +### + +# Now import all the configs specified by command-line, in left-to-right order +for ((argpos=1; argpos<$#; argpos++)); do + if [ "${!argpos}" == "--config" ]; then + argpos_plus1=$((argpos+1)) + config=${!argpos_plus1} + [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1 + . $config # source the config file. + fi +done + + +### +### No we process the command line options +### +while true; do + [ -z "${1:-}" ] && break; # break if there are no arguments + case "$1" in + # If the enclosing script is called with --help option, print the help + # message and exit. Scripts should put help messages in $help_message + --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2; + else printf "$help_message\n" 1>&2 ; fi; + exit 0 ;; + --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'" + exit 1 ;; + # If the first command-line argument begins with "--" (e.g. --foo-bar), + # then work out the variable name as $name, which will equal "foo_bar". + --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`; + # Next we test whether the variable in question is undefned-- if so it's + # an invalid option and we die. Note: $0 evaluates to the name of the + # enclosing script. + # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar + # is undefined. We then have to wrap this test inside "eval" because + # foo_bar is itself inside a variable ($name). + eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1; + + oldval="`eval echo \\$$name`"; + # Work out whether we seem to be expecting a Boolean argument. + if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then + was_bool=true; + else + was_bool=false; + fi + + # Set the variable to the right value-- the escaped quotes make it work if + # the option had spaces, like --cmd "queue.pl -sync y" + eval $name=\"$2\"; + + # Check that Boolean-valued arguments are really Boolean. + if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then + echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2 + exit 1; + fi + shift 2; + ;; + *) break; + esac +done + + +# Check for an empty argument to the --cmd option, which can easily occur as a +# result of scripting errors. +[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1; + + +true; # so this script returns exit code 0. diff --git a/tools/wav2dur.py b/tools/wav2dur.py new file mode 100755 index 0000000..1bcc1b6 --- /dev/null +++ b/tools/wav2dur.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +import sys + +import torchaudio +torchaudio.set_audio_backend("sox_io") + +scp = sys.argv[1] +dur_scp = sys.argv[2] + +with open(scp, 'r') as f, open(dur_scp, 'w') as fout: + cnt = 0 + total_duration = 0 + for l in f: + items = l.strip().split() + wav_id = items[0] + fname = items[1] + cnt += 1 + waveform, rate = torchaudio.load(fname) + frames = len(waveform[0]) + duration = frames / float(rate) + total_duration += duration + fout.write('{} {}\n'.format(wav_id, duration)) + print('process {} utts'.format(cnt)) + print('total {} s'.format(total_duration)) diff --git a/tools/wav_to_duration.sh b/tools/wav_to_duration.sh new file mode 100755 index 0000000..51b055c --- /dev/null +++ b/tools/wav_to_duration.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# split the wav scp, calculate duration and merge +nj=4 +. tools/parse_options.sh || exit 1; + +inscp=$1 +outscp=$2 +data=$(dirname ${inscp}) +if [ $# -eq 3 ]; then + logdir=$3 +else + logdir=${data}/log +fi +mkdir -p ${logdir} + +rm -f $logdir/wav_*.slice +rm -f $logdir/wav_*.shape +split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_ + +for slice in `ls $logdir/wav_*.slice`; do +{ + name=`basename -s .slice $slice` + tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log +} & +done +wait +cat $logdir/wav_*.shape > $outscp