[tools] add tools

This commit is contained in:
Binbin Zhang 2021-11-10 18:43:28 +08:00
parent 679ed2e98f
commit f629c0fa54
5 changed files with 341 additions and 0 deletions

137
tools/compute_cmvn_stats.py Executable file
View File

@ -0,0 +1,137 @@
#!/usr/bin/env python3
# encoding: utf-8
import sys
import argparse
import json
import codecs
import yaml
import torch
import torchaudio
import torchaudio.compliance.kaldi as kaldi
from torch.utils.data import Dataset, DataLoader
torchaudio.set_audio_backend("sox_io")
class CollateFunc(object):
''' Collate function for AudioDataset
'''
def __init__(self, feat_dim, resample_rate):
self.feat_dim = feat_dim
self.resample_rate = resample_rate
pass
def __call__(self, batch):
mean_stat = torch.zeros(self.feat_dim)
var_stat = torch.zeros(self.feat_dim)
number = 0
for item in batch:
value = item[1].strip().split(",")
assert len(value) == 3 or len(value) == 1
wav_path = value[0]
sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate
resample_rate = sample_rate
# len(value) == 3 means segmented wav.scp,
# len(value) == 1 means original wav.scp
if len(value) == 3:
start_frame = int(float(value[1]) * sample_rate)
end_frame = int(float(value[2]) * sample_rate)
waveform, sample_rate = torchaudio.backend.sox_io_backend.load(
filepath=wav_path,
num_frames=end_frame - start_frame,
frame_offset=start_frame)
else:
waveform, sample_rate = torchaudio.load(item[1])
waveform = waveform * (1 << 15)
if self.resample_rate != 0 and self.resample_rate != sample_rate:
resample_rate = self.resample_rate
waveform = torchaudio.transforms.Resample(
orig_freq=sample_rate, new_freq=resample_rate)(waveform)
mat = kaldi.fbank(waveform,
num_mel_bins=self.feat_dim,
dither=0.0,
energy_floor=0.0,
sample_frequency=resample_rate)
mean_stat += torch.sum(mat, axis=0)
var_stat += torch.sum(torch.square(mat), axis=0)
number += mat.shape[0]
return number, mean_stat, var_stat
class AudioDataset(Dataset):
def __init__(self, data_file):
self.items = []
with codecs.open(data_file, 'r', encoding='utf-8') as f:
for line in f:
arr = line.strip().split()
self.items.append((arr[0], arr[1]))
def __len__(self):
return len(self.items)
def __getitem__(self, idx):
return self.items[idx]
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='extract CMVN stats')
parser.add_argument('--num_workers',
default=0,
type=int,
help='num of subprocess workers for processing')
parser.add_argument('--train_config',
default='',
help='training yaml conf')
parser.add_argument('--in_scp', default=None, help='wav scp file')
parser.add_argument('--out_cmvn',
default='global_cmvn',
help='global cmvn file')
args = parser.parse_args()
with open(args.train_config, 'r') as fin:
configs = yaml.load(fin, Loader=yaml.FullLoader)
feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins']
resample_rate = 0
if 'resample_conf' in configs['dataset_conf']:
resample_rate = configs['dataset_conf']['resample_conf']['resample_rate']
print('using resample and new sample rate is {}'.format(resample_rate))
collate_func = CollateFunc(feat_dim, resample_rate)
dataset = AudioDataset(args.in_scp)
batch_size = 20
data_loader = DataLoader(dataset,
batch_size=batch_size,
shuffle=True,
sampler=None,
num_workers=args.num_workers,
collate_fn=collate_func)
with torch.no_grad():
all_number = 0
all_mean_stat = torch.zeros(feat_dim)
all_var_stat = torch.zeros(feat_dim)
wav_number = 0
for i, batch in enumerate(data_loader):
number, mean_stat, var_stat = batch
all_mean_stat += mean_stat
all_var_stat += var_stat
all_number += number
wav_number += batch_size
if wav_number % 1000 == 0:
print(f'processed {wav_number} wavs, {all_number} frames',
file=sys.stderr,
flush=True)
cmvn_info = {
'mean_stat': list(all_mean_stat.tolist()),
'var_stat': list(all_var_stat.tolist()),
'frame_num': all_number
}
with open(args.out_cmvn, 'w') as fout:
fout.write(json.dumps(cmvn_info))

54
tools/make_list.py Executable file
View File

@ -0,0 +1,54 @@
#!/usr/bin/env python3
# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='')
parser.add_argument('wav_file', help='wav file')
parser.add_argument('text_file', help='text file')
parser.add_argument('duration_file', help='duration file')
parser.add_argument('output_file', help='output list file')
args = parser.parse_args()
wav_table = {}
with open(args.wav_file, 'r', encoding='utf8') as fin:
for line in fin:
arr = line.strip().split()
assert len(arr) == 2
wav_table[arr[0]] = arr[1]
duration_table = {}
with open(args.duration_file, 'r', encoding='utf8') as fin:
for line in fin:
arr = line.strip().split()
assert len(arr) == 2
duration_table[arr[0]] = float(arr[1])
with open(args.text_file, 'r', encoding='utf8') as fin, \
open(args.output_file, 'w', encoding='utf8') as fout:
for line in fin:
arr = line.strip().split(maxsplit=1)
key = arr[0]
txt = int(arr[1])
assert key in wav_table
wav = wav_table[key]
assert key in duration_table
duration = duration_table[key]
line = dict(key=key, txt=txt, duration=duration, wav=wav)
json_line = json.dumps(line, ensure_ascii=False)
fout.write(json_line + '\n')

97
tools/parse_options.sh Executable file
View File

@ -0,0 +1,97 @@
#!/bin/bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### No we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.

26
tools/wav2dur.py Executable file
View File

@ -0,0 +1,26 @@
#!/usr/bin/env python3
# encoding: utf-8
import sys
import torchaudio
torchaudio.set_audio_backend("sox_io")
scp = sys.argv[1]
dur_scp = sys.argv[2]
with open(scp, 'r') as f, open(dur_scp, 'w') as fout:
cnt = 0
total_duration = 0
for l in f:
items = l.strip().split()
wav_id = items[0]
fname = items[1]
cnt += 1
waveform, rate = torchaudio.load(fname)
frames = len(waveform[0])
duration = frames / float(rate)
total_duration += duration
fout.write('{} {}\n'.format(wav_id, duration))
print('process {} utts'.format(cnt))
print('total {} s'.format(total_duration))

27
tools/wav_to_duration.sh Executable file
View File

@ -0,0 +1,27 @@
#!/bin/bash
# split the wav scp, calculate duration and merge
nj=4
. tools/parse_options.sh || exit 1;
inscp=$1
outscp=$2
data=$(dirname ${inscp})
if [ $# -eq 3 ]; then
logdir=$3
else
logdir=${data}/log
fi
mkdir -p ${logdir}
rm -f $logdir/wav_*.slice
rm -f $logdir/wav_*.shape
split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_
for slice in `ls $logdir/wav_*.slice`; do
{
name=`basename -s .slice $slice`
tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log
} &
done
wait
cat $logdir/wav_*.shape > $outscp