[tools] add tools
This commit is contained in:
parent
679ed2e98f
commit
f629c0fa54
137
tools/compute_cmvn_stats.py
Executable file
137
tools/compute_cmvn_stats.py
Executable file
@ -0,0 +1,137 @@
|
||||
#!/usr/bin/env python3
|
||||
# encoding: utf-8
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
import json
|
||||
import codecs
|
||||
import yaml
|
||||
|
||||
import torch
|
||||
import torchaudio
|
||||
import torchaudio.compliance.kaldi as kaldi
|
||||
from torch.utils.data import Dataset, DataLoader
|
||||
|
||||
torchaudio.set_audio_backend("sox_io")
|
||||
|
||||
|
||||
class CollateFunc(object):
|
||||
''' Collate function for AudioDataset
|
||||
'''
|
||||
def __init__(self, feat_dim, resample_rate):
|
||||
self.feat_dim = feat_dim
|
||||
self.resample_rate = resample_rate
|
||||
pass
|
||||
|
||||
def __call__(self, batch):
|
||||
mean_stat = torch.zeros(self.feat_dim)
|
||||
var_stat = torch.zeros(self.feat_dim)
|
||||
number = 0
|
||||
for item in batch:
|
||||
value = item[1].strip().split(",")
|
||||
assert len(value) == 3 or len(value) == 1
|
||||
wav_path = value[0]
|
||||
sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate
|
||||
resample_rate = sample_rate
|
||||
# len(value) == 3 means segmented wav.scp,
|
||||
# len(value) == 1 means original wav.scp
|
||||
if len(value) == 3:
|
||||
start_frame = int(float(value[1]) * sample_rate)
|
||||
end_frame = int(float(value[2]) * sample_rate)
|
||||
waveform, sample_rate = torchaudio.backend.sox_io_backend.load(
|
||||
filepath=wav_path,
|
||||
num_frames=end_frame - start_frame,
|
||||
frame_offset=start_frame)
|
||||
else:
|
||||
waveform, sample_rate = torchaudio.load(item[1])
|
||||
|
||||
waveform = waveform * (1 << 15)
|
||||
if self.resample_rate != 0 and self.resample_rate != sample_rate:
|
||||
resample_rate = self.resample_rate
|
||||
waveform = torchaudio.transforms.Resample(
|
||||
orig_freq=sample_rate, new_freq=resample_rate)(waveform)
|
||||
|
||||
mat = kaldi.fbank(waveform,
|
||||
num_mel_bins=self.feat_dim,
|
||||
dither=0.0,
|
||||
energy_floor=0.0,
|
||||
sample_frequency=resample_rate)
|
||||
mean_stat += torch.sum(mat, axis=0)
|
||||
var_stat += torch.sum(torch.square(mat), axis=0)
|
||||
number += mat.shape[0]
|
||||
return number, mean_stat, var_stat
|
||||
|
||||
|
||||
class AudioDataset(Dataset):
|
||||
def __init__(self, data_file):
|
||||
self.items = []
|
||||
with codecs.open(data_file, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
arr = line.strip().split()
|
||||
self.items.append((arr[0], arr[1]))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.items)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return self.items[idx]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='extract CMVN stats')
|
||||
parser.add_argument('--num_workers',
|
||||
default=0,
|
||||
type=int,
|
||||
help='num of subprocess workers for processing')
|
||||
parser.add_argument('--train_config',
|
||||
default='',
|
||||
help='training yaml conf')
|
||||
parser.add_argument('--in_scp', default=None, help='wav scp file')
|
||||
parser.add_argument('--out_cmvn',
|
||||
default='global_cmvn',
|
||||
help='global cmvn file')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.train_config, 'r') as fin:
|
||||
configs = yaml.load(fin, Loader=yaml.FullLoader)
|
||||
feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins']
|
||||
resample_rate = 0
|
||||
if 'resample_conf' in configs['dataset_conf']:
|
||||
resample_rate = configs['dataset_conf']['resample_conf']['resample_rate']
|
||||
print('using resample and new sample rate is {}'.format(resample_rate))
|
||||
|
||||
collate_func = CollateFunc(feat_dim, resample_rate)
|
||||
dataset = AudioDataset(args.in_scp)
|
||||
batch_size = 20
|
||||
data_loader = DataLoader(dataset,
|
||||
batch_size=batch_size,
|
||||
shuffle=True,
|
||||
sampler=None,
|
||||
num_workers=args.num_workers,
|
||||
collate_fn=collate_func)
|
||||
|
||||
with torch.no_grad():
|
||||
all_number = 0
|
||||
all_mean_stat = torch.zeros(feat_dim)
|
||||
all_var_stat = torch.zeros(feat_dim)
|
||||
wav_number = 0
|
||||
for i, batch in enumerate(data_loader):
|
||||
number, mean_stat, var_stat = batch
|
||||
all_mean_stat += mean_stat
|
||||
all_var_stat += var_stat
|
||||
all_number += number
|
||||
wav_number += batch_size
|
||||
if wav_number % 1000 == 0:
|
||||
print(f'processed {wav_number} wavs, {all_number} frames',
|
||||
file=sys.stderr,
|
||||
flush=True)
|
||||
|
||||
cmvn_info = {
|
||||
'mean_stat': list(all_mean_stat.tolist()),
|
||||
'var_stat': list(all_var_stat.tolist()),
|
||||
'frame_num': all_number
|
||||
}
|
||||
|
||||
with open(args.out_cmvn, 'w') as fout:
|
||||
fout.write(json.dumps(cmvn_info))
|
||||
54
tools/make_list.py
Executable file
54
tools/make_list.py
Executable file
@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import json
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='')
|
||||
parser.add_argument('wav_file', help='wav file')
|
||||
parser.add_argument('text_file', help='text file')
|
||||
parser.add_argument('duration_file', help='duration file')
|
||||
parser.add_argument('output_file', help='output list file')
|
||||
args = parser.parse_args()
|
||||
|
||||
wav_table = {}
|
||||
with open(args.wav_file, 'r', encoding='utf8') as fin:
|
||||
for line in fin:
|
||||
arr = line.strip().split()
|
||||
assert len(arr) == 2
|
||||
wav_table[arr[0]] = arr[1]
|
||||
|
||||
duration_table = {}
|
||||
with open(args.duration_file, 'r', encoding='utf8') as fin:
|
||||
for line in fin:
|
||||
arr = line.strip().split()
|
||||
assert len(arr) == 2
|
||||
duration_table[arr[0]] = float(arr[1])
|
||||
|
||||
with open(args.text_file, 'r', encoding='utf8') as fin, \
|
||||
open(args.output_file, 'w', encoding='utf8') as fout:
|
||||
for line in fin:
|
||||
arr = line.strip().split(maxsplit=1)
|
||||
key = arr[0]
|
||||
txt = int(arr[1])
|
||||
assert key in wav_table
|
||||
wav = wav_table[key]
|
||||
assert key in duration_table
|
||||
duration = duration_table[key]
|
||||
line = dict(key=key, txt=txt, duration=duration, wav=wav)
|
||||
json_line = json.dumps(line, ensure_ascii=False)
|
||||
fout.write(json_line + '\n')
|
||||
97
tools/parse_options.sh
Executable file
97
tools/parse_options.sh
Executable file
@ -0,0 +1,97 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
|
||||
# Arnab Ghoshal, Karel Vesely
|
||||
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
|
||||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
|
||||
# MERCHANTABLITY OR NON-INFRINGEMENT.
|
||||
# See the Apache 2 License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
# Parse command-line options.
|
||||
# To be sourced by another script (as in ". parse_options.sh").
|
||||
# Option format is: --option-name arg
|
||||
# and shell variable "option_name" gets set to value "arg."
|
||||
# The exception is --help, which takes no arguments, but prints the
|
||||
# $help_message variable (if defined).
|
||||
|
||||
|
||||
###
|
||||
### The --config file options have lower priority to command line
|
||||
### options, so we need to import them first...
|
||||
###
|
||||
|
||||
# Now import all the configs specified by command-line, in left-to-right order
|
||||
for ((argpos=1; argpos<$#; argpos++)); do
|
||||
if [ "${!argpos}" == "--config" ]; then
|
||||
argpos_plus1=$((argpos+1))
|
||||
config=${!argpos_plus1}
|
||||
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
|
||||
. $config # source the config file.
|
||||
fi
|
||||
done
|
||||
|
||||
|
||||
###
|
||||
### No we process the command line options
|
||||
###
|
||||
while true; do
|
||||
[ -z "${1:-}" ] && break; # break if there are no arguments
|
||||
case "$1" in
|
||||
# If the enclosing script is called with --help option, print the help
|
||||
# message and exit. Scripts should put help messages in $help_message
|
||||
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
|
||||
else printf "$help_message\n" 1>&2 ; fi;
|
||||
exit 0 ;;
|
||||
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
|
||||
exit 1 ;;
|
||||
# If the first command-line argument begins with "--" (e.g. --foo-bar),
|
||||
# then work out the variable name as $name, which will equal "foo_bar".
|
||||
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
|
||||
# Next we test whether the variable in question is undefned-- if so it's
|
||||
# an invalid option and we die. Note: $0 evaluates to the name of the
|
||||
# enclosing script.
|
||||
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
|
||||
# is undefined. We then have to wrap this test inside "eval" because
|
||||
# foo_bar is itself inside a variable ($name).
|
||||
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
|
||||
|
||||
oldval="`eval echo \\$$name`";
|
||||
# Work out whether we seem to be expecting a Boolean argument.
|
||||
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
|
||||
was_bool=true;
|
||||
else
|
||||
was_bool=false;
|
||||
fi
|
||||
|
||||
# Set the variable to the right value-- the escaped quotes make it work if
|
||||
# the option had spaces, like --cmd "queue.pl -sync y"
|
||||
eval $name=\"$2\";
|
||||
|
||||
# Check that Boolean-valued arguments are really Boolean.
|
||||
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
|
||||
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
|
||||
exit 1;
|
||||
fi
|
||||
shift 2;
|
||||
;;
|
||||
*) break;
|
||||
esac
|
||||
done
|
||||
|
||||
|
||||
# Check for an empty argument to the --cmd option, which can easily occur as a
|
||||
# result of scripting errors.
|
||||
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
|
||||
|
||||
|
||||
true; # so this script returns exit code 0.
|
||||
26
tools/wav2dur.py
Executable file
26
tools/wav2dur.py
Executable file
@ -0,0 +1,26 @@
|
||||
#!/usr/bin/env python3
|
||||
# encoding: utf-8
|
||||
|
||||
import sys
|
||||
|
||||
import torchaudio
|
||||
torchaudio.set_audio_backend("sox_io")
|
||||
|
||||
scp = sys.argv[1]
|
||||
dur_scp = sys.argv[2]
|
||||
|
||||
with open(scp, 'r') as f, open(dur_scp, 'w') as fout:
|
||||
cnt = 0
|
||||
total_duration = 0
|
||||
for l in f:
|
||||
items = l.strip().split()
|
||||
wav_id = items[0]
|
||||
fname = items[1]
|
||||
cnt += 1
|
||||
waveform, rate = torchaudio.load(fname)
|
||||
frames = len(waveform[0])
|
||||
duration = frames / float(rate)
|
||||
total_duration += duration
|
||||
fout.write('{} {}\n'.format(wav_id, duration))
|
||||
print('process {} utts'.format(cnt))
|
||||
print('total {} s'.format(total_duration))
|
||||
27
tools/wav_to_duration.sh
Executable file
27
tools/wav_to_duration.sh
Executable file
@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
# split the wav scp, calculate duration and merge
|
||||
nj=4
|
||||
. tools/parse_options.sh || exit 1;
|
||||
|
||||
inscp=$1
|
||||
outscp=$2
|
||||
data=$(dirname ${inscp})
|
||||
if [ $# -eq 3 ]; then
|
||||
logdir=$3
|
||||
else
|
||||
logdir=${data}/log
|
||||
fi
|
||||
mkdir -p ${logdir}
|
||||
|
||||
rm -f $logdir/wav_*.slice
|
||||
rm -f $logdir/wav_*.shape
|
||||
split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_
|
||||
|
||||
for slice in `ls $logdir/wav_*.slice`; do
|
||||
{
|
||||
name=`basename -s .slice $slice`
|
||||
tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log
|
||||
} &
|
||||
done
|
||||
wait
|
||||
cat $logdir/wav_*.shape > $outscp
|
||||
Loading…
x
Reference in New Issue
Block a user