[tools] add tools

2021-11-10 18:43:28 +08:00 · 2021-11-10 18:43:28 +08:00 · f629c0fa54
commit f629c0fa54
parent 679ed2e98f
5 changed files with 341 additions and 0 deletions
--- a/tools/compute_cmvn_stats.py
+++ b/tools/compute_cmvn_stats.py
@ -0,0 +1,137 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+
+import sys
+import argparse
+import json
+import codecs
+import yaml
+
+import torch
+import torchaudio
+import torchaudio.compliance.kaldi as kaldi
+from torch.utils.data import Dataset, DataLoader
+
+torchaudio.set_audio_backend("sox_io")
+
+
+class CollateFunc(object):
+    ''' Collate function for AudioDataset
+    '''
+    def __init__(self, feat_dim, resample_rate):
+        self.feat_dim = feat_dim
+        self.resample_rate = resample_rate
+        pass
+
+    def __call__(self, batch):
+        mean_stat = torch.zeros(self.feat_dim)
+        var_stat = torch.zeros(self.feat_dim)
+        number = 0
+        for item in batch:
+            value = item[1].strip().split(",")
+            assert len(value) == 3 or len(value) == 1
+            wav_path = value[0]
+            sample_rate = torchaudio.backend.sox_io_backend.info(wav_path).sample_rate
+            resample_rate = sample_rate
+            # len(value) == 3 means segmented wav.scp,
+            # len(value) == 1 means original wav.scp
+            if len(value) == 3:
+                start_frame = int(float(value[1]) * sample_rate)
+                end_frame = int(float(value[2]) * sample_rate)
+                waveform, sample_rate = torchaudio.backend.sox_io_backend.load(
+                    filepath=wav_path,
+                    num_frames=end_frame - start_frame,
+                    frame_offset=start_frame)
+            else:
+                waveform, sample_rate = torchaudio.load(item[1])
+
+            waveform = waveform * (1 << 15)
+            if self.resample_rate != 0 and self.resample_rate != sample_rate:
+                resample_rate = self.resample_rate
+                waveform = torchaudio.transforms.Resample(
+                    orig_freq=sample_rate, new_freq=resample_rate)(waveform)
+
+            mat = kaldi.fbank(waveform,
+                              num_mel_bins=self.feat_dim,
+                              dither=0.0,
+                              energy_floor=0.0,
+                              sample_frequency=resample_rate)
+            mean_stat += torch.sum(mat, axis=0)
+            var_stat += torch.sum(torch.square(mat), axis=0)
+            number += mat.shape[0]
+        return number, mean_stat, var_stat
+
+
+class AudioDataset(Dataset):
+    def __init__(self, data_file):
+        self.items = []
+        with codecs.open(data_file, 'r', encoding='utf-8') as f:
+            for line in f:
+                arr = line.strip().split()
+                self.items.append((arr[0], arr[1]))
+
+    def __len__(self):
+        return len(self.items)
+
+    def __getitem__(self, idx):
+        return self.items[idx]
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='extract CMVN stats')
+    parser.add_argument('--num_workers',
+                        default=0,
+                        type=int,
+                        help='num of subprocess workers for processing')
+    parser.add_argument('--train_config',
+                        default='',
+                        help='training yaml conf')
+    parser.add_argument('--in_scp', default=None, help='wav scp file')
+    parser.add_argument('--out_cmvn',
+                        default='global_cmvn',
+                        help='global cmvn file')
+
+    args = parser.parse_args()
+
+    with open(args.train_config, 'r') as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+    feat_dim = configs['dataset_conf']['fbank_conf']['num_mel_bins']
+    resample_rate = 0
+    if 'resample_conf' in configs['dataset_conf']:
+        resample_rate = configs['dataset_conf']['resample_conf']['resample_rate']
+        print('using resample and new sample rate is {}'.format(resample_rate))
+
+    collate_func = CollateFunc(feat_dim, resample_rate)
+    dataset = AudioDataset(args.in_scp)
+    batch_size = 20
+    data_loader = DataLoader(dataset,
+                             batch_size=batch_size,
+                             shuffle=True,
+                             sampler=None,
+                             num_workers=args.num_workers,
+                             collate_fn=collate_func)
+
+    with torch.no_grad():
+        all_number = 0
+        all_mean_stat = torch.zeros(feat_dim)
+        all_var_stat = torch.zeros(feat_dim)
+        wav_number = 0
+        for i, batch in enumerate(data_loader):
+            number, mean_stat, var_stat = batch
+            all_mean_stat += mean_stat
+            all_var_stat += var_stat
+            all_number += number
+            wav_number += batch_size
+            if wav_number % 1000 == 0:
+                print(f'processed {wav_number} wavs, {all_number} frames',
+                      file=sys.stderr,
+                      flush=True)
+
+    cmvn_info = {
+        'mean_stat': list(all_mean_stat.tolist()),
+        'var_stat': list(all_var_stat.tolist()),
+        'frame_num': all_number
+    }
+
+    with open(args.out_cmvn, 'w') as fout:
+        fout.write(json.dumps(cmvn_info))
--- a/tools/make_list.py
+++ b/tools/make_list.py
@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 Mobvoi Inc. (authors: Binbin Zhang)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='')
+    parser.add_argument('wav_file', help='wav file')
+    parser.add_argument('text_file', help='text file')
+    parser.add_argument('duration_file', help='duration file')
+    parser.add_argument('output_file', help='output list file')
+    args = parser.parse_args()
+
+    wav_table = {}
+    with open(args.wav_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            wav_table[arr[0]] = arr[1]
+
+    duration_table = {}
+    with open(args.duration_file, 'r', encoding='utf8') as fin:
+        for line in fin:
+            arr = line.strip().split()
+            assert len(arr) == 2
+            duration_table[arr[0]] = float(arr[1])
+
+    with open(args.text_file, 'r', encoding='utf8') as fin, \
+         open(args.output_file, 'w', encoding='utf8') as fout:
+        for line in fin:
+            arr = line.strip().split(maxsplit=1)
+            key = arr[0]
+            txt = int(arr[1])
+            assert key in wav_table
+            wav = wav_table[key]
+            assert key in duration_table
+            duration = duration_table[key]
+            line = dict(key=key, txt=txt, duration=duration, wav=wav)
+            json_line = json.dumps(line, ensure_ascii=False)
+            fout.write(json_line + '\n')
--- a/tools/parse_options.sh
+++ b/tools/parse_options.sh
@ -0,0 +1,97 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
+#                 Arnab Ghoshal, Karel Vesely
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the
+# $help_message variable (if defined).
+
+
+###
+### The --config file options have lower priority to command line
+### options, so we need to import them first...
+###
+
+# Now import all the configs specified by command-line, in left-to-right order
+for ((argpos=1; argpos<$#; argpos++)); do
+  if [ "${!argpos}" == "--config" ]; then
+    argpos_plus1=$((argpos+1))
+    config=${!argpos_plus1}
+    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+    . $config  # source the config file.
+  fi
+done
+
+
+###
+### No we process the command line options
+###
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    # If the enclosing script is called with --help option, print the help
+    # message and exit.  Scripts should put help messages in $help_message
+    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+      else printf "$help_message\n" 1>&2 ; fi;
+      exit 0 ;;
+    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+      exit 1 ;;
+    # If the first command-line argument begins with "--" (e.g. --foo-bar),
+    # then work out the variable name as $name, which will equal "foo_bar".
+    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
+      # Next we test whether the variable in question is undefned-- if so it's
+      # an invalid option and we die.  Note: $0 evaluates to the name of the
+      # enclosing script.
+      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+      # is undefined.  We then have to wrap this test inside "eval" because
+      # foo_bar is itself inside a variable ($name).
+      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+
+      oldval="`eval echo \\$$name`";
+      # Work out whether we seem to be expecting a Boolean argument.
+      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval $name=\"$2\";
+
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+  *) break;
+  esac
+done
+
+
+# Check for an empty argument to the --cmd option, which can easily occur as a
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+
+
+true; # so this script returns exit code 0.
--- a/tools/wav2dur.py
+++ b/tools/wav2dur.py
@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+
+import sys
+
+import torchaudio
+torchaudio.set_audio_backend("sox_io")
+
+scp = sys.argv[1]
+dur_scp = sys.argv[2]
+
+with open(scp, 'r') as f, open(dur_scp, 'w') as fout:
+    cnt = 0
+    total_duration = 0
+    for l in f:
+        items = l.strip().split()
+        wav_id = items[0]
+        fname = items[1]
+        cnt += 1
+        waveform, rate = torchaudio.load(fname)
+        frames = len(waveform[0])
+        duration = frames / float(rate)
+        total_duration += duration
+        fout.write('{} {}\n'.format(wav_id, duration))
+    print('process {} utts'.format(cnt))
+    print('total {} s'.format(total_duration))
--- a/tools/wav_to_duration.sh
+++ b/tools/wav_to_duration.sh
@ -0,0 +1,27 @@
+#!/bin/bash
+# split the wav scp, calculate duration and merge
+nj=4
+. tools/parse_options.sh || exit 1;
+
+inscp=$1
+outscp=$2
+data=$(dirname ${inscp})
+if [ $# -eq 3 ]; then
+  logdir=$3
+else
+  logdir=${data}/log
+fi
+mkdir -p ${logdir}
+
+rm -f $logdir/wav_*.slice
+rm -f $logdir/wav_*.shape
+split --additional-suffix .slice -d -n l/$nj $inscp $logdir/wav_
+
+for slice in `ls $logdir/wav_*.slice`; do
+{
+    name=`basename -s .slice $slice`
+    tools/wav2dur.py $slice $logdir/$name.shape 1>$logdir/$name.log
+} &
+done
+wait
+cat $logdir/wav_*.shape > $outscp