[wekws] add online noise and rir argumentation (#115)

* [wekws] add online noise and rir argumentation * format * format * update copyright Co-authored-by: menglong.xu <menglong.xu>
2022-11-28 21:12:26 +08:00 · 2022-11-28 21:12:26 +08:00 · 6da85d4662
commit 6da85d4662
parent 5c6088f947
8 changed files with 205 additions and 2 deletions
--- a/examples/hey_snips/s0/conf/ds_tcn.yaml
+++ b/examples/hey_snips/s0/conf/ds_tcn.yaml
@ -5,6 +5,8 @@ dataset_conf:
    resample_conf:
        resample_rate: 16000
    speed_perturb: false
    reverb_prob: 0.2
    noise_prob: 0.3
    feature_extraction_conf:
        feature_type: 'fbank'
        num_mel_bins: 40
--- a/examples/hey_snips/s0/run.sh
+++ b/examples/hey_snips/s0/run.sh
@ -20,6 +20,8 @@ num_average=30
 score_checkpoint=$dir/avg_${num_average}.pt
 download_dir=./data/local # your data dir
 noise_lmdb=
 reverb_lmdb=
 . tools/parse_options.sh || exit 1;
@ -78,6 +80,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    --min_duration 50 \
    --seed 777 \
    $cmvn_opts \
    ${reverb_lmdb:+--reverb_lmdb $reverb_lmdb} \
    ${noise_lmdb:+--noise_lmdb $noise_lmdb} \
    ${checkpoint:+--checkpoint $checkpoint}
 fi
--- a/requirements.txt
+++ b/requirements.txt
@ -11,3 +11,6 @@ flake8-pyi==20.5.0
 mccabe
 pycodestyle==2.6.0
 pyflakes==2.2.0
 lmdb
 scipy
 tqdm
--- a/tools/make_lmdb.py
+++ b/tools/make_lmdb.py
@ -0,0 +1,59 @@
 # Copyright (c) 2022 Binbin Zhang(binbzha@qq.com)
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import math
 import pickle
 import lmdb
 from tqdm import tqdm
 def get_args():
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('in_scp_file', help='input scp file')
    parser.add_argument('out_lmdb', help='output lmdb')
    args = parser.parse_args()
    return args
 def main():
    args = get_args()
    db = lmdb.open(args.out_lmdb, map_size=int(math.pow(1024, 4)))  # 1TB
    # txn is for Transaciton
    txn = db.begin(write=True)
    keys = []
    with open(args.in_scp_file, 'r', encoding='utf8') as fin:
        lines = fin.readlines()
        for i, line in enumerate(tqdm(lines)):
            arr = line.strip().split()
            assert len(arr) == 2
            key, wav = arr[0], arr[1]
            keys.append(key)
            with open(wav, 'rb') as fin:
                data = fin.read()
            txn.put(key.encode(), data)
            # Write flush to disk
            if i % 100 == 0:
                txn.commit()
                txn = db.begin(write=True)
    txn.commit()
    with db.begin(write=True) as txn:
        txn.put(b'__keys__', pickle.dumps(keys))
    db.sync()
    db.close()
 if __name__ == '__main__':
    main()
--- a/wekws/bin/train.py
+++ b/wekws/bin/train.py
@ -77,6 +77,12 @@ def get_args():
                        default=100,
                        type=int,
                        help='prefetch number')
    parser.add_argument('--reverb_lmdb',
                        default=None,
                        help='reverb lmdb file')
    parser.add_argument('--noise_lmdb',
                        default=None,
                        help='noise lmdb file')
    args = parser.parse_args()
    return args
@ -106,7 +112,10 @@ def main():
    cv_conf['spec_aug'] = False
    cv_conf['shuffle'] = False
-    train_dataset = Dataset(args.train_data, train_conf)
+    train_dataset = Dataset(args.train_data,
                            train_conf,
                            reverb_lmdb=args.reverb_lmdb,
                            noise_lmdb=args.noise_lmdb)
    cv_dataset = Dataset(args.cv_data, cv_conf)
    train_data_loader = DataLoader(train_dataset,
--- a/wekws/dataset/dataset.py
+++ b/wekws/dataset/dataset.py
@ -20,6 +20,7 @@ from torch.utils.data import IterableDataset
 import wekws.dataset.processor as processor
 from wekws.utils.file_utils import read_lists
 from wekws.dataset.lmdb_data import LmdbData
 class Processor(IterableDataset):
@ -112,7 +113,10 @@ class DataList(IterableDataset):
            yield data
-def Dataset(data_list_file, conf, partition=True):
+def Dataset(data_list_file, conf,
            partition=True,
            reverb_lmdb=None,
            noise_lmdb=None):
    """ Construct dataset from arguments
        We have two shuffle stage in the Dataset. The first is global
@ -122,6 +126,8 @@ def Dataset(data_list_file, conf, partition=True):
        Args:
            data_type(str): raw/shard
            partition(bool): whether to do data partition in terms of rank
            reverb_lmdb: reverb data source lmdb file
            noise_lmdb: noise data source lmdb file
    """
    lists = read_lists(data_list_file)
    shuffle = conf.get('shuffle', True)
@ -136,6 +142,14 @@ def Dataset(data_list_file, conf, partition=True):
    speed_perturb = conf.get('speed_perturb', False)
    if speed_perturb:
        dataset = Processor(dataset, processor.speed_perturb)
    if reverb_lmdb and conf.get('reverb_prob', 0) > 0:
        reverb_data = LmdbData(reverb_lmdb)
        dataset = Processor(dataset, processor.add_reverb,
                            reverb_data, conf['reverb_prob'])
    if noise_lmdb and conf.get('noise_prob', 0) > 0:
        noise_data = LmdbData(noise_lmdb)
        dataset = Processor(dataset, processor.add_noise,
                            noise_data, conf['noise_prob'])
    feature_extraction_conf = conf.get('feature_extraction_conf', {})
    if feature_extraction_conf['feature_type'] == 'mfcc':
        dataset = Processor(dataset, processor.compute_mfcc,
--- a/wekws/dataset/lmdb_data.py
+++ b/wekws/dataset/lmdb_data.py
@ -0,0 +1,53 @@
 # Copyright (c) 2022 Binbin Zhang(binbzha@qq.com)
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import random
 import pickle
 import lmdb
 class LmdbData:
    def __init__(self, lmdb_file):
        self.db = lmdb.open(lmdb_file,
                            readonly=True,
                            lock=False,
                            readahead=False)
        with self.db.begin(write=False) as txn:
            obj = txn.get(b'__keys__')
            assert obj is not None
            self.keys = pickle.loads(obj)
            assert isinstance(self.keys, list)
    def random_one(self):
        assert len(self.keys) > 0
        index = random.randint(0, len(self.keys) - 1)
        key = self.keys[index]
        with self.db.begin(write=False) as txn:
            value = txn.get(key.encode())
            assert value is not None
        return key, value
    def __del__(self):
        self.db.close()
 if __name__ == '__main__':
    import sys
    db = LmdbData(sys.argv[1])
    key, _ = db.random_one()
    print(key)
    key, _ = db.random_one()
    print(key)
--- a/wekws/dataset/processor.py
+++ b/wekws/dataset/processor.py
@ -12,10 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import io
 import logging
 import json
 import random
 import numpy as np
 from scipy import signal
 from scipy.io import wavfile
 import torch
 import torchaudio
 import torchaudio.compliance.kaldi as kaldi
@ -304,3 +308,58 @@ def padding(data):
                                    batch_first=True,
                                    padding_value=0)
        yield (sorted_keys, padded_feats, sorted_labels, feats_lengths)
 def add_reverb(data, reverb_source, aug_prob):
    for sample in data:
        assert 'wav' in sample
        if aug_prob > random.random():
            audio = sample['wav'].numpy()[0]
            audio_len = audio.shape[0]
            _, rir_data = reverb_source.random_one()
            rir_io = io.BytesIO(rir_data)
            _, rir_audio = wavfile.read(rir_io)
            rir_audio = rir_audio.astype(np.float32)
            rir_audio = rir_audio / np.sqrt(np.sum(rir_audio**2))
            out_audio = signal.convolve(audio, rir_audio,
                                        mode='full')[:audio_len]
            out_audio = torch.from_numpy(out_audio)
            out_audio = torch.unsqueeze(out_audio, 0)
            sample['wav'] = out_audio
        yield sample
 def add_noise(data, noise_source, aug_prob):
    for sample in data:
        assert 'wav' in sample
        assert 'key' in sample
        if aug_prob > random.random():
            audio = sample['wav'].numpy()[0]
            audio_len = audio.shape[0]
            audio_db = 10 * np.log10(np.mean(audio**2) + 1e-4)
            key, noise_data = noise_source.random_one()
            if key.startswith('noise'):
                snr_range = [0, 15]
            elif key.startswith('speech'):
                snr_range = [5, 30]
            elif key.startswith('music'):
                snr_range = [5, 15]
            else:
                snr_range = [0, 15]
            _, noise_audio = wavfile.read(io.BytesIO(noise_data))
            noise_audio = noise_audio.astype(np.float32)
            if noise_audio.shape[0] > audio_len:
                start = random.randint(0, noise_audio.shape[0] - audio_len)
                noise_audio = noise_audio[start:start + audio_len]
            else:
                # Resize will repeat copy
                noise_audio = np.resize(noise_audio, (audio_len, ))
            noise_snr = random.uniform(snr_range[0], snr_range[1])
            noise_db = 10 * np.log10(np.mean(noise_audio**2) + 1e-4)
            noise_audio = np.sqrt(10**(
                (audio_db - noise_db - noise_snr) / 10)) * noise_audio
            out_audio = audio + noise_audio
            out_audio = torch.from_numpy(out_audio)
            out_audio = torch.unsqueeze(out_audio, 0)
            sample['wav'] = out_audio
        yield sample