[examples] speech command data prepare (#27)

* [examples] added speech command data preparation code * update * updata path.sh
2021-12-06 12:00:25 +08:00 · 2021-12-06 12:00:25 +08:00 · 8be4bef405
commit 8be4bef405
parent 5241491e95
7 changed files with 191 additions and 0 deletions
--- a/examples/speechcommand_v1/s0/kws
+++ b/examples/speechcommand_v1/s0/kws
@ -0,0 +1 @@
+../../../kws
--- a/examples/speechcommand_v1/s0/local/data_download.sh
+++ b/examples/speechcommand_v1/s0/local/data_download.sh
@ -0,0 +1,43 @@
+#!/bin/bash
+
+# Copyright (c) 2021 Jingyong Hou (houjingyong@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[ -f ./path.sh ] && . ./path.sh
+
+dl_dir=./data/local
+
+. tools/parse_options.sh || exit 1;
+data_dir=$dl_dir
+file_name=speech_commands_v0.01.tar.gz
+speech_command_dir=$data_dir/speech_commands_v1
+audio_dir=$data_dir/speech_commands_v1/audio
+url=http://download.tensorflow.org/data/$file_name
+mkdir -p $data_dir
+if [ ! -f $data_dir/$file_name ]; then
+    echo "downloading $url..."
+    wget -O $data_dir/$file_name $url
+else
+    echo "$file_name exist in $data_dir, skip download it"
+fi
+
+if [ ! -f $speech_command_dir/.extracted ]; then
+    mkdir -p $audio_dir
+    tar -xzvf $data_dir/$file_name -C $audio_dir
+    touch $speech_command_dir/.extracted
+else
+    echo "$speech_command_dir/.exatracted exist in $speech_command_dir, skip exatraction"
+fi
+
+exit 0
--- a/examples/speechcommand_v1/s0/local/prepare_speech_command.py
+++ b/examples/speechcommand_v1/s0/local/prepare_speech_command.py
@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+# Copyright (c) 2021 Jingyong Hou (houjingyong@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import argparse
+
+CLASSES = 'unknown, yes, no, up, down, left, right, on, off, stop, go'.split(
+    ', ')
+CLASS_TO_IDX = {CLASSES[i]: str(i) for i in range(len(CLASSES))}
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='prepare kaldi format file for google speech command')
+    parser.add_argument(
+        '--wav_list',
+        required=True,
+        help='full path of a wav file in google speech command dataset')
+    parser.add_argument('--data_dir',
+                        required=True,
+                        help='folder to write kaldi format files')
+    args = parser.parse_args()
+
+    data_dir = args.data_dir
+    f_wav_scp = open(os.path.join(data_dir, 'wav.scp'), 'w')
+    f_text = open(os.path.join(data_dir, 'text'), 'w')
+    with open(args.wav_list) as f:
+        for line in f.readlines():
+            keyword, file_name = line.strip().split('/')[-2:]
+            file_name_new = file_name.split('.')[0]
+            wav_id = '_'.join([keyword, file_name_new])
+            file_dir = line.strip()
+            f_wav_scp.writelines(wav_id + ' ' + file_dir + '\n')
+            label = CLASS_TO_IDX[
+                keyword] if keyword in CLASS_TO_IDX else CLASS_TO_IDX["unknown"]
+            f_text.writelines(wav_id + ' ' + str(label) + '\n')
+    f_wav_scp.close()
+    f_text.close()
--- a/examples/speechcommand_v1/s0/local/split_dataset.py
+++ b/examples/speechcommand_v1/s0/local/split_dataset.py
@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+# Copyright (c) 2021 Jingyong Hou (houjingyong@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import argparse
+
+
+def move_files(src_folder, to_folder, list_file):
+    with open(list_file) as f:
+        for line in f.readlines():
+            line = line.rstrip()
+            dirname = os.path.dirname(line)
+            dest = os.path.join(to_folder, dirname)
+            if not os.path.exists(dest):
+                os.mkdir(dest)
+            shutil.move(os.path.join(src_folder, line), dest)
+
+
+if __name__ == '__main__':
+    '''Splits the google speech commands into train, validation and test set'''
+    parser = argparse.ArgumentParser(
+        description='Split google command dataset.')
+    parser.add_argument(
+        'root',
+        type=str,
+        help='the path to the root folder of the google commands dataset')
+    args = parser.parse_args()
+
+    audio_folder = os.path.join(args.root, 'audio')
+    validation_path = os.path.join(audio_folder, 'validation_list.txt')
+    test_path = os.path.join(audio_folder, 'testing_list.txt')
+
+    valid_folder = os.path.join(args.root, 'valid')
+    test_folder = os.path.join(args.root, 'test')
+    train_folder = os.path.join(args.root, 'train')
+
+    os.mkdir(valid_folder)
+    os.mkdir(test_folder)
+
+    move_files(audio_folder, test_folder, test_path)
+    move_files(audio_folder, valid_folder, validation_path)
+    os.rename(audio_folder, train_folder)
--- a/examples/speechcommand_v1/s0/path.sh
+++ b/examples/speechcommand_v1/s0/path.sh
@ -0,0 +1,5 @@
+export PATH=$PWD:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../../:$PYTHONPATH
--- a/examples/speechcommand_v1/s0/run.sh
+++ b/examples/speechcommand_v1/s0/run.sh
@ -0,0 +1,37 @@
+#!/bin/bash
+# Copyright 2021  Binbin Zhang
+#                 Jingyong Hou
+
+. ./path.sh
+
+export CUDA_VISIBLE_DEVICES="0"
+
+stage=-1
+stop_stage=0
+
+# your data dir
+download_dir=/mnt/mnt-data-3/jingyong.hou/data
+speech_command_dir=$download_dir/speech_commands_v1
+. tools/parse_options.sh || exit 1;
+
+set -euo pipefail
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+  echo "Download and extract all datasets"
+  local/data_download.sh --dl_dir $download_dir
+  python local/split_dataset.py $download_dir/speech_commands_v1
+fi
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+  echo "Start preparing Kaldi format files"
+  for x in train test valid;
+  do
+    data=data/$x
+    mkdir -p $data
+    # make wav.scp utt2spk text file
+    find $speech_command_dir/$x -name *.wav | grep -v "_background_noise_" > $data/wav.list
+    python local/prepare_speech_command.py --wav_list=$data/wav.list --data_dir=$data
+  done
+fi
+
--- a/examples/speechcommand_v1/s0/tools
+++ b/examples/speechcommand_v1/s0/tools
@ -0,0 +1 @@
+../../../tools