diff --git a/examples/speechcommand_v1/s0/kws b/examples/speechcommand_v1/s0/kws new file mode 120000 index 0000000..7a3e8e1 --- /dev/null +++ b/examples/speechcommand_v1/s0/kws @@ -0,0 +1 @@ +../../../kws \ No newline at end of file diff --git a/examples/speechcommand_v1/s0/local/data_download.sh b/examples/speechcommand_v1/s0/local/data_download.sh new file mode 100755 index 0000000..1622cc7 --- /dev/null +++ b/examples/speechcommand_v1/s0/local/data_download.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +# Copyright (c) 2021 Jingyong Hou (houjingyong@gmail.com) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +[ -f ./path.sh ] && . ./path.sh + +dl_dir=./data/local + +. tools/parse_options.sh || exit 1; +data_dir=$dl_dir +file_name=speech_commands_v0.01.tar.gz +speech_command_dir=$data_dir/speech_commands_v1 +audio_dir=$data_dir/speech_commands_v1/audio +url=http://download.tensorflow.org/data/$file_name +mkdir -p $data_dir +if [ ! -f $data_dir/$file_name ]; then + echo "downloading $url..." + wget -O $data_dir/$file_name $url +else + echo "$file_name exist in $data_dir, skip download it" +fi + +if [ ! -f $speech_command_dir/.extracted ]; then + mkdir -p $audio_dir + tar -xzvf $data_dir/$file_name -C $audio_dir + touch $speech_command_dir/.extracted +else + echo "$speech_command_dir/.exatracted exist in $speech_command_dir, skip exatraction" +fi + +exit 0 diff --git a/examples/speechcommand_v1/s0/local/prepare_speech_command.py b/examples/speechcommand_v1/s0/local/prepare_speech_command.py new file mode 100755 index 0000000..e10372e --- /dev/null +++ b/examples/speechcommand_v1/s0/local/prepare_speech_command.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# Copyright (c) 2021 Jingyong Hou (houjingyong@gmail.com) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse + +CLASSES = 'unknown, yes, no, up, down, left, right, on, off, stop, go'.split( + ', ') +CLASS_TO_IDX = {CLASSES[i]: str(i) for i in range(len(CLASSES))} + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='prepare kaldi format file for google speech command') + parser.add_argument( + '--wav_list', + required=True, + help='full path of a wav file in google speech command dataset') + parser.add_argument('--data_dir', + required=True, + help='folder to write kaldi format files') + args = parser.parse_args() + + data_dir = args.data_dir + f_wav_scp = open(os.path.join(data_dir, 'wav.scp'), 'w') + f_text = open(os.path.join(data_dir, 'text'), 'w') + with open(args.wav_list) as f: + for line in f.readlines(): + keyword, file_name = line.strip().split('/')[-2:] + file_name_new = file_name.split('.')[0] + wav_id = '_'.join([keyword, file_name_new]) + file_dir = line.strip() + f_wav_scp.writelines(wav_id + ' ' + file_dir + '\n') + label = CLASS_TO_IDX[ + keyword] if keyword in CLASS_TO_IDX else CLASS_TO_IDX["unknown"] + f_text.writelines(wav_id + ' ' + str(label) + '\n') + f_wav_scp.close() + f_text.close() diff --git a/examples/speechcommand_v1/s0/local/split_dataset.py b/examples/speechcommand_v1/s0/local/split_dataset.py new file mode 100755 index 0000000..b1ac3cf --- /dev/null +++ b/examples/speechcommand_v1/s0/local/split_dataset.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +# Copyright (c) 2021 Jingyong Hou (houjingyong@gmail.com) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import argparse + + +def move_files(src_folder, to_folder, list_file): + with open(list_file) as f: + for line in f.readlines(): + line = line.rstrip() + dirname = os.path.dirname(line) + dest = os.path.join(to_folder, dirname) + if not os.path.exists(dest): + os.mkdir(dest) + shutil.move(os.path.join(src_folder, line), dest) + + +if __name__ == '__main__': + '''Splits the google speech commands into train, validation and test set''' + parser = argparse.ArgumentParser( + description='Split google command dataset.') + parser.add_argument( + 'root', + type=str, + help='the path to the root folder of the google commands dataset') + args = parser.parse_args() + + audio_folder = os.path.join(args.root, 'audio') + validation_path = os.path.join(audio_folder, 'validation_list.txt') + test_path = os.path.join(audio_folder, 'testing_list.txt') + + valid_folder = os.path.join(args.root, 'valid') + test_folder = os.path.join(args.root, 'test') + train_folder = os.path.join(args.root, 'train') + + os.mkdir(valid_folder) + os.mkdir(test_folder) + + move_files(audio_folder, test_folder, test_path) + move_files(audio_folder, valid_folder, validation_path) + os.rename(audio_folder, train_folder) diff --git a/examples/speechcommand_v1/s0/path.sh b/examples/speechcommand_v1/s0/path.sh new file mode 100755 index 0000000..b90a515 --- /dev/null +++ b/examples/speechcommand_v1/s0/path.sh @@ -0,0 +1,5 @@ +export PATH=$PWD:$PATH + +# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=../../../:$PYTHONPATH diff --git a/examples/speechcommand_v1/s0/run.sh b/examples/speechcommand_v1/s0/run.sh new file mode 100755 index 0000000..0149e54 --- /dev/null +++ b/examples/speechcommand_v1/s0/run.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Copyright 2021 Binbin Zhang +# Jingyong Hou + +. ./path.sh + +export CUDA_VISIBLE_DEVICES="0" + +stage=-1 +stop_stage=0 + +# your data dir +download_dir=/mnt/mnt-data-3/jingyong.hou/data +speech_command_dir=$download_dir/speech_commands_v1 +. tools/parse_options.sh || exit 1; + +set -euo pipefail + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + echo "Download and extract all datasets" + local/data_download.sh --dl_dir $download_dir + python local/split_dataset.py $download_dir/speech_commands_v1 +fi + + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo "Start preparing Kaldi format files" + for x in train test valid; + do + data=data/$x + mkdir -p $data + # make wav.scp utt2spk text file + find $speech_command_dir/$x -name *.wav | grep -v "_background_noise_" > $data/wav.list + python local/prepare_speech_command.py --wav_list=$data/wav.list --data_dir=$data + done +fi + diff --git a/examples/speechcommand_v1/s0/tools b/examples/speechcommand_v1/s0/tools new file mode 120000 index 0000000..c92f417 --- /dev/null +++ b/examples/speechcommand_v1/s0/tools @@ -0,0 +1 @@ +../../../tools \ No newline at end of file