[bin] use torchrun to launch ddp training (#42)

This commit is contained in:
Binbin Zhang 2021-12-13 19:45:55 +08:00 committed by GitHub
parent 28652a766f
commit 171309bd9e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 41 additions and 67 deletions

View File

@ -1,10 +1,8 @@
#!/bin/bash
# Copyright 2021 Binbin Zhang
# Copyright 2021 Binbin Zhang(binbzha@qq.com)
. ./path.sh
export CUDA_VISIBLE_DEVICES="0"
stage=0
stop_stage=4
num_keywords=2
@ -12,7 +10,7 @@ num_keywords=2
config=conf/ds_tcn.yaml
norm_mean=true
norm_var=true
gpu_id=0
gpus="0,1"
checkpoint=
dir=exp/ds_tcn
@ -24,7 +22,6 @@ download_dir=./data/local # your data dir
. tools/parse_options.sh || exit 1;
set -euo pipefail
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "Download and extracte all datasets"
@ -74,42 +71,37 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
cmvn_opts=
$norm_mean && cmvn_opts="--cmvn_file data/train/global_cmvn"
$norm_var && cmvn_opts="$cmvn_opts --norm_var"
python kws/bin/train.py --gpu $gpu_id \
--config $config \
--train_data data/train/data.list \
--cv_data data/dev/data.list \
--model_dir $dir \
--num_workers 8 \
--num_keywords $num_keywords \
--min_duration 50 \
--seed 666 \
$cmvn_opts \
${checkpoint:+--checkpoint $checkpoint}
num_gpus=$(echo $gpus | awk -F ',' '{print NF}')
torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus \
kws/bin/train.py --gpus $gpus \
--config $config \
--train_data data/train/data.list \
--cv_data data/dev/data.list \
--model_dir $dir \
--num_workers 8 \
--num_keywords $num_keywords \
--min_duration 50 \
--seed 666 \
$cmvn_opts \
${checkpoint:+--checkpoint $checkpoint}
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# Do model average
echo "Do model average, Compute FRR/FAR ..."
python kws/bin/average_model.py \
--dst_model $score_checkpoint \
--src_path $dir \
--num ${num_average} \
--val_best
# Compute posterior score
result_dir=$dir/test_$(basename $score_checkpoint)
mkdir -p $result_dir
python kws/bin/score.py --gpu $gpu_id \
python kws/bin/score.py \
--config $dir/config.yaml \
--test_data data/test/data.list \
--batch_size 256 \
--checkpoint $score_checkpoint \
--score_file $result_dir/score.txt \
--num_workers 8
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# Compute detection error tradeoff
result_dir=$dir/test_$(basename $score_checkpoint)
for keyword in 0 1; do
python kws/bin/compute_det.py \
--keyword $keyword \
@ -120,7 +112,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
python kws/bin/export_jit.py --config $dir/config.yaml \
--checkpoint $score_checkpoint \
--output_file $dir/final.zip \

View File

@ -38,36 +38,20 @@ def get_args():
parser.add_argument('--config', required=True, help='config file')
parser.add_argument('--train_data', required=True, help='train data file')
parser.add_argument('--cv_data', required=True, help='cv data file')
parser.add_argument('--gpu',
type=int,
default=-1,
help='gpu id for this local rank, -1 for cpu')
parser.add_argument('--gpus',
default='-1',
help='gpu lists, seperated with `,`, -1 for cpu')
parser.add_argument('--model_dir', required=True, help='save model dir')
parser.add_argument('--seed', type=int, default=777, help='random seed')
parser.add_argument('--checkpoint', help='checkpoint model')
parser.add_argument('--tensorboard_dir',
default='tensorboard',
help='tensorboard log dir')
parser.add_argument('--ddp.rank',
dest='rank',
default=0,
type=int,
help='global rank for distributed training')
parser.add_argument('--ddp.world_size',
dest='world_size',
default=-1,
type=int,
help='''number of total processes/gpus for
distributed training''')
parser.add_argument('--ddp.dist_backend',
dest='dist_backend',
default='nccl',
choices=['nccl', 'gloo'],
help='distributed backend')
parser.add_argument('--ddp.init_method',
dest='init_method',
default=None,
help='ddp init method')
parser.add_argument('--num_workers',
default=0,
type=int,
@ -102,21 +86,19 @@ def main():
args = get_args()
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
# Set random seed
set_mannul_seed(args.seed)
print(args)
with open(args.config, 'r') as fin:
configs = yaml.load(fin, Loader=yaml.FullLoader)
distributed = args.world_size > 1
if distributed:
logging.info('training on multiple gpus, this gpu {}'.format(args.gpu))
dist.init_process_group(args.dist_backend,
init_method=args.init_method,
world_size=args.world_size,
rank=args.rank)
rank = int(os.environ['LOCAL_RANK'])
world_size = int(os.environ['WORLD_SIZE'])
gpu = int(args.gpus.split(',')[rank])
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu)
if world_size > 1:
logging.info('training on multiple gpus, this gpu {}'.format(gpu))
dist.init_process_group(backend=args.dist_backend)
train_conf = configs['dataset_conf']
cv_conf = copy.deepcopy(train_conf)
@ -149,7 +131,7 @@ def main():
configs['model']['cmvn'] = {}
configs['model']['cmvn']['norm_var'] = args.norm_var
configs['model']['cmvn']['cmvn_file'] = args.cmvn_file
if args.rank == 0:
if rank == 0:
saved_config_path = os.path.join(args.model_dir, 'config.yaml')
with open(saved_config_path, 'w') as fout:
data = yaml.dump(configs)
@ -164,7 +146,7 @@ def main():
# !!!IMPORTANT!!!
# Try to export the model by script, if fails, we should refine
# the code to satisfy the script export requirements
if args.rank == 0:
if rank == 0:
script_model = torch.jit.script(model)
script_model.save(os.path.join(args.model_dir, 'init.zip'))
executor = Executor()
@ -178,20 +160,19 @@ def main():
model_dir = args.model_dir
writer = None
if args.rank == 0:
if rank == 0:
os.makedirs(model_dir, exist_ok=True)
exp_id = os.path.basename(model_dir)
writer = SummaryWriter(os.path.join(args.tensorboard_dir, exp_id))
if distributed:
if world_size > 1:
assert (torch.cuda.is_available())
# cuda model is required for nn.parallel.DistributedDataParallel
model.cuda()
model = torch.nn.parallel.DistributedDataParallel(
model, find_unused_parameters=True)
model = torch.nn.parallel.DistributedDataParallel(model)
device = torch.device("cuda")
else:
use_cuda = args.gpu >= 0 and torch.cuda.is_available()
use_cuda = gpu >= 0 and torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
model = model.to(device)
@ -209,7 +190,7 @@ def main():
training_config['min_duration'] = args.min_duration
num_epochs = training_config.get('max_epoch', 100)
final_epoch = None
if start_epoch == 0 and args.rank == 0:
if start_epoch == 0 and rank == 0:
save_model_path = os.path.join(model_dir, 'init.pt')
save_checkpoint(model, save_model_path)
@ -221,11 +202,12 @@ def main():
logging.info('Epoch {} TRAIN info lr {}'.format(epoch, lr))
executor.train(model, optimizer, train_data_loader, device, writer,
training_config)
cv_loss, cv_acc = executor.cv(model, cv_data_loader, device, training_config)
logging.info('Epoch {} CV info cv_loss {} cv_acc {}'
.format(epoch, cv_loss, cv_acc))
cv_loss, cv_acc = executor.cv(model, cv_data_loader, device,
training_config)
logging.info('Epoch {} CV info cv_loss {} cv_acc {}'.format(
epoch, cv_loss, cv_acc))
if args.rank == 0:
if rank == 0:
save_model_path = os.path.join(model_dir, '{}.pt'.format(epoch))
save_checkpoint(model, save_model_path, {
'epoch': epoch,
@ -238,7 +220,7 @@ def main():
final_epoch = epoch
scheduler.step(cv_loss)
if final_epoch is not None and args.rank == 0:
if final_epoch is not None and rank == 0:
final_model_path = os.path.join(model_dir, 'final.pt')
os.symlink('{}.pt'.format(final_epoch), final_model_path)
writer.close()