diff --git a/examples/speechcommand_v1/s0/run.sh b/examples/speechcommand_v1/s0/run.sh index 2092651..44bf258 100755 --- a/examples/speechcommand_v1/s0/run.sh +++ b/examples/speechcommand_v1/s0/run.sh @@ -6,7 +6,7 @@ export CUDA_VISIBLE_DEVICES="0" -stage=2 +stage=-1 stop_stage=2 num_keywords=11 diff --git a/kws/model/kws_model.py b/kws/model/kws_model.py index 2f2b2e2..effc889 100644 --- a/kws/model/kws_model.py +++ b/kws/model/kws_model.py @@ -123,6 +123,8 @@ def init_model(configs): print('Unknown body type {}'.format(backbone_type)) sys.exit(1) if 'classifier' in configs: + # For speech command dataset, we use 2 FC layer as classifier, + # we add dropout after first FC layer to prevent overfitting classifier_type = configs['classifier']['type'] dropout = configs['classifier']['dropout'] @@ -131,8 +133,11 @@ def init_model(configs): nn.Dropout(dropout), nn.Linear(64, output_dim)) if classifier_type == 'global': + # global means we add a global average pooling before classifier classifier = GlobalClassifier(classifier_base) elif classifier_type == 'last': + # last means we use last frame to do backpropagation, so the model + # can be infered streamingly classifier = LastClassifier(classifier_base) else: print('Unknown classifier type {}'.format(classifier_type))