liuyenting
5/19/2016 - 5:51 PM

Task 2 - DNN

Task 2 - DNN

#!/bin/bash

# two variables you need to set
pdnndir=/data/ASR5/babel/ymiao/tools/pdnn  # pointer to PDNN
device=gpu0  # the device to be used. set it to "cpu" if you don't have GPUs

# export environment variables
export PYTHONPATH=$PYTHONPATH:$pdnndir
export THEANO_FLAGS=mode=FAST_RUN,device=$device,floatX=float32

# download mnist dataset
wget http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

# split the dataset to training, validation and testing sets
# you will see train.pickle.gz, valid.pickle.gz, test.pickle.gz
echo "Preparing datasets ..."
python data_prep.py

# train DNN model
echo "Training the DNN model ..."
python $pdnndir/cmds/run_DNN.py --train-data "train.pickle.gz" \
                                --valid-data "valid.pickle.gz" \
                                --nnet-spec "784:1024:1024:10" --wdir ./ \
                                --l2-reg 0.0001 --lrate "C:0.1:200" --model-save-step 20 \
                                --param-output-file dnn.param --cfg-output-file dnn.cfg  >& dnn.training.log

# classification on the testing data; -1 means the final layer, that is, the classification softmax layer
echo "Classifying with the DNN model ..."
python $pdnndir/cmds/run_Extract_Feats.py --data "test.pickle.gz" \
                                          --nnet-param dnn.param --nnet-cfg dnn.cfg \
                                          --output-file "dnn.classify.pickle.gz" --layer-index -1 \
                                          --batch-size 100 >& dnn.testing.log

python show_results.py dnn.classify.pickle.gz


# train CNN model
echo "Training the CNN model ..."
python $pdnndir/cmds/run_CNN.py --train-data "train.pickle.gz" \
                                --valid-data "valid.pickle.gz" \
                                --conv-nnet-spec "1x28x28:20,5x5,p2x2:50,5x5,p2x2,f" --nnet-spec "512:10" --wdir ./ \
                                --l2-reg 0.0001 --lrate "C:0.1:200" --model-save-step 20 \
                                --param-output-file cnn.param --cfg-output-file cnn.cfg  >& cnn.training.log

echo "Classifying with the CNN model ..."
python $pdnndir/cmds/run_Extract_Feats.py --data "test.pickle.gz" \
                                          --nnet-param cnn.param --nnet-cfg cnn.cfg \
                                          --output-file "cnn.classify.pickle.gz" --layer-index -1 \
                                          --batch-size 100 >& cnn.testing.log

python show_results.py cnn.classify.pickle.gz
#!/usr/bin/env python3

import os, argparse, logging
# using doc2vec model
from gensim.models import Doc2Vec
# generate compressed pickle file
import pickle, numpy, gzip

logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s')

def extract_vec(model, sent_cnt, vec_dim, logger=None) :
    # ignore the first element, since it contains the test data
    total_sent_cnt = sum(sent_cnt[1:])

    vec_array = numpy.zeros((total_sent_cnt, vec_dim))
    vec_label = numpy.zeros(total_sent_cnt)

    curr_sent_idx = 0
    for emot, index_limit in enumerate(sent_cnt) :
        # ignore the first element in the counter list
        if emot == 0 :
            continue

        if logger :
            logger.info('... processing emoticon {:d}'.format(emot))

        for i in range(1,index_limit+1) :
            prefix = 'EMOTICON_{:d}_{:d}'.format(emot, i)
            vec_array[curr_sent_idx] = model.docvecs[prefix]
            vec_label[curr_sent_idx] = emot

            # increment the overall counter
            curr_sent_idx += 1

    return (vec_array, vec_label)

def get_args() :
    parser = argparse.ArgumentParser(description='Generate compressed pickle file of doc vectors.')
    parser.add_argument('--outdir', '-o', dest='out_dir',
                        default='/tmp2/b03902036',
                        help='destination directory for the model file')
    parser.add_argument('--verbose', '-v', dest='verbose',
                        action='count', default=0,
                        help='control the display level of output logs')
    parser.add_argument('mod_file', nargs='+',
                        help='Model file from doc2vec training')

    return parser.parse_args()

if __name__ == '__main__' :
    # parse the command line arguments
    args = get_args()
    # get the logger object
    logger = logging.getLogger()
    # set the log level
    if args.verbose >= 2 :
        logger.setLevel(logging.DEBUG)
    elif args.verbose >= 1 :
        logger.setLevel(logging.INFO)
    else :
        logger.setLevel(logging.WARNING)

    if len(args.mod_file) > 1 :
        logger.warning('additional model files are ignored except the first one')
    args.mod_file = args.mod_file[0]
    logger.info('loading model from "{:s}"'.format(args.mod_file))
    model = Doc2Vec.load(args.mod_file)

    logger.info('loading relevant data about the model')
    mif_base = os.path.splitext(args.mod_file)[0]
    with open(mif_base + '.mif', 'rb') as in_file :
        dat_file = pickle.load(in_file)
        sent_cnt = pickle.load(in_file)
        dim = pickle.load(in_file)
    logger.info('... model of {:d} features with {:d} emoticons is loaded'.format(dim, len(sent_cnt)-1))

    logger.info('extracting vectors from the model')
    feature, label = extract_vec(model, sent_cnt, dim, logger=logger)

    # create .pkl.gz file filepath
    basename = os.path.basename(args.mod_file)
    new_filename = os.path.splitext(basename)[0] + '.pkl.gz'
    new_filepath = os.path.join(args.out_dir, new_filename)

    # save the model
    with gzip.open(new_filepath, 'wb') as out_file :
        pickle.dump((feature, label), out_file, protocol=0)

    logger.info('numpy array saved to {:s}'.format(new_filepath))