Task 2 - Voting - Cacher Snippet

5/19/2016 - 5:49 PM

Task 2 - Voting

#!/usr/bin/env python3

import os, sys, argparse, logging
# search for files
import glob

TOTAL_EMOTICON_TYPES = 40
N_TOP = 3

logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s')

def load_file(file_path, sent_dict, weight=[1, 1, 1], logger=None) :
    with open(file_path, 'r') as in_file :
        # skip the header
        next(in_file)

        for line in in_file :
            # ignore lines with empty text after the filtering process
            sid, emot = line.strip().split(',', maxsplit=1)
            emot = [int(x) for x in emot.split()]

            sid = int(sid)
            if sid not in sent_dict :
                sent_dict[sid] = [0 for i in range(1, TOTAL_EMOTICON_TYPES+1)]

            for i, e in enumerate(emot) :
                if i < len(weight) :
                    sent_dict[sid][e-1] += weight[i]
                else :
                    break

            if logger :
                logger.debug('{:d} {:s}'.format(sid, str(sent_dict[sid])))

    return sent_dict

def get_args() :
    parser = argparse.ArgumentParser(description='Train the classifier using trained doc2vec model.')
    parser.add_argument('--verbose', '-v', dest='verbose',
                        action='count', default=0,
                        help='control the display level of output logs')
    parser.add_argument('--outdir', '-o', dest='out_dir',
                        default='/tmp2/b03902036',
                        help='destination directory for the model file')
    parser.add_argument('ans_dir', nargs='+',
                        help='The directory that stores the answer files')

    return parser.parse_args()

if __name__ == '__main__' :
    # parse the command line arguments
    args = get_args()
    # get the logger object
    logger = logging.getLogger()
    # set the log level
    if args.verbose >= 2 :
        logger.setLevel(logging.DEBUG)
    elif args.verbose >= 1 :
        logger.setLevel(logging.INFO)
    else :
        logger.setLevel(logging.WARNING)

    if len(args.ans_dir) > 1 :
        logger.warning('currently only 1 directory is supported')
    args.ans_dir = args.ans_dir[0]
    logger.info('scanning in {:s}'.format(args.ans_dir))

    file_list = glob.glob(os.path.join(args.ans_dir, '**/*.ans'), recursive=True)

    sent_dict = dict()
    # traverse all the .ans file
    for file_path in file_list:
        logger.info('processing "{:s}"...'.format(file_path))
        sent_dict = load_file(file_path, sent_dict, logger=logger)

    # find the top N emoticons
    logger.info('voting in progress')
    for key, value in sent_dict.items() :
        value_sorted = sorted(range(1, len(value)+1), key=lambda i: value[i-1])[-N_TOP:]
        sent_dict[key] = value_sorted[::-1]

        logger.debug('{:d} {:s}'.format(key, str(sent_dict[key])))

    new_filepath = os.path.join(args.out_dir, 'result.cmb')
    with open(new_filepath, 'w') as out_file :
        out_file.write('Id,Emoticon\n')
        for key, value in sent_dict.items() :
            prediction = ' '.join([str(x) for x in value])
            out_file.write('{:d},{:s}\n'.format(key, prediction))

    logger.info('saved to {:s}'.format(new_filepath))

cmb-test.py

#!/usr/bin/env python3

import os, sys, argparse, logging

baseline = '/tmp2/b03902036/train-punc.pro'
result = '/tmp2/b03902036/result.cmb'

ground_truth = dict()
with open(baseline, 'r') as in_file :
    for line in in_file :
        # ignore lines with empty text after the filtering process
        sid, content = line.strip().split('\t', maxsplit=1)
        sid = int(sid)
        try :
            emot, txt= content.strip().split('\t', maxsplit=1)
        except ValueError :
            print('{:d} has value error'.format(sid))
        ground_truth[sid] = int(emot)

eval_weight=[1, 0.5, 0.333]

total_score = 0
total_trial = 0

with open(result, 'r') as in_file :
    next(in_file)
    for line in in_file :
        sid, emot = line.strip().split(',', maxsplit=1)
        emot_cand = emot.strip().split(' ')
        for j, weight in enumerate(eval_weight) :
            if int(emot_cand[j]) == ground_truth[int(sid)] :
                print('{:d} -> {:s}'.format(ground_truth[int(sid)], str(emot_cand)))
                total_score += weight
                break
        total_trial += 1
        print('{:f} / {:f} = {:f}'.format(total_score, total_trial, (total_score/total_trial)))

print('accuracy = {:f}'.format(total_score/total_trial))

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

Task 2 - Voting