jinxuan
4/26/2016 - 1:51 AM

job title normalization version 0.1

job title normalization version 0.1

import argparse
import re
import nltk
import sys
import inflect

#Revmoe Punctuations that definitely won't contribute
def remove_punctuation(s):
    #Phase 1, handle punctuation.
    # Directly removable
    #  '/', ',', '\', '|',
    for char in ['/', '//', '|', '?', '(', ')', '{', '}', '[', ']', ',', '.', '-']:
        s = s.replace(char, " ")
    s.strip()
    #remove those consists of only punctuation and number
    s = filter(lambda x: not re.match("[0-9~!@#$%^&*()_\-+{}\":;\']+", x), s.split())
    return ' '.join(s)

#Read in a file, create a list of string
def readfile(filelocation):
    with open(filelocation) as f:
        l = f.readlines()
        return [i.strip() for i in l]
    
#Remove Punctuation
#takes in a list of job titles, remove punctuation
def removePunctuation(jobtitle_list):
   return [remove_punctuation(s) for s in jobtitle_list]

#Word Count
def wordCount(jobtitle_list):
    wordcount = {}
    for jobtitle in jobtitle_list:
        for word in jobtitle.split():
            if word not in wordcount:
                wordcount[word] = 1
            else:
                wordcount[word] += 1
    return wordcount

#Sort Word Count
#return a sorted word count list
def sortWordCount(wordCountDict):
    return sorted(wordCountDict, key = wordCountDict.get, reverse=True)

#Based on Given Percentage, output list of top match word
def getHighFrequencyWords(wordCountDict, sortedWordCountList, percentage=0.6):
    count = 0;
    total = sum(wordCountDict.values())
    expected_count = total * percentage
    index = 0
    for word in sortedWordCountList:
        count += wordCountDict[word]
        if count > expected_count:
            index = sortedWordCountList.index(word)
            break
    result = [s for s in sortedWordCountList[0:index] if s != len(s) * s[0]]
    result = [s for s in result if "NN" in nltk.pos_tag([s])[0][1]]
    result = [s for s in result if s != "Sr"]
    return result

#Filter Words from raw Job Title use the high frequency word we have built
def filterWordsFormRawJobTitle(jobtitle_list, high_frequency_list):
    jobtitle_list =  [" ". join(filter(lambda x: x in high_frequency_list, title.split())) for title in jobtitle_list]
    jobtitle_list =  [jobtitle for jobtitle in jobtitle_list if len(jobtitle.split()) >= 2]
    return jobtitle_list

#Remove duplicates in one job title match and remove plural
def unique_list(l):
    ulist = []
    p = inflect.engine()
    [ulist.append(x) for x in l if x not in ulist]
    ulist = [p.singular_noun(x) if p.singular_noun(x) else x for x in ulist]
    return ulist

#A bunch of string normalization 
def stringNormalization(jobtitle_list):
    #Change To Camel Case
    jobtitle_list = [' '.join(word[0].upper() + word[1:] for word in job.split()) for job in jobtitle_list]
    #Remove Duplicate Words
    jobtitle_list = [" ".join(unique_list(job.split())) for job in jobtitle_list]
    #Aggregate to Set
    return list(set(jobtitle_list))

if __name__=="__main__":
    parser = argparse.ArgumentParser(description='provide master ip, input file location and output file location, may be google search api later')
    parser.add_argument('-i', action="store", dest="input_file_location", default="/Users/jinxuanwu/data/tmp/javajobtitles.txt")
    parser.add_argument('-p', action="store", dest="percentage", default=0.6)
    argv = parser.parse_args(sys.argv[1:])
    inputfile_location=argv.input_file_location
    percentage = argv.percentage
    jobtitle_list = readfile(inputfile_location)
    jobtitle_list_after_remove_Punctuation = removePunctuation(jobtitle_list)
    wordCountDict = wordCount(jobtitle_list_after_remove_Punctuation)
    sortedWordList = sortWordCount(wordCountDict)
    HighFrequencyList = getHighFrequencyWords(wordCountDict, sortedWordList, percentage)
    FilteredList = filterWordsFormRawJobTitle(jobtitle_list_after_remove_Punctuation, HighFrequencyList)
    res =  stringNormalization(FilteredList)
    print len(res)
    with open("result.txt", "w") as f:
        f.writelines([i + "\n" for i in res])