job title normalization version 0.1
import argparse
import re
import nltk
import sys
import inflect
#Revmoe Punctuations that definitely won't contribute
def remove_punctuation(s):
#Phase 1, handle punctuation.
# Directly removable
# '/', ',', '\', '|',
for char in ['/', '//', '|', '?', '(', ')', '{', '}', '[', ']', ',', '.', '-']:
s = s.replace(char, " ")
s.strip()
#remove those consists of only punctuation and number
s = filter(lambda x: not re.match("[0-9~!@#$%^&*()_\-+{}\":;\']+", x), s.split())
return ' '.join(s)
#Read in a file, create a list of string
def readfile(filelocation):
with open(filelocation) as f:
l = f.readlines()
return [i.strip() for i in l]
#Remove Punctuation
#takes in a list of job titles, remove punctuation
def removePunctuation(jobtitle_list):
return [remove_punctuation(s) for s in jobtitle_list]
#Word Count
def wordCount(jobtitle_list):
wordcount = {}
for jobtitle in jobtitle_list:
for word in jobtitle.split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
return wordcount
#Sort Word Count
#return a sorted word count list
def sortWordCount(wordCountDict):
return sorted(wordCountDict, key = wordCountDict.get, reverse=True)
#Based on Given Percentage, output list of top match word
def getHighFrequencyWords(wordCountDict, sortedWordCountList, percentage=0.6):
count = 0;
total = sum(wordCountDict.values())
expected_count = total * percentage
index = 0
for word in sortedWordCountList:
count += wordCountDict[word]
if count > expected_count:
index = sortedWordCountList.index(word)
break
result = [s for s in sortedWordCountList[0:index] if s != len(s) * s[0]]
result = [s for s in result if "NN" in nltk.pos_tag([s])[0][1]]
result = [s for s in result if s != "Sr"]
return result
#Filter Words from raw Job Title use the high frequency word we have built
def filterWordsFormRawJobTitle(jobtitle_list, high_frequency_list):
jobtitle_list = [" ". join(filter(lambda x: x in high_frequency_list, title.split())) for title in jobtitle_list]
jobtitle_list = [jobtitle for jobtitle in jobtitle_list if len(jobtitle.split()) >= 2]
return jobtitle_list
#Remove duplicates in one job title match and remove plural
def unique_list(l):
ulist = []
p = inflect.engine()
[ulist.append(x) for x in l if x not in ulist]
ulist = [p.singular_noun(x) if p.singular_noun(x) else x for x in ulist]
return ulist
#A bunch of string normalization
def stringNormalization(jobtitle_list):
#Change To Camel Case
jobtitle_list = [' '.join(word[0].upper() + word[1:] for word in job.split()) for job in jobtitle_list]
#Remove Duplicate Words
jobtitle_list = [" ".join(unique_list(job.split())) for job in jobtitle_list]
#Aggregate to Set
return list(set(jobtitle_list))
if __name__=="__main__":
parser = argparse.ArgumentParser(description='provide master ip, input file location and output file location, may be google search api later')
parser.add_argument('-i', action="store", dest="input_file_location", default="/Users/jinxuanwu/data/tmp/javajobtitles.txt")
parser.add_argument('-p', action="store", dest="percentage", default=0.6)
argv = parser.parse_args(sys.argv[1:])
inputfile_location=argv.input_file_location
percentage = argv.percentage
jobtitle_list = readfile(inputfile_location)
jobtitle_list_after_remove_Punctuation = removePunctuation(jobtitle_list)
wordCountDict = wordCount(jobtitle_list_after_remove_Punctuation)
sortedWordList = sortWordCount(wordCountDict)
HighFrequencyList = getHighFrequencyWords(wordCountDict, sortedWordList, percentage)
FilteredList = filterWordsFormRawJobTitle(jobtitle_list_after_remove_Punctuation, HighFrequencyList)
res = stringNormalization(FilteredList)
print len(res)
with open("result.txt", "w") as f:
f.writelines([i + "\n" for i in res])