vgrabovets
3/28/2017 - 8:25 AM

get most common words

get most common words

def get_most_common_words(df, column_name, column_category,  output_file):
    print('Calculating common words for all company types')
    names_list = [word for name in df[column_name] if type(name) == str 
                       for word in name.split()]
    common_words = Counter(names_list).most_common(100)
    writer = pd.ExcelWriter(output_file)
    pd.DataFrame(common_words, columns=['word', 'count']).to_excel(writer, sheet_name='all', index=False)
    
    #by category
    print('Calculating common words per company type')
    legal_nature = df[column_category].unique()
    common_w_dict = dict()
    cnt = 0
    for l in legal_nature:
        names_list = [w for n in df[df[column_category] == l][column_name] if type(n) == str 
                        for w in n.split()]
        common_words = Counter(names_list).most_common(100)
        common_w_dict[l.strip()] = common_words
        cnt += 1
        print('Done %0.2f ' % (round(cnt/len(legal_nature), 2)), end="\r", flush=True)
    print('\n')

    common_words_df = pd.DataFrame()
    for c in common_w_dict:
        temp_df = pd.DataFrame(common_w_dict[c]).set_index(0)
        temp_df.columns = [c]
        common_words_df = common_words_df.join(temp_df, how='outer')
        common_words_df = common_words_df.fillna(0)
    common_words_df['total'] = common_words_df.apply(lambda x: sum(x), axis=1)
    common_words_df = common_words_df.sort_values(by='total', ascending=False)
    del common_words_df['total']
    common_words_df = round(common_words_df.div(common_words_df.sum(axis=1), axis=0), 3)
    common_words_df.to_excel(writer, sheet_name='by_type')
    writer.save()
    print('Finished')