get most common words
def get_most_common_words(df, column_name, column_category, output_file):
print('Calculating common words for all company types')
names_list = [word for name in df[column_name] if type(name) == str
for word in name.split()]
common_words = Counter(names_list).most_common(100)
writer = pd.ExcelWriter(output_file)
pd.DataFrame(common_words, columns=['word', 'count']).to_excel(writer, sheet_name='all', index=False)
#by category
print('Calculating common words per company type')
legal_nature = df[column_category].unique()
common_w_dict = dict()
cnt = 0
for l in legal_nature:
names_list = [w for n in df[df[column_category] == l][column_name] if type(n) == str
for w in n.split()]
common_words = Counter(names_list).most_common(100)
common_w_dict[l.strip()] = common_words
cnt += 1
print('Done %0.2f ' % (round(cnt/len(legal_nature), 2)), end="\r", flush=True)
print('\n')
common_words_df = pd.DataFrame()
for c in common_w_dict:
temp_df = pd.DataFrame(common_w_dict[c]).set_index(0)
temp_df.columns = [c]
common_words_df = common_words_df.join(temp_df, how='outer')
common_words_df = common_words_df.fillna(0)
common_words_df['total'] = common_words_df.apply(lambda x: sum(x), axis=1)
common_words_df = common_words_df.sort_values(by='total', ascending=False)
del common_words_df['total']
common_words_df = round(common_words_df.div(common_words_df.sum(axis=1), axis=0), 3)
common_words_df.to_excel(writer, sheet_name='by_type')
writer.save()
print('Finished')