Source code for advertools.word_frequency
from collections import defaultdict
import advertools as adv
import pandas as pd
[docs]def word_frequency(text_list, num_list, sep=None, rm_words=adv.stopwords['english']):
word_freq = defaultdict(lambda: [0, 0])
for text, num in zip(text_list, num_list):
for word in text.split(sep=sep):
if word.lower() in rm_words:
continue
word_freq[word.lower()][0] += 1
word_freq[word.lower()][1] += num
columns = {0: 'abs_freq', 1: 'wtd_freq'}
abs_wtd_df = (pd.DataFrame.from_dict(word_freq, orient='index')
.rename(columns=columns )
.sort_values('wtd_freq', ascending=False)
.assign(rel_value=lambda df: df['wtd_freq'] / df['abs_freq']).round())
abs_wtd_df.insert(1, 'abs_perc', value=abs_wtd_df['abs_freq'] / abs_wtd_df['abs_freq'].sum())
abs_wtd_df.insert(2, 'abs_perc_cum', abs_wtd_df['abs_perc'].cumsum())
abs_wtd_df.insert(4, 'wtd_freq_perc', abs_wtd_df['wtd_freq'] / abs_wtd_df['wtd_freq'].sum())
abs_wtd_df.insert(5, 'wtd_freq_perc_cum', abs_wtd_df['wtd_freq_perc'].cumsum())
abs_wtd_df = abs_wtd_df.reset_index().rename(columns={'index': 'word'})
return abs_wtd_df