Source code for advertools.word_frequency

from collections import defaultdict

import advertools as adv
import pandas as pd


[docs]def word_frequency(text_list, num_list, sep=None, rm_words=adv.stopwords['english']): word_freq = defaultdict(lambda: [0, 0]) for text, num in zip(text_list, num_list): for word in text.split(sep=sep): if word.lower() in rm_words: continue word_freq[word.lower()][0] += 1 word_freq[word.lower()][1] += num columns = {0: 'abs_freq', 1: 'wtd_freq'} abs_wtd_df = (pd.DataFrame.from_dict(word_freq, orient='index') .rename(columns=columns ) .sort_values('wtd_freq', ascending=False) .assign(rel_value=lambda df: df['wtd_freq'] / df['abs_freq']).round()) abs_wtd_df.insert(1, 'abs_perc', value=abs_wtd_df['abs_freq'] / abs_wtd_df['abs_freq'].sum()) abs_wtd_df.insert(2, 'abs_perc_cum', abs_wtd_df['abs_perc'].cumsum()) abs_wtd_df.insert(4, 'wtd_freq_perc', abs_wtd_df['wtd_freq'] / abs_wtd_df['wtd_freq'].sum()) abs_wtd_df.insert(5, 'wtd_freq_perc_cum', abs_wtd_df['wtd_freq_perc'].cumsum()) abs_wtd_df = abs_wtd_df.reset_index().rename(columns={'index': 'word'}) return abs_wtd_df