src.utils.utils
1import numpy as np 2import pandas as pd 3import random 4import os 5 6def create_dataset(N: int, dist_type: str) -> tuple[list, pd.DataFrame, list]: 7 """ 8 Creates a dataset for frequency estimation. 9 10 Args: 11 N (int): Number of elements in the dataset. 12 dist_type (string): Distribution type [exp (exponential), norm (normal), small (values within a reduced domain)]. 13 14 Returns: 15 values (list): Generated dataset in list format. 16 df (DataFrame): Generated dataset in Pandas DataFrame format. 17 unique_values (list): Unique values (domain) in the dataset. 18 19 Examples: 20 >>> create_dataset(10**6, 'exp') 21 >>> create_dataset(1000, 'small') 22 """ 23 if dist_type == 'exp': 24 values = np.random.exponential(scale=2.0, size=N).astype(int) 25 elif dist_type == 'norm': 26 values = np.random.normal(loc=12, scale=2, size=N).astype(int) 27 elif dist_type == 'small': 28 elements = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'] 29 frequencies = [0.29, 0.19, 0.15, 0.12, 0.1, 0.08, 0.05, 0.02] 30 dataset = np.random.choice(elements, size=N, p=frequencies) 31 values = dataset.tolist() 32 np.random.shuffle(values) 33 34 df = pd.DataFrame({'value': values}) 35 unique_values = df['value'].unique().tolist() 36 unique_values.sort() 37 return values, df, unique_values 38 39def load_dataset(csv_filename): 40 """ 41 Loads a dataset from a CSV file and returns the values, the DataFrame, and unique 'value' entries. 42 43 Args: 44 csv_filename (str): Name of the CSV file located in the 'datasets' folder. 45 46 Returns: 47 values (list): Dataset in list format. 48 df (DataFrame): Dataset in Pandas DataFrame format. 49 unique_values (list): Unique values (domain) of the dataset. 50 """ 51 dataset_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'data/filtered', csv_filename + '.csv')) 52 df = pd.read_csv(dataset_path) 53 df = df[['value']] 54 values = df['value'].tolist() 55 unique_values = df['value'].unique().tolist() 56 return values, df, unique_values 57 58def generate_hash_functions(k, p, c, m): 59 """ 60 Generates a set of k c-independent hash functions (D -> m). 61 62 Args: 63 c (int): Number of coefficients for c-independent hash functions. 64 k (int): Number of hash functions. 65 p (int): Large prime number for hash function construction. 66 m (int): Maximum domain value to which the hash functions map. 67 68 Returns: 69 hash_functions (list): Set of k hash functions. 70 """ 71 hash_functions = [] 72 functions_params = [] 73 for _ in range(k): 74 coefficients = [random.randint(1, p - 1) for _ in range(c)] 75 hash_func = lambda x, coeffs=coefficients, p=p: (sum((coeffs[i] * (hash(x) ** i)) % p for i in range(c)) % p) % m 76 hash_functions.append(hash_func) 77 functions_params.append(coefficients) 78 return hash_functions 79 80def generate_hash_function_G(k, p): 81 hash_functions = [] 82 for _ in range(k): 83 a = random.randint(1, p - 1) 84 b = random.randint(0, p - 1) 85 c = random.randint(1, p - 1) 86 d = random.randint(0, p - 1) 87 88 def hash_func(x, a=a, b=b, c=c, d=d, p=p): 89 if isinstance(x, str) and x.startswith("AOI "): 90 x = int(x.split()[1]) 91 x_mod = x % p 92 h = (a + b * x_mod + c * pow(x_mod, 2, p) + d * pow(x_mod, 3, p)) % p 93 return 1 if (h % 2) == 0 else -1 94 95 hash_functions.append(hash_func) 96 return hash_functions 97 98def generate_error_table(real_freq: pd.DataFrame, estimated_freq: dict): 99 # Calculate errors 100 f = real_freq['value'].value_counts() 101 real_num_freq = f.sort_index().to_dict() 102 103 error_data = [] 104 for element in real_num_freq: 105 real_count = real_num_freq[element] 106 estimated_count = estimated_freq.get(element, 0) 107 if real_count > 0: 108 percent_error = abs(real_count - estimated_count) / real_count * 100 109 else: 110 percent_error = 0.0 111 error_data.append({ 112 "Item": element, 113 "Percentage Error": f"{percent_error:.2f}%" 114 }) 115 116 117 118def display_results(real_freq: pd.DataFrame, estimated_freq: dict): 119 120 N = real_freq.shape[0] 121 f = real_freq['value'].value_counts() 122 real_num_freq = f.sort_index().to_dict() 123 real_percent_freq = ((f * 100 / N).sort_index()).to_dict() 124 125 data_table = [] 126 for element in real_num_freq: 127 if element in estimated_freq: 128 real_count = real_num_freq[element] 129 real_percent = real_percent_freq[element] 130 estimated_count = estimated_freq[element] 131 estimated_percent = (estimated_count / N) * 100 132 diff = abs(real_num_freq[element] - estimated_freq[element]) 133 134 if real_count > 0: 135 percent_error = abs(real_count - estimated_count) / real_count * 100 136 else: 137 percent_error = 0.0 138 139 data_table.append([ 140 element, 141 real_count, 142 f"{real_percent:.3f}%", 143 f"{estimated_count:.2f}", 144 f"{estimated_percent:.3f}%", 145 f"{diff:.2f}", 146 f"{percent_error:.2f}%" 147 ]) 148 149 errors = [abs(real_num_freq[key] - estimated_freq[key]) for key in estimated_freq] 150 mean_error = np.mean(errors) 151 total_errors = np.sum(errors) 152 max_freq = max(real_num_freq.values()) 153 min_freq = min(real_num_freq.values()) 154 mse = np.sum([(real_num_freq[key] - estimated_freq[key]) ** 2 for key in estimated_freq]) / len(estimated_freq) 155 normalized_mse = mse / (max_freq - min_freq) 156 157 error_table = [ 158 ['Total Errors', f"{total_errors:.2f}"], 159 ['Mean Error', f"{mean_error:.2f}"], 160 ['Percentage Error', f"{(mean_error / N) * 100:.2f}%"], 161 ['MSE', f"{mse:.2f}"], 162 ['RMSE', f"{np.sqrt(mse):.2f}"], 163 ['Normalized MSE', f"{normalized_mse:.4f}"], 164 ['Normalized RMSE', f"{np.sqrt(normalized_mse):.2f}"] 165 ] 166 167 return data_table, error_table
7def create_dataset(N: int, dist_type: str) -> tuple[list, pd.DataFrame, list]: 8 """ 9 Creates a dataset for frequency estimation. 10 11 Args: 12 N (int): Number of elements in the dataset. 13 dist_type (string): Distribution type [exp (exponential), norm (normal), small (values within a reduced domain)]. 14 15 Returns: 16 values (list): Generated dataset in list format. 17 df (DataFrame): Generated dataset in Pandas DataFrame format. 18 unique_values (list): Unique values (domain) in the dataset. 19 20 Examples: 21 >>> create_dataset(10**6, 'exp') 22 >>> create_dataset(1000, 'small') 23 """ 24 if dist_type == 'exp': 25 values = np.random.exponential(scale=2.0, size=N).astype(int) 26 elif dist_type == 'norm': 27 values = np.random.normal(loc=12, scale=2, size=N).astype(int) 28 elif dist_type == 'small': 29 elements = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'] 30 frequencies = [0.29, 0.19, 0.15, 0.12, 0.1, 0.08, 0.05, 0.02] 31 dataset = np.random.choice(elements, size=N, p=frequencies) 32 values = dataset.tolist() 33 np.random.shuffle(values) 34 35 df = pd.DataFrame({'value': values}) 36 unique_values = df['value'].unique().tolist() 37 unique_values.sort() 38 return values, df, unique_values
Creates a dataset for frequency estimation.
Args: N (int): Number of elements in the dataset. dist_type (string): Distribution type [exp (exponential), norm (normal), small (values within a reduced domain)].
Returns: values (list): Generated dataset in list format. df (DataFrame): Generated dataset in Pandas DataFrame format. unique_values (list): Unique values (domain) in the dataset.
Examples:
create_dataset(10**6, 'exp') create_dataset(1000, 'small')
40def load_dataset(csv_filename): 41 """ 42 Loads a dataset from a CSV file and returns the values, the DataFrame, and unique 'value' entries. 43 44 Args: 45 csv_filename (str): Name of the CSV file located in the 'datasets' folder. 46 47 Returns: 48 values (list): Dataset in list format. 49 df (DataFrame): Dataset in Pandas DataFrame format. 50 unique_values (list): Unique values (domain) of the dataset. 51 """ 52 dataset_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'data/filtered', csv_filename + '.csv')) 53 df = pd.read_csv(dataset_path) 54 df = df[['value']] 55 values = df['value'].tolist() 56 unique_values = df['value'].unique().tolist() 57 return values, df, unique_values
Loads a dataset from a CSV file and returns the values, the DataFrame, and unique 'value' entries.
Args: csv_filename (str): Name of the CSV file located in the 'datasets' folder.
Returns: values (list): Dataset in list format. df (DataFrame): Dataset in Pandas DataFrame format. unique_values (list): Unique values (domain) of the dataset.
59def generate_hash_functions(k, p, c, m): 60 """ 61 Generates a set of k c-independent hash functions (D -> m). 62 63 Args: 64 c (int): Number of coefficients for c-independent hash functions. 65 k (int): Number of hash functions. 66 p (int): Large prime number for hash function construction. 67 m (int): Maximum domain value to which the hash functions map. 68 69 Returns: 70 hash_functions (list): Set of k hash functions. 71 """ 72 hash_functions = [] 73 functions_params = [] 74 for _ in range(k): 75 coefficients = [random.randint(1, p - 1) for _ in range(c)] 76 hash_func = lambda x, coeffs=coefficients, p=p: (sum((coeffs[i] * (hash(x) ** i)) % p for i in range(c)) % p) % m 77 hash_functions.append(hash_func) 78 functions_params.append(coefficients) 79 return hash_functions
Generates a set of k c-independent hash functions (D -> m).
Args: c (int): Number of coefficients for c-independent hash functions. k (int): Number of hash functions. p (int): Large prime number for hash function construction. m (int): Maximum domain value to which the hash functions map.
Returns: hash_functions (list): Set of k hash functions.
81def generate_hash_function_G(k, p): 82 hash_functions = [] 83 for _ in range(k): 84 a = random.randint(1, p - 1) 85 b = random.randint(0, p - 1) 86 c = random.randint(1, p - 1) 87 d = random.randint(0, p - 1) 88 89 def hash_func(x, a=a, b=b, c=c, d=d, p=p): 90 if isinstance(x, str) and x.startswith("AOI "): 91 x = int(x.split()[1]) 92 x_mod = x % p 93 h = (a + b * x_mod + c * pow(x_mod, 2, p) + d * pow(x_mod, 3, p)) % p 94 return 1 if (h % 2) == 0 else -1 95 96 hash_functions.append(hash_func) 97 return hash_functions
99def generate_error_table(real_freq: pd.DataFrame, estimated_freq: dict): 100 # Calculate errors 101 f = real_freq['value'].value_counts() 102 real_num_freq = f.sort_index().to_dict() 103 104 error_data = [] 105 for element in real_num_freq: 106 real_count = real_num_freq[element] 107 estimated_count = estimated_freq.get(element, 0) 108 if real_count > 0: 109 percent_error = abs(real_count - estimated_count) / real_count * 100 110 else: 111 percent_error = 0.0 112 error_data.append({ 113 "Item": element, 114 "Percentage Error": f"{percent_error:.2f}%" 115 })
119def display_results(real_freq: pd.DataFrame, estimated_freq: dict): 120 121 N = real_freq.shape[0] 122 f = real_freq['value'].value_counts() 123 real_num_freq = f.sort_index().to_dict() 124 real_percent_freq = ((f * 100 / N).sort_index()).to_dict() 125 126 data_table = [] 127 for element in real_num_freq: 128 if element in estimated_freq: 129 real_count = real_num_freq[element] 130 real_percent = real_percent_freq[element] 131 estimated_count = estimated_freq[element] 132 estimated_percent = (estimated_count / N) * 100 133 diff = abs(real_num_freq[element] - estimated_freq[element]) 134 135 if real_count > 0: 136 percent_error = abs(real_count - estimated_count) / real_count * 100 137 else: 138 percent_error = 0.0 139 140 data_table.append([ 141 element, 142 real_count, 143 f"{real_percent:.3f}%", 144 f"{estimated_count:.2f}", 145 f"{estimated_percent:.3f}%", 146 f"{diff:.2f}", 147 f"{percent_error:.2f}%" 148 ]) 149 150 errors = [abs(real_num_freq[key] - estimated_freq[key]) for key in estimated_freq] 151 mean_error = np.mean(errors) 152 total_errors = np.sum(errors) 153 max_freq = max(real_num_freq.values()) 154 min_freq = min(real_num_freq.values()) 155 mse = np.sum([(real_num_freq[key] - estimated_freq[key]) ** 2 for key in estimated_freq]) / len(estimated_freq) 156 normalized_mse = mse / (max_freq - min_freq) 157 158 error_table = [ 159 ['Total Errors', f"{total_errors:.2f}"], 160 ['Mean Error', f"{mean_error:.2f}"], 161 ['Percentage Error', f"{(mean_error / N) * 100:.2f}%"], 162 ['MSE', f"{mse:.2f}"], 163 ['RMSE', f"{np.sqrt(mse):.2f}"], 164 ['Normalized MSE', f"{normalized_mse:.4f}"], 165 ['Normalized RMSE', f"{np.sqrt(normalized_mse):.2f}"] 166 ] 167 168 return data_table, error_table