src.test.test_distributions
1import numpy as np 2import pandas as pd 3import random 4import string 5import sys 6import os 7from tabulate import tabulate 8import matplotlib.pyplot as plt 9 10sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 11 12from private_count_mean.private_cms_client import run_private_cms_client 13from scripts.preprocess import run_data_processor 14 15 16def generate_user_id(length=10): 17 """ 18 Generates a random user ID of the specified length. 19 20 Args: 21 length (int): The length of the generated user ID (default is 10). 22 23 Returns: 24 str: A randomly generated user ID. 25 """ 26 return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length)) 27 28def generate_dataset(distribution, n): 29 """ 30 Generates a dataset with a specified distribution and saves it as a CSV file. 31 32 Args: 33 distribution (str): The type of distribution to generate ('normal', 'laplace', 'uniform', 'exp'). 34 n (int): The number of data points to generate. 35 36 Creates a CSV file with the dataset and stores it in the '../../data/filtered' directory. 37 """ 38 if distribution == 'normal': 39 valores = np.random.normal(loc=12, scale=2, size=n).astype(int) 40 elif distribution == 'laplace': 41 valores = np.random.laplace(loc=12, scale=2, size=n).astype(int) 42 elif distribution == 'uniform': 43 valores = np.random.uniform(low=0, high=4, size=n).astype(int) 44 elif distribution == "exp": 45 valores = np.random.exponential(scale=2.0, size=n).astype(int) 46 47 user_ids = ["S01post"] * n 48 49 50 user_ids = list(user_ids) 51 52 data = {'user_id': user_ids, 'value': valores} 53 df = pd.DataFrame(data) 54 55 return df 56 57def run_distribution_test(): 58 """ 59 Runs a distribution test by generating datasets for different distributions and evaluating 60 their error metrics using the Private Count Mean Sketch (PrivateCMS). 61 62 Generates datasets for different distributions and calculates error metrics for 63 each distribution using various values for the 'k' and 'm' parameters. 64 It visualizes the estimated frequency distribution and displays the results as a table. 65 66 Results include error metrics such as mean error, percentage error, MSE, RMSE, 67 and Pearson Correlation Coefficient. 68 """ 69 N = 50000 70 k = [16, 128, 128, 1024, 32768] 71 m = [16, 16, 1024, 256, 256] 72 e = 2 73 74 # Define distributions 75 distributions = ['laplace', 'uniform', 'normal', 'exp'] 76 77 for i in range(len(distributions)): 78 print(f"\n================== {distributions[i]} ==================") 79 80 # Generate the dataset 81 df = generate_dataset(distributions[i], N) 82 83 filename = f"{distributions[i]}_{N}" 84 85 general_table = [] 86 87 for j in range(5): 88 print(f"\nk={k[j]}, m={m[j]} ==================") 89 _, error_table, estimated_freq = run_private_cms_client(k[j], m[j], e, df) 90 91 error_dict = { key: value for key, value in error_table } 92 93 row = [ 94 k[j], 95 m[j], 96 error_dict.get("Mean Error", ""), 97 error_dict.get("Percentage Error", ""), 98 error_dict.get("MSE", ""), 99 error_dict.get("RMSE", ""), 100 error_dict.get("Normalized MSE", ""), 101 error_dict.get("Normalized RMSE", ""), 102 error_dict.get("Pearson Correlation Coefficient", "") 103 ] 104 general_table.append(row) 105 106 if j == 4: 107 keys = list(estimated_freq.keys()) 108 values = list(estimated_freq.values()) 109 110 plt.figure(figsize=(10, 6)) 111 plt.bar(keys, values, color='skyblue') 112 plt.xlabel("Element") 113 plt.ylabel("Estimated Frequency") 114 plt.title(f"Estimated Frequencies\nDistribution: {distributions[i]} (k={k[j]}, m={m[j]})") 115 plt.xticks(rotation=45) 116 plt.tight_layout() 117 plt.show() 118 119 headers = [ 120 "k", "m", "Mean Error", "Percentage Error", 121 "MSE", "RMSE", "Normalized MSE", "Normalized RMSE", "Pearson Corr" 122 ] 123 124 print(tabulate(general_table, headers=headers, tablefmt="grid")) 125 126 127if __name__ == '__main__': 128 run_distribution_test()
17def generate_user_id(length=10): 18 """ 19 Generates a random user ID of the specified length. 20 21 Args: 22 length (int): The length of the generated user ID (default is 10). 23 24 Returns: 25 str: A randomly generated user ID. 26 """ 27 return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length))
Generates a random user ID of the specified length.
Args: length (int): The length of the generated user ID (default is 10).
Returns: str: A randomly generated user ID.
29def generate_dataset(distribution, n): 30 """ 31 Generates a dataset with a specified distribution and saves it as a CSV file. 32 33 Args: 34 distribution (str): The type of distribution to generate ('normal', 'laplace', 'uniform', 'exp'). 35 n (int): The number of data points to generate. 36 37 Creates a CSV file with the dataset and stores it in the '../../data/filtered' directory. 38 """ 39 if distribution == 'normal': 40 valores = np.random.normal(loc=12, scale=2, size=n).astype(int) 41 elif distribution == 'laplace': 42 valores = np.random.laplace(loc=12, scale=2, size=n).astype(int) 43 elif distribution == 'uniform': 44 valores = np.random.uniform(low=0, high=4, size=n).astype(int) 45 elif distribution == "exp": 46 valores = np.random.exponential(scale=2.0, size=n).astype(int) 47 48 user_ids = ["S01post"] * n 49 50 51 user_ids = list(user_ids) 52 53 data = {'user_id': user_ids, 'value': valores} 54 df = pd.DataFrame(data) 55 56 return df
Generates a dataset with a specified distribution and saves it as a CSV file.
Args: distribution (str): The type of distribution to generate ('normal', 'laplace', 'uniform', 'exp'). n (int): The number of data points to generate.
Creates a CSV file with the dataset and stores it in the '../../data/filtered' directory.
58def run_distribution_test(): 59 """ 60 Runs a distribution test by generating datasets for different distributions and evaluating 61 their error metrics using the Private Count Mean Sketch (PrivateCMS). 62 63 Generates datasets for different distributions and calculates error metrics for 64 each distribution using various values for the 'k' and 'm' parameters. 65 It visualizes the estimated frequency distribution and displays the results as a table. 66 67 Results include error metrics such as mean error, percentage error, MSE, RMSE, 68 and Pearson Correlation Coefficient. 69 """ 70 N = 50000 71 k = [16, 128, 128, 1024, 32768] 72 m = [16, 16, 1024, 256, 256] 73 e = 2 74 75 # Define distributions 76 distributions = ['laplace', 'uniform', 'normal', 'exp'] 77 78 for i in range(len(distributions)): 79 print(f"\n================== {distributions[i]} ==================") 80 81 # Generate the dataset 82 df = generate_dataset(distributions[i], N) 83 84 filename = f"{distributions[i]}_{N}" 85 86 general_table = [] 87 88 for j in range(5): 89 print(f"\nk={k[j]}, m={m[j]} ==================") 90 _, error_table, estimated_freq = run_private_cms_client(k[j], m[j], e, df) 91 92 error_dict = { key: value for key, value in error_table } 93 94 row = [ 95 k[j], 96 m[j], 97 error_dict.get("Mean Error", ""), 98 error_dict.get("Percentage Error", ""), 99 error_dict.get("MSE", ""), 100 error_dict.get("RMSE", ""), 101 error_dict.get("Normalized MSE", ""), 102 error_dict.get("Normalized RMSE", ""), 103 error_dict.get("Pearson Correlation Coefficient", "") 104 ] 105 general_table.append(row) 106 107 if j == 4: 108 keys = list(estimated_freq.keys()) 109 values = list(estimated_freq.values()) 110 111 plt.figure(figsize=(10, 6)) 112 plt.bar(keys, values, color='skyblue') 113 plt.xlabel("Element") 114 plt.ylabel("Estimated Frequency") 115 plt.title(f"Estimated Frequencies\nDistribution: {distributions[i]} (k={k[j]}, m={m[j]})") 116 plt.xticks(rotation=45) 117 plt.tight_layout() 118 plt.show() 119 120 headers = [ 121 "k", "m", "Mean Error", "Percentage Error", 122 "MSE", "RMSE", "Normalized MSE", "Normalized RMSE", "Pearson Corr" 123 ] 124 125 print(tabulate(general_table, headers=headers, tablefmt="grid"))
Runs a distribution test by generating datasets for different distributions and evaluating their error metrics using the Private Count Mean Sketch (PrivateCMS).
Generates datasets for different distributions and calculates error metrics for each distribution using various values for the 'k' and 'm' parameters. It visualizes the estimated frequency distribution and displays the results as a table.
Results include error metrics such as mean error, percentage error, MSE, RMSE, and Pearson Correlation Coefficient.