src.test.test_distributions

  1import numpy as np
  2import pandas as pd
  3import random
  4import string
  5import sys
  6import os
  7from tabulate import tabulate
  8import matplotlib.pyplot as plt
  9
 10sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 11
 12from private_count_mean.private_cms_client import run_private_cms_client
 13from scripts.preprocess import run_data_processor
 14
 15
 16def generate_user_id(length=10):
 17    """
 18    Generates a random user ID of the specified length.
 19
 20    Args:
 21        length (int): The length of the generated user ID (default is 10).
 22
 23    Returns:
 24        str: A randomly generated user ID.
 25    """
 26    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length))
 27
 28def generate_dataset(distribution, n):
 29    """
 30    Generates a dataset with a specified distribution and saves it as a CSV file.
 31
 32    Args:
 33        distribution (str): The type of distribution to generate ('normal', 'laplace', 'uniform', 'exp').
 34        n (int): The number of data points to generate.
 35
 36    Creates a CSV file with the dataset and stores it in the '../../data/filtered' directory.
 37    """
 38    if distribution == 'normal':
 39        valores = np.random.normal(loc=12, scale=2, size=n).astype(int)
 40    elif distribution == 'laplace':
 41        valores = np.random.laplace(loc=12, scale=2, size=n).astype(int)
 42    elif distribution == 'uniform':
 43        valores = np.random.uniform(low=0, high=4, size=n).astype(int)
 44    elif distribution == "exp":
 45        valores = np.random.exponential(scale=2.0, size=n).astype(int)
 46
 47    user_ids = ["S01post"] * n
 48
 49    
 50    user_ids = list(user_ids)
 51
 52    data = {'user_id': user_ids, 'value': valores}
 53    df = pd.DataFrame(data)
 54
 55    return df
 56
 57def run_distribution_test():
 58    """
 59    Runs a distribution test by generating datasets for different distributions and evaluating 
 60    their error metrics using the Private Count Mean Sketch (PrivateCMS).
 61
 62    Generates datasets for different distributions and calculates error metrics for 
 63    each distribution using various values for the 'k' and 'm' parameters. 
 64    It visualizes the estimated frequency distribution and displays the results as a table.
 65
 66    Results include error metrics such as mean error, percentage error, MSE, RMSE, 
 67    and Pearson Correlation Coefficient.
 68    """
 69    N = 50000
 70    k = [16, 128, 128, 1024, 32768]
 71    m = [16, 16, 1024, 256, 256]
 72    e = 2
 73
 74    # Define distributions
 75    distributions = ['laplace', 'uniform', 'normal', 'exp']
 76
 77    for i in range(len(distributions)):
 78        print(f"\n================== {distributions[i]} ==================")
 79        
 80        # Generate the dataset
 81        df = generate_dataset(distributions[i], N)
 82
 83        filename = f"{distributions[i]}_{N}"
 84
 85        general_table = []
 86
 87        for j in range(5):
 88            print(f"\nk={k[j]}, m={m[j]} ==================")
 89            _, error_table, estimated_freq = run_private_cms_client(k[j], m[j], e, df)
 90
 91            error_dict = { key: value for key, value in error_table }
 92
 93            row = [
 94                k[j],
 95                m[j],
 96                error_dict.get("Mean Error", ""),
 97                error_dict.get("Percentage Error", ""),
 98                error_dict.get("MSE", ""),
 99                error_dict.get("RMSE", ""),
100                error_dict.get("Normalized MSE", ""),
101                error_dict.get("Normalized RMSE", ""),
102                error_dict.get("Pearson Correlation Coefficient", "")
103            ]
104            general_table.append(row)
105
106            if j == 4:
107                keys = list(estimated_freq.keys())
108                values = list(estimated_freq.values())
109                
110                plt.figure(figsize=(10, 6))
111                plt.bar(keys, values, color='skyblue')
112                plt.xlabel("Element")
113                plt.ylabel("Estimated Frequency")
114                plt.title(f"Estimated Frequencies\nDistribution: {distributions[i]} (k={k[j]}, m={m[j]})")
115                plt.xticks(rotation=45)
116                plt.tight_layout()
117                plt.show()
118
119        headers = [
120            "k", "m", "Mean Error", "Percentage Error", 
121            "MSE", "RMSE", "Normalized MSE", "Normalized RMSE", "Pearson Corr"
122        ]
123
124        print(tabulate(general_table, headers=headers, tablefmt="grid"))
125
126
127if __name__ == '__main__':
128    run_distribution_test()
def generate_user_id(length=10):
17def generate_user_id(length=10):
18    """
19    Generates a random user ID of the specified length.
20
21    Args:
22        length (int): The length of the generated user ID (default is 10).
23
24    Returns:
25        str: A randomly generated user ID.
26    """
27    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length))

Generates a random user ID of the specified length.

Args: length (int): The length of the generated user ID (default is 10).

Returns: str: A randomly generated user ID.

def generate_dataset(distribution, n):
29def generate_dataset(distribution, n):
30    """
31    Generates a dataset with a specified distribution and saves it as a CSV file.
32
33    Args:
34        distribution (str): The type of distribution to generate ('normal', 'laplace', 'uniform', 'exp').
35        n (int): The number of data points to generate.
36
37    Creates a CSV file with the dataset and stores it in the '../../data/filtered' directory.
38    """
39    if distribution == 'normal':
40        valores = np.random.normal(loc=12, scale=2, size=n).astype(int)
41    elif distribution == 'laplace':
42        valores = np.random.laplace(loc=12, scale=2, size=n).astype(int)
43    elif distribution == 'uniform':
44        valores = np.random.uniform(low=0, high=4, size=n).astype(int)
45    elif distribution == "exp":
46        valores = np.random.exponential(scale=2.0, size=n).astype(int)
47
48    user_ids = ["S01post"] * n
49
50    
51    user_ids = list(user_ids)
52
53    data = {'user_id': user_ids, 'value': valores}
54    df = pd.DataFrame(data)
55
56    return df

Generates a dataset with a specified distribution and saves it as a CSV file.

Args: distribution (str): The type of distribution to generate ('normal', 'laplace', 'uniform', 'exp'). n (int): The number of data points to generate.

Creates a CSV file with the dataset and stores it in the '../../data/filtered' directory.

def run_distribution_test():
 58def run_distribution_test():
 59    """
 60    Runs a distribution test by generating datasets for different distributions and evaluating 
 61    their error metrics using the Private Count Mean Sketch (PrivateCMS).
 62
 63    Generates datasets for different distributions and calculates error metrics for 
 64    each distribution using various values for the 'k' and 'm' parameters. 
 65    It visualizes the estimated frequency distribution and displays the results as a table.
 66
 67    Results include error metrics such as mean error, percentage error, MSE, RMSE, 
 68    and Pearson Correlation Coefficient.
 69    """
 70    N = 50000
 71    k = [16, 128, 128, 1024, 32768]
 72    m = [16, 16, 1024, 256, 256]
 73    e = 2
 74
 75    # Define distributions
 76    distributions = ['laplace', 'uniform', 'normal', 'exp']
 77
 78    for i in range(len(distributions)):
 79        print(f"\n================== {distributions[i]} ==================")
 80        
 81        # Generate the dataset
 82        df = generate_dataset(distributions[i], N)
 83
 84        filename = f"{distributions[i]}_{N}"
 85
 86        general_table = []
 87
 88        for j in range(5):
 89            print(f"\nk={k[j]}, m={m[j]} ==================")
 90            _, error_table, estimated_freq = run_private_cms_client(k[j], m[j], e, df)
 91
 92            error_dict = { key: value for key, value in error_table }
 93
 94            row = [
 95                k[j],
 96                m[j],
 97                error_dict.get("Mean Error", ""),
 98                error_dict.get("Percentage Error", ""),
 99                error_dict.get("MSE", ""),
100                error_dict.get("RMSE", ""),
101                error_dict.get("Normalized MSE", ""),
102                error_dict.get("Normalized RMSE", ""),
103                error_dict.get("Pearson Correlation Coefficient", "")
104            ]
105            general_table.append(row)
106
107            if j == 4:
108                keys = list(estimated_freq.keys())
109                values = list(estimated_freq.values())
110                
111                plt.figure(figsize=(10, 6))
112                plt.bar(keys, values, color='skyblue')
113                plt.xlabel("Element")
114                plt.ylabel("Estimated Frequency")
115                plt.title(f"Estimated Frequencies\nDistribution: {distributions[i]} (k={k[j]}, m={m[j]})")
116                plt.xticks(rotation=45)
117                plt.tight_layout()
118                plt.show()
119
120        headers = [
121            "k", "m", "Mean Error", "Percentage Error", 
122            "MSE", "RMSE", "Normalized MSE", "Normalized RMSE", "Pearson Corr"
123        ]
124
125        print(tabulate(general_table, headers=headers, tablefmt="grid"))

Runs a distribution test by generating datasets for different distributions and evaluating their error metrics using the Private Count Mean Sketch (PrivateCMS).

Generates datasets for different distributions and calculates error metrics for each distribution using various values for the 'k' and 'm' parameters. It visualizes the estimated frequency distribution and displays the results as a table.

Results include error metrics such as mean error, percentage error, MSE, RMSE, and Pearson Correlation Coefficient.