src.scripts.parameter_fitting
1import optuna 2from colorama import Fore, Style 3from tabulate import tabulate 4 5from count_mean.private_cms_client import run_private_cms_client 6from hadamard_count_mean.private_hcms_client import run_private_hcms_client 7 8class PrivacyUtilityOptimizer: 9 """ 10 Optimizes the privacy-utility tradeoff by tuning the privacy parameter `e`. 11 12 Attributes: 13 df (pd.DataFrame): Input dataset containing values to be privatized. 14 algorithm (str): Selected privacy algorithm (1 for CMS, 2 for HCMS). 15 k (int): Parameter k for the selected algorithm. 16 m (int): Parameter m for the selected algorithm. 17 real_frequency (pd.DataFrame): True frequency distribution of elements in `df`. 18 N (int): Total count of elements in `df`. 19 headers (list): Column headers for displaying tabular results. 20 """ 21 def __init__(self, df, k, m, algorithm): 22 """ 23 Initializes the PrivacyUtilityOptimizer class with dataset and algorithm parameters. 24 25 Args: 26 df (pd.DataFrame): Input dataset. 27 k (int): Parameter k for the selected algorithm. 28 m (int): Parameter m for the selected algorithm. 29 algorithm (str): Algorithm choice (1 for CMS, 2 for HCMS). 30 """ 31 self.df = df 32 self.algorithm = algorithm 33 self.k = k 34 self.m = m 35 36 self.real_frequency = self.get_real_frequency() 37 self.N = self.real_frequency['Frequency'].sum() 38 self.headers =[ "Element", "Real Frequency", "Real Percentage", "Estimated Frequency", "Estimated Percentage", "Estimation Difference", "Percentage Error"] 39 40 41 def function_LP(self, f_estimated, f_real, p): 42 """ 43 Computes the Lp norm error between estimated and real frequencies. 44 45 Args: 46 f_estimated (pd.DataFrame): Estimated frequency distribution. 47 f_real (pd.DataFrame): Real frequency distribution. 48 p (float): Order of the Lp norm. 49 50 Returns: 51 float: Computed Lp error. 52 """ 53 merged = f_estimated.merge(f_real, on="Element", suffixes=("_estimated", "_real")) 54 return (1 / self.N) * sum(abs(row["Frequency_estimated"] - row["Frequency_real"]) ** p for _, row in merged.iterrows()) 55 56 def run_command(self, e): 57 """ 58 Runs the selected privacy algorithm with a given privacy budget `e`. 59 60 Args: 61 e (float): Privacy parameter. 62 63 Returns: 64 tuple: Containing result, data table, error table, privatized data, and estimated frequencies. 65 """ 66 if self.algorithm == '1': 67 result, data_table, error_table, privatized_data, df_estimated = run_private_cms_client(self.k, self.m, e, self.df) 68 elif self.algorithm == '2': 69 result, data_table, error_table, privatized_data, df_estimated = run_private_hcms_client(self.k, self.m, e, self.df) 70 71 self.frequency_estimation = df_estimated 72 return result, data_table, error_table, privatized_data 73 74 def get_real_frequency(self): 75 """ 76 Computes the real frequency distribution from the dataset. 77 78 Returns: 79 pd.DataFrame: DataFrame with element frequencies. 80 """ 81 count = self.df['value'].value_counts().reset_index() 82 return count.rename(columns={'value': 'Element', 'count': 'Frequency'}) 83 84 def frequencies(self): 85 """ 86 Returns both the estimated and real frequency distributions. 87 88 Returns: 89 tuple: Estimated frequency and real frequency DataFrames. 90 """ 91 return self.frequency_estimation, self.get_real_frequency() 92 93 def optimize_e_with_optuna(self, target_error, p, metric, n_trials): 94 """ 95 Optimizes the privacy parameter `ϵ` using Optuna to reach a target error. 96 97 Args: 98 target_error (float): Desired error value. 99 p (float): Order of the Lp norm. 100 metric (str): Metric type (1 = MSE, 2 = Lp norm, 3 = Percentage Error). 101 102 Returns: 103 tuple: Best `ϵ`, privatized data, error table, result, and data table. 104 """ 105 def objective(trial): 106 e = trial.suggest_float('e', 0.01, 20, step = 0.01) 107 result, data_table, error_table, privatized_data = self.run_command(e) 108 109 trial.set_user_attr('result', result) 110 trial.set_user_attr('privatized_data', privatized_data) 111 trial.set_user_attr('error_table', error_table) 112 trial.set_user_attr('data_table', data_table) 113 114 print(tabulate(data_table, headers=self.headers, tablefmt="grid")) 115 116 if metric == "1" or metric == "2": 117 Lp_target = self.function_LP(self.frequency_estimation, self.get_real_frequency(), p) 118 elif metric == "3": 119 Lp_target = (self.function_LP(self.frequency_estimation, self.get_real_frequency(), p) / self.N) * 100 120 121 # Minimize the diference: LP - target_error 122 return abs(Lp_target - target_error) 123 124 study = optuna.create_study(direction='minimize') # minimize the difference 125 study.optimize(objective, n_trials=n_trials) 126 127 best_e = study.best_params['e'] 128 privatized_data = study.best_trial.user_attrs['privatized_data'] 129 error_table = study.best_trial.user_attrs['error_table'] 130 result = study.best_trial.user_attrs['result'] 131 data_table = study.best_trial.user_attrs['data_table'] 132 133 print("\n================ e Optimization finished ====================") 134 print(f"Best value of ϵ: {best_e}") 135 print(f"Closest error (LP - target_error): {study.best_value}") 136 137 return best_e, privatized_data, error_table, result, data_table 138 139 def utility_error(self, Lp, p, metric, n_trials=20): 140 """ 141 Optimizes the privacy parameter `ϵ` for utility preservation. 142 143 Args: 144 Lp (float): Target error value. 145 p (float): Order of the Lp norm. 146 metric (str): Metric type (1 = MSE, 2 = Lp norm, 3 = Percentage Error). 147 148 Returns: 149 tuple: Optimized `ϵ`, result, and privatized data. 150 """ 151 e, privatized_data, error_table, result, data_table = self.optimize_e_with_optuna(Lp, p, metric, n_trials) # Adjust the value of e to reach the desired error 152 153 print(tabulate(data_table, headers=self.headers, tablefmt="fancy_grid")) # Show database with the e 154 155 option = input("Are you satisfied with the results? (yes/no): ") # Ask the user if he is satisfied with the results 156 if option == "no": 157 self.utility_error(Lp, p, metric) 158 else: 159 print(f"\nError metrics for parameters k={self.k}, m={self.m} and ϵ={e}") 160 print(tabulate(error_table, tablefmt="fancy_grid")) 161 162 return e, result, privatized_data, data_table 163 164 def privacy_error(self): 165 """ 166 Optimizes the privacy parameter `e` for privacy preservation. 167 168 Returns: 169 tuple: Optimized `e`, result, and privatized data. 170 """ 171 from individual_method import main 172 173 p = float(input("\n→ Enter the type of error ρ: ")) 174 175 error_table = [] 176 error_table_fav = [] 177 privatized_fav = None 178 179 while True: 180 e_min = input(f"→ Enter the {Style.BRIGHT}minimum{Style.RESET_ALL} value of ϵ: ") 181 e_max = input(f"→ Enter the {Style.BRIGHT}maximum{Style.RESET_ALL} value of ϵ: ") 182 step = input(f"→ Enter the {Style.BRIGHT}step{Style.RESET_ALL} value: ") 183 184 saved_e = 0 185 186 for e in range(int(e_min), int(e_max), int(step)): # Optimize e 187 result, data_table, error_table, privatized_data = self.run_command(e) 188 f_estimated, f_real = self.frequencies() 189 error = self.function_LP(f_estimated, f_real, p) 190 191 print(f"\nError for ϵ = {e}: {error}") 192 print(tabulate(data_table, headers=self.headers, tablefmt="grid")) 193 194 save = input("Do you want to save this privatized values? (yes/no): ") 195 if save == "yes": 196 saved_e = e 197 H_fav = result 198 error_table_fav = error_table 199 privatized_fav = privatized_data 200 print(f"\nOptimization finished:{Fore.RED} What do you want to do?{Style.RESET_ALL}") 201 choice = input("\n1. Change e\n2. Change k or m\n3. Continue\nSelect: ") 202 if choice == "2": 203 main(2) 204 break 205 elif choice == "3": 206 break 207 208 if saved_e == 0: 209 e = input("Enter the value of ϵ to use: ") 210 211 H_fav, data_table, error_table_fav, privatized_fav = self.run_command(e) 212 print(tabulate(data_table, headers=self.headers, tablefmt="fancy_grid")) # Show database with the e 213 else: 214 print(f"Using the saved value of ϵ: {saved_e}") 215 216 option = input("Are you satisfied with the results? (yes/no): ") 217 if option == "no": 218 self.privacy_error() 219 else: 220 print(f"\nError metrics for k={self.k}, m={self.m}, e={saved_e}") 221 print(tabulate(error_table_fav, tablefmt="pretty")) 222 223 print("\nSending database to server ...") 224 return saved_e, H_fav, privatized_fav 225 226 def run(self): 227 """ 228 Main execution function. Asks the user to choose between utility and privacy optimization. 229 230 Returns: 231 tuple: Optimized `e`, result, and privatized data. 232 """ 233 e = 0 234 choice = input("Enter the optimization:\n1. Utility\n2. Privacy\nSelect: ") 235 if choice == "1": 236 print(f"\n{Fore.GREEN}🔎 Optimizing ϵ for utility ...{Style.RESET_ALL}") 237 metric = input("Enter the metric to optimize \n1. MSE\n2. LP\n3. Porcentual Error \nSelect: ") 238 if metric == "1": 239 Lp = float(input("Enter the MSE to reach: ")) 240 p = 2 241 elif metric == "2": 242 Lp = float(input("Enter the Lp to reach: ")) 243 p = float(input("Enter the type of error (p): ")) 244 elif metric == "3": 245 Lp = float(input("Enter the Porcentual Error to reach: ")) 246 p = 1 247 n_trials = int(input("Enter the number of trials: ")) 248 e, result, privatized_data, _ = self.utility_error(Lp, p, metric, n_trials) 249 elif choice == "2": 250 print(f"\n{Fore.GREEN}🔎 Optimizing ϵ for privacy ...{Style.RESET_ALL}") 251 e, result, privatized_data = self.privacy_error() 252 else: 253 print("Invalid choice. Please try again.") 254 return e, result, privatized_data 255 256 257def run_parameter_fitting(df, k, m, algorithm): 258 """ 259 Initializes and runs the PrivacyUtilityOptimizer with the given parameters. 260 261 Args: 262 df (pd.DataFrame): Input dataset. 263 k (int): Parameter k for the selected algorithm. 264 m (int): Parameter m for the selected algorithm. 265 algorithm (str): Algorithm choice (1 for CMS, 2 for HCMS). 266 267 Returns: 268 tuple: Optimized `e`, result, and privatized data. 269 """ 270 optimizer = PrivacyUtilityOptimizer(df, k, m, algorithm) 271 e, result, privatized_data = optimizer.run() 272 return e, result, privatized_data 273 274
10class PrivacyUtilityOptimizer: 11 """ 12 Optimizes the privacy-utility tradeoff by tuning the privacy parameter `e`. 13 14 Attributes: 15 df (pd.DataFrame): Input dataset containing values to be privatized. 16 algorithm (str): Selected privacy algorithm (1 for CMS, 2 for HCMS). 17 k (int): Parameter k for the selected algorithm. 18 m (int): Parameter m for the selected algorithm. 19 real_frequency (pd.DataFrame): True frequency distribution of elements in `df`. 20 N (int): Total count of elements in `df`. 21 headers (list): Column headers for displaying tabular results. 22 """ 23 def __init__(self, df, k, m, algorithm): 24 """ 25 Initializes the PrivacyUtilityOptimizer class with dataset and algorithm parameters. 26 27 Args: 28 df (pd.DataFrame): Input dataset. 29 k (int): Parameter k for the selected algorithm. 30 m (int): Parameter m for the selected algorithm. 31 algorithm (str): Algorithm choice (1 for CMS, 2 for HCMS). 32 """ 33 self.df = df 34 self.algorithm = algorithm 35 self.k = k 36 self.m = m 37 38 self.real_frequency = self.get_real_frequency() 39 self.N = self.real_frequency['Frequency'].sum() 40 self.headers =[ "Element", "Real Frequency", "Real Percentage", "Estimated Frequency", "Estimated Percentage", "Estimation Difference", "Percentage Error"] 41 42 43 def function_LP(self, f_estimated, f_real, p): 44 """ 45 Computes the Lp norm error between estimated and real frequencies. 46 47 Args: 48 f_estimated (pd.DataFrame): Estimated frequency distribution. 49 f_real (pd.DataFrame): Real frequency distribution. 50 p (float): Order of the Lp norm. 51 52 Returns: 53 float: Computed Lp error. 54 """ 55 merged = f_estimated.merge(f_real, on="Element", suffixes=("_estimated", "_real")) 56 return (1 / self.N) * sum(abs(row["Frequency_estimated"] - row["Frequency_real"]) ** p for _, row in merged.iterrows()) 57 58 def run_command(self, e): 59 """ 60 Runs the selected privacy algorithm with a given privacy budget `e`. 61 62 Args: 63 e (float): Privacy parameter. 64 65 Returns: 66 tuple: Containing result, data table, error table, privatized data, and estimated frequencies. 67 """ 68 if self.algorithm == '1': 69 result, data_table, error_table, privatized_data, df_estimated = run_private_cms_client(self.k, self.m, e, self.df) 70 elif self.algorithm == '2': 71 result, data_table, error_table, privatized_data, df_estimated = run_private_hcms_client(self.k, self.m, e, self.df) 72 73 self.frequency_estimation = df_estimated 74 return result, data_table, error_table, privatized_data 75 76 def get_real_frequency(self): 77 """ 78 Computes the real frequency distribution from the dataset. 79 80 Returns: 81 pd.DataFrame: DataFrame with element frequencies. 82 """ 83 count = self.df['value'].value_counts().reset_index() 84 return count.rename(columns={'value': 'Element', 'count': 'Frequency'}) 85 86 def frequencies(self): 87 """ 88 Returns both the estimated and real frequency distributions. 89 90 Returns: 91 tuple: Estimated frequency and real frequency DataFrames. 92 """ 93 return self.frequency_estimation, self.get_real_frequency() 94 95 def optimize_e_with_optuna(self, target_error, p, metric, n_trials): 96 """ 97 Optimizes the privacy parameter `ϵ` using Optuna to reach a target error. 98 99 Args: 100 target_error (float): Desired error value. 101 p (float): Order of the Lp norm. 102 metric (str): Metric type (1 = MSE, 2 = Lp norm, 3 = Percentage Error). 103 104 Returns: 105 tuple: Best `ϵ`, privatized data, error table, result, and data table. 106 """ 107 def objective(trial): 108 e = trial.suggest_float('e', 0.01, 20, step = 0.01) 109 result, data_table, error_table, privatized_data = self.run_command(e) 110 111 trial.set_user_attr('result', result) 112 trial.set_user_attr('privatized_data', privatized_data) 113 trial.set_user_attr('error_table', error_table) 114 trial.set_user_attr('data_table', data_table) 115 116 print(tabulate(data_table, headers=self.headers, tablefmt="grid")) 117 118 if metric == "1" or metric == "2": 119 Lp_target = self.function_LP(self.frequency_estimation, self.get_real_frequency(), p) 120 elif metric == "3": 121 Lp_target = (self.function_LP(self.frequency_estimation, self.get_real_frequency(), p) / self.N) * 100 122 123 # Minimize the diference: LP - target_error 124 return abs(Lp_target - target_error) 125 126 study = optuna.create_study(direction='minimize') # minimize the difference 127 study.optimize(objective, n_trials=n_trials) 128 129 best_e = study.best_params['e'] 130 privatized_data = study.best_trial.user_attrs['privatized_data'] 131 error_table = study.best_trial.user_attrs['error_table'] 132 result = study.best_trial.user_attrs['result'] 133 data_table = study.best_trial.user_attrs['data_table'] 134 135 print("\n================ e Optimization finished ====================") 136 print(f"Best value of ϵ: {best_e}") 137 print(f"Closest error (LP - target_error): {study.best_value}") 138 139 return best_e, privatized_data, error_table, result, data_table 140 141 def utility_error(self, Lp, p, metric, n_trials=20): 142 """ 143 Optimizes the privacy parameter `ϵ` for utility preservation. 144 145 Args: 146 Lp (float): Target error value. 147 p (float): Order of the Lp norm. 148 metric (str): Metric type (1 = MSE, 2 = Lp norm, 3 = Percentage Error). 149 150 Returns: 151 tuple: Optimized `ϵ`, result, and privatized data. 152 """ 153 e, privatized_data, error_table, result, data_table = self.optimize_e_with_optuna(Lp, p, metric, n_trials) # Adjust the value of e to reach the desired error 154 155 print(tabulate(data_table, headers=self.headers, tablefmt="fancy_grid")) # Show database with the e 156 157 option = input("Are you satisfied with the results? (yes/no): ") # Ask the user if he is satisfied with the results 158 if option == "no": 159 self.utility_error(Lp, p, metric) 160 else: 161 print(f"\nError metrics for parameters k={self.k}, m={self.m} and ϵ={e}") 162 print(tabulate(error_table, tablefmt="fancy_grid")) 163 164 return e, result, privatized_data, data_table 165 166 def privacy_error(self): 167 """ 168 Optimizes the privacy parameter `e` for privacy preservation. 169 170 Returns: 171 tuple: Optimized `e`, result, and privatized data. 172 """ 173 from individual_method import main 174 175 p = float(input("\n→ Enter the type of error ρ: ")) 176 177 error_table = [] 178 error_table_fav = [] 179 privatized_fav = None 180 181 while True: 182 e_min = input(f"→ Enter the {Style.BRIGHT}minimum{Style.RESET_ALL} value of ϵ: ") 183 e_max = input(f"→ Enter the {Style.BRIGHT}maximum{Style.RESET_ALL} value of ϵ: ") 184 step = input(f"→ Enter the {Style.BRIGHT}step{Style.RESET_ALL} value: ") 185 186 saved_e = 0 187 188 for e in range(int(e_min), int(e_max), int(step)): # Optimize e 189 result, data_table, error_table, privatized_data = self.run_command(e) 190 f_estimated, f_real = self.frequencies() 191 error = self.function_LP(f_estimated, f_real, p) 192 193 print(f"\nError for ϵ = {e}: {error}") 194 print(tabulate(data_table, headers=self.headers, tablefmt="grid")) 195 196 save = input("Do you want to save this privatized values? (yes/no): ") 197 if save == "yes": 198 saved_e = e 199 H_fav = result 200 error_table_fav = error_table 201 privatized_fav = privatized_data 202 print(f"\nOptimization finished:{Fore.RED} What do you want to do?{Style.RESET_ALL}") 203 choice = input("\n1. Change e\n2. Change k or m\n3. Continue\nSelect: ") 204 if choice == "2": 205 main(2) 206 break 207 elif choice == "3": 208 break 209 210 if saved_e == 0: 211 e = input("Enter the value of ϵ to use: ") 212 213 H_fav, data_table, error_table_fav, privatized_fav = self.run_command(e) 214 print(tabulate(data_table, headers=self.headers, tablefmt="fancy_grid")) # Show database with the e 215 else: 216 print(f"Using the saved value of ϵ: {saved_e}") 217 218 option = input("Are you satisfied with the results? (yes/no): ") 219 if option == "no": 220 self.privacy_error() 221 else: 222 print(f"\nError metrics for k={self.k}, m={self.m}, e={saved_e}") 223 print(tabulate(error_table_fav, tablefmt="pretty")) 224 225 print("\nSending database to server ...") 226 return saved_e, H_fav, privatized_fav 227 228 def run(self): 229 """ 230 Main execution function. Asks the user to choose between utility and privacy optimization. 231 232 Returns: 233 tuple: Optimized `e`, result, and privatized data. 234 """ 235 e = 0 236 choice = input("Enter the optimization:\n1. Utility\n2. Privacy\nSelect: ") 237 if choice == "1": 238 print(f"\n{Fore.GREEN}🔎 Optimizing ϵ for utility ...{Style.RESET_ALL}") 239 metric = input("Enter the metric to optimize \n1. MSE\n2. LP\n3. Porcentual Error \nSelect: ") 240 if metric == "1": 241 Lp = float(input("Enter the MSE to reach: ")) 242 p = 2 243 elif metric == "2": 244 Lp = float(input("Enter the Lp to reach: ")) 245 p = float(input("Enter the type of error (p): ")) 246 elif metric == "3": 247 Lp = float(input("Enter the Porcentual Error to reach: ")) 248 p = 1 249 n_trials = int(input("Enter the number of trials: ")) 250 e, result, privatized_data, _ = self.utility_error(Lp, p, metric, n_trials) 251 elif choice == "2": 252 print(f"\n{Fore.GREEN}🔎 Optimizing ϵ for privacy ...{Style.RESET_ALL}") 253 e, result, privatized_data = self.privacy_error() 254 else: 255 print("Invalid choice. Please try again.") 256 return e, result, privatized_data
Optimizes the privacy-utility tradeoff by tuning the privacy parameter e
.
Attributes:
df (pd.DataFrame): Input dataset containing values to be privatized.
algorithm (str): Selected privacy algorithm (1 for CMS, 2 for HCMS).
k (int): Parameter k for the selected algorithm.
m (int): Parameter m for the selected algorithm.
real_frequency (pd.DataFrame): True frequency distribution of elements in df
.
N (int): Total count of elements in df
.
headers (list): Column headers for displaying tabular results.
23 def __init__(self, df, k, m, algorithm): 24 """ 25 Initializes the PrivacyUtilityOptimizer class with dataset and algorithm parameters. 26 27 Args: 28 df (pd.DataFrame): Input dataset. 29 k (int): Parameter k for the selected algorithm. 30 m (int): Parameter m for the selected algorithm. 31 algorithm (str): Algorithm choice (1 for CMS, 2 for HCMS). 32 """ 33 self.df = df 34 self.algorithm = algorithm 35 self.k = k 36 self.m = m 37 38 self.real_frequency = self.get_real_frequency() 39 self.N = self.real_frequency['Frequency'].sum() 40 self.headers =[ "Element", "Real Frequency", "Real Percentage", "Estimated Frequency", "Estimated Percentage", "Estimation Difference", "Percentage Error"]
Initializes the PrivacyUtilityOptimizer class with dataset and algorithm parameters.
Args: df (pd.DataFrame): Input dataset. k (int): Parameter k for the selected algorithm. m (int): Parameter m for the selected algorithm. algorithm (str): Algorithm choice (1 for CMS, 2 for HCMS).
43 def function_LP(self, f_estimated, f_real, p): 44 """ 45 Computes the Lp norm error between estimated and real frequencies. 46 47 Args: 48 f_estimated (pd.DataFrame): Estimated frequency distribution. 49 f_real (pd.DataFrame): Real frequency distribution. 50 p (float): Order of the Lp norm. 51 52 Returns: 53 float: Computed Lp error. 54 """ 55 merged = f_estimated.merge(f_real, on="Element", suffixes=("_estimated", "_real")) 56 return (1 / self.N) * sum(abs(row["Frequency_estimated"] - row["Frequency_real"]) ** p for _, row in merged.iterrows())
Computes the Lp norm error between estimated and real frequencies.
Args: f_estimated (pd.DataFrame): Estimated frequency distribution. f_real (pd.DataFrame): Real frequency distribution. p (float): Order of the Lp norm.
Returns: float: Computed Lp error.
58 def run_command(self, e): 59 """ 60 Runs the selected privacy algorithm with a given privacy budget `e`. 61 62 Args: 63 e (float): Privacy parameter. 64 65 Returns: 66 tuple: Containing result, data table, error table, privatized data, and estimated frequencies. 67 """ 68 if self.algorithm == '1': 69 result, data_table, error_table, privatized_data, df_estimated = run_private_cms_client(self.k, self.m, e, self.df) 70 elif self.algorithm == '2': 71 result, data_table, error_table, privatized_data, df_estimated = run_private_hcms_client(self.k, self.m, e, self.df) 72 73 self.frequency_estimation = df_estimated 74 return result, data_table, error_table, privatized_data
Runs the selected privacy algorithm with a given privacy budget e
.
Args: e (float): Privacy parameter.
Returns: tuple: Containing result, data table, error table, privatized data, and estimated frequencies.
76 def get_real_frequency(self): 77 """ 78 Computes the real frequency distribution from the dataset. 79 80 Returns: 81 pd.DataFrame: DataFrame with element frequencies. 82 """ 83 count = self.df['value'].value_counts().reset_index() 84 return count.rename(columns={'value': 'Element', 'count': 'Frequency'})
Computes the real frequency distribution from the dataset.
Returns: pd.DataFrame: DataFrame with element frequencies.
86 def frequencies(self): 87 """ 88 Returns both the estimated and real frequency distributions. 89 90 Returns: 91 tuple: Estimated frequency and real frequency DataFrames. 92 """ 93 return self.frequency_estimation, self.get_real_frequency()
Returns both the estimated and real frequency distributions.
Returns: tuple: Estimated frequency and real frequency DataFrames.
95 def optimize_e_with_optuna(self, target_error, p, metric, n_trials): 96 """ 97 Optimizes the privacy parameter `ϵ` using Optuna to reach a target error. 98 99 Args: 100 target_error (float): Desired error value. 101 p (float): Order of the Lp norm. 102 metric (str): Metric type (1 = MSE, 2 = Lp norm, 3 = Percentage Error). 103 104 Returns: 105 tuple: Best `ϵ`, privatized data, error table, result, and data table. 106 """ 107 def objective(trial): 108 e = trial.suggest_float('e', 0.01, 20, step = 0.01) 109 result, data_table, error_table, privatized_data = self.run_command(e) 110 111 trial.set_user_attr('result', result) 112 trial.set_user_attr('privatized_data', privatized_data) 113 trial.set_user_attr('error_table', error_table) 114 trial.set_user_attr('data_table', data_table) 115 116 print(tabulate(data_table, headers=self.headers, tablefmt="grid")) 117 118 if metric == "1" or metric == "2": 119 Lp_target = self.function_LP(self.frequency_estimation, self.get_real_frequency(), p) 120 elif metric == "3": 121 Lp_target = (self.function_LP(self.frequency_estimation, self.get_real_frequency(), p) / self.N) * 100 122 123 # Minimize the diference: LP - target_error 124 return abs(Lp_target - target_error) 125 126 study = optuna.create_study(direction='minimize') # minimize the difference 127 study.optimize(objective, n_trials=n_trials) 128 129 best_e = study.best_params['e'] 130 privatized_data = study.best_trial.user_attrs['privatized_data'] 131 error_table = study.best_trial.user_attrs['error_table'] 132 result = study.best_trial.user_attrs['result'] 133 data_table = study.best_trial.user_attrs['data_table'] 134 135 print("\n================ e Optimization finished ====================") 136 print(f"Best value of ϵ: {best_e}") 137 print(f"Closest error (LP - target_error): {study.best_value}") 138 139 return best_e, privatized_data, error_table, result, data_table
Optimizes the privacy parameter ϵ
using Optuna to reach a target error.
Args: target_error (float): Desired error value. p (float): Order of the Lp norm. metric (str): Metric type (1 = MSE, 2 = Lp norm, 3 = Percentage Error).
Returns:
tuple: Best ϵ
, privatized data, error table, result, and data table.
141 def utility_error(self, Lp, p, metric, n_trials=20): 142 """ 143 Optimizes the privacy parameter `ϵ` for utility preservation. 144 145 Args: 146 Lp (float): Target error value. 147 p (float): Order of the Lp norm. 148 metric (str): Metric type (1 = MSE, 2 = Lp norm, 3 = Percentage Error). 149 150 Returns: 151 tuple: Optimized `ϵ`, result, and privatized data. 152 """ 153 e, privatized_data, error_table, result, data_table = self.optimize_e_with_optuna(Lp, p, metric, n_trials) # Adjust the value of e to reach the desired error 154 155 print(tabulate(data_table, headers=self.headers, tablefmt="fancy_grid")) # Show database with the e 156 157 option = input("Are you satisfied with the results? (yes/no): ") # Ask the user if he is satisfied with the results 158 if option == "no": 159 self.utility_error(Lp, p, metric) 160 else: 161 print(f"\nError metrics for parameters k={self.k}, m={self.m} and ϵ={e}") 162 print(tabulate(error_table, tablefmt="fancy_grid")) 163 164 return e, result, privatized_data, data_table
Optimizes the privacy parameter ϵ
for utility preservation.
Args: Lp (float): Target error value. p (float): Order of the Lp norm. metric (str): Metric type (1 = MSE, 2 = Lp norm, 3 = Percentage Error).
Returns:
tuple: Optimized ϵ
, result, and privatized data.
166 def privacy_error(self): 167 """ 168 Optimizes the privacy parameter `e` for privacy preservation. 169 170 Returns: 171 tuple: Optimized `e`, result, and privatized data. 172 """ 173 from individual_method import main 174 175 p = float(input("\n→ Enter the type of error ρ: ")) 176 177 error_table = [] 178 error_table_fav = [] 179 privatized_fav = None 180 181 while True: 182 e_min = input(f"→ Enter the {Style.BRIGHT}minimum{Style.RESET_ALL} value of ϵ: ") 183 e_max = input(f"→ Enter the {Style.BRIGHT}maximum{Style.RESET_ALL} value of ϵ: ") 184 step = input(f"→ Enter the {Style.BRIGHT}step{Style.RESET_ALL} value: ") 185 186 saved_e = 0 187 188 for e in range(int(e_min), int(e_max), int(step)): # Optimize e 189 result, data_table, error_table, privatized_data = self.run_command(e) 190 f_estimated, f_real = self.frequencies() 191 error = self.function_LP(f_estimated, f_real, p) 192 193 print(f"\nError for ϵ = {e}: {error}") 194 print(tabulate(data_table, headers=self.headers, tablefmt="grid")) 195 196 save = input("Do you want to save this privatized values? (yes/no): ") 197 if save == "yes": 198 saved_e = e 199 H_fav = result 200 error_table_fav = error_table 201 privatized_fav = privatized_data 202 print(f"\nOptimization finished:{Fore.RED} What do you want to do?{Style.RESET_ALL}") 203 choice = input("\n1. Change e\n2. Change k or m\n3. Continue\nSelect: ") 204 if choice == "2": 205 main(2) 206 break 207 elif choice == "3": 208 break 209 210 if saved_e == 0: 211 e = input("Enter the value of ϵ to use: ") 212 213 H_fav, data_table, error_table_fav, privatized_fav = self.run_command(e) 214 print(tabulate(data_table, headers=self.headers, tablefmt="fancy_grid")) # Show database with the e 215 else: 216 print(f"Using the saved value of ϵ: {saved_e}") 217 218 option = input("Are you satisfied with the results? (yes/no): ") 219 if option == "no": 220 self.privacy_error() 221 else: 222 print(f"\nError metrics for k={self.k}, m={self.m}, e={saved_e}") 223 print(tabulate(error_table_fav, tablefmt="pretty")) 224 225 print("\nSending database to server ...") 226 return saved_e, H_fav, privatized_fav
Optimizes the privacy parameter e
for privacy preservation.
Returns:
tuple: Optimized e
, result, and privatized data.
228 def run(self): 229 """ 230 Main execution function. Asks the user to choose between utility and privacy optimization. 231 232 Returns: 233 tuple: Optimized `e`, result, and privatized data. 234 """ 235 e = 0 236 choice = input("Enter the optimization:\n1. Utility\n2. Privacy\nSelect: ") 237 if choice == "1": 238 print(f"\n{Fore.GREEN}🔎 Optimizing ϵ for utility ...{Style.RESET_ALL}") 239 metric = input("Enter the metric to optimize \n1. MSE\n2. LP\n3. Porcentual Error \nSelect: ") 240 if metric == "1": 241 Lp = float(input("Enter the MSE to reach: ")) 242 p = 2 243 elif metric == "2": 244 Lp = float(input("Enter the Lp to reach: ")) 245 p = float(input("Enter the type of error (p): ")) 246 elif metric == "3": 247 Lp = float(input("Enter the Porcentual Error to reach: ")) 248 p = 1 249 n_trials = int(input("Enter the number of trials: ")) 250 e, result, privatized_data, _ = self.utility_error(Lp, p, metric, n_trials) 251 elif choice == "2": 252 print(f"\n{Fore.GREEN}🔎 Optimizing ϵ for privacy ...{Style.RESET_ALL}") 253 e, result, privatized_data = self.privacy_error() 254 else: 255 print("Invalid choice. Please try again.") 256 return e, result, privatized_data
Main execution function. Asks the user to choose between utility and privacy optimization.
Returns:
tuple: Optimized e
, result, and privatized data.
259def run_parameter_fitting(df, k, m, algorithm): 260 """ 261 Initializes and runs the PrivacyUtilityOptimizer with the given parameters. 262 263 Args: 264 df (pd.DataFrame): Input dataset. 265 k (int): Parameter k for the selected algorithm. 266 m (int): Parameter m for the selected algorithm. 267 algorithm (str): Algorithm choice (1 for CMS, 2 for HCMS). 268 269 Returns: 270 tuple: Optimized `e`, result, and privatized data. 271 """ 272 optimizer = PrivacyUtilityOptimizer(df, k, m, algorithm) 273 e, result, privatized_data = optimizer.run() 274 return e, result, privatized_data
Initializes and runs the PrivacyUtilityOptimizer with the given parameters.
Args: df (pd.DataFrame): Input dataset. k (int): Parameter k for the selected algorithm. m (int): Parameter m for the selected algorithm. algorithm (str): Algorithm choice (1 for CMS, 2 for HCMS).
Returns:
tuple: Optimized e
, result, and privatized data.