src.scripts.parameter_fitting

  1import optuna
  2from colorama import Fore, Style
  3from tabulate import tabulate
  4
  5from count_mean.private_cms_client import run_private_cms_client
  6from hadamard_count_mean.private_hcms_client import run_private_hcms_client
  7
  8class PrivacyUtilityOptimizer:
  9    """
 10    Optimizes the privacy-utility tradeoff by tuning the privacy parameter `e`.
 11    
 12    Attributes:
 13        df (pd.DataFrame): Input dataset containing values to be privatized.
 14        algorithm (str): Selected privacy algorithm (1 for CMS, 2 for HCMS).
 15        k (int): Parameter k for the selected algorithm.
 16        m (int): Parameter m for the selected algorithm.
 17        real_frequency (pd.DataFrame): True frequency distribution of elements in `df`.
 18        N (int): Total count of elements in `df`.
 19        headers (list): Column headers for displaying tabular results.
 20    """
 21    def __init__(self, df, k, m, algorithm):
 22        """
 23        Initializes the PrivacyUtilityOptimizer class with dataset and algorithm parameters.
 24
 25        Args:
 26            df (pd.DataFrame): Input dataset.
 27            k (int): Parameter k for the selected algorithm.
 28            m (int): Parameter m for the selected algorithm.
 29            algorithm (str): Algorithm choice (1 for CMS, 2 for HCMS).
 30        """
 31        self.df = df
 32        self.algorithm = algorithm
 33        self.k = k
 34        self.m = m
 35
 36        self.real_frequency = self.get_real_frequency()
 37        self.N = self.real_frequency['Frequency'].sum()
 38        self.headers =[ "Element", "Real Frequency", "Real Percentage", "Estimated Frequency", "Estimated Percentage", "Estimation Difference", "Percentage Error"]
 39    
 40
 41    def function_LP(self, f_estimated, f_real, p):
 42        """
 43        Computes the Lp norm error between estimated and real frequencies.
 44        
 45        Args:
 46            f_estimated (pd.DataFrame): Estimated frequency distribution.
 47            f_real (pd.DataFrame): Real frequency distribution.
 48            p (float): Order of the Lp norm.
 49        
 50        Returns:
 51            float: Computed Lp error.
 52        """
 53        merged = f_estimated.merge(f_real, on="Element", suffixes=("_estimated", "_real"))
 54        return (1 / self.N) * sum(abs(row["Frequency_estimated"] - row["Frequency_real"]) ** p for _, row in merged.iterrows())
 55
 56    def run_command(self, e):
 57        """
 58        Runs the selected privacy algorithm with a given privacy budget `e`.
 59        
 60        Args:
 61            e (float): Privacy parameter.
 62        
 63        Returns:
 64            tuple: Containing result, data table, error table, privatized data, and estimated frequencies.
 65        """
 66        if self.algorithm == '1':
 67            result, data_table, error_table, privatized_data, df_estimated = run_private_cms_client(self.k, self.m, e, self.df)
 68        elif self.algorithm == '2':
 69            result, data_table, error_table, privatized_data, df_estimated = run_private_hcms_client(self.k, self.m, e, self.df)
 70        
 71        self.frequency_estimation = df_estimated
 72        return result, data_table, error_table, privatized_data
 73
 74    def get_real_frequency(self):
 75        """
 76        Computes the real frequency distribution from the dataset.
 77        
 78        Returns:
 79            pd.DataFrame: DataFrame with element frequencies.
 80        """
 81        count = self.df['value'].value_counts().reset_index()
 82        return count.rename(columns={'value': 'Element', 'count': 'Frequency'})
 83
 84    def frequencies(self):
 85        """
 86        Returns both the estimated and real frequency distributions.
 87        
 88        Returns:
 89            tuple: Estimated frequency and real frequency DataFrames.
 90        """
 91        return self.frequency_estimation, self.get_real_frequency()
 92
 93    def optimize_e_with_optuna(self, target_error, p, metric, n_trials):
 94        """
 95        Optimizes the privacy parameter `ϵ` using Optuna to reach a target error.
 96        
 97        Args:
 98            target_error (float): Desired error value.
 99            p (float): Order of the Lp norm.
100            metric (str): Metric type (1 = MSE, 2 = Lp norm, 3 = Percentage Error).
101        
102        Returns:
103            tuple: Best `ϵ`, privatized data, error table, result, and data table.
104        """
105        def objective(trial):
106            e = trial.suggest_float('e', 0.01, 20, step = 0.01)
107            result, data_table, error_table, privatized_data = self.run_command(e)
108
109            trial.set_user_attr('result', result)
110            trial.set_user_attr('privatized_data', privatized_data)
111            trial.set_user_attr('error_table', error_table)
112            trial.set_user_attr('data_table', data_table)
113
114            print(tabulate(data_table, headers=self.headers, tablefmt="grid"))
115    
116            if metric == "1" or metric == "2":
117                Lp_target = self.function_LP(self.frequency_estimation, self.get_real_frequency(), p)
118            elif metric == "3":
119                Lp_target = (self.function_LP(self.frequency_estimation, self.get_real_frequency(), p) / self.N) * 100
120            
121            # Minimize the diference: LP - target_error
122            return abs(Lp_target - target_error)
123
124        study = optuna.create_study(direction='minimize') # minimize the difference
125        study.optimize(objective, n_trials=n_trials)
126
127        best_e = study.best_params['e']
128        privatized_data = study.best_trial.user_attrs['privatized_data']
129        error_table = study.best_trial.user_attrs['error_table']
130        result = study.best_trial.user_attrs['result']
131        data_table = study.best_trial.user_attrs['data_table']
132
133        print("\n================ e Optimization finished ====================")
134        print(f"Best value of ϵ: {best_e}")
135        print(f"Closest error (LP - target_error): {study.best_value}")
136        
137        return best_e, privatized_data, error_table, result, data_table
138
139    def utility_error(self, Lp, p, metric, n_trials=20):
140        """
141        Optimizes the privacy parameter `ϵ` for utility preservation.
142        
143        Args:
144            Lp (float): Target error value.
145            p (float): Order of the Lp norm.
146            metric (str): Metric type (1 = MSE, 2 = Lp norm, 3 = Percentage Error).
147        
148        Returns:
149            tuple: Optimized `ϵ`, result, and privatized data.
150        """
151        e, privatized_data, error_table, result, data_table = self.optimize_e_with_optuna(Lp, p, metric, n_trials) # Adjust the value of e to reach the desired error
152
153        print(tabulate(data_table, headers=self.headers, tablefmt="fancy_grid")) # Show database with the e
154
155        option = input("Are you satisfied with the results? (yes/no): ") # Ask the user if he is satisfied with the results
156        if option == "no":
157            self.utility_error(Lp, p, metric)
158        else:
159            print(f"\nError metrics for parameters k={self.k}, m={self.m} and ϵ={e}")
160            print(tabulate(error_table, tablefmt="fancy_grid"))
161
162        return e, result, privatized_data, data_table
163
164    def privacy_error(self):
165        """
166        Optimizes the privacy parameter `e` for privacy preservation.
167        
168        Returns:
169            tuple: Optimized `e`, result, and privatized data.
170        """
171        from individual_method import main
172        
173        p = float(input("\n→ Enter the type of error ρ: "))
174
175        error_table = []
176        error_table_fav = []
177        privatized_fav = None
178
179        while True:
180            e_min = input(f"→ Enter the {Style.BRIGHT}minimum{Style.RESET_ALL} value of ϵ: ")
181            e_max = input(f"→ Enter the {Style.BRIGHT}maximum{Style.RESET_ALL} value of ϵ: ")
182            step = input(f"→ Enter the {Style.BRIGHT}step{Style.RESET_ALL} value: ")
183
184            saved_e = 0
185
186            for e in range(int(e_min), int(e_max), int(step)): # Optimize e
187                result, data_table, error_table, privatized_data = self.run_command(e)
188                f_estimated, f_real = self.frequencies()
189                error = self.function_LP(f_estimated, f_real, p)
190
191                print(f"\nError for ϵ = {e}: {error}")
192                print(tabulate(data_table, headers=self.headers, tablefmt="grid"))
193
194                save = input("Do you want to save this privatized values? (yes/no): ")
195                if save == "yes":
196                    saved_e = e
197                    H_fav = result
198                    error_table_fav = error_table
199                    privatized_fav = privatized_data
200            print(f"\nOptimization finished:{Fore.RED} What do you want to do?{Style.RESET_ALL}")
201            choice = input("\n1. Change e\n2. Change k or m\n3. Continue\nSelect: ")
202            if choice == "2":
203                main(2)
204                break
205            elif choice == "3":
206                break
207        
208        if saved_e == 0:
209            e = input("Enter the value of ϵ to use: ")
210            
211            H_fav, data_table, error_table_fav, privatized_fav = self.run_command(e)
212            print(tabulate(data_table, headers=self.headers, tablefmt="fancy_grid")) # Show database with the e
213        else:
214            print(f"Using the saved value of ϵ: {saved_e}")
215
216        option = input("Are you satisfied with the results? (yes/no): ")
217        if option == "no":
218            self.privacy_error()
219        else:
220            print(f"\nError metrics for k={self.k}, m={self.m}, e={saved_e}")
221            print(tabulate(error_table_fav, tablefmt="pretty"))
222
223            print("\nSending database to server ...")
224        return saved_e, H_fav, privatized_fav
225
226    def run(self):
227        """
228        Main execution function. Asks the user to choose between utility and privacy optimization.
229        
230        Returns:
231            tuple: Optimized `e`, result, and privatized data.
232        """
233        e = 0
234        choice = input("Enter the optimization:\n1. Utility\n2. Privacy\nSelect: ")
235        if choice == "1":
236            print(f"\n{Fore.GREEN}🔎 Optimizing ϵ for utility ...{Style.RESET_ALL}")
237            metric = input("Enter the metric to optimize \n1. MSE\n2. LP\n3. Porcentual Error \nSelect: ")
238            if metric == "1":
239                Lp = float(input("Enter the MSE to reach: "))
240                p = 2
241            elif metric == "2":
242                Lp = float(input("Enter the Lp to reach: "))
243                p = float(input("Enter the type of error (p): "))
244            elif metric == "3":
245                Lp = float(input("Enter the Porcentual Error to reach: "))
246                p = 1
247            n_trials = int(input("Enter the number of trials: "))
248            e, result, privatized_data, _ = self.utility_error(Lp, p, metric, n_trials)
249        elif choice == "2":
250            print(f"\n{Fore.GREEN}🔎 Optimizing ϵ for privacy ...{Style.RESET_ALL}")
251            e, result, privatized_data = self.privacy_error()
252        else:
253            print("Invalid choice. Please try again.")
254        return e, result, privatized_data
255
256    
257def run_parameter_fitting(df, k, m, algorithm):
258    """
259    Initializes and runs the PrivacyUtilityOptimizer with the given parameters.
260    
261    Args:
262        df (pd.DataFrame): Input dataset.
263        k (int): Parameter k for the selected algorithm.
264        m (int): Parameter m for the selected algorithm.
265        algorithm (str): Algorithm choice (1 for CMS, 2 for HCMS).
266    
267    Returns:
268        tuple: Optimized `e`, result, and privatized data.
269    """
270    optimizer = PrivacyUtilityOptimizer(df, k, m, algorithm)
271    e, result, privatized_data = optimizer.run()
272    return e, result, privatized_data
273
274    
class PrivacyUtilityOptimizer:
 10class PrivacyUtilityOptimizer:
 11    """
 12    Optimizes the privacy-utility tradeoff by tuning the privacy parameter `e`.
 13    
 14    Attributes:
 15        df (pd.DataFrame): Input dataset containing values to be privatized.
 16        algorithm (str): Selected privacy algorithm (1 for CMS, 2 for HCMS).
 17        k (int): Parameter k for the selected algorithm.
 18        m (int): Parameter m for the selected algorithm.
 19        real_frequency (pd.DataFrame): True frequency distribution of elements in `df`.
 20        N (int): Total count of elements in `df`.
 21        headers (list): Column headers for displaying tabular results.
 22    """
 23    def __init__(self, df, k, m, algorithm):
 24        """
 25        Initializes the PrivacyUtilityOptimizer class with dataset and algorithm parameters.
 26
 27        Args:
 28            df (pd.DataFrame): Input dataset.
 29            k (int): Parameter k for the selected algorithm.
 30            m (int): Parameter m for the selected algorithm.
 31            algorithm (str): Algorithm choice (1 for CMS, 2 for HCMS).
 32        """
 33        self.df = df
 34        self.algorithm = algorithm
 35        self.k = k
 36        self.m = m
 37
 38        self.real_frequency = self.get_real_frequency()
 39        self.N = self.real_frequency['Frequency'].sum()
 40        self.headers =[ "Element", "Real Frequency", "Real Percentage", "Estimated Frequency", "Estimated Percentage", "Estimation Difference", "Percentage Error"]
 41    
 42
 43    def function_LP(self, f_estimated, f_real, p):
 44        """
 45        Computes the Lp norm error between estimated and real frequencies.
 46        
 47        Args:
 48            f_estimated (pd.DataFrame): Estimated frequency distribution.
 49            f_real (pd.DataFrame): Real frequency distribution.
 50            p (float): Order of the Lp norm.
 51        
 52        Returns:
 53            float: Computed Lp error.
 54        """
 55        merged = f_estimated.merge(f_real, on="Element", suffixes=("_estimated", "_real"))
 56        return (1 / self.N) * sum(abs(row["Frequency_estimated"] - row["Frequency_real"]) ** p for _, row in merged.iterrows())
 57
 58    def run_command(self, e):
 59        """
 60        Runs the selected privacy algorithm with a given privacy budget `e`.
 61        
 62        Args:
 63            e (float): Privacy parameter.
 64        
 65        Returns:
 66            tuple: Containing result, data table, error table, privatized data, and estimated frequencies.
 67        """
 68        if self.algorithm == '1':
 69            result, data_table, error_table, privatized_data, df_estimated = run_private_cms_client(self.k, self.m, e, self.df)
 70        elif self.algorithm == '2':
 71            result, data_table, error_table, privatized_data, df_estimated = run_private_hcms_client(self.k, self.m, e, self.df)
 72        
 73        self.frequency_estimation = df_estimated
 74        return result, data_table, error_table, privatized_data
 75
 76    def get_real_frequency(self):
 77        """
 78        Computes the real frequency distribution from the dataset.
 79        
 80        Returns:
 81            pd.DataFrame: DataFrame with element frequencies.
 82        """
 83        count = self.df['value'].value_counts().reset_index()
 84        return count.rename(columns={'value': 'Element', 'count': 'Frequency'})
 85
 86    def frequencies(self):
 87        """
 88        Returns both the estimated and real frequency distributions.
 89        
 90        Returns:
 91            tuple: Estimated frequency and real frequency DataFrames.
 92        """
 93        return self.frequency_estimation, self.get_real_frequency()
 94
 95    def optimize_e_with_optuna(self, target_error, p, metric, n_trials):
 96        """
 97        Optimizes the privacy parameter `ϵ` using Optuna to reach a target error.
 98        
 99        Args:
100            target_error (float): Desired error value.
101            p (float): Order of the Lp norm.
102            metric (str): Metric type (1 = MSE, 2 = Lp norm, 3 = Percentage Error).
103        
104        Returns:
105            tuple: Best `ϵ`, privatized data, error table, result, and data table.
106        """
107        def objective(trial):
108            e = trial.suggest_float('e', 0.01, 20, step = 0.01)
109            result, data_table, error_table, privatized_data = self.run_command(e)
110
111            trial.set_user_attr('result', result)
112            trial.set_user_attr('privatized_data', privatized_data)
113            trial.set_user_attr('error_table', error_table)
114            trial.set_user_attr('data_table', data_table)
115
116            print(tabulate(data_table, headers=self.headers, tablefmt="grid"))
117    
118            if metric == "1" or metric == "2":
119                Lp_target = self.function_LP(self.frequency_estimation, self.get_real_frequency(), p)
120            elif metric == "3":
121                Lp_target = (self.function_LP(self.frequency_estimation, self.get_real_frequency(), p) / self.N) * 100
122            
123            # Minimize the diference: LP - target_error
124            return abs(Lp_target - target_error)
125
126        study = optuna.create_study(direction='minimize') # minimize the difference
127        study.optimize(objective, n_trials=n_trials)
128
129        best_e = study.best_params['e']
130        privatized_data = study.best_trial.user_attrs['privatized_data']
131        error_table = study.best_trial.user_attrs['error_table']
132        result = study.best_trial.user_attrs['result']
133        data_table = study.best_trial.user_attrs['data_table']
134
135        print("\n================ e Optimization finished ====================")
136        print(f"Best value of ϵ: {best_e}")
137        print(f"Closest error (LP - target_error): {study.best_value}")
138        
139        return best_e, privatized_data, error_table, result, data_table
140
141    def utility_error(self, Lp, p, metric, n_trials=20):
142        """
143        Optimizes the privacy parameter `ϵ` for utility preservation.
144        
145        Args:
146            Lp (float): Target error value.
147            p (float): Order of the Lp norm.
148            metric (str): Metric type (1 = MSE, 2 = Lp norm, 3 = Percentage Error).
149        
150        Returns:
151            tuple: Optimized `ϵ`, result, and privatized data.
152        """
153        e, privatized_data, error_table, result, data_table = self.optimize_e_with_optuna(Lp, p, metric, n_trials) # Adjust the value of e to reach the desired error
154
155        print(tabulate(data_table, headers=self.headers, tablefmt="fancy_grid")) # Show database with the e
156
157        option = input("Are you satisfied with the results? (yes/no): ") # Ask the user if he is satisfied with the results
158        if option == "no":
159            self.utility_error(Lp, p, metric)
160        else:
161            print(f"\nError metrics for parameters k={self.k}, m={self.m} and ϵ={e}")
162            print(tabulate(error_table, tablefmt="fancy_grid"))
163
164        return e, result, privatized_data, data_table
165
166    def privacy_error(self):
167        """
168        Optimizes the privacy parameter `e` for privacy preservation.
169        
170        Returns:
171            tuple: Optimized `e`, result, and privatized data.
172        """
173        from individual_method import main
174        
175        p = float(input("\n→ Enter the type of error ρ: "))
176
177        error_table = []
178        error_table_fav = []
179        privatized_fav = None
180
181        while True:
182            e_min = input(f"→ Enter the {Style.BRIGHT}minimum{Style.RESET_ALL} value of ϵ: ")
183            e_max = input(f"→ Enter the {Style.BRIGHT}maximum{Style.RESET_ALL} value of ϵ: ")
184            step = input(f"→ Enter the {Style.BRIGHT}step{Style.RESET_ALL} value: ")
185
186            saved_e = 0
187
188            for e in range(int(e_min), int(e_max), int(step)): # Optimize e
189                result, data_table, error_table, privatized_data = self.run_command(e)
190                f_estimated, f_real = self.frequencies()
191                error = self.function_LP(f_estimated, f_real, p)
192
193                print(f"\nError for ϵ = {e}: {error}")
194                print(tabulate(data_table, headers=self.headers, tablefmt="grid"))
195
196                save = input("Do you want to save this privatized values? (yes/no): ")
197                if save == "yes":
198                    saved_e = e
199                    H_fav = result
200                    error_table_fav = error_table
201                    privatized_fav = privatized_data
202            print(f"\nOptimization finished:{Fore.RED} What do you want to do?{Style.RESET_ALL}")
203            choice = input("\n1. Change e\n2. Change k or m\n3. Continue\nSelect: ")
204            if choice == "2":
205                main(2)
206                break
207            elif choice == "3":
208                break
209        
210        if saved_e == 0:
211            e = input("Enter the value of ϵ to use: ")
212            
213            H_fav, data_table, error_table_fav, privatized_fav = self.run_command(e)
214            print(tabulate(data_table, headers=self.headers, tablefmt="fancy_grid")) # Show database with the e
215        else:
216            print(f"Using the saved value of ϵ: {saved_e}")
217
218        option = input("Are you satisfied with the results? (yes/no): ")
219        if option == "no":
220            self.privacy_error()
221        else:
222            print(f"\nError metrics for k={self.k}, m={self.m}, e={saved_e}")
223            print(tabulate(error_table_fav, tablefmt="pretty"))
224
225            print("\nSending database to server ...")
226        return saved_e, H_fav, privatized_fav
227
228    def run(self):
229        """
230        Main execution function. Asks the user to choose between utility and privacy optimization.
231        
232        Returns:
233            tuple: Optimized `e`, result, and privatized data.
234        """
235        e = 0
236        choice = input("Enter the optimization:\n1. Utility\n2. Privacy\nSelect: ")
237        if choice == "1":
238            print(f"\n{Fore.GREEN}🔎 Optimizing ϵ for utility ...{Style.RESET_ALL}")
239            metric = input("Enter the metric to optimize \n1. MSE\n2. LP\n3. Porcentual Error \nSelect: ")
240            if metric == "1":
241                Lp = float(input("Enter the MSE to reach: "))
242                p = 2
243            elif metric == "2":
244                Lp = float(input("Enter the Lp to reach: "))
245                p = float(input("Enter the type of error (p): "))
246            elif metric == "3":
247                Lp = float(input("Enter the Porcentual Error to reach: "))
248                p = 1
249            n_trials = int(input("Enter the number of trials: "))
250            e, result, privatized_data, _ = self.utility_error(Lp, p, metric, n_trials)
251        elif choice == "2":
252            print(f"\n{Fore.GREEN}🔎 Optimizing ϵ for privacy ...{Style.RESET_ALL}")
253            e, result, privatized_data = self.privacy_error()
254        else:
255            print("Invalid choice. Please try again.")
256        return e, result, privatized_data

Optimizes the privacy-utility tradeoff by tuning the privacy parameter e.

Attributes: df (pd.DataFrame): Input dataset containing values to be privatized. algorithm (str): Selected privacy algorithm (1 for CMS, 2 for HCMS). k (int): Parameter k for the selected algorithm. m (int): Parameter m for the selected algorithm. real_frequency (pd.DataFrame): True frequency distribution of elements in df. N (int): Total count of elements in df. headers (list): Column headers for displaying tabular results.

PrivacyUtilityOptimizer(df, k, m, algorithm)
23    def __init__(self, df, k, m, algorithm):
24        """
25        Initializes the PrivacyUtilityOptimizer class with dataset and algorithm parameters.
26
27        Args:
28            df (pd.DataFrame): Input dataset.
29            k (int): Parameter k for the selected algorithm.
30            m (int): Parameter m for the selected algorithm.
31            algorithm (str): Algorithm choice (1 for CMS, 2 for HCMS).
32        """
33        self.df = df
34        self.algorithm = algorithm
35        self.k = k
36        self.m = m
37
38        self.real_frequency = self.get_real_frequency()
39        self.N = self.real_frequency['Frequency'].sum()
40        self.headers =[ "Element", "Real Frequency", "Real Percentage", "Estimated Frequency", "Estimated Percentage", "Estimation Difference", "Percentage Error"]

Initializes the PrivacyUtilityOptimizer class with dataset and algorithm parameters.

Args: df (pd.DataFrame): Input dataset. k (int): Parameter k for the selected algorithm. m (int): Parameter m for the selected algorithm. algorithm (str): Algorithm choice (1 for CMS, 2 for HCMS).

df
algorithm
k
m
real_frequency
N
headers
def function_LP(self, f_estimated, f_real, p):
43    def function_LP(self, f_estimated, f_real, p):
44        """
45        Computes the Lp norm error between estimated and real frequencies.
46        
47        Args:
48            f_estimated (pd.DataFrame): Estimated frequency distribution.
49            f_real (pd.DataFrame): Real frequency distribution.
50            p (float): Order of the Lp norm.
51        
52        Returns:
53            float: Computed Lp error.
54        """
55        merged = f_estimated.merge(f_real, on="Element", suffixes=("_estimated", "_real"))
56        return (1 / self.N) * sum(abs(row["Frequency_estimated"] - row["Frequency_real"]) ** p for _, row in merged.iterrows())

Computes the Lp norm error between estimated and real frequencies.

Args: f_estimated (pd.DataFrame): Estimated frequency distribution. f_real (pd.DataFrame): Real frequency distribution. p (float): Order of the Lp norm.

Returns: float: Computed Lp error.

def run_command(self, e):
58    def run_command(self, e):
59        """
60        Runs the selected privacy algorithm with a given privacy budget `e`.
61        
62        Args:
63            e (float): Privacy parameter.
64        
65        Returns:
66            tuple: Containing result, data table, error table, privatized data, and estimated frequencies.
67        """
68        if self.algorithm == '1':
69            result, data_table, error_table, privatized_data, df_estimated = run_private_cms_client(self.k, self.m, e, self.df)
70        elif self.algorithm == '2':
71            result, data_table, error_table, privatized_data, df_estimated = run_private_hcms_client(self.k, self.m, e, self.df)
72        
73        self.frequency_estimation = df_estimated
74        return result, data_table, error_table, privatized_data

Runs the selected privacy algorithm with a given privacy budget e.

Args: e (float): Privacy parameter.

Returns: tuple: Containing result, data table, error table, privatized data, and estimated frequencies.

def get_real_frequency(self):
76    def get_real_frequency(self):
77        """
78        Computes the real frequency distribution from the dataset.
79        
80        Returns:
81            pd.DataFrame: DataFrame with element frequencies.
82        """
83        count = self.df['value'].value_counts().reset_index()
84        return count.rename(columns={'value': 'Element', 'count': 'Frequency'})

Computes the real frequency distribution from the dataset.

Returns: pd.DataFrame: DataFrame with element frequencies.

def frequencies(self):
86    def frequencies(self):
87        """
88        Returns both the estimated and real frequency distributions.
89        
90        Returns:
91            tuple: Estimated frequency and real frequency DataFrames.
92        """
93        return self.frequency_estimation, self.get_real_frequency()

Returns both the estimated and real frequency distributions.

Returns: tuple: Estimated frequency and real frequency DataFrames.

def optimize_e_with_optuna(self, target_error, p, metric, n_trials):
 95    def optimize_e_with_optuna(self, target_error, p, metric, n_trials):
 96        """
 97        Optimizes the privacy parameter `ϵ` using Optuna to reach a target error.
 98        
 99        Args:
100            target_error (float): Desired error value.
101            p (float): Order of the Lp norm.
102            metric (str): Metric type (1 = MSE, 2 = Lp norm, 3 = Percentage Error).
103        
104        Returns:
105            tuple: Best `ϵ`, privatized data, error table, result, and data table.
106        """
107        def objective(trial):
108            e = trial.suggest_float('e', 0.01, 20, step = 0.01)
109            result, data_table, error_table, privatized_data = self.run_command(e)
110
111            trial.set_user_attr('result', result)
112            trial.set_user_attr('privatized_data', privatized_data)
113            trial.set_user_attr('error_table', error_table)
114            trial.set_user_attr('data_table', data_table)
115
116            print(tabulate(data_table, headers=self.headers, tablefmt="grid"))
117    
118            if metric == "1" or metric == "2":
119                Lp_target = self.function_LP(self.frequency_estimation, self.get_real_frequency(), p)
120            elif metric == "3":
121                Lp_target = (self.function_LP(self.frequency_estimation, self.get_real_frequency(), p) / self.N) * 100
122            
123            # Minimize the diference: LP - target_error
124            return abs(Lp_target - target_error)
125
126        study = optuna.create_study(direction='minimize') # minimize the difference
127        study.optimize(objective, n_trials=n_trials)
128
129        best_e = study.best_params['e']
130        privatized_data = study.best_trial.user_attrs['privatized_data']
131        error_table = study.best_trial.user_attrs['error_table']
132        result = study.best_trial.user_attrs['result']
133        data_table = study.best_trial.user_attrs['data_table']
134
135        print("\n================ e Optimization finished ====================")
136        print(f"Best value of ϵ: {best_e}")
137        print(f"Closest error (LP - target_error): {study.best_value}")
138        
139        return best_e, privatized_data, error_table, result, data_table

Optimizes the privacy parameter ϵ using Optuna to reach a target error.

Args: target_error (float): Desired error value. p (float): Order of the Lp norm. metric (str): Metric type (1 = MSE, 2 = Lp norm, 3 = Percentage Error).

Returns: tuple: Best ϵ, privatized data, error table, result, and data table.

def utility_error(self, Lp, p, metric, n_trials=20):
141    def utility_error(self, Lp, p, metric, n_trials=20):
142        """
143        Optimizes the privacy parameter `ϵ` for utility preservation.
144        
145        Args:
146            Lp (float): Target error value.
147            p (float): Order of the Lp norm.
148            metric (str): Metric type (1 = MSE, 2 = Lp norm, 3 = Percentage Error).
149        
150        Returns:
151            tuple: Optimized `ϵ`, result, and privatized data.
152        """
153        e, privatized_data, error_table, result, data_table = self.optimize_e_with_optuna(Lp, p, metric, n_trials) # Adjust the value of e to reach the desired error
154
155        print(tabulate(data_table, headers=self.headers, tablefmt="fancy_grid")) # Show database with the e
156
157        option = input("Are you satisfied with the results? (yes/no): ") # Ask the user if he is satisfied with the results
158        if option == "no":
159            self.utility_error(Lp, p, metric)
160        else:
161            print(f"\nError metrics for parameters k={self.k}, m={self.m} and ϵ={e}")
162            print(tabulate(error_table, tablefmt="fancy_grid"))
163
164        return e, result, privatized_data, data_table

Optimizes the privacy parameter ϵ for utility preservation.

Args: Lp (float): Target error value. p (float): Order of the Lp norm. metric (str): Metric type (1 = MSE, 2 = Lp norm, 3 = Percentage Error).

Returns: tuple: Optimized ϵ, result, and privatized data.

def privacy_error(self):
166    def privacy_error(self):
167        """
168        Optimizes the privacy parameter `e` for privacy preservation.
169        
170        Returns:
171            tuple: Optimized `e`, result, and privatized data.
172        """
173        from individual_method import main
174        
175        p = float(input("\n→ Enter the type of error ρ: "))
176
177        error_table = []
178        error_table_fav = []
179        privatized_fav = None
180
181        while True:
182            e_min = input(f"→ Enter the {Style.BRIGHT}minimum{Style.RESET_ALL} value of ϵ: ")
183            e_max = input(f"→ Enter the {Style.BRIGHT}maximum{Style.RESET_ALL} value of ϵ: ")
184            step = input(f"→ Enter the {Style.BRIGHT}step{Style.RESET_ALL} value: ")
185
186            saved_e = 0
187
188            for e in range(int(e_min), int(e_max), int(step)): # Optimize e
189                result, data_table, error_table, privatized_data = self.run_command(e)
190                f_estimated, f_real = self.frequencies()
191                error = self.function_LP(f_estimated, f_real, p)
192
193                print(f"\nError for ϵ = {e}: {error}")
194                print(tabulate(data_table, headers=self.headers, tablefmt="grid"))
195
196                save = input("Do you want to save this privatized values? (yes/no): ")
197                if save == "yes":
198                    saved_e = e
199                    H_fav = result
200                    error_table_fav = error_table
201                    privatized_fav = privatized_data
202            print(f"\nOptimization finished:{Fore.RED} What do you want to do?{Style.RESET_ALL}")
203            choice = input("\n1. Change e\n2. Change k or m\n3. Continue\nSelect: ")
204            if choice == "2":
205                main(2)
206                break
207            elif choice == "3":
208                break
209        
210        if saved_e == 0:
211            e = input("Enter the value of ϵ to use: ")
212            
213            H_fav, data_table, error_table_fav, privatized_fav = self.run_command(e)
214            print(tabulate(data_table, headers=self.headers, tablefmt="fancy_grid")) # Show database with the e
215        else:
216            print(f"Using the saved value of ϵ: {saved_e}")
217
218        option = input("Are you satisfied with the results? (yes/no): ")
219        if option == "no":
220            self.privacy_error()
221        else:
222            print(f"\nError metrics for k={self.k}, m={self.m}, e={saved_e}")
223            print(tabulate(error_table_fav, tablefmt="pretty"))
224
225            print("\nSending database to server ...")
226        return saved_e, H_fav, privatized_fav

Optimizes the privacy parameter e for privacy preservation.

Returns: tuple: Optimized e, result, and privatized data.

def run(self):
228    def run(self):
229        """
230        Main execution function. Asks the user to choose between utility and privacy optimization.
231        
232        Returns:
233            tuple: Optimized `e`, result, and privatized data.
234        """
235        e = 0
236        choice = input("Enter the optimization:\n1. Utility\n2. Privacy\nSelect: ")
237        if choice == "1":
238            print(f"\n{Fore.GREEN}🔎 Optimizing ϵ for utility ...{Style.RESET_ALL}")
239            metric = input("Enter the metric to optimize \n1. MSE\n2. LP\n3. Porcentual Error \nSelect: ")
240            if metric == "1":
241                Lp = float(input("Enter the MSE to reach: "))
242                p = 2
243            elif metric == "2":
244                Lp = float(input("Enter the Lp to reach: "))
245                p = float(input("Enter the type of error (p): "))
246            elif metric == "3":
247                Lp = float(input("Enter the Porcentual Error to reach: "))
248                p = 1
249            n_trials = int(input("Enter the number of trials: "))
250            e, result, privatized_data, _ = self.utility_error(Lp, p, metric, n_trials)
251        elif choice == "2":
252            print(f"\n{Fore.GREEN}🔎 Optimizing ϵ for privacy ...{Style.RESET_ALL}")
253            e, result, privatized_data = self.privacy_error()
254        else:
255            print("Invalid choice. Please try again.")
256        return e, result, privatized_data

Main execution function. Asks the user to choose between utility and privacy optimization.

Returns: tuple: Optimized e, result, and privatized data.

def run_parameter_fitting(df, k, m, algorithm):
259def run_parameter_fitting(df, k, m, algorithm):
260    """
261    Initializes and runs the PrivacyUtilityOptimizer with the given parameters.
262    
263    Args:
264        df (pd.DataFrame): Input dataset.
265        k (int): Parameter k for the selected algorithm.
266        m (int): Parameter m for the selected algorithm.
267        algorithm (str): Algorithm choice (1 for CMS, 2 for HCMS).
268    
269    Returns:
270        tuple: Optimized `e`, result, and privatized data.
271    """
272    optimizer = PrivacyUtilityOptimizer(df, k, m, algorithm)
273    e, result, privatized_data = optimizer.run()
274    return e, result, privatized_data

Initializes and runs the PrivacyUtilityOptimizer with the given parameters.

Args: df (pd.DataFrame): Input dataset. k (int): Parameter k for the selected algorithm. m (int): Parameter m for the selected algorithm. algorithm (str): Algorithm choice (1 for CMS, 2 for HCMS).

Returns: tuple: Optimized e, result, and privatized data.