src.private_count_mean.private_cms_client
1import random 2import numpy as np 3from sympy import primerange 4import pandas as pd 5from rich.progress import Progress 6 7from utils.utils import load_dataset, generate_hash_functions, display_results, generate_error_table 8 9class privateCMSClient: 10 """ 11 A class to represent the privatized Count-Min Sketch (privateCMS) Client. 12 13 Attributes: 14 df: DataFrame containing the dataset. 15 epsilon: Privacy parameter for the privatization process. 16 k: Number of hash functions used in the CMS. 17 m: Size of the sketch matrix. 18 dataset: List of values from the dataset. 19 domain: Unique values in the dataset. 20 N: Total number of elements in the dataset. 21 M: Count-Min Sketch matrix. 22 client_matrix: List of privatized matrices generated by the client. 23 H: List of hash functions. 24 25 Methods: 26 bernoulli_vector(): 27 Generates a Bernoulli vector for privatization based on the epsilon value. 28 client(d): 29 Simulates the client side of the private CMS, returning a privatized sketch vector. 30 update_sketch_matrix(v, j): 31 Updates the sketch matrix based on the privatized sketch vector. 32 estimate_client(d): 33 Estimates the frequency of an element using the private CMS sketch matrix. 34 execute_client(): 35 Simulates the client side of the private CMS for all elements in the dataset. 36 server_simulator(privatized_data): 37 Simulates the server side of the private CMS, processes the privatized data, and estimates frequencies. 38 """ 39 def __init__(self, epsilon, k, m, df): 40 """ 41 Initializes the privateCMSClient with the given parameters. 42 43 Args: 44 epsilon (float): Privacy parameter for the privatization process. 45 k (int): Number of hash functions. 46 m (int): Size of the sketch matrix. 47 df (DataFrame): Dataset to be processed. 48 """ 49 self.df = df 50 self.epsilon = epsilon 51 self.k = k 52 self.m = m 53 self.dataset = self.df['value'].tolist() 54 self.domain = self.df['value'].unique().tolist() 55 self.N = len(self.dataset) 56 57 # Creation of the sketch matrix 58 self.M = np.zeros((self.k, self.m)) 59 60 # List to store the privatized matrices 61 self.client_matrix = [] 62 63 # Definition of the hash family 3 by 3 64 primes = list(primerange(10**6, 10**7)) 65 p = primes[random.randint(0, len(primes)-1)] 66 self.H = generate_hash_functions(self.k,p, 3,self.m) 67 68 69 def bernoulli_vector(self): 70 """ 71 Generates a Bernoulli vector for privatization based on the epsilon value. 72 73 Returns: 74 numpy.ndarray: A Bernoulli vector with values -1 and 1. 75 """ 76 b = np.random.binomial(1, (np.exp(self.epsilon/2)) / ((np.exp(self.epsilon/2)) + 1), self.m) 77 b = 2 * b - 1 # Convert 0 to -1 78 return b 79 80 def client(self, d): 81 """ 82 Simulates the client side of the privatized Count-Min Sketch. 83 84 Args: 85 d (element): The element for which the privatized sketch vector is generated. 86 87 Returns: 88 tuple: A tuple containing the privatized sketch vector and the index of the chosen hash function. 89 """ 90 j = random.randint(0, self.k-1) 91 v = np.full(self.m, -1) 92 selected_hash = self.H[j] 93 v[selected_hash(d)] = 1 94 b = self.bernoulli_vector() 95 v_aux = v*b 96 # Store the privatized matrix 97 self.client_matrix.append((v_aux,j)) 98 return v_aux,j 99 100 def update_sketch_matrix(self,v,j): 101 """ 102 Updates the sketch matrix based on the given privatized sketch vector. 103 104 Args: 105 v (numpy.ndarray): The privatized sketch vector. 106 j (int): The index of the selected hash function. 107 """ 108 c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1) 109 x = self.k * ((c_e/2) * v + (1/2) * np.ones_like(v)) 110 for i in range (self.m): 111 self.M[j,i] += x[i] 112 113 def estimate_client(self,d): 114 """ 115 Estimates the frequency of an element based on the private CMS sketch matrix. 116 117 Args: 118 d (element): The element whose frequency is estimated. 119 120 Returns: 121 float: The estimated frequency of the element. 122 """ 123 sum_aux = 0 124 for i in range(self.k): 125 selected_hash = self.H[i] 126 sum_aux += self.M[i, selected_hash(d)] 127 128 f_estimated = (self.m/(self.m-1))*((sum_aux/self.k)-(self.N/self.m)) 129 return f_estimated 130 131 def execute_client(self): 132 """ 133 Simulates the client side of the privatized Count-Min Sketch for all elements in the dataset. 134 135 Returns: 136 list: A list of privatized sketch vectors for all elements in the dataset. 137 """ 138 with Progress() as progress: 139 bar = progress.add_task("Processing client data", total=len(self.dataset)) 140 141 privatized_data = [] 142 for d in self.dataset: 143 v_i, j_i = self.client(d) 144 privatized_data.append((v_i,j_i)) 145 progress.update(bar, advance=1) 146 147 return privatized_data 148 149 def server_simulator(self,privatized_data): 150 """ 151 Simulates the server side of the privatized Count-Min Sketch, processes the privatized data, and estimates frequencies. 152 153 Args: 154 privatized_data (list): List of privatized sketch vectors. 155 156 Returns: 157 tuple: A tuple containing the estimated frequencies and the hash functions used. 158 """ 159 with Progress() as progress: 160 bar = progress.add_task('Update sketch matrix', total=len(privatized_data)) 161 162 for data in privatized_data: 163 self.update_sketch_matrix(data[0],data[1]) 164 progress.update(bar, advance=1) 165 166 bar = progress.add_task('Estimate frequencies', total=len(self.domain)) 167 F_estimated = {} 168 for x in self.domain: 169 F_estimated[x] = self.estimate_client(x) 170 progress.update(bar, advance=1) 171 172 return F_estimated, self.H 173 174def run_private_cms_client(k, m, e, df): 175 """ 176 Runs the privatized Count-Min Sketch algorithm and displays the results. 177 178 Args: 179 k (int): Number of hash functions. 180 m (int): Size of the sketch matrix. 181 e (float): Privacy parameter. 182 df (DataFrame): Dataset to be processed. 183 184 Returns: 185 tuple: A tuple containing the hash functions, the results table, the error table, the privatized data, and the estimated frequency DataFrame. 186 """ 187 # Initialize the private Count-Mean Sketch 188 PCMS = privateCMSClient(e, k, m, df) 189 190 # Client side: process the private data 191 privatized_data = PCMS.execute_client() 192 193 # Simulate the server side 194 f_estimated, H = PCMS.server_simulator(privatized_data) 195 196 # Save f_estimated to a file 197 df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency']) 198 199 # Show the results 200 data_table, error_table = display_results(df, f_estimated) 201 202 return H, data_table, error_table, privatized_data, df_estimated
10class privateCMSClient: 11 """ 12 A class to represent the privatized Count-Min Sketch (privateCMS) Client. 13 14 Attributes: 15 df: DataFrame containing the dataset. 16 epsilon: Privacy parameter for the privatization process. 17 k: Number of hash functions used in the CMS. 18 m: Size of the sketch matrix. 19 dataset: List of values from the dataset. 20 domain: Unique values in the dataset. 21 N: Total number of elements in the dataset. 22 M: Count-Min Sketch matrix. 23 client_matrix: List of privatized matrices generated by the client. 24 H: List of hash functions. 25 26 Methods: 27 bernoulli_vector(): 28 Generates a Bernoulli vector for privatization based on the epsilon value. 29 client(d): 30 Simulates the client side of the private CMS, returning a privatized sketch vector. 31 update_sketch_matrix(v, j): 32 Updates the sketch matrix based on the privatized sketch vector. 33 estimate_client(d): 34 Estimates the frequency of an element using the private CMS sketch matrix. 35 execute_client(): 36 Simulates the client side of the private CMS for all elements in the dataset. 37 server_simulator(privatized_data): 38 Simulates the server side of the private CMS, processes the privatized data, and estimates frequencies. 39 """ 40 def __init__(self, epsilon, k, m, df): 41 """ 42 Initializes the privateCMSClient with the given parameters. 43 44 Args: 45 epsilon (float): Privacy parameter for the privatization process. 46 k (int): Number of hash functions. 47 m (int): Size of the sketch matrix. 48 df (DataFrame): Dataset to be processed. 49 """ 50 self.df = df 51 self.epsilon = epsilon 52 self.k = k 53 self.m = m 54 self.dataset = self.df['value'].tolist() 55 self.domain = self.df['value'].unique().tolist() 56 self.N = len(self.dataset) 57 58 # Creation of the sketch matrix 59 self.M = np.zeros((self.k, self.m)) 60 61 # List to store the privatized matrices 62 self.client_matrix = [] 63 64 # Definition of the hash family 3 by 3 65 primes = list(primerange(10**6, 10**7)) 66 p = primes[random.randint(0, len(primes)-1)] 67 self.H = generate_hash_functions(self.k,p, 3,self.m) 68 69 70 def bernoulli_vector(self): 71 """ 72 Generates a Bernoulli vector for privatization based on the epsilon value. 73 74 Returns: 75 numpy.ndarray: A Bernoulli vector with values -1 and 1. 76 """ 77 b = np.random.binomial(1, (np.exp(self.epsilon/2)) / ((np.exp(self.epsilon/2)) + 1), self.m) 78 b = 2 * b - 1 # Convert 0 to -1 79 return b 80 81 def client(self, d): 82 """ 83 Simulates the client side of the privatized Count-Min Sketch. 84 85 Args: 86 d (element): The element for which the privatized sketch vector is generated. 87 88 Returns: 89 tuple: A tuple containing the privatized sketch vector and the index of the chosen hash function. 90 """ 91 j = random.randint(0, self.k-1) 92 v = np.full(self.m, -1) 93 selected_hash = self.H[j] 94 v[selected_hash(d)] = 1 95 b = self.bernoulli_vector() 96 v_aux = v*b 97 # Store the privatized matrix 98 self.client_matrix.append((v_aux,j)) 99 return v_aux,j 100 101 def update_sketch_matrix(self,v,j): 102 """ 103 Updates the sketch matrix based on the given privatized sketch vector. 104 105 Args: 106 v (numpy.ndarray): The privatized sketch vector. 107 j (int): The index of the selected hash function. 108 """ 109 c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1) 110 x = self.k * ((c_e/2) * v + (1/2) * np.ones_like(v)) 111 for i in range (self.m): 112 self.M[j,i] += x[i] 113 114 def estimate_client(self,d): 115 """ 116 Estimates the frequency of an element based on the private CMS sketch matrix. 117 118 Args: 119 d (element): The element whose frequency is estimated. 120 121 Returns: 122 float: The estimated frequency of the element. 123 """ 124 sum_aux = 0 125 for i in range(self.k): 126 selected_hash = self.H[i] 127 sum_aux += self.M[i, selected_hash(d)] 128 129 f_estimated = (self.m/(self.m-1))*((sum_aux/self.k)-(self.N/self.m)) 130 return f_estimated 131 132 def execute_client(self): 133 """ 134 Simulates the client side of the privatized Count-Min Sketch for all elements in the dataset. 135 136 Returns: 137 list: A list of privatized sketch vectors for all elements in the dataset. 138 """ 139 with Progress() as progress: 140 bar = progress.add_task("Processing client data", total=len(self.dataset)) 141 142 privatized_data = [] 143 for d in self.dataset: 144 v_i, j_i = self.client(d) 145 privatized_data.append((v_i,j_i)) 146 progress.update(bar, advance=1) 147 148 return privatized_data 149 150 def server_simulator(self,privatized_data): 151 """ 152 Simulates the server side of the privatized Count-Min Sketch, processes the privatized data, and estimates frequencies. 153 154 Args: 155 privatized_data (list): List of privatized sketch vectors. 156 157 Returns: 158 tuple: A tuple containing the estimated frequencies and the hash functions used. 159 """ 160 with Progress() as progress: 161 bar = progress.add_task('Update sketch matrix', total=len(privatized_data)) 162 163 for data in privatized_data: 164 self.update_sketch_matrix(data[0],data[1]) 165 progress.update(bar, advance=1) 166 167 bar = progress.add_task('Estimate frequencies', total=len(self.domain)) 168 F_estimated = {} 169 for x in self.domain: 170 F_estimated[x] = self.estimate_client(x) 171 progress.update(bar, advance=1) 172 173 return F_estimated, self.H
A class to represent the privatized Count-Min Sketch (privateCMS) Client.
Attributes: df: DataFrame containing the dataset. epsilon: Privacy parameter for the privatization process. k: Number of hash functions used in the CMS. m: Size of the sketch matrix. dataset: List of values from the dataset. domain: Unique values in the dataset. N: Total number of elements in the dataset. M: Count-Min Sketch matrix. client_matrix: List of privatized matrices generated by the client. H: List of hash functions.
Methods: bernoulli_vector(): Generates a Bernoulli vector for privatization based on the epsilon value. client(d): Simulates the client side of the private CMS, returning a privatized sketch vector. update_sketch_matrix(v, j): Updates the sketch matrix based on the privatized sketch vector. estimate_client(d): Estimates the frequency of an element using the private CMS sketch matrix. execute_client(): Simulates the client side of the private CMS for all elements in the dataset. server_simulator(privatized_data): Simulates the server side of the private CMS, processes the privatized data, and estimates frequencies.
40 def __init__(self, epsilon, k, m, df): 41 """ 42 Initializes the privateCMSClient with the given parameters. 43 44 Args: 45 epsilon (float): Privacy parameter for the privatization process. 46 k (int): Number of hash functions. 47 m (int): Size of the sketch matrix. 48 df (DataFrame): Dataset to be processed. 49 """ 50 self.df = df 51 self.epsilon = epsilon 52 self.k = k 53 self.m = m 54 self.dataset = self.df['value'].tolist() 55 self.domain = self.df['value'].unique().tolist() 56 self.N = len(self.dataset) 57 58 # Creation of the sketch matrix 59 self.M = np.zeros((self.k, self.m)) 60 61 # List to store the privatized matrices 62 self.client_matrix = [] 63 64 # Definition of the hash family 3 by 3 65 primes = list(primerange(10**6, 10**7)) 66 p = primes[random.randint(0, len(primes)-1)] 67 self.H = generate_hash_functions(self.k,p, 3,self.m)
Initializes the privateCMSClient with the given parameters.
Args: epsilon (float): Privacy parameter for the privatization process. k (int): Number of hash functions. m (int): Size of the sketch matrix. df (DataFrame): Dataset to be processed.
70 def bernoulli_vector(self): 71 """ 72 Generates a Bernoulli vector for privatization based on the epsilon value. 73 74 Returns: 75 numpy.ndarray: A Bernoulli vector with values -1 and 1. 76 """ 77 b = np.random.binomial(1, (np.exp(self.epsilon/2)) / ((np.exp(self.epsilon/2)) + 1), self.m) 78 b = 2 * b - 1 # Convert 0 to -1 79 return b
Generates a Bernoulli vector for privatization based on the epsilon value.
Returns: numpy.ndarray: A Bernoulli vector with values -1 and 1.
81 def client(self, d): 82 """ 83 Simulates the client side of the privatized Count-Min Sketch. 84 85 Args: 86 d (element): The element for which the privatized sketch vector is generated. 87 88 Returns: 89 tuple: A tuple containing the privatized sketch vector and the index of the chosen hash function. 90 """ 91 j = random.randint(0, self.k-1) 92 v = np.full(self.m, -1) 93 selected_hash = self.H[j] 94 v[selected_hash(d)] = 1 95 b = self.bernoulli_vector() 96 v_aux = v*b 97 # Store the privatized matrix 98 self.client_matrix.append((v_aux,j)) 99 return v_aux,j
Simulates the client side of the privatized Count-Min Sketch.
Args: d (element): The element for which the privatized sketch vector is generated.
Returns: tuple: A tuple containing the privatized sketch vector and the index of the chosen hash function.
101 def update_sketch_matrix(self,v,j): 102 """ 103 Updates the sketch matrix based on the given privatized sketch vector. 104 105 Args: 106 v (numpy.ndarray): The privatized sketch vector. 107 j (int): The index of the selected hash function. 108 """ 109 c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1) 110 x = self.k * ((c_e/2) * v + (1/2) * np.ones_like(v)) 111 for i in range (self.m): 112 self.M[j,i] += x[i]
Updates the sketch matrix based on the given privatized sketch vector.
Args: v (numpy.ndarray): The privatized sketch vector. j (int): The index of the selected hash function.
114 def estimate_client(self,d): 115 """ 116 Estimates the frequency of an element based on the private CMS sketch matrix. 117 118 Args: 119 d (element): The element whose frequency is estimated. 120 121 Returns: 122 float: The estimated frequency of the element. 123 """ 124 sum_aux = 0 125 for i in range(self.k): 126 selected_hash = self.H[i] 127 sum_aux += self.M[i, selected_hash(d)] 128 129 f_estimated = (self.m/(self.m-1))*((sum_aux/self.k)-(self.N/self.m)) 130 return f_estimated
Estimates the frequency of an element based on the private CMS sketch matrix.
Args: d (element): The element whose frequency is estimated.
Returns: float: The estimated frequency of the element.
132 def execute_client(self): 133 """ 134 Simulates the client side of the privatized Count-Min Sketch for all elements in the dataset. 135 136 Returns: 137 list: A list of privatized sketch vectors for all elements in the dataset. 138 """ 139 with Progress() as progress: 140 bar = progress.add_task("Processing client data", total=len(self.dataset)) 141 142 privatized_data = [] 143 for d in self.dataset: 144 v_i, j_i = self.client(d) 145 privatized_data.append((v_i,j_i)) 146 progress.update(bar, advance=1) 147 148 return privatized_data
Simulates the client side of the privatized Count-Min Sketch for all elements in the dataset.
Returns: list: A list of privatized sketch vectors for all elements in the dataset.
150 def server_simulator(self,privatized_data): 151 """ 152 Simulates the server side of the privatized Count-Min Sketch, processes the privatized data, and estimates frequencies. 153 154 Args: 155 privatized_data (list): List of privatized sketch vectors. 156 157 Returns: 158 tuple: A tuple containing the estimated frequencies and the hash functions used. 159 """ 160 with Progress() as progress: 161 bar = progress.add_task('Update sketch matrix', total=len(privatized_data)) 162 163 for data in privatized_data: 164 self.update_sketch_matrix(data[0],data[1]) 165 progress.update(bar, advance=1) 166 167 bar = progress.add_task('Estimate frequencies', total=len(self.domain)) 168 F_estimated = {} 169 for x in self.domain: 170 F_estimated[x] = self.estimate_client(x) 171 progress.update(bar, advance=1) 172 173 return F_estimated, self.H
Simulates the server side of the privatized Count-Min Sketch, processes the privatized data, and estimates frequencies.
Args: privatized_data (list): List of privatized sketch vectors.
Returns: tuple: A tuple containing the estimated frequencies and the hash functions used.
175def run_private_cms_client(k, m, e, df): 176 """ 177 Runs the privatized Count-Min Sketch algorithm and displays the results. 178 179 Args: 180 k (int): Number of hash functions. 181 m (int): Size of the sketch matrix. 182 e (float): Privacy parameter. 183 df (DataFrame): Dataset to be processed. 184 185 Returns: 186 tuple: A tuple containing the hash functions, the results table, the error table, the privatized data, and the estimated frequency DataFrame. 187 """ 188 # Initialize the private Count-Mean Sketch 189 PCMS = privateCMSClient(e, k, m, df) 190 191 # Client side: process the private data 192 privatized_data = PCMS.execute_client() 193 194 # Simulate the server side 195 f_estimated, H = PCMS.server_simulator(privatized_data) 196 197 # Save f_estimated to a file 198 df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency']) 199 200 # Show the results 201 data_table, error_table = display_results(df, f_estimated) 202 203 return H, data_table, error_table, privatized_data, df_estimated
Runs the privatized Count-Min Sketch algorithm and displays the results.
Args: k (int): Number of hash functions. m (int): Size of the sketch matrix. e (float): Privacy parameter. df (DataFrame): Dataset to be processed.
Returns: tuple: A tuple containing the hash functions, the results table, the error table, the privatized data, and the estimated frequency DataFrame.