src.hadamard_count_mean.private_hcms_client
1from sympy import primerange 2import random 3import numpy as np 4from rich.progress import Progress 5import pandas as pd 6 7from utils.utils import generate_hash_functions, display_results 8 9 10class privateHCMSClient: 11 """ 12 This class implements the private Count-Min Sketch (privateCMS) for differential privacy. 13 It processes the dataset, applies privatization, and estimates frequencies on the server side. 14 15 Attributes: 16 epsilon (float): The privacy parameter for differential privacy. 17 k (int): The number of hash functions. 18 m (int): The size of the sketch matrix. 19 dataset (list): The dataset containing values to be processed. 20 domain (list): The unique values in the dataset. 21 H (numpy.ndarray): The Hadamard matrix used in the privatization process. 22 N (int): The total number of elements in the dataset. 23 M (numpy.ndarray): The sketch matrix used to store frequency estimates. 24 client_matrix (list): A list to store privatized matrices for each client. 25 hashes (list): A list of hash functions. 26 """ 27 def __init__(self, epsilon, k, m, df): 28 """ 29 Initializes the privateHCMSClient class with the given parameters. 30 31 Args: 32 epsilon (float): The privacy parameter for differential privacy. 33 k (int): The number of hash functions. 34 m (int): The size of the sketch matrix. 35 df (pandas.DataFrame): The dataset in DataFrame format. 36 """ 37 self.df = df 38 self.epsilon = epsilon 39 self.k = k 40 self.m = m 41 self.dataset = self.df['value'].tolist() 42 self.domain = self.df['value'].unique().tolist() 43 self.H = self.hadamard_matrix(self.m) 44 self.N = len(self.dataset) 45 46 # Creation of the sketch matrix 47 self.M = np.zeros((self.k, self.m)) 48 49 # List to store the privatized matrices 50 self.client_matrix = [] 51 52 # Definition of the hash family 3 by 3 53 primes = list(primerange(10**6, 10**7)) 54 p = primes[random.randint(0, len(primes)-1)] 55 self.hashes = generate_hash_functions(self.k,p, 3,self.m) 56 57 def hadamard_matrix(self,n): 58 """ 59 Generates the Hadamard matrix recursively. 60 61 Args: 62 n (int): The size of the matrix. 63 64 Returns: 65 numpy.ndarray: The generated Hadamard matrix. 66 """ 67 if n == 1: 68 return np.array([[1]]) 69 else: 70 # Recursive function to generate the Hadamard matrix 71 h_half = self.hadamard_matrix(n // 2) 72 h = np.block([[h_half, h_half], [h_half, -h_half]]) 73 return h 74 75 def client(self,d): 76 """ 77 Applies privatization to the data using a random hash function and the Hadamard matrix. 78 79 Args: 80 d (any): The element to be privatized. 81 82 Returns: 83 tuple: A tuple containing the privatized value, hash function index, and matrix index. 84 """ 85 j = random.randint(0, self.k-1) 86 v = np.full(self.m, 0) 87 selected_hash = self.hashes[j] 88 v[selected_hash(d)] = 1 89 w = np.dot(self.H, v) 90 l = random.randint(0, self.m-1) 91 92 P_active = np.exp(self.epsilon) / (np.exp(self.epsilon) + 1) 93 if random.random() <= P_active: 94 b = 1 95 else: 96 b = -1 97 98 self.client_matrix.append((b * w[l], j, l)) 99 return b * w[l],j,l 100 101 def update_sketch_matrix(self, w, j, l): 102 """ 103 Updates the sketch matrix based on the privatized value. 104 105 Args: 106 w (float): The privatized value. 107 j (int): The index of the hash function. 108 l (int): The index of the matrix. 109 """ 110 c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1) 111 x = self.k * c_e * w 112 self.M[j,l] = self.M[j,l] + x 113 114 def traspose_M(self): 115 """ 116 Transposes the sketch matrix. 117 """ 118 self.M = self.M @ np.transpose(self.H) 119 120 def estimate_client(self,d): 121 """ 122 Estimates the frequency of an element using the sketch matrix. 123 124 Args: 125 d (any): The element whose frequency is to be estimated. 126 127 Returns: 128 float: The estimated frequency of the element. 129 """ 130 return (self.m / (self.m-1)) * (1/self.k * np.sum([self.M[i,self.hashes[i](d)] for i in range(self.k)]) - self.N/self.m) 131 132 def execute_client(self): 133 """ 134 Executes the client-side privatization and stores the privatized data. 135 136 Returns: 137 list: A list of privatized data. 138 """ 139 with Progress() as progress: 140 task = progress.add_task('[cyan]Processing client data', total=len(self.dataset)) 141 privatized_data = [] 142 for d in self.dataset: 143 w_i, j_i, l_i = self.client(d) 144 privatized_data.append((w_i,j_i,l_i)) 145 progress.update(task, advance=1) 146 147 return privatized_data 148 149 def server_simulator(self, privatized_data): 150 """ 151 Simulates the server-side process by updating the sketch matrix and estimating frequencies. 152 153 Args: 154 privatized_data (list): The list of privatized data. 155 156 Returns: 157 tuple: A tuple containing the estimated frequencies and the hash functions used. 158 """ 159 with Progress() as progress: 160 task = progress.add_task('[cyan]Update sketch matrix', total=len(privatized_data)) 161 for data in privatized_data: 162 self.update_sketch_matrix(data[0],data[1],data[2]) 163 progress.update(task, advance=1) 164 165 # Transpose the matrix 166 self.traspose_M() 167 168 # Estimate the frequencies 169 F_estimated = {} 170 task = progress.add_task('[cyan]Obtaining histogram of estimated frequencies', total=len(self.domain)) 171 for x in self.domain: 172 F_estimated[x] = self.estimate_client(x) 173 progress.update(task, advance=1) 174 return F_estimated, self.hashes 175 176def run_private_hcms_client(k, m, e, df): 177 """ 178 Runs the private Count-Min Sketch client, processes the data, and estimates frequencies on the server. 179 180 Args: 181 k (int): The number of hash functions. 182 m (int): The size of the sketch matrix. 183 e (float): The privacy parameter epsilon for differential privacy. 184 df (pandas.DataFrame): The dataset in DataFrame format. 185 186 Returns: 187 tuple: A tuple containing the hash functions, data table, error table, privatized data, and the estimated frequencies. 188 """ 189 # Initialize the client 190 client = privateHCMSClient(e, k, m, df) 191 192 # Client side: process the private data 193 privatized_data = client.execute_client() 194 195 # Simulate the server side 196 f_estimated, hashes = client.server_simulator(privatized_data) 197 198 # Save f_estimated to a file 199 df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency']) 200 201 data_table, error_table = display_results(df, f_estimated) 202 203 return hashes, data_table, error_table, privatized_data, df_estimated 204 205 206
11class privateHCMSClient: 12 """ 13 This class implements the private Count-Min Sketch (privateCMS) for differential privacy. 14 It processes the dataset, applies privatization, and estimates frequencies on the server side. 15 16 Attributes: 17 epsilon (float): The privacy parameter for differential privacy. 18 k (int): The number of hash functions. 19 m (int): The size of the sketch matrix. 20 dataset (list): The dataset containing values to be processed. 21 domain (list): The unique values in the dataset. 22 H (numpy.ndarray): The Hadamard matrix used in the privatization process. 23 N (int): The total number of elements in the dataset. 24 M (numpy.ndarray): The sketch matrix used to store frequency estimates. 25 client_matrix (list): A list to store privatized matrices for each client. 26 hashes (list): A list of hash functions. 27 """ 28 def __init__(self, epsilon, k, m, df): 29 """ 30 Initializes the privateHCMSClient class with the given parameters. 31 32 Args: 33 epsilon (float): The privacy parameter for differential privacy. 34 k (int): The number of hash functions. 35 m (int): The size of the sketch matrix. 36 df (pandas.DataFrame): The dataset in DataFrame format. 37 """ 38 self.df = df 39 self.epsilon = epsilon 40 self.k = k 41 self.m = m 42 self.dataset = self.df['value'].tolist() 43 self.domain = self.df['value'].unique().tolist() 44 self.H = self.hadamard_matrix(self.m) 45 self.N = len(self.dataset) 46 47 # Creation of the sketch matrix 48 self.M = np.zeros((self.k, self.m)) 49 50 # List to store the privatized matrices 51 self.client_matrix = [] 52 53 # Definition of the hash family 3 by 3 54 primes = list(primerange(10**6, 10**7)) 55 p = primes[random.randint(0, len(primes)-1)] 56 self.hashes = generate_hash_functions(self.k,p, 3,self.m) 57 58 def hadamard_matrix(self,n): 59 """ 60 Generates the Hadamard matrix recursively. 61 62 Args: 63 n (int): The size of the matrix. 64 65 Returns: 66 numpy.ndarray: The generated Hadamard matrix. 67 """ 68 if n == 1: 69 return np.array([[1]]) 70 else: 71 # Recursive function to generate the Hadamard matrix 72 h_half = self.hadamard_matrix(n // 2) 73 h = np.block([[h_half, h_half], [h_half, -h_half]]) 74 return h 75 76 def client(self,d): 77 """ 78 Applies privatization to the data using a random hash function and the Hadamard matrix. 79 80 Args: 81 d (any): The element to be privatized. 82 83 Returns: 84 tuple: A tuple containing the privatized value, hash function index, and matrix index. 85 """ 86 j = random.randint(0, self.k-1) 87 v = np.full(self.m, 0) 88 selected_hash = self.hashes[j] 89 v[selected_hash(d)] = 1 90 w = np.dot(self.H, v) 91 l = random.randint(0, self.m-1) 92 93 P_active = np.exp(self.epsilon) / (np.exp(self.epsilon) + 1) 94 if random.random() <= P_active: 95 b = 1 96 else: 97 b = -1 98 99 self.client_matrix.append((b * w[l], j, l)) 100 return b * w[l],j,l 101 102 def update_sketch_matrix(self, w, j, l): 103 """ 104 Updates the sketch matrix based on the privatized value. 105 106 Args: 107 w (float): The privatized value. 108 j (int): The index of the hash function. 109 l (int): The index of the matrix. 110 """ 111 c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1) 112 x = self.k * c_e * w 113 self.M[j,l] = self.M[j,l] + x 114 115 def traspose_M(self): 116 """ 117 Transposes the sketch matrix. 118 """ 119 self.M = self.M @ np.transpose(self.H) 120 121 def estimate_client(self,d): 122 """ 123 Estimates the frequency of an element using the sketch matrix. 124 125 Args: 126 d (any): The element whose frequency is to be estimated. 127 128 Returns: 129 float: The estimated frequency of the element. 130 """ 131 return (self.m / (self.m-1)) * (1/self.k * np.sum([self.M[i,self.hashes[i](d)] for i in range(self.k)]) - self.N/self.m) 132 133 def execute_client(self): 134 """ 135 Executes the client-side privatization and stores the privatized data. 136 137 Returns: 138 list: A list of privatized data. 139 """ 140 with Progress() as progress: 141 task = progress.add_task('[cyan]Processing client data', total=len(self.dataset)) 142 privatized_data = [] 143 for d in self.dataset: 144 w_i, j_i, l_i = self.client(d) 145 privatized_data.append((w_i,j_i,l_i)) 146 progress.update(task, advance=1) 147 148 return privatized_data 149 150 def server_simulator(self, privatized_data): 151 """ 152 Simulates the server-side process by updating the sketch matrix and estimating frequencies. 153 154 Args: 155 privatized_data (list): The list of privatized data. 156 157 Returns: 158 tuple: A tuple containing the estimated frequencies and the hash functions used. 159 """ 160 with Progress() as progress: 161 task = progress.add_task('[cyan]Update sketch matrix', total=len(privatized_data)) 162 for data in privatized_data: 163 self.update_sketch_matrix(data[0],data[1],data[2]) 164 progress.update(task, advance=1) 165 166 # Transpose the matrix 167 self.traspose_M() 168 169 # Estimate the frequencies 170 F_estimated = {} 171 task = progress.add_task('[cyan]Obtaining histogram of estimated frequencies', total=len(self.domain)) 172 for x in self.domain: 173 F_estimated[x] = self.estimate_client(x) 174 progress.update(task, advance=1) 175 return F_estimated, self.hashes
This class implements the private Count-Min Sketch (privateCMS) for differential privacy. It processes the dataset, applies privatization, and estimates frequencies on the server side.
Attributes: epsilon (float): The privacy parameter for differential privacy. k (int): The number of hash functions. m (int): The size of the sketch matrix. dataset (list): The dataset containing values to be processed. domain (list): The unique values in the dataset. H (numpy.ndarray): The Hadamard matrix used in the privatization process. N (int): The total number of elements in the dataset. M (numpy.ndarray): The sketch matrix used to store frequency estimates. client_matrix (list): A list to store privatized matrices for each client. hashes (list): A list of hash functions.
28 def __init__(self, epsilon, k, m, df): 29 """ 30 Initializes the privateHCMSClient class with the given parameters. 31 32 Args: 33 epsilon (float): The privacy parameter for differential privacy. 34 k (int): The number of hash functions. 35 m (int): The size of the sketch matrix. 36 df (pandas.DataFrame): The dataset in DataFrame format. 37 """ 38 self.df = df 39 self.epsilon = epsilon 40 self.k = k 41 self.m = m 42 self.dataset = self.df['value'].tolist() 43 self.domain = self.df['value'].unique().tolist() 44 self.H = self.hadamard_matrix(self.m) 45 self.N = len(self.dataset) 46 47 # Creation of the sketch matrix 48 self.M = np.zeros((self.k, self.m)) 49 50 # List to store the privatized matrices 51 self.client_matrix = [] 52 53 # Definition of the hash family 3 by 3 54 primes = list(primerange(10**6, 10**7)) 55 p = primes[random.randint(0, len(primes)-1)] 56 self.hashes = generate_hash_functions(self.k,p, 3,self.m)
Initializes the privateHCMSClient class with the given parameters.
Args: epsilon (float): The privacy parameter for differential privacy. k (int): The number of hash functions. m (int): The size of the sketch matrix. df (pandas.DataFrame): The dataset in DataFrame format.
58 def hadamard_matrix(self,n): 59 """ 60 Generates the Hadamard matrix recursively. 61 62 Args: 63 n (int): The size of the matrix. 64 65 Returns: 66 numpy.ndarray: The generated Hadamard matrix. 67 """ 68 if n == 1: 69 return np.array([[1]]) 70 else: 71 # Recursive function to generate the Hadamard matrix 72 h_half = self.hadamard_matrix(n // 2) 73 h = np.block([[h_half, h_half], [h_half, -h_half]]) 74 return h
Generates the Hadamard matrix recursively.
Args: n (int): The size of the matrix.
Returns: numpy.ndarray: The generated Hadamard matrix.
76 def client(self,d): 77 """ 78 Applies privatization to the data using a random hash function and the Hadamard matrix. 79 80 Args: 81 d (any): The element to be privatized. 82 83 Returns: 84 tuple: A tuple containing the privatized value, hash function index, and matrix index. 85 """ 86 j = random.randint(0, self.k-1) 87 v = np.full(self.m, 0) 88 selected_hash = self.hashes[j] 89 v[selected_hash(d)] = 1 90 w = np.dot(self.H, v) 91 l = random.randint(0, self.m-1) 92 93 P_active = np.exp(self.epsilon) / (np.exp(self.epsilon) + 1) 94 if random.random() <= P_active: 95 b = 1 96 else: 97 b = -1 98 99 self.client_matrix.append((b * w[l], j, l)) 100 return b * w[l],j,l
Applies privatization to the data using a random hash function and the Hadamard matrix.
Args: d (any): The element to be privatized.
Returns: tuple: A tuple containing the privatized value, hash function index, and matrix index.
102 def update_sketch_matrix(self, w, j, l): 103 """ 104 Updates the sketch matrix based on the privatized value. 105 106 Args: 107 w (float): The privatized value. 108 j (int): The index of the hash function. 109 l (int): The index of the matrix. 110 """ 111 c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1) 112 x = self.k * c_e * w 113 self.M[j,l] = self.M[j,l] + x
Updates the sketch matrix based on the privatized value.
Args: w (float): The privatized value. j (int): The index of the hash function. l (int): The index of the matrix.
115 def traspose_M(self): 116 """ 117 Transposes the sketch matrix. 118 """ 119 self.M = self.M @ np.transpose(self.H)
Transposes the sketch matrix.
121 def estimate_client(self,d): 122 """ 123 Estimates the frequency of an element using the sketch matrix. 124 125 Args: 126 d (any): The element whose frequency is to be estimated. 127 128 Returns: 129 float: The estimated frequency of the element. 130 """ 131 return (self.m / (self.m-1)) * (1/self.k * np.sum([self.M[i,self.hashes[i](d)] for i in range(self.k)]) - self.N/self.m)
Estimates the frequency of an element using the sketch matrix.
Args: d (any): The element whose frequency is to be estimated.
Returns: float: The estimated frequency of the element.
133 def execute_client(self): 134 """ 135 Executes the client-side privatization and stores the privatized data. 136 137 Returns: 138 list: A list of privatized data. 139 """ 140 with Progress() as progress: 141 task = progress.add_task('[cyan]Processing client data', total=len(self.dataset)) 142 privatized_data = [] 143 for d in self.dataset: 144 w_i, j_i, l_i = self.client(d) 145 privatized_data.append((w_i,j_i,l_i)) 146 progress.update(task, advance=1) 147 148 return privatized_data
Executes the client-side privatization and stores the privatized data.
Returns: list: A list of privatized data.
150 def server_simulator(self, privatized_data): 151 """ 152 Simulates the server-side process by updating the sketch matrix and estimating frequencies. 153 154 Args: 155 privatized_data (list): The list of privatized data. 156 157 Returns: 158 tuple: A tuple containing the estimated frequencies and the hash functions used. 159 """ 160 with Progress() as progress: 161 task = progress.add_task('[cyan]Update sketch matrix', total=len(privatized_data)) 162 for data in privatized_data: 163 self.update_sketch_matrix(data[0],data[1],data[2]) 164 progress.update(task, advance=1) 165 166 # Transpose the matrix 167 self.traspose_M() 168 169 # Estimate the frequencies 170 F_estimated = {} 171 task = progress.add_task('[cyan]Obtaining histogram of estimated frequencies', total=len(self.domain)) 172 for x in self.domain: 173 F_estimated[x] = self.estimate_client(x) 174 progress.update(task, advance=1) 175 return F_estimated, self.hashes
Simulates the server-side process by updating the sketch matrix and estimating frequencies.
Args: privatized_data (list): The list of privatized data.
Returns: tuple: A tuple containing the estimated frequencies and the hash functions used.
177def run_private_hcms_client(k, m, e, df): 178 """ 179 Runs the private Count-Min Sketch client, processes the data, and estimates frequencies on the server. 180 181 Args: 182 k (int): The number of hash functions. 183 m (int): The size of the sketch matrix. 184 e (float): The privacy parameter epsilon for differential privacy. 185 df (pandas.DataFrame): The dataset in DataFrame format. 186 187 Returns: 188 tuple: A tuple containing the hash functions, data table, error table, privatized data, and the estimated frequencies. 189 """ 190 # Initialize the client 191 client = privateHCMSClient(e, k, m, df) 192 193 # Client side: process the private data 194 privatized_data = client.execute_client() 195 196 # Simulate the server side 197 f_estimated, hashes = client.server_simulator(privatized_data) 198 199 # Save f_estimated to a file 200 df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency']) 201 202 data_table, error_table = display_results(df, f_estimated) 203 204 return hashes, data_table, error_table, privatized_data, df_estimated
Runs the private Count-Min Sketch client, processes the data, and estimates frequencies on the server.
Args: k (int): The number of hash functions. m (int): The size of the sketch matrix. e (float): The privacy parameter epsilon for differential privacy. df (pandas.DataFrame): The dataset in DataFrame format.
Returns: tuple: A tuple containing the hash functions, data table, error table, privatized data, and the estimated frequencies.