src.private_count_min.private_cmins_client
1from sympy import primerange 2import random 3import numpy as np 4import importlib.util 5import os 6import argparse 7import time 8from progress.bar import Bar 9from tabulate import tabulate 10import sys 11import pandas as pd 12import pickle 13import statistics 14 15from utils.utils import load_dataset, generate_hash_functions, display_results, generate_error_table 16 17class privateCMinSClient: 18 def __init__(self, epsilon, k, m, dataset, domain, dataset_name): 19 self.dataset_name = dataset_name 20 self.epsilon = epsilon 21 self.k = k 22 self.m = m 23 self.dataset = dataset 24 self.domain = domain 25 self.N = len(dataset) 26 27 # Creation of the sketch matrix 28 self.M = np.zeros((self.k, self.m)) 29 30 # List to store the privatized matrices 31 self.client_matrix = [] 32 33 # Definition of the hash family 3 by 3 34 primes = list(primerange(10**6, 10**7)) 35 p = primes[random.randint(0, len(primes)-1)] 36 self.H = generate_hash_functions(self.k, p, 3,self.m) 37 38 39 def bernoulli_vector(self): 40 b = np.random.binomial(1, (np.exp(self.epsilon/2)) / ((np.exp(self.epsilon/2)) + 1), self.m) 41 b = 2 * b - 1 # Convert 0 to -1 42 return b 43 44 def client(self, d): 45 j = random.randint(0, self.k-1) 46 v = np.full(self.m, -1) 47 selected_hash = self.H[j] 48 v[selected_hash(d)] = 1 49 b = self.bernoulli_vector() 50 v_aux = v*b 51 # Store the privatized matrix 52 self.client_matrix.append((v_aux,j)) 53 return v_aux,j 54 55 def update_sketch_matrix(self,v,j): 56 c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1) 57 #x = self.k * ((c_e/2) * v + (1/2) * np.ones_like(v)) 58 x = ((c_e/2) * v + (1/2) * np.ones_like(v)) 59 for i in range (self.m): 60 self.M[j,i] += x[i] 61 62 def estimate_client(self,d): 63 v_minimum = [] 64 for i in range(self.k): 65 selected_hash = self.H[i] 66 if self.M[i, selected_hash(d)] > 0 and self.M[i, selected_hash(d)] != self.k: 67 v_minimum.append(self.M[i, selected_hash(d)]) 68 69 minimum = min(v_minimum) 70 f_estimated = (self.m / (self.m - 1)) * ( minimum - (self.N/self.m)) 71 72 return minimum 73 74 def execute_client(self): 75 bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%') 76 privatized_data = [] 77 for d in self.dataset: 78 v_i, j_i = self.client(d) 79 privatized_data.append((v_i,j_i)) 80 bar.next() 81 bar.finish() 82 83 df_client_matrix = pd.DataFrame(privatized_data, columns=['v', 'j']) 84 85 data_dict = df_client_matrix.to_dict(orient='list') 86 87 script_dir = os.path.dirname(os.path.abspath(__file__)) 88 output_dir = os.path.join(script_dir, "../../data/privatized") 89 90 output_file = os.path.join(output_dir, f"{self.dataset_name}_private.pkl") 91 92 with open(output_file, 'wb') as f: 93 pickle.dump(privatized_data, f) 94 95 df_client_matrix.to_csv(os.path.join(output_dir, f"{self.dataset_name}_private.csv"), index=False) 96 return privatized_data 97 98 def server_simulator(self,privatized_data): 99 bar = Bar('Update sketch matrix', max=len(privatized_data), suffix='%(percent)d%%') 100 101 for data in privatized_data: 102 self.update_sketch_matrix(data[0],data[1]) 103 bar.next() 104 bar.finish() 105 106 F_estimated = {} 107 for x in self.domain: 108 F_estimated[x] = self.estimate_client(x) 109 bar.next() 110 bar.finish() 111 return F_estimated, self.H 112 113def run_private_cmins_client(k, m, e, d): 114 dataset, df, domain = load_dataset(f"{d}_filtered") 115 116 # Initialize the private Count-Mean Sketch 117 PCMS = privateCMinSClient(e, k, m, dataset, domain, d) 118 119 # Client side: process the private data 120 privatized_data = PCMS.execute_client() 121 122 # Simulate the server side 123 f_estimated, H = PCMS.server_simulator(privatized_data) 124 125 # Save f_estimated to a file 126 df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency']) 127 128 script_dir = os.path.dirname(os.path.abspath(__file__)) 129 output_dir = os.path.join(script_dir, "../../data/frequencies") 130 df_estimated.to_csv(os.path.join(output_dir, f"{d}_freq_estimated_cms.csv"), index=False) 131 132 # Show the results 133 error_table, _ = display_results(df, f_estimated) 134 135 return H, error_table, f_estimated
class
privateCMinSClient:
18class privateCMinSClient: 19 def __init__(self, epsilon, k, m, dataset, domain, dataset_name): 20 self.dataset_name = dataset_name 21 self.epsilon = epsilon 22 self.k = k 23 self.m = m 24 self.dataset = dataset 25 self.domain = domain 26 self.N = len(dataset) 27 28 # Creation of the sketch matrix 29 self.M = np.zeros((self.k, self.m)) 30 31 # List to store the privatized matrices 32 self.client_matrix = [] 33 34 # Definition of the hash family 3 by 3 35 primes = list(primerange(10**6, 10**7)) 36 p = primes[random.randint(0, len(primes)-1)] 37 self.H = generate_hash_functions(self.k, p, 3,self.m) 38 39 40 def bernoulli_vector(self): 41 b = np.random.binomial(1, (np.exp(self.epsilon/2)) / ((np.exp(self.epsilon/2)) + 1), self.m) 42 b = 2 * b - 1 # Convert 0 to -1 43 return b 44 45 def client(self, d): 46 j = random.randint(0, self.k-1) 47 v = np.full(self.m, -1) 48 selected_hash = self.H[j] 49 v[selected_hash(d)] = 1 50 b = self.bernoulli_vector() 51 v_aux = v*b 52 # Store the privatized matrix 53 self.client_matrix.append((v_aux,j)) 54 return v_aux,j 55 56 def update_sketch_matrix(self,v,j): 57 c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1) 58 #x = self.k * ((c_e/2) * v + (1/2) * np.ones_like(v)) 59 x = ((c_e/2) * v + (1/2) * np.ones_like(v)) 60 for i in range (self.m): 61 self.M[j,i] += x[i] 62 63 def estimate_client(self,d): 64 v_minimum = [] 65 for i in range(self.k): 66 selected_hash = self.H[i] 67 if self.M[i, selected_hash(d)] > 0 and self.M[i, selected_hash(d)] != self.k: 68 v_minimum.append(self.M[i, selected_hash(d)]) 69 70 minimum = min(v_minimum) 71 f_estimated = (self.m / (self.m - 1)) * ( minimum - (self.N/self.m)) 72 73 return minimum 74 75 def execute_client(self): 76 bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%') 77 privatized_data = [] 78 for d in self.dataset: 79 v_i, j_i = self.client(d) 80 privatized_data.append((v_i,j_i)) 81 bar.next() 82 bar.finish() 83 84 df_client_matrix = pd.DataFrame(privatized_data, columns=['v', 'j']) 85 86 data_dict = df_client_matrix.to_dict(orient='list') 87 88 script_dir = os.path.dirname(os.path.abspath(__file__)) 89 output_dir = os.path.join(script_dir, "../../data/privatized") 90 91 output_file = os.path.join(output_dir, f"{self.dataset_name}_private.pkl") 92 93 with open(output_file, 'wb') as f: 94 pickle.dump(privatized_data, f) 95 96 df_client_matrix.to_csv(os.path.join(output_dir, f"{self.dataset_name}_private.csv"), index=False) 97 return privatized_data 98 99 def server_simulator(self,privatized_data): 100 bar = Bar('Update sketch matrix', max=len(privatized_data), suffix='%(percent)d%%') 101 102 for data in privatized_data: 103 self.update_sketch_matrix(data[0],data[1]) 104 bar.next() 105 bar.finish() 106 107 F_estimated = {} 108 for x in self.domain: 109 F_estimated[x] = self.estimate_client(x) 110 bar.next() 111 bar.finish() 112 return F_estimated, self.H
privateCMinSClient(epsilon, k, m, dataset, domain, dataset_name)
19 def __init__(self, epsilon, k, m, dataset, domain, dataset_name): 20 self.dataset_name = dataset_name 21 self.epsilon = epsilon 22 self.k = k 23 self.m = m 24 self.dataset = dataset 25 self.domain = domain 26 self.N = len(dataset) 27 28 # Creation of the sketch matrix 29 self.M = np.zeros((self.k, self.m)) 30 31 # List to store the privatized matrices 32 self.client_matrix = [] 33 34 # Definition of the hash family 3 by 3 35 primes = list(primerange(10**6, 10**7)) 36 p = primes[random.randint(0, len(primes)-1)] 37 self.H = generate_hash_functions(self.k, p, 3,self.m)
def
estimate_client(self, d):
63 def estimate_client(self,d): 64 v_minimum = [] 65 for i in range(self.k): 66 selected_hash = self.H[i] 67 if self.M[i, selected_hash(d)] > 0 and self.M[i, selected_hash(d)] != self.k: 68 v_minimum.append(self.M[i, selected_hash(d)]) 69 70 minimum = min(v_minimum) 71 f_estimated = (self.m / (self.m - 1)) * ( minimum - (self.N/self.m)) 72 73 return minimum
def
execute_client(self):
75 def execute_client(self): 76 bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%') 77 privatized_data = [] 78 for d in self.dataset: 79 v_i, j_i = self.client(d) 80 privatized_data.append((v_i,j_i)) 81 bar.next() 82 bar.finish() 83 84 df_client_matrix = pd.DataFrame(privatized_data, columns=['v', 'j']) 85 86 data_dict = df_client_matrix.to_dict(orient='list') 87 88 script_dir = os.path.dirname(os.path.abspath(__file__)) 89 output_dir = os.path.join(script_dir, "../../data/privatized") 90 91 output_file = os.path.join(output_dir, f"{self.dataset_name}_private.pkl") 92 93 with open(output_file, 'wb') as f: 94 pickle.dump(privatized_data, f) 95 96 df_client_matrix.to_csv(os.path.join(output_dir, f"{self.dataset_name}_private.csv"), index=False) 97 return privatized_data
def
server_simulator(self, privatized_data):
99 def server_simulator(self,privatized_data): 100 bar = Bar('Update sketch matrix', max=len(privatized_data), suffix='%(percent)d%%') 101 102 for data in privatized_data: 103 self.update_sketch_matrix(data[0],data[1]) 104 bar.next() 105 bar.finish() 106 107 F_estimated = {} 108 for x in self.domain: 109 F_estimated[x] = self.estimate_client(x) 110 bar.next() 111 bar.finish() 112 return F_estimated, self.H
def
run_private_cmins_client(k, m, e, d):
114def run_private_cmins_client(k, m, e, d): 115 dataset, df, domain = load_dataset(f"{d}_filtered") 116 117 # Initialize the private Count-Mean Sketch 118 PCMS = privateCMinSClient(e, k, m, dataset, domain, d) 119 120 # Client side: process the private data 121 privatized_data = PCMS.execute_client() 122 123 # Simulate the server side 124 f_estimated, H = PCMS.server_simulator(privatized_data) 125 126 # Save f_estimated to a file 127 df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency']) 128 129 script_dir = os.path.dirname(os.path.abspath(__file__)) 130 output_dir = os.path.join(script_dir, "../../data/frequencies") 131 df_estimated.to_csv(os.path.join(output_dir, f"{d}_freq_estimated_cms.csv"), index=False) 132 133 # Show the results 134 error_table, _ = display_results(df, f_estimated) 135 136 return H, error_table, f_estimated