src.private_count_sketch.private_cs_client
1from sympy import primerange 2import random 3import numpy as np 4import importlib.util 5import os 6import argparse 7import time 8from progress.bar import Bar 9from tabulate import tabulate 10import sys 11import pandas as pd 12import pickle 13import statistics 14 15from utils.utils import load_dataset, generate_hash_functions, display_results, generate_error_table, generate_hash_function_G 16 17class privateCSClient: 18 def __init__(self, epsilon, k, m, dataset, domain, dataset_name): 19 self.dataset_name = dataset_name 20 self.epsilon = epsilon 21 self.k = k 22 self.m = m 23 self.dataset = dataset 24 self.domain = domain 25 self.N = len(dataset) 26 27 # Creation of the sketch matrix 28 self.M = np.zeros((self.k, self.m)) 29 30 # List to store the privatized matrices 31 self.client_matrix = [] 32 33 # Definition of the hash family 3 by 3 34 primes = list(primerange(10**6, 10**7)) 35 p = primes[random.randint(0, len(primes)-1)] 36 self.H = generate_hash_functions(self.k,p, 3,self.m) 37 38 #Definition of the hash family 4 by 4 39 prime = 2**61 - 1 40 self.G = generate_hash_function_G(self.k, prime) 41 42 43 def bernoulli_vector(self): 44 b = np.random.binomial(1, (np.exp(self.epsilon / 2) / (np.exp(self.epsilon / 2) + 1)), self.m) 45 b = 2 * b - 1 46 return b 47 48 def client(self, d): 49 j = random.randint(0, self.k-1) 50 v = np.zeros(self.m) 51 52 v[self.H[j](d)] = 1 * self.G[j](d) 53 54 b = self.bernoulli_vector() 55 v_aux = v * b 56 57 self.client_matrix.append((v_aux,j)) 58 return v_aux, j 59 60 def update_sketch_matrix(self, v, j): 61 c_e = (np.exp(self.epsilon / 2) + 1) / (np.exp(self.epsilon / 2) - 1) 62 x = self.k * ((c_e/2) * v + (1/2) * np.ones_like(v)) 63 for i in range (self.m): 64 self.M[j,i] += x[i] 65 66 def estimate_client(self, d): 67 median_vector = [] 68 for i in range(self.k): 69 median_vector.append(self.M[i, self.H[i](d)] * self.G[i](d)) 70 median = statistics.median(median_vector) 71 72 #f_estimated = (self.m/(self.m-1))*(median -(self.N/self.m)) 73 f_estimated = (self.m/(self.m-1))*(median) 74 return f_estimated 75 76 def execute_client(self): 77 bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%') 78 privatized_data = [] 79 for d in self.dataset: 80 v_i, j_i = self.client(d) 81 privatized_data.append((v_i, j_i)) 82 bar.next() 83 bar.finish() 84 85 df_client_matrix = pd.DataFrame(privatized_data, columns=['v', 'j']) 86 87 data_dict = df_client_matrix.to_dict(orient='list') 88 89 script_dir = os.path.dirname(os.path.abspath(__file__)) 90 output_dir = os.path.join(script_dir, "../../data/privatized") 91 92 output_file = os.path.join(output_dir, f"{self.dataset_name}_private.pkl") 93 94 with open(output_file, 'wb') as f: 95 pickle.dump(privatized_data, f) 96 97 df_client_matrix.to_csv(os.path.join(output_dir, f"{self.dataset_name}_private.csv"), index=False) 98 return privatized_data 99 100 def server_simulator(self,privatized_data): 101 bar = Bar('Update sketch matrix', max=len(privatized_data), suffix='%(percent)d%%') 102 103 for data in privatized_data: 104 self.update_sketch_matrix(data[0],data[1]) 105 bar.next() 106 bar.finish() 107 108 F_estimated = {} 109 for x in self.domain: 110 F_estimated[x] = self.estimate_client(x) 111 bar.next() 112 bar.finish() 113 return F_estimated, self.H, self.G 114 115def run_private_cs_client(k, m, e, d): 116 dataset, df, domain = load_dataset(f"{d}_filtered") 117 118 # Initialize the private Count-Mean Sketch 119 PCMS = privateCSClient(e, k, m, dataset, domain, d) 120 121 # Client side: process the private data 122 privatized_data = PCMS.execute_client() 123 124 # Simulate the server side 125 f_estimated, H, G = PCMS.server_simulator(privatized_data) 126 127 # Save f_estimated to a file 128 df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency']) 129 130 script_dir = os.path.dirname(os.path.abspath(__file__)) 131 output_dir = os.path.join(script_dir, "../../data/frequencies") 132 df_estimated.to_csv(os.path.join(output_dir, f"{d}_freq_estimated_cms.csv"), index=False) 133 134 # Show the results 135 data_table = display_results(df, f_estimated) 136 return H, data_table, G
class
privateCSClient:
18class privateCSClient: 19 def __init__(self, epsilon, k, m, dataset, domain, dataset_name): 20 self.dataset_name = dataset_name 21 self.epsilon = epsilon 22 self.k = k 23 self.m = m 24 self.dataset = dataset 25 self.domain = domain 26 self.N = len(dataset) 27 28 # Creation of the sketch matrix 29 self.M = np.zeros((self.k, self.m)) 30 31 # List to store the privatized matrices 32 self.client_matrix = [] 33 34 # Definition of the hash family 3 by 3 35 primes = list(primerange(10**6, 10**7)) 36 p = primes[random.randint(0, len(primes)-1)] 37 self.H = generate_hash_functions(self.k,p, 3,self.m) 38 39 #Definition of the hash family 4 by 4 40 prime = 2**61 - 1 41 self.G = generate_hash_function_G(self.k, prime) 42 43 44 def bernoulli_vector(self): 45 b = np.random.binomial(1, (np.exp(self.epsilon / 2) / (np.exp(self.epsilon / 2) + 1)), self.m) 46 b = 2 * b - 1 47 return b 48 49 def client(self, d): 50 j = random.randint(0, self.k-1) 51 v = np.zeros(self.m) 52 53 v[self.H[j](d)] = 1 * self.G[j](d) 54 55 b = self.bernoulli_vector() 56 v_aux = v * b 57 58 self.client_matrix.append((v_aux,j)) 59 return v_aux, j 60 61 def update_sketch_matrix(self, v, j): 62 c_e = (np.exp(self.epsilon / 2) + 1) / (np.exp(self.epsilon / 2) - 1) 63 x = self.k * ((c_e/2) * v + (1/2) * np.ones_like(v)) 64 for i in range (self.m): 65 self.M[j,i] += x[i] 66 67 def estimate_client(self, d): 68 median_vector = [] 69 for i in range(self.k): 70 median_vector.append(self.M[i, self.H[i](d)] * self.G[i](d)) 71 median = statistics.median(median_vector) 72 73 #f_estimated = (self.m/(self.m-1))*(median -(self.N/self.m)) 74 f_estimated = (self.m/(self.m-1))*(median) 75 return f_estimated 76 77 def execute_client(self): 78 bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%') 79 privatized_data = [] 80 for d in self.dataset: 81 v_i, j_i = self.client(d) 82 privatized_data.append((v_i, j_i)) 83 bar.next() 84 bar.finish() 85 86 df_client_matrix = pd.DataFrame(privatized_data, columns=['v', 'j']) 87 88 data_dict = df_client_matrix.to_dict(orient='list') 89 90 script_dir = os.path.dirname(os.path.abspath(__file__)) 91 output_dir = os.path.join(script_dir, "../../data/privatized") 92 93 output_file = os.path.join(output_dir, f"{self.dataset_name}_private.pkl") 94 95 with open(output_file, 'wb') as f: 96 pickle.dump(privatized_data, f) 97 98 df_client_matrix.to_csv(os.path.join(output_dir, f"{self.dataset_name}_private.csv"), index=False) 99 return privatized_data 100 101 def server_simulator(self,privatized_data): 102 bar = Bar('Update sketch matrix', max=len(privatized_data), suffix='%(percent)d%%') 103 104 for data in privatized_data: 105 self.update_sketch_matrix(data[0],data[1]) 106 bar.next() 107 bar.finish() 108 109 F_estimated = {} 110 for x in self.domain: 111 F_estimated[x] = self.estimate_client(x) 112 bar.next() 113 bar.finish() 114 return F_estimated, self.H, self.G
privateCSClient(epsilon, k, m, dataset, domain, dataset_name)
19 def __init__(self, epsilon, k, m, dataset, domain, dataset_name): 20 self.dataset_name = dataset_name 21 self.epsilon = epsilon 22 self.k = k 23 self.m = m 24 self.dataset = dataset 25 self.domain = domain 26 self.N = len(dataset) 27 28 # Creation of the sketch matrix 29 self.M = np.zeros((self.k, self.m)) 30 31 # List to store the privatized matrices 32 self.client_matrix = [] 33 34 # Definition of the hash family 3 by 3 35 primes = list(primerange(10**6, 10**7)) 36 p = primes[random.randint(0, len(primes)-1)] 37 self.H = generate_hash_functions(self.k,p, 3,self.m) 38 39 #Definition of the hash family 4 by 4 40 prime = 2**61 - 1 41 self.G = generate_hash_function_G(self.k, prime)
def
estimate_client(self, d):
67 def estimate_client(self, d): 68 median_vector = [] 69 for i in range(self.k): 70 median_vector.append(self.M[i, self.H[i](d)] * self.G[i](d)) 71 median = statistics.median(median_vector) 72 73 #f_estimated = (self.m/(self.m-1))*(median -(self.N/self.m)) 74 f_estimated = (self.m/(self.m-1))*(median) 75 return f_estimated
def
execute_client(self):
77 def execute_client(self): 78 bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%') 79 privatized_data = [] 80 for d in self.dataset: 81 v_i, j_i = self.client(d) 82 privatized_data.append((v_i, j_i)) 83 bar.next() 84 bar.finish() 85 86 df_client_matrix = pd.DataFrame(privatized_data, columns=['v', 'j']) 87 88 data_dict = df_client_matrix.to_dict(orient='list') 89 90 script_dir = os.path.dirname(os.path.abspath(__file__)) 91 output_dir = os.path.join(script_dir, "../../data/privatized") 92 93 output_file = os.path.join(output_dir, f"{self.dataset_name}_private.pkl") 94 95 with open(output_file, 'wb') as f: 96 pickle.dump(privatized_data, f) 97 98 df_client_matrix.to_csv(os.path.join(output_dir, f"{self.dataset_name}_private.csv"), index=False) 99 return privatized_data
def
server_simulator(self, privatized_data):
101 def server_simulator(self,privatized_data): 102 bar = Bar('Update sketch matrix', max=len(privatized_data), suffix='%(percent)d%%') 103 104 for data in privatized_data: 105 self.update_sketch_matrix(data[0],data[1]) 106 bar.next() 107 bar.finish() 108 109 F_estimated = {} 110 for x in self.domain: 111 F_estimated[x] = self.estimate_client(x) 112 bar.next() 113 bar.finish() 114 return F_estimated, self.H, self.G
def
run_private_cs_client(k, m, e, d):
116def run_private_cs_client(k, m, e, d): 117 dataset, df, domain = load_dataset(f"{d}_filtered") 118 119 # Initialize the private Count-Mean Sketch 120 PCMS = privateCSClient(e, k, m, dataset, domain, d) 121 122 # Client side: process the private data 123 privatized_data = PCMS.execute_client() 124 125 # Simulate the server side 126 f_estimated, H, G = PCMS.server_simulator(privatized_data) 127 128 # Save f_estimated to a file 129 df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency']) 130 131 script_dir = os.path.dirname(os.path.abspath(__file__)) 132 output_dir = os.path.join(script_dir, "../../data/frequencies") 133 df_estimated.to_csv(os.path.join(output_dir, f"{d}_freq_estimated_cms.csv"), index=False) 134 135 # Show the results 136 data_table = display_results(df, f_estimated) 137 return H, data_table, G