src.private_count_min.cms_client_min
1from sympy import primerange 2import random 3import numpy as np 4import importlib.util 5import os 6import argparse 7import time 8from progress.bar import Bar 9from tabulate import tabulate 10import sys 11import pandas as pd 12import pickle 13 14 15from utils.utils import load_dataset, generate_error_table, generate_hash_functions, display_results 16 17class CMSClient: 18 def __init__(self, k, m, dataset, domain): 19 self.k = k 20 self.m = m 21 self.dataset = dataset 22 self.domain = domain 23 self.N = len(dataset) 24 25 # Creation of the sketch matrix 26 self.M = np.zeros((self.k, self.m)) 27 28 # Definition of the hash family 3 by 3 29 primes = list(primerange(10**6, 10**7)) 30 p = primes[random.randint(0, len(primes)-1)] 31 self.H = generate_hash_functions(self.k,p, 3,self.m) 32 33 def client(self, d): 34 j = random.randint(0, self.k-1) 35 v = np.full(self.m, -1) 36 selected_hash = self.H[j] 37 v[selected_hash(d)] = 1 38 return v, j 39 40 def update_sketch_matrix(self, d): 41 for i in range (self.k): 42 selected_hash = self.H[i] 43 hash_index = selected_hash(d) 44 self.M[i ,hash_index] += 1 45 46 def estimate_client(self,d): 47 min_estimation = float('inf') 48 for i in range(self.k): 49 selected_hash = self.H[i] 50 min_estimation = min(min_estimation, self.M[i,selected_hash(d)]) 51 return min_estimation 52 53 def server_simulator(self): 54 bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%') 55 56 for d in self.dataset: 57 self.update_sketch_matrix(d) 58 bar.next() 59 bar.finish() 60 61 F_estimated = {} 62 bar = Bar('Obtaining histogram of estimated frequencies', max=len(self.domain), suffix='%(percent)d%%') 63 for x in self.domain: 64 F_estimated[x] = self.estimate_client(x) 65 bar.next() 66 bar.finish() 67 return F_estimated 68 69def run_cmins_client(k, m, d): 70 # Load the dataset 71 dataset_name = f"{d}_filtered" 72 dataset, df, domain = load_dataset(dataset_name) 73 74 # Initialize the CMSClient 75 PCMS = CMSClient(k, m, dataset, domain) 76 77 # Simulate the server side 78 f_estimated = PCMS.server_simulator() 79 80 # Save f_estimated to a file 81 df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency']) 82 83 script_dir = os.path.dirname(os.path.abspath(__file__)) 84 output_dir = os.path.join(script_dir, "../../data/frequencies") 85 df_estimated.to_csv(os.path.join(output_dir, f"{d}_freq_estimated_cms.csv"), index=False) 86 87 # Show the results 88 data_table = display_results(df, f_estimated) 89 90 return data_table 91 92 93 94 95
class
CMSClient:
18class CMSClient: 19 def __init__(self, k, m, dataset, domain): 20 self.k = k 21 self.m = m 22 self.dataset = dataset 23 self.domain = domain 24 self.N = len(dataset) 25 26 # Creation of the sketch matrix 27 self.M = np.zeros((self.k, self.m)) 28 29 # Definition of the hash family 3 by 3 30 primes = list(primerange(10**6, 10**7)) 31 p = primes[random.randint(0, len(primes)-1)] 32 self.H = generate_hash_functions(self.k,p, 3,self.m) 33 34 def client(self, d): 35 j = random.randint(0, self.k-1) 36 v = np.full(self.m, -1) 37 selected_hash = self.H[j] 38 v[selected_hash(d)] = 1 39 return v, j 40 41 def update_sketch_matrix(self, d): 42 for i in range (self.k): 43 selected_hash = self.H[i] 44 hash_index = selected_hash(d) 45 self.M[i ,hash_index] += 1 46 47 def estimate_client(self,d): 48 min_estimation = float('inf') 49 for i in range(self.k): 50 selected_hash = self.H[i] 51 min_estimation = min(min_estimation, self.M[i,selected_hash(d)]) 52 return min_estimation 53 54 def server_simulator(self): 55 bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%') 56 57 for d in self.dataset: 58 self.update_sketch_matrix(d) 59 bar.next() 60 bar.finish() 61 62 F_estimated = {} 63 bar = Bar('Obtaining histogram of estimated frequencies', max=len(self.domain), suffix='%(percent)d%%') 64 for x in self.domain: 65 F_estimated[x] = self.estimate_client(x) 66 bar.next() 67 bar.finish() 68 return F_estimated
CMSClient(k, m, dataset, domain)
19 def __init__(self, k, m, dataset, domain): 20 self.k = k 21 self.m = m 22 self.dataset = dataset 23 self.domain = domain 24 self.N = len(dataset) 25 26 # Creation of the sketch matrix 27 self.M = np.zeros((self.k, self.m)) 28 29 # Definition of the hash family 3 by 3 30 primes = list(primerange(10**6, 10**7)) 31 p = primes[random.randint(0, len(primes)-1)] 32 self.H = generate_hash_functions(self.k,p, 3,self.m)
def
server_simulator(self):
54 def server_simulator(self): 55 bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%') 56 57 for d in self.dataset: 58 self.update_sketch_matrix(d) 59 bar.next() 60 bar.finish() 61 62 F_estimated = {} 63 bar = Bar('Obtaining histogram of estimated frequencies', max=len(self.domain), suffix='%(percent)d%%') 64 for x in self.domain: 65 F_estimated[x] = self.estimate_client(x) 66 bar.next() 67 bar.finish() 68 return F_estimated
def
run_cmins_client(k, m, d):
70def run_cmins_client(k, m, d): 71 # Load the dataset 72 dataset_name = f"{d}_filtered" 73 dataset, df, domain = load_dataset(dataset_name) 74 75 # Initialize the CMSClient 76 PCMS = CMSClient(k, m, dataset, domain) 77 78 # Simulate the server side 79 f_estimated = PCMS.server_simulator() 80 81 # Save f_estimated to a file 82 df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency']) 83 84 script_dir = os.path.dirname(os.path.abspath(__file__)) 85 output_dir = os.path.join(script_dir, "../../data/frequencies") 86 df_estimated.to_csv(os.path.join(output_dir, f"{d}_freq_estimated_cms.csv"), index=False) 87 88 # Show the results 89 data_table = display_results(df, f_estimated) 90 91 return data_table