src.private_count_min.cms_client_min

 1from sympy import primerange
 2import random
 3import numpy as np
 4import importlib.util
 5import os
 6import argparse
 7import time
 8from progress.bar import Bar
 9from tabulate import tabulate
10import sys
11import pandas as pd
12import pickle
13
14
15from utils.utils import load_dataset, generate_error_table, generate_hash_functions, display_results
16
17class CMSClient:
18    def __init__(self, k, m, dataset, domain):
19        self.k = k 
20        self.m = m
21        self.dataset = dataset
22        self.domain = domain
23        self.N = len(dataset)
24        
25        # Creation of the sketch matrix
26        self.M = np.zeros((self.k, self.m))
27
28        # Definition of the hash family 3 by 3
29        primes = list(primerange(10**6, 10**7))
30        p = primes[random.randint(0, len(primes)-1)]
31        self.H = generate_hash_functions(self.k,p, 3,self.m)
32
33    def client(self, d):
34        j = random.randint(0, self.k-1)
35        v = np.full(self.m, -1)
36        selected_hash = self.H[j]
37        v[selected_hash(d)] = 1
38        return v, j
39   
40    def update_sketch_matrix(self, d):
41        for i in range (self.k):
42            selected_hash = self.H[i]
43            hash_index = selected_hash(d)
44            self.M[i ,hash_index] += 1
45
46    def estimate_client(self,d):
47        min_estimation = float('inf')
48        for i in range(self.k):
49            selected_hash = self.H[i]
50            min_estimation = min(min_estimation, self.M[i,selected_hash(d)])
51        return min_estimation
52    
53    def server_simulator(self):
54        bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%')
55
56        for d in self.dataset:
57            self.update_sketch_matrix(d)
58            bar.next()
59        bar.finish()
60
61        F_estimated = {}
62        bar = Bar('Obtaining histogram of estimated frequencies', max=len(self.domain), suffix='%(percent)d%%')
63        for x in self.domain:
64            F_estimated[x] = self.estimate_client(x)
65            bar.next()
66        bar.finish()
67        return F_estimated
68
69def run_cmins_client(k, m, d):
70    # Load the dataset
71    dataset_name = f"{d}_filtered"
72    dataset, df, domain = load_dataset(dataset_name)
73
74    # Initialize the CMSClient
75    PCMS = CMSClient(k, m, dataset, domain)
76
77    # Simulate the server side
78    f_estimated = PCMS.server_simulator()
79
80    # Save f_estimated to a file
81    df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency'])
82
83    script_dir = os.path.dirname(os.path.abspath(__file__))
84    output_dir = os.path.join(script_dir, "../../data/frequencies")
85    df_estimated.to_csv(os.path.join(output_dir, f"{d}_freq_estimated_cms.csv"), index=False)
86
87    # Show the results
88    data_table = display_results(df, f_estimated)
89
90    return data_table
91
92
93
94
95  
class CMSClient:
18class CMSClient:
19    def __init__(self, k, m, dataset, domain):
20        self.k = k 
21        self.m = m
22        self.dataset = dataset
23        self.domain = domain
24        self.N = len(dataset)
25        
26        # Creation of the sketch matrix
27        self.M = np.zeros((self.k, self.m))
28
29        # Definition of the hash family 3 by 3
30        primes = list(primerange(10**6, 10**7))
31        p = primes[random.randint(0, len(primes)-1)]
32        self.H = generate_hash_functions(self.k,p, 3,self.m)
33
34    def client(self, d):
35        j = random.randint(0, self.k-1)
36        v = np.full(self.m, -1)
37        selected_hash = self.H[j]
38        v[selected_hash(d)] = 1
39        return v, j
40   
41    def update_sketch_matrix(self, d):
42        for i in range (self.k):
43            selected_hash = self.H[i]
44            hash_index = selected_hash(d)
45            self.M[i ,hash_index] += 1
46
47    def estimate_client(self,d):
48        min_estimation = float('inf')
49        for i in range(self.k):
50            selected_hash = self.H[i]
51            min_estimation = min(min_estimation, self.M[i,selected_hash(d)])
52        return min_estimation
53    
54    def server_simulator(self):
55        bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%')
56
57        for d in self.dataset:
58            self.update_sketch_matrix(d)
59            bar.next()
60        bar.finish()
61
62        F_estimated = {}
63        bar = Bar('Obtaining histogram of estimated frequencies', max=len(self.domain), suffix='%(percent)d%%')
64        for x in self.domain:
65            F_estimated[x] = self.estimate_client(x)
66            bar.next()
67        bar.finish()
68        return F_estimated
CMSClient(k, m, dataset, domain)
19    def __init__(self, k, m, dataset, domain):
20        self.k = k 
21        self.m = m
22        self.dataset = dataset
23        self.domain = domain
24        self.N = len(dataset)
25        
26        # Creation of the sketch matrix
27        self.M = np.zeros((self.k, self.m))
28
29        # Definition of the hash family 3 by 3
30        primes = list(primerange(10**6, 10**7))
31        p = primes[random.randint(0, len(primes)-1)]
32        self.H = generate_hash_functions(self.k,p, 3,self.m)
k
m
dataset
domain
N
M
H
def client(self, d):
34    def client(self, d):
35        j = random.randint(0, self.k-1)
36        v = np.full(self.m, -1)
37        selected_hash = self.H[j]
38        v[selected_hash(d)] = 1
39        return v, j
def update_sketch_matrix(self, d):
41    def update_sketch_matrix(self, d):
42        for i in range (self.k):
43            selected_hash = self.H[i]
44            hash_index = selected_hash(d)
45            self.M[i ,hash_index] += 1
def estimate_client(self, d):
47    def estimate_client(self,d):
48        min_estimation = float('inf')
49        for i in range(self.k):
50            selected_hash = self.H[i]
51            min_estimation = min(min_estimation, self.M[i,selected_hash(d)])
52        return min_estimation
def server_simulator(self):
54    def server_simulator(self):
55        bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%')
56
57        for d in self.dataset:
58            self.update_sketch_matrix(d)
59            bar.next()
60        bar.finish()
61
62        F_estimated = {}
63        bar = Bar('Obtaining histogram of estimated frequencies', max=len(self.domain), suffix='%(percent)d%%')
64        for x in self.domain:
65            F_estimated[x] = self.estimate_client(x)
66            bar.next()
67        bar.finish()
68        return F_estimated
def run_cmins_client(k, m, d):
70def run_cmins_client(k, m, d):
71    # Load the dataset
72    dataset_name = f"{d}_filtered"
73    dataset, df, domain = load_dataset(dataset_name)
74
75    # Initialize the CMSClient
76    PCMS = CMSClient(k, m, dataset, domain)
77
78    # Simulate the server side
79    f_estimated = PCMS.server_simulator()
80
81    # Save f_estimated to a file
82    df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency'])
83
84    script_dir = os.path.dirname(os.path.abspath(__file__))
85    output_dir = os.path.join(script_dir, "../../data/frequencies")
86    df_estimated.to_csv(os.path.join(output_dir, f"{d}_freq_estimated_cms.csv"), index=False)
87
88    # Show the results
89    data_table = display_results(df, f_estimated)
90
91    return data_table