src.private_count_min.private_cmins_client

  1from sympy import primerange
  2import random
  3import numpy as np
  4import importlib.util
  5import os
  6import argparse
  7import time
  8from progress.bar import Bar
  9from tabulate import tabulate
 10import sys
 11import pandas as pd
 12import pickle
 13import statistics
 14
 15from utils.utils import load_dataset, generate_hash_functions, display_results, generate_error_table
 16
 17class privateCMinSClient:
 18    def __init__(self, epsilon, k, m, dataset, domain, dataset_name):
 19        self.dataset_name = dataset_name
 20        self.epsilon = epsilon
 21        self.k = k
 22        self.m = m
 23        self.dataset = dataset
 24        self.domain = domain
 25        self.N = len(dataset)
 26
 27        # Creation of the sketch matrix
 28        self.M = np.zeros((self.k, self.m))
 29
 30        # List to store the privatized matrices
 31        self.client_matrix = []
 32
 33        # Definition of the hash family 3 by 3
 34        primes = list(primerange(10**6, 10**7))
 35        p = primes[random.randint(0, len(primes)-1)]
 36        self.H = generate_hash_functions(self.k, p, 3,self.m)
 37
 38    
 39    def bernoulli_vector(self):
 40        b = np.random.binomial(1, (np.exp(self.epsilon/2)) / ((np.exp(self.epsilon/2)) + 1), self.m)
 41        b = 2 * b - 1  # Convert 0 to -1
 42        return b
 43
 44    def client(self, d):
 45        j = random.randint(0, self.k-1)
 46        v = np.full(self.m, -1)
 47        selected_hash = self.H[j]
 48        v[selected_hash(d)] = 1
 49        b = self.bernoulli_vector()
 50        v_aux = v*b
 51        # Store the privatized matrix
 52        self.client_matrix.append((v_aux,j))
 53        return v_aux,j
 54
 55    def update_sketch_matrix(self,v,j):
 56        c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1)
 57        #x = self.k * ((c_e/2) * v + (1/2) * np.ones_like(v))
 58        x = ((c_e/2) * v + (1/2) * np.ones_like(v))
 59        for i in range (self.m):
 60            self.M[j,i] += x[i]
 61
 62    def estimate_client(self,d):
 63        v_minimum = []
 64        for i in range(self.k):
 65            selected_hash = self.H[i]
 66            if self.M[i, selected_hash(d)] > 0 and self.M[i, selected_hash(d)] != self.k:
 67                v_minimum.append(self.M[i, selected_hash(d)])
 68
 69        minimum = min(v_minimum)
 70        f_estimated = (self.m / (self.m - 1)) * ( minimum - (self.N/self.m))
 71 
 72        return minimum
 73    
 74    def execute_client(self):
 75        bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%')
 76        privatized_data = []
 77        for d in self.dataset:
 78            v_i, j_i = self.client(d)
 79            privatized_data.append((v_i,j_i))
 80            bar.next()
 81        bar.finish()
 82        
 83        df_client_matrix = pd.DataFrame(privatized_data, columns=['v', 'j'])
 84
 85        data_dict = df_client_matrix.to_dict(orient='list')
 86
 87        script_dir = os.path.dirname(os.path.abspath(__file__))
 88        output_dir = os.path.join(script_dir, "../../data/privatized")
 89
 90        output_file = os.path.join(output_dir, f"{self.dataset_name}_private.pkl")
 91    
 92        with open(output_file, 'wb') as f:
 93            pickle.dump(privatized_data, f)
 94    
 95        df_client_matrix.to_csv(os.path.join(output_dir, f"{self.dataset_name}_private.csv"), index=False)
 96        return privatized_data
 97    
 98    def server_simulator(self,privatized_data):
 99        bar = Bar('Update sketch matrix', max=len(privatized_data), suffix='%(percent)d%%')
100        
101        for data in privatized_data:
102            self.update_sketch_matrix(data[0],data[1])
103            bar.next()
104        bar.finish()
105
106        F_estimated = {}
107        for x in self.domain:
108            F_estimated[x] = self.estimate_client(x)
109            bar.next()
110        bar.finish()
111        return F_estimated, self.H
112
113def run_private_cmins_client(k, m, e, d):
114    dataset, df, domain = load_dataset(f"{d}_filtered")
115
116    # Initialize the private Count-Mean Sketch
117    PCMS = privateCMinSClient(e, k, m, dataset, domain, d)
118
119    # Client side: process the private data
120    privatized_data = PCMS.execute_client()
121
122    # Simulate the server side
123    f_estimated, H = PCMS.server_simulator(privatized_data)
124
125    # Save f_estimated to a file
126    df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency'])
127
128    script_dir = os.path.dirname(os.path.abspath(__file__))
129    output_dir = os.path.join(script_dir, "../../data/frequencies")
130    df_estimated.to_csv(os.path.join(output_dir, f"{d}_freq_estimated_cms.csv"), index=False)
131
132    # Show the results
133    error_table, _ = display_results(df, f_estimated)
134   
135    return H, error_table, f_estimated
class privateCMinSClient:
 18class privateCMinSClient:
 19    def __init__(self, epsilon, k, m, dataset, domain, dataset_name):
 20        self.dataset_name = dataset_name
 21        self.epsilon = epsilon
 22        self.k = k
 23        self.m = m
 24        self.dataset = dataset
 25        self.domain = domain
 26        self.N = len(dataset)
 27
 28        # Creation of the sketch matrix
 29        self.M = np.zeros((self.k, self.m))
 30
 31        # List to store the privatized matrices
 32        self.client_matrix = []
 33
 34        # Definition of the hash family 3 by 3
 35        primes = list(primerange(10**6, 10**7))
 36        p = primes[random.randint(0, len(primes)-1)]
 37        self.H = generate_hash_functions(self.k, p, 3,self.m)
 38
 39    
 40    def bernoulli_vector(self):
 41        b = np.random.binomial(1, (np.exp(self.epsilon/2)) / ((np.exp(self.epsilon/2)) + 1), self.m)
 42        b = 2 * b - 1  # Convert 0 to -1
 43        return b
 44
 45    def client(self, d):
 46        j = random.randint(0, self.k-1)
 47        v = np.full(self.m, -1)
 48        selected_hash = self.H[j]
 49        v[selected_hash(d)] = 1
 50        b = self.bernoulli_vector()
 51        v_aux = v*b
 52        # Store the privatized matrix
 53        self.client_matrix.append((v_aux,j))
 54        return v_aux,j
 55
 56    def update_sketch_matrix(self,v,j):
 57        c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1)
 58        #x = self.k * ((c_e/2) * v + (1/2) * np.ones_like(v))
 59        x = ((c_e/2) * v + (1/2) * np.ones_like(v))
 60        for i in range (self.m):
 61            self.M[j,i] += x[i]
 62
 63    def estimate_client(self,d):
 64        v_minimum = []
 65        for i in range(self.k):
 66            selected_hash = self.H[i]
 67            if self.M[i, selected_hash(d)] > 0 and self.M[i, selected_hash(d)] != self.k:
 68                v_minimum.append(self.M[i, selected_hash(d)])
 69
 70        minimum = min(v_minimum)
 71        f_estimated = (self.m / (self.m - 1)) * ( minimum - (self.N/self.m))
 72 
 73        return minimum
 74    
 75    def execute_client(self):
 76        bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%')
 77        privatized_data = []
 78        for d in self.dataset:
 79            v_i, j_i = self.client(d)
 80            privatized_data.append((v_i,j_i))
 81            bar.next()
 82        bar.finish()
 83        
 84        df_client_matrix = pd.DataFrame(privatized_data, columns=['v', 'j'])
 85
 86        data_dict = df_client_matrix.to_dict(orient='list')
 87
 88        script_dir = os.path.dirname(os.path.abspath(__file__))
 89        output_dir = os.path.join(script_dir, "../../data/privatized")
 90
 91        output_file = os.path.join(output_dir, f"{self.dataset_name}_private.pkl")
 92    
 93        with open(output_file, 'wb') as f:
 94            pickle.dump(privatized_data, f)
 95    
 96        df_client_matrix.to_csv(os.path.join(output_dir, f"{self.dataset_name}_private.csv"), index=False)
 97        return privatized_data
 98    
 99    def server_simulator(self,privatized_data):
100        bar = Bar('Update sketch matrix', max=len(privatized_data), suffix='%(percent)d%%')
101        
102        for data in privatized_data:
103            self.update_sketch_matrix(data[0],data[1])
104            bar.next()
105        bar.finish()
106
107        F_estimated = {}
108        for x in self.domain:
109            F_estimated[x] = self.estimate_client(x)
110            bar.next()
111        bar.finish()
112        return F_estimated, self.H
privateCMinSClient(epsilon, k, m, dataset, domain, dataset_name)
19    def __init__(self, epsilon, k, m, dataset, domain, dataset_name):
20        self.dataset_name = dataset_name
21        self.epsilon = epsilon
22        self.k = k
23        self.m = m
24        self.dataset = dataset
25        self.domain = domain
26        self.N = len(dataset)
27
28        # Creation of the sketch matrix
29        self.M = np.zeros((self.k, self.m))
30
31        # List to store the privatized matrices
32        self.client_matrix = []
33
34        # Definition of the hash family 3 by 3
35        primes = list(primerange(10**6, 10**7))
36        p = primes[random.randint(0, len(primes)-1)]
37        self.H = generate_hash_functions(self.k, p, 3,self.m)
dataset_name
epsilon
k
m
dataset
domain
N
M
client_matrix
H
def bernoulli_vector(self):
40    def bernoulli_vector(self):
41        b = np.random.binomial(1, (np.exp(self.epsilon/2)) / ((np.exp(self.epsilon/2)) + 1), self.m)
42        b = 2 * b - 1  # Convert 0 to -1
43        return b
def client(self, d):
45    def client(self, d):
46        j = random.randint(0, self.k-1)
47        v = np.full(self.m, -1)
48        selected_hash = self.H[j]
49        v[selected_hash(d)] = 1
50        b = self.bernoulli_vector()
51        v_aux = v*b
52        # Store the privatized matrix
53        self.client_matrix.append((v_aux,j))
54        return v_aux,j
def update_sketch_matrix(self, v, j):
56    def update_sketch_matrix(self,v,j):
57        c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1)
58        #x = self.k * ((c_e/2) * v + (1/2) * np.ones_like(v))
59        x = ((c_e/2) * v + (1/2) * np.ones_like(v))
60        for i in range (self.m):
61            self.M[j,i] += x[i]
def estimate_client(self, d):
63    def estimate_client(self,d):
64        v_minimum = []
65        for i in range(self.k):
66            selected_hash = self.H[i]
67            if self.M[i, selected_hash(d)] > 0 and self.M[i, selected_hash(d)] != self.k:
68                v_minimum.append(self.M[i, selected_hash(d)])
69
70        minimum = min(v_minimum)
71        f_estimated = (self.m / (self.m - 1)) * ( minimum - (self.N/self.m))
72 
73        return minimum
def execute_client(self):
75    def execute_client(self):
76        bar = Bar('Processing client data', max=len(self.dataset), suffix='%(percent)d%%')
77        privatized_data = []
78        for d in self.dataset:
79            v_i, j_i = self.client(d)
80            privatized_data.append((v_i,j_i))
81            bar.next()
82        bar.finish()
83        
84        df_client_matrix = pd.DataFrame(privatized_data, columns=['v', 'j'])
85
86        data_dict = df_client_matrix.to_dict(orient='list')
87
88        script_dir = os.path.dirname(os.path.abspath(__file__))
89        output_dir = os.path.join(script_dir, "../../data/privatized")
90
91        output_file = os.path.join(output_dir, f"{self.dataset_name}_private.pkl")
92    
93        with open(output_file, 'wb') as f:
94            pickle.dump(privatized_data, f)
95    
96        df_client_matrix.to_csv(os.path.join(output_dir, f"{self.dataset_name}_private.csv"), index=False)
97        return privatized_data
def server_simulator(self, privatized_data):
 99    def server_simulator(self,privatized_data):
100        bar = Bar('Update sketch matrix', max=len(privatized_data), suffix='%(percent)d%%')
101        
102        for data in privatized_data:
103            self.update_sketch_matrix(data[0],data[1])
104            bar.next()
105        bar.finish()
106
107        F_estimated = {}
108        for x in self.domain:
109            F_estimated[x] = self.estimate_client(x)
110            bar.next()
111        bar.finish()
112        return F_estimated, self.H
def run_private_cmins_client(k, m, e, d):
114def run_private_cmins_client(k, m, e, d):
115    dataset, df, domain = load_dataset(f"{d}_filtered")
116
117    # Initialize the private Count-Mean Sketch
118    PCMS = privateCMinSClient(e, k, m, dataset, domain, d)
119
120    # Client side: process the private data
121    privatized_data = PCMS.execute_client()
122
123    # Simulate the server side
124    f_estimated, H = PCMS.server_simulator(privatized_data)
125
126    # Save f_estimated to a file
127    df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency'])
128
129    script_dir = os.path.dirname(os.path.abspath(__file__))
130    output_dir = os.path.join(script_dir, "../../data/frequencies")
131    df_estimated.to_csv(os.path.join(output_dir, f"{d}_freq_estimated_cms.csv"), index=False)
132
133    # Show the results
134    error_table, _ = display_results(df, f_estimated)
135   
136    return H, error_table, f_estimated