src.hadamard_count_mean.private_hcms_client

  1from sympy import primerange
  2import random
  3import numpy as np
  4from rich.progress import Progress
  5import pandas as pd
  6
  7from utils.utils import generate_hash_functions, display_results
  8
  9
 10class privateHCMSClient:
 11    """
 12    This class implements the private Count-Min Sketch (privateCMS) for differential privacy.
 13    It processes the dataset, applies privatization, and estimates frequencies on the server side.
 14
 15    Attributes:
 16        epsilon (float): The privacy parameter for differential privacy.
 17        k (int): The number of hash functions.
 18        m (int): The size of the sketch matrix.
 19        dataset (list): The dataset containing values to be processed.
 20        domain (list): The unique values in the dataset.
 21        H (numpy.ndarray): The Hadamard matrix used in the privatization process.
 22        N (int): The total number of elements in the dataset.
 23        M (numpy.ndarray): The sketch matrix used to store frequency estimates.
 24        client_matrix (list): A list to store privatized matrices for each client.
 25        hashes (list): A list of hash functions.
 26    """
 27    def __init__(self, epsilon, k, m, df):
 28        """
 29        Initializes the privateHCMSClient class with the given parameters.
 30
 31        Args:
 32            epsilon (float): The privacy parameter for differential privacy.
 33            k (int): The number of hash functions.
 34            m (int): The size of the sketch matrix.
 35            df (pandas.DataFrame): The dataset in DataFrame format.
 36        """
 37        self.df = df
 38        self.epsilon = epsilon
 39        self.k = k
 40        self.m = m
 41        self.dataset = self.df['value'].tolist()
 42        self.domain = self.df['value'].unique().tolist()
 43        self.H = self.hadamard_matrix(self.m)
 44        self.N = len(self.dataset)
 45
 46        # Creation of the sketch matrix
 47        self.M = np.zeros((self.k, self.m))
 48
 49        # List to store the privatized matrices
 50        self.client_matrix = []
 51
 52        # Definition of the hash family 3 by 3
 53        primes = list(primerange(10**6, 10**7))
 54        p = primes[random.randint(0, len(primes)-1)]
 55        self.hashes = generate_hash_functions(self.k,p, 3,self.m)
 56    
 57    def hadamard_matrix(self,n):
 58        """
 59        Generates the Hadamard matrix recursively.
 60
 61        Args:
 62            n (int): The size of the matrix.
 63
 64        Returns:
 65            numpy.ndarray: The generated Hadamard matrix.
 66        """
 67        if n == 1:
 68            return np.array([[1]])
 69        else:
 70            # Recursive function to generate the Hadamard matrix
 71            h_half = self.hadamard_matrix(n // 2)
 72            h = np.block([[h_half, h_half], [h_half, -h_half]])
 73        return h
 74
 75    def client(self,d):
 76        """
 77        Applies privatization to the data using a random hash function and the Hadamard matrix.
 78
 79        Args:
 80            d (any): The element to be privatized.
 81
 82        Returns:
 83            tuple: A tuple containing the privatized value, hash function index, and matrix index.
 84        """
 85        j = random.randint(0, self.k-1)
 86        v = np.full(self.m, 0)
 87        selected_hash = self.hashes[j]
 88        v[selected_hash(d)] = 1
 89        w = np.dot(self.H, v)
 90        l = random.randint(0, self.m-1)
 91
 92        P_active = np.exp(self.epsilon) / (np.exp(self.epsilon) + 1)
 93        if random.random() <= P_active:
 94            b = 1
 95        else:
 96            b = -1
 97    
 98        self.client_matrix.append((b * w[l], j, l))
 99        return b * w[l],j,l
100
101    def update_sketch_matrix(self, w, j, l):
102        """
103        Updates the sketch matrix based on the privatized value.
104
105        Args:
106            w (float): The privatized value.
107            j (int): The index of the hash function.
108            l (int): The index of the matrix.
109        """
110        c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1)
111        x = self.k * c_e * w
112        self.M[j,l] =  self.M[j,l] + x
113
114    def traspose_M(self):
115        """
116        Transposes the sketch matrix.
117        """
118        self.M = self.M @ np.transpose(self.H)
119
120    def estimate_client(self,d):
121        """
122        Estimates the frequency of an element using the sketch matrix.
123
124        Args:
125            d (any): The element whose frequency is to be estimated.
126
127        Returns:
128            float: The estimated frequency of the element.
129        """
130        return (self.m / (self.m-1)) * (1/self.k * np.sum([self.M[i,self.hashes[i](d)] for i in range(self.k)]) - self.N/self.m)
131    
132    def execute_client(self):
133        """
134        Executes the client-side privatization and stores the privatized data.
135
136        Returns:
137            list: A list of privatized data.
138        """
139        with Progress() as progress:
140            task = progress.add_task('[cyan]Processing client data', total=len(self.dataset))
141            privatized_data = []
142            for d in self.dataset:
143                w_i, j_i, l_i = self.client(d)
144                privatized_data.append((w_i,j_i,l_i))
145                progress.update(task, advance=1)
146
147        return privatized_data
148
149    def server_simulator(self, privatized_data):
150        """
151        Simulates the server-side process by updating the sketch matrix and estimating frequencies.
152
153        Args:
154            privatized_data (list): The list of privatized data.
155
156        Returns:
157            tuple: A tuple containing the estimated frequencies and the hash functions used.
158        """
159        with Progress() as progress:
160            task = progress.add_task('[cyan]Update sketch matrix', total=len(privatized_data))
161            for data in privatized_data:
162                self.update_sketch_matrix(data[0],data[1],data[2])
163                progress.update(task, advance=1)
164
165            # Transpose the matrix
166            self.traspose_M()
167
168            # Estimate the frequencies
169            F_estimated = {}
170            task = progress.add_task('[cyan]Obtaining histogram of estimated frequencies', total=len(self.domain))
171            for x in self.domain:
172                F_estimated[x] = self.estimate_client(x)
173                progress.update(task, advance=1)
174        return F_estimated, self.hashes
175    
176def run_private_hcms_client(k, m, e, df):
177    """
178    Runs the private Count-Min Sketch client, processes the data, and estimates frequencies on the server.
179
180    Args:
181        k (int): The number of hash functions.
182        m (int): The size of the sketch matrix.
183        e (float): The privacy parameter epsilon for differential privacy.
184        df (pandas.DataFrame): The dataset in DataFrame format.
185
186    Returns:
187        tuple: A tuple containing the hash functions, data table, error table, privatized data, and the estimated frequencies.
188    """
189    # Initialize the client 
190    client = privateHCMSClient(e, k, m, df)
191
192    # Client side: process the private data
193    privatized_data = client.execute_client()
194
195    # Simulate the server side
196    f_estimated, hashes = client.server_simulator(privatized_data)
197
198    # Save f_estimated to a file
199    df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency'])
200
201    data_table, error_table = display_results(df, f_estimated)
202
203    return hashes, data_table, error_table, privatized_data, df_estimated
204
205
206  
class privateHCMSClient:
 11class privateHCMSClient:
 12    """
 13    This class implements the private Count-Min Sketch (privateCMS) for differential privacy.
 14    It processes the dataset, applies privatization, and estimates frequencies on the server side.
 15
 16    Attributes:
 17        epsilon (float): The privacy parameter for differential privacy.
 18        k (int): The number of hash functions.
 19        m (int): The size of the sketch matrix.
 20        dataset (list): The dataset containing values to be processed.
 21        domain (list): The unique values in the dataset.
 22        H (numpy.ndarray): The Hadamard matrix used in the privatization process.
 23        N (int): The total number of elements in the dataset.
 24        M (numpy.ndarray): The sketch matrix used to store frequency estimates.
 25        client_matrix (list): A list to store privatized matrices for each client.
 26        hashes (list): A list of hash functions.
 27    """
 28    def __init__(self, epsilon, k, m, df):
 29        """
 30        Initializes the privateHCMSClient class with the given parameters.
 31
 32        Args:
 33            epsilon (float): The privacy parameter for differential privacy.
 34            k (int): The number of hash functions.
 35            m (int): The size of the sketch matrix.
 36            df (pandas.DataFrame): The dataset in DataFrame format.
 37        """
 38        self.df = df
 39        self.epsilon = epsilon
 40        self.k = k
 41        self.m = m
 42        self.dataset = self.df['value'].tolist()
 43        self.domain = self.df['value'].unique().tolist()
 44        self.H = self.hadamard_matrix(self.m)
 45        self.N = len(self.dataset)
 46
 47        # Creation of the sketch matrix
 48        self.M = np.zeros((self.k, self.m))
 49
 50        # List to store the privatized matrices
 51        self.client_matrix = []
 52
 53        # Definition of the hash family 3 by 3
 54        primes = list(primerange(10**6, 10**7))
 55        p = primes[random.randint(0, len(primes)-1)]
 56        self.hashes = generate_hash_functions(self.k,p, 3,self.m)
 57    
 58    def hadamard_matrix(self,n):
 59        """
 60        Generates the Hadamard matrix recursively.
 61
 62        Args:
 63            n (int): The size of the matrix.
 64
 65        Returns:
 66            numpy.ndarray: The generated Hadamard matrix.
 67        """
 68        if n == 1:
 69            return np.array([[1]])
 70        else:
 71            # Recursive function to generate the Hadamard matrix
 72            h_half = self.hadamard_matrix(n // 2)
 73            h = np.block([[h_half, h_half], [h_half, -h_half]])
 74        return h
 75
 76    def client(self,d):
 77        """
 78        Applies privatization to the data using a random hash function and the Hadamard matrix.
 79
 80        Args:
 81            d (any): The element to be privatized.
 82
 83        Returns:
 84            tuple: A tuple containing the privatized value, hash function index, and matrix index.
 85        """
 86        j = random.randint(0, self.k-1)
 87        v = np.full(self.m, 0)
 88        selected_hash = self.hashes[j]
 89        v[selected_hash(d)] = 1
 90        w = np.dot(self.H, v)
 91        l = random.randint(0, self.m-1)
 92
 93        P_active = np.exp(self.epsilon) / (np.exp(self.epsilon) + 1)
 94        if random.random() <= P_active:
 95            b = 1
 96        else:
 97            b = -1
 98    
 99        self.client_matrix.append((b * w[l], j, l))
100        return b * w[l],j,l
101
102    def update_sketch_matrix(self, w, j, l):
103        """
104        Updates the sketch matrix based on the privatized value.
105
106        Args:
107            w (float): The privatized value.
108            j (int): The index of the hash function.
109            l (int): The index of the matrix.
110        """
111        c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1)
112        x = self.k * c_e * w
113        self.M[j,l] =  self.M[j,l] + x
114
115    def traspose_M(self):
116        """
117        Transposes the sketch matrix.
118        """
119        self.M = self.M @ np.transpose(self.H)
120
121    def estimate_client(self,d):
122        """
123        Estimates the frequency of an element using the sketch matrix.
124
125        Args:
126            d (any): The element whose frequency is to be estimated.
127
128        Returns:
129            float: The estimated frequency of the element.
130        """
131        return (self.m / (self.m-1)) * (1/self.k * np.sum([self.M[i,self.hashes[i](d)] for i in range(self.k)]) - self.N/self.m)
132    
133    def execute_client(self):
134        """
135        Executes the client-side privatization and stores the privatized data.
136
137        Returns:
138            list: A list of privatized data.
139        """
140        with Progress() as progress:
141            task = progress.add_task('[cyan]Processing client data', total=len(self.dataset))
142            privatized_data = []
143            for d in self.dataset:
144                w_i, j_i, l_i = self.client(d)
145                privatized_data.append((w_i,j_i,l_i))
146                progress.update(task, advance=1)
147
148        return privatized_data
149
150    def server_simulator(self, privatized_data):
151        """
152        Simulates the server-side process by updating the sketch matrix and estimating frequencies.
153
154        Args:
155            privatized_data (list): The list of privatized data.
156
157        Returns:
158            tuple: A tuple containing the estimated frequencies and the hash functions used.
159        """
160        with Progress() as progress:
161            task = progress.add_task('[cyan]Update sketch matrix', total=len(privatized_data))
162            for data in privatized_data:
163                self.update_sketch_matrix(data[0],data[1],data[2])
164                progress.update(task, advance=1)
165
166            # Transpose the matrix
167            self.traspose_M()
168
169            # Estimate the frequencies
170            F_estimated = {}
171            task = progress.add_task('[cyan]Obtaining histogram of estimated frequencies', total=len(self.domain))
172            for x in self.domain:
173                F_estimated[x] = self.estimate_client(x)
174                progress.update(task, advance=1)
175        return F_estimated, self.hashes

This class implements the private Count-Min Sketch (privateCMS) for differential privacy. It processes the dataset, applies privatization, and estimates frequencies on the server side.

Attributes: epsilon (float): The privacy parameter for differential privacy. k (int): The number of hash functions. m (int): The size of the sketch matrix. dataset (list): The dataset containing values to be processed. domain (list): The unique values in the dataset. H (numpy.ndarray): The Hadamard matrix used in the privatization process. N (int): The total number of elements in the dataset. M (numpy.ndarray): The sketch matrix used to store frequency estimates. client_matrix (list): A list to store privatized matrices for each client. hashes (list): A list of hash functions.

privateHCMSClient(epsilon, k, m, df)
28    def __init__(self, epsilon, k, m, df):
29        """
30        Initializes the privateHCMSClient class with the given parameters.
31
32        Args:
33            epsilon (float): The privacy parameter for differential privacy.
34            k (int): The number of hash functions.
35            m (int): The size of the sketch matrix.
36            df (pandas.DataFrame): The dataset in DataFrame format.
37        """
38        self.df = df
39        self.epsilon = epsilon
40        self.k = k
41        self.m = m
42        self.dataset = self.df['value'].tolist()
43        self.domain = self.df['value'].unique().tolist()
44        self.H = self.hadamard_matrix(self.m)
45        self.N = len(self.dataset)
46
47        # Creation of the sketch matrix
48        self.M = np.zeros((self.k, self.m))
49
50        # List to store the privatized matrices
51        self.client_matrix = []
52
53        # Definition of the hash family 3 by 3
54        primes = list(primerange(10**6, 10**7))
55        p = primes[random.randint(0, len(primes)-1)]
56        self.hashes = generate_hash_functions(self.k,p, 3,self.m)

Initializes the privateHCMSClient class with the given parameters.

Args: epsilon (float): The privacy parameter for differential privacy. k (int): The number of hash functions. m (int): The size of the sketch matrix. df (pandas.DataFrame): The dataset in DataFrame format.

df
epsilon
k
m
dataset
domain
H
N
M
client_matrix
hashes
def hadamard_matrix(self, n):
58    def hadamard_matrix(self,n):
59        """
60        Generates the Hadamard matrix recursively.
61
62        Args:
63            n (int): The size of the matrix.
64
65        Returns:
66            numpy.ndarray: The generated Hadamard matrix.
67        """
68        if n == 1:
69            return np.array([[1]])
70        else:
71            # Recursive function to generate the Hadamard matrix
72            h_half = self.hadamard_matrix(n // 2)
73            h = np.block([[h_half, h_half], [h_half, -h_half]])
74        return h

Generates the Hadamard matrix recursively.

Args: n (int): The size of the matrix.

Returns: numpy.ndarray: The generated Hadamard matrix.

def client(self, d):
 76    def client(self,d):
 77        """
 78        Applies privatization to the data using a random hash function and the Hadamard matrix.
 79
 80        Args:
 81            d (any): The element to be privatized.
 82
 83        Returns:
 84            tuple: A tuple containing the privatized value, hash function index, and matrix index.
 85        """
 86        j = random.randint(0, self.k-1)
 87        v = np.full(self.m, 0)
 88        selected_hash = self.hashes[j]
 89        v[selected_hash(d)] = 1
 90        w = np.dot(self.H, v)
 91        l = random.randint(0, self.m-1)
 92
 93        P_active = np.exp(self.epsilon) / (np.exp(self.epsilon) + 1)
 94        if random.random() <= P_active:
 95            b = 1
 96        else:
 97            b = -1
 98    
 99        self.client_matrix.append((b * w[l], j, l))
100        return b * w[l],j,l

Applies privatization to the data using a random hash function and the Hadamard matrix.

Args: d (any): The element to be privatized.

Returns: tuple: A tuple containing the privatized value, hash function index, and matrix index.

def update_sketch_matrix(self, w, j, l):
102    def update_sketch_matrix(self, w, j, l):
103        """
104        Updates the sketch matrix based on the privatized value.
105
106        Args:
107            w (float): The privatized value.
108            j (int): The index of the hash function.
109            l (int): The index of the matrix.
110        """
111        c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1)
112        x = self.k * c_e * w
113        self.M[j,l] =  self.M[j,l] + x

Updates the sketch matrix based on the privatized value.

Args: w (float): The privatized value. j (int): The index of the hash function. l (int): The index of the matrix.

def traspose_M(self):
115    def traspose_M(self):
116        """
117        Transposes the sketch matrix.
118        """
119        self.M = self.M @ np.transpose(self.H)

Transposes the sketch matrix.

def estimate_client(self, d):
121    def estimate_client(self,d):
122        """
123        Estimates the frequency of an element using the sketch matrix.
124
125        Args:
126            d (any): The element whose frequency is to be estimated.
127
128        Returns:
129            float: The estimated frequency of the element.
130        """
131        return (self.m / (self.m-1)) * (1/self.k * np.sum([self.M[i,self.hashes[i](d)] for i in range(self.k)]) - self.N/self.m)

Estimates the frequency of an element using the sketch matrix.

Args: d (any): The element whose frequency is to be estimated.

Returns: float: The estimated frequency of the element.

def execute_client(self):
133    def execute_client(self):
134        """
135        Executes the client-side privatization and stores the privatized data.
136
137        Returns:
138            list: A list of privatized data.
139        """
140        with Progress() as progress:
141            task = progress.add_task('[cyan]Processing client data', total=len(self.dataset))
142            privatized_data = []
143            for d in self.dataset:
144                w_i, j_i, l_i = self.client(d)
145                privatized_data.append((w_i,j_i,l_i))
146                progress.update(task, advance=1)
147
148        return privatized_data

Executes the client-side privatization and stores the privatized data.

Returns: list: A list of privatized data.

def server_simulator(self, privatized_data):
150    def server_simulator(self, privatized_data):
151        """
152        Simulates the server-side process by updating the sketch matrix and estimating frequencies.
153
154        Args:
155            privatized_data (list): The list of privatized data.
156
157        Returns:
158            tuple: A tuple containing the estimated frequencies and the hash functions used.
159        """
160        with Progress() as progress:
161            task = progress.add_task('[cyan]Update sketch matrix', total=len(privatized_data))
162            for data in privatized_data:
163                self.update_sketch_matrix(data[0],data[1],data[2])
164                progress.update(task, advance=1)
165
166            # Transpose the matrix
167            self.traspose_M()
168
169            # Estimate the frequencies
170            F_estimated = {}
171            task = progress.add_task('[cyan]Obtaining histogram of estimated frequencies', total=len(self.domain))
172            for x in self.domain:
173                F_estimated[x] = self.estimate_client(x)
174                progress.update(task, advance=1)
175        return F_estimated, self.hashes

Simulates the server-side process by updating the sketch matrix and estimating frequencies.

Args: privatized_data (list): The list of privatized data.

Returns: tuple: A tuple containing the estimated frequencies and the hash functions used.

def run_private_hcms_client(k, m, e, df):
177def run_private_hcms_client(k, m, e, df):
178    """
179    Runs the private Count-Min Sketch client, processes the data, and estimates frequencies on the server.
180
181    Args:
182        k (int): The number of hash functions.
183        m (int): The size of the sketch matrix.
184        e (float): The privacy parameter epsilon for differential privacy.
185        df (pandas.DataFrame): The dataset in DataFrame format.
186
187    Returns:
188        tuple: A tuple containing the hash functions, data table, error table, privatized data, and the estimated frequencies.
189    """
190    # Initialize the client 
191    client = privateHCMSClient(e, k, m, df)
192
193    # Client side: process the private data
194    privatized_data = client.execute_client()
195
196    # Simulate the server side
197    f_estimated, hashes = client.server_simulator(privatized_data)
198
199    # Save f_estimated to a file
200    df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency'])
201
202    data_table, error_table = display_results(df, f_estimated)
203
204    return hashes, data_table, error_table, privatized_data, df_estimated

Runs the private Count-Min Sketch client, processes the data, and estimates frequencies on the server.

Args: k (int): The number of hash functions. m (int): The size of the sketch matrix. e (float): The privacy parameter epsilon for differential privacy. df (pandas.DataFrame): The dataset in DataFrame format.

Returns: tuple: A tuple containing the hash functions, data table, error table, privatized data, and the estimated frequencies.