src.count_mean.private_cms_client

  1import random
  2import numpy as np
  3from sympy import primerange
  4import pandas as pd
  5from rich.progress import Progress
  6
  7from utils.utils import generate_hash_functions, display_results
  8
  9class privateCMSClient:
 10    """
 11    A class to represent the privatized Count-Min Sketch (privateCMS) Client.
 12
 13    Attributes:
 14        df: DataFrame containing the dataset.
 15        epsilon: Privacy parameter for the privatization process.
 16        k: Number of hash functions used in the CMS.
 17        m: Size of the sketch matrix.
 18        dataset: List of values from the dataset.
 19        domain: Unique values in the dataset.
 20        N: Total number of elements in the dataset.
 21        M: Count-Min Sketch matrix.
 22        client_matrix: List of privatized matrices generated by the client.
 23        H: List of hash functions.
 24    
 25    Methods:
 26        bernoulli_vector():
 27            Generates a Bernoulli vector for privatization based on the epsilon value.
 28        client(d):
 29            Simulates the client side of the private CMS, returning a privatized sketch vector.
 30        update_sketch_matrix(v, j):
 31            Updates the sketch matrix based on the privatized sketch vector.
 32        estimate_client(d):
 33            Estimates the frequency of an element using the private CMS sketch matrix.
 34        execute_client():
 35            Simulates the client side of the private CMS for all elements in the dataset.
 36        server_simulator(privatized_data):
 37            Simulates the server side of the private CMS, processes the privatized data, and estimates frequencies.
 38    """
 39    def __init__(self, epsilon, k, m, df):
 40        """
 41        Initializes the privateCMSClient with the given parameters.
 42
 43        Args:
 44            epsilon (float): Privacy parameter for the privatization process.
 45            k (int): Number of hash functions.
 46            m (int): Size of the sketch matrix.
 47            df (DataFrame): Dataset to be processed.
 48        """
 49        self.df = df
 50        self.epsilon = epsilon
 51        self.k = k
 52        self.m = m
 53        self.dataset = self.df['value'].tolist()
 54        self.domain = self.df['value'].unique().tolist()
 55        self.N = len(self.dataset)
 56
 57        # Creation of the sketch matrix
 58        self.M = np.zeros((self.k, self.m))
 59
 60        # List to store the privatized matrices
 61        self.client_matrix = []
 62
 63        # Definition of the hash family 3 by 3
 64        primes = list(primerange(10**6, 10**7))
 65        p = primes[random.randint(0, len(primes)-1)]
 66        self.H = generate_hash_functions(self.k,p, 3,self.m)
 67
 68    
 69    def bernoulli_vector(self):
 70        """
 71        Generates a Bernoulli vector for privatization based on the epsilon value.
 72
 73        Returns:
 74            numpy.ndarray: A Bernoulli vector with values -1 and 1.
 75        """
 76        b = np.random.binomial(1, (np.exp(self.epsilon/2)) / ((np.exp(self.epsilon/2)) + 1), self.m)
 77        b = 2 * b - 1  # Convert 0 to -1
 78        return b
 79
 80    def client(self, d):
 81        """
 82        Simulates the client side of the privatized Count-Min Sketch.
 83
 84        Args:
 85            d (element): The element for which the privatized sketch vector is generated.
 86
 87        Returns:
 88            tuple: A tuple containing the privatized sketch vector and the index of the chosen hash function.
 89        """
 90        j = random.randint(0, self.k-1)
 91        v = np.full(self.m, -1)
 92        selected_hash = self.H[j]
 93        v[selected_hash(d)] = 1
 94        b = self.bernoulli_vector()
 95        v_aux = v*b
 96        # Store the privatized matrix
 97        self.client_matrix.append((v_aux,j))
 98        return v_aux,j
 99
100    def update_sketch_matrix(self,v,j):
101        """
102        Updates the sketch matrix based on the given privatized sketch vector.
103
104        Args:
105            v (numpy.ndarray): The privatized sketch vector.
106            j (int): The index of the selected hash function.
107        """
108        c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1)
109        x = self.k * ((c_e/2) * v + (1/2) * np.ones_like(v))
110        for i in range (self.m):
111            self.M[j,i] += x[i]
112
113    def estimate_client(self,d):
114        """
115        Estimates the frequency of an element based on the private CMS sketch matrix.
116
117        Args:
118            d (element): The element whose frequency is estimated.
119
120        Returns:
121            float: The estimated frequency of the element.
122        """
123        sum_aux = 0
124        for i in range(self.k):
125            selected_hash = self.H[i]
126            sum_aux += self.M[i, selected_hash(d)]
127
128        f_estimated = (self.m/(self.m-1))*((sum_aux/self.k)-(self.N/self.m))
129        return f_estimated
130    
131    def execute_client(self):
132        """
133        Simulates the client side of the privatized Count-Min Sketch for all elements in the dataset.
134
135        Returns:
136            list: A list of privatized sketch vectors for all elements in the dataset.
137        """
138        with Progress() as progress:
139            bar = progress.add_task("Processing client data", total=len(self.dataset))
140            
141            privatized_data = []
142            for d in self.dataset:
143                v_i, j_i = self.client(d)
144                privatized_data.append((v_i,j_i))
145                progress.update(bar, advance=1)
146        
147        return privatized_data
148    
149    def server_simulator(self,privatized_data):
150        """
151        Simulates the server side of the privatized Count-Min Sketch, processes the privatized data, and estimates frequencies.
152
153        Args:
154            privatized_data (list): List of privatized sketch vectors.
155
156        Returns:
157            tuple: A tuple containing the estimated frequencies and the hash functions used.
158        """
159        with Progress() as progress:
160            bar = progress.add_task('Update sketch matrix', total=len(privatized_data))
161            
162            for data in privatized_data:
163                self.update_sketch_matrix(data[0],data[1])
164                progress.update(bar, advance=1)
165
166            bar = progress.add_task('Estimate frequencies', total=len(self.domain))
167            F_estimated = {}
168            for x in self.domain:
169                F_estimated[x] = self.estimate_client(x)
170                progress.update(bar, advance=1)
171
172        return F_estimated, self.H
173
174def run_private_cms_client(k, m, e, df):
175    """
176    Runs the privatized Count-Min Sketch algorithm and displays the results.
177
178    Args:
179        k (int): Number of hash functions.
180        m (int): Size of the sketch matrix.
181        e (float): Privacy parameter.
182        df (DataFrame): Dataset to be processed.
183
184    Returns:
185        tuple: A tuple containing the hash functions, the results table, the error table, the privatized data, and the estimated frequency DataFrame.
186    """
187    # Initialize the private Count-Mean Sketch
188    PCMS = privateCMSClient(e, k, m, df)
189
190    # Client side: process the private data
191    privatized_data = PCMS.execute_client()
192
193    # Simulate the server side
194    f_estimated, H = PCMS.server_simulator(privatized_data)
195
196    # Save f_estimated to a file
197    df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency'])
198
199    # Show the results
200    data_table, error_table = display_results(df, f_estimated)
201   
202    return H, data_table, error_table, privatized_data, df_estimated
class privateCMSClient:
 10class privateCMSClient:
 11    """
 12    A class to represent the privatized Count-Min Sketch (privateCMS) Client.
 13
 14    Attributes:
 15        df: DataFrame containing the dataset.
 16        epsilon: Privacy parameter for the privatization process.
 17        k: Number of hash functions used in the CMS.
 18        m: Size of the sketch matrix.
 19        dataset: List of values from the dataset.
 20        domain: Unique values in the dataset.
 21        N: Total number of elements in the dataset.
 22        M: Count-Min Sketch matrix.
 23        client_matrix: List of privatized matrices generated by the client.
 24        H: List of hash functions.
 25    
 26    Methods:
 27        bernoulli_vector():
 28            Generates a Bernoulli vector for privatization based on the epsilon value.
 29        client(d):
 30            Simulates the client side of the private CMS, returning a privatized sketch vector.
 31        update_sketch_matrix(v, j):
 32            Updates the sketch matrix based on the privatized sketch vector.
 33        estimate_client(d):
 34            Estimates the frequency of an element using the private CMS sketch matrix.
 35        execute_client():
 36            Simulates the client side of the private CMS for all elements in the dataset.
 37        server_simulator(privatized_data):
 38            Simulates the server side of the private CMS, processes the privatized data, and estimates frequencies.
 39    """
 40    def __init__(self, epsilon, k, m, df):
 41        """
 42        Initializes the privateCMSClient with the given parameters.
 43
 44        Args:
 45            epsilon (float): Privacy parameter for the privatization process.
 46            k (int): Number of hash functions.
 47            m (int): Size of the sketch matrix.
 48            df (DataFrame): Dataset to be processed.
 49        """
 50        self.df = df
 51        self.epsilon = epsilon
 52        self.k = k
 53        self.m = m
 54        self.dataset = self.df['value'].tolist()
 55        self.domain = self.df['value'].unique().tolist()
 56        self.N = len(self.dataset)
 57
 58        # Creation of the sketch matrix
 59        self.M = np.zeros((self.k, self.m))
 60
 61        # List to store the privatized matrices
 62        self.client_matrix = []
 63
 64        # Definition of the hash family 3 by 3
 65        primes = list(primerange(10**6, 10**7))
 66        p = primes[random.randint(0, len(primes)-1)]
 67        self.H = generate_hash_functions(self.k,p, 3,self.m)
 68
 69    
 70    def bernoulli_vector(self):
 71        """
 72        Generates a Bernoulli vector for privatization based on the epsilon value.
 73
 74        Returns:
 75            numpy.ndarray: A Bernoulli vector with values -1 and 1.
 76        """
 77        b = np.random.binomial(1, (np.exp(self.epsilon/2)) / ((np.exp(self.epsilon/2)) + 1), self.m)
 78        b = 2 * b - 1  # Convert 0 to -1
 79        return b
 80
 81    def client(self, d):
 82        """
 83        Simulates the client side of the privatized Count-Min Sketch.
 84
 85        Args:
 86            d (element): The element for which the privatized sketch vector is generated.
 87
 88        Returns:
 89            tuple: A tuple containing the privatized sketch vector and the index of the chosen hash function.
 90        """
 91        j = random.randint(0, self.k-1)
 92        v = np.full(self.m, -1)
 93        selected_hash = self.H[j]
 94        v[selected_hash(d)] = 1
 95        b = self.bernoulli_vector()
 96        v_aux = v*b
 97        # Store the privatized matrix
 98        self.client_matrix.append((v_aux,j))
 99        return v_aux,j
100
101    def update_sketch_matrix(self,v,j):
102        """
103        Updates the sketch matrix based on the given privatized sketch vector.
104
105        Args:
106            v (numpy.ndarray): The privatized sketch vector.
107            j (int): The index of the selected hash function.
108        """
109        c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1)
110        x = self.k * ((c_e/2) * v + (1/2) * np.ones_like(v))
111        for i in range (self.m):
112            self.M[j,i] += x[i]
113
114    def estimate_client(self,d):
115        """
116        Estimates the frequency of an element based on the private CMS sketch matrix.
117
118        Args:
119            d (element): The element whose frequency is estimated.
120
121        Returns:
122            float: The estimated frequency of the element.
123        """
124        sum_aux = 0
125        for i in range(self.k):
126            selected_hash = self.H[i]
127            sum_aux += self.M[i, selected_hash(d)]
128
129        f_estimated = (self.m/(self.m-1))*((sum_aux/self.k)-(self.N/self.m))
130        return f_estimated
131    
132    def execute_client(self):
133        """
134        Simulates the client side of the privatized Count-Min Sketch for all elements in the dataset.
135
136        Returns:
137            list: A list of privatized sketch vectors for all elements in the dataset.
138        """
139        with Progress() as progress:
140            bar = progress.add_task("Processing client data", total=len(self.dataset))
141            
142            privatized_data = []
143            for d in self.dataset:
144                v_i, j_i = self.client(d)
145                privatized_data.append((v_i,j_i))
146                progress.update(bar, advance=1)
147        
148        return privatized_data
149    
150    def server_simulator(self,privatized_data):
151        """
152        Simulates the server side of the privatized Count-Min Sketch, processes the privatized data, and estimates frequencies.
153
154        Args:
155            privatized_data (list): List of privatized sketch vectors.
156
157        Returns:
158            tuple: A tuple containing the estimated frequencies and the hash functions used.
159        """
160        with Progress() as progress:
161            bar = progress.add_task('Update sketch matrix', total=len(privatized_data))
162            
163            for data in privatized_data:
164                self.update_sketch_matrix(data[0],data[1])
165                progress.update(bar, advance=1)
166
167            bar = progress.add_task('Estimate frequencies', total=len(self.domain))
168            F_estimated = {}
169            for x in self.domain:
170                F_estimated[x] = self.estimate_client(x)
171                progress.update(bar, advance=1)
172
173        return F_estimated, self.H

A class to represent the privatized Count-Min Sketch (privateCMS) Client.

Attributes: df: DataFrame containing the dataset. epsilon: Privacy parameter for the privatization process. k: Number of hash functions used in the CMS. m: Size of the sketch matrix. dataset: List of values from the dataset. domain: Unique values in the dataset. N: Total number of elements in the dataset. M: Count-Min Sketch matrix. client_matrix: List of privatized matrices generated by the client. H: List of hash functions.

Methods: bernoulli_vector(): Generates a Bernoulli vector for privatization based on the epsilon value. client(d): Simulates the client side of the private CMS, returning a privatized sketch vector. update_sketch_matrix(v, j): Updates the sketch matrix based on the privatized sketch vector. estimate_client(d): Estimates the frequency of an element using the private CMS sketch matrix. execute_client(): Simulates the client side of the private CMS for all elements in the dataset. server_simulator(privatized_data): Simulates the server side of the private CMS, processes the privatized data, and estimates frequencies.

privateCMSClient(epsilon, k, m, df)
40    def __init__(self, epsilon, k, m, df):
41        """
42        Initializes the privateCMSClient with the given parameters.
43
44        Args:
45            epsilon (float): Privacy parameter for the privatization process.
46            k (int): Number of hash functions.
47            m (int): Size of the sketch matrix.
48            df (DataFrame): Dataset to be processed.
49        """
50        self.df = df
51        self.epsilon = epsilon
52        self.k = k
53        self.m = m
54        self.dataset = self.df['value'].tolist()
55        self.domain = self.df['value'].unique().tolist()
56        self.N = len(self.dataset)
57
58        # Creation of the sketch matrix
59        self.M = np.zeros((self.k, self.m))
60
61        # List to store the privatized matrices
62        self.client_matrix = []
63
64        # Definition of the hash family 3 by 3
65        primes = list(primerange(10**6, 10**7))
66        p = primes[random.randint(0, len(primes)-1)]
67        self.H = generate_hash_functions(self.k,p, 3,self.m)

Initializes the privateCMSClient with the given parameters.

Args: epsilon (float): Privacy parameter for the privatization process. k (int): Number of hash functions. m (int): Size of the sketch matrix. df (DataFrame): Dataset to be processed.

df
epsilon
k
m
dataset
domain
N
M
client_matrix
H
def bernoulli_vector(self):
70    def bernoulli_vector(self):
71        """
72        Generates a Bernoulli vector for privatization based on the epsilon value.
73
74        Returns:
75            numpy.ndarray: A Bernoulli vector with values -1 and 1.
76        """
77        b = np.random.binomial(1, (np.exp(self.epsilon/2)) / ((np.exp(self.epsilon/2)) + 1), self.m)
78        b = 2 * b - 1  # Convert 0 to -1
79        return b

Generates a Bernoulli vector for privatization based on the epsilon value.

Returns: numpy.ndarray: A Bernoulli vector with values -1 and 1.

def client(self, d):
81    def client(self, d):
82        """
83        Simulates the client side of the privatized Count-Min Sketch.
84
85        Args:
86            d (element): The element for which the privatized sketch vector is generated.
87
88        Returns:
89            tuple: A tuple containing the privatized sketch vector and the index of the chosen hash function.
90        """
91        j = random.randint(0, self.k-1)
92        v = np.full(self.m, -1)
93        selected_hash = self.H[j]
94        v[selected_hash(d)] = 1
95        b = self.bernoulli_vector()
96        v_aux = v*b
97        # Store the privatized matrix
98        self.client_matrix.append((v_aux,j))
99        return v_aux,j

Simulates the client side of the privatized Count-Min Sketch.

Args: d (element): The element for which the privatized sketch vector is generated.

Returns: tuple: A tuple containing the privatized sketch vector and the index of the chosen hash function.

def update_sketch_matrix(self, v, j):
101    def update_sketch_matrix(self,v,j):
102        """
103        Updates the sketch matrix based on the given privatized sketch vector.
104
105        Args:
106            v (numpy.ndarray): The privatized sketch vector.
107            j (int): The index of the selected hash function.
108        """
109        c_e = (np.exp(self.epsilon/2)+1) / ((np.exp(self.epsilon/2))-1)
110        x = self.k * ((c_e/2) * v + (1/2) * np.ones_like(v))
111        for i in range (self.m):
112            self.M[j,i] += x[i]

Updates the sketch matrix based on the given privatized sketch vector.

Args: v (numpy.ndarray): The privatized sketch vector. j (int): The index of the selected hash function.

def estimate_client(self, d):
114    def estimate_client(self,d):
115        """
116        Estimates the frequency of an element based on the private CMS sketch matrix.
117
118        Args:
119            d (element): The element whose frequency is estimated.
120
121        Returns:
122            float: The estimated frequency of the element.
123        """
124        sum_aux = 0
125        for i in range(self.k):
126            selected_hash = self.H[i]
127            sum_aux += self.M[i, selected_hash(d)]
128
129        f_estimated = (self.m/(self.m-1))*((sum_aux/self.k)-(self.N/self.m))
130        return f_estimated

Estimates the frequency of an element based on the private CMS sketch matrix.

Args: d (element): The element whose frequency is estimated.

Returns: float: The estimated frequency of the element.

def execute_client(self):
132    def execute_client(self):
133        """
134        Simulates the client side of the privatized Count-Min Sketch for all elements in the dataset.
135
136        Returns:
137            list: A list of privatized sketch vectors for all elements in the dataset.
138        """
139        with Progress() as progress:
140            bar = progress.add_task("Processing client data", total=len(self.dataset))
141            
142            privatized_data = []
143            for d in self.dataset:
144                v_i, j_i = self.client(d)
145                privatized_data.append((v_i,j_i))
146                progress.update(bar, advance=1)
147        
148        return privatized_data

Simulates the client side of the privatized Count-Min Sketch for all elements in the dataset.

Returns: list: A list of privatized sketch vectors for all elements in the dataset.

def server_simulator(self, privatized_data):
150    def server_simulator(self,privatized_data):
151        """
152        Simulates the server side of the privatized Count-Min Sketch, processes the privatized data, and estimates frequencies.
153
154        Args:
155            privatized_data (list): List of privatized sketch vectors.
156
157        Returns:
158            tuple: A tuple containing the estimated frequencies and the hash functions used.
159        """
160        with Progress() as progress:
161            bar = progress.add_task('Update sketch matrix', total=len(privatized_data))
162            
163            for data in privatized_data:
164                self.update_sketch_matrix(data[0],data[1])
165                progress.update(bar, advance=1)
166
167            bar = progress.add_task('Estimate frequencies', total=len(self.domain))
168            F_estimated = {}
169            for x in self.domain:
170                F_estimated[x] = self.estimate_client(x)
171                progress.update(bar, advance=1)
172
173        return F_estimated, self.H

Simulates the server side of the privatized Count-Min Sketch, processes the privatized data, and estimates frequencies.

Args: privatized_data (list): List of privatized sketch vectors.

Returns: tuple: A tuple containing the estimated frequencies and the hash functions used.

def run_private_cms_client(k, m, e, df):
175def run_private_cms_client(k, m, e, df):
176    """
177    Runs the privatized Count-Min Sketch algorithm and displays the results.
178
179    Args:
180        k (int): Number of hash functions.
181        m (int): Size of the sketch matrix.
182        e (float): Privacy parameter.
183        df (DataFrame): Dataset to be processed.
184
185    Returns:
186        tuple: A tuple containing the hash functions, the results table, the error table, the privatized data, and the estimated frequency DataFrame.
187    """
188    # Initialize the private Count-Mean Sketch
189    PCMS = privateCMSClient(e, k, m, df)
190
191    # Client side: process the private data
192    privatized_data = PCMS.execute_client()
193
194    # Simulate the server side
195    f_estimated, H = PCMS.server_simulator(privatized_data)
196
197    # Save f_estimated to a file
198    df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency'])
199
200    # Show the results
201    data_table, error_table = display_results(df, f_estimated)
202   
203    return H, data_table, error_table, privatized_data, df_estimated

Runs the privatized Count-Min Sketch algorithm and displays the results.

Args: k (int): Number of hash functions. m (int): Size of the sketch matrix. e (float): Privacy parameter. df (DataFrame): Dataset to be processed.

Returns: tuple: A tuple containing the hash functions, the results table, the error table, the privatized data, and the estimated frequency DataFrame.