src.private_count_mean.cms_client_mean
1import random 2import numpy as np 3from sympy import primerange 4from rich.progress import Progress 5import pandas as pd 6 7from utils.utils import load_dataset, generate_error_table, generate_hash_functions, display_results 8 9class CMSClient: 10 """ 11 A class to represent the Count-Min Sketch (CMS) Client. 12 13 Attributes: 14 df: DataFrame containing the dataset. 15 k: Number of hash functions used in the CMS. 16 m: Size of the sketch matrix. 17 dataset: List of values from the dataset. 18 domain: Unique values in the dataset. 19 N: Total number of elements in the dataset. 20 M: Count-Min Sketch matrix. 21 H: List of hash functions. 22 23 Methods: 24 client(d): 25 Simulates the client side of the CMS, returning a vector with hash values. 26 update_sketch_matrix(d): 27 Updates the sketch matrix based on the given element. 28 estimate_client(d): 29 Estimates the frequency of an element using the CMS sketch matrix. 30 server_simulator(): 31 Simulates the server side of the CMS, processes the data, and estimates frequencies. 32 """ 33 34 def __init__(self, k, m, df): 35 """ 36 Initializes the CMSClient with the given parameters. 37 """ 38 self.df = df 39 self.k = k 40 self.m = m 41 self.dataset = self.df['value'].tolist() 42 self.domain = self.df['value'].unique().tolist() 43 self.N = len(self.dataset) 44 45 # Creation of the sketch matrix 46 self.M = np.zeros((self.k, self.m)) 47 48 # Definition of the hash family 3 by 3 49 primes = list(primerange(10**6, 10**7)) 50 p = primes[random.randint(0, len(primes)-1)] 51 self.H = generate_hash_functions(self.k,p, 3,self.m) 52 53 def client(self, d): 54 """ 55 Simulates the client side of the Count-Min Sketch. 56 57 Args: 58 d (element): The element for which the sketch vector is generated. 59 60 Returns: 61 tuple: A tuple containing the sketch vector and the index of the chosen hash function. 62 """ 63 j = random.randint(0, self.k-1) 64 v = np.full(self.m, -1) 65 selected_hash = self.H[j] 66 v[selected_hash(d)] = 1 67 return v, j 68 69 def update_sketch_matrix(self, d): 70 """ 71 Updates the sketch matrix based on the given element. 72 73 Args: 74 d (element): The element to be used for updating the sketch matrix. 75 """ 76 for i in range (self.k): 77 selected_hash = self.H[i] 78 hash_index = selected_hash(d) 79 self.M[i ,hash_index] += 1 80 81 def estimate_client(self,d): 82 """ 83 Estimates the frequency of an element based on the sketch matrix. 84 85 Args: 86 d (element): The element whose frequency is estimated. 87 88 Returns: 89 float: The estimated frequency of the element. 90 """ 91 mean = 0 92 for i in range(self.k): 93 selected_hash = self.H[i] 94 mean += self.M[i,selected_hash(d)] 95 return mean/self.k 96 97 def server_simulator(self): 98 """ 99 Simulates the server side of the CMS by processing the dataset 100 and estimating the frequencies of each element. 101 102 Returns: 103 dict: A dictionary with the elements and their estimated frequencies. 104 """ 105 with Progress() as progress: 106 bar = progress.add_task("[cyan]Processing client data...", total=len(self.dataset)) 107 108 for d in self.dataset: 109 self.update_sketch_matrix(d) 110 progress.update(bar, advance=1) 111 112 F_estimated = {} 113 bar = progress.add_task("[cyan]Obtaining histogram of estimated frequencies...", total=len(self.domain)) 114 for x in self.domain: 115 F_estimated[x] = self.estimate_client(x) 116 progress.update(bar, advance=1) 117 return F_estimated 118 119def run_cms_client_mean(k, m, df): 120 """ 121 Runs the Count-Min Sketch algorithm and displays the results. 122 123 Args: 124 k (int): Number of hash functions. 125 m (int): Size of the sketch matrix. 126 df (DataFrame): Dataset to be processed. 127 128 Returns: 129 DataFrame: A table containing the elements and their estimated frequencies. 130 """ 131 # Initialize the CMSClient 132 PCMS = CMSClient(k, m, df) 133 134 # Simulate the server side 135 f_estimated = PCMS.server_simulator() 136 df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency']) 137 138 # Show the results 139 data_table, _= display_results(df, f_estimated) 140 141 return data_table 142 143 144 145 146
10class CMSClient: 11 """ 12 A class to represent the Count-Min Sketch (CMS) Client. 13 14 Attributes: 15 df: DataFrame containing the dataset. 16 k: Number of hash functions used in the CMS. 17 m: Size of the sketch matrix. 18 dataset: List of values from the dataset. 19 domain: Unique values in the dataset. 20 N: Total number of elements in the dataset. 21 M: Count-Min Sketch matrix. 22 H: List of hash functions. 23 24 Methods: 25 client(d): 26 Simulates the client side of the CMS, returning a vector with hash values. 27 update_sketch_matrix(d): 28 Updates the sketch matrix based on the given element. 29 estimate_client(d): 30 Estimates the frequency of an element using the CMS sketch matrix. 31 server_simulator(): 32 Simulates the server side of the CMS, processes the data, and estimates frequencies. 33 """ 34 35 def __init__(self, k, m, df): 36 """ 37 Initializes the CMSClient with the given parameters. 38 """ 39 self.df = df 40 self.k = k 41 self.m = m 42 self.dataset = self.df['value'].tolist() 43 self.domain = self.df['value'].unique().tolist() 44 self.N = len(self.dataset) 45 46 # Creation of the sketch matrix 47 self.M = np.zeros((self.k, self.m)) 48 49 # Definition of the hash family 3 by 3 50 primes = list(primerange(10**6, 10**7)) 51 p = primes[random.randint(0, len(primes)-1)] 52 self.H = generate_hash_functions(self.k,p, 3,self.m) 53 54 def client(self, d): 55 """ 56 Simulates the client side of the Count-Min Sketch. 57 58 Args: 59 d (element): The element for which the sketch vector is generated. 60 61 Returns: 62 tuple: A tuple containing the sketch vector and the index of the chosen hash function. 63 """ 64 j = random.randint(0, self.k-1) 65 v = np.full(self.m, -1) 66 selected_hash = self.H[j] 67 v[selected_hash(d)] = 1 68 return v, j 69 70 def update_sketch_matrix(self, d): 71 """ 72 Updates the sketch matrix based on the given element. 73 74 Args: 75 d (element): The element to be used for updating the sketch matrix. 76 """ 77 for i in range (self.k): 78 selected_hash = self.H[i] 79 hash_index = selected_hash(d) 80 self.M[i ,hash_index] += 1 81 82 def estimate_client(self,d): 83 """ 84 Estimates the frequency of an element based on the sketch matrix. 85 86 Args: 87 d (element): The element whose frequency is estimated. 88 89 Returns: 90 float: The estimated frequency of the element. 91 """ 92 mean = 0 93 for i in range(self.k): 94 selected_hash = self.H[i] 95 mean += self.M[i,selected_hash(d)] 96 return mean/self.k 97 98 def server_simulator(self): 99 """ 100 Simulates the server side of the CMS by processing the dataset 101 and estimating the frequencies of each element. 102 103 Returns: 104 dict: A dictionary with the elements and their estimated frequencies. 105 """ 106 with Progress() as progress: 107 bar = progress.add_task("[cyan]Processing client data...", total=len(self.dataset)) 108 109 for d in self.dataset: 110 self.update_sketch_matrix(d) 111 progress.update(bar, advance=1) 112 113 F_estimated = {} 114 bar = progress.add_task("[cyan]Obtaining histogram of estimated frequencies...", total=len(self.domain)) 115 for x in self.domain: 116 F_estimated[x] = self.estimate_client(x) 117 progress.update(bar, advance=1) 118 return F_estimated
A class to represent the Count-Min Sketch (CMS) Client.
Attributes: df: DataFrame containing the dataset. k: Number of hash functions used in the CMS. m: Size of the sketch matrix. dataset: List of values from the dataset. domain: Unique values in the dataset. N: Total number of elements in the dataset. M: Count-Min Sketch matrix. H: List of hash functions.
Methods: client(d): Simulates the client side of the CMS, returning a vector with hash values. update_sketch_matrix(d): Updates the sketch matrix based on the given element. estimate_client(d): Estimates the frequency of an element using the CMS sketch matrix. server_simulator(): Simulates the server side of the CMS, processes the data, and estimates frequencies.
35 def __init__(self, k, m, df): 36 """ 37 Initializes the CMSClient with the given parameters. 38 """ 39 self.df = df 40 self.k = k 41 self.m = m 42 self.dataset = self.df['value'].tolist() 43 self.domain = self.df['value'].unique().tolist() 44 self.N = len(self.dataset) 45 46 # Creation of the sketch matrix 47 self.M = np.zeros((self.k, self.m)) 48 49 # Definition of the hash family 3 by 3 50 primes = list(primerange(10**6, 10**7)) 51 p = primes[random.randint(0, len(primes)-1)] 52 self.H = generate_hash_functions(self.k,p, 3,self.m)
Initializes the CMSClient with the given parameters.
54 def client(self, d): 55 """ 56 Simulates the client side of the Count-Min Sketch. 57 58 Args: 59 d (element): The element for which the sketch vector is generated. 60 61 Returns: 62 tuple: A tuple containing the sketch vector and the index of the chosen hash function. 63 """ 64 j = random.randint(0, self.k-1) 65 v = np.full(self.m, -1) 66 selected_hash = self.H[j] 67 v[selected_hash(d)] = 1 68 return v, j
Simulates the client side of the Count-Min Sketch.
Args: d (element): The element for which the sketch vector is generated.
Returns: tuple: A tuple containing the sketch vector and the index of the chosen hash function.
70 def update_sketch_matrix(self, d): 71 """ 72 Updates the sketch matrix based on the given element. 73 74 Args: 75 d (element): The element to be used for updating the sketch matrix. 76 """ 77 for i in range (self.k): 78 selected_hash = self.H[i] 79 hash_index = selected_hash(d) 80 self.M[i ,hash_index] += 1
Updates the sketch matrix based on the given element.
Args: d (element): The element to be used for updating the sketch matrix.
82 def estimate_client(self,d): 83 """ 84 Estimates the frequency of an element based on the sketch matrix. 85 86 Args: 87 d (element): The element whose frequency is estimated. 88 89 Returns: 90 float: The estimated frequency of the element. 91 """ 92 mean = 0 93 for i in range(self.k): 94 selected_hash = self.H[i] 95 mean += self.M[i,selected_hash(d)] 96 return mean/self.k
Estimates the frequency of an element based on the sketch matrix.
Args: d (element): The element whose frequency is estimated.
Returns: float: The estimated frequency of the element.
98 def server_simulator(self): 99 """ 100 Simulates the server side of the CMS by processing the dataset 101 and estimating the frequencies of each element. 102 103 Returns: 104 dict: A dictionary with the elements and their estimated frequencies. 105 """ 106 with Progress() as progress: 107 bar = progress.add_task("[cyan]Processing client data...", total=len(self.dataset)) 108 109 for d in self.dataset: 110 self.update_sketch_matrix(d) 111 progress.update(bar, advance=1) 112 113 F_estimated = {} 114 bar = progress.add_task("[cyan]Obtaining histogram of estimated frequencies...", total=len(self.domain)) 115 for x in self.domain: 116 F_estimated[x] = self.estimate_client(x) 117 progress.update(bar, advance=1) 118 return F_estimated
Simulates the server side of the CMS by processing the dataset and estimating the frequencies of each element.
Returns: dict: A dictionary with the elements and their estimated frequencies.
120def run_cms_client_mean(k, m, df): 121 """ 122 Runs the Count-Min Sketch algorithm and displays the results. 123 124 Args: 125 k (int): Number of hash functions. 126 m (int): Size of the sketch matrix. 127 df (DataFrame): Dataset to be processed. 128 129 Returns: 130 DataFrame: A table containing the elements and their estimated frequencies. 131 """ 132 # Initialize the CMSClient 133 PCMS = CMSClient(k, m, df) 134 135 # Simulate the server side 136 f_estimated = PCMS.server_simulator() 137 df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency']) 138 139 # Show the results 140 data_table, _= display_results(df, f_estimated) 141 142 return data_table
Runs the Count-Min Sketch algorithm and displays the results.
Args: k (int): Number of hash functions. m (int): Size of the sketch matrix. df (DataFrame): Dataset to be processed.
Returns: DataFrame: A table containing the elements and their estimated frequencies.