src.count_mean.cms_client_mean
1import random 2import numpy as np 3from sympy import primerange 4from rich.progress import Progress 5 6from utils.utils import generate_hash_functions, display_results 7 8class CMSClient: 9 """ 10 A class to represent the Count-Min Sketch (CMS) Client. 11 12 Attributes: 13 df: DataFrame containing the dataset. 14 k: Number of hash functions used in the CMS. 15 m: Size of the sketch matrix. 16 dataset: List of values from the dataset. 17 domain: Unique values in the dataset. 18 N: Total number of elements in the dataset. 19 M: Count-Min Sketch matrix. 20 H: List of hash functions. 21 22 Methods: 23 client(d): 24 Simulates the client side of the CMS, returning a vector with hash values. 25 update_sketch_matrix(d): 26 Updates the sketch matrix based on the given element. 27 estimate_client(d): 28 Estimates the frequency of an element using the CMS sketch matrix. 29 server_simulator(): 30 Simulates the server side of the CMS, processes the data, and estimates frequencies. 31 """ 32 33 def __init__(self, k, m, df): 34 """ 35 Initializes the CMSClient with the given parameters. 36 """ 37 self.df = df 38 self.k = k 39 self.m = m 40 self.dataset = self.df['value'].tolist() 41 self.domain = self.df['value'].unique().tolist() 42 self.N = len(self.dataset) 43 44 # Creation of the sketch matrix 45 self.M = np.zeros((self.k, self.m)) 46 47 # Definition of the hash family 3 by 3 48 primes = list(primerange(10**6, 10**7)) 49 p = primes[random.randint(0, len(primes)-1)] 50 self.H = generate_hash_functions(self.k,p, 3,self.m) 51 52 def client(self, d): 53 """ 54 Simulates the client side of the Count-Min Sketch. 55 56 Args: 57 d (element): The element for which the sketch vector is generated. 58 59 Returns: 60 tuple: A tuple containing the sketch vector and the index of the chosen hash function. 61 """ 62 j = random.randint(0, self.k-1) 63 v = np.full(self.m, -1) 64 selected_hash = self.H[j] 65 v[selected_hash(d)] = 1 66 return v, j 67 68 def update_sketch_matrix(self, d): 69 """ 70 Updates the sketch matrix based on the given element. 71 72 Args: 73 d (element): The element to be used for updating the sketch matrix. 74 """ 75 for i in range (self.k): 76 selected_hash = self.H[i] 77 hash_index = selected_hash(d) 78 self.M[i ,hash_index] += 1 79 80 def estimate_client(self,d): 81 """ 82 Estimates the frequency of an element based on the sketch matrix. 83 84 Args: 85 d (element): The element whose frequency is estimated. 86 87 Returns: 88 float: The estimated frequency of the element. 89 """ 90 mean = 0 91 for i in range(self.k): 92 selected_hash = self.H[i] 93 mean += self.M[i,selected_hash(d)] 94 return mean/self.k 95 96 def server_simulator(self): 97 """ 98 Simulates the server side of the CMS by processing the dataset 99 and estimating the frequencies of each element. 100 101 Returns: 102 dict: A dictionary with the elements and their estimated frequencies. 103 """ 104 with Progress() as progress: 105 bar = progress.add_task("[cyan]Processing client data...", total=len(self.dataset)) 106 107 for d in self.dataset: 108 self.update_sketch_matrix(d) 109 progress.update(bar, advance=1) 110 111 F_estimated = {} 112 bar = progress.add_task("[cyan]Obtaining histogram of estimated frequencies...", total=len(self.domain)) 113 for x in self.domain: 114 F_estimated[x] = self.estimate_client(x) 115 progress.update(bar, advance=1) 116 return F_estimated 117 118def run_cms_client_mean(k, m, df): 119 """ 120 Runs the Count-Min Sketch algorithm and displays the results. 121 122 Args: 123 k (int): Number of hash functions. 124 m (int): Size of the sketch matrix. 125 df (DataFrame): Dataset to be processed. 126 127 Returns: 128 DataFrame: A table containing the elements and their estimated frequencies. 129 """ 130 # Initialize the CMSClient 131 PCMS = CMSClient(k, m, df) 132 133 # Simulate the server side 134 f_estimated = PCMS.server_simulator() 135 136 # Show the results 137 data_table, _= display_results(df, f_estimated) 138 139 return data_table 140 141 142 143 144
9class CMSClient: 10 """ 11 A class to represent the Count-Min Sketch (CMS) Client. 12 13 Attributes: 14 df: DataFrame containing the dataset. 15 k: Number of hash functions used in the CMS. 16 m: Size of the sketch matrix. 17 dataset: List of values from the dataset. 18 domain: Unique values in the dataset. 19 N: Total number of elements in the dataset. 20 M: Count-Min Sketch matrix. 21 H: List of hash functions. 22 23 Methods: 24 client(d): 25 Simulates the client side of the CMS, returning a vector with hash values. 26 update_sketch_matrix(d): 27 Updates the sketch matrix based on the given element. 28 estimate_client(d): 29 Estimates the frequency of an element using the CMS sketch matrix. 30 server_simulator(): 31 Simulates the server side of the CMS, processes the data, and estimates frequencies. 32 """ 33 34 def __init__(self, k, m, df): 35 """ 36 Initializes the CMSClient with the given parameters. 37 """ 38 self.df = df 39 self.k = k 40 self.m = m 41 self.dataset = self.df['value'].tolist() 42 self.domain = self.df['value'].unique().tolist() 43 self.N = len(self.dataset) 44 45 # Creation of the sketch matrix 46 self.M = np.zeros((self.k, self.m)) 47 48 # Definition of the hash family 3 by 3 49 primes = list(primerange(10**6, 10**7)) 50 p = primes[random.randint(0, len(primes)-1)] 51 self.H = generate_hash_functions(self.k,p, 3,self.m) 52 53 def client(self, d): 54 """ 55 Simulates the client side of the Count-Min Sketch. 56 57 Args: 58 d (element): The element for which the sketch vector is generated. 59 60 Returns: 61 tuple: A tuple containing the sketch vector and the index of the chosen hash function. 62 """ 63 j = random.randint(0, self.k-1) 64 v = np.full(self.m, -1) 65 selected_hash = self.H[j] 66 v[selected_hash(d)] = 1 67 return v, j 68 69 def update_sketch_matrix(self, d): 70 """ 71 Updates the sketch matrix based on the given element. 72 73 Args: 74 d (element): The element to be used for updating the sketch matrix. 75 """ 76 for i in range (self.k): 77 selected_hash = self.H[i] 78 hash_index = selected_hash(d) 79 self.M[i ,hash_index] += 1 80 81 def estimate_client(self,d): 82 """ 83 Estimates the frequency of an element based on the sketch matrix. 84 85 Args: 86 d (element): The element whose frequency is estimated. 87 88 Returns: 89 float: The estimated frequency of the element. 90 """ 91 mean = 0 92 for i in range(self.k): 93 selected_hash = self.H[i] 94 mean += self.M[i,selected_hash(d)] 95 return mean/self.k 96 97 def server_simulator(self): 98 """ 99 Simulates the server side of the CMS by processing the dataset 100 and estimating the frequencies of each element. 101 102 Returns: 103 dict: A dictionary with the elements and their estimated frequencies. 104 """ 105 with Progress() as progress: 106 bar = progress.add_task("[cyan]Processing client data...", total=len(self.dataset)) 107 108 for d in self.dataset: 109 self.update_sketch_matrix(d) 110 progress.update(bar, advance=1) 111 112 F_estimated = {} 113 bar = progress.add_task("[cyan]Obtaining histogram of estimated frequencies...", total=len(self.domain)) 114 for x in self.domain: 115 F_estimated[x] = self.estimate_client(x) 116 progress.update(bar, advance=1) 117 return F_estimated
A class to represent the Count-Min Sketch (CMS) Client.
Attributes: df: DataFrame containing the dataset. k: Number of hash functions used in the CMS. m: Size of the sketch matrix. dataset: List of values from the dataset. domain: Unique values in the dataset. N: Total number of elements in the dataset. M: Count-Min Sketch matrix. H: List of hash functions.
Methods: client(d): Simulates the client side of the CMS, returning a vector with hash values. update_sketch_matrix(d): Updates the sketch matrix based on the given element. estimate_client(d): Estimates the frequency of an element using the CMS sketch matrix. server_simulator(): Simulates the server side of the CMS, processes the data, and estimates frequencies.
34 def __init__(self, k, m, df): 35 """ 36 Initializes the CMSClient with the given parameters. 37 """ 38 self.df = df 39 self.k = k 40 self.m = m 41 self.dataset = self.df['value'].tolist() 42 self.domain = self.df['value'].unique().tolist() 43 self.N = len(self.dataset) 44 45 # Creation of the sketch matrix 46 self.M = np.zeros((self.k, self.m)) 47 48 # Definition of the hash family 3 by 3 49 primes = list(primerange(10**6, 10**7)) 50 p = primes[random.randint(0, len(primes)-1)] 51 self.H = generate_hash_functions(self.k,p, 3,self.m)
Initializes the CMSClient with the given parameters.
53 def client(self, d): 54 """ 55 Simulates the client side of the Count-Min Sketch. 56 57 Args: 58 d (element): The element for which the sketch vector is generated. 59 60 Returns: 61 tuple: A tuple containing the sketch vector and the index of the chosen hash function. 62 """ 63 j = random.randint(0, self.k-1) 64 v = np.full(self.m, -1) 65 selected_hash = self.H[j] 66 v[selected_hash(d)] = 1 67 return v, j
Simulates the client side of the Count-Min Sketch.
Args: d (element): The element for which the sketch vector is generated.
Returns: tuple: A tuple containing the sketch vector and the index of the chosen hash function.
69 def update_sketch_matrix(self, d): 70 """ 71 Updates the sketch matrix based on the given element. 72 73 Args: 74 d (element): The element to be used for updating the sketch matrix. 75 """ 76 for i in range (self.k): 77 selected_hash = self.H[i] 78 hash_index = selected_hash(d) 79 self.M[i ,hash_index] += 1
Updates the sketch matrix based on the given element.
Args: d (element): The element to be used for updating the sketch matrix.
81 def estimate_client(self,d): 82 """ 83 Estimates the frequency of an element based on the sketch matrix. 84 85 Args: 86 d (element): The element whose frequency is estimated. 87 88 Returns: 89 float: The estimated frequency of the element. 90 """ 91 mean = 0 92 for i in range(self.k): 93 selected_hash = self.H[i] 94 mean += self.M[i,selected_hash(d)] 95 return mean/self.k
Estimates the frequency of an element based on the sketch matrix.
Args: d (element): The element whose frequency is estimated.
Returns: float: The estimated frequency of the element.
97 def server_simulator(self): 98 """ 99 Simulates the server side of the CMS by processing the dataset 100 and estimating the frequencies of each element. 101 102 Returns: 103 dict: A dictionary with the elements and their estimated frequencies. 104 """ 105 with Progress() as progress: 106 bar = progress.add_task("[cyan]Processing client data...", total=len(self.dataset)) 107 108 for d in self.dataset: 109 self.update_sketch_matrix(d) 110 progress.update(bar, advance=1) 111 112 F_estimated = {} 113 bar = progress.add_task("[cyan]Obtaining histogram of estimated frequencies...", total=len(self.domain)) 114 for x in self.domain: 115 F_estimated[x] = self.estimate_client(x) 116 progress.update(bar, advance=1) 117 return F_estimated
Simulates the server side of the CMS by processing the dataset and estimating the frequencies of each element.
Returns: dict: A dictionary with the elements and their estimated frequencies.
119def run_cms_client_mean(k, m, df): 120 """ 121 Runs the Count-Min Sketch algorithm and displays the results. 122 123 Args: 124 k (int): Number of hash functions. 125 m (int): Size of the sketch matrix. 126 df (DataFrame): Dataset to be processed. 127 128 Returns: 129 DataFrame: A table containing the elements and their estimated frequencies. 130 """ 131 # Initialize the CMSClient 132 PCMS = CMSClient(k, m, df) 133 134 # Simulate the server side 135 f_estimated = PCMS.server_simulator() 136 137 # Show the results 138 data_table, _= display_results(df, f_estimated) 139 140 return data_table
Runs the Count-Min Sketch algorithm and displays the results.
Args: k (int): Number of hash functions. m (int): Size of the sketch matrix. df (DataFrame): Dataset to be processed.
Returns: DataFrame: A table containing the elements and their estimated frequencies.