src.private_count_mean.cms_client_mean

  1import random
  2import numpy as np
  3from sympy import primerange
  4from rich.progress import Progress
  5import pandas as pd
  6
  7from utils.utils import load_dataset, generate_error_table, generate_hash_functions, display_results
  8
  9class CMSClient:
 10    """
 11    A class to represent the Count-Min Sketch (CMS) Client.
 12
 13    Attributes:
 14        df: DataFrame containing the dataset.
 15        k: Number of hash functions used in the CMS.
 16        m: Size of the sketch matrix.
 17        dataset: List of values from the dataset.
 18        domain: Unique values in the dataset.
 19        N: Total number of elements in the dataset.
 20        M: Count-Min Sketch matrix.
 21        H: List of hash functions.
 22    
 23    Methods:
 24        client(d):
 25            Simulates the client side of the CMS, returning a vector with hash values.
 26        update_sketch_matrix(d):
 27            Updates the sketch matrix based on the given element.
 28        estimate_client(d):
 29            Estimates the frequency of an element using the CMS sketch matrix.
 30        server_simulator():
 31            Simulates the server side of the CMS, processes the data, and estimates frequencies.
 32    """
 33
 34    def __init__(self, k, m, df):
 35        """
 36        Initializes the CMSClient with the given parameters.
 37        """
 38        self.df = df
 39        self.k = k 
 40        self.m = m
 41        self.dataset = self.df['value'].tolist()
 42        self.domain = self.df['value'].unique().tolist()
 43        self.N = len(self.dataset)
 44        
 45        # Creation of the sketch matrix
 46        self.M = np.zeros((self.k, self.m))
 47
 48        # Definition of the hash family 3 by 3
 49        primes = list(primerange(10**6, 10**7))
 50        p = primes[random.randint(0, len(primes)-1)]
 51        self.H = generate_hash_functions(self.k,p, 3,self.m)
 52
 53    def client(self, d):
 54        """
 55        Simulates the client side of the Count-Min Sketch.
 56
 57        Args:
 58            d (element): The element for which the sketch vector is generated.
 59
 60        Returns:
 61            tuple: A tuple containing the sketch vector and the index of the chosen hash function.
 62        """
 63        j = random.randint(0, self.k-1)
 64        v = np.full(self.m, -1)
 65        selected_hash = self.H[j]
 66        v[selected_hash(d)] = 1
 67        return v, j
 68   
 69    def update_sketch_matrix(self, d):
 70        """
 71        Updates the sketch matrix based on the given element.
 72
 73        Args:
 74            d (element): The element to be used for updating the sketch matrix.
 75        """
 76        for i in range (self.k):
 77            selected_hash = self.H[i]
 78            hash_index = selected_hash(d)
 79            self.M[i ,hash_index] += 1
 80
 81    def estimate_client(self,d):
 82        """
 83        Estimates the frequency of an element based on the sketch matrix.
 84
 85        Args:
 86            d (element): The element whose frequency is estimated.
 87
 88        Returns:
 89            float: The estimated frequency of the element.
 90        """
 91        mean = 0
 92        for i in range(self.k):
 93            selected_hash = self.H[i]
 94            mean += self.M[i,selected_hash(d)]
 95        return mean/self.k
 96    
 97    def server_simulator(self):
 98        """
 99        Simulates the server side of the CMS by processing the dataset 
100        and estimating the frequencies of each element.
101
102        Returns:
103            dict: A dictionary with the elements and their estimated frequencies.
104        """
105        with Progress() as progress:
106            bar = progress.add_task("[cyan]Processing client data...", total=len(self.dataset))
107
108            for d in self.dataset:
109                self.update_sketch_matrix(d)
110                progress.update(bar, advance=1)
111
112            F_estimated = {}
113            bar = progress.add_task("[cyan]Obtaining histogram of estimated frequencies...", total=len(self.domain))
114            for x in self.domain:
115                F_estimated[x] = self.estimate_client(x)
116                progress.update(bar, advance=1)
117        return F_estimated
118
119def run_cms_client_mean(k, m, df):
120    """
121    Runs the Count-Min Sketch algorithm and displays the results.
122
123    Args:
124        k (int): Number of hash functions.
125        m (int): Size of the sketch matrix.
126        df (DataFrame): Dataset to be processed.
127
128    Returns:
129        DataFrame: A table containing the elements and their estimated frequencies.
130    """
131    # Initialize the CMSClient
132    PCMS = CMSClient(k, m, df)
133
134    # Simulate the server side
135    f_estimated = PCMS.server_simulator()
136    df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency'])
137
138    # Show the results
139    data_table, _= display_results(df, f_estimated)
140
141    return data_table
142
143
144
145
146  
class CMSClient:
 10class CMSClient:
 11    """
 12    A class to represent the Count-Min Sketch (CMS) Client.
 13
 14    Attributes:
 15        df: DataFrame containing the dataset.
 16        k: Number of hash functions used in the CMS.
 17        m: Size of the sketch matrix.
 18        dataset: List of values from the dataset.
 19        domain: Unique values in the dataset.
 20        N: Total number of elements in the dataset.
 21        M: Count-Min Sketch matrix.
 22        H: List of hash functions.
 23    
 24    Methods:
 25        client(d):
 26            Simulates the client side of the CMS, returning a vector with hash values.
 27        update_sketch_matrix(d):
 28            Updates the sketch matrix based on the given element.
 29        estimate_client(d):
 30            Estimates the frequency of an element using the CMS sketch matrix.
 31        server_simulator():
 32            Simulates the server side of the CMS, processes the data, and estimates frequencies.
 33    """
 34
 35    def __init__(self, k, m, df):
 36        """
 37        Initializes the CMSClient with the given parameters.
 38        """
 39        self.df = df
 40        self.k = k 
 41        self.m = m
 42        self.dataset = self.df['value'].tolist()
 43        self.domain = self.df['value'].unique().tolist()
 44        self.N = len(self.dataset)
 45        
 46        # Creation of the sketch matrix
 47        self.M = np.zeros((self.k, self.m))
 48
 49        # Definition of the hash family 3 by 3
 50        primes = list(primerange(10**6, 10**7))
 51        p = primes[random.randint(0, len(primes)-1)]
 52        self.H = generate_hash_functions(self.k,p, 3,self.m)
 53
 54    def client(self, d):
 55        """
 56        Simulates the client side of the Count-Min Sketch.
 57
 58        Args:
 59            d (element): The element for which the sketch vector is generated.
 60
 61        Returns:
 62            tuple: A tuple containing the sketch vector and the index of the chosen hash function.
 63        """
 64        j = random.randint(0, self.k-1)
 65        v = np.full(self.m, -1)
 66        selected_hash = self.H[j]
 67        v[selected_hash(d)] = 1
 68        return v, j
 69   
 70    def update_sketch_matrix(self, d):
 71        """
 72        Updates the sketch matrix based on the given element.
 73
 74        Args:
 75            d (element): The element to be used for updating the sketch matrix.
 76        """
 77        for i in range (self.k):
 78            selected_hash = self.H[i]
 79            hash_index = selected_hash(d)
 80            self.M[i ,hash_index] += 1
 81
 82    def estimate_client(self,d):
 83        """
 84        Estimates the frequency of an element based on the sketch matrix.
 85
 86        Args:
 87            d (element): The element whose frequency is estimated.
 88
 89        Returns:
 90            float: The estimated frequency of the element.
 91        """
 92        mean = 0
 93        for i in range(self.k):
 94            selected_hash = self.H[i]
 95            mean += self.M[i,selected_hash(d)]
 96        return mean/self.k
 97    
 98    def server_simulator(self):
 99        """
100        Simulates the server side of the CMS by processing the dataset 
101        and estimating the frequencies of each element.
102
103        Returns:
104            dict: A dictionary with the elements and their estimated frequencies.
105        """
106        with Progress() as progress:
107            bar = progress.add_task("[cyan]Processing client data...", total=len(self.dataset))
108
109            for d in self.dataset:
110                self.update_sketch_matrix(d)
111                progress.update(bar, advance=1)
112
113            F_estimated = {}
114            bar = progress.add_task("[cyan]Obtaining histogram of estimated frequencies...", total=len(self.domain))
115            for x in self.domain:
116                F_estimated[x] = self.estimate_client(x)
117                progress.update(bar, advance=1)
118        return F_estimated

A class to represent the Count-Min Sketch (CMS) Client.

Attributes: df: DataFrame containing the dataset. k: Number of hash functions used in the CMS. m: Size of the sketch matrix. dataset: List of values from the dataset. domain: Unique values in the dataset. N: Total number of elements in the dataset. M: Count-Min Sketch matrix. H: List of hash functions.

Methods: client(d): Simulates the client side of the CMS, returning a vector with hash values. update_sketch_matrix(d): Updates the sketch matrix based on the given element. estimate_client(d): Estimates the frequency of an element using the CMS sketch matrix. server_simulator(): Simulates the server side of the CMS, processes the data, and estimates frequencies.

CMSClient(k, m, df)
35    def __init__(self, k, m, df):
36        """
37        Initializes the CMSClient with the given parameters.
38        """
39        self.df = df
40        self.k = k 
41        self.m = m
42        self.dataset = self.df['value'].tolist()
43        self.domain = self.df['value'].unique().tolist()
44        self.N = len(self.dataset)
45        
46        # Creation of the sketch matrix
47        self.M = np.zeros((self.k, self.m))
48
49        # Definition of the hash family 3 by 3
50        primes = list(primerange(10**6, 10**7))
51        p = primes[random.randint(0, len(primes)-1)]
52        self.H = generate_hash_functions(self.k,p, 3,self.m)

Initializes the CMSClient with the given parameters.

df
k
m
dataset
domain
N
M
H
def client(self, d):
54    def client(self, d):
55        """
56        Simulates the client side of the Count-Min Sketch.
57
58        Args:
59            d (element): The element for which the sketch vector is generated.
60
61        Returns:
62            tuple: A tuple containing the sketch vector and the index of the chosen hash function.
63        """
64        j = random.randint(0, self.k-1)
65        v = np.full(self.m, -1)
66        selected_hash = self.H[j]
67        v[selected_hash(d)] = 1
68        return v, j

Simulates the client side of the Count-Min Sketch.

Args: d (element): The element for which the sketch vector is generated.

Returns: tuple: A tuple containing the sketch vector and the index of the chosen hash function.

def update_sketch_matrix(self, d):
70    def update_sketch_matrix(self, d):
71        """
72        Updates the sketch matrix based on the given element.
73
74        Args:
75            d (element): The element to be used for updating the sketch matrix.
76        """
77        for i in range (self.k):
78            selected_hash = self.H[i]
79            hash_index = selected_hash(d)
80            self.M[i ,hash_index] += 1

Updates the sketch matrix based on the given element.

Args: d (element): The element to be used for updating the sketch matrix.

def estimate_client(self, d):
82    def estimate_client(self,d):
83        """
84        Estimates the frequency of an element based on the sketch matrix.
85
86        Args:
87            d (element): The element whose frequency is estimated.
88
89        Returns:
90            float: The estimated frequency of the element.
91        """
92        mean = 0
93        for i in range(self.k):
94            selected_hash = self.H[i]
95            mean += self.M[i,selected_hash(d)]
96        return mean/self.k

Estimates the frequency of an element based on the sketch matrix.

Args: d (element): The element whose frequency is estimated.

Returns: float: The estimated frequency of the element.

def server_simulator(self):
 98    def server_simulator(self):
 99        """
100        Simulates the server side of the CMS by processing the dataset 
101        and estimating the frequencies of each element.
102
103        Returns:
104            dict: A dictionary with the elements and their estimated frequencies.
105        """
106        with Progress() as progress:
107            bar = progress.add_task("[cyan]Processing client data...", total=len(self.dataset))
108
109            for d in self.dataset:
110                self.update_sketch_matrix(d)
111                progress.update(bar, advance=1)
112
113            F_estimated = {}
114            bar = progress.add_task("[cyan]Obtaining histogram of estimated frequencies...", total=len(self.domain))
115            for x in self.domain:
116                F_estimated[x] = self.estimate_client(x)
117                progress.update(bar, advance=1)
118        return F_estimated

Simulates the server side of the CMS by processing the dataset and estimating the frequencies of each element.

Returns: dict: A dictionary with the elements and their estimated frequencies.

def run_cms_client_mean(k, m, df):
120def run_cms_client_mean(k, m, df):
121    """
122    Runs the Count-Min Sketch algorithm and displays the results.
123
124    Args:
125        k (int): Number of hash functions.
126        m (int): Size of the sketch matrix.
127        df (DataFrame): Dataset to be processed.
128
129    Returns:
130        DataFrame: A table containing the elements and their estimated frequencies.
131    """
132    # Initialize the CMSClient
133    PCMS = CMSClient(k, m, df)
134
135    # Simulate the server side
136    f_estimated = PCMS.server_simulator()
137    df_estimated = pd.DataFrame(list(f_estimated.items()), columns=['Element', 'Frequency'])
138
139    # Show the results
140    data_table, _= display_results(df, f_estimated)
141
142    return data_table

Runs the Count-Min Sketch algorithm and displays the results.

Args: k (int): Number of hash functions. m (int): Size of the sketch matrix. df (DataFrame): Dataset to be processed.

Returns: DataFrame: A table containing the elements and their estimated frequencies.