src.count_mean.cms_client_mean

  1import random
  2import numpy as np
  3from sympy import primerange
  4from rich.progress import Progress
  5
  6from utils.utils import generate_hash_functions, display_results
  7
  8class CMSClient:
  9    """
 10    A class to represent the Count-Min Sketch (CMS) Client.
 11
 12    Attributes:
 13        df: DataFrame containing the dataset.
 14        k: Number of hash functions used in the CMS.
 15        m: Size of the sketch matrix.
 16        dataset: List of values from the dataset.
 17        domain: Unique values in the dataset.
 18        N: Total number of elements in the dataset.
 19        M: Count-Min Sketch matrix.
 20        H: List of hash functions.
 21    
 22    Methods:
 23        client(d):
 24            Simulates the client side of the CMS, returning a vector with hash values.
 25        update_sketch_matrix(d):
 26            Updates the sketch matrix based on the given element.
 27        estimate_client(d):
 28            Estimates the frequency of an element using the CMS sketch matrix.
 29        server_simulator():
 30            Simulates the server side of the CMS, processes the data, and estimates frequencies.
 31    """
 32
 33    def __init__(self, k, m, df):
 34        """
 35        Initializes the CMSClient with the given parameters.
 36        """
 37        self.df = df
 38        self.k = k 
 39        self.m = m
 40        self.dataset = self.df['value'].tolist()
 41        self.domain = self.df['value'].unique().tolist()
 42        self.N = len(self.dataset)
 43        
 44        # Creation of the sketch matrix
 45        self.M = np.zeros((self.k, self.m))
 46
 47        # Definition of the hash family 3 by 3
 48        primes = list(primerange(10**6, 10**7))
 49        p = primes[random.randint(0, len(primes)-1)]
 50        self.H = generate_hash_functions(self.k,p, 3,self.m)
 51
 52    def client(self, d):
 53        """
 54        Simulates the client side of the Count-Min Sketch.
 55
 56        Args:
 57            d (element): The element for which the sketch vector is generated.
 58
 59        Returns:
 60            tuple: A tuple containing the sketch vector and the index of the chosen hash function.
 61        """
 62        j = random.randint(0, self.k-1)
 63        v = np.full(self.m, -1)
 64        selected_hash = self.H[j]
 65        v[selected_hash(d)] = 1
 66        return v, j
 67   
 68    def update_sketch_matrix(self, d):
 69        """
 70        Updates the sketch matrix based on the given element.
 71
 72        Args:
 73            d (element): The element to be used for updating the sketch matrix.
 74        """
 75        for i in range (self.k):
 76            selected_hash = self.H[i]
 77            hash_index = selected_hash(d)
 78            self.M[i ,hash_index] += 1
 79
 80    def estimate_client(self,d):
 81        """
 82        Estimates the frequency of an element based on the sketch matrix.
 83
 84        Args:
 85            d (element): The element whose frequency is estimated.
 86
 87        Returns:
 88            float: The estimated frequency of the element.
 89        """
 90        mean = 0
 91        for i in range(self.k):
 92            selected_hash = self.H[i]
 93            mean += self.M[i,selected_hash(d)]
 94        return mean/self.k
 95    
 96    def server_simulator(self):
 97        """
 98        Simulates the server side of the CMS by processing the dataset 
 99        and estimating the frequencies of each element.
100
101        Returns:
102            dict: A dictionary with the elements and their estimated frequencies.
103        """
104        with Progress() as progress:
105            bar = progress.add_task("[cyan]Processing client data...", total=len(self.dataset))
106
107            for d in self.dataset:
108                self.update_sketch_matrix(d)
109                progress.update(bar, advance=1)
110
111            F_estimated = {}
112            bar = progress.add_task("[cyan]Obtaining histogram of estimated frequencies...", total=len(self.domain))
113            for x in self.domain:
114                F_estimated[x] = self.estimate_client(x)
115                progress.update(bar, advance=1)
116        return F_estimated
117
118def run_cms_client_mean(k, m, df):
119    """
120    Runs the Count-Min Sketch algorithm and displays the results.
121
122    Args:
123        k (int): Number of hash functions.
124        m (int): Size of the sketch matrix.
125        df (DataFrame): Dataset to be processed.
126
127    Returns:
128        DataFrame: A table containing the elements and their estimated frequencies.
129    """
130    # Initialize the CMSClient
131    PCMS = CMSClient(k, m, df)
132
133    # Simulate the server side
134    f_estimated = PCMS.server_simulator()
135
136    # Show the results
137    data_table, _= display_results(df, f_estimated)
138
139    return data_table
140
141
142
143
144  
class CMSClient:
  9class CMSClient:
 10    """
 11    A class to represent the Count-Min Sketch (CMS) Client.
 12
 13    Attributes:
 14        df: DataFrame containing the dataset.
 15        k: Number of hash functions used in the CMS.
 16        m: Size of the sketch matrix.
 17        dataset: List of values from the dataset.
 18        domain: Unique values in the dataset.
 19        N: Total number of elements in the dataset.
 20        M: Count-Min Sketch matrix.
 21        H: List of hash functions.
 22    
 23    Methods:
 24        client(d):
 25            Simulates the client side of the CMS, returning a vector with hash values.
 26        update_sketch_matrix(d):
 27            Updates the sketch matrix based on the given element.
 28        estimate_client(d):
 29            Estimates the frequency of an element using the CMS sketch matrix.
 30        server_simulator():
 31            Simulates the server side of the CMS, processes the data, and estimates frequencies.
 32    """
 33
 34    def __init__(self, k, m, df):
 35        """
 36        Initializes the CMSClient with the given parameters.
 37        """
 38        self.df = df
 39        self.k = k 
 40        self.m = m
 41        self.dataset = self.df['value'].tolist()
 42        self.domain = self.df['value'].unique().tolist()
 43        self.N = len(self.dataset)
 44        
 45        # Creation of the sketch matrix
 46        self.M = np.zeros((self.k, self.m))
 47
 48        # Definition of the hash family 3 by 3
 49        primes = list(primerange(10**6, 10**7))
 50        p = primes[random.randint(0, len(primes)-1)]
 51        self.H = generate_hash_functions(self.k,p, 3,self.m)
 52
 53    def client(self, d):
 54        """
 55        Simulates the client side of the Count-Min Sketch.
 56
 57        Args:
 58            d (element): The element for which the sketch vector is generated.
 59
 60        Returns:
 61            tuple: A tuple containing the sketch vector and the index of the chosen hash function.
 62        """
 63        j = random.randint(0, self.k-1)
 64        v = np.full(self.m, -1)
 65        selected_hash = self.H[j]
 66        v[selected_hash(d)] = 1
 67        return v, j
 68   
 69    def update_sketch_matrix(self, d):
 70        """
 71        Updates the sketch matrix based on the given element.
 72
 73        Args:
 74            d (element): The element to be used for updating the sketch matrix.
 75        """
 76        for i in range (self.k):
 77            selected_hash = self.H[i]
 78            hash_index = selected_hash(d)
 79            self.M[i ,hash_index] += 1
 80
 81    def estimate_client(self,d):
 82        """
 83        Estimates the frequency of an element based on the sketch matrix.
 84
 85        Args:
 86            d (element): The element whose frequency is estimated.
 87
 88        Returns:
 89            float: The estimated frequency of the element.
 90        """
 91        mean = 0
 92        for i in range(self.k):
 93            selected_hash = self.H[i]
 94            mean += self.M[i,selected_hash(d)]
 95        return mean/self.k
 96    
 97    def server_simulator(self):
 98        """
 99        Simulates the server side of the CMS by processing the dataset 
100        and estimating the frequencies of each element.
101
102        Returns:
103            dict: A dictionary with the elements and their estimated frequencies.
104        """
105        with Progress() as progress:
106            bar = progress.add_task("[cyan]Processing client data...", total=len(self.dataset))
107
108            for d in self.dataset:
109                self.update_sketch_matrix(d)
110                progress.update(bar, advance=1)
111
112            F_estimated = {}
113            bar = progress.add_task("[cyan]Obtaining histogram of estimated frequencies...", total=len(self.domain))
114            for x in self.domain:
115                F_estimated[x] = self.estimate_client(x)
116                progress.update(bar, advance=1)
117        return F_estimated

A class to represent the Count-Min Sketch (CMS) Client.

Attributes: df: DataFrame containing the dataset. k: Number of hash functions used in the CMS. m: Size of the sketch matrix. dataset: List of values from the dataset. domain: Unique values in the dataset. N: Total number of elements in the dataset. M: Count-Min Sketch matrix. H: List of hash functions.

Methods: client(d): Simulates the client side of the CMS, returning a vector with hash values. update_sketch_matrix(d): Updates the sketch matrix based on the given element. estimate_client(d): Estimates the frequency of an element using the CMS sketch matrix. server_simulator(): Simulates the server side of the CMS, processes the data, and estimates frequencies.

CMSClient(k, m, df)
34    def __init__(self, k, m, df):
35        """
36        Initializes the CMSClient with the given parameters.
37        """
38        self.df = df
39        self.k = k 
40        self.m = m
41        self.dataset = self.df['value'].tolist()
42        self.domain = self.df['value'].unique().tolist()
43        self.N = len(self.dataset)
44        
45        # Creation of the sketch matrix
46        self.M = np.zeros((self.k, self.m))
47
48        # Definition of the hash family 3 by 3
49        primes = list(primerange(10**6, 10**7))
50        p = primes[random.randint(0, len(primes)-1)]
51        self.H = generate_hash_functions(self.k,p, 3,self.m)

Initializes the CMSClient with the given parameters.

df
k
m
dataset
domain
N
M
H
def client(self, d):
53    def client(self, d):
54        """
55        Simulates the client side of the Count-Min Sketch.
56
57        Args:
58            d (element): The element for which the sketch vector is generated.
59
60        Returns:
61            tuple: A tuple containing the sketch vector and the index of the chosen hash function.
62        """
63        j = random.randint(0, self.k-1)
64        v = np.full(self.m, -1)
65        selected_hash = self.H[j]
66        v[selected_hash(d)] = 1
67        return v, j

Simulates the client side of the Count-Min Sketch.

Args: d (element): The element for which the sketch vector is generated.

Returns: tuple: A tuple containing the sketch vector and the index of the chosen hash function.

def update_sketch_matrix(self, d):
69    def update_sketch_matrix(self, d):
70        """
71        Updates the sketch matrix based on the given element.
72
73        Args:
74            d (element): The element to be used for updating the sketch matrix.
75        """
76        for i in range (self.k):
77            selected_hash = self.H[i]
78            hash_index = selected_hash(d)
79            self.M[i ,hash_index] += 1

Updates the sketch matrix based on the given element.

Args: d (element): The element to be used for updating the sketch matrix.

def estimate_client(self, d):
81    def estimate_client(self,d):
82        """
83        Estimates the frequency of an element based on the sketch matrix.
84
85        Args:
86            d (element): The element whose frequency is estimated.
87
88        Returns:
89            float: The estimated frequency of the element.
90        """
91        mean = 0
92        for i in range(self.k):
93            selected_hash = self.H[i]
94            mean += self.M[i,selected_hash(d)]
95        return mean/self.k

Estimates the frequency of an element based on the sketch matrix.

Args: d (element): The element whose frequency is estimated.

Returns: float: The estimated frequency of the element.

def server_simulator(self):
 97    def server_simulator(self):
 98        """
 99        Simulates the server side of the CMS by processing the dataset 
100        and estimating the frequencies of each element.
101
102        Returns:
103            dict: A dictionary with the elements and their estimated frequencies.
104        """
105        with Progress() as progress:
106            bar = progress.add_task("[cyan]Processing client data...", total=len(self.dataset))
107
108            for d in self.dataset:
109                self.update_sketch_matrix(d)
110                progress.update(bar, advance=1)
111
112            F_estimated = {}
113            bar = progress.add_task("[cyan]Obtaining histogram of estimated frequencies...", total=len(self.domain))
114            for x in self.domain:
115                F_estimated[x] = self.estimate_client(x)
116                progress.update(bar, advance=1)
117        return F_estimated

Simulates the server side of the CMS by processing the dataset and estimating the frequencies of each element.

Returns: dict: A dictionary with the elements and their estimated frequencies.

def run_cms_client_mean(k, m, df):
119def run_cms_client_mean(k, m, df):
120    """
121    Runs the Count-Min Sketch algorithm and displays the results.
122
123    Args:
124        k (int): Number of hash functions.
125        m (int): Size of the sketch matrix.
126        df (DataFrame): Dataset to be processed.
127
128    Returns:
129        DataFrame: A table containing the elements and their estimated frequencies.
130    """
131    # Initialize the CMSClient
132    PCMS = CMSClient(k, m, df)
133
134    # Simulate the server side
135    f_estimated = PCMS.server_simulator()
136
137    # Show the results
138    data_table, _= display_results(df, f_estimated)
139
140    return data_table

Runs the Count-Min Sketch algorithm and displays the results.

Args: k (int): Number of hash functions. m (int): Size of the sketch matrix. df (DataFrame): Dataset to be processed.

Returns: DataFrame: A table containing the elements and their estimated frequencies.