src.utils.utils

  1import numpy as np
  2import pandas as pd
  3import random
  4import os
  5
  6def create_dataset(N: int, dist_type: str) -> tuple[list, pd.DataFrame, list]:
  7    """
  8    Creates a dataset for frequency estimation.
  9
 10    Args:
 11        N (int): Number of elements in the dataset.
 12        dist_type (string): Distribution type [exp (exponential), norm (normal), small (values within a reduced domain)].
 13
 14    Returns:
 15        values (list): Generated dataset in list format.
 16        df (DataFrame): Generated dataset in Pandas DataFrame format.
 17        unique_values (list): Unique values (domain) in the dataset.
 18
 19    Examples:
 20        >>> create_dataset(10**6, 'exp')
 21        >>> create_dataset(1000, 'small')
 22    """
 23    if dist_type == 'exp':
 24        values = np.random.exponential(scale=2.0, size=N).astype(int)
 25    elif dist_type == 'norm':
 26        values = np.random.normal(loc=12, scale=2, size=N).astype(int)
 27    elif dist_type == 'small':
 28        elements = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
 29        frequencies = [0.29, 0.19, 0.15, 0.12, 0.1, 0.08, 0.05, 0.02]
 30        dataset = np.random.choice(elements, size=N, p=frequencies)
 31        values = dataset.tolist()
 32        np.random.shuffle(values)
 33    
 34    df = pd.DataFrame({'value': values})
 35    unique_values = df['value'].unique().tolist()
 36    unique_values.sort()
 37    return values, df, unique_values
 38
 39def load_dataset(csv_filename):
 40    """
 41    Loads a dataset from a CSV file and returns the values, the DataFrame, and unique 'value' entries.
 42
 43    Args:
 44        csv_filename (str): Name of the CSV file located in the 'datasets' folder.
 45
 46    Returns:
 47        values (list): Dataset in list format.
 48        df (DataFrame): Dataset in Pandas DataFrame format.
 49        unique_values (list): Unique values (domain) of the dataset.
 50    """
 51    dataset_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'data/filtered', csv_filename + '.csv'))
 52    df = pd.read_csv(dataset_path)
 53    df = df[['value']]
 54    values = df['value'].tolist()
 55    unique_values = df['value'].unique().tolist()
 56    return values, df, unique_values
 57
 58def generate_hash_functions(k, p, c, m):
 59    """
 60    Generates a set of k c-independent hash functions (D -> m).
 61
 62    Args:
 63        c (int): Number of coefficients for c-independent hash functions.
 64        k (int): Number of hash functions.
 65        p (int): Large prime number for hash function construction.
 66        m (int): Maximum domain value to which the hash functions map.
 67
 68    Returns:
 69        hash_functions (list): Set of k hash functions.
 70    """
 71    hash_functions = []
 72    functions_params = []
 73    for _ in range(k):
 74        coefficients = [random.randint(1, p - 1) for _ in range(c)]
 75        hash_func = lambda x, coeffs=coefficients, p=p: (sum((coeffs[i] * (hash(x) ** i)) % p for i in range(c)) % p) % m
 76        hash_functions.append(hash_func)
 77        functions_params.append(coefficients)
 78    return hash_functions
 79
 80def generate_hash_function_G(k, p):
 81    hash_functions = []
 82    for _ in range(k):
 83        a = random.randint(1, p - 1)
 84        b = random.randint(0, p - 1)
 85        c = random.randint(1, p - 1)
 86        d = random.randint(0, p - 1)
 87
 88        def hash_func(x, a=a, b=b, c=c, d=d, p=p):
 89            if isinstance(x, str) and x.startswith("AOI "):
 90                x = int(x.split()[1])
 91            x_mod = x % p
 92            h = (a + b * x_mod + c * pow(x_mod, 2, p) + d * pow(x_mod, 3, p)) % p
 93            return 1 if (h % 2) == 0 else -1
 94
 95        hash_functions.append(hash_func)
 96    return hash_functions
 97
 98def generate_error_table(real_freq: pd.DataFrame, estimated_freq: dict):
 99   # Calculate errors
100    f = real_freq['value'].value_counts()
101    real_num_freq = f.sort_index().to_dict()
102
103    error_data = []
104    for element in real_num_freq:
105        real_count = real_num_freq[element]
106        estimated_count = estimated_freq.get(element, 0)
107        if real_count > 0:
108            percent_error = abs(real_count - estimated_count) / real_count * 100
109        else:
110            percent_error = 0.0
111        error_data.append({
112            "Item": element,
113            "Percentage Error": f"{percent_error:.2f}%"
114        })
115
116
117
118def display_results(real_freq: pd.DataFrame, estimated_freq: dict):
119   
120    N = real_freq.shape[0]
121    f = real_freq['value'].value_counts()
122    real_num_freq = f.sort_index().to_dict()
123    real_percent_freq = ((f * 100 / N).sort_index()).to_dict()
124
125    data_table = []
126    for element in real_num_freq:
127        if element in estimated_freq:
128            real_count = real_num_freq[element]
129            real_percent = real_percent_freq[element]
130            estimated_count = estimated_freq[element]
131            estimated_percent = (estimated_count / N) * 100
132            diff = abs(real_num_freq[element] - estimated_freq[element])
133            
134            if real_count > 0:
135                percent_error = abs(real_count - estimated_count) / real_count * 100
136            else:
137                percent_error = 0.0
138            
139            data_table.append([
140                element, 
141                real_count, 
142                f"{real_percent:.3f}%", 
143                f"{estimated_count:.2f}", 
144                f"{estimated_percent:.3f}%", 
145                f"{diff:.2f}", 
146                f"{percent_error:.2f}%"
147            ])
148
149    errors = [abs(real_num_freq[key] - estimated_freq[key]) for key in estimated_freq]
150    mean_error = np.mean(errors)
151    total_errors = np.sum(errors)
152    max_freq = max(real_num_freq.values())
153    min_freq = min(real_num_freq.values())
154    mse = np.sum([(real_num_freq[key] - estimated_freq[key]) ** 2 for key in estimated_freq]) / len(estimated_freq)
155    normalized_mse = mse / (max_freq - min_freq)
156
157    error_table = [
158        ['Total Errors', f"{total_errors:.2f}"],
159        ['Mean Error', f"{mean_error:.2f}"],
160        ['Percentage Error', f"{(mean_error / N) * 100:.2f}%"],
161        ['MSE', f"{mse:.2f}"],
162        ['RMSE', f"{np.sqrt(mse):.2f}"],
163        ['Normalized MSE', f"{normalized_mse:.4f}"],
164        ['Normalized RMSE', f"{np.sqrt(normalized_mse):.2f}"]
165    ]
166
167    return data_table, error_table
def create_dataset(N: int, dist_type: str) -> tuple[list, pandas.core.frame.DataFrame, list]:
 7def create_dataset(N: int, dist_type: str) -> tuple[list, pd.DataFrame, list]:
 8    """
 9    Creates a dataset for frequency estimation.
10
11    Args:
12        N (int): Number of elements in the dataset.
13        dist_type (string): Distribution type [exp (exponential), norm (normal), small (values within a reduced domain)].
14
15    Returns:
16        values (list): Generated dataset in list format.
17        df (DataFrame): Generated dataset in Pandas DataFrame format.
18        unique_values (list): Unique values (domain) in the dataset.
19
20    Examples:
21        >>> create_dataset(10**6, 'exp')
22        >>> create_dataset(1000, 'small')
23    """
24    if dist_type == 'exp':
25        values = np.random.exponential(scale=2.0, size=N).astype(int)
26    elif dist_type == 'norm':
27        values = np.random.normal(loc=12, scale=2, size=N).astype(int)
28    elif dist_type == 'small':
29        elements = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
30        frequencies = [0.29, 0.19, 0.15, 0.12, 0.1, 0.08, 0.05, 0.02]
31        dataset = np.random.choice(elements, size=N, p=frequencies)
32        values = dataset.tolist()
33        np.random.shuffle(values)
34    
35    df = pd.DataFrame({'value': values})
36    unique_values = df['value'].unique().tolist()
37    unique_values.sort()
38    return values, df, unique_values

Creates a dataset for frequency estimation.

Args: N (int): Number of elements in the dataset. dist_type (string): Distribution type [exp (exponential), norm (normal), small (values within a reduced domain)].

Returns: values (list): Generated dataset in list format. df (DataFrame): Generated dataset in Pandas DataFrame format. unique_values (list): Unique values (domain) in the dataset.

Examples:

create_dataset(10**6, 'exp') create_dataset(1000, 'small')

def load_dataset(csv_filename):
40def load_dataset(csv_filename):
41    """
42    Loads a dataset from a CSV file and returns the values, the DataFrame, and unique 'value' entries.
43
44    Args:
45        csv_filename (str): Name of the CSV file located in the 'datasets' folder.
46
47    Returns:
48        values (list): Dataset in list format.
49        df (DataFrame): Dataset in Pandas DataFrame format.
50        unique_values (list): Unique values (domain) of the dataset.
51    """
52    dataset_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'data/filtered', csv_filename + '.csv'))
53    df = pd.read_csv(dataset_path)
54    df = df[['value']]
55    values = df['value'].tolist()
56    unique_values = df['value'].unique().tolist()
57    return values, df, unique_values

Loads a dataset from a CSV file and returns the values, the DataFrame, and unique 'value' entries.

Args: csv_filename (str): Name of the CSV file located in the 'datasets' folder.

Returns: values (list): Dataset in list format. df (DataFrame): Dataset in Pandas DataFrame format. unique_values (list): Unique values (domain) of the dataset.

def generate_hash_functions(k, p, c, m):
59def generate_hash_functions(k, p, c, m):
60    """
61    Generates a set of k c-independent hash functions (D -> m).
62
63    Args:
64        c (int): Number of coefficients for c-independent hash functions.
65        k (int): Number of hash functions.
66        p (int): Large prime number for hash function construction.
67        m (int): Maximum domain value to which the hash functions map.
68
69    Returns:
70        hash_functions (list): Set of k hash functions.
71    """
72    hash_functions = []
73    functions_params = []
74    for _ in range(k):
75        coefficients = [random.randint(1, p - 1) for _ in range(c)]
76        hash_func = lambda x, coeffs=coefficients, p=p: (sum((coeffs[i] * (hash(x) ** i)) % p for i in range(c)) % p) % m
77        hash_functions.append(hash_func)
78        functions_params.append(coefficients)
79    return hash_functions

Generates a set of k c-independent hash functions (D -> m).

Args: c (int): Number of coefficients for c-independent hash functions. k (int): Number of hash functions. p (int): Large prime number for hash function construction. m (int): Maximum domain value to which the hash functions map.

Returns: hash_functions (list): Set of k hash functions.

def generate_hash_function_G(k, p):
81def generate_hash_function_G(k, p):
82    hash_functions = []
83    for _ in range(k):
84        a = random.randint(1, p - 1)
85        b = random.randint(0, p - 1)
86        c = random.randint(1, p - 1)
87        d = random.randint(0, p - 1)
88
89        def hash_func(x, a=a, b=b, c=c, d=d, p=p):
90            if isinstance(x, str) and x.startswith("AOI "):
91                x = int(x.split()[1])
92            x_mod = x % p
93            h = (a + b * x_mod + c * pow(x_mod, 2, p) + d * pow(x_mod, 3, p)) % p
94            return 1 if (h % 2) == 0 else -1
95
96        hash_functions.append(hash_func)
97    return hash_functions
def generate_error_table(real_freq: pandas.core.frame.DataFrame, estimated_freq: dict):
 99def generate_error_table(real_freq: pd.DataFrame, estimated_freq: dict):
100   # Calculate errors
101    f = real_freq['value'].value_counts()
102    real_num_freq = f.sort_index().to_dict()
103
104    error_data = []
105    for element in real_num_freq:
106        real_count = real_num_freq[element]
107        estimated_count = estimated_freq.get(element, 0)
108        if real_count > 0:
109            percent_error = abs(real_count - estimated_count) / real_count * 100
110        else:
111            percent_error = 0.0
112        error_data.append({
113            "Item": element,
114            "Percentage Error": f"{percent_error:.2f}%"
115        })
def display_results(real_freq: pandas.core.frame.DataFrame, estimated_freq: dict):
119def display_results(real_freq: pd.DataFrame, estimated_freq: dict):
120   
121    N = real_freq.shape[0]
122    f = real_freq['value'].value_counts()
123    real_num_freq = f.sort_index().to_dict()
124    real_percent_freq = ((f * 100 / N).sort_index()).to_dict()
125
126    data_table = []
127    for element in real_num_freq:
128        if element in estimated_freq:
129            real_count = real_num_freq[element]
130            real_percent = real_percent_freq[element]
131            estimated_count = estimated_freq[element]
132            estimated_percent = (estimated_count / N) * 100
133            diff = abs(real_num_freq[element] - estimated_freq[element])
134            
135            if real_count > 0:
136                percent_error = abs(real_count - estimated_count) / real_count * 100
137            else:
138                percent_error = 0.0
139            
140            data_table.append([
141                element, 
142                real_count, 
143                f"{real_percent:.3f}%", 
144                f"{estimated_count:.2f}", 
145                f"{estimated_percent:.3f}%", 
146                f"{diff:.2f}", 
147                f"{percent_error:.2f}%"
148            ])
149
150    errors = [abs(real_num_freq[key] - estimated_freq[key]) for key in estimated_freq]
151    mean_error = np.mean(errors)
152    total_errors = np.sum(errors)
153    max_freq = max(real_num_freq.values())
154    min_freq = min(real_num_freq.values())
155    mse = np.sum([(real_num_freq[key] - estimated_freq[key]) ** 2 for key in estimated_freq]) / len(estimated_freq)
156    normalized_mse = mse / (max_freq - min_freq)
157
158    error_table = [
159        ['Total Errors', f"{total_errors:.2f}"],
160        ['Mean Error', f"{mean_error:.2f}"],
161        ['Percentage Error', f"{(mean_error / N) * 100:.2f}%"],
162        ['MSE', f"{mse:.2f}"],
163        ['RMSE', f"{np.sqrt(mse):.2f}"],
164        ['Normalized MSE', f"{normalized_mse:.4f}"],
165        ['Normalized RMSE', f"{np.sqrt(normalized_mse):.2f}"]
166    ]
167
168    return data_table, error_table