src.scripts.preprocess

 1import pandas as pd
 2from rich.progress import Progress
 3from colorama import Style
 4import os
 5
 6class DataProcessor:
 7    """
 8    Processes an Excel dataset containing eye-tracking data, filters relevant columns, 
 9    and extracts fixation and AOI (Area of Interest) hit information.
10    
11    Attributes:
12        file_name (str): Name of the dataset file.
13        columns (list): Relevant columns to extract from the dataset.
14        df (pd.DataFrame): Dataframe holding the dataset.
15        excel_file (str): Path to the input Excel file.
16        output_csv (str): Path to save the filtered CSV file.
17    """
18    def __init__(self):
19        """
20        Initializes the DataProcessor with the dataset name and determines file paths.
21        
22        Args:
23            dataset_name (str): Name of the dataset file (without extension).
24        """
25        self.columns = ['Participant', 'Fixation Position X [px]', 'Fixation Position Y [px]', 'AOI Name']
26        self.df = None
27
28        base_path_1 = os.path.join('..', '..', 'data', 'raw')
29        base_path_2 = os.path.join('..', 'data', 'raw')
30
31        if os.path.exists(base_path_1):
32            latest_file = max([f for f in os.listdir(base_path_1) if f.endswith('.xlsx')], key=lambda x: os.path.getmtime(os.path.join(base_path_1, x)))
33            self.excel_file = os.path.join(base_path_1, latest_file)
34            self.file_name = latest_file
35        else:
36            latest_file = max([f for f in os.listdir(base_path_2) if f.endswith('.xlsx')], key=lambda x: os.path.getmtime(os.path.join(base_path_2, x)))
37            self.excel_file = os.path.join(base_path_2, latest_file)
38            self.file_name = latest_file
39        
40        print(f"Processing {Style.BRIGHT}{self.file_name}{Style.RESET_ALL}")
41        self.df = pd.read_excel(self.excel_file)
42
43
44    def aoi_hits(self):
45        """
46        Processes the dataset to determine whether an AOI (Area of Interest) hit has been made.
47        Extracts relevant user IDs and the first AOI hit for each participant.
48        """
49        rows = []
50        with Progress() as progress:
51            task = progress.add_task("[cyan]🔍 Processing AOI Hits...", total=len(self.df))
52            for _, row in self.df.iterrows():
53                user_id = row['Participant']
54                for col in self.df.columns[1:]:
55                    if row[col] != "-":
56                        rows.append({'user_id': user_id, 'value': row[col]})
57                        break
58                progress.update(task, advance=1)
59        self.df = pd.DataFrame(rows)
60
61    def filter_fixation(self):
62        """
63        Removes rows where fixation position data is missing.
64        Drops unnecessary fixation position columns after filtering.
65        """
66        self.df = self.df[self.df['Fixation Position X [px]'] != '-']
67        self.df = self.df.drop(columns=['Fixation Position X [px]', 'Fixation Position Y [px]'])
68
69    def filter_columns(self):
70        """
71        Filters and preprocesses the dataset by keeping only relevant columns,
72        removing missing values, filtering fixation positions, and processing AOI hits.
73        
74        Returns:
75            pd.DataFrame: Processed and filtered DataFrame.
76        """
77        self.df = self.df[self.columns].dropna()
78        self.filter_fixation()
79        self.aoi_hits()
80        return self.df
81
82def run_data_processor():
83    """
84    Runs the data processing pipeline for a given dataset.
85    
86    Args:
87        dataset_name (str): Name of the dataset file (without extension).
88    
89    Returns:
90        pd.DataFrame: The filtered dataset.
91    """
92    processor = DataProcessor()
93    df = processor.filter_columns()
94    return df
class DataProcessor:
 8class DataProcessor:
 9    """
10    Processes an Excel dataset containing eye-tracking data, filters relevant columns, 
11    and extracts fixation and AOI (Area of Interest) hit information.
12    
13    Attributes:
14        file_name (str): Name of the dataset file.
15        columns (list): Relevant columns to extract from the dataset.
16        df (pd.DataFrame): Dataframe holding the dataset.
17        excel_file (str): Path to the input Excel file.
18        output_csv (str): Path to save the filtered CSV file.
19    """
20    def __init__(self):
21        """
22        Initializes the DataProcessor with the dataset name and determines file paths.
23        
24        Args:
25            dataset_name (str): Name of the dataset file (without extension).
26        """
27        self.columns = ['Participant', 'Fixation Position X [px]', 'Fixation Position Y [px]', 'AOI Name']
28        self.df = None
29
30        base_path_1 = os.path.join('..', '..', 'data', 'raw')
31        base_path_2 = os.path.join('..', 'data', 'raw')
32
33        if os.path.exists(base_path_1):
34            latest_file = max([f for f in os.listdir(base_path_1) if f.endswith('.xlsx')], key=lambda x: os.path.getmtime(os.path.join(base_path_1, x)))
35            self.excel_file = os.path.join(base_path_1, latest_file)
36            self.file_name = latest_file
37        else:
38            latest_file = max([f for f in os.listdir(base_path_2) if f.endswith('.xlsx')], key=lambda x: os.path.getmtime(os.path.join(base_path_2, x)))
39            self.excel_file = os.path.join(base_path_2, latest_file)
40            self.file_name = latest_file
41        
42        print(f"Processing {Style.BRIGHT}{self.file_name}{Style.RESET_ALL}")
43        self.df = pd.read_excel(self.excel_file)
44
45
46    def aoi_hits(self):
47        """
48        Processes the dataset to determine whether an AOI (Area of Interest) hit has been made.
49        Extracts relevant user IDs and the first AOI hit for each participant.
50        """
51        rows = []
52        with Progress() as progress:
53            task = progress.add_task("[cyan]🔍 Processing AOI Hits...", total=len(self.df))
54            for _, row in self.df.iterrows():
55                user_id = row['Participant']
56                for col in self.df.columns[1:]:
57                    if row[col] != "-":
58                        rows.append({'user_id': user_id, 'value': row[col]})
59                        break
60                progress.update(task, advance=1)
61        self.df = pd.DataFrame(rows)
62
63    def filter_fixation(self):
64        """
65        Removes rows where fixation position data is missing.
66        Drops unnecessary fixation position columns after filtering.
67        """
68        self.df = self.df[self.df['Fixation Position X [px]'] != '-']
69        self.df = self.df.drop(columns=['Fixation Position X [px]', 'Fixation Position Y [px]'])
70
71    def filter_columns(self):
72        """
73        Filters and preprocesses the dataset by keeping only relevant columns,
74        removing missing values, filtering fixation positions, and processing AOI hits.
75        
76        Returns:
77            pd.DataFrame: Processed and filtered DataFrame.
78        """
79        self.df = self.df[self.columns].dropna()
80        self.filter_fixation()
81        self.aoi_hits()
82        return self.df

Processes an Excel dataset containing eye-tracking data, filters relevant columns, and extracts fixation and AOI (Area of Interest) hit information.

Attributes: file_name (str): Name of the dataset file. columns (list): Relevant columns to extract from the dataset. df (pd.DataFrame): Dataframe holding the dataset. excel_file (str): Path to the input Excel file. output_csv (str): Path to save the filtered CSV file.

DataProcessor()
20    def __init__(self):
21        """
22        Initializes the DataProcessor with the dataset name and determines file paths.
23        
24        Args:
25            dataset_name (str): Name of the dataset file (without extension).
26        """
27        self.columns = ['Participant', 'Fixation Position X [px]', 'Fixation Position Y [px]', 'AOI Name']
28        self.df = None
29
30        base_path_1 = os.path.join('..', '..', 'data', 'raw')
31        base_path_2 = os.path.join('..', 'data', 'raw')
32
33        if os.path.exists(base_path_1):
34            latest_file = max([f for f in os.listdir(base_path_1) if f.endswith('.xlsx')], key=lambda x: os.path.getmtime(os.path.join(base_path_1, x)))
35            self.excel_file = os.path.join(base_path_1, latest_file)
36            self.file_name = latest_file
37        else:
38            latest_file = max([f for f in os.listdir(base_path_2) if f.endswith('.xlsx')], key=lambda x: os.path.getmtime(os.path.join(base_path_2, x)))
39            self.excel_file = os.path.join(base_path_2, latest_file)
40            self.file_name = latest_file
41        
42        print(f"Processing {Style.BRIGHT}{self.file_name}{Style.RESET_ALL}")
43        self.df = pd.read_excel(self.excel_file)

Initializes the DataProcessor with the dataset name and determines file paths.

Args: dataset_name (str): Name of the dataset file (without extension).

columns
df
def aoi_hits(self):
46    def aoi_hits(self):
47        """
48        Processes the dataset to determine whether an AOI (Area of Interest) hit has been made.
49        Extracts relevant user IDs and the first AOI hit for each participant.
50        """
51        rows = []
52        with Progress() as progress:
53            task = progress.add_task("[cyan]🔍 Processing AOI Hits...", total=len(self.df))
54            for _, row in self.df.iterrows():
55                user_id = row['Participant']
56                for col in self.df.columns[1:]:
57                    if row[col] != "-":
58                        rows.append({'user_id': user_id, 'value': row[col]})
59                        break
60                progress.update(task, advance=1)
61        self.df = pd.DataFrame(rows)

Processes the dataset to determine whether an AOI (Area of Interest) hit has been made. Extracts relevant user IDs and the first AOI hit for each participant.

def filter_fixation(self):
63    def filter_fixation(self):
64        """
65        Removes rows where fixation position data is missing.
66        Drops unnecessary fixation position columns after filtering.
67        """
68        self.df = self.df[self.df['Fixation Position X [px]'] != '-']
69        self.df = self.df.drop(columns=['Fixation Position X [px]', 'Fixation Position Y [px]'])

Removes rows where fixation position data is missing. Drops unnecessary fixation position columns after filtering.

def filter_columns(self):
71    def filter_columns(self):
72        """
73        Filters and preprocesses the dataset by keeping only relevant columns,
74        removing missing values, filtering fixation positions, and processing AOI hits.
75        
76        Returns:
77            pd.DataFrame: Processed and filtered DataFrame.
78        """
79        self.df = self.df[self.columns].dropna()
80        self.filter_fixation()
81        self.aoi_hits()
82        return self.df

Filters and preprocesses the dataset by keeping only relevant columns, removing missing values, filtering fixation positions, and processing AOI hits.

Returns: pd.DataFrame: Processed and filtered DataFrame.

def run_data_processor():
84def run_data_processor():
85    """
86    Runs the data processing pipeline for a given dataset.
87    
88    Args:
89        dataset_name (str): Name of the dataset file (without extension).
90    
91    Returns:
92        pd.DataFrame: The filtered dataset.
93    """
94    processor = DataProcessor()
95    df = processor.filter_columns()
96    return df

Runs the data processing pipeline for a given dataset.

Args: dataset_name (str): Name of the dataset file (without extension).

Returns: pd.DataFrame: The filtered dataset.