src.scripts.preprocess
1import pandas as pd 2from rich.progress import Progress 3from colorama import Style 4import os 5 6class DataProcessor: 7 """ 8 Processes an Excel dataset containing eye-tracking data, filters relevant columns, 9 and extracts fixation and AOI (Area of Interest) hit information. 10 11 Attributes: 12 file_name (str): Name of the dataset file. 13 columns (list): Relevant columns to extract from the dataset. 14 df (pd.DataFrame): Dataframe holding the dataset. 15 excel_file (str): Path to the input Excel file. 16 output_csv (str): Path to save the filtered CSV file. 17 """ 18 def __init__(self): 19 """ 20 Initializes the DataProcessor with the dataset name and determines file paths. 21 22 Args: 23 dataset_name (str): Name of the dataset file (without extension). 24 """ 25 self.columns = ['Participant', 'Fixation Position X [px]', 'Fixation Position Y [px]', 'AOI Name'] 26 self.df = None 27 28 base_path_1 = os.path.join('..', '..', 'data', 'raw') 29 base_path_2 = os.path.join('..', 'data', 'raw') 30 31 if os.path.exists(base_path_1): 32 latest_file = max([f for f in os.listdir(base_path_1) if f.endswith('.xlsx')], key=lambda x: os.path.getmtime(os.path.join(base_path_1, x))) 33 self.excel_file = os.path.join(base_path_1, latest_file) 34 self.file_name = latest_file 35 else: 36 latest_file = max([f for f in os.listdir(base_path_2) if f.endswith('.xlsx')], key=lambda x: os.path.getmtime(os.path.join(base_path_2, x))) 37 self.excel_file = os.path.join(base_path_2, latest_file) 38 self.file_name = latest_file 39 40 print(f"Processing {Style.BRIGHT}{self.file_name}{Style.RESET_ALL}") 41 self.df = pd.read_excel(self.excel_file) 42 43 44 def aoi_hits(self): 45 """ 46 Processes the dataset to determine whether an AOI (Area of Interest) hit has been made. 47 Extracts relevant user IDs and the first AOI hit for each participant. 48 """ 49 rows = [] 50 with Progress() as progress: 51 task = progress.add_task("[cyan]🔍 Processing AOI Hits...", total=len(self.df)) 52 for _, row in self.df.iterrows(): 53 user_id = row['Participant'] 54 for col in self.df.columns[1:]: 55 if row[col] != "-": 56 rows.append({'user_id': user_id, 'value': row[col]}) 57 break 58 progress.update(task, advance=1) 59 self.df = pd.DataFrame(rows) 60 61 def filter_fixation(self): 62 """ 63 Removes rows where fixation position data is missing. 64 Drops unnecessary fixation position columns after filtering. 65 """ 66 self.df = self.df[self.df['Fixation Position X [px]'] != '-'] 67 self.df = self.df.drop(columns=['Fixation Position X [px]', 'Fixation Position Y [px]']) 68 69 def filter_columns(self): 70 """ 71 Filters and preprocesses the dataset by keeping only relevant columns, 72 removing missing values, filtering fixation positions, and processing AOI hits. 73 74 Returns: 75 pd.DataFrame: Processed and filtered DataFrame. 76 """ 77 self.df = self.df[self.columns].dropna() 78 self.filter_fixation() 79 self.aoi_hits() 80 return self.df 81 82def run_data_processor(): 83 """ 84 Runs the data processing pipeline for a given dataset. 85 86 Args: 87 dataset_name (str): Name of the dataset file (without extension). 88 89 Returns: 90 pd.DataFrame: The filtered dataset. 91 """ 92 processor = DataProcessor() 93 df = processor.filter_columns() 94 return df
8class DataProcessor: 9 """ 10 Processes an Excel dataset containing eye-tracking data, filters relevant columns, 11 and extracts fixation and AOI (Area of Interest) hit information. 12 13 Attributes: 14 file_name (str): Name of the dataset file. 15 columns (list): Relevant columns to extract from the dataset. 16 df (pd.DataFrame): Dataframe holding the dataset. 17 excel_file (str): Path to the input Excel file. 18 output_csv (str): Path to save the filtered CSV file. 19 """ 20 def __init__(self): 21 """ 22 Initializes the DataProcessor with the dataset name and determines file paths. 23 24 Args: 25 dataset_name (str): Name of the dataset file (without extension). 26 """ 27 self.columns = ['Participant', 'Fixation Position X [px]', 'Fixation Position Y [px]', 'AOI Name'] 28 self.df = None 29 30 base_path_1 = os.path.join('..', '..', 'data', 'raw') 31 base_path_2 = os.path.join('..', 'data', 'raw') 32 33 if os.path.exists(base_path_1): 34 latest_file = max([f for f in os.listdir(base_path_1) if f.endswith('.xlsx')], key=lambda x: os.path.getmtime(os.path.join(base_path_1, x))) 35 self.excel_file = os.path.join(base_path_1, latest_file) 36 self.file_name = latest_file 37 else: 38 latest_file = max([f for f in os.listdir(base_path_2) if f.endswith('.xlsx')], key=lambda x: os.path.getmtime(os.path.join(base_path_2, x))) 39 self.excel_file = os.path.join(base_path_2, latest_file) 40 self.file_name = latest_file 41 42 print(f"Processing {Style.BRIGHT}{self.file_name}{Style.RESET_ALL}") 43 self.df = pd.read_excel(self.excel_file) 44 45 46 def aoi_hits(self): 47 """ 48 Processes the dataset to determine whether an AOI (Area of Interest) hit has been made. 49 Extracts relevant user IDs and the first AOI hit for each participant. 50 """ 51 rows = [] 52 with Progress() as progress: 53 task = progress.add_task("[cyan]🔍 Processing AOI Hits...", total=len(self.df)) 54 for _, row in self.df.iterrows(): 55 user_id = row['Participant'] 56 for col in self.df.columns[1:]: 57 if row[col] != "-": 58 rows.append({'user_id': user_id, 'value': row[col]}) 59 break 60 progress.update(task, advance=1) 61 self.df = pd.DataFrame(rows) 62 63 def filter_fixation(self): 64 """ 65 Removes rows where fixation position data is missing. 66 Drops unnecessary fixation position columns after filtering. 67 """ 68 self.df = self.df[self.df['Fixation Position X [px]'] != '-'] 69 self.df = self.df.drop(columns=['Fixation Position X [px]', 'Fixation Position Y [px]']) 70 71 def filter_columns(self): 72 """ 73 Filters and preprocesses the dataset by keeping only relevant columns, 74 removing missing values, filtering fixation positions, and processing AOI hits. 75 76 Returns: 77 pd.DataFrame: Processed and filtered DataFrame. 78 """ 79 self.df = self.df[self.columns].dropna() 80 self.filter_fixation() 81 self.aoi_hits() 82 return self.df
Processes an Excel dataset containing eye-tracking data, filters relevant columns, and extracts fixation and AOI (Area of Interest) hit information.
Attributes: file_name (str): Name of the dataset file. columns (list): Relevant columns to extract from the dataset. df (pd.DataFrame): Dataframe holding the dataset. excel_file (str): Path to the input Excel file. output_csv (str): Path to save the filtered CSV file.
20 def __init__(self): 21 """ 22 Initializes the DataProcessor with the dataset name and determines file paths. 23 24 Args: 25 dataset_name (str): Name of the dataset file (without extension). 26 """ 27 self.columns = ['Participant', 'Fixation Position X [px]', 'Fixation Position Y [px]', 'AOI Name'] 28 self.df = None 29 30 base_path_1 = os.path.join('..', '..', 'data', 'raw') 31 base_path_2 = os.path.join('..', 'data', 'raw') 32 33 if os.path.exists(base_path_1): 34 latest_file = max([f for f in os.listdir(base_path_1) if f.endswith('.xlsx')], key=lambda x: os.path.getmtime(os.path.join(base_path_1, x))) 35 self.excel_file = os.path.join(base_path_1, latest_file) 36 self.file_name = latest_file 37 else: 38 latest_file = max([f for f in os.listdir(base_path_2) if f.endswith('.xlsx')], key=lambda x: os.path.getmtime(os.path.join(base_path_2, x))) 39 self.excel_file = os.path.join(base_path_2, latest_file) 40 self.file_name = latest_file 41 42 print(f"Processing {Style.BRIGHT}{self.file_name}{Style.RESET_ALL}") 43 self.df = pd.read_excel(self.excel_file)
Initializes the DataProcessor with the dataset name and determines file paths.
Args: dataset_name (str): Name of the dataset file (without extension).
46 def aoi_hits(self): 47 """ 48 Processes the dataset to determine whether an AOI (Area of Interest) hit has been made. 49 Extracts relevant user IDs and the first AOI hit for each participant. 50 """ 51 rows = [] 52 with Progress() as progress: 53 task = progress.add_task("[cyan]🔍 Processing AOI Hits...", total=len(self.df)) 54 for _, row in self.df.iterrows(): 55 user_id = row['Participant'] 56 for col in self.df.columns[1:]: 57 if row[col] != "-": 58 rows.append({'user_id': user_id, 'value': row[col]}) 59 break 60 progress.update(task, advance=1) 61 self.df = pd.DataFrame(rows)
Processes the dataset to determine whether an AOI (Area of Interest) hit has been made. Extracts relevant user IDs and the first AOI hit for each participant.
63 def filter_fixation(self): 64 """ 65 Removes rows where fixation position data is missing. 66 Drops unnecessary fixation position columns after filtering. 67 """ 68 self.df = self.df[self.df['Fixation Position X [px]'] != '-'] 69 self.df = self.df.drop(columns=['Fixation Position X [px]', 'Fixation Position Y [px]'])
Removes rows where fixation position data is missing. Drops unnecessary fixation position columns after filtering.
71 def filter_columns(self): 72 """ 73 Filters and preprocesses the dataset by keeping only relevant columns, 74 removing missing values, filtering fixation positions, and processing AOI hits. 75 76 Returns: 77 pd.DataFrame: Processed and filtered DataFrame. 78 """ 79 self.df = self.df[self.columns].dropna() 80 self.filter_fixation() 81 self.aoi_hits() 82 return self.df
Filters and preprocesses the dataset by keeping only relevant columns, removing missing values, filtering fixation positions, and processing AOI hits.
Returns: pd.DataFrame: Processed and filtered DataFrame.
84def run_data_processor(): 85 """ 86 Runs the data processing pipeline for a given dataset. 87 88 Args: 89 dataset_name (str): Name of the dataset file (without extension). 90 91 Returns: 92 pd.DataFrame: The filtered dataset. 93 """ 94 processor = DataProcessor() 95 df = processor.filter_columns() 96 return df
Runs the data processing pipeline for a given dataset.
Args: dataset_name (str): Name of the dataset file (without extension).
Returns: pd.DataFrame: The filtered dataset.