import torch
import os
import random
import sys
from sklearn.preprocessing import StandardScaler
import torch
import pandas as pd
import logging
import sys
sys.path.append("../")
import zipfile
import requests
from .basedataset import BaseDataset
[docs]class BrowardDataset(BaseDataset):
"""Compas dataset with human judgements for 1000 points"""
def __init__(
self, data_dir, test_split=0.2, val_split=0.1, batch_size=1000, transforms=None
):
"""
https://farid.berkeley.edu/downloads/publications/scienceadvances17/allData.zip
https://www.science.org/doi/10.1126/sciadv.aao5580
data_dir: where to save files for model
test_split: percentage of test data
val_split: percentage of data to be used for validation (from training set)
batch_size: batch size for training
transforms: data transforms
"""
self.data_dir = data_dir
self.test_split = test_split
self.val_split = val_split
self.batch_size = batch_size
self.n_dataset = 2
self.train_split = 1 - test_split - val_split
self.transforms = transforms
self.generate_data()
[docs] def generate_data(self):
"""
generate data for training, validation and test sets
"""
# check if file already exists
if not os.path.exists(self.data_dir + "/allDataBroward"):
logging.info("Downloading Broward data")
r = requests.get(
"https://farid.berkeley.edu/downloads/publications/scienceadvances17/allData.zip",
allow_redirects=True,
)
with open(self.data_dir + "/allData.zip", "wb") as f:
f.write(r.content)
# create data directory
if not os.path.exists(self.data_dir + "/allDataBroward"):
os.makedirs(self.data_dir + "/allDataBroward")
# python unzip
with zipfile.ZipFile(self.data_dir + "/allData.zip", "r") as zip_ref:
zip_ref.extractall(self.data_dir + "/allDataBroward")
os.remove(self.data_dir + "/allData.zip")
logging.info("Finished Downloading Broward data")
try:
broward_data = pd.read_csv(
self.data_dir + "/allDataBroward/BROWARD_CLEAN_SUBSET.csv"
)
mturk_data = pd.read_csv(
self.data_dir + "/allDataBroward/MTURK_RACE.csv"
)
except:
logging.error("Failed to load Broward data")
raise
else:
logging.info("Loading Broward data")
try:
broward_data = pd.read_csv(
self.data_dir + "/allDataBroward/BROWARD_CLEAN_SUBSET.csv"
)
mturk_data = pd.read_csv(
self.data_dir + "/allDataBroward/MTURK_RACE.csv"
)
except:
logging.error("Failed to load Broward data")
raise
broward_data = broward_data.drop(["block_num", "id"], axis=1)
train_y = broward_data.two_year_recid.to_numpy()
broward_data = broward_data.drop(["two_year_recid"], axis=1)
train_x = broward_data.to_numpy()
# normalize data
scaler = StandardScaler()
scaler.fit(train_x)
train_x = scaler.transform(train_x)
human_predictions = []
mturk_data = mturk_data.drop(["mTurk_code"], axis=1)
for i in range(1, len(mturk_data)):
# get all columns
row = mturk_data.iloc[i]
# only keep the columns that are not nan
row = row[row.notna()]
# get a random prediction
random_sample = row.sample(n=1).values[0]
most_common = row.value_counts().idxmax()
# can choose either here
if most_common == 1:
human_predictions.append(train_y[i - 1])
else:
human_predictions.append(1 - train_y[i - 1])
human_predictions = torch.tensor(human_predictions)
train_y = torch.tensor(train_y)
train_x = torch.from_numpy(train_x).float()
self.total_samples = len(train_x)
self.d = len(train_x[0])
random_seed = random.randrange(10000)
train_size = int(self.train_split * self.total_samples)
val_size = int(self.val_split * self.total_samples)
test_size = self.total_samples - train_size - val_size
self.train_x, self.val_x, self.test_x = torch.utils.data.random_split(
train_x,
[train_size, val_size, test_size],
generator=torch.Generator().manual_seed(random_seed),
)
self.train_y, self.val_y, self.test_y = torch.utils.data.random_split(
train_y,
[train_size, val_size, test_size],
generator=torch.Generator().manual_seed(random_seed),
)
self.train_h, self.val_h, self.test_h = torch.utils.data.random_split(
human_predictions,
[train_size, val_size, test_size],
generator=torch.Generator().manual_seed(random_seed),
)
logging.info("train size: ", len(self.train_x))
logging.info("val size: ", len(self.val_x))
logging.info("test size: ", len(self.test_x))
self.data_train = torch.utils.data.TensorDataset(
self.train_x.dataset.data[self.train_x.indices],
self.train_y.dataset.data[self.train_y.indices],
self.train_h.dataset.data[self.train_h.indices],
)
self.data_val = torch.utils.data.TensorDataset(
self.val_x.dataset.data[self.val_x.indices],
self.val_y.dataset.data[self.val_y.indices],
self.val_h.dataset.data[self.val_h.indices],
)
self.data_test = torch.utils.data.TensorDataset(
self.test_x.dataset.data[self.test_x.indices],
self.test_y.dataset.data[self.test_y.indices],
self.test_h.dataset.data[self.test_h.indices],
)
self.data_train_loader = torch.utils.data.DataLoader(
self.data_train, batch_size=self.batch_size, shuffle=True
)
self.data_val_loader = torch.utils.data.DataLoader(
self.data_val, batch_size=self.batch_size, shuffle=True
)
self.data_test_loader = torch.utils.data.DataLoader(
self.data_test, batch_size=self.batch_size, shuffle=True
)