--- title: Base dataset keywords: fastai sidebar: home_sidebar summary: "Base class for dataset module." description: "Base class for dataset module." nb_path: "nbs/datasets/datasets.base.ipynb" ---
class YoochooseDataset(SessionDataset):
data_id = '1UEcKC4EfgMVD2n_zBvAyp0vRNyv7ndSF'
def __init__(self,
root,
min_session_length: int = 2,
min_item_support: int = 5,
eval_sec: int = 86400,
):
super().__init__(root, min_session_length, min_item_support, eval_sec)
@property
def raw_file_names(self) -> str:
return 'rsc15-clicks.dat'
@property
def processed_file_names(self) -> str:
return ['yoochoose_train.txt','yoochoose_valid.txt']
def download(self):
from google_drive_downloader import GoogleDriveDownloader as gdd
from shutil import move, rmtree
path = osp.join(self.raw_dir, 'rsc15.zip')
gdd.download_file_from_google_drive(self.data_id, path)
extract_zip(path, self.raw_dir)
move(osp.join(self.raw_dir, 'rsc15', 'raw', ),
osp.join(self.raw_dir, self.raw_file_names))
rmtree(osp.join(self.raw_dir, 'rsc15'))
os.unlink(path)
def process(self):
df = self.load_ratings_df()
if self.min_session_length is not None:
df = self.remove_short_sessions(df)
if self.min_item_support is not None:
df = self.remove_sparse_items(df)
train, test = self.split_df(df)
train.to_csv(self.processed_paths[0], sep=',', index=False)
test.to_csv(self.processed_paths[1], sep=',', index=False)
def load_ratings_df(self):
df = pd.read_csv(self.raw_paths[0], header=None, usecols=[0, 1, 2],
dtype={0: np.int32, 1: str, 2: np.int64})
df.columns = ['uid', 'timestamp', 'sid']
df['timestamp'] = df['timestamp'].apply(lambda x: datetime.datetime.strptime(
x, '%Y-%m-%dT%H:%M:%S.%fZ').timestamp())
return df
ds = YoochooseDataset(root='/content/yoochoose')
!tree --du -h -C /content/yoochoose
train_data = ([[1, 2, 3], [2, 3, 4], [1, 2, 4], [2, 3], [1]],
[4, 5, 5, 4, 2])
tds = GraphData(train_data, shuffle=False)
print(tds.generate_batch(1))
print(tds.generate_batch(2))
print(tds.inputs)
tds = GraphData(train_data, shuffle=True)
print(tds.generate_batch(1))
print(tds.generate_batch(2))
print(tds.inputs)
class DigineticaDataset(SessionGraphDataset):
train_url = "https://github.com/RecoHut-Datasets/diginetica/raw/v2/train.txt"
test_url = "https://github.com/RecoHut-Datasets/diginetica/raw/v2/test.txt"
all_train_seq_url = "https://github.com/RecoHut-Datasets/diginetica/raw/v2/all_train_seq.txt"
def __init__(self, root, shuffle=False, n_node=43097, is_train=True):
self.n_node = n_node
self.shuffle = shuffle
self.is_train = is_train
super().__init__(root, shuffle, n_node)
@property
def raw_file_names(self) -> str:
if self.is_train:
return ['train.txt', 'all_train_seq.txt']
return ['test.txt', 'all_train_seq.txt']
def download(self):
download_url(self.all_train_seq_url, self.raw_dir)
if self.is_train:
download_url(self.train_url, self.raw_dir)
else:
download_url(self.test_url, self.raw_dir)
root = '/content/diginetica'
train_data = DigineticaDataset(root=root, shuffle=True, is_train=True)
test_data = DigineticaDataset(root=root, shuffle=False, is_train=False)
class TmallDataset(SessionGraphDataset):
train_url = "https://github.com/RecoHut-Datasets/tmall/raw/v1/train.txt"
test_url = "https://github.com/RecoHut-Datasets/tmall/raw/v1/test.txt"
all_train_seq_url = "https://github.com/RecoHut-Datasets/tmall/raw/v1/all_train_seq.txt"
def __init__(self, root, shuffle=False, n_node=40727, is_train=True):
self.n_node = n_node
self.shuffle = shuffle
self.is_train = is_train
super().__init__(root, shuffle, n_node)
@property
def raw_file_names(self) -> str:
if self.is_train:
return ['train.txt', 'all_train_seq.txt']
return ['test.txt', 'all_train_seq.txt']
def download(self):
download_url(self.all_train_seq_url, self.raw_dir)
if self.is_train:
download_url(self.train_url, self.raw_dir)
else:
download_url(self.test_url, self.raw_dir)
root = '/content/tmall'
train_data = TmallDataset(root=root, shuffle=True, is_train=True)
test_data = TmallDataset(root=root, shuffle=False, is_train=False)
class RetailRocketDataset(SessionGraphDataset):
train_url = "https://github.com/RecoHut-Datasets/retail_rocket/raw/v1/train.txt"
test_url = "https://github.com/RecoHut-Datasets/retail_rocket/raw/v1/test.txt"
all_train_seq_url = "https://github.com/RecoHut-Datasets/retail_rocket/raw/v1/all_train_seq.txt"
def __init__(self, root, shuffle=False, n_node=40727, is_train=True):
self.n_node = n_node
self.shuffle = shuffle
self.is_train = is_train
super().__init__(root, shuffle, n_node)
@property
def raw_file_names(self) -> str:
if self.is_train:
return ['train.txt', 'all_train_seq.txt']
return ['test.txt', 'all_train_seq.txt']
def download(self):
download_url(self.all_train_seq_url, self.raw_dir)
if self.is_train:
download_url(self.train_url, self.raw_dir)
else:
download_url(self.test_url, self.raw_dir)
root = '/content/retail_rocket'
train_data = RetailRocketDataset(root=root, shuffle=True, is_train=True)
test_data = RetailRocketDataset(root=root, shuffle=False, is_train=False)
class SampleDataset(SessionGraphDataset):
train_url = "https://github.com/RecoHut-Datasets/sample_session/raw/v2/train.txt"
test_url = "https://github.com/RecoHut-Datasets/sample_session/raw/v2/test.txt"
all_train_seq_url = "https://github.com/RecoHut-Datasets/sample_session/raw/v2/all_train_seq.txt"
def __init__(self, root, shuffle=False, n_node=309, is_train=True):
self.n_node = n_node
self.shuffle = shuffle
self.is_train = is_train
super().__init__(root, shuffle, n_node)
@property
def raw_file_names(self) -> str:
if self.is_train:
return ['train.txt', 'all_train_seq.txt']
return ['test.txt', 'all_train_seq.txt']
def download(self):
download_url(self.all_train_seq_url, self.raw_dir)
if self.is_train:
download_url(self.train_url, self.raw_dir)
else:
download_url(self.test_url, self.raw_dir)
root = '/content/sample'
train_data = SampleDataset(root=root, shuffle=True, is_train=True)
test_data = SampleDataset(root=root, shuffle=False, is_train=False)
class AmazonGamesDataset(RatingDataset):
url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Video_Games.csv"
@property
def raw_file_names(self):
return 'ratings_Video_Games.csv'
def download(self):
download_url(self.url, self.raw_dir)
def load_ratings_df(self):
df = pd.read_csv(self.raw_paths[0], header=None)
df.columns = ['uid', 'sid', 'rating', 'timestamp']
return df
ds = AmazonGamesDataset(root='/content/amazon_games', min_uc=10, min_sc=5)
class AmazonBeautyDataset(RatingDataset):
url = "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Beauty.csv"
@property
def raw_file_names(self):
return 'ratings_Beauty.csv'
def download(self):
download_url(self.url, self.raw_dir)
def load_ratings_df(self):
df = pd.read_csv(self.raw_paths[0], header=None)
df.columns = ['uid', 'sid', 'rating', 'timestamp']
return df
ds = AmazonGamesDataset(root='/content/amazon_beauty', min_uc=10, min_sc=5)
class ML1mDataset(RatingDataset):
url = "http://files.grouplens.org/datasets/movielens/ml-1m.zip"
@property
def raw_file_names(self):
return 'ratings.dat'
def download(self):
path = download_url(self.url, self.raw_dir)
extract_zip(path, self.raw_dir)
from shutil import move, rmtree
move(osp.join(self.raw_dir, 'ml-1m', self.raw_file_names), self.raw_dir)
rmtree(osp.join(self.raw_dir, 'ml-1m'))
os.unlink(path)
def load_ratings_df(self):
df = pd.read_csv(self.raw_paths[0], sep='::', header=None, engine='python')
df.columns = ['uid', 'sid', 'rating', 'timestamp']
return df
ds = ML1mDataset(root='/content/ML1m', min_uc=10, min_sc=5)
class SteamGamesDataset(RatingDataset):
url = "http://cseweb.ucsd.edu/~wckang/steam_reviews.json.gz"
@property
def raw_file_names(self):
return 'steam_reviews.json'
def download(self):
path = download_url(self.url, self.raw_dir)
extract_gz(path, self.raw_dir)
os.unlink(path)
def load_ratings_df(self):
data = []
f = open(self.raw_paths[0], 'r', encoding='utf-8')
import ast
for line in f.readlines():
temp = ast.literal_eval(line)
data.append([temp['username'], temp['product_id'], temp['date']])
return pd.DataFrame(data, columns=['uid', 'sid', 'timestamp'])
ds = SteamGamesDataset(root='/content/steam', min_uc=10, min_sc=5)
class YoochooseDataset(RatingDataset):
url = "https://s3-eu-west-1.amazonaws.com/yc-rdata/yoochoose-data.7z"
@property
def raw_file_names(self):
return 'yoochoose-clicks.dat'
def download(self):
path = download_url(self.url, self.raw_dir)
# pip install pyunpack patool
import pyunpack
pyunpack.Archive(path).extractall(self.raw_dir)
os.unlink(path)
def load_ratings_df(self):
df = pd.read_csv(self.raw_paths[0], header=None)
df.columns = ['uid', 'timestamp', 'sid', 'category']
return df
ds = YoochooseDataset(root='/content/yoochoose', min_uc=10, min_sc=5)
class YoochooseDataset(TorchSessionDataset):
url = 'https://github.com/RecoHut-Datasets/yoochoose/raw/v3/yoochoose.csv'
def __init__(self, root, maxlen, is_train=True):
fpath = download_url(url=self.url, folder=root)
super().__init__(fpath, maxlen, is_train)
dataset = YoochooseDataset(root='/content/yoochoose', maxlen=30)
sampler = torch.utils.data.DataLoader(dataset, batch_size=8, num_workers=2, pin_memory=True)
samples = next(iter(sampler))
samples
class NowplayingDataset(TorchSessionDataset):
url = 'https://github.com/RecoHut-Datasets/nowplaying/raw/v3/nowplaying.csv'
def __init__(self, root, maxlen, is_train=True):
fpath = download_url(url=self.url, folder=root)
super().__init__(fpath, maxlen, is_train)
dataset = NowplayingDataset(root='/content/nowplaying', maxlen=30)
sampler = torch.utils.data.DataLoader(dataset, batch_size=8, num_workers=2, pin_memory=True)
samples = next(iter(sampler))
samples
class DigineticaDataset(TorchSessionDataset):
url = 'https://github.com/RecoHut-Datasets/diginetica/raw/v4/diginetica.csv'
def __init__(self, root, maxlen, is_train=True):
fpath = download_url(url=self.url, folder=root)
super().__init__(fpath, maxlen, is_train)
dataset = DigineticaDataset(root='/content/diginetica', maxlen=30)
sampler = torch.utils.data.DataLoader(dataset, batch_size=8, num_workers=2, pin_memory=True)
samples = next(iter(sampler))
samples
class LastfmDataset(TorchSessionDataset):
url = 'https://github.com/RecoHut-Datasets/lastfm/raw/v2/last_fm.csv'
def __init__(self, root, maxlen, is_train=True):
fpath = download_url(url=self.url, folder=root)
super().__init__(fpath, maxlen, is_train)
dataset = LastfmDataset(root='/content/lastfm', maxlen=30)
sampler = torch.utils.data.DataLoader(dataset, batch_size=8, num_workers=2, pin_memory=True)
samples = next(iter(sampler))
samples