Module deepsport_utilities.dataset

Expand source code
import dataclasses
from enum import IntFlag
import errno
import os
import random

import numpy as np

from mlworkflow.datasets import batchify, Dataset, FilteredDataset, AugmentedDataset
from aleatorpy import pseudo_random, method # pylint: disable=unused-import




def collate_fn(items):
    return {f"batch_{k}": v for k,v in batchify(items).items()}


# This object is defined both in here and in experimentator repository
# Any change here should be reported in experimentator as well
class SubsetType(IntFlag):
    TRAIN = 1
    EVAL  = 2

# This object is defined both in here and in experimentator repository
# Any change here should be reported in experimentator as well
class Subset:
    def __init__(self, name: str, subset_type: SubsetType, dataset: Dataset, keys=None, repetitions=1, desc=None):
        keys = keys if keys is not None else dataset.keys.all()
        assert isinstance(keys, (tuple, list)), f"Received instance of {type(keys)} for subset {name}"
        self.name = name
        self.type = subset_type
        self.dataset = FilteredDataset(dataset, predicate=lambda k,v: v is not None)
        self._keys = keys
        self.keys = keys
        self.repetitions = repetitions
        self.desc = desc
        self.is_training = self.type == SubsetType.TRAIN
        loop = None if self.is_training else repetitions
        self.shuffled_keys = pseudo_random(evolutive=self.is_training)(self.shuffled_keys)
        self.dataset.query_item = pseudo_random(loop=loop, input_dependent=True)(self.dataset.query_item)

    def shuffled_keys(self): # pylint: disable=method-hidden
        keys = self.keys * self.repetitions
        return random.sample(keys, len(keys)) if self.is_training else keys

    def __len__(self):
        return len(self.keys)*self.repetitions

    def __str__(self):
        return f"{self.__class__.__name__}<{self.name}>({len(self)})"

    def batches(self, batch_size, keys=None, *args, **kwargs):
        keys = keys or self.shuffled_keys()
        yield from self.dataset.batches(batch_size=batch_size, keys=keys, collate_fn=collate_fn, *args, **kwargs)


class CombinedSubset(Subset):
    def __init__(self, name, *subsets):
        self.subsets = subsets
        self.name = name
        assert len(set(subset.type for subset in subsets)) == 1, "Combined Subsets must have the same type"
        self.type = subsets[0].type

    def __len__(self):
        return min(len(subset) for subset in self.subsets)*len(self.subsets)

    def batches(self, batch_size, **kwargs):
        assert batch_size % len(self.subsets) == 0, f"Batch size must be a multiple of the number of subsets ({len(self.subsets)})"
        batch_size = batch_size // len(self.subsets)
        iterators = [subset.batches(batch_size, **kwargs) for subset in self.subsets]
        while True:
            try:
                key_chunks, chunks = zip(*[next(it) for it in iterators])
            except StopIteration:
                break
            keys = [key for key_chunk in key_chunks for key in key_chunk]
            batch = {k: np.concatenate([chunk[k] for chunk in chunks]) for k in chunks[0]}
            yield keys, batch


class BalancedSubest(Subset):
    """
    """
    def __init__(self, balancing_attr, *args, **kwargs):
        super().__init__(self, *args, **kwargs)
        self.balancing_attr = balancing_attr
    def shuffled_keys(self):
        # logic
        return None

class MergedDataset(Dataset):
    def __init__(self, *ds):
        self.ds = ds
        self.cache = {}
    def yield_keys(self):
        for ds in self.ds:
            for key in ds.yield_keys():
                self.cache[key] = ds
                yield key
    def query_item(self, key):
        return self.cache[key].query_item(key)



class TolerentDataset(AugmentedDataset):
    def __init__(self, parent, retry=0):
        super().__init__(parent)
        self.retry = retry
    def augment(self, root_key, root_item):
        retry = self.retry
        while root_item is None and retry:
            root_item = self.parent.query_item(root_key)
            retry -= 1
        return root_item



class DatasetSamplerDataset(Dataset):
    def __init__(self, dataset, count):
        self.parent = dataset
        self.keys = random.sample(list(dataset.keys.all()), count)
    def yield_keys(self):
        for key in self.keys:
            yield key
    def query_item(self, key):
        return self.parent.query_item(key)


@dataclasses.dataclass
class BasicDatasetSplitter:
    validation_pc: int = 15
    testing_pc: int = 15
    def __post_init__(self):
        assert self.validation_pc + self.testing_pc < 100

    def __call__(self, dataset, fold=0):
        keys = list(dataset.keys.all())
        l = len(keys)

        # Backup random seed
        random_state = random.getstate()
        random.seed(fold)

        random.shuffle(keys)

        # Restore random seed
        random.setstate(random_state)

        u1 = self.validation_pc
        u2 = self.validation_pc + self.testing_pc

        validation_keys = keys[00*l//100:u1*l//100]
        testing_keys    = keys[u1*l//100:u2*l//100]
        training_keys   = keys[u2*l//100:]

        return [
            Subset("training", subset_type=SubsetType.TRAIN, keys=training_keys, dataset=dataset),
            Subset("validation", subset_type=SubsetType.EVAL, keys=validation_keys, dataset=dataset),
            Subset("testing", subset_type=SubsetType.EVAL, keys=testing_keys, dataset=dataset),
        ]


def split_equally(d, K):
    """ splits equally the keys of d given their values
        arguments:
            d (dict) - A dict {"label1": 30, "label2": 45, "label3": 22, ... "label<N>": 14}
            K (int)  - The number of split to make
        returns:
            A list of 'K' lists splitting equally the values of 'd':
            e.g. [[label1, label12, label19], [label2, label15], [label3, label10, label11], ...]
            where
            ```
               d["label1"]+d["label12"]+d["label19"]  ~=  d["label2"]+d["label15"]  ~=  d["label3"]+d["label10"]+d["label11]
            ```
    """
    s = sorted(d.items(), key=lambda kv: kv[1])
    f = [{"count": 0, "list": []} for _ in range(K)]
    while s:
        arena_label, count = s.pop(-1)
        index, _ = min(enumerate(f), key=(lambda x: x[1]["count"]))
        f[index]["count"] += count
        f[index]["list"].append(arena_label)
    return [x["list"] for x in f]



def find(path, dirs=None, verbose=True):
    if os.path.isabs(path):
        if not os.path.isfile(path) and not os.path.isdir(path):
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)
        return path

    dirs = dirs or [os.getcwd(), *os.getenv("DATA_PATH", "").split(":")]
    for dirname in dirs:
        if dirname is None:
            continue
        tmp_path = os.path.join(dirname, path)
        if os.path.isfile(tmp_path) or os.path.isdir(tmp_path):
            if verbose:
                print("{} found in {}".format(path, tmp_path))
            return tmp_path

    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                                "{} (searched in {})".format(path, dirs))

Functions

def collate_fn(items)
Expand source code
def collate_fn(items):
    return {f"batch_{k}": v for k,v in batchify(items).items()}
def find(path, dirs=None, verbose=True)
Expand source code
def find(path, dirs=None, verbose=True):
    if os.path.isabs(path):
        if not os.path.isfile(path) and not os.path.isdir(path):
            raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), path)
        return path

    dirs = dirs or [os.getcwd(), *os.getenv("DATA_PATH", "").split(":")]
    for dirname in dirs:
        if dirname is None:
            continue
        tmp_path = os.path.join(dirname, path)
        if os.path.isfile(tmp_path) or os.path.isdir(tmp_path):
            if verbose:
                print("{} found in {}".format(path, tmp_path))
            return tmp_path

    raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT),
                                "{} (searched in {})".format(path, dirs))
def split_equally(d, K)

splits equally the keys of d given their values arguments: d (dict) - A dict {"label1": 30, "label2": 45, "label3": 22, … "label": 14} K (int) - The number of split to make returns: A list of 'K' lists splitting equally the values of 'd': e.g. [[label1, label12, label19], [label2, label15], [label3, label10, label11], …] where d["label1"]+d["label12"]+d["label19"] ~= d["label2"]+d["label15"] ~= d["label3"]+d["label10"]+d["label11]

Expand source code
def split_equally(d, K):
    """ splits equally the keys of d given their values
        arguments:
            d (dict) - A dict {"label1": 30, "label2": 45, "label3": 22, ... "label<N>": 14}
            K (int)  - The number of split to make
        returns:
            A list of 'K' lists splitting equally the values of 'd':
            e.g. [[label1, label12, label19], [label2, label15], [label3, label10, label11], ...]
            where
            ```
               d["label1"]+d["label12"]+d["label19"]  ~=  d["label2"]+d["label15"]  ~=  d["label3"]+d["label10"]+d["label11]
            ```
    """
    s = sorted(d.items(), key=lambda kv: kv[1])
    f = [{"count": 0, "list": []} for _ in range(K)]
    while s:
        arena_label, count = s.pop(-1)
        index, _ = min(enumerate(f), key=(lambda x: x[1]["count"]))
        f[index]["count"] += count
        f[index]["list"].append(arena_label)
    return [x["list"] for x in f]

Classes

class BalancedSubest (balancing_attr, *args, **kwargs)
Expand source code
class BalancedSubest(Subset):
    """
    """
    def __init__(self, balancing_attr, *args, **kwargs):
        super().__init__(self, *args, **kwargs)
        self.balancing_attr = balancing_attr
    def shuffled_keys(self):
        # logic
        return None

Ancestors

Methods

def shuffled_keys(self)
Expand source code
def shuffled_keys(self):
    # logic
    return None
class BasicDatasetSplitter (validation_pc: int = 15, testing_pc: int = 15)

BasicDatasetSplitter(validation_pc: int = 15, testing_pc: int = 15)

Expand source code
class BasicDatasetSplitter:
    validation_pc: int = 15
    testing_pc: int = 15
    def __post_init__(self):
        assert self.validation_pc + self.testing_pc < 100

    def __call__(self, dataset, fold=0):
        keys = list(dataset.keys.all())
        l = len(keys)

        # Backup random seed
        random_state = random.getstate()
        random.seed(fold)

        random.shuffle(keys)

        # Restore random seed
        random.setstate(random_state)

        u1 = self.validation_pc
        u2 = self.validation_pc + self.testing_pc

        validation_keys = keys[00*l//100:u1*l//100]
        testing_keys    = keys[u1*l//100:u2*l//100]
        training_keys   = keys[u2*l//100:]

        return [
            Subset("training", subset_type=SubsetType.TRAIN, keys=training_keys, dataset=dataset),
            Subset("validation", subset_type=SubsetType.EVAL, keys=validation_keys, dataset=dataset),
            Subset("testing", subset_type=SubsetType.EVAL, keys=testing_keys, dataset=dataset),
        ]

Class variables

var testing_pc : int
var validation_pc : int
class CombinedSubset (name, *subsets)
Expand source code
class CombinedSubset(Subset):
    def __init__(self, name, *subsets):
        self.subsets = subsets
        self.name = name
        assert len(set(subset.type for subset in subsets)) == 1, "Combined Subsets must have the same type"
        self.type = subsets[0].type

    def __len__(self):
        return min(len(subset) for subset in self.subsets)*len(self.subsets)

    def batches(self, batch_size, **kwargs):
        assert batch_size % len(self.subsets) == 0, f"Batch size must be a multiple of the number of subsets ({len(self.subsets)})"
        batch_size = batch_size // len(self.subsets)
        iterators = [subset.batches(batch_size, **kwargs) for subset in self.subsets]
        while True:
            try:
                key_chunks, chunks = zip(*[next(it) for it in iterators])
            except StopIteration:
                break
            keys = [key for key_chunk in key_chunks for key in key_chunk]
            batch = {k: np.concatenate([chunk[k] for chunk in chunks]) for k in chunks[0]}
            yield keys, batch

Ancestors

Methods

def batches(self, batch_size, **kwargs)
Expand source code
def batches(self, batch_size, **kwargs):
    assert batch_size % len(self.subsets) == 0, f"Batch size must be a multiple of the number of subsets ({len(self.subsets)})"
    batch_size = batch_size // len(self.subsets)
    iterators = [subset.batches(batch_size, **kwargs) for subset in self.subsets]
    while True:
        try:
            key_chunks, chunks = zip(*[next(it) for it in iterators])
        except StopIteration:
            break
        keys = [key for key_chunk in key_chunks for key in key_chunk]
        batch = {k: np.concatenate([chunk[k] for chunk in chunks]) for k in chunks[0]}
        yield keys, batch
class DatasetSamplerDataset (dataset, count)
Expand source code
class DatasetSamplerDataset(Dataset):
    def __init__(self, dataset, count):
        self.parent = dataset
        self.keys = random.sample(list(dataset.keys.all()), count)
    def yield_keys(self):
        for key in self.keys:
            yield key
    def query_item(self, key):
        return self.parent.query_item(key)

Ancestors

  • mlworkflow.datasets.Dataset

Methods

def query_item(self, key)

Returns a tuple for one item, typically (Xi, Yi), or (Xi,)

Expand source code
def query_item(self, key):
    return self.parent.query_item(key)
def yield_keys(self)
Expand source code
def yield_keys(self):
    for key in self.keys:
        yield key
class MergedDataset (*ds)
Expand source code
class MergedDataset(Dataset):
    def __init__(self, *ds):
        self.ds = ds
        self.cache = {}
    def yield_keys(self):
        for ds in self.ds:
            for key in ds.yield_keys():
                self.cache[key] = ds
                yield key
    def query_item(self, key):
        return self.cache[key].query_item(key)

Ancestors

  • mlworkflow.datasets.Dataset

Methods

def query_item(self, key)

Returns a tuple for one item, typically (Xi, Yi), or (Xi,)

Expand source code
def query_item(self, key):
    return self.cache[key].query_item(key)
def yield_keys(self)
Expand source code
def yield_keys(self):
    for ds in self.ds:
        for key in ds.yield_keys():
            self.cache[key] = ds
            yield key
class Subset (name: str, subset_type: SubsetType, dataset: mlworkflow.datasets.Dataset, keys=None, repetitions=1, desc=None)
Expand source code
class Subset:
    def __init__(self, name: str, subset_type: SubsetType, dataset: Dataset, keys=None, repetitions=1, desc=None):
        keys = keys if keys is not None else dataset.keys.all()
        assert isinstance(keys, (tuple, list)), f"Received instance of {type(keys)} for subset {name}"
        self.name = name
        self.type = subset_type
        self.dataset = FilteredDataset(dataset, predicate=lambda k,v: v is not None)
        self._keys = keys
        self.keys = keys
        self.repetitions = repetitions
        self.desc = desc
        self.is_training = self.type == SubsetType.TRAIN
        loop = None if self.is_training else repetitions
        self.shuffled_keys = pseudo_random(evolutive=self.is_training)(self.shuffled_keys)
        self.dataset.query_item = pseudo_random(loop=loop, input_dependent=True)(self.dataset.query_item)

    def shuffled_keys(self): # pylint: disable=method-hidden
        keys = self.keys * self.repetitions
        return random.sample(keys, len(keys)) if self.is_training else keys

    def __len__(self):
        return len(self.keys)*self.repetitions

    def __str__(self):
        return f"{self.__class__.__name__}<{self.name}>({len(self)})"

    def batches(self, batch_size, keys=None, *args, **kwargs):
        keys = keys or self.shuffled_keys()
        yield from self.dataset.batches(batch_size=batch_size, keys=keys, collate_fn=collate_fn, *args, **kwargs)

Subclasses

Methods

def batches(self, batch_size, keys=None, *args, **kwargs)
Expand source code
def batches(self, batch_size, keys=None, *args, **kwargs):
    keys = keys or self.shuffled_keys()
    yield from self.dataset.batches(batch_size=batch_size, keys=keys, collate_fn=collate_fn, *args, **kwargs)
def shuffled_keys(self)
Expand source code
def shuffled_keys(self): # pylint: disable=method-hidden
    keys = self.keys * self.repetitions
    return random.sample(keys, len(keys)) if self.is_training else keys
class SubsetType (value, names=None, *, module=None, qualname=None, type=None, start=1)

An enumeration.

Expand source code
class SubsetType(IntFlag):
    TRAIN = 1
    EVAL  = 2

Ancestors

  • enum.IntFlag
  • builtins.int
  • enum.Flag
  • enum.Enum

Class variables

var EVAL
var TRAIN
class TolerentDataset (parent, retry=0)

"Augments" a dataset in the sense that it can produce many child items from one root item of the dataset. The root key must be retrievable from the child key. By convention, the root key is in the first element of the child key. This is overridable with the root_key method.

>>> class PermutingDataset(AugmentedDataset):
...     def augment(self, root_key, root_item):
...         yield (root_key, 0), root_item
...         yield (root_key, 1), root_item[::-1]
>>> d = DictDataset({0: ("Denzel", "Washington"), 1: ("Tom", "Hanks")})
>>> d = PermutingDataset(d)
>>> new_keys = d.keys()
>>> new_keys
((0, 0), (0, 1), (1, 0), (1, 1))
>>> d.query(new_keys)
(array(['Denzel', 'Washington', 'Tom', 'Hanks'], ...),
 array(['Washington', 'Denzel', 'Hanks', 'Tom'], ...))
Expand source code
class TolerentDataset(AugmentedDataset):
    def __init__(self, parent, retry=0):
        super().__init__(parent)
        self.retry = retry
    def augment(self, root_key, root_item):
        retry = self.retry
        while root_item is None and retry:
            root_item = self.parent.query_item(root_key)
            retry -= 1
        return root_item

Ancestors

  • mlworkflow.datasets.AugmentedDataset
  • mlworkflow.datasets.Dataset

Methods

def augment(self, root_key, root_item)
Expand source code
def augment(self, root_key, root_item):
    retry = self.retry
    while root_item is None and retry:
        root_item = self.parent.query_item(root_key)
        retry -= 1
    return root_item