--- title: Common utils keywords: fastai sidebar: home_sidebar summary: "A collection of utilities often used." description: "A collection of utilities often used." nb_path: "nbs/utils/utils.common_utils.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %}

Download

{% raw %}

wget_download[source]

wget_download(url, savepath)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

download_url[source]

download_url(url:str, folder:str, log:bool=True)

Downloads the content of an URL to a specific folder. Args: url (string): The url. folder (string): The folder. log (bool, optional): If :obj:False, will not print anything to the console. (default: :obj:True)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
download_url('https://files.grouplens.org/datasets/movielens/ml-1m.zip',
             './data/bronze')
Downloading https://files.grouplens.org/datasets/movielens/ml-1m.zip
'./data/bronze/ml-1m.zip'
{% endraw %} {% raw %}
!tree ./data
./data
└── bronze
    └── ml-1m.zip

1 directory, 1 file
{% endraw %} {% raw %}
!tree --du -h -C ./data
./data
├── [ 24M]  bronze
│   └── [ 24M]  ml-1m
│       ├── [167K]  movies.dat
│       ├── [ 23M]  ratings.dat
│       ├── [5.4K]  README
│       └── [131K]  users.dat
└── [3.0M]  silver
    └── [3.0M]  ml-1m_min_rating0-min_uc5-min_sc5-splitleave_one_out
        └── [3.0M]  dataset.pkl

  27M used in 4 directories, 5 files
{% endraw %}

Extract

{% raw %}

extract_tar[source]

extract_tar(path:str, folder:str, mode:str='r:gz', log:bool=True)

Extracts a tar archive to a specific folder. Args: path (string): The path to the tar archive. folder (string): The folder. mode (string, optional): The compression mode. (default: :obj:"r:gz") log (bool, optional): If :obj:False, will not print anything to the console. (default: :obj:True)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

extract_zip[source]

extract_zip(path:str, folder:str, log:bool=True)

Extracts a zip archive to a specific folder. Args: path (string): The path to the tar archive. folder (string): The folder. log (bool, optional): If :obj:False, will not print anything to the console. (default: :obj:True)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

extract_bz2[source]

extract_bz2(path:str, folder:str, log:bool=True)

Extracts a bz2 archive to a specific folder. Args: path (string): The path to the tar archive. folder (string): The folder. log (bool, optional): If :obj:False, will not print anything to the console. (default: :obj:True)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

extract_gz[source]

extract_gz(path:str, folder:str, log:bool=True)

Extracts a gz archive to a specific folder. Args: path (string): The path to the tar archive. folder (string): The folder. log (bool, optional): If :obj:False, will not print anything to the console. (default: :obj:True)

{% endraw %} {% raw %}
{% endraw %}

Tabulate

{% raw %}

print_result_as_table(results, tag=None)

Print results as a table.

{% endraw %} {% raw %}
{% endraw %} {% raw %}
results = [{'model':'MF', 'MRR':.35},
           {'model':'NCF', 'MRR':.42, 'nDCG':.25}]

print_result_as_table(results)
--------------------------------------------------------------------------------
+------+------+-------+
|      | MF   |   NCF |
|------+------+-------|
| MRR  | 0.35 |  0.42 |
| nDCG | --   |  0.25 |
+------+------+-------+
--------------------------------------------------------------------------------
{% endraw %}

Listing

{% raw %}

list_files[source]

list_files(startpath)

Util function to print the nested structure of a directory

{% endraw %} {% raw %}
{% endraw %} {% raw %}
list_files('./sample_data')
sample_data/
    README.md
    anscombe.json
    california_housing_train.csv
    california_housing_test.csv
    mnist_test.csv
    mnist_train_small.csv
{% endraw %}

Deterministic

{% raw %}
{% endraw %} {% raw %}

seed_everything[source]

seed_everything(seed=40)

sets the random seed to establish deterministic behaviors

Args: seed (int): the random seed integer

{% endraw %} {% raw %}
{% endraw %}

Mapping, Masking, and Padding

{% raw %}
{% endraw %} {% raw %}

map_column[source]

map_column(df:DataFrame, col_name:str)

Maps column values to integers.

{% endraw %} {% raw %}

get_context[source]

get_context(df:DataFrame, split:str, context_size:int=120, val_context_size:int=5, seed:int=42)

Create a training / validation samples.

{% endraw %} {% raw %}

pad_arr[source]

pad_arr(arr:ndarray, expected_size:int=30)

Pad top of array when there is not enough history.

{% endraw %} {% raw %}

pad_list[source]

pad_list(list_integers, history_size:int, pad_val:int=0, mode='left')

Pad list from left or right

{% endraw %} {% raw %}

mask_list[source]

mask_list(l1, p=0.8, mask=1, seed=42)

{% endraw %} {% raw %}

mask_last_elements_list[source]

mask_last_elements_list(l1, val_context_size:int=5, seed=42)

{% endraw %} {% raw %}

masked_accuracy[source]

masked_accuracy(y_pred:Tensor, y_true:Tensor, mask:Tensor)

{% endraw %} {% raw %}

masked_ce[source]

masked_ce(y_pred, y_true, mask)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
import unittest
from numpy.testing import assert_array_equal
{% endraw %} {% raw %}
class TestUtils(unittest.TestCase):
    def testColMapping(self):
        "test the column mapping function"
        df = pd.DataFrame(
            {'uid': [1,2,3,4],
             'sid': [1,3,5,7]}
        )
        df, _, _ = map_column(df, col_name='sid')
        assert_array_equal(df.sid_mapped.values,
                           [2, 3, 4, 5])
        
    def testSplit(self):
        "test the train/test/val split"
        SEED = 42
        df = pd.DataFrame(
            {'uid': list(np.arange(50)),
                'sid': list(np.arange(50))}
        )
        context = get_context(df, split='train', context_size=5, seed=SEED)
        assert_array_equal(context.sid.values,
                           [12, 13, 14, 15, 16])
        
    def testArrayPadding(self):
        "test array padding function"
        pad_output_1 = pad_arr(np.array([[1,2,3],[7,8,9]]), expected_size=5)
        pad_output_2 = pad_arr(np.array([[1,2,3]]), expected_size=3)
        assert_array_equal(pad_output_1,
                           [[1, 2, 3],
                            [1, 2, 3],
                            [1, 2, 3],
                            [1, 2, 3],
                            [7, 8, 9]])
        assert_array_equal(pad_output_2,
                           [[1, 2, 3],
                            [1, 2, 3],
                            [1, 2, 3]])
        
    def testListPadding(self):
        "test list padding function"
        pad_output_1 = pad_list([1,2,3], history_size=5, pad_val=0, mode='left')
        pad_output_2 = pad_list([1,2,3], history_size=6, pad_val=1, mode='right')
        assert_array_equal(pad_output_1,
                           [0, 0, 1, 2, 3])
        assert_array_equal(pad_output_2,
                           [1, 2, 3, 1, 1, 1])
{% endraw %} {% raw %}
class TestModelUtils(unittest.TestCase):
    def testMaskedAccuracy(self):
        "test the masked accuracy"
        output1 = masked_accuracy(torch.Tensor([[0,1,1,0]]),
                                torch.Tensor([[0,1,1,1]]),
                                torch.tensor([1,1,1,1], dtype=torch.bool))

        output2 = masked_accuracy(torch.Tensor([[0,1,1,0]]),
                                torch.Tensor([[0,1,1,1]]),
                                torch.tensor([1,0,0,1], dtype=torch.bool))

        self.assertEqual(output1, torch.tensor(0.75, dtype=torch.float64))
        self.assertEqual(output2, torch.tensor(0.5, dtype=torch.float64))

    def testMaskedCrossEntropy(self):
        input = [[1.1049, 1.5729, 1.4864],
        [-1.8321, -0.3137, -0.3257]]
        target = [0,2]

        output1 = masked_ce(torch.tensor(input),
                            torch.tensor(target),
                            torch.tensor([1,0], dtype=torch.bool))

        output2 = masked_ce(torch.tensor(input), 
                            torch.tensor(target),
                            torch.tensor([1,1], dtype=torch.bool))
        
        assert_array_equal(output1.numpy().round(4),
                           np.array(1.4015, dtype=np.float32))
        assert_array_equal(output2.numpy().round(4),
                           np.array(1.1026, dtype=np.float32))
        
    def testMaskList(self):
        seed = 42
        assert_array_equal(mask_list([1,2,3,4,5,6,7,8], seed=seed),
                           [1,2,3,4,5,6,1,8])
        seed = 40
        assert_array_equal(mask_list([1,2,3,4,5,6,7,8], seed=seed),
                           [1,1,3,4,1,6,7,8])

    def testMaskListLastElement(self):
        seed = 42
        output1 = mask_last_elements_list([1,2,3,4,5,6,7,8], val_context_size=5, seed=seed)
        output2 = mask_last_elements_list([1,2,3,4,5,6,7,8], val_context_size=3, seed=seed)
        assert_array_equal(output1, [1,2,3,1,5,6,7,1])
        assert_array_equal(output2, [1,2,3,4,5,1,7,8])
{% endraw %} {% raw %}
unittest.main(argv=[''], verbosity=2, exit=False)
testMaskList (__main__.TestModelUtils) ... ok
testMaskListLastElement (__main__.TestModelUtils) ... ok
testMaskedAccuracy (__main__.TestModelUtils)
test the masked accuracy ... ok
testMaskedCrossEntropy (__main__.TestModelUtils) ... ok
testArrayPadding (__main__.TestUtils)
test array padding function ... ok
testColMapping (__main__.TestUtils)
test the column mapping function ... ok
testListPadding (__main__.TestUtils)
test list padding function ... ok
testSplit (__main__.TestUtils)
test the train/test/val split ... ok

----------------------------------------------------------------------
Ran 8 tests in 0.032s

OK
<unittest.main.TestProgram at 0x7fc8ba85ced0>
{% endraw %}