Source code for cero

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

#      ConCERO - a program to automate data format conversion and the execution of economic modelling software.
#      Copyright (C) 2018  CSIRO Energy Business Unit
#
#     This program is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License as published by
#     the Free Software Foundation, either version 3 of the License, or
#     (at your option) any later version.
#
#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#     GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#     along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""
.. cero:

A core concept in the operation of ConCERO is that of a 'Collins Economic Results Object' - a CERO - which \
serves as a standard format for data-interchange between economic modelling programs. Conceptually, the CERO \
is a set of instances of a 'fundamental data type', a discussion of which can be found in the :ref:`Design_Philosophy` \
documentation.

Software-wise, the CERO is a ``pandas.DataFrame`` with some additional constraints. Those constraints are:

    * ``cero.index`` must be an instance of the ``pandas.Index`` class, and
    * ``cero.columns`` must be an instance of the ``pandas.DatetimeIndex`` class, and
    * both ``cero.index`` and ``cero.columns`` values must be unique and
    * all index values must be valid *identifiers* (see below) and
    * ``cero`` data/array values must all be of 32-bit floating-point type (specifically, be instances of a \
    subclass of the ``numpy.float32`` class),

where ``cero`` is a CERO. The values of ``cero.index`` are referred as *identifiers*.

.. _cero_ids:

CERO Identifiers
----------------

As mentioned previously, values of the index of a CERO are referred to as *identifiers*. Identifiers are \
subject to a couple of restrictions. They are:

    * The identifier must be unique - that is, no other value of ``cero.index`` can be exactly the same.
    * The identifier must be either:
        * a string (``str``) *with no commas*, or
        * a tuple of strings, where each string does not have any commas.

The comma constraint is a result of how ConCERO interprets commas when reading YAML files - ConCERO \
interprets commas as a string-splitting character. Thus, \
if a configuration file contains the string:

    ``"hello,world"``

in the context of CERO *identifiers*, then this will be interpreted as the python tuple:

    ``('hello','world')``

Note also that any white spaced is stripped when the string is split, so the string:

    ``"hello, world"``

also becomes:

    ``('hello','world')``

and this:

    ``" L_OUTPUT, Electricity, AUS"``

becomes:

    ``("L_OUTPUT","Electricity","AUS")``

The advantage of the tuple form of identifier is that it preserves ordered relationships, even though that \
ordered relationship has no meaning within the CERO itself. This is necessary to store data that is more \
than 2-dimensional in nature in 2-dimensions. It also allows for the implementation of ``sets`` (see :ref:`sets`),\
which provide the user with significant flexibility and power with respect to selecting identifiers of interest. \
In summary, ``sets`` allow the user to select large amounts of identifiers by just listing sets, \
as opposed to all the identifiers.


.. * VURM
.. * AusTIMES
.. * LUTO
.. * GALLM-E
.. * GALLM-T
.. * GTAP-E

.. The CERO provides methods for use with a pandas dataframe. Subclassing \
.. ``pandas.DataFrame`` is not a trivial exercise.

Created on Wed Dec 20 10:20:32 2017

@author: Lyle Collins
@email: Lyle.Collins@csiro.au
"""
import os

import pandas as pd
import numpy as np

from concero._identifier import _Identifier

[docs]class CERO(object): _msg_inv_type = "Object not of CERO type (pandas.DataFrame)." _msg_bad_idx = r"Object's index is not of pandas.Index type." _msg_bad_col = "Object.columns is not of pandas.DatetimeIndex type." _msg_idx_nunique = "Index values are not unique." _msg_col_nunique = "Column values are not unique." _msg_val_type = "Values are not of numpy.float32 type." _msg_empty_cero = "Object is empty - not a valid CERO." _msg_invalid_ids = "Values in object's index are not valid identifiers."
[docs] class InvalidCERO(TypeError): pass
[docs] class CEROIndexConflict(RuntimeError): pass
[docs] class EmptyCERO(ValueError): pass
[docs] @staticmethod def create_empty(): """Returns empty CERO.""" cero = pd.DataFrame() cero.columns = pd.DatetimeIndex(freq="AS", periods=0, start="2018") return cero
[docs] @staticmethod def is_cero(obj, raise_exception=True, empty_ok=True): """ :param obj: The object that may or may not be a CERO. :param raise_exception: If `True` will raise an exception on the event that obj is not a CERO. :param empty_ok: If `True`, ``obj`` must have at least one value that is not an NaN to qualify as a CERO. `False` by default. :return: """ try: assert isinstance(obj, pd.DataFrame) except AssertionError: raise CERO.InvalidCERO(CERO._msg_inv_type) try: assert (type(obj.index) == pd.Index) except AssertionError: if raise_exception: raise CERO.InvalidCERO(CERO._msg_bad_idx + (" Instead it is of type '%s'." % type(obj.index))) return False try: assert isinstance(obj.columns, pd.DatetimeIndex) except AssertionError: if raise_exception: raise CERO.InvalidCERO(CERO._msg_bad_col) return False try: assert obj.index.is_unique except AssertionError: if raise_exception: raise CERO.InvalidCERO(CERO._msg_idx_nunique + (" Duplicated values are: %s." % obj.index.get_duplicates())) return False try: assert obj.columns.is_unique except AssertionError: if raise_exception: raise CERO.InvalidCERO(CERO._msg_col_nunique + (" Duplicated values are: %s." % obj.columns.get_duplicates())) return False try: assert all([issubclass(x.type, (np.float32,)) for x in obj.dtypes]) # Note that this 'float32 requirement' is because df.to_pickle() automatically downsizes \ # float64 to float32, and there is no option to change this behaviour. except AssertionError: if raise_exception: raise CERO.InvalidCERO(CERO._msg_val_type) return False invalid_idents = list(filter(lambda x: not _Identifier.is_valid(x), obj.index.tolist())) try: assert (not invalid_idents) except AssertionError: if raise_exception: raise CERO.InvalidCERO(CERO._msg_invalid_ids + (" Invalid index values are: %s." % invalid_idents)) return False if not empty_ok: try: assert(not obj.isnull().all().all()) except AssertionError: if raise_exception: raise CERO.EmptyCERO(CERO._msg_empty_cero) return False return True
[docs] @staticmethod def create_cero_index(values: 'List[str, tuple]'): """Creates pandas.Index object that adheres to CERO constraints.""" values = [_Identifier.tupleize_name(value) for value in values] return pd.Index(values, tupleize_cols=False)
[docs] @staticmethod def read_xlsx(xlsx_file, *args, **kwargs): """Reads CEROs that have been exported to xlsx files. :arg (str) file: Name of xlsx file that CERO resides in. """ xlsx_file = os.path.abspath(xlsx_file) ch = pd.read_excel(xlsx_file, nrows=0) # Read header # Identify where index columns end ss = [type(col) for col in ch.columns].index(pd.datetime) index_col = list(range(ss)) cero = pd.read_excel(xlsx_file, index_col=index_col) cero = cero.astype(pd.np.float32) cero.index = CERO.create_cero_index(cero.index.tolist()) assert CERO.is_cero(cero) # Check that it is a valid CERO object return cero
[docs] @staticmethod def read_csv(csv_file): """ Reads CEROs that have been exported to csv file. It is assumed that ';' are used to seperate the fields (if more than one) of the identifier. :param str csv_file: Path to the file containing the CERO. :return pandas.DataFrame: The imported CERO. """ cero = pd.read_csv(csv_file, header=0, index_col=0) # Read header cero = cero.astype(pd.np.float32) cero.index = CERO.create_cero_index(_Identifier.get_all_idents(cero.index.tolist(), sep=";")) cero.columns = pd.to_datetime(cero.columns.tolist(), format="%Y") assert CERO.is_cero(cero) # Check that it is a valid CERO object return cero
[docs] @staticmethod def combine_ceros(ceros: list, overwrite=True, verify_cero=True) -> pd.DataFrame: """Combine multiple CEROs (provided as a ``list``) into a common CERO. If ``overwrite`` is True, a CERO \ that is later in ``ceros`` (i.e. has a higher index) will overwrite the merger of all preceding CEROs. If \ ``overwrite`` is False and duplicate indices are detected, an ``CERO.CEROIndexConflict`` exception \ will be raised. If ``verify_cero`` is ``True``, then a check is performed before and after combination to ensure that \ only CEROs are combined with other CEROs, to form a CERO. By disabling this, ``combine_ceros`` can be \ applied to ``pandas.DataFrames`` as well. """ try: assert isinstance(ceros, list) except AssertionError: raise TypeError("'ceros' must be provided as a list of CEROs, not '%s'." % type(ceros)) if verify_cero: for i, cero in enumerate(ceros): try: CERO.is_cero(cero) except CERO.InvalidCERO as e: raise CERO.InvalidCERO("The '%d'th CERO in the list (zero-indexed) is invalid." % i) if isinstance(overwrite, bool): overwrite = [overwrite]*len(ceros) # In the case first value is neglected because it is ineffectual... if issubclass(type(overwrite), list) and len(overwrite) == len(ceros) -1: overwrite.insert(0, overwrite[0]) cero = ceros[0] for next_cero, ow in zip(ceros[1:], overwrite[1:]): if not ow: # Check the intersection of indices itsn = cero.index.intersection(next_cero.index) if not itsn.empty: raise CERO.CEROIndexConflict(("Attempted to combine CEROs with duplicate indices " + "(and 'overwrite' is not allowed). The duplicated indices are: %s." % itsn.values)) cero = next_cero.combine_first(cero) cero = cero.astype(np.float32, copy=False) # #TODO: combine_first can output array of different dtypes then inputs. I have not isolated the circumstances \ # that this occurs however... if verify_cero: CERO.is_cero(cero) # check there have been no errors return cero
[docs] @staticmethod def rename_index_values(cero: pd.DataFrame, map_dict: dict, inplace: bool=True): """ :param cero: The CERO object to rename the index values of. The order of the CERO.index imposes order on the mapping operation - that is, the CERO retains its original order. :param map_dict: A `dict` of (old name, new name) are (key, value) pairs. :return: """ if not issubclass(type(map_dict), dict): raise TypeError("The mapping dictionary must be a subclass of dict.") def f(x): # Copied from within pandas.core.generic.rename() if x in map_dict: return _Identifier.tupleize_name(map_dict[x]) return _Identifier.tupleize_name(x) if inplace: ret = None else: cero = cero.copy(deep=False) ret = cero cero.index = CERO._transform_index(cero.index, f, tupleize_cols=False) return ret
@staticmethod def _transform_index(index, func, level=None, tupleize_cols=False): """ Apply function to all values found in index. This includes transforming multiindex entries separately. Only apply function to one level of the MultiIndex if level is specified. """ # Copied from pandas.core.internals._transform_index() with minor modification in response to pandas bug #19497 if isinstance(index, pd.MultiIndex): if level is not None: items = [tuple(func(y) if i == level else y for i, y in enumerate(x)) for x in index] else: items = [tuple(func(y) for y in x) for x in index] return pd.MultiIndex.from_tuples(items, names=index.names) else: items = [func(x) for x in index] return pd.Index(items, name=index.name, tupleize_cols=tupleize_cols)