readabs

Package to download timeseries data from the Australian Bureau of Statistics (ABS) and the Reserve Bank of Australia (RBA).

 1"""Package to download timeseries data from 
 2the Australian Bureau of Statistics (ABS) 
 3and the Reserve Bank of Australia (RBA)."""
 4
 5# --- imports
 6import importlib.metadata
 7
 8# --- local imports
 9# - ABS related -
10from readabs.abs_catalogue import abs_catalogue
11from readabs.print_abs_catalogue import print_abs_catalogue
12from readabs.search_abs_meta import search_abs_meta, find_abs_id
13from readabs.read_abs_cat import read_abs_cat
14from readabs.read_abs_series import read_abs_series
15from readabs.read_abs_by_desc import read_abs_by_desc
16from readabs.grab_abs_url import grab_abs_url
17from readabs.abs_meta_data import metacol
18
19# - RBA related -
20from readabs.rba_catalogue import print_rba_catalogue, rba_catalogue
21from readabs.read_rba_table import read_rba_table, read_rba_ocr
22from readabs.rba_meta_data import rba_metacol
23
24# - Utilities -
25from readabs.datatype import Datatype
26from readabs.recalibrate import recalibrate, recalibrate_value
27from readabs.utilities import (
28    percent_change,
29    annualise_rates,
30    annualise_percentages,
31    qtly_to_monthly,
32    monthly_to_qtly,
33)
34
35
36# --- version and author
37try:
38    __version__ = importlib.metadata.version(__name__)
39except importlib.metadata.PackageNotFoundError:
40    __version__ = "0.0.0"  # Fallback for development mode
41__author__ = "Bryan Palmer"
42
43
44# --- exposed functions and classes
45__all__ = (
46    # -- abs -- related
47    "metacol",
48    "read_abs_cat",
49    "read_abs_series",
50    "read_abs_by_desc",
51    "search_abs_meta",
52    "find_abs_id",
53    "grab_abs_url",
54    "print_abs_catalogue",
55    "abs_catalogue",
56    # -- rba -- related
57    "print_rba_catalogue",
58    "rba_catalogue",
59    "read_rba_table",
60    "rba_metacol",
61    "read_rba_ocr",
62    # -- utilities --
63    "Datatype",
64    "percent_change",
65    "annualise_rates",
66    "annualise_percentages",
67    "qtly_to_monthly",
68    "monthly_to_qtly",
69    "recalibrate",
70    "recalibrate_value",
71)
72__pdoc__ = {
73    "download_cache": False,
74    "get_abs_links": False,
75    "read_support": False,
76    "grab_abs_url": False,
77}  # hide submodules from documentation
metacol = Metacol(did='Data Item Description', stype='Series Type', id='Series ID', start='Series Start', end='Series End', num='No. Obs.', unit='Unit', dtype='Data Type', freq='Freq.', cmonth='Collection Month', table='Table', tdesc='Table Description', cat='Catalogue number')
@cache
def read_abs_cat( cat: str, keep_non_ts: bool = False, **kwargs: Any) -> tuple[dict[str, pandas.core.frame.DataFrame], pandas.core.frame.DataFrame]:
 23@cache  # minimise slowness for any repeat business
 24def read_abs_cat(
 25    cat: str,
 26    keep_non_ts: bool = False,
 27    **kwargs: Any,
 28) -> tuple[dict[str, DataFrame], DataFrame]:
 29    """This function returns the complete ABS Catalogue information as a
 30    python dictionary of pandas DataFrames, as well as the associated metadata
 31    in a separate DataFrame. The function automates the collection of zip and
 32    excel files from the ABS website. If necessary, these files are downloaded,
 33    and saved into a cache directory. The files are then parsed to extract time
 34    series data, and the associated metadata.
 35
 36    By default, the cache directory is `./.readabs_cache/`. You can change the
 37    default directory name by setting the shell environment variable
 38    `READABS_CACHE_DIR` with the name of the preferred directory.
 39
 40    Parameters
 41    ----------
 42
 43    cat : str
 44        The ABS Catalogue Number for the data to be downloaded and made
 45        available by this function. This argument must be specified in the
 46        function call.
 47
 48    keep_non_ts : bool = False
 49        A flag for whether to keep the non-time-series tables
 50        that might form part of an ABS catalogue item. Normally, the
 51        non-time-series information is ignored, and not made available to
 52        the user.
 53
 54    **kwargs : Any
 55        The following parameters may be passed as optional keyword arguments.
 56
 57    history : str = ""
 58        Orovide a month-year string to extract historical ABS data.
 59        For example, you can set history="dec-2023" to the get the ABS data
 60        for a catalogue identifier that was originally published in respect
 61        of Q4 of 2023. Note: not all ABS data sources are structured so that
 62        this technique works in every case; but most are.
 63
 64    verbose : bool = False
 65        Setting this to true may help diagnose why something
 66        might be going wrong with the data retrieval process.
 67
 68    ignore_errors : bool = False
 69        Normally, this function will cease downloading when
 70        an error in encountered. However, sometimes the ABS website has
 71        malformed links, and changing this setting is necessitated. (Note:
 72        if you drop a message to the ABS, they will usually fix broken
 73        links with a business day).
 74
 75    get_zip : bool = True
 76        Download the excel files in .zip files.
 77
 78    get_excel_if_no_zip : bool = True
 79        Only try to download .xlsx files if there are no zip
 80        files available to be downloaded. Only downloading individual excel
 81        files when there are no zip files to download can speed up the
 82        download process.
 83
 84    get_excel : bool = False
 85        The default value means that excel files are not
 86        automatically download. Note: at least one of `get_zip`,
 87        `get_excel_if_no_zip`, or `get_excel` must be true. For most ABS
 88        catalogue items, it is sufficient to just download the one zip
 89        file. But note, some catalogue items do not have a zip file.
 90        Others have quite a number of zip files.
 91
 92    single_excel_only : str = ""
 93        If this argument is set to a table name (without the
 94        .xlsx extension), only that excel file will be downloaded. If
 95        set, and only a limited subset of available data is needed,
 96        this can speed up download times significantly. Note: overrides
 97        `get_zip`, `get_excel_if_no_zip`, `get_excel` and `single_zip_only`.
 98
 99    single_zip_only : str = ""
100        If this argument is set to a zip file name (without
101        the .zip extension), only that zip file will be downloaded.
102        If set, and only a limited subset of available data is needed,
103        this can speed up download times significantly. Note: overrides
104        `get_zip`, `get_excel_if_no_zip`, and `get_excel`.
105
106    cache_only : bool = False
107        If set to True, this function will only access
108        data that has been previously cached. Normally, the function
109        checks the date of the cache data against the date of the data
110        on the ABS website, before deciding whether the ABS has fresher
111        data that needs to be downloaded to the cache.
112
113    Returns
114    -------------
115    tuple[dict[str, DataFrame], DataFrame]
116        The function returns a tuple of two items. The first item is a
117        python dictionary of pandas DataFrames (which is the primary data
118        associated with the ABS catalogue item). The second item is a
119        DataFrame of ABS metadata for the ABS collection.
120
121    Example
122    -------
123
124    ```python
125    import readabs as ra
126    from pandas import DataFrame
127    cat_num = "6202.0"  # The ABS labour force survey
128    data: tuple[dict[str, DataFrame], DataFrame] = ra.read_abs_cat(cat=cat_num)
129    abs_dict, meta = data
130    ```"""
131
132    # --- get the time series data ---
133    raw_abs_dict = grab_abs_url(cat=cat, **kwargs)
134    abs_dict, abs_meta = _get_time_series_data(
135        cat, raw_abs_dict, keep_non_ts=keep_non_ts, **kwargs
136    )
137
138    return abs_dict, abs_meta

This function returns the complete ABS Catalogue information as a python dictionary of pandas DataFrames, as well as the associated metadata in a separate DataFrame. The function automates the collection of zip and excel files from the ABS website. If necessary, these files are downloaded, and saved into a cache directory. The files are then parsed to extract time series data, and the associated metadata.

By default, the cache directory is ./.readabs_cache/. You can change the default directory name by setting the shell environment variable READABS_CACHE_DIR with the name of the preferred directory.

Parameters

cat : str The ABS Catalogue Number for the data to be downloaded and made available by this function. This argument must be specified in the function call.

keep_non_ts : bool = False A flag for whether to keep the non-time-series tables that might form part of an ABS catalogue item. Normally, the non-time-series information is ignored, and not made available to the user.

**kwargs : Any The following parameters may be passed as optional keyword arguments.

history : str = "" Orovide a month-year string to extract historical ABS data. For example, you can set history="dec-2023" to the get the ABS data for a catalogue identifier that was originally published in respect of Q4 of 2023. Note: not all ABS data sources are structured so that this technique works in every case; but most are.

verbose : bool = False Setting this to true may help diagnose why something might be going wrong with the data retrieval process.

ignore_errors : bool = False Normally, this function will cease downloading when an error in encountered. However, sometimes the ABS website has malformed links, and changing this setting is necessitated. (Note: if you drop a message to the ABS, they will usually fix broken links with a business day).

get_zip : bool = True Download the excel files in .zip files.

get_excel_if_no_zip : bool = True Only try to download .xlsx files if there are no zip files available to be downloaded. Only downloading individual excel files when there are no zip files to download can speed up the download process.

get_excel : bool = False The default value means that excel files are not automatically download. Note: at least one of get_zip, get_excel_if_no_zip, or get_excel must be true. For most ABS catalogue items, it is sufficient to just download the one zip file. But note, some catalogue items do not have a zip file. Others have quite a number of zip files.

single_excel_only : str = "" If this argument is set to a table name (without the .xlsx extension), only that excel file will be downloaded. If set, and only a limited subset of available data is needed, this can speed up download times significantly. Note: overrides get_zip, get_excel_if_no_zip, get_excel and single_zip_only.

single_zip_only : str = "" If this argument is set to a zip file name (without the .zip extension), only that zip file will be downloaded. If set, and only a limited subset of available data is needed, this can speed up download times significantly. Note: overrides get_zip, get_excel_if_no_zip, and get_excel.

cache_only : bool = False If set to True, this function will only access data that has been previously cached. Normally, the function checks the date of the cache data against the date of the data on the ABS website, before deciding whether the ABS has fresher data that needs to be downloaded to the cache.

Returns

tuple[dict[str, DataFrame], DataFrame] The function returns a tuple of two items. The first item is a python dictionary of pandas DataFrames (which is the primary data associated with the ABS catalogue item). The second item is a DataFrame of ABS metadata for the ABS collection.

Example

import readabs as ra
from pandas import DataFrame
cat_num = "6202.0"  # The ABS labour force survey
data: tuple[dict[str, DataFrame], DataFrame] = ra.read_abs_cat(cat=cat_num)
abs_dict, meta = data
def read_abs_series( cat: str, series_id: Union[str, Sequence[str]], **kwargs: Any) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
 18def read_abs_series(
 19    cat: str,
 20    series_id: str | Sequence[str],
 21    **kwargs: Any,
 22) -> tuple[DataFrame, DataFrame]:
 23    """Get specific ABS data series by their ABS catalogue and series identifiers.
 24
 25    Parameters
 26    ----------
 27    cat : str
 28        The ABS catalogue ID.
 29
 30    series_id : str | Sequence[str]
 31        An ABS series ID or a sequence of ABS series IDs.
 32
 33    **kwargs : Any
 34        Keyword arguments for the read_abs_series function,
 35        which are the same as the keyword arguments for the
 36        read_abs_cat function.
 37
 38    Returns
 39    -------
 40    tuple[DataFrame, DataFrame]
 41        A tuple of two DataFrames, one for the primary data and one for the metadata.
 42
 43    Example
 44    -------
 45
 46    ```python
 47    import readabs as ra
 48    from pandas import DataFrame
 49    cat_num = "6202.0"  # The ABS labour force survey
 50    unemployment_rate = "A84423050A"
 51    seo = "6202001"  # The ABS table name
 52    data, meta = ra.read_abs_series(
 53        cat=cat_num, series_id=unemployment_rate, single_excel_only=seo
 54    )
 55    ```"""
 56
 57    # check for unexpected keyword arguments/get defaults
 58    check_kwargs(kwargs, "read_abs_series")
 59    args = get_args(kwargs, "read_abs_series")
 60
 61    # read the ABS category data
 62    cat_data, cat_meta = read_abs_cat(cat, **args)
 63
 64    # drop repeated series_ids in the meta data,
 65    # make unique series_ids the index
 66    cat_meta.index = Index(cat_meta[metacol.id])
 67    cat_meta = cat_meta.groupby(cat_meta.index).first()
 68
 69    # get the ABS series data
 70    if isinstance(series_id, str):
 71        series_id = [series_id]
 72    return_data, return_meta = DataFrame(), DataFrame()
 73    for identifier in series_id:
 74
 75        # confirm that the series ID is in the catalogue
 76        if identifier not in cat_meta.index:
 77            if args["verbose"]:
 78                print(f"Series ID {identifier} not found in ABS catalogue ID {cat}")
 79            if args["ignore_errors"]:
 80                continue
 81            raise ValueError(f"Series ID {identifier} not found in catalogue {cat}")
 82
 83        # confirm thay the index of the series is compatible
 84        table = str(cat_meta.loc[identifier, metacol.table])  # str for mypy
 85        data_series = cat_data[table][identifier]
 86        if (
 87            len(return_data) > 0
 88            and cast(PeriodIndex, return_data.index).freq
 89            != cast(PeriodIndex, data_series.index).freq
 90        ):
 91            if args["verbose"]:
 92                print(f"Frequency mismatch for series ID {identifier}")
 93            if args["ignore_errors"]:
 94                continue
 95            raise ValueError(f"Frequency mismatch for series ID {identifier}")
 96
 97        # add the series data and meta data to the return values
 98        if len(return_data) > 0:
 99            return_data = return_data.reindex(
100                return_data.index.union(data_series.index)
101            )
102        return_data[identifier] = data_series
103        return_meta = concat([return_meta, cat_meta.loc[identifier]], axis=1)
104
105    return return_data, return_meta.T

Get specific ABS data series by their ABS catalogue and series identifiers.

Parameters

cat : str The ABS catalogue ID.

series_id : str | Sequence[str] An ABS series ID or a sequence of ABS series IDs.

**kwargs : Any Keyword arguments for the read_abs_series function, which are the same as the keyword arguments for the read_abs_cat function.

Returns

tuple[DataFrame, DataFrame] A tuple of two DataFrames, one for the primary data and one for the metadata.

Example

import readabs as ra
from pandas import DataFrame
cat_num = "6202.0"  # The ABS labour force survey
unemployment_rate = "A84423050A"
seo = "6202001"  # The ABS table name
data, meta = ra.read_abs_series(
    cat=cat_num, series_id=unemployment_rate, single_excel_only=seo
)
def read_abs_by_desc( wanted: list[str] | dict[str, str] | dict[str, dict[str, typing.Any]], **kwargs: Any) -> tuple[dict[str, pandas.core.series.Series], pandas.core.frame.DataFrame]:
145def read_abs_by_desc(
146    wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]],
147    **kwargs: Any,
148) -> tuple[dict[str, pd.Series], pd.DataFrame]:
149    """Get specific ABS data series by searching the ABS meta data.
150
151    Parameters
152    ----------
153    - wanted : list of str, dict of str:str, or dict of str:dict - the data
154        item descriptions to search for. If a list, it will be a list of
155        descriptions to search for. If a dictionary, the keys will a name.
156        The dixtionary values can be either a string (the data item
157        description to search for) or a dictionary of keyword arguments, one of
158        which would be the data item description to search for.
159    - kwargs : Any - keyword arguments to control the data retrieval.
160        The keyword arguments can include the following:
161        - abs_dict : dict - the dictionary of ABS data to search (from
162            read_abs_cat()).
163        - abs_meta : DataFrame - the metadata for the ABS data (from
164            read_abs_cat()).
165        - for the retrieval of data, the "cat" argument must be present.
166            The following arguments, if present, will also be used (ie.
167            passed to read_abs_cat()): ["ignore_errors", "get_zip",
168            "get_excel_if_no_zip", "get_excel", "cache_only",
169            "single_excel_only", "single_zip_only", "verbose"].
170        - for the selection of data, the following metacol names, if present,
171            will be used to construct the selector: "cat", "did"
172            "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
173            "cmonth", "table", "tdesc".
174        - finally, the following arguments will be passed to the find_abs_id()
175            and search_abs_meta() functions: ["validate_unique", "exact_match",
176            "regex", "verbose"].
177
178    Notes:
179    - if "wanted" is of type list[str] or dict[str, str], the kwargs should
180        include sufficient keys from the metacol dataclass to get the data.
181        Typically, the "cat" key, the "table" key, and the "stype" key would
182        be required. The did key would taken from the wanted list or
183        dictionary.
184    if wanted is of type dict[str, dict[str, Any]], the inner dictionary
185        must contain a "did" key. The other keys that can be used for the
186        data retrieval are the same as the metacol dataclass fileds, namely:
187        "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
188        "cmonth", "table", "tdesc".
189    - if abs_dict and abs_meta are provided within the kwargs, they will be
190        used to locate and extract the selected data.
191    - if abs_dict and abs_meta are not provided, then, (1) wanted must be of
192        type dict[str, dict[str, Any]] and (2) the inner dictionary must
193        contain a "cat" key so the data can be retrieved. Other keys that
194        can be used for the data retrieval are the same as for read_abs_cat(),
195        namely ["ignore_errors", "get_zip", "get_excel_if_no_zip",
196        "get_excel", "single_excel_only", "single_zip_only", "cache_only"].
197
198
199    Returns
200    -------
201    Returns a tuple of two items:
202    - A dictionary of pandas Series objects, where the keys are the series
203      descriptions. The series.name attribute will be the ABS series-id.
204    - A pandas DataFrame containing the metadata for the series.
205
206    Example
207    -------
208
209    ```python
210    import readabs as ra
211    from pandas import DataFrame
212    cat_num = "5206.0"  # The ABS National Accounts
213    data, meta = ra.read_abs_cat(cat=cat_num)
214    wanted = ["Gross domestic product: Chain volume measures ;",]
215    selected, selected_meta = ra.read_abs_by_desc(
216        wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
217    )
218    ```"""
219
220    # - preparation
221    if not _work_to_do(wanted):
222        return {}, pd.DataFrame()
223    if isinstance(wanted, list):
224        wanted = _wlist_to_wdict(wanted)
225    abs_dict = kwargs.get("abs_dict", {})
226    abs_meta = kwargs.get("abs_meta", pd.DataFrame())
227    kwarg_selector = _get_search_terms(kwargs, {})
228    search_args = _get_search_args(kwargs, {})
229
230    return_dict = {}
231    return_meta = pd.DataFrame()
232    for key, value in wanted.items():
233
234        item_selector = kwarg_selector.copy()
235        item_search_args = search_args.copy()
236        if isinstance(value, str):
237            series, meta = _get_item_from_str(
238                item=value,
239                data_dict=abs_dict,
240                data_meta=abs_meta,
241                item_selector=item_selector,
242                search_args=item_search_args,
243            )
244
245        elif isinstance(value, dict):
246            series, meta = _get_item_from_dict(
247                item_dict=value,
248                data_dict=abs_dict,
249                data_meta=abs_meta,
250                item_selector=item_selector,
251                search_args=item_search_args,
252                **kwargs,
253            )
254        else:
255            raise TypeError(
256                "Each value in the wanted list/dictionary must be either a string "
257                + "or a dictionary."
258            )
259
260        # save search results
261        return_dict[key] = series
262        return_meta = pd.concat([return_meta, meta])
263
264    return return_dict, return_meta

Get specific ABS data series by searching the ABS meta data.

Parameters

  • wanted : list of str, dict of str:str, or dict of str:dict - the data item descriptions to search for. If a list, it will be a list of descriptions to search for. If a dictionary, the keys will a name. The dixtionary values can be either a string (the data item description to search for) or a dictionary of keyword arguments, one of which would be the data item description to search for.
  • kwargs : Any - keyword arguments to control the data retrieval. The keyword arguments can include the following:
    • abs_dict : dict - the dictionary of ABS data to search (from read_abs_cat()).
    • abs_meta : DataFrame - the metadata for the ABS data (from read_abs_cat()).
    • for the retrieval of data, the "cat" argument must be present. The following arguments, if present, will also be used (ie. passed to read_abs_cat()): ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "cache_only", "single_excel_only", "single_zip_only", "verbose"].
    • for the selection of data, the following metacol names, if present, will be used to construct the selector: "cat", "did" "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc".
    • finally, the following arguments will be passed to the find_abs_id() and search_abs_meta() functions: ["validate_unique", "exact_match", "regex", "verbose"].

Notes:

  • if "wanted" is of type list[str] or dict[str, str], the kwargs should include sufficient keys from the metacol dataclass to get the data. Typically, the "cat" key, the "table" key, and the "stype" key would be required. The did key would taken from the wanted list or dictionary. if wanted is of type dict[str, dict[str, Any]], the inner dictionary must contain a "did" key. The other keys that can be used for the data retrieval are the same as the metacol dataclass fileds, namely: "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc".
  • if abs_dict and abs_meta are provided within the kwargs, they will be used to locate and extract the selected data.
  • if abs_dict and abs_meta are not provided, then, (1) wanted must be of type dict[str, dict[str, Any]] and (2) the inner dictionary must contain a "cat" key so the data can be retrieved. Other keys that can be used for the data retrieval are the same as for read_abs_cat(), namely ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "single_excel_only", "single_zip_only", "cache_only"].

Returns

Returns a tuple of two items:

  • A dictionary of pandas Series objects, where the keys are the series descriptions. The series.name attribute will be the ABS series-id.
  • A pandas DataFrame containing the metadata for the series.

Example

import readabs as ra
from pandas import DataFrame
cat_num = "5206.0"  # The ABS National Accounts
data, meta = ra.read_abs_cat(cat=cat_num)
wanted = ["Gross domestic product: Chain volume measures ;",]
selected, selected_meta = ra.read_abs_by_desc(
    wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
)
def search_abs_meta( meta: pandas.core.frame.DataFrame, search_terms: dict[str, str], exact_match: bool = False, regex: bool = False, validate_unique=False, **kwargs: Any) -> pandas.core.frame.DataFrame:
 13def search_abs_meta(
 14    meta: DataFrame,  # sourced from read_abs_series() or read_abs_cat()
 15    search_terms: dict[str, str],  # {search_term: meta_data_column_name, ...}
 16    exact_match: bool = False,
 17    regex: bool = False,
 18    validate_unique=False,  # useful safety-net if you expect only one match
 19    **kwargs: Any,
 20) -> DataFrame:
 21    """Extract from the ABS meta data those rows that match the
 22    search_terms, by iteratively searching the meta data one
 23    search_term at a time.
 24
 25    Parameters
 26    ----------
 27    meta : DataFrame
 28        A pandas DataFrame of metadata from the ABS
 29        (via read_abs_cat() or read_abs_series()).
 30    search_terms : dict[str, str]
 31        A dictionary {search_phrase: meta_column_name, ...} of search terms.
 32        Note: the search terms must be unique, as a dictionary cannot hold the
 33        same search term to be applied to different columns.
 34    exact_match : bool = False
 35        Whether to match using == (exact) or .str.contains() (inexact).
 36    regex : bool = False
 37        Whether to use regular expressions in the search.
 38    validate_unique : bool = False
 39        Raise a ValueError if the search result is not unique.
 40    **kwargs : Any
 41        Additional keyword arguments. The only keyword argument
 42        that is used is verbose.
 43    verbose : bool = False
 44        Print additional information while searching; which can
 45        be useful when diagnosing problems with search terms.
 46
 47    Returns
 48    -------
 49    DataFrame
 50        Returns a pandas DataFrame of matching rows (subseted from meta).
 51        Note, The index for the returned meta data will always comprise ABS
 52        series_ids. Duplicate indexes will be removed from the meta data
 53        (ie. where the same ABS series appears in more than one table, this
 54        function will only report the first match).
 55
 56    Metacol
 57    -------
 58    Because the meta data is a DataFrame, the columns can be referenced by either
 59    their full textual name, or by the short name defined in the metacol object.
 60    For example, if metacol is imported as mc, to refer to the
 61    `Data Item Description` column, the user can refer to it as mc.did.
 62
 63    Example
 64    -------
 65    ```python
 66    from readabs import metacol as mc  # alias for the ABS meta data column names
 67    from readabs import read_abs_cat, search_abs_meta
 68    cat_num = "6202.0"  # The ABS labour force survey
 69    data, meta = read_abs_cat(cat_num)
 70    search_terms = {
 71        "Unemployment rate": mc.did,  # the data item description
 72        "Persons": mc.did,
 73        "Seasonally Adjusted": mc.stype,
 74        "Percent": mc.unit,
 75        "6202001": mc.table,
 76    }
 77    rows = search_abs_meta(meta, search_terms, verbose=True)
 78    print(rows)  # should have three rows : FT/PT/All Unemployment rates
 79    ```"""
 80
 81    # get the verbose-flag from kwargs
 82    verbose = kwargs.get("verbose", False)
 83
 84    # establish the starting point
 85    meta_select = meta.copy()  # preserve the original meta data
 86    if verbose:
 87        print(f"In search_abs_meta() {exact_match=} {regex=} {verbose=}")
 88        print(
 89            f"In search_abs_meta() starting with {len(meta_select)} rows in the meta_data."
 90        )
 91
 92    # iteratively search
 93    for phrase, column in search_terms.items():
 94        if verbose:
 95            print(f"Searching {len(meta_select)}: term: {phrase} in-column: {column}")
 96
 97        pick_me = (
 98            (meta_select[column] == phrase)
 99            if (exact_match or column == mc.table)
100            else meta_select[column].str.contains(phrase, regex=regex)
101        )
102        meta_select = meta_select[pick_me]
103        if verbose:
104            print(f"In find_rows() have found {len(meta_select)}")
105
106    # search complete - check results - and return
107    meta_select.index = Index(meta_select[mc.id])
108    meta_select = meta_select[~meta_select.index.duplicated(keep="first")]
109
110    if verbose:
111        print(f"Final selection is {len(meta_select)} rows.")
112
113    elif len(meta_select) == 0:
114        print("Nothing selected?")
115
116    if validate_unique and len(meta_select) != 1:
117        raise ValueError("The selected meta data should only contain one row.")
118
119    return meta_select

Extract from the ABS meta data those rows that match the search_terms, by iteratively searching the meta data one search_term at a time.

Parameters

meta : DataFrame A pandas DataFrame of metadata from the ABS (via read_abs_cat() or read_abs_series()). search_terms : dict[str, str] A dictionary {search_phrase: meta_column_name, ...} of search terms. Note: the search terms must be unique, as a dictionary cannot hold the same search term to be applied to different columns. exact_match : bool = False Whether to match using == (exact) or .str.contains() (inexact). regex : bool = False Whether to use regular expressions in the search. validate_unique : bool = False Raise a ValueError if the search result is not unique. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is verbose. verbose : bool = False Print additional information while searching; which can be useful when diagnosing problems with search terms.

Returns

DataFrame Returns a pandas DataFrame of matching rows (subseted from meta). Note, The index for the returned meta data will always comprise ABS series_ids. Duplicate indexes will be removed from the meta data (ie. where the same ABS series appears in more than one table, this function will only report the first match).

Metacol

Because the meta data is a DataFrame, the columns can be referenced by either their full textual name, or by the short name defined in the metacol object. For example, if metacol is imported as mc, to refer to the Data Item Description column, the user can refer to it as mc.did.

Example

from readabs import metacol as mc  # alias for the ABS meta data column names
from readabs import read_abs_cat, search_abs_meta
cat_num = "6202.0"  # The ABS labour force survey
data, meta = read_abs_cat(cat_num)
search_terms = {
    "Unemployment rate": mc.did,  # the data item description
    "Persons": mc.did,
    "Seasonally Adjusted": mc.stype,
    "Percent": mc.unit,
    "6202001": mc.table,
}
rows = search_abs_meta(meta, search_terms, verbose=True)
print(rows)  # should have three rows : FT/PT/All Unemployment rates
def find_abs_id( meta: pandas.core.frame.DataFrame, search_terms: dict[str, str], **kwargs: Any) -> tuple[str, str, str]:
122def find_abs_id(
123    meta: DataFrame,
124    search_terms: dict[str, str],
125    **kwargs: Any,
126) -> tuple[str, str, str]:  # table, series_id, units
127    """Find a unique ABS series identifier in the ABS metadata.
128
129    Parameters
130    ----------
131    meta : DataFrame
132        A pandas DataFrame of metadata from the ABS
133        (via read_abs_cat() or read_abs_series()).
134    search_terms : dict[str, str]
135        A dictionary {search_phrase: meta_column_name, ...} of search terms.
136        Note: the search terms must be unique, as a dictionary cannot hold the
137        same search term to be applied to different columns.
138    **kwargs : Any
139        Additional keyword arguments. The only additional keyword argument
140        that is used is validate_unique.
141    validate_unique : bool = True
142        Raise a ValueError if the search result is not a single
143        unique match. Note: the default is True for safety.
144
145    Returns
146    -------
147    tuple[str, str, str]
148        A tuple of the table, series_id and units for the unique
149        series_id that matches the search terms.
150
151    Metacol
152    -------
153    Because the meta data is a DataFrame, the columns can be referenced by either
154    their full textual name, or by the short name defined in the metacol object.
155    For example, if metacol is imported as mc, to refer to the
156    `Data Item Description` column, the user can refer to it as mc.did.
157
158    Example
159    -------
160    ```python
161    from readabs import metacol as mc  # alias for the ABS meta data column names
162    from readabs import read_abs_cat, find_abs_id, recalibrate
163    cat_num = "6202.0"  # The ABS labour force survey
164    data, meta = read_abs_cat(cat_num)
165    search_terms = {
166        "Employed total ;  Persons ;": mc.did,
167        "Seasonally Adjusted": mc.stype,
168        "6202001": mc.table,
169    }
170    table, series_id, units = find_abs_id(meta, search_terms)
171    print(f"Table: {table} Series ID: {series_id} Units: {units}")
172    recal_series, recal_units = recalibrate(data[table][series_id], units)
173    ```"""
174
175    validate_unique = kwargs.pop("validate_unique", True)
176    found = search_abs_meta(
177        meta, search_terms, validate_unique=validate_unique, **kwargs
178    ).iloc[0]
179    table, series_id, units = (
180        found[mc.table],
181        found[mc.id],
182        found[mc.unit],
183    )
184
185    return table, series_id, units

Find a unique ABS series identifier in the ABS metadata.

Parameters

meta : DataFrame A pandas DataFrame of metadata from the ABS (via read_abs_cat() or read_abs_series()). search_terms : dict[str, str] A dictionary {search_phrase: meta_column_name, ...} of search terms. Note: the search terms must be unique, as a dictionary cannot hold the same search term to be applied to different columns. **kwargs : Any Additional keyword arguments. The only additional keyword argument that is used is validate_unique. validate_unique : bool = True Raise a ValueError if the search result is not a single unique match. Note: the default is True for safety.

Returns

tuple[str, str, str] A tuple of the table, series_id and units for the unique series_id that matches the search terms.

Metacol

Because the meta data is a DataFrame, the columns can be referenced by either their full textual name, or by the short name defined in the metacol object. For example, if metacol is imported as mc, to refer to the Data Item Description column, the user can refer to it as mc.did.

Example

from readabs import metacol as mc  # alias for the ABS meta data column names
from readabs import read_abs_cat, find_abs_id, recalibrate
cat_num = "6202.0"  # The ABS labour force survey
data, meta = read_abs_cat(cat_num)
search_terms = {
    "Employed total ;  Persons ;": mc.did,
    "Seasonally Adjusted": mc.stype,
    "6202001": mc.table,
}
table, series_id, units = find_abs_id(meta, search_terms)
print(f"Table: {table} Series ID: {series_id} Units: {units}")
recal_series, recal_units = recalibrate(data[table][series_id], units)
@cache
def grab_abs_url(url: str = '', **kwargs: Any) -> dict[str, pandas.core.frame.DataFrame]:
 23@cache  # minimise slowness with repeat business
 24def grab_abs_url(
 25    url: str = "",
 26    **kwargs: Any,
 27) -> dict[str, DataFrame]:
 28    """For a given URL, extract the data from the Excel and ZIP file
 29    links found on that page. The data is returned as a dictionary of
 30    DataFrames. The Excel files are converted into DataFrames, with
 31    each sheet in each Excel file becoming a separate DataFrame. ZIP
 32    files are examined for Excel files, which are similarly converted into
 33    DataFrames. The dictionary of DataFrames is returned.
 34
 35    The preferred mechanism for reading ABS data is to use the `read_abs_cat()`
 36    or `read_abs_series()` functions. This function is provided for those
 37    cases where the data is not available in the ABS catalogue, where the
 38    data is not a timeseries, or where the user wants to extract data from
 39    a specific ABS landingpage.
 40
 41
 42    Parameters
 43    ----------
 44    url : str = ""
 45        A URL for an ABS Catalogue landing page. Either a url or
 46        a catalogue number must be provided. If both are provided, the
 47        URL will be used.
 48
 49    **kwargs : Any
 50        Accepts the same keyword arguments as `read_abs_cat()`. Additionally,
 51        a cat argument can be provided, which will be used to get the URL
 52        (see below).
 53
 54    cat : str = ""
 55        An ABS Catalogue number. If provided, and the URL is not
 56        provided, then the Catalogue number will be used to get the URL.
 57
 58    Returns
 59    -------
 60    dict[str, DataFrame]
 61        A dictionary of DataFrames."""
 62
 63    # check/get the keyword arguments
 64    url = _get_url(url, kwargs)  # note: removes "cat" from kwargs
 65    check_kwargs(kwargs, "grab_abs_url")  # warn if invalid kwargs
 66    args = get_args(kwargs, "grab_abs_url")  # get the valid kwargs
 67    if verbose := args["verbose"]:
 68        print(f"grab_abs_url(): {url=}, {args=}")
 69
 70    # get the URL links to the relevant ABS data files on that webpage
 71    links = get_abs_links(url, **args)
 72    if not links:
 73        print(f"No data files found at URL: {url}")
 74        return {}  # return an empty Dictionary
 75
 76    # read the data files into a dictionary of DataFrames
 77    abs_dict: dict[str, DataFrame] = {}
 78
 79    # use the args, and the found links to get the data ...
 80    if args["single_excel_only"]:
 81        link = _find_url(links, ".xlsx", args["single_excel_only"], verbose)
 82        if link:
 83            abs_dict = _add_excel(abs_dict, link, **args)
 84            return abs_dict
 85
 86    if args["single_zip_only"]:
 87        link = _find_url(links, ".zip", args["single_zip_only"], verbose)
 88        if link:
 89            abs_dict = _add_zip(abs_dict, link, **args)
 90            return abs_dict
 91
 92    for link_type in ".zip", ".xlsx":  # .zip must come first
 93        for link in links.get(link_type, []):
 94
 95            if link_type == ".zip" and args["get_zip"]:
 96                abs_dict = _add_zip(abs_dict, link, **args)
 97
 98            elif link_type == ".xlsx":
 99                if (
100                    args["get_excel"]
101                    or (args["get_excel_if_no_zip"] and not args["get_zip"])
102                    or (args["get_excel_if_no_zip"] and not links.get(".zip", []))
103                ):
104                    abs_dict = _add_excel(abs_dict, link, **args)
105
106    return abs_dict

For a given URL, extract the data from the Excel and ZIP file links found on that page. The data is returned as a dictionary of DataFrames. The Excel files are converted into DataFrames, with each sheet in each Excel file becoming a separate DataFrame. ZIP files are examined for Excel files, which are similarly converted into DataFrames. The dictionary of DataFrames is returned.

The preferred mechanism for reading ABS data is to use the read_abs_cat() or read_abs_series() functions. This function is provided for those cases where the data is not available in the ABS catalogue, where the data is not a timeseries, or where the user wants to extract data from a specific ABS landingpage.

Parameters

url : str = "" A URL for an ABS Catalogue landing page. Either a url or a catalogue number must be provided. If both are provided, the URL will be used.

**kwargs : Any Accepts the same keyword arguments as read_abs_cat(). Additionally, a cat argument can be provided, which will be used to get the URL (see below).

cat : str = "" An ABS Catalogue number. If provided, and the URL is not provided, then the Catalogue number will be used to get the URL.

Returns

dict[str, DataFrame] A dictionary of DataFrames.

@cache
def abs_catalogue(cache_only=False, verbose=False) -> pandas.core.frame.DataFrame:
10@cache
11def abs_catalogue(cache_only=False, verbose=False) -> DataFrame:
12    """Return a DataFrame of ABS Catalogue numbers. In the first instance,
13    this is downloaded from the ABS website, and cached for future use.
14
15    Parameters
16    ----------
17    cache_only : bool = False
18        If True, only use the cache.
19    verbose : bool = False
20        If True, print progress messages.
21
22    Returns
23    -------
24    DataFrame
25        A DataFrame of ABS Catalogue numbers.
26
27    Example
28    -------
29    ```python
30    import readabs as ra
31    catalogue = ra.abs_catalogue()
32    ```"""
33
34    # get ABS web page of catalogue numbers
35    url = "https://www.abs.gov.au/about/data-services/help/abs-time-series-directory"
36    abs_bytes = get_file(url, cache_only=cache_only, verbose=verbose)
37    links = read_html(StringIO(abs_bytes.decode("utf-8")), extract_links="body")[
38        1
39    ]  # second table on the page
40
41    # extract catalogue numbers
42    cats = links["Catalogue Number"].apply(Series)[0]
43    urls = links["Topic"].apply(Series)[1]
44    root = "https://www.abs.gov.au/statistics/"
45    snip = urls.str.replace(root, "")
46    snip = (
47        snip[~snip.str.contains("http")].str.replace("-", " ").str.title()
48    )  # remove bad cases
49    frame = snip.str.split("/", expand=True).iloc[:, :3]
50    frame.columns = Index(["Theme", "Parent Topic", "Topic"])
51    frame["URL"] = urls
52    cats = cats[frame.index]
53    cat_index = cats.str.replace("(Ceased)", "").str.strip()
54    status = Series(" ", index=cats.index).where(cat_index == cats, "Ceased")
55    frame["Status"] = status
56    frame.index = Index(cat_index)
57    frame.index.name = "Catalogue ID"
58    return frame

Return a DataFrame of ABS Catalogue numbers. In the first instance, this is downloaded from the ABS website, and cached for future use.

Parameters

cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.

Returns

DataFrame A DataFrame of ABS Catalogue numbers.

Example

import readabs as ra
catalogue = ra.abs_catalogue()
@cache
def rba_catalogue(cache_only=False, verbose=False) -> pandas.core.frame.DataFrame:
18@cache
19def rba_catalogue(cache_only=False, verbose=False) -> DataFrame:
20    """Return a DataFrame of RBA Catalogue numbers. In the first instance,
21    this is downloaded from the RBA website, and cached for future use.
22
23    Parameters
24    ----------
25    cache_only : bool = False
26        If True, only use the cache.
27    verbose : bool = False
28        If True, print progress messages.
29
30    Returns
31    -------
32    DataFrame
33        A DataFrame of RBA Catalogue numbers.
34
35    Example
36    -------
37    ```python
38    import readabs as ra
39    catalogue = ra.rba_catalogue()
40    ```"""
41
42    return _get_rba_links(cache_only=cache_only, verbose=verbose)

Return a DataFrame of RBA Catalogue numbers. In the first instance, this is downloaded from the RBA website, and cached for future use.

Parameters

cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.

Returns

DataFrame A DataFrame of RBA Catalogue numbers.

Example

import readabs as ra
catalogue = ra.rba_catalogue()
def read_rba_table( table: str, **kwargs: Any) -> tuple[pandas.core.frame.DataFrame, pandas.core.frame.DataFrame]:
 77def read_rba_table(table: str, **kwargs: Any) -> tuple[DataFrame, DataFrame]:
 78    """Read a table from the RBA website and return the actual data
 79    and the meta data in a tuple of two DataFrames.
 80
 81    Parameters
 82    ----------
 83    table : str
 84        The table to read from the RBA website.
 85    **kwargs : Any
 86        Additional keyword arguments.
 87        The only keyword argument that is used is ignore_errors.
 88    ignore_errors : bool = False
 89        If True, then any major errors encountered will be printed and the function
 90        will return empty DataFrames. If False, then any major errors encountered
 91        will raise an exception.
 92
 93    Returns
 94    -------
 95    tuple[DataFrame, DataFrame]
 96        The primary data and the meta data in a tuple of two DataFrames.
 97
 98    Examples
 99    --------
100    ```python
101    data, meta = read_rba_table("C1")
102    ```"""
103
104    # set-up
105    ignore_errors = kwargs.get("ignore_errors", False)
106    data, meta = DataFrame(), DataFrame()
107
108    # get the Excel file
109    excel = _get_excel_file(table, ignore_errors, **kwargs)
110    if excel is None:
111        return data, meta
112
113    # read Excel file into DataFrame
114    try:
115        raw = read_excel(BytesIO(excel), header=None, index_col=None)
116    except Exception as e:
117        if ignore_errors:
118            print(f"Ignoring error: {e}")
119            return data, meta
120        raise
121
122    # extract the meta data
123    meta = raw.iloc[1:11, :].T.copy()
124    meta.columns = Index(meta.iloc[0])
125    renamer = {
126        "Mnemonic": rm.id,
127    }  # historical data is inconsistent
128    meta = meta.rename(columns=renamer)
129    meta = meta.iloc[1:, :]
130    meta.index = Index(meta[rm.id])
131    meta[rm.table] = table
132    meta[rm.tdesc] = raw.iloc[0, 0]
133    meta = meta.dropna(how="all", axis=1)  # drop columns with all NaNs
134
135    # extract the data
136    data = raw.iloc[10:, :].copy()
137    data.columns = Index(data.iloc[0])
138    data = data.iloc[1:, :]
139    data.index = DatetimeIndex(data.iloc[:, 0])
140    data = data.iloc[:, 1:]
141    data = data.dropna(how="all", axis=1)  # drop columns with all NaNs
142
143    # can we make the index into a PeriodIndex?
144    days = data.index.to_series().diff(1).dropna().dt.days
145    if days.min() >= 28 and days.max() <= 31:
146        data.index = PeriodIndex(data.index, freq="M")
147    elif days.min() >= 90 and days.max() <= 92:
148        data.index = PeriodIndex(data.index, freq="Q")
149    elif days.min() >= 365 and days.max() <= 366:
150        data.index = PeriodIndex(data.index, freq="Y")
151    else:
152        data.index = PeriodIndex(data.index, freq="D")
153
154    return data, meta

Read a table from the RBA website and return the actual data and the meta data in a tuple of two DataFrames.

Parameters

table : str The table to read from the RBA website. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is ignore_errors. ignore_errors : bool = False If True, then any major errors encountered will be printed and the function will return empty DataFrames. If False, then any major errors encountered will raise an exception.

Returns

tuple[DataFrame, DataFrame] The primary data and the meta data in a tuple of two DataFrames.

Examples

data, meta = read_rba_table("C1")
rba_metacol = _RbaMetacol(title='Title', desc='Description', freq='Frequency', type='Type', unit='Units', src='Source', pub='Publication date', id='Series ID', table='Table', tdesc='Table Description')
def read_rba_ocr(monthly: bool = True, **kwargs: Any) -> pandas.core.series.Series:
157def read_rba_ocr(monthly: bool = True, **kwargs: Any) -> Series:
158    """Read the Official Cash Rate (OCR) from the RBA website and return it
159    in a pandas Series, with either a daily or monthly PeriodIndex,
160    depending on the value of the monthly parameter. The default is monthly.
161
162    Parameters
163    ----------
164    monthly : bool = True
165        If True, then the data will be returned with a monthly PeriodIndex.
166        If False, then the data will be returned with a daily PeriodIndex.
167    **kwargs : Any
168        Additional keyword arguments. The only keyword argument that is used is ignore_errors.
169    ignore_errors : bool = False
170        If True, then any major errors encountered will be printed and the function
171        will return an empty Series. If False, then any major errors encountered
172        will raise an exception.
173
174    Returns
175    -------
176    Series
177        The OCR data in a pandas Series, with an index of either daily or monthly Periods.
178
179    Examples
180    --------
181    ```python
182    ocr = read_rba_ocr(monthly=True)
183    ```"""
184
185    # read the OCR table from the RBA website, make float and sort, name the series
186    rba, _rba_meta = read_rba_table("A2", **kwargs)  # should have a daily PeriodIndex
187    ocr = (
188        rba.loc[lambda x: x.index >= "1990-08-02", "ARBAMPCNCRT"]
189        .astype(float)
190        .sort_index()
191    )
192    ocr.name = "RBA Official Cash Rate"
193
194    # bring up to date
195    today = Period(Timestamp.today(), freq=cast(PeriodIndex, ocr.index).freqstr)
196    if ocr.index[-1] < today:
197        ocr[today] = ocr.iloc[-1]
198
199    if not monthly:
200        # fill in missing days and return daily data
201        daily_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="D")
202        ocr = ocr.reindex(daily_index).ffill()
203        return ocr
204
205    # convert to monthly data, keeping last value if duplicates in month
206    # fill in missing months
207    ocr.index = PeriodIndex(ocr.index, freq="M")
208    ocr = ocr[~ocr.index.duplicated(keep="last")]
209    monthly_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="M")
210    ocr = ocr.reindex(monthly_index, method="ffill")
211    return ocr

Read the Official Cash Rate (OCR) from the RBA website and return it in a pandas Series, with either a daily or monthly PeriodIndex, depending on the value of the monthly parameter. The default is monthly.

Parameters

monthly : bool = True If True, then the data will be returned with a monthly PeriodIndex. If False, then the data will be returned with a daily PeriodIndex. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is ignore_errors. ignore_errors : bool = False If True, then any major errors encountered will be printed and the function will return an empty Series. If False, then any major errors encountered will raise an exception.

Returns

Series The OCR data in a pandas Series, with an index of either daily or monthly Periods.

Examples

ocr = read_rba_ocr(monthly=True)
def percent_change(data: ~Datatype, n_periods: int) -> ~Datatype:
13def percent_change(data: DataT, n_periods: int) -> DataT:
14    """Calculate an percentage change in a contiguous, ordered series over n_periods.
15
16    Parameters
17    ----------
18    data : pandas Series or DataFrame
19        The data to calculate the percentage change for.
20    n_periods : int
21        The number of periods to calculate the percentage change over.
22        Typically 4 for quarterly data, and 12 for monthly data.
23
24    Returns
25    -------
26    pandas Series or DataFrame
27        The percentage change in the data over n_periods. For DataFrame input,
28        the percentage change is calculated for each column.
29    """
30
31    return (data / data.shift(n_periods) - 1) * 100

Calculate an percentage change in a contiguous, ordered series over n_periods.

Parameters

data : pandas Series or DataFrame The data to calculate the percentage change for. n_periods : int The number of periods to calculate the percentage change over. Typically 4 for quarterly data, and 12 for monthly data.

Returns

pandas Series or DataFrame The percentage change in the data over n_periods. For DataFrame input, the percentage change is calculated for each column.

def annualise_rates(data: ~Datatype, periods_per_year: int | float = 12) -> ~Datatype:
34def annualise_rates(data: DataT, periods_per_year: int | float = 12) -> DataT:
35    """Annualise a growth rate for a period.
36    Note: returns a percentage value (and not a rate)!
37
38    Parameters
39    ----------
40    data : pandas Series or DataFrame
41        The growth rate to annualise. Note a growth rate of 0.05 is 5%.
42    periods_per_year : int or float, default 12
43        The number of periods in a year. For monthly data, this is 12.
44
45    Returns
46    -------
47    pandas Series or DataFrame
48        The annualised growth expressed as a percentage (not a rate).
49        For DataFrame input, the annualised growth rate is calculated
50        for each column."""
51    return (((1 + data) ** periods_per_year) - 1) * 100

Annualise a growth rate for a period. Note: returns a percentage value (and not a rate)!

Parameters

data : pandas Series or DataFrame The growth rate to annualise. Note a growth rate of 0.05 is 5%. periods_per_year : int or float, default 12 The number of periods in a year. For monthly data, this is 12.

Returns

pandas Series or DataFrame The annualised growth expressed as a percentage (not a rate). For DataFrame input, the annualised growth rate is calculated for each column.

def annualise_percentages(data: ~Datatype, periods_per_year: int | float = 12) -> ~Datatype:
54def annualise_percentages(data: DataT, periods_per_year: int | float = 12) -> DataT:
55    """Annualise a growth rate (expressed as a percentage) for a period.
56
57    Parameters
58    ----------
59    data : pandas Series or DataFrame
60        The growth rate (expresed as a percentage) to annualise. Note a
61        growth percentage of 5% is a growth rate of 0.05.
62    periods_per_year : int or float, default 12
63        The number of periods in a year. For monthly data, this is 12.
64
65    Returns
66    -------
67    pandas Series or DataFrame
68        The annualised growth expressed as a percentage. For DataFrame input,
69        the annualised growth rate is calculated for each column."""
70
71    rates = data / 100.0
72    return annualise_rates(rates, periods_per_year)

Annualise a growth rate (expressed as a percentage) for a period.

Parameters

data : pandas Series or DataFrame The growth rate (expresed as a percentage) to annualise. Note a growth percentage of 5% is a growth rate of 0.05. periods_per_year : int or float, default 12 The number of periods in a year. For monthly data, this is 12.

Returns

pandas Series or DataFrame The annualised growth expressed as a percentage. For DataFrame input, the annualised growth rate is calculated for each column.

def qtly_to_monthly( data: ~Datatype, interpolate: bool = True, limit: Optional[int] = 2, dropna: bool = True) -> ~Datatype:
 75def qtly_to_monthly(
 76    data: DataT,
 77    interpolate: bool = True,
 78    limit: Optional[int] = 2,  # only used if interpolate is True
 79    dropna: bool = True,
 80) -> DataT:
 81    """Convert a pandas timeseries with a Quarterly PeriodIndex to an
 82    timeseries with a Monthly PeriodIndex.
 83
 84    Parameters
 85    ----------
 86    data - either a pandas Series or DataFrame - assumes the index is unique.
 87        The data to convert to monthly frequency.
 88    interpolate: bool, default True
 89        Whether to interpolate the missing monthly data.
 90    limit: int, default 2
 91        The maximum number of consecutive missing months to interpolate.
 92    dropna: bool, default True
 93        Whether to drop NA data
 94
 95    Returns
 96    -------
 97    pandas Series or DataFrame
 98        The data with a Monthly PeriodIndex. If interpolate is True, the
 99        missing monthly data is interpolated. If dropna is True, any NA
100        data is removed."""
101
102    # sanity checks
103    assert isinstance(data.index, PeriodIndex)
104    assert data.index.freqstr[0] == "Q"
105    assert data.index.is_unique
106    assert data.index.is_monotonic_increasing
107
108    def set_axis_monthly_periods(x: DataT) -> DataT:
109        """Convert a DatetimeIndex to a Monthly PeriodIndex."""
110
111        return x.set_axis(
112            labels=cast(DatetimeIndex, x.index).to_period(freq="M"), axis="index"
113        )
114
115    # do the heavy lifting
116    data = (
117        data.set_axis(
118            labels=data.index.to_timestamp(how="end"), axis="index", copy=True
119        )
120        .resample(rule="ME")  # adds in every missing month
121        .first(min_count=1)  # generates nans for new months
122        # assumes only one value per quarter (ie. unique index)
123        .pipe(set_axis_monthly_periods)
124    )
125
126    if interpolate:
127        data = data.interpolate(limit_area="inside", limit=limit)
128    if dropna:
129        data = data.dropna()
130
131    return data

Convert a pandas timeseries with a Quarterly PeriodIndex to an timeseries with a Monthly PeriodIndex.

Parameters

data - either a pandas Series or DataFrame - assumes the index is unique. The data to convert to monthly frequency. interpolate: bool, default True Whether to interpolate the missing monthly data. limit: int, default 2 The maximum number of consecutive missing months to interpolate. dropna: bool, default True Whether to drop NA data

Returns

pandas Series or DataFrame The data with a Monthly PeriodIndex. If interpolate is True, the missing monthly data is interpolated. If dropna is True, any NA data is removed.

def monthly_to_qtly(data: ~Datatype, q_ending='DEC', f: str = 'mean') -> ~Datatype:
134def monthly_to_qtly(data: DataT, q_ending="DEC", f: str = "mean") -> DataT:
135    """Convert monthly data to quarterly data by taking the mean (or sum)
136    of the three months in each quarter. Ignore quarters with less than
137    or more than three months data. Drop NA items. Change f to "sum"
138    for a quarterly sum.
139
140    Parameters
141    ----------
142    data : pandas Series or DataFrame
143        The data to convert to quarterly frequency.
144    q_ending : str, default DEC
145        The month in which the quarter ends. For example, "DEC" for December.
146    f : str, default "mean"
147        The function to apply to the three months in each quarter.
148        Change to "sum" for a quarterly sum. The default is a
149        quarterly mean.
150
151    Returns
152    -------
153    pandas Series or DataFrame
154        The data with a quarterly PeriodIndex. If a quarter has less than
155        three months data, the quarter is dropped. If the quarter has more
156        than three months data, the quarter is dropped. Any NA data is removed.
157        For DataFrame input, the function is applied to each column."""
158
159    if isinstance(data, Series):
160        return _monthly_to_qtly_series(data, q_ending, f)
161
162    if isinstance(data, DataFrame):
163        chamber = {}
164        for col in data.columns:
165            chamber[col] = _monthly_to_qtly_series(data[col], q_ending, f)
166        return DataFrame(chamber)
167
168    raise ValueError("data must be a pandas Series or DataFrame")

Convert monthly data to quarterly data by taking the mean (or sum) of the three months in each quarter. Ignore quarters with less than or more than three months data. Drop NA items. Change f to "sum" for a quarterly sum.

Parameters

data : pandas Series or DataFrame The data to convert to quarterly frequency. q_ending : str, default DEC The month in which the quarter ends. For example, "DEC" for December. f : str, default "mean" The function to apply to the three months in each quarter. Change to "sum" for a quarterly sum. The default is a quarterly mean.

Returns

pandas Series or DataFrame The data with a quarterly PeriodIndex. If a quarter has less than three months data, the quarter is dropped. If the quarter has more than three months data, the quarter is dropped. Any NA data is removed. For DataFrame input, the function is applied to each column.

def recalibrate(data: ~Datatype, units: str) -> tuple[~Datatype, str]:
15def recalibrate(
16    data: DataT,
17    units: str,
18) -> tuple[DataT, str]:
19    """Recalibrate a Series or DataFrame so the data in in the range -1000 to 1000.
20    Change the name of the units to reflect the recalibration.
21
22    Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar.
23    If you provide a Series, you will get a Series back. If you provide a DataFrame,
24    you will get a DataFrame back.
25
26    Parameters
27    ----------
28    data : Series or DataFrame
29        The data to recalibrate.
30    units : str
31        The units of the data. This string should be in the form of
32        "Number", "Thousands", "Millions", "Billions", etc. The units
33        should be in title case.
34
35    Returns
36    -------
37    Series or DataFrame
38        The recalibrated data will be a Series if a Series was provided,
39        or a DataFrame if a DataFrame was provided.
40
41    Examples
42    --------
43    ```python
44    from pandas import Series
45    from readabs import recalibrate
46    s = Series([1_000, 10_000, 100_000, 1_000_000])
47    recalibrated, units = recalibrate(s, "$")
48    print(f"{recalibrated=}, {units=}")
49    ```"""
50
51    if not isinstance(data, (Series, DataFrame)):
52        raise TypeError("data must be a Series or DataFrame")
53    units, restore_name = _prepare_units(units)
54    flat_data = data.to_numpy().flatten()
55    flat_data, units = _recalibrate(flat_data, units)
56
57    if restore_name:
58        units = f"{restore_name} {units}"
59        for n in "numbers", "number":
60            if n in units:
61                units = units.replace(n, "").strip()
62                break
63    units = units.title()
64
65    restore_pandas = DataFrame if len(data.shape) == 2 else Series
66    result = restore_pandas(flat_data.reshape(data.shape))
67    result.index = data.index
68    if len(data.shape) == 2:
69        result.columns = data.columns
70    if len(data.shape) == 1:
71        result.name = data.name
72    return result, units

Recalibrate a Series or DataFrame so the data in in the range -1000 to 1000. Change the name of the units to reflect the recalibration.

Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar. If you provide a Series, you will get a Series back. If you provide a DataFrame, you will get a DataFrame back.

Parameters

data : Series or DataFrame The data to recalibrate. units : str The units of the data. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.

Returns

Series or DataFrame The recalibrated data will be a Series if a Series was provided, or a DataFrame if a DataFrame was provided.

Examples

from pandas import Series
from readabs import recalibrate
s = Series([1_000, 10_000, 100_000, 1_000_000])
recalibrated, units = recalibrate(s, "$")
print(f"{recalibrated=}, {units=}")
def recalibrate_value(value: float, units: str) -> tuple[float, str]:
 75def recalibrate_value(value: float, units: str) -> tuple[float, str]:
 76    """Recalibrate a floating point value. The value will be recalibrated
 77    so it is in the range -1000 to 1000. The units will be changed to reflect
 78    the recalibration.
 79
 80    Parameters
 81    ----------
 82    value : float
 83        The value to recalibrate.
 84    units : str
 85        The units of the value. This string should be in the form of
 86        "Number", "Thousands", "Millions", "Billions", etc. The units
 87        should be in title case.
 88
 89    Returns
 90    -------
 91    tuple[float, str]
 92        A tuple containing the recalibrated value and the recalibrated units.
 93
 94    Examples
 95    --------
 96    ```python
 97    from readabs import recalibrate_value
 98    recalibrated, units = recalibrate_value(10_000_000, "Thousand")
 99    print(recalibrated, units)
100    ```"""
101
102    series = Series([value])
103    output, units = recalibrate(series, units)
104    return output.values[0], units

Recalibrate a floating point value. The value will be recalibrated so it is in the range -1000 to 1000. The units will be changed to reflect the recalibration.

Parameters

value : float The value to recalibrate. units : str The units of the value. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.

Returns

tuple[float, str] A tuple containing the recalibrated value and the recalibrated units.

Examples

from readabs import recalibrate_value
recalibrated, units = recalibrate_value(10_000_000, "Thousand")
print(recalibrated, units)