readabs.read_abs_by_desc

Get specific ABS data series by searching for the ABS data item descriptions.

  1"""Get specific ABS data series by searching for the ABS 
  2data item descriptions."""
  3
  4# --- imports
  5# system imports
  6from typing import Any
  7import inspect
  8
  9# Analytic imports
 10import pandas as pd
 11
 12# local imports
 13from readabs.abs_meta_data import metacol as mc
 14from readabs.search_abs_meta import find_abs_id
 15from readabs.read_abs_cat import read_abs_cat
 16
 17
 18# --- private functions
 19def _work_to_do(wanted: Any) -> bool:
 20    """Check if there is any work to do."""
 21    if wanted is None or len(wanted) == 0:
 22        print("No data requested.")
 23        return False
 24    return True
 25
 26
 27def _wlist_to_wdict(wanted: list[str]) -> dict[str, str]:
 28    """Convert a list of strings to a dictionary of strings:strings.
 29    Note: the keys and values are the same.
 30    Note: any duplicate elements in the list will be lost."""
 31    return {k: k for k in wanted}
 32
 33
 34def _get_search_terms(input_dict, output_dict) -> dict[str, str]:
 35    """Build a selector dictionary from the input dictionary."""
 36    search_names = {
 37        abbr: term for abbr, term in inspect.getmembers(mc) if not abbr.startswith("_")
 38    }
 39    for mc_abbr, meta_column in search_names.items():
 40        if mc_abbr in input_dict:
 41            # the selector dictionary is back-to_front
 42            # ie. {value_sought: column_name}
 43            output_dict[input_dict[mc_abbr]] = meta_column
 44    return output_dict
 45
 46
 47def _get_args(keys: list[str], input_dict, output_dict) -> dict[str, Any]:
 48    """Build a retrieval dictionary from the input dictionary."""
 49    for key in keys:
 50        if key in input_dict:
 51            output_dict[key] = input_dict[key]
 52    return output_dict
 53
 54
 55def _get_search_args(input_dict: dict, output_dict: dict) -> dict[str, Any]:
 56    """Extract the search arguments from the input dictionary."""
 57    keys = ["validate_unique", "exact_match", "regex", "verbose"]
 58    return _get_args(keys, input_dict, output_dict)
 59
 60
 61def _get_retrieval_args(input_dict: dict, output_dict: dict) -> dict[str, Any]:
 62    """Extract the retrieval arguments from the input dictionary."""
 63    keys = [
 64        "ignore_errors",
 65        "get_zip",
 66        "get_excel_if_no_zip",
 67        "get_excel",
 68        "cache_only",
 69        "single_excel_only",
 70        "single_zip_only",
 71        "verbose",
 72    ]
 73    return _get_args(keys, input_dict, output_dict)
 74
 75
 76def _get_item_from_str(
 77    item: str,
 78    data_dict,
 79    data_meta,
 80    item_selector,
 81    search_args,
 82) -> tuple[pd.Series, pd.DataFrame]:
 83    """Get a data series from the data dictionary and metadata.
 84    Give the series its series-id as a name."""
 85
 86    if not data_dict or data_meta.empty:
 87        raise ValueError(
 88            "If the wanted data is a string, a populated abs_dict "
 89            + "and abs_meta must be provided."
 90        )
 91    item_selector[item] = mc.did  # back_to_front
 92    table, series_id, units = find_abs_id(data_meta, item_selector, **search_args)
 93
 94    series = data_dict[table][series_id]
 95    series.name = series_id
 96    series_meta = data_meta.loc[
 97        (data_meta[mc.table] == table)
 98        & (data_meta[mc.id] == series_id)
 99        & (data_meta[mc.unit] == units)
100    ]
101    return series, series_meta
102
103
104def _get_item_from_dict(
105    item_dict: dict[str, Any],
106    data_dict: dict[str, pd.DataFrame],
107    data_meta: pd.DataFrame,
108    item_selector: dict[str, str],
109    search_args: dict[str, Any],
110    **kwargs,
111) -> tuple[pd.Series, pd.DataFrame]:
112
113    # preparation
114    if "did" not in item_dict:
115        raise ValueError("Each inner dictionary must contain a 'did' key.")
116    item = item_dict.pop("did")
117    item_selector = _get_search_terms(item_dict, item_selector)
118    item_search_args = _get_search_args(item_dict, search_args)
119
120    if not data_dict or data_meta.empty:
121        # data retrieval reqquired
122        if "cat" not in item_dict:
123            raise ValueError(
124                "Each inner dictionary must contain a 'cat' key, "
125                + "if an abs_dict is not provided/empty or the "
126                + "abs_meta is not provided/empty."
127            )
128        ret_args = _get_retrieval_args(kwargs, {})
129        ret_args = _get_retrieval_args(item_dict, ret_args)
130        data_dict, data_meta = read_abs_cat(cat=item_dict["cat"], **ret_args)
131
132    # series extraction based on search terms
133    series, series_meta = _get_item_from_str(
134        item=item,
135        data_dict=data_dict,
136        data_meta=data_meta,
137        item_selector=item_selector,
138        search_args=item_search_args,
139    )
140    return series, series_meta
141
142
143# --- public functions
144def read_abs_by_desc(
145    wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]],
146    **kwargs: Any,
147) -> tuple[dict[str, pd.Series], pd.DataFrame]:
148    """Get specific ABS data series by searching the ABS meta data.
149
150    Parameters
151    ----------
152    - wanted : list of str, dict of str:str, or dict of str:dict - the data
153        item descriptions to search for. If a list, it will be a list of
154        descriptions to search for. If a dictionary, the keys will a name.
155        The dixtionary values can be either a string (the data item
156        description to search for) or a dictionary of keyword arguments, one of
157        which would be the data item description to search for.
158    - kwargs : Any - keyword arguments to control the data retrieval.
159        The keyword arguments can include the following:
160        - abs_dict : dict - the dictionary of ABS data to search (from
161            read_abs_cat()).
162        - abs_meta : DataFrame - the metadata for the ABS data (from
163            read_abs_cat()).
164        - for the retrieval of data, the "cat" argument must be present.
165            The following arguments, if present, will also be used (ie.
166            passed to read_abs_cat()): ["ignore_errors", "get_zip",
167            "get_excel_if_no_zip", "get_excel", "cache_only",
168            "single_excel_only", "single_zip_only", "verbose"].
169        - for the selection of data, the following metacol names, if present,
170            will be used to construct the selector: "cat", "did"
171            "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
172            "cmonth", "table", "tdesc".
173        - finally, the following arguments will be passed to the find_abs_id()
174            and search_abs_meta() functions: ["validate_unique", "exact_match",
175            "regex", "verbose"].
176
177    Notes:
178    - if "wanted" is of type list[str] or dict[str, str], the kwargs should
179        include sufficient keys from the metacol dataclass to get the data.
180        Typically, the "cat" key, the "table" key, and the "stype" key would
181        be required. The did key would taken from the wanted list or
182        dictionary.
183    if wanted is of type dict[str, dict[str, Any]], the inner dictionary
184        must contain a "did" key. The other keys that can be used for the
185        data retrieval are the same as the metacol dataclass fileds, namely:
186        "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
187        "cmonth", "table", "tdesc".
188    - if abs_dict and abs_meta are provided within the kwargs, they will be
189        used to locate and extract the selected data.
190    - if abs_dict and abs_meta are not provided, then, (1) wanted must be of
191        type dict[str, dict[str, Any]] and (2) the inner dictionary must
192        contain a "cat" key so the data can be retrieved. Other keys that
193        can be used for the data retrieval are the same as for read_abs_cat(),
194        namely ["ignore_errors", "get_zip", "get_excel_if_no_zip",
195        "get_excel", "single_excel_only", "single_zip_only", "cache_only"].
196
197
198    Returns
199    -------
200    Returns a tuple of two items:
201    - A dictionary of pandas Series objects, where the keys are the series
202      descriptions. The series.name attribute will be the ABS series-id.
203    - A pandas DataFrame containing the metadata for the series.
204
205    Example
206    -------
207
208    ```python
209    import readabs as ra
210    from pandas import DataFrame
211    cat_num = "5206.0"  # The ABS National Accounts
212    data, meta = ra.read_abs_cat(cat=cat_num)
213    wanted = ["Gross domestic product: Chain volume measures ;",]
214    selected, selected_meta = ra.read_abs_by_desc(
215        wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
216    )
217    ```"""
218
219    # - preparation
220    if not _work_to_do(wanted):
221        return {}, pd.DataFrame()
222    if isinstance(wanted, list):
223        wanted = _wlist_to_wdict(wanted)
224    abs_dict = kwargs.get("abs_dict", {})
225    abs_meta = kwargs.get("abs_meta", pd.DataFrame())
226    kwarg_selector = _get_search_terms(kwargs, {})
227    search_args = _get_search_args(kwargs, {})
228
229    return_dict = {}
230    return_meta = pd.DataFrame()
231    for key, value in wanted.items():
232
233        item_selector = kwarg_selector.copy()
234        item_search_args = search_args.copy()
235        if isinstance(value, str):
236            series, meta = _get_item_from_str(
237                item=value,
238                data_dict=abs_dict,
239                data_meta=abs_meta,
240                item_selector=item_selector,
241                search_args=item_search_args,
242            )
243
244        elif isinstance(value, dict):
245            series, meta = _get_item_from_dict(
246                item_dict=value,
247                data_dict=abs_dict,
248                data_meta=abs_meta,
249                item_selector=item_selector,
250                search_args=item_search_args,
251                **kwargs,
252            )
253        else:
254            raise TypeError(
255                "Each value in the wanted list/dictionary must be either a string "
256                + "or a dictionary."
257            )
258
259        # save search results
260        return_dict[key] = series
261        return_meta = pd.concat([return_meta, meta])
262
263    return return_dict, return_meta
264
265
266# --- testing ---
267if __name__ == "__main__":
268    # --- test 1: get a list of dids
269    def test1():
270        """Test case: get a list of dids."""
271
272        cat = "5206.0"
273        table = "5206001_Key_Aggregates"
274        data_dict, data_meta = read_abs_cat(
275            cat=cat, single_excel_only=table, verbose=False
276        )
277        stype = "Seasonally Adjusted"
278        get_these = data_meta.loc[
279            (data_meta[mc.table] == table)
280            & (data_meta[mc.stype] == stype)
281            & data_meta[mc.unit].str.contains("Million")
282            & data_meta[mc.did].str.contains("Chain volume measures")
283        ][mc.did].to_list()
284        print(f"get_these: {get_these}")
285
286        selected, selected_meta = read_abs_by_desc(
287            wanted=get_these,
288            abs_dict=data_dict,
289            abs_meta=data_meta,
290            # exact_match=True, verbose=True,
291            table=table,
292            stype=stype,
293        )
294        print(selected, selected_meta)
295
296    test1()
297
298    # --- test 2: get a dictionary of dids
299    def test2():
300        """Test case: get a dictionary of dids."""
301
302        gdp_table = "5206001_Key_Aggregates"
303        uer_table = "6202001"
304        sa = "Seasonally Adjusted"
305        get_these = {
306            # two series, each from two different ABS Catalogue Numbers
307            "GDP": {
308                "cat": "5206.0",
309                "table": gdp_table,
310                "stype": sa,
311                "did": "Gross domestic product: Chain volume measures ;",
312                "single_excel_only": gdp_table,
313            },
314            "Unemployment Rate": {
315                "cat": "6202.0",
316                "table": uer_table,
317                "stype": sa,
318                "did": "Unemployment rate ;  Persons ;",
319                "single_excel_only": uer_table,
320            },
321        }
322        selected, selected_meta = read_abs_by_desc(
323            wanted=get_these,
324        )
325
326        print(selected_meta)
327        print(selected)
328
329    test2()
def read_abs_by_desc( wanted: list[str] | dict[str, str] | dict[str, dict[str, typing.Any]], **kwargs: Any) -> tuple[dict[str, pandas.core.series.Series], pandas.core.frame.DataFrame]:
145def read_abs_by_desc(
146    wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]],
147    **kwargs: Any,
148) -> tuple[dict[str, pd.Series], pd.DataFrame]:
149    """Get specific ABS data series by searching the ABS meta data.
150
151    Parameters
152    ----------
153    - wanted : list of str, dict of str:str, or dict of str:dict - the data
154        item descriptions to search for. If a list, it will be a list of
155        descriptions to search for. If a dictionary, the keys will a name.
156        The dixtionary values can be either a string (the data item
157        description to search for) or a dictionary of keyword arguments, one of
158        which would be the data item description to search for.
159    - kwargs : Any - keyword arguments to control the data retrieval.
160        The keyword arguments can include the following:
161        - abs_dict : dict - the dictionary of ABS data to search (from
162            read_abs_cat()).
163        - abs_meta : DataFrame - the metadata for the ABS data (from
164            read_abs_cat()).
165        - for the retrieval of data, the "cat" argument must be present.
166            The following arguments, if present, will also be used (ie.
167            passed to read_abs_cat()): ["ignore_errors", "get_zip",
168            "get_excel_if_no_zip", "get_excel", "cache_only",
169            "single_excel_only", "single_zip_only", "verbose"].
170        - for the selection of data, the following metacol names, if present,
171            will be used to construct the selector: "cat", "did"
172            "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
173            "cmonth", "table", "tdesc".
174        - finally, the following arguments will be passed to the find_abs_id()
175            and search_abs_meta() functions: ["validate_unique", "exact_match",
176            "regex", "verbose"].
177
178    Notes:
179    - if "wanted" is of type list[str] or dict[str, str], the kwargs should
180        include sufficient keys from the metacol dataclass to get the data.
181        Typically, the "cat" key, the "table" key, and the "stype" key would
182        be required. The did key would taken from the wanted list or
183        dictionary.
184    if wanted is of type dict[str, dict[str, Any]], the inner dictionary
185        must contain a "did" key. The other keys that can be used for the
186        data retrieval are the same as the metacol dataclass fileds, namely:
187        "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
188        "cmonth", "table", "tdesc".
189    - if abs_dict and abs_meta are provided within the kwargs, they will be
190        used to locate and extract the selected data.
191    - if abs_dict and abs_meta are not provided, then, (1) wanted must be of
192        type dict[str, dict[str, Any]] and (2) the inner dictionary must
193        contain a "cat" key so the data can be retrieved. Other keys that
194        can be used for the data retrieval are the same as for read_abs_cat(),
195        namely ["ignore_errors", "get_zip", "get_excel_if_no_zip",
196        "get_excel", "single_excel_only", "single_zip_only", "cache_only"].
197
198
199    Returns
200    -------
201    Returns a tuple of two items:
202    - A dictionary of pandas Series objects, where the keys are the series
203      descriptions. The series.name attribute will be the ABS series-id.
204    - A pandas DataFrame containing the metadata for the series.
205
206    Example
207    -------
208
209    ```python
210    import readabs as ra
211    from pandas import DataFrame
212    cat_num = "5206.0"  # The ABS National Accounts
213    data, meta = ra.read_abs_cat(cat=cat_num)
214    wanted = ["Gross domestic product: Chain volume measures ;",]
215    selected, selected_meta = ra.read_abs_by_desc(
216        wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
217    )
218    ```"""
219
220    # - preparation
221    if not _work_to_do(wanted):
222        return {}, pd.DataFrame()
223    if isinstance(wanted, list):
224        wanted = _wlist_to_wdict(wanted)
225    abs_dict = kwargs.get("abs_dict", {})
226    abs_meta = kwargs.get("abs_meta", pd.DataFrame())
227    kwarg_selector = _get_search_terms(kwargs, {})
228    search_args = _get_search_args(kwargs, {})
229
230    return_dict = {}
231    return_meta = pd.DataFrame()
232    for key, value in wanted.items():
233
234        item_selector = kwarg_selector.copy()
235        item_search_args = search_args.copy()
236        if isinstance(value, str):
237            series, meta = _get_item_from_str(
238                item=value,
239                data_dict=abs_dict,
240                data_meta=abs_meta,
241                item_selector=item_selector,
242                search_args=item_search_args,
243            )
244
245        elif isinstance(value, dict):
246            series, meta = _get_item_from_dict(
247                item_dict=value,
248                data_dict=abs_dict,
249                data_meta=abs_meta,
250                item_selector=item_selector,
251                search_args=item_search_args,
252                **kwargs,
253            )
254        else:
255            raise TypeError(
256                "Each value in the wanted list/dictionary must be either a string "
257                + "or a dictionary."
258            )
259
260        # save search results
261        return_dict[key] = series
262        return_meta = pd.concat([return_meta, meta])
263
264    return return_dict, return_meta

Get specific ABS data series by searching the ABS meta data.

Parameters

  • wanted : list of str, dict of str:str, or dict of str:dict - the data item descriptions to search for. If a list, it will be a list of descriptions to search for. If a dictionary, the keys will a name. The dixtionary values can be either a string (the data item description to search for) or a dictionary of keyword arguments, one of which would be the data item description to search for.
  • kwargs : Any - keyword arguments to control the data retrieval. The keyword arguments can include the following:
    • abs_dict : dict - the dictionary of ABS data to search (from read_abs_cat()).
    • abs_meta : DataFrame - the metadata for the ABS data (from read_abs_cat()).
    • for the retrieval of data, the "cat" argument must be present. The following arguments, if present, will also be used (ie. passed to read_abs_cat()): ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "cache_only", "single_excel_only", "single_zip_only", "verbose"].
    • for the selection of data, the following metacol names, if present, will be used to construct the selector: "cat", "did" "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc".
    • finally, the following arguments will be passed to the find_abs_id() and search_abs_meta() functions: ["validate_unique", "exact_match", "regex", "verbose"].

Notes:

  • if "wanted" is of type list[str] or dict[str, str], the kwargs should include sufficient keys from the metacol dataclass to get the data. Typically, the "cat" key, the "table" key, and the "stype" key would be required. The did key would taken from the wanted list or dictionary. if wanted is of type dict[str, dict[str, Any]], the inner dictionary must contain a "did" key. The other keys that can be used for the data retrieval are the same as the metacol dataclass fileds, namely: "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc".
  • if abs_dict and abs_meta are provided within the kwargs, they will be used to locate and extract the selected data.
  • if abs_dict and abs_meta are not provided, then, (1) wanted must be of type dict[str, dict[str, Any]] and (2) the inner dictionary must contain a "cat" key so the data can be retrieved. Other keys that can be used for the data retrieval are the same as for read_abs_cat(), namely ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "single_excel_only", "single_zip_only", "cache_only"].

Returns

Returns a tuple of two items:

  • A dictionary of pandas Series objects, where the keys are the series descriptions. The series.name attribute will be the ABS series-id.
  • A pandas DataFrame containing the metadata for the series.

Example

import readabs as ra
from pandas import DataFrame
cat_num = "5206.0"  # The ABS National Accounts
data, meta = ra.read_abs_cat(cat=cat_num)
wanted = ["Gross domestic product: Chain volume measures ;",]
selected, selected_meta = ra.read_abs_by_desc(
    wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
)