readabs.read_abs_by_desc
Get specific ABS data series by searching for the ABS data item descriptions.
1"""Get specific ABS data series by searching for the ABS 2data item descriptions.""" 3 4# --- imports 5# system imports 6from typing import Any 7import inspect 8 9# Analytic imports 10import pandas as pd 11 12# local imports 13from readabs.abs_meta_data import metacol as mc 14from readabs.search_abs_meta import find_abs_id 15from readabs.read_abs_cat import read_abs_cat 16 17 18# --- private functions 19def _work_to_do(wanted: Any) -> bool: 20 """Check if there is any work to do.""" 21 if wanted is None or len(wanted) == 0: 22 print("No data requested.") 23 return False 24 return True 25 26 27def _wlist_to_wdict(wanted: list[str]) -> dict[str, str]: 28 """Convert a list of strings to a dictionary of strings:strings. 29 Note: the keys and values are the same. 30 Note: any duplicate elements in the list will be lost.""" 31 return {k: k for k in wanted} 32 33 34def _get_search_terms(input_dict, output_dict) -> dict[str, str]: 35 """Build a selector dictionary from the input dictionary.""" 36 search_names = { 37 abbr: term for abbr, term in inspect.getmembers(mc) if not abbr.startswith("_") 38 } 39 for mc_abbr, meta_column in search_names.items(): 40 if mc_abbr in input_dict: 41 # the selector dictionary is back-to_front 42 # ie. {value_sought: column_name} 43 output_dict[input_dict[mc_abbr]] = meta_column 44 return output_dict 45 46 47def _get_args(keys: list[str], input_dict, output_dict) -> dict[str, Any]: 48 """Build a retrieval dictionary from the input dictionary.""" 49 for key in keys: 50 if key in input_dict: 51 output_dict[key] = input_dict[key] 52 return output_dict 53 54 55def _get_search_args(input_dict: dict, output_dict: dict) -> dict[str, Any]: 56 """Extract the search arguments from the input dictionary.""" 57 keys = ["validate_unique", "exact_match", "regex", "verbose"] 58 return _get_args(keys, input_dict, output_dict) 59 60 61def _get_retrieval_args(input_dict: dict, output_dict: dict) -> dict[str, Any]: 62 """Extract the retrieval arguments from the input dictionary.""" 63 keys = [ 64 "ignore_errors", 65 "get_zip", 66 "get_excel_if_no_zip", 67 "get_excel", 68 "cache_only", 69 "single_excel_only", 70 "single_zip_only", 71 "verbose", 72 ] 73 return _get_args(keys, input_dict, output_dict) 74 75 76def _get_item_from_str( 77 item: str, 78 data_dict, 79 data_meta, 80 item_selector, 81 search_args, 82) -> tuple[pd.Series, pd.DataFrame]: 83 """Get a data series from the data dictionary and metadata. 84 Give the series its series-id as a name.""" 85 86 if not data_dict or data_meta.empty: 87 raise ValueError( 88 "If the wanted data is a string, a populated abs_dict " 89 + "and abs_meta must be provided." 90 ) 91 item_selector[item] = mc.did # back_to_front 92 table, series_id, units = find_abs_id(data_meta, item_selector, **search_args) 93 94 series = data_dict[table][series_id] 95 series.name = series_id 96 series_meta = data_meta.loc[ 97 (data_meta[mc.table] == table) 98 & (data_meta[mc.id] == series_id) 99 & (data_meta[mc.unit] == units) 100 ] 101 return series, series_meta 102 103 104def _get_item_from_dict( 105 item_dict: dict[str, Any], 106 data_dict: dict[str, pd.DataFrame], 107 data_meta: pd.DataFrame, 108 item_selector: dict[str, str], 109 search_args: dict[str, Any], 110 **kwargs, 111) -> tuple[pd.Series, pd.DataFrame]: 112 113 # preparation 114 if "did" not in item_dict: 115 raise ValueError("Each inner dictionary must contain a 'did' key.") 116 item = item_dict.pop("did") 117 item_selector = _get_search_terms(item_dict, item_selector) 118 item_search_args = _get_search_args(item_dict, search_args) 119 120 if not data_dict or data_meta.empty: 121 # data retrieval reqquired 122 if "cat" not in item_dict: 123 raise ValueError( 124 "Each inner dictionary must contain a 'cat' key, " 125 + "if an abs_dict is not provided/empty or the " 126 + "abs_meta is not provided/empty." 127 ) 128 ret_args = _get_retrieval_args(kwargs, {}) 129 ret_args = _get_retrieval_args(item_dict, ret_args) 130 data_dict, data_meta = read_abs_cat(cat=item_dict["cat"], **ret_args) 131 132 # series extraction based on search terms 133 series, series_meta = _get_item_from_str( 134 item=item, 135 data_dict=data_dict, 136 data_meta=data_meta, 137 item_selector=item_selector, 138 search_args=item_search_args, 139 ) 140 return series, series_meta 141 142 143# --- public functions 144def read_abs_by_desc( 145 wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]], 146 **kwargs: Any, 147) -> tuple[dict[str, pd.Series], pd.DataFrame]: 148 """Get specific ABS data series by searching the ABS meta data. 149 150 Parameters 151 ---------- 152 - wanted : list of str, dict of str:str, or dict of str:dict - the data 153 item descriptions to search for. If a list, it will be a list of 154 descriptions to search for. If a dictionary, the keys will a name. 155 The dixtionary values can be either a string (the data item 156 description to search for) or a dictionary of keyword arguments, one of 157 which would be the data item description to search for. 158 - kwargs : Any - keyword arguments to control the data retrieval. 159 The keyword arguments can include the following: 160 - abs_dict : dict - the dictionary of ABS data to search (from 161 read_abs_cat()). 162 - abs_meta : DataFrame - the metadata for the ABS data (from 163 read_abs_cat()). 164 - for the retrieval of data, the "cat" argument must be present. 165 The following arguments, if present, will also be used (ie. 166 passed to read_abs_cat()): ["ignore_errors", "get_zip", 167 "get_excel_if_no_zip", "get_excel", "cache_only", 168 "single_excel_only", "single_zip_only", "verbose"]. 169 - for the selection of data, the following metacol names, if present, 170 will be used to construct the selector: "cat", "did" 171 "stype", "id", "start", "end", "num", "unit", "dtype", "freq", 172 "cmonth", "table", "tdesc". 173 - finally, the following arguments will be passed to the find_abs_id() 174 and search_abs_meta() functions: ["validate_unique", "exact_match", 175 "regex", "verbose"]. 176 177 Notes: 178 - if "wanted" is of type list[str] or dict[str, str], the kwargs should 179 include sufficient keys from the metacol dataclass to get the data. 180 Typically, the "cat" key, the "table" key, and the "stype" key would 181 be required. The did key would taken from the wanted list or 182 dictionary. 183 if wanted is of type dict[str, dict[str, Any]], the inner dictionary 184 must contain a "did" key. The other keys that can be used for the 185 data retrieval are the same as the metacol dataclass fileds, namely: 186 "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq", 187 "cmonth", "table", "tdesc". 188 - if abs_dict and abs_meta are provided within the kwargs, they will be 189 used to locate and extract the selected data. 190 - if abs_dict and abs_meta are not provided, then, (1) wanted must be of 191 type dict[str, dict[str, Any]] and (2) the inner dictionary must 192 contain a "cat" key so the data can be retrieved. Other keys that 193 can be used for the data retrieval are the same as for read_abs_cat(), 194 namely ["ignore_errors", "get_zip", "get_excel_if_no_zip", 195 "get_excel", "single_excel_only", "single_zip_only", "cache_only"]. 196 197 198 Returns 199 ------- 200 Returns a tuple of two items: 201 - A dictionary of pandas Series objects, where the keys are the series 202 descriptions. The series.name attribute will be the ABS series-id. 203 - A pandas DataFrame containing the metadata for the series. 204 205 Example 206 ------- 207 208 ```python 209 import readabs as ra 210 from pandas import DataFrame 211 cat_num = "5206.0" # The ABS National Accounts 212 data, meta = ra.read_abs_cat(cat=cat_num) 213 wanted = ["Gross domestic product: Chain volume measures ;",] 214 selected, selected_meta = ra.read_abs_by_desc( 215 wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates" 216 ) 217 ```""" 218 219 # - preparation 220 if not _work_to_do(wanted): 221 return {}, pd.DataFrame() 222 if isinstance(wanted, list): 223 wanted = _wlist_to_wdict(wanted) 224 abs_dict = kwargs.get("abs_dict", {}) 225 abs_meta = kwargs.get("abs_meta", pd.DataFrame()) 226 kwarg_selector = _get_search_terms(kwargs, {}) 227 search_args = _get_search_args(kwargs, {}) 228 229 return_dict = {} 230 return_meta = pd.DataFrame() 231 for key, value in wanted.items(): 232 233 item_selector = kwarg_selector.copy() 234 item_search_args = search_args.copy() 235 if isinstance(value, str): 236 series, meta = _get_item_from_str( 237 item=value, 238 data_dict=abs_dict, 239 data_meta=abs_meta, 240 item_selector=item_selector, 241 search_args=item_search_args, 242 ) 243 244 elif isinstance(value, dict): 245 series, meta = _get_item_from_dict( 246 item_dict=value, 247 data_dict=abs_dict, 248 data_meta=abs_meta, 249 item_selector=item_selector, 250 search_args=item_search_args, 251 **kwargs, 252 ) 253 else: 254 raise TypeError( 255 "Each value in the wanted list/dictionary must be either a string " 256 + "or a dictionary." 257 ) 258 259 # save search results 260 return_dict[key] = series 261 return_meta = pd.concat([return_meta, meta]) 262 263 return return_dict, return_meta 264 265 266# --- testing --- 267if __name__ == "__main__": 268 # --- test 1: get a list of dids 269 def test1(): 270 """Test case: get a list of dids.""" 271 272 cat = "5206.0" 273 table = "5206001_Key_Aggregates" 274 data_dict, data_meta = read_abs_cat( 275 cat=cat, single_excel_only=table, verbose=False 276 ) 277 stype = "Seasonally Adjusted" 278 get_these = data_meta.loc[ 279 (data_meta[mc.table] == table) 280 & (data_meta[mc.stype] == stype) 281 & data_meta[mc.unit].str.contains("Million") 282 & data_meta[mc.did].str.contains("Chain volume measures") 283 ][mc.did].to_list() 284 print(f"get_these: {get_these}") 285 286 selected, selected_meta = read_abs_by_desc( 287 wanted=get_these, 288 abs_dict=data_dict, 289 abs_meta=data_meta, 290 # exact_match=True, verbose=True, 291 table=table, 292 stype=stype, 293 ) 294 print(selected, selected_meta) 295 296 test1() 297 298 # --- test 2: get a dictionary of dids 299 def test2(): 300 """Test case: get a dictionary of dids.""" 301 302 gdp_table = "5206001_Key_Aggregates" 303 uer_table = "6202001" 304 sa = "Seasonally Adjusted" 305 get_these = { 306 # two series, each from two different ABS Catalogue Numbers 307 "GDP": { 308 "cat": "5206.0", 309 "table": gdp_table, 310 "stype": sa, 311 "did": "Gross domestic product: Chain volume measures ;", 312 "single_excel_only": gdp_table, 313 }, 314 "Unemployment Rate": { 315 "cat": "6202.0", 316 "table": uer_table, 317 "stype": sa, 318 "did": "Unemployment rate ; Persons ;", 319 "single_excel_only": uer_table, 320 }, 321 } 322 selected, selected_meta = read_abs_by_desc( 323 wanted=get_these, 324 ) 325 326 print(selected_meta) 327 print(selected) 328 329 test2()
def
read_abs_by_desc( wanted: list[str] | dict[str, str] | dict[str, dict[str, typing.Any]], **kwargs: Any) -> tuple[dict[str, pandas.core.series.Series], pandas.core.frame.DataFrame]:
145def read_abs_by_desc( 146 wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]], 147 **kwargs: Any, 148) -> tuple[dict[str, pd.Series], pd.DataFrame]: 149 """Get specific ABS data series by searching the ABS meta data. 150 151 Parameters 152 ---------- 153 - wanted : list of str, dict of str:str, or dict of str:dict - the data 154 item descriptions to search for. If a list, it will be a list of 155 descriptions to search for. If a dictionary, the keys will a name. 156 The dixtionary values can be either a string (the data item 157 description to search for) or a dictionary of keyword arguments, one of 158 which would be the data item description to search for. 159 - kwargs : Any - keyword arguments to control the data retrieval. 160 The keyword arguments can include the following: 161 - abs_dict : dict - the dictionary of ABS data to search (from 162 read_abs_cat()). 163 - abs_meta : DataFrame - the metadata for the ABS data (from 164 read_abs_cat()). 165 - for the retrieval of data, the "cat" argument must be present. 166 The following arguments, if present, will also be used (ie. 167 passed to read_abs_cat()): ["ignore_errors", "get_zip", 168 "get_excel_if_no_zip", "get_excel", "cache_only", 169 "single_excel_only", "single_zip_only", "verbose"]. 170 - for the selection of data, the following metacol names, if present, 171 will be used to construct the selector: "cat", "did" 172 "stype", "id", "start", "end", "num", "unit", "dtype", "freq", 173 "cmonth", "table", "tdesc". 174 - finally, the following arguments will be passed to the find_abs_id() 175 and search_abs_meta() functions: ["validate_unique", "exact_match", 176 "regex", "verbose"]. 177 178 Notes: 179 - if "wanted" is of type list[str] or dict[str, str], the kwargs should 180 include sufficient keys from the metacol dataclass to get the data. 181 Typically, the "cat" key, the "table" key, and the "stype" key would 182 be required. The did key would taken from the wanted list or 183 dictionary. 184 if wanted is of type dict[str, dict[str, Any]], the inner dictionary 185 must contain a "did" key. The other keys that can be used for the 186 data retrieval are the same as the metacol dataclass fileds, namely: 187 "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq", 188 "cmonth", "table", "tdesc". 189 - if abs_dict and abs_meta are provided within the kwargs, they will be 190 used to locate and extract the selected data. 191 - if abs_dict and abs_meta are not provided, then, (1) wanted must be of 192 type dict[str, dict[str, Any]] and (2) the inner dictionary must 193 contain a "cat" key so the data can be retrieved. Other keys that 194 can be used for the data retrieval are the same as for read_abs_cat(), 195 namely ["ignore_errors", "get_zip", "get_excel_if_no_zip", 196 "get_excel", "single_excel_only", "single_zip_only", "cache_only"]. 197 198 199 Returns 200 ------- 201 Returns a tuple of two items: 202 - A dictionary of pandas Series objects, where the keys are the series 203 descriptions. The series.name attribute will be the ABS series-id. 204 - A pandas DataFrame containing the metadata for the series. 205 206 Example 207 ------- 208 209 ```python 210 import readabs as ra 211 from pandas import DataFrame 212 cat_num = "5206.0" # The ABS National Accounts 213 data, meta = ra.read_abs_cat(cat=cat_num) 214 wanted = ["Gross domestic product: Chain volume measures ;",] 215 selected, selected_meta = ra.read_abs_by_desc( 216 wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates" 217 ) 218 ```""" 219 220 # - preparation 221 if not _work_to_do(wanted): 222 return {}, pd.DataFrame() 223 if isinstance(wanted, list): 224 wanted = _wlist_to_wdict(wanted) 225 abs_dict = kwargs.get("abs_dict", {}) 226 abs_meta = kwargs.get("abs_meta", pd.DataFrame()) 227 kwarg_selector = _get_search_terms(kwargs, {}) 228 search_args = _get_search_args(kwargs, {}) 229 230 return_dict = {} 231 return_meta = pd.DataFrame() 232 for key, value in wanted.items(): 233 234 item_selector = kwarg_selector.copy() 235 item_search_args = search_args.copy() 236 if isinstance(value, str): 237 series, meta = _get_item_from_str( 238 item=value, 239 data_dict=abs_dict, 240 data_meta=abs_meta, 241 item_selector=item_selector, 242 search_args=item_search_args, 243 ) 244 245 elif isinstance(value, dict): 246 series, meta = _get_item_from_dict( 247 item_dict=value, 248 data_dict=abs_dict, 249 data_meta=abs_meta, 250 item_selector=item_selector, 251 search_args=item_search_args, 252 **kwargs, 253 ) 254 else: 255 raise TypeError( 256 "Each value in the wanted list/dictionary must be either a string " 257 + "or a dictionary." 258 ) 259 260 # save search results 261 return_dict[key] = series 262 return_meta = pd.concat([return_meta, meta]) 263 264 return return_dict, return_meta
Get specific ABS data series by searching the ABS meta data.
Parameters
- wanted : list of str, dict of str:str, or dict of str:dict - the data item descriptions to search for. If a list, it will be a list of descriptions to search for. If a dictionary, the keys will a name. The dixtionary values can be either a string (the data item description to search for) or a dictionary of keyword arguments, one of which would be the data item description to search for.
- kwargs : Any - keyword arguments to control the data retrieval.
The keyword arguments can include the following:
- abs_dict : dict - the dictionary of ABS data to search (from read_abs_cat()).
- abs_meta : DataFrame - the metadata for the ABS data (from read_abs_cat()).
- for the retrieval of data, the "cat" argument must be present. The following arguments, if present, will also be used (ie. passed to read_abs_cat()): ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "cache_only", "single_excel_only", "single_zip_only", "verbose"].
- for the selection of data, the following metacol names, if present, will be used to construct the selector: "cat", "did" "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc".
- finally, the following arguments will be passed to the find_abs_id() and search_abs_meta() functions: ["validate_unique", "exact_match", "regex", "verbose"].
Notes:
- if "wanted" is of type list[str] or dict[str, str], the kwargs should include sufficient keys from the metacol dataclass to get the data. Typically, the "cat" key, the "table" key, and the "stype" key would be required. The did key would taken from the wanted list or dictionary. if wanted is of type dict[str, dict[str, Any]], the inner dictionary must contain a "did" key. The other keys that can be used for the data retrieval are the same as the metacol dataclass fileds, namely: "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc".
- if abs_dict and abs_meta are provided within the kwargs, they will be used to locate and extract the selected data.
- if abs_dict and abs_meta are not provided, then, (1) wanted must be of type dict[str, dict[str, Any]] and (2) the inner dictionary must contain a "cat" key so the data can be retrieved. Other keys that can be used for the data retrieval are the same as for read_abs_cat(), namely ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "single_excel_only", "single_zip_only", "cache_only"].
Returns
Returns a tuple of two items:
- A dictionary of pandas Series objects, where the keys are the series descriptions. The series.name attribute will be the ABS series-id.
- A pandas DataFrame containing the metadata for the series.
Example
import readabs as ra
from pandas import DataFrame
cat_num = "5206.0" # The ABS National Accounts
data, meta = ra.read_abs_cat(cat=cat_num)
wanted = ["Gross domestic product: Chain volume measures ;",]
selected, selected_meta = ra.read_abs_by_desc(
wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
)