readabs
Package to download timeseries data from the Australian Bureau of Statistics (ABS) and the Reserve Bank of Australia (RBA).
1"""Package to download timeseries data from 2the Australian Bureau of Statistics (ABS) 3and the Reserve Bank of Australia (RBA).""" 4 5# --- imports 6import importlib.metadata 7 8# --- local imports 9# - ABS related - 10from readabs.abs_catalogue import abs_catalogue 11from readabs.print_abs_catalogue import print_abs_catalogue 12from readabs.search_abs_meta import search_abs_meta, find_abs_id 13from readabs.read_abs_cat import read_abs_cat 14from readabs.read_abs_series import read_abs_series 15from readabs.read_abs_by_desc import read_abs_by_desc 16from readabs.grab_abs_url import grab_abs_url 17from readabs.abs_meta_data import metacol 18 19# - RBA related - 20from readabs.rba_catalogue import print_rba_catalogue, rba_catalogue 21from readabs.read_rba_table import read_rba_table, read_rba_ocr 22from readabs.rba_meta_data import rba_metacol 23 24# - Utilities - 25from readabs.datatype import Datatype 26from readabs.recalibrate import recalibrate, recalibrate_value 27from readabs.utilities import ( 28 percent_change, 29 annualise_rates, 30 annualise_percentages, 31 qtly_to_monthly, 32 monthly_to_qtly, 33) 34 35 36# --- version and author 37try: 38 __version__ = importlib.metadata.version(__name__) 39except importlib.metadata.PackageNotFoundError: 40 __version__ = "0.0.0" # Fallback for development mode 41__author__ = "Bryan Palmer" 42 43 44# --- exposed functions and classes 45__all__ = ( 46 # -- abs -- related 47 "metacol", 48 "read_abs_cat", 49 "read_abs_series", 50 "read_abs_by_desc", 51 "search_abs_meta", 52 "find_abs_id", 53 "grab_abs_url", 54 "print_abs_catalogue", 55 "abs_catalogue", 56 # -- rba -- related 57 "print_rba_catalogue", 58 "rba_catalogue", 59 "read_rba_table", 60 "rba_metacol", 61 "read_rba_ocr", 62 # -- utilities -- 63 "Datatype", 64 "percent_change", 65 "annualise_rates", 66 "annualise_percentages", 67 "qtly_to_monthly", 68 "monthly_to_qtly", 69 "recalibrate", 70 "recalibrate_value", 71) 72__pdoc__ = { 73 "download_cache": False, 74 "get_abs_links": False, 75 "read_support": False, 76 "grab_abs_url": False, 77} # hide submodules from documentation
23@cache # minimise slowness for any repeat business 24def read_abs_cat( 25 cat: str, 26 keep_non_ts: bool = False, 27 **kwargs: Any, 28) -> tuple[dict[str, DataFrame], DataFrame]: 29 """This function returns the complete ABS Catalogue information as a 30 python dictionary of pandas DataFrames, as well as the associated metadata 31 in a separate DataFrame. The function automates the collection of zip and 32 excel files from the ABS website. If necessary, these files are downloaded, 33 and saved into a cache directory. The files are then parsed to extract time 34 series data, and the associated metadata. 35 36 By default, the cache directory is `./.readabs_cache/`. You can change the 37 default directory name by setting the shell environment variable 38 `READABS_CACHE_DIR` with the name of the preferred directory. 39 40 Parameters 41 ---------- 42 43 cat : str 44 The ABS Catalogue Number for the data to be downloaded and made 45 available by this function. This argument must be specified in the 46 function call. 47 48 keep_non_ts : bool = False 49 A flag for whether to keep the non-time-series tables 50 that might form part of an ABS catalogue item. Normally, the 51 non-time-series information is ignored, and not made available to 52 the user. 53 54 **kwargs : Any 55 The following parameters may be passed as optional keyword arguments. 56 57 history : str = "" 58 Orovide a month-year string to extract historical ABS data. 59 For example, you can set history="dec-2023" to the get the ABS data 60 for a catalogue identifier that was originally published in respect 61 of Q4 of 2023. Note: not all ABS data sources are structured so that 62 this technique works in every case; but most are. 63 64 verbose : bool = False 65 Setting this to true may help diagnose why something 66 might be going wrong with the data retrieval process. 67 68 ignore_errors : bool = False 69 Normally, this function will cease downloading when 70 an error in encountered. However, sometimes the ABS website has 71 malformed links, and changing this setting is necessitated. (Note: 72 if you drop a message to the ABS, they will usually fix broken 73 links with a business day). 74 75 get_zip : bool = True 76 Download the excel files in .zip files. 77 78 get_excel_if_no_zip : bool = True 79 Only try to download .xlsx files if there are no zip 80 files available to be downloaded. Only downloading individual excel 81 files when there are no zip files to download can speed up the 82 download process. 83 84 get_excel : bool = False 85 The default value means that excel files are not 86 automatically download. Note: at least one of `get_zip`, 87 `get_excel_if_no_zip`, or `get_excel` must be true. For most ABS 88 catalogue items, it is sufficient to just download the one zip 89 file. But note, some catalogue items do not have a zip file. 90 Others have quite a number of zip files. 91 92 single_excel_only : str = "" 93 If this argument is set to a table name (without the 94 .xlsx extension), only that excel file will be downloaded. If 95 set, and only a limited subset of available data is needed, 96 this can speed up download times significantly. Note: overrides 97 `get_zip`, `get_excel_if_no_zip`, `get_excel` and `single_zip_only`. 98 99 single_zip_only : str = "" 100 If this argument is set to a zip file name (without 101 the .zip extension), only that zip file will be downloaded. 102 If set, and only a limited subset of available data is needed, 103 this can speed up download times significantly. Note: overrides 104 `get_zip`, `get_excel_if_no_zip`, and `get_excel`. 105 106 cache_only : bool = False 107 If set to True, this function will only access 108 data that has been previously cached. Normally, the function 109 checks the date of the cache data against the date of the data 110 on the ABS website, before deciding whether the ABS has fresher 111 data that needs to be downloaded to the cache. 112 113 Returns 114 ------------- 115 tuple[dict[str, DataFrame], DataFrame] 116 The function returns a tuple of two items. The first item is a 117 python dictionary of pandas DataFrames (which is the primary data 118 associated with the ABS catalogue item). The second item is a 119 DataFrame of ABS metadata for the ABS collection. 120 121 Example 122 ------- 123 124 ```python 125 import readabs as ra 126 from pandas import DataFrame 127 cat_num = "6202.0" # The ABS labour force survey 128 data: tuple[dict[str, DataFrame], DataFrame] = ra.read_abs_cat(cat=cat_num) 129 abs_dict, meta = data 130 ```""" 131 132 # --- get the time series data --- 133 raw_abs_dict = grab_abs_url(cat=cat, **kwargs) 134 abs_dict, abs_meta = _get_time_series_data( 135 cat, raw_abs_dict, keep_non_ts=keep_non_ts, **kwargs 136 ) 137 138 return abs_dict, abs_meta
This function returns the complete ABS Catalogue information as a python dictionary of pandas DataFrames, as well as the associated metadata in a separate DataFrame. The function automates the collection of zip and excel files from the ABS website. If necessary, these files are downloaded, and saved into a cache directory. The files are then parsed to extract time series data, and the associated metadata.
By default, the cache directory is ./.readabs_cache/
. You can change the
default directory name by setting the shell environment variable
READABS_CACHE_DIR
with the name of the preferred directory.
Parameters
cat : str The ABS Catalogue Number for the data to be downloaded and made available by this function. This argument must be specified in the function call.
keep_non_ts : bool = False A flag for whether to keep the non-time-series tables that might form part of an ABS catalogue item. Normally, the non-time-series information is ignored, and not made available to the user.
**kwargs : Any The following parameters may be passed as optional keyword arguments.
history : str = "" Orovide a month-year string to extract historical ABS data. For example, you can set history="dec-2023" to the get the ABS data for a catalogue identifier that was originally published in respect of Q4 of 2023. Note: not all ABS data sources are structured so that this technique works in every case; but most are.
verbose : bool = False Setting this to true may help diagnose why something might be going wrong with the data retrieval process.
ignore_errors : bool = False Normally, this function will cease downloading when an error in encountered. However, sometimes the ABS website has malformed links, and changing this setting is necessitated. (Note: if you drop a message to the ABS, they will usually fix broken links with a business day).
get_zip : bool = True Download the excel files in .zip files.
get_excel_if_no_zip : bool = True Only try to download .xlsx files if there are no zip files available to be downloaded. Only downloading individual excel files when there are no zip files to download can speed up the download process.
get_excel : bool = False
The default value means that excel files are not
automatically download. Note: at least one of get_zip
,
get_excel_if_no_zip
, or get_excel
must be true. For most ABS
catalogue items, it is sufficient to just download the one zip
file. But note, some catalogue items do not have a zip file.
Others have quite a number of zip files.
single_excel_only : str = ""
If this argument is set to a table name (without the
.xlsx extension), only that excel file will be downloaded. If
set, and only a limited subset of available data is needed,
this can speed up download times significantly. Note: overrides
get_zip
, get_excel_if_no_zip
, get_excel
and single_zip_only
.
single_zip_only : str = ""
If this argument is set to a zip file name (without
the .zip extension), only that zip file will be downloaded.
If set, and only a limited subset of available data is needed,
this can speed up download times significantly. Note: overrides
get_zip
, get_excel_if_no_zip
, and get_excel
.
cache_only : bool = False If set to True, this function will only access data that has been previously cached. Normally, the function checks the date of the cache data against the date of the data on the ABS website, before deciding whether the ABS has fresher data that needs to be downloaded to the cache.
Returns
tuple[dict[str, DataFrame], DataFrame] The function returns a tuple of two items. The first item is a python dictionary of pandas DataFrames (which is the primary data associated with the ABS catalogue item). The second item is a DataFrame of ABS metadata for the ABS collection.
Example
import readabs as ra
from pandas import DataFrame
cat_num = "6202.0" # The ABS labour force survey
data: tuple[dict[str, DataFrame], DataFrame] = ra.read_abs_cat(cat=cat_num)
abs_dict, meta = data
18def read_abs_series( 19 cat: str, 20 series_id: str | Sequence[str], 21 **kwargs: Any, 22) -> tuple[DataFrame, DataFrame]: 23 """Get specific ABS data series by their ABS catalogue and series identifiers. 24 25 Parameters 26 ---------- 27 cat : str 28 The ABS catalogue ID. 29 30 series_id : str | Sequence[str] 31 An ABS series ID or a sequence of ABS series IDs. 32 33 **kwargs : Any 34 Keyword arguments for the read_abs_series function, 35 which are the same as the keyword arguments for the 36 read_abs_cat function. 37 38 Returns 39 ------- 40 tuple[DataFrame, DataFrame] 41 A tuple of two DataFrames, one for the primary data and one for the metadata. 42 43 Example 44 ------- 45 46 ```python 47 import readabs as ra 48 from pandas import DataFrame 49 cat_num = "6202.0" # The ABS labour force survey 50 unemployment_rate = "A84423050A" 51 seo = "6202001" # The ABS table name 52 data, meta = ra.read_abs_series( 53 cat=cat_num, series_id=unemployment_rate, single_excel_only=seo 54 ) 55 ```""" 56 57 # check for unexpected keyword arguments/get defaults 58 check_kwargs(kwargs, "read_abs_series") 59 args = get_args(kwargs, "read_abs_series") 60 61 # read the ABS category data 62 cat_data, cat_meta = read_abs_cat(cat, **args) 63 64 # drop repeated series_ids in the meta data, 65 # make unique series_ids the index 66 cat_meta.index = Index(cat_meta[metacol.id]) 67 cat_meta = cat_meta.groupby(cat_meta.index).first() 68 69 # get the ABS series data 70 if isinstance(series_id, str): 71 series_id = [series_id] 72 return_data, return_meta = DataFrame(), DataFrame() 73 for identifier in series_id: 74 75 # confirm that the series ID is in the catalogue 76 if identifier not in cat_meta.index: 77 if args["verbose"]: 78 print(f"Series ID {identifier} not found in ABS catalogue ID {cat}") 79 if args["ignore_errors"]: 80 continue 81 raise ValueError(f"Series ID {identifier} not found in catalogue {cat}") 82 83 # confirm thay the index of the series is compatible 84 table = str(cat_meta.loc[identifier, metacol.table]) # str for mypy 85 data_series = cat_data[table][identifier] 86 if ( 87 len(return_data) > 0 88 and cast(PeriodIndex, return_data.index).freq 89 != cast(PeriodIndex, data_series.index).freq 90 ): 91 if args["verbose"]: 92 print(f"Frequency mismatch for series ID {identifier}") 93 if args["ignore_errors"]: 94 continue 95 raise ValueError(f"Frequency mismatch for series ID {identifier}") 96 97 # add the series data and meta data to the return values 98 if len(return_data) > 0: 99 return_data = return_data.reindex( 100 return_data.index.union(data_series.index) 101 ) 102 return_data[identifier] = data_series 103 return_meta = concat([return_meta, cat_meta.loc[identifier]], axis=1) 104 105 return return_data, return_meta.T
Get specific ABS data series by their ABS catalogue and series identifiers.
Parameters
cat : str The ABS catalogue ID.
series_id : str | Sequence[str] An ABS series ID or a sequence of ABS series IDs.
**kwargs : Any Keyword arguments for the read_abs_series function, which are the same as the keyword arguments for the read_abs_cat function.
Returns
tuple[DataFrame, DataFrame] A tuple of two DataFrames, one for the primary data and one for the metadata.
Example
import readabs as ra
from pandas import DataFrame
cat_num = "6202.0" # The ABS labour force survey
unemployment_rate = "A84423050A"
seo = "6202001" # The ABS table name
data, meta = ra.read_abs_series(
cat=cat_num, series_id=unemployment_rate, single_excel_only=seo
)
145def read_abs_by_desc( 146 wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]], 147 **kwargs: Any, 148) -> tuple[dict[str, pd.Series], pd.DataFrame]: 149 """Get specific ABS data series by searching the ABS meta data. 150 151 Parameters 152 ---------- 153 - wanted : list of str, dict of str:str, or dict of str:dict - the data 154 item descriptions to search for. If a list, it will be a list of 155 descriptions to search for. If a dictionary, the keys will a name. 156 The dixtionary values can be either a string (the data item 157 description to search for) or a dictionary of keyword arguments, one of 158 which would be the data item description to search for. 159 - kwargs : Any - keyword arguments to control the data retrieval. 160 The keyword arguments can include the following: 161 - abs_dict : dict - the dictionary of ABS data to search (from 162 read_abs_cat()). 163 - abs_meta : DataFrame - the metadata for the ABS data (from 164 read_abs_cat()). 165 - for the retrieval of data, the "cat" argument must be present. 166 The following arguments, if present, will also be used (ie. 167 passed to read_abs_cat()): ["ignore_errors", "get_zip", 168 "get_excel_if_no_zip", "get_excel", "cache_only", 169 "single_excel_only", "single_zip_only", "verbose"]. 170 - for the selection of data, the following metacol names, if present, 171 will be used to construct the selector: "cat", "did" 172 "stype", "id", "start", "end", "num", "unit", "dtype", "freq", 173 "cmonth", "table", "tdesc". 174 - finally, the following arguments will be passed to the find_abs_id() 175 and search_abs_meta() functions: ["validate_unique", "exact_match", 176 "regex", "verbose"]. 177 178 Notes: 179 - if "wanted" is of type list[str] or dict[str, str], the kwargs should 180 include sufficient keys from the metacol dataclass to get the data. 181 Typically, the "cat" key, the "table" key, and the "stype" key would 182 be required. The did key would taken from the wanted list or 183 dictionary. 184 if wanted is of type dict[str, dict[str, Any]], the inner dictionary 185 must contain a "did" key. The other keys that can be used for the 186 data retrieval are the same as the metacol dataclass fileds, namely: 187 "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq", 188 "cmonth", "table", "tdesc". 189 - if abs_dict and abs_meta are provided within the kwargs, they will be 190 used to locate and extract the selected data. 191 - if abs_dict and abs_meta are not provided, then, (1) wanted must be of 192 type dict[str, dict[str, Any]] and (2) the inner dictionary must 193 contain a "cat" key so the data can be retrieved. Other keys that 194 can be used for the data retrieval are the same as for read_abs_cat(), 195 namely ["ignore_errors", "get_zip", "get_excel_if_no_zip", 196 "get_excel", "single_excel_only", "single_zip_only", "cache_only"]. 197 198 199 Returns 200 ------- 201 Returns a tuple of two items: 202 - A dictionary of pandas Series objects, where the keys are the series 203 descriptions. The series.name attribute will be the ABS series-id. 204 - A pandas DataFrame containing the metadata for the series. 205 206 Example 207 ------- 208 209 ```python 210 import readabs as ra 211 from pandas import DataFrame 212 cat_num = "5206.0" # The ABS National Accounts 213 data, meta = ra.read_abs_cat(cat=cat_num) 214 wanted = ["Gross domestic product: Chain volume measures ;",] 215 selected, selected_meta = ra.read_abs_by_desc( 216 wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates" 217 ) 218 ```""" 219 220 # - preparation 221 if not _work_to_do(wanted): 222 return {}, pd.DataFrame() 223 if isinstance(wanted, list): 224 wanted = _wlist_to_wdict(wanted) 225 abs_dict = kwargs.get("abs_dict", {}) 226 abs_meta = kwargs.get("abs_meta", pd.DataFrame()) 227 kwarg_selector = _get_search_terms(kwargs, {}) 228 search_args = _get_search_args(kwargs, {}) 229 230 return_dict = {} 231 return_meta = pd.DataFrame() 232 for key, value in wanted.items(): 233 234 item_selector = kwarg_selector.copy() 235 item_search_args = search_args.copy() 236 if isinstance(value, str): 237 series, meta = _get_item_from_str( 238 item=value, 239 data_dict=abs_dict, 240 data_meta=abs_meta, 241 item_selector=item_selector, 242 search_args=item_search_args, 243 ) 244 245 elif isinstance(value, dict): 246 series, meta = _get_item_from_dict( 247 item_dict=value, 248 data_dict=abs_dict, 249 data_meta=abs_meta, 250 item_selector=item_selector, 251 search_args=item_search_args, 252 **kwargs, 253 ) 254 else: 255 raise TypeError( 256 "Each value in the wanted list/dictionary must be either a string " 257 + "or a dictionary." 258 ) 259 260 # save search results 261 return_dict[key] = series 262 return_meta = pd.concat([return_meta, meta]) 263 264 return return_dict, return_meta
Get specific ABS data series by searching the ABS meta data.
Parameters
- wanted : list of str, dict of str:str, or dict of str:dict - the data item descriptions to search for. If a list, it will be a list of descriptions to search for. If a dictionary, the keys will a name. The dixtionary values can be either a string (the data item description to search for) or a dictionary of keyword arguments, one of which would be the data item description to search for.
- kwargs : Any - keyword arguments to control the data retrieval.
The keyword arguments can include the following:
- abs_dict : dict - the dictionary of ABS data to search (from read_abs_cat()).
- abs_meta : DataFrame - the metadata for the ABS data (from read_abs_cat()).
- for the retrieval of data, the "cat" argument must be present. The following arguments, if present, will also be used (ie. passed to read_abs_cat()): ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "cache_only", "single_excel_only", "single_zip_only", "verbose"].
- for the selection of data, the following metacol names, if present, will be used to construct the selector: "cat", "did" "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc".
- finally, the following arguments will be passed to the find_abs_id() and search_abs_meta() functions: ["validate_unique", "exact_match", "regex", "verbose"].
Notes:
- if "wanted" is of type list[str] or dict[str, str], the kwargs should include sufficient keys from the metacol dataclass to get the data. Typically, the "cat" key, the "table" key, and the "stype" key would be required. The did key would taken from the wanted list or dictionary. if wanted is of type dict[str, dict[str, Any]], the inner dictionary must contain a "did" key. The other keys that can be used for the data retrieval are the same as the metacol dataclass fileds, namely: "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc".
- if abs_dict and abs_meta are provided within the kwargs, they will be used to locate and extract the selected data.
- if abs_dict and abs_meta are not provided, then, (1) wanted must be of type dict[str, dict[str, Any]] and (2) the inner dictionary must contain a "cat" key so the data can be retrieved. Other keys that can be used for the data retrieval are the same as for read_abs_cat(), namely ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "single_excel_only", "single_zip_only", "cache_only"].
Returns
Returns a tuple of two items:
- A dictionary of pandas Series objects, where the keys are the series descriptions. The series.name attribute will be the ABS series-id.
- A pandas DataFrame containing the metadata for the series.
Example
import readabs as ra
from pandas import DataFrame
cat_num = "5206.0" # The ABS National Accounts
data, meta = ra.read_abs_cat(cat=cat_num)
wanted = ["Gross domestic product: Chain volume measures ;",]
selected, selected_meta = ra.read_abs_by_desc(
wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
)
13def search_abs_meta( 14 meta: DataFrame, # sourced from read_abs_series() or read_abs_cat() 15 search_terms: dict[str, str], # {search_term: meta_data_column_name, ...} 16 exact_match: bool = False, 17 regex: bool = False, 18 validate_unique=False, # useful safety-net if you expect only one match 19 **kwargs: Any, 20) -> DataFrame: 21 """Extract from the ABS meta data those rows that match the 22 search_terms, by iteratively searching the meta data one 23 search_term at a time. 24 25 Parameters 26 ---------- 27 meta : DataFrame 28 A pandas DataFrame of metadata from the ABS 29 (via read_abs_cat() or read_abs_series()). 30 search_terms : dict[str, str] 31 A dictionary {search_phrase: meta_column_name, ...} of search terms. 32 Note: the search terms must be unique, as a dictionary cannot hold the 33 same search term to be applied to different columns. 34 exact_match : bool = False 35 Whether to match using == (exact) or .str.contains() (inexact). 36 regex : bool = False 37 Whether to use regular expressions in the search. 38 validate_unique : bool = False 39 Raise a ValueError if the search result is not unique. 40 **kwargs : Any 41 Additional keyword arguments. The only keyword argument 42 that is used is verbose. 43 verbose : bool = False 44 Print additional information while searching; which can 45 be useful when diagnosing problems with search terms. 46 47 Returns 48 ------- 49 DataFrame 50 Returns a pandas DataFrame of matching rows (subseted from meta). 51 Note, The index for the returned meta data will always comprise ABS 52 series_ids. Duplicate indexes will be removed from the meta data 53 (ie. where the same ABS series appears in more than one table, this 54 function will only report the first match). 55 56 Metacol 57 ------- 58 Because the meta data is a DataFrame, the columns can be referenced by either 59 their full textual name, or by the short name defined in the metacol object. 60 For example, if metacol is imported as mc, to refer to the 61 `Data Item Description` column, the user can refer to it as mc.did. 62 63 Example 64 ------- 65 ```python 66 from readabs import metacol as mc # alias for the ABS meta data column names 67 from readabs import read_abs_cat, search_abs_meta 68 cat_num = "6202.0" # The ABS labour force survey 69 data, meta = read_abs_cat(cat_num) 70 search_terms = { 71 "Unemployment rate": mc.did, # the data item description 72 "Persons": mc.did, 73 "Seasonally Adjusted": mc.stype, 74 "Percent": mc.unit, 75 "6202001": mc.table, 76 } 77 rows = search_abs_meta(meta, search_terms, verbose=True) 78 print(rows) # should have three rows : FT/PT/All Unemployment rates 79 ```""" 80 81 # get the verbose-flag from kwargs 82 verbose = kwargs.get("verbose", False) 83 84 # establish the starting point 85 meta_select = meta.copy() # preserve the original meta data 86 if verbose: 87 print(f"In search_abs_meta() {exact_match=} {regex=} {verbose=}") 88 print( 89 f"In search_abs_meta() starting with {len(meta_select)} rows in the meta_data." 90 ) 91 92 # iteratively search 93 for phrase, column in search_terms.items(): 94 if verbose: 95 print(f"Searching {len(meta_select)}: term: {phrase} in-column: {column}") 96 97 pick_me = ( 98 (meta_select[column] == phrase) 99 if (exact_match or column == mc.table) 100 else meta_select[column].str.contains(phrase, regex=regex) 101 ) 102 meta_select = meta_select[pick_me] 103 if verbose: 104 print(f"In find_rows() have found {len(meta_select)}") 105 106 # search complete - check results - and return 107 meta_select.index = Index(meta_select[mc.id]) 108 meta_select = meta_select[~meta_select.index.duplicated(keep="first")] 109 110 if verbose: 111 print(f"Final selection is {len(meta_select)} rows.") 112 113 elif len(meta_select) == 0: 114 print("Nothing selected?") 115 116 if validate_unique and len(meta_select) != 1: 117 raise ValueError("The selected meta data should only contain one row.") 118 119 return meta_select
Extract from the ABS meta data those rows that match the search_terms, by iteratively searching the meta data one search_term at a time.
Parameters
meta : DataFrame A pandas DataFrame of metadata from the ABS (via read_abs_cat() or read_abs_series()). search_terms : dict[str, str] A dictionary {search_phrase: meta_column_name, ...} of search terms. Note: the search terms must be unique, as a dictionary cannot hold the same search term to be applied to different columns. exact_match : bool = False Whether to match using == (exact) or .str.contains() (inexact). regex : bool = False Whether to use regular expressions in the search. validate_unique : bool = False Raise a ValueError if the search result is not unique. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is verbose. verbose : bool = False Print additional information while searching; which can be useful when diagnosing problems with search terms.
Returns
DataFrame Returns a pandas DataFrame of matching rows (subseted from meta). Note, The index for the returned meta data will always comprise ABS series_ids. Duplicate indexes will be removed from the meta data (ie. where the same ABS series appears in more than one table, this function will only report the first match).
Metacol
Because the meta data is a DataFrame, the columns can be referenced by either
their full textual name, or by the short name defined in the metacol object.
For example, if metacol is imported as mc, to refer to the
Data Item Description
column, the user can refer to it as mc.did.
Example
from readabs import metacol as mc # alias for the ABS meta data column names
from readabs import read_abs_cat, search_abs_meta
cat_num = "6202.0" # The ABS labour force survey
data, meta = read_abs_cat(cat_num)
search_terms = {
"Unemployment rate": mc.did, # the data item description
"Persons": mc.did,
"Seasonally Adjusted": mc.stype,
"Percent": mc.unit,
"6202001": mc.table,
}
rows = search_abs_meta(meta, search_terms, verbose=True)
print(rows) # should have three rows : FT/PT/All Unemployment rates
122def find_abs_id( 123 meta: DataFrame, 124 search_terms: dict[str, str], 125 **kwargs: Any, 126) -> tuple[str, str, str]: # table, series_id, units 127 """Find a unique ABS series identifier in the ABS metadata. 128 129 Parameters 130 ---------- 131 meta : DataFrame 132 A pandas DataFrame of metadata from the ABS 133 (via read_abs_cat() or read_abs_series()). 134 search_terms : dict[str, str] 135 A dictionary {search_phrase: meta_column_name, ...} of search terms. 136 Note: the search terms must be unique, as a dictionary cannot hold the 137 same search term to be applied to different columns. 138 **kwargs : Any 139 Additional keyword arguments. The only additional keyword argument 140 that is used is validate_unique. 141 validate_unique : bool = True 142 Raise a ValueError if the search result is not a single 143 unique match. Note: the default is True for safety. 144 145 Returns 146 ------- 147 tuple[str, str, str] 148 A tuple of the table, series_id and units for the unique 149 series_id that matches the search terms. 150 151 Metacol 152 ------- 153 Because the meta data is a DataFrame, the columns can be referenced by either 154 their full textual name, or by the short name defined in the metacol object. 155 For example, if metacol is imported as mc, to refer to the 156 `Data Item Description` column, the user can refer to it as mc.did. 157 158 Example 159 ------- 160 ```python 161 from readabs import metacol as mc # alias for the ABS meta data column names 162 from readabs import read_abs_cat, find_abs_id, recalibrate 163 cat_num = "6202.0" # The ABS labour force survey 164 data, meta = read_abs_cat(cat_num) 165 search_terms = { 166 "Employed total ; Persons ;": mc.did, 167 "Seasonally Adjusted": mc.stype, 168 "6202001": mc.table, 169 } 170 table, series_id, units = find_abs_id(meta, search_terms) 171 print(f"Table: {table} Series ID: {series_id} Units: {units}") 172 recal_series, recal_units = recalibrate(data[table][series_id], units) 173 ```""" 174 175 validate_unique = kwargs.pop("validate_unique", True) 176 found = search_abs_meta( 177 meta, search_terms, validate_unique=validate_unique, **kwargs 178 ).iloc[0] 179 table, series_id, units = ( 180 found[mc.table], 181 found[mc.id], 182 found[mc.unit], 183 ) 184 185 return table, series_id, units
Find a unique ABS series identifier in the ABS metadata.
Parameters
meta : DataFrame A pandas DataFrame of metadata from the ABS (via read_abs_cat() or read_abs_series()). search_terms : dict[str, str] A dictionary {search_phrase: meta_column_name, ...} of search terms. Note: the search terms must be unique, as a dictionary cannot hold the same search term to be applied to different columns. **kwargs : Any Additional keyword arguments. The only additional keyword argument that is used is validate_unique. validate_unique : bool = True Raise a ValueError if the search result is not a single unique match. Note: the default is True for safety.
Returns
tuple[str, str, str] A tuple of the table, series_id and units for the unique series_id that matches the search terms.
Metacol
Because the meta data is a DataFrame, the columns can be referenced by either
their full textual name, or by the short name defined in the metacol object.
For example, if metacol is imported as mc, to refer to the
Data Item Description
column, the user can refer to it as mc.did.
Example
from readabs import metacol as mc # alias for the ABS meta data column names
from readabs import read_abs_cat, find_abs_id, recalibrate
cat_num = "6202.0" # The ABS labour force survey
data, meta = read_abs_cat(cat_num)
search_terms = {
"Employed total ; Persons ;": mc.did,
"Seasonally Adjusted": mc.stype,
"6202001": mc.table,
}
table, series_id, units = find_abs_id(meta, search_terms)
print(f"Table: {table} Series ID: {series_id} Units: {units}")
recal_series, recal_units = recalibrate(data[table][series_id], units)
23@cache # minimise slowness with repeat business 24def grab_abs_url( 25 url: str = "", 26 **kwargs: Any, 27) -> dict[str, DataFrame]: 28 """For a given URL, extract the data from the Excel and ZIP file 29 links found on that page. The data is returned as a dictionary of 30 DataFrames. The Excel files are converted into DataFrames, with 31 each sheet in each Excel file becoming a separate DataFrame. ZIP 32 files are examined for Excel files, which are similarly converted into 33 DataFrames. The dictionary of DataFrames is returned. 34 35 The preferred mechanism for reading ABS data is to use the `read_abs_cat()` 36 or `read_abs_series()` functions. This function is provided for those 37 cases where the data is not available in the ABS catalogue, where the 38 data is not a timeseries, or where the user wants to extract data from 39 a specific ABS landingpage. 40 41 42 Parameters 43 ---------- 44 url : str = "" 45 A URL for an ABS Catalogue landing page. Either a url or 46 a catalogue number must be provided. If both are provided, the 47 URL will be used. 48 49 **kwargs : Any 50 Accepts the same keyword arguments as `read_abs_cat()`. Additionally, 51 a cat argument can be provided, which will be used to get the URL 52 (see below). 53 54 cat : str = "" 55 An ABS Catalogue number. If provided, and the URL is not 56 provided, then the Catalogue number will be used to get the URL. 57 58 Returns 59 ------- 60 dict[str, DataFrame] 61 A dictionary of DataFrames.""" 62 63 # check/get the keyword arguments 64 url = _get_url(url, kwargs) # note: removes "cat" from kwargs 65 check_kwargs(kwargs, "grab_abs_url") # warn if invalid kwargs 66 args = get_args(kwargs, "grab_abs_url") # get the valid kwargs 67 if verbose := args["verbose"]: 68 print(f"grab_abs_url(): {url=}, {args=}") 69 70 # get the URL links to the relevant ABS data files on that webpage 71 links = get_abs_links(url, **args) 72 if not links: 73 print(f"No data files found at URL: {url}") 74 return {} # return an empty Dictionary 75 76 # read the data files into a dictionary of DataFrames 77 abs_dict: dict[str, DataFrame] = {} 78 79 # use the args, and the found links to get the data ... 80 if args["single_excel_only"]: 81 link = _find_url(links, ".xlsx", args["single_excel_only"], verbose) 82 if link: 83 abs_dict = _add_excel(abs_dict, link, **args) 84 return abs_dict 85 86 if args["single_zip_only"]: 87 link = _find_url(links, ".zip", args["single_zip_only"], verbose) 88 if link: 89 abs_dict = _add_zip(abs_dict, link, **args) 90 return abs_dict 91 92 for link_type in ".zip", ".xlsx": # .zip must come first 93 for link in links.get(link_type, []): 94 95 if link_type == ".zip" and args["get_zip"]: 96 abs_dict = _add_zip(abs_dict, link, **args) 97 98 elif link_type == ".xlsx": 99 if ( 100 args["get_excel"] 101 or (args["get_excel_if_no_zip"] and not args["get_zip"]) 102 or (args["get_excel_if_no_zip"] and not links.get(".zip", [])) 103 ): 104 abs_dict = _add_excel(abs_dict, link, **args) 105 106 return abs_dict
For a given URL, extract the data from the Excel and ZIP file links found on that page. The data is returned as a dictionary of DataFrames. The Excel files are converted into DataFrames, with each sheet in each Excel file becoming a separate DataFrame. ZIP files are examined for Excel files, which are similarly converted into DataFrames. The dictionary of DataFrames is returned.
The preferred mechanism for reading ABS data is to use the read_abs_cat()
or read_abs_series()
functions. This function is provided for those
cases where the data is not available in the ABS catalogue, where the
data is not a timeseries, or where the user wants to extract data from
a specific ABS landingpage.
Parameters
url : str = "" A URL for an ABS Catalogue landing page. Either a url or a catalogue number must be provided. If both are provided, the URL will be used.
**kwargs : Any
Accepts the same keyword arguments as read_abs_cat()
. Additionally,
a cat argument can be provided, which will be used to get the URL
(see below).
cat : str = "" An ABS Catalogue number. If provided, and the URL is not provided, then the Catalogue number will be used to get the URL.
Returns
dict[str, DataFrame] A dictionary of DataFrames.
7def print_abs_catalogue(cache_only=False, verbose=False) -> None: 8 """This function prints to standard output a table of the ABS 9 Catalogue Numbers that contain time-series data. In addition to the 10 Catalogue Numbers, the table includes the theme, parent topic and 11 topic for the collection represented by each Catalogue Number. 12 13 It is primarily a convenience function: The first parameter for 14 the read_abs_cat() and read_abs_series() functions is the ABS 15 Catalogue Number from which data is sought. 16 17 Parameters 18 ---------- 19 cache_only : bool = False 20 If True, only use the cache. 21 verbose : bool = False 22 If True, print progress messages. 23 24 Return values 25 ------------- 26 27 The function does not return anything. 28 29 Example 30 ------- 31 32 ```python 33 import readabs as ra 34 ra.print_abs_catalogue() 35 ```""" 36 37 catalogue = abs_catalogue(cache_only=cache_only, verbose=verbose) 38 print(catalogue.loc[:, catalogue.columns != "URL"].to_markdown())
This function prints to standard output a table of the ABS Catalogue Numbers that contain time-series data. In addition to the Catalogue Numbers, the table includes the theme, parent topic and topic for the collection represented by each Catalogue Number.
It is primarily a convenience function: The first parameter for the read_abs_cat() and read_abs_series() functions is the ABS Catalogue Number from which data is sought.
Parameters
cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.
Return values
The function does not return anything.
Example
import readabs as ra
ra.print_abs_catalogue()
10@cache 11def abs_catalogue(cache_only=False, verbose=False) -> DataFrame: 12 """Return a DataFrame of ABS Catalogue numbers. In the first instance, 13 this is downloaded from the ABS website, and cached for future use. 14 15 Parameters 16 ---------- 17 cache_only : bool = False 18 If True, only use the cache. 19 verbose : bool = False 20 If True, print progress messages. 21 22 Returns 23 ------- 24 DataFrame 25 A DataFrame of ABS Catalogue numbers. 26 27 Example 28 ------- 29 ```python 30 import readabs as ra 31 catalogue = ra.abs_catalogue() 32 ```""" 33 34 # get ABS web page of catalogue numbers 35 url = "https://www.abs.gov.au/about/data-services/help/abs-time-series-directory" 36 abs_bytes = get_file(url, cache_only=cache_only, verbose=verbose) 37 links = read_html(StringIO(abs_bytes.decode("utf-8")), extract_links="body")[ 38 1 39 ] # second table on the page 40 41 # extract catalogue numbers 42 cats = links["Catalogue Number"].apply(Series)[0] 43 urls = links["Topic"].apply(Series)[1] 44 root = "https://www.abs.gov.au/statistics/" 45 snip = urls.str.replace(root, "") 46 snip = ( 47 snip[~snip.str.contains("http")].str.replace("-", " ").str.title() 48 ) # remove bad cases 49 frame = snip.str.split("/", expand=True).iloc[:, :3] 50 frame.columns = Index(["Theme", "Parent Topic", "Topic"]) 51 frame["URL"] = urls 52 cats = cats[frame.index] 53 cat_index = cats.str.replace("(Ceased)", "").str.strip() 54 status = Series(" ", index=cats.index).where(cat_index == cats, "Ceased") 55 frame["Status"] = status 56 frame.index = Index(cat_index) 57 frame.index.name = "Catalogue ID" 58 return frame
Return a DataFrame of ABS Catalogue numbers. In the first instance, this is downloaded from the ABS website, and cached for future use.
Parameters
cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.
Returns
DataFrame A DataFrame of ABS Catalogue numbers.
Example
import readabs as ra
catalogue = ra.abs_catalogue()
45def print_rba_catalogue(cache_only=False, verbose=False) -> None: 46 """This function prints to standard output a table of the RBA 47 Catalogue Numbers. 48 49 Parameters 50 ---------- 51 cache_only : bool = False 52 If True, only use the cache. 53 verbose : bool = False 54 If True, print progress messages. 55 56 Return values 57 ------------- 58 59 The function does not return anything. 60 61 Example 62 ------- 63 64 ```python 65 import readabs as ra 66 ra.print_rba_catalogue() 67 ```""" 68 69 rba_catalog = rba_catalogue(cache_only=cache_only, verbose=verbose) 70 print(rba_catalog.loc[:, rba_catalog.columns != "URL"].to_markdown())
This function prints to standard output a table of the RBA Catalogue Numbers.
Parameters
cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.
Return values
The function does not return anything.
Example
import readabs as ra
ra.print_rba_catalogue()
18@cache 19def rba_catalogue(cache_only=False, verbose=False) -> DataFrame: 20 """Return a DataFrame of RBA Catalogue numbers. In the first instance, 21 this is downloaded from the RBA website, and cached for future use. 22 23 Parameters 24 ---------- 25 cache_only : bool = False 26 If True, only use the cache. 27 verbose : bool = False 28 If True, print progress messages. 29 30 Returns 31 ------- 32 DataFrame 33 A DataFrame of RBA Catalogue numbers. 34 35 Example 36 ------- 37 ```python 38 import readabs as ra 39 catalogue = ra.rba_catalogue() 40 ```""" 41 42 return _get_rba_links(cache_only=cache_only, verbose=verbose)
Return a DataFrame of RBA Catalogue numbers. In the first instance, this is downloaded from the RBA website, and cached for future use.
Parameters
cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.
Returns
DataFrame A DataFrame of RBA Catalogue numbers.
Example
import readabs as ra
catalogue = ra.rba_catalogue()
77def read_rba_table(table: str, **kwargs: Any) -> tuple[DataFrame, DataFrame]: 78 """Read a table from the RBA website and return the actual data 79 and the meta data in a tuple of two DataFrames. 80 81 Parameters 82 ---------- 83 table : str 84 The table to read from the RBA website. 85 **kwargs : Any 86 Additional keyword arguments. 87 The only keyword argument that is used is ignore_errors. 88 ignore_errors : bool = False 89 If True, then any major errors encountered will be printed and the function 90 will return empty DataFrames. If False, then any major errors encountered 91 will raise an exception. 92 93 Returns 94 ------- 95 tuple[DataFrame, DataFrame] 96 The primary data and the meta data in a tuple of two DataFrames. 97 98 Examples 99 -------- 100 ```python 101 data, meta = read_rba_table("C1") 102 ```""" 103 104 # set-up 105 ignore_errors = kwargs.get("ignore_errors", False) 106 data, meta = DataFrame(), DataFrame() 107 108 # get the Excel file 109 excel = _get_excel_file(table, ignore_errors, **kwargs) 110 if excel is None: 111 return data, meta 112 113 # read Excel file into DataFrame 114 try: 115 raw = read_excel(BytesIO(excel), header=None, index_col=None) 116 except Exception as e: 117 if ignore_errors: 118 print(f"Ignoring error: {e}") 119 return data, meta 120 raise 121 122 # extract the meta data 123 meta = raw.iloc[1:11, :].T.copy() 124 meta.columns = Index(meta.iloc[0]) 125 renamer = { 126 "Mnemonic": rm.id, 127 } # historical data is inconsistent 128 meta = meta.rename(columns=renamer) 129 meta = meta.iloc[1:, :] 130 meta.index = Index(meta[rm.id]) 131 meta[rm.table] = table 132 meta[rm.tdesc] = raw.iloc[0, 0] 133 meta = meta.dropna(how="all", axis=1) # drop columns with all NaNs 134 135 # extract the data 136 data = raw.iloc[10:, :].copy() 137 data.columns = Index(data.iloc[0]) 138 data = data.iloc[1:, :] 139 data.index = DatetimeIndex(data.iloc[:, 0]) 140 data = data.iloc[:, 1:] 141 data = data.dropna(how="all", axis=1) # drop columns with all NaNs 142 143 # can we make the index into a PeriodIndex? 144 days = data.index.to_series().diff(1).dropna().dt.days 145 if days.min() >= 28 and days.max() <= 31: 146 data.index = PeriodIndex(data.index, freq="M") 147 elif days.min() >= 90 and days.max() <= 92: 148 data.index = PeriodIndex(data.index, freq="Q") 149 elif days.min() >= 365 and days.max() <= 366: 150 data.index = PeriodIndex(data.index, freq="Y") 151 else: 152 data.index = PeriodIndex(data.index, freq="D") 153 154 return data, meta
Read a table from the RBA website and return the actual data and the meta data in a tuple of two DataFrames.
Parameters
table : str The table to read from the RBA website. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is ignore_errors. ignore_errors : bool = False If True, then any major errors encountered will be printed and the function will return empty DataFrames. If False, then any major errors encountered will raise an exception.
Returns
tuple[DataFrame, DataFrame] The primary data and the meta data in a tuple of two DataFrames.
Examples
data, meta = read_rba_table("C1")
157def read_rba_ocr(monthly: bool = True, **kwargs: Any) -> Series: 158 """Read the Official Cash Rate (OCR) from the RBA website and return it 159 in a pandas Series, with either a daily or monthly PeriodIndex, 160 depending on the value of the monthly parameter. The default is monthly. 161 162 Parameters 163 ---------- 164 monthly : bool = True 165 If True, then the data will be returned with a monthly PeriodIndex. 166 If False, then the data will be returned with a daily PeriodIndex. 167 **kwargs : Any 168 Additional keyword arguments. The only keyword argument that is used is ignore_errors. 169 ignore_errors : bool = False 170 If True, then any major errors encountered will be printed and the function 171 will return an empty Series. If False, then any major errors encountered 172 will raise an exception. 173 174 Returns 175 ------- 176 Series 177 The OCR data in a pandas Series, with an index of either daily or monthly Periods. 178 179 Examples 180 -------- 181 ```python 182 ocr = read_rba_ocr(monthly=True) 183 ```""" 184 185 # read the OCR table from the RBA website, make float and sort, name the series 186 rba, _rba_meta = read_rba_table("A2", **kwargs) # should have a daily PeriodIndex 187 ocr = ( 188 rba.loc[lambda x: x.index >= "1990-08-02", "ARBAMPCNCRT"] 189 .astype(float) 190 .sort_index() 191 ) 192 ocr.name = "RBA Official Cash Rate" 193 194 # bring up to date 195 today = Period(Timestamp.today(), freq=cast(PeriodIndex, ocr.index).freqstr) 196 if ocr.index[-1] < today: 197 ocr[today] = ocr.iloc[-1] 198 199 if not monthly: 200 # fill in missing days and return daily data 201 daily_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="D") 202 ocr = ocr.reindex(daily_index).ffill() 203 return ocr 204 205 # convert to monthly data, keeping last value if duplicates in month 206 # fill in missing months 207 ocr.index = PeriodIndex(ocr.index, freq="M") 208 ocr = ocr[~ocr.index.duplicated(keep="last")] 209 monthly_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="M") 210 ocr = ocr.reindex(monthly_index, method="ffill") 211 return ocr
Read the Official Cash Rate (OCR) from the RBA website and return it in a pandas Series, with either a daily or monthly PeriodIndex, depending on the value of the monthly parameter. The default is monthly.
Parameters
monthly : bool = True If True, then the data will be returned with a monthly PeriodIndex. If False, then the data will be returned with a daily PeriodIndex. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is ignore_errors. ignore_errors : bool = False If True, then any major errors encountered will be printed and the function will return an empty Series. If False, then any major errors encountered will raise an exception.
Returns
Series The OCR data in a pandas Series, with an index of either daily or monthly Periods.
Examples
ocr = read_rba_ocr(monthly=True)
13def percent_change(data: DataT, n_periods: int) -> DataT: 14 """Calculate an percentage change in a contiguous, ordered series over n_periods. 15 16 Parameters 17 ---------- 18 data : pandas Series or DataFrame 19 The data to calculate the percentage change for. 20 n_periods : int 21 The number of periods to calculate the percentage change over. 22 Typically 4 for quarterly data, and 12 for monthly data. 23 24 Returns 25 ------- 26 pandas Series or DataFrame 27 The percentage change in the data over n_periods. For DataFrame input, 28 the percentage change is calculated for each column. 29 """ 30 31 return (data / data.shift(n_periods) - 1) * 100
Calculate an percentage change in a contiguous, ordered series over n_periods.
Parameters
data : pandas Series or DataFrame The data to calculate the percentage change for. n_periods : int The number of periods to calculate the percentage change over. Typically 4 for quarterly data, and 12 for monthly data.
Returns
pandas Series or DataFrame The percentage change in the data over n_periods. For DataFrame input, the percentage change is calculated for each column.
34def annualise_rates(data: DataT, periods_per_year: int | float = 12) -> DataT: 35 """Annualise a growth rate for a period. 36 Note: returns a percentage value (and not a rate)! 37 38 Parameters 39 ---------- 40 data : pandas Series or DataFrame 41 The growth rate to annualise. Note a growth rate of 0.05 is 5%. 42 periods_per_year : int or float, default 12 43 The number of periods in a year. For monthly data, this is 12. 44 45 Returns 46 ------- 47 pandas Series or DataFrame 48 The annualised growth expressed as a percentage (not a rate). 49 For DataFrame input, the annualised growth rate is calculated 50 for each column.""" 51 return (((1 + data) ** periods_per_year) - 1) * 100
Annualise a growth rate for a period. Note: returns a percentage value (and not a rate)!
Parameters
data : pandas Series or DataFrame The growth rate to annualise. Note a growth rate of 0.05 is 5%. periods_per_year : int or float, default 12 The number of periods in a year. For monthly data, this is 12.
Returns
pandas Series or DataFrame The annualised growth expressed as a percentage (not a rate). For DataFrame input, the annualised growth rate is calculated for each column.
54def annualise_percentages(data: DataT, periods_per_year: int | float = 12) -> DataT: 55 """Annualise a growth rate (expressed as a percentage) for a period. 56 57 Parameters 58 ---------- 59 data : pandas Series or DataFrame 60 The growth rate (expresed as a percentage) to annualise. Note a 61 growth percentage of 5% is a growth rate of 0.05. 62 periods_per_year : int or float, default 12 63 The number of periods in a year. For monthly data, this is 12. 64 65 Returns 66 ------- 67 pandas Series or DataFrame 68 The annualised growth expressed as a percentage. For DataFrame input, 69 the annualised growth rate is calculated for each column.""" 70 71 rates = data / 100.0 72 return annualise_rates(rates, periods_per_year)
Annualise a growth rate (expressed as a percentage) for a period.
Parameters
data : pandas Series or DataFrame The growth rate (expresed as a percentage) to annualise. Note a growth percentage of 5% is a growth rate of 0.05. periods_per_year : int or float, default 12 The number of periods in a year. For monthly data, this is 12.
Returns
pandas Series or DataFrame The annualised growth expressed as a percentage. For DataFrame input, the annualised growth rate is calculated for each column.
75def qtly_to_monthly( 76 data: DataT, 77 interpolate: bool = True, 78 limit: Optional[int] = 2, # only used if interpolate is True 79 dropna: bool = True, 80) -> DataT: 81 """Convert a pandas timeseries with a Quarterly PeriodIndex to an 82 timeseries with a Monthly PeriodIndex. 83 84 Parameters 85 ---------- 86 data - either a pandas Series or DataFrame - assumes the index is unique. 87 The data to convert to monthly frequency. 88 interpolate: bool, default True 89 Whether to interpolate the missing monthly data. 90 limit: int, default 2 91 The maximum number of consecutive missing months to interpolate. 92 dropna: bool, default True 93 Whether to drop NA data 94 95 Returns 96 ------- 97 pandas Series or DataFrame 98 The data with a Monthly PeriodIndex. If interpolate is True, the 99 missing monthly data is interpolated. If dropna is True, any NA 100 data is removed.""" 101 102 # sanity checks 103 assert isinstance(data.index, PeriodIndex) 104 assert data.index.freqstr[0] == "Q" 105 assert data.index.is_unique 106 assert data.index.is_monotonic_increasing 107 108 def set_axis_monthly_periods(x: DataT) -> DataT: 109 """Convert a DatetimeIndex to a Monthly PeriodIndex.""" 110 111 return x.set_axis( 112 labels=cast(DatetimeIndex, x.index).to_period(freq="M"), axis="index" 113 ) 114 115 # do the heavy lifting 116 data = ( 117 data.set_axis( 118 labels=data.index.to_timestamp(how="end"), axis="index", copy=True 119 ) 120 .resample(rule="ME") # adds in every missing month 121 .first(min_count=1) # generates nans for new months 122 # assumes only one value per quarter (ie. unique index) 123 .pipe(set_axis_monthly_periods) 124 ) 125 126 if interpolate: 127 data = data.interpolate(limit_area="inside", limit=limit) 128 if dropna: 129 data = data.dropna() 130 131 return data
Convert a pandas timeseries with a Quarterly PeriodIndex to an timeseries with a Monthly PeriodIndex.
Parameters
data - either a pandas Series or DataFrame - assumes the index is unique. The data to convert to monthly frequency. interpolate: bool, default True Whether to interpolate the missing monthly data. limit: int, default 2 The maximum number of consecutive missing months to interpolate. dropna: bool, default True Whether to drop NA data
Returns
pandas Series or DataFrame The data with a Monthly PeriodIndex. If interpolate is True, the missing monthly data is interpolated. If dropna is True, any NA data is removed.
134def monthly_to_qtly(data: DataT, q_ending="DEC", f: str = "mean") -> DataT: 135 """Convert monthly data to quarterly data by taking the mean (or sum) 136 of the three months in each quarter. Ignore quarters with less than 137 or more than three months data. Drop NA items. Change f to "sum" 138 for a quarterly sum. 139 140 Parameters 141 ---------- 142 data : pandas Series or DataFrame 143 The data to convert to quarterly frequency. 144 q_ending : str, default DEC 145 The month in which the quarter ends. For example, "DEC" for December. 146 f : str, default "mean" 147 The function to apply to the three months in each quarter. 148 Change to "sum" for a quarterly sum. The default is a 149 quarterly mean. 150 151 Returns 152 ------- 153 pandas Series or DataFrame 154 The data with a quarterly PeriodIndex. If a quarter has less than 155 three months data, the quarter is dropped. If the quarter has more 156 than three months data, the quarter is dropped. Any NA data is removed. 157 For DataFrame input, the function is applied to each column.""" 158 159 if isinstance(data, Series): 160 return _monthly_to_qtly_series(data, q_ending, f) 161 162 if isinstance(data, DataFrame): 163 chamber = {} 164 for col in data.columns: 165 chamber[col] = _monthly_to_qtly_series(data[col], q_ending, f) 166 return DataFrame(chamber) 167 168 raise ValueError("data must be a pandas Series or DataFrame")
Convert monthly data to quarterly data by taking the mean (or sum) of the three months in each quarter. Ignore quarters with less than or more than three months data. Drop NA items. Change f to "sum" for a quarterly sum.
Parameters
data : pandas Series or DataFrame The data to convert to quarterly frequency. q_ending : str, default DEC The month in which the quarter ends. For example, "DEC" for December. f : str, default "mean" The function to apply to the three months in each quarter. Change to "sum" for a quarterly sum. The default is a quarterly mean.
Returns
pandas Series or DataFrame The data with a quarterly PeriodIndex. If a quarter has less than three months data, the quarter is dropped. If the quarter has more than three months data, the quarter is dropped. Any NA data is removed. For DataFrame input, the function is applied to each column.
15def recalibrate( 16 data: DataT, 17 units: str, 18) -> tuple[DataT, str]: 19 """Recalibrate a Series or DataFrame so the data in in the range -1000 to 1000. 20 Change the name of the units to reflect the recalibration. 21 22 Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar. 23 If you provide a Series, you will get a Series back. If you provide a DataFrame, 24 you will get a DataFrame back. 25 26 Parameters 27 ---------- 28 data : Series or DataFrame 29 The data to recalibrate. 30 units : str 31 The units of the data. This string should be in the form of 32 "Number", "Thousands", "Millions", "Billions", etc. The units 33 should be in title case. 34 35 Returns 36 ------- 37 Series or DataFrame 38 The recalibrated data will be a Series if a Series was provided, 39 or a DataFrame if a DataFrame was provided. 40 41 Examples 42 -------- 43 ```python 44 from pandas import Series 45 from readabs import recalibrate 46 s = Series([1_000, 10_000, 100_000, 1_000_000]) 47 recalibrated, units = recalibrate(s, "$") 48 print(f"{recalibrated=}, {units=}") 49 ```""" 50 51 if not isinstance(data, (Series, DataFrame)): 52 raise TypeError("data must be a Series or DataFrame") 53 units, restore_name = _prepare_units(units) 54 flat_data = data.to_numpy().flatten() 55 flat_data, units = _recalibrate(flat_data, units) 56 57 if restore_name: 58 units = f"{restore_name} {units}" 59 for n in "numbers", "number": 60 if n in units: 61 units = units.replace(n, "").strip() 62 break 63 units = units.title() 64 65 restore_pandas = DataFrame if len(data.shape) == 2 else Series 66 result = restore_pandas(flat_data.reshape(data.shape)) 67 result.index = data.index 68 if len(data.shape) == 2: 69 result.columns = data.columns 70 if len(data.shape) == 1: 71 result.name = data.name 72 return result, units
Recalibrate a Series or DataFrame so the data in in the range -1000 to 1000. Change the name of the units to reflect the recalibration.
Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar. If you provide a Series, you will get a Series back. If you provide a DataFrame, you will get a DataFrame back.
Parameters
data : Series or DataFrame The data to recalibrate. units : str The units of the data. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.
Returns
Series or DataFrame The recalibrated data will be a Series if a Series was provided, or a DataFrame if a DataFrame was provided.
Examples
from pandas import Series
from readabs import recalibrate
s = Series([1_000, 10_000, 100_000, 1_000_000])
recalibrated, units = recalibrate(s, "$")
print(f"{recalibrated=}, {units=}")
75def recalibrate_value(value: float, units: str) -> tuple[float, str]: 76 """Recalibrate a floating point value. The value will be recalibrated 77 so it is in the range -1000 to 1000. The units will be changed to reflect 78 the recalibration. 79 80 Parameters 81 ---------- 82 value : float 83 The value to recalibrate. 84 units : str 85 The units of the value. This string should be in the form of 86 "Number", "Thousands", "Millions", "Billions", etc. The units 87 should be in title case. 88 89 Returns 90 ------- 91 tuple[float, str] 92 A tuple containing the recalibrated value and the recalibrated units. 93 94 Examples 95 -------- 96 ```python 97 from readabs import recalibrate_value 98 recalibrated, units = recalibrate_value(10_000_000, "Thousand") 99 print(recalibrated, units) 100 ```""" 101 102 series = Series([value]) 103 output, units = recalibrate(series, units) 104 return output.values[0], units
Recalibrate a floating point value. The value will be recalibrated so it is in the range -1000 to 1000. The units will be changed to reflect the recalibration.
Parameters
value : float The value to recalibrate. units : str The units of the value. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.
Returns
tuple[float, str] A tuple containing the recalibrated value and the recalibrated units.
Examples
from readabs import recalibrate_value
recalibrated, units = recalibrate_value(10_000_000, "Thousand")
print(recalibrated, units)