readabs.rba_catalogue
Extract links to RBA data files from the RBA website.
1"""Extract links to RBA data files from the RBA website.""" 2 3# system imports 4import re 5from typing import Any 6from functools import cache 7 8# analutic imports 9from bs4 import BeautifulSoup 10from pandas import DataFrame 11 12# local imports 13from readabs.download_cache import get_file, HttpError, CacheError 14 15 16# --- public functions --- 17@cache 18def rba_catalogue(cache_only=False, verbose=False) -> DataFrame: 19 """Return a DataFrame of RBA Catalogue numbers. In the first instance, 20 this is downloaded from the RBA website, and cached for future use. 21 22 Parameters 23 ---------- 24 cache_only : bool = False 25 If True, only use the cache. 26 verbose : bool = False 27 If True, print progress messages. 28 29 Returns 30 ------- 31 DataFrame 32 A DataFrame of RBA Catalogue numbers. 33 34 Example 35 ------- 36 ```python 37 import readabs as ra 38 catalogue = ra.rba_catalogue() 39 ```""" 40 41 return _get_rba_links(cache_only=cache_only, verbose=verbose) 42 43 44def print_rba_catalogue(cache_only=False, verbose=False) -> None: 45 """This function prints to standard output a table of the RBA 46 Catalogue Numbers. 47 48 Parameters 49 ---------- 50 cache_only : bool = False 51 If True, only use the cache. 52 verbose : bool = False 53 If True, print progress messages. 54 55 Return values 56 ------------- 57 58 The function does not return anything. 59 60 Example 61 ------- 62 63 ```python 64 import readabs as ra 65 ra.print_rba_catalogue() 66 ```""" 67 68 rba_catalog = rba_catalogue(cache_only=cache_only, verbose=verbose) 69 print(rba_catalog.loc[:, rba_catalog.columns != "URL"].to_markdown()) 70 71 72# --- private functions --- 73def _get_soup(url: str, **kwargs: Any) -> BeautifulSoup | None: 74 """Return a BeautifulSoup object from a URL. 75 Returns None on error.""" 76 77 try: 78 page = get_file(url, **kwargs) 79 except (HttpError, CacheError) as e: 80 print(f"Error: {e}") 81 return None 82 83 # remove those pesky span tags - possibly not necessary 84 page = re.sub(b"<span[^>]*>", b" ", page) 85 page = re.sub(b"</span>", b" ", page) 86 page = re.sub(b"\\s+", b" ", page) # tidy up white space 87 88 return BeautifulSoup(page, "html.parser") 89 90 91def _historical_name_fix( 92 moniker: str, 93 foretext: str, 94 prefix: str, 95) -> tuple[str, str]: 96 """Fix the historical data names. Returns a tuple of moniker and foretext.""" 97 98 if "Exchange Rates" in foretext: 99 foretext = f"{foretext} - {moniker}" 100 moniker = "F11.1" 101 102 for word in ["Daily", "Monthly", "Detailed", "Summary", "Allotted"]: 103 if word in foretext: 104 moniker = f"{moniker}-{word}" 105 break 106 107 last = foretext.rsplit(" ", 1)[-1] 108 if re.match(r"\d{4}", last): 109 moniker = f"{moniker}-{last}" 110 111 moniker = f"{prefix}{moniker}" 112 113 return moniker, foretext 114 115 116def _excel_link_capture( 117 soup: BeautifulSoup, 118 prefix: str, 119) -> dict[str, dict[str, str]]: 120 """Capture all links (of Microsoft Excel types) from the 121 BeautifulSoup object. Returns a dictionary with the following 122 structure: {moniker: {"Description": text, "URL": url}}.""" 123 124 # The RBA has a number of historic tables that are not well 125 # formated. We will exclude these from the dictionary. 126 historic_exclusions = ("E4", "E5", "E6", "E7", "J1", "J2") 127 128 link_dict = {} 129 for link in soup.findAll("a"): 130 131 url = link.get("href").strip() 132 if not url or url is None: 133 continue 134 135 tail = url.rsplit("/", 1)[-1].lower() 136 if "." not in tail: 137 continue 138 if not tail.endswith(".xls") and not tail.endswith(".xlsx"): 139 continue 140 text, url = link.text, _make_absolute_url(url.strip()) 141 text = text.replace("–", "-").strip() 142 143 pair = text.rsplit(" - ", 1) 144 if len(pair) != 2: 145 continue 146 foretext, moniker = pair 147 148 if prefix: 149 # Remove historical data that does not easily 150 # parse under the same rules as for the current data. 151 if moniker in historic_exclusions: 152 continue 153 if "Occasional Paper" in moniker: 154 continue 155 156 # The historical data is a bit ugly. Let's clean it up. 157 moniker, foretext = _historical_name_fix(moniker, foretext, prefix) 158 159 if moniker in link_dict: 160 print(f"Warning: {moniker} already exists in the dictionary {tail}") 161 if tail != ".xlsx": 162 # do not replace a .xlsx link with an .xls link 163 continue 164 link_dict[moniker] = {"Description": foretext.strip(), "URL": url} 165 166 return link_dict 167 168 169@cache 170def _get_rba_links(**kwargs: Any) -> DataFrame: 171 """Extract links to RBA data files in Excel format 172 from the RBA website. Returns a DataFrame with the 173 following columns: 'Description' and 'URL'. The index 174 is the 'Table' number. Returns an empty DataFrame on error.""" 175 176 urls = [ 177 # (url, prefix) 178 ("https://www.rba.gov.au/statistics/tables/", ""), # current 179 ("https://www.rba.gov.au/statistics/historical-data.html", "Z:"), # history 180 ] 181 182 link_dict = {} 183 for url, prefix in urls: 184 soup = _get_soup(url, **kwargs) 185 if soup is not None: 186 link_dict.update(_excel_link_capture(soup, prefix)) 187 188 rba_catalog = DataFrame(link_dict).T.sort_index() 189 rba_catalog.index.name = "Table" 190 return rba_catalog 191 192 193# private 194def _make_absolute_url(url: str, prefix: str = "https://www.rba.gov.au") -> str: 195 """Convert a relative URL address found on the RBA site to 196 an absolute URL address.""" 197 198 # remove a prefix if it already exists (just to be sure) 199 url = url.replace(prefix, "") 200 url = url.replace(prefix.replace("https://", "http://"), "") 201 # then add the prefix (back) ... 202 return f"{prefix}{url}" 203 204 205# --- testing --- 206if __name__ == "__main__": 207 print_rba_catalogue(cache_only=False, verbose=False)
@cache
def
rba_catalogue(cache_only=False, verbose=False) -> pandas.core.frame.DataFrame:
18@cache 19def rba_catalogue(cache_only=False, verbose=False) -> DataFrame: 20 """Return a DataFrame of RBA Catalogue numbers. In the first instance, 21 this is downloaded from the RBA website, and cached for future use. 22 23 Parameters 24 ---------- 25 cache_only : bool = False 26 If True, only use the cache. 27 verbose : bool = False 28 If True, print progress messages. 29 30 Returns 31 ------- 32 DataFrame 33 A DataFrame of RBA Catalogue numbers. 34 35 Example 36 ------- 37 ```python 38 import readabs as ra 39 catalogue = ra.rba_catalogue() 40 ```""" 41 42 return _get_rba_links(cache_only=cache_only, verbose=verbose)
Return a DataFrame of RBA Catalogue numbers. In the first instance, this is downloaded from the RBA website, and cached for future use.
Parameters
cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.
Returns
DataFrame A DataFrame of RBA Catalogue numbers.
Example
import readabs as ra
catalogue = ra.rba_catalogue()
def
print_rba_catalogue(cache_only=False, verbose=False) -> None:
45def print_rba_catalogue(cache_only=False, verbose=False) -> None: 46 """This function prints to standard output a table of the RBA 47 Catalogue Numbers. 48 49 Parameters 50 ---------- 51 cache_only : bool = False 52 If True, only use the cache. 53 verbose : bool = False 54 If True, print progress messages. 55 56 Return values 57 ------------- 58 59 The function does not return anything. 60 61 Example 62 ------- 63 64 ```python 65 import readabs as ra 66 ra.print_rba_catalogue() 67 ```""" 68 69 rba_catalog = rba_catalogue(cache_only=cache_only, verbose=verbose) 70 print(rba_catalog.loc[:, rba_catalog.columns != "URL"].to_markdown())
This function prints to standard output a table of the RBA Catalogue Numbers.
Parameters
cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.
Return values
The function does not return anything.
Example
import readabs as ra
ra.print_rba_catalogue()