readabs.rba_catalogue

Extract links to RBA data files from the RBA website.

  1"""Extract links to RBA data files from the RBA website."""
  2
  3# system imports
  4import re
  5from typing import Any
  6from functools import cache
  7
  8# analutic imports
  9from bs4 import BeautifulSoup
 10from pandas import DataFrame
 11
 12# local imports
 13from readabs.download_cache import get_file, HttpError, CacheError
 14
 15
 16# --- public functions ---
 17@cache
 18def rba_catalogue(cache_only=False, verbose=False) -> DataFrame:
 19    """Return a DataFrame of RBA Catalogue numbers. In the first instance,
 20    this is downloaded from the RBA website, and cached for future use.
 21
 22    Parameters
 23    ----------
 24    cache_only : bool = False
 25        If True, only use the cache.
 26    verbose : bool = False
 27        If True, print progress messages.
 28
 29    Returns
 30    -------
 31    DataFrame
 32        A DataFrame of RBA Catalogue numbers.
 33
 34    Example
 35    -------
 36    ```python
 37    import readabs as ra
 38    catalogue = ra.rba_catalogue()
 39    ```"""
 40
 41    return _get_rba_links(cache_only=cache_only, verbose=verbose)
 42
 43
 44def print_rba_catalogue(cache_only=False, verbose=False) -> None:
 45    """This function prints to standard output a table of the RBA
 46    Catalogue Numbers.
 47
 48    Parameters
 49    ----------
 50    cache_only : bool = False
 51        If True, only use the cache.
 52    verbose : bool = False
 53        If True, print progress messages.
 54
 55    Return values
 56    -------------
 57
 58    The function does not return anything.
 59
 60    Example
 61    -------
 62
 63    ```python
 64    import readabs as ra
 65    ra.print_rba_catalogue()
 66    ```"""
 67
 68    rba_catalog = rba_catalogue(cache_only=cache_only, verbose=verbose)
 69    print(rba_catalog.loc[:, rba_catalog.columns != "URL"].to_markdown())
 70
 71
 72# --- private functions ---
 73def _get_soup(url: str, **kwargs: Any) -> BeautifulSoup | None:
 74    """Return a BeautifulSoup object from a URL.
 75    Returns None on error."""
 76
 77    try:
 78        page = get_file(url, **kwargs)
 79    except (HttpError, CacheError) as e:
 80        print(f"Error: {e}")
 81        return None
 82
 83    # remove those pesky span tags - possibly not necessary
 84    page = re.sub(b"<span[^>]*>", b" ", page)
 85    page = re.sub(b"</span>", b" ", page)
 86    page = re.sub(b"\\s+", b" ", page)  # tidy up white space
 87
 88    return BeautifulSoup(page, "html.parser")
 89
 90
 91def _historical_name_fix(
 92    moniker: str,
 93    foretext: str,
 94    prefix: str,
 95) -> tuple[str, str]:
 96    """Fix the historical data names. Returns a tuple of moniker and foretext."""
 97
 98    if "Exchange Rates" in foretext:
 99        foretext = f"{foretext} - {moniker}"
100        moniker = "F11.1"
101
102    for word in ["Daily", "Monthly", "Detailed", "Summary", "Allotted"]:
103        if word in foretext:
104            moniker = f"{moniker}-{word}"
105            break
106
107    last = foretext.rsplit(" ", 1)[-1]
108    if re.match(r"\d{4}", last):
109        moniker = f"{moniker}-{last}"
110
111    moniker = f"{prefix}{moniker}"
112
113    return moniker, foretext
114
115
116def _excel_link_capture(
117    soup: BeautifulSoup,
118    prefix: str,
119) -> dict[str, dict[str, str]]:
120    """Capture all links (of Microsoft Excel types) from the
121    BeautifulSoup object. Returns a dictionary with the following
122    structure: {moniker: {"Description": text, "URL": url}}."""
123
124    # The RBA has a number of historic tables that are not well
125    # formated. We will exclude these from the dictionary.
126    historic_exclusions = ("E4", "E5", "E6", "E7", "J1", "J2")
127
128    link_dict = {}
129    for link in soup.findAll("a"):
130
131        url = link.get("href").strip()
132        if not url or url is None:
133            continue
134
135        tail = url.rsplit("/", 1)[-1].lower()
136        if "." not in tail:
137            continue
138        if not tail.endswith(".xls") and not tail.endswith(".xlsx"):
139            continue
140        text, url = link.text, _make_absolute_url(url.strip())
141        text = text.replace("–", "-").strip()
142
143        pair = text.rsplit(" - ", 1)
144        if len(pair) != 2:
145            continue
146        foretext, moniker = pair
147
148        if prefix:
149            # Remove historical data that does not easily
150            # parse under the same rules as for the current data.
151            if moniker in historic_exclusions:
152                continue
153            if "Occasional Paper" in moniker:
154                continue
155
156            # The historical data is a bit ugly. Let's clean it up.
157            moniker, foretext = _historical_name_fix(moniker, foretext, prefix)
158
159        if moniker in link_dict:
160            print(f"Warning: {moniker} already exists in the dictionary {tail}")
161            if tail != ".xlsx":
162                # do not replace a .xlsx link with an .xls link
163                continue
164        link_dict[moniker] = {"Description": foretext.strip(), "URL": url}
165
166    return link_dict
167
168
169@cache
170def _get_rba_links(**kwargs: Any) -> DataFrame:
171    """Extract links to RBA data files in Excel format
172    from the RBA website.  Returns a DataFrame with the
173    following columns: 'Description' and 'URL'. The index
174    is the 'Table' number. Returns an empty DataFrame on error."""
175
176    urls = [
177        # (url, prefix)
178        ("https://www.rba.gov.au/statistics/tables/", ""),  # current
179        ("https://www.rba.gov.au/statistics/historical-data.html", "Z:"),  # history
180    ]
181
182    link_dict = {}
183    for url, prefix in urls:
184        soup = _get_soup(url, **kwargs)
185        if soup is not None:
186            link_dict.update(_excel_link_capture(soup, prefix))
187
188    rba_catalog = DataFrame(link_dict).T.sort_index()
189    rba_catalog.index.name = "Table"
190    return rba_catalog
191
192
193# private
194def _make_absolute_url(url: str, prefix: str = "https://www.rba.gov.au") -> str:
195    """Convert a relative URL address found on the RBA site to
196    an absolute URL address."""
197
198    # remove a prefix if it already exists (just to be sure)
199    url = url.replace(prefix, "")
200    url = url.replace(prefix.replace("https://", "http://"), "")
201    # then add the prefix (back) ...
202    return f"{prefix}{url}"
203
204
205# --- testing ---
206if __name__ == "__main__":
207    print_rba_catalogue(cache_only=False, verbose=False)
@cache
def rba_catalogue(cache_only=False, verbose=False) -> pandas.core.frame.DataFrame:
18@cache
19def rba_catalogue(cache_only=False, verbose=False) -> DataFrame:
20    """Return a DataFrame of RBA Catalogue numbers. In the first instance,
21    this is downloaded from the RBA website, and cached for future use.
22
23    Parameters
24    ----------
25    cache_only : bool = False
26        If True, only use the cache.
27    verbose : bool = False
28        If True, print progress messages.
29
30    Returns
31    -------
32    DataFrame
33        A DataFrame of RBA Catalogue numbers.
34
35    Example
36    -------
37    ```python
38    import readabs as ra
39    catalogue = ra.rba_catalogue()
40    ```"""
41
42    return _get_rba_links(cache_only=cache_only, verbose=verbose)

Return a DataFrame of RBA Catalogue numbers. In the first instance, this is downloaded from the RBA website, and cached for future use.

Parameters

cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.

Returns

DataFrame A DataFrame of RBA Catalogue numbers.

Example

import readabs as ra
catalogue = ra.rba_catalogue()