readabs.abs_catalogue

Catalogue map for ABS data.

 1"""Catalogue map for ABS data."""
 2
 3from functools import cache
 4from io import StringIO
 5from pandas import DataFrame, Series, Index, read_html
 6from readabs.download_cache import get_file
 7
 8
 9@cache
10def abs_catalogue(cache_only=False, verbose=False) -> DataFrame:
11    """Return a DataFrame of ABS Catalogue numbers. In the first instance,
12    this is downloaded from the ABS website, and cached for future use.
13
14    Parameters
15    ----------
16    cache_only : bool = False
17        If True, only use the cache.
18    verbose : bool = False
19        If True, print progress messages.
20
21    Returns
22    -------
23    DataFrame
24        A DataFrame of ABS Catalogue numbers.
25
26    Example
27    -------
28    ```python
29    import readabs as ra
30    catalogue = ra.abs_catalogue()
31    ```"""
32
33    # get ABS web page of catalogue numbers
34    url = "https://www.abs.gov.au/about/data-services/help/abs-time-series-directory"
35    abs_bytes = get_file(url, cache_only=cache_only, verbose=verbose)
36    links = read_html(StringIO(abs_bytes.decode("utf-8")), extract_links="body")[
37        1
38    ]  # second table on the page
39
40    # extract catalogue numbers
41    cats = links["Catalogue Number"].apply(Series)[0]
42    urls = links["Topic"].apply(Series)[1]
43    root = "https://www.abs.gov.au/statistics/"
44    snip = urls.str.replace(root, "")
45    snip = (
46        snip[~snip.str.contains("http")].str.replace("-", " ").str.title()
47    )  # remove bad cases
48    frame = snip.str.split("/", expand=True).iloc[:, :3]
49    frame.columns = Index(["Theme", "Parent Topic", "Topic"])
50    frame["URL"] = urls
51    cats = cats[frame.index]
52    cat_index = cats.str.replace("(Ceased)", "").str.strip()
53    status = Series(" ", index=cats.index).where(cat_index == cats, "Ceased")
54    frame["Status"] = status
55    frame.index = Index(cat_index)
56    frame.index.name = "Catalogue ID"
57    return frame
58
59
60if __name__ == "__main__":
61    print(abs_catalogue())
@cache
def abs_catalogue(cache_only=False, verbose=False) -> pandas.core.frame.DataFrame:
10@cache
11def abs_catalogue(cache_only=False, verbose=False) -> DataFrame:
12    """Return a DataFrame of ABS Catalogue numbers. In the first instance,
13    this is downloaded from the ABS website, and cached for future use.
14
15    Parameters
16    ----------
17    cache_only : bool = False
18        If True, only use the cache.
19    verbose : bool = False
20        If True, print progress messages.
21
22    Returns
23    -------
24    DataFrame
25        A DataFrame of ABS Catalogue numbers.
26
27    Example
28    -------
29    ```python
30    import readabs as ra
31    catalogue = ra.abs_catalogue()
32    ```"""
33
34    # get ABS web page of catalogue numbers
35    url = "https://www.abs.gov.au/about/data-services/help/abs-time-series-directory"
36    abs_bytes = get_file(url, cache_only=cache_only, verbose=verbose)
37    links = read_html(StringIO(abs_bytes.decode("utf-8")), extract_links="body")[
38        1
39    ]  # second table on the page
40
41    # extract catalogue numbers
42    cats = links["Catalogue Number"].apply(Series)[0]
43    urls = links["Topic"].apply(Series)[1]
44    root = "https://www.abs.gov.au/statistics/"
45    snip = urls.str.replace(root, "")
46    snip = (
47        snip[~snip.str.contains("http")].str.replace("-", " ").str.title()
48    )  # remove bad cases
49    frame = snip.str.split("/", expand=True).iloc[:, :3]
50    frame.columns = Index(["Theme", "Parent Topic", "Topic"])
51    frame["URL"] = urls
52    cats = cats[frame.index]
53    cat_index = cats.str.replace("(Ceased)", "").str.strip()
54    status = Series(" ", index=cats.index).where(cat_index == cats, "Ceased")
55    frame["Status"] = status
56    frame.index = Index(cat_index)
57    frame.index.name = "Catalogue ID"
58    return frame

Return a DataFrame of ABS Catalogue numbers. In the first instance, this is downloaded from the ABS website, and cached for future use.

Parameters

cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.

Returns

DataFrame A DataFrame of ABS Catalogue numbers.

Example

import readabs as ra
catalogue = ra.abs_catalogue()