readabs.abs_catalogue
Catalogue map for ABS data.
1"""Catalogue map for ABS data.""" 2 3from functools import cache 4from io import StringIO 5from pandas import DataFrame, Series, Index, read_html 6from readabs.download_cache import get_file 7 8 9@cache 10def abs_catalogue(cache_only=False, verbose=False) -> DataFrame: 11 """Return a DataFrame of ABS Catalogue numbers. In the first instance, 12 this is downloaded from the ABS website, and cached for future use. 13 14 Parameters 15 ---------- 16 cache_only : bool = False 17 If True, only use the cache. 18 verbose : bool = False 19 If True, print progress messages. 20 21 Returns 22 ------- 23 DataFrame 24 A DataFrame of ABS Catalogue numbers. 25 26 Example 27 ------- 28 ```python 29 import readabs as ra 30 catalogue = ra.abs_catalogue() 31 ```""" 32 33 # get ABS web page of catalogue numbers 34 url = "https://www.abs.gov.au/about/data-services/help/abs-time-series-directory" 35 abs_bytes = get_file(url, cache_only=cache_only, verbose=verbose) 36 links = read_html(StringIO(abs_bytes.decode("utf-8")), extract_links="body")[ 37 1 38 ] # second table on the page 39 40 # extract catalogue numbers 41 cats = links["Catalogue Number"].apply(Series)[0] 42 urls = links["Topic"].apply(Series)[1] 43 root = "https://www.abs.gov.au/statistics/" 44 snip = urls.str.replace(root, "") 45 snip = ( 46 snip[~snip.str.contains("http")].str.replace("-", " ").str.title() 47 ) # remove bad cases 48 frame = snip.str.split("/", expand=True).iloc[:, :3] 49 frame.columns = Index(["Theme", "Parent Topic", "Topic"]) 50 frame["URL"] = urls 51 cats = cats[frame.index] 52 cat_index = cats.str.replace("(Ceased)", "").str.strip() 53 status = Series(" ", index=cats.index).where(cat_index == cats, "Ceased") 54 frame["Status"] = status 55 frame.index = Index(cat_index) 56 frame.index.name = "Catalogue ID" 57 return frame 58 59 60if __name__ == "__main__": 61 print(abs_catalogue())
@cache
def
abs_catalogue(cache_only=False, verbose=False) -> pandas.core.frame.DataFrame:
10@cache 11def abs_catalogue(cache_only=False, verbose=False) -> DataFrame: 12 """Return a DataFrame of ABS Catalogue numbers. In the first instance, 13 this is downloaded from the ABS website, and cached for future use. 14 15 Parameters 16 ---------- 17 cache_only : bool = False 18 If True, only use the cache. 19 verbose : bool = False 20 If True, print progress messages. 21 22 Returns 23 ------- 24 DataFrame 25 A DataFrame of ABS Catalogue numbers. 26 27 Example 28 ------- 29 ```python 30 import readabs as ra 31 catalogue = ra.abs_catalogue() 32 ```""" 33 34 # get ABS web page of catalogue numbers 35 url = "https://www.abs.gov.au/about/data-services/help/abs-time-series-directory" 36 abs_bytes = get_file(url, cache_only=cache_only, verbose=verbose) 37 links = read_html(StringIO(abs_bytes.decode("utf-8")), extract_links="body")[ 38 1 39 ] # second table on the page 40 41 # extract catalogue numbers 42 cats = links["Catalogue Number"].apply(Series)[0] 43 urls = links["Topic"].apply(Series)[1] 44 root = "https://www.abs.gov.au/statistics/" 45 snip = urls.str.replace(root, "") 46 snip = ( 47 snip[~snip.str.contains("http")].str.replace("-", " ").str.title() 48 ) # remove bad cases 49 frame = snip.str.split("/", expand=True).iloc[:, :3] 50 frame.columns = Index(["Theme", "Parent Topic", "Topic"]) 51 frame["URL"] = urls 52 cats = cats[frame.index] 53 cat_index = cats.str.replace("(Ceased)", "").str.strip() 54 status = Series(" ", index=cats.index).where(cat_index == cats, "Ceased") 55 frame["Status"] = status 56 frame.index = Index(cat_index) 57 frame.index.name = "Catalogue ID" 58 return frame
Return a DataFrame of ABS Catalogue numbers. In the first instance, this is downloaded from the ABS website, and cached for future use.
Parameters
cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.
Returns
DataFrame A DataFrame of ABS Catalogue numbers.
Example
import readabs as ra
catalogue = ra.abs_catalogue()