readabs.grab_abs_url

Find and extract DataFrames from an ABS webpage.

  1"""Find and extract DataFrames from an ABS webpage."""
  2
  3# --- imports ---
  4# standard library imports
  5import zipfile
  6from functools import cache
  7from io import BytesIO
  8from typing import Any
  9
 10# analytic imports
 11import pandas as pd
 12from pandas import DataFrame
 13
 14# local imports
 15from readabs.get_abs_links import get_abs_links, get_table_name
 16from readabs.read_support import check_kwargs, get_args, HYPHEN
 17from readabs.download_cache import get_file
 18from readabs.abs_catalogue import abs_catalogue
 19
 20
 21# --- public - primary entry point for this module
 22@cache  # minimise slowness with repeat business
 23def grab_abs_url(
 24    url: str = "",
 25    **kwargs: Any,
 26) -> dict[str, DataFrame]:
 27    """For a given URL, extract the data from the Excel and ZIP file
 28    links found on that page. The data is returned as a dictionary of
 29    DataFrames. The Excel files are converted into DataFrames, with
 30    each sheet in each Excel file becoming a separate DataFrame. ZIP
 31    files are examined for Excel files, which are similarly converted into
 32    DataFrames. The dictionary of DataFrames is returned.
 33
 34    The preferred mechanism for reading ABS data is to use the `read_abs_cat()`
 35    or `read_abs_series()` functions. This function is provided for those
 36    cases where the data is not available in the ABS catalogue, where the
 37    data is not a timeseries, or where the user wants to extract data from
 38    a specific ABS landingpage.
 39
 40
 41    Parameters
 42    ----------
 43    url : str = ""
 44        A URL for an ABS Catalogue landing page. Either a url or
 45        a catalogue number must be provided. If both are provided, the
 46        URL will be used.
 47
 48    **kwargs : Any
 49        Accepts the same keyword arguments as `read_abs_cat()`. Additionally,
 50        a cat argument can be provided, which will be used to get the URL
 51        (see below).
 52
 53    cat : str = ""
 54        An ABS Catalogue number. If provided, and the URL is not
 55        provided, then the Catalogue number will be used to get the URL.
 56
 57    Returns
 58    -------
 59    dict[str, DataFrame]
 60        A dictionary of DataFrames."""
 61
 62    # check/get the keyword arguments
 63    url = _get_url(url, kwargs)  # note: removes "cat" from kwargs
 64    check_kwargs(kwargs, "grab_abs_url")  # warn if invalid kwargs
 65    args = get_args(kwargs, "grab_abs_url")  # get the valid kwargs
 66    if verbose := args["verbose"]:
 67        print(f"grab_abs_url(): {url=}, {args=}")
 68
 69    # get the URL links to the relevant ABS data files on that webpage
 70    links = get_abs_links(url, **args)
 71    if not links:
 72        print(f"No data files found at URL: {url}")
 73        return {}  # return an empty Dictionary
 74
 75    # read the data files into a dictionary of DataFrames
 76    abs_dict: dict[str, DataFrame] = {}
 77
 78    # use the args, and the found links to get the data ...
 79    if args["single_excel_only"]:
 80        link = _find_url(links, ".xlsx", args["single_excel_only"], verbose)
 81        if link:
 82            abs_dict = _add_excel(abs_dict, link, **args)
 83            return abs_dict
 84
 85    if args["single_zip_only"]:
 86        link = _find_url(links, ".zip", args["single_zip_only"], verbose)
 87        if link:
 88            abs_dict = _add_zip(abs_dict, link, **args)
 89            return abs_dict
 90
 91    for link_type in ".zip", ".xlsx":  # .zip must come first
 92        for link in links.get(link_type, []):
 93
 94            if link_type == ".zip" and args["get_zip"]:
 95                abs_dict = _add_zip(abs_dict, link, **args)
 96
 97            elif link_type == ".xlsx":
 98                if (
 99                    args["get_excel"]
100                    or (args["get_excel_if_no_zip"] and not args["get_zip"])
101                    or (args["get_excel_if_no_zip"] and not links.get(".zip", []))
102                ):
103                    abs_dict = _add_excel(abs_dict, link, **args)
104
105    return abs_dict
106
107
108# --- private
109def _find_url(
110    links: dict[str, list[str]], targ_type: str, target: str, verbose=False
111) -> str:
112    """Find the URL for a target file type.
113    Returns the URL if found, otherwise an empty string."""
114
115    targ_list = links.get(targ_type, [])
116    if not targ_list:
117        return ""
118    goal = f"{target}{targ_type}"
119    if verbose:
120        print(f"_find_url(): looking for {goal} in {targ_list}.")
121    for link in targ_list:
122        if link.endswith(goal):
123            return link
124    return ""
125
126
127def _get_url(url: str, kwargs: dict) -> str:
128    """If an ABS 'cat' is provided and url is not provided,
129    get the URL for the ABS data files on the ABS webpage.
130    Otherwise, return the URL provided. Either the 'url' or
131    'cat' argument must be provided.
132
133    Note: kwargs is passed as a dictionary, so that it can be
134    modified in place. This is a common Python idiom."""
135
136    cat: str = kwargs.pop("cat", "")  # this takes cat out of kwargs
137    cat_map = abs_catalogue()
138    if not url and cat and cat in cat_map.index:
139        url = str(cat_map.loc[cat, "URL"])
140    if not url:
141        raise ValueError("_grab_url(): no URL/cat provided.")
142
143    return url
144
145
146def _add_zip(abs_dict: dict[str, DataFrame], link: str, **args) -> dict[str, DataFrame]:
147    """Read in the zip-zip file at the URL in the 'link' argument.
148    Iterate over the contents of that zip-file, calling
149    _add_excel_bytes() to put those contents into the dictionary of
150    DataFrames given by 'abs_dict'. When done, return the dictionary
151    of DataFrames."""
152
153    zip_contents = get_file(link, **args)
154    if len(zip_contents) == 0:
155        return abs_dict
156
157    with zipfile.ZipFile(BytesIO(zip_contents)) as zipped:
158        for element in zipped.infolist():
159
160            # get the zipfile into pandas
161            table_name = get_table_name(url=element.filename)
162            raw_bytes = zipped.read(element.filename)
163            abs_dict = _add_excel_bytes(abs_dict, raw_bytes, table_name, args)
164
165    return abs_dict
166
167
168def _add_excel_bytes(
169    abs_dict: dict[str, DataFrame],
170    raw_bytes: bytes,
171    name: str,
172    args: Any,
173) -> dict[str, DataFrame]:
174    """Assume the bytes at 'raw_bytes' represemt an Excel file.
175    Convert each sheet in the Excel file to a DataFrame, and place
176    those DataFrames in the dictionary of DataFrames given by
177    'abs_dict', using 'name---sheet_name' as a key.
178    When done, return the dictionary of DataFrames."""
179
180    verbose = args.get("verbose", False)
181
182    if len(raw_bytes) == 0:
183        if verbose:
184            print("_add_excel_bytes(): the raw bytes are empty.")
185        return abs_dict
186
187    # convert the raw bytes into a pandas ExcelFile
188    try:
189        excel = pd.ExcelFile(BytesIO(raw_bytes))
190    except Exception as e:  # pylint: disable=broad-exception-caught
191        message = f"With {name}: could not convert raw bytes to ExcelFile.\n{e}"
192        print(message)
193        return abs_dict
194
195    # iterate over the sheets in the Excel file
196    for sheet_name in excel.sheet_names:
197        # grab and go - no treatment of the data
198        sheet_data = excel.parse(
199            sheet_name,
200        )
201        if len(sheet_data) == 0:
202            if verbose:
203                print(f"_add_excel_bytes(): sheet {sheet_name} in {name} is empty.")
204            continue
205        abs_dict[f"{name}{HYPHEN}{sheet_name}"] = sheet_data
206
207    # return the dictionary of DataFrames
208    return abs_dict
209
210
211def _add_excel(
212    abs_dict: dict[str, DataFrame],
213    link: str,
214    **args: Any,
215) -> dict[str, DataFrame]:
216    """Read in an Excel file at the URL in the 'link' argument.
217    Pass those bytes to _add_excel_bytes() to put the contents
218    into the dictionary of DataFrames given by 'abs_dict'. When done,
219    return the dictionary of DataFrames."""
220
221    name = get_table_name(link)
222
223    if name in abs_dict:
224        # table already in the dictionary
225        return abs_dict
226
227    raw_bytes = get_file(link, **args)
228
229    abs_dict = _add_excel_bytes(abs_dict, raw_bytes, name, args)
230
231    return abs_dict
232
233
234# --- main ---
235if __name__ == "__main__":
236
237    def simple_test() -> None:
238        """Simple test of the grab_abs_url function."""
239
240        def test(name: str, **kwargs: Any) -> None:
241            print(f"TEST -- {name}")
242            data_dict = grab_abs_url(**kwargs)
243            print("---")
244            if not data_dict:
245                print("PROBLEM -- No data found.")
246            print(data_dict.keys())
247            print(f"Done.\n{'=' * 20}\n")
248
249        name = "1 -- grab a single zip file"
250        test(
251            name,
252            cat="6291.0.55.001",
253            single_zip_only="p6291_all_quartely_spreadsheets",
254            get_zip=True,
255            verbose=True,
256        )
257
258        name = "2 -- grab a single Excel file"
259        test(
260            name,
261            cat="6202.0",
262            get_excel=True,
263            single_excel_only="6202001",
264            verbose=False,
265        )
266
267        # 3 -- grab the whole shebang
268        urls = [
269            "https://www.abs.gov.au/statistics/labour/jobs/"
270            + "weekly-payroll-jobs/latest-release",
271            "https://www.abs.gov.au/statistics/people/population/"
272            + "national-state-and-territory-population/dec-2023",
273        ]
274        for i, url_ in enumerate(urls):
275            name = f"3.{i} -- grab the whole shebang {url_}"
276            test(name, url=url_, verbose=True)
277
278    simple_test()
@cache
def grab_abs_url(url: str = '', **kwargs: Any) -> dict[str, pandas.core.frame.DataFrame]:
 23@cache  # minimise slowness with repeat business
 24def grab_abs_url(
 25    url: str = "",
 26    **kwargs: Any,
 27) -> dict[str, DataFrame]:
 28    """For a given URL, extract the data from the Excel and ZIP file
 29    links found on that page. The data is returned as a dictionary of
 30    DataFrames. The Excel files are converted into DataFrames, with
 31    each sheet in each Excel file becoming a separate DataFrame. ZIP
 32    files are examined for Excel files, which are similarly converted into
 33    DataFrames. The dictionary of DataFrames is returned.
 34
 35    The preferred mechanism for reading ABS data is to use the `read_abs_cat()`
 36    or `read_abs_series()` functions. This function is provided for those
 37    cases where the data is not available in the ABS catalogue, where the
 38    data is not a timeseries, or where the user wants to extract data from
 39    a specific ABS landingpage.
 40
 41
 42    Parameters
 43    ----------
 44    url : str = ""
 45        A URL for an ABS Catalogue landing page. Either a url or
 46        a catalogue number must be provided. If both are provided, the
 47        URL will be used.
 48
 49    **kwargs : Any
 50        Accepts the same keyword arguments as `read_abs_cat()`. Additionally,
 51        a cat argument can be provided, which will be used to get the URL
 52        (see below).
 53
 54    cat : str = ""
 55        An ABS Catalogue number. If provided, and the URL is not
 56        provided, then the Catalogue number will be used to get the URL.
 57
 58    Returns
 59    -------
 60    dict[str, DataFrame]
 61        A dictionary of DataFrames."""
 62
 63    # check/get the keyword arguments
 64    url = _get_url(url, kwargs)  # note: removes "cat" from kwargs
 65    check_kwargs(kwargs, "grab_abs_url")  # warn if invalid kwargs
 66    args = get_args(kwargs, "grab_abs_url")  # get the valid kwargs
 67    if verbose := args["verbose"]:
 68        print(f"grab_abs_url(): {url=}, {args=}")
 69
 70    # get the URL links to the relevant ABS data files on that webpage
 71    links = get_abs_links(url, **args)
 72    if not links:
 73        print(f"No data files found at URL: {url}")
 74        return {}  # return an empty Dictionary
 75
 76    # read the data files into a dictionary of DataFrames
 77    abs_dict: dict[str, DataFrame] = {}
 78
 79    # use the args, and the found links to get the data ...
 80    if args["single_excel_only"]:
 81        link = _find_url(links, ".xlsx", args["single_excel_only"], verbose)
 82        if link:
 83            abs_dict = _add_excel(abs_dict, link, **args)
 84            return abs_dict
 85
 86    if args["single_zip_only"]:
 87        link = _find_url(links, ".zip", args["single_zip_only"], verbose)
 88        if link:
 89            abs_dict = _add_zip(abs_dict, link, **args)
 90            return abs_dict
 91
 92    for link_type in ".zip", ".xlsx":  # .zip must come first
 93        for link in links.get(link_type, []):
 94
 95            if link_type == ".zip" and args["get_zip"]:
 96                abs_dict = _add_zip(abs_dict, link, **args)
 97
 98            elif link_type == ".xlsx":
 99                if (
100                    args["get_excel"]
101                    or (args["get_excel_if_no_zip"] and not args["get_zip"])
102                    or (args["get_excel_if_no_zip"] and not links.get(".zip", []))
103                ):
104                    abs_dict = _add_excel(abs_dict, link, **args)
105
106    return abs_dict

For a given URL, extract the data from the Excel and ZIP file links found on that page. The data is returned as a dictionary of DataFrames. The Excel files are converted into DataFrames, with each sheet in each Excel file becoming a separate DataFrame. ZIP files are examined for Excel files, which are similarly converted into DataFrames. The dictionary of DataFrames is returned.

The preferred mechanism for reading ABS data is to use the read_abs_cat() or read_abs_series() functions. This function is provided for those cases where the data is not available in the ABS catalogue, where the data is not a timeseries, or where the user wants to extract data from a specific ABS landingpage.

Parameters

url : str = "" A URL for an ABS Catalogue landing page. Either a url or a catalogue number must be provided. If both are provided, the URL will be used.

**kwargs : Any Accepts the same keyword arguments as read_abs_cat(). Additionally, a cat argument can be provided, which will be used to get the URL (see below).

cat : str = "" An ABS Catalogue number. If provided, and the URL is not provided, then the Catalogue number will be used to get the URL.

Returns

dict[str, DataFrame] A dictionary of DataFrames.