readabs.grab_abs_url
Find and extract DataFrames from an ABS webpage.
1"""Find and extract DataFrames from an ABS webpage.""" 2 3# --- imports --- 4# standard library imports 5import zipfile 6from functools import cache 7from io import BytesIO 8from typing import Any 9 10# analytic imports 11import pandas as pd 12from pandas import DataFrame 13 14# local imports 15from readabs.get_abs_links import get_abs_links, get_table_name 16from readabs.read_support import check_kwargs, get_args, HYPHEN 17from readabs.download_cache import get_file 18from readabs.abs_catalogue import abs_catalogue 19 20 21# --- public - primary entry point for this module 22@cache # minimise slowness with repeat business 23def grab_abs_url( 24 url: str = "", 25 **kwargs: Any, 26) -> dict[str, DataFrame]: 27 """For a given URL, extract the data from the Excel and ZIP file 28 links found on that page. The data is returned as a dictionary of 29 DataFrames. The Excel files are converted into DataFrames, with 30 each sheet in each Excel file becoming a separate DataFrame. ZIP 31 files are examined for Excel files, which are similarly converted into 32 DataFrames. The dictionary of DataFrames is returned. 33 34 The preferred mechanism for reading ABS data is to use the `read_abs_cat()` 35 or `read_abs_series()` functions. This function is provided for those 36 cases where the data is not available in the ABS catalogue, where the 37 data is not a timeseries, or where the user wants to extract data from 38 a specific ABS landingpage. 39 40 41 Parameters 42 ---------- 43 url : str = "" 44 A URL for an ABS Catalogue landing page. Either a url or 45 a catalogue number must be provided. If both are provided, the 46 URL will be used. 47 48 **kwargs : Any 49 Accepts the same keyword arguments as `read_abs_cat()`. Additionally, 50 a cat argument can be provided, which will be used to get the URL 51 (see below). 52 53 cat : str = "" 54 An ABS Catalogue number. If provided, and the URL is not 55 provided, then the Catalogue number will be used to get the URL. 56 57 Returns 58 ------- 59 dict[str, DataFrame] 60 A dictionary of DataFrames.""" 61 62 # check/get the keyword arguments 63 url = _get_url(url, kwargs) # note: removes "cat" from kwargs 64 check_kwargs(kwargs, "grab_abs_url") # warn if invalid kwargs 65 args = get_args(kwargs, "grab_abs_url") # get the valid kwargs 66 if verbose := args["verbose"]: 67 print(f"grab_abs_url(): {url=}, {args=}") 68 69 # get the URL links to the relevant ABS data files on that webpage 70 links = get_abs_links(url, **args) 71 if not links: 72 print(f"No data files found at URL: {url}") 73 return {} # return an empty Dictionary 74 75 # read the data files into a dictionary of DataFrames 76 abs_dict: dict[str, DataFrame] = {} 77 78 # use the args, and the found links to get the data ... 79 if args["single_excel_only"]: 80 link = _find_url(links, ".xlsx", args["single_excel_only"], verbose) 81 if link: 82 abs_dict = _add_excel(abs_dict, link, **args) 83 return abs_dict 84 85 if args["single_zip_only"]: 86 link = _find_url(links, ".zip", args["single_zip_only"], verbose) 87 if link: 88 abs_dict = _add_zip(abs_dict, link, **args) 89 return abs_dict 90 91 for link_type in ".zip", ".xlsx": # .zip must come first 92 for link in links.get(link_type, []): 93 94 if link_type == ".zip" and args["get_zip"]: 95 abs_dict = _add_zip(abs_dict, link, **args) 96 97 elif link_type == ".xlsx": 98 if ( 99 args["get_excel"] 100 or (args["get_excel_if_no_zip"] and not args["get_zip"]) 101 or (args["get_excel_if_no_zip"] and not links.get(".zip", [])) 102 ): 103 abs_dict = _add_excel(abs_dict, link, **args) 104 105 return abs_dict 106 107 108# --- private 109def _find_url( 110 links: dict[str, list[str]], targ_type: str, target: str, verbose=False 111) -> str: 112 """Find the URL for a target file type. 113 Returns the URL if found, otherwise an empty string.""" 114 115 targ_list = links.get(targ_type, []) 116 if not targ_list: 117 return "" 118 goal = f"{target}{targ_type}" 119 if verbose: 120 print(f"_find_url(): looking for {goal} in {targ_list}.") 121 for link in targ_list: 122 if link.endswith(goal): 123 return link 124 return "" 125 126 127def _get_url(url: str, kwargs: dict) -> str: 128 """If an ABS 'cat' is provided and url is not provided, 129 get the URL for the ABS data files on the ABS webpage. 130 Otherwise, return the URL provided. Either the 'url' or 131 'cat' argument must be provided. 132 133 Note: kwargs is passed as a dictionary, so that it can be 134 modified in place. This is a common Python idiom.""" 135 136 cat: str = kwargs.pop("cat", "") # this takes cat out of kwargs 137 cat_map = abs_catalogue() 138 if not url and cat and cat in cat_map.index: 139 url = str(cat_map.loc[cat, "URL"]) 140 if not url: 141 raise ValueError("_grab_url(): no URL/cat provided.") 142 143 return url 144 145 146def _add_zip(abs_dict: dict[str, DataFrame], link: str, **args) -> dict[str, DataFrame]: 147 """Read in the zip-zip file at the URL in the 'link' argument. 148 Iterate over the contents of that zip-file, calling 149 _add_excel_bytes() to put those contents into the dictionary of 150 DataFrames given by 'abs_dict'. When done, return the dictionary 151 of DataFrames.""" 152 153 zip_contents = get_file(link, **args) 154 if len(zip_contents) == 0: 155 return abs_dict 156 157 with zipfile.ZipFile(BytesIO(zip_contents)) as zipped: 158 for element in zipped.infolist(): 159 160 # get the zipfile into pandas 161 table_name = get_table_name(url=element.filename) 162 raw_bytes = zipped.read(element.filename) 163 abs_dict = _add_excel_bytes(abs_dict, raw_bytes, table_name, args) 164 165 return abs_dict 166 167 168def _add_excel_bytes( 169 abs_dict: dict[str, DataFrame], 170 raw_bytes: bytes, 171 name: str, 172 args: Any, 173) -> dict[str, DataFrame]: 174 """Assume the bytes at 'raw_bytes' represemt an Excel file. 175 Convert each sheet in the Excel file to a DataFrame, and place 176 those DataFrames in the dictionary of DataFrames given by 177 'abs_dict', using 'name---sheet_name' as a key. 178 When done, return the dictionary of DataFrames.""" 179 180 verbose = args.get("verbose", False) 181 182 if len(raw_bytes) == 0: 183 if verbose: 184 print("_add_excel_bytes(): the raw bytes are empty.") 185 return abs_dict 186 187 # convert the raw bytes into a pandas ExcelFile 188 try: 189 excel = pd.ExcelFile(BytesIO(raw_bytes)) 190 except Exception as e: # pylint: disable=broad-exception-caught 191 message = f"With {name}: could not convert raw bytes to ExcelFile.\n{e}" 192 print(message) 193 return abs_dict 194 195 # iterate over the sheets in the Excel file 196 for sheet_name in excel.sheet_names: 197 # grab and go - no treatment of the data 198 sheet_data = excel.parse( 199 sheet_name, 200 ) 201 if len(sheet_data) == 0: 202 if verbose: 203 print(f"_add_excel_bytes(): sheet {sheet_name} in {name} is empty.") 204 continue 205 abs_dict[f"{name}{HYPHEN}{sheet_name}"] = sheet_data 206 207 # return the dictionary of DataFrames 208 return abs_dict 209 210 211def _add_excel( 212 abs_dict: dict[str, DataFrame], 213 link: str, 214 **args: Any, 215) -> dict[str, DataFrame]: 216 """Read in an Excel file at the URL in the 'link' argument. 217 Pass those bytes to _add_excel_bytes() to put the contents 218 into the dictionary of DataFrames given by 'abs_dict'. When done, 219 return the dictionary of DataFrames.""" 220 221 name = get_table_name(link) 222 223 if name in abs_dict: 224 # table already in the dictionary 225 return abs_dict 226 227 raw_bytes = get_file(link, **args) 228 229 abs_dict = _add_excel_bytes(abs_dict, raw_bytes, name, args) 230 231 return abs_dict 232 233 234# --- main --- 235if __name__ == "__main__": 236 237 def simple_test() -> None: 238 """Simple test of the grab_abs_url function.""" 239 240 def test(name: str, **kwargs: Any) -> None: 241 print(f"TEST -- {name}") 242 data_dict = grab_abs_url(**kwargs) 243 print("---") 244 if not data_dict: 245 print("PROBLEM -- No data found.") 246 print(data_dict.keys()) 247 print(f"Done.\n{'=' * 20}\n") 248 249 name = "1 -- grab a single zip file" 250 test( 251 name, 252 cat="6291.0.55.001", 253 single_zip_only="p6291_all_quartely_spreadsheets", 254 get_zip=True, 255 verbose=True, 256 ) 257 258 name = "2 -- grab a single Excel file" 259 test( 260 name, 261 cat="6202.0", 262 get_excel=True, 263 single_excel_only="6202001", 264 verbose=False, 265 ) 266 267 # 3 -- grab the whole shebang 268 urls = [ 269 "https://www.abs.gov.au/statistics/labour/jobs/" 270 + "weekly-payroll-jobs/latest-release", 271 "https://www.abs.gov.au/statistics/people/population/" 272 + "national-state-and-territory-population/dec-2023", 273 ] 274 for i, url_ in enumerate(urls): 275 name = f"3.{i} -- grab the whole shebang {url_}" 276 test(name, url=url_, verbose=True) 277 278 simple_test()
23@cache # minimise slowness with repeat business 24def grab_abs_url( 25 url: str = "", 26 **kwargs: Any, 27) -> dict[str, DataFrame]: 28 """For a given URL, extract the data from the Excel and ZIP file 29 links found on that page. The data is returned as a dictionary of 30 DataFrames. The Excel files are converted into DataFrames, with 31 each sheet in each Excel file becoming a separate DataFrame. ZIP 32 files are examined for Excel files, which are similarly converted into 33 DataFrames. The dictionary of DataFrames is returned. 34 35 The preferred mechanism for reading ABS data is to use the `read_abs_cat()` 36 or `read_abs_series()` functions. This function is provided for those 37 cases where the data is not available in the ABS catalogue, where the 38 data is not a timeseries, or where the user wants to extract data from 39 a specific ABS landingpage. 40 41 42 Parameters 43 ---------- 44 url : str = "" 45 A URL for an ABS Catalogue landing page. Either a url or 46 a catalogue number must be provided. If both are provided, the 47 URL will be used. 48 49 **kwargs : Any 50 Accepts the same keyword arguments as `read_abs_cat()`. Additionally, 51 a cat argument can be provided, which will be used to get the URL 52 (see below). 53 54 cat : str = "" 55 An ABS Catalogue number. If provided, and the URL is not 56 provided, then the Catalogue number will be used to get the URL. 57 58 Returns 59 ------- 60 dict[str, DataFrame] 61 A dictionary of DataFrames.""" 62 63 # check/get the keyword arguments 64 url = _get_url(url, kwargs) # note: removes "cat" from kwargs 65 check_kwargs(kwargs, "grab_abs_url") # warn if invalid kwargs 66 args = get_args(kwargs, "grab_abs_url") # get the valid kwargs 67 if verbose := args["verbose"]: 68 print(f"grab_abs_url(): {url=}, {args=}") 69 70 # get the URL links to the relevant ABS data files on that webpage 71 links = get_abs_links(url, **args) 72 if not links: 73 print(f"No data files found at URL: {url}") 74 return {} # return an empty Dictionary 75 76 # read the data files into a dictionary of DataFrames 77 abs_dict: dict[str, DataFrame] = {} 78 79 # use the args, and the found links to get the data ... 80 if args["single_excel_only"]: 81 link = _find_url(links, ".xlsx", args["single_excel_only"], verbose) 82 if link: 83 abs_dict = _add_excel(abs_dict, link, **args) 84 return abs_dict 85 86 if args["single_zip_only"]: 87 link = _find_url(links, ".zip", args["single_zip_only"], verbose) 88 if link: 89 abs_dict = _add_zip(abs_dict, link, **args) 90 return abs_dict 91 92 for link_type in ".zip", ".xlsx": # .zip must come first 93 for link in links.get(link_type, []): 94 95 if link_type == ".zip" and args["get_zip"]: 96 abs_dict = _add_zip(abs_dict, link, **args) 97 98 elif link_type == ".xlsx": 99 if ( 100 args["get_excel"] 101 or (args["get_excel_if_no_zip"] and not args["get_zip"]) 102 or (args["get_excel_if_no_zip"] and not links.get(".zip", [])) 103 ): 104 abs_dict = _add_excel(abs_dict, link, **args) 105 106 return abs_dict
For a given URL, extract the data from the Excel and ZIP file links found on that page. The data is returned as a dictionary of DataFrames. The Excel files are converted into DataFrames, with each sheet in each Excel file becoming a separate DataFrame. ZIP files are examined for Excel files, which are similarly converted into DataFrames. The dictionary of DataFrames is returned.
The preferred mechanism for reading ABS data is to use the read_abs_cat()
or read_abs_series()
functions. This function is provided for those
cases where the data is not available in the ABS catalogue, where the
data is not a timeseries, or where the user wants to extract data from
a specific ABS landingpage.
Parameters
url : str = "" A URL for an ABS Catalogue landing page. Either a url or a catalogue number must be provided. If both are provided, the URL will be used.
**kwargs : Any
Accepts the same keyword arguments as read_abs_cat()
. Additionally,
a cat argument can be provided, which will be used to get the URL
(see below).
cat : str = "" An ABS Catalogue number. If provided, and the URL is not provided, then the Catalogue number will be used to get the URL.
Returns
dict[str, DataFrame] A dictionary of DataFrames.