readabs.grab_abs_url
Find and extract DataFrames from an ABS webpage.
1"""Find and extract DataFrames from an ABS webpage.""" 2 3# --- imports --- 4# standard library imports 5import zipfile 6from functools import cache 7from io import BytesIO 8from pathlib import Path 9from typing import Any, Unpack 10 11# analytic imports 12import pandas as pd 13from pandas import DataFrame 14 15from readabs.abs_catalogue import abs_catalogue 16from readabs.download_cache import get_file 17 18# local imports 19from readabs.get_abs_links import get_abs_links, get_table_name 20from readabs.read_support import HYPHEN, ReadArgs, check_kwargs, get_args 21 22# --- constants --- 23# File extensions for ABS data files 24ZIP_EXTENSION = ".zip" 25EXCEL_EXTENSION = ".xlsx" 26 27# Processing order: ZIP files must be processed before Excel files 28# This prevents duplicate data when ZIP files contain Excel files 29FILE_EXTENSIONS_PROCESSING_ORDER = (ZIP_EXTENSION, EXCEL_EXTENSION) 30 31# Default values and limits 32EMPTY_BYTES_LENGTH = 0 33 34 35# --- public - primary entry point for this module 36@cache # minimise slowness with repeat business 37def grab_abs_url( 38 cat: str = "", 39 url: str = "", 40 **kwargs: Unpack[ReadArgs], 41) -> dict[str, DataFrame]: 42 """For a given URL, extract the data from the Excel and ZIP file links found on that page. 43 44 The data is returned as a dictionary of DataFrames. The Excel files are converted 45 into DataFrames, with each sheet in each Excel file becoming a separate DataFrame. 46 ZIP files are examined for Excel files, which are similarly converted into 47 DataFrames. The dictionary of DataFrames is returned. 48 49 The preferred mechanism for reading ABS data is to use the `read_abs_cat()` 50 or `read_abs_series()` functions. This function is provided for those 51 cases where the data is not available in the ABS catalogue, where the 52 data is not a timeseries, or where the user wants to extract data from 53 a specific ABS landingpage. 54 55 56 Parameters 57 ---------- 58 url : str = "" 59 A URL for an ABS Catalogue landing page. Either a url or 60 a catalogue number must be provided. If both are provided, the 61 URL will be used. 62 63 cat : str = "" 64 An ABS Catalogue number. If provided, and the URL is not 65 provided, then the Catalogue number will be used to get the URL. 66 67 **kwargs : Unpack[ReadArgs] 68 Accepts the same keyword arguments as `read_abs_cat()`. 69 70 Returns 71 ------- 72 dict[str, DataFrame] 73 A dictionary of DataFrames. 74 75 """ 76 # check/get the keyword arguments 77 url = _get_url(url, cat) 78 check_kwargs(kwargs, "grab_abs_url") # warn if invalid kwargs 79 args = get_args(kwargs, "grab_abs_url") # get the valid kwargs 80 if verbose := args["verbose"]: 81 print(f"grab_abs_url(): {url=}, {args=}") 82 83 # get the URL links to the relevant ABS data files on that webpage 84 links = get_abs_links(url, **args) 85 if not links: 86 print(f"No data files found at URL: {url}") 87 return {} # return an empty Dictionary 88 89 # read the data files into a dictionary of DataFrames 90 abs_dict: dict[str, DataFrame] = {} 91 92 # Process single file requests first 93 abs_dict = _process_single_files(abs_dict, links, args, verbose=verbose) 94 if abs_dict: # If single file was found and processed, return it 95 return abs_dict 96 97 # Process all files based on configuration 98 return _process_all_files(abs_dict, links, args) 99 100 101def grab_abs_zip( 102 zip_path: Path | str, 103 **kwargs: Unpack[ReadArgs] 104) -> dict[str, DataFrame]: 105 """Grab and process a single ABS ZIP file from a file system location. 106 107 This is a convenience function that opens an ABS ZIP file from a local 108 filesystem path. Expect to be used rarely. 109 110 Parameters 111 ---------- 112 zip_path : Path | str 113 The local filesystem path of the ABS ZIP file to open and process. 114 115 **kwargs : Unpack[ReadArgs] 116 Additional keyword arguments for file retrieval and processing. 117 118 Returns 119 ------- 120 dict[str, DataFrame] 121 A dictionary of DataFrames extracted from the ZIP file. 122 123 """ 124 check_kwargs(kwargs, "grab_abs_zip") # warn if invalid kwargs 125 args = get_args(kwargs, "grab_abs_zip") # get the valid kwargs 126 127 zp: Path = zip_path if isinstance(zip_path, Path) else Path(zip_path) 128 zip_bytes = zp.read_bytes() 129 abs_dict: dict[str, DataFrame] = {} 130 return _process_zip(abs_dict, zip_bytes, **args) 131 132 133# --- private 134def _process_single_files( 135 abs_dict: dict[str, DataFrame], 136 links: dict[str, list[str]], 137 args: dict[str, Any], # ReadArgs after processing 138 *, 139 verbose: bool, 140) -> dict[str, DataFrame]: 141 """Process single file requests (single_excel_only, selected_excel, or single_zip_only).""" 142 if args["single_excel_only"]: 143 link = _find_url(links, EXCEL_EXTENSION, args["single_excel_only"], verbose=verbose) 144 if link: 145 return _add_excel(abs_dict, link, **args) 146 147 if args["selected_excel"]: 148 for target in args["selected_excel"]: 149 link = _find_url(links, EXCEL_EXTENSION, target, verbose=verbose) 150 if link: 151 abs_dict = _add_excel(abs_dict, link, **args) 152 if abs_dict: 153 return abs_dict 154 155 if args["single_zip_only"]: 156 link = _find_url(links, ZIP_EXTENSION, args["single_zip_only"], verbose=verbose) 157 if link: 158 return _add_zip(abs_dict, link, **args) 159 160 return abs_dict 161 162 163def _process_all_files( 164 abs_dict: dict[str, DataFrame], 165 links: dict[str, list[str]], 166 args: dict[str, Any], # ReadArgs after processing 167) -> dict[str, DataFrame]: 168 """Process all files based on configuration (get_zip, get_excel, etc.).""" 169 for link_type in FILE_EXTENSIONS_PROCESSING_ORDER: 170 for link in links.get(link_type, []): 171 if link_type == ZIP_EXTENSION and args["get_zip"]: 172 abs_dict = _add_zip(abs_dict, link, **args) 173 elif link_type == EXCEL_EXTENSION and _should_process_excel_file(args, links): 174 abs_dict = _add_excel(abs_dict, link, **args) 175 return abs_dict 176 177 178def _should_process_excel_file(args: dict[str, Any], links: dict[str, list[str]]) -> bool: 179 """Determine if Excel files should be processed based on configuration. 180 181 Excel files are processed if: 182 1. get_excel is explicitly True, or 183 2. get_excel_if_no_zip is True AND (get_zip is False OR no ZIP files are available) 184 185 Args: 186 args: Configuration arguments from user 187 links: Dictionary of available file links by type 188 189 Returns: 190 bool: True if Excel files should be processed 191 192 """ 193 # Always process if explicitly requested 194 if args["get_excel"]: 195 return True 196 197 # Process Excel if requested when no ZIP files, and either: 198 # - ZIP processing is disabled, or 199 # - No ZIP files are available 200 if args["get_excel_if_no_zip"]: 201 zip_processing_disabled = not args["get_zip"] 202 no_zip_files_available = not links.get(ZIP_EXTENSION, []) 203 return zip_processing_disabled or no_zip_files_available 204 205 return False 206 207 208def _find_url(links: dict[str, list[str]], targ_type: str, target: str, *, verbose: bool = False) -> str: 209 """Find the URL for a target file type. 210 211 Args: 212 links: Dictionary mapping file types to lists of URLs 213 targ_type: Target file extension (e.g., '.xlsx', '.zip') 214 target: Target filename without extension 215 verbose: Whether to print debug information 216 217 Returns: 218 str: The matching URL if found, otherwise an empty string 219 220 """ 221 targ_list = links.get(targ_type, []) 222 if not targ_list: 223 return "" 224 goal = f"{target}{targ_type}" 225 if verbose: 226 print(f"_find_url(): looking for {goal} in {targ_list}.") 227 for link in targ_list: 228 if link.endswith(goal): 229 return link 230 return "" 231 232 233def _get_url(url: str, cat: str) -> str: 234 """Get URL from provided URL or catalogue number. 235 236 If an ABS catalogue number is provided and URL is not provided, 237 get the URL for the ABS data files on the ABS webpage. 238 Otherwise, return the URL provided. Either the 'url' or 239 'cat' argument must be provided. 240 241 Args: 242 url: The URL to use if provided 243 cat: The catalogue number to use if URL is not provided 244 245 Returns: 246 str: The URL to use for data retrieval 247 248 Raises: 249 ValueError: If neither URL nor valid catalogue number is provided 250 251 """ 252 if not url and cat: 253 try: 254 cat_map = abs_catalogue() 255 if cat in cat_map.index: 256 url = str(cat_map.loc[cat, "URL"]) 257 except (KeyError, IndexError) as e: 258 raise ValueError(f"Catalogue number '{cat}' not found in ABS catalogue: {e}") from e 259 except (ConnectionError, TimeoutError) as e: 260 raise ValueError(f"Network error retrieving catalogue for '{cat}': {e}") from e 261 except (ValueError, TypeError) as e: 262 raise ValueError(f"Invalid catalogue data for '{cat}': {e}") from e 263 264 if not url and cat: 265 raise ValueError( 266 f"Catalogue number '{cat}' not found in the ABS Time Series Directory. " 267 f"This may be a discontinued series. If you know the ABS landing page URL, " 268 f"you can use: read_abs_cat(cat='{cat}', url='https://www.abs.gov.au/...')" 269 ) 270 271 if not url: 272 raise ValueError("_get_url(): no URL or valid catalogue number provided.") 273 274 return url 275 276 277def _process_zip( 278 abs_dict: dict[str, DataFrame], 279 zip_contents: bytes, 280 **args: Any, # ReadArgs compatible 281) -> dict[str, DataFrame]: 282 """Read and process a ZIP file's contents from bytes.""" 283 if len(zip_contents) == EMPTY_BYTES_LENGTH: 284 return abs_dict 285 286 with zipfile.ZipFile(BytesIO(zip_contents)) as zipped: 287 for element in zipped.infolist(): 288 # get the zipfile into pandas 289 table_name = get_table_name(url=element.filename) 290 raw_bytes = zipped.read(element.filename) 291 abs_dict = _add_excel_bytes(abs_dict, raw_bytes, table_name, args) 292 293 return abs_dict 294 295 296def _add_zip( 297 abs_dict: dict[str, DataFrame], 298 link: str, 299 **args: Any, # ReadArgs compatible 300) -> dict[str, DataFrame]: 301 """Read and process a ZIP file from a URL. 302 303 Downloads the ZIP file and iterates over its contents, calling 304 _add_excel_bytes() to extract Excel files and add their contents 305 to the DataFrames dictionary. 306 307 Args: 308 abs_dict: Dictionary to store extracted DataFrames 309 link: URL to the ZIP file 310 **args: Additional arguments passed to file retrieval functions 311 312 Returns: 313 dict[str, DataFrame]: Updated dictionary with new DataFrames from ZIP contents 314 315 """ 316 zip_contents = get_file(link, **args) 317 return _process_zip(abs_dict, zip_contents, **args) 318 319 320def _add_excel_bytes( 321 abs_dict: dict[str, DataFrame], 322 raw_bytes: bytes, 323 name: str, 324 args: dict[str, Any], # ReadArgs after processing 325) -> dict[str, DataFrame]: 326 """Convert Excel file bytes to DataFrames and add to dictionary. 327 328 Processes the bytes as an Excel file, converting each sheet to a DataFrame 329 and adding them to the dictionary using 'name---sheet_name' as keys. 330 331 Args: 332 abs_dict: Dictionary to store extracted DataFrames 333 raw_bytes: Bytes content of the Excel file 334 name: Base name for the Excel file 335 args: Dictionary of processing arguments 336 337 Returns: 338 dict[str, DataFrame]: Updated dictionary with new DataFrames from Excel sheets 339 340 """ 341 verbose = args.get("verbose", False) 342 343 if len(raw_bytes) == EMPTY_BYTES_LENGTH: 344 if verbose: 345 print("_add_excel_bytes(): the raw bytes are empty.") 346 return abs_dict 347 348 # convert the raw bytes into a pandas ExcelFile 349 try: 350 excel = pd.ExcelFile(BytesIO(raw_bytes)) 351 except (ValueError, TypeError) as e: 352 message = f"With {name}: could not convert raw bytes to ExcelFile.\n{e}" 353 print(message) 354 return abs_dict 355 356 # iterate over the sheets in the Excel file 357 for sheet_name in excel.sheet_names: 358 # grab and go - no treatment of the data 359 sheet_data = excel.parse( 360 sheet_name, 361 ) 362 if len(sheet_data) == EMPTY_BYTES_LENGTH: 363 if verbose: 364 print(f"_add_excel_bytes(): sheet {sheet_name} in {name} is empty.") 365 continue 366 abs_dict[f"{name}{HYPHEN}{sheet_name}"] = sheet_data 367 368 # return the dictionary of DataFrames 369 return abs_dict 370 371 372def _add_excel( 373 abs_dict: dict[str, DataFrame], 374 link: str, 375 **args: Any, # ReadArgs compatible 376) -> dict[str, DataFrame]: 377 """Read in an Excel file at the URL in the 'link' argument. 378 379 Pass those bytes to _add_excel_bytes() to put the contents 380 into the dictionary of DataFrames given by 'abs_dict'. When done, 381 return the dictionary of DataFrames. 382 """ 383 name = get_table_name(link) 384 385 if name in abs_dict: 386 # table already in the dictionary 387 return abs_dict 388 389 raw_bytes = get_file(link, **args) 390 391 return _add_excel_bytes(abs_dict, raw_bytes, name, args) 392 393 394# --- main --- 395if __name__ == "__main__": 396 397 def simple_test() -> None: 398 """Test the grab_abs_url and grab_abs_zip functions.""" 399 400 def test(name: str, **kwargs: Any) -> None: # ReadArgs compatible 401 print(f"TEST -- {name}") 402 try: 403 data_dict = grab_abs_url(**kwargs) 404 print("---") 405 if not data_dict: 406 print("PROBLEM -- No data found.") 407 else: 408 print(f"SUCCESS -- Found {len(data_dict)} datasets: {list(data_dict.keys())}") 409 except Exception as e: # pylint: disable=broad-except 410 print(f"ERROR -- Test failed with exception: {e}") 411 print(f"Done.\n{'=' * 20}\n") 412 413 def test_zip(zip_path: Path, **kwargs: Any) -> None: # ReadArgs compatible 414 print(f"TEST -- grab_abs_zip() with {zip_path}") 415 try: 416 data_dict = grab_abs_zip(zip_path, **kwargs) 417 print("---") 418 if not data_dict: 419 print("PROBLEM -- No data found.") 420 else: 421 print(f"SUCCESS -- Found {len(data_dict)} datasets: {list(data_dict.keys())}") 422 except Exception as e: # pylint: disable=broad-except 423 print(f"ERROR -- Test failed with exception: {e}") 424 print(f"Done.\n{'=' * 20}\n") 425 426 # 4 -- grab from url 427 _name = ".test-data/Qrtly-CPI-Time-series-spreadsheets-all.zip" 428 _zip_path = Path(_name) 429 test_zip(_zip_path, verbose=True) 430 431 # --- various grab_abs_url() tests 432 433 name = "1 -- grab a single zip file" 434 test( 435 name, 436 cat="6291.0.55.001", 437 single_zip_only="p6291_all_quarterly_spreadsheets", 438 get_zip=True, 439 verbose=True, 440 ) 441 442 name = "2 -- grab a single Excel file" 443 test( 444 name, 445 cat="6202.0", 446 get_excel=True, 447 single_excel_only="6202001", 448 verbose=False, 449 ) 450 451 # 3 -- grab the whole shebang 452 urls = [ 453 "https://www.abs.gov.au/statistics/labour/jobs/weekly-payroll-jobs/latest-release", 454 "https://www.abs.gov.au/statistics/people/population/national-state-and-territory-population/dec-2023", 455 ] 456 for i, url_ in enumerate(urls): 457 name = f"3.{i} -- grab the whole shebang {url_}" 458 test(name, url=url_, verbose=True) 459 460 simple_test()
37@cache # minimise slowness with repeat business 38def grab_abs_url( 39 cat: str = "", 40 url: str = "", 41 **kwargs: Unpack[ReadArgs], 42) -> dict[str, DataFrame]: 43 """For a given URL, extract the data from the Excel and ZIP file links found on that page. 44 45 The data is returned as a dictionary of DataFrames. The Excel files are converted 46 into DataFrames, with each sheet in each Excel file becoming a separate DataFrame. 47 ZIP files are examined for Excel files, which are similarly converted into 48 DataFrames. The dictionary of DataFrames is returned. 49 50 The preferred mechanism for reading ABS data is to use the `read_abs_cat()` 51 or `read_abs_series()` functions. This function is provided for those 52 cases where the data is not available in the ABS catalogue, where the 53 data is not a timeseries, or where the user wants to extract data from 54 a specific ABS landingpage. 55 56 57 Parameters 58 ---------- 59 url : str = "" 60 A URL for an ABS Catalogue landing page. Either a url or 61 a catalogue number must be provided. If both are provided, the 62 URL will be used. 63 64 cat : str = "" 65 An ABS Catalogue number. If provided, and the URL is not 66 provided, then the Catalogue number will be used to get the URL. 67 68 **kwargs : Unpack[ReadArgs] 69 Accepts the same keyword arguments as `read_abs_cat()`. 70 71 Returns 72 ------- 73 dict[str, DataFrame] 74 A dictionary of DataFrames. 75 76 """ 77 # check/get the keyword arguments 78 url = _get_url(url, cat) 79 check_kwargs(kwargs, "grab_abs_url") # warn if invalid kwargs 80 args = get_args(kwargs, "grab_abs_url") # get the valid kwargs 81 if verbose := args["verbose"]: 82 print(f"grab_abs_url(): {url=}, {args=}") 83 84 # get the URL links to the relevant ABS data files on that webpage 85 links = get_abs_links(url, **args) 86 if not links: 87 print(f"No data files found at URL: {url}") 88 return {} # return an empty Dictionary 89 90 # read the data files into a dictionary of DataFrames 91 abs_dict: dict[str, DataFrame] = {} 92 93 # Process single file requests first 94 abs_dict = _process_single_files(abs_dict, links, args, verbose=verbose) 95 if abs_dict: # If single file was found and processed, return it 96 return abs_dict 97 98 # Process all files based on configuration 99 return _process_all_files(abs_dict, links, args)
For a given URL, extract the data from the Excel and ZIP file links found on that page.
The data is returned as a dictionary of DataFrames. The Excel files are converted into DataFrames, with each sheet in each Excel file becoming a separate DataFrame. ZIP files are examined for Excel files, which are similarly converted into DataFrames. The dictionary of DataFrames is returned.
The preferred mechanism for reading ABS data is to use the read_abs_cat()
or read_abs_series() functions. This function is provided for those
cases where the data is not available in the ABS catalogue, where the
data is not a timeseries, or where the user wants to extract data from
a specific ABS landingpage.
Parameters
url : str = "" A URL for an ABS Catalogue landing page. Either a url or a catalogue number must be provided. If both are provided, the URL will be used.
cat : str = "" An ABS Catalogue number. If provided, and the URL is not provided, then the Catalogue number will be used to get the URL.
**kwargs : Unpack[ReadArgs]
Accepts the same keyword arguments as read_abs_cat().
Returns
dict[str, DataFrame] A dictionary of DataFrames.
102def grab_abs_zip( 103 zip_path: Path | str, 104 **kwargs: Unpack[ReadArgs] 105) -> dict[str, DataFrame]: 106 """Grab and process a single ABS ZIP file from a file system location. 107 108 This is a convenience function that opens an ABS ZIP file from a local 109 filesystem path. Expect to be used rarely. 110 111 Parameters 112 ---------- 113 zip_path : Path | str 114 The local filesystem path of the ABS ZIP file to open and process. 115 116 **kwargs : Unpack[ReadArgs] 117 Additional keyword arguments for file retrieval and processing. 118 119 Returns 120 ------- 121 dict[str, DataFrame] 122 A dictionary of DataFrames extracted from the ZIP file. 123 124 """ 125 check_kwargs(kwargs, "grab_abs_zip") # warn if invalid kwargs 126 args = get_args(kwargs, "grab_abs_zip") # get the valid kwargs 127 128 zp: Path = zip_path if isinstance(zip_path, Path) else Path(zip_path) 129 zip_bytes = zp.read_bytes() 130 abs_dict: dict[str, DataFrame] = {} 131 return _process_zip(abs_dict, zip_bytes, **args)
Grab and process a single ABS ZIP file from a file system location.
This is a convenience function that opens an ABS ZIP file from a local filesystem path. Expect to be used rarely.
Parameters
zip_path : Path | str The local filesystem path of the ABS ZIP file to open and process.
**kwargs : Unpack[ReadArgs] Additional keyword arguments for file retrieval and processing.
Returns
dict[str, DataFrame] A dictionary of DataFrames extracted from the ZIP file.