readabs.grab_abs_url

Find and extract DataFrames from an ABS webpage.

  1"""Find and extract DataFrames from an ABS webpage."""
  2
  3# --- imports ---
  4# standard library imports
  5import zipfile
  6from functools import cache
  7from io import BytesIO
  8from pathlib import Path
  9from typing import Any, Unpack
 10
 11# analytic imports
 12import pandas as pd
 13from pandas import DataFrame
 14
 15from readabs.abs_catalogue import abs_catalogue
 16from readabs.download_cache import get_file
 17
 18# local imports
 19from readabs.get_abs_links import get_abs_links, get_table_name
 20from readabs.read_support import HYPHEN, ReadArgs, check_kwargs, get_args
 21
 22# --- constants ---
 23# File extensions for ABS data files
 24ZIP_EXTENSION = ".zip"
 25EXCEL_EXTENSION = ".xlsx"
 26
 27# Processing order: ZIP files must be processed before Excel files
 28# This prevents duplicate data when ZIP files contain Excel files
 29FILE_EXTENSIONS_PROCESSING_ORDER = (ZIP_EXTENSION, EXCEL_EXTENSION)
 30
 31# Default values and limits
 32EMPTY_BYTES_LENGTH = 0
 33
 34
 35# --- public - primary entry point for this module
 36@cache  # minimise slowness with repeat business
 37def grab_abs_url(
 38    cat: str = "",
 39    url: str = "",
 40    **kwargs: Unpack[ReadArgs],
 41) -> dict[str, DataFrame]:
 42    """For a given URL, extract the data from the Excel and ZIP file links found on that page.
 43
 44    The data is returned as a dictionary of DataFrames. The Excel files are converted
 45    into DataFrames, with each sheet in each Excel file becoming a separate DataFrame.
 46    ZIP files are examined for Excel files, which are similarly converted into
 47    DataFrames. The dictionary of DataFrames is returned.
 48
 49    The preferred mechanism for reading ABS data is to use the `read_abs_cat()`
 50    or `read_abs_series()` functions. This function is provided for those
 51    cases where the data is not available in the ABS catalogue, where the
 52    data is not a timeseries, or where the user wants to extract data from
 53    a specific ABS landingpage.
 54
 55
 56    Parameters
 57    ----------
 58    url : str = ""
 59        A URL for an ABS Catalogue landing page. Either a url or
 60        a catalogue number must be provided. If both are provided, the
 61        URL will be used.
 62
 63    cat : str = ""
 64        An ABS Catalogue number. If provided, and the URL is not
 65        provided, then the Catalogue number will be used to get the URL.
 66
 67    **kwargs : Unpack[ReadArgs]
 68        Accepts the same keyword arguments as `read_abs_cat()`.
 69
 70    Returns
 71    -------
 72    dict[str, DataFrame]
 73        A dictionary of DataFrames.
 74
 75    """
 76    # check/get the keyword arguments
 77    url = _get_url(url, cat)
 78    check_kwargs(kwargs, "grab_abs_url")  # warn if invalid kwargs
 79    args = get_args(kwargs, "grab_abs_url")  # get the valid kwargs
 80    if verbose := args["verbose"]:
 81        print(f"grab_abs_url(): {url=}, {args=}")
 82
 83    # get the URL links to the relevant ABS data files on that webpage
 84    links = get_abs_links(url, **args)
 85    if not links:
 86        print(f"No data files found at URL: {url}")
 87        return {}  # return an empty Dictionary
 88
 89    # read the data files into a dictionary of DataFrames
 90    abs_dict: dict[str, DataFrame] = {}
 91
 92    # Process single file requests first
 93    abs_dict = _process_single_files(abs_dict, links, args, verbose=verbose)
 94    if abs_dict:  # If single file was found and processed, return it
 95        return abs_dict
 96
 97    # Process all files based on configuration
 98    return _process_all_files(abs_dict, links, args)
 99
100
101def grab_abs_zip(
102    zip_path: Path | str,
103    **kwargs: Unpack[ReadArgs]
104) -> dict[str, DataFrame]:
105    """Grab and process a single ABS ZIP file from a file system location.
106
107    This is a convenience function that opens an ABS ZIP file from a local
108    filesystem path. Expect to be used rarely.
109
110    Parameters
111    ----------
112    zip_path : Path | str
113        The local filesystem path of the ABS ZIP file to open and process.
114
115    **kwargs : Unpack[ReadArgs]
116        Additional keyword arguments for file retrieval and processing.
117
118    Returns
119    -------
120    dict[str, DataFrame]
121        A dictionary of DataFrames extracted from the ZIP file.
122
123    """
124    check_kwargs(kwargs, "grab_abs_zip")  # warn if invalid kwargs
125    args = get_args(kwargs, "grab_abs_zip")  # get the valid kwargs
126
127    zp: Path = zip_path if isinstance(zip_path, Path) else Path(zip_path)
128    zip_bytes = zp.read_bytes()
129    abs_dict: dict[str, DataFrame] = {}
130    return _process_zip(abs_dict, zip_bytes, **args)
131
132
133# --- private
134def _process_single_files(
135    abs_dict: dict[str, DataFrame],
136    links: dict[str, list[str]],
137    args: dict[str, Any],  # ReadArgs after processing
138    *,
139    verbose: bool,
140) -> dict[str, DataFrame]:
141    """Process single file requests (single_excel_only, selected_excel, or single_zip_only)."""
142    if args["single_excel_only"]:
143        link = _find_url(links, EXCEL_EXTENSION, args["single_excel_only"], verbose=verbose)
144        if link:
145            return _add_excel(abs_dict, link, **args)
146
147    if args["selected_excel"]:
148        for target in args["selected_excel"]:
149            link = _find_url(links, EXCEL_EXTENSION, target, verbose=verbose)
150            if link:
151                abs_dict = _add_excel(abs_dict, link, **args)
152        if abs_dict:
153            return abs_dict
154
155    if args["single_zip_only"]:
156        link = _find_url(links, ZIP_EXTENSION, args["single_zip_only"], verbose=verbose)
157        if link:
158            return _add_zip(abs_dict, link, **args)
159
160    return abs_dict
161
162
163def _process_all_files(
164    abs_dict: dict[str, DataFrame],
165    links: dict[str, list[str]],
166    args: dict[str, Any],  # ReadArgs after processing
167) -> dict[str, DataFrame]:
168    """Process all files based on configuration (get_zip, get_excel, etc.)."""
169    for link_type in FILE_EXTENSIONS_PROCESSING_ORDER:
170        for link in links.get(link_type, []):
171            if link_type == ZIP_EXTENSION and args["get_zip"]:
172                abs_dict = _add_zip(abs_dict, link, **args)
173            elif link_type == EXCEL_EXTENSION and _should_process_excel_file(args, links):
174                abs_dict = _add_excel(abs_dict, link, **args)
175    return abs_dict
176
177
178def _should_process_excel_file(args: dict[str, Any], links: dict[str, list[str]]) -> bool:
179    """Determine if Excel files should be processed based on configuration.
180
181    Excel files are processed if:
182    1. get_excel is explicitly True, or
183    2. get_excel_if_no_zip is True AND (get_zip is False OR no ZIP files are available)
184
185    Args:
186        args: Configuration arguments from user
187        links: Dictionary of available file links by type
188
189    Returns:
190        bool: True if Excel files should be processed
191
192    """
193    # Always process if explicitly requested
194    if args["get_excel"]:
195        return True
196
197    # Process Excel if requested when no ZIP files, and either:
198    # - ZIP processing is disabled, or
199    # - No ZIP files are available
200    if args["get_excel_if_no_zip"]:
201        zip_processing_disabled = not args["get_zip"]
202        no_zip_files_available = not links.get(ZIP_EXTENSION, [])
203        return zip_processing_disabled or no_zip_files_available
204
205    return False
206
207
208def _find_url(links: dict[str, list[str]], targ_type: str, target: str, *, verbose: bool = False) -> str:
209    """Find the URL for a target file type.
210
211    Args:
212        links: Dictionary mapping file types to lists of URLs
213        targ_type: Target file extension (e.g., '.xlsx', '.zip')
214        target: Target filename without extension
215        verbose: Whether to print debug information
216
217    Returns:
218        str: The matching URL if found, otherwise an empty string
219
220    """
221    targ_list = links.get(targ_type, [])
222    if not targ_list:
223        return ""
224    goal = f"{target}{targ_type}"
225    if verbose:
226        print(f"_find_url(): looking for {goal} in {targ_list}.")
227    for link in targ_list:
228        if link.endswith(goal):
229            return link
230    return ""
231
232
233def _get_url(url: str, cat: str) -> str:
234    """Get URL from provided URL or catalogue number.
235
236    If an ABS catalogue number is provided and URL is not provided,
237    get the URL for the ABS data files on the ABS webpage.
238    Otherwise, return the URL provided. Either the 'url' or
239    'cat' argument must be provided.
240
241    Args:
242        url: The URL to use if provided
243        cat: The catalogue number to use if URL is not provided
244
245    Returns:
246        str: The URL to use for data retrieval
247
248    Raises:
249        ValueError: If neither URL nor valid catalogue number is provided
250
251    """
252    if not url and cat:
253        try:
254            cat_map = abs_catalogue()
255            if cat in cat_map.index:
256                url = str(cat_map.loc[cat, "URL"])
257        except (KeyError, IndexError) as e:
258            raise ValueError(f"Catalogue number '{cat}' not found in ABS catalogue: {e}") from e
259        except (ConnectionError, TimeoutError) as e:
260            raise ValueError(f"Network error retrieving catalogue for '{cat}': {e}") from e
261        except (ValueError, TypeError) as e:
262            raise ValueError(f"Invalid catalogue data for '{cat}': {e}") from e
263
264    if not url and cat:
265        raise ValueError(
266            f"Catalogue number '{cat}' not found in the ABS Time Series Directory. "
267            f"This may be a discontinued series. If you know the ABS landing page URL, "
268            f"you can use: read_abs_cat(cat='{cat}', url='https://www.abs.gov.au/...')"
269        )
270
271    if not url:
272        raise ValueError("_get_url(): no URL or valid catalogue number provided.")
273
274    return url
275
276
277def _process_zip(
278    abs_dict: dict[str, DataFrame],
279    zip_contents: bytes,
280    **args: Any,  # ReadArgs compatible
281) -> dict[str, DataFrame]:
282    """Read and process a ZIP file's contents from bytes."""
283    if len(zip_contents) == EMPTY_BYTES_LENGTH:
284        return abs_dict
285
286    with zipfile.ZipFile(BytesIO(zip_contents)) as zipped:
287        for element in zipped.infolist():
288            # get the zipfile into pandas
289            table_name = get_table_name(url=element.filename)
290            raw_bytes = zipped.read(element.filename)
291            abs_dict = _add_excel_bytes(abs_dict, raw_bytes, table_name, args)
292
293    return abs_dict
294
295
296def _add_zip(
297    abs_dict: dict[str, DataFrame],
298    link: str,
299    **args: Any,  # ReadArgs compatible
300) -> dict[str, DataFrame]:
301    """Read and process a ZIP file from a URL.
302
303    Downloads the ZIP file and iterates over its contents, calling
304    _add_excel_bytes() to extract Excel files and add their contents
305    to the DataFrames dictionary.
306
307    Args:
308        abs_dict: Dictionary to store extracted DataFrames
309        link: URL to the ZIP file
310        **args: Additional arguments passed to file retrieval functions
311
312    Returns:
313        dict[str, DataFrame]: Updated dictionary with new DataFrames from ZIP contents
314
315    """
316    zip_contents = get_file(link, **args)
317    return _process_zip(abs_dict, zip_contents, **args)
318
319
320def _add_excel_bytes(
321    abs_dict: dict[str, DataFrame],
322    raw_bytes: bytes,
323    name: str,
324    args: dict[str, Any],  # ReadArgs after processing
325) -> dict[str, DataFrame]:
326    """Convert Excel file bytes to DataFrames and add to dictionary.
327
328    Processes the bytes as an Excel file, converting each sheet to a DataFrame
329    and adding them to the dictionary using 'name---sheet_name' as keys.
330
331    Args:
332        abs_dict: Dictionary to store extracted DataFrames
333        raw_bytes: Bytes content of the Excel file
334        name: Base name for the Excel file
335        args: Dictionary of processing arguments
336
337    Returns:
338        dict[str, DataFrame]: Updated dictionary with new DataFrames from Excel sheets
339
340    """
341    verbose = args.get("verbose", False)
342
343    if len(raw_bytes) == EMPTY_BYTES_LENGTH:
344        if verbose:
345            print("_add_excel_bytes(): the raw bytes are empty.")
346        return abs_dict
347
348    # convert the raw bytes into a pandas ExcelFile
349    try:
350        excel = pd.ExcelFile(BytesIO(raw_bytes))
351    except (ValueError, TypeError) as e:
352        message = f"With {name}: could not convert raw bytes to ExcelFile.\n{e}"
353        print(message)
354        return abs_dict
355
356    # iterate over the sheets in the Excel file
357    for sheet_name in excel.sheet_names:
358        # grab and go - no treatment of the data
359        sheet_data = excel.parse(
360            sheet_name,
361        )
362        if len(sheet_data) == EMPTY_BYTES_LENGTH:
363            if verbose:
364                print(f"_add_excel_bytes(): sheet {sheet_name} in {name} is empty.")
365            continue
366        abs_dict[f"{name}{HYPHEN}{sheet_name}"] = sheet_data
367
368    # return the dictionary of DataFrames
369    return abs_dict
370
371
372def _add_excel(
373    abs_dict: dict[str, DataFrame],
374    link: str,
375    **args: Any,  # ReadArgs compatible
376) -> dict[str, DataFrame]:
377    """Read in an Excel file at the URL in the 'link' argument.
378
379    Pass those bytes to _add_excel_bytes() to put the contents
380    into the dictionary of DataFrames given by 'abs_dict'. When done,
381    return the dictionary of DataFrames.
382    """
383    name = get_table_name(link)
384
385    if name in abs_dict:
386        # table already in the dictionary
387        return abs_dict
388
389    raw_bytes = get_file(link, **args)
390
391    return _add_excel_bytes(abs_dict, raw_bytes, name, args)
392
393
394# --- main ---
395if __name__ == "__main__":
396
397    def simple_test() -> None:
398        """Test the grab_abs_url and grab_abs_zip functions."""
399
400        def test(name: str, **kwargs: Any) -> None:  # ReadArgs compatible
401            print(f"TEST -- {name}")
402            try:
403                data_dict = grab_abs_url(**kwargs)
404                print("---")
405                if not data_dict:
406                    print("PROBLEM -- No data found.")
407                else:
408                    print(f"SUCCESS -- Found {len(data_dict)} datasets: {list(data_dict.keys())}")
409            except Exception as e:  # pylint: disable=broad-except
410                print(f"ERROR -- Test failed with exception: {e}")
411            print(f"Done.\n{'=' * 20}\n")
412
413        def test_zip(zip_path: Path, **kwargs: Any) -> None:  # ReadArgs compatible
414            print(f"TEST -- grab_abs_zip() with {zip_path}")
415            try:
416                data_dict = grab_abs_zip(zip_path, **kwargs)
417                print("---")
418                if not data_dict:
419                    print("PROBLEM -- No data found.")
420                else:
421                    print(f"SUCCESS -- Found {len(data_dict)} datasets: {list(data_dict.keys())}")
422            except Exception as e:  # pylint: disable=broad-except
423                print(f"ERROR -- Test failed with exception: {e}")
424            print(f"Done.\n{'=' * 20}\n")
425
426        # 4 -- grab from url
427        _name = ".test-data/Qrtly-CPI-Time-series-spreadsheets-all.zip"
428        _zip_path = Path(_name)
429        test_zip(_zip_path, verbose=True)
430
431        # --- various grab_abs_url() tests
432
433        name = "1 -- grab a single zip file"
434        test(
435            name,
436            cat="6291.0.55.001",
437            single_zip_only="p6291_all_quarterly_spreadsheets",
438            get_zip=True,
439            verbose=True,
440        )
441
442        name = "2 -- grab a single Excel file"
443        test(
444            name,
445            cat="6202.0",
446            get_excel=True,
447            single_excel_only="6202001",
448            verbose=False,
449        )
450
451        # 3 -- grab the whole shebang
452        urls = [
453            "https://www.abs.gov.au/statistics/labour/jobs/weekly-payroll-jobs/latest-release",
454            "https://www.abs.gov.au/statistics/people/population/national-state-and-territory-population/dec-2023",
455        ]
456        for i, url_ in enumerate(urls):
457            name = f"3.{i} -- grab the whole shebang {url_}"
458            test(name, url=url_, verbose=True)
459
460    simple_test()
ZIP_EXTENSION = '.zip'
EXCEL_EXTENSION = '.xlsx'
FILE_EXTENSIONS_PROCESSING_ORDER = ('.zip', '.xlsx')
EMPTY_BYTES_LENGTH = 0
@cache
def grab_abs_url( cat: str = '', url: str = '', **kwargs: Unpack[readabs.ReadArgs]) -> dict[str, pandas.DataFrame]:
37@cache  # minimise slowness with repeat business
38def grab_abs_url(
39    cat: str = "",
40    url: str = "",
41    **kwargs: Unpack[ReadArgs],
42) -> dict[str, DataFrame]:
43    """For a given URL, extract the data from the Excel and ZIP file links found on that page.
44
45    The data is returned as a dictionary of DataFrames. The Excel files are converted
46    into DataFrames, with each sheet in each Excel file becoming a separate DataFrame.
47    ZIP files are examined for Excel files, which are similarly converted into
48    DataFrames. The dictionary of DataFrames is returned.
49
50    The preferred mechanism for reading ABS data is to use the `read_abs_cat()`
51    or `read_abs_series()` functions. This function is provided for those
52    cases where the data is not available in the ABS catalogue, where the
53    data is not a timeseries, or where the user wants to extract data from
54    a specific ABS landingpage.
55
56
57    Parameters
58    ----------
59    url : str = ""
60        A URL for an ABS Catalogue landing page. Either a url or
61        a catalogue number must be provided. If both are provided, the
62        URL will be used.
63
64    cat : str = ""
65        An ABS Catalogue number. If provided, and the URL is not
66        provided, then the Catalogue number will be used to get the URL.
67
68    **kwargs : Unpack[ReadArgs]
69        Accepts the same keyword arguments as `read_abs_cat()`.
70
71    Returns
72    -------
73    dict[str, DataFrame]
74        A dictionary of DataFrames.
75
76    """
77    # check/get the keyword arguments
78    url = _get_url(url, cat)
79    check_kwargs(kwargs, "grab_abs_url")  # warn if invalid kwargs
80    args = get_args(kwargs, "grab_abs_url")  # get the valid kwargs
81    if verbose := args["verbose"]:
82        print(f"grab_abs_url(): {url=}, {args=}")
83
84    # get the URL links to the relevant ABS data files on that webpage
85    links = get_abs_links(url, **args)
86    if not links:
87        print(f"No data files found at URL: {url}")
88        return {}  # return an empty Dictionary
89
90    # read the data files into a dictionary of DataFrames
91    abs_dict: dict[str, DataFrame] = {}
92
93    # Process single file requests first
94    abs_dict = _process_single_files(abs_dict, links, args, verbose=verbose)
95    if abs_dict:  # If single file was found and processed, return it
96        return abs_dict
97
98    # Process all files based on configuration
99    return _process_all_files(abs_dict, links, args)

For a given URL, extract the data from the Excel and ZIP file links found on that page.

The data is returned as a dictionary of DataFrames. The Excel files are converted into DataFrames, with each sheet in each Excel file becoming a separate DataFrame. ZIP files are examined for Excel files, which are similarly converted into DataFrames. The dictionary of DataFrames is returned.

The preferred mechanism for reading ABS data is to use the read_abs_cat() or read_abs_series() functions. This function is provided for those cases where the data is not available in the ABS catalogue, where the data is not a timeseries, or where the user wants to extract data from a specific ABS landingpage.

Parameters

url : str = "" A URL for an ABS Catalogue landing page. Either a url or a catalogue number must be provided. If both are provided, the URL will be used.

cat : str = "" An ABS Catalogue number. If provided, and the URL is not provided, then the Catalogue number will be used to get the URL.

**kwargs : Unpack[ReadArgs] Accepts the same keyword arguments as read_abs_cat().

Returns

dict[str, DataFrame] A dictionary of DataFrames.

def grab_abs_zip( zip_path: pathlib.Path | str, **kwargs: Unpack[readabs.ReadArgs]) -> dict[str, pandas.DataFrame]:
102def grab_abs_zip(
103    zip_path: Path | str,
104    **kwargs: Unpack[ReadArgs]
105) -> dict[str, DataFrame]:
106    """Grab and process a single ABS ZIP file from a file system location.
107
108    This is a convenience function that opens an ABS ZIP file from a local
109    filesystem path. Expect to be used rarely.
110
111    Parameters
112    ----------
113    zip_path : Path | str
114        The local filesystem path of the ABS ZIP file to open and process.
115
116    **kwargs : Unpack[ReadArgs]
117        Additional keyword arguments for file retrieval and processing.
118
119    Returns
120    -------
121    dict[str, DataFrame]
122        A dictionary of DataFrames extracted from the ZIP file.
123
124    """
125    check_kwargs(kwargs, "grab_abs_zip")  # warn if invalid kwargs
126    args = get_args(kwargs, "grab_abs_zip")  # get the valid kwargs
127
128    zp: Path = zip_path if isinstance(zip_path, Path) else Path(zip_path)
129    zip_bytes = zp.read_bytes()
130    abs_dict: dict[str, DataFrame] = {}
131    return _process_zip(abs_dict, zip_bytes, **args)

Grab and process a single ABS ZIP file from a file system location.

This is a convenience function that opens an ABS ZIP file from a local filesystem path. Expect to be used rarely.

Parameters

zip_path : Path | str The local filesystem path of the ABS ZIP file to open and process.

**kwargs : Unpack[ReadArgs] Additional keyword arguments for file retrieval and processing.

Returns

dict[str, DataFrame] A dictionary of DataFrames extracted from the ZIP file.