readabs

Package to download timeseries data from the Australian Bureau of Statistics and RBA.

This package provides functions to download and process timeseries data from the Australian Bureau of Statistics (ABS) and the Reserve Bank of Australia (RBA).

 1"""Package to download timeseries data from the Australian Bureau of Statistics and RBA.
 2
 3This package provides functions to download and process timeseries data from
 4the Australian Bureau of Statistics (ABS) and the Reserve Bank of Australia (RBA).
 5"""
 6
 7import importlib.metadata
 8
 9# ABS related imports
10from readabs.abs_catalogue import abs_catalogue
11from readabs.abs_meta_data import metacol
12
13# Utility imports
14from readabs.datatype import Datatype
15from readabs.grab_abs_url import grab_abs_url, grab_abs_zip
16from readabs.print_abs_catalogue import print_abs_catalogue
17
18# RBA related imports
19from readabs.rba_catalogue import print_rba_catalogue, rba_catalogue
20from readabs.rba_meta_data import rba_metacol
21from readabs.read_abs_by_desc import read_abs_by_desc
22from readabs.read_abs_cat import read_abs_cat
23from readabs.read_abs_series import read_abs_series
24from readabs.read_rba_table import read_rba_ocr, read_rba_table
25from readabs.read_support import ReadArgs
26from readabs.recalibrate import recalibrate, recalibrate_value
27from readabs.search_abs_meta import find_abs_id, search_abs_meta
28from readabs.splice import select, select_and_splice, select_one, splice
29from readabs.utilities import (
30    annualise_percentages,
31    annualise_rates,
32    monthly_to_qtly,
33    percent_change,
34    qtly_to_monthly,
35)
36
37# Version and author information
38try:
39    __version__ = importlib.metadata.version(__name__)
40except importlib.metadata.PackageNotFoundError:
41    __version__ = "0.0.0"  # Fallback for development mode
42__author__ = "Bryan Palmer"
43
44
45# Exposed functions and classes
46__all__ = (
47    "Datatype",
48    "ReadArgs",
49    "abs_catalogue",
50    "annualise_percentages",
51    "annualise_rates",
52    "find_abs_id",
53    "grab_abs_url",
54    "grab_abs_zip",
55    "metacol",
56    "monthly_to_qtly",
57    "percent_change",
58    "print_abs_catalogue",
59    "print_rba_catalogue",
60    "qtly_to_monthly",
61    "rba_catalogue",
62    "rba_metacol",
63    "read_abs_by_desc",
64    "read_abs_cat",
65    "read_abs_series",
66    "read_rba_ocr",
67    "read_rba_table",
68    "recalibrate",
69    "recalibrate_value",
70    "search_abs_meta",
71    "select",
72    "select_and_splice",
73    "select_one",
74    "splice",
75)
76__pdoc__ = {
77    "download_cache": False,
78    "get_abs_links": False,
79    "read_support": False,
80    "grab_abs_url": False,
81}  # hide submodules from documentation
class ReadArgs(typing.TypedDict):
15class ReadArgs(TypedDict):
16    """Type definition for ABS data reading arguments."""
17
18    verbose: NotRequired[bool]
19    ignore_errors: NotRequired[bool]
20    get_zip: NotRequired[bool]
21    get_excel_if_no_zip: NotRequired[bool]
22    get_excel: NotRequired[bool]
23    single_zip_only: NotRequired[str]
24    single_excel_only: NotRequired[str]
25    selected_excel: NotRequired[tuple[str, ...]]
26    history: NotRequired[str]
27    cache_only: NotRequired[bool]
28    keep_non_ts: NotRequired[bool]
29    zip_file: NotRequired[str]
30    url: NotRequired[str]

Type definition for ABS data reading arguments.

verbose: NotRequired[bool]
ignore_errors: NotRequired[bool]
get_zip: NotRequired[bool]
get_excel_if_no_zip: NotRequired[bool]
get_excel: NotRequired[bool]
single_zip_only: NotRequired[str]
single_excel_only: NotRequired[str]
selected_excel: NotRequired[tuple[str, ...]]
history: NotRequired[str]
cache_only: NotRequired[bool]
keep_non_ts: NotRequired[bool]
zip_file: NotRequired[str]
url: NotRequired[str]
@cache
def abs_catalogue(*, cache_only: bool = False, verbose: bool = False) -> pandas.DataFrame:
 24@cache
 25def abs_catalogue(*, cache_only: bool = False, verbose: bool = False) -> DataFrame:
 26    """Return a DataFrame of ABS Catalogue numbers.
 27
 28    Downloads catalogue data from the ABS website on first call and caches
 29    for future use. The returned DataFrame contains catalogue numbers with
 30    their topics, themes, URLs, and status.
 31
 32    Parameters
 33    ----------
 34    cache_only : bool, default False
 35        If True, only use cached data and don't attempt to download.
 36    verbose : bool, default False
 37        If True, print progress messages.
 38
 39    Returns
 40    -------
 41    DataFrame
 42        DataFrame with columns ['Theme', 'Parent Topic', 'Topic', 'URL', 'Status']
 43        and index of catalogue IDs.
 44
 45    Raises
 46    ------
 47    CatalogueError
 48        If the catalogue data cannot be retrieved or parsed.
 49    HttpError
 50        If there's a network error downloading the catalogue.
 51    CacheError
 52        If cache_only=True but no cached data is available.
 53
 54    Example
 55    -------
 56    >>> import readabs as ra
 57    >>> catalogue = ra.abs_catalogue()
 58    >>> print(catalogue.head())
 59
 60    """
 61    try:
 62        # Download ABS catalogue page
 63        abs_bytes = get_file(ABS_CATALOGUE_URL, cache_only=cache_only, verbose=verbose)
 64
 65        if not abs_bytes:
 66            raise CatalogueError("No data retrieved from ABS catalogue URL")
 67
 68        # Parse HTML content
 69        try:
 70            html_content = abs_bytes.decode(DEFAULT_ENCODING, errors="replace")
 71        except UnicodeDecodeError as e:
 72            raise CatalogueError(f"Failed to decode HTML content: {e}") from e
 73
 74        # Extract tables from HTML
 75        try:
 76            tables = read_html(StringIO(html_content), extract_links="body")
 77            if not tables:
 78                raise CatalogueError("No tables found in HTML content")
 79            links = tables[-1]  # Get the last table
 80        except (ValueError, IndexError) as e:
 81            raise CatalogueError(f"Failed to parse HTML tables: {e}") from e
 82
 83        # Validate required columns exist
 84        required_cols = ["Catalogue number", "Topic"]
 85        missing_cols = [col for col in required_cols if col not in links.columns]
 86        if missing_cols:
 87            raise CatalogueError(f"Missing required columns: {missing_cols}")
 88
 89        # Extract catalogue numbers and URLs
 90        try:
 91            cats = links["Catalogue number"].apply(Series)[0]
 92            urls = links["Topic"].apply(Series)[1]
 93        except (KeyError, IndexError) as e:
 94            raise CatalogueError(f"Failed to extract catalogue data: {e}") from e
 95
 96        # Process topic URLs to create hierarchical structure
 97        url_snippets = _process_topic_urls(urls)
 98
 99        # Create main DataFrame with hierarchical topic structure
100        frame = _create_topic_frame(url_snippets)
101        frame["URL"] = urls
102
103        # Align catalogue numbers with processed frame
104        cats = cats[frame.index]
105
106        # Process catalogue status (active vs ceased)
107        cat_index, status = _process_catalogue_status(cats)
108
109        frame["Status"] = status
110        frame.index = Index(cat_index)
111        frame.index.name = CATALOGUE_INDEX_NAME
112
113    except (HttpError, CacheError, ValueError) as e:
114        raise CatalogueError(f"Error retrieving ABS catalogue: {e}") from e
115
116    return frame

Return a DataFrame of ABS Catalogue numbers.

Downloads catalogue data from the ABS website on first call and caches for future use. The returned DataFrame contains catalogue numbers with their topics, themes, URLs, and status.

Parameters

cache_only : bool, default False If True, only use cached data and don't attempt to download. verbose : bool, default False If True, print progress messages.

Returns

DataFrame DataFrame with columns ['Theme', 'Parent Topic', 'Topic', 'URL', 'Status'] and index of catalogue IDs.

Raises

CatalogueError If the catalogue data cannot be retrieved or parsed. HttpError If there's a network error downloading the catalogue. CacheError If cache_only=True but no cached data is available.

Example

>>> import readabs as ra
>>> catalogue = ra.abs_catalogue()
>>> print(catalogue.head())
def annualise_percentages(data: ~Datatype, *, periods_per_year: float) -> ~Datatype:
 95def annualise_percentages(data: DataT, *, periods_per_year: float) -> DataT:
 96    """Annualise a growth rate (expressed as a percentage) for a period.
 97
 98    Args:
 99        data : pandas Series or DataFrame - The growth rate (expressed as a
100            percentage) to annualise. Note a growth percentage of 5% is a growth
101            rate of 0.05.
102        periods_per_year : int or float, default 12 - The number of periods in a
103            year. For monthly data, this is 12.
104
105    Returns:
106        pandas Series or DataFrame - The annualised growth expressed as a percentage.
107            For DataFrame input, the annualised growth rate is calculated for each column.
108
109    Raises:
110        InvalidParameterError - If periods_per_year is not positive.
111        InvalidDataError - If data is not a Series or DataFrame.
112
113    """
114    if not isinstance(data, (Series, DataFrame)):
115        raise InvalidDataError("data must be a pandas Series or DataFrame")
116
117    if not isinstance(periods_per_year, (int, float)) or periods_per_year <= 0:
118        raise InvalidParameterError("periods_per_year must be a positive number")
119
120    try:
121        rates = data / 100.0
122        return annualise_rates(rates, periods_per_year=periods_per_year)
123    except Exception as e:
124        raise InvalidDataError(f"Error annualising percentages: {e}") from e

Annualise a growth rate (expressed as a percentage) for a period.

Args: data : pandas Series or DataFrame - The growth rate (expressed as a percentage) to annualise. Note a growth percentage of 5% is a growth rate of 0.05. periods_per_year : int or float, default 12 - The number of periods in a year. For monthly data, this is 12.

Returns: pandas Series or DataFrame - The annualised growth expressed as a percentage. For DataFrame input, the annualised growth rate is calculated for each column.

Raises: InvalidParameterError - If periods_per_year is not positive. InvalidDataError - If data is not a Series or DataFrame.

def annualise_rates(data: ~Datatype, *, periods_per_year: float) -> ~Datatype:
62def annualise_rates(data: DataT, *, periods_per_year: float) -> DataT:
63    """Annualise a growth rate for a period.
64
65    Note: returns a percentage value (and not a rate)!
66
67    Args:
68        data : pandas Series or DataFrame - The growth rate to annualise.
69            Note a growth rate of 0.05 is 5%.
70        periods_per_year : int or float, default 12 - The number of periods in a year.
71            For monthly data, this is 12.
72
73    Returns:
74        pandas Series or DataFrame - The annualised growth expressed as a percentage
75            (not a rate). For DataFrame input, the annualised growth rate is
76            calculated for each column.
77
78    Raises:
79        InvalidParameterError - If periods_per_year is not positive.
80    InvalidDataError - If data is not a Series or DataFrame.
81
82    """
83    if not isinstance(data, (Series, DataFrame)):
84        raise InvalidDataError("data must be a pandas Series or DataFrame")
85
86    if not isinstance(periods_per_year, (int, float)) or periods_per_year <= 0:
87        raise InvalidParameterError("periods_per_year must be a positive number")
88
89    try:
90        return (((1 + data) ** periods_per_year) - 1) * 100
91    except Exception as e:
92        raise InvalidDataError(f"Error annualising rates: {e}") from e

Annualise a growth rate for a period.

Note: returns a percentage value (and not a rate)!

Args: data : pandas Series or DataFrame - The growth rate to annualise. Note a growth rate of 0.05 is 5%. periods_per_year : int or float, default 12 - The number of periods in a year. For monthly data, this is 12.

Returns: pandas Series or DataFrame - The annualised growth expressed as a percentage (not a rate). For DataFrame input, the annualised growth rate is calculated for each column.

Raises: InvalidParameterError - If periods_per_year is not positive. InvalidDataError - If data is not a Series or DataFrame.

def find_abs_id( meta: pandas.DataFrame, search_terms: dict[str, str], **kwargs: Any) -> tuple[str, str, str]:
126def find_abs_id(
127    meta: DataFrame,
128    search_terms: dict[str, str],
129    **kwargs: Any,
130) -> tuple[str, str, str]:  # table, series_id, units
131    """Find a unique ABS series identifier in the ABS metadata.
132
133    Parameters
134    ----------
135    meta : DataFrame
136        A pandas DataFrame of metadata from the ABS
137        (via read_abs_cat() or read_abs_series()).
138    search_terms : dict[str, str]
139        A dictionary {search_phrase: meta_column_name, ...} of search terms.
140        Note: the search terms must be unique, as a dictionary cannot hold the
141        same search term to be applied to different columns.
142    **kwargs : Any
143        Additional keyword arguments. The only additional keyword argument
144        that is used is validate_unique.
145    validate_unique : bool = True
146        Raise a ValueError if the search result is not a single
147        unique match. Note: the default is True for safety.
148
149    Returns
150    -------
151    tuple[str, str, str]
152        A tuple of the table, series_id and units for the unique
153        series_id that matches the search terms.
154
155    Metacol
156    -------
157    Because the meta data is a DataFrame, the columns can be referenced by either
158    their full textual name, or by the short name defined in the metacol object.
159    For example, if metacol is imported as mc, to refer to the
160    `Data Item Description` column, the user can refer to it as mc.did.
161
162    Example
163    -------
164    ```python
165    from readabs import metacol as mc  # alias for the ABS meta data column names
166    from readabs import read_abs_cat, find_abs_id, recalibrate
167    cat_num = "6202.0"  # The ABS labour force survey
168    data, meta = read_abs_cat(cat_num)
169    search_terms = {
170        "Employed total ;  Persons ;": mc.did,
171        "Seasonally Adjusted": mc.stype,
172        "6202001": mc.table,
173    }
174    table, series_id, units = find_abs_id(meta, search_terms)
175    print(f"Table: {table} Series ID: {series_id} Units: {units}")
176    recal_series, recal_units = recalibrate(data[table][series_id], units)
177    ```
178
179    """
180    validate_unique = kwargs.pop("validate_unique", True)
181    found = search_abs_meta(meta, search_terms, validate_unique=validate_unique, **kwargs).iloc[0]
182    table, series_id, units = (
183        found[mc.table],
184        found[mc.id],
185        found[mc.unit],
186    )
187
188    return table, series_id, units

Find a unique ABS series identifier in the ABS metadata.

Parameters

meta : DataFrame A pandas DataFrame of metadata from the ABS (via read_abs_cat() or read_abs_series()). search_terms : dict[str, str] A dictionary {search_phrase: meta_column_name, ...} of search terms. Note: the search terms must be unique, as a dictionary cannot hold the same search term to be applied to different columns. **kwargs : Any Additional keyword arguments. The only additional keyword argument that is used is validate_unique. validate_unique : bool = True Raise a ValueError if the search result is not a single unique match. Note: the default is True for safety.

Returns

tuple[str, str, str] A tuple of the table, series_id and units for the unique series_id that matches the search terms.

Metacol

Because the meta data is a DataFrame, the columns can be referenced by either their full textual name, or by the short name defined in the metacol object. For example, if metacol is imported as mc, to refer to the Data Item Description column, the user can refer to it as mc.did.

Example

from readabs import metacol as mc  # alias for the ABS meta data column names
from readabs import read_abs_cat, find_abs_id, recalibrate
cat_num = "6202.0"  # The ABS labour force survey
data, meta = read_abs_cat(cat_num)
search_terms = {
    "Employed total ;  Persons ;": mc.did,
    "Seasonally Adjusted": mc.stype,
    "6202001": mc.table,
}
table, series_id, units = find_abs_id(meta, search_terms)
print(f"Table: {table} Series ID: {series_id} Units: {units}")
recal_series, recal_units = recalibrate(data[table][series_id], units)
@cache
def grab_abs_url( cat: str = '', url: str = '', **kwargs: Unpack[ReadArgs]) -> dict[str, pandas.DataFrame]:
37@cache  # minimise slowness with repeat business
38def grab_abs_url(
39    cat: str = "",
40    url: str = "",
41    **kwargs: Unpack[ReadArgs],
42) -> dict[str, DataFrame]:
43    """For a given URL, extract the data from the Excel and ZIP file links found on that page.
44
45    The data is returned as a dictionary of DataFrames. The Excel files are converted
46    into DataFrames, with each sheet in each Excel file becoming a separate DataFrame.
47    ZIP files are examined for Excel files, which are similarly converted into
48    DataFrames. The dictionary of DataFrames is returned.
49
50    The preferred mechanism for reading ABS data is to use the `read_abs_cat()`
51    or `read_abs_series()` functions. This function is provided for those
52    cases where the data is not available in the ABS catalogue, where the
53    data is not a timeseries, or where the user wants to extract data from
54    a specific ABS landingpage.
55
56
57    Parameters
58    ----------
59    url : str = ""
60        A URL for an ABS Catalogue landing page. Either a url or
61        a catalogue number must be provided. If both are provided, the
62        URL will be used.
63
64    cat : str = ""
65        An ABS Catalogue number. If provided, and the URL is not
66        provided, then the Catalogue number will be used to get the URL.
67
68    **kwargs : Unpack[ReadArgs]
69        Accepts the same keyword arguments as `read_abs_cat()`.
70
71    Returns
72    -------
73    dict[str, DataFrame]
74        A dictionary of DataFrames.
75
76    """
77    # check/get the keyword arguments
78    url = _get_url(url, cat)
79    check_kwargs(kwargs, "grab_abs_url")  # warn if invalid kwargs
80    args = get_args(kwargs, "grab_abs_url")  # get the valid kwargs
81    if verbose := args["verbose"]:
82        print(f"grab_abs_url(): {url=}, {args=}")
83
84    # get the URL links to the relevant ABS data files on that webpage
85    links = get_abs_links(url, **args)
86    if not links:
87        print(f"No data files found at URL: {url}")
88        return {}  # return an empty Dictionary
89
90    # read the data files into a dictionary of DataFrames
91    abs_dict: dict[str, DataFrame] = {}
92
93    # Process single file requests first
94    abs_dict = _process_single_files(abs_dict, links, args, verbose=verbose)
95    if abs_dict:  # If single file was found and processed, return it
96        return abs_dict
97
98    # Process all files based on configuration
99    return _process_all_files(abs_dict, links, args)

For a given URL, extract the data from the Excel and ZIP file links found on that page.

The data is returned as a dictionary of DataFrames. The Excel files are converted into DataFrames, with each sheet in each Excel file becoming a separate DataFrame. ZIP files are examined for Excel files, which are similarly converted into DataFrames. The dictionary of DataFrames is returned.

The preferred mechanism for reading ABS data is to use the read_abs_cat() or read_abs_series() functions. This function is provided for those cases where the data is not available in the ABS catalogue, where the data is not a timeseries, or where the user wants to extract data from a specific ABS landingpage.

Parameters

url : str = "" A URL for an ABS Catalogue landing page. Either a url or a catalogue number must be provided. If both are provided, the URL will be used.

cat : str = "" An ABS Catalogue number. If provided, and the URL is not provided, then the Catalogue number will be used to get the URL.

**kwargs : Unpack[ReadArgs] Accepts the same keyword arguments as read_abs_cat().

Returns

dict[str, DataFrame] A dictionary of DataFrames.

def grab_abs_zip( zip_path: pathlib.Path | str, **kwargs: Unpack[ReadArgs]) -> dict[str, pandas.DataFrame]:
102def grab_abs_zip(
103    zip_path: Path | str,
104    **kwargs: Unpack[ReadArgs]
105) -> dict[str, DataFrame]:
106    """Grab and process a single ABS ZIP file from a file system location.
107
108    This is a convenience function that opens an ABS ZIP file from a local
109    filesystem path. Expect to be used rarely.
110
111    Parameters
112    ----------
113    zip_path : Path | str
114        The local filesystem path of the ABS ZIP file to open and process.
115
116    **kwargs : Unpack[ReadArgs]
117        Additional keyword arguments for file retrieval and processing.
118
119    Returns
120    -------
121    dict[str, DataFrame]
122        A dictionary of DataFrames extracted from the ZIP file.
123
124    """
125    check_kwargs(kwargs, "grab_abs_zip")  # warn if invalid kwargs
126    args = get_args(kwargs, "grab_abs_zip")  # get the valid kwargs
127
128    zp: Path = zip_path if isinstance(zip_path, Path) else Path(zip_path)
129    zip_bytes = zp.read_bytes()
130    abs_dict: dict[str, DataFrame] = {}
131    return _process_zip(abs_dict, zip_bytes, **args)

Grab and process a single ABS ZIP file from a file system location.

This is a convenience function that opens an ABS ZIP file from a local filesystem path. Expect to be used rarely.

Parameters

zip_path : Path | str The local filesystem path of the ABS ZIP file to open and process.

**kwargs : Unpack[ReadArgs] Additional keyword arguments for file retrieval and processing.

Returns

dict[str, DataFrame] A dictionary of DataFrames extracted from the ZIP file.

metacol = Metacol(did='Data Item Description', stype='Series Type', id='Series ID', start='Series Start', end='Series End', num='No. Obs.', unit='Unit', dtype='Data Type', freq='Freq.', cmonth='Collection Month', table='Table', tdesc='Table Description', cat='Catalogue number')
def monthly_to_qtly(data: ~Datatype, q_ending: str = 'DEC', f: str = 'mean') -> ~Datatype:
194def monthly_to_qtly(data: DataT, q_ending: str = "DEC", f: str = "mean") -> DataT:
195    """Convert monthly data to quarterly data.
196
197    This is done by taking the mean (or sum) of the three months in each quarter.
198    Ignore quarters with less than or more than three months data. Drop NA items.
199    Change f to "sum" for a quarterly sum.
200
201    Args:
202        data : pandas Series or DataFrame
203            The data to convert to quarterly frequency.
204        q_ending : str, default "DEC"
205            The month in which the quarter ends. For example, "DEC" for December.
206        f : str, default "mean"
207            The function to apply to the three months in each quarter.
208            Change to "sum" for a quarterly sum. The default is a
209            quarterly mean.
210
211    Returns:
212        pandas Series or DataFrame
213            The data with a quarterly PeriodIndex. If a quarter has less than
214            three months data, the quarter is dropped. If the quarter has more
215            than three months data, the quarter is dropped. Any NA data is removed.
216        For DataFrame input, the function is applied to each column.
217
218    Raises:
219        InvalidDataError - If data is not a Series or DataFrame.
220        InvalidParameterError - If q_ending or f parameters are invalid.
221
222    """
223    # Validate inputs
224    if not isinstance(data, (Series, DataFrame)):
225        raise InvalidDataError("data must be a pandas Series or DataFrame")
226
227    valid_endings = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]
228    if q_ending.upper() not in valid_endings:
229        raise InvalidParameterError(f"q_ending must be one of {valid_endings}")
230
231    valid_aggregations = ["mean", "sum", "min", "max", "std", "var"]
232    if f not in valid_aggregations:
233        raise InvalidParameterError(f"f must be one of {valid_aggregations}")
234
235    try:
236        if isinstance(data, Series):
237            return _monthly_to_qtly_series(data, q_ending, f)
238        if isinstance(data, DataFrame):
239            result_dict = {}
240            for col in data.columns:
241                result_dict[col] = _monthly_to_qtly_series(data[col], q_ending, f)
242            return data.__class__(result_dict)
243        # This should never be reached due to validation above
244        raise InvalidDataError("Unexpected data type")  # noqa: TRY301
245    except Exception as e:
246        raise InvalidDataError(f"Error converting monthly to quarterly data: {e}") from e

Convert monthly data to quarterly data.

This is done by taking the mean (or sum) of the three months in each quarter. Ignore quarters with less than or more than three months data. Drop NA items. Change f to "sum" for a quarterly sum.

Args: data : pandas Series or DataFrame The data to convert to quarterly frequency. q_ending : str, default "DEC" The month in which the quarter ends. For example, "DEC" for December. f : str, default "mean" The function to apply to the three months in each quarter. Change to "sum" for a quarterly sum. The default is a quarterly mean.

Returns: pandas Series or DataFrame The data with a quarterly PeriodIndex. If a quarter has less than three months data, the quarter is dropped. If the quarter has more than three months data, the quarter is dropped. Any NA data is removed. For DataFrame input, the function is applied to each column.

Raises: InvalidDataError - If data is not a Series or DataFrame. InvalidParameterError - If q_ending or f parameters are invalid.

def percent_change(data: ~Datatype, n_periods: int) -> ~Datatype:
31def percent_change(data: DataT, n_periods: int) -> DataT:
32    """Calculate a percentage change in a contiguous, ordered series over n_periods.
33
34    Args:
35        data : pandas Series or DataFrame
36            The data to calculate the percentage change for.
37        n_periods : int
38            The number of periods to calculate the percentage change over.
39            Typically 4 for quarterly data, and 12 for monthly data.
40
41    Returns:
42        pandas Series or DataFrame - The percentage change in the data over n_periods.
43            For DataFrame input, the percentage change is calculated for each column.
44
45    Raises:
46        InvalidParameterError - If n_periods is not a positive integer.
47        InvalidDataError - If data is not a Series or DataFrame.
48
49    """
50    if not isinstance(n_periods, int) or n_periods <= 0:
51        raise InvalidParameterError("n_periods must be a positive integer")
52
53    if not isinstance(data, (Series, DataFrame)):
54        raise InvalidDataError("data must be a pandas Series or DataFrame")
55
56    try:
57        return (data / data.shift(n_periods) - 1) * 100
58    except Exception as e:
59        raise InvalidDataError(f"Error calculating percentage change: {e}") from e

Calculate a percentage change in a contiguous, ordered series over n_periods.

Args: data : pandas Series or DataFrame The data to calculate the percentage change for. n_periods : int The number of periods to calculate the percentage change over. Typically 4 for quarterly data, and 12 for monthly data.

Returns: pandas Series or DataFrame - The percentage change in the data over n_periods. For DataFrame input, the percentage change is calculated for each column.

Raises: InvalidParameterError - If n_periods is not a positive integer. InvalidDataError - If data is not a Series or DataFrame.

def qtly_to_monthly( data: ~Datatype, *, interpolate: bool = True, limit: int | None = 2, dropna: bool = True) -> ~Datatype:
127def qtly_to_monthly(
128    data: DataT,
129    *,
130    interpolate: bool = True,
131    limit: int | None = 2,  # only used if interpolate is True
132    dropna: bool = True,
133) -> DataT:
134    """Convert data from Quarterly PeriodIndex to a Monthly PeriodIndex.
135
136    Args:
137        data: Series or DataFrame with quarterly PeriodIndex. Assumes the index is unique.
138            The data to convert to monthly frequency.
139        interpolate: bool, default True
140            Whether to interpolate the missing monthly data.
141        limit: int, default 2 - The maximum number of consecutive missing months
142            to interpolate.
143        dropna: bool, default True - Whether to drop NA data
144
145    Returns:
146        pandas Series or DataFrame - The data with a Monthly PeriodIndex.
147            If interpolate is True, the missing monthly data is interpolated.
148            If dropna is True, any NA data is removed.
149
150    Raises:
151        InvalidDataError - If data index is not a quarterly PeriodIndex or has issues.
152        InvalidParameterError - If limit parameter is invalid.
153
154    """
155    # Validate input data
156    if not isinstance(data, (Series, DataFrame)):
157        raise InvalidDataError("data must be a pandas Series or DataFrame")
158
159    if not isinstance(data.index, PeriodIndex):
160        raise InvalidDataError("data index must be a PeriodIndex")
161
162    if not (data.index.freqstr and data.index.freqstr[0] == "Q"):
163        raise InvalidDataError("data index must have quarterly frequency")
164
165    if not data.index.is_unique:
166        raise InvalidDataError("data index must be unique")
167
168    if not data.index.is_monotonic_increasing:
169        raise InvalidDataError("data index must be monotonic increasing")
170
171    if limit is not None and (not isinstance(limit, int) or limit < 0):
172        raise InvalidParameterError("limit must be a non-negative integer or None")
173
174    # do the heavy lifting
175    try:
176        data = (
177            data.set_axis(labels=data.index.to_timestamp(how="end"), axis="index", copy=True)
178            .resample(rule="ME")  # adds in every missing month
179            .first(min_count=1)  # generates nans for new months
180            # assumes only one value per quarter (ie. unique index)
181            .pipe(_set_axis_monthly_periods)
182        )
183    except Exception as e:
184        raise InvalidDataError(f"Error in quarterly to monthly conversion: {e}") from e
185
186    if interpolate:
187        data = data.interpolate(limit_area="inside", limit=limit)
188    if dropna:
189        data = data.dropna()
190
191    return data

Convert data from Quarterly PeriodIndex to a Monthly PeriodIndex.

Args: data: Series or DataFrame with quarterly PeriodIndex. Assumes the index is unique. The data to convert to monthly frequency. interpolate: bool, default True Whether to interpolate the missing monthly data. limit: int, default 2 - The maximum number of consecutive missing months to interpolate. dropna: bool, default True - Whether to drop NA data

Returns: pandas Series or DataFrame - The data with a Monthly PeriodIndex. If interpolate is True, the missing monthly data is interpolated. If dropna is True, any NA data is removed.

Raises: InvalidDataError - If data index is not a quarterly PeriodIndex or has issues. InvalidParameterError - If limit parameter is invalid.

@cache
def rba_catalogue(*, cache_only: bool = False, verbose: bool = False) -> pandas.DataFrame:
17@cache
18def rba_catalogue(*, cache_only: bool = False, verbose: bool = False) -> DataFrame:
19    """Return a DataFrame of RBA Catalogue numbers.
20
21    In the first instance, this is downloaded from the RBA website, and
22    cached for future use.
23
24    Parameters
25    ----------
26    cache_only : bool = False
27        If True, only use the cache.
28    verbose : bool = False
29        If True, print progress messages.
30
31    Returns
32    -------
33    DataFrame
34        A DataFrame of RBA Catalogue numbers.
35
36    Example
37    -------
38    ```python
39    import readabs as ra
40    catalogue = ra.rba_catalogue()
41    ```
42
43    """
44    return _get_rba_links(cache_only=cache_only, verbose=verbose)

Return a DataFrame of RBA Catalogue numbers.

In the first instance, this is downloaded from the RBA website, and cached for future use.

Parameters

cache_only : bool = False If True, only use the cache. verbose : bool = False If True, print progress messages.

Returns

DataFrame A DataFrame of RBA Catalogue numbers.

Example

import readabs as ra
catalogue = ra.rba_catalogue()
rba_metacol = _RbaMetacol(title='Title', desc='Description', freq='Frequency', type='Type', unit='Units', src='Source', pub='Publication date', id='Series ID', table='Table', tdesc='Table Description')
def read_abs_by_desc( wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]], **kwargs: Any) -> tuple[dict[str, pandas.Series], pandas.DataFrame]:
144def read_abs_by_desc(
145    wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]],
146    **kwargs: Any,
147) -> tuple[dict[str, pd.Series], pd.DataFrame]:
148    """Get specific ABS data series by searching the ABS meta data.
149
150    Parameters
151    ----------
152    wanted : list of str, dict of str:str, or dict of str:dict
153        The data
154        item descriptions to search for. If a list, it will be a list of
155        descriptions to search for. If a dictionary, the keys will a name.
156        The dictionary values can be either a string (the data item
157        description to search for) or a dictionary of keyword arguments, one of
158        which would be the data item description to search for.
159    **kwargs : Any
160        Keyword arguments to control the data retrieval.
161        The keyword arguments can include the following:
162        - abs_dict : dict - the dictionary of ABS data to search (from
163            read_abs_cat()).
164        - abs_meta : DataFrame - the metadata for the ABS data (from
165            read_abs_cat()).
166        - for the retrieval of data, the "cat" argument must be present.
167            The following arguments, if present, will also be used (ie.
168            passed to read_abs_cat()): ["ignore_errors", "get_zip",
169            "get_excel_if_no_zip", "get_excel", "cache_only",
170            "single_excel_only", "selected_excel", "single_zip_only",
171            "verbose"].
172        - for the selection of data, the following metacol names, if present,
173            will be used to construct the selector: "cat", "did"
174            "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
175            "cmonth", "table", "tdesc".
176        - finally, the following arguments will be passed to the find_abs_id()
177            and search_abs_meta() functions: ["validate_unique", "exact_match",
178            "regex", "verbose"].
179
180    Notes
181    -----
182    - if "wanted" is of type list[str] or dict[str, str], the kwargs should
183        include sufficient keys from the metacol dataclass to get the data.
184        Typically, the "cat" key, the "table" key, and the "stype" key would
185        be required. The did key would taken from the wanted list or
186        dictionary.
187    if wanted is of type dict[str, dict[str, Any]], the inner dictionary
188        must contain a "did" key. The other keys that can be used for the
189        data retrieval are the same as the metacol dataclass fileds, namely:
190        "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
191        "cmonth", "table", "tdesc".
192    - if abs_dict and abs_meta are provided within the kwargs, they will be
193        used to locate and extract the selected data.
194    - if abs_dict and abs_meta are not provided, then, (1) wanted must be of
195        type dict[str, dict[str, Any]] and (2) the inner dictionary must
196        contain a "cat" key so the data can be retrieved. Other keys that
197        can be used for the data retrieval are the same as for read_abs_cat(),
198        namely ["ignore_errors", "get_zip", "get_excel_if_no_zip",
199        "get_excel", "single_excel_only", "selected_excel",
200        "single_zip_only", "cache_only"].
201
202
203    Returns
204    -------
205    Returns a tuple of two items:
206    - A dictionary of pandas Series objects, where the keys are the series
207      descriptions. The series.name attribute will be the ABS series-id.
208    - A pandas DataFrame containing the metadata for the series.
209
210    Example
211    -------
212
213    ```python
214    import readabs as ra
215    from pandas import DataFrame
216    cat_num = "5206.0"  # The ABS National Accounts
217    data, meta = ra.read_abs_cat(cat=cat_num)
218    wanted = ["Gross domestic product: Chain volume measures ;",]
219    selected, selected_meta = ra.read_abs_by_desc(
220        wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
221    )
222    ```
223
224    """
225    # - preparation
226    if not _work_to_do(wanted):
227        return {}, pd.DataFrame()
228    if isinstance(wanted, list):
229        wanted = _wlist_to_wdict(wanted)
230    abs_dict = kwargs.get("abs_dict", {})
231    abs_meta = kwargs.get("abs_meta", pd.DataFrame())
232    kwarg_selector = _get_search_terms(kwargs, {})
233    search_args = _get_search_args(kwargs, {})
234
235    return_dict = {}
236    return_meta = pd.DataFrame()
237    for key, value in wanted.items():
238        item_selector = kwarg_selector.copy()
239        item_search_args = search_args.copy()
240        if isinstance(value, str):
241            series, meta = _get_item_from_str(
242                item=value,
243                data_dict=abs_dict,
244                data_meta=abs_meta,
245                item_selector=item_selector,
246                search_args=item_search_args,
247            )
248
249        elif isinstance(value, dict):
250            series, meta = _get_item_from_dict(
251                item_dict=value,
252                data_dict=abs_dict,
253                data_meta=abs_meta,
254                item_selector=item_selector,
255                search_args=item_search_args,
256                **kwargs,
257            )
258        else:
259            raise TypeError(
260                "Each value in the wanted list/dictionary must be either a string " + "or a dictionary."
261            )
262
263        # save search results
264        return_dict[key] = series
265        return_meta = pd.concat([return_meta, meta])
266
267    return return_dict, return_meta

Get specific ABS data series by searching the ABS meta data.

Parameters

wanted : list of str, dict of str:str, or dict of str:dict The data item descriptions to search for. If a list, it will be a list of descriptions to search for. If a dictionary, the keys will a name. The dictionary values can be either a string (the data item description to search for) or a dictionary of keyword arguments, one of which would be the data item description to search for. **kwargs : Any Keyword arguments to control the data retrieval. The keyword arguments can include the following: - abs_dict : dict - the dictionary of ABS data to search (from read_abs_cat()). - abs_meta : DataFrame - the metadata for the ABS data (from read_abs_cat()). - for the retrieval of data, the "cat" argument must be present. The following arguments, if present, will also be used (ie. passed to read_abs_cat()): ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "cache_only", "single_excel_only", "selected_excel", "single_zip_only", "verbose"]. - for the selection of data, the following metacol names, if present, will be used to construct the selector: "cat", "did" "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc". - finally, the following arguments will be passed to the find_abs_id() and search_abs_meta() functions: ["validate_unique", "exact_match", "regex", "verbose"].

Notes

  • if "wanted" is of type list[str] or dict[str, str], the kwargs should include sufficient keys from the metacol dataclass to get the data. Typically, the "cat" key, the "table" key, and the "stype" key would be required. The did key would taken from the wanted list or dictionary. if wanted is of type dict[str, dict[str, Any]], the inner dictionary must contain a "did" key. The other keys that can be used for the data retrieval are the same as the metacol dataclass fileds, namely: "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc".
  • if abs_dict and abs_meta are provided within the kwargs, they will be used to locate and extract the selected data.
  • if abs_dict and abs_meta are not provided, then, (1) wanted must be of type dict[str, dict[str, Any]] and (2) the inner dictionary must contain a "cat" key so the data can be retrieved. Other keys that can be used for the data retrieval are the same as for read_abs_cat(), namely ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "single_excel_only", "selected_excel", "single_zip_only", "cache_only"].

Returns

Returns a tuple of two items:

  • A dictionary of pandas Series objects, where the keys are the series descriptions. The series.name attribute will be the ABS series-id.
  • A pandas DataFrame containing the metadata for the series.

Example

import readabs as ra
from pandas import DataFrame
cat_num = "5206.0"  # The ABS National Accounts
data, meta = ra.read_abs_cat(cat=cat_num)
wanted = ["Gross domestic product: Chain volume measures ;",]
selected, selected_meta = ra.read_abs_by_desc(
    wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
)
@cache
def read_abs_cat( cat: str, **kwargs: Unpack[ReadArgs]) -> tuple[dict[str, pandas.DataFrame], pandas.DataFrame]:
 27@cache  # minimise slowness for any repeat business
 28def read_abs_cat(
 29    cat: str,
 30    **kwargs: Unpack[ReadArgs],
 31) -> tuple[dict[str, DataFrame], DataFrame]:
 32    """For a specific catalogue identifier, return the complete ABS Catalogue information as DataFrames.
 33
 34    This function returns the complete ABS Catalogue information as a
 35    python dictionary of pandas DataFrames, as well as the associated metadata
 36    in a separate DataFrame. The function automates the collection of zip and
 37    excel files from the ABS website. If necessary, these files are downloaded,
 38    and saved into a cache directory. The files are then parsed to extract time
 39    series data, and the associated metadata.
 40
 41    By default, the cache directory is `./.readabs_cache/`. You can change the
 42    default directory name by setting the shell environment variable
 43    `READABS_CACHE_DIR` with the name of the preferred directory.
 44
 45    Parameters
 46    ----------
 47    cat : str
 48        The ABS Catalogue Number for the data to be downloaded and made
 49        available by this function. This argument must be specified in the
 50        function call.
 51
 52    **kwargs : Unpack[ReadArgs]
 53        The following parameters may be passed as optional keyword arguments.
 54
 55    url : str = ""
 56        The URL of an ABS landing page. Use this for discontinued series
 57        that are no longer in the ABS Time Series Directory. If provided,
 58        data will be retrieved from this URL instead of looking up the
 59        catalogue number. Example:
 60        `read_abs_cat(cat="8501.0", url="https://www.abs.gov.au/.../jun-2025")`
 61
 62    keep_non_ts : bool = False
 63        A flag for whether to keep the non-time-series tables
 64        that might form part of an ABS catalogue item. Normally, the
 65        non-time-series information is ignored, and not made available to
 66        the user.
 67
 68    history : str = ""
 69        Provide a month-year string to extract historical ABS data.
 70        For example, you can set history="dec-2023" to the get the ABS data
 71        for a catalogue identifier that was originally published in respect
 72        of Q4 of 2023. Note: not all ABS data sources are structured so that
 73        this technique works in every case; but most are.
 74
 75    verbose : bool = False
 76        Setting this to true may help diagnose why something
 77        might be going wrong with the data retrieval process.
 78
 79    ignore_errors : bool = False
 80        Normally, this function will cease downloading when
 81        an error in encountered. However, sometimes the ABS website has
 82        malformed links, and changing this setting is necessitated. (Note:
 83        if you drop a message to the ABS, they will usually fix broken
 84        links with a business day).
 85
 86    get_zip : bool = True
 87        Download the excel files in .zip files.
 88
 89    get_excel_if_no_zip : bool = True
 90        Only try to download .xlsx files if there are no zip
 91        files available to be downloaded. Only downloading individual excel
 92        files when there are no zip files to download can speed up the
 93        download process.
 94
 95    get_excel : bool = False
 96        The default value means that excel files are not
 97        automatically download. Note: at least one of `get_zip`,
 98        `get_excel_if_no_zip`, or `get_excel` must be true. For most ABS
 99        catalogue items, it is sufficient to just download the one zip
100        file. But note, some catalogue items do not have a zip file.
101        Others have quite a number of zip files.
102
103    single_excel_only : str = ""
104        If this argument is set to a table name (without the
105        .xlsx extension), only that excel file will be downloaded. If
106        set, and only a limited subset of available data is needed,
107        this can speed up download times significantly. Note: overrides
108        `get_zip`, `get_excel_if_no_zip`, `get_excel` and `single_zip_only`.
109
110    selected_excel : tuple[str, ...] = ()
111        If set to a tuple of table names (without the .xlsx extension),
112        only those excel files will be downloaded. Useful when several
113        specific tables are needed and downloading the full zip would
114        be wasteful. Example:
115        `selected_excel=("62020001", "62020017", "62020X28")`.
116        Must be a tuple (not a list) because `read_abs_cat` uses an
117        internal cache that requires hashable arguments. Note: overrides
118        `get_zip`, `get_excel_if_no_zip`, `get_excel` and `single_zip_only`
119        when at least one matching file is found.
120
121    single_zip_only : str = ""
122        If this argument is set to a zip file name (without
123        the .zip extension), only that zip file will be downloaded.
124        If set, and only a limited subset of available data is needed,
125        this can speed up download times significantly. Note: overrides
126        `get_zip`, `get_excel_if_no_zip`, and `get_excel`.
127
128    cache_only : bool = False
129        If set to True, this function will only access
130        data that has been previously cached. Normally, the function
131        checks the date of the cache data against the date of the data
132        on the ABS website, before deciding whether the ABS has fresher
133        data that needs to be downloaded to the cache.
134
135    zip_file: str | Path = ""
136        If set to a specific zip file name (with or without the .zip
137        extension), this function will only extract data from that zip file
138        on the local file system. This may be useful for debugging purposes.
139
140    Returns
141    -------
142    tuple[dict[str, DataFrame], DataFrame]
143        The function returns a tuple of two items. The first item is a
144        python dictionary of pandas DataFrames (which is the primary data
145        associated with the ABS catalogue item). The second item is a
146        DataFrame of ABS metadata for the ABS collection.
147
148        Note:
149        You can retrieve non-timeseries data using the grab_abs_url()
150        function. That takes the URL for the ABS landing page for the ABS
151        collection you are interested in. The read_abs_cat function is for
152        ABS catalogue identifiers which are timeseries data, for which the
153        metadata can be extracted.
154
155    Example
156    -------
157
158    ```python
159    import readabs as ra
160    from pandas import DataFrame
161    cat_num = "6202.0"  # The ABS labour force survey
162    data: tuple[dict[str, DataFrame], DataFrame] = ra.read_abs_cat(cat=cat_num)
163    abs_dict, meta = data
164    ```
165
166    """
167    # --- get the time series data ---
168    if kwargs.get("zip_file"):
169        raw_abs_dict = grab_abs_zip(kwargs["zip_file"], **kwargs)
170    else:
171        raw_abs_dict = grab_abs_url(cat=cat, **kwargs)
172    response = _get_time_series_data(cat, raw_abs_dict, **kwargs)
173
174    if not response:
175        response = {}, DataFrame()
176
177    return response  # dictionary of DataFrames, and a DataFrame of metadata

For a specific catalogue identifier, return the complete ABS Catalogue information as DataFrames.

This function returns the complete ABS Catalogue information as a python dictionary of pandas DataFrames, as well as the associated metadata in a separate DataFrame. The function automates the collection of zip and excel files from the ABS website. If necessary, these files are downloaded, and saved into a cache directory. The files are then parsed to extract time series data, and the associated metadata.

By default, the cache directory is ./.readabs_cache/. You can change the default directory name by setting the shell environment variable READABS_CACHE_DIR with the name of the preferred directory.

Parameters

cat : str The ABS Catalogue Number for the data to be downloaded and made available by this function. This argument must be specified in the function call.

**kwargs : Unpack[ReadArgs] The following parameters may be passed as optional keyword arguments.

url : str = "" The URL of an ABS landing page. Use this for discontinued series that are no longer in the ABS Time Series Directory. If provided, data will be retrieved from this URL instead of looking up the catalogue number. Example: read_abs_cat(cat="8501.0", url="https://www.abs.gov.au/.../jun-2025")

keep_non_ts : bool = False A flag for whether to keep the non-time-series tables that might form part of an ABS catalogue item. Normally, the non-time-series information is ignored, and not made available to the user.

history : str = "" Provide a month-year string to extract historical ABS data. For example, you can set history="dec-2023" to the get the ABS data for a catalogue identifier that was originally published in respect of Q4 of 2023. Note: not all ABS data sources are structured so that this technique works in every case; but most are.

verbose : bool = False Setting this to true may help diagnose why something might be going wrong with the data retrieval process.

ignore_errors : bool = False Normally, this function will cease downloading when an error in encountered. However, sometimes the ABS website has malformed links, and changing this setting is necessitated. (Note: if you drop a message to the ABS, they will usually fix broken links with a business day).

get_zip : bool = True Download the excel files in .zip files.

get_excel_if_no_zip : bool = True Only try to download .xlsx files if there are no zip files available to be downloaded. Only downloading individual excel files when there are no zip files to download can speed up the download process.

get_excel : bool = False The default value means that excel files are not automatically download. Note: at least one of get_zip, get_excel_if_no_zip, or get_excel must be true. For most ABS catalogue items, it is sufficient to just download the one zip file. But note, some catalogue items do not have a zip file. Others have quite a number of zip files.

single_excel_only : str = "" If this argument is set to a table name (without the .xlsx extension), only that excel file will be downloaded. If set, and only a limited subset of available data is needed, this can speed up download times significantly. Note: overrides get_zip, get_excel_if_no_zip, get_excel and single_zip_only.

selected_excel : tuple[str, ...] = () If set to a tuple of table names (without the .xlsx extension), only those excel files will be downloaded. Useful when several specific tables are needed and downloading the full zip would be wasteful. Example: selected_excel=("62020001", "62020017", "62020X28"). Must be a tuple (not a list) because read_abs_cat uses an internal cache that requires hashable arguments. Note: overrides get_zip, get_excel_if_no_zip, get_excel and single_zip_only when at least one matching file is found.

single_zip_only : str = "" If this argument is set to a zip file name (without the .zip extension), only that zip file will be downloaded. If set, and only a limited subset of available data is needed, this can speed up download times significantly. Note: overrides get_zip, get_excel_if_no_zip, and get_excel.

cache_only : bool = False If set to True, this function will only access data that has been previously cached. Normally, the function checks the date of the cache data against the date of the data on the ABS website, before deciding whether the ABS has fresher data that needs to be downloaded to the cache.

zip_file: str | Path = "" If set to a specific zip file name (with or without the .zip extension), this function will only extract data from that zip file on the local file system. This may be useful for debugging purposes.

Returns

tuple[dict[str, DataFrame], DataFrame] The function returns a tuple of two items. The first item is a python dictionary of pandas DataFrames (which is the primary data associated with the ABS catalogue item). The second item is a DataFrame of ABS metadata for the ABS collection.

Note:
You can retrieve non-timeseries data using the grab_abs_url()
function. That takes the URL for the ABS landing page for the ABS
collection you are interested in. The read_abs_cat function is for
ABS catalogue identifiers which are timeseries data, for which the
metadata can be extracted.

Example

import readabs as ra
from pandas import DataFrame
cat_num = "6202.0"  # The ABS labour force survey
data: tuple[dict[str, DataFrame], DataFrame] = ra.read_abs_cat(cat=cat_num)
abs_dict, meta = data
def read_abs_series( cat: str, series_id: str | Sequence[str], **kwargs: Unpack[ReadArgs]) -> tuple[pandas.DataFrame, pandas.DataFrame]:
15def read_abs_series(
16    cat: str,
17    series_id: str | Sequence[str],
18    **kwargs: Unpack[ReadArgs],
19) -> tuple[DataFrame, DataFrame]:
20    """Get specific ABS data series by their ABS catalogue and series identifiers.
21
22    Parameters
23    ----------
24    cat : str
25        The ABS catalogue ID.
26
27    series_id : str | Sequence[str]
28        An ABS series ID or a sequence of ABS series IDs.
29
30    **kwargs : Any
31        Keyword arguments for the read_abs_series function,
32        which are the same as the keyword arguments for the
33        read_abs_cat function.
34
35    Returns
36    -------
37    tuple[DataFrame, DataFrame]
38        A tuple of two DataFrames, one for the primary data and one for the metadata.
39
40    Example
41    -------
42
43    ```python
44    import readabs as ra
45    from pandas import DataFrame
46    cat_num = "6202.0"  # The ABS labour force survey
47    unemployment_rate = "A84423050A"
48    seo = "6202001"  # The ABS table name
49    data, meta = ra.read_abs_series(
50        cat=cat_num, series_id=unemployment_rate, single_excel_only=seo
51    )
52    ```
53
54    """
55    # check for unexpected keyword arguments/get defaults
56    check_kwargs(kwargs, "read_abs_series")
57    args = get_args(kwargs, "read_abs_series")
58
59    # read the ABS category data
60    cat_data, cat_meta = read_abs_cat(cat, **args)
61
62    # drop repeated series_ids in the meta data,
63    # make unique series_ids the index
64    cat_meta.index = Index(cat_meta[metacol.id])
65    cat_meta = cat_meta.groupby(cat_meta.index).first()
66
67    # get the ABS series data
68    if isinstance(series_id, str):
69        series_id = [series_id]
70    return_data, return_meta = DataFrame(), DataFrame()
71    for identifier in series_id:
72        # confirm that the series ID is in the catalogue
73        if identifier not in cat_meta.index:
74            if args["verbose"]:
75                print(f"Series ID {identifier} not found in ABS catalogue ID {cat}")
76            if args["ignore_errors"]:
77                continue
78            raise ValueError(f"Series ID {identifier} not found in catalogue {cat}")
79
80        # confirm thay the index of the series is compatible
81        table = str(cat_meta.loc[identifier, metacol.table])  # str for mypy
82        data_series = cat_data[table][identifier]
83        if (
84            len(return_data) > 0
85            and cast("PeriodIndex", return_data.index).freq != cast("PeriodIndex", data_series.index).freq
86        ):
87            if args["verbose"]:
88                print(f"Frequency mismatch for series ID {identifier}")
89            if args["ignore_errors"]:
90                continue
91            raise ValueError(f"Frequency mismatch for series ID {identifier}")
92
93        # add the series data and meta data to the return values
94        if len(return_data) > 0:
95            return_data = return_data.reindex(return_data.index.union(data_series.index))
96        return_data[identifier] = data_series
97        return_meta = concat([return_meta, cat_meta.loc[identifier]], axis=1)
98
99    return return_data, return_meta.T

Get specific ABS data series by their ABS catalogue and series identifiers.

Parameters

cat : str The ABS catalogue ID.

series_id : str | Sequence[str] An ABS series ID or a sequence of ABS series IDs.

**kwargs : Any Keyword arguments for the read_abs_series function, which are the same as the keyword arguments for the read_abs_cat function.

Returns

tuple[DataFrame, DataFrame] A tuple of two DataFrames, one for the primary data and one for the metadata.

Example

import readabs as ra
from pandas import DataFrame
cat_num = "6202.0"  # The ABS labour force survey
unemployment_rate = "A84423050A"
seo = "6202001"  # The ABS table name
data, meta = ra.read_abs_series(
    cat=cat_num, series_id=unemployment_rate, single_excel_only=seo
)
def read_rba_ocr(*, monthly: bool = True, **kwargs: Any) -> pandas.Series:
171def read_rba_ocr(*, monthly: bool = True, **kwargs: Any) -> Series:  # ignore_errors
172    """Read the Official Cash Rate (OCR) from the RBA website.
173
174    Return it in a pandas Series, with either a daily or monthly PeriodIndex,
175    depending on the value of the monthly parameter. The default is monthly.
176
177    Parameters
178    ----------
179    monthly : bool = True
180        If True, then the data will be returned with a monthly PeriodIndex.
181        If False, then the data will be returned with a daily PeriodIndex.
182    **kwargs : Any
183        Additional keyword arguments. The only keyword argument that is used is ignore_errors.
184    ignore_errors : bool = False
185        If True, then any major errors encountered will be printed and the function
186        will return an empty Series. If False, then any major errors encountered
187        will raise an exception.
188
189    Returns
190    -------
191    Series
192        The OCR data in a pandas Series, with an index of either daily or monthly Periods.
193
194    Examples
195    --------
196    ```python
197    ocr = read_rba_ocr(monthly=True)
198    ```
199
200    """
201    # read the OCR table from the RBA website, make float and sort, name the series
202    rba, _rba_meta = read_rba_table("A2", **kwargs)  # should have a daily PeriodIndex
203    ocr_series = rba.loc[lambda x: x.index >= "1990-08-02", "ARBAMPCNCRT"]
204    ocr = ocr_series.astype(float).sort_index()  # pyright: ignore[reportAttributeAccessIssue]
205    ocr.name = "RBA Official Cash Rate"
206
207    # bring up to date
208    today = Period(Timestamp.today(), freq=cast("PeriodIndex", ocr.index).freqstr)
209    last_period = cast("Period", ocr.index[-1])
210    if last_period < today:
211        ocr[today] = ocr.iloc[-1]
212
213    if not monthly:
214        # fill in missing days and return daily data
215        daily_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="D")
216        return ocr.reindex(daily_index).ffill()
217
218    # convert to monthly data, keeping last value if duplicates in month
219    # fill in missing months
220    ocr.index = PeriodIndex(ocr.index, freq="M")
221    ocr = ocr[~ocr.index.duplicated(keep="last")]
222    monthly_index = period_range(start=ocr.index.min(), end=ocr.index.max(), freq="M")
223    return ocr.reindex(monthly_index, method="ffill")

Read the Official Cash Rate (OCR) from the RBA website.

Return it in a pandas Series, with either a daily or monthly PeriodIndex, depending on the value of the monthly parameter. The default is monthly.

Parameters

monthly : bool = True If True, then the data will be returned with a monthly PeriodIndex. If False, then the data will be returned with a daily PeriodIndex. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is ignore_errors. ignore_errors : bool = False If True, then any major errors encountered will be printed and the function will return an empty Series. If False, then any major errors encountered will raise an exception.

Returns

Series The OCR data in a pandas Series, with an index of either daily or monthly Periods.

Examples

ocr = read_rba_ocr(monthly=True)
def read_rba_table(table: str, **kwargs: Any) -> tuple[pandas.DataFrame, pandas.DataFrame]:
 89def read_rba_table(table: str, **kwargs: Any) -> tuple[DataFrame, DataFrame]:  # ignore_errors
 90    """Read a table from the RBA website and return the actual data and meta data.
 91
 92    Returns the actual data and the meta data in a tuple of two DataFrames.
 93
 94    Parameters
 95    ----------
 96    table : str
 97        The table to read from the RBA website.
 98    **kwargs : Any
 99        Additional keyword arguments.
100        The only keyword argument that is used is ignore_errors.
101    ignore_errors : bool = False
102        If True, then any major errors encountered will be printed and the function
103        will return empty DataFrames. If False, then any major errors encountered
104        will raise an exception.
105
106    Returns
107    -------
108    tuple[DataFrame, DataFrame]
109        The primary data and the meta data in a tuple of two DataFrames.
110
111    Examples
112    --------
113    ```python
114    data, meta = read_rba_table("C1")
115    ```
116
117    """
118    # set-up
119    ignore_errors = kwargs.get("ignore_errors", False)
120    data, meta = DataFrame(), DataFrame()
121
122    # get the Excel file
123    excel = _get_excel_file(table, ignore_errors=ignore_errors, **kwargs)
124    if excel is None:
125        return data, meta
126
127    # read Excel file into DataFrame
128    try:
129        raw = read_excel(BytesIO(excel), header=None, index_col=None)
130    except Exception as e:
131        if ignore_errors:
132            print(f"Ignoring error: {e}")
133            return data, meta
134        raise
135
136    # extract the meta data
137    meta = raw.iloc[1:11, :].T.copy()
138    meta.columns = Index(meta.iloc[0])
139    renamer = {
140        "Mnemonic": rm.id,
141    }  # historical data is inconsistent
142    meta = meta.rename(columns=renamer)
143    meta = meta.iloc[1:, :]
144    meta.index = Index(meta[rm.id])
145    meta[rm.table] = table
146    meta[rm.tdesc] = raw.iloc[0, 0]
147    meta = meta.dropna(how="all", axis=1)  # drop columns with all NaNs
148
149    # extract the data
150    data = raw.iloc[10:, :].copy()
151    data.columns = Index(data.iloc[0])
152    data = data.iloc[1:, :]
153    data.index = DatetimeIndex(data.iloc[:, 0])
154    data = data.iloc[:, 1:]
155    data = data.dropna(how="all", axis=1)  # drop columns with all NaNs
156
157    # can we make the index into a PeriodIndex?
158    days = data.index.to_series().diff(1).dropna().dt.days
159    if days.min() >= MONTHLY_MIN_DAYS and days.max() <= MONTHLY_MAX_DAYS:
160        data.index = PeriodIndex(data.index, freq="M")
161    elif days.min() >= QUARTERLY_MIN_DAYS and days.max() <= QUARTERLY_MAX_DAYS:
162        data.index = PeriodIndex(data.index, freq="Q")
163    elif days.min() >= YEARLY_MIN_DAYS and days.max() <= YEARLY_MAX_DAYS:
164        data.index = PeriodIndex(data.index, freq="Y")
165    else:
166        data.index = PeriodIndex(data.index, freq="D")
167
168    return data, meta

Read a table from the RBA website and return the actual data and meta data.

Returns the actual data and the meta data in a tuple of two DataFrames.

Parameters

table : str The table to read from the RBA website. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is ignore_errors. ignore_errors : bool = False If True, then any major errors encountered will be printed and the function will return empty DataFrames. If False, then any major errors encountered will raise an exception.

Returns

tuple[DataFrame, DataFrame] The primary data and the meta data in a tuple of two DataFrames.

Examples

data, meta = read_rba_table("C1")
def recalibrate(data: ~Datatype, units: str) -> tuple[~Datatype, str]:
24def recalibrate(
25    data: DataT,
26    units: str,
27) -> tuple[DataT, str]:
28    """Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000.
29
30    Change the name of the units to reflect the recalibration.
31
32    Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar.
33    If you provide a Series, you will get a Series back. If you provide a DataFrame,
34    you will get a DataFrame back.
35
36    Parameters
37    ----------
38    data : Series or DataFrame
39        The data to recalibrate.
40    units : str
41        The units of the data. This string should be in the form of
42        "Number", "Thousands", "Millions", "Billions", etc. The units
43        should be in title case.
44
45    Returns
46    -------
47    Series or DataFrame
48        The recalibrated data will be a Series if a Series was provided,
49        or a DataFrame if a DataFrame was provided.
50
51    Examples
52    --------
53    ```python
54    from pandas import Series
55    from readabs import recalibrate
56    s = Series([1_000, 10_000, 100_000, 1_000_000])
57    recalibrated, units = recalibrate(s, "$")
58    print(f"{recalibrated=}, {units=}")
59    ```
60
61    """
62    if not isinstance(data, (Series, DataFrame)):
63        raise TypeError("data must be a Series or DataFrame")
64    units, restore_name = _prepare_units(units)
65    flat_data = data.to_numpy().flatten()
66    flat_data, units = _recalibrate(flat_data, units)
67
68    if restore_name:
69        units = f"{restore_name} {units}"
70        for n in "numbers", "number":
71            if n in units:
72                units = units.replace(n, "").strip()
73                break
74    units = units.title()
75
76    result = data.__class__(flat_data.reshape(data.shape))
77    result.index = data.index
78    if len(data.shape) == NDIM_DATAFRAME:
79        result.columns = data.columns
80    if len(data.shape) == NDIM_SERIES:
81        result.name = data.name  # pyright: ignore[reportAttributeAccessIssue]
82    return result, units

Recalibrate a Series or DataFrame so the data is in the range -1000 to 1000.

Change the name of the units to reflect the recalibration.

Note, DataT = TypeVar("DataT", Series, DataFrame). DataT is a constrained typevar. If you provide a Series, you will get a Series back. If you provide a DataFrame, you will get a DataFrame back.

Parameters

data : Series or DataFrame The data to recalibrate. units : str The units of the data. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.

Returns

Series or DataFrame The recalibrated data will be a Series if a Series was provided, or a DataFrame if a DataFrame was provided.

Examples

from pandas import Series
from readabs import recalibrate
s = Series([1_000, 10_000, 100_000, 1_000_000])
recalibrated, units = recalibrate(s, "$")
print(f"{recalibrated=}, {units=}")
def recalibrate_value(value: float, units: str) -> tuple[float, str]:
 85def recalibrate_value(value: float, units: str) -> tuple[float, str]:
 86    """Recalibrate a floating point value.
 87
 88    The value will be recalibrated so it is in the range -1000 to 1000.
 89    The units will be changed to reflect the recalibration.
 90
 91    Parameters
 92    ----------
 93    value : float
 94        The value to recalibrate.
 95    units : str
 96        The units of the value. This string should be in the form of
 97        "Number", "Thousands", "Millions", "Billions", etc. The units
 98        should be in title case.
 99
100    Returns
101    -------
102    tuple[float, str]
103        A tuple containing the recalibrated value and the recalibrated units.
104
105    Examples
106    --------
107    ```python
108    from readabs import recalibrate_value
109    recalibrated, units = recalibrate_value(10_000_000, "Thousand")
110    print(recalibrated, units)
111    ```
112
113    """
114    series = Series([value])
115    output, units = recalibrate(series, units)
116    return output.to_numpy()[0], units

Recalibrate a floating point value.

The value will be recalibrated so it is in the range -1000 to 1000. The units will be changed to reflect the recalibration.

Parameters

value : float The value to recalibrate. units : str The units of the value. This string should be in the form of "Number", "Thousands", "Millions", "Billions", etc. The units should be in title case.

Returns

tuple[float, str] A tuple containing the recalibrated value and the recalibrated units.

Examples

from readabs import recalibrate_value
recalibrated, units = recalibrate_value(10_000_000, "Thousand")
print(recalibrated, units)
def search_abs_meta( meta: pandas.DataFrame, search_terms: dict[str, str], *, exact_match: bool = False, regex: bool = False, validate_unique: bool = False, **kwargs: Any) -> pandas.DataFrame:
 17def search_abs_meta(
 18    meta: DataFrame,  # sourced from read_abs_series() or read_abs_cat()
 19    search_terms: dict[str, str],  # {search_term: meta_data_column_name, ...}
 20    *,
 21    exact_match: bool = False,
 22    regex: bool = False,
 23    validate_unique: bool = False,  # useful safety-net if you expect only one match
 24    **kwargs: Any,  # verbose flag
 25) -> DataFrame:
 26    """Extract from the ABS meta data those rows that match the search_terms.
 27
 28    Iteratively search the meta data one search_term at a time.
 29
 30    Parameters
 31    ----------
 32    meta : DataFrame
 33        A pandas DataFrame of metadata from the ABS
 34        (via read_abs_cat() or read_abs_series()).
 35    search_terms : dict[str, str]
 36        A dictionary {search_phrase: meta_column_name, ...} of search terms.
 37        Note: the search terms must be unique, as a dictionary cannot hold the
 38        same search term to be applied to different columns.
 39    exact_match : bool = False
 40        Whether to match using == (exact) or .str.contains() (inexact).
 41    regex : bool = False
 42        Whether to use regular expressions in the search.
 43    validate_unique : bool = False
 44        Raise a ValueError if the search result is not unique.
 45    **kwargs : Any
 46        Additional keyword arguments. The only keyword argument
 47        that is used is verbose.
 48    verbose : bool = False
 49        Print additional information while searching; which can
 50        be useful when diagnosing problems with search terms.
 51
 52    Returns
 53    -------
 54    DataFrame
 55        Returns a pandas DataFrame of matching rows (subseted from meta).
 56        Note, The index for the returned meta data will always comprise ABS
 57        series_ids. Duplicate indexes will be removed from the meta data
 58        (ie. where the same ABS series appears in more than one table, this
 59        function will only report the first match).
 60
 61    Metacol
 62    -------
 63    Because the meta data is a DataFrame, the columns can be referenced by either
 64    their full textual name, or by the short name defined in the metacol object.
 65    For example, if metacol is imported as mc, to refer to the
 66    `Data Item Description` column, the user can refer to it as mc.did.
 67
 68    Example
 69    -------
 70    ```python
 71    from readabs import metacol as mc  # alias for the ABS meta data column names
 72    from readabs import read_abs_cat, search_abs_meta
 73    cat_num = "6202.0"  # The ABS labour force survey
 74    data, meta = read_abs_cat(cat_num)
 75    search_terms = {
 76        "Unemployment rate": mc.did,  # the data item description
 77        "Persons": mc.did,
 78        "Seasonally Adjusted": mc.stype,
 79        "Percent": mc.unit,
 80        "6202001": mc.table,
 81    }
 82    rows = search_abs_meta(meta, search_terms, verbose=True)
 83    print(rows)  # should have three rows : FT/PT/All Unemployment rates
 84    ```
 85
 86    """
 87    # get the verbose-flag from kwargs
 88    verbose = kwargs.get("verbose", False)
 89
 90    # establish the starting point
 91    meta_select = meta.copy()  # preserve the original meta data
 92    if verbose:
 93        print(f"In search_abs_meta() {exact_match=} {regex=} {verbose=}")
 94        print(f"In search_abs_meta() starting with {len(meta_select)} rows in the meta_data.")
 95
 96    # iteratively search
 97    for phrase, column in search_terms.items():
 98        if verbose:
 99            print(f"Searching {len(meta_select)}: term: {phrase} in-column: {column}")
100
101        pick_me = (
102            (meta_select[column] == phrase)
103            if (exact_match or column == mc.table)
104            else meta_select[column].str.contains(phrase, regex=regex)
105        )
106        meta_select = meta_select[pick_me]
107        if verbose:
108            print(f"In find_rows() have found {len(meta_select)}")
109
110    # search complete - check results - and return
111    meta_select.index = Index(meta_select[mc.id])
112    meta_select = meta_select[~meta_select.index.duplicated(keep="first")]
113
114    if verbose:
115        print(f"Final selection is {len(meta_select)} rows.")
116
117    elif len(meta_select) == 0:
118        print("Nothing selected?")
119
120    if validate_unique and len(meta_select) != 1:
121        raise ValueError("The selected meta data should only contain one row.")
122
123    return meta_select

Extract from the ABS meta data those rows that match the search_terms.

Iteratively search the meta data one search_term at a time.

Parameters

meta : DataFrame A pandas DataFrame of metadata from the ABS (via read_abs_cat() or read_abs_series()). search_terms : dict[str, str] A dictionary {search_phrase: meta_column_name, ...} of search terms. Note: the search terms must be unique, as a dictionary cannot hold the same search term to be applied to different columns. exact_match : bool = False Whether to match using == (exact) or .str.contains() (inexact). regex : bool = False Whether to use regular expressions in the search. validate_unique : bool = False Raise a ValueError if the search result is not unique. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is verbose. verbose : bool = False Print additional information while searching; which can be useful when diagnosing problems with search terms.

Returns

DataFrame Returns a pandas DataFrame of matching rows (subseted from meta). Note, The index for the returned meta data will always comprise ABS series_ids. Duplicate indexes will be removed from the meta data (ie. where the same ABS series appears in more than one table, this function will only report the first match).

Metacol

Because the meta data is a DataFrame, the columns can be referenced by either their full textual name, or by the short name defined in the metacol object. For example, if metacol is imported as mc, to refer to the Data Item Description column, the user can refer to it as mc.did.

Example

from readabs import metacol as mc  # alias for the ABS meta data column names
from readabs import read_abs_cat, search_abs_meta
cat_num = "6202.0"  # The ABS labour force survey
data, meta = read_abs_cat(cat_num)
search_terms = {
    "Unemployment rate": mc.did,  # the data item description
    "Persons": mc.did,
    "Seasonally Adjusted": mc.stype,
    "Percent": mc.unit,
    "6202001": mc.table,
}
rows = search_abs_meta(meta, search_terms, verbose=True)
print(rows)  # should have three rows : FT/PT/All Unemployment rates
def select( sources: Iterable[tuple[dict[str, pandas.DataFrame], pandas.DataFrame, dict[str, str]]], *, require_same_units: bool = True) -> list[pandas.Series]:
317def select(sources: Iterable[Source], *, require_same_units: bool = True) -> list[Series]:
318    """Select a series for each ``(data, meta, selector)`` — the iterable in, iterable out.
319
320    The composable selection primitive: takes the iterable of ``(data, meta,
321    selector)`` sources and returns the matching list of Series, ready to hand to
322    :func:`splice` (directly, or after a per-series transform).  Each selection
323    goes through ``readabs.find_abs_id`` with ``validate_unique=True``, which
324    de-duplicates on Series ID first — so a selector matching the same series in
325    several tables resolves cleanly, while one matching two genuinely different
326    series raises rather than guessing.
327
328    Parameters
329    ----------
330    sources
331        Iterable of ``(data, meta, selector)``:
332
333        - ``data``   — ``dict[table_name, DataFrame]`` from ``read_abs_cat``.
334        - ``meta``   — the matching metadata DataFrame.
335        - ``selector`` — ``{search_value: meta_column}`` for ``find_abs_id``, e.g.
336          ``{"Index Numbers ;  All groups CPI ;  Australia ;": mc.did,
337          "Index Numbers": mc.unit, "Quarter": mc.freq}``.
338    require_same_units
339        If ``True`` (default) **raise** when the selected series do not all share
340        the same ABS unit — units must cohere to be spliced.  Set ``False`` when
341        you deliberately select different-unit series together (e.g. two counts
342        and a rate that you will combine yourself).
343
344    Returns
345    -------
346    list[Series]
347        One Series per source, each named by its Series ID with its ABS unit in
348        ``series.attrs["unit"]``.  Unpack it (``a, b = select([...])``), map a
349        transform over it, or pass it straight to :func:`splice`.  A later
350        transform drops the unit attr — correctly, since the unit is then no
351        longer the ABS one.
352
353    Raises
354    ------
355    ValueError
356        If ``require_same_units`` and the selected series carry mixed units.
357
358    """
359    segments = [select_one(data, meta, selector) for data, meta, selector in sources]
360    if require_same_units:
361        units = [str(s.attrs.get("unit", "")) for s in segments]
362        if len(set(units)) > 1:
363            detail = ", ".join(f"{s.name}={u!r}" for s, u in zip(segments, units, strict=True))
364            raise ValueError(
365                f"select: selected series have mismatched units ({detail}). Pass "
366                f"require_same_units=False to select different-unit series together."
367            )
368    return segments

Select a series for each (data, meta, selector) — the iterable in, iterable out.

The composable selection primitive: takes the iterable of (data, meta, selector) sources and returns the matching list of Series, ready to hand to splice() (directly, or after a per-series transform). Each selection goes through readabs.find_abs_id with validate_unique=True, which de-duplicates on Series ID first — so a selector matching the same series in several tables resolves cleanly, while one matching two genuinely different series raises rather than guessing.

Parameters

sources Iterable of (data, meta, selector):

- ``data``   — ``dict[table_name, DataFrame]`` from ``read_abs_cat``.
- ``meta``   — the matching metadata DataFrame.
- ``selector`` — ``{search_value: meta_column}`` for ``find_abs_id``, e.g.
  ``{"Index Numbers ;  All groups CPI ;  Australia ;": mc.did,
  "Index Numbers": mc.unit, "Quarter": mc.freq}``.

require_same_units If True (default) raise when the selected series do not all share the same ABS unit — units must cohere to be spliced. Set False when you deliberately select different-unit series together (e.g. two counts and a rate that you will combine yourself).

Returns

list[Series] One Series per source, each named by its Series ID with its ABS unit in series.attrs["unit"]. Unpack it (a, b = select([...])), map a transform over it, or pass it straight to splice(). A later transform drops the unit attr — correctly, since the unit is then no longer the ABS one.

Raises

ValueError If require_same_units and the selected series carry mixed units.

def select_and_splice( sources: Iterable[tuple[dict[str, pandas.DataFrame], pandas.DataFrame, dict[str, str]]], *, target: str | None = None, rebase: bool = False, agg: str = 'mean', output: str | None = None, fill: Literal['ffill', 'interpolate'] | None = None, name: str | None = None, require_same_units: bool = True) -> tuple[pandas.Series, str, pandas.DataFrame]:
371def select_and_splice(
372    sources: Iterable[Source],
373    *,
374    target: str | None = None,
375    rebase: bool = False,
376    agg: str = "mean",
377    output: str | None = None,
378    fill: Literal["ffill", "interpolate"] | None = None,
379    name: str | None = None,
380    require_same_units: bool = True,
381) -> tuple[Series, str, DataFrame]:
382    """Select one series per source and :func:`splice` them — the no-transform case.
383
384    Sugar for ``splice(select(*src) for src in sources)`` with a unit guard.  When
385    you need a transform *between* selecting and splicing (e.g. a growth rate),
386    compose :func:`select` and :func:`splice` directly instead — that is the whole
387    reason :func:`select` is exposed separately.
388
389    Parameters
390    ----------
391    sources
392        Ordered iterable of ``(data, meta, selector)``, **highest priority
393        first** (same priority rule as :func:`splice`):
394
395        - ``data``   — ``dict[table_name, DataFrame]`` from ``read_abs_cat``.
396        - ``meta``   — the matching metadata DataFrame.
397        - ``selector`` — ``{search_value: meta_column}`` for ``find_abs_id``,
398          e.g. ``{"Index Numbers ;  All groups CPI ;  Australia ;": mc.did,
399          "Index Numbers": mc.unit, "Quarter": mc.freq}``.  In the common case
400          the only thing differing between two sources is the frequency, so a
401          shared *base* selector composes with ``base | {"Quarter": mc.freq}``.
402    target, rebase, agg, output, fill, name
403        Passed straight through to :func:`splice`.
404    require_same_units
405        Forwarded to :func:`select`: if ``True`` (default) raise when the
406        selected segments carry mixed units; ``False`` overrides (the result is
407        then labelled with the highest-priority segment's unit).
408
409    Returns
410    -------
411    tuple[Series, str, DataFrame]
412        The spliced series, its unit (the highest-priority segment's unit), and
413        the :func:`splice` join report, augmented with ``series_id`` and
414        ``unit`` columns recording what each segment resolved to.
415
416    """
417    segments = select(sources, require_same_units=require_same_units)
418    units = [str(s.attrs.get("unit", "")) for s in segments]
419
420    result, report = splice(
421        segments, target=target, rebase=rebase, agg=agg, output=output, fill=fill, name=name
422    )
423    # Audit trail: which Series ID / unit did each reported (lower-priority) segment use?
424    if len(report):
425        seg = [int(i) for i in report["segment"]]
426        report.insert(1, "series_id", [str(segments[i].name) for i in seg])
427        report.insert(2, "unit", [units[i] for i in seg])
428    return result, units[0], report

Select one series per source and splice() them — the no-transform case.

Sugar for splice(select(*src) for src in sources) with a unit guard. When you need a transform between selecting and splicing (e.g. a growth rate), compose select() and splice() directly instead — that is the whole reason select() is exposed separately.

Parameters

sources Ordered iterable of (data, meta, selector), highest priority first (same priority rule as splice()):

- ``data``   — ``dict[table_name, DataFrame]`` from ``read_abs_cat``.
- ``meta``   — the matching metadata DataFrame.
- ``selector`` — ``{search_value: meta_column}`` for ``find_abs_id``,
  e.g. ``{"Index Numbers ;  All groups CPI ;  Australia ;": mc.did,
  "Index Numbers": mc.unit, "Quarter": mc.freq}``.  In the common case
  the only thing differing between two sources is the frequency, so a
  shared *base* selector composes with ``base | {"Quarter": mc.freq}``.

target, rebase, agg, output, fill, name Passed straight through to splice(). require_same_units Forwarded to select(): if True (default) raise when the selected segments carry mixed units; False overrides (the result is then labelled with the highest-priority segment's unit).

Returns

tuple[Series, str, DataFrame] The spliced series, its unit (the highest-priority segment's unit), and the splice() join report, augmented with series_id and unit columns recording what each segment resolved to.

def select_one( data: dict[str, pandas.DataFrame], meta: pandas.DataFrame, selector: dict[str, str]) -> pandas.Series:
303def select_one(data: dict[str, DataFrame], meta: DataFrame, selector: dict[str, str]) -> Series:
304    """Select the single Series for one ``(data, meta, selector)`` — the single-source wrapper.
305
306    Convenience for the common one-selector case; equivalent to
307    ``select([(data, meta, selector)])[0]``.  Returns the Series named by its
308    Series ID, with its ABS unit on ``.attrs["unit"]``.
309    """
310    table, series_id, unit = find_abs_id(meta, selector, validate_unique=True)
311    s = data[table][series_id].copy()
312    s.name = series_id
313    s.attrs["unit"] = str(unit)
314    return s

Select the single Series for one (data, meta, selector) — the single-source wrapper.

Convenience for the common one-selector case; equivalent to select([(data, meta, selector)])[0]. Returns the Series named by its Series ID, with its ABS unit on .attrs["unit"].

def splice( segments: Iterable[pandas.Series], *, target: str | None = None, rebase: bool = False, agg: str = 'mean', output: str | None = None, fill: Literal['ffill', 'interpolate'] | None = None, name: str | None = None) -> tuple[pandas.Series, pandas.DataFrame]:
179def splice(
180    segments: Iterable[Series],
181    *,
182    target: str | None = None,
183    rebase: bool = False,
184    agg: str = "mean",
185    output: str | None = None,
186    fill: Literal["ffill", "interpolate"] | None = None,
187    name: str | None = None,
188) -> tuple[Series, DataFrame]:
189    """Splice mixed-frequency *segments* into one series, highest priority first.
190
191    Parameters
192    ----------
193    segments
194        Ordered list of pandas Series (PeriodIndex or DatetimeIndex).  The
195        first is highest priority: it wins where periods overlap and (when
196        ``rebase`` is on) sets the level everything else is rebased to.
197    target
198        Common-grid frequency (e.g. ``"M"``, ``"Q-DEC"``).  Defaults to the
199        finest frequency present (anchor clashes step one rank finer).
200    rebase
201        Off by default — segments are coalesced at their **raw** levels, with no
202        silent transformation of your data.  Set ``True`` to *multiplicatively*
203        rescale each lower-priority segment to the running result's level before
204        coalescing.  Rebasing assumes **ratio-scale** inputs (meaningful zero,
205        proportional discrepancy between segments) — splicing index series on
206        different base periods (CPI, price/volume indices) is the case that
207        needs it.  It is wrong for zero-crossing series (rates, balances) or
208        additive level breaks, and it *invents* a correction when same-unit
209        segments already agree — which is why it is opt-in.  A non-finite or
210        non-positive factor raises.  See the module docstring's *rebase* step.
211    agg
212        Aggregator used when a segment is finer than the grid (or when
213        downsampling to *output*).  ``"mean"`` for index levels; use ``"sum"``
214        for flows.
215    output
216        Optional final frequency to resample the spliced result to.
217    fill
218        Optional gap fill.  By default (``None``) the result contains only the
219        periods that actually have data — no NaN rows are inserted for the gaps
220        a coarse segment leaves on a finer grid, and nothing is interpolated.
221        ``"ffill"`` or ``"interpolate"`` densify the result onto the full grid
222        first and then fill.
223    name
224        Name for the result series (defaults to the first segment's name).
225
226    Returns
227    -------
228    tuple[Series, DataFrame]
229        The spliced series and a one-row-per-junction report.
230
231    """
232    segments = list(segments)
233    if not segments:
234        raise ValueError("splice() needs at least one segment.")
235
236    grid = target or _pick_target(segments)
237    on_grid = [_to_grid(s, grid, agg) for s in segments]
238
239    result = on_grid[0].copy()
240    rows: list[dict[str, object]] = []
241    for i, seg in enumerate(on_grid[1:], start=1):
242        if rebase:
243            factor, method, n, lo, hi = _rebase_factor(result, seg)
244            # Multiplicative rebasing assumes ratio-scale inputs.  A non-finite
245            # factor (near-zero denominator) or a non-positive one (the overlap
246            # means have opposite signs, which would flip the back-history) means
247            # the data is not ratio-scale — fail loud rather than ship it.  A
248            # large *magnitude* is fine: a legitimate base-period difference can
249            # need a 50x factor, so only sign and finiteness are guarded.
250            if not (math.isfinite(factor) and factor > 0):
251                raise ValueError(
252                    f"splice: rebase factor for segment {i} ('{seg.name}') is {factor} over "
253                    f"{lo}..{hi}. Multiplicative rebasing needs ratio-scale inputs (meaningful "
254                    f"zero, proportional discrepancy); a non-finite or non-positive factor means "
255                    f"the segments cross zero or differ additively. Pass rebase=False to coalesce "
256                    f"raw levels instead."
257                )
258        else:
259            factor, method, n, lo, hi = 1.0, "off", 0, None, None
260        seg_rebased = seg * factor
261        rows.append(
262            {
263                "segment": i,
264                "name": str(seg.name),
265                "freq_in": str(_pidx(segments[i]).freqstr),
266                "method": method,
267                "overlap_n": n,
268                "window_start": str(lo) if lo is not None else "",
269                "window_end": str(hi) if hi is not None else "",
270                "factor": round(factor, 6),
271                "fills_from": str(seg.dropna().index.min()),
272            }
273        )
274        result = result.combine_first(seg_rebased)
275
276    # By default keep only the periods that actually carry data: do NOT reindex
277    # onto a dense grid (which would manufacture NaN for the gaps a coarse
278    # back-history leaves on a finer grid) and do NOT interpolate.  A long-run
279    # series therefore stays sparse where it is old and coarse, and plots as one
280    # continuous line with no holes and no invented points.
281    result = result.dropna().sort_index()
282
283    if output and output != grid:
284        result = _to_grid(result, output, agg).dropna().sort_index()
285        grid = output
286
287    if fill in ("ffill", "interpolate") and len(result):
288        # Explicit opt-in: densify onto the full grid, then fill.
289        full = pd.period_range(result.index.min(), result.index.max(), freq=grid)
290        result = result.reindex(full)
291        result = result.ffill() if fill == "ffill" else result.interpolate()
292
293    result.name = name or str(segments[0].name)
294    report = DataFrame(rows)
295    return result, report

Splice mixed-frequency segments into one series, highest priority first.

Parameters

segments Ordered list of pandas Series (PeriodIndex or DatetimeIndex). The first is highest priority: it wins where periods overlap and (when rebase is on) sets the level everything else is rebased to. target Common-grid frequency (e.g. "M", "Q-DEC"). Defaults to the finest frequency present (anchor clashes step one rank finer). rebase Off by default — segments are coalesced at their raw levels, with no silent transformation of your data. Set True to multiplicatively rescale each lower-priority segment to the running result's level before coalescing. Rebasing assumes ratio-scale inputs (meaningful zero, proportional discrepancy between segments) — splicing index series on different base periods (CPI, price/volume indices) is the case that needs it. It is wrong for zero-crossing series (rates, balances) or additive level breaks, and it invents a correction when same-unit segments already agree — which is why it is opt-in. A non-finite or non-positive factor raises. See the module docstring's rebase step. agg Aggregator used when a segment is finer than the grid (or when downsampling to output). "mean" for index levels; use "sum" for flows. output Optional final frequency to resample the spliced result to. fill Optional gap fill. By default (None) the result contains only the periods that actually have data — no NaN rows are inserted for the gaps a coarse segment leaves on a finer grid, and nothing is interpolated. "ffill" or "interpolate" densify the result onto the full grid first and then fill. name Name for the result series (defaults to the first segment's name).

Returns

tuple[Series, DataFrame] The spliced series and a one-row-per-junction report.