readabs.read_abs_by_desc

Get specific ABS data series by searching for the ABS data item descriptions.

This module provides functionality to search and retrieve ABS data series by their descriptions rather than series IDs.

  1"""Get specific ABS data series by searching for the ABS data item descriptions.
  2
  3This module provides functionality to search and retrieve ABS data series
  4by their descriptions rather than series IDs.
  5"""
  6
  7import inspect
  8from typing import Any
  9
 10# Analytic imports
 11import pandas as pd
 12
 13# local imports
 14from readabs.abs_meta_data import metacol as mc
 15from readabs.read_abs_cat import read_abs_cat
 16from readabs.search_abs_meta import find_abs_id
 17
 18
 19# --- private functions
 20def _work_to_do(wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]] | None) -> bool:
 21    """Check if there is any work to do."""
 22    if wanted is None or len(wanted) == 0:
 23        print("No data requested.")
 24        return False
 25    return True
 26
 27
 28def _wlist_to_wdict(wanted: list[str]) -> dict[str, str]:
 29    """Convert a list of strings to a dictionary of strings:strings.
 30
 31    Note: the keys and values are the same.
 32    Note: any duplicate elements in the list will be lost.
 33    """
 34    return {k: k for k in wanted}
 35
 36
 37def _get_search_terms(input_dict: dict[str, Any], output_dict: dict[str, str]) -> dict[str, str]:
 38    """Build a selector dictionary from the input dictionary."""
 39    search_names = {abbr: term for abbr, term in inspect.getmembers(mc) if not abbr.startswith("_")}
 40    for mc_abbr, meta_column in search_names.items():
 41        if mc_abbr in input_dict:
 42            # the selector dictionary is back-to_front
 43            # ie. {value_sought: column_name}
 44            output_dict[input_dict[mc_abbr]] = meta_column
 45    return output_dict
 46
 47
 48def _get_args(keys: list[str], input_dict: dict[str, Any], output_dict: dict[str, Any]) -> dict[str, Any]:
 49    """Build a retrieval dictionary from the input dictionary."""
 50    for key in keys:
 51        if key in input_dict:
 52            output_dict[key] = input_dict[key]
 53    return output_dict
 54
 55
 56def _get_search_args(input_dict: dict[str, Any], output_dict: dict[str, Any]) -> dict[str, Any]:
 57    """Extract the search arguments from the input dictionary."""
 58    keys = ["validate_unique", "exact_match", "regex", "verbose"]
 59    return _get_args(keys, input_dict, output_dict)
 60
 61
 62def _get_retrieval_args(input_dict: dict[str, Any], output_dict: dict[str, Any]) -> dict[str, Any]:
 63    """Extract the retrieval arguments from the input dictionary."""
 64    keys = [
 65        "ignore_errors",
 66        "get_zip",
 67        "get_excel_if_no_zip",
 68        "get_excel",
 69        "cache_only",
 70        "single_excel_only",
 71        "selected_excel",
 72        "single_zip_only",
 73        "verbose",
 74    ]
 75    return _get_args(keys, input_dict, output_dict)
 76
 77
 78def _get_item_from_str(
 79    item: str,
 80    data_dict: dict[str, pd.DataFrame],
 81    data_meta: pd.DataFrame,
 82    item_selector: dict[str, str],
 83    search_args: dict[str, Any],
 84) -> tuple[pd.Series, pd.DataFrame]:
 85    """Get a data series from the data dictionary and metadata.
 86
 87    Give the series its series-id as a name.
 88    """
 89    if not data_dict or data_meta.empty:
 90        raise ValueError(
 91            "If the wanted data is a string, a populated abs_dict " + "and abs_meta must be provided."
 92        )
 93    item_selector[item] = mc.did  # back_to_front
 94    table, series_id, units = find_abs_id(data_meta, item_selector, **search_args)
 95
 96    series = data_dict[table][series_id]
 97    series.name = series_id
 98    series_meta = data_meta.loc[
 99        (data_meta[mc.table] == table) & (data_meta[mc.id] == series_id) & (data_meta[mc.unit] == units)
100    ]
101    return series, series_meta
102
103
104def _get_item_from_dict(
105    item_dict: dict[str, Any],
106    data_dict: dict[str, pd.DataFrame],
107    data_meta: pd.DataFrame,
108    item_selector: dict[str, str],
109    search_args: dict[str, Any],
110    **kwargs: Any,
111) -> tuple[pd.Series, pd.DataFrame]:
112    # preparation
113    if "did" not in item_dict:
114        raise ValueError("Each inner dictionary must contain a 'did' key.")
115    item = item_dict.pop("did")
116    item_selector = _get_search_terms(item_dict, item_selector)
117    item_search_args = _get_search_args(item_dict, search_args)
118
119    if not data_dict or data_meta.empty:
120        # data retrieval reqquired
121        if "cat" not in item_dict:
122            raise ValueError(
123                "Each inner dictionary must contain a 'cat' key, "
124                "if an abs_dict is not provided/empty or the "
125                "abs_meta is not provided/empty."
126            )
127        ret_args = _get_retrieval_args(kwargs, {})
128        ret_args = _get_retrieval_args(item_dict, ret_args)
129        data_dict, data_meta = read_abs_cat(cat=item_dict["cat"], **ret_args)
130
131    # series extraction based on search terms
132    series, series_meta = _get_item_from_str(
133        item=item,
134        data_dict=data_dict,
135        data_meta=data_meta,
136        item_selector=item_selector,
137        search_args=item_search_args,
138    )
139    return series, series_meta
140
141
142# --- public functions
143def read_abs_by_desc(
144    wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]],
145    **kwargs: Any,
146) -> tuple[dict[str, pd.Series], pd.DataFrame]:
147    """Get specific ABS data series by searching the ABS meta data.
148
149    Parameters
150    ----------
151    wanted : list of str, dict of str:str, or dict of str:dict
152        The data
153        item descriptions to search for. If a list, it will be a list of
154        descriptions to search for. If a dictionary, the keys will a name.
155        The dictionary values can be either a string (the data item
156        description to search for) or a dictionary of keyword arguments, one of
157        which would be the data item description to search for.
158    **kwargs : Any
159        Keyword arguments to control the data retrieval.
160        The keyword arguments can include the following:
161        - abs_dict : dict - the dictionary of ABS data to search (from
162            read_abs_cat()).
163        - abs_meta : DataFrame - the metadata for the ABS data (from
164            read_abs_cat()).
165        - for the retrieval of data, the "cat" argument must be present.
166            The following arguments, if present, will also be used (ie.
167            passed to read_abs_cat()): ["ignore_errors", "get_zip",
168            "get_excel_if_no_zip", "get_excel", "cache_only",
169            "single_excel_only", "selected_excel", "single_zip_only",
170            "verbose"].
171        - for the selection of data, the following metacol names, if present,
172            will be used to construct the selector: "cat", "did"
173            "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
174            "cmonth", "table", "tdesc".
175        - finally, the following arguments will be passed to the find_abs_id()
176            and search_abs_meta() functions: ["validate_unique", "exact_match",
177            "regex", "verbose"].
178
179    Notes
180    -----
181    - if "wanted" is of type list[str] or dict[str, str], the kwargs should
182        include sufficient keys from the metacol dataclass to get the data.
183        Typically, the "cat" key, the "table" key, and the "stype" key would
184        be required. The did key would taken from the wanted list or
185        dictionary.
186    if wanted is of type dict[str, dict[str, Any]], the inner dictionary
187        must contain a "did" key. The other keys that can be used for the
188        data retrieval are the same as the metacol dataclass fileds, namely:
189        "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
190        "cmonth", "table", "tdesc".
191    - if abs_dict and abs_meta are provided within the kwargs, they will be
192        used to locate and extract the selected data.
193    - if abs_dict and abs_meta are not provided, then, (1) wanted must be of
194        type dict[str, dict[str, Any]] and (2) the inner dictionary must
195        contain a "cat" key so the data can be retrieved. Other keys that
196        can be used for the data retrieval are the same as for read_abs_cat(),
197        namely ["ignore_errors", "get_zip", "get_excel_if_no_zip",
198        "get_excel", "single_excel_only", "selected_excel",
199        "single_zip_only", "cache_only"].
200
201
202    Returns
203    -------
204    Returns a tuple of two items:
205    - A dictionary of pandas Series objects, where the keys are the series
206      descriptions. The series.name attribute will be the ABS series-id.
207    - A pandas DataFrame containing the metadata for the series.
208
209    Example
210    -------
211
212    ```python
213    import readabs as ra
214    from pandas import DataFrame
215    cat_num = "5206.0"  # The ABS National Accounts
216    data, meta = ra.read_abs_cat(cat=cat_num)
217    wanted = ["Gross domestic product: Chain volume measures ;",]
218    selected, selected_meta = ra.read_abs_by_desc(
219        wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
220    )
221    ```
222
223    """
224    # - preparation
225    if not _work_to_do(wanted):
226        return {}, pd.DataFrame()
227    if isinstance(wanted, list):
228        wanted = _wlist_to_wdict(wanted)
229    abs_dict = kwargs.get("abs_dict", {})
230    abs_meta = kwargs.get("abs_meta", pd.DataFrame())
231    kwarg_selector = _get_search_terms(kwargs, {})
232    search_args = _get_search_args(kwargs, {})
233
234    return_dict = {}
235    return_meta = pd.DataFrame()
236    for key, value in wanted.items():
237        item_selector = kwarg_selector.copy()
238        item_search_args = search_args.copy()
239        if isinstance(value, str):
240            series, meta = _get_item_from_str(
241                item=value,
242                data_dict=abs_dict,
243                data_meta=abs_meta,
244                item_selector=item_selector,
245                search_args=item_search_args,
246            )
247
248        elif isinstance(value, dict):
249            series, meta = _get_item_from_dict(
250                item_dict=value,
251                data_dict=abs_dict,
252                data_meta=abs_meta,
253                item_selector=item_selector,
254                search_args=item_search_args,
255                **kwargs,
256            )
257        else:
258            raise TypeError(
259                "Each value in the wanted list/dictionary must be either a string " + "or a dictionary."
260            )
261
262        # save search results
263        return_dict[key] = series
264        return_meta = pd.concat([return_meta, meta])
265
266    return return_dict, return_meta
267
268
269# --- testing ---
270if __name__ == "__main__":
271    # --- test 1: get a list of dids
272    def test1() -> None:
273        """Test case: get a list of dids."""
274        cat = "5206.0"
275        table = "5206001_Key_Aggregates"
276        data_dict, data_meta = read_abs_cat(cat=cat, single_excel_only=table, verbose=False)
277        stype = "Seasonally Adjusted"
278        get_these = data_meta.loc[
279            (data_meta[mc.table] == table)
280            & (data_meta[mc.stype] == stype)
281            & data_meta[mc.unit].str.contains("Million")
282            & data_meta[mc.did].str.contains("Chain volume measures")
283        ][mc.did].to_list()
284        print(f"get_these: {get_these}")
285
286        selected, selected_meta = read_abs_by_desc(
287            wanted=get_these,
288            abs_dict=data_dict,
289            abs_meta=data_meta,
290            # exact_match=True, verbose=True,
291            table=table,
292            stype=stype,
293        )
294        print(selected, selected_meta)
295
296    test1()
297
298    # --- test 2: get a dictionary of dids
299    def test2() -> None:
300        """Test case: get a dictionary of dids."""
301        gdp_table = "5206001_Key_Aggregates"
302        uer_table = "6202001"
303        sa = "Seasonally Adjusted"
304        get_these = {
305            # two series, each from two different ABS Catalogue Numbers
306            "GDP": {
307                "cat": "5206.0",
308                "table": gdp_table,
309                "stype": sa,
310                "did": "Gross domestic product: Chain volume measures ;",
311                "single_excel_only": gdp_table,
312            },
313            "Unemployment Rate": {
314                "cat": "6202.0",
315                "table": uer_table,
316                "stype": sa,
317                "did": "Unemployment rate ;  Persons ;",
318                "single_excel_only": uer_table,
319            },
320        }
321        selected, selected_meta = read_abs_by_desc(
322            wanted=get_these,
323        )
324
325        print(selected_meta)
326        print(selected)
327
328    test2()
def read_abs_by_desc( wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]], **kwargs: Any) -> tuple[dict[str, pandas.Series], pandas.DataFrame]:
144def read_abs_by_desc(
145    wanted: list[str] | dict[str, str] | dict[str, dict[str, Any]],
146    **kwargs: Any,
147) -> tuple[dict[str, pd.Series], pd.DataFrame]:
148    """Get specific ABS data series by searching the ABS meta data.
149
150    Parameters
151    ----------
152    wanted : list of str, dict of str:str, or dict of str:dict
153        The data
154        item descriptions to search for. If a list, it will be a list of
155        descriptions to search for. If a dictionary, the keys will a name.
156        The dictionary values can be either a string (the data item
157        description to search for) or a dictionary of keyword arguments, one of
158        which would be the data item description to search for.
159    **kwargs : Any
160        Keyword arguments to control the data retrieval.
161        The keyword arguments can include the following:
162        - abs_dict : dict - the dictionary of ABS data to search (from
163            read_abs_cat()).
164        - abs_meta : DataFrame - the metadata for the ABS data (from
165            read_abs_cat()).
166        - for the retrieval of data, the "cat" argument must be present.
167            The following arguments, if present, will also be used (ie.
168            passed to read_abs_cat()): ["ignore_errors", "get_zip",
169            "get_excel_if_no_zip", "get_excel", "cache_only",
170            "single_excel_only", "selected_excel", "single_zip_only",
171            "verbose"].
172        - for the selection of data, the following metacol names, if present,
173            will be used to construct the selector: "cat", "did"
174            "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
175            "cmonth", "table", "tdesc".
176        - finally, the following arguments will be passed to the find_abs_id()
177            and search_abs_meta() functions: ["validate_unique", "exact_match",
178            "regex", "verbose"].
179
180    Notes
181    -----
182    - if "wanted" is of type list[str] or dict[str, str], the kwargs should
183        include sufficient keys from the metacol dataclass to get the data.
184        Typically, the "cat" key, the "table" key, and the "stype" key would
185        be required. The did key would taken from the wanted list or
186        dictionary.
187    if wanted is of type dict[str, dict[str, Any]], the inner dictionary
188        must contain a "did" key. The other keys that can be used for the
189        data retrieval are the same as the metacol dataclass fileds, namely:
190        "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq",
191        "cmonth", "table", "tdesc".
192    - if abs_dict and abs_meta are provided within the kwargs, they will be
193        used to locate and extract the selected data.
194    - if abs_dict and abs_meta are not provided, then, (1) wanted must be of
195        type dict[str, dict[str, Any]] and (2) the inner dictionary must
196        contain a "cat" key so the data can be retrieved. Other keys that
197        can be used for the data retrieval are the same as for read_abs_cat(),
198        namely ["ignore_errors", "get_zip", "get_excel_if_no_zip",
199        "get_excel", "single_excel_only", "selected_excel",
200        "single_zip_only", "cache_only"].
201
202
203    Returns
204    -------
205    Returns a tuple of two items:
206    - A dictionary of pandas Series objects, where the keys are the series
207      descriptions. The series.name attribute will be the ABS series-id.
208    - A pandas DataFrame containing the metadata for the series.
209
210    Example
211    -------
212
213    ```python
214    import readabs as ra
215    from pandas import DataFrame
216    cat_num = "5206.0"  # The ABS National Accounts
217    data, meta = ra.read_abs_cat(cat=cat_num)
218    wanted = ["Gross domestic product: Chain volume measures ;",]
219    selected, selected_meta = ra.read_abs_by_desc(
220        wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
221    )
222    ```
223
224    """
225    # - preparation
226    if not _work_to_do(wanted):
227        return {}, pd.DataFrame()
228    if isinstance(wanted, list):
229        wanted = _wlist_to_wdict(wanted)
230    abs_dict = kwargs.get("abs_dict", {})
231    abs_meta = kwargs.get("abs_meta", pd.DataFrame())
232    kwarg_selector = _get_search_terms(kwargs, {})
233    search_args = _get_search_args(kwargs, {})
234
235    return_dict = {}
236    return_meta = pd.DataFrame()
237    for key, value in wanted.items():
238        item_selector = kwarg_selector.copy()
239        item_search_args = search_args.copy()
240        if isinstance(value, str):
241            series, meta = _get_item_from_str(
242                item=value,
243                data_dict=abs_dict,
244                data_meta=abs_meta,
245                item_selector=item_selector,
246                search_args=item_search_args,
247            )
248
249        elif isinstance(value, dict):
250            series, meta = _get_item_from_dict(
251                item_dict=value,
252                data_dict=abs_dict,
253                data_meta=abs_meta,
254                item_selector=item_selector,
255                search_args=item_search_args,
256                **kwargs,
257            )
258        else:
259            raise TypeError(
260                "Each value in the wanted list/dictionary must be either a string " + "or a dictionary."
261            )
262
263        # save search results
264        return_dict[key] = series
265        return_meta = pd.concat([return_meta, meta])
266
267    return return_dict, return_meta

Get specific ABS data series by searching the ABS meta data.

Parameters

wanted : list of str, dict of str:str, or dict of str:dict The data item descriptions to search for. If a list, it will be a list of descriptions to search for. If a dictionary, the keys will a name. The dictionary values can be either a string (the data item description to search for) or a dictionary of keyword arguments, one of which would be the data item description to search for. **kwargs : Any Keyword arguments to control the data retrieval. The keyword arguments can include the following: - abs_dict : dict - the dictionary of ABS data to search (from read_abs_cat()). - abs_meta : DataFrame - the metadata for the ABS data (from read_abs_cat()). - for the retrieval of data, the "cat" argument must be present. The following arguments, if present, will also be used (ie. passed to read_abs_cat()): ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "cache_only", "single_excel_only", "selected_excel", "single_zip_only", "verbose"]. - for the selection of data, the following metacol names, if present, will be used to construct the selector: "cat", "did" "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc". - finally, the following arguments will be passed to the find_abs_id() and search_abs_meta() functions: ["validate_unique", "exact_match", "regex", "verbose"].

Notes

  • if "wanted" is of type list[str] or dict[str, str], the kwargs should include sufficient keys from the metacol dataclass to get the data. Typically, the "cat" key, the "table" key, and the "stype" key would be required. The did key would taken from the wanted list or dictionary. if wanted is of type dict[str, dict[str, Any]], the inner dictionary must contain a "did" key. The other keys that can be used for the data retrieval are the same as the metacol dataclass fileds, namely: "cat", "stype", "id", "start", "end", "num", "unit", "dtype", "freq", "cmonth", "table", "tdesc".
  • if abs_dict and abs_meta are provided within the kwargs, they will be used to locate and extract the selected data.
  • if abs_dict and abs_meta are not provided, then, (1) wanted must be of type dict[str, dict[str, Any]] and (2) the inner dictionary must contain a "cat" key so the data can be retrieved. Other keys that can be used for the data retrieval are the same as for read_abs_cat(), namely ["ignore_errors", "get_zip", "get_excel_if_no_zip", "get_excel", "single_excel_only", "selected_excel", "single_zip_only", "cache_only"].

Returns

Returns a tuple of two items:

  • A dictionary of pandas Series objects, where the keys are the series descriptions. The series.name attribute will be the ABS series-id.
  • A pandas DataFrame containing the metadata for the series.

Example

import readabs as ra
from pandas import DataFrame
cat_num = "5206.0"  # The ABS National Accounts
data, meta = ra.read_abs_cat(cat=cat_num)
wanted = ["Gross domestic product: Chain volume measures ;",]
selected, selected_meta = ra.read_abs_by_desc(
    wanted=wanted, abs_dict=data, abs_meta=meta, table="5206001_Key_Aggregates"
)