readabs.search_abs_meta

Search a DataFrame of ABS meta data, using a dictionary of search terms, to identify the row or rows that match all of the search terms.

  1"""Search a DataFrame of ABS meta data, using a dictionary of search terms,
  2to identify the row or rows that match all of the search terms."""
  3
  4from typing import Any
  5from pandas import DataFrame, Index
  6
  7# local imports
  8from readabs.abs_meta_data import metacol as mc
  9from readabs.read_abs_cat import read_abs_cat
 10
 11
 12def search_abs_meta(
 13    meta: DataFrame,  # sourced from read_abs_series() or read_abs_cat()
 14    search_terms: dict[str, str],  # {search_term: meta_data_column_name, ...}
 15    exact_match: bool = False,
 16    regex: bool = False,
 17    validate_unique=False,  # useful safety-net if you expect only one match
 18    **kwargs: Any,
 19) -> DataFrame:
 20    """Extract from the ABS meta data those rows that match the
 21    search_terms, by iteratively searching the meta data one
 22    search_term at a time.
 23
 24    Parameters
 25    ----------
 26    meta : DataFrame
 27        A pandas DataFrame of metadata from the ABS
 28        (via read_abs_cat() or read_abs_series()).
 29    search_terms : dict[str, str]
 30        A dictionary {search_phrase: meta_column_name, ...} of search terms.
 31        Note: the search terms must be unique, as a dictionary cannot hold the
 32        same search term to be applied to different columns.
 33    exact_match : bool = False
 34        Whether to match using == (exact) or .str.contains() (inexact).
 35    regex : bool = False
 36        Whether to use regular expressions in the search.
 37    validate_unique : bool = False
 38        Raise a ValueError if the search result is not unique.
 39    **kwargs : Any
 40        Additional keyword arguments. The only keyword argument
 41        that is used is verbose.
 42    verbose : bool = False
 43        Print additional information while searching; which can
 44        be useful when diagnosing problems with search terms.
 45
 46    Returns
 47    -------
 48    DataFrame
 49        Returns a pandas DataFrame of matching rows (subseted from meta).
 50        Note, The index for the returned meta data will always comprise ABS
 51        series_ids. Duplicate indexes will be removed from the meta data
 52        (ie. where the same ABS series appears in more than one table, this
 53        function will only report the first match).
 54
 55    Metacol
 56    -------
 57    Because the meta data is a DataFrame, the columns can be referenced by either
 58    their full textual name, or by the short name defined in the metacol object.
 59    For example, if metacol is imported as mc, to refer to the
 60    `Data Item Description` column, the user can refer to it as mc.did.
 61
 62    Example
 63    -------
 64    ```python
 65    from readabs import metacol as mc  # alias for the ABS meta data column names
 66    from readabs import read_abs_cat, search_abs_meta
 67    cat_num = "6202.0"  # The ABS labour force survey
 68    data, meta = read_abs_cat(cat_num)
 69    search_terms = {
 70        "Unemployment rate": mc.did,  # the data item description
 71        "Persons": mc.did,
 72        "Seasonally Adjusted": mc.stype,
 73        "Percent": mc.unit,
 74        "6202001": mc.table,
 75    }
 76    rows = search_abs_meta(meta, search_terms, verbose=True)
 77    print(rows)  # should have three rows : FT/PT/All Unemployment rates
 78    ```"""
 79
 80    # get the verbose-flag from kwargs
 81    verbose = kwargs.get("verbose", False)
 82
 83    # establish the starting point
 84    meta_select = meta.copy()  # preserve the original meta data
 85    if verbose:
 86        print(f"In search_abs_meta() {exact_match=} {regex=} {verbose=}")
 87        print(
 88            f"In search_abs_meta() starting with {len(meta_select)} rows in the meta_data."
 89        )
 90
 91    # iteratively search
 92    for phrase, column in search_terms.items():
 93        if verbose:
 94            print(f"Searching {len(meta_select)}: term: {phrase} in-column: {column}")
 95
 96        pick_me = (
 97            (meta_select[column] == phrase)
 98            if (exact_match or column == mc.table)
 99            else meta_select[column].str.contains(phrase, regex=regex)
100        )
101        meta_select = meta_select[pick_me]
102        if verbose:
103            print(f"In find_rows() have found {len(meta_select)}")
104
105    # search complete - check results - and return
106    meta_select.index = Index(meta_select[mc.id])
107    meta_select = meta_select[~meta_select.index.duplicated(keep="first")]
108
109    if verbose:
110        print(f"Final selection is {len(meta_select)} rows.")
111
112    elif len(meta_select) == 0:
113        print("Nothing selected?")
114
115    if validate_unique and len(meta_select) != 1:
116        raise ValueError("The selected meta data should only contain one row.")
117
118    return meta_select
119
120
121def find_abs_id(
122    meta: DataFrame,
123    search_terms: dict[str, str],
124    **kwargs: Any,
125) -> tuple[str, str, str]:  # table, series_id, units
126    """Find a unique ABS series identifier in the ABS metadata.
127
128    Parameters
129    ----------
130    meta : DataFrame
131        A pandas DataFrame of metadata from the ABS
132        (via read_abs_cat() or read_abs_series()).
133    search_terms : dict[str, str]
134        A dictionary {search_phrase: meta_column_name, ...} of search terms.
135        Note: the search terms must be unique, as a dictionary cannot hold the
136        same search term to be applied to different columns.
137    **kwargs : Any
138        Additional keyword arguments. The only additional keyword argument
139        that is used is validate_unique.
140    validate_unique : bool = True
141        Raise a ValueError if the search result is not a single
142        unique match. Note: the default is True for safety.
143
144    Returns
145    -------
146    tuple[str, str, str]
147        A tuple of the table, series_id and units for the unique
148        series_id that matches the search terms.
149
150    Metacol
151    -------
152    Because the meta data is a DataFrame, the columns can be referenced by either
153    their full textual name, or by the short name defined in the metacol object.
154    For example, if metacol is imported as mc, to refer to the
155    `Data Item Description` column, the user can refer to it as mc.did.
156
157    Example
158    -------
159    ```python
160    from readabs import metacol as mc  # alias for the ABS meta data column names
161    from readabs import read_abs_cat, find_abs_id, recalibrate
162    cat_num = "6202.0"  # The ABS labour force survey
163    data, meta = read_abs_cat(cat_num)
164    search_terms = {
165        "Employed total ;  Persons ;": mc.did,
166        "Seasonally Adjusted": mc.stype,
167        "6202001": mc.table,
168    }
169    table, series_id, units = find_abs_id(meta, search_terms)
170    print(f"Table: {table} Series ID: {series_id} Units: {units}")
171    recal_series, recal_units = recalibrate(data[table][series_id], units)
172    ```"""
173
174    validate_unique = kwargs.pop("validate_unique", True)
175    found = search_abs_meta(
176        meta, search_terms, validate_unique=validate_unique, **kwargs
177    ).iloc[0]
178    table, series_id, units = (
179        found[mc.table],
180        found[mc.id],
181        found[mc.unit],
182    )
183
184    return table, series_id, units
185
186
187if __name__ == "__main__":
188
189    def test_search_abs_meta():
190        """Test the search_abs_meta() function."""
191
192        cat_num = "6202.0"  # The ABS labour force survey
193        _data, meta = read_abs_cat(cat_num)
194        search_terms = {
195            "Unemployment rate": mc.did,  # the data item description
196            "Persons": mc.did,
197            "Seasonally Adjusted": mc.stype,
198            "Percent": mc.unit,
199            "6202001": mc.table,
200        }
201        rows = search_abs_meta(meta, search_terms, verbose=True)
202        print(rows)  # should have three rows : FT/PT/All Unemplooyment rates
203
204    test_search_abs_meta()
205
206    def test_find_abs_id():
207        """Test the find_abs_id() function."""
208
209        cat_num = "6202.0"  # The ABS labour force survey
210        _data, meta = read_abs_cat(cat_num)
211        search_terms = {
212            "Employed total ;  Persons ;": mc.did,
213            "Seasonally Adjusted": mc.stype,
214            "6202001": mc.table,
215        }
216        table, series_id, units = find_abs_id(meta, search_terms)
217        print(f"Table: {table} Series ID: {series_id} Units: {units}")
218
219    test_find_abs_id()
def search_abs_meta( meta: pandas.core.frame.DataFrame, search_terms: dict[str, str], exact_match: bool = False, regex: bool = False, validate_unique=False, **kwargs: Any) -> pandas.core.frame.DataFrame:
 13def search_abs_meta(
 14    meta: DataFrame,  # sourced from read_abs_series() or read_abs_cat()
 15    search_terms: dict[str, str],  # {search_term: meta_data_column_name, ...}
 16    exact_match: bool = False,
 17    regex: bool = False,
 18    validate_unique=False,  # useful safety-net if you expect only one match
 19    **kwargs: Any,
 20) -> DataFrame:
 21    """Extract from the ABS meta data those rows that match the
 22    search_terms, by iteratively searching the meta data one
 23    search_term at a time.
 24
 25    Parameters
 26    ----------
 27    meta : DataFrame
 28        A pandas DataFrame of metadata from the ABS
 29        (via read_abs_cat() or read_abs_series()).
 30    search_terms : dict[str, str]
 31        A dictionary {search_phrase: meta_column_name, ...} of search terms.
 32        Note: the search terms must be unique, as a dictionary cannot hold the
 33        same search term to be applied to different columns.
 34    exact_match : bool = False
 35        Whether to match using == (exact) or .str.contains() (inexact).
 36    regex : bool = False
 37        Whether to use regular expressions in the search.
 38    validate_unique : bool = False
 39        Raise a ValueError if the search result is not unique.
 40    **kwargs : Any
 41        Additional keyword arguments. The only keyword argument
 42        that is used is verbose.
 43    verbose : bool = False
 44        Print additional information while searching; which can
 45        be useful when diagnosing problems with search terms.
 46
 47    Returns
 48    -------
 49    DataFrame
 50        Returns a pandas DataFrame of matching rows (subseted from meta).
 51        Note, The index for the returned meta data will always comprise ABS
 52        series_ids. Duplicate indexes will be removed from the meta data
 53        (ie. where the same ABS series appears in more than one table, this
 54        function will only report the first match).
 55
 56    Metacol
 57    -------
 58    Because the meta data is a DataFrame, the columns can be referenced by either
 59    their full textual name, or by the short name defined in the metacol object.
 60    For example, if metacol is imported as mc, to refer to the
 61    `Data Item Description` column, the user can refer to it as mc.did.
 62
 63    Example
 64    -------
 65    ```python
 66    from readabs import metacol as mc  # alias for the ABS meta data column names
 67    from readabs import read_abs_cat, search_abs_meta
 68    cat_num = "6202.0"  # The ABS labour force survey
 69    data, meta = read_abs_cat(cat_num)
 70    search_terms = {
 71        "Unemployment rate": mc.did,  # the data item description
 72        "Persons": mc.did,
 73        "Seasonally Adjusted": mc.stype,
 74        "Percent": mc.unit,
 75        "6202001": mc.table,
 76    }
 77    rows = search_abs_meta(meta, search_terms, verbose=True)
 78    print(rows)  # should have three rows : FT/PT/All Unemployment rates
 79    ```"""
 80
 81    # get the verbose-flag from kwargs
 82    verbose = kwargs.get("verbose", False)
 83
 84    # establish the starting point
 85    meta_select = meta.copy()  # preserve the original meta data
 86    if verbose:
 87        print(f"In search_abs_meta() {exact_match=} {regex=} {verbose=}")
 88        print(
 89            f"In search_abs_meta() starting with {len(meta_select)} rows in the meta_data."
 90        )
 91
 92    # iteratively search
 93    for phrase, column in search_terms.items():
 94        if verbose:
 95            print(f"Searching {len(meta_select)}: term: {phrase} in-column: {column}")
 96
 97        pick_me = (
 98            (meta_select[column] == phrase)
 99            if (exact_match or column == mc.table)
100            else meta_select[column].str.contains(phrase, regex=regex)
101        )
102        meta_select = meta_select[pick_me]
103        if verbose:
104            print(f"In find_rows() have found {len(meta_select)}")
105
106    # search complete - check results - and return
107    meta_select.index = Index(meta_select[mc.id])
108    meta_select = meta_select[~meta_select.index.duplicated(keep="first")]
109
110    if verbose:
111        print(f"Final selection is {len(meta_select)} rows.")
112
113    elif len(meta_select) == 0:
114        print("Nothing selected?")
115
116    if validate_unique and len(meta_select) != 1:
117        raise ValueError("The selected meta data should only contain one row.")
118
119    return meta_select

Extract from the ABS meta data those rows that match the search_terms, by iteratively searching the meta data one search_term at a time.

Parameters

meta : DataFrame A pandas DataFrame of metadata from the ABS (via read_abs_cat() or read_abs_series()). search_terms : dict[str, str] A dictionary {search_phrase: meta_column_name, ...} of search terms. Note: the search terms must be unique, as a dictionary cannot hold the same search term to be applied to different columns. exact_match : bool = False Whether to match using == (exact) or .str.contains() (inexact). regex : bool = False Whether to use regular expressions in the search. validate_unique : bool = False Raise a ValueError if the search result is not unique. **kwargs : Any Additional keyword arguments. The only keyword argument that is used is verbose. verbose : bool = False Print additional information while searching; which can be useful when diagnosing problems with search terms.

Returns

DataFrame Returns a pandas DataFrame of matching rows (subseted from meta). Note, The index for the returned meta data will always comprise ABS series_ids. Duplicate indexes will be removed from the meta data (ie. where the same ABS series appears in more than one table, this function will only report the first match).

Metacol

Because the meta data is a DataFrame, the columns can be referenced by either their full textual name, or by the short name defined in the metacol object. For example, if metacol is imported as mc, to refer to the Data Item Description column, the user can refer to it as mc.did.

Example

from readabs import metacol as mc  # alias for the ABS meta data column names
from readabs import read_abs_cat, search_abs_meta
cat_num = "6202.0"  # The ABS labour force survey
data, meta = read_abs_cat(cat_num)
search_terms = {
    "Unemployment rate": mc.did,  # the data item description
    "Persons": mc.did,
    "Seasonally Adjusted": mc.stype,
    "Percent": mc.unit,
    "6202001": mc.table,
}
rows = search_abs_meta(meta, search_terms, verbose=True)
print(rows)  # should have three rows : FT/PT/All Unemployment rates
def find_abs_id( meta: pandas.core.frame.DataFrame, search_terms: dict[str, str], **kwargs: Any) -> tuple[str, str, str]:
122def find_abs_id(
123    meta: DataFrame,
124    search_terms: dict[str, str],
125    **kwargs: Any,
126) -> tuple[str, str, str]:  # table, series_id, units
127    """Find a unique ABS series identifier in the ABS metadata.
128
129    Parameters
130    ----------
131    meta : DataFrame
132        A pandas DataFrame of metadata from the ABS
133        (via read_abs_cat() or read_abs_series()).
134    search_terms : dict[str, str]
135        A dictionary {search_phrase: meta_column_name, ...} of search terms.
136        Note: the search terms must be unique, as a dictionary cannot hold the
137        same search term to be applied to different columns.
138    **kwargs : Any
139        Additional keyword arguments. The only additional keyword argument
140        that is used is validate_unique.
141    validate_unique : bool = True
142        Raise a ValueError if the search result is not a single
143        unique match. Note: the default is True for safety.
144
145    Returns
146    -------
147    tuple[str, str, str]
148        A tuple of the table, series_id and units for the unique
149        series_id that matches the search terms.
150
151    Metacol
152    -------
153    Because the meta data is a DataFrame, the columns can be referenced by either
154    their full textual name, or by the short name defined in the metacol object.
155    For example, if metacol is imported as mc, to refer to the
156    `Data Item Description` column, the user can refer to it as mc.did.
157
158    Example
159    -------
160    ```python
161    from readabs import metacol as mc  # alias for the ABS meta data column names
162    from readabs import read_abs_cat, find_abs_id, recalibrate
163    cat_num = "6202.0"  # The ABS labour force survey
164    data, meta = read_abs_cat(cat_num)
165    search_terms = {
166        "Employed total ;  Persons ;": mc.did,
167        "Seasonally Adjusted": mc.stype,
168        "6202001": mc.table,
169    }
170    table, series_id, units = find_abs_id(meta, search_terms)
171    print(f"Table: {table} Series ID: {series_id} Units: {units}")
172    recal_series, recal_units = recalibrate(data[table][series_id], units)
173    ```"""
174
175    validate_unique = kwargs.pop("validate_unique", True)
176    found = search_abs_meta(
177        meta, search_terms, validate_unique=validate_unique, **kwargs
178    ).iloc[0]
179    table, series_id, units = (
180        found[mc.table],
181        found[mc.id],
182        found[mc.unit],
183    )
184
185    return table, series_id, units

Find a unique ABS series identifier in the ABS metadata.

Parameters

meta : DataFrame A pandas DataFrame of metadata from the ABS (via read_abs_cat() or read_abs_series()). search_terms : dict[str, str] A dictionary {search_phrase: meta_column_name, ...} of search terms. Note: the search terms must be unique, as a dictionary cannot hold the same search term to be applied to different columns. **kwargs : Any Additional keyword arguments. The only additional keyword argument that is used is validate_unique. validate_unique : bool = True Raise a ValueError if the search result is not a single unique match. Note: the default is True for safety.

Returns

tuple[str, str, str] A tuple of the table, series_id and units for the unique series_id that matches the search terms.

Metacol

Because the meta data is a DataFrame, the columns can be referenced by either their full textual name, or by the short name defined in the metacol object. For example, if metacol is imported as mc, to refer to the Data Item Description column, the user can refer to it as mc.did.

Example

from readabs import metacol as mc  # alias for the ABS meta data column names
from readabs import read_abs_cat, find_abs_id, recalibrate
cat_num = "6202.0"  # The ABS labour force survey
data, meta = read_abs_cat(cat_num)
search_terms = {
    "Employed total ;  Persons ;": mc.did,
    "Seasonally Adjusted": mc.stype,
    "6202001": mc.table,
}
table, series_id, units = find_abs_id(meta, search_terms)
print(f"Table: {table} Series ID: {series_id} Units: {units}")
recal_series, recal_units = recalibrate(data[table][series_id], units)