Skip to content

Date/Time Derivations

derive_vars_dt

admiralpy.derive_vars_dt.derive_vars_dt(dataset, new_vars_prefix, dtc, highest_imputation='n', date_imputation='first', flag_imputation='auto')

Derive a date variable (*DT) from a character date variable (--DTC).

Mirrors derive_vars_dt() from the admiral R package.

Parameters:

Name Type Description Default
dataset DataFrame

Input dataset. Must contain the column named by dtc.

required
new_vars_prefix str

Prefix for the output variables. For example, "AST" creates ASTDT (and ASTDTF when the imputation flag is requested).

required
dtc str

Name of the source character date variable.

required
highest_imputation str

Highest imputation level allowed. One of:

  • "n" – no imputation (default)
  • "D" – impute missing day
  • "M" – impute missing month and/or day
  • "Y" – impute missing year, month, and/or day
'n'
date_imputation str

How to impute missing date components. One of:

  • "first" – first day/month (default)
  • "last" – last day/month
  • "mid" – mid day/month (15th for day, 30 Jun for month)
  • "MM-DD" – a specific month-day string, e.g. "06-15"
'first'
flag_imputation str

Controls creation of the date imputation flag *DTF. One of:

  • "auto" – create flag when highest_imputation != "n" (default)
  • "date" – always create flag
  • "none" – never create flag
'auto'

Returns:

Type Description
DataFrame

Input dataset with {new_vars_prefix}DT (and optionally {new_vars_prefix}DTF) appended.

Raises:

Type Description
ValueError

If dtc is not present in dataset or if highest_imputation is not a recognised value.

Examples:

>>> import pandas as pd
>>> from admiralpy import derive_vars_dt
>>> df = pd.DataFrame({
...     "MHSTDTC": ["2019-07-18T15:25:40", "2019-07-18", "2019-02", "2019", ""]
... })
>>> derive_vars_dt(df, new_vars_prefix="AST", dtc="MHSTDTC",
...                highest_imputation="M", date_imputation="first")
Source code in admiralpy/derive_vars_dt.py
def derive_vars_dt(
    dataset: pd.DataFrame,
    new_vars_prefix: str,
    dtc: str,
    highest_imputation: str = "n",
    date_imputation: str = "first",
    flag_imputation: str = "auto",
) -> pd.DataFrame:
    """
    Derive a date variable (``*DT``) from a character date variable (``--DTC``).

    Mirrors ``derive_vars_dt()`` from the admiral R package.

    Parameters
    ----------
    dataset : pd.DataFrame
        Input dataset. Must contain the column named by ``dtc``.
    new_vars_prefix : str
        Prefix for the output variables.  For example, ``"AST"`` creates
        ``ASTDT`` (and ``ASTDTF`` when the imputation flag is requested).
    dtc : str
        Name of the source character date variable.
    highest_imputation : str, optional
        Highest imputation level allowed.  One of:

        * ``"n"`` – no imputation (default)
        * ``"D"`` – impute missing day
        * ``"M"`` – impute missing month and/or day
        * ``"Y"`` – impute missing year, month, and/or day
    date_imputation : str, optional
        How to impute missing date components.  One of:

        * ``"first"`` – first day/month (default)
        * ``"last"``  – last day/month
        * ``"mid"``   – mid day/month (15th for day, 30 Jun for month)
        * ``"MM-DD"`` – a specific month-day string, e.g. ``"06-15"``
    flag_imputation : str, optional
        Controls creation of the date imputation flag ``*DTF``.  One of:

        * ``"auto"``  – create flag when ``highest_imputation != "n"`` (default)
        * ``"date"``  – always create flag
        * ``"none"``  – never create flag

    Returns
    -------
    pd.DataFrame
        Input dataset with ``{new_vars_prefix}DT`` (and optionally
        ``{new_vars_prefix}DTF``) appended.

    Raises
    ------
    ValueError
        If ``dtc`` is not present in ``dataset`` or if ``highest_imputation``
        is not a recognised value.

    Examples
    --------
    >>> import pandas as pd
    >>> from admiralpy import derive_vars_dt
    >>> df = pd.DataFrame({
    ...     "MHSTDTC": ["2019-07-18T15:25:40", "2019-07-18", "2019-02", "2019", ""]
    ... })
    >>> derive_vars_dt(df, new_vars_prefix="AST", dtc="MHSTDTC",
    ...                highest_imputation="M", date_imputation="first")
    """
    if dtc not in dataset.columns:
        raise ValueError(f"Column '{dtc}' not found in dataset.")

    valid_highest = ("n", "D", "M", "Y")
    if highest_imputation not in valid_highest:
        raise ValueError(
            f"highest_imputation must be one of {valid_highest}, "
            f"got '{highest_imputation}'."
        )

    dt_var = f"{new_vars_prefix}DT"
    dtf_var = f"{new_vars_prefix}DTF"

    derive_flag = flag_imputation == "date" or (
        flag_imputation == "auto" and highest_imputation != "n"
    )

    dates = []
    flags = []

    for val in dataset[dtc]:
        parsed_date, imputation_flag = _parse_date_from_dtc(
            val, highest_imputation, date_imputation
        )
        dates.append(
            pd.NaT if parsed_date is None else pd.Timestamp(parsed_date)
        )
        flags.append(imputation_flag)

    result = dataset.copy()
    result[dt_var] = pd.array(dates, dtype="datetime64[ns]")

    if derive_flag:
        result[dtf_var] = flags

    return result

derive_vars_dtm

admiralpy.derive_vars_dt.derive_vars_dtm(dataset, new_vars_prefix, dtc, highest_imputation='n', date_imputation='first', time_imputation='first', flag_imputation='auto')

Derive a datetime variable (*DTM) from a character datetime variable (--DTC).

Mirrors derive_vars_dtm() from the admiral R package.

Parameters:

Name Type Description Default
dataset DataFrame

Input dataset. Must contain the column named by dtc.

required
new_vars_prefix str

Prefix for the output variables. For example, "AST" creates ASTDTM (and ASTDTMF when the imputation flag is requested).

required
dtc str

Name of the source character datetime variable.

required
highest_imputation str

Highest imputation level allowed for the date part. One of:

  • "n" – no imputation (default)
  • "D" – impute missing day
  • "M" – impute missing month and/or day
  • "Y" – impute missing year, month, and/or day
'n'
date_imputation str

How to impute missing date components ("first", "last", "mid", or "MM-DD"). Default is "first".

'first'
time_imputation str

How to impute missing time components. One of:

  • "first"00:00:00 (default)
  • "last"23:59:59
'first'
flag_imputation str

Controls creation of the imputation flag *DTMF. One of:

  • "auto" – create flag when any imputation occurs (default)
  • "datetime" – always create flag
  • "none" – never create flag
'auto'

Returns:

Type Description
DataFrame

Input dataset with {new_vars_prefix}DTM (and optionally {new_vars_prefix}DTMF) appended.

Raises:

Type Description
ValueError

If dtc is not present in dataset or if highest_imputation is not a recognised value.

Examples:

>>> import pandas as pd
>>> from admiralpy import derive_vars_dtm
>>> df = pd.DataFrame({
...     "EXSTDTC": ["2019-07-18T15:25:40", "2019-07-18T15:25", "2019-07-18", ""]
... })
>>> derive_vars_dtm(df, new_vars_prefix="AST", dtc="EXSTDTC",
...                 highest_imputation="D", time_imputation="first")
Source code in admiralpy/derive_vars_dt.py
def derive_vars_dtm(
    dataset: pd.DataFrame,
    new_vars_prefix: str,
    dtc: str,
    highest_imputation: str = "n",
    date_imputation: str = "first",
    time_imputation: str = "first",
    flag_imputation: str = "auto",
) -> pd.DataFrame:
    """
    Derive a datetime variable (``*DTM``) from a character datetime variable (``--DTC``).

    Mirrors ``derive_vars_dtm()`` from the admiral R package.

    Parameters
    ----------
    dataset : pd.DataFrame
        Input dataset. Must contain the column named by ``dtc``.
    new_vars_prefix : str
        Prefix for the output variables.  For example, ``"AST"`` creates
        ``ASTDTM`` (and ``ASTDTMF`` when the imputation flag is requested).
    dtc : str
        Name of the source character datetime variable.
    highest_imputation : str, optional
        Highest imputation level allowed for the date part.  One of:

        * ``"n"`` – no imputation (default)
        * ``"D"`` – impute missing day
        * ``"M"`` – impute missing month and/or day
        * ``"Y"`` – impute missing year, month, and/or day
    date_imputation : str, optional
        How to impute missing date components (``"first"``, ``"last"``,
        ``"mid"``, or ``"MM-DD"``).  Default is ``"first"``.
    time_imputation : str, optional
        How to impute missing time components.  One of:

        * ``"first"`` – ``00:00:00`` (default)
        * ``"last"``  – ``23:59:59``
    flag_imputation : str, optional
        Controls creation of the imputation flag ``*DTMF``.  One of:

        * ``"auto"``  – create flag when any imputation occurs (default)
        * ``"datetime"`` – always create flag
        * ``"none"``  – never create flag

    Returns
    -------
    pd.DataFrame
        Input dataset with ``{new_vars_prefix}DTM`` (and optionally
        ``{new_vars_prefix}DTMF``) appended.

    Raises
    ------
    ValueError
        If ``dtc`` is not present in ``dataset`` or if ``highest_imputation``
        is not a recognised value.

    Examples
    --------
    >>> import pandas as pd
    >>> from admiralpy import derive_vars_dtm
    >>> df = pd.DataFrame({
    ...     "EXSTDTC": ["2019-07-18T15:25:40", "2019-07-18T15:25", "2019-07-18", ""]
    ... })
    >>> derive_vars_dtm(df, new_vars_prefix="AST", dtc="EXSTDTC",
    ...                 highest_imputation="D", time_imputation="first")
    """
    if dtc not in dataset.columns:
        raise ValueError(f"Column '{dtc}' not found in dataset.")

    valid_highest = ("n", "D", "M", "Y")
    if highest_imputation not in valid_highest:
        raise ValueError(
            f"highest_imputation must be one of {valid_highest}, "
            f"got '{highest_imputation}'."
        )

    dtm_var = f"{new_vars_prefix}DTM"
    dtmf_var = f"{new_vars_prefix}DTMF"

    derive_flag = flag_imputation in ("datetime",) or (
        flag_imputation == "auto" and highest_imputation != "n"
    )

    datetimes = []
    flags = []

    for val in dataset[dtc]:
        parsed_date, date_flag = _parse_date_from_dtc(
            val, highest_imputation, date_imputation
        )
        if parsed_date is None:
            datetimes.append(pd.NaT)
            flags.append(None)
            continue

        h, mi, s, time_flag = _parse_time_from_dtc(val, time_imputation)

        datetimes.append(
            pd.Timestamp(
                year=parsed_date.year,
                month=parsed_date.month,
                day=parsed_date.day,
                hour=h,
                minute=mi,
                second=s,
            )
        )
        # Combine flags: report the 'higher-priority' imputation
        combined_flag = date_flag or time_flag
        flags.append(combined_flag)

    result = dataset.copy()
    result[dtm_var] = pd.array(datetimes, dtype="datetime64[ns]")

    if derive_flag:
        result[dtmf_var] = flags

    return result