Source code for audformat.core.column

from __future__ import annotations

import itertools
import typing
import warnings

import numpy as np
import pandas as pd

from audformat.core import define
from audformat.core.common import HeaderBase
from audformat.core.common import to_audformat_dtype
from audformat.core.common import to_pandas_dtype
from audformat.core.index import index_type
from audformat.core.index import is_scalar
from audformat.core.index import to_array
from audformat.core.rater import Rater
from audformat.core.typing import Values


if typing.TYPE_CHECKING:
    # Fix to make mypy work without circular imports,
    # compare
    # https://adamj.eu/tech/2021/05/13/python-type-hints-how-to-fix-circular-imports/
    from audformat.core.scheme import Scheme  # pragma: nocover


def assert_values(
    values: Values,
    scheme: Scheme,
):
    r"""Raise error if values do not match scheme."""
    error_msg = ""

    if (
        scheme.labels is not None
        or scheme.minimum is not None
        or scheme.maximum is not None
    ):
        if is_scalar(values):
            values = [values]
        elif isinstance(values, pd.Series):
            values = values.values
        values = [v for v in values if v is not None and not pd.isna(v)]
        if not values:
            return

    if scheme.labels is not None:
        bad_values = set(values) - set(scheme.labels_as_list)
        if len(bad_values) > 0:
            # Convert only `max_display` entries from set to list
            max_display = 10
            show_bad_values = sorted(
                [v for v in itertools.islice(bad_values, max_display)]
            )
            error_msg = str(show_bad_values)[1:-1]
            if len(bad_values) > max_display:
                error_msg += ", ..."
            error_msg += "\n"

    if scheme.is_numeric:
        if scheme.minimum is not None:
            min_value = min(values)
            if float(min_value) < scheme.minimum:
                error_msg += f"minimum {min_value} smaller than scheme minimum\n"
        if scheme.maximum is not None:
            max_value = max(values)
            if float(max_value) > scheme.maximum:
                error_msg += f"maximum {max_value} larger than scheme maximum\n"

    if error_msg:
        raise ValueError(
            f"Some value(s) do not match scheme\n{scheme}\n"
            f"with scheme ID '{scheme._id}':\n"
            f"{error_msg}"
        )


[docs]class Column(HeaderBase):
    r"""Table column.

    Represents a table column (see :class:`audformat.Table`) and
    optionally links it to a scheme (see :class:`audformat.Scheme`) and
    a rater (see :class:`audformat.Rater`).

    Args:
        scheme_id: scheme identifier (must exist)
        rater_id: rater identifier (must exist)
        description: table description
        meta: additional meta fields

    Examples:
        >>> Column(scheme_id="emotion")
        {scheme_id: emotion}

    """

    def __init__(
        self,
        *,
        scheme_id: str = None,
        rater_id: str = None,
        description: str = None,
        meta: dict = None,
    ):
        super().__init__(description=description, meta=meta)

        self.scheme_id = scheme_id
        r"""Scheme identifier"""
        self.rater_id = rater_id
        r"""Rater identifier"""
        self._table = None
        self._id = None

    @property
    def rater(self) -> Rater | None:
        r"""Rater object.

        Returns:
            rater object or ``None`` if not available

        """
        if (
            (self.rater_id is not None)
            and (self.table is not None)
            and (self.table.db is not None)
        ):
            return self.table.db.raters[self.rater_id]

    @property
    def scheme(self) -> Scheme | None:
        r"""Scheme object.

        Returns:
            scheme object or ``None`` if not available

        """
        if (
            (self.scheme_id is not None)
            and (self.table is not None)
            and (self.table.db is not None)
        ):
            return self.table.db.schemes[self.scheme_id]

    @property
    def table(self):
        r"""Table object.

        Returns:
            table object or ``None`` if not assigned yet

        """
        return self._table

[docs]    def get(
        self,
        index: pd.Index = None,
        *,
        map: str = None,
        copy: bool = True,
        as_segmented: bool = False,
        allow_nat: bool = True,
        root: str = None,
        num_workers: int | None = 1,
        verbose: bool = False,
    ) -> pd.Series:
        r"""Get labels.

        By default, all labels of the column are returned,
        use ``index`` to get a subset.

        Examples are provided with the
        :ref:`table specifications <data-tables:Tables>`.

        Args:
            index: index conform to
                :ref:`table specifications <data-tables:Tables>`
            copy: return a copy of the labels
            map: :ref:`map scheme or scheme field to column values
                <map-scheme-labels>`.
                For example if your column holds speaker IDs and is
                assigned to a scheme that contains a dict mapping
                speaker IDs to age entries, ``map='age'``
                will replace the ID values with the age of the speaker
            as_segmented: if set to ``True``
                and column has a filewise index,
                the index of the returned column
                will be converted to a segmented index.
                ``start`` will be set to ``0`` and
                ``end`` to ``NaT`` or to the file duration
                if ``allow_nat`` is set to ``False``.
                If column belongs to a miscellaneous table,
                this and the following arguments have no effect
            allow_nat: if set to ``False``,
                ``end=NaT`` is replaced with file duration
            root: root directory under which the files are stored.
                Provide if file names are relative and
                database was not saved or loaded from disk.
                If ``None`` :attr:`audformat.Database.root` is used.
                Only relevant if ``allow_nat`` is set to ``False``
            num_workers: number of parallel jobs.
                If ``None`` will be set to the number of processors
                on the machine multiplied by 5
            verbose: show progress bar

        Returns:
            labels

        Raises:
            FileNotFoundError: if file is not found
            RuntimeError: if column is not assigned to a table
            ValueError: if trying to map without a scheme,
                or from a scheme that has no labels,
                or from a scheme that has only a list of labels,
                or to a non-existing field

        """
        if self._table is None:
            raise RuntimeError("Column is not assigned to a table.")

        if hasattr(self._table, "type"):
            result = self._table.get(
                index,
                copy=False,
                as_segmented=as_segmented,
                allow_nat=allow_nat,
                root=root,
                num_workers=num_workers,
                verbose=verbose,
            )
        else:
            result = self._table.get(
                index,
                copy=False,
            )
        result = result[self._id]

        if map is not None:
            copy = False  # to avoid another copy

            if self.scheme_id is None:
                raise ValueError(f"Column '{self._id}' is not assigned to a scheme.")

            scheme = self._table._db.schemes[self.scheme_id]
            labels = scheme._labels_to_dict()

            if labels is None:
                raise ValueError(f"Scheme '{self.scheme_id}' has no labels.")

            if not any(labels.values()):
                raise ValueError(
                    f"Scheme '{self.scheme_id}' provides no mapping " "for its labels."
                )

            # Check that at least one key is available for map
            # if labels are stored as dictionary
            keys = []
            for key, value in labels.items():
                if isinstance(value, dict):
                    keys += list(value.keys())
            keys = sorted(list(set(keys)))
            if len(keys) > 0 and map not in keys:
                raise ValueError(
                    f"Cannot map "
                    f"'{self._id}' "
                    f"to "
                    f"'{map}'. "
                    f"Expected one of "
                    f"{list(keys)}."
                )

            mapping = {}
            for key, value in labels.items():
                if isinstance(value, dict):
                    if map in value:
                        value = value[map]
                    else:
                        value = np.nan
                mapping[key] = value

            result = result.map(mapping)
            result.name = map

            if (
                scheme.uses_table
                and self._table._db[scheme.labels][map].scheme is not None
                #                       ^           ^
                #                   misc table   column
            ):
                # Infer dtype from misc table
                misc_table_id = scheme.labels
                column = self._table._db[misc_table_id][map]
                dtype = column.scheme.to_pandas_dtype()
            else:
                # Infer dtype from actual labels
                dtype = pd.api.types.infer_dtype(list(result.values))
                dtype = to_pandas_dtype(to_audformat_dtype(dtype))

            result = result.astype(dtype)

        return result.copy() if copy else result

[docs]    def set(
        self,
        values: Values,
        *,
        index: pd.Index = None,
    ):
        r"""Set labels.

        By default, all labels of the column are replaced,
        use ``index`` to set a subset.
        If columns is assigned to a :class:`Scheme`
        values will be automatically converted
        to match its dtype.

        Examples are provided with the
        :ref:`table specifications <data-tables:Tables>`.

        Args:
            values: list of values
            index: index conform to
                :ref:`table specifications <data-tables:Tables>`

        Raises:
            RuntimeError: if column is not assign to a table
            ValueError: if trying to set values of a filewise column
                using a segmented index
            ValueError: if values cannot be converted
                to match the schemes dtype

        """
        if self._table is None:
            raise RuntimeError("Column is not assigned to a table.")

        column_id = self._id
        df = self._table.df

        if index is None:
            index = df.index

        if self.scheme_id is not None:
            scheme = self._table._db.schemes[self.scheme_id]
            assert_values(values, scheme)
            dtype = scheme.to_pandas_dtype()
        else:
            dtype = df[column_id].dtype

        if hasattr(self._table, "type") and self._table.type != index_type(index):
            # special case where a filewise / segmented table
            # is requested with an index of the other type
            if not self._table.is_filewise:
                files = index.get_level_values(define.IndexField.FILE)
                index = df.loc[files].index
                return self.set(values, index=index)
            else:
                raise ValueError(
                    "Cannot set values of a filewise column " "using a segmented index."
                )
        else:
            if is_scalar(values):
                values = [values] * len(index)
            values = to_array(values)
            if dtype == "datetime64[ns]":
                # Ensure all date values are timezone unaware,
                # see https://github.com/audeering/audformat/issues/364
                values = [
                    pd.to_datetime(value).tz_localize(None)
                    if value is not None
                    else value
                    for value in values
                ]
            with warnings.catch_warnings():
                # Avoid FutureWarning and DeprecationWarning
                # for pandas 1.5.0 to 1.5.3
                # for setting values in place
                # as introduced at
                # https://pandas.pydata.org/docs/dev/whatsnew/v1.5.0.html#inplace-operation-when-setting-values-with-loc-and-iloc
                # For pandas >=2.0.0 values are always set in place
                for warning in [FutureWarning, DeprecationWarning]:
                    warnings.simplefilter(action="ignore", category=warning)
                df.loc[index, column_id] = pd.Series(
                    values,
                    index=index,
                    dtype=dtype,
                )

[docs]    def __eq__(
        self,
        other: Column,
    ) -> bool:
        r"""Compare if column equals another column."""
        if self.dump() != other.dump():
            return False
        if self._table is not None and other._table is not None:
            return self._table.df[self._id].equals(other._table.df[other._id])
        return self._table is None and other._table is None