Source code for audb.core.api

import os
import tempfile
import typing

import pandas as pd

import audbackend
import audeer
import audformat

from audb.core import define
from audb.core import utils
from audb.core.cache import database_cache_root
from audb.core.cache import default_cache_root
from audb.core.config import config
from audb.core.dependencies import Dependencies
from audb.core.flavor import Flavor
from audb.core.lock import FolderLock
from audb.core.repository import Repository


def available(
    *,
    only_latest: bool = False,
) -> pd.DataFrame:
    r"""List all databases that are available to the user.

    Args:
        only_latest: include only latest version of database

    Returns:
        table with database name as index,
        and backend, host, repository, version as columns

    Examples:
        >>> df = audb.available(only_latest=True)
        >>> df.loc[["air", "emodb"]]
                   backend                                    host   repository version
        name
        air    artifactory  https://audeering.jfrog.io/artifactory  data-public   1.4.2
        emodb  artifactory  https://audeering.jfrog.io/artifactory  data-public   1.4.1

    """  # noqa: E501
    databases = []
    for repository in config.REPOSITORIES:
        try:
            backend = utils.access_backend(repository)
            if isinstance(backend, audbackend.Artifactory):
                # avoid backend.ls('/')
                # which is very slow on Artifactory
                # see https://github.com/audeering/audbackend/issues/132
                for p in backend._repo.path:
                    name = p.name
                    for version in [str(x).split("/")[-1] for x in p / "db"]:
                        databases.append(
                            [
                                name,
                                repository.backend,
                                repository.host,
                                repository.name,
                                version,
                            ]
                        )
            else:
                for path, version in backend.ls("/"):
                    if path.endswith(define.HEADER_FILE):
                        name = path.split("/")[1]
                        databases.append(
                            [
                                name,
                                repository.backend,
                                repository.host,
                                repository.name,
                                version,
                            ]
                        )
        except audbackend.BackendError:
            continue

    df = pd.DataFrame.from_records(
        databases,
        columns=["name", "backend", "host", "repository", "version"],
    )
    if only_latest:
        # Pick latest version for every database, see
        # https://stackoverflow.com/a/53842408
        df = df[
            df["version"]
            == df.groupby("name")["version"].transform(
                lambda x: audeer.sort_versions(x)[-1]
            )
        ]
    else:
        # Sort by version
        df = df.sort_values(by=["version"], key=audeer.sort_versions)
    df = df.sort_values(by=["name"])
    return df.set_index("name")


def cached(
    cache_root: str = None,
    *,
    name: str = None,
    shared: bool = False,
) -> pd.DataFrame:
    r"""List available databases and flavors in the cache.

    Args:
        cache_root: cache folder where databases are stored.
            If not set :meth:`audb.default_cache_root` is used
        name: name of database.
            If provided,
            it will show only cached versions of that database
        shared: include databases from shared cache

    Returns:
        cached databases
        with cache path as index,
        and name,
        flavor_id,
        version,
        complete,
        bit_depth,
        channels,
        format,
        mixdown,
        sampling_rate
        as columns

    Examples:
        >>> db = audb.load(
        ...     "emodb",
        ...     version="1.4.1",
        ...     only_metadata=True,
        ...     full_path=False,
        ...     verbose=False,
        ... )
        >>> df = cached()
        >>> print(df.iloc[0].to_string())
        name                emodb
        flavor_id        d3b62a9b
        version             1.4.1
        complete            False
        bit_depth            None
        channels             None
        format               None
        mixdown             False
        sampling_rate        None

    """  # noqa: E501
    cache_root = audeer.path(cache_root or default_cache_root(shared=shared))

    columns = [
        "name",
        "flavor_id",
        "version",
        "complete",
        "bit_depth",
        "channels",
        "format",
        "mixdown",
        "sampling_rate",
    ]
    df = pd.DataFrame([], columns=columns)

    if not os.path.exists(cache_root):
        return df

    database_paths = audeer.list_dir_names(cache_root)
    for database_path in database_paths:
        database = os.path.basename(database_path)

        # Limit to databases of given name
        if name is not None and database != name:
            continue

        version_paths = audeer.list_dir_names(database_path)
        for version_path in version_paths:
            version = os.path.basename(version_path)

            # Skip tmp folder (e.g. 1.0.0~)
            if version.endswith("~"):  # pragma: no cover
                continue

            flavor_id_paths = audeer.list_dir_names(version_path)

            # Skip old audb cache (e.g. 1 as flavor)
            files = audeer.list_file_names(version_path)
            deps_path = os.path.join(version_path, define.DEPENDENCIES_FILE)
            deps_path_cached = os.path.join(
                version_path,
                define.CACHED_DEPENDENCIES_FILE,
            )
            if deps_path not in files and deps_path_cached not in files:
                # Skip all cache entries
                # that don't contain a db.csv or db.pkl file
                # as those stem from audb<1.0.0.
                # We only look for db.csv
                # as we switched to db.pkl with audb>=1.0.5
                continue  # pragma: no cover

            for flavor_id_path in flavor_id_paths:
                flavor_id = os.path.basename(flavor_id_path)
                files = audeer.list_file_names(flavor_id_path)
                files = [os.path.basename(f) for f in files]

                if define.HEADER_FILE in files:
                    db = audformat.Database.load(
                        flavor_id_path,
                        load_data=False,
                    )
                    flavor = db.meta["audb"]["flavor"]
                    complete = db.meta["audb"]["complete"]
                    df.loc[flavor_id_path] = [
                        database,
                        flavor_id,
                        version,
                        complete,
                        flavor["bit_depth"],
                        flavor["channels"],
                        flavor["format"],
                        flavor["mixdown"],
                        flavor["sampling_rate"],
                    ]

    # Replace NaN with None
    return df.where(pd.notnull(df), None)


def dependencies(
    name: str,
    *,
    version: str = None,
    cache_root: str = None,
    verbose: bool = False,
) -> Dependencies:
    r"""Database dependencies.

    Args:
        name: name of database
        version: version of database
        cache_root: cache folder where databases are stored.
            If not set :meth:`audb.default_cache_root` is used
        verbose: show debug messages

    Returns:
        dependency object

    Examples:
        >>> deps = dependencies("emodb", version="1.4.1")
        >>> deps.version("db.emotion.csv")
        '1.1.0'

    """
    if version is None:
        version = latest_version(name)

    db_root = database_cache_root(
        name,
        version,
        cache_root=cache_root,
    )
    deps_path = os.path.join(db_root, define.CACHED_DEPENDENCIES_FILE)

    deps = Dependencies()

    with FolderLock(db_root):
        try:
            deps.load(deps_path)
        except (AttributeError, FileNotFoundError, ValueError, EOFError):
            # If loading pickled cached file fails, load again from backend
            backend = utils.lookup_backend(name, version)
            with tempfile.TemporaryDirectory() as tmp_root:
                archive = backend.join("/", name, define.DB + ".zip")
                backend.get_archive(
                    archive,
                    tmp_root,
                    version,
                    verbose=verbose,
                )
                deps.load(os.path.join(tmp_root, define.DEPENDENCIES_FILE))
                deps.save(deps_path)

    return deps


def exists(
    name: str,
    *,
    version: str = None,
    bit_depth: int = None,
    channels: typing.Union[int, typing.Sequence[int]] = None,
    format: str = None,
    mixdown: bool = False,
    sampling_rate: int = None,
    cache_root: str = None,
) -> bool:
    r"""Check if specified database flavor exists in local cache folder.

    Does not check for any flavor of the requested database in the cache,
    but only for a particular flavor.
    Note, that using only the name, e.g. ``audb.exists('emodb')``
    is also a single flavor.

    To list all available flavors of a particular database, use:

    .. code-block::

        audb.cached(name='emodb')

    Args:
        name: name of database
        version: version string, latest if ``None``
        bit_depth: bit depth, one of ``16``, ``24``, ``32``
        channels: channel selection, see :func:`audresample.remix`
        format: file format, one of ``'flac'``, ``'wav'``
        mixdown: apply mono mix-down
        sampling_rate: sampling rate in Hz, one of
            ``8000``, ``16000``, ``22500``, ``44100``, ``48000``
        cache_root: cache folder where databases are stored.
            If not set :meth:`audb.default_cache_root` is used

    Returns:
        ``True`` if database flavor exists

    Examples:
        >>> db = audb.load(
        ...     "emodb",
        ...     version="1.4.1",
        ...     only_metadata=True,
        ...     verbose=False,
        ... )
        >>> audb.exists("emodb", version="1.4.1")
        True
        >>> audb.exists("emodb", version="1.4.1", format="wav")
        False

    """
    if version is None:
        version = latest_version(name)

    relative_flavor_path = flavor_path(
        name,
        version,
        channels=channels,
        format=format,
        mixdown=mixdown,
        bit_depth=bit_depth,
        sampling_rate=sampling_rate,
    )

    cache_roots = (
        [
            default_cache_root(True),  # check shared cache first
            default_cache_root(False),
        ]
        if cache_root is None
        else [audeer.path(cache_root, follow_symlink=True)]
    )
    for cache_root in cache_roots:
        db_root = os.path.join(cache_root, relative_flavor_path)
        if os.path.exists(db_root):
            return True

    return False


def flavor_path(
    name: str,
    version: str,
    *,
    bit_depth: int = None,
    channels: typing.Union[int, typing.Sequence[int]] = None,
    format: str = None,
    mixdown: bool = False,
    sampling_rate: int = None,
) -> str:
    r"""Flavor cache path.

    Returns the path under which :func:`audb.load` stores a specific
    flavor of a database in the cache folder, that is:

     ``<name>/<version>/<flavor-id>/``

    Note that the returned path is relative.
    To get the absolute path, do:

    .. code-block::

        os.path.join(
            audb.default_cache_root(...),
            audb.flavor_path(...),
        )

    Args:
        name: name of database
        version: version string
        bit_depth: bit depth, one of ``16``, ``24``, ``32``
        channels: channel selection, see :func:`audresample.remix`
        format: file format, one of ``'flac'``, ``'wav'``
        mixdown: apply mono mix-down
        sampling_rate: sampling rate in Hz, one of
            ``8000``, ``16000``, ``22500``, ``44100``, ``48000``

    Returns:
        flavor path relative to cache folder

    Raises:
        ValueError: if a non-supported ``bit_depth``,
            ``format``,
            or ``sampling_rate``
            is requested

    Examples:
        >>> flavor_path("emodb", version="1.4.1").split(os.path.sep)
        ['emodb', '1.4.1', 'd3b62a9b']

    """
    flavor = Flavor(
        channels=channels,
        format=format,
        mixdown=mixdown,
        bit_depth=bit_depth,
        sampling_rate=sampling_rate,
    )

    return flavor.path(name, version)


def latest_version(
    name,
) -> str:
    r"""Latest version of database.

    Args:
        name: name of database

    Returns:
        version string

    Raises:
        RuntimeError: if no version exists for the requested database

    Examples:
        >>> latest_version("emodb")
        '1.4.1'

    """
    vs = versions(name)
    if not vs:
        raise RuntimeError(
            f"Cannot find a version for database '{name}'.",
        )
    return vs[-1]


def remove_media(
    name: str,
    files: typing.Union[str, typing.Sequence[str]],
    *,
    verbose: bool = False,
):
    r"""Remove media from all versions.

    Be careful,
    this removes files from all published versions
    on all backends.
    Those files cannot be restored afterwards.

    Args:
        name: name of database
        files: list of files that should be removed
        verbose: show debug messages

    """
    if isinstance(files, str):
        files = [files]

    for version in versions(name):
        backend = utils.lookup_backend(name, version)

        with tempfile.TemporaryDirectory() as db_root:
            # download dependencies
            archive = backend.join("/", name, define.DB + ".zip")
            deps_path = backend.get_archive(
                archive,
                db_root,
                version,
                verbose=verbose,
            )[0]
            deps_path = os.path.join(db_root, deps_path)
            deps = Dependencies()
            deps.load(deps_path)
            upload = False

            for file in audeer.progress_bar(
                files,
                disable=not verbose,
                desc=f"Remove media from v{version}",
            ):
                if file in deps.media:
                    archive = deps.archive(file)

                    # if archive exists in this version,
                    # remove file from it and re-publish
                    remote_archive = backend.join(
                        "/",
                        name,
                        define.DEPEND_TYPE_NAMES[define.DependType.MEDIA],
                        archive + ".zip",
                    )
                    if backend.exists(remote_archive, version):
                        files_in_archive = backend.get_archive(
                            remote_archive,
                            db_root,
                            version,
                        )

                        if os.name == "nt":  # pragma: no cover
                            files_in_archive = [
                                file.replace(os.path.sep, "/")
                                for file in files_in_archive
                            ]

                        # skip if file was already deleted
                        if file in files_in_archive:
                            os.remove(os.path.join(db_root, file))
                            files_in_archive.remove(file)
                            backend.put_archive(
                                db_root,
                                remote_archive,
                                version,
                                files=files_in_archive,
                            )

                    # mark file as removed
                    deps._remove(file)
                    upload = True

            # upload dependencies
            if upload:
                deps.save(deps_path)
                remote_archive = backend.join("/", name, define.DB + ".zip")
                backend.put_archive(
                    db_root,
                    remote_archive,
                    version,
                    files=define.DEPENDENCIES_FILE,
                    verbose=verbose,
                )


[docs]def repository( name: str, version: str, ) -> Repository: r"""Return repository that stores the requested database. If the database is stored in several repositories, only the first one is returned. The order of the repositories to look for the database is given by :attr:`config.REPOSITORIES`. Args: name: database name version: version string Returns: repository that contains the database Raises: RuntimeError: if database or version is not found Examples: >>> audb.repository("emodb", "1.4.1") Repository('data-public', 'https://audeering.jfrog.io/artifactory', 'artifactory') """ # noqa: E501 if not versions(name): raise RuntimeError(f"Cannot find database " f"'{name}'.") return utils._lookup(name, version)[0]
def versions( name: str, ) -> typing.List[str]: r"""Available versions of database. Args: name: name of database Returns: list of versions Examples: >>> versions("emodb") ['1.1.0', '1.1.1', '1.2.0', '1.3.0', '1.4.0', '1.4.1'] """ vs = [] for repository in config.REPOSITORIES: backend = utils.access_backend(repository) if isinstance(backend, audbackend.Artifactory): # Avoid using ls() on Artifactory # see https://github.com/devopshq/artifactory/issues/423 folder = backend.join("/", name, "db") path = audbackend.core.artifactory._artifactory_path( backend._expand(folder), backend._username, backend._api_key, ) if path.exists(): for p in path: version = p.parts[-1] header = p.joinpath(f"db-{version}.yaml") if header.exists(): vs.extend([version]) else: header = backend.join("/", name, "db.yaml") vs.extend(backend.versions(header, suppress_backend_errors=True)) return audeer.sort_versions(vs)