Source code for pycldf.ext.discovery

"""
This module provides a function (:func:`get_dataset`) implementing dataset discovery.

The scope of discoverable datasets can be extended by plugins, i.e. Python packages which
register additional :class:`DatasetResolver` subclasses using the entry point
`pycldf_dataset_resolver`

`pycldf` itself comes with two resolvers

- :class:`LocalResolver`
- :class:`GenericUrlResolver`

Additional resolvers:

- The `cldfzenodo <https://pypi.org/project/cldfzenodo>`_ package (>=1.0) provides a dataset
  resolver for DOI URLs pointing to the Zenodo archive.
"""
import typing
import pathlib
import warnings
import functools
import urllib.parse
from importlib.metadata import entry_points

from csvw.utils import is_url

from pycldf import Dataset, iter_datasets, sniff
from pycldf.util import url_without_fragment

__all__ = ['get_dataset', 'DatasetResolver']
EP = 'pycldf_dataset_resolver'

_resolvers = []


[docs]class DatasetResolver:
    """
    Virtual base class for dataset resolvers.

    :ivar priority: A number between 0 and 10, determining the call order of registered resolvers.\
    Resolvers with higher priority will be called earlier. Thus, resolvers specifying a high \
    priority should be quick in figuring out whether they apply to a locator.
    """
    priority = 5

    def __call__(self, loc: str, download_dir: pathlib.Path) \
            -> typing.Union[None, Dataset, pathlib.Path]:
        """
        :param loc: URL pointing to a place where datasets are archived.
        :param download_dir: A directory to which resolvers can download data.
        :return: Dataset resolvers may return `None` if they do not apply to `loc`, a `Dataset` \
        instance, if a candidate dataset was found, or a local path, pointing to a metadata file
        or a directory to be searched for metadata files.
        """
        raise NotImplementedError()  # pragma: no cover


class LocalResolver(DatasetResolver):
    """
    Resolves dataset locators specifying local file paths.
    """
    priority = 100

    def __call__(self, loc: str, download_dir, base: typing.Optional[pathlib.Path]) \
            -> typing.Union[None, pathlib.Path]:
        if isinstance(loc, str) and is_url(loc):
            return
        loc = pathlib.Path(loc)
        if loc.resolve() != loc and base:
            # A relative path, to be interpreted relative to base
            loc = base.resolve().joinpath(loc)
        if loc.exists():
            return loc


class GenericUrlResolver(DatasetResolver):
    """
    URL resolver which works for generic URLs provided they point to a CLDF metadata file.
    """
    priority = -1

    def __call__(self, loc, download_dir):
        if is_url(loc):
            try:
                return Dataset.from_metadata(loc)
            except:  # noqa: E722 # pragma: no cover
                raise
                pass


class DatasetLocator(str):
    @functools.cached_property
    def parsed_url(self) -> urllib.parse.ParseResult:
        return urllib.parse.urlparse(self)

    @property
    def url_without_fragment(self):
        return url_without_fragment(self.parsed_url)

    def match(self, dataset: Dataset) -> bool:
        if self.parsed_url.fragment:
            key, _, value = self.parsed_url.fragment.partition('=')
            return dataset.properties.get(key) == value if value else key in dataset.properties
        return True


def get_resolvers():
    if not _resolvers:
        eps = entry_points()
        for ep in set(eps.select(group=EP) if hasattr(eps, 'select') else eps.get(EP, [])):
            try:
                _resolvers.append(ep.load()())
            except ImportError:  # pragma: no cover
                warnings.warn('ImportError loading entry point {0.name}'.format(ep))
                continue
    return sorted(_resolvers, key=lambda res: -res.priority)


def _get_dataset(locator: DatasetLocator, location: typing.Union[None, Dataset, pathlib.Path]):
    if isinstance(location, Dataset):
        if locator.match(location):
            return location
        return
    if location.is_dir():
        for ds in iter_datasets(location):
            if locator.match(ds):
                return ds
    else:
        ds = Dataset.from_metadata(location) if sniff(location) else Dataset.from_data(location)
        if locator.match(ds):
            return ds


[docs]def get_dataset(locator: str,
                download_dir: pathlib.Path,
                base: typing.Optional[pathlib.Path] = None) -> Dataset:
    """
    :param locator: Dataset locator as specified in "Dataset discovery".
    :param download_dir: Directory to which to download remote data if necessary.
    :param base: Optional path relative to which local paths in `locator` must be resolved.
    """
    locator = DatasetLocator(locator)
    for resolver in get_resolvers():
        if isinstance(resolver, LocalResolver):
            # Local paths may need to be resolved relative to another path (e.g. the location of
            # a CLDF markdown document).
            res = resolver(locator.url_without_fragment, download_dir, base)
        else:
            res = resolver(locator.url_without_fragment, download_dir)
        if res:
            res = _get_dataset(locator, res)
            if res:
                return res
    raise ValueError('Could not resolve dataset locator {}'.format(locator))