Source code for pycldf.ext.discovery

"""
This module provides a function (:func:`get_dataset`) implementing dataset discovery.

The scope of discoverable datasets can be extended by plugins, i.e. Python packages which
register additional :class:`DatasetResolver` subclasses using the entry point
`pycldf_dataset_resolver`

`pycldf` itself comes with two resolvers

- :class:`LocalResolver`
- :class:`GenericUrlResolver`

Additional resolvers:

- The `cldfzenodo <https://pypi.org/project/cldfzenodo>`_ package (>=1.0) provides a dataset
  resolver for DOI URLs pointing to the Zenodo archive.
"""
import re
from typing import Optional, Union
import pathlib
import zipfile
import warnings
import functools
import urllib.parse
import urllib.request
from importlib.metadata import entry_points

from csvw.utils import is_url

from pycldf import Dataset, iter_datasets, sniff
from pycldf.urlutil import url_without_fragment
from pycldf._compat import entry_points_select

__all__ = ['get_dataset', 'DatasetResolver']
EP = 'pycldf_dataset_resolver'

_resolvers = []


[docs]class DatasetResolver: # pylint: disable=R0903 """ Virtual base class for dataset resolvers. :ivar priority: A number between 0 and 10, determining the call order of registered resolvers.\ Resolvers with higher priority will be called earlier. Thus, resolvers specifying a high \ priority should be quick in figuring out whether they apply to a locator. """ priority = 5 def __call__( self, loc: str, download_dir: pathlib.Path, ) -> Union[None, Dataset, pathlib.Path]: """ :param loc: URL pointing to a place where datasets are archived. :param download_dir: A directory to which resolvers can download data. :return: Dataset resolvers may return `None` if they do not apply to `loc`, a `Dataset` \ instance, if a candidate dataset was found, or a local path, pointing to a metadata file or a directory to be searched for metadata files. """ raise NotImplementedError() # pragma: no cover
class LocalResolver(DatasetResolver): # pylint: disable=R0903 """ Resolves dataset locators specifying local file paths. """ priority = 100 def __call__( self, loc: str, download_dir, base: Optional[pathlib.Path], ) -> Optional[pathlib.Path]: """ :return: a local path to a directory """ if isinstance(loc, str) and is_url(loc): return None loc = pathlib.Path(loc) if loc.resolve() != loc and base: # A relative path, to be interpreted relative to base loc = base.resolve().joinpath(loc) if loc.exists(): return loc return None # pragma: no cover class GenericUrlResolver(DatasetResolver): # pylint: disable=R0903 """ URL resolver which works for generic URLs provided they point to a CLDF metadata file. """ priority = -1 def __call__(self, loc, download_dir) -> Optional[Dataset]: if is_url(loc): return Dataset.from_metadata(loc) return None # pragma: no cover class GitHubResolver(DatasetResolver): # pylint: disable=R0903 """ Resolves dataset locators of the form "https://github.com/<org>/<repos>/tree/<tag>", e.g. https://github.com/cldf-datasets/petersonsouthasia/tree/v1.1 or https://github.com/cldf-datasets/petersonsouthasia/releases/tag/v1.1 """ priority = 3 def __call__(self, loc, download_dir) -> Optional[pathlib.Path]: url = urllib.parse.urlparse(loc) if url.netloc == 'github.com' and re.search(r'/[v.0-9]+$', url.path): comps = url.path.split('/') z = download_dir / f'{comps[1]}-{comps[2]}-{comps[-1]}.zip' url = f"https://github.com/{comps[1]}/{comps[2]}/archive/refs/tags/{comps[-1]}.zip" urllib.request.urlretrieve(url, z) with zipfile.ZipFile(z) as zf: dirs = {info.filename.split('/')[0] for info in zf.infolist()} assert len(dirs) == 1 zf.extractall(download_dir) z.unlink() return download_dir / dirs.pop() return None class DatasetLocator(str): """Dataset locators are URLs with identifying information added to the fragment.""" @functools.cached_property def parsed_url(self) -> urllib.parse.ParseResult: # pylint: disable=C0116 return urllib.parse.urlparse(self) @property def url_without_fragment(self): # pylint: disable=C0116 return url_without_fragment(self.parsed_url) def match(self, dataset: Dataset) -> bool: # pylint: disable=C0116 if self.parsed_url.fragment: key, _, value = self.parsed_url.fragment.partition('=') return dataset.properties.get(key) == value if value else key in dataset.properties return True def get_resolvers() -> list[type]: """Register resolvers defined via entry points.""" if not _resolvers: for ep in set(entry_points_select(entry_points(), EP)): try: _resolvers.append(ep.load()()) except ImportError: # pragma: no cover warnings.warn(f'ImportError loading entry point {ep.name}') continue return sorted(_resolvers, key=lambda res: -res.priority) def _get_dataset( locator: DatasetLocator, location: Union[None, Dataset, pathlib.Path], ) -> Optional[Dataset]: """Determine whether locator matches location and if so, resolve to a Dataset instance.""" if isinstance(location, Dataset): if locator.match(location): return location return None if location.is_dir(): for ds in iter_datasets(location): if locator.match(ds): return ds else: ds = Dataset.from_metadata(location) if sniff(location) else Dataset.from_data(location) if locator.match(ds): return ds return None # pragma: no cover
[docs]def get_dataset(locator: str, download_dir: pathlib.Path, base: Optional[pathlib.Path] = None) -> Dataset: """ :param locator: Dataset locator as specified in "Dataset discovery". :param download_dir: Directory to which to download remote data if necessary. :param base: Optional path relative to which local paths in `locator` must be resolved. """ locator = DatasetLocator(locator) for resolver in get_resolvers(): if isinstance(resolver, LocalResolver): # Local paths may need to be resolved relative to another path (e.g. the location of # a CLDF markdown document). res = resolver(locator.url_without_fragment, download_dir, base) else: res = resolver(locator.url_without_fragment, download_dir) if res: res = _get_dataset(locator, res) if res: return res raise ValueError(f'Could not resolve dataset locator {locator}')