"""
This module provides tools to build a CLDF Markdown renderer.
For an example, see :class:`FilenameToComponent`.
"""
import re
from typing import Optional, Union, Any
import pathlib
import warnings
import collections
from collections.abc import Mapping
import dataclasses
import yaml
import jmespath
import frontmatter
import clldutils
from clldutils.markup import MarkdownLink
from pycldf.util import pkg_path, MD_SUFFIX
from pycldf.urlutil import url_without_fragment
from pycldf.sources import Source
from pycldf import Dataset
from pycldf import orm
from .discovery import get_dataset
__all__ = ['CLDFMarkdownLink', 'CLDFMarkdownText', 'FilenameToComponent']
#: The YAML frontmatter key to specify dataset mappings:
DATASETS_MAPPING = 'cldf-datasets'
SOURCE_COMPONENT = 'Source'
METADATA_COMPONENT = 'Metadata'
class DatasetMapping(Mapping):
"""
A read-only mapping of prefixes to datasets.
"""
key_pattern = re.compile('[a-zA-Z0-9_]+')
@staticmethod
def to_dict(o): # pylint: disable=C0116
if isinstance(o, DatasetMapping):
return o.m
return {} if not o else ({None: o} if isinstance(o, (str, Dataset)) else o)
def __init__(self,
m1,
m2=None,
doc_path: Optional[pathlib.Path] = None,
download_dir: Optional[pathlib.Path] = None):
"""
:param m1: Mapping of prefixes to datasets (locators).
:param m2: Mapping of prefixes to datasets (locators) to update `m1`.
:param doc_path: Path of a CLDF markdown document, relative to which dataset locators are \
to be resolved.
:param download_dir: Path to an existing directory to which to download datasets \
(if necessary).
"""
self.m = self.to_dict(m1)
self.m.update(self.to_dict(m2))
if not all(True if k is None else DatasetMapping.key_pattern.fullmatch(k) for k in self.m):
raise ValueError('Invalid dataset prefix')
for k in self.m:
if not isinstance(self.m[k], Dataset):
self.m[k] = get_dataset(self.m[k], download_dir, doc_path)
def __getitem__(self, prefix: Union[str, None]) -> Dataset:
"""
Get a `Dataset` mapped to a prefix.
"""
return self.m[prefix]
def __iter__(self):
return iter(self.m)
def __len__(self):
return len(self.m)
[docs]@dataclasses.dataclass
class CLDFMarkdownLink(MarkdownLink):
"""
CLDF Markdown links are specified using URLs of a particular format.
Instances of `CLDFMarkdownLink` are supplied as sole argument when calling the replacement
function passed to `CLDFMarkdownLink.replace` .
"""
fragment_pattern = re.compile(r'cldf(-(?P<prefix>[a-zA-Z0-9_]+))?:')
@property
def url_without_fragment(self) -> str:
"""Return the HREF value of the link without the fragment."""
return url_without_fragment(self.parsed_url)
[docs] @classmethod
def from_component(cls, comp, objid='__all__', label=None, prefix=None) -> 'CLDFMarkdownLink':
"""Create a CLDF Markdown link for an object in a component."""
return cls(label=label or f'{comp}:{objid}', url=cls.format_url(comp, objid, prefix=prefix))
@property
def is_cldf_link(self) -> bool:
"""
Flag signaling whether the markdown link is indeed referencing a CLDF object.
"""
return bool(self.fragment_pattern.match(self.parsed_url.fragment))
@property
def prefix(self) -> Optional[str]:
"""
The dataset prefix associated with a CLDF Markdown link.
"""
if self.is_cldf_link:
return self.fragment_pattern.match(self.parsed_url.fragment).group('prefix')
return None # pragma: no cover
@property
def table_or_fname(self) -> Optional[str]:
"""
The last path component of the URL of a CLDF Markdown link.
"""
if self.is_cldf_link:
return self.parsed_url.path.split('/')[-1]
return None # pragma: no cover
[docs] def component(
self,
cldf: Optional[Union[Dataset, dict[str, Dataset], DatasetMapping]] = None,
) -> Union[str, None]:
"""
:param cldf: `pycldf.Dataset` instance to which the link refers.
:return: Name of the CLDF component the link pertains to or `None`.
"""
name = self.table_or_fname
if cldf is None:
# If no CLDF dataset is passed as context, we can only detect links using proper
# component names as path:
return name if (name in [SOURCE_COMPONENT, METADATA_COMPONENT] or # noqa: W504
pkg_path('components', name + MD_SUFFIX).exists()) \
else None
if isinstance(cldf, (dict, DatasetMapping)):
cldf = cldf[self.prefix]
if name in (cldf.bibname, SOURCE_COMPONENT):
return SOURCE_COMPONENT
if name in (cldf.filename, METADATA_COMPONENT):
return METADATA_COMPONENT
try:
return cldf.get_tabletype(cldf[name])
except (KeyError, ValueError):
return None
@property
def objid(self) -> Optional[str]:
"""
The identifier of the object referenced by a CLDF Markdown link.
"""
if self.is_cldf_link:
return self.parsed_url.fragment.split(':', maxsplit=1)[-1]
return None # pragma: no cover
@property
def all(self) -> bool:
"""
Flag signaling whether the link is referencing the special `__all__` identifier.
"""
return self.objid == '__all__'
[docs] def get_row(self, cldf: Union[Dataset, DatasetMapping]) -> dict:
"""
Resolve the reference in a CLDF Markdown link to a row (`dict`) in the CLDF `Dataset`.
"""
assert self.is_cldf_link and self.objid and (not self.all)
ds = DatasetMapping(cldf)[self.prefix]
return ds.get_row(self.component(cldf=ds), self.objid)
[docs] def get_object(self, cldf: Union[Dataset, DatasetMapping]) -> orm.Object:
"""
Resolve the reference in a CLDF Markdown link to an ORM object in the CLDF `Dataset`.
"""
assert self.is_cldf_link and self.objid and (not self.all)
ds = DatasetMapping(cldf)[self.prefix]
return ds.get_object(self.component(cldf=ds), self.objid)
[docs]class CLDFMarkdownText:
"""
A CLDF Markdown document.
Basic CLDF Markdown rendering can be implemented by overwriting the `render_link` method.
Then, calling the `render` method will return a markdown string with CLDF Markdown links
replaced.
A trivial renderer, replacing each CLDF Markdown link with the link label, would look as
follows:
.. code-block:: python
from pycldf.ext.markdown import CLDFMarkdownText
class Renderer(CLDFMarkdownText):
def render_link(self, link):
return str(link.label)
assert Renderer('[Example 1](ExampleTable#cldf:ex1)').render() == 'Example 1'
:ivar text: `str` containing the markdown text (with YAML frontmatter removed).
:ivar metadata: `dict` of document metadata read from YAML frontmatter.
:ivar dataset_mapping: :class:`DatasetMapping` instance, linking prefixes used in CLDF \
Markdown links to :class:`pycldf.Dataset` instances.
:cvar source_component: Name of the special "Source" component.
:cvar metadata_component: Name of the special "Metadata" component.
"""
def __init__(self,
text: Union[pathlib.Path, str],
dataset_mapping: Optional[Union[str, Dataset, dict]] = None,
download_dir: Optional[pathlib.Path] = None):
"""
:param text: CLDF Markdown text either to be read from a path or specified as `str`.
:param dataset_mapping: Mapping of dataset prefixes to `Dataset` instances. May override \
the mapping provided in YAML frontmatter as part of the text.
:download_dir: Optional path to a directory to download data for remote datasets.
"""
p = frontmatter.loads(text) if isinstance(text, str) else frontmatter.load(str(text))
self.metadata: dict[str, Any] = p.metadata
self.dataset_mapping: Mapping[Union[str, None], Dataset] = DatasetMapping(
p.get(DATASETS_MAPPING),
dataset_mapping,
text.parent if isinstance(text, pathlib.Path) else None,
download_dir,
)
self.text: str = p.content
self._datadict = collections.defaultdict(dict)
for prefix, ds in self.dataset_mapping.items():
self._datadict[prefix][SOURCE_COMPONENT] = {src.id: src for src in ds.sources}
self._datadict[prefix][METADATA_COMPONENT] = ds.tablegroup.asdict(omit_defaults=True)
@property
def frontmatter(self) -> str:
"""
The markdown documents metadata formatted as YAML frontmatter.
"""
return f'---\n{yaml.dump(self.metadata)}---'
[docs] def get_object(self, ml: CLDFMarkdownLink) -> Union[list, orm.Object, Source, dict]:
"""
Resolve the reference in a CLDF Markdown link to the matching object from a mapped dataset.
The returned object is
- an :class:`pycldf.orm.Object` instance for items in ORM mapped components,
- a row `dict` for items in custom tables,
- a :class:`pycldf.sources.Source` instance for source references,
- a `list` of the above for the special `__all__` identifier.
- a `jmespath.search` result for referenced items in the Metadata component,
This method can be used within :meth:`render_link` implementations.
"""
cldf = self.dataset_mapping[ml.prefix]
comp = ml.component(cldf)
key = comp or ml.table_or_fname
if key == METADATA_COMPONENT:
if ml.all:
return self._datadict[ml.prefix][METADATA_COMPONENT]
return jmespath.search(ml.objid, self._datadict[ml.prefix][METADATA_COMPONENT])
if key not in self._datadict[ml.prefix]: # A new type of data is referenced.
objs = cldf.objects(comp) if comp else cldf.iter_rows(key, 'id')
self._datadict[ml.prefix][key] = {
r.id if isinstance(r, orm.Object) else r['id']: r for r in objs}
return list(self._datadict[ml.prefix][key].values()) if ml.all \
else self._datadict[ml.prefix][key][ml.objid]
def _render_link(self, link: CLDFMarkdownLink) -> Union[str, CLDFMarkdownLink]:
"""Dispatches to custom rendering in case of CLDF links."""
if link.is_cldf_link:
return self.render_link(link)
return link
[docs] def render_link(self, cldf_link: CLDFMarkdownLink) -> Union[str, CLDFMarkdownLink]:
"""
CLDF Markdown renderers must implement this method.
"""
raise NotImplementedError() # pragma: no cover
[docs] def render(
self,
simple_link_detection: bool = True,
markdown_kw: Optional[dict[str, Any]] = None,
) -> str:
"""
A markdown string with CLDF Markdown links replaced.
"""
if tuple(map(int, clldutils.__version__.split('.')[:2])) < (3, 14): # pragma: no cover
if not simple_link_detection or markdown_kw:
warnings.warn(
'Extended markdown link detection is only supported with clldutils>=3.14',
category=UserWarning)
kw = {}
else:
kw = {'simple': simple_link_detection, 'markdown_kw': markdown_kw}
return CLDFMarkdownLink.replace(self.text, self._render_link, **kw)
[docs]class FilenameToComponent(CLDFMarkdownText):
"""
Renderer to replace filenames in CLDF Markdown links with CLDF component names.
"""
[docs] def render_link(self, cldf_link: CLDFMarkdownLink) -> CLDFMarkdownLink:
"""
Rewrites to URL of CLDF Markdown links, using the component name as path component.
"""
comp = cldf_link.component(cldf=self.dataset_mapping)
if comp:
return cldf_link.update_url(path=cldf_link.component(cldf=self.dataset_mapping))
return cldf_link