"""
This module provides tools to build a CLDF Markdown renderer.
For an example, see :class:`FilenameToComponent`.
"""
import re
import typing
import pathlib
import warnings
import collections.abc
import yaml
import jmespath
import attr
import frontmatter
import clldutils
from clldutils.markup import MarkdownLink
from .discovery import get_dataset
from pycldf.util import pkg_path, url_without_fragment
from pycldf.dataset import MD_SUFFIX
from pycldf.sources import Source
from pycldf import Dataset
from pycldf import orm
__all__ = ['CLDFMarkdownLink', 'CLDFMarkdownText', 'FilenameToComponent']
#: The YAML frontmatter key to specify dataset mappings:
DATASETS_MAPPING = 'cldf-datasets'
SOURCE_COMPONENT = 'Source'
METADATA_COMPONENT = 'Metadata'
class DatasetMapping(collections.abc.Mapping):
"""
A read-only mapping of prefixes to datasets.
"""
key_pattern = re.compile('[a-zA-Z0-9_]+')
@staticmethod
def to_dict(o):
if isinstance(o, DatasetMapping):
return o.m
return {} if not o else ({None: o} if isinstance(o, (str, Dataset)) else o)
def __init__(self,
m1,
m2=None,
doc_path: typing.Optional[pathlib.Path] = None,
download_dir: typing.Optional[pathlib.Path] = None):
"""
:param m1: Mapping of prefixes to datasets (locators).
:param m2: Mapping of prefixes to datasets (locators) to update `m1`.
:param doc_path: Path of a CLDF markdown document, relative to which dataset locators are \
to be resolved.
:param download_dir: Path to an existing directory to which to download datasets \
(if necessary).
"""
self.m = self.to_dict(m1)
self.m.update(self.to_dict(m2))
if not all(True if k is None else DatasetMapping.key_pattern.fullmatch(k) for k in self.m):
raise ValueError('Invalid dataset prefix')
for k in self.m:
if not isinstance(self.m[k], Dataset):
self.m[k] = get_dataset(self.m[k], download_dir, doc_path)
def __getitem__(self, prefix: typing.Union[str, None]) -> Dataset:
"""
Get a `Dataset` mapped to a prefix.
"""
return self.m[prefix]
def __iter__(self):
return iter(self.m)
def __len__(self):
return len(self.m)
[docs]@attr.s
class CLDFMarkdownLink(MarkdownLink):
"""
CLDF Markdown links are specified using URLs of a particular format.
Instances of `CLDFMarkdownLink` are supplied as sole argument when calling the replacement
function passed to `CLDFMarkdownLink.replace` .
"""
fragment_pattern = re.compile(r'cldf(-(?P<prefix>[a-zA-Z0-9_]+))?:')
@property
def url_without_fragment(self):
return url_without_fragment(self.parsed_url)
@staticmethod
def format_url(path, objid, prefix=None):
return '{}#cldf{}:{}'.format(path, '-' + prefix if prefix else '', objid)
@classmethod
def from_component(cls, comp, objid='__all__', label=None, prefix=None) -> 'CLDFMarkdownLink':
return cls(
label=label or '{}:{}'.format(comp, objid),
url=cls.format_url(comp, objid, prefix=prefix))
@property
def is_cldf_link(self) -> bool:
"""
Flag signaling whether the markdown link is indeed referencing a CLDF object.
"""
return bool(self.fragment_pattern.match(self.parsed_url.fragment))
@property
def prefix(self) -> typing.Union[None, str]:
"""
The dataset prefix associated with a CLDF Markdown link.
"""
if self.is_cldf_link:
return self.fragment_pattern.match(self.parsed_url.fragment).group('prefix')
@property
def table_or_fname(self) -> typing.Union[None, str]:
"""
The last path component of the URL of a CLDF Markdown link.
"""
if self.is_cldf_link:
return self.parsed_url.path.split('/')[-1]
[docs] def component(self,
cldf: typing.Optional[
typing.Union[Dataset, typing.Dict[str, Dataset], DatasetMapping]] = None,
) -> typing.Union[str, None]:
"""
:param cldf: `pycldf.Dataset` instance to which the link refers.
:return: Name of the CLDF component the link pertains to or `None`.
"""
name = self.table_or_fname
if cldf is None:
# If no CLDF dataset is passed as context, we can only detect links using proper
# component names as path:
return name if (name in [SOURCE_COMPONENT, METADATA_COMPONENT] or # noqa: W504
pkg_path('components', name + MD_SUFFIX).exists()) \
else None
if isinstance(cldf, (dict, DatasetMapping)):
cldf = cldf[self.prefix]
if name == cldf.bibname or name == SOURCE_COMPONENT:
return SOURCE_COMPONENT
if name == cldf.filename or name == METADATA_COMPONENT:
return METADATA_COMPONENT
try:
return cldf.get_tabletype(cldf[name])
except (KeyError, ValueError):
return None
@property
def objid(self) -> typing.Union[None, str]:
"""
The identifier of the object referenced by a CLDF Markdown link.
"""
if self.is_cldf_link:
return self.parsed_url.fragment.split(':', maxsplit=1)[-1]
@property
def all(self) -> bool:
"""
Flag signaling whether the link is referencing the special `__all__` identifier.
"""
return self.objid == '__all__'
[docs] def get_row(self, cldf: typing.Union[Dataset, DatasetMapping]) -> dict:
"""
Resolve the reference in a CLDF Markdown link to a row (`dict`) in the CLDF `Dataset`.
"""
assert self.is_cldf_link and self.objid and (not self.all)
ds = DatasetMapping(cldf)[self.prefix]
return ds.get_row(self.component(cldf=ds), self.objid)
[docs] def get_object(self, cldf: typing.Union[Dataset, DatasetMapping]) -> orm.Object:
"""
Resolve the reference in a CLDF Markdown link to an ORM object in the CLDF `Dataset`.
"""
assert self.is_cldf_link and self.objid and (not self.all)
ds = DatasetMapping(cldf)[self.prefix]
return ds.get_object(self.component(cldf=ds), self.objid)
[docs]class CLDFMarkdownText:
"""
A CLDF Markdown document.
Basic CLDF Markdown rendering can be implemented by overwriting the `render_link` method.
Then, calling the `render` method will return a markdown string with CLDF Markdown links
replaced.
A trivial renderer, replacing each CLDF Markdown link with the link label, would look as
follows:
.. code-block:: python
from pycldf.ext.markdown import CLDFMarkdownText
class Renderer(CLDFMarkdownText):
def render_link(self, link):
return str(link.label)
assert Renderer('[Example 1](ExampleTable#cldf:ex1)').render() == 'Example 1'
:ivar text: `str` containing the markdown text (with YAML frontmatter removed).
:ivar metadata: `dict` of document metadata read from YAML frontmatter.
:ivar dataset_mapping: :class:`DatasetMapping` instance, linking prefixes used in CLDF \
Markdown links to :class:`pycldf.Dataset` instances.
:cvar source_component: Name of the special "Source" component.
:cvar metadata_component: Name of the special "Metadata" component.
"""
def __init__(self,
text: typing.Union[pathlib.Path, str],
dataset_mapping: typing.Optional[typing.Union[str, Dataset, dict]] = None,
download_dir: typing.Optional[pathlib.Path] = None):
"""
:param text: CLDF Markdown text either to be read from a path or specified as `str`.
:param dataset_mapping: Mapping of dataset prefixes to `Dataset` instances. May override \
the mapping provided in YAML frontmatter as part of the text.
:download_dir: Optional path to a directory to download data for remote datasets.
"""
p = frontmatter.loads(text) if isinstance(text, str) else frontmatter.load(str(text))
self.metadata = p.metadata
self.dataset_mapping = DatasetMapping(
p.get(DATASETS_MAPPING),
dataset_mapping,
text.parent if isinstance(text, pathlib.Path) else None,
download_dir,
)
self.text = p.content
self._datadict = collections.defaultdict(dict)
for prefix, ds in self.dataset_mapping.items():
self._datadict[prefix][SOURCE_COMPONENT] = {src.id: src for src in ds.sources}
self._datadict[prefix][METADATA_COMPONENT] = ds.tablegroup.asdict(omit_defaults=True)
@property
def frontmatter(self) -> str:
"""
The markdown documents metadata formatted as YAML frontmatter.
"""
return '---\n{}---'.format(yaml.dump(self.metadata))
[docs] def get_object(self, ml: CLDFMarkdownLink) -> typing.Union[list, orm.Object, Source, dict]:
"""
Resolve the reference in a CLDF Markdown link to the matching object from a mapped dataset.
The returned object is
- an :class:`pycldf.orm.Object` instance for items in ORM mapped components,
- a row `dict` for items in custom tables,
- a :class:`pycldf.sources.Source` instance for source references,
- a `list` of the above for the special `__all__` identifier.
- a `jmespath.search` result for referenced items in the Metadata component,
This method can be used within :meth:`render_link` implementations.
"""
cldf = self.dataset_mapping[ml.prefix]
comp = ml.component(cldf)
key = comp or ml.table_or_fname
if key == METADATA_COMPONENT:
if ml.all:
return self._datadict[ml.prefix][METADATA_COMPONENT]
return jmespath.search(ml.objid, self._datadict[ml.prefix][METADATA_COMPONENT])
if key not in self._datadict[ml.prefix]: # A new type of data is referenced.
objs = cldf.objects(comp) if comp else cldf.iter_rows(key, 'id')
self._datadict[ml.prefix][key] = {
r.id if isinstance(r, orm.Object) else r['id']: r for r in objs}
return list(self._datadict[ml.prefix][key].values()) if ml.all \
else self._datadict[ml.prefix][key][ml.objid]
def _render_link(self, link):
if link.is_cldf_link:
return self.render_link(link)
return link
[docs] def render_link(self, cldf_link: CLDFMarkdownLink) -> typing.Union[str, CLDFMarkdownLink]:
"""
CLDF Markdown renderers must implement this method.
"""
raise NotImplementedError() # pragma: no cover
[docs] def render(self,
simple_link_detection: bool = True,
markdown_kw: typing.Optional[dict] = None) -> str:
"""
A markdown string with CLDF Markdown links replaced.
"""
if tuple(map(int, clldutils.__version__.split('.')[:2])) < (3, 14): # pragma: no cover
if not simple_link_detection or markdown_kw:
warnings.warn(
'Extended markdown link detection is only supported with clldutils>=3.14',
category=UserWarning)
kw = {}
else:
kw = dict(simple=simple_link_detection, markdown_kw=markdown_kw)
return CLDFMarkdownLink.replace(self.text, self._render_link, **kw)
[docs]class FilenameToComponent(CLDFMarkdownText):
"""
Renderer to replace filenames in CLDF Markdown links with CLDF component names.
"""
[docs] def render_link(self, cldf_link):
"""
Rewrites to URL of CLDF Markdown links, using the component name as path component.
"""
comp = cldf_link.component(cldf=self.dataset_mapping)
if comp:
return cldf_link.update_url(path=cldf_link.component(cldf=self.dataset_mapping))
return cldf_link