Source code for pycldf.trees

"""
Support for the CLDF TreeTable component.

The peculiarity of a tree object in CLDF lies in the fact that the actual tree data is pulled in
from a media file in newick or Nexus format. This "pulling in" is implemented in the method
:meth:`Tree.newick`.

Accessing `Tree` instances associated with a dataset is done using a :class:`Trees` instance.

.. code-block:: python

    >>> from pycldf import Dataset
    >>> from pycldf.trees import TreeTable
    >>> ds = Dataset.from_metadata('tests/data/dataset_with_trees/metadata.json')
    >>> trees = list(TreeTable(ds))
    >>> print(trees[0].newick().ascii_art())
              ┌─l1
         ┌────┤
         │    └─l2
    ─────┤
         ├─l3
         └─l4
"""
import typing
import logging
import pathlib

from clldutils.misc import log_or_raise
from commonnexus import Nexus
import newick

import pycldf
from pycldf.media import MediaTable, File


__all__ = ['Tree', 'TreeTable']


[docs]class Tree:
    """
    Represents a tree object as specified in a row of `TreeTable`.
    """
    def __init__(self, trees: 'TreeTable', row: dict, file: File):
        self.row = row
        self.id = row[trees.cols['id'].name]
        self.name = row[trees.cols['name'].name]
        self.file = file
        for prop in ['description', 'treeType', 'treeIsRooted', 'treeBranchLengthUnit']:
            attrib = ''.join('_' + c.lower() if c.isupper() else c for c in prop)
            setattr(self, attrib, row.get(trees.cols[prop].name) if trees.cols[prop] else None)
        self.trees = trees

[docs]    def newick_string(self, d: typing.Optional[pathlib.Path] = None) -> str:
        """
        Retrieve the Newick representation of the tree from the associated tree file.

        :param d: Directory where the tree file was saved earlier, using \
        :meth:`pycldf.media.File.save`.
        :return: Newick representation of the associated tree.
        """
        if self.file.id not in self.trees._parsed_files:
            content = self.file.read(d=d)
            if self.file.mimetype == 'text/x-nh':
                self.trees._parsed_files[self.file.id] = {
                    str(index): nwk for index, nwk in enumerate(
                        [t.strip() for t in content.split(';') if t.strip()], start=1)}
            else:
                self.trees._parsed_files[self.file.id] = {
                    tree.name: tree.newick_string for tree in Nexus(content).TREES.trees}

        return self.trees._parsed_files[self.file.id][self.name]

[docs]    def newick(self,
               d: typing.Optional[pathlib.Path] = None,
               strip_comments: bool = False) -> newick.Node:
        """
        Retrieve a `newick.Node` instance for the tree from the associated tree file.

        :param d: Directory where the tree file was saved earlier, using \
        :meth:`pycldf.media.File.save`.
        :param strip_comments: Flag signaling whether to strip comments enclosed in square \
        brackets.
        :return: `newick.Node` representing the root of the associated tree.
        """
        return newick.loads(self.newick_string(d=d), strip_comments=strip_comments)[0]


[docs]class TreeTable(pycldf.ComponentWithValidation):
    """
    Container class for a `Dataset`'s TreeTable.
    """
    def __init__(self, ds: pycldf.Dataset):
        super().__init__(ds)
        self.media = MediaTable(ds)
        self.media_rows = {row[self.media.id_col.name]: row for row in ds['MediaTable']}
        self.cols = {
            prop: self.ds.get((self.table, prop)) for prop in [
                'id', 'name', 'description', 'mediaReference',
                'treeIsRooted', 'treeType', 'treeBranchLengthUnit']}
        # Since reading and parsing tree files is expensive, we cache them.
        self._parsed_files = {}

    def __iter__(self) -> typing.Generator[Tree, None, None]:
        for row in self.table:
            yield Tree(
                self,
                row,
                File(self.media, self.media_rows[row[self.cols['mediaReference'].name]]))

    def validate(self,
                 success: bool = True,
                 log: logging.Logger = None) -> bool:
        lids = {r['id'] for r in self.ds.iter_rows('LanguageTable', 'id')}
        for tree in self:
            try:
                nwk = tree.newick()
            except KeyError:
                log_or_raise(
                    'No newick tree found for name "{}"'.format(tree.name),
                    log=log)
                success = False
                nwk = None

            if nwk:
                for node in nwk.walk():
                    if node.name and (node.name not in lids):
                        log_or_raise(
                            'Newick node label "{}" is not a LanguageTable ID'.format(node.name),
                            log=log)
                        success = False
        return success