Source code for pycldf.trees

"""
Support for the CLDF TreeTable component.

The peculiarity of a tree object in CLDF lies in the fact that the actual tree data is pulled in
from a media file in newick or Nexus format. This "pulling in" is implemented in the method
:meth:`Tree.newick`.

Accessing `Tree` instances associated with a dataset is done using a :class:`Trees` instance.

.. code-block:: python

    >>> from pycldf import Dataset
    >>> from pycldf.trees import TreeTable
    >>> ds = Dataset.from_metadata('tests/data/dataset_with_trees/metadata.json')
    >>> trees = list(TreeTable(ds))
    >>> print(trees[0].newick().ascii_art())
              ┌─l1
         ┌────┤
         │    └─l2
    ─────┤
         ├─l3
         └─l4
"""
import typing
import logging
import pathlib

from clldutils.misc import log_or_raise
from commonnexus import Nexus
import newick

import pycldf
from pycldf.media import MediaTable, File


__all__ = ['Tree', 'TreeTable']


[docs]class Tree: """ Represents a tree object as specified in a row of `TreeTable`. """ def __init__(self, trees: 'TreeTable', row: dict, file: File): self.row = row self.id = row[trees.cols['id'].name] self.name = row[trees.cols['name'].name] self.file = file for prop in ['description', 'treeType', 'treeIsRooted', 'treeBranchLengthUnit']: attrib = ''.join('_' + c.lower() if c.isupper() else c for c in prop) setattr(self, attrib, row.get(trees.cols[prop].name) if trees.cols[prop] else None) self.trees = trees
[docs] def newick_string(self, d: typing.Optional[pathlib.Path] = None) -> str: """ Retrieve the Newick representation of the tree from the associated tree file. :param d: Directory where the tree file was saved earlier, using \ :meth:`pycldf.media.File.save`. :return: Newick representation of the associated tree. """ if self.file.id not in self.trees._parsed_files: content = self.file.read(d=d) if self.file.mimetype == 'text/x-nh': self.trees._parsed_files[self.file.id] = { str(index): nwk for index, nwk in enumerate( [t.strip() for t in content.split(';') if t.strip()], start=1)} else: self.trees._parsed_files[self.file.id] = { tree.name: tree.newick_string for tree in Nexus(content).TREES.trees} return self.trees._parsed_files[self.file.id][self.name]
[docs] def newick(self, d: typing.Optional[pathlib.Path] = None, strip_comments: bool = False) -> newick.Node: """ Retrieve a `newick.Node` instance for the tree from the associated tree file. :param d: Directory where the tree file was saved earlier, using \ :meth:`pycldf.media.File.save`. :param strip_comments: Flag signaling whether to strip comments enclosed in square \ brackets. :return: `newick.Node` representing the root of the associated tree. """ return newick.loads(self.newick_string(d=d), strip_comments=strip_comments)[0]
[docs]class TreeTable(pycldf.ComponentWithValidation): """ Container class for a `Dataset`'s TreeTable. """ def __init__(self, ds: pycldf.Dataset): super().__init__(ds) self.media = MediaTable(ds) self.media_rows = {row[self.media.id_col.name]: row for row in ds['MediaTable']} self.cols = { prop: self.ds.get((self.table, prop)) for prop in [ 'id', 'name', 'description', 'mediaReference', 'treeIsRooted', 'treeType', 'treeBranchLengthUnit']} # Since reading and parsing tree files is expensive, we cache them. self._parsed_files = {} def __iter__(self) -> typing.Generator[Tree, None, None]: for row in self.table: yield Tree( self, row, File(self.media, self.media_rows[row[self.cols['mediaReference'].name]])) def validate(self, success: bool = True, log: logging.Logger = None) -> bool: lids = {r['id'] for r in self.ds.iter_rows('LanguageTable', 'id')} for tree in self: try: nwk = tree.newick() except KeyError: log_or_raise( 'No newick tree found for name "{}"'.format(tree.name), log=log) success = False nwk = None if nwk: for node in nwk.walk(): if node.name and (node.name not in lids): log_or_raise( 'Newick node label "{}" is not a LanguageTable ID'.format(node.name), log=log) success = False return success