"""
Object oriented (read-only) access to CLDF data
To read ORM objects from a `pycldf.Dataset`, there are two generic methods:
* :meth:`pycldf.Dataset.objects`
* :meth:`pycldf.Dataset.get_object`
Both will return default implementations of the objects, i.e. instances of the corresponding
class defined in this module. To customize these objects,
1. subclass the default and specify \
the appropriate component (i.e. the table of the CLDF dataset which holds rows to be transformed\
to this type):
.. code-block:: python
from pycldf.orm import Language
class Variety(Language):
__component__ = 'LanguageTable'
def custom_method(self):
pass
2. pass the class into the `objects` or `get_object` method.
In addition, module-specific subclasses of :class:`pycldf.Dataset` provide more meaningful
properties and methods, as shortcuts to the methods above. See
`<./dataset.html#subclasses-supporting-specific-cldf-modules>`_ for details.
Limitations:
------------
* We only support foreign key constraints for CLDF reference properties targeting either a \
component's CLDF id or its primary key. This is because CSVW does not support unique constraints \
other than the one implied by the primary key declaration.
* This functionality comes with the typical "more convenient API vs. less performance and bigger \
memory footprint" trade-off. If you are running into problems with this, you might want to load \
your data into a SQLite db using the `pycldf.db` module, and access via SQL. \
Some numbers (to be interpreted relative to each other): \
Reading ~400,000 rows from a ValueTable of a StructureDataset takes
* ~2secs with csvcut, i.e. only making sure it's valid CSV
* ~15secs iterating over ``pycldf.Dataset['ValueTable']``
* ~35secs iterating over ``pycldf.Dataset.objects('ValueTable')``
"""
import types
import typing
import functools
import collections
import csvw.metadata
from tabulate import tabulate
from pycldf.terms import TERMS, term_uri
from pycldf.util import DictTuple
from pycldf.sources import Reference
if typing.TYPE_CHECKING:
from pycldf import Dataset # pragma: no cover
[docs]class Object:
"""
Represents a row of a CLDF component table.
Subclasses of `Object` are instantiated when calling `Dataset.objects` or `Dataset.get_object`.
:ivar dataset: Reference to the `Dataset` instance, this object was loaded from.
:ivar data: An `OrderedDict` with a copy of the row the object was instantiated with.
:ivar cldf: A `dict` with CLDF-specified properties of the row, keyed with CLDF terms.
:ivar id: The value of the CLDF id property of the row.
:ivar name: The value of the CLDF name property of the row.
:ivar description: The value of the CLDF description property of the row.
:ivar pk: The value of the column specified as primary key for the table. (May differ from id)
"""
# If a subclass name can not be used to derive the CLDF component name, the component can be
# specified here:
__component__ = None
def __init__(self, dataset: 'Dataset', row: dict):
# Get a mapping of column names to pairs (CLDF property name, list-valued) for columns
# present in the component specified by class name.
cldf_cols = {
v[0]: (k, v[1])
for k, v in vars(getattr(dataset.readonly_column_names, self.component)).items()
if v}
self._listvalued = set(v[0] for v in cldf_cols.values() if v[1])
self.cldf = {}
self.data = collections.OrderedDict()
for k, v in row.items():
# We go through the items of the row and slot them into the appropriate bags:
self.data[k] = v
if k in cldf_cols:
self.cldf[cldf_cols[k][0]] = v
# Make cldf properties accessible as attributes:
self.cldf = types.SimpleNamespace(**self.cldf)
self.dataset = dataset
self.id = self.cldf.id
self.pk = None
t = dataset[self.component_name()]
if t.tableSchema.primaryKey and len(t.tableSchema.primaryKey) == 1:
self.pk = self.data[dataset[self.component_name()].tableSchema.primaryKey[0]]
self.name = getattr(self.cldf, 'name', None)
self.description = getattr(self.cldf, 'name', None)
def __repr__(self):
return '<{}.{} id="{}">'.format(self.__class__.__module__, self.__class__.__name__, self.id)
@classmethod
def component_name(cls) -> str:
return cls.__component__ or (cls.__name__ + 'Table')
@property
def component(self) -> str:
"""
Name of the CLDF component the object belongs to. Can be used to lookup the corresponding \
table via `obj.dataset[obj.component_name()]`.
"""
return self.__class__.component_name()
@property
def key(self) -> typing.Tuple[int, str, str]:
return id(self.dataset), self.__class__.__name__, self.id
def __hash__(self):
return hash(self.key)
def __eq__(self, other):
if isinstance(self, Object):
return self.key == other.key
return NotImplemented # pragma: no cover
def _expand_uritemplate(self, attr, col):
"""
CSVW cells can specify various URI templates which must be expanded supplying the full
row as context. Thus, expansion is available as method on this row object.
"""
col = self.dataset[self.component, col]
variables = {k: v for k, v in vars(self.cldf).items()}
variables.update(self.data)
if getattr(col, attr, None):
return getattr(col, attr).expand(**variables)
[docs] def aboutUrl(self, col='id') -> typing.Union[str, None]:
"""
The table's `aboutUrl` property, expanded with the object's row as context.
"""
return self._expand_uritemplate('aboutUrl', col)
[docs] def valueUrl(self, col='id') -> typing.Union[str, None]:
"""
The table's `valueUrl` property, expanded with the object's row as context.
"""
return self._expand_uritemplate('valueUrl', col)
[docs] def propertyUrl(self, col='id') -> typing.Union[str, None]:
"""
The table's `propertyUrl` property, expanded with the object's row as context.
"""
return self._expand_uritemplate('propertyUrl', col)
@functools.cached_property
def references(self) -> typing.Tuple[Reference]:
"""
`pycldf.Reference` instances associated with the object.
>>> obj.references[0].source['title']
>>> obj.references[0].fields.title
>>> obj.references[0].description # The "context", typically cited pages
"""
return DictTuple(
self.dataset.sources.expand_refs(getattr(self.cldf, 'source', []) or []),
key=lambda r: r.source.id,
multi=True,
)
class _WithLanguageMixin:
@property
def language(self):
return self.related('languageReference')
@property
def languages(self):
return self.all_related('languageReference')
class _WithParameterMixin:
@functools.cached_property
def parameter(self):
return self.related('parameterReference')
@property
def parameters(self):
return self.all_related('parameterReference')
[docs]class Borrowing(Object):
@property
def targetForm(self):
return self.related('targetFormReference')
@property
def sourceForm(self):
return self.related('sourceFormReference')
[docs]class Code(Object, _WithParameterMixin):
pass
[docs]class Cognateset(Object):
@property
def cognates(self):
return DictTuple(v for v in self.dataset.objects('CognateTable') if v.cognateset == self)
[docs]class Cognate(Object):
@property
def form(self):
return self.related('formReference')
@property
def cognateset(self):
return self.related('cognatesetReference')
[docs]class Contribution(Object):
@property
def sentences(self):
res = []
if self.dataset.module == 'TextCorpus':
# Return the list of lines, ordered by position.
for e in self.dataset.objects('ExampleTable'):
if e.cldf.contributionReference == self.id:
if not getattr(e.cldf, 'exampleReference', None):
# Not just an alternative translation line.
res.append(e)
if res and hasattr(res[0].cldf, 'position'):
return sorted(res, key=lambda e: getattr(e.cldf, 'position'))
return res
[docs]class Entry(Object, _WithLanguageMixin):
@property
def senses(self):
return DictTuple(v for v in self.dataset.objects('SenseTable') if self in v.entries)
[docs]class Example(Object, _WithLanguageMixin):
@property
def metaLanguage(self):
return self.related('metaLanguageReference')
@property
def igt(self):
return '{0}\n{1}\n{2}'.format(
self.cldf.primaryText,
tabulate([self.cldf.gloss], self.cldf.analyzedWord, tablefmt='plain'),
self.cldf.translatedText,
)
@property
def text(self):
"""
Examples in a TextCorpus are interpreted as lines of text.
"""
if self.dataset.module == 'TextCorpus' and hasattr(self.cldf, 'contributionReference'):
return self.related('contributionReference')
@property
def alternative_translations(self):
res = []
if hasattr(self.cldf, 'exampleReference'):
# There's a self-referential foreign key. We assume this to link together full examples
# and alternative translations.
for ex in self.dataset.objects('ExampleTable'):
if ex.cldf.exampleReference == self.id:
res.append(ex)
return res
[docs]class FunctionalEquivalentset(Object):
pass
[docs]class FunctionalEquivalent(Object):
@property
def form(self): # pragma: no cover
return self.related('formReference')
[docs]class Language(Object):
"""
FIXME: describe usage!
"""
@property
def lonlat(self):
"""
:return: (longitude, latitude) pair
"""
if hasattr(self.cldf, 'latitude'):
return (self.cldf.longitude, self.cldf.latitude)
@property
def as_geojson_feature(self):
if self.lonlat:
return {
"type": "Feature",
"geometry": {"type": "Point", "coordinates": list(self.lonlat)},
"properties": self.cldf,
}
@functools.cached_property
def speaker_area(self):
from pycldf.media import File
if getattr(self.cldf, 'speakerArea', None):
return File.from_dataset(self.dataset, self.related('speakerArea'))
@functools.cached_property
def speaker_area_as_geojson_feature(self):
if self.speaker_area and self.speaker_area.mimetype.subtype == 'geo+json':
res = self.speaker_area.read_json()
if res['type'] == 'FeatureCollection':
for feature in res['features']:
if feature['properties']['cldf:languageReference'] == self.id:
return feature
else:
assert res['type'] == 'Feature'
return res
@property
def values(self):
return DictTuple(v for v in self.dataset.objects('ValueTable') if self in v.languages)
@property
def forms(self):
return DictTuple(v for v in self.dataset.objects('FormTable') if self in v.languages)
[docs] def glottolog_languoid(self, glottolog_api):
"""
Get a Glottolog languoid associated with the `Language`.
:param glottolog_api: `pyglottolog.Glottolog` instance or `dict` mapping glottocodes to \
`pyglottolog.langoids.Languoid` instances.
:return: `pyglottolog.langoids.Languoid` instance or `None`.
"""
if isinstance(glottolog_api, dict):
return glottolog_api.get(self.cldf.glottocode)
return glottolog_api.languoid(self.cldf.glottocode)
class ParameterNetworkEdge(Object):
__component__ = 'ParameterNetwork'
[docs]class Parameter(Object):
@functools.cached_property
def columnSpec(self):
columnSpec = getattr(self.cldf, 'columnSpec', None)
if columnSpec:
return csvw.metadata.Column.fromvalue(columnSpec)
@functools.cached_property
def datatype(self):
if 'datatype' in self.data \
and self.dataset['ParameterTable', 'datatype'].datatype.base == 'json':
if self.data['datatype']:
return csvw.metadata.Datatype.fromvalue(self.data['datatype'])
@property
def codes(self):
return DictTuple(v for v in self.dataset.objects('CodeTable') if v.parameter == self)
@property
def values(self):
return DictTuple(v for v in self.dataset.objects('ValueTable') if self in v.parameters)
@property
def forms(self):
return DictTuple(v for v in self.dataset.objects('FormTable') if self in v.parameters)
[docs] def concepticon_conceptset(self, concepticon_api):
"""
Get a Concepticon conceptset associated with the `Parameter`.
:param concepticon_api: `pyconcepticon.Concepticon` instance or `dict` mapping conceptset \
IDs to `pyconcepticon.models.Conceptset` instances.
:return: `pyconcepticon.models.Conceptset` instance or `None`.
"""
if isinstance(concepticon_api, dict):
return concepticon_api.get(self.cldf.concepticonReference)
return concepticon_api.conceptsets.get(self.cldf.concepticonReference)
[docs]class Sense(Object):
@property
def entry(self):
return self.related('entryReference')
@property
def entries(self):
return self.all_related('entryReference')
class Tree(Object):
pass
[docs]class Value(Object, _WithLanguageMixin, _WithParameterMixin):
"""
Value objects correspond to rows in a dataset's ``ValueTable``.
While a Value's string representation is typically available from the ``value`` column,
i.e. as ``Value.cldf.value``, The interpretation of this value may be dictated by other
metadata.
- Categorical data will often describe possible values (aka "codes") using a ``CodeTable``.
In this case, the associated ``Code`` object of a ``Value`` is available as ``Value.code``.
- Typed data may use a ``columnSpec`` property in ``ParameterTable`` to specify how to read
the string value.
.. code-block:: python
>>> from csvw.metadata import Column
>>> from pycldf import StructureDataset
>>> cs = Column.fromvalue(dict(datatype=dict(base='integer', maximum=5), separator=' '))
>>> ds = StructureDataset.in_dir('.')
>>> ds.add_component('ParameterTable')
>>> ds.write(
... ParameterTable=[dict(ID='1', ColumnSpec=cs.asdict())],
... ValueTable=[dict(ID='1', Language_ID='l', Parameter_ID='1', Value='1 2 3')],
... )
>>> v = ds.objects('ValueTable')[0]
>>> v.cldf.value
'1 2 3'
>>> v.typed_value
[1, 2, 3]
"""
@property
def typed_value(self):
if self.parameter.columnSpec:
return self.parameter.columnSpec.read(self.cldf.value)
if self.parameter.datatype:
return self.parameter.datatype.read(self.cldf.value)
return self.cldf.value
@property
def code(self):
return self.related('codeReference')
@property
def examples(self):
return self.all_related('exampleReference')