Source code for pycldf.sources

import re
import types
import typing
import pathlib
import zipfile
import tempfile
import collections
from urllib.error import HTTPError
from urllib.request import urlopen, urlretrieve

from csvw.metadata import is_url
from pybtex import database
from pybtex.database.output.bibtex import Writer as BaseWriter
from clldutils.source import Source as BaseSource
from clldutils.source import ID_PATTERN

from pycldf.util import update_url

__all__ = ['Source', 'Sources', 'Reference']

GLOTTOLOG_ID_PATTERN = re.compile('^[1-9][0-9]*$')


class Writer(BaseWriter):
    def quote(self, s):
        self.check_braces(s)
        return '{%s}' % s

    def _encode(self, text):
        #
        # FIXME: We overwrite a private method here!
        #
        return text


[docs]class Source(BaseSource): """ A bibliograhical record, specifying a source for some data in a CLDF dataset. """ @property def entry(self): persons = collections.OrderedDict([ ('author', list(self.persons(self.get('author', '')))), ('editor', list(self.persons(self.get('editor', '')))), ]) return database.Entry( self.genre, fields=collections.OrderedDict( (k, v) for k, v in sorted(self.items()) if v and k not in ['author', 'editor']), persons=persons) def __str__(self): return self.text() def __repr__(self): return '<%s %s>' % (self.__class__.__name__, self.id)
[docs] @classmethod def from_entry(cls, key, entry, **_kw): """ Create a `cls` instance from a `pybtex` entry object. :param key: BibTeX citation key of the entry :param entry: `pybtex.database.Entry` instance :param _kw: Non-bib-metadata keywords to be passed for `cls` instantiation :return: `cls` instance """ _kw.update({k: v for k, v in entry.fields.items()}) _kw.setdefault('_check_id', False) for role in entry.persons: if entry.persons[role]: _kw[role] = ' and '.join('%s' % p for p in entry.persons[role]) return cls(entry.type, key, **_kw)
@staticmethod def persons(s): for name in re.split(r'\s+&\s+|\s+and\s+', s.strip()): if name: parts = name.split(',') if len(parts) > 2: for part in parts: yield database.Person(part.strip()) else: yield database.Person(name) def refkey(self, year_brackets='round'): brackets = {None: ('', ''), 'round': ('(', ')'), 'square': ('[', ']'), 'curly': ('{', '}')} persons = self.entry.persons.get('author') or self.entry.persons.get('editor', []) s = ' '.join(persons[0].prelast_names + persons[0].last_names) if persons else 'n.a.' if len(persons) == 2: s += ' and {}'.format(' '.join(persons[1].last_names)) elif len(persons) > 2: s += ' et al.' return s.replace('{', '').replace('}', '') + ' {}{}{}'.format( brackets[year_brackets][0], self.get('year', 'n.d.'), brackets[year_brackets][1])
[docs]class Reference(object): """ A reference connects a piece of data with a `Source`, typically adding some citation context \ often page numbers, or similar. """ def __init__(self, source: Source, desc: typing.Union[str, None]): if desc and ('[' in desc or ']' in desc or ';' in desc): raise ValueError('invalid ref description: %s' % desc) self.source = source self.fields = types.SimpleNamespace(**self.source) if isinstance(self.source, dict) else {} self.description = desc
[docs] def __str__(self): """ String representation of a reference according to the CLDF specification. .. seealso:: https://github.com/cldf/cldf#sources """ res = self.source.id if hasattr(self.source, 'id') else self.source if self.description: res += '[%s]' % self.description return res
def __repr__(self): return '<%s %s>' % (self.__class__.__name__, self)
[docs]class Sources(object): """ A `dict` like container for all sources linked to data in a CLDF dataset. """ def __init__(self): self._bibdata = database.BibliographyData() @classmethod def from_file(cls, fname): zipped = False res = cls() if not is_url(fname): fname = pathlib.Path(fname) if not fname.exists(): fname = fname.parent / '{}.zip'.format(fname.name) zipped = True if fname.exists(): assert fname.is_file(), 'Bibfile {} must be a file!'.format(fname) res.read(fname, zipped=zipped) else: res.read(fname) return res def __bool__(self): return bool(self._bibdata.entries) __nonzero__ = __bool__ def keys(self): return self._bibdata.entries.keys() def items(self): for key, entry in self._bibdata.entries.items(): yield Source.from_entry(key, entry) def __iter__(self): return self.items() def __len__(self): return len(self._bibdata.entries) def __getitem__(self, item): try: return Source.from_entry(item, self._bibdata.entries[item]) except KeyError: raise ValueError('missing citekey: %s' % item) def __contains__(self, item): return item in self._bibdata.entries @staticmethod def format_refs(*refs): return ['%s' % ref for ref in refs]
[docs] @staticmethod def parse(ref: str) -> typing.Tuple[str, str]: """ Parse the string representation of a reference into source ID and context. :raises ValueError: if the reference does not match the expected format. """ sid, pages = ref.strip(), None if '[' in sid: sid, pages = [ss.strip() for ss in sid.split('[', 1)] if not (sid and pages.endswith(']')): raise ValueError(ref) pages = pages[:-1].strip() return sid, pages
def validate(self, refs): for sid, _ in map(self.parse, [refs] if isinstance(refs, str) else refs): if sid not in self.keys(): raise ValueError('missing source key: {0}'.format(sid))
[docs] def expand_refs(self, refs: typing.Iterable[str], **kw) -> typing.Iterable[Reference]: """ Turn a list of string references into proper :class:`Reference` instances, looking up \ sources in `self`. This can be used from a :class:`pycldf.Dataset` as follows: .. code-block:: python >>> for row in dataset.iter_rows('ValueTable', 'source'): ... for ref in dataset.sources.expand_refs(row['source']): ... print(ref.source) """ for sid, pages in map( self.parse, [refs] if isinstance(refs, str) else refs): if sid not in self and GLOTTOLOG_ID_PATTERN.match(sid): self._add_entries(Source('misc', sid, glottolog_id=sid), **kw) yield Reference(self[sid], pages)
def _add_entries(self, data, **kw): if isinstance(data, Source): entries = [(data.id, data.entry)] elif isinstance(data, database.BibliographyData): entries = data.entries.items() else: raise ValueError(data) for key, entry in entries: if kw.get('_check_id', False) and not ID_PATTERN.match(key): raise ValueError('invalid source ID: %s' % key) if key not in self._bibdata.entries: try: self._bibdata.add_entry(key, entry) except database.BibliographyDataError as e: # pragma: no cover raise ValueError('%s' % e) def read(self, fname, zipped=False, **kw): if is_url(fname): try: content = urlopen(fname).read().decode('utf-8') except HTTPError as e: if '404' in str(e): fname = update_url( fname, lambda u: (u.scheme, u.netloc, u.path + '.zip', u.query, u.fragment)) with tempfile.TemporaryDirectory() as tmp: zfname = pathlib.Path(tmp) / 'sources.zip' urlretrieve(fname, zfname) with zipfile.ZipFile(zfname, 'r') as zf: content = zf.read(zf.namelist()[0]).decode('utf8') else: if zipped: with zipfile.ZipFile(fname, 'r') as zf: content = zf.read(zf.namelist()[0]).decode('utf8') else: content = pathlib.Path(fname).read_text(encoding='utf-8') self._add_entries( database.parse_string(content, bib_format='bibtex'), **kw) def write(self, fname, ids=None, zipped=False, **kw): if ids: bibdata = database.BibliographyData() for key, entry in self._bibdata.entries.items(): if key in ids: bibdata.add_entry(key, entry) else: bibdata = self._bibdata if bibdata.entries: with pathlib.Path(fname).open('w', encoding='utf8') as fp: Writer().write_stream(bibdata, fp) if zipped: with zipfile.ZipFile( fname.parent / '{}.zip'.format(fname.name), 'w', compression=zipfile.ZIP_DEFLATED) as zf: zf.write(fname, fname.name) fname.unlink() return fname
[docs] def add(self, *entries: typing.Union[str, Source], **kw): """ Add a source, either specified as BibTeX string or as :class:`Source`. """ for entry in entries: if isinstance(entry, str): self._add_entries(database.parse_string(entry, bib_format='bibtex'), **kw) else: self._add_entries(entry, **kw)