Source code for pymorphy2.tagset

# -*- coding: utf-8 -*-
"""
Utils for working with grammatical tags.
"""
from __future__ import absolute_import, unicode_literals
import collections

try:
    from sys import intern
except ImportError:
    # python 2.x has builtin ``intern`` function
    pass

# a bit of *heavy* magic...
class _select_grammeme_from(object):
    """
    Descriptor object for accessing grammemes of certain classes
    (e.g. number or voice).
    """
    def __init__(self, grammeme_set):
        self.grammeme_set = grammeme_set

        # ... are descriptors not magical enough?

        # In order to fight typos, raise an exception
        # if a result is compared to a grammeme which
        # is not in a set of allowed grammemes.
        _str = type("unicode string")

        class TypedGrammeme(_str):
            def __eq__(self, other):
                if other is None:
                    return False
                if other not in grammeme_set:
                    raise ValueError("'%s' is not a valid grammeme for this attribute." % other)
                return _str.__eq__(self, other)

            def __ne__(self, other):
                return not self.__eq__(other)

            def __hash__(self):
                return _str.__hash__(self)

        self.TypedGrammeme = TypedGrammeme

    def __get__(self, instance, owner):
        grammemes = self.grammeme_set & instance.grammemes

        if not grammemes:
            # XXX: type checks are not enforced in this case
            return None

        res = next(iter(grammemes))
        return self.TypedGrammeme(res) if owner.typed_grammemes else res


# Design notes: Tag objects should be immutable.
[docs]class OpencorporaTag(object):
    """
    Wrapper class for OpenCorpora.org tags.

    .. warning::

        In order to work properly, the class has to be globally
        initialized with actual grammemes (using _init_grammemes method).

        Pymorphy2 initializes it when loading a dictionary;
        it may be not a good idea to use this class directly.
        If possible, use ``morph_analyzer.TagClass`` instead.

    Example::

        >>> from pymorphy2 import MorphAnalyzer
        >>> morph = MorphAnalyzer()
        >>> Tag = morph.TagClass # get an initialzed Tag class
        >>> tag = Tag('VERB,perf,tran plur,impr,excl')
        >>> tag
        OpencorporaTag('VERB,perf,tran plur,impr,excl')

    Tag instances have attributes for accessing grammemes::

        >>> print(tag.POS)
        VERB
        >>> print(tag.number)
        plur
        >>> print(tag.case)
        None

    Available attributes are: POS, animacy, aspect, case, gender, involvement,
    mood, number, person, tense, transitivity and voice.

    You may check if a grammeme is in tag or if all grammemes
    from a given set are in tag::

        >>> 'perf' in tag
        True
        >>> 'nomn' in tag
        False
        >>> 'Geox' in tag
        False
        >>> set(['VERB', 'perf']) in tag
        True
        >>> set(['VERB', 'perf', 'sing']) in tag
        False

    In order to fight typos, for unknown grammemes an exception is raised::

        >>> 'foobar' in tag
        Traceback (most recent call last):
        ...
        ValueError: Grammeme is unknown: foobar
        >>> set(['NOUN', 'foo', 'bar']) in tag
        Traceback (most recent call last):
        ...
        ValueError: Grammemes are unknown: {'bar', 'foo'}

    This also works for attributes::

        >>> tag.POS == 'plur'
        Traceback (most recent call last):
        ...
        ValueError: 'plur' is not a valid grammeme for this attribute.

    """

    __slots__ = ['_grammemes_tuple', '_grammemes_cache', '_str']

    # Grammeme categories
    # (see http://opencorpora.org/dict.php?act=gram for a full set)
    # -------------------------------------------------------------

    PARTS_OF_SPEECH = frozenset([
        'NOUN', # имя существительное
        'ADJF', # имя прилагательное (полное)
        'ADJS', # имя прилагательное (краткое)
        'COMP', # компаратив
        'VERB', # глагол (личная форма)
        'INFN', # глагол (инфинитив)
        'PRTF', # причастие (полное)
        'PRTS', # причастие (краткое)
        'GRND', # деепричастие
        'NUMR', # числительное
        'ADVB', # наречие
        'NPRO', # местоимение-существительное
        'PRED', # предикатив
        'PREP', # предлог
        'CONJ', # союз
        'PRCL', # частица
        'INTJ', # междометие
    ])

    ANIMACY = frozenset([
        'anim', # одушевлённое
        'inan', # неодушевлённое
    ])

    GENDERS = frozenset([
        'masc', # мужской род
        'femn', # женский род
        'neut', # средний род
    ])

    NUMBERS = frozenset([
        'sing', # единственное число
        'plur', # множественное число
    ])

    CASES = frozenset([
        'nomn', # именительный падеж
        'gent', # родительный падеж
        'datv', # дательный падеж
        'accs', # винительный падеж
        'ablt', # творительный падеж
        'loct', # предложный падеж
        'voct', # звательный падеж
        'gen1', # первый родительный падеж
        'gen2', # второй родительный (частичный) падеж
        'acc2', # второй винительный падеж
        'loc1', # первый предложный падеж
        'loc2', # второй предложный (местный) падеж
    ])

    ASPECTS = frozenset([
        'perf', # совершенный вид
        'impf', # несовершенный вид
    ])

    TRANSITIVITY = frozenset([
        'tran', # переходный
        'intr', # непереходный
    ])

    PERSONS = frozenset([
        '1per', # 1 лицо
        '2per', # 2 лицо
        '3per', # 3 лицо
    ])

    TENSES = frozenset([
        'pres', # настоящее время
        'past', # прошедшее время
        'futr', # будущее время
    ])

    MOODS = frozenset([
        'indc', # изъявительное наклонение
        'impr', # повелительное наклонение
    ])

    VOICES = frozenset([
        'actv', # действительный залог
        'pssv', # страдательный залог
    ])

    INVOLVEMENT = frozenset([
        'incl', # говорящий включён в действие
        'excl', # говорящий не включён в действие
    ])

    # Set this to False (as a class attribute) to disable strict
    # grammeme type checking for tag.POS, tag.voice, etc. attributes.
    # Without type checks comparisons are about 2x faster.
    typed_grammemes = True

    # Tag format identifier
    # (compatible with https://github.com/kmike/russian-tagsets)
    # ----------------------------------------------------------
    FORMAT = 'opencorpora-int'


    # Helper attributes for inflection/declension routines
    # ----------------------------------------------------
    _NON_PRODUCTIVE_CLASSES = frozenset(['NUMR', 'NPRO', 'PRED', 'PREP',
                                   'CONJ', 'PRCL', 'INTJ'])
    _EXTRA_INCOMPATIBLE = { # XXX: is it a good idea to have these rules?
        'plur': set(['GNdr']),
        # XXX: how to use rules from OpenCorpora?
        # (they have "lexeme/form" separation)
    }
    _GRAMMEME_INDICES = collections.defaultdict(lambda: 0)
    _GRAMMEME_INCOMPATIBLE = collections.defaultdict(set)
    _KNOWN_GRAMMEMES = None

    def __init__(self, tag):
        self._str = tag

        # XXX: we loose information about which grammemes
        # belongs to lexeme and which belongs to form
        # (but this information seems useless for pymorphy2).

        # Hacks for better memory usage:
        # - store grammemes in a tuple and build a set only when needed;
        # - use byte strings for grammemes under Python 2.x;
        # - grammemes are interned.
        grammemes = tag.replace(' ', ',', 1).split(',')
        self._grammemes_tuple = tuple([intern(str(g)) for g in grammemes])
        self._grammemes_cache = None

    @property
[docs]    def grammemes(self):
        """ A frozenset with grammemes for this tag. """
        if self._grammemes_cache is None:
            self._grammemes_cache = frozenset(self._grammemes_tuple)
        return self._grammemes_cache

    # attributes for grammeme categories
    POS = _select_grammeme_from(PARTS_OF_SPEECH)
    animacy = _select_grammeme_from(ANIMACY)
    aspect = _select_grammeme_from(ASPECTS)
    case = _select_grammeme_from(CASES)
    gender = _select_grammeme_from(GENDERS)
    involvement = _select_grammeme_from(INVOLVEMENT)
    mood = _select_grammeme_from(MOODS)
    number = _select_grammeme_from(NUMBERS)
    person = _select_grammeme_from(PERSONS)
    tense = _select_grammeme_from(TENSES)
    transitivity = _select_grammeme_from(TRANSITIVITY)
    voice = _select_grammeme_from(VOICES)

    def __contains__(self, grammeme):

        # {'NOUN', 'sing'} in tag
        if isinstance(grammeme, (set, frozenset)):
            if grammeme <= self.grammemes:
                return True

            if not grammeme <= self._KNOWN_GRAMMEMES:
                unknown = grammeme - self._KNOWN_GRAMMEMES
                unknown_repr = ", ".join(["'%s'" % g for g in sorted(unknown)])
                raise ValueError("Grammemes are unknown: {%s}" % unknown_repr)
            return False

        # 'NOUN' in tag
        if grammeme in self.grammemes:
            return True
        else:
            if not self.grammeme_is_known(grammeme):
                raise ValueError("Grammeme is unknown: %s" % grammeme)
            return False

    # FIXME: __repr__ and __str__ always return unicode,
    # but they should return a byte string under Python 2.x.
    def __str__(self):
        return self._str

    def __repr__(self):
        return "OpencorporaTag('%s')" % self


    def __eq__(self, other):
        return self._grammemes_tuple == other._grammemes_tuple

    def __ne__(self, other):
        return self._grammemes_tuple != other._grammemes_tuple

    def __lt__(self, other):
        return self._grammemes_tuple < other._grammemes_tuple

    def __gt__(self, other):
        return self._grammemes_tuple > other._grammemes_tuple

    def __hash__(self):
        return hash(self._grammemes_tuple)


    def is_productive(self):
        # We use `self._grammemes_tuple[0]` instead of `self.POS` here because
        # it is faster and this method is heavily used by MorphAnalyzer.
        return not self._grammemes_tuple[0] in self._NON_PRODUCTIVE_CLASSES

    @classmethod
    def grammeme_is_known(cls, grammeme):
        if not cls._KNOWN_GRAMMEMES:
            msg = "The class was not properly initialized."
            raise RuntimeError(msg)
        return grammeme in cls._KNOWN_GRAMMEMES

[docs]    def updated_grammemes(self, required):
        """
        Return a new set of grammemes with ``required`` grammemes added
        and incompatible grammemes removed.
        """
        new_grammemes = self.grammemes | required
        for grammeme in required:
            if not self.grammeme_is_known(grammeme):
                raise ValueError("Unknown grammeme: %s" % grammeme)
            new_grammemes -= self._GRAMMEME_INCOMPATIBLE[grammeme]
        return new_grammemes

    @classmethod
    def _init_grammemes(cls, dict_grammemes):
        """
        Initialize various class attributes with grammeme
        information obtained from XML dictionary.

        ``dict_grammemes`` is a list of tuples::

            [
                (name, parent, alias, description),
                ...
            ]

        """
        gr = dict((name, parent) for (name, parent, alias, description) in dict_grammemes)
        cls._KNOWN_GRAMMEMES = frozenset(gr.keys())

        # figure out parents & children
        children = collections.defaultdict(set)
        for index, (name, parent, alias, description) in enumerate(dict_grammemes):
            if parent:
                children[parent].add(name)
            if gr.get(parent, None): # parent's parent
                children[gr[parent]].add(name)

        # expand EXTRA_INCOMPATIBLE
        for grammeme, g_set in cls._EXTRA_INCOMPATIBLE.items():
            for g in g_set.copy():
                g_set.update(children[g])

        # fill GRAMMEME_INDICES and GRAMMEME_INCOMPATIBLE
        for index, (name, parent, alias, description) in enumerate(dict_grammemes):
            cls._GRAMMEME_INDICES[name] = index
            incompatible = cls._EXTRA_INCOMPATIBLE.get(name, set())
            incompatible = (incompatible | children[parent]) - set([name])

            cls._GRAMMEME_INCOMPATIBLE[name] = frozenset(incompatible)

    # XXX: do we still need these methods?
    @classmethod
    def _from_internal_tag(cls, tag):
        """ Return tag string given internal tag string """
        return tag

    @classmethod
    def _from_internal_grammeme(cls, grammeme):
        return grammeme



class CyrillicOpencorporaTag(OpencorporaTag):
    """
    Tag class that uses Cyrillic tag names.

    .. warning::

        This class is experimental and incomplete, do not use
        it because it may be removed in future!
    """

    FORMAT = 'opencorpora-ext'

    _GRAMMEME_ALIAS_MAP = dict()

    @classmethod
    def _from_internal_tag(cls, tag):
        for name, alias in cls._GRAMMEME_ALIAS_MAP.items():
            if alias:
                tag = tag.replace(name, alias)
        return tag

    @classmethod
    def _from_internal_grammeme(cls, grammeme):
        return cls._GRAMMEME_ALIAS_MAP.get(grammeme, grammeme)

    @classmethod
    def _init_grammemes(cls, dict_grammemes):
        """
        Initialize various class attributes with grammeme
        information obtained from XML dictionary.
        """
        cls._init_alias_map(dict_grammemes)
        super(CyrillicOpencorporaTag, cls)._init_grammemes(dict_grammemes)

        GRAMMEME_INDICES = collections.defaultdict(lambda: 0)
        for name, idx in cls._GRAMMEME_INDICES.items():
            GRAMMEME_INDICES[cls._from_internal_grammeme(name)] = idx
        cls._GRAMMEME_INDICES = GRAMMEME_INDICES

        GRAMMEME_INCOMPATIBLE = collections.defaultdict(set)
        for name, value in cls._GRAMMEME_INCOMPATIBLE.items():
            GRAMMEME_INCOMPATIBLE[cls._from_internal_grammeme(name)] = set([
                cls._from_internal_grammeme(gr) for gr in value
            ])
        cls._GRAMMEME_INCOMPATIBLE = GRAMMEME_INCOMPATIBLE

        cls._NON_PRODUCTIVE_CLASSES = set([
            cls._from_internal_grammeme(gr) for gr in cls._NON_PRODUCTIVE_CLASSES
        ])

    @classmethod
    def _init_alias_map(cls, dict_grammemes):
        for name, parent, alias, description in dict_grammemes:
            cls._GRAMMEME_ALIAS_MAP[name] = alias


registry = dict()

for tag_type in [CyrillicOpencorporaTag, OpencorporaTag]:
    registry[tag_type.FORMAT] = tag_type
Source code for pymorphy2.tagset

Project Versions

На этой странице

Просмотр

Source code for pymorphy2.tagset

Project Versions

RTD Search

На этой странице

Быстрый поиск

Просмотр