Source code for pymorphy2.tagset

# -*- coding: utf-8 -*-
"""
Utils for working with grammatical tags.
"""
from __future__ import absolute_import, unicode_literals
import collections

try:
    from sys import intern
except ImportError:
    # python 2.x has builtin ``intern`` function
    pass

# a bit of *heavy* magic...
class _select_grammeme_from(object):
    """
    Descriptor object for accessing grammemes of certain classes
    (e.g. number or voice).
    """
    def __init__(self, grammeme_set):
        self.grammeme_set = grammeme_set

        # ... are descriptors not magical enough?

        # In order to fight typos, raise an exception
        # if a result is compared to a grammeme which
        # is not in a set of allowed grammemes.
        _str = type("unicode string")

        class TypedGrammeme(_str):
            def __eq__(self, other):
                if other is None:
                    return False
                if other not in grammeme_set:
                    raise ValueError("'%s' is not a valid grammeme for this attribute." % other)
                return _str.__eq__(self, other)

            def __ne__(self, other):
                return not self.__eq__(other)

            def __hash__(self):
                return _str.__hash__(self)

        self.TypedGrammeme = TypedGrammeme

    def __get__(self, instance, owner):
        grammemes = self.grammeme_set & instance.grammemes

        if not grammemes:
            # XXX: type checks are not enforced in this case
            return None

        res = next(iter(grammemes))
        return self.TypedGrammeme(res) if owner.typed_grammemes else res


# Design notes: Tag objects should be immutable.
[docs]class OpencorporaTag(object): """ Wrapper class for OpenCorpora.org tags. .. warning:: In order to work properly, the class has to be globally initialized with actual grammemes (using _init_grammemes method). Pymorphy2 initializes it when loading a dictionary; it may be not a good idea to use this class directly. If possible, use ``morph_analyzer.TagClass`` instead. Example:: >>> from pymorphy2 import MorphAnalyzer >>> morph = MorphAnalyzer() >>> Tag = morph.TagClass # get an initialzed Tag class >>> tag = Tag('VERB,perf,tran plur,impr,excl') >>> tag OpencorporaTag('VERB,perf,tran plur,impr,excl') Tag instances have attributes for accessing grammemes:: >>> print(tag.POS) VERB >>> print(tag.number) plur >>> print(tag.case) None Available attributes are: POS, animacy, aspect, case, gender, involvement, mood, number, person, tense, transitivity and voice. You may check if a grammeme is in tag or if all grammemes from a given set are in tag:: >>> 'perf' in tag True >>> 'nomn' in tag False >>> 'Geox' in tag False >>> set(['VERB', 'perf']) in tag True >>> set(['VERB', 'perf', 'sing']) in tag False In order to fight typos, for unknown grammemes an exception is raised:: >>> 'foobar' in tag Traceback (most recent call last): ... ValueError: Grammeme is unknown: foobar >>> set(['NOUN', 'foo', 'bar']) in tag Traceback (most recent call last): ... ValueError: Grammemes are unknown: {'bar', 'foo'} This also works for attributes:: >>> tag.POS == 'plur' Traceback (most recent call last): ... ValueError: 'plur' is not a valid grammeme for this attribute. """ __slots__ = ['_grammemes_tuple', '_grammemes_cache', '_str'] # Grammeme categories # (see http://opencorpora.org/dict.php?act=gram for a full set) # ------------------------------------------------------------- PARTS_OF_SPEECH = frozenset([ 'NOUN', # имя существительное 'ADJF', # имя прилагательное (полное) 'ADJS', # имя прилагательное (краткое) 'COMP', # компаратив 'VERB', # глагол (личная форма) 'INFN', # глагол (инфинитив) 'PRTF', # причастие (полное) 'PRTS', # причастие (краткое) 'GRND', # деепричастие 'NUMR', # числительное 'ADVB', # наречие 'NPRO', # местоимение-существительное 'PRED', # предикатив 'PREP', # предлог 'CONJ', # союз 'PRCL', # частица 'INTJ', # междометие ]) ANIMACY = frozenset([ 'anim', # одушевлённое 'inan', # неодушевлённое ]) GENDERS = frozenset([ 'masc', # мужской род 'femn', # женский род 'neut', # средний род ]) NUMBERS = frozenset([ 'sing', # единственное число 'plur', # множественное число ]) CASES = frozenset([ 'nomn', # именительный падеж 'gent', # родительный падеж 'datv', # дательный падеж 'accs', # винительный падеж 'ablt', # творительный падеж 'loct', # предложный падеж 'voct', # звательный падеж 'gen1', # первый родительный падеж 'gen2', # второй родительный (частичный) падеж 'acc2', # второй винительный падеж 'loc1', # первый предложный падеж 'loc2', # второй предложный (местный) падеж ]) ASPECTS = frozenset([ 'perf', # совершенный вид 'impf', # несовершенный вид ]) TRANSITIVITY = frozenset([ 'tran', # переходный 'intr', # непереходный ]) PERSONS = frozenset([ '1per', # 1 лицо '2per', # 2 лицо '3per', # 3 лицо ]) TENSES = frozenset([ 'pres', # настоящее время 'past', # прошедшее время 'futr', # будущее время ]) MOODS = frozenset([ 'indc', # изъявительное наклонение 'impr', # повелительное наклонение ]) VOICES = frozenset([ 'actv', # действительный залог 'pssv', # страдательный залог ]) INVOLVEMENT = frozenset([ 'incl', # говорящий включён в действие 'excl', # говорящий не включён в действие ]) # Set this to False (as a class attribute) to disable strict # grammeme type checking for tag.POS, tag.voice, etc. attributes. # Without type checks comparisons are about 2x faster. typed_grammemes = True # Tag format identifier # (compatible with https://github.com/kmike/russian-tagsets) # ---------------------------------------------------------- FORMAT = 'opencorpora-int' # Helper attributes for inflection/declension routines # ---------------------------------------------------- _NON_PRODUCTIVE_CLASSES = frozenset(['NUMR', 'NPRO', 'PRED', 'PREP', 'CONJ', 'PRCL', 'INTJ']) _EXTRA_INCOMPATIBLE = { # XXX: is it a good idea to have these rules? 'plur': set(['GNdr']), # XXX: how to use rules from OpenCorpora? # (they have "lexeme/form" separation) } _GRAMMEME_INDICES = collections.defaultdict(lambda: 0) _GRAMMEME_INCOMPATIBLE = collections.defaultdict(set) _KNOWN_GRAMMEMES = None def __init__(self, tag): self._str = tag # XXX: we loose information about which grammemes # belongs to lexeme and which belongs to form # (but this information seems useless for pymorphy2). # Hacks for better memory usage: # - store grammemes in a tuple and build a set only when needed; # - use byte strings for grammemes under Python 2.x; # - grammemes are interned. grammemes = tag.replace(' ', ',', 1).split(',') self._grammemes_tuple = tuple([intern(str(g)) for g in grammemes]) self._grammemes_cache = None @property
[docs] def grammemes(self): """ A frozenset with grammemes for this tag. """ if self._grammemes_cache is None: self._grammemes_cache = frozenset(self._grammemes_tuple) return self._grammemes_cache # attributes for grammeme categories
POS = _select_grammeme_from(PARTS_OF_SPEECH) animacy = _select_grammeme_from(ANIMACY) aspect = _select_grammeme_from(ASPECTS) case = _select_grammeme_from(CASES) gender = _select_grammeme_from(GENDERS) involvement = _select_grammeme_from(INVOLVEMENT) mood = _select_grammeme_from(MOODS) number = _select_grammeme_from(NUMBERS) person = _select_grammeme_from(PERSONS) tense = _select_grammeme_from(TENSES) transitivity = _select_grammeme_from(TRANSITIVITY) voice = _select_grammeme_from(VOICES) def __contains__(self, grammeme): # {'NOUN', 'sing'} in tag if isinstance(grammeme, (set, frozenset)): if grammeme <= self.grammemes: return True if not grammeme <= self._KNOWN_GRAMMEMES: unknown = grammeme - self._KNOWN_GRAMMEMES unknown_repr = ", ".join(["'%s'" % g for g in sorted(unknown)]) raise ValueError("Grammemes are unknown: {%s}" % unknown_repr) return False # 'NOUN' in tag if grammeme in self.grammemes: return True else: if not self.grammeme_is_known(grammeme): raise ValueError("Grammeme is unknown: %s" % grammeme) return False # FIXME: __repr__ and __str__ always return unicode, # but they should return a byte string under Python 2.x. def __str__(self): return self._str def __repr__(self): return "OpencorporaTag('%s')" % self def __eq__(self, other): return self._grammemes_tuple == other._grammemes_tuple def __ne__(self, other): return self._grammemes_tuple != other._grammemes_tuple def __lt__(self, other): return self._grammemes_tuple < other._grammemes_tuple def __gt__(self, other): return self._grammemes_tuple > other._grammemes_tuple def __hash__(self): return hash(self._grammemes_tuple) def is_productive(self): # We use `self._grammemes_tuple[0]` instead of `self.POS` here because # it is faster and this method is heavily used by MorphAnalyzer. return not self._grammemes_tuple[0] in self._NON_PRODUCTIVE_CLASSES @classmethod def grammeme_is_known(cls, grammeme): if not cls._KNOWN_GRAMMEMES: msg = "The class was not properly initialized." raise RuntimeError(msg) return grammeme in cls._KNOWN_GRAMMEMES
[docs] def updated_grammemes(self, required): """ Return a new set of grammemes with ``required`` grammemes added and incompatible grammemes removed. """ new_grammemes = self.grammemes | required for grammeme in required: if not self.grammeme_is_known(grammeme): raise ValueError("Unknown grammeme: %s" % grammeme) new_grammemes -= self._GRAMMEME_INCOMPATIBLE[grammeme] return new_grammemes
@classmethod def _init_grammemes(cls, dict_grammemes): """ Initialize various class attributes with grammeme information obtained from XML dictionary. ``dict_grammemes`` is a list of tuples:: [ (name, parent, alias, description), ... ] """ gr = dict((name, parent) for (name, parent, alias, description) in dict_grammemes) cls._KNOWN_GRAMMEMES = frozenset(gr.keys()) # figure out parents & children children = collections.defaultdict(set) for index, (name, parent, alias, description) in enumerate(dict_grammemes): if parent: children[parent].add(name) if gr.get(parent, None): # parent's parent children[gr[parent]].add(name) # expand EXTRA_INCOMPATIBLE for grammeme, g_set in cls._EXTRA_INCOMPATIBLE.items(): for g in g_set.copy(): g_set.update(children[g]) # fill GRAMMEME_INDICES and GRAMMEME_INCOMPATIBLE for index, (name, parent, alias, description) in enumerate(dict_grammemes): cls._GRAMMEME_INDICES[name] = index incompatible = cls._EXTRA_INCOMPATIBLE.get(name, set()) incompatible = (incompatible | children[parent]) - set([name]) cls._GRAMMEME_INCOMPATIBLE[name] = frozenset(incompatible) # XXX: do we still need these methods? @classmethod def _from_internal_tag(cls, tag): """ Return tag string given internal tag string """ return tag @classmethod def _from_internal_grammeme(cls, grammeme): return grammeme
class CyrillicOpencorporaTag(OpencorporaTag): """ Tag class that uses Cyrillic tag names. .. warning:: This class is experimental and incomplete, do not use it because it may be removed in future! """ FORMAT = 'opencorpora-ext' _GRAMMEME_ALIAS_MAP = dict() @classmethod def _from_internal_tag(cls, tag): for name, alias in cls._GRAMMEME_ALIAS_MAP.items(): if alias: tag = tag.replace(name, alias) return tag @classmethod def _from_internal_grammeme(cls, grammeme): return cls._GRAMMEME_ALIAS_MAP.get(grammeme, grammeme) @classmethod def _init_grammemes(cls, dict_grammemes): """ Initialize various class attributes with grammeme information obtained from XML dictionary. """ cls._init_alias_map(dict_grammemes) super(CyrillicOpencorporaTag, cls)._init_grammemes(dict_grammemes) GRAMMEME_INDICES = collections.defaultdict(lambda: 0) for name, idx in cls._GRAMMEME_INDICES.items(): GRAMMEME_INDICES[cls._from_internal_grammeme(name)] = idx cls._GRAMMEME_INDICES = GRAMMEME_INDICES GRAMMEME_INCOMPATIBLE = collections.defaultdict(set) for name, value in cls._GRAMMEME_INCOMPATIBLE.items(): GRAMMEME_INCOMPATIBLE[cls._from_internal_grammeme(name)] = set([ cls._from_internal_grammeme(gr) for gr in value ]) cls._GRAMMEME_INCOMPATIBLE = GRAMMEME_INCOMPATIBLE cls._NON_PRODUCTIVE_CLASSES = set([ cls._from_internal_grammeme(gr) for gr in cls._NON_PRODUCTIVE_CLASSES ]) @classmethod def _init_alias_map(cls, dict_grammemes): for name, parent, alias, description in dict_grammemes: cls._GRAMMEME_ALIAS_MAP[name] = alias registry = dict() for tag_type in [CyrillicOpencorporaTag, OpencorporaTag]: registry[tag_type.FORMAT] = tag_type

Project Versions

На этой странице