Source code for pymorphy2.analyzer

# -*- coding: utf-8 -*-
from __future__ import print_function, unicode_literals, division
import os
import heapq
import collections
import logging

from pymorphy2 import opencorpora_dict
from pymorphy2 import units

logger = logging.getLogger(__name__)

_Parse = collections.namedtuple('Parse', 'word, tag, normal_form, estimate, methods_stack')

[docs]class Parse(_Parse):
    """
    Parse result wrapper.
    """

    _morph = None
    """ :type _morph: MorphAnalyzer """

    _dict = None
    """ :type _dict: pymorphy2.opencorpora_dict.Dictionary """

[docs]    def inflect(self, required_grammemes):
        res = self._morph._inflect(self, required_grammemes)
        return None if not res else res[0]

    @property
[docs]    def lexeme(self):
        """ A lexeme this form belongs to. """
        return self._morph.get_lexeme(self)

    @property
[docs]    def is_known(self):
        """ True if this form is a known dictionary form. """
        # return self.estimate == 1?
        return self._dict.word_is_known(self.word, strict_ee=True)

    @property
[docs]    def normalized(self):
        """ A :class:`Parse` instance for :attr:`self.normal_form`. """
        last_method = self.methods_stack[-1]
        return self.__class__(*last_method[0].normalized(self))

    # @property
    # def paradigm(self):
    #     return self._dict.build_paradigm_info(self.para_id)



[docs]class MorphAnalyzer(object):
    """
    Morphological analyzer for Russian language.

    For a given word it can find all possible inflectional paradigms
    and thus compute all possible tags and normal forms.

    Analyzer uses morphological word features and a lexicon
    (dictionary compiled from XML available at OpenCorpora.org);
    for unknown words heuristic algorithm is used.

    Create a :class:`MorphAnalyzer` object::

        >>> import pymorphy2
        >>> morph = pymorphy2.MorphAnalyzer()

    MorphAnalyzer uses dictionaries from ``pymorphy2-dicts`` package
    (which can be installed via ``pip install pymorphy2-dicts``).

    Alternatively (e.g. if you have your own precompiled dictionaries),
    either create ``PYMORPHY2_DICT_PATH`` environment variable
    with a path to dictionaries, or pass ``path`` argument
    to :class:`pymorphy2.MorphAnalyzer` constructor::

        >>> morph = pymorphy2.MorphAnalyzer('/path/to/dictionaries') # doctest: +SKIP

    By default, methods of this class return parsing results
    as namedtuples :class:`Parse`. This has performance implications
    under CPython, so if you need maximum speed then pass
    ``result_type=None`` to make analyzer return plain unwrapped tuples::

        >>> morph = pymorphy2.MorphAnalyzer(result_type=None)

    """

    ENV_VARIABLE = 'PYMORPHY2_DICT_PATH'
    DEFAULT_UNITS = [
        units.DictionaryAnalyzer,

        units.NumberAnalyzer,
        units.PunctuationAnalyzer,
        units.LatinAnalyzer,

        units.HyphenSeparatedParticleAnalyzer,
        units.HyphenAdverbAnalyzer,
        units.HyphenatedWordsAnalyzer,
        units.KnownPrefixAnalyzer,
        units.UnknownPrefixAnalyzer,
        units.KnownSuffixAnalyzer,
    ]

    def __init__(self, path=None, result_type=Parse, units=None):

        path = self.choose_dictionary_path(path)
        self.dictionary = opencorpora_dict.Dictionary(path)

        if result_type is not None:
            # create a subclass with the same name,
            # but with _morph attribute bound to self
            res_type = type(
                result_type.__name__,
                (result_type,),
                {'_morph': self, '_dict': self.dictionary}
            )
            self._result_type = res_type
        else:
            self._result_type = None

        # initialize units
        if units is None:
            units = self.DEFAULT_UNITS

        self._units = [cls(self) for cls in units]


    @classmethod
[docs]    def choose_dictionary_path(cls, path=None):
        if path is not None:
            return path

        if cls.ENV_VARIABLE in os.environ:
            return os.environ[cls.ENV_VARIABLE]

        try:
            import pymorphy2_dicts
            return pymorphy2_dicts.get_path()
        except ImportError:
            msg = ("Can't find dictionaries. "
                   "Please either pass a path to dictionaries, "
                   "or install 'pymorphy2-dicts' package, "
                   "or set %s environment variable.") % cls.ENV_VARIABLE
            raise ValueError(msg)


[docs]    def parse(self, word):
        """
        Analyze the word and return a list of :class:`pymorphy2.analyzer.Parse`
        namedtuples:

            Parse(word, tag, normal_form, para_id, idx, _estimate)

        (or plain tuples if ``result_type=None`` was used in constructor).
        """
        res = []
        seen = set()
        word_lower = word.lower()

        for analyzer in self._units:
            res.extend(analyzer.parse(word, word_lower, seen))

            if res and analyzer.terminal:
                break

        if self._result_type is None:
            return res

        return [self._result_type(*p) for p in res]


[docs]    def tag(self, word):
        res = []
        seen = set()
        word_lower = word.lower()

        for analyzer in self._units:
            res.extend(analyzer.tag(word, word_lower, seen))

            if res and analyzer.terminal:
                break

        return res


[docs]    def normal_forms(self, word):
        """
        Return a list of word normal forms.
        """
        seen = set()
        result = []

        for p in self.parse(word):
            normal_form = p[2]
            if normal_form not in seen:
                result.append(normal_form)
                seen.add(normal_form)
        return result

    # ==== inflection ========

[docs]    def get_lexeme(self, form):
        """
        Return the lexeme this parse belongs to.
        """
        methods_stack = form[4]
        last_method = methods_stack[-1]
        result = last_method[0].get_lexeme(form)

        if self._result_type is None:
            return result
        return [self._result_type(*p) for p in result]


    def _inflect(self, form, required_grammemes):
        grammemes = form[1].updated_grammemes(required_grammemes)

        possible_results = [f for f in self.get_lexeme(form)
                            if required_grammemes <= f[1].grammemes]

        def similarity(frm):
            tag = frm[1]
            return len(grammemes & tag.grammemes)

        return heapq.nlargest(1, possible_results, key=similarity)


    # ====== misc =========

[docs]    def iter_known_word_parses(self, prefix=""):
        """
        Return an iterator over parses of dictionary words that starts
        with a given prefix (default empty prefix means "all words").
        """

        # XXX: this method currently assumes that
        # units.DictionaryAnalyzer is the first analyzer unit.
        for word, tag, normal_form, para_id, idx in self.dictionary.iter_known_words(prefix):
            methods = ((self._units[0], word, para_id, idx),)
            parse = (word, tag, normal_form, 1.0, methods)
            if self._result_type is None:
                yield parse
            else:
                yield self._result_type(*parse)


[docs]    def word_is_known(self, word, strict_ee=False):
        """
        Check if a ``word`` is in the dictionary.
        Pass ``strict_ee=True`` if ``word`` is guaranteed to
        have correct е/ё letters.

        .. note::

            Dictionary words are not always correct words;
            the dictionary also contains incorrect forms which
            are commonly used. So for spellchecking tasks this
            method should be used with extra care.

        """
        return self.dictionary.word_is_known(word.lower(), strict_ee)


    @property
[docs]    def TagClass(self):
        return self.dictionary.Tag
Source code for pymorphy2.analyzer

Project Versions

На этой странице

Просмотр

Source code for pymorphy2.analyzer

Project Versions

RTD Search

На этой странице

Быстрый поиск

Просмотр