Source code for pymorphy2.analyzer

# -*- coding: utf-8 -*-
from __future__ import print_function, unicode_literals, division
import os
import heapq
import collections
import logging

from pymorphy2 import opencorpora_dict
from pymorphy2 import units

logger = logging.getLogger(__name__)

_Parse = collections.namedtuple('Parse', 'word, tag, normal_form, estimate, methods_stack')

[docs]class Parse(_Parse): """ Parse result wrapper. """ _morph = None """ :type _morph: MorphAnalyzer """ _dict = None """ :type _dict: pymorphy2.opencorpora_dict.Dictionary """
[docs] def inflect(self, required_grammemes): res = self._morph._inflect(self, required_grammemes) return None if not res else res[0]
@property
[docs] def lexeme(self): """ A lexeme this form belongs to. """ return self._morph.get_lexeme(self)
@property
[docs] def is_known(self): """ True if this form is a known dictionary form. """ # return self.estimate == 1? return self._dict.word_is_known(self.word, strict_ee=True)
@property
[docs] def normalized(self): """ A :class:`Parse` instance for :attr:`self.normal_form`. """ last_method = self.methods_stack[-1] return self.__class__(*last_method[0].normalized(self)) # @property # def paradigm(self): # return self._dict.build_paradigm_info(self.para_id)
[docs]class MorphAnalyzer(object): """ Morphological analyzer for Russian language. For a given word it can find all possible inflectional paradigms and thus compute all possible tags and normal forms. Analyzer uses morphological word features and a lexicon (dictionary compiled from XML available at OpenCorpora.org); for unknown words heuristic algorithm is used. Create a :class:`MorphAnalyzer` object:: >>> import pymorphy2 >>> morph = pymorphy2.MorphAnalyzer() MorphAnalyzer uses dictionaries from ``pymorphy2-dicts`` package (which can be installed via ``pip install pymorphy2-dicts``). Alternatively (e.g. if you have your own precompiled dictionaries), either create ``PYMORPHY2_DICT_PATH`` environment variable with a path to dictionaries, or pass ``path`` argument to :class:`pymorphy2.MorphAnalyzer` constructor:: >>> morph = pymorphy2.MorphAnalyzer('/path/to/dictionaries') # doctest: +SKIP By default, methods of this class return parsing results as namedtuples :class:`Parse`. This has performance implications under CPython, so if you need maximum speed then pass ``result_type=None`` to make analyzer return plain unwrapped tuples:: >>> morph = pymorphy2.MorphAnalyzer(result_type=None) """ ENV_VARIABLE = 'PYMORPHY2_DICT_PATH' DEFAULT_UNITS = [ units.DictionaryAnalyzer, units.NumberAnalyzer, units.PunctuationAnalyzer, units.LatinAnalyzer, units.HyphenSeparatedParticleAnalyzer, units.HyphenAdverbAnalyzer, units.HyphenatedWordsAnalyzer, units.KnownPrefixAnalyzer, units.UnknownPrefixAnalyzer, units.KnownSuffixAnalyzer, ] def __init__(self, path=None, result_type=Parse, units=None): path = self.choose_dictionary_path(path) self.dictionary = opencorpora_dict.Dictionary(path) if result_type is not None: # create a subclass with the same name, # but with _morph attribute bound to self res_type = type( result_type.__name__, (result_type,), {'_morph': self, '_dict': self.dictionary} ) self._result_type = res_type else: self._result_type = None # initialize units if units is None: units = self.DEFAULT_UNITS self._units = [cls(self) for cls in units] @classmethod
[docs] def choose_dictionary_path(cls, path=None): if path is not None: return path if cls.ENV_VARIABLE in os.environ: return os.environ[cls.ENV_VARIABLE] try: import pymorphy2_dicts return pymorphy2_dicts.get_path() except ImportError: msg = ("Can't find dictionaries. " "Please either pass a path to dictionaries, " "or install 'pymorphy2-dicts' package, " "or set %s environment variable.") % cls.ENV_VARIABLE raise ValueError(msg)
[docs] def parse(self, word): """ Analyze the word and return a list of :class:`pymorphy2.analyzer.Parse` namedtuples: Parse(word, tag, normal_form, para_id, idx, _estimate) (or plain tuples if ``result_type=None`` was used in constructor). """ res = [] seen = set() word_lower = word.lower() for analyzer in self._units: res.extend(analyzer.parse(word, word_lower, seen)) if res and analyzer.terminal: break if self._result_type is None: return res return [self._result_type(*p) for p in res]
[docs] def tag(self, word): res = [] seen = set() word_lower = word.lower() for analyzer in self._units: res.extend(analyzer.tag(word, word_lower, seen)) if res and analyzer.terminal: break return res
[docs] def normal_forms(self, word): """ Return a list of word normal forms. """ seen = set() result = [] for p in self.parse(word): normal_form = p[2] if normal_form not in seen: result.append(normal_form) seen.add(normal_form) return result # ==== inflection ========
[docs] def get_lexeme(self, form): """ Return the lexeme this parse belongs to. """ methods_stack = form[4] last_method = methods_stack[-1] result = last_method[0].get_lexeme(form) if self._result_type is None: return result return [self._result_type(*p) for p in result]
def _inflect(self, form, required_grammemes): grammemes = form[1].updated_grammemes(required_grammemes) possible_results = [f for f in self.get_lexeme(form) if required_grammemes <= f[1].grammemes] def similarity(frm): tag = frm[1] return len(grammemes & tag.grammemes) return heapq.nlargest(1, possible_results, key=similarity) # ====== misc =========
[docs] def iter_known_word_parses(self, prefix=""): """ Return an iterator over parses of dictionary words that starts with a given prefix (default empty prefix means "all words"). """ # XXX: this method currently assumes that # units.DictionaryAnalyzer is the first analyzer unit. for word, tag, normal_form, para_id, idx in self.dictionary.iter_known_words(prefix): methods = ((self._units[0], word, para_id, idx),) parse = (word, tag, normal_form, 1.0, methods) if self._result_type is None: yield parse else: yield self._result_type(*parse)
[docs] def word_is_known(self, word, strict_ee=False): """ Check if a ``word`` is in the dictionary. Pass ``strict_ee=True`` if ``word`` is guaranteed to have correct е/ё letters. .. note:: Dictionary words are not always correct words; the dictionary also contains incorrect forms which are commonly used. So for spellchecking tasks this method should be used with extra care. """ return self.dictionary.word_is_known(word.lower(), strict_ee)
@property
[docs] def TagClass(self): return self.dictionary.Tag

Project Versions

На этой странице