Source code for pymorphy2.units.by_lookup

# -*- coding: utf-8 -*-
"""
Dictionary analyzer unit
------------------------
"""
from __future__ import absolute_import, division, unicode_literals
import logging
from pymorphy2.units.base import BaseAnalyzerUnit


logger = logging.getLogger(__name__)


[docs]class DictionaryAnalyzer(BaseAnalyzerUnit): """ Analyzer unit that analyzes word using dictionary. """
[docs] def parse(self, word, word_lower, seen_parses): """ Parse a word using this dictionary. """ res = [] para_data = self.dict.words.similar_items(word_lower, self.morph.char_substitutes) for fixed_word, parses in para_data: # `fixed_word` is a word with proper substitute (e.g. ё) letters for para_id, idx in parses: normal_form = self.dict.build_normal_form(para_id, idx, fixed_word) tag = self.dict.build_tag_info(para_id, idx) method = ((self, fixed_word, para_id, idx),) res.append((fixed_word, tag, normal_form, 1.0, method)) # res.sort(key=lambda p: len(p[1])) # prefer simple parses return res
[docs] def tag(self, word, word_lower, seen_tags): """ Tag a word using this dictionary. """ para_data = self.dict.words.similar_item_values(word_lower, self.morph.char_substitutes) # avoid extra attribute lookups paradigms = self.dict.paradigms gramtab = self.dict.gramtab # tag known word result = [] for parse in para_data: for para_id, idx in parse: # result.append(self.build_tag_info(para_id, idx)) # .build_tag_info is unrolled for speed paradigm = paradigms[para_id] paradigm_len = len(paradigm) // 3 tag_id = paradigm[paradigm_len + idx] result.append(gramtab[tag_id]) return result
[docs] def get_lexeme(self, form): """ Return a lexeme (given a parsed word). """ fixed_word, tag, normal_form, score, methods_stack = form _, para_id, idx = self._extract_para_info(methods_stack) _para = self.dict.paradigms[para_id] stem = self.dict.build_stem(_para, idx, fixed_word) result = [] paradigm = self.dict.build_paradigm_info(para_id) # XXX: reuse _para? for index, (_prefix, _tag, _suffix) in enumerate(paradigm): word = _prefix + stem + _suffix new_methods_stack = self._fix_stack(methods_stack, word, para_id, index) parse = (word, _tag, normal_form, 1.0, new_methods_stack) result.append(parse) return result
def normalized(self, form): fixed_word, tag, normal_form, score, methods_stack = form original_word, para_id, idx = self._extract_para_info(methods_stack) if idx == 0: return form tag = self.dict.build_tag_info(para_id, 0) new_methods_stack = self._fix_stack(methods_stack, normal_form, para_id, 0) return (normal_form, tag, normal_form, 1.0, new_methods_stack) def _extract_para_info(self, methods_stack): # This method assumes that DictionaryAnalyzer is the first # and the only method in methods_stack. analyzer, original_word, para_id, idx = methods_stack[0] assert analyzer is self return original_word, para_id, idx def _fix_stack(self, methods_stack, word, para_id, idx): method0 = self, word, para_id, idx return (method0,) + methods_stack[1:]