Source code for pymorphy2.units.by_analogy

# -*- coding: utf-8 -*-
"""
Analogy analyzer units
----------------------

This module provides analyzer units that analyzes unknown words by looking
at how similar known words are analyzed.

"""

from __future__ import absolute_import, unicode_literals, division

import operator

from pymorphy2.units.base import AnalogyAnalizerUnit
from pymorphy2.units.by_lookup import DictionaryAnalyzer
from pymorphy2.units.utils import (
    add_parse_if_not_seen,
    add_tag_if_not_seen,
    without_fixed_prefix,
    with_prefix
)
from pymorphy2.utils import word_splits
from pymorphy2.dawg import PrefixMatcher

_cnt_getter = operator.itemgetter(3)


class _PrefixAnalyzer(AnalogyAnalizerUnit):

    def normalizer(self, form, this_method):
        prefix = this_method[1]
        normal_form = yield without_fixed_prefix(form, len(prefix))
        yield with_prefix(normal_form, prefix)

    def lexemizer(self, form, this_method):
        prefix = this_method[1]
        lexeme = yield without_fixed_prefix(form, len(prefix))
        yield [with_prefix(f, prefix) for f in lexeme]


[docs]class KnownPrefixAnalyzer(_PrefixAnalyzer): """ Parse the word by checking if it starts with a known prefix and parsing the remainder. Example: псевдокошка -> (псевдо) + кошка. """ _repr_skip_value_params = ['known_prefixes'] def __init__(self, known_prefixes, score_multiplier=0.75, min_remainder_length=3): self.known_prefixes = known_prefixes self.score_multiplier = score_multiplier self.min_remainder_length = min_remainder_length def init(self, morph): super(KnownPrefixAnalyzer, self).init(morph) self.get_prefixes = PrefixMatcher(self.known_prefixes).prefixes def parse(self, word, word_lower, seen_parses): result = [] for prefix, unprefixed_word in self.possible_splits(word_lower): method = (self, prefix) parses = self.morph.parse(unprefixed_word) for fixed_word, tag, normal_form, score, methods_stack in parses: if not tag.is_productive(): continue parse = ( prefix + fixed_word, tag, prefix + normal_form, score * self.score_multiplier, methods_stack + (method,) ) add_parse_if_not_seen(parse, result, seen_parses) return result def tag(self, word, word_lower, seen_tags): result = [] for prefix, unprefixed_word in self.possible_splits(word_lower): for tag in self.morph.tag(unprefixed_word): if not tag.is_productive(): continue add_tag_if_not_seen(tag, result, seen_tags) return result def possible_splits(self, word): word_prefixes = self.get_prefixes(word) word_prefixes.sort(key=len, reverse=True) for prefix in word_prefixes: unprefixed_word = word[len(prefix):] if len(unprefixed_word) < self.min_remainder_length: continue yield prefix, unprefixed_word
[docs]class UnknownPrefixAnalyzer(_PrefixAnalyzer): """ Parse the word by parsing only the word suffix (with restrictions on prefix & suffix lengths). Example: байткод -> (байт) + код """ def __init__(self, score_multiplier=0.5): self.score_multiplier = score_multiplier def init(self, morph): super(AnalogyAnalizerUnit, self).init(morph) self.dict_analyzer = DictionaryAnalyzer() self.dict_analyzer.init(morph) def parse(self, word, word_lower, seen_parses): result = [] for prefix, unprefixed_word in word_splits(word_lower): method = (self, prefix) parses = self.dict_analyzer.parse(unprefixed_word, unprefixed_word, seen_parses) for fixed_word, tag, normal_form, score, methods_stack in parses: if not tag.is_productive(): continue parse = ( prefix + fixed_word, tag, prefix + normal_form, score * self.score_multiplier, methods_stack + (method,) ) add_parse_if_not_seen(parse, result, seen_parses) return result def tag(self, word, word_lower, seen_tags): result = [] for _, unprefixed_word in word_splits(word_lower): tags = self.dict_analyzer.tag(unprefixed_word, unprefixed_word, seen_tags) for tag in tags: if not tag.is_productive(): continue add_tag_if_not_seen(tag, result, seen_tags) return result
[docs]class KnownSuffixAnalyzer(AnalogyAnalizerUnit): """ Parse the word by checking how the words with similar suffixes are parsed. Example: бутявкать -> ...вкать """
[docs] class FakeDictionary(DictionaryAnalyzer): """ This is just a DictionaryAnalyzer with different __repr__ """ pass
def __init__(self, score_multiplier=0.5, min_word_length=4): self.min_word_length = min_word_length self.score_multiplier = score_multiplier def init(self, morph): super(KnownSuffixAnalyzer, self).init(morph) self._paradigm_prefixes = list(reversed(list(enumerate(self.dict.paradigm_prefixes)))) self._prediction_splits = list(reversed(range(1, self._max_suffix_length()+1))) self.fake_dict = self.FakeDictionary() self.fake_dict.init(morph) def _max_suffix_length(self): try: return self.dict.meta['compile_options']['max_suffix_length'] except KeyError: # dicts v2.4 support return self.dict.meta['prediction_options']['max_suffix_length'] def parse(self, word, word_lower, seen_parses): result = [] if len(word) < self.min_word_length: return result # smoothing; XXX: isn't max_cnt better? # or maybe use a proper discounting? total_counts = [1] * len(self._paradigm_prefixes) for prefix_id, prefix, suffixes_dawg in self._possible_prefixes(word_lower): for i in self._prediction_splits: # XXX: this should be counted once, not for each prefix word_start, word_end = word_lower[:-i], word_lower[-i:] para_data = suffixes_dawg.similar_items(word_end, self.morph.char_substitutes) for fixed_suffix, parses in para_data: fixed_word = word_start + fixed_suffix for cnt, para_id, idx in parses: tag = self.dict.build_tag_info(para_id, idx) # skip non-productive tags if not tag.is_productive(): continue total_counts[prefix_id] += cnt # avoid duplicate parses reduced_parse = fixed_word, tag, para_id if reduced_parse in seen_parses: continue seen_parses.add(reduced_parse) # ok, build the result normal_form = self.dict.build_normal_form(para_id, idx, fixed_word) methods = ( (self.fake_dict, fixed_word, para_id, idx), (self, fixed_suffix), ) parse = (cnt, fixed_word, tag, normal_form, prefix_id, methods) result.append(parse) if total_counts[prefix_id] > 1: break result = [ (fixed_word, tag, normal_form, cnt/total_counts[prefix_id] * self.score_multiplier, methods_stack) for (cnt, fixed_word, tag, normal_form, prefix_id, methods_stack) in result ] result.sort(key=_cnt_getter, reverse=True) return result def tag(self, word, word_lower, seen_tags): # XXX: the result order may be different from # ``self.parse(...)``. result = [] if len(word) < self.min_word_length: return result for prefix_id, prefix, suffixes_dawg in self._possible_prefixes(word_lower): for i in self._prediction_splits: # XXX: end should be counted once, not for each prefix end = word_lower[-i:] para_data = suffixes_dawg.similar_items(end, self.morph.char_substitutes) found = False for fixed_suffix, parses in para_data: for cnt, para_id, idx in parses: tag = self.dict.build_tag_info(para_id, idx) if not tag.is_productive(): continue found = True if tag in seen_tags: continue seen_tags.add(tag) result.append((cnt, tag)) if found: break result.sort(reverse=True) return [tag for cnt, tag in result] def _possible_prefixes(self, word): for prefix_id, prefix in self._paradigm_prefixes: if not word.startswith(prefix): continue suffixes_dawg = self.dict.prediction_suffixes_dawgs[prefix_id] yield prefix_id, prefix, suffixes_dawg