Source code for pymorphy2.units.by_shape

# -*- coding: utf-8 -*-
"""
Analyzer units that analyzes non-word tokes
-------------------------------------------
"""

from __future__ import absolute_import, unicode_literals, division

from pymorphy2.units.base import BaseAnalyzerUnit
from pymorphy2.shapes import is_latin, is_punctuation, is_roman_number


class _ShapeAnalyzer(BaseAnalyzerUnit):
    SCORE = 0.9
    EXTRA_GRAMMEMES = []
    EXTRA_GRAMMEMES_CYR = []

    def __init__(self, morph):
        super(_ShapeAnalyzer, self).__init__(morph)

        for lat, cyr in zip(self.EXTRA_GRAMMEMES, self.EXTRA_GRAMMEMES_CYR):
            self.morph.TagClass.add_grammemes_to_known(lat, cyr)

    def parse(self, word, word_lower, seen_parses):
        shape = self.check_shape(word, word_lower)
        if not shape:
            return []

        methods = ((self, word),)
        return [(word_lower, self.get_tag(word, shape), word_lower, self.SCORE, methods)]

    def tag(self, word, word_lower, seen_tags):
        shape = self.check_shape(word, word_lower)
        if not shape:
            return []
        return [self.get_tag(word, shape)]

    def get_lexeme(self, form):
        return [form]

    def normalized(self, form):
        return form

    # implement these 2 methods in a subclass:
    def check_shape(self, word, word_lower):
        raise NotImplementedError()

    def get_tag(self, word, shape):
        raise NotImplementedError()


class _SingleShapeAnalyzer(_ShapeAnalyzer):
    TAG_STR = None
    TAG_STR_CYR = None

    def __init__(self, morph):
        assert self.TAG_STR is not None
        assert self.TAG_STR_CYR is not None
        self.EXTRA_GRAMMEMES = self.TAG_STR.split(',')
        self.EXTRA_GRAMMEMES_CYR = self.TAG_STR_CYR.split(',')
        super(_SingleShapeAnalyzer, self).__init__(morph)
        self._tag = self.morph.TagClass(self.TAG_STR)

    def get_tag(self, word, shape):
        return self._tag


[docs]class PunctuationAnalyzer(_SingleShapeAnalyzer): """ This analyzer tags punctuation marks as "PNCT". Example: "," -> PNCT """ TAG_STR = 'PNCT' TAG_STR_CYR = 'ЗПР' # aot.ru uses this name def check_shape(self, word, word_lower): return is_punctuation(word)
[docs]class LatinAnalyzer(_SingleShapeAnalyzer): """ This analyzer marks latin words with "LATN" tag. Example: "pdf" -> LATN """ TAG_STR = 'LATN' TAG_STR_CYR = 'ЛАТ' def check_shape(self, word, word_lower): return is_latin(word)
[docs]class NumberAnalyzer(_ShapeAnalyzer): """ This analyzer marks integer numbers with "NUMB,int" or "NUMB,real" tags. Example: "12" -> NUMB,int; "12.4" -> NUMB,real .. note:: Don't confuse it with "NUMR": "тридцать" -> NUMR """ EXTRA_GRAMMEMES = ['NUMB', 'intg', 'real'] EXTRA_GRAMMEMES_CYR = ['ЧИСЛО', 'цел', 'вещ'] def __init__(self, morph): super(NumberAnalyzer, self).__init__(morph) self._tags = { 'intg': morph.TagClass('NUMB,intg'), 'real': morph.TagClass('NUMB,real'), } def check_shape(self, word, word_lower): try: int(word) return 'intg' except ValueError: try: float(word.replace(',', '.')) return 'real' except ValueError: pass return False def get_tag(self, word, shape): return self._tags[shape]
class RomanNumberAnalyzer(_SingleShapeAnalyzer): TAG_STR = 'ROMN' TAG_STR_CYR = 'РИМ' def check_shape(self, word, word_lower): return is_roman_number(word)