Source code for pymorphy2.units.by_shape

# -*- coding: utf-8 -*-
"""
Analyzer units that analyzes non-word tokes
-------------------------------------------
"""

from __future__ import absolute_import, unicode_literals, division

from pymorphy2.units.base import BaseAnalyzerUnit
from pymorphy2.shapes import is_latin, is_punctuation, is_roman_number


class _ShapeAnalyzer(BaseAnalyzerUnit):
    SCORE = 0.9
    EXTRA_GRAMMEMES = []
    EXTRA_GRAMMEMES_CYR = []

    def __init__(self, morph):
        super(_ShapeAnalyzer, self).__init__(morph)

        for lat, cyr in zip(self.EXTRA_GRAMMEMES, self.EXTRA_GRAMMEMES_CYR):
            self.morph.TagClass.add_grammemes_to_known(lat, cyr)

    def parse(self, word, word_lower, seen_parses):
        shape = self.check_shape(word, word_lower)
        if not shape:
            return []

        methods = ((self, word),)
        return [(word_lower, self.get_tag(word, shape), word_lower, self.SCORE, methods)]

    def tag(self, word, word_lower, seen_tags):
        shape = self.check_shape(word, word_lower)
        if not shape:
            return []
        return [self.get_tag(word, shape)]

    def get_lexeme(self, form):
        return [form]

    def normalized(self, form):
        return form

    # implement these 2 methods in a subclass:
    def check_shape(self, word, word_lower):
        raise NotImplementedError()

    def get_tag(self, word, shape):
        raise NotImplementedError()


class _SingleShapeAnalyzer(_ShapeAnalyzer):
    TAG_STR = None
    TAG_STR_CYR = None

    def __init__(self, morph):
        assert self.TAG_STR is not None
        assert self.TAG_STR_CYR is not None
        self.EXTRA_GRAMMEMES = self.TAG_STR.split(',')
        self.EXTRA_GRAMMEMES_CYR = self.TAG_STR_CYR.split(',')
        super(_SingleShapeAnalyzer, self).__init__(morph)
        self._tag = self.morph.TagClass(self.TAG_STR)

    def get_tag(self, word, shape):
        return self._tag


[docs]class PunctuationAnalyzer(_SingleShapeAnalyzer):
    """
    This analyzer tags punctuation marks as "PNCT".
    Example: "," -> PNCT
    """
    TAG_STR = 'PNCT'
    TAG_STR_CYR = 'ЗПР'  # aot.ru uses this name

    def check_shape(self, word, word_lower):
        return is_punctuation(word)


[docs]class LatinAnalyzer(_SingleShapeAnalyzer):
    """
    This analyzer marks latin words with "LATN" tag.
    Example: "pdf" -> LATN
    """
    TAG_STR = 'LATN'
    TAG_STR_CYR = 'ЛАТ'

    def check_shape(self, word, word_lower):
        return is_latin(word)


[docs]class NumberAnalyzer(_ShapeAnalyzer):
    """
    This analyzer marks integer numbers with "NUMB,int" or "NUMB,real" tags.
    Example: "12" -> NUMB,int; "12.4" -> NUMB,real

    .. note::

        Don't confuse it with "NUMR": "тридцать" -> NUMR

    """
    EXTRA_GRAMMEMES = ['NUMB', 'intg', 'real']
    EXTRA_GRAMMEMES_CYR = ['ЧИСЛО', 'цел', 'вещ']

    def __init__(self, morph):
        super(NumberAnalyzer, self).__init__(morph)
        self._tags = {
            'intg': morph.TagClass('NUMB,intg'),
            'real': morph.TagClass('NUMB,real'),
        }

    def check_shape(self, word, word_lower):
        try:
            int(word)
            return 'intg'
        except ValueError:
            try:
                float(word.replace(',', '.'))
                return 'real'
            except ValueError:
                pass
        return False

    def get_tag(self, word, shape):
        return self._tags[shape]


class RomanNumberAnalyzer(_SingleShapeAnalyzer):
    TAG_STR = 'ROMN'
    TAG_STR_CYR = 'РИМ'

    def check_shape(self, word, word_lower):
        return is_roman_number(word)