Source code for pymorphy2.opencorpora_dict.storage

# -*- coding: utf-8 -*-
"""
:mod:`pymorphy2.opencorpora_dict.storage` is a
module for saving and loading pymorphy2 dictionaries.
"""
from __future__ import absolute_import, unicode_literals
import datetime
import os
import logging
import collections
import itertools
import array
import struct

try:
    izip = itertools.izip
except AttributeError:
    izip = zip

import pymorphy2
from pymorphy2 import tagset
from pymorphy2 import dawg
from pymorphy2.constants import PARADIGM_PREFIXES, PREDICTION_PREFIXES
from pymorphy2.utils import json_write, json_read

logger = logging.getLogger(__name__)

CURRENT_FORMAT_VERSION = '2.0'

LoadedDictionary = collections.namedtuple(
    'LoadedDictionary',
    'meta, gramtab, suffixes, paradigms, words, prediction_prefixes, prediction_suffixes_dawgs, Tag, paradigm_prefixes'
)


[docs]def load_dict(path, gramtab_format='opencorpora-int'): """ Load pymorphy2 dictionary. ``path`` is a folder name with dictionary data. """ _f = lambda p: os.path.join(path, p) meta = _load_meta(_f('meta.json')) _assert_format_is_compatible(meta, path) Tag = _load_tag_class(gramtab_format, _f('grammemes.json')) gramtab = [Tag(tag_str) for tag_str in _load_gramtab(meta, gramtab_format, path)] suffixes = json_read(_f('suffixes.json')) paradigm_prefixes = json_read(_f('paradigm-prefixes.json')) paradigms = _load_paradigms(_f('paradigms.array')) words = dawg.WordsDawg().load(_f('words.dawg')) prediction_prefixes = dawg.DAWG().load(_f('prediction-prefixes.dawg')) prediction_suffixes_dawgs = [] for prefix_id in range(len(paradigm_prefixes)): fn = _f('prediction-suffixes-%s.dawg' % prefix_id) prediction_suffixes_dawgs.append(dawg.PredictionSuffixesDAWG().load(fn)) return LoadedDictionary(meta, gramtab, suffixes, paradigms, words, prediction_prefixes, prediction_suffixes_dawgs, Tag, paradigm_prefixes)
[docs]def save_compiled_dict(compiled_dict, out_path): """ Save a compiled_dict to ``out_path`` ``out_path`` should be a name of folder where to put dictionaries. """ logger.info("Saving...") _f = lambda path: os.path.join(out_path, path) json_write(_f('grammemes.json'), compiled_dict.parsed_dict.grammemes) gramtab_formats = {} for format, Tag in tagset.registry.items(): Tag._init_grammemes(compiled_dict.parsed_dict.grammemes) new_gramtab = [Tag._from_internal_tag(tag) for tag in compiled_dict.gramtab] gramtab_name = "gramtab-%s.json" % format gramtab_formats[format] = gramtab_name json_write(_f(gramtab_name), new_gramtab) with open(_f('paradigms.array'), 'wb') as f: f.write(struct.pack(str("<H"), len(compiled_dict.paradigms))) for para in compiled_dict.paradigms: f.write(struct.pack(str("<H"), len(para))) para.tofile(f) json_write(_f('suffixes.json'), compiled_dict.suffixes) compiled_dict.words_dawg.save(_f('words.dawg')) for prefix_id, prediction_suffixes_dawg in enumerate(compiled_dict.prediction_suffixes_dawgs): prediction_suffixes_dawg.save(_f('prediction-suffixes-%s.dawg' % prefix_id)) dawg.DAWG(PREDICTION_PREFIXES).save(_f('prediction-prefixes.dawg')) json_write(_f('paradigm-prefixes.json'), PARADIGM_PREFIXES) logger.debug("computing metadata..") def _dawg_len(dawg): return sum(1 for k in dawg.iterkeys()) logger.debug(' words_dawg_len') words_dawg_len = _dawg_len(compiled_dict.words_dawg) logger.debug(' prediction_suffixes_dawgs_len') prediction_suffixes_dawg_lenghts = [] for prediction_suffixes_dawg in compiled_dict.prediction_suffixes_dawgs: prediction_suffixes_dawg_lenghts.append(_dawg_len(prediction_suffixes_dawg)) meta = [ ['format_version', CURRENT_FORMAT_VERSION], ['pymorphy2_version', pymorphy2.__version__], ['compiled_at', datetime.datetime.utcnow().isoformat()], ['source', 'opencorpora.org'], ['source_version', compiled_dict.parsed_dict.version], ['source_revision', compiled_dict.parsed_dict.revision], ['source_lexemes_count', len(compiled_dict.parsed_dict.lexemes)], ['source_links_count', len(compiled_dict.parsed_dict.links)], ['gramtab_length', len(compiled_dict.gramtab)], ['gramtab_formats', gramtab_formats], ['paradigms_length', len(compiled_dict.paradigms)], ['suffixes_length', len(compiled_dict.suffixes)], ['words_dawg_length', words_dawg_len], ['prediction_options', compiled_dict.prediction_options], ['prediction_suffixes_dawg_lengths', prediction_suffixes_dawg_lenghts], ['prediction_prefixes_dawg_length', len(PREDICTION_PREFIXES)], ['paradigm_prefixes_length', len(PARADIGM_PREFIXES)], ] json_write(_f('meta.json'), meta, indent=4)
def _load_meta(filename): """ Load metadata. """ meta = json_read(filename, parse_float=str) if hasattr(collections, 'OrderedDict'): return collections.OrderedDict(meta) return dict(meta) def _load_tag_class(gramtab_format, grammemes_filename): """ Load and initialize Tag class (according to ``gramtab_format``). """ if gramtab_format not in tagset.registry: raise ValueError("This gramtab format ('%s') is unsupported." % gramtab_format) grammemes = json_read(grammemes_filename) Tag = tagset.registry[gramtab_format] Tag._init_grammemes(grammemes) return Tag def _load_gramtab(meta, gramtab_format, path): """ Load gramtab (a list of tags) """ gramtab_formats = meta.get('gramtab_formats', {}) if gramtab_format not in gramtab_formats: raise ValueError("This gramtab format (%s) is unavailable; available formats: %s" % (gramtab_format, gramtab_formats.keys())) gramtab_filename = os.path.join(path, gramtab_formats[gramtab_format]) return json_read(gramtab_filename) def _load_paradigms(filename): """ Load paradigms data """ paradigms = [] with open(filename, 'rb') as f: paradigms_count = struct.unpack(str("<H"), f.read(2))[0] for x in range(paradigms_count): paradigm_len = struct.unpack(str("<H"), f.read(2))[0] para = array.array(str("H")) para.fromfile(f, paradigm_len) paradigms.append(para) return paradigms def _assert_format_is_compatible(meta, path): """ Raise an exception if dictionary format is not compatible """ format_version = str(meta.get('format_version', '0.0')) if '.' not in format_version: raise ValueError('Invalid format_version: %s' % format_version) major, minor = format_version.split('.') curr_major, curr_minor = CURRENT_FORMAT_VERSION.split('.') if major != curr_major: msg = ("Error loading dictionaries from %s: " "the format ('%s') is not supported; " "required format is '%s.x'.") % (path, format_version, curr_major) raise ValueError(msg)

Project Versions

На этой странице