Source code for pymorphy2.opencorpora_dict.parse
# -*- coding: utf-8 -*-
"""
:mod:`pymorphy2.opencorpora_dict.parse` is a
module for OpenCorpora XML dictionaries parsing.
"""
from __future__ import absolute_import, unicode_literals, division
import logging
import collections
logger = logging.getLogger(__name__)
ParsedDictionary = collections.namedtuple('ParsedDictionary', 'lexemes links grammemes version revision')
[docs]def parse_opencorpora_xml(filename):
"""
Parse OpenCorpora dict XML and return a ``ParsedDictionary`` namedtuple.
"""
from lxml import etree
links = []
lexemes = {}
grammemes = []
version, revision = None, None
_lexemes_len = 0
def _clear(elem):
elem.clear()
while elem.getprevious() is not None:
del elem.getparent()[0]
for ev, elem in etree.iterparse(filename):
if elem.tag == 'grammeme':
name = elem.find('name').text
parent = elem.get('parent')
alias = elem.find('alias').text
description = elem.find('description').text
grameme = (name, parent, alias, description)
grammemes.append(grameme)
_clear(elem)
if elem.tag == 'dictionary':
version = elem.get('version')
revision = elem.get('revision')
_clear(elem)
if elem.tag == 'lemma':
if not lexemes:
logger.info('parsing xml:lemmas...')
lex_id, word_forms = _word_forms_from_xml_elem(elem)
lexemes[lex_id] = word_forms
_clear(elem)
elif elem.tag == 'link':
if not links:
logger.info('parsing xml:links...')
link_tuple = (
elem.get('from'),
elem.get('to'),
elem.get('type'),
)
links.append(link_tuple)
_clear(elem)
if len(lexemes) != _lexemes_len and not (len(lexemes) % 50000):
logger.debug("%d lexemes parsed" % len(lexemes))
_lexemes_len = len(lexemes)
return ParsedDictionary(lexemes, links, grammemes, version, revision)
def _tags_from_elem(elem):
return ",".join(g.get('v') for g in elem.findall('g'))
def _word_forms_from_xml_elem(elem):
"""
Return a list of (word, tags) pairs given "lemma" XML element.
"""
lexeme = []
lex_id = elem.get('id')
if len(elem) == 0: # deleted lexeme?
return lex_id, lexeme
base_info = elem.findall('l')
assert len(base_info) == 1
base_tags = _tags_from_elem(base_info[0])
for form_elem in elem.findall('f'):
tags = _tags_from_elem(form_elem)
form = form_elem.get('t').lower()
lexeme.append(
(form, " ".join([base_tags, tags]).strip())
)
return lex_id, lexeme