Source code for pymorphy2.utils

# -*- coding: utf-8 -*-
from __future__ import absolute_import
# unicode_literals here would break tests

import os
import heapq
import itertools
import codecs
import json


[docs]def get_mem_usage():
    """
    Return memory usage of the current process, in bytes.
    Requires psutil Python package.
    """
    import psutil
    proc = psutil.Process(os.getpid())
    try:
        return proc.memory_info().rss
    except AttributeError:
        # psutil < 2.x
        return proc.get_memory_info()[0]


[docs]def combinations_of_all_lengths(it):
    """
    Return an iterable with all possible combinations of items from ``it``:

        >>> for comb in combinations_of_all_lengths('ABC'):
        ...     print("".join(comb))
        A
        B
        C
        AB
        AC
        BC
        ABC

    """
    return itertools.chain(
        *(itertools.combinations(it, num+1) for num in range(len(it)))
    )


[docs]def longest_common_substring(data):
    """
    Return a longest common substring of a list of strings:

        >>> longest_common_substring(["apricot", "rice", "cricket"])
        'ric'
        >>> longest_common_substring(["apricot", "banana"])
        'a'
        >>> longest_common_substring(["foo", "bar", "baz"])
        ''
        >>> longest_common_substring(["", "foo"])
        ''
        >>> longest_common_substring(["apricot"])
        'apricot'
        >>> longest_common_substring([])
        ''

    See http://stackoverflow.com/questions/2892931/.
    """
    if len(data) == 1:
        return data[0]
    if not data or len(data[0]) == 0:
        return ''
    substr = ''
    for i in range(len(data[0])):
        for j in range(len(data[0])-i+1):
            if j > len(substr) and all(data[0][i:i+j] in x for x in data):
                substr = data[0][i:i+j]
    return substr


[docs]def json_write(filename, obj, **json_options):
    """ Create file ``filename`` with ``obj`` serialized to JSON """

    json_options.setdefault('ensure_ascii', False)
    json_options.setdefault('indent', 2)
    with codecs.open(filename, 'w', 'utf8') as f:
        json.dump(obj, f, **json_options)


[docs]def json_read(filename, **json_options):
    """ Read an object from a json file ``filename`` """
    with codecs.open(filename, 'r', 'utf8') as f:
        return json.load(f, **json_options)


[docs]def largest_elements(iterable, key, n=1):
    """
    Return a list of large elements of the ``iterable``
    (according to ``key`` function).

    ``n`` is a number of top element values to consider; when n==1
    (default) only largest elements are returned; when n==2 - elements
    with one of the top-2 values, etc.

    >>> s = [-4, 3, 5, 7, 4, -7]
    >>> largest_elements(s, abs)
    [7, -7]
    >>> largest_elements(s, abs, 2)
    [5, 7, -7]
    >>> largest_elements(s, abs, 3)
    [-4, 5, 7, 4, -7]

    """
    it1, it2 = itertools.tee(iterable)
    top_keys = set(heapq.nlargest(n, set(map(key, it1))))
    return [el for el in it2 if key(el) in top_keys]


[docs]def word_splits(word, min_reminder=3, max_prefix_length=5):
    """
    Return all splits of a word (taking in account min_reminder and
    max_prefix_length).
    """
    max_split = min(max_prefix_length, len(word)-min_reminder)
    split_indexes = range(1, 1+max_split)
    return [(word[:i], word[i:]) for i in split_indexes]


[docs]def kwargs_repr(kwargs=None, dont_show_value=None):
    """
    >>> kwargs_repr(dict(foo="123", a=5, x=8))
    "a=5, foo='123', x=8"
    >>> kwargs_repr(dict(foo="123", a=5, x=8), dont_show_value=['foo'])
    'a=5, foo=<...>, x=8'
    >>> kwargs_repr()
    ''
    """
    kwargs = kwargs or {}
    dont_show_value = set(dont_show_value or [])
    return ", ".join(
        "%s=%s" % (k, repr(v) if k not in dont_show_value else "<...>")
        for k, v in sorted(kwargs.items())
    )


[docs]def with_progress(iterable, desc=None, total=None, leave=True):
    """
    Return an iterator which prints the iteration progress using tqdm package.
    Return iterable intact if tqdm is not available.
    """
    try:
        from tqdm import tqdm

        # workarounds for tqdm bugs
        def _it(iterable, desc, total, leave):
            if total is None:
                try:
                    total = len(iterable)
                except Exception:
                    total = 0
            for el in tqdm(iterable, desc=desc, total=total, leave=leave):
                yield el
            if leave:
                print("")

        return _it(iterable, desc, total, leave)

    except ImportError:
        return iterable