Source code for pymorphy2.utils

# -*- coding: utf-8 -*-
from __future__ import absolute_import
# unicode_literals here would break tests

import os
import heapq
import itertools
import codecs
import json


[docs]def get_mem_usage(): """ Return memory usage of the current process, in bytes. Requires psutil Python package. """ import psutil proc = psutil.Process(os.getpid()) try: return proc.memory_info().rss except AttributeError: # psutil < 2.x return proc.get_memory_info()[0]
[docs]def combinations_of_all_lengths(it): """ Return an iterable with all possible combinations of items from ``it``: >>> for comb in combinations_of_all_lengths('ABC'): ... print("".join(comb)) A B C AB AC BC ABC """ return itertools.chain( *(itertools.combinations(it, num+1) for num in range(len(it))) )
[docs]def longest_common_substring(data): """ Return a longest common substring of a list of strings: >>> longest_common_substring(["apricot", "rice", "cricket"]) 'ric' >>> longest_common_substring(["apricot", "banana"]) 'a' >>> longest_common_substring(["foo", "bar", "baz"]) '' >>> longest_common_substring(["", "foo"]) '' >>> longest_common_substring(["apricot"]) 'apricot' >>> longest_common_substring([]) '' See http://stackoverflow.com/questions/2892931/. """ if len(data) == 1: return data[0] if not data or len(data[0]) == 0: return '' substr = '' for i in range(len(data[0])): for j in range(len(data[0])-i+1): if j > len(substr) and all(data[0][i:i+j] in x for x in data): substr = data[0][i:i+j] return substr
[docs]def json_write(filename, obj, **json_options): """ Create file ``filename`` with ``obj`` serialized to JSON """ json_options.setdefault('ensure_ascii', False) json_options.setdefault('indent', 2) with codecs.open(filename, 'w', 'utf8') as f: json.dump(obj, f, **json_options)
[docs]def json_read(filename, **json_options): """ Read an object from a json file ``filename`` """ with codecs.open(filename, 'r', 'utf8') as f: return json.load(f, **json_options)
[docs]def largest_elements(iterable, key, n=1): """ Return a list of large elements of the ``iterable`` (according to ``key`` function). ``n`` is a number of top element values to consider; when n==1 (default) only largest elements are returned; when n==2 - elements with one of the top-2 values, etc. >>> s = [-4, 3, 5, 7, 4, -7] >>> largest_elements(s, abs) [7, -7] >>> largest_elements(s, abs, 2) [5, 7, -7] >>> largest_elements(s, abs, 3) [-4, 5, 7, 4, -7] """ it1, it2 = itertools.tee(iterable) top_keys = set(heapq.nlargest(n, set(map(key, it1)))) return [el for el in it2 if key(el) in top_keys]
[docs]def word_splits(word, min_reminder=3, max_prefix_length=5): """ Return all splits of a word (taking in account min_reminder and max_prefix_length). """ max_split = min(max_prefix_length, len(word)-min_reminder) split_indexes = range(1, 1+max_split) return [(word[:i], word[i:]) for i in split_indexes]
[docs]def kwargs_repr(kwargs=None, dont_show_value=None): """ >>> kwargs_repr(dict(foo="123", a=5, x=8)) "a=5, foo='123', x=8" >>> kwargs_repr(dict(foo="123", a=5, x=8), dont_show_value=['foo']) 'a=5, foo=<...>, x=8' >>> kwargs_repr() '' """ kwargs = kwargs or {} dont_show_value = set(dont_show_value or []) return ", ".join( "%s=%s" % (k, repr(v) if k not in dont_show_value else "<...>") for k, v in sorted(kwargs.items()) )
[docs]def with_progress(iterable, desc=None, total=None, leave=True): """ Return an iterator which prints the iteration progress using tqdm package. Return iterable intact if tqdm is not available. """ try: from tqdm import tqdm # workarounds for tqdm bugs def _it(iterable, desc, total, leave): if total is None: try: total = len(iterable) except Exception: total = 0 for el in tqdm(iterable, desc=desc, total=total, leave=leave): yield el if leave: print("") return _it(iterable, desc, total, leave) except ImportError: return iterable