1 | """ |
---|
2 | Functions for splitting lists and arrays |
---|
3 | """ |
---|
4 | |
---|
5 | |
---|
6 | import numpy as np |
---|
7 | |
---|
8 | |
---|
9 | __all__ = ['split_corpus', 'mp_split_ls', 'split_documents'] |
---|
10 | |
---|
11 | |
---|
12 | |
---|
13 | def split_corpus(arr, indices): |
---|
14 | """ |
---|
15 | Splits the given array by the indices into list of sub-arrays. |
---|
16 | |
---|
17 | :param arr: An array to be split. |
---|
18 | :type arr: array |
---|
19 | :param indices: 1-dimensional array of integers that indicates |
---|
20 | where the array is split. |
---|
21 | :type indices: array |
---|
22 | |
---|
23 | :returns: A list of sub-arrays split at the indices. |
---|
24 | |
---|
25 | **Examples** |
---|
26 | |
---|
27 | >>> arr = np.arange(8) |
---|
28 | >>> indices = np.array([2,4,7]) |
---|
29 | >>> split_corpus(arr, indices) |
---|
30 | [array([0,1]), array([2,3]), array([4,5,6]), array([7])] |
---|
31 | """ |
---|
32 | if len(indices) == 0: |
---|
33 | return arr |
---|
34 | |
---|
35 | if isinstance(indices, list): |
---|
36 | indices = np.array(indices) |
---|
37 | |
---|
38 | out = np.split(arr, indices) |
---|
39 | |
---|
40 | if (indices >= arr.size).any(): |
---|
41 | out = out[:-1] |
---|
42 | |
---|
43 | for i in xrange(len(out)): |
---|
44 | if out[i].size == 0: |
---|
45 | out[i] = np.array([], dtype=arr.dtype) |
---|
46 | |
---|
47 | return out |
---|
48 | |
---|
49 | |
---|
50 | |
---|
51 | def mp_split_ls(ls, n): |
---|
52 | """ |
---|
53 | Split list into an `n`-length list of arrays. |
---|
54 | |
---|
55 | :param ls: List to be split. |
---|
56 | :type ls: list |
---|
57 | |
---|
58 | :param n: Number of splits. |
---|
59 | :type n: int |
---|
60 | |
---|
61 | :returns: List of arrays whose length is 'n'. |
---|
62 | |
---|
63 | **Examples** |
---|
64 | >>> ls = [1,5,6,8,2,8] |
---|
65 | >>> mp_split_ls(ls, 4) |
---|
66 | [array([1, 5]), array([6, 8]), array([2]), array([8])] |
---|
67 | """ |
---|
68 | return np.array_split(ls, min(len(ls), n)) |
---|
69 | |
---|
70 | |
---|
71 | def split_documents(corpus, indices, n_partitions): |
---|
72 | """ |
---|
73 | """ |
---|
74 | docs = [(0, indices[0])] |
---|
75 | for i in xrange(len(indices)-1): |
---|
76 | docs.append((indices[i], indices[i+1])) |
---|
77 | docs = np.array(docs, dtype='i8, i8') |
---|
78 | |
---|
79 | corpus_chunks = np.array_split(corpus, n_partitions) |
---|
80 | chunk_indices = np.cumsum([len(chunk) for chunk in corpus_chunks]) |
---|
81 | doc_indices = np.searchsorted(indices, chunk_indices, side='right') |
---|
82 | doc_partitions = np.split(docs, doc_indices[:-1]) |
---|
83 | |
---|
84 | return doc_partitions |
---|