1 | import unittest2 as unittest |
---|
2 | import numpy as np |
---|
3 | import os |
---|
4 | from vsm.corpus import * |
---|
5 | from vsm.split import split_corpus |
---|
6 | from tempfile import NamedTemporaryFile |
---|
7 | |
---|
8 | |
---|
9 | class TestCorpus(unittest.TestCase): |
---|
10 | |
---|
11 | def setUp(self): |
---|
12 | corpus = np.array([0, 3, 2, 1, 0, 3, 0, 2, 3, 0, 2, 3, 1, 2, 0, 3, |
---|
13 | 2, 1, 2, 2], dtype=np.int) |
---|
14 | contextData = np.array([(3, 'doc0'), (5, 'doc1'), (7,'doc2'), (11,'doc3'), |
---|
15 | (11,'doc4'), (15,'doc5'), (18,'doc6'), (20,'doc7')], |
---|
16 | dtype=[('idx', '<i8'), ('doc', '|S4')]) |
---|
17 | |
---|
18 | self.bc = BaseCorpus(corpus, context_data=[contextData], |
---|
19 | context_types=['document']) |
---|
20 | |
---|
21 | text = ['I', 'came', 'I', 'saw', 'I', 'conquered'] |
---|
22 | ctx_data = [np.array([(2, 'Veni'), (4, 'Vidi'), (6, 'Vici')], |
---|
23 | dtype=[('idx', '<i8'), ('sent', '|S6')])] |
---|
24 | |
---|
25 | self.corpus = Corpus(text, context_data=ctx_data, |
---|
26 | context_types=['sentence']) |
---|
27 | |
---|
28 | |
---|
29 | #TODO: Move this test to vsm.split |
---|
30 | def test_SplitCorpus(self): |
---|
31 | odd = split_corpus(self.corpus.corpus, [1,3,5]) |
---|
32 | even = split_corpus(self.corpus.corpus, [2,4,6]) |
---|
33 | |
---|
34 | odd_expected = [np.array([0]), np.array([1, 0]), |
---|
35 | np.array([3, 0]), np.array([2])] |
---|
36 | even_expected = [np.array([0, 1]), np.array([0, 3]), |
---|
37 | np.array([0, 2])] |
---|
38 | |
---|
39 | for i in xrange(len(odd)): |
---|
40 | np.testing.assert_array_equal(odd[i], odd_expected[i]) |
---|
41 | for i in xrange(len(even)): |
---|
42 | np.testing.assert_array_equal(even[i], even_expected[i]) |
---|
43 | |
---|
44 | |
---|
45 | def test_align_corpora(self): |
---|
46 | |
---|
47 | out = align_corpora(self.corpus, Corpus([])) |
---|
48 | self.assertTrue(len(out.corpus)==0) |
---|
49 | self.assertTrue(len(out.words)==4) |
---|
50 | self.assertTrue(len(out.words_int)==4) |
---|
51 | |
---|
52 | out = align_corpora(Corpus([], remove_empty=False), self.corpus) |
---|
53 | self.assertTrue(len(out.corpus)==0) |
---|
54 | self.assertTrue(len(out.words)==0) |
---|
55 | self.assertTrue(len(out.words_int)==0) |
---|
56 | |
---|
57 | out = align_corpora(self.corpus, self.corpus) |
---|
58 | self.assertTrue(len(out.corpus)==len(self.corpus.corpus)) |
---|
59 | self.assertTrue((out.corpus==self.corpus.corpus).all()) |
---|
60 | self.assertTrue(len(out.words)==len(self.corpus.words)) |
---|
61 | self.assertTrue((out.words==self.corpus.words).all()) |
---|
62 | self.assertTrue(out.words_int==self.corpus.words_int) |
---|
63 | |
---|
64 | new_corp = Corpus( |
---|
65 | [ 'came', 'saw', 'and', 'conquered' ], |
---|
66 | context_data=[ np.array([(4, )], dtype=[('idx', '<i8')]) ]) |
---|
67 | out = align_corpora(self.corpus, new_corp) |
---|
68 | self.assertTrue(len(out.corpus)==3) |
---|
69 | for w in out.corpus: |
---|
70 | self.assertTrue(out.words[w]==self.corpus.words[w]) |
---|
71 | self.assertTrue(len(out.words)==4) |
---|
72 | self.assertTrue((out.words==self.corpus.words).all()) |
---|
73 | self.assertTrue(out.words_int==self.corpus.words_int) |
---|
74 | |
---|
75 | |
---|
76 | def test_ValidateIndices(self): |
---|
77 | for t in self.bc.context_data: |
---|
78 | self.assertTrue(self.bc._validate_indices(t['idx'])) |
---|
79 | |
---|
80 | def test_RemoveEmpty(self): |
---|
81 | self.bc.remove_empty() |
---|
82 | new_ctx = np.equal(self.bc.context_data[0], np.array([(3,'doc0'), (5,'doc1'), |
---|
83 | (7,'doc2'), (11,'doc3'), (15,'doc5'), (18,'doc6'), |
---|
84 | (20,'doc7')], dtype=[('idx', '<i8'), ('sent', '|S6')])) |
---|
85 | self.assertTrue(new_ctx, msg=None) |
---|
86 | |
---|
87 | def test_ViewMetadata(self): |
---|
88 | meta = self.bc.view_metadata('document') |
---|
89 | np.testing.assert_array_equal(self.bc.context_data[0], meta) |
---|
90 | |
---|
91 | def test_bc_ViewContexts(self): |
---|
92 | ctx = self.bc.view_contexts('document') |
---|
93 | expected = [np.array([0,3,2]), np.array([1,0]), np.array([3,0]), |
---|
94 | np.array([2,3,0,2]), np.array([3,1,2,0]), np.array([3,2,1]), |
---|
95 | np.array([2,2])] |
---|
96 | for i in xrange(len(ctx)): |
---|
97 | np.testing.assert_array_equal(ctx[i], expected[i]) |
---|
98 | |
---|
99 | |
---|
100 | def test_MetaInt(self): |
---|
101 | i = self.bc.meta_int('document', {'doc': 'doc3'}) |
---|
102 | self.assertEqual(3, i) |
---|
103 | |
---|
104 | def test_GetMetadatum(self): |
---|
105 | s = self.bc.get_metadatum('document', {'doc': 'doc0'}, 'doc') |
---|
106 | self.assertEqual('doc0', s) |
---|
107 | |
---|
108 | |
---|
109 | def test_SetWordsInt(self): |
---|
110 | d = {'I':0, 'came':1, 'conquered':2, 'saw':3} |
---|
111 | self.assertEqual(self.corpus.words_int, d) |
---|
112 | |
---|
113 | def test_ViewContexts(self): |
---|
114 | expected = [np.array(['I','came']), np.array(['I', 'saw']), np.array(['I', 'conquered'])] |
---|
115 | ctx = self.corpus.view_contexts('sentence', as_strings=True) |
---|
116 | for i in xrange(len(ctx)): |
---|
117 | np.testing.assert_array_equal(ctx[i], expected[i]) |
---|
118 | |
---|
119 | def test_SaveLoad(self): |
---|
120 | |
---|
121 | try: |
---|
122 | tmp = NamedTemporaryFile(delete=False, suffix='.npz') |
---|
123 | self.corpus.save(tmp.name) |
---|
124 | tmp.close() |
---|
125 | c_reloaded = Corpus.load(tmp.name) |
---|
126 | |
---|
127 | self.assertTrue((self.corpus.corpus == c_reloaded.corpus).all()) |
---|
128 | self.assertTrue((self.corpus.words == c_reloaded.words).all()) |
---|
129 | self.assertTrue(self.corpus.words_int == c_reloaded.words_int) |
---|
130 | self.assertTrue(self.corpus.context_types == c_reloaded.context_types) |
---|
131 | |
---|
132 | for i in xrange(len(self.corpus.context_data)): |
---|
133 | self.assertTrue((self.corpus.context_data[i] == |
---|
134 | c_reloaded.context_data[i]).all(), msg=None) |
---|
135 | |
---|
136 | finally: |
---|
137 | os.remove(tmp.name) |
---|
138 | |
---|
139 | |
---|
140 | suite = unittest.TestLoader().loadTestsFromTestCase(TestCorpus) |
---|
141 | unittest.TextTestRunner(verbosity=2).run(suite) |
---|