1 | import unittest2 as unittest |
---|
2 | |
---|
3 | from vsm.extensions.corpusbuilders import * |
---|
4 | from vsm.extensions.corpusbuilders.corpusbuilders import file_tokenize, coll_tokenize, dir_tokenize, corpus_fromlist |
---|
5 | import numpy as np |
---|
6 | |
---|
7 | |
---|
8 | class TestCorpusbuilders(unittest.TestCase): |
---|
9 | |
---|
10 | def test_empty_corpus(self): |
---|
11 | |
---|
12 | c = empty_corpus() |
---|
13 | self.assertTrue((np.array([]) == c.corpus).all()) |
---|
14 | self.assertTrue(['document'] == c.context_types) |
---|
15 | self.assertTrue((np.array([]) == c.view_contexts('document')).all()) |
---|
16 | |
---|
17 | def test_corpus_fromlist(self): |
---|
18 | |
---|
19 | l = [[],['Not','an','empty','document'],[], |
---|
20 | ['Another','non-empty','document'],[]] |
---|
21 | |
---|
22 | c = corpus_fromlist(l, context_type='sentence') |
---|
23 | |
---|
24 | self.assertTrue(c.context_types == ['sentence']) |
---|
25 | self.assertTrue((c.context_data[0]['idx'] == [4,7]).all()) |
---|
26 | self.assertTrue((c.context_data[0]['sentence_label'] == |
---|
27 | ['sentence_1', 'sentence_3']).all()) |
---|
28 | |
---|
29 | |
---|
30 | def test_toy_corpus(self): |
---|
31 | |
---|
32 | keats = ('She dwells with Beauty - Beauty that must die;\n\n' |
---|
33 | 'And Joy, whose hand is ever at his lips\n\n' |
---|
34 | 'Bidding adieu; and aching Pleasure nigh,\n\n' |
---|
35 | 'Turning to poison while the bee-mouth sips:\n\n' |
---|
36 | 'Ay, in the very temple of Delight\n\n' |
---|
37 | 'Veil\'d Melancholy has her sovran shrine,\n\n' |
---|
38 | 'Though seen of none save him whose strenuous tongue\n\n' |
---|
39 | 'Can burst Joy\'s grape against his palate fine;\n\n' |
---|
40 | 'His soul shall taste the sadness of her might,\n\n' |
---|
41 | 'And be among her cloudy trophies hung.') |
---|
42 | |
---|
43 | self.assertTrue(toy_corpus(keats)) |
---|
44 | self.assertTrue(toy_corpus(keats, nltk_stop=True)) |
---|
45 | self.assertTrue(toy_corpus(keats, stop_freq=1)) |
---|
46 | self.assertTrue(toy_corpus(keats, add_stop=['and', 'with'])) |
---|
47 | self.assertTrue(toy_corpus(keats, nltk_stop=True, |
---|
48 | stop_freq=1, add_stop=['ay'])) |
---|
49 | |
---|
50 | import os |
---|
51 | from tempfile import NamedTemporaryFile as NFT |
---|
52 | |
---|
53 | tmp = NFT(delete=False) |
---|
54 | tmp.write(keats) |
---|
55 | tmp.close() |
---|
56 | |
---|
57 | c = toy_corpus(tmp.name, is_filename=True, |
---|
58 | nltk_stop=True, add_stop=['ay']) |
---|
59 | |
---|
60 | self.assertTrue(c) |
---|
61 | os.remove(tmp.name) |
---|
62 | |
---|
63 | return c |
---|
64 | |
---|
65 | |
---|
66 | def test_file_tokenize(self): |
---|
67 | |
---|
68 | text = 'foo foo foo\n\nfoo foo. Foo bar. Foo bar. foo\n\nfoo' |
---|
69 | |
---|
70 | words, context_data = file_tokenize(text) |
---|
71 | |
---|
72 | self.assertTrue(len(words) == 11) |
---|
73 | self.assertTrue(len(context_data['paragraph']) == 3) |
---|
74 | self.assertTrue(len(context_data['sentence']) == 6) |
---|
75 | |
---|
76 | self.assertTrue((context_data['paragraph']['idx'] == |
---|
77 | [3, 10, 11]).all()) |
---|
78 | self.assertTrue((context_data['paragraph']['paragraph_label'] == |
---|
79 | ['0', '1', '2']).all()) |
---|
80 | self.assertTrue((context_data['sentence']['idx'] == |
---|
81 | [3, 5, 7, 9, 10, 11]).all()) |
---|
82 | self.assertTrue((context_data['sentence']['paragraph_label'] == |
---|
83 | ['0', '1', '1', '1', '1', '2']).all()) |
---|
84 | self.assertTrue((context_data['sentence']['sentence_label'] == |
---|
85 | ['0', '1', '2', '3', '4', '5']).all()) |
---|
86 | |
---|
87 | |
---|
88 | def test_file_corpus(self): |
---|
89 | |
---|
90 | text = 'foo foo foo\n\nfoo foo. Foo bar. Foo bar. foo\n\nfoo' |
---|
91 | |
---|
92 | import os |
---|
93 | from tempfile import NamedTemporaryFile as NFT |
---|
94 | |
---|
95 | tmp = NFT(delete=False) |
---|
96 | tmp.write(text) |
---|
97 | tmp.close() |
---|
98 | |
---|
99 | c = file_corpus(tmp.name) |
---|
100 | |
---|
101 | self.assertTrue(c) |
---|
102 | os.remove(tmp.name) |
---|
103 | |
---|
104 | return c |
---|
105 | |
---|
106 | #TODO: tests for dir_corpus, coll_corpus |
---|
107 | def test_dir_tokenize(self): |
---|
108 | |
---|
109 | chunks = ['foo foo foo\n\nfoo foo', |
---|
110 | 'Foo bar. Foo bar.', |
---|
111 | '', |
---|
112 | 'foo\n\nfoo'] |
---|
113 | |
---|
114 | labels = [str(i) for i in xrange(len(chunks))] |
---|
115 | words, context_data = dir_tokenize(chunks, labels) |
---|
116 | |
---|
117 | print |
---|
118 | print context_data['sentence']['idx'] |
---|
119 | print |
---|
120 | |
---|
121 | self.assertTrue(len(words) == 11) |
---|
122 | self.assertTrue(len(context_data['article']) == 4) |
---|
123 | self.assertTrue(len(context_data['paragraph']) == 6) |
---|
124 | self.assertTrue(len(context_data['sentence']) == 6) |
---|
125 | |
---|
126 | self.assertTrue((context_data['article']['idx'] == [5, 9, 9, 11]).all()) |
---|
127 | self.assertTrue((context_data['article']['article_label'] == |
---|
128 | ['0', '1', '2', '3']).all()) |
---|
129 | self.assertTrue((context_data['paragraph']['idx'] == |
---|
130 | [3, 5, 9, 9, 10, 11]).all()) |
---|
131 | self.assertTrue((context_data['paragraph']['article_label'] == |
---|
132 | ['0', '0', '1', '2', '3', '3']).all()) |
---|
133 | self.assertTrue((context_data['paragraph']['paragraph_label'] == |
---|
134 | ['0', '1', '2', '3', '4', '5']).all()) |
---|
135 | self.assertTrue((context_data['sentence']['idx'] == |
---|
136 | [3, 5, 7, 9, 10, 11]).all()) |
---|
137 | self.assertTrue((context_data['sentence']['article_label'] == |
---|
138 | ['0', '0', '1', '1', '3', '3']).all()) |
---|
139 | self.assertTrue((context_data['sentence']['paragraph_label'] == |
---|
140 | ['0', '1', '2', '2', '4', '5']).all()) |
---|
141 | self.assertTrue((context_data['sentence']['sentence_label'] == |
---|
142 | ['0', '1', '2', '3', '4', '5']).all()) |
---|
143 | |
---|
144 | |
---|
145 | def test_coll_tokenize(self): |
---|
146 | |
---|
147 | books = [[('foo foo foo.\n\nfoo foo', '1'), |
---|
148 | ('Foo bar. Foo bar.', '2')], |
---|
149 | [('','3'), |
---|
150 | ('foo.\n\nfoo', '4')]] |
---|
151 | |
---|
152 | book_names = [str(i) for i in xrange(len(books))] |
---|
153 | words, context_data = coll_tokenize(books, book_names) |
---|
154 | |
---|
155 | self.assertTrue(len(words) == 11) |
---|
156 | self.assertTrue(len(context_data['book']) == 2) |
---|
157 | self.assertTrue(len(context_data['page']) == 4) |
---|
158 | self.assertTrue(len(context_data['sentence']) == 6) |
---|
159 | self.assertTrue((context_data['book']['idx'] == [9, 11]).all()) |
---|
160 | self.assertTrue((context_data['book']['book_label'] == ['0', '1']).all()) |
---|
161 | self.assertTrue((context_data['page']['idx'] == [5, 9, 9, 11]).all()) |
---|
162 | self.assertTrue((context_data['page']['page_label'] == |
---|
163 | ['0', '1', '2', '3']).all()) |
---|
164 | self.assertTrue((context_data['page']['book_label'] == |
---|
165 | ['0', '0', '1', '1']).all()) |
---|
166 | self.assertTrue((context_data['sentence']['idx'] == |
---|
167 | [3, 5, 7, 9, 10, 11]).all()) |
---|
168 | self.assertTrue((context_data['sentence']['sentence_label'] == |
---|
169 | ['0', '1', '2', '3', '4', '5']).all()) |
---|
170 | self.assertTrue((context_data['sentence']['page_label'] == |
---|
171 | ['0', '0', '1', '1', '3', '3']).all()) |
---|
172 | self.assertTrue((context_data['sentence']['book_label'] == |
---|
173 | ['0', '0', '0', '0', '1', '1']).all()) |
---|
174 | self.assertTrue((context_data['page']['file'] == |
---|
175 | ['1','2','3','4']).all()) |
---|
176 | self.assertTrue((context_data['sentence']['file'] == |
---|
177 | ['1','1','2','2','4','4']).all()) |
---|
178 | |
---|
179 | |
---|
180 | suite = unittest.TestLoader().loadTestsFromTestCase(TestCorpusbuilders) |
---|
181 | unittest.TextTestRunner(verbosity=2).run(suite) |
---|