[0ff122b] | 1 | import unittest2 as unittest |
---|
| 2 | |
---|
| 3 | from vsm.extensions.corpusbuilders import * |
---|
| 4 | from vsm.extensions.corpusbuilders.corpusbuilders import file_tokenize, coll_tokenize, dir_tokenize, corpus_fromlist |
---|
| 5 | import numpy as np |
---|
| 6 | |
---|
| 7 | |
---|
| 8 | class TestCorpusbuilders(unittest.TestCase): |
---|
| 9 | |
---|
| 10 | def test_empty_corpus(self): |
---|
| 11 | |
---|
| 12 | c = empty_corpus() |
---|
| 13 | self.assertTrue((np.array([]) == c.corpus).all()) |
---|
| 14 | self.assertTrue(['document'] == c.context_types) |
---|
| 15 | self.assertTrue((np.array([]) == c.view_contexts('document')).all()) |
---|
| 16 | |
---|
| 17 | def test_corpus_fromlist(self): |
---|
| 18 | |
---|
| 19 | l = [[],['Not','an','empty','document'],[], |
---|
| 20 | ['Another','non-empty','document'],[]] |
---|
| 21 | |
---|
| 22 | c = corpus_fromlist(l, context_type='sentence') |
---|
| 23 | |
---|
| 24 | self.assertTrue(c.context_types == ['sentence']) |
---|
| 25 | self.assertTrue((c.context_data[0]['idx'] == [4,7]).all()) |
---|
| 26 | self.assertTrue((c.context_data[0]['sentence_label'] == |
---|
| 27 | ['sentence_1', 'sentence_3']).all()) |
---|
| 28 | |
---|
| 29 | |
---|
| 30 | def test_toy_corpus(self): |
---|
| 31 | |
---|
| 32 | keats = ('She dwells with Beauty - Beauty that must die;\n\n' |
---|
| 33 | 'And Joy, whose hand is ever at his lips\n\n' |
---|
| 34 | 'Bidding adieu; and aching Pleasure nigh,\n\n' |
---|
| 35 | 'Turning to poison while the bee-mouth sips:\n\n' |
---|
| 36 | 'Ay, in the very temple of Delight\n\n' |
---|
| 37 | 'Veil\'d Melancholy has her sovran shrine,\n\n' |
---|
| 38 | 'Though seen of none save him whose strenuous tongue\n\n' |
---|
| 39 | 'Can burst Joy\'s grape against his palate fine;\n\n' |
---|
| 40 | 'His soul shall taste the sadness of her might,\n\n' |
---|
| 41 | 'And be among her cloudy trophies hung.') |
---|
| 42 | |
---|
| 43 | self.assertTrue(toy_corpus(keats)) |
---|
| 44 | self.assertTrue(toy_corpus(keats, nltk_stop=True)) |
---|
| 45 | self.assertTrue(toy_corpus(keats, stop_freq=1)) |
---|
| 46 | self.assertTrue(toy_corpus(keats, add_stop=['and', 'with'])) |
---|
| 47 | self.assertTrue(toy_corpus(keats, nltk_stop=True, |
---|
| 48 | stop_freq=1, add_stop=['ay'])) |
---|
| 49 | |
---|
| 50 | import os |
---|
| 51 | from tempfile import NamedTemporaryFile as NFT |
---|
| 52 | |
---|
| 53 | tmp = NFT(delete=False) |
---|
| 54 | tmp.write(keats) |
---|
| 55 | tmp.close() |
---|
| 56 | |
---|
| 57 | c = toy_corpus(tmp.name, is_filename=True, |
---|
| 58 | nltk_stop=True, add_stop=['ay']) |
---|
| 59 | |
---|
| 60 | self.assertTrue(c) |
---|
| 61 | os.remove(tmp.name) |
---|
| 62 | |
---|
| 63 | return c |
---|
| 64 | |
---|
| 65 | |
---|
| 66 | def test_file_tokenize(self): |
---|
| 67 | |
---|
| 68 | text = 'foo foo foo\n\nfoo foo. Foo bar. Foo bar. foo\n\nfoo' |
---|
| 69 | |
---|
| 70 | words, context_data = file_tokenize(text) |
---|
| 71 | |
---|
| 72 | self.assertTrue(len(words) == 11) |
---|
| 73 | self.assertTrue(len(context_data['paragraph']) == 3) |
---|
| 74 | self.assertTrue(len(context_data['sentence']) == 6) |
---|
| 75 | |
---|
| 76 | self.assertTrue((context_data['paragraph']['idx'] == |
---|
| 77 | [3, 10, 11]).all()) |
---|
| 78 | self.assertTrue((context_data['paragraph']['paragraph_label'] == |
---|
| 79 | ['0', '1', '2']).all()) |
---|
| 80 | self.assertTrue((context_data['sentence']['idx'] == |
---|
| 81 | [3, 5, 7, 9, 10, 11]).all()) |
---|
| 82 | self.assertTrue((context_data['sentence']['paragraph_label'] == |
---|
| 83 | ['0', '1', '1', '1', '1', '2']).all()) |
---|
| 84 | self.assertTrue((context_data['sentence']['sentence_label'] == |
---|
| 85 | ['0', '1', '2', '3', '4', '5']).all()) |
---|
| 86 | |
---|
| 87 | |
---|
| 88 | def test_file_corpus(self): |
---|
| 89 | |
---|
| 90 | text = 'foo foo foo\n\nfoo foo. Foo bar. Foo bar. foo\n\nfoo' |
---|
| 91 | |
---|
| 92 | import os |
---|
| 93 | from tempfile import NamedTemporaryFile as NFT |
---|
| 94 | |
---|
| 95 | tmp = NFT(delete=False) |
---|
| 96 | tmp.write(text) |
---|
| 97 | tmp.close() |
---|
| 98 | |
---|
| 99 | c = file_corpus(tmp.name) |
---|
| 100 | |
---|
| 101 | self.assertTrue(c) |
---|
| 102 | os.remove(tmp.name) |
---|
| 103 | |
---|
| 104 | return c |
---|
| 105 | |
---|
| 106 | #TODO: tests for dir_corpus, coll_corpus |
---|
| 107 | def test_dir_tokenize(self): |
---|
| 108 | |
---|
| 109 | chunks = ['foo foo foo\n\nfoo foo', |
---|
| 110 | 'Foo bar. Foo bar.', |
---|
| 111 | '', |
---|
| 112 | 'foo\n\nfoo'] |
---|
| 113 | |
---|
| 114 | labels = [str(i) for i in xrange(len(chunks))] |
---|
| 115 | words, context_data = dir_tokenize(chunks, labels) |
---|
| 116 | |
---|
| 117 | print |
---|
| 118 | print context_data['sentence']['idx'] |
---|
| 119 | print |
---|
| 120 | |
---|
| 121 | self.assertTrue(len(words) == 11) |
---|
| 122 | self.assertTrue(len(context_data['article']) == 4) |
---|
| 123 | self.assertTrue(len(context_data['paragraph']) == 6) |
---|
| 124 | self.assertTrue(len(context_data['sentence']) == 6) |
---|
| 125 | |
---|
| 126 | self.assertTrue((context_data['article']['idx'] == [5, 9, 9, 11]).all()) |
---|
| 127 | self.assertTrue((context_data['article']['article_label'] == |
---|
| 128 | ['0', '1', '2', '3']).all()) |
---|
| 129 | self.assertTrue((context_data['paragraph']['idx'] == |
---|
| 130 | [3, 5, 9, 9, 10, 11]).all()) |
---|
| 131 | self.assertTrue((context_data['paragraph']['article_label'] == |
---|
| 132 | ['0', '0', '1', '2', '3', '3']).all()) |
---|
| 133 | self.assertTrue((context_data['paragraph']['paragraph_label'] == |
---|
| 134 | ['0', '1', '2', '3', '4', '5']).all()) |
---|
| 135 | self.assertTrue((context_data['sentence']['idx'] == |
---|
| 136 | [3, 5, 7, 9, 10, 11]).all()) |
---|
| 137 | self.assertTrue((context_data['sentence']['article_label'] == |
---|
| 138 | ['0', '0', '1', '1', '3', '3']).all()) |
---|
| 139 | self.assertTrue((context_data['sentence']['paragraph_label'] == |
---|
| 140 | ['0', '1', '2', '2', '4', '5']).all()) |
---|
| 141 | self.assertTrue((context_data['sentence']['sentence_label'] == |
---|
| 142 | ['0', '1', '2', '3', '4', '5']).all()) |
---|
| 143 | |
---|
| 144 | |
---|
| 145 | def test_coll_tokenize(self): |
---|
| 146 | |
---|
| 147 | books = [[('foo foo foo.\n\nfoo foo', '1'), |
---|
| 148 | ('Foo bar. Foo bar.', '2')], |
---|
| 149 | [('','3'), |
---|
| 150 | ('foo.\n\nfoo', '4')]] |
---|
| 151 | |
---|
| 152 | book_names = [str(i) for i in xrange(len(books))] |
---|
| 153 | words, context_data = coll_tokenize(books, book_names) |
---|
| 154 | |
---|
| 155 | self.assertTrue(len(words) == 11) |
---|
| 156 | self.assertTrue(len(context_data['book']) == 2) |
---|
| 157 | self.assertTrue(len(context_data['page']) == 4) |
---|
| 158 | self.assertTrue(len(context_data['sentence']) == 6) |
---|
| 159 | self.assertTrue((context_data['book']['idx'] == [9, 11]).all()) |
---|
| 160 | self.assertTrue((context_data['book']['book_label'] == ['0', '1']).all()) |
---|
| 161 | self.assertTrue((context_data['page']['idx'] == [5, 9, 9, 11]).all()) |
---|
| 162 | self.assertTrue((context_data['page']['page_label'] == |
---|
| 163 | ['0', '1', '2', '3']).all()) |
---|
| 164 | self.assertTrue((context_data['page']['book_label'] == |
---|
| 165 | ['0', '0', '1', '1']).all()) |
---|
| 166 | self.assertTrue((context_data['sentence']['idx'] == |
---|
| 167 | [3, 5, 7, 9, 10, 11]).all()) |
---|
| 168 | self.assertTrue((context_data['sentence']['sentence_label'] == |
---|
| 169 | ['0', '1', '2', '3', '4', '5']).all()) |
---|
| 170 | self.assertTrue((context_data['sentence']['page_label'] == |
---|
| 171 | ['0', '0', '1', '1', '3', '3']).all()) |
---|
| 172 | self.assertTrue((context_data['sentence']['book_label'] == |
---|
| 173 | ['0', '0', '0', '0', '1', '1']).all()) |
---|
| 174 | self.assertTrue((context_data['page']['file'] == |
---|
| 175 | ['1','2','3','4']).all()) |
---|
| 176 | self.assertTrue((context_data['sentence']['file'] == |
---|
| 177 | ['1','1','2','2','4','4']).all()) |
---|
| 178 | |
---|
| 179 | |
---|
| 180 | suite = unittest.TestLoader().loadTestsFromTestCase(TestCorpusbuilders) |
---|
| 181 | unittest.TextTestRunner(verbosity=2).run(suite) |
---|