Navegación de contexto

source: consulta_publica/vsm/unit_tests/tests_corpusbuilders.py @ a805aa6

baseconstituyenteestudiantesgeneralplan_patriasala

Last change on this file since a805aa6 was 0ff122b, checked in by rudmanmrrod <rudman22@…>, 7 años ago
Agregado módulo de gestión de perfiles de procesamiento, incorporado el módulo de visualización de modelado de tópicos
Propiedad mode establecida a `100644`
File size: 7.0 KB

Línea
1	import unittest2 as unittest
2
3	from vsm.extensions.corpusbuilders import *
4	from vsm.extensions.corpusbuilders.corpusbuilders import file_tokenize, coll_tokenize, dir_tokenize, corpus_fromlist
5	import numpy as np
6
7
8	class TestCorpusbuilders(unittest.TestCase):
9
10	def test_empty_corpus(self):
11
12	c = empty_corpus()
13	self.assertTrue((np.array([]) == c.corpus).all())
14	self.assertTrue(['document'] == c.context_types)
15	self.assertTrue((np.array([]) == c.view_contexts('document')).all())
16
17	def test_corpus_fromlist(self):
18
19	l = [[],['Not','an','empty','document'],[],
20	['Another','non-empty','document'],[]]
21
22	c = corpus_fromlist(l, context_type='sentence')
23
24	self.assertTrue(c.context_types == ['sentence'])
25	self.assertTrue((c.context_data[0]['idx'] == [4,7]).all())
26	self.assertTrue((c.context_data[0]['sentence_label'] ==
27	['sentence_1', 'sentence_3']).all())
28
29
30	def test_toy_corpus(self):
31
32	keats = ('She dwells with Beauty - Beauty that must die;\n\n'
33	'And Joy, whose hand is ever at his lips\n\n'
34	'Bidding adieu; and aching Pleasure nigh,\n\n'
35	'Turning to poison while the bee-mouth sips:\n\n'
36	'Ay, in the very temple of Delight\n\n'
37	'Veil\'d Melancholy has her sovran shrine,\n\n'
38	'Though seen of none save him whose strenuous tongue\n\n'
39	'Can burst Joy\'s grape against his palate fine;\n\n'
40	'His soul shall taste the sadness of her might,\n\n'
41	'And be among her cloudy trophies hung.')
42
43	self.assertTrue(toy_corpus(keats))
44	self.assertTrue(toy_corpus(keats, nltk_stop=True))
45	self.assertTrue(toy_corpus(keats, stop_freq=1))
46	self.assertTrue(toy_corpus(keats, add_stop=['and', 'with']))
47	self.assertTrue(toy_corpus(keats, nltk_stop=True,
48	stop_freq=1, add_stop=['ay']))
49
50	import os
51	from tempfile import NamedTemporaryFile as NFT
52
53	tmp = NFT(delete=False)
54	tmp.write(keats)
55	tmp.close()
56
57	c = toy_corpus(tmp.name, is_filename=True,
58	nltk_stop=True, add_stop=['ay'])
59
60	self.assertTrue(c)
61	os.remove(tmp.name)
62
63	return c
64
65
66	def test_file_tokenize(self):
67
68	text = 'foo foo foo\n\nfoo foo. Foo bar. Foo bar. foo\n\nfoo'
69
70	words, context_data = file_tokenize(text)
71
72	self.assertTrue(len(words) == 11)
73	self.assertTrue(len(context_data['paragraph']) == 3)
74	self.assertTrue(len(context_data['sentence']) == 6)
75
76	self.assertTrue((context_data['paragraph']['idx'] ==
77	[3, 10, 11]).all())
78	self.assertTrue((context_data['paragraph']['paragraph_label'] ==
79	['0', '1', '2']).all())
80	self.assertTrue((context_data['sentence']['idx'] ==
81	[3, 5, 7, 9, 10, 11]).all())
82	self.assertTrue((context_data['sentence']['paragraph_label'] ==
83	['0', '1', '1', '1', '1', '2']).all())
84	self.assertTrue((context_data['sentence']['sentence_label'] ==
85	['0', '1', '2', '3', '4', '5']).all())
86
87
88	def test_file_corpus(self):
89
90	text = 'foo foo foo\n\nfoo foo. Foo bar. Foo bar. foo\n\nfoo'
91
92	import os
93	from tempfile import NamedTemporaryFile as NFT
94
95	tmp = NFT(delete=False)
96	tmp.write(text)
97	tmp.close()
98
99	c = file_corpus(tmp.name)
100
101	self.assertTrue(c)
102	os.remove(tmp.name)
103
104	return c
105
106	#TODO: tests for dir_corpus, coll_corpus
107	def test_dir_tokenize(self):
108
109	chunks = ['foo foo foo\n\nfoo foo',
110	'Foo bar. Foo bar.',
111	'',
112	'foo\n\nfoo']
113
114	labels = [str(i) for i in xrange(len(chunks))]
115	words, context_data = dir_tokenize(chunks, labels)
116
117	print
118	print context_data['sentence']['idx']
119	print
120
121	self.assertTrue(len(words) == 11)
122	self.assertTrue(len(context_data['article']) == 4)
123	self.assertTrue(len(context_data['paragraph']) == 6)
124	self.assertTrue(len(context_data['sentence']) == 6)
125
126	self.assertTrue((context_data['article']['idx'] == [5, 9, 9, 11]).all())
127	self.assertTrue((context_data['article']['article_label'] ==
128	['0', '1', '2', '3']).all())
129	self.assertTrue((context_data['paragraph']['idx'] ==
130	[3, 5, 9, 9, 10, 11]).all())
131	self.assertTrue((context_data['paragraph']['article_label'] ==
132	['0', '0', '1', '2', '3', '3']).all())
133	self.assertTrue((context_data['paragraph']['paragraph_label'] ==
134	['0', '1', '2', '3', '4', '5']).all())
135	self.assertTrue((context_data['sentence']['idx'] ==
136	[3, 5, 7, 9, 10, 11]).all())
137	self.assertTrue((context_data['sentence']['article_label'] ==
138	['0', '0', '1', '1', '3', '3']).all())
139	self.assertTrue((context_data['sentence']['paragraph_label'] ==
140	['0', '1', '2', '2', '4', '5']).all())
141	self.assertTrue((context_data['sentence']['sentence_label'] ==
142	['0', '1', '2', '3', '4', '5']).all())
143
144
145	def test_coll_tokenize(self):
146
147	books = [[('foo foo foo.\n\nfoo foo', '1'),
148	('Foo bar. Foo bar.', '2')],
149	[('','3'),
150	('foo.\n\nfoo', '4')]]
151
152	book_names = [str(i) for i in xrange(len(books))]
153	words, context_data = coll_tokenize(books, book_names)
154
155	self.assertTrue(len(words) == 11)
156	self.assertTrue(len(context_data['book']) == 2)
157	self.assertTrue(len(context_data['page']) == 4)
158	self.assertTrue(len(context_data['sentence']) == 6)
159	self.assertTrue((context_data['book']['idx'] == [9, 11]).all())
160	self.assertTrue((context_data['book']['book_label'] == ['0', '1']).all())
161	self.assertTrue((context_data['page']['idx'] == [5, 9, 9, 11]).all())
162	self.assertTrue((context_data['page']['page_label'] ==
163	['0', '1', '2', '3']).all())
164	self.assertTrue((context_data['page']['book_label'] ==
165	['0', '0', '1', '1']).all())
166	self.assertTrue((context_data['sentence']['idx'] ==
167	[3, 5, 7, 9, 10, 11]).all())
168	self.assertTrue((context_data['sentence']['sentence_label'] ==
169	['0', '1', '2', '3', '4', '5']).all())
170	self.assertTrue((context_data['sentence']['page_label'] ==
171	['0', '0', '1', '1', '3', '3']).all())
172	self.assertTrue((context_data['sentence']['book_label'] ==
173	['0', '0', '0', '0', '1', '1']).all())
174	self.assertTrue((context_data['page']['file'] ==
175	['1','2','3','4']).all())
176	self.assertTrue((context_data['sentence']['file'] ==
177	['1','1','2','2','4','4']).all())
178
179
180	suite = unittest.TestLoader().loadTestsFromTestCase(TestCorpusbuilders)
181	unittest.TextTestRunner(verbosity=2).run(suite)

Nota: Vea TracBrowser para ayuda de uso del navegador del repositorio.

Descargar en otros formatos: