1 | import unittest2 as unittest |
---|
2 | |
---|
3 | from vsm.corpus import add_metadata |
---|
4 | from vsm.extensions.corpusbuilders.util import * |
---|
5 | import numpy as np |
---|
6 | |
---|
7 | class TestCorpusUtil(unittest.TestCase): |
---|
8 | |
---|
9 | def test_strip_punc(self): |
---|
10 | |
---|
11 | tsent = ['foo-foo',',','3','foo','bars','bar_foo','2to1','.'] |
---|
12 | out = strip_punc(tsent) |
---|
13 | self.assertEqual(out, ['foo-foo','3','foo','bars','bar_foo','2to1']) |
---|
14 | |
---|
15 | |
---|
16 | def test_rem_num(self): |
---|
17 | |
---|
18 | tsent = ['foo-foo',',','3','foo','bars','2-parts','2-to-1','3words','.'] |
---|
19 | out = rem_num(tsent) |
---|
20 | self.assertEqual(out, ['foo-foo',',','3','foo','bars','2-parts','3words','.']) |
---|
21 | |
---|
22 | def test_rehyph(self): |
---|
23 | |
---|
24 | sent = 'foo foo 3 foo--bars barfoo -- 2to1.' |
---|
25 | out = rehyph(sent) |
---|
26 | self.assertEqual(out, 'foo foo 3 foo - bars barfoo - 2to1.') |
---|
27 | |
---|
28 | def test_add_metadata(self): |
---|
29 | |
---|
30 | from vsm.corpus.util.corpusbuilders import random_corpus |
---|
31 | |
---|
32 | c = random_corpus(1000, 50, 0, 20, context_type='sentence', metadata=True) |
---|
33 | n = c.view_metadata('sentence').size |
---|
34 | meta = ['m_{0}'.format(i) for i in xrange(n)] |
---|
35 | new_c = add_metadata(c, 'sentence', 'new_meta', meta) |
---|
36 | |
---|
37 | self.assertEqual(new_c.view_metadata('sentence')['new_meta'].tolist(), meta) |
---|
38 | |
---|
39 | |
---|
40 | def test_apply_stoplist(self): |
---|
41 | |
---|
42 | from vsm.corpus.util.corpusbuilders import random_corpus, corpus_fromlist |
---|
43 | |
---|
44 | c = random_corpus(1000, 50, 0, 20, context_type='sentence', metadata=True) |
---|
45 | new_c = apply_stoplist(c, nltk_stop=False, add_stop=['0','1'], freq=0) |
---|
46 | |
---|
47 | li = [[],['he','said'],['he','said','bar'],['bar','ate'],['I','foo']] |
---|
48 | wc = corpus_fromlist(li, context_type='sentence') |
---|
49 | new_wc = apply_stoplist(wc, nltk_stop=True, freq=1) |
---|
50 | |
---|
51 | self.assertTrue('0' in c.words) |
---|
52 | self.assertTrue('1' in c.words) |
---|
53 | self.assertFalse('0' in new_c.words) |
---|
54 | self.assertFalse('1' in new_c.words) |
---|
55 | |
---|
56 | self.assertTrue('said' in new_wc.words) |
---|
57 | self.assertTrue('bar' in new_wc.words) |
---|
58 | self.assertFalse('he' in new_wc.words) |
---|
59 | self.assertFalse('foo' in new_wc.words) |
---|
60 | self.assertFalse('ate' in new_wc.words) |
---|
61 | |
---|
62 | |
---|
63 | def test_filter_by_suffix(self): |
---|
64 | |
---|
65 | li = ['a.txt', 'b.json', 'c.txt'] |
---|
66 | filtered = filter_by_suffix(li, ['.txt']) |
---|
67 | filtered1 = filter_by_suffix(li, ['.json']) |
---|
68 | filtered2 = filter_by_suffix(li, ['.csv']) |
---|
69 | |
---|
70 | self.assertEqual(filtered, ['b.json']) |
---|
71 | self.assertEqual(filtered1, ['a.txt','c.txt']) |
---|
72 | self.assertEqual(filtered2, li) |
---|
73 | |
---|
74 | |
---|
75 | suite = unittest.TestLoader().loadTestsFromTestCase(TestCorpusUtil) |
---|
76 | unittest.TextTestRunner(verbosity=2).run(suite) |
---|