[0ff122b] | 1 | import unittest2 as unittest |
---|
| 2 | |
---|
| 3 | from vsm.corpus import add_metadata |
---|
| 4 | from vsm.extensions.corpusbuilders.util import * |
---|
| 5 | import numpy as np |
---|
| 6 | |
---|
| 7 | class TestCorpusUtil(unittest.TestCase): |
---|
| 8 | |
---|
| 9 | def test_strip_punc(self): |
---|
| 10 | |
---|
| 11 | tsent = ['foo-foo',',','3','foo','bars','bar_foo','2to1','.'] |
---|
| 12 | out = strip_punc(tsent) |
---|
| 13 | self.assertEqual(out, ['foo-foo','3','foo','bars','bar_foo','2to1']) |
---|
| 14 | |
---|
| 15 | |
---|
| 16 | def test_rem_num(self): |
---|
| 17 | |
---|
| 18 | tsent = ['foo-foo',',','3','foo','bars','2-parts','2-to-1','3words','.'] |
---|
| 19 | out = rem_num(tsent) |
---|
| 20 | self.assertEqual(out, ['foo-foo',',','3','foo','bars','2-parts','3words','.']) |
---|
| 21 | |
---|
| 22 | def test_rehyph(self): |
---|
| 23 | |
---|
| 24 | sent = 'foo foo 3 foo--bars barfoo -- 2to1.' |
---|
| 25 | out = rehyph(sent) |
---|
| 26 | self.assertEqual(out, 'foo foo 3 foo - bars barfoo - 2to1.') |
---|
| 27 | |
---|
| 28 | def test_add_metadata(self): |
---|
| 29 | |
---|
| 30 | from vsm.corpus.util.corpusbuilders import random_corpus |
---|
| 31 | |
---|
| 32 | c = random_corpus(1000, 50, 0, 20, context_type='sentence', metadata=True) |
---|
| 33 | n = c.view_metadata('sentence').size |
---|
| 34 | meta = ['m_{0}'.format(i) for i in xrange(n)] |
---|
| 35 | new_c = add_metadata(c, 'sentence', 'new_meta', meta) |
---|
| 36 | |
---|
| 37 | self.assertEqual(new_c.view_metadata('sentence')['new_meta'].tolist(), meta) |
---|
| 38 | |
---|
| 39 | |
---|
| 40 | def test_apply_stoplist(self): |
---|
| 41 | |
---|
| 42 | from vsm.corpus.util.corpusbuilders import random_corpus, corpus_fromlist |
---|
| 43 | |
---|
| 44 | c = random_corpus(1000, 50, 0, 20, context_type='sentence', metadata=True) |
---|
| 45 | new_c = apply_stoplist(c, nltk_stop=False, add_stop=['0','1'], freq=0) |
---|
| 46 | |
---|
| 47 | li = [[],['he','said'],['he','said','bar'],['bar','ate'],['I','foo']] |
---|
| 48 | wc = corpus_fromlist(li, context_type='sentence') |
---|
| 49 | new_wc = apply_stoplist(wc, nltk_stop=True, freq=1) |
---|
| 50 | |
---|
| 51 | self.assertTrue('0' in c.words) |
---|
| 52 | self.assertTrue('1' in c.words) |
---|
| 53 | self.assertFalse('0' in new_c.words) |
---|
| 54 | self.assertFalse('1' in new_c.words) |
---|
| 55 | |
---|
| 56 | self.assertTrue('said' in new_wc.words) |
---|
| 57 | self.assertTrue('bar' in new_wc.words) |
---|
| 58 | self.assertFalse('he' in new_wc.words) |
---|
| 59 | self.assertFalse('foo' in new_wc.words) |
---|
| 60 | self.assertFalse('ate' in new_wc.words) |
---|
| 61 | |
---|
| 62 | |
---|
| 63 | def test_filter_by_suffix(self): |
---|
| 64 | |
---|
| 65 | li = ['a.txt', 'b.json', 'c.txt'] |
---|
| 66 | filtered = filter_by_suffix(li, ['.txt']) |
---|
| 67 | filtered1 = filter_by_suffix(li, ['.json']) |
---|
| 68 | filtered2 = filter_by_suffix(li, ['.csv']) |
---|
| 69 | |
---|
| 70 | self.assertEqual(filtered, ['b.json']) |
---|
| 71 | self.assertEqual(filtered1, ['a.txt','c.txt']) |
---|
| 72 | self.assertEqual(filtered2, li) |
---|
| 73 | |
---|
| 74 | |
---|
| 75 | suite = unittest.TestLoader().loadTestsFromTestCase(TestCorpusUtil) |
---|
| 76 | unittest.TextTestRunner(verbosity=2).run(suite) |
---|