1 | import nltk |
---|
2 | from vsm.corpus import Corpus |
---|
3 | from vsm.extensions.corpusbuilders.util import * |
---|
4 | |
---|
5 | |
---|
6 | |
---|
7 | def apply_stoplist_len(corp, nltk_stop=True, add_stop=None, |
---|
8 | word_len=3, freq=0): |
---|
9 | """ |
---|
10 | New parameter word_len is added. Adds words with length |
---|
11 | <= word_len to the stoplist. A rough solution for |
---|
12 | getting rid of bibliographic information and common |
---|
13 | foreign language particles. |
---|
14 | """ |
---|
15 | stoplist = set() |
---|
16 | if nltk_stop: |
---|
17 | for w in nltk.corpus.stopwords.words('english'): |
---|
18 | stoplist.add(w) |
---|
19 | if add_stop: |
---|
20 | for w in add_stop: |
---|
21 | stoplist.add(w) |
---|
22 | for w in corp.words: |
---|
23 | if len(w) <= word_len: |
---|
24 | stoplist.add(w) |
---|
25 | |
---|
26 | return corp.apply_stoplist(stoplist=stoplist, freq=freq) |
---|
27 | |
---|
28 | |
---|
29 | def apply_stoplist_nltk(corp, nltk_stop=[], add_stop=None, |
---|
30 | word_len=0, freq=0): |
---|
31 | """ |
---|
32 | Originally nltk_stop was a boolean that filtered 'english'. |
---|
33 | Now it is a string, language, supported in nltk.corpus. |
---|
34 | stopwords. If nltk_stop is set to None, then no stopwords |
---|
35 | will be added from nltk corpus. |
---|
36 | """ |
---|
37 | stoplist = set() |
---|
38 | if len(nltk_stop) > 0: |
---|
39 | for lang in nltk_stop: |
---|
40 | try: |
---|
41 | for w in nltk.corpus.stopwords.words(lang): |
---|
42 | stoplist.add(w) |
---|
43 | except Exception: |
---|
44 | print "{0} language not found in nltk.corpus\ |
---|
45 | .stopwords".format(nltk_stop) |
---|
46 | if add_stop: |
---|
47 | for w in add_stop: |
---|
48 | stoplist.add(w) |
---|
49 | |
---|
50 | if word_len > 0: |
---|
51 | for w in corp.words: |
---|
52 | if len(w) <= word_len: |
---|
53 | stoplist.add(w) |
---|
54 | |
---|
55 | return corp.apply_stoplist(stoplist=stoplist, freq=freq) |
---|
56 | |
---|
57 | |
---|
58 | def snowball_stem(corp, language='english'): |
---|
59 | """ |
---|
60 | Builds a dictionary with words as keys and stems as the values. |
---|
61 | language : string. 'english', 'german', or 'french'. |
---|
62 | """ |
---|
63 | stemmer = [] |
---|
64 | if language == 'english': |
---|
65 | stemmer = nltk.stem.snowball.EnglishStemmer() |
---|
66 | elif language == 'german': |
---|
67 | stemmer = nltk.stem.snowball.GermanStemmer() |
---|
68 | elif language == 'french': |
---|
69 | stemmer = nltk.stem.snowball.FrenchStemmer() |
---|
70 | |
---|
71 | stemdict = {} |
---|
72 | for w in corp.words: |
---|
73 | w_ = w.decode('utf-8').strip() |
---|
74 | stemdict[w] = unidecode(stemmer.stem(w_)) |
---|
75 | |
---|
76 | return stemdict |
---|
77 | |
---|
78 | |
---|
79 | def porter_stem(corp): |
---|
80 | """ |
---|
81 | Builds a dictionary with words as keys and stems as the values. |
---|
82 | """ |
---|
83 | from porterstemmer import PorterStemmer |
---|
84 | |
---|
85 | ps = PorterStemmer() |
---|
86 | psdict = {} |
---|
87 | for w in corp.words: |
---|
88 | psdict[w] = ps.stem(w) |
---|
89 | |
---|
90 | return psdict |
---|
91 | |
---|
92 | |
---|
93 | def stem_int(corp, stemdict): |
---|
94 | """ |
---|
95 | Returns a dictionary to replace corp.words_int |
---|
96 | """ |
---|
97 | |
---|
98 | wordint = {} |
---|
99 | sint = -1 |
---|
100 | prev = '' |
---|
101 | for k in corp.words: |
---|
102 | stem = stemdict[k] |
---|
103 | |
---|
104 | if k == stem: |
---|
105 | wordint[k] = corp.words_int[k] |
---|
106 | sint = -1 |
---|
107 | |
---|
108 | else: # replace it with stem |
---|
109 | if stem in corp.words: |
---|
110 | wordint[k] = corp.words_int[stem] |
---|
111 | sint = -1 |
---|
112 | |
---|
113 | else: # create a new entry or new int |
---|
114 | if sint in wordint.values() and prev == stem: |
---|
115 | wordint[k] = sint |
---|
116 | |
---|
117 | else: # new stem, new sint |
---|
118 | wordint[k] = corp.words_int[k] |
---|
119 | sint = wordint[k] |
---|
120 | |
---|
121 | prev = stem |
---|
122 | |
---|
123 | return wordint |
---|
124 | |
---|
125 | |
---|
126 | def word_stem(corp, stemdict): |
---|
127 | """ |
---|
128 | Returns a dictionary with integer maps in corp.words_int |
---|
129 | as keys and integers for stems as values. |
---|
130 | """ |
---|
131 | |
---|
132 | intint = {} |
---|
133 | sint = -1 |
---|
134 | prev = '' |
---|
135 | for k in corp.words: |
---|
136 | stem = stemdict[k] |
---|
137 | orig = corp.words_int[k] |
---|
138 | |
---|
139 | if k == stem: # same as c.words_int |
---|
140 | intint[orig] = corp.words_int[k] |
---|
141 | sint = -1 |
---|
142 | |
---|
143 | else: # replace it with stem |
---|
144 | if stem in corp.words: |
---|
145 | intint[orig] = corp.words_int[stem] |
---|
146 | sint = -1 |
---|
147 | |
---|
148 | else: # create a new entry or new int |
---|
149 | if sint in intint.values() and prev == stem: |
---|
150 | intint[orig] = sint |
---|
151 | else: |
---|
152 | intint[orig] = corp.words_int[k] |
---|
153 | sint = intint[orig] |
---|
154 | prev = stem |
---|
155 | |
---|
156 | return intint |
---|