Navegación de contexto

source: consulta_publica/vsm/vsm/extensions/corpuscleanup.py @ 32be06f

baseconstituyenteestudiantesgeneralplan_patriasala

Last change on this file since 32be06f was 0ff122b, checked in by rudmanmrrod <rudman22@…>, 7 años ago
Agregado módulo de gestión de perfiles de procesamiento, incorporado el módulo de visualización de modelado de tópicos
Propiedad mode establecida a `100644`
File size: 4.2 KB

Línea
1	import nltk
2	from vsm.corpus import Corpus
3	from vsm.extensions.corpusbuilders.util import *
4
5
6
7	def apply_stoplist_len(corp, nltk_stop=True, add_stop=None,
8	word_len=3, freq=0):
9	"""
10	New parameter word_len is added. Adds words with length
11	<= word_len to the stoplist. A rough solution for
12	getting rid of bibliographic information and common
13	foreign language particles.
14	"""
15	stoplist = set()
16	if nltk_stop:
17	for w in nltk.corpus.stopwords.words('english'):
18	stoplist.add(w)
19	if add_stop:
20	for w in add_stop:
21	stoplist.add(w)
22	for w in corp.words:
23	if len(w) <= word_len:
24	stoplist.add(w)
25
26	return corp.apply_stoplist(stoplist=stoplist, freq=freq)
27
28
29	def apply_stoplist_nltk(corp, nltk_stop=[], add_stop=None,
30	word_len=0, freq=0):
31	"""
32	Originally nltk_stop was a boolean that filtered 'english'.
33	Now it is a string, language, supported in nltk.corpus.
34	stopwords. If nltk_stop is set to None, then no stopwords
35	will be added from nltk corpus.
36	"""
37	stoplist = set()
38	if len(nltk_stop) > 0:
39	for lang in nltk_stop:
40	try:
41	for w in nltk.corpus.stopwords.words(lang):
42	stoplist.add(w)
43	except Exception:
44	print "{0} language not found in nltk.corpus\
45	.stopwords".format(nltk_stop)
46	if add_stop:
47	for w in add_stop:
48	stoplist.add(w)
49
50	if word_len > 0:
51	for w in corp.words:
52	if len(w) <= word_len:
53	stoplist.add(w)
54
55	return corp.apply_stoplist(stoplist=stoplist, freq=freq)
56
57
58	def snowball_stem(corp, language='english'):
59	"""
60	Builds a dictionary with words as keys and stems as the values.
61	language : string. 'english', 'german', or 'french'.
62	"""
63	stemmer = []
64	if language == 'english':
65	stemmer = nltk.stem.snowball.EnglishStemmer()
66	elif language == 'german':
67	stemmer = nltk.stem.snowball.GermanStemmer()
68	elif language == 'french':
69	stemmer = nltk.stem.snowball.FrenchStemmer()
70
71	stemdict = {}
72	for w in corp.words:
73	w_ = w.decode('utf-8').strip()
74	stemdict[w] = unidecode(stemmer.stem(w_))
75
76	return stemdict
77
78
79	def porter_stem(corp):
80	"""
81	Builds a dictionary with words as keys and stems as the values.
82	"""
83	from porterstemmer import PorterStemmer
84
85	ps = PorterStemmer()
86	psdict = {}
87	for w in corp.words:
88	psdict[w] = ps.stem(w)
89
90	return psdict
91
92
93	def stem_int(corp, stemdict):
94	"""
95	Returns a dictionary to replace corp.words_int
96	"""
97
98	wordint = {}
99	sint = -1
100	prev = ''
101	for k in corp.words:
102	stem = stemdict[k]
103
104	if k == stem:
105	wordint[k] = corp.words_int[k]
106	sint = -1
107
108	else: # replace it with stem
109	if stem in corp.words:
110	wordint[k] = corp.words_int[stem]
111	sint = -1
112
113	else: # create a new entry or new int
114	if sint in wordint.values() and prev == stem:
115	wordint[k] = sint
116
117	else: # new stem, new sint
118	wordint[k] = corp.words_int[k]
119	sint = wordint[k]
120
121	prev = stem
122
123	return wordint
124
125
126	def word_stem(corp, stemdict):
127	"""
128	Returns a dictionary with integer maps in corp.words_int
129	as keys and integers for stems as values.
130	"""
131
132	intint = {}
133	sint = -1
134	prev = ''
135	for k in corp.words:
136	stem = stemdict[k]
137	orig = corp.words_int[k]
138
139	if k == stem: # same as c.words_int
140	intint[orig] = corp.words_int[k]
141	sint = -1
142
143	else: # replace it with stem
144	if stem in corp.words:
145	intint[orig] = corp.words_int[stem]
146	sint = -1
147
148	else: # create a new entry or new int
149	if sint in intint.values() and prev == stem:
150	intint[orig] = sint
151	else:
152	intint[orig] = corp.words_int[k]
153	sint = intint[orig]
154	prev = stem
155
156	return intint

Nota: Vea TracBrowser para ayuda de uso del navegador del repositorio.

Descargar en otros formatos: