Navegación de contexto

source: modelado_topicos/utils/freeling.py @ 5b95488

preprocesamientov1.0

Last change on this file since 5b95488 was 5b95488, checked in by rboet <rboet@…>, 9 años ago
agregado directorio see_topics, funcionabilidad y nuevo directorio templates
Propiedad mode establecida a `100755`
File size: 8.1 KB

Línea
1	# coding: utf8
2	from subprocess import Popen, PIPE
3	import shlex, os
4	from codecs import open
5	from nltk.corpus import stopwords
6	import math, operator
7
8
9	def files_to_lower(path_orig,path_dest):
10	files = os.listdir(path_orig)
11	for file in files:
12	file_string = open(path_orig+file,'r','utf8').read()
13	f = open(path_dest+file,'w','utf8')
14	f.write(file_string.lower())
15	f.close()
16
17	os.environ['FREELINGSHARE'] = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/share/freeling'
18	def call_freeling(freeling_cmd,file_string):
19	p = Popen(freeling_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
20	output, err = p.communicate(file_string.encode('utf8'))
21
22	if err < 0:
23	return "ERROR: FALLÃ EJECUCIÃN DE FREELING"
24
25	return output
26
27
28	def clean_words(words_fl):
29	words_fl = [w.encode('unicode-escape') for w in words_fl]
30	return [w.replace(u'\xe0'.encode('unicode-escape'),u'a').replace(u'\xe8'.encode('unicode-escape'),u'e').replace(u'\xec'.encode('unicode-escape'),u'i').replace(u'\xf2'.encode('unicode-escape'),u'o').replace(u'\xfa'.encode('unicode-escape'),u'u').replace(u'\xe1'.encode('unicode-escape'),u'a').replace(u'\xe9'.encode('unicode-escape'),u'e').replace(u'\xed'.encode('unicode-escape'),u'i').replace(u'\xf3'.encode('unicode-escape'),u'o').replace(u'\xfa'.encode('unicode-escape'),u'u').replace(u'Ã¡',u'a').replace(u'Ã©',u'e').replace(u'Ã',u'i').replace(u'Ã³',u'o').replace(u'Ãº',u'u').replace(u'Ã ',u'a').replace(u'Ãš',u'e').replace(u'Ã¬',u'i').replace(u'Ã²',u'o').replace(u'Ã¹',u'u') for w in words_fl if w not in stopwords.words('spanish') and w not in '*+.,?Â¿!Â¡":;-=/$@#ââ()[]{}' and not w.isdigit() and len(w) > 3]
31
32	def is_pos(word,pos_list):
33	for item in pos_list:
34	if word.startswith(item): return True
35	return False
36
37	def complete_word(words_list,word):
38	indexes = [i for i,j in enumerate(words_list) if j == word]
39	if len(indexes) == 1: return word
40	if len(indexes) == 0: return word
41
42	#if len(indexes) == 0: raise Exception("LA PALABRA NO SE ENCUENTRA EN EL DOCUMENTO: cosa rara!")
43	index = 1
44	complete_word = word
45	i1 = indexes[0]
46	while True:
47	for i2 in indexes[1:]:
48	try:
49	if words_list[i1+index] != words_list[i2+index]:
50	return complete_word
51	except IndexError:
52	return complete_word
53	complete_word += '-' + words_list[i1+index]
54	index += 1
55	if indexes[1] == i1+index or i1+index == len(words_list):
56	return complete_word
57
58	def all_complete_words(words_list):
59	words_list = [w.decode('utf8') for w in words_list]
60	ret_val = []
61	c = ''.encode('utf8')
62	for w in words_list:
63	c_aux = complete_word(words_list,w)
64	if c_aux in c:
65	continue
66
67	c = c_aux
68	ret_val += [c]
69	return list(set(ret_val))
70
71
72
73	def select_pos(words_fl,pos_list=['V','A','N','R','D','P','C','I','S']):
74	output_list = []
75	for item in words_fl.split('\n'):
76	try:
77	if item.split(' ')[0].decode('utf8') not in stopwords.words('spanish') and is_pos(item.split(' ')[2],pos_list):
78	# Selecciona el lema
79	#output_list += [item.split(' ')[1]]
80	# Selecciona la palabra original
81	output_list += [item.split(' ')[0]]
82	except IndexError:
83	pass
84	return output_list
85
86
87	def preprocess(corpus_path,do_fl=True):
88	freeling_cmd = shlex.split('/home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/bin/analyzer -f /home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/share/freeling/config/es.cfg',' ')
89	freeling_corpus_path = corpus_path + '../freeling/'
90	ret_val = dict()
91	corpus_words = []
92	i = 0
93	for file in os.listdir(corpus_path):
94	file_string = open(corpus_path+file,'r','utf8').read()
95	if do_fl:
96	# LematizaciÃ³n con FREELING
97	words_fl = call_freeling(freeling_cmd,file_string)
98	fl_file = open(freeling_corpus_path+file,'w','utf8')
99	fl_file.write(words_fl.decode('utf-8'))
100	fl_file.close()
101	else:
102	words_fl = open(freeling_corpus_path+file,'r').read()
103	####################################
104	####################################
105	#'V', verbos
106	#'A', adjetivos
107	#'N', sustantivos
108	#'R', advervios
109	#'D', determinantes
110	#'P', pronombres
111	#'C', conjunciones
112	#'I', interjecciones
113	#'S', preposiciones
114	words_fl = select_pos(words_fl=words_fl,pos_list=['A','R','V','N'])
115	####################################
116	####################################
117	# Quitar STOPWORDS y caracteres no deseados
118	words_pp = all_complete_words(words_fl)
119	words_pp = clean_words(words_pp)
120	ret_val[file] = words_pp
121	corpus_words += words_pp
122	i += 1
123	#print "Pre-procesado el archivo: " + file
124	#print "####################################"
125	#print words_pp , '(' + str(i) + ')'
126	#print "####################################"
127	#print "####################################"
128
129	return ret_val,set(corpus_words)
130
131	def idf(file_words_pp,corpus_words):
132	idf = {}
133	num_docs = len(file_words_pp)
134	for w in corpus_words:
135	count = 0
136	for file,words in file_words_pp.items():
137	if w in words: count += 1
138	idf[w] = math.log(float(num_docs)/float(1+count))
139	return idf
140
141
142	if __name__ == '__main__':
143	"""
144	path_orig = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/orig/'
145	path_dest = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/lower/'
146	files_to_lower(path_orig,path_dest)
147	"""
148	corpus_path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/'
149	lower_corpus_path = corpus_path + 'lower/'
150	pp_corpus_path = corpus_path + 'pp/'
151
152	file_words_pp,corpus_words = preprocess(lower_corpus_path,do_fl=False)
153	exclude_words = ['descripcion','justificacion','construccion','desarrollo','comunidad','comunal','proyecto','prueblo','desarrollar','mismo','nacional','pueblo','sistema']
154	exclude_words = [w.encode('utf8') for w in exclude_words]
155	#vocab_idf = idf(file_words_pp,corpus_words)
156	#print sorted(vocab_idf.items(),key=operator.itemgetter(1), reverse=True)
157	excluded = open(corpus_path+'excluded.txt','w','utf8')
158	added_files = []
159	repeated_count = 0
160	flag = False
161	for file,words_pp in file_words_pp.items():
162	# Excluir documentos repetidos
163	for aux_words_pp in added_files:
164	if words_pp == aux_words_pp:
165	repeated_count += 1
166	print "Repetido: " + file
167	flag = True
168	break
169	if flag:
170	flag = False
171	continue
172
173	#coef = float(len(set(words_pp)))/float(len(words_pp))
174	#print coef, len(words_pp), file
175	#if (coef <= 0.5) or len(words_pp) <= 150: continue
176	if len(words_pp) <= 50: continue
177	# Guardar archivo
178	file_pp = open(pp_corpus_path+file,'w','utf8')
179	added_files.append(words_pp)
180	for w in words_pp:
181	#condition = vocab_idf[w]
182	#if condition >= 2.0 and condition <= 6.1 and not '_' in w:
183	#if condition >= 2.0 and not '_' in w:
184
185	if w not in exclude_words and not '_' in w:
186
187	#try:
188	# file_pp.write(w.encode('utf8') + ' ')
189	#except UnicodeDecodeError:
190	file_pp.write(w + ' ')
191	else:
192	try:
193	#excluded.write(w.encode('utf8') + ' ' + str(condition) + ' (' + file + ')\n')
194	excluded.write(w.encode('utf8') + ' (' + file + ')\n')
195	except UnicodeDecodeError:
196	#excluded.write(w + ' ' + str(condition) + ' (' + file + ')\n')
197	excluded.write(w + ' (' + file + ')\n')
198	file_pp.close()
199	excluded.close()
200
201	print "Documentos repetidos: ", repeated_count
202	print "Palabras en el vocabulario: ", len(corpus_words)

Nota: Vea TracBrowser para ayuda de uso del navegador del repositorio.

Descargar en otros formatos: