Navegación de contexto

source: modelado_topicos/utils/freeling.py @ a1c3fea

preprocesamientov1.0

Last change on this file since a1c3fea was a1c3fea, checked in by rboet <rboet@…>, 9 años ago
se agregó texto original sin acentos y función mejorada para ver los topicos
Propiedad mode establecida a `100755`
File size: 7.2 KB

Línea
1	# coding: utf8
2	from subprocess import Popen, PIPE
3	import shlex, os
4	from codecs import open
5	from nltk.corpus import stopwords
6	import math, operator
7
8
9	def files_to_lower(path_orig,path_dest):
10	files = os.listdir(path_orig)
11	for file in files:
12	file_string = open(path_orig+file,'r','utf8').read()
13	f = open(path_dest+file,'w','utf8')
14	f.write(file_string.lower())
15	f.close()
16
17	os.environ['FREELINGSHARE'] = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/share/freeling'
18	def call_freeling(freeling_cmd,file_string):
19	p = Popen(freeling_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
20	output, err = p.communicate(file_string.encode('utf8'))
21
22	if err < 0:
23	return "ERROR: FALLÃ EJECUCIÃN DE FREELING"
24
25	return output
26
27	def clean_words(w):
28	w = w.encode('unicode-escape')
29	return w.replace(u'\xe0'.encode('unicode-escape'),u'a').replace(u'\xe8'.encode('unicode-escape'),u'e').replace(u'\xec'.encode('unicode-escape'),u'i').replace(u'\xf2'.encode('unicode-escape'),u'o').replace(u'\xfa'.encode('unicode-escape'),u'u').replace(u'\xe1'.encode('unicode-escape'),u'a').replace(u'\xe9'.encode('unicode-escape'),u'e').replace(u'\xed'.encode('unicode-escape'),u'i').replace(u'\xf3'.encode('unicode-escape'),u'o').replace(u'\xfa'.encode('unicode-escape'),u'u').replace(u'Ã¡',u'a').replace(u'Ã©',u'e').replace(u'Ã',u'i').replace(u'Ã³',u'o').replace(u'Ãº',u'u').replace(u'Ã ',u'a').replace(u'Ãš',u'e').replace(u'Ã¬',u'i').replace(u'Ã²',u'o').replace(u'Ã¹',u'u')
30
31	def is_pos(word,pos_list):
32	for item in pos_list:
33	if word.startswith(item): return True
34	return False
35
36	def complete_word(words_list,word):
37	indexes = [i for i,j in enumerate(words_list) if j == word]
38	if len(indexes) == 1: return word
39	if len(indexes) == 0: return word
40
41	index = 1
42	complete_word = word
43	i1 = indexes[0]
44	while True:
45	for i2 in indexes[1:]:
46	try:
47	if words_list[i1+index] != words_list[i2+index]:
48	return complete_word
49	except IndexError:
50	return complete_word
51	complete_word += '-' + words_list[i1+index]
52	index += 1
53	if indexes[1] == i1+index or i1+index == len(words_list):
54	return complete_word
55
56	def all_complete_words(words_list):
57	words_list = [w.decode('utf8') for w in words_list]
58	ret_val = []
59	c = ''.encode('utf8')
60	for w in words_list:
61	c_aux = complete_word(words_list,w)
62	if c_aux in c:
63	continue
64
65	c = c_aux
66	ret_val += [c]
67	return list(set(ret_val))
68
69
70
71	def select_pos(path,words_fl,pos_list=['V','A','N','R','D','P','C','I','S']):
72	output_list = []
73	all_words_list = []
74	for item in words_fl.split('\n'):
75	try:
76	w = item.split(' ')[0]
77	cleaned_word = clean_words(w.decode('utf8'))
78	all_words_list += [cleaned_word]
79	if w.decode('utf8') not in stopwords.words('spanish') and is_pos(item.split(' ')[2],pos_list) and w not in '*+.,?Â¿!Â¡":;-=/$@#ââ()[]{}' and not w.isdigit() and len(w) > 3:
80	# Selecciona el lema
81	#output_list += [item.split(' ')[1]]
82	# Selecciona la palabra original
83	#output_list += [item.split(' ')[0]]
84	output_list += [cleaned_word]
85	except IndexError:
86	pass
87	na_file = open(path,'w','utf8')
88	na_file.write(' '.join(all_words_list))
89	na_file.close()
90	return output_list
91
92	def preprocess(corpus_path,do_fl=True):
93	freeling_corpus_path = corpus_path + '../freeling/'
94	no_accent_path = corpus_path + '../noaccent/'
95	ret_val = dict()
96	corpus_words = []
97	i = 0
98	for file in os.listdir(corpus_path):
99	file_string = open(corpus_path+file,'r','utf8').read()
100	if do_fl:
101	freeling_cmd = shlex.split('/home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/bin/analyzer -f /home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/share/freeling/config/es.cfg',' ')
102	# LematizaciÃ³n con FREELING
103	words_fl = call_freeling(freeling_cmd,file_string)
104	fl_file = open(freeling_corpus_path+file,'w','utf8')
105	fl_file.write(words_fl.decode('utf-8'))
106	fl_file.close()
107	else:
108	words_fl = open(freeling_corpus_path+file,'r').read()
109	####################################
110	####################################
111	#'V', verbos
112	#'A', adjetivos
113	#'N', sustantivos
114	#'R', advervios
115	#'D', determinantes
116	#'P', pronombres
117	#'C', conjunciones
118	#'I', interjecciones
119	#'S', preposiciones
120	####################################
121	####################################
122
123	words_fl = select_pos(no_accent_path+file,words_fl=words_fl,pos_list=['A','R','V','N'])
124
125	# Quitar STOPWORDS y caracteres no deseados
126	words_pp = all_complete_words(words_fl)
127	ret_val[file] = words_pp
128	corpus_words += words_pp
129	i += 1
130
131	return ret_val,set(corpus_words)
132
133	def idf(file_words_pp,corpus_words):
134	idf = {}
135	num_docs = len(file_words_pp)
136	for w in corpus_words:
137	count = 0
138	for file,words in file_words_pp.items():
139	if w in words: count += 1
140	idf[w] = math.log(float(num_docs)/float(1+count))
141	return idf
142
143
144	if __name__ == '__main__':
145	"""
146	path_orig = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/orig/'
147	path_dest = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/lower/'
148	files_to_lower(path_orig,path_dest)
149	"""
150	#corpus_path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/'
151	corpus_path = '/home/cenditel/Interpretacion/demo-data/corpus_propuestas/'
152	lower_corpus_path = corpus_path + 'lower/'
153	pp_corpus_path = corpus_path + 'pp/'
154
155	file_words_pp,corpus_words = preprocess(lower_corpus_path,do_fl=False)
156	exclude_words = ['descripcion','justificacion','construccion','desarrollo','comunidad','comunal','proyecto','prueblo','desarrollar','mismo','nacional','pueblo','sistema']
157	exclude_words = [w.encode('utf8') for w in exclude_words]
158	excluded = open(corpus_path+'excluded.txt','w','utf8')
159	added_files = []
160	repeated_count = 0
161	flag = False
162	for file,words_pp in file_words_pp.items():
163	# Excluir documentos repetidos
164	for aux_words_pp in added_files:
165	if words_pp == aux_words_pp:
166	repeated_count += 1
167	print "Repetido: " + file
168	flag = True
169	break
170	if flag:
171	flag = False
172	continue
173
174	if len(words_pp) <= 50: continue
175	# Guardar archivo
176	file_pp = open(pp_corpus_path+file,'w','utf8')
177	added_files.append(words_pp)
178	for w in words_pp:
179	if w not in exclude_words and not '_' in w:
180	file_pp.write(w + ' ')
181	else:
182	try:
183	excluded.write(w.encode('utf8') + ' (' + file + ')\n')
184	except UnicodeDecodeError:
185	excluded.write(w + ' (' + file + ')\n')
186	file_pp.close()
187	excluded.close()
188
189

Nota: Vea TracBrowser para ayuda de uso del navegador del repositorio.

Descargar en otros formatos: