Navegación de contexto

source: modelado_topicos/utils/freeling.py @ ff807e0

preprocesamientov1.0

Last change on this file since ff807e0 was ff807e0, checked in by Jorge Redondo Flames <jredondo@…>, 9 años ago
Agregado directorio utils/
Propiedad mode establecida a `100644`
File size: 4.2 KB

Línea
1	# coding: utf8
2	from subprocess import Popen, PIPE
3	import shlex, os
4	from codecs import open
5	from nltk.corpus import stopwords
6	import math
7
8
9	def files_to_lower(path_orig,path_dest):
10	files = os.listdir(path_orig)
11	for file in files:
12	file_string = open(path_orig+file,'r','utf8').read()
13	f = open(path_dest+file,'w','utf8')
14	f.write(file_string.lower())
15	f.close()
16
17	os.environ['FREELINGSHARE'] = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/share/freeling'
18	def call_freeling(freeling_cmd,file_string):
19	p = Popen(freeling_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
20	output, err = p.communicate(file_string.encode('utf8'))
21
22	if err < 0:
23	return "ERROR: FALLÃ EJECUCIÃN DE FREELING"
24
25	output_list = []
26	for item in output.split('\n'):
27	if item.split(' ')[0] not in stopwords.words('spanish'):
28	try:
29	output_list += [item.split(' ')[1]]
30	except IndexError:
31	pass
32	return output_list
33
34	def clean_words(words_fl):
35	return [w.decode('utf8').replace(u'Ã¡',u'a').replace(u'Ã©',u'e').replace(u'Ã',u'i').replace(u'Ã³',u'o').replace(u'Ãº',u'u').replace(u'Ã ',u'a').replace(u'Ãš',u'e').replace(u'Ã¬',u'i').replace(u'Ã²',u'o').replace(u'Ã¹',u'u') for w in words_fl if w.decode('utf8') not in stopwords.words('spanish') and w.decode('utf8') not in '*+.,?Â¿!Â¡":;-=/$@#ââ()[]{}'.decode('utf8') and not w.decode('utf8').isdigit() and len(w) > 3]
36
37	def preprocess(corpus_path):
38	freeling_cmd = shlex.split('/home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/bin/analyzer -f /home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/share/freeling/config/es.cfg',' ')
39	freeling_corpus_path = corpus_path + 'freeling/'
40	ret_val = dict()
41	corpus_words = []
42	i = 0
43	for file in os.listdir(corpus_path):
44	file_string = open(corpus_path+file,'r','utf8').read()
45	# LematizaciÃ³n con FREELING
46	words_fl = call_freeling(freeling_cmd,file_string)
47	# Quitar STOPWORDS y caracteres no deseados
48	words_pp = clean_words(words_fl)
49	ret_val[file] = words_pp
50	corpus_words += words_pp
51	i += 1
52	print "Pre-procesado el archivo: " + file
53	print "####################################"
54	print words_pp , '(' + str(i) + ')'
55	print "####################################"
56	print "####################################"
57
58	return ret_val,set(corpus_words)
59
60	def idf(file_words_pp,corpus_words):
61	idf = {}
62	num_docs = len(file_words_pp)
63	for w in corpus_words:
64	count = 0
65	for file,words in file_words_pp.items():
66	if w in words: count += 1
67	idf[w] = math.log(float(num_docs)/float(1+count))
68	return idf
69
70
71	if __name__ == '__main__':
72	"""
73	path_orig = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/orig/'
74	path_dest = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/lower/'
75	files_to_lower(path_orig,path_dest)
76	"""
77	corpus_path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/'
78	lower_corpus_path = corpus_path + 'lower/'
79	pp_corpus_path = corpus_path + 'pp/'
80
81	file_words_pp,corpus_words = preprocess(lower_corpus_path)
82	vocab_idf = idf(file_words_pp,corpus_words)
83	excluded = open(corpus_path+'excluded.txt','w','utf8')
84	for file,words_pp in file_words_pp.items():
85	# Guardar archivo
86	file_pp = open(pp_corpus_path+file,'w','utf8')
87	for w in words_pp:
88	condition = vocab_idf[w]
89	if condition >= 1.2 and condition <= 6.1 and not '_' in w:
90	try:
91	file_pp.write(w.encode('utf8') + ' ')
92	except UnicodeDecodeError:
93	file_pp.write(w + ' ')
94	else:
95	try:
96	excluded.write(w.encode('utf8') + ' ' + str(condition) + ' (' + file + ')\n')
97	except UnicodeDecodeError:
98	excluded.write(w + ' ' + str(condition) + ' (' + file + ')\n')
99	file_pp.close()
100	excluded.close()
101
102	print "Palabras en el vocabulario: ", len(corpus_words)

Nota: Vea TracBrowser para ayuda de uso del navegador del repositorio.

Descargar en otros formatos: