Navegación de contexto

source: modelado_topicos/utils/freeling.py @ 5b95488

preprocesamientov1.0

Last change on this file since 5b95488 was 5b95488, checked in by rboet <rboet@…>, 9 años ago
agregado directorio see_topics, funcionabilidad y nuevo directorio templates
Propiedad mode establecida a `100755`
File size: 8.1 KB

Ver.	Línea
[ff807e0]	1	# coding: utf8
	2	from subprocess import Popen, PIPE
	3	import shlex, os
	4	from codecs import open
	5	from nltk.corpus import stopwords
[539bee2]	6	import math, operator
[ff807e0]	7
	8
	9	def files_to_lower(path_orig,path_dest):
	10	files = os.listdir(path_orig)
	11	for file in files:
	12	file_string = open(path_orig+file,'r','utf8').read()
	13	f = open(path_dest+file,'w','utf8')
	14	f.write(file_string.lower())
	15	f.close()
	16
	17	os.environ['FREELINGSHARE'] = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/share/freeling'
	18	def call_freeling(freeling_cmd,file_string):
	19	p = Popen(freeling_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
	20	output, err = p.communicate(file_string.encode('utf8'))
	21
	22	if err < 0:
	23	return "ERROR: FALLÃ EJECUCIÃN DE FREELING"
	24
[539bee2]	25	return output
	26
	27
	28	def clean_words(words_fl):
	29	words_fl = [w.encode('unicode-escape') for w in words_fl]
	30	return [w.replace(u'\xe0'.encode('unicode-escape'),u'a').replace(u'\xe8'.encode('unicode-escape'),u'e').replace(u'\xec'.encode('unicode-escape'),u'i').replace(u'\xf2'.encode('unicode-escape'),u'o').replace(u'\xfa'.encode('unicode-escape'),u'u').replace(u'\xe1'.encode('unicode-escape'),u'a').replace(u'\xe9'.encode('unicode-escape'),u'e').replace(u'\xed'.encode('unicode-escape'),u'i').replace(u'\xf3'.encode('unicode-escape'),u'o').replace(u'\xfa'.encode('unicode-escape'),u'u').replace(u'Ã¡',u'a').replace(u'Ã©',u'e').replace(u'Ã',u'i').replace(u'Ã³',u'o').replace(u'Ãº',u'u').replace(u'Ã ',u'a').replace(u'Ãš',u'e').replace(u'Ã¬',u'i').replace(u'Ã²',u'o').replace(u'Ã¹',u'u') for w in words_fl if w not in stopwords.words('spanish') and w not in '*+.,?Â¿!Â¡":;-=/$@#ââ()[]{}' and not w.isdigit() and len(w) > 3]
	31
	32	def is_pos(word,pos_list):
	33	for item in pos_list:
	34	if word.startswith(item): return True
	35	return False
	36
	37	def complete_word(words_list,word):
	38	indexes = [i for i,j in enumerate(words_list) if j == word]
	39	if len(indexes) == 1: return word
	40	if len(indexes) == 0: return word
	41
	42	#if len(indexes) == 0: raise Exception("LA PALABRA NO SE ENCUENTRA EN EL DOCUMENTO: cosa rara!")
	43	index = 1
	44	complete_word = word
	45	i1 = indexes[0]
	46	while True:
	47	for i2 in indexes[1:]:
[ff807e0]	48	try:
[539bee2]	49	if words_list[i1+index] != words_list[i2+index]:
	50	return complete_word
[ff807e0]	51	except IndexError:
[539bee2]	52	return complete_word
	53	complete_word += '-' + words_list[i1+index]
	54	index += 1
	55	if indexes[1] == i1+index or i1+index == len(words_list):
	56	return complete_word
	57
	58	def all_complete_words(words_list):
	59	words_list = [w.decode('utf8') for w in words_list]
	60	ret_val = []
	61	c = ''.encode('utf8')
	62	for w in words_list:
	63	c_aux = complete_word(words_list,w)
	64	if c_aux in c:
	65	continue
	66
	67	c = c_aux
	68	ret_val += [c]
	69	return list(set(ret_val))
	70
	71
	72
	73	def select_pos(words_fl,pos_list=['V','A','N','R','D','P','C','I','S']):
	74	output_list = []
	75	for item in words_fl.split('\n'):
	76	try:
	77	if item.split(' ')[0].decode('utf8') not in stopwords.words('spanish') and is_pos(item.split(' ')[2],pos_list):
	78	# Selecciona el lema
	79	#output_list += [item.split(' ')[1]]
	80	# Selecciona la palabra original
	81	output_list += [item.split(' ')[0]]
	82	except IndexError:
	83	pass
[ff807e0]	84	return output_list
	85
[539bee2]	86
	87	def preprocess(corpus_path,do_fl=True):
[ff807e0]	88	freeling_cmd = shlex.split('/home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/bin/analyzer -f /home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/share/freeling/config/es.cfg',' ')
[539bee2]	89	freeling_corpus_path = corpus_path + '../freeling/'
[ff807e0]	90	ret_val = dict()
	91	corpus_words = []
	92	i = 0
	93	for file in os.listdir(corpus_path):
	94	file_string = open(corpus_path+file,'r','utf8').read()
[539bee2]	95	if do_fl:
	96	# LematizaciÃ³n con FREELING
	97	words_fl = call_freeling(freeling_cmd,file_string)
	98	fl_file = open(freeling_corpus_path+file,'w','utf8')
	99	fl_file.write(words_fl.decode('utf-8'))
	100	fl_file.close()
	101	else:
	102	words_fl = open(freeling_corpus_path+file,'r').read()
	103	####################################
	104	####################################
	105	#'V', verbos
	106	#'A', adjetivos
	107	#'N', sustantivos
	108	#'R', advervios
	109	#'D', determinantes
	110	#'P', pronombres
	111	#'C', conjunciones
	112	#'I', interjecciones
	113	#'S', preposiciones
	114	words_fl = select_pos(words_fl=words_fl,pos_list=['A','R','V','N'])
	115	####################################
	116	####################################
[ff807e0]	117	# Quitar STOPWORDS y caracteres no deseados
[539bee2]	118	words_pp = all_complete_words(words_fl)
	119	words_pp = clean_words(words_pp)
[ff807e0]	120	ret_val[file] = words_pp
	121	corpus_words += words_pp
	122	i += 1
[539bee2]	123	#print "Pre-procesado el archivo: " + file
	124	#print "####################################"
	125	#print words_pp , '(' + str(i) + ')'
	126	#print "####################################"
	127	#print "####################################"
[ff807e0]	128
	129	return ret_val,set(corpus_words)
	130
	131	def idf(file_words_pp,corpus_words):
	132	idf = {}
	133	num_docs = len(file_words_pp)
	134	for w in corpus_words:
	135	count = 0
	136	for file,words in file_words_pp.items():
	137	if w in words: count += 1
	138	idf[w] = math.log(float(num_docs)/float(1+count))
	139	return idf
	140
	141
	142	if __name__ == '__main__':
	143	"""
	144	path_orig = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/orig/'
	145	path_dest = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/lower/'
	146	files_to_lower(path_orig,path_dest)
	147	"""
	148	corpus_path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/'
	149	lower_corpus_path = corpus_path + 'lower/'
	150	pp_corpus_path = corpus_path + 'pp/'
	151
[539bee2]	152	file_words_pp,corpus_words = preprocess(lower_corpus_path,do_fl=False)
	153	exclude_words = ['descripcion','justificacion','construccion','desarrollo','comunidad','comunal','proyecto','prueblo','desarrollar','mismo','nacional','pueblo','sistema']
	154	exclude_words = [w.encode('utf8') for w in exclude_words]
	155	#vocab_idf = idf(file_words_pp,corpus_words)
	156	#print sorted(vocab_idf.items(),key=operator.itemgetter(1), reverse=True)
[ff807e0]	157	excluded = open(corpus_path+'excluded.txt','w','utf8')
[539bee2]	158	added_files = []
	159	repeated_count = 0
	160	flag = False
[ff807e0]	161	for file,words_pp in file_words_pp.items():
[539bee2]	162	# Excluir documentos repetidos
	163	for aux_words_pp in added_files:
	164	if words_pp == aux_words_pp:
	165	repeated_count += 1
	166	print "Repetido: " + file
	167	flag = True
	168	break
	169	if flag:
	170	flag = False
	171	continue
	172
	173	#coef = float(len(set(words_pp)))/float(len(words_pp))
	174	#print coef, len(words_pp), file
	175	#if (coef <= 0.5) or len(words_pp) <= 150: continue
	176	if len(words_pp) <= 50: continue
[ff807e0]	177	# Guardar archivo
	178	file_pp = open(pp_corpus_path+file,'w','utf8')
[539bee2]	179	added_files.append(words_pp)
[ff807e0]	180	for w in words_pp:
[539bee2]	181	#condition = vocab_idf[w]
	182	#if condition >= 2.0 and condition <= 6.1 and not '_' in w:
	183	#if condition >= 2.0 and not '_' in w:
	184
	185	if w not in exclude_words and not '_' in w:
	186
	187	#try:
	188	# file_pp.write(w.encode('utf8') + ' ')
	189	#except UnicodeDecodeError:
	190	file_pp.write(w + ' ')
[ff807e0]	191	else:
	192	try:
[539bee2]	193	#excluded.write(w.encode('utf8') + ' ' + str(condition) + ' (' + file + ')\n')
	194	excluded.write(w.encode('utf8') + ' (' + file + ')\n')
[ff807e0]	195	except UnicodeDecodeError:
[539bee2]	196	#excluded.write(w + ' ' + str(condition) + ' (' + file + ')\n')
	197	excluded.write(w + ' (' + file + ')\n')
[ff807e0]	198	file_pp.close()
	199	excluded.close()
	200
[539bee2]	201	print "Documentos repetidos: ", repeated_count
[ff807e0]	202	print "Palabras en el vocabulario: ", len(corpus_words)

Nota: Vea TracBrowser para ayuda de uso del navegador del repositorio.

Descargar en otros formatos: