[ff807e0] | 1 | # coding: utf8 |
---|
| 2 | from subprocess import Popen, PIPE |
---|
| 3 | import shlex, os |
---|
| 4 | from codecs import open |
---|
| 5 | from nltk.corpus import stopwords |
---|
[539bee2] | 6 | import math, operator |
---|
[ff807e0] | 7 | |
---|
| 8 | |
---|
| 9 | def files_to_lower(path_orig,path_dest): |
---|
| 10 | files = os.listdir(path_orig) |
---|
| 11 | for file in files: |
---|
| 12 | file_string = open(path_orig+file,'r','utf8').read() |
---|
| 13 | f = open(path_dest+file,'w','utf8') |
---|
| 14 | f.write(file_string.lower()) |
---|
| 15 | f.close() |
---|
| 16 | |
---|
| 17 | os.environ['FREELINGSHARE'] = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/share/freeling' |
---|
| 18 | def call_freeling(freeling_cmd,file_string): |
---|
| 19 | p = Popen(freeling_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE) |
---|
| 20 | output, err = p.communicate(file_string.encode('utf8')) |
---|
| 21 | |
---|
| 22 | if err < 0: |
---|
| 23 | return "ERROR: FALLÃ EJECUCIÃN DE FREELING" |
---|
| 24 | |
---|
[539bee2] | 25 | return output |
---|
| 26 | |
---|
| 27 | |
---|
| 28 | def clean_words(words_fl): |
---|
| 29 | words_fl = [w.encode('unicode-escape') for w in words_fl] |
---|
| 30 | return [w.replace(u'\xe0'.encode('unicode-escape'),u'a').replace(u'\xe8'.encode('unicode-escape'),u'e').replace(u'\xec'.encode('unicode-escape'),u'i').replace(u'\xf2'.encode('unicode-escape'),u'o').replace(u'\xfa'.encode('unicode-escape'),u'u').replace(u'\xe1'.encode('unicode-escape'),u'a').replace(u'\xe9'.encode('unicode-escape'),u'e').replace(u'\xed'.encode('unicode-escape'),u'i').replace(u'\xf3'.encode('unicode-escape'),u'o').replace(u'\xfa'.encode('unicode-escape'),u'u').replace(u'á',u'a').replace(u'é',u'e').replace(u'Ã',u'i').replace(u'ó',u'o').replace(u'ú',u'u').replace(u'à ',u'a').replace(u'Ú',u'e').replace(u'ì',u'i').replace(u'ò',u'o').replace(u'ù',u'u') for w in words_fl if w not in stopwords.words('spanish') and w not in '*+.,?¿!¡":;-=/$@#ââ()[]{}' and not w.isdigit() and len(w) > 3] |
---|
| 31 | |
---|
| 32 | def is_pos(word,pos_list): |
---|
| 33 | for item in pos_list: |
---|
| 34 | if word.startswith(item): return True |
---|
| 35 | return False |
---|
| 36 | |
---|
| 37 | def complete_word(words_list,word): |
---|
| 38 | indexes = [i for i,j in enumerate(words_list) if j == word] |
---|
| 39 | if len(indexes) == 1: return word |
---|
| 40 | if len(indexes) == 0: return word |
---|
| 41 | |
---|
| 42 | #if len(indexes) == 0: raise Exception("LA PALABRA NO SE ENCUENTRA EN EL DOCUMENTO: cosa rara!") |
---|
| 43 | index = 1 |
---|
| 44 | complete_word = word |
---|
| 45 | i1 = indexes[0] |
---|
| 46 | while True: |
---|
| 47 | for i2 in indexes[1:]: |
---|
[ff807e0] | 48 | try: |
---|
[539bee2] | 49 | if words_list[i1+index] != words_list[i2+index]: |
---|
| 50 | return complete_word |
---|
[ff807e0] | 51 | except IndexError: |
---|
[539bee2] | 52 | return complete_word |
---|
| 53 | complete_word += '-' + words_list[i1+index] |
---|
| 54 | index += 1 |
---|
| 55 | if indexes[1] == i1+index or i1+index == len(words_list): |
---|
| 56 | return complete_word |
---|
| 57 | |
---|
| 58 | def all_complete_words(words_list): |
---|
| 59 | words_list = [w.decode('utf8') for w in words_list] |
---|
| 60 | ret_val = [] |
---|
| 61 | c = ''.encode('utf8') |
---|
| 62 | for w in words_list: |
---|
| 63 | c_aux = complete_word(words_list,w) |
---|
| 64 | if c_aux in c: |
---|
| 65 | continue |
---|
| 66 | |
---|
| 67 | c = c_aux |
---|
| 68 | ret_val += [c] |
---|
| 69 | return list(set(ret_val)) |
---|
| 70 | |
---|
| 71 | |
---|
| 72 | |
---|
| 73 | def select_pos(words_fl,pos_list=['V','A','N','R','D','P','C','I','S']): |
---|
| 74 | output_list = [] |
---|
| 75 | for item in words_fl.split('\n'): |
---|
| 76 | try: |
---|
| 77 | if item.split(' ')[0].decode('utf8') not in stopwords.words('spanish') and is_pos(item.split(' ')[2],pos_list): |
---|
| 78 | # Selecciona el lema |
---|
| 79 | #output_list += [item.split(' ')[1]] |
---|
| 80 | # Selecciona la palabra original |
---|
| 81 | output_list += [item.split(' ')[0]] |
---|
| 82 | except IndexError: |
---|
| 83 | pass |
---|
[ff807e0] | 84 | return output_list |
---|
| 85 | |
---|
[539bee2] | 86 | |
---|
| 87 | def preprocess(corpus_path,do_fl=True): |
---|
[ff807e0] | 88 | freeling_cmd = shlex.split('/home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/bin/analyzer -f /home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/share/freeling/config/es.cfg',' ') |
---|
[539bee2] | 89 | freeling_corpus_path = corpus_path + '../freeling/' |
---|
[ff807e0] | 90 | ret_val = dict() |
---|
| 91 | corpus_words = [] |
---|
| 92 | i = 0 |
---|
| 93 | for file in os.listdir(corpus_path): |
---|
| 94 | file_string = open(corpus_path+file,'r','utf8').read() |
---|
[539bee2] | 95 | if do_fl: |
---|
| 96 | # Lematización con FREELING |
---|
| 97 | words_fl = call_freeling(freeling_cmd,file_string) |
---|
| 98 | fl_file = open(freeling_corpus_path+file,'w','utf8') |
---|
| 99 | fl_file.write(words_fl.decode('utf-8')) |
---|
| 100 | fl_file.close() |
---|
| 101 | else: |
---|
| 102 | words_fl = open(freeling_corpus_path+file,'r').read() |
---|
| 103 | #################################### |
---|
| 104 | #################################### |
---|
| 105 | #'V', verbos |
---|
| 106 | #'A', adjetivos |
---|
| 107 | #'N', sustantivos |
---|
| 108 | #'R', advervios |
---|
| 109 | #'D', determinantes |
---|
| 110 | #'P', pronombres |
---|
| 111 | #'C', conjunciones |
---|
| 112 | #'I', interjecciones |
---|
| 113 | #'S', preposiciones |
---|
| 114 | words_fl = select_pos(words_fl=words_fl,pos_list=['A','R','V','N']) |
---|
| 115 | #################################### |
---|
| 116 | #################################### |
---|
[ff807e0] | 117 | # Quitar STOPWORDS y caracteres no deseados |
---|
[539bee2] | 118 | words_pp = all_complete_words(words_fl) |
---|
| 119 | words_pp = clean_words(words_pp) |
---|
[ff807e0] | 120 | ret_val[file] = words_pp |
---|
| 121 | corpus_words += words_pp |
---|
| 122 | i += 1 |
---|
[539bee2] | 123 | #print "Pre-procesado el archivo: " + file |
---|
| 124 | #print "####################################" |
---|
| 125 | #print words_pp , '(' + str(i) + ')' |
---|
| 126 | #print "####################################" |
---|
| 127 | #print "####################################" |
---|
[ff807e0] | 128 | |
---|
| 129 | return ret_val,set(corpus_words) |
---|
| 130 | |
---|
| 131 | def idf(file_words_pp,corpus_words): |
---|
| 132 | idf = {} |
---|
| 133 | num_docs = len(file_words_pp) |
---|
| 134 | for w in corpus_words: |
---|
| 135 | count = 0 |
---|
| 136 | for file,words in file_words_pp.items(): |
---|
| 137 | if w in words: count += 1 |
---|
| 138 | idf[w] = math.log(float(num_docs)/float(1+count)) |
---|
| 139 | return idf |
---|
| 140 | |
---|
| 141 | |
---|
| 142 | if __name__ == '__main__': |
---|
| 143 | """ |
---|
| 144 | path_orig = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/orig/' |
---|
| 145 | path_dest = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/lower/' |
---|
| 146 | files_to_lower(path_orig,path_dest) |
---|
| 147 | """ |
---|
| 148 | corpus_path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/' |
---|
| 149 | lower_corpus_path = corpus_path + 'lower/' |
---|
| 150 | pp_corpus_path = corpus_path + 'pp/' |
---|
| 151 | |
---|
[539bee2] | 152 | file_words_pp,corpus_words = preprocess(lower_corpus_path,do_fl=False) |
---|
| 153 | exclude_words = ['descripcion','justificacion','construccion','desarrollo','comunidad','comunal','proyecto','prueblo','desarrollar','mismo','nacional','pueblo','sistema'] |
---|
| 154 | exclude_words = [w.encode('utf8') for w in exclude_words] |
---|
| 155 | #vocab_idf = idf(file_words_pp,corpus_words) |
---|
| 156 | #print sorted(vocab_idf.items(),key=operator.itemgetter(1), reverse=True) |
---|
[ff807e0] | 157 | excluded = open(corpus_path+'excluded.txt','w','utf8') |
---|
[539bee2] | 158 | added_files = [] |
---|
| 159 | repeated_count = 0 |
---|
| 160 | flag = False |
---|
[ff807e0] | 161 | for file,words_pp in file_words_pp.items(): |
---|
[539bee2] | 162 | # Excluir documentos repetidos |
---|
| 163 | for aux_words_pp in added_files: |
---|
| 164 | if words_pp == aux_words_pp: |
---|
| 165 | repeated_count += 1 |
---|
| 166 | print "Repetido: " + file |
---|
| 167 | flag = True |
---|
| 168 | break |
---|
| 169 | if flag: |
---|
| 170 | flag = False |
---|
| 171 | continue |
---|
| 172 | |
---|
| 173 | #coef = float(len(set(words_pp)))/float(len(words_pp)) |
---|
| 174 | #print coef, len(words_pp), file |
---|
| 175 | #if (coef <= 0.5) or len(words_pp) <= 150: continue |
---|
| 176 | if len(words_pp) <= 50: continue |
---|
[ff807e0] | 177 | # Guardar archivo |
---|
| 178 | file_pp = open(pp_corpus_path+file,'w','utf8') |
---|
[539bee2] | 179 | added_files.append(words_pp) |
---|
[ff807e0] | 180 | for w in words_pp: |
---|
[539bee2] | 181 | #condition = vocab_idf[w] |
---|
| 182 | #if condition >= 2.0 and condition <= 6.1 and not '_' in w: |
---|
| 183 | #if condition >= 2.0 and not '_' in w: |
---|
| 184 | |
---|
| 185 | if w not in exclude_words and not '_' in w: |
---|
| 186 | |
---|
| 187 | #try: |
---|
| 188 | # file_pp.write(w.encode('utf8') + ' ') |
---|
| 189 | #except UnicodeDecodeError: |
---|
| 190 | file_pp.write(w + ' ') |
---|
[ff807e0] | 191 | else: |
---|
| 192 | try: |
---|
[539bee2] | 193 | #excluded.write(w.encode('utf8') + ' ' + str(condition) + ' (' + file + ')\n') |
---|
| 194 | excluded.write(w.encode('utf8') + ' (' + file + ')\n') |
---|
[ff807e0] | 195 | except UnicodeDecodeError: |
---|
[539bee2] | 196 | #excluded.write(w + ' ' + str(condition) + ' (' + file + ')\n') |
---|
| 197 | excluded.write(w + ' (' + file + ')\n') |
---|
[ff807e0] | 198 | file_pp.close() |
---|
| 199 | excluded.close() |
---|
| 200 | |
---|
[539bee2] | 201 | print "Documentos repetidos: ", repeated_count |
---|
[ff807e0] | 202 | print "Palabras en el vocabulario: ", len(corpus_words) |
---|