1 | # coding: utf8 |
---|
2 | from subprocess import Popen, PIPE |
---|
3 | import shlex, os |
---|
4 | from codecs import open |
---|
5 | from nltk.corpus import stopwords |
---|
6 | import math, operator |
---|
7 | |
---|
8 | |
---|
9 | def files_to_lower(path_orig,path_dest): |
---|
10 | files = os.listdir(path_orig) |
---|
11 | for file in files: |
---|
12 | file_string = open(path_orig+file,'r','utf8').read() |
---|
13 | f = open(path_dest+file,'w','utf8') |
---|
14 | f.write(file_string.lower()) |
---|
15 | f.close() |
---|
16 | |
---|
17 | os.environ['FREELINGSHARE'] = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/share/freeling' |
---|
18 | def call_freeling(freeling_cmd,file_string): |
---|
19 | p = Popen(freeling_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE) |
---|
20 | output, err = p.communicate(file_string.encode('utf8')) |
---|
21 | |
---|
22 | if err < 0: |
---|
23 | return "ERROR: FALLÃ EJECUCIÃN DE FREELING" |
---|
24 | |
---|
25 | return output |
---|
26 | |
---|
27 | def clean_words(w): |
---|
28 | w = w.encode('unicode-escape') |
---|
29 | return w.replace(u'\xe0'.encode('unicode-escape'),u'a').replace(u'\xe8'.encode('unicode-escape'),u'e').replace(u'\xec'.encode('unicode-escape'),u'i').replace(u'\xf2'.encode('unicode-escape'),u'o').replace(u'\xfa'.encode('unicode-escape'),u'u').replace(u'\xe1'.encode('unicode-escape'),u'a').replace(u'\xe9'.encode('unicode-escape'),u'e').replace(u'\xed'.encode('unicode-escape'),u'i').replace(u'\xf3'.encode('unicode-escape'),u'o').replace(u'\xfa'.encode('unicode-escape'),u'u').replace(u'á',u'a').replace(u'é',u'e').replace(u'Ã',u'i').replace(u'ó',u'o').replace(u'ú',u'u').replace(u'à ',u'a').replace(u'Ú',u'e').replace(u'ì',u'i').replace(u'ò',u'o').replace(u'ù',u'u') |
---|
30 | |
---|
31 | def is_pos(word,pos_list): |
---|
32 | for item in pos_list: |
---|
33 | if word.startswith(item): return True |
---|
34 | return False |
---|
35 | |
---|
36 | def complete_word(words_list,word): |
---|
37 | indexes = [i for i,j in enumerate(words_list) if j == word] |
---|
38 | if len(indexes) == 1: return word |
---|
39 | if len(indexes) == 0: return word |
---|
40 | |
---|
41 | index = 1 |
---|
42 | complete_word = word |
---|
43 | i1 = indexes[0] |
---|
44 | while True: |
---|
45 | for i2 in indexes[1:]: |
---|
46 | try: |
---|
47 | if words_list[i1+index] != words_list[i2+index]: |
---|
48 | return complete_word |
---|
49 | except IndexError: |
---|
50 | return complete_word |
---|
51 | complete_word += '-' + words_list[i1+index] |
---|
52 | index += 1 |
---|
53 | if indexes[1] == i1+index or i1+index == len(words_list): |
---|
54 | return complete_word |
---|
55 | |
---|
56 | def all_complete_words(words_list): |
---|
57 | words_list = [w.decode('utf8') for w in words_list] |
---|
58 | ret_val = [] |
---|
59 | c = ''.encode('utf8') |
---|
60 | for w in words_list: |
---|
61 | c_aux = complete_word(words_list,w) |
---|
62 | if c_aux in c: |
---|
63 | continue |
---|
64 | |
---|
65 | c = c_aux |
---|
66 | ret_val += [c] |
---|
67 | return list(set(ret_val)) |
---|
68 | |
---|
69 | |
---|
70 | |
---|
71 | def select_pos(path,words_fl,pos_list=['V','A','N','R','D','P','C','I','S']): |
---|
72 | output_list = [] |
---|
73 | all_words_list = [] |
---|
74 | for item in words_fl.split('\n'): |
---|
75 | try: |
---|
76 | w = item.split(' ')[0] |
---|
77 | cleaned_word = clean_words(w.decode('utf8')) |
---|
78 | all_words_list += [cleaned_word] |
---|
79 | if w.decode('utf8') not in stopwords.words('spanish') and is_pos(item.split(' ')[2],pos_list) and w not in '*+.,?¿!¡":;-=/$@#ââ()[]{}' and not w.isdigit() and len(w) > 3: |
---|
80 | # Selecciona el lema |
---|
81 | #output_list += [item.split(' ')[1]] |
---|
82 | # Selecciona la palabra original |
---|
83 | #output_list += [item.split(' ')[0]] |
---|
84 | output_list += [cleaned_word] |
---|
85 | except IndexError: |
---|
86 | pass |
---|
87 | na_file = open(path,'w','utf8') |
---|
88 | na_file.write(' '.join(all_words_list)) |
---|
89 | na_file.close() |
---|
90 | return output_list |
---|
91 | |
---|
92 | def preprocess(corpus_path,do_fl=True): |
---|
93 | freeling_corpus_path = corpus_path + '../freeling/' |
---|
94 | no_accent_path = corpus_path + '../noaccent/' |
---|
95 | ret_val = dict() |
---|
96 | corpus_words = [] |
---|
97 | i = 0 |
---|
98 | for file in os.listdir(corpus_path): |
---|
99 | file_string = open(corpus_path+file,'r','utf8').read() |
---|
100 | if do_fl: |
---|
101 | freeling_cmd = shlex.split('/home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/bin/analyzer -f /home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/share/freeling/config/es.cfg',' ') |
---|
102 | # Lematización con FREELING |
---|
103 | words_fl = call_freeling(freeling_cmd,file_string) |
---|
104 | fl_file = open(freeling_corpus_path+file,'w','utf8') |
---|
105 | fl_file.write(words_fl.decode('utf-8')) |
---|
106 | fl_file.close() |
---|
107 | else: |
---|
108 | words_fl = open(freeling_corpus_path+file,'r').read() |
---|
109 | #################################### |
---|
110 | #################################### |
---|
111 | #'V', verbos |
---|
112 | #'A', adjetivos |
---|
113 | #'N', sustantivos |
---|
114 | #'R', advervios |
---|
115 | #'D', determinantes |
---|
116 | #'P', pronombres |
---|
117 | #'C', conjunciones |
---|
118 | #'I', interjecciones |
---|
119 | #'S', preposiciones |
---|
120 | #################################### |
---|
121 | #################################### |
---|
122 | |
---|
123 | words_fl = select_pos(no_accent_path+file,words_fl=words_fl,pos_list=['A','R','V','N']) |
---|
124 | |
---|
125 | # Quitar STOPWORDS y caracteres no deseados |
---|
126 | words_pp = all_complete_words(words_fl) |
---|
127 | ret_val[file] = words_pp |
---|
128 | corpus_words += words_pp |
---|
129 | i += 1 |
---|
130 | |
---|
131 | return ret_val,set(corpus_words) |
---|
132 | |
---|
133 | def idf(file_words_pp,corpus_words): |
---|
134 | idf = {} |
---|
135 | num_docs = len(file_words_pp) |
---|
136 | for w in corpus_words: |
---|
137 | count = 0 |
---|
138 | for file,words in file_words_pp.items(): |
---|
139 | if w in words: count += 1 |
---|
140 | idf[w] = math.log(float(num_docs)/float(1+count)) |
---|
141 | return idf |
---|
142 | |
---|
143 | |
---|
144 | if __name__ == '__main__': |
---|
145 | """ |
---|
146 | path_orig = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/orig/' |
---|
147 | path_dest = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/lower/' |
---|
148 | files_to_lower(path_orig,path_dest) |
---|
149 | """ |
---|
150 | #corpus_path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/' |
---|
151 | corpus_path = '/home/cenditel/Interpretacion/demo-data/corpus_propuestas/' |
---|
152 | lower_corpus_path = corpus_path + 'lower/' |
---|
153 | pp_corpus_path = corpus_path + 'pp/' |
---|
154 | |
---|
155 | file_words_pp,corpus_words = preprocess(lower_corpus_path,do_fl=False) |
---|
156 | exclude_words = ['descripcion','justificacion','construccion','desarrollo','comunidad','comunal','proyecto','prueblo','desarrollar','mismo','nacional','pueblo','sistema'] |
---|
157 | exclude_words = [w.encode('utf8') for w in exclude_words] |
---|
158 | excluded = open(corpus_path+'excluded.txt','w','utf8') |
---|
159 | added_files = [] |
---|
160 | repeated_count = 0 |
---|
161 | flag = False |
---|
162 | for file,words_pp in file_words_pp.items(): |
---|
163 | # Excluir documentos repetidos |
---|
164 | for aux_words_pp in added_files: |
---|
165 | if words_pp == aux_words_pp: |
---|
166 | repeated_count += 1 |
---|
167 | print "Repetido: " + file |
---|
168 | flag = True |
---|
169 | break |
---|
170 | if flag: |
---|
171 | flag = False |
---|
172 | continue |
---|
173 | |
---|
174 | if len(words_pp) <= 50: continue |
---|
175 | # Guardar archivo |
---|
176 | file_pp = open(pp_corpus_path+file,'w','utf8') |
---|
177 | added_files.append(words_pp) |
---|
178 | for w in words_pp: |
---|
179 | if w not in exclude_words and not '_' in w: |
---|
180 | file_pp.write(w + ' ') |
---|
181 | else: |
---|
182 | try: |
---|
183 | excluded.write(w.encode('utf8') + ' (' + file + ')\n') |
---|
184 | except UnicodeDecodeError: |
---|
185 | excluded.write(w + ' (' + file + ')\n') |
---|
186 | file_pp.close() |
---|
187 | excluded.close() |
---|
188 | |
---|
189 | |
---|