1 | # coding: utf8 |
---|
2 | from subprocess import Popen, PIPE |
---|
3 | import shlex, os |
---|
4 | from codecs import open |
---|
5 | from nltk.corpus import stopwords |
---|
6 | import math |
---|
7 | |
---|
8 | |
---|
9 | def files_to_lower(path_orig,path_dest): |
---|
10 | files = os.listdir(path_orig) |
---|
11 | for file in files: |
---|
12 | file_string = open(path_orig+file,'r','utf8').read() |
---|
13 | f = open(path_dest+file,'w','utf8') |
---|
14 | f.write(file_string.lower()) |
---|
15 | f.close() |
---|
16 | |
---|
17 | os.environ['FREELINGSHARE'] = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/share/freeling' |
---|
18 | def call_freeling(freeling_cmd,file_string): |
---|
19 | p = Popen(freeling_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE) |
---|
20 | output, err = p.communicate(file_string.encode('utf8')) |
---|
21 | |
---|
22 | if err < 0: |
---|
23 | return "ERROR: FALLÃ EJECUCIÃN DE FREELING" |
---|
24 | |
---|
25 | output_list = [] |
---|
26 | for item in output.split('\n'): |
---|
27 | if item.split(' ')[0] not in stopwords.words('spanish'): |
---|
28 | try: |
---|
29 | output_list += [item.split(' ')[1]] |
---|
30 | except IndexError: |
---|
31 | pass |
---|
32 | return output_list |
---|
33 | |
---|
34 | def clean_words(words_fl): |
---|
35 | return [w.decode('utf8').replace(u'á',u'a').replace(u'é',u'e').replace(u'Ã',u'i').replace(u'ó',u'o').replace(u'ú',u'u').replace(u'à ',u'a').replace(u'Ú',u'e').replace(u'ì',u'i').replace(u'ò',u'o').replace(u'ù',u'u') for w in words_fl if w.decode('utf8') not in stopwords.words('spanish') and w.decode('utf8') not in '*+.,?¿!¡":;-=/$@#ââ()[]{}'.decode('utf8') and not w.decode('utf8').isdigit() and len(w) > 3] |
---|
36 | |
---|
37 | def preprocess(corpus_path): |
---|
38 | freeling_cmd = shlex.split('/home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/bin/analyzer -f /home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/share/freeling/config/es.cfg',' ') |
---|
39 | freeling_corpus_path = corpus_path + 'freeling/' |
---|
40 | ret_val = dict() |
---|
41 | corpus_words = [] |
---|
42 | i = 0 |
---|
43 | for file in os.listdir(corpus_path): |
---|
44 | file_string = open(corpus_path+file,'r','utf8').read() |
---|
45 | # Lematización con FREELING |
---|
46 | words_fl = call_freeling(freeling_cmd,file_string) |
---|
47 | # Quitar STOPWORDS y caracteres no deseados |
---|
48 | words_pp = clean_words(words_fl) |
---|
49 | ret_val[file] = words_pp |
---|
50 | corpus_words += words_pp |
---|
51 | i += 1 |
---|
52 | print "Pre-procesado el archivo: " + file |
---|
53 | print "####################################" |
---|
54 | print words_pp , '(' + str(i) + ')' |
---|
55 | print "####################################" |
---|
56 | print "####################################" |
---|
57 | |
---|
58 | return ret_val,set(corpus_words) |
---|
59 | |
---|
60 | def idf(file_words_pp,corpus_words): |
---|
61 | idf = {} |
---|
62 | num_docs = len(file_words_pp) |
---|
63 | for w in corpus_words: |
---|
64 | count = 0 |
---|
65 | for file,words in file_words_pp.items(): |
---|
66 | if w in words: count += 1 |
---|
67 | idf[w] = math.log(float(num_docs)/float(1+count)) |
---|
68 | return idf |
---|
69 | |
---|
70 | |
---|
71 | if __name__ == '__main__': |
---|
72 | """ |
---|
73 | path_orig = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/orig/' |
---|
74 | path_dest = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/lower/' |
---|
75 | files_to_lower(path_orig,path_dest) |
---|
76 | """ |
---|
77 | corpus_path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/' |
---|
78 | lower_corpus_path = corpus_path + 'lower/' |
---|
79 | pp_corpus_path = corpus_path + 'pp/' |
---|
80 | |
---|
81 | file_words_pp,corpus_words = preprocess(lower_corpus_path) |
---|
82 | vocab_idf = idf(file_words_pp,corpus_words) |
---|
83 | excluded = open(corpus_path+'excluded.txt','w','utf8') |
---|
84 | for file,words_pp in file_words_pp.items(): |
---|
85 | # Guardar archivo |
---|
86 | file_pp = open(pp_corpus_path+file,'w','utf8') |
---|
87 | for w in words_pp: |
---|
88 | condition = vocab_idf[w] |
---|
89 | if condition >= 1.2 and condition <= 6.1 and not '_' in w: |
---|
90 | try: |
---|
91 | file_pp.write(w.encode('utf8') + ' ') |
---|
92 | except UnicodeDecodeError: |
---|
93 | file_pp.write(w + ' ') |
---|
94 | else: |
---|
95 | try: |
---|
96 | excluded.write(w.encode('utf8') + ' ' + str(condition) + ' (' + file + ')\n') |
---|
97 | except UnicodeDecodeError: |
---|
98 | excluded.write(w + ' ' + str(condition) + ' (' + file + ')\n') |
---|
99 | file_pp.close() |
---|
100 | excluded.close() |
---|
101 | |
---|
102 | print "Palabras en el vocabulario: ", len(corpus_words) |
---|