Conjunto 539bee2 en modelado_topicos
- Fecha y hora:
- 07/07/2015 15:50:24 (hace 9 años)
- Branches:
- master, preprocesamiento, v1.0
- Children:
- 5b95488
- Parents:
- ff807e0
- Ficheros:
-
- 5 editados
Leyenda
- No modificado
- Añadido
- Eliminado
-
django_topic_explorer/settings.py
rfb25bf8 r539bee2 95 95 ## TOPIC EXPLORER SETTINGS 96 96 TOPIC_EXPLORER_PATH = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/' 97 MODELS_PATH = TOPIC_EXPLORER_PATH + 'demo-data/programapatrio_propuestas_descripcion/../models/' 98 CORPUS_FILE = MODELS_PATH + 'programapatrio_propuestas_descripcion-nltk-en-freq5.npz' 99 MODEL_PATTERN = TOPIC_EXPLORER_PATH + 'demo-data/ap/../models/programapatrio_propuestas_descripcion-nltk-en-freq5-LDA-K{0}-document-20.npz' 97 MODELS_PATH = TOPIC_EXPLORER_PATH + 'demo-data/corpus_propuestas/models/' 98 CORPUS_FILE = MODELS_PATH + 'pp-nltk-en-freq5.npz' 99 #CORPUS_FILE = MODELS_PATH + 'ap-nltk-en-freq5.npz' 100 MODEL_PATTERN = MODELS_PATH + 'pp-nltk-en-freq5-LDA-K{0}-document-400.npz' 101 #MODEL_PATTERN = MODELS_PATH + 'ap-nltk-en-freq5-LDA-K{0}-document-20.npz' 100 102 CONTEXT_TYPE = 'document' 101 TOPICS = '10, 20, 30, 40, 50, 60, 70, 80, 90, 100' 103 TOPICS = '10, 20, 30, 40, 50, 60, 70' 104 #TOPICS = '10, 20, 30, 40, 50, 60, 70, 80, 90, 100' 105 #TOPICS = '10, 20, 30, 40, 50, 60' 102 106 CORPUS_NAME = 'Deafult' 103 107 ICONS = 'link' -
topic_explorer/templates/index.html
rfb25bf8 r539bee2 276 276 var tops; 277 277 d3.json(url, function(error, data) { 278 console.log(data); 278 279 $('#status .bar').css('width', '50%').text('Loading topics...'); 279 280 if (error) { -
topic_explorer/urls.py
rc617763 r539bee2 9 9 urlpatterns = patterns('', 10 10 url(r'^doc_topics/(?P<doc_id>\d+)/$', doc_topic_csv, name='doc_topic_csv'), 11 url(r'^docs/(?P< doc_id>)\d+/$',doc_csv , name='doc_csv'),11 url(r'^docs/(?P<k_param>\d+)/(?P<doc_id>.+)/$',doc_csv , name='doc_csv'), 12 12 url(r'^topics/(?P<k_param>\d+)/(?P<topic_no>\d+)/$', topic_json , name='topic_json'), 13 13 url(r'^docs_topics/(?P<doc_id>.+)/$', doc_topics , name='doc_topics'), -
topic_explorer/views.py
rfb25bf8 r539bee2 12 12 from vsm.viewer.ldagibbsviewer import LDAGibbsViewer as LDAViewer 13 13 from vsm.viewer.wrappers import doc_label_name 14 15 from StringIO import StringIO 16 import csv 14 17 15 18 … … 56 59 return HttpResponse(output.getvalue()) 57 60 58 def doc_csv(request, doc_id, threshold=0.2): 61 def doc_csv(request, k_param,doc_id,threshold=0.2): 62 lda_m = LCM.load(model_pattern.format(k_param)) 63 lda_v = LDAViewer(lda_c, lda_m) 59 64 data = lda_v.sim_doc_doc(doc_id) 60 65 … … 67 72 68 73 def topic_json(request,k_param,topic_no, N=40): 69 global lda_v74 #global lda_v 70 75 lda_m = LCM.load(model_pattern.format(k_param)) 71 76 lda_v = LDAViewer(lda_c, lda_m) … … 136 141 data = lda_v.topics() 137 142 for i,topic in enumerate(data): 138 js[str(i)].update({'words' : dict([(w, p) for w,p in topic[: 10]])})143 js[str(i)].update({'words' : dict([(w, p) for w,p in topic[:20]])}) 139 144 140 145 return HttpResponse(json.dumps(js)) -
utils/freeling.py
rff807e0 r539bee2 4 4 from codecs import open 5 5 from nltk.corpus import stopwords 6 import math 6 import math, operator 7 7 8 8 … … 23 23 return "ERROR: FALLà EJECUCIÃN DE FREELING" 24 24 25 return output 26 27 28 def clean_words(words_fl): 29 words_fl = [w.encode('unicode-escape') for w in words_fl] 30 return [w.replace(u'\xe0'.encode('unicode-escape'),u'a').replace(u'\xe8'.encode('unicode-escape'),u'e').replace(u'\xec'.encode('unicode-escape'),u'i').replace(u'\xf2'.encode('unicode-escape'),u'o').replace(u'\xfa'.encode('unicode-escape'),u'u').replace(u'\xe1'.encode('unicode-escape'),u'a').replace(u'\xe9'.encode('unicode-escape'),u'e').replace(u'\xed'.encode('unicode-escape'),u'i').replace(u'\xf3'.encode('unicode-escape'),u'o').replace(u'\xfa'.encode('unicode-escape'),u'u').replace(u'á',u'a').replace(u'é',u'e').replace(u'Ã',u'i').replace(u'ó',u'o').replace(u'ú',u'u').replace(u'à ',u'a').replace(u'Ú',u'e').replace(u'ì',u'i').replace(u'ò',u'o').replace(u'ù',u'u') for w in words_fl if w not in stopwords.words('spanish') and w not in '*+.,?¿!¡":;-=/$@#ââ()[]{}' and not w.isdigit() and len(w) > 3] 31 32 def is_pos(word,pos_list): 33 for item in pos_list: 34 if word.startswith(item): return True 35 return False 36 37 def complete_word(words_list,word): 38 indexes = [i for i,j in enumerate(words_list) if j == word] 39 if len(indexes) == 1: return word 40 if len(indexes) == 0: return word 41 42 #if len(indexes) == 0: raise Exception("LA PALABRA NO SE ENCUENTRA EN EL DOCUMENTO: cosa rara!") 43 index = 1 44 complete_word = word 45 i1 = indexes[0] 46 while True: 47 for i2 in indexes[1:]: 48 try: 49 if words_list[i1+index] != words_list[i2+index]: 50 return complete_word 51 except IndexError: 52 return complete_word 53 complete_word += '-' + words_list[i1+index] 54 index += 1 55 if indexes[1] == i1+index or i1+index == len(words_list): 56 return complete_word 57 58 def all_complete_words(words_list): 59 words_list = [w.decode('utf8') for w in words_list] 60 ret_val = [] 61 c = ''.encode('utf8') 62 for w in words_list: 63 c_aux = complete_word(words_list,w) 64 if c_aux in c: 65 continue 66 67 c = c_aux 68 ret_val += [c] 69 return list(set(ret_val)) 70 71 72 73 def select_pos(words_fl,pos_list=['V','A','N','R','D','P','C','I','S']): 25 74 output_list = [] 26 for item in output.split('\n'): 27 if item.split(' ')[0] not in stopwords.words('spanish'): 28 try: 29 output_list += [item.split(' ')[1]] 30 except IndexError: 31 pass 75 for item in words_fl.split('\n'): 76 try: 77 if item.split(' ')[0].decode('utf8') not in stopwords.words('spanish') and is_pos(item.split(' ')[2],pos_list): 78 # Selecciona el lema 79 #output_list += [item.split(' ')[1]] 80 # Selecciona la palabra original 81 output_list += [item.split(' ')[0]] 82 except IndexError: 83 pass 32 84 return output_list 33 85 34 def clean_words(words_fl): 35 return [w.decode('utf8').replace(u'á',u'a').replace(u'é',u'e').replace(u'Ã',u'i').replace(u'ó',u'o').replace(u'ú',u'u').replace(u'à ',u'a').replace(u'Ú',u'e').replace(u'ì',u'i').replace(u'ò',u'o').replace(u'ù',u'u') for w in words_fl if w.decode('utf8') not in stopwords.words('spanish') and w.decode('utf8') not in '*+.,?¿!¡":;-=/$@#ââ()[]{}'.decode('utf8') and not w.decode('utf8').isdigit() and len(w) > 3] 36 37 def preprocess(corpus_path): 86 87 def preprocess(corpus_path,do_fl=True): 38 88 freeling_cmd = shlex.split('/home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/bin/analyzer -f /home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/share/freeling/config/es.cfg',' ') 39 freeling_corpus_path = corpus_path + ' freeling/'89 freeling_corpus_path = corpus_path + '../freeling/' 40 90 ret_val = dict() 41 91 corpus_words = [] … … 43 93 for file in os.listdir(corpus_path): 44 94 file_string = open(corpus_path+file,'r','utf8').read() 45 # Lematización con FREELING 46 words_fl = call_freeling(freeling_cmd,file_string) 95 if do_fl: 96 # Lematización con FREELING 97 words_fl = call_freeling(freeling_cmd,file_string) 98 fl_file = open(freeling_corpus_path+file,'w','utf8') 99 fl_file.write(words_fl.decode('utf-8')) 100 fl_file.close() 101 else: 102 words_fl = open(freeling_corpus_path+file,'r').read() 103 #################################### 104 #################################### 105 #'V', verbos 106 #'A', adjetivos 107 #'N', sustantivos 108 #'R', advervios 109 #'D', determinantes 110 #'P', pronombres 111 #'C', conjunciones 112 #'I', interjecciones 113 #'S', preposiciones 114 words_fl = select_pos(words_fl=words_fl,pos_list=['A','R','V','N']) 115 #################################### 116 #################################### 47 117 # Quitar STOPWORDS y caracteres no deseados 48 words_pp = clean_words(words_fl) 118 words_pp = all_complete_words(words_fl) 119 words_pp = clean_words(words_pp) 49 120 ret_val[file] = words_pp 50 121 corpus_words += words_pp 51 122 i += 1 52 print "Pre-procesado el archivo: " + file53 print "####################################"54 print words_pp , '(' + str(i) + ')'55 print "####################################"56 print "####################################"123 #print "Pre-procesado el archivo: " + file 124 #print "####################################" 125 #print words_pp , '(' + str(i) + ')' 126 #print "####################################" 127 #print "####################################" 57 128 58 129 return ret_val,set(corpus_words) … … 79 150 pp_corpus_path = corpus_path + 'pp/' 80 151 81 file_words_pp,corpus_words = preprocess(lower_corpus_path) 82 vocab_idf = idf(file_words_pp,corpus_words) 152 file_words_pp,corpus_words = preprocess(lower_corpus_path,do_fl=False) 153 exclude_words = ['descripcion','justificacion','construccion','desarrollo','comunidad','comunal','proyecto','prueblo','desarrollar','mismo','nacional','pueblo','sistema'] 154 exclude_words = [w.encode('utf8') for w in exclude_words] 155 #vocab_idf = idf(file_words_pp,corpus_words) 156 #print sorted(vocab_idf.items(),key=operator.itemgetter(1), reverse=True) 83 157 excluded = open(corpus_path+'excluded.txt','w','utf8') 158 added_files = [] 159 repeated_count = 0 160 flag = False 84 161 for file,words_pp in file_words_pp.items(): 162 # Excluir documentos repetidos 163 for aux_words_pp in added_files: 164 if words_pp == aux_words_pp: 165 repeated_count += 1 166 print "Repetido: " + file 167 flag = True 168 break 169 if flag: 170 flag = False 171 continue 172 173 #coef = float(len(set(words_pp)))/float(len(words_pp)) 174 #print coef, len(words_pp), file 175 #if (coef <= 0.5) or len(words_pp) <= 150: continue 176 if len(words_pp) <= 50: continue 85 177 # Guardar archivo 86 178 file_pp = open(pp_corpus_path+file,'w','utf8') 179 added_files.append(words_pp) 87 180 for w in words_pp: 88 condition = vocab_idf[w] 89 if condition >= 1.2 and condition <= 6.1 and not '_' in w: 90 try: 91 file_pp.write(w.encode('utf8') + ' ') 92 except UnicodeDecodeError: 93 file_pp.write(w + ' ') 181 #condition = vocab_idf[w] 182 #if condition >= 2.0 and condition <= 6.1 and not '_' in w: 183 #if condition >= 2.0 and not '_' in w: 184 185 if w not in exclude_words and not '_' in w: 186 187 #try: 188 # file_pp.write(w.encode('utf8') + ' ') 189 #except UnicodeDecodeError: 190 file_pp.write(w + ' ') 94 191 else: 95 192 try: 96 excluded.write(w.encode('utf8') + ' ' + str(condition) + ' (' + file + ')\n') 193 #excluded.write(w.encode('utf8') + ' ' + str(condition) + ' (' + file + ')\n') 194 excluded.write(w.encode('utf8') + ' (' + file + ')\n') 97 195 except UnicodeDecodeError: 98 excluded.write(w + ' ' + str(condition) + ' (' + file + ')\n') 196 #excluded.write(w + ' ' + str(condition) + ' (' + file + ')\n') 197 excluded.write(w + ' (' + file + ')\n') 99 198 file_pp.close() 100 199 excluded.close() 101 200 201 print "Documentos repetidos: ", repeated_count 102 202 print "Palabras en el vocabulario: ", len(corpus_words)
Nota: Vea TracChangeset
para ayuda en el uso del visor de conjuntos de cambios.