Conjunto 485135c en modelado_topicos
- Fecha y hora:
- 29/01/2016 10:08:40 (hace 8 años)
- Branches:
- master, preprocesamiento, v1.0
- Children:
- 21ab8e1
- Parents:
- 1a2167d
- Ficheros:
-
- 4 editados
Leyenda
- No modificado
- Añadido
- Eliminado
-
django_topic_explorer/settings.py
r1a2167d r485135c 112 112 #MODELS_PATH = TOPIC_EXPLORER_PATH + 'demo-data/corpus_propuestas/lda2vsm_models/' 113 113 CORPUS_FILE = MODELS_PATH + 'pp-nltk-en-freq5.npz' 114 LDA_DATA_PATH = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/lda-c-dist/test{0}/' 115 LDA_CORPUS_FILE = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/vsm2ldac/corpus.dat' 116 LDA_VOCAB_FILE = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/vsm2ldac/vocab.txt' 117 LDA_CORPUS_DIR = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/pp' 118 114 119 #MODEL_PATTERN = MODELS_PATH + 'model.npz' 115 120 … … 120 125 #TOPICS = '10, 20, 30, 40, 50, 60, 70' 121 126 #TOPICS = '10, 20, 30, 40, 50, 60, 70, 80, 90, 100' 122 TOPICS = '1 0, 20, 30, 40, 50, 60'127 TOPICS = '15, 30, 40, 50, 60, 70, 80, 90' 123 128 CORPUS_NAME = 'Deafult' 124 129 ICONS = 'link' -
topic_explorer/urls.py
rbd6e395 r485135c 9 9 urlpatterns = patterns('', 10 10 url(r'^doc_topics/(?P<doc_id>\d+)/$', doc_topic_csv, name='doc_topic_csv'), 11 url(r'^docs/(?P<k _param>\d+)/(?P<doc_id>.+)/$',doc_csv , name='doc_csv'),12 url(r'^topics/(?P<k _param>\d+)/(?P<topic_no>\d+)/$', topic_json , name='topic_json'),11 url(r'^docs/(?P<k>\d+)/(?P<doc_id>.+)/$',doc_csv , name='doc_csv'), 12 url(r'^topics/(?P<k>\d+)/(?P<topic_no>\d+)/$', topic_json , name='topic_json'), 13 13 url(r'^docs_topics/(?P<doc_id>.+)/$', doc_topics , name='doc_topics'), 14 14 url(r'^topics.json/$', topics , name='topics'), … … 16 16 url(r'^icons/$', icons , name='icons'), 17 17 url(r'^$', index , name='index'), 18 url(r'^doc/(?P<k _param>\d+)/(?P<filename>.+)/$', visualize , name='visualize'),19 url(r'^topic/(?P<k _param>\d+)/(?P<topic_no>\d+)/$', visualize , name='visualize'),18 url(r'^doc/(?P<k>\d+)/(?P<filename>.+)/$', visualize , name='visualize'), 19 url(r'^topic/(?P<k>\d+)/(?P<topic_no>\d+)/$', visualize , name='visualize'), 20 20 url(r'^see_topic',IrTopic.as_view(),name='see_topic'), 21 21 -
topic_explorer/views.py
r1a2167d r485135c 10 10 11 11 from utils import colorlib 12 from ldac2vsm import *12 from utils.ldac2vsm import * 13 13 import itertools 14 14 from vsm.corpus import Corpus … … 27 27 from django.utils.safestring import mark_safe 28 28 from django_topic_explorer.settings import FILES_PATH 29 30 from django_topic_explorer.settings import LDA_DATA_PATH 31 from django_topic_explorer.settings import LDA_CORPUS_FILE 32 from django_topic_explorer.settings import LDA_VOCAB_FILE 33 from django_topic_explorer.settings import LDA_CORPUS_DIR 34 29 35 30 36 #path = settings.PATH … … 42 48 doc_url_format = settings.DOC_URL_FORMAT 43 49 44 #global lda_m, lda_v 50 global k_param 51 k_param = None 52 global lda_c,lda_m, lda_v 45 53 46 54 # Integración LDA-c topic_explorer 47 lda_c,lda_m = corpus_model() 55 lda_c,lda_m = corpus_model(50,LDA_DATA_PATH.format(50), 56 LDA_CORPUS_FILE, 57 LDA_VOCAB_FILE, 58 LDA_CORPUS_DIR) 48 59 #lda_c = Corpus.load(corpus_file) 49 60 #lda_c.save('/home/jredondo/tmp/corpus.npz') … … 64 75 65 76 def doc_topic_csv(request, doc_id): 66 data = lda_v.doc_topics(doc_id) 67 68 output=StringIO() 69 writer = csv.writer(output) 70 writer.writerow(['topic','prob']) 71 writer.writerows([(t, "%6f" % p) for t,p in data]) 72 73 return HttpResponse(output.getvalue()) 74 75 def doc_csv(request, k_param,doc_id,threshold=0.2): 76 #lda_m = LCM.load(model_pattern.format(k_param)) 77 global lda_v 78 try: 79 data = lda_v.doc_topics(doc_id) 80 81 output=StringIO() 82 writer = csv.writer(output) 83 writer.writerow(['topic','prob']) 84 writer.writerows([(t, "%6f" % p) for t,p in data]) 85 86 return HttpResponse(output.getvalue()) 87 except: 88 return dump_exception() 89 90 def doc_csv(request, k,doc_id,threshold=0.2): 91 global k_param, lda_c, lda_m, lda_v 92 try: 93 if k != k_param: 94 k_param = k 95 lda_c,lda_m = corpus_model(k_param,LDA_DATA_PATH.format(k_param), 96 LDA_CORPUS_FILE, 97 LDA_VOCAB_FILE, 98 LDA_CORPUS_DIR) 99 lda_v = LDAViewer(lda_c, lda_m) 100 #lda_m = LCM.load(model_pattern.format(k_param)) 101 #lda_v = LDAViewer(lda_c, lda_m) 102 data = lda_v.sim_doc_doc(doc_id) 103 104 output=StringIO() 105 writer = csv.writer(output) 106 writer.writerow(['doc','prob']) 107 writer.writerows([(d, "%6f" % p) for d,p in data if p > threshold]) 108 109 return HttpResponse(output.getvalue()) 110 except: 111 return dump_exception() 112 113 def topic_json(request,k,topic_no, N=40): 114 global k_param, lda_c, lda_m, lda_v 115 try: 116 if k != k_param: 117 k_param = k 118 lda_c,lda_m = corpus_model(k_param,LDA_DATA_PATH.format(k_param), 119 LDA_CORPUS_FILE, 120 LDA_VOCAB_FILE, 121 LDA_CORPUS_DIR) 122 lda_v = LDAViewer(lda_c, lda_m) 123 #global lda_v 124 #lda_m = LCM.load(model_pattern.format(k_param)) 125 #lda_v = LDAViewer(lda_c, lda_m) 126 try: 127 N = int(request.query.n) 128 except: 129 pass 130 131 if N > 0: 132 data = lda_v.dist_top_doc([int(topic_no)])[:N] 133 else: 134 data = lda_v.dist_top_doc([int(topic_no)])[N:] 135 data = reversed(data) 136 137 docs = [doc for doc,prob in data] 138 doc_topics_mat = lda_v.doc_topics(docs) 139 140 js = [] 141 for doc_prob, topics in zip(data, doc_topics_mat): 142 doc, prob = doc_prob 143 js.append({'doc' : doc, 'label': label(doc), 'prob' : 1-prob, 144 'topics' : dict([(str(t), p) for t,p in topics])}) 145 return HttpResponse(json.dumps(js)) 146 except: 147 return dump_exception() 148 149 def doc_topics(request,doc_id, N=40): 150 global lda_v 151 #lda_c,lda_m = corpus_model(k_param,LDA_DATA_PATH.format(k_param), 152 # LDA_CORPUS_FILE, 153 # LDA_VOCAB_FILE, 154 # LDA_CORPUS_DIR) 77 155 #lda_v = LDAViewer(lda_c, lda_m) 78 data = lda_v.sim_doc_doc(doc_id)79 80 output=StringIO()81 writer = csv.writer(output)82 writer.writerow(['doc','prob'])83 writer.writerows([(d, "%6f" % p) for d,p in data if p > threshold])84 85 return HttpResponse(output.getvalue())86 87 def topic_json(request,k_param,topic_no, N=40):88 #global lda_v89 #lda_m = LCM.load(model_pattern.format(k_param))90 #lda_v = LDAViewer(lda_c, lda_m)91 try:92 N = int(request.query.n)93 except:94 pass95 96 if N > 0:97 data = lda_v.dist_top_doc([int(topic_no)])[:N]98 else:99 data = lda_v.dist_top_doc([int(topic_no)])[N:]100 data = reversed(data)101 102 docs = [doc for doc,prob in data]103 doc_topics_mat = lda_v.doc_topics(docs)104 105 js = []106 for doc_prob, topics in zip(data, doc_topics_mat):107 doc, prob = doc_prob108 js.append({'doc' : doc, 'label': label(doc), 'prob' : 1-prob,109 'topics' : dict([(str(t), p) for t,p in topics])})110 return HttpResponse(json.dumps(js))111 112 def doc_topics(request,doc_id, N=40):113 156 try: 114 157 try: … … 135 178 136 179 def topics(request): 137 try: 138 js=populateJson() 180 global lda_v 181 try: 182 #lda_c,lda_m = corpus_model(k_param,LDA_DATA_PATH.format(k_param), 183 # LDA_CORPUS_FILE, 184 # LDA_VOCAB_FILE, 185 # LDA_CORPUS_DIR) 186 #lda_v = LDAViewer(lda_c, lda_m) 187 js=populateJson(lda_v) 139 188 return HttpResponse(json.dumps(js)) 140 189 except: 141 190 return dump_exception() 142 191 143 def populateJson( ):192 def populateJson(lda_v): 144 193 # populate entropy values 145 194 data = lda_v.topic_oscillations() … … 164 213 165 214 def docs(request): 166 try: 215 global lda_v 216 try: 217 #lda_c,lda_m = corpus_model(k_param,LDA_DATA_PATH.format(k_param), 218 # LDA_CORPUS_FILE, 219 # LDA_VOCAB_FILE, 220 # LDA_CORPUS_DIR) 221 #lda_v = LDAViewer(lda_c, lda_m) 167 222 docs = lda_v.corpus.view_metadata(context_type)[doc_label_name(context_type)] 168 223 js = list() … … 178 233 179 234 def index(request): 180 global lda_m,lda_v 181 #lda_m = LCM.load(model_pattern.format(10)) 182 #lda_v = LDAViewer(lda_c, lda_m) 183 template_name = 'topic_explorer/index.html' 184 return render(request,template_name, 185 {'filename':None, 186 #'corpus_name' : corpus_name, 187 'corpus_link' : corpus_link, 188 'context_type' : context_type, 189 'topics_range' : topics_range, 190 'doc_title_format' : doc_title_format, 191 'doc_url_format' : doc_url_format}) 192 193 def visualize(request,k_param,filename=None,topic_no=None): 194 global lda_m,lda_v 195 #lda_m = LCM.load(model_pattern.format(k_param)) 196 #lda_v = LDAViewer(lda_c, lda_m) 197 template_name = 'topic_explorer/index.html' 198 return render(request,template_name, 199 {'filename':filename, 200 'k_param':k_param, 201 'topic_no':topic_no, 202 #'corpus_name' : corpus_name, 203 'corpus_link' : corpus_link, 204 'context_type' : context_type, 205 'topics_range' : topics_range, 206 'doc_title_format' : doc_title_format, 207 'doc_url_format' : doc_url_format}) 235 try: 236 #global lda_m,lda_v 237 #lda_m = LCM.load(model_pattern.format(10)) 238 #lda_v = LDAViewer(lda_c, lda_m) 239 template_name = 'topic_explorer/index.html' 240 return render(request,template_name, 241 {'filename':None, 242 #'corpus_name' : corpus_name, 243 'corpus_link' : corpus_link, 244 'context_type' : context_type, 245 'topics_range' : topics_range, 246 'doc_title_format' : doc_title_format, 247 'doc_url_format' : doc_url_format}) 248 except: 249 return dump_exception() 250 251 def visualize(request,k,filename=None,topic_no=None): 252 global k_param,lda_c,lda_m,lda_v 253 try: 254 if k != k_param: 255 k_param = k 256 lda_c,lda_m = corpus_model(k_param,LDA_DATA_PATH.format(k_param), 257 LDA_CORPUS_FILE, 258 LDA_VOCAB_FILE, 259 LDA_CORPUS_DIR) 260 lda_v = LDAViewer(lda_c, lda_m) 261 #lda_m = LCM.load(model_pattern.format(k_param)) 262 #lda_v = LDAViewer(lda_c, lda_m) 263 template_name = 'topic_explorer/index.html' 264 return render(request,template_name, 265 {'filename':filename, 266 'k_param':k_param, 267 'topic_no':topic_no, 268 #'corpus_name' : corpus_name, 269 'corpus_link' : corpus_link, 270 'context_type' : context_type, 271 'topics_range' : topics_range, 272 'doc_title_format' : doc_title_format, 273 'doc_url_format' : doc_url_format}) 274 except: 275 return dump_exception() 208 276 209 277 class IrTopic(TemplateView): 210 278 template_name='topic_explorer/verTopico.html' 211 279 def post(self, request, *args, **kwargs): 280 global lda_v 281 #global k_param 282 #lda_c,lda_m = corpus_model(k_param,LDA_DATA_PATH.format(k_param), 283 # LDA_CORPUS_FILE, 284 # LDA_VOCAB_FILE, 285 # LDA_CORPUS_DIR) 286 #lda_v = LDAViewer(lda_c, lda_m) 212 287 propuesta = request.POST['nombre_propuesta'] 213 288 #url = reverse('verTopicos') 214 289 #Obtnener json 215 Topic_Json = populateJson( )290 Topic_Json = populateJson(lda_v) 216 291 Topic_Json = json.dumps(Topic_Json) 217 292 topicos = json.loads(Topic_Json) -
utils/ldac2vsm.py
r1a2167d r485135c 9 9 import numpy as np 10 10 11 path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/lda-c-dist/test50/' 12 corpus_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/vsm2ldac/corpus.dat' 13 vocab_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/vsm2ldac/vocab.txt' 14 corpus_dir = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/noaccent' 11 #path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/lda-c-dist/test50/' 12 #path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/lda-c-dist/test15/' 13 #corpus_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/vsm2ldac/corpus.dat' 14 #vocab_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/vsm2ldac/vocab.txt' 15 #corpus_dir = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/pp' 15 16 #path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/lda-c-dist/output/' 16 17 #corpus_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/ap/ap.dat' 17 18 #vocab_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/ap/vocab.txt' 18 19 19 def likelihood(path =path):20 def likelihood(path): 20 21 with open(path + 'likelihood.dat') as f: 21 22 lh = f.readlines() 22 23 return np.array([item.strip('\n').split('\t')[0] for item in lh],dtype=np.float) 23 24 24 def beta(path =path):25 def beta(path): 25 26 b = [] 26 27 with open(path + 'final.beta') as f: … … 31 32 32 33 33 def alpha(path =path):34 def alpha(path): 34 35 with open(path + 'final.other') as f: 35 36 a = f.readlines() 36 37 return float(a[2].split()[1]) 37 38 38 def word_assigments(path =path):39 def word_assigments(path): 39 40 indices_tmp = [] 40 41 z_tmp = [] … … 54 55 return z,indices 55 56 56 def corpus(file =corpus_file):57 def corpus(file): 57 58 with open(file) as f: 58 59 c = f.readlines() … … 69 70 return c,indices 70 71 71 def vocab(file =vocab_file):72 def vocab(file): 72 73 with open(file) as f: 73 74 v = f.readlines() 74 75 return len(v) 75 76 76 def alpha_list(z,path =path):77 def alpha_list(z,path): 77 78 a = alpha(path) 78 79 a_list = [] … … 82 83 83 84 84 def top_doc(path =path):85 def top_doc(path): 85 86 z,indices = word_assigments(path) 86 87 b = beta(path) … … 88 89 return compute_top_doc(z, len(b), np.array(a_list)) 89 90 90 def word_top(path =path):91 def word_top(path): 91 92 c,indices = corpus() 92 93 z,indices = word_assigments(path) … … 95 96 return compute_word_top(c, z, len(b), v, np.transpose(b)) 96 97 97 def log_prob(path =path):98 def log_prob(path): 98 99 wt = word_top(path) 99 100 td = top_doc(path) … … 102 103 return compute_log_prob(c, z, wt, td) 103 104 104 def corpus_model( path=path):105 def corpus_model(k_param,path,corpus_file,vocab_file,corpus_dir): 105 106 z,indices = word_assigments(path) 106 107 zeta = [] … … 108 109 zeta.extend(item) 109 110 b = beta(path) 110 v = vocab( )111 v = vocab(vocab_file) 111 112 a = alpha_list(z,path) 112 113 c = import_corpus(corpusfilename=corpus_file, vocabfilename=vocab_file, path=corpus_dir ,context_type='propesta') … … 120 121 m = LdaCgsMulti(corpus=c, 121 122 context_type='propesta', 122 K=50, 123 #K=50, 124 K=int(k_param), 123 125 V=v, 124 126 #alpha=alpha,
Nota: Vea TracChangeset
para ayuda en el uso del visor de conjuntos de cambios.