Navegación de contexto

source: modelado_topicos/utils/ldac2vsm.py @ 80f1533

preprocesamientov1.0

Last change on this file since 80f1533 was 80f1533, checked in by Jorge Redondo Flames <jredondo@…>, 8 años ago
Scripts de interoperabilidad con LDA-C (Implementación Blei)
Propiedad mode establecida a `100644`
File size: 3.0 KB

Línea
1	from vsm.extensions.interop.ldac import import_corpus
2	from vsm.extensions.corpusbuilders import dir_corpus
3	from vsm.corpus import Corpus
4	from vsm.model.ldacgsmulti import LdaCgsMulti
5
6	from vsm.model.ldafunctions import *
7	import math
8	import numpy as np
9
10
11	path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/lda-c-dist/output/'
12	#corpus_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/vsm_tmp/corpus.dat'
13	corpus_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/ap/ap.dat'
14	vocab_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/ap/vocab.txt'
15
16	def likelihood(path=path):
17	with open(path + 'likelihood.dat') as f:
18	lh = f.readlines()
19	return np.array([item.strip('\n').split('\t')[0] for item in lh],dtype=np.float)
20
21	def beta(path=path):
22	b = []
23	with open(path + 'final.beta') as f:
24	for line in f:
25	b.append([math.exp(float(item)) for item in line.strip('\n').split()])
26	#b.append([10**float(item) for item in line.strip('\n').split()])
27	return b
28
29
30	def alpha(path=path):
31	with open(path + 'final.other') as f:
32	a = f.readlines()
33	return float(a[2].split()[1])
34
35	def word_assigments(path=path):
36	indices_tmp = []
37	z_tmp = []
38	with open(path + 'word-assignments.dat') as f:
39	for line in f:
40	indices_tmp += [int(line.strip('\n').split()[0])]
41	line_split = line.strip('\n').split()
42	z_tmp.append(line_split[1:len(line_split)])
43	indices = []
44	for i in xrange(len(indices_tmp)):
45	indices += [sum(indices_tmp[0:i+1])]
46
47	z = list(range(len(z_tmp)))
48	for i,doc in enumerate(z_tmp):
49	z[i] = [int(item.split(':')[1]) for item in doc]
50
51	return z,indices
52
53	def corpus(file=corpus_file):
54	with open(file) as f:
55	c = f.readlines()
56	indices_tmp = [int(item.strip('\n').split()[0]) for item in c]
57	indices = []
58	for i in xrange(len(indices_tmp)):
59	indices += [sum(indices_tmp[0:i+1])]
60
61	c_tmp = [item.strip('\n').split()[1:len(item.strip('\n').split())] for item in c]
62	c = list(range(len(c_tmp)))
63	for i,doc in enumerate(c_tmp):
64	c[i] = [int(item.split(':')[0]) for item in doc]
65
66	return c,indices
67
68	def vocab(file=vocab_file):
69	with open(file) as f:
70	v = f.readlines()
71	return len(v)
72
73	def alpha_list(z,path=path):
74	a = alpha(path)
75	a_list = []
76	for i in range(len(z)):
77	a_list += [a]
78	return a_list
79
80
81	def top_doc(path=path):
82	z,indices = word_assigments(path)
83	b = beta(path)
84	a_list = alpha_list(z,path)
85	return compute_top_doc(z, len(b), np.array(a_list))
86
87	def word_top(path=path):
88	c,indices = corpus()
89	z,indices = word_assigments(path)
90	b = beta(path)
91	v = vocab()
92	return compute_word_top(c, z, len(b), v, np.transpose(b))
93
94	def log_prob(path=path):
95	wt = word_top(path)
96	td = top_doc(path)
97	c,indices = corpus()
98	z,indices = word_assigments(path)
99	return compute_log_prob(c, z, wt, td)
100
101	if __name__=='__main__':
102
103	z,indices = word_assigments(path)
104	b = beta(path)
105	v = vocab()
106	a = alpha_list(z,path)
107	c = import_corpus(corpus_file,vocab_file)
108	m = LdaCgsMulti(corpus=c,K=20,V=v,alpha=a,beta=b)
109
110

Nota: Vea TracBrowser para ayuda de uso del navegador del repositorio.

Descargar en otros formatos: