1 | from vsm.extensions.interop.ldac import import_corpus |
---|
2 | from vsm.extensions.corpusbuilders import dir_corpus |
---|
3 | from vsm.corpus import Corpus |
---|
4 | from vsm.model.ldacgsmulti import LdaCgsMulti |
---|
5 | from vsm.viewer.ldagibbsviewer import LDAGibbsViewer as LDAViewer |
---|
6 | |
---|
7 | from vsm.model.ldafunctions import * |
---|
8 | import math |
---|
9 | import numpy as np |
---|
10 | |
---|
11 | #path = '/home/rodrigo/Proyectos/Interpretacion/demo-data/data_ldac/test50' |
---|
12 | #path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/lda-c-dist/test15/' |
---|
13 | #corpus_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/vsm2ldac/corpus.dat' |
---|
14 | #vocab_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/vsm2ldac/vocab.txt' |
---|
15 | #corpus_dir = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/pp' |
---|
16 | #path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/lda-c-dist/output/' |
---|
17 | #corpus_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/ap/ap.dat' |
---|
18 | #vocab_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/ap/vocab.txt' |
---|
19 | path = '/home/rodrigo/Proyectos/Interpretacion/recursos/lda-blei/lda-c-dist/pruebas10/' |
---|
20 | corpus_file = '/home/rodrigo/Proyectos/Interpretacion/demo-data/prueba/lda/corpus.dat' |
---|
21 | vocab_file = '/home/rodrigo/Proyectos/Interpretacion/demo-data/prueba/lda/vocab.txt' |
---|
22 | corpus_dir = '/home/rodrigo/Proyectos/Interpretacion/demo-data/prueba/pp' |
---|
23 | |
---|
24 | def likelihood(path): |
---|
25 | with open(path + 'likelihood.dat') as f: |
---|
26 | lh = f.readlines() |
---|
27 | return np.array([item.strip('\n').split('\t')[0] for item in lh],dtype=np.float) |
---|
28 | |
---|
29 | def beta(path): |
---|
30 | b = [] |
---|
31 | with open(path + 'final.beta') as f: |
---|
32 | for line in f: |
---|
33 | b.append([math.exp(float(item)) for item in line.strip('\n').split()]) |
---|
34 | #b.append([10**float(item) for item in line.strip('\n').split()]) |
---|
35 | return b |
---|
36 | |
---|
37 | |
---|
38 | def alpha(path): |
---|
39 | with open(path + 'final.other') as f: |
---|
40 | a = f.readlines() |
---|
41 | return float(a[2].split()[1]) |
---|
42 | |
---|
43 | def word_assigments(path): |
---|
44 | indices_tmp = [] |
---|
45 | z_tmp = [] |
---|
46 | with open(path + 'word-assignments.dat') as f: |
---|
47 | for line in f: |
---|
48 | indices_tmp += [int(line.strip('\n').split()[0])] |
---|
49 | line_split = line.strip('\n').split() |
---|
50 | z_tmp.append(line_split[1:len(line_split)]) |
---|
51 | indices = [] |
---|
52 | for i in xrange(len(indices_tmp)): |
---|
53 | indices += [sum(indices_tmp[0:i+1])] |
---|
54 | |
---|
55 | z = list(range(len(z_tmp))) |
---|
56 | for i,doc in enumerate(z_tmp): |
---|
57 | z[i] = [int(item.split(':')[1]) for item in doc] |
---|
58 | |
---|
59 | return z,indices |
---|
60 | |
---|
61 | def corpus(file): |
---|
62 | with open(file) as f: |
---|
63 | c = f.readlines() |
---|
64 | indices_tmp = [int(item.strip('\n').split()[0]) for item in c] |
---|
65 | indices = [] |
---|
66 | for i in xrange(len(indices_tmp)): |
---|
67 | indices += [sum(indices_tmp[0:i+1])] |
---|
68 | |
---|
69 | c_tmp = [item.strip('\n').split()[1:len(item.strip('\n').split())] for item in c] |
---|
70 | c = list(range(len(c_tmp))) |
---|
71 | for i,doc in enumerate(c_tmp): |
---|
72 | c[i] = [int(item.split(':')[0]) for item in doc] |
---|
73 | |
---|
74 | return c,indices |
---|
75 | |
---|
76 | def vocab(file): |
---|
77 | with open(file) as f: |
---|
78 | v = f.readlines() |
---|
79 | return len(v) |
---|
80 | |
---|
81 | def alpha_list(z,path): |
---|
82 | a = alpha(path) |
---|
83 | a_list = [] |
---|
84 | for i in range(len(z)): |
---|
85 | a_list += [a] |
---|
86 | return a_list |
---|
87 | |
---|
88 | |
---|
89 | def top_doc(path): |
---|
90 | z,indices = word_assigments(path) |
---|
91 | b = beta(path) |
---|
92 | a_list = alpha_list(z,path) |
---|
93 | return compute_top_doc(z, len(b), np.array(a_list)) |
---|
94 | |
---|
95 | def word_top(path): |
---|
96 | c,indices = corpus() |
---|
97 | z,indices = word_assigments(path) |
---|
98 | b = beta(path) |
---|
99 | v = vocab() |
---|
100 | return compute_word_top(c, z, len(b), v, np.transpose(b)) |
---|
101 | |
---|
102 | def log_prob(path): |
---|
103 | wt = word_top(path) |
---|
104 | td = top_doc(path) |
---|
105 | c,indices = corpus() |
---|
106 | z,indices = word_assigments(path) |
---|
107 | return compute_log_prob(c, z, wt, td) |
---|
108 | |
---|
109 | def corpus_model(k_param,path,corpus_file,vocab_file,corpus_dir): |
---|
110 | z,indices = word_assigments(path) |
---|
111 | zeta = [] |
---|
112 | for item in z: |
---|
113 | zeta.extend(item) |
---|
114 | b = beta(path) |
---|
115 | v = vocab(vocab_file) |
---|
116 | a = alpha_list(z,path) |
---|
117 | c = import_corpus(corpusfilename=corpus_file, vocabfilename=vocab_file, path=corpus_dir, context_type='propesta') |
---|
118 | alpha = [] |
---|
119 | |
---|
120 | for i in range(len(b)): |
---|
121 | alpha.append(a) |
---|
122 | alpha = (np.array(alpha, dtype=np.float).reshape(len(alpha),len(alpha[0]))) |
---|
123 | |
---|
124 | b = (np.array(b, dtype=np.float).reshape(len(b[0]),len(b))) |
---|
125 | m = LdaCgsMulti(corpus=c, |
---|
126 | context_type='propesta', |
---|
127 | #K=50, |
---|
128 | K=int(k_param), |
---|
129 | V=v, |
---|
130 | #alpha=alpha, |
---|
131 | #beta=b, |
---|
132 | Z=np.array(zeta)) |
---|
133 | |
---|
134 | |
---|
135 | return c,m |
---|
136 | |
---|
137 | if __name__=='__main__': |
---|
138 | print "******************** MAIN **********************" |
---|
139 | #save_path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/lda2vsm_models/' |
---|
140 | save_path = '/home/rodrigo/Proyectos/Interpretacion/demo-data/prueba/models/' |
---|
141 | c,m = corpus_model(50,path,corpus_file,vocab_file,corpus_dir) |
---|
142 | c.save(save_path+'corpus.npz') |
---|
143 | save_lda(m,save_path+'model.npz') |
---|