1 | from vsm.extensions.interop.ldac import import_corpus |
---|
2 | from vsm.extensions.corpusbuilders import dir_corpus |
---|
3 | from vsm.corpus import Corpus |
---|
4 | from vsm.model.ldacgsmulti import LdaCgsMulti |
---|
5 | from vsm.viewer.ldagibbsviewer import LDAGibbsViewer as LDAViewer |
---|
6 | |
---|
7 | from vsm.model.ldafunctions import * |
---|
8 | import math |
---|
9 | import numpy as np |
---|
10 | |
---|
11 | path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/lda-c-dist/test50/' |
---|
12 | corpus_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/vsm2ldac/corpus.dat' |
---|
13 | vocab_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/vsm2ldac/vocab.txt' |
---|
14 | corpus_dir = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/pp' |
---|
15 | #path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/lda-c-dist/output/' |
---|
16 | #corpus_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/ap/ap.dat' |
---|
17 | #vocab_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/ap/vocab.txt' |
---|
18 | |
---|
19 | def likelihood(path=path): |
---|
20 | with open(path + 'likelihood.dat') as f: |
---|
21 | lh = f.readlines() |
---|
22 | return np.array([item.strip('\n').split('\t')[0] for item in lh],dtype=np.float) |
---|
23 | |
---|
24 | def beta(path=path): |
---|
25 | b = [] |
---|
26 | with open(path + 'final.beta') as f: |
---|
27 | for line in f: |
---|
28 | b.append([math.exp(float(item)) for item in line.strip('\n').split()]) |
---|
29 | #b.append([10**float(item) for item in line.strip('\n').split()]) |
---|
30 | return b |
---|
31 | |
---|
32 | |
---|
33 | def alpha(path=path): |
---|
34 | with open(path + 'final.other') as f: |
---|
35 | a = f.readlines() |
---|
36 | return float(a[2].split()[1]) |
---|
37 | |
---|
38 | def word_assigments(path=path): |
---|
39 | indices_tmp = [] |
---|
40 | z_tmp = [] |
---|
41 | with open(path + 'word-assignments.dat') as f: |
---|
42 | for line in f: |
---|
43 | indices_tmp += [int(line.strip('\n').split()[0])] |
---|
44 | line_split = line.strip('\n').split() |
---|
45 | z_tmp.append(line_split[1:len(line_split)]) |
---|
46 | indices = [] |
---|
47 | for i in xrange(len(indices_tmp)): |
---|
48 | indices += [sum(indices_tmp[0:i+1])] |
---|
49 | |
---|
50 | z = list(range(len(z_tmp))) |
---|
51 | for i,doc in enumerate(z_tmp): |
---|
52 | z[i] = [int(item.split(':')[1]) for item in doc] |
---|
53 | |
---|
54 | return z,indices |
---|
55 | |
---|
56 | def corpus(file=corpus_file): |
---|
57 | with open(file) as f: |
---|
58 | c = f.readlines() |
---|
59 | indices_tmp = [int(item.strip('\n').split()[0]) for item in c] |
---|
60 | indices = [] |
---|
61 | for i in xrange(len(indices_tmp)): |
---|
62 | indices += [sum(indices_tmp[0:i+1])] |
---|
63 | |
---|
64 | c_tmp = [item.strip('\n').split()[1:len(item.strip('\n').split())] for item in c] |
---|
65 | c = list(range(len(c_tmp))) |
---|
66 | for i,doc in enumerate(c_tmp): |
---|
67 | c[i] = [int(item.split(':')[0]) for item in doc] |
---|
68 | |
---|
69 | return c,indices |
---|
70 | |
---|
71 | def vocab(file=vocab_file): |
---|
72 | with open(file) as f: |
---|
73 | v = f.readlines() |
---|
74 | return len(v) |
---|
75 | |
---|
76 | def alpha_list(z,path=path): |
---|
77 | a = alpha(path) |
---|
78 | a_list = [] |
---|
79 | for i in range(len(z)): |
---|
80 | a_list += [a] |
---|
81 | return a_list |
---|
82 | |
---|
83 | |
---|
84 | def top_doc(path=path): |
---|
85 | z,indices = word_assigments(path) |
---|
86 | b = beta(path) |
---|
87 | a_list = alpha_list(z,path) |
---|
88 | return compute_top_doc(z, len(b), np.array(a_list)) |
---|
89 | |
---|
90 | def word_top(path=path): |
---|
91 | c,indices = corpus() |
---|
92 | z,indices = word_assigments(path) |
---|
93 | b = beta(path) |
---|
94 | v = vocab() |
---|
95 | return compute_word_top(c, z, len(b), v, np.transpose(b)) |
---|
96 | |
---|
97 | def log_prob(path=path): |
---|
98 | wt = word_top(path) |
---|
99 | td = top_doc(path) |
---|
100 | c,indices = corpus() |
---|
101 | z,indices = word_assigments(path) |
---|
102 | return compute_log_prob(c, z, wt, td) |
---|
103 | |
---|
104 | def corpus_model(path=path): |
---|
105 | z,indices = word_assigments(path) |
---|
106 | zeta = [] |
---|
107 | for item in z: |
---|
108 | zeta.extend(item) |
---|
109 | b = beta(path) |
---|
110 | v = vocab() |
---|
111 | a = alpha_list(z,path) |
---|
112 | c = import_corpus(corpusfilename=corpus_file, vocabfilename=vocab_file, path=corpus_dir ,context_type='propesta') |
---|
113 | alpha = [] |
---|
114 | |
---|
115 | for i in range(len(b)): |
---|
116 | alpha.append(a) |
---|
117 | alpha = (np.array(alpha, dtype=np.float).reshape(len(alpha),len(alpha[0]))) |
---|
118 | |
---|
119 | b = (np.array(b, dtype=np.float).reshape(len(b[0]),len(b))) |
---|
120 | m = LdaCgsMulti(corpus=c, |
---|
121 | context_type='propesta', |
---|
122 | K=50, |
---|
123 | V=v, |
---|
124 | #alpha=alpha, |
---|
125 | #beta=b, |
---|
126 | Z=np.array(zeta)) |
---|
127 | |
---|
128 | |
---|
129 | return c,m |
---|
130 | |
---|
131 | if __name__=='__main__': |
---|
132 | print "******************** MAIN **********************" |
---|
133 | save_path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/lda2vsm_models/' |
---|
134 | c,m = corpus_model() |
---|
135 | #c.save(save_path+'corpus.npz') |
---|
136 | #save_lda(m,save_path+'model.npz') |
---|