1 | from vsm.extensions.interop.ldac import import_corpus |
---|
2 | from vsm.extensions.corpusbuilders import dir_corpus |
---|
3 | from vsm.corpus import Corpus |
---|
4 | from vsm.model.ldacgsmulti import LdaCgsMulti |
---|
5 | |
---|
6 | from vsm.model.ldafunctions import * |
---|
7 | import math |
---|
8 | import numpy as np |
---|
9 | |
---|
10 | |
---|
11 | path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/lda-c-dist/output/' |
---|
12 | #corpus_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/vsm_tmp/corpus.dat' |
---|
13 | corpus_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/ap/ap.dat' |
---|
14 | vocab_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/ap/vocab.txt' |
---|
15 | |
---|
16 | def likelihood(path=path): |
---|
17 | with open(path + 'likelihood.dat') as f: |
---|
18 | lh = f.readlines() |
---|
19 | return np.array([item.strip('\n').split('\t')[0] for item in lh],dtype=np.float) |
---|
20 | |
---|
21 | def beta(path=path): |
---|
22 | b = [] |
---|
23 | with open(path + 'final.beta') as f: |
---|
24 | for line in f: |
---|
25 | b.append([math.exp(float(item)) for item in line.strip('\n').split()]) |
---|
26 | #b.append([10**float(item) for item in line.strip('\n').split()]) |
---|
27 | return b |
---|
28 | |
---|
29 | |
---|
30 | def alpha(path=path): |
---|
31 | with open(path + 'final.other') as f: |
---|
32 | a = f.readlines() |
---|
33 | return float(a[2].split()[1]) |
---|
34 | |
---|
35 | def word_assigments(path=path): |
---|
36 | indices_tmp = [] |
---|
37 | z_tmp = [] |
---|
38 | with open(path + 'word-assignments.dat') as f: |
---|
39 | for line in f: |
---|
40 | indices_tmp += [int(line.strip('\n').split()[0])] |
---|
41 | line_split = line.strip('\n').split() |
---|
42 | z_tmp.append(line_split[1:len(line_split)]) |
---|
43 | indices = [] |
---|
44 | for i in xrange(len(indices_tmp)): |
---|
45 | indices += [sum(indices_tmp[0:i+1])] |
---|
46 | |
---|
47 | z = list(range(len(z_tmp))) |
---|
48 | for i,doc in enumerate(z_tmp): |
---|
49 | z[i] = [int(item.split(':')[1]) for item in doc] |
---|
50 | |
---|
51 | return z,indices |
---|
52 | |
---|
53 | def corpus(file=corpus_file): |
---|
54 | with open(file) as f: |
---|
55 | c = f.readlines() |
---|
56 | indices_tmp = [int(item.strip('\n').split()[0]) for item in c] |
---|
57 | indices = [] |
---|
58 | for i in xrange(len(indices_tmp)): |
---|
59 | indices += [sum(indices_tmp[0:i+1])] |
---|
60 | |
---|
61 | c_tmp = [item.strip('\n').split()[1:len(item.strip('\n').split())] for item in c] |
---|
62 | c = list(range(len(c_tmp))) |
---|
63 | for i,doc in enumerate(c_tmp): |
---|
64 | c[i] = [int(item.split(':')[0]) for item in doc] |
---|
65 | |
---|
66 | return c,indices |
---|
67 | |
---|
68 | def vocab(file=vocab_file): |
---|
69 | with open(file) as f: |
---|
70 | v = f.readlines() |
---|
71 | return len(v) |
---|
72 | |
---|
73 | def alpha_list(z,path=path): |
---|
74 | a = alpha(path) |
---|
75 | a_list = [] |
---|
76 | for i in range(len(z)): |
---|
77 | a_list += [a] |
---|
78 | return a_list |
---|
79 | |
---|
80 | |
---|
81 | def top_doc(path=path): |
---|
82 | z,indices = word_assigments(path) |
---|
83 | b = beta(path) |
---|
84 | a_list = alpha_list(z,path) |
---|
85 | return compute_top_doc(z, len(b), np.array(a_list)) |
---|
86 | |
---|
87 | def word_top(path=path): |
---|
88 | c,indices = corpus() |
---|
89 | z,indices = word_assigments(path) |
---|
90 | b = beta(path) |
---|
91 | v = vocab() |
---|
92 | return compute_word_top(c, z, len(b), v, np.transpose(b)) |
---|
93 | |
---|
94 | def log_prob(path=path): |
---|
95 | wt = word_top(path) |
---|
96 | td = top_doc(path) |
---|
97 | c,indices = corpus() |
---|
98 | z,indices = word_assigments(path) |
---|
99 | return compute_log_prob(c, z, wt, td) |
---|
100 | |
---|
101 | if __name__=='__main__': |
---|
102 | |
---|
103 | z,indices = word_assigments(path) |
---|
104 | b = beta(path) |
---|
105 | v = vocab() |
---|
106 | a = alpha_list(z,path) |
---|
107 | c = import_corpus(corpus_file,vocab_file) |
---|
108 | m = LdaCgsMulti(corpus=c,K=20,V=v,alpha=a,beta=b) |
---|
109 | |
---|
110 | |
---|