[0ff122b] | 1 | from vsm.extensions.interop.ldac import import_corpus |
---|
| 2 | from vsm.extensions.corpusbuilders import dir_corpus |
---|
| 3 | from vsm.corpus import Corpus |
---|
| 4 | from vsm.model.ldacgsmulti import LdaCgsMulti |
---|
| 5 | from vsm.viewer.ldagibbsviewer import LDAGibbsViewer as LDAViewer |
---|
| 6 | |
---|
| 7 | from vsm.model.ldafunctions import * |
---|
| 8 | import math |
---|
| 9 | import numpy as np |
---|
| 10 | |
---|
| 11 | path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/lda-c-dist/test50/' |
---|
| 12 | corpus_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/vsm2ldac/corpus.dat' |
---|
| 13 | vocab_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/vsm2ldac/vocab.txt' |
---|
| 14 | corpus_dir = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/pp' |
---|
| 15 | #path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/lda-c-dist/output/' |
---|
| 16 | #corpus_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/ap/ap.dat' |
---|
| 17 | #vocab_file = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/lda-blei/ap/vocab.txt' |
---|
| 18 | |
---|
| 19 | def likelihood(path=path): |
---|
| 20 | with open(path + 'likelihood.dat') as f: |
---|
| 21 | lh = f.readlines() |
---|
| 22 | return np.array([item.strip('\n').split('\t')[0] for item in lh],dtype=np.float) |
---|
| 23 | |
---|
| 24 | def beta(path=path): |
---|
| 25 | b = [] |
---|
| 26 | with open(path + 'final.beta') as f: |
---|
| 27 | for line in f: |
---|
| 28 | b.append([math.exp(float(item)) for item in line.strip('\n').split()]) |
---|
| 29 | #b.append([10**float(item) for item in line.strip('\n').split()]) |
---|
| 30 | return b |
---|
| 31 | |
---|
| 32 | |
---|
| 33 | def alpha(path=path): |
---|
| 34 | with open(path + 'final.other') as f: |
---|
| 35 | a = f.readlines() |
---|
| 36 | return float(a[2].split()[1]) |
---|
| 37 | |
---|
| 38 | def word_assigments(path=path): |
---|
| 39 | indices_tmp = [] |
---|
| 40 | z_tmp = [] |
---|
| 41 | with open(path + 'word-assignments.dat') as f: |
---|
| 42 | for line in f: |
---|
| 43 | indices_tmp += [int(line.strip('\n').split()[0])] |
---|
| 44 | line_split = line.strip('\n').split() |
---|
| 45 | z_tmp.append(line_split[1:len(line_split)]) |
---|
| 46 | indices = [] |
---|
| 47 | for i in xrange(len(indices_tmp)): |
---|
| 48 | indices += [sum(indices_tmp[0:i+1])] |
---|
| 49 | |
---|
| 50 | z = list(range(len(z_tmp))) |
---|
| 51 | for i,doc in enumerate(z_tmp): |
---|
| 52 | z[i] = [int(item.split(':')[1]) for item in doc] |
---|
| 53 | |
---|
| 54 | return z,indices |
---|
| 55 | |
---|
| 56 | def corpus(file=corpus_file): |
---|
| 57 | with open(file) as f: |
---|
| 58 | c = f.readlines() |
---|
| 59 | indices_tmp = [int(item.strip('\n').split()[0]) for item in c] |
---|
| 60 | indices = [] |
---|
| 61 | for i in xrange(len(indices_tmp)): |
---|
| 62 | indices += [sum(indices_tmp[0:i+1])] |
---|
| 63 | |
---|
| 64 | c_tmp = [item.strip('\n').split()[1:len(item.strip('\n').split())] for item in c] |
---|
| 65 | c = list(range(len(c_tmp))) |
---|
| 66 | for i,doc in enumerate(c_tmp): |
---|
| 67 | c[i] = [int(item.split(':')[0]) for item in doc] |
---|
| 68 | |
---|
| 69 | return c,indices |
---|
| 70 | |
---|
| 71 | def vocab(file=vocab_file): |
---|
| 72 | with open(file) as f: |
---|
| 73 | v = f.readlines() |
---|
| 74 | return len(v) |
---|
| 75 | |
---|
| 76 | def alpha_list(z,path=path): |
---|
| 77 | a = alpha(path) |
---|
| 78 | a_list = [] |
---|
| 79 | for i in range(len(z)): |
---|
| 80 | a_list += [a] |
---|
| 81 | return a_list |
---|
| 82 | |
---|
| 83 | |
---|
| 84 | def top_doc(path=path): |
---|
| 85 | z,indices = word_assigments(path) |
---|
| 86 | b = beta(path) |
---|
| 87 | a_list = alpha_list(z,path) |
---|
| 88 | return compute_top_doc(z, len(b), np.array(a_list)) |
---|
| 89 | |
---|
| 90 | def word_top(path=path): |
---|
| 91 | c,indices = corpus() |
---|
| 92 | z,indices = word_assigments(path) |
---|
| 93 | b = beta(path) |
---|
| 94 | v = vocab() |
---|
| 95 | return compute_word_top(c, z, len(b), v, np.transpose(b)) |
---|
| 96 | |
---|
| 97 | def log_prob(path=path): |
---|
| 98 | wt = word_top(path) |
---|
| 99 | td = top_doc(path) |
---|
| 100 | c,indices = corpus() |
---|
| 101 | z,indices = word_assigments(path) |
---|
| 102 | return compute_log_prob(c, z, wt, td) |
---|
| 103 | |
---|
| 104 | def corpus_model(path=path): |
---|
| 105 | z,indices = word_assigments(path) |
---|
| 106 | zeta = [] |
---|
| 107 | for item in z: |
---|
| 108 | zeta.extend(item) |
---|
| 109 | b = beta(path) |
---|
| 110 | v = vocab() |
---|
| 111 | a = alpha_list(z,path) |
---|
| 112 | c = import_corpus(corpusfilename=corpus_file, vocabfilename=vocab_file, path=corpus_dir ,context_type='propesta') |
---|
| 113 | alpha = [] |
---|
| 114 | |
---|
| 115 | for i in range(len(b)): |
---|
| 116 | alpha.append(a) |
---|
| 117 | alpha = (np.array(alpha, dtype=np.float).reshape(len(alpha),len(alpha[0]))) |
---|
| 118 | |
---|
| 119 | b = (np.array(b, dtype=np.float).reshape(len(b[0]),len(b))) |
---|
| 120 | m = LdaCgsMulti(corpus=c, |
---|
| 121 | context_type='propesta', |
---|
| 122 | K=50, |
---|
| 123 | V=v, |
---|
| 124 | #alpha=alpha, |
---|
| 125 | #beta=b, |
---|
| 126 | Z=np.array(zeta)) |
---|
| 127 | |
---|
| 128 | |
---|
| 129 | return c,m |
---|
| 130 | |
---|
| 131 | if __name__=='__main__': |
---|
| 132 | print "******************** MAIN **********************" |
---|
| 133 | save_path = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/demo-data/corpus_propuestas/lda2vsm_models/' |
---|
| 134 | c,m = corpus_model() |
---|
| 135 | #c.save(save_path+'corpus.npz') |
---|
| 136 | #save_lda(m,save_path+'model.npz') |
---|