[0ff122b] | 1 | # -*- coding: utf-8 -*- |
---|
| 2 | """ |
---|
| 3 | Sistema de Modelado de Tópicos |
---|
| 4 | |
---|
| 5 | Copyleft (@) 2014 CENDITEL nodo Mérida - https://planificacion.cenditel.gob.ve/trac/ |
---|
| 6 | """ |
---|
| 7 | ## @package django_topic_explorer.utils |
---|
| 8 | # |
---|
| 9 | # Métodos para generar los archivos del LDA resultantes de verificar el pre-procesamiento |
---|
| 10 | # @author Jorge Redondo (jredondo at cenditel.gob.ve) |
---|
| 11 | # @author <a href='http://www.cenditel.gob.ve'>Centro Nacional de Desarrollo e Investigación en TecnologÃas Libres |
---|
| 12 | # (CENDITEL) nodo Mérida - Venezuela</a> |
---|
| 13 | # @copyright <a href='http://www.gnu.org/licenses/gpl-2.0.html'>GNU Public License versión 2 (GPLv2)</a> |
---|
| 14 | # @version 1.3 |
---|
| 15 | |
---|
| 16 | import glob |
---|
| 17 | import sys |
---|
| 18 | |
---|
| 19 | def build_lda(path_corpus,path_output): |
---|
| 20 | """! |
---|
| 21 | Función para construir los archivos del lda |
---|
| 22 | |
---|
| 23 | @author Jorge Redondo (jredondo at cenditel.gob.ve) |
---|
| 24 | @copyright GNU/GPLv2 |
---|
| 25 | @param path_corpus Recibe la ruta donde esta ubicado el corpus |
---|
| 26 | @param path_output Recibe la ruta de destino donde es secribirán los archivos |
---|
| 27 | """ |
---|
| 28 | |
---|
| 29 | files = glob.glob(path_corpus+'/*') |
---|
| 30 | |
---|
| 31 | vocab = [] |
---|
| 32 | f_dict = {} |
---|
| 33 | # To build vocabulary, first |
---|
| 34 | for f in files: |
---|
| 35 | # It's supposed that each files has only one line, thus index 0 |
---|
| 36 | f_words = open(f).readlines()[0].split(' ') |
---|
| 37 | f_words = [w.decode('utf8').strip(u'\ufeff') for w in f_words if len(w) > 0] |
---|
| 38 | f_dict[f.split('/')[-1]] = f_words |
---|
| 39 | vocab += f_words |
---|
| 40 | |
---|
| 41 | vocab = sorted(set(vocab)) |
---|
| 42 | |
---|
| 43 | with open(path_output+'corpus.dat','w') as outfile: |
---|
| 44 | for item in f_dict.items(): |
---|
| 45 | outfile.write(str(len(item[1]))+' ') |
---|
| 46 | |
---|
| 47 | for w in set(item[1]): |
---|
| 48 | outfile.write(str(vocab.index(w)) + ':' + str(item[1].count(w)) + ' ') |
---|
| 49 | |
---|
| 50 | outfile.write('\n') |
---|
| 51 | |
---|
| 52 | with open(path_output+'vocab.txt','w') as outfile: |
---|
| 53 | for item in vocab: |
---|
| 54 | outfile.write(item.encode('utf8')+'\n') |
---|
| 55 | |
---|
| 56 | |
---|
| 57 | if __name__ == '__main__': |
---|
| 58 | if(len(sys.argv)==3): |
---|
| 59 | path_corpus = sys.argv[1] |
---|
| 60 | path_output = sys.argv[2] |
---|
| 61 | build_lda(path_corpus,path_output) |
---|
| 62 | print "Se ejecutó el comando con éxito" |
---|
| 63 | else: |
---|
| 64 | print "Debe ingresar (2) argumentos a la función: path_corpus y path_output" |
---|
| 65 | |
---|
| 66 | |
---|