Navegación de contexto

Conjunto 539bee2 en modelado_topicos

Fecha y hora:

07/07/2015 15:50:24 (hace 9 años)

Autor:

Jorge Redondo Flames <jredondo@…>

Branches:

master, preprocesamiento, v1.0

Children:

5b95488

Parents:

ff807e0

Mensaje:

Preprocesamiento con Freeling y otras opciones

Ficheros:

: 5 editados

django_topic_explorer/settings.py (modificado) (1 diferencia)
topic_explorer/templates/index.html (modificado) (1 diferencia)
topic_explorer/urls.py (modificado) (1 diferencia)
topic_explorer/views.py (modificado) (4 diferencias)
utils/freeling.py (modificado) (4 diferencias)

Leyenda

: No modificado
: Añadido
: Eliminado

django_topic_explorer/settings.py

-                      rfb25bf8
+                      r539bee2
 ## TOPIC EXPLORER SETTINGS
 TOPIC_EXPLORER_PATH = '/home/jredondo/Proyectos/Analisis_del_Discurso/src/topic-explorer/'
+MODELS_PATH = TOPIC_EXPLORER_PATH + 'demo-data/programapatrio_propuestas_descripcion/../models/'
+CORPUS_FILE = MODELS_PATH + 'programapatrio_propuestas_descripcion-nltk-en-freq5.npz'
+MODEL_PATTERN = TOPIC_EXPLORER_PATH + 'demo-data/ap/../models/programapatrio_propuestas_descripcion-nltk-en-freq5-LDA-K{0}-document-20.npz'
+MODELS_PATH = TOPIC_EXPLORER_PATH + 'demo-data/corpus_propuestas/models/'
+CORPUS_FILE = MODELS_PATH + 'pp-nltk-en-freq5.npz'
+#CORPUS_FILE = MODELS_PATH + 'ap-nltk-en-freq5.npz'
+MODEL_PATTERN = MODELS_PATH + 'pp-nltk-en-freq5-LDA-K{0}-document-400.npz'
+#MODEL_PATTERN = MODELS_PATH + 'ap-nltk-en-freq5-LDA-K{0}-document-20.npz'
 CONTEXT_TYPE = 'document'
+TOPICS = '10, 20, 30, 40, 50, 60, 70, 80, 90, 100'
+TOPICS = '10, 20, 30, 40, 50, 60, 70'
+#TOPICS = '10, 20, 30, 40, 50, 60, 70, 80, 90, 100'
+#TOPICS = '10, 20, 30, 40, 50, 60'
 CORPUS_NAME = 'Deafult'
 ICONS = 'link'

topic_explorer/templates/index.html

rfb25bf8	r539bee2
276	276	var tops;
277	277	d3.json(url, function(error, data) {
	278	console.log(data);
278	279	$('#status .bar').css('width', '50%').text('Loading topics...');
279	280	if (error) {

topic_explorer/urls.py

rc617763	r539bee2
9	9	urlpatterns = patterns('',
10	10	url(r'^doc_topics/(?P<doc_id>\d+)/$', doc_topic_csv, name='doc_topic_csv'),
11		url(r'^docs/(?P<~~doc_id>)\d+~~/$',doc_csv , name='doc_csv'),
	11	url(r'^docs/(?P<k_param>\d+)/(?P<doc_id>.+)/$',doc_csv , name='doc_csv'),
12	12	url(r'^topics/(?P<k_param>\d+)/(?P<topic_no>\d+)/$', topic_json , name='topic_json'),
13	13	url(r'^docs_topics/(?P<doc_id>.+)/$', doc_topics , name='doc_topics'),

topic_explorer/views.py

-                      rfb25bf8
+                      r539bee2
 from vsm.viewer.ldagibbsviewer import LDAGibbsViewer as LDAViewer
 from vsm.viewer.wrappers import doc_label_name
+from StringIO import StringIO
+import csv
 …
     return HttpResponse(output.getvalue())
+def doc_csv(request, doc_id, threshold=0.2):
+def doc_csv(request, k_param,doc_id,threshold=0.2):
+    lda_m = LCM.load(model_pattern.format(k_param))
+    lda_v = LDAViewer(lda_c, lda_m)
     data = lda_v.sim_doc_doc(doc_id)
 …
 def topic_json(request,k_param,topic_no, N=40):
     global lda_v
+    #global lda_v
     lda_m = LCM.load(model_pattern.format(k_param))
     lda_v = LDAViewer(lda_c, lda_m)
 …
         data = lda_v.topics()
         for i,topic in enumerate(data):
             js[str(i)].update({'words' : dict([(w, p) for w,p in topic[:10]])})
+            js[str(i)].update({'words' : dict([(w, p) for w,p in topic[:20]])})
         return HttpResponse(json.dumps(js))

utils/freeling.py

-                      rff807e0
+                      r539bee2
 from codecs import open
 from nltk.corpus import stopwords
 import math
+import math, operator
 …
         return "ERROR: FALLÃ EJECUCIÃN DE FREELING"
+    return output
+def clean_words(words_fl):
+    words_fl = [w.encode('unicode-escape') for w in words_fl]
+    return [w.replace(u'\xe0'.encode('unicode-escape'),u'a').replace(u'\xe8'.encode('unicode-escape'),u'e').replace(u'\xec'.encode('unicode-escape'),u'i').replace(u'\xf2'.encode('unicode-escape'),u'o').replace(u'\xfa'.encode('unicode-escape'),u'u').replace(u'\xe1'.encode('unicode-escape'),u'a').replace(u'\xe9'.encode('unicode-escape'),u'e').replace(u'\xed'.encode('unicode-escape'),u'i').replace(u'\xf3'.encode('unicode-escape'),u'o').replace(u'\xfa'.encode('unicode-escape'),u'u').replace(u'Ã¡',u'a').replace(u'Ã©',u'e').replace(u'Ã',u'i').replace(u'Ã³',u'o').replace(u'Ãº',u'u').replace(u'Ã ',u'a').replace(u'Ãš',u'e').replace(u'Ã¬',u'i').replace(u'Ã²',u'o').replace(u'Ã¹',u'u') for w in words_fl if w not in stopwords.words('spanish') and w not in '*+.,?Â¿!Â¡":;-=/$@#ââ()[]{}' and not w.isdigit() and len(w) > 3]
+def is_pos(word,pos_list):
+    for item in pos_list:
+        if word.startswith(item): return True
+    return False
+def complete_word(words_list,word):
+    indexes = [i for i,j in enumerate(words_list) if j == word]
+    if len(indexes) == 1: return word
+    if len(indexes) == 0: return word
+    #if len(indexes) == 0: raise Exception("LA PALABRA NO SE ENCUENTRA EN EL DOCUMENTO: cosa rara!")
+    index = 1
+    complete_word = word
+    i1 = indexes[0]
+    while True:
+        for i2 in indexes[1:]:
+            try:
+                if words_list[i1+index] != words_list[i2+index]:
+                    return complete_word
+            except IndexError:
+                return complete_word
+        complete_word += '-' + words_list[i1+index]
+        index += 1
+        if indexes[1] == i1+index or i1+index == len(words_list):
+            return complete_word
+def all_complete_words(words_list):
+    words_list = [w.decode('utf8') for w in words_list]
+    ret_val = []
+    c = ''.encode('utf8')
+    for w in words_list:
+        c_aux = complete_word(words_list,w)
+        if c_aux in c:
+            continue
+        c = c_aux
+        ret_val += [c]
+    return list(set(ret_val))
+def select_pos(words_fl,pos_list=['V','A','N','R','D','P','C','I','S']):
     output_list = []
+    for item in output.split('\n'):
+        if item.split(' ')[0] not in stopwords.words('spanish'):
+            try:
+                output_list += [item.split(' ')[1]]
+            except IndexError:
+                pass
+    for item in words_fl.split('\n'):
+        try:
+            if item.split(' ')[0].decode('utf8') not in stopwords.words('spanish') and is_pos(item.split(' ')[2],pos_list):
+                # Selecciona el lema
+                #output_list += [item.split(' ')[1]]
+                # Selecciona la palabra original
+                output_list += [item.split(' ')[0]]
+        except IndexError:
+            pass
     return output_list
+def clean_words(words_fl):
+    return [w.decode('utf8').replace(u'Ã¡',u'a').replace(u'Ã©',u'e').replace(u'Ã',u'i').replace(u'Ã³',u'o').replace(u'Ãº',u'u').replace(u'Ã ',u'a').replace(u'Ãš',u'e').replace(u'Ã¬',u'i').replace(u'Ã²',u'o').replace(u'Ã¹',u'u') for w in words_fl if w.decode('utf8') not in stopwords.words('spanish') and w.decode('utf8') not in '*+.,?Â¿!Â¡":;-=/$@#ââ()[]{}'.decode('utf8') and not w.decode('utf8').isdigit() and len(w) > 3]
+def preprocess(corpus_path):
+def preprocess(corpus_path,do_fl=True):
     freeling_cmd = shlex.split('/home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/bin/analyzer -f /home/jredondo/Proyectos/Analisis_del_Discurso/src/freeling/share/freeling/config/es.cfg',' ')
     freeling_corpus_path = corpus_path + 'freeling/'
+    freeling_corpus_path = corpus_path + '../freeling/'
     ret_val = dict()
     corpus_words = []
 …
     for file in os.listdir(corpus_path):
         file_string = open(corpus_path+file,'r','utf8').read()
+        # LematizaciÃ³n con FREELING
+        words_fl = call_freeling(freeling_cmd,file_string)
+        if do_fl:
+            # LematizaciÃ³n con FREELING
+            words_fl = call_freeling(freeling_cmd,file_string)
+            fl_file = open(freeling_corpus_path+file,'w','utf8')
+            fl_file.write(words_fl.decode('utf-8'))
+            fl_file.close()
+        else:
+            words_fl = open(freeling_corpus_path+file,'r').read()
+        ####################################
+        ####################################
+        #'V', verbos
+        #'A', adjetivos
+        #'N', sustantivos
+        #'R', advervios
+        #'D', determinantes
+        #'P', pronombres
+        #'C', conjunciones
+        #'I', interjecciones
+        #'S', preposiciones
+        words_fl = select_pos(words_fl=words_fl,pos_list=['A','R','V','N'])
+        ####################################
+        ####################################
         # Quitar STOPWORDS y caracteres no deseados
+        words_pp = clean_words(words_fl)
+        words_pp = all_complete_words(words_fl)
+        words_pp = clean_words(words_pp)
         ret_val[file] = words_pp
         corpus_words += words_pp
         i += 1
         print "Pre-procesado el archivo: " + file
         print "####################################"
         print words_pp , '(' + str(i) + ')'
         print "####################################"
         print "####################################"
+        #print "Pre-procesado el archivo: " + file
+        #print "####################################"
+        #print words_pp , '(' + str(i) + ')'
+        #print "####################################"
+        #print "####################################"
     return ret_val,set(corpus_words)
 …
     pp_corpus_path = corpus_path + 'pp/'
+    file_words_pp,corpus_words = preprocess(lower_corpus_path)
+    vocab_idf = idf(file_words_pp,corpus_words)
+    file_words_pp,corpus_words = preprocess(lower_corpus_path,do_fl=False)
+    exclude_words = ['descripcion','justificacion','construccion','desarrollo','comunidad','comunal','proyecto','prueblo','desarrollar','mismo','nacional','pueblo','sistema']
+    exclude_words = [w.encode('utf8') for w in exclude_words]
+    #vocab_idf = idf(file_words_pp,corpus_words)
+    #print sorted(vocab_idf.items(),key=operator.itemgetter(1), reverse=True)
     excluded = open(corpus_path+'excluded.txt','w','utf8')
+    added_files = []
+    repeated_count = 0
+    flag = False
     for file,words_pp in file_words_pp.items():
+        # Excluir documentos repetidos
+        for aux_words_pp in added_files:
+            if words_pp == aux_words_pp:
+                repeated_count += 1
+                print "Repetido: " + file
+                flag = True
+                break
+        if flag:
+            flag = False
+            continue
+        #coef = float(len(set(words_pp)))/float(len(words_pp))
+        #print coef, len(words_pp), file
+        #if (coef <= 0.5) or len(words_pp) <= 150: continue
+        if len(words_pp) <= 50: continue
         # Guardar archivo
         file_pp = open(pp_corpus_path+file,'w','utf8')
+        added_files.append(words_pp)
         for w in words_pp:
+            condition = vocab_idf[w]
+            if condition >= 1.2 and condition <= 6.1 and not '_' in w:
+                try:
+                    file_pp.write(w.encode('utf8') + ' ')
+                except UnicodeDecodeError:
+                    file_pp.write(w + ' ')
+            #condition = vocab_idf[w]
+            #if condition >= 2.0 and condition <= 6.1 and not '_' in w:
+            #if condition >= 2.0 and not '_' in w:
+            if w not in exclude_words and not '_' in w:
+                #try:
+                #    file_pp.write(w.encode('utf8') + ' ')
+                #except UnicodeDecodeError:
+                file_pp.write(w + ' ')
             else:
                 try:
+                    excluded.write(w.encode('utf8') + ' ' + str(condition) + ' (' + file + ')\n')
+                    #excluded.write(w.encode('utf8') + ' ' + str(condition) + ' (' + file + ')\n')
+                    excluded.write(w.encode('utf8') + ' (' + file + ')\n')
                 except UnicodeDecodeError:
+                    excluded.write(w + ' ' + str(condition) + ' (' + file + ')\n')
+                    #excluded.write(w + ' ' + str(condition) + ' (' + file + ')\n')
+                    excluded.write(w + ' (' + file + ')\n')
         file_pp.close()
     excluded.close()
+    print "Documentos repetidos: ", repeated_count
     print "Palabras en el vocabulario: ", len(corpus_words)

Nota: Vea TracChangeset para ayuda en el uso del visor de conjuntos de cambios.