source: consulta_publica/vsm/vsm/extensions/ldasentences.py @ 32be06f

baseconstituyenteestudiantesgeneralplan_patriasala
Last change on this file since 32be06f was 0ff122b, checked in by rudmanmrrod <rudman22@…>, 7 años ago

Agregado módulo de gestión de perfiles de procesamiento, incorporado el módulo de visualización de modelado de tópicos

  • Propiedad mode establecida a 100644
File size: 21.2 KB
Línea 
1import numpy as np
2from vsm.corpus import Corpus
3from vsm.extensions.corpusbuilders import *
4from vsm.extensions.corpuscleanup import apply_stoplist_len
5from vsm.extensions.htrc import vol_link_fn, add_link_
6import os
7import re
8
9__all__ = ['CorpusSent', 'sim_sent_sent', 'sim_sent_sent_across',
10        'file_tokenize', 'file_corpus', 'dir_tokenize', 'dir_corpus',
11        'extend_sdd', 'extend_across']
12
13
14class CorpusSent(Corpus):
15    """
16    A subclass of Corpus whose purpose is to store original
17    sentence information in the Corpus
18   
19    :See Also: :class: Corpus
20    """
21    def __init__(self, corpus, sentences, context_types=[], context_data=[], 
22                remove_empty=False):
23       
24       super(CorpusSent, self).__init__(corpus, context_types=context_types,
25                 context_data=context_data, remove_empty=remove_empty)
26       
27       sentences = [re.sub('\n', ' ', s) for s in sentences]
28       self.sentences = np.array(sentences)
29
30
31    def __set_words_int(self):
32        """
33        Mapping of words to their integer representations.
34        """
35        self.words_int = dict((t,i) for i,t in enumerate(self.words))
36
37
38    def apply_stoplist(self, stoplist=[], freq=0):
39        """
40        Takes a Corpus object and returns a copy of it with words in the
41        stoplist removed and with words of frequency <= `freq` removed.
42       
43            :param stoplist: The list of words to be removed.
44        :type stoplist: list
45       
46        :type freq: integer, optional
47            :param freq: A threshold where words of frequency <= 'freq' are
48            removed. Default is 0.
49           
50        :returns: Copy of corpus with words in the stoplist and words of
51            frequnecy <= 'freq' removed.
52
53        :See Also: :class:`Corpus`
54        """
55        if freq:
56            #TODO: Use the TF model instead
57
58            print 'Computing collection frequencies'
59            cfs = np.zeros_like(self.words, dtype=self.corpus.dtype)
60   
61            for word in self.corpus:
62                cfs[word] += 1
63
64            print 'Selecting words of frequency <=', freq
65            freq_stop = np.arange(cfs.size)[(cfs <= freq)]
66            stop = set(freq_stop)
67        else:
68            stop = set()
69
70        for t in stoplist:
71            if t in self.words:
72                stop.add(self.words_int[t])
73
74        if not stop:
75            print 'Stop list is empty.'
76            return self
77   
78        print 'Removing stop words'
79        f = np.vectorize(lambda x: x not in stop)
80        corpus = self.corpus[f(self.corpus)]
81
82        print 'Rebuilding corpus'
83        corpus = [self.words[i] for i in corpus]
84        context_data = []
85        for i in xrange(len(self.context_data)):
86            print 'Recomputing token breaks:', self.context_types[i]
87            tokens = self.view_contexts(self.context_types[i])
88            spans = [t[f(t)].size for t in tokens]
89            tok = self.context_data[i].copy()
90            tok['idx'] = np.cumsum(spans)
91            context_data.append(tok)
92
93        return CorpusSent(corpus, self.sentences, context_data=context_data,
94                            context_types=self.context_types)
95
96
97    @staticmethod
98    def load(file):
99        """
100        Loads data into a Corpus object that has been stored using
101        `save`.
102       
103        :param file: Designates the file to read. If `file` is a string ending
104            in `.gz`, the file is first gunzipped. See `numpy.load`
105            for further details.
106        :type file: string-like or file-like object
107
108        :returns: c : A Corpus object storing the data found in `file`.
109
110        :See Also: :class: Corpus, :meth: Corpus.load, :meth: numpy.load
111        """
112        print 'Loading corpus from', file
113        arrays_in = np.load(file)
114
115        c = CorpusSent([], [])
116        c.corpus = arrays_in['corpus']
117        c.words = arrays_in['words']
118        c.sentences = arrays_in['sentences']
119        c.context_types = arrays_in['context_types'].tolist()
120
121        c.context_data = list()
122        for n in c.context_types:
123            t = arrays_in['context_data_' + n]
124            c.context_data.append(t)
125
126        c.__set_words_int()
127
128        return c
129
130    def save(self, file):
131        """
132        Saves data from a CorpusSent object as an `npz` file.
133       
134        :param file: Designates the file to which to save data. See
135            `numpy.savez` for further details.
136        :type file: str-like or file-like object
137           
138        :returns: None
139
140        :See Also: :class: Corpus, :meth: Corpus.save, :meth: numpy.savez
141        """
142       
143        print 'Saving corpus as', file
144        arrays_out = dict()
145        arrays_out['corpus'] = self.corpus
146        arrays_out['words'] = self.words
147        arrays_out['sentences'] = self.sentences
148        arrays_out['context_types'] = np.asarray(self.context_types)
149
150        for i,t in enumerate(self.context_data):
151            key = 'context_data_' + self.context_types[i]
152            arrays_out[key] = t
153
154        np.savez(file, **arrays_out)
155       
156   
157    def sent_int(self, sent):
158        """
159        sent : list of strings
160            List of sentence tokenization.
161            The list could be a subset of existing sentence tokenization.
162        """
163        tok = self.view_contexts('sentence', as_strings=True)
164        sent_li = []
165        for i in xrange(len(tok)):
166            sent_li.append(sent)
167        keys = [i for i in xrange(len(tok)) 
168                if set(sent_li[i]).issubset(tok[i].tolist())]
169       
170        n = len(keys)
171        if n == 0:
172            raise Exception('No token fits {0}.'.format(sent))
173        elif n > 1:
174            return keys
175        return keys[0] 
176
177
178def sim_sent_sent(ldaviewer, sent, print_len=10):
179    """
180    ldaviewer : ldaviewer object
181    sent : sentence index or sentence as a list of words
182
183    Returns
184    -------
185    sim_sents : numpy array
186        (sentence index, probability) as (i, value) pair.
187    tokenized_sents : list of arrays
188        List containing tokenized sentences as arrays.
189    orig_sents : list of strings
190        List containing original sentences as strings.
191    """
192    from vsm.viewer.ldagibbsviewer import LDAGibbsViewer
193
194    corp = ldaviewer.corpus
195    ind = sent
196    if isinstance(sent, list) and isinstance(sent[0], str):
197        ind = corp.sent_int(sent)
198    sim_sents = ldaviewer.sim_doc_doc(ind, print_len=print_len)
199    lc = sim_sents['doc'][:print_len]
200    lc = [s.split(', ') for s in lc]
201    lc = [int(s[-1]) for s in lc]
202   
203    # only returns print_len length
204    tokenized_sents, orig_sents = [], []
205    for i in lc:
206        tokenized_sents.append(corp.view_contexts('sentence', as_strings=True)[i])
207        orig_sents.append(corp.sentences[i])
208
209    return tokenized_sents, orig_sents, sim_sents
210
211
212def sim_sent_sent_across(ldavFrom, ldavTo, beagleviewer, sent, print_len=10,
213                         label_fn=vol_link_fn):
214    """
215    ldavFrom : ldaviewer object where the sentence is from.
216    ldavTo : ldaviewer object to find similar sentences
217    beagleviewer : beagleviewer object is used to find
218        similar words for words that don't exist in a different corpus.
219    sent : sentence index of the corpus that corresponds to ldavFrom,
220        or sentence as a list of words
221
222    Returns
223    -------
224    sim_sents : numpy array
225        (sentence index, probability) as (i, value) pair.
226    tokenized_sents : list of arrays
227        List containing tokenized sentences as arrays.
228    orig_sents : list of strings
229        List containing original sentences as strings.
230    """
231    from vsm.viewer.ldagibbsviewer import LDAGibbsViewer
232    from vsm.viewer.beagleviewer import BeagleViewer
233
234    def first_in_corp(corp, wordlist):
235        """
236        Goes down the list to find a word that's in `corp`.
237        Assumes there is a word in the `wordlist` that's in `corp`.
238        """
239        for w in wordlist:
240            if w in corp.words:
241                return w
242
243    corp = ldavFrom.corpus # to get sent ind
244    ind = sent
245    word_list = []
246    if isinstance(sent, list) and isinstance(sent[0], str):
247        ind = corp.sent_int(sent)
248        word_list = sent
249    elif isinstance(sent, list):
250        word_list = set()
251        for i in sent:
252            li = set(ldavFrom.corpus.view_contexts('sentence', 
253                    as_strings=True)[i])
254            word_list.update(li)       
255
256    else: # if sent is an int index
257        word_list = ldavFrom.corpus.view_contexts('sentence',
258                    as_strings=True)[ind].tolist()
259
260    word_list = list(word_list)
261    # Before trying ldavTo.sim_word_word, make sure all words
262    # in the list exist in ldavTo.corpus.
263    wl = []
264    for w in word_list:
265        if w not in ldavTo.corpus.words:
266            words = beagleviewer.sim_word_word(w)['word']
267            replacement = first_in_corp(ldavTo.corpus, words)
268            wl.append(replacement)
269            print 'BEAGLE composite model replaced {0} by {1}'.format(w, 
270                                                        replacement)
271        else:
272            wl.append(w)
273   
274    # from ldavFrom:sent -> ldavTo:topics -> ldavTo:sent(doc)
275    tops = ldavTo.sim_word_top(wl).first_cols[:(ldavTo.model.K/6)]
276    tops = [int(t) for t in tops]
277    print "Related topics: ", tops
278    # sim_sents = ldavTo.sim_top_doc(tops, print_len=print_len,
279    #                                as_strings=False)
280    # lc = sim_sents['i'][:print_len]
281    # tokenized_sents, orig_sents = [], []
282    # for i in lc:
283    #    tokenized_sents.append(ldavTo.corpus.view_contexts('sentence', as_strings=True)[i])
284    #   orig_sents.append(ldavTo.corpus.sentences[i])
285    sim_sents = ldavTo.sim_top_doc(tops, print_len=print_len,
286                                    label_fn=label_fn)
287    return sim_sents
288
289
290def extend_sdd(args, v, print_len=10):
291    """
292    Extend table resulting from sim_doc_doc with
293    label_fn = vol_link_fn. Adds an ArgumentMap column.
294    """
295    from vsm.viewer.ldagibbsviewer import LDAGibbsViewer
296
297    sdd = v.sim_doc_doc(args, label_fn=vol_link_fn, print_len=print_len)
298    table_str = sdd._repr_html_()
299    rows = table_str.split('</tr>') 
300
301    rows[0] = re.sub("2", "3", rows[0]) + '</tr>'
302    rows[1] += '<th style="text-align: center; background: #EFF2FB;">Argument\
303                Map</th></tr>'
304   
305    for i in xrange(2,len(rows)-1):
306        a = rows[i].split('</a>, ')
307        arg = a[1].split(',')[0]
308
309        arg_map = find_arg(arg) 
310        rows[i] += '<td>{0}</td></tr>'.format(arg_map)
311
312    return ''.join(rows)
313
314
315def extend_across(vFrom, vTo, beagle_v, args, txtFrom, txtTo, print_len=10):
316    """
317    Extend table resulting from sim_sent_sent_across with
318    the label_fn= vol_link_fn. Adds ArgumentMap and Novelty columns.
319    """
320    from vsm.extensions.htrc import add_link_
321
322    across = sim_sent_sent_across(vFrom, vTo, beagle_v, args, print_len=print_len)
323    table_str = across._repr_html_()
324    rows = table_str.split('</tr>') 
325
326    rows[0] = re.sub("2", "4", rows[0]) + '</tr>'
327    rows[1] += '<th style="text-align: center; background: #EFF2FB;">\
328                Argument Map</th><th style="text-align: center; background: \
329                #EFF2FB;">Novelty</th></tr>'
330   
331    for i in xrange(2,len(rows)-1):
332        a = rows[i].split('</a>, ')
333        arg = a[1].split(',')[0]
334
335        novelty = in_ed1(arg, txtTo, txtFrom)
336        arg_map = find_arg(novelty)
337       
338        # add link to novelty when it's found in the corpusFrom.
339        if not novelty == 'new':
340            li = novelty.split(' ')
341            idx = int(li[0])
342            md = vFrom.corpus.view_metadata('sentence')[idx]
343            link = add_link_(md['page_urls'], md['sentence_label'])
344            li[0] = link
345            novelty = ' '.join(li)
346
347        rows[i] += '<td>{0}</td><td>{1}</td></tr>'.format(arg_map, novelty)
348
349    return ''.join(rows)
350
351 
352def in_ed1(idx, difftxt, ed1txt):
353    """
354    Only for sim_sent_sent_across.
355    Return ind from ed1txt if i has a equal match.
356    Else return 'i'th entry in difftxt.
357   
358    """
359    path = '/var/inphosemantics/data/20131214/Washburn/vsm-data/'
360   
361    with open(path + ed1txt, 'r') as f1:
362        ed1 = f1.read()
363        ed1 = ed1.split(',') 
364       
365        with open(path + difftxt, 'r') as f:
366            txt = f.read() 
367            entries = txt.split(',')
368           
369            for i in xrange(len(entries)): 
370                if entries[i].startswith(str(idx) + ' '):
371                    if '=' in entries[i]:
372                        return ed1[i]
373                    else:
374                        prob = entries[i].split(' ')[1]
375                        return ed1[i] + ' ' + prob
376            # didn't find idx in the table.
377            return 'new'
378
379def find_arg(i):
380    """
381    Find the arg (e.g. '422')  if i is one of the analyzed args,
382    otherwise return ''
383    """
384    import json
385
386    path = '/var/inphosemantics/data/20131214/Washburn/vsm-data/'
387   
388    if i == 'new' or '(' in i:
389        return ''
390
391    i = int(i)
392    with open(path + 'arg_indices.json', 'r') as jsonf:
393        indices = json.load(jsonf)
394       
395        for k in indices:
396            if i in indices[k]:
397                return str(k)
398        return ''
399
400
401def file_tokenize(text):
402    """
403    `file_tokenize` is a helper function for :meth:`file_corpus`.
404   
405    Takes a string that is content in a file and returns words
406    and corpus data.
407
408    :param text: Content in a plain text file.
409    :type text: string
410
411    :returns: words : List of words.
412        Words in the `text` tokenized by :meth:`vsm.corpus.util.word_tokenize`.
413        corpus_data : Dictionary with context type as keys and
414        corresponding tokenizations as values. The tokenizations
415        are np.arrays.
416    """
417    words, par_tokens, sent_tokens, sent_orig = [], [], [], []
418    sent_break, par_n, sent_n = 0, 0, 0
419
420    pars = paragraph_tokenize(text)
421
422    for par in pars:
423        sents = sentence_tokenize(par)
424
425        for sent in sents:
426            w = word_tokenize(sent)
427            words.extend(w)
428            sent_break += len(w)
429            sent_tokens.append((sent_break, par_n, sent_n))
430            sent_orig.append(sent)
431            sent_n += 1
432
433        par_tokens.append((sent_break, par_n))
434        par_n += 1
435
436    idx_dt = ('idx', np.int32)
437    sent_label_dt = ('sentence_label', np.array(sent_n, np.str_).dtype)
438    par_label_dt = ('paragraph_label', np.array(par_n, np.str_).dtype)
439
440    corpus_data = dict()
441    dtype = [idx_dt, par_label_dt]
442    corpus_data['paragraph'] = np.array(par_tokens, dtype=dtype)
443    dtype = [idx_dt, par_label_dt, sent_label_dt]
444    corpus_data['sentence'] = np.array(sent_tokens, dtype=dtype)
445
446    return words, corpus_data, sent_orig
447
448
449def file_corpus(filename, nltk_stop=True, stop_freq=1, add_stop=None):
450    """
451    `file_corpus` is a convenience function for generating Corpus
452    objects from a a plain text corpus contained in a single string
453    `file_corpus` will strip punctuation and arabic numerals outside
454    the range 1-29. All letters are made lowercase.
455
456    :param filename: File name of the plain text file.
457    :type plain_dir: string-like
458   
459    :param nltk_stop: If `True` then the corpus object is masked
460        using the NLTK English stop words. Default is `False`.
461    :type nltk_stop: boolean, optional
462   
463    :param stop_freq: The upper bound for a word to be masked on
464        the basis of its collection frequency. Default is 1.
465    :type stop_freq: int, optional
466   
467    :param add_stop: A list of stop words. Default is `None`.
468    :type add_stop: array-like, optional
469
470    :returns: c : a Corpus object
471        Contains the tokenized corpus built from the input plain-text
472        corpus. Document tokens are named `documents`.
473   
474    :See Also: :class:`vsm.corpus.Corpus`,
475        :meth:`file_tokenize`,
476        :meth:`vsm.corpus.util.apply_stoplist`
477    """
478    with open(filename, mode='r') as f:
479        text = f.read()
480
481    words, tok, sent = file_tokenize(text)
482    names, data = zip(*tok.items())
483   
484    c = CorpusSent(words, sent, context_data=data, context_types=names,
485                    remove_empty=False)
486    c = apply_stoplist(c, nltk_stop=nltk_stop,
487                       freq=stop_freq, add_stop=add_stop)
488
489    return c
490
491
492
493def dir_tokenize(chunks, labels, chunk_name='article', paragraphs=True):
494    """
495    """
496    words, chk_tokens, sent_tokens, sent_orig = [], [], [], []
497    sent_break, chk_n, sent_n = 0, 0, 0
498
499    if paragraphs:
500        par_tokens = []
501        par_n = 0
502       
503        for chk, label in zip(chunks, labels):
504            print 'Tokenizing', label
505            pars = paragraph_tokenize(chk)
506
507            for par in pars:
508                sents = sentence_tokenize(par)
509
510                for sent in sents:
511                    w = word_tokenize(sent)
512                    words.extend(w)
513                    sent_break += len(w)
514                    sent_tokens.append((sent_break, label, par_n, sent_n))
515                    sent_orig.append(sent)
516                    sent_n += 1
517
518                par_tokens.append((sent_break, label, par_n))
519                par_n += 1
520
521            chk_tokens.append((sent_break, label))
522            chk_n += 1
523    else:
524        for chk, label in zip(chunks, labels):
525            print 'Tokenizing', label
526            sents = sentence_tokenize(chk)
527
528            for sent in sents:
529                w = word_tokenize(sent)
530                words.extend(w)
531                sent_break += len(w)
532                sent_tokens.append((sent_break, label, sent_n))
533                sent_orig.append(sent)
534                sent_n += 1
535
536            chk_tokens.append((sent_break, label))
537            chk_n += 1
538
539    idx_dt = ('idx', np.int32)
540    label_dt = (chunk_name + '_label', np.array(labels).dtype)
541    sent_label_dt = ('sentence_label', np.array(sent_n, np.str_).dtype)
542    corpus_data = dict()
543    dtype = [idx_dt, label_dt]
544    corpus_data[chunk_name] = np.array(chk_tokens, dtype=dtype)
545
546    if paragraphs:
547        par_label_dt = ('paragraph_label', np.array(par_n, np.str_).dtype)
548        dtype = [idx_dt, label_dt, par_label_dt]
549        corpus_data['paragraph'] = np.array(par_tokens, dtype=dtype)
550        dtype = [idx_dt, label_dt, par_label_dt, sent_label_dt]
551        corpus_data['sentence'] = np.array(sent_tokens, dtype=dtype)
552    else:
553        dtype = [idx_dt, label_dt, sent_label_dt]
554        corpus_data['sentence'] = np.array(sent_tokens, dtype=dtype)
555
556    return words, corpus_data, sent_orig
557
558
559
560def dir_corpus(plain_dir, chunk_name='article', paragraphs=True, word_len=2,
561               nltk_stop=True, stop_freq=1, add_stop=None, corpus_sent=True,
562               ignore=['.log', '.pickle', '.xml']):
563    """
564    `dir_corpus` is a convenience function for generating Corpus
565    objects from a directory of plain text files.
566
567    `dir_corpus` will retain file-level tokenization and perform
568    sentence and word tokenizations. Optionally, it will provide
569    paragraph-level tokenizations.
570
571    It will also strip punctuation and arabic numerals outside the
572    range 1-29. All letters are made lowercase.
573
574    :param plain_dir: String containing directory containing a
575        plain-text corpus.
576    :type plain_dir: string-like
577   
578    :param chunk_name: The name of the tokenization corresponding
579        to individual files. For example, if the files are pages
580        of a book, one might set `chunk_name` to `pages`. Default
581        is `articles`.
582    :type chunk_name: string-like, optional
583   
584    :param paragraphs: If `True`, a paragraph-level tokenization
585        is included. Defaults to `True`.
586    :type paragraphs: boolean, optional
587   
588    :param word_len: Filters words whose lengths are <= word_len.
589        Default is 2.
590    :type word_len: int, optional
591
592    :param nltk_stop: If `True` then the corpus object is masked
593        using the NLTK English stop words. Default is `False`.
594    :type nltk_stop: boolean, optional
595   
596    :param stop_freq: The upper bound for a word to be masked on
597        the basis of its collection frequency. Default is 1.
598    :type stop_freq: int, optional
599
600    :param corpus_sent: If `True` a CorpusSent object is returned.
601        Otherwise Corpus object is returned. Default is `True`.
602    :type corpus_sent: boolean, optional
603
604    :param add_stop: A list of stop words. Default is `None`.
605    :type add_stop: array-like, optional
606
607    :param ignore: The list containing suffixes of files to be filtered.
608        The suffix strings are normally file types. Default is ['.json',
609        '.log', '.pickle'].
610    :type ignore: list of strings, optional
611
612    :returns: c : Corpus or CorpusSent
613        Contains the tokenized corpus built from the input plain-text
614        corpus. Document tokens are named `documents`.
615   
616    :See Also: :class: Corpus, :class: CorpusSent, :meth: dir_tokenize,
617        :meth: apply_stoplist
618    """
619    chunks = []
620    filenames = os.listdir(plain_dir)
621    filenames = filter_by_suffix(filenames, ignore)
622    filenames.sort()
623
624    for filename in filenames:
625        filename = os.path.join(plain_dir, filename)
626        with open(filename, mode='r') as f:
627            chunks.append(f.read())
628
629    words, tok, sent = dir_tokenize(chunks, filenames, chunk_name=chunk_name,
630                              paragraphs=paragraphs)
631    names, data = zip(*tok.items())
632   
633    if corpus_sent:
634        c = CorpusSent(words, sent, context_data=data, context_types=names,
635                        remove_empty=False)
636    else:
637        c = Corpus(words, context_data=data, context_types=names)
638    c = apply_stoplist_len(c, nltk_stop=nltk_stop, add_stop=add_stop,
639                       word_len=word_len, freq=stop_freq)
640
641    return c
Nota: Vea TracBrowser para ayuda de uso del navegador del repositorio.