Navegación de contexto

source: consulta_publica/vsm/vsm/extensions/ldasentences.py @ 32be06f

baseconstituyenteestudiantesgeneralplan_patriasala

Last change on this file since 32be06f was 0ff122b, checked in by rudmanmrrod <rudman22@…>, 7 años ago
Agregado módulo de gestión de perfiles de procesamiento, incorporado el módulo de visualización de modelado de tópicos
Propiedad mode establecida a `100644`
File size: 21.2 KB

Línea
1	import numpy as np
2	from vsm.corpus import Corpus
3	from vsm.extensions.corpusbuilders import *
4	from vsm.extensions.corpuscleanup import apply_stoplist_len
5	from vsm.extensions.htrc import vol_link_fn, add_link_
6	import os
7	import re
8
9	__all__ = ['CorpusSent', 'sim_sent_sent', 'sim_sent_sent_across',
10	'file_tokenize', 'file_corpus', 'dir_tokenize', 'dir_corpus',
11	'extend_sdd', 'extend_across']
12
13
14	class CorpusSent(Corpus):
15	"""
16	A subclass of Corpus whose purpose is to store original
17	sentence information in the Corpus
18
19	:See Also: :class: Corpus
20	"""
21	def __init__(self, corpus, sentences, context_types=[], context_data=[],
22	remove_empty=False):
23
24	super(CorpusSent, self).__init__(corpus, context_types=context_types,
25	context_data=context_data, remove_empty=remove_empty)
26
27	sentences = [re.sub('\n', ' ', s) for s in sentences]
28	self.sentences = np.array(sentences)
29
30
31	def __set_words_int(self):
32	"""
33	Mapping of words to their integer representations.
34	"""
35	self.words_int = dict((t,i) for i,t in enumerate(self.words))
36
37
38	def apply_stoplist(self, stoplist=[], freq=0):
39	"""
40	Takes a Corpus object and returns a copy of it with words in the
41	stoplist removed and with words of frequency <= `freq` removed.
42
43	:param stoplist: The list of words to be removed.
44	:type stoplist: list
45
46	:type freq: integer, optional
47	:param freq: A threshold where words of frequency <= 'freq' are
48	removed. Default is 0.
49
50	:returns: Copy of corpus with words in the stoplist and words of
51	frequnecy <= 'freq' removed.
52
53	:See Also: :class:`Corpus`
54	"""
55	if freq:
56	#TODO: Use the TF model instead
57
58	print 'Computing collection frequencies'
59	cfs = np.zeros_like(self.words, dtype=self.corpus.dtype)
60
61	for word in self.corpus:
62	cfs[word] += 1
63
64	print 'Selecting words of frequency <=', freq
65	freq_stop = np.arange(cfs.size)[(cfs <= freq)]
66	stop = set(freq_stop)
67	else:
68	stop = set()
69
70	for t in stoplist:
71	if t in self.words:
72	stop.add(self.words_int[t])
73
74	if not stop:
75	print 'Stop list is empty.'
76	return self
77
78	print 'Removing stop words'
79	f = np.vectorize(lambda x: x not in stop)
80	corpus = self.corpus[f(self.corpus)]
81
82	print 'Rebuilding corpus'
83	corpus = [self.words[i] for i in corpus]
84	context_data = []
85	for i in xrange(len(self.context_data)):
86	print 'Recomputing token breaks:', self.context_types[i]
87	tokens = self.view_contexts(self.context_types[i])
88	spans = [t[f(t)].size for t in tokens]
89	tok = self.context_data[i].copy()
90	tok['idx'] = np.cumsum(spans)
91	context_data.append(tok)
92
93	return CorpusSent(corpus, self.sentences, context_data=context_data,
94	context_types=self.context_types)
95
96
97	@staticmethod
98	def load(file):
99	"""
100	Loads data into a Corpus object that has been stored using
101	`save`.
102
103	:param file: Designates the file to read. If `file` is a string ending
104	in `.gz`, the file is first gunzipped. See `numpy.load`
105	for further details.
106	:type file: string-like or file-like object
107
108	:returns: c : A Corpus object storing the data found in `file`.
109
110	:See Also: :class: Corpus, :meth: Corpus.load, :meth: numpy.load
111	"""
112	print 'Loading corpus from', file
113	arrays_in = np.load(file)
114
115	c = CorpusSent([], [])
116	c.corpus = arrays_in['corpus']
117	c.words = arrays_in['words']
118	c.sentences = arrays_in['sentences']
119	c.context_types = arrays_in['context_types'].tolist()
120
121	c.context_data = list()
122	for n in c.context_types:
123	t = arrays_in['context_data_' + n]
124	c.context_data.append(t)
125
126	c.__set_words_int()
127
128	return c
129
130	def save(self, file):
131	"""
132	Saves data from a CorpusSent object as an `npz` file.
133
134	:param file: Designates the file to which to save data. See
135	`numpy.savez` for further details.
136	:type file: str-like or file-like object
137
138	:returns: None
139
140	:See Also: :class: Corpus, :meth: Corpus.save, :meth: numpy.savez
141	"""
142
143	print 'Saving corpus as', file
144	arrays_out = dict()
145	arrays_out['corpus'] = self.corpus
146	arrays_out['words'] = self.words
147	arrays_out['sentences'] = self.sentences
148	arrays_out['context_types'] = np.asarray(self.context_types)
149
150	for i,t in enumerate(self.context_data):
151	key = 'context_data_' + self.context_types[i]
152	arrays_out[key] = t
153
154	np.savez(file, **arrays_out)
155
156
157	def sent_int(self, sent):
158	"""
159	sent : list of strings
160	List of sentence tokenization.
161	The list could be a subset of existing sentence tokenization.
162	"""
163	tok = self.view_contexts('sentence', as_strings=True)
164	sent_li = []
165	for i in xrange(len(tok)):
166	sent_li.append(sent)
167	keys = [i for i in xrange(len(tok))
168	if set(sent_li[i]).issubset(tok[i].tolist())]
169
170	n = len(keys)
171	if n == 0:
172	raise Exception('No token fits {0}.'.format(sent))
173	elif n > 1:
174	return keys
175	return keys[0]
176
177
178	def sim_sent_sent(ldaviewer, sent, print_len=10):
179	"""
180	ldaviewer : ldaviewer object
181	sent : sentence index or sentence as a list of words
182
183	Returns
184	-------
185	sim_sents : numpy array
186	(sentence index, probability) as (i, value) pair.
187	tokenized_sents : list of arrays
188	List containing tokenized sentences as arrays.
189	orig_sents : list of strings
190	List containing original sentences as strings.
191	"""
192	from vsm.viewer.ldagibbsviewer import LDAGibbsViewer
193
194	corp = ldaviewer.corpus
195	ind = sent
196	if isinstance(sent, list) and isinstance(sent[0], str):
197	ind = corp.sent_int(sent)
198	sim_sents = ldaviewer.sim_doc_doc(ind, print_len=print_len)
199	lc = sim_sents['doc'][:print_len]
200	lc = [s.split(', ') for s in lc]
201	lc = [int(s[-1]) for s in lc]
202
203	# only returns print_len length
204	tokenized_sents, orig_sents = [], []
205	for i in lc:
206	tokenized_sents.append(corp.view_contexts('sentence', as_strings=True)[i])
207	orig_sents.append(corp.sentences[i])
208
209	return tokenized_sents, orig_sents, sim_sents
210
211
212	def sim_sent_sent_across(ldavFrom, ldavTo, beagleviewer, sent, print_len=10,
213	label_fn=vol_link_fn):
214	"""
215	ldavFrom : ldaviewer object where the sentence is from.
216	ldavTo : ldaviewer object to find similar sentences
217	beagleviewer : beagleviewer object is used to find
218	similar words for words that don't exist in a different corpus.
219	sent : sentence index of the corpus that corresponds to ldavFrom,
220	or sentence as a list of words
221
222	Returns
223	-------
224	sim_sents : numpy array
225	(sentence index, probability) as (i, value) pair.
226	tokenized_sents : list of arrays
227	List containing tokenized sentences as arrays.
228	orig_sents : list of strings
229	List containing original sentences as strings.
230	"""
231	from vsm.viewer.ldagibbsviewer import LDAGibbsViewer
232	from vsm.viewer.beagleviewer import BeagleViewer
233
234	def first_in_corp(corp, wordlist):
235	"""
236	Goes down the list to find a word that's in `corp`.
237	Assumes there is a word in the `wordlist` that's in `corp`.
238	"""
239	for w in wordlist:
240	if w in corp.words:
241	return w
242
243	corp = ldavFrom.corpus # to get sent ind
244	ind = sent
245	word_list = []
246	if isinstance(sent, list) and isinstance(sent[0], str):
247	ind = corp.sent_int(sent)
248	word_list = sent
249	elif isinstance(sent, list):
250	word_list = set()
251	for i in sent:
252	li = set(ldavFrom.corpus.view_contexts('sentence',
253	as_strings=True)[i])
254	word_list.update(li)
255
256	else: # if sent is an int index
257	word_list = ldavFrom.corpus.view_contexts('sentence',
258	as_strings=True)[ind].tolist()
259
260	word_list = list(word_list)
261	# Before trying ldavTo.sim_word_word, make sure all words
262	# in the list exist in ldavTo.corpus.
263	wl = []
264	for w in word_list:
265	if w not in ldavTo.corpus.words:
266	words = beagleviewer.sim_word_word(w)['word']
267	replacement = first_in_corp(ldavTo.corpus, words)
268	wl.append(replacement)
269	print 'BEAGLE composite model replaced {0} by {1}'.format(w,
270	replacement)
271	else:
272	wl.append(w)
273
274	# from ldavFrom:sent -> ldavTo:topics -> ldavTo:sent(doc)
275	tops = ldavTo.sim_word_top(wl).first_cols[:(ldavTo.model.K/6)]
276	tops = [int(t) for t in tops]
277	print "Related topics: ", tops
278	# sim_sents = ldavTo.sim_top_doc(tops, print_len=print_len,
279	# as_strings=False)
280	# lc = sim_sents['i'][:print_len]
281	# tokenized_sents, orig_sents = [], []
282	# for i in lc:
283	# tokenized_sents.append(ldavTo.corpus.view_contexts('sentence', as_strings=True)[i])
284	# orig_sents.append(ldavTo.corpus.sentences[i])
285	sim_sents = ldavTo.sim_top_doc(tops, print_len=print_len,
286	label_fn=label_fn)
287	return sim_sents
288
289
290	def extend_sdd(args, v, print_len=10):
291	"""
292	Extend table resulting from sim_doc_doc with
293	label_fn = vol_link_fn. Adds an ArgumentMap column.
294	"""
295	from vsm.viewer.ldagibbsviewer import LDAGibbsViewer
296
297	sdd = v.sim_doc_doc(args, label_fn=vol_link_fn, print_len=print_len)
298	table_str = sdd._repr_html_()
299	rows = table_str.split('</tr>')
300
301	rows[0] = re.sub("2", "3", rows[0]) + '</tr>'
302	rows[1] += '<th style="text-align: center; background: #EFF2FB;">Argument\
303	Map</th></tr>'
304
305	for i in xrange(2,len(rows)-1):
306	a = rows[i].split('</a>, ')
307	arg = a[1].split(',')[0]
308
309	arg_map = find_arg(arg)
310	rows[i] += '<td>{0}</td></tr>'.format(arg_map)
311
312	return ''.join(rows)
313
314
315	def extend_across(vFrom, vTo, beagle_v, args, txtFrom, txtTo, print_len=10):
316	"""
317	Extend table resulting from sim_sent_sent_across with
318	the label_fn= vol_link_fn. Adds ArgumentMap and Novelty columns.
319	"""
320	from vsm.extensions.htrc import add_link_
321
322	across = sim_sent_sent_across(vFrom, vTo, beagle_v, args, print_len=print_len)
323	table_str = across._repr_html_()
324	rows = table_str.split('</tr>')
325
326	rows[0] = re.sub("2", "4", rows[0]) + '</tr>'
327	rows[1] += '<th style="text-align: center; background: #EFF2FB;">\
328	Argument Map</th><th style="text-align: center; background: \
329	#EFF2FB;">Novelty</th></tr>'
330
331	for i in xrange(2,len(rows)-1):
332	a = rows[i].split('</a>, ')
333	arg = a[1].split(',')[0]
334
335	novelty = in_ed1(arg, txtTo, txtFrom)
336	arg_map = find_arg(novelty)
337
338	# add link to novelty when it's found in the corpusFrom.
339	if not novelty == 'new':
340	li = novelty.split(' ')
341	idx = int(li[0])
342	md = vFrom.corpus.view_metadata('sentence')[idx]
343	link = add_link_(md['page_urls'], md['sentence_label'])
344	li[0] = link
345	novelty = ' '.join(li)
346
347	rows[i] += '<td>{0}</td><td>{1}</td></tr>'.format(arg_map, novelty)
348
349	return ''.join(rows)
350
351
352	def in_ed1(idx, difftxt, ed1txt):
353	"""
354	Only for sim_sent_sent_across.
355	Return ind from ed1txt if i has a equal match.
356	Else return 'i'th entry in difftxt.
357
358	"""
359	path = '/var/inphosemantics/data/20131214/Washburn/vsm-data/'
360
361	with open(path + ed1txt, 'r') as f1:
362	ed1 = f1.read()
363	ed1 = ed1.split(',')
364
365	with open(path + difftxt, 'r') as f:
366	txt = f.read()
367	entries = txt.split(',')
368
369	for i in xrange(len(entries)):
370	if entries[i].startswith(str(idx) + ' '):
371	if '=' in entries[i]:
372	return ed1[i]
373	else:
374	prob = entries[i].split(' ')[1]
375	return ed1[i] + ' ' + prob
376	# didn't find idx in the table.
377	return 'new'
378
379	def find_arg(i):
380	"""
381	Find the arg (e.g. '422') if i is one of the analyzed args,
382	otherwise return ''
383	"""
384	import json
385
386	path = '/var/inphosemantics/data/20131214/Washburn/vsm-data/'
387
388	if i == 'new' or '(' in i:
389	return ''
390
391	i = int(i)
392	with open(path + 'arg_indices.json', 'r') as jsonf:
393	indices = json.load(jsonf)
394
395	for k in indices:
396	if i in indices[k]:
397	return str(k)
398	return ''
399
400
401	def file_tokenize(text):
402	"""
403	`file_tokenize` is a helper function for :meth:`file_corpus`.
404
405	Takes a string that is content in a file and returns words
406	and corpus data.
407
408	:param text: Content in a plain text file.
409	:type text: string
410
411	:returns: words : List of words.
412	Words in the `text` tokenized by :meth:`vsm.corpus.util.word_tokenize`.
413	corpus_data : Dictionary with context type as keys and
414	corresponding tokenizations as values. The tokenizations
415	are np.arrays.
416	"""
417	words, par_tokens, sent_tokens, sent_orig = [], [], [], []
418	sent_break, par_n, sent_n = 0, 0, 0
419
420	pars = paragraph_tokenize(text)
421
422	for par in pars:
423	sents = sentence_tokenize(par)
424
425	for sent in sents:
426	w = word_tokenize(sent)
427	words.extend(w)
428	sent_break += len(w)
429	sent_tokens.append((sent_break, par_n, sent_n))
430	sent_orig.append(sent)
431	sent_n += 1
432
433	par_tokens.append((sent_break, par_n))
434	par_n += 1
435
436	idx_dt = ('idx', np.int32)
437	sent_label_dt = ('sentence_label', np.array(sent_n, np.str_).dtype)
438	par_label_dt = ('paragraph_label', np.array(par_n, np.str_).dtype)
439
440	corpus_data = dict()
441	dtype = [idx_dt, par_label_dt]
442	corpus_data['paragraph'] = np.array(par_tokens, dtype=dtype)
443	dtype = [idx_dt, par_label_dt, sent_label_dt]
444	corpus_data['sentence'] = np.array(sent_tokens, dtype=dtype)
445
446	return words, corpus_data, sent_orig
447
448
449	def file_corpus(filename, nltk_stop=True, stop_freq=1, add_stop=None):
450	"""
451	`file_corpus` is a convenience function for generating Corpus
452	objects from a a plain text corpus contained in a single string
453	`file_corpus` will strip punctuation and arabic numerals outside
454	the range 1-29. All letters are made lowercase.
455
456	:param filename: File name of the plain text file.
457	:type plain_dir: string-like
458
459	:param nltk_stop: If `True` then the corpus object is masked
460	using the NLTK English stop words. Default is `False`.
461	:type nltk_stop: boolean, optional
462
463	:param stop_freq: The upper bound for a word to be masked on
464	the basis of its collection frequency. Default is 1.
465	:type stop_freq: int, optional
466
467	:param add_stop: A list of stop words. Default is `None`.
468	:type add_stop: array-like, optional
469
470	:returns: c : a Corpus object
471	Contains the tokenized corpus built from the input plain-text
472	corpus. Document tokens are named `documents`.
473
474	:See Also: :class:`vsm.corpus.Corpus`,
475	:meth:`file_tokenize`,
476	:meth:`vsm.corpus.util.apply_stoplist`
477	"""
478	with open(filename, mode='r') as f:
479	text = f.read()
480
481	words, tok, sent = file_tokenize(text)
482	names, data = zip(*tok.items())
483
484	c = CorpusSent(words, sent, context_data=data, context_types=names,
485	remove_empty=False)
486	c = apply_stoplist(c, nltk_stop=nltk_stop,
487	freq=stop_freq, add_stop=add_stop)
488
489	return c
490
491
492
493	def dir_tokenize(chunks, labels, chunk_name='article', paragraphs=True):
494	"""
495	"""
496	words, chk_tokens, sent_tokens, sent_orig = [], [], [], []
497	sent_break, chk_n, sent_n = 0, 0, 0
498
499	if paragraphs:
500	par_tokens = []
501	par_n = 0
502
503	for chk, label in zip(chunks, labels):
504	print 'Tokenizing', label
505	pars = paragraph_tokenize(chk)
506
507	for par in pars:
508	sents = sentence_tokenize(par)
509
510	for sent in sents:
511	w = word_tokenize(sent)
512	words.extend(w)
513	sent_break += len(w)
514	sent_tokens.append((sent_break, label, par_n, sent_n))
515	sent_orig.append(sent)
516	sent_n += 1
517
518	par_tokens.append((sent_break, label, par_n))
519	par_n += 1
520
521	chk_tokens.append((sent_break, label))
522	chk_n += 1
523	else:
524	for chk, label in zip(chunks, labels):
525	print 'Tokenizing', label
526	sents = sentence_tokenize(chk)
527
528	for sent in sents:
529	w = word_tokenize(sent)
530	words.extend(w)
531	sent_break += len(w)
532	sent_tokens.append((sent_break, label, sent_n))
533	sent_orig.append(sent)
534	sent_n += 1
535
536	chk_tokens.append((sent_break, label))
537	chk_n += 1
538
539	idx_dt = ('idx', np.int32)
540	label_dt = (chunk_name + '_label', np.array(labels).dtype)
541	sent_label_dt = ('sentence_label', np.array(sent_n, np.str_).dtype)
542	corpus_data = dict()
543	dtype = [idx_dt, label_dt]
544	corpus_data[chunk_name] = np.array(chk_tokens, dtype=dtype)
545
546	if paragraphs:
547	par_label_dt = ('paragraph_label', np.array(par_n, np.str_).dtype)
548	dtype = [idx_dt, label_dt, par_label_dt]
549	corpus_data['paragraph'] = np.array(par_tokens, dtype=dtype)
550	dtype = [idx_dt, label_dt, par_label_dt, sent_label_dt]
551	corpus_data['sentence'] = np.array(sent_tokens, dtype=dtype)
552	else:
553	dtype = [idx_dt, label_dt, sent_label_dt]
554	corpus_data['sentence'] = np.array(sent_tokens, dtype=dtype)
555
556	return words, corpus_data, sent_orig
557
558
559
560	def dir_corpus(plain_dir, chunk_name='article', paragraphs=True, word_len=2,
561	nltk_stop=True, stop_freq=1, add_stop=None, corpus_sent=True,
562	ignore=['.log', '.pickle', '.xml']):
563	"""
564	`dir_corpus` is a convenience function for generating Corpus
565	objects from a directory of plain text files.
566
567	`dir_corpus` will retain file-level tokenization and perform
568	sentence and word tokenizations. Optionally, it will provide
569	paragraph-level tokenizations.
570
571	It will also strip punctuation and arabic numerals outside the
572	range 1-29. All letters are made lowercase.
573
574	:param plain_dir: String containing directory containing a
575	plain-text corpus.
576	:type plain_dir: string-like
577
578	:param chunk_name: The name of the tokenization corresponding
579	to individual files. For example, if the files are pages
580	of a book, one might set `chunk_name` to `pages`. Default
581	is `articles`.
582	:type chunk_name: string-like, optional
583
584	:param paragraphs: If `True`, a paragraph-level tokenization
585	is included. Defaults to `True`.
586	:type paragraphs: boolean, optional
587
588	:param word_len: Filters words whose lengths are <= word_len.
589	Default is 2.
590	:type word_len: int, optional
591
592	:param nltk_stop: If `True` then the corpus object is masked
593	using the NLTK English stop words. Default is `False`.
594	:type nltk_stop: boolean, optional
595
596	:param stop_freq: The upper bound for a word to be masked on
597	the basis of its collection frequency. Default is 1.
598	:type stop_freq: int, optional
599
600	:param corpus_sent: If `True` a CorpusSent object is returned.
601	Otherwise Corpus object is returned. Default is `True`.
602	:type corpus_sent: boolean, optional
603
604	:param add_stop: A list of stop words. Default is `None`.
605	:type add_stop: array-like, optional
606
607	:param ignore: The list containing suffixes of files to be filtered.
608	The suffix strings are normally file types. Default is ['.json',
609	'.log', '.pickle'].
610	:type ignore: list of strings, optional
611
612	:returns: c : Corpus or CorpusSent
613	Contains the tokenized corpus built from the input plain-text
614	corpus. Document tokens are named `documents`.
615
616	:See Also: :class: Corpus, :class: CorpusSent, :meth: dir_tokenize,
617	:meth: apply_stoplist
618	"""
619	chunks = []
620	filenames = os.listdir(plain_dir)
621	filenames = filter_by_suffix(filenames, ignore)
622	filenames.sort()
623
624	for filename in filenames:
625	filename = os.path.join(plain_dir, filename)
626	with open(filename, mode='r') as f:
627	chunks.append(f.read())
628
629	words, tok, sent = dir_tokenize(chunks, filenames, chunk_name=chunk_name,
630	paragraphs=paragraphs)
631	names, data = zip(*tok.items())
632
633	if corpus_sent:
634	c = CorpusSent(words, sent, context_data=data, context_types=names,
635	remove_empty=False)
636	else:
637	c = Corpus(words, context_data=data, context_types=names)
638	c = apply_stoplist_len(c, nltk_stop=nltk_stop, add_stop=add_stop,
639	word_len=word_len, freq=stop_freq)
640
641	return c

Nota: Vea TracBrowser para ayuda de uso del navegador del repositorio.

Descargar en otros formatos: