{ "metadata": { "name": "", "signature": "sha256:1a2c284881d0d090a8144448ddffa2622095c15049758d95804689d0f0432b74" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "from vsm import *\n", "from vsm.extensions.corpusbuilders import toy_corpus\n", "\n", "\n", "plain_corpus = \"\"\"\n", "His theology challenged the Pope of the Roman Catholic Church by\n", "teaching that the Bible is the only source of divinely revealed\n", "knowledge.\n", "\n", "Augustine is held in the Catholic Church to be the model teacher.\n", "\n", "Augustine was recognized as a Doctor of the Church by Pope Boniface\n", "VIII.\n", "\n", "Roman Catholic theology stated that faith alone cannot justify man.\n", "\n", "In the Catholic Church the Pope is regarded as the successor of Saint\n", "Peter.\n", "\n", "Alonzo Church was an American mathematician and logician who made\n", "major contributions to mathematical logic and the foundations of\n", "theoretical computer science.\n", "\n", "The lambda calculus was introduced by mathematician Alonzo Church as\n", "an investigation into the foundations of mathematics.\n", "\n", "The Church Turing thesis states that a function is algorithmically\n", "computable if and only if it is computable by a Turing machine.\n", "\n", "Mathematical logic has close connections to the foundations of\n", "mathematics, theoretical computer science.\n", "\n", "A Turing machine can be adapted to simulate the logic of any computer\n", "algorithm.\n", "\"\"\"\n", "\n", "metadata = ['Ecclesiastical ' + str(i) for i in xrange(1, 6)]\n", "metadata += ['Logic ' + str(i) for i in xrange(1, 6)]\n", "\n", "c = toy_corpus(plain_corpus, nltk_stop=True, metadata=metadata)\n", "\n", "tf_m = TfMulti(c, 'document')\n", "tf_m.train(2)\n", "\n", "tfidf_m = TfIdf(tf_m.matrix, 'document')\n", "tfidf_m.train()\n", "\n", "lsa_m = Lsa(tfidf_m.matrix, 'document')\n", "lsa_m.train(k_factors=3)\n", "\n", "tf_v = TfViewer(c, tf_m)\n", "tfidf_v = TfIdfViewer(c, tfidf_m)\n", "lsa_v = LsaViewer(c, lsa_m)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Mapping\n" ] }, { "output_type": "stream", "stream": "stderr", "text": [ "/usr/lib/pymodules/python2.7/numpy/ctypeslib.py:411: RuntimeWarning: Item size computed from the PEP 3118 buffer format string does not match the actual item size.\n", " return array(obj, copy=False)\n" ] }, { "output_type": "stream", "stream": "stdout", "text": [ "Reducing\n" ] } ], "prompt_number": 1 }, { "cell_type": "code", "collapsed": false, "input": [ "tf_v.dist_word_word('logic', print_len=24)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
Words: logic
Word Distance Word Distance
logic 0.00000 close 0.95532
computer 0.00000 adapted 0.95532
theoretical 0.61548 algorithm 0.95532
science 0.61548 american 0.95532
mathematical 0.61548 mathematician 1.15026
foundations 0.84107 alonzo 1.15026
simulate 0.95532 machine 1.15026
major 0.95532 mathematics 1.15026
made 0.95532 turing 1.30964
logician 0.95532 church 1.35081
contributions 0.95532 pope 1.57080
connections 0.95532 recognized 1.57080
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 2, "text": [ "LabeledColumn([('logic', 0.0), ('computer', 0.0),\n", " ('theoretical', 0.6154797086703874),\n", " ('science', 0.6154797086703874),\n", " ('mathematical', 0.6154797086703874),\n", " ('foundations', 0.84106867056793), ('simulate', 0.9553166181245092),\n", " ('major', 0.9553166181245092), ('made', 0.9553166181245092),\n", " ('logician', 0.9553166181245092),\n", " ('contributions', 0.9553166181245092),\n", " ('connections', 0.9553166181245092), ('close', 0.9553166181245092),\n", " ('adapted', 0.9553166181245092), ('algorithm', 0.9553166181245092),\n", " ('american', 0.9553166181245092),\n", " ('mathematician', 1.1502619915109316),\n", " ('alonzo', 1.1502619915109316), ('machine', 1.1502619915109316),\n", " ('mathematics', 1.1502619915109316), ('turing', 1.3096389158918722),\n", " ('church', 1.3508083493994372), ('pope', 1.5707963267948966),\n", " ('recognized', 1.5707963267948966),\n", " ('regarded', 1.5707963267948966), ('revealed', 1.5707963267948966),\n", " ('roman', 1.5707963267948966), ('saint', 1.5707963267948966),\n", " ('algorithmically', 1.5707963267948966),\n", " ('source', 1.5707963267948966), ('stated', 1.5707963267948966),\n", " ('states', 1.5707963267948966), ('successor', 1.5707963267948966),\n", " ('teacher', 1.5707963267948966), ('teaching', 1.5707963267948966),\n", " ('theology', 1.5707963267948966), ('thesis', 1.5707963267948966),\n", " ('alone', 1.5707963267948966), ('peter', 1.5707963267948966),\n", " ('computable', 1.5707963267948966), ('man', 1.5707963267948966),\n", " ('challenged', 1.5707963267948966),\n", " ('catholic', 1.5707963267948966), ('calculus', 1.5707963267948966),\n", " ('divinely', 1.5707963267948966), ('doctor', 1.5707963267948966),\n", " ('faith', 1.5707963267948966), ('boniface', 1.5707963267948966),\n", " ('model', 1.5707963267948966), ('function', 1.5707963267948966),\n", " ('introduced', 1.5707963267948966),\n", " ('investigation', 1.5707963267948966),\n", " ('justify', 1.5707963267948966), ('knowledge', 1.5707963267948966),\n", " ('lambda', 1.5707963267948966), ('bible', 1.5707963267948966),\n", " ('augustine', 1.5707963267948966), ('held', 1.5707963267948966),\n", " ('viii', 1.5707963267948966)], \n", " dtype=[('word', '|S15'), ('value', 'Words: logicWord Distance Word Distance logic 0.00000 close 0.95532 computer 0.00000 adapted 0.95532 theoretical 0.61548 algorithm 0.95532 science 0.61548 american 0.95532 mathematical 0.61548 mathematician 1.15026 foundations 0.84107 alonzo 1.15026 simulate 0.95532 machine 1.15026 major 0.95532 mathematics 1.15026 made 0.95532 turing 1.30964 logician 0.95532 church 1.35081 contributions 0.95532 pope 1.57080 connections 0.95532 recognized 1.57080 " ], "metadata": {}, "output_type": "pyout", "prompt_number": 3, "text": [ "LabeledColumn([('logic', 0.0), ('computer', 0.0),\n", " ('theoretical', 0.6154797086703874),\n", " ('science', 0.6154797086703874),\n", " ('mathematical', 0.6154797086703874),\n", " ('foundations', 0.8410686705679303),\n", " ('simulate', 0.9553166181245094), ('major', 0.9553166181245094),\n", " ('made', 0.9553166181245094), ('logician', 0.9553166181245094),\n", " ('contributions', 0.9553166181245094),\n", " ('connections', 0.9553166181245094), ('close', 0.9553166181245094),\n", " ('adapted', 0.9553166181245094), ('algorithm', 0.9553166181245094),\n", " ('american', 0.9553166181245094),\n", " ('mathematician', 1.1502619915109316),\n", " ('alonzo', 1.1502619915109316), ('machine', 1.1502619915109316),\n", " ('mathematics', 1.1502619915109316), ('turing', 1.3096389158918724),\n", " ('church', 1.3508083493994372), ('pope', 1.5707963267948966),\n", " ('recognized', 1.5707963267948966),\n", " ('regarded', 1.5707963267948966), ('revealed', 1.5707963267948966),\n", " ('roman', 1.5707963267948966), ('saint', 1.5707963267948966),\n", " ('algorithmically', 1.5707963267948966),\n", " ('source', 1.5707963267948966), ('stated', 1.5707963267948966),\n", " ('states', 1.5707963267948966), ('successor', 1.5707963267948966),\n", " ('teacher', 1.5707963267948966), ('teaching', 1.5707963267948966),\n", " ('theology', 1.5707963267948966), ('thesis', 1.5707963267948966),\n", " ('alone', 1.5707963267948966), ('peter', 1.5707963267948966),\n", " ('computable', 1.5707963267948966), ('man', 1.5707963267948966),\n", " ('challenged', 1.5707963267948966),\n", " ('catholic', 1.5707963267948966), ('calculus', 1.5707963267948966),\n", " ('divinely', 1.5707963267948966), ('doctor', 1.5707963267948966),\n", " ('faith', 1.5707963267948966), ('boniface', 1.5707963267948966),\n", " ('model', 1.5707963267948966), ('function', 1.5707963267948966),\n", " ('introduced', 1.5707963267948966),\n", " ('investigation', 1.5707963267948966),\n", " ('justify', 1.5707963267948966), ('knowledge', 1.5707963267948966),\n", " ('lambda', 1.5707963267948966), ('bible', 1.5707963267948966),\n", " ('augustine', 1.5707963267948966), ('held', 1.5707963267948966),\n", " ('viii', 1.5707963267948966)], \n", " dtype=[('word', '|S15'), ('value', 'Words: logicWord Distance Word Distance logic 0.00000 contributions 0.16133 computer 0.00000 major 0.16133 connections 0.14615 alonzo 0.16665 close 0.14615 mathematician 0.16665 theoretical 0.15619 calculus 0.18072 science 0.15619 introduced 0.18072 mathematical 0.15619 lambda 0.18072 mathematics 0.16088 investigation 0.18072 foundations 0.16111 church 0.85232 made 0.16133 simulate 0.94910 american 0.16133 adapted 0.94910 logician 0.16133 algorithm 0.94910 " ], "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ "LabeledColumn([('logic', 2.1073424255447017e-08),\n", " ('computer', 2.1073424255447017e-08),\n", " ('connections', 0.1461497327030728), ('close', 0.1461497327030728),\n", " ('theoretical', 0.1561931875925689),\n", " ('science', 0.1561931875925689),\n", " ('mathematical', 0.1561931875925689),\n", " ('mathematics', 0.16088388222038064),\n", " ('foundations', 0.16111028907696195), ('made', 0.16132699126936187),\n", " ('american', 0.16132699126936187),\n", " ('logician', 0.16132699126936187),\n", " ('contributions', 0.16132699126936187),\n", " ('major', 0.16132699126936187), ('alonzo', 0.166652610177689),\n", " ('mathematician', 0.166652610177689),\n", " ('calculus', 0.18071500661099674),\n", " ('introduced', 0.18071500661099674),\n", " ('lambda', 0.18071500661099674),\n", " ('investigation', 0.18071500661099674),\n", " ('church', 0.8523211636647396), ('simulate', 0.9491017050433752),\n", " ('adapted', 0.9491017050433752), ('algorithm', 0.9491017050433752),\n", " ('machine', 1.353933874937494), ('turing', 1.3993762078367922),\n", " ('thesis', 1.455329082734973), ('states', 1.455329082734973),\n", " ('algorithmically', 1.455329082734973),\n", " ('computable', 1.455329082734973), ('function', 1.455329082734973),\n", " ('held', 1.4942581924548715), ('teacher', 1.4942581924548715),\n", " ('model', 1.4942581924548715), ('augustine', 1.5036434979434157),\n", " ('recognized', 1.5100453465502426), ('viii', 1.5100453465502426),\n", " ('boniface', 1.5100453465502426), ('doctor', 1.5100453465502426),\n", " ('peter', 1.537296434294445), ('successor', 1.537296434294445),\n", " ('saint', 1.537296434294445), ('regarded', 1.537296434294445),\n", " ('pope', 1.569538765830574), ('catholic', 1.5741457973418245),\n", " ('challenged', 1.5786643468083752), ('bible', 1.5786643468083752),\n", " ('source', 1.5786643468083752), ('knowledge', 1.5786643468083752),\n", " ('teaching', 1.5786643468083752), ('revealed', 1.5786643468083752),\n", " ('divinely', 1.5786643468083752), ('roman', 1.5807179416878123),\n", " ('theology', 1.5807179416878123), ('alone', 1.5857499354125162),\n", " ('stated', 1.5857499354125162), ('justify', 1.5857499354125162),\n", " ('man', 1.5857499354125162), ('faith', 1.5857499354125162)], \n", " dtype=[('word', '|S15'), ('value', 'Documents: Document Distance Ecclesiastical 4 0.00000 Ecclesiastical 1 1.25961 Ecclesiastical 2 1.42595 Ecclesiastical 5 1.43676 Ecclesiastical 3 1.57080 Logic 1 1.57080 Logic 2 1.57080 Logic 3 1.57080 Logic 4 1.57080 Logic 5 1.57080 " ], "metadata": {}, "output_type": "pyout", "prompt_number": 5, "text": [ "LabeledColumn([('Ecclesiastical 4', 1.4901161193847656e-08),\n", " ('Ecclesiastical 1', 1.2596120825173864),\n", " ('Ecclesiastical 2', 1.4259528297963369),\n", " ('Ecclesiastical 5', 1.4367647653836777),\n", " ('Ecclesiastical 3', 1.5707963267948966),\n", " ('Logic 1', 1.5707963267948966), ('Logic 2', 1.5707963267948966),\n", " ('Logic 3', 1.5707963267948966), ('Logic 4', 1.5707963267948966),\n", " ('Logic 5', 1.5707963267948966)], \n", " dtype=[('doc', '|S16'), ('value', 'Documents: Document Distance Ecclesiastical 4 0.00000 Ecclesiastical 1 1.41228 Ecclesiastical 2 1.53742 Ecclesiastical 5 1.54051 Ecclesiastical 3 1.57080 Logic 1 1.57080 Logic 2 1.57080 Logic 3 1.57080 Logic 4 1.57080 Logic 5 1.57080 " ], "metadata": {}, "output_type": "pyout", "prompt_number": 6, "text": [ "LabeledColumn([('Ecclesiastical 4', 0.0), ('Ecclesiastical 1', 1.4122756403782417),\n", " ('Ecclesiastical 2', 1.5374225855153376),\n", " ('Ecclesiastical 5', 1.5405050906164508),\n", " ('Ecclesiastical 3', 1.5707963267948966),\n", " ('Logic 1', 1.5707963267948966), ('Logic 2', 1.5707963267948966),\n", " ('Logic 3', 1.5707963267948966), ('Logic 4', 1.5707963267948966),\n", " ('Logic 5', 1.5707963267948966)], \n", " dtype=[('doc', '|S16'), ('value', 'Words: Document Distance Logic 5 1.32242 Logic 4 1.32915 Logic 1 1.38815 Ecclesiastical 1 1.57080 Ecclesiastical 2 1.57080 Ecclesiastical 3 1.57080 Ecclesiastical 4 1.57080 Ecclesiastical 5 1.57080 Logic 2 1.57080 Logic 3 1.57080 " ], "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ "LabeledColumn([('Logic 5', 1.3224154697632784), ('Logic 4', 1.3291532220319409),\n", " ('Logic 1', 1.388146251745384),\n", " ('Ecclesiastical 1', 1.5707963267948966),\n", " ('Ecclesiastical 2', 1.5707963267948966),\n", " ('Ecclesiastical 3', 1.5707963267948966),\n", " ('Ecclesiastical 4', 1.5707963267948966),\n", " ('Ecclesiastical 5', 1.5707963267948966),\n", " ('Logic 2', 1.5707963267948966), ('Logic 3', 1.5707963267948966)], \n", " dtype=[('doc', '|S16'), ('value', 'Words: Document Distance Ecclesiastical 2 0.73633 Ecclesiastical 3 0.75591 Ecclesiastical 5 0.78818 Ecclesiastical 1 0.83776 Logic 5 0.84555 Ecclesiastical 4 0.84617 Logic 4 0.94199 Logic 2 0.94433 Logic 1 0.94556 Logic 3 1.11314 " ], "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ "LabeledColumn([('Ecclesiastical 2', 0.7363264837344443),\n", " ('Ecclesiastical 3', 0.7559069891816871),\n", " ('Ecclesiastical 5', 0.7881792424138612),\n", " ('Ecclesiastical 1', 0.8377568378998803),\n", " ('Logic 5', 0.8455464728606694),\n", " ('Ecclesiastical 4', 0.8461724249009318),\n", " ('Logic 4', 0.9419921003724298), ('Logic 2', 0.9443332777787585),\n", " ('Logic 1', 0.9455622593137681), ('Logic 3', 1.1131440244046271)], \n", " dtype=[('doc', '|S16'), ('value', 'Documents: Document Distance Ecclesiastical 4 0.00000 Ecclesiastical 1 0.00843 Ecclesiastical 5 0.05804 Ecclesiastical 3 0.09036 Ecclesiastical 2 0.10991 Logic 2 1.56321 Logic 3 1.57501 Logic 1 1.57808 Logic 4 1.58446 Logic 5 1.58765 " ], "metadata": {}, "output_type": "pyout", "prompt_number": 9, "text": [ "LabeledColumn([('Ecclesiastical 4', 0.0),\n", " ('Ecclesiastical 1', 0.008426996338275176),\n", " ('Ecclesiastical 5', 0.05803666882089707),\n", " ('Ecclesiastical 3', 0.09036042875240273),\n", " ('Ecclesiastical 2', 0.10990663361013094),\n", " ('Logic 2', 1.563206166779967), ('Logic 3', 1.5750098468112752),\n", " ('Logic 1', 1.5780801982346484), ('Logic 4', 1.58446244677414),\n", " ('Logic 5', 1.5876493531936278)], \n", " dtype=[('doc', '|S16'), ('value', 'Collection FrequenciesWord Counts Word Counts Word Counts Word Counts church 7 computable 2 calculus 1 states 1 catholic 4 augustine 2 american 1 stated 1 foundations 3 roman 2 close 1 source 1 turing 3 science 2 algorithmically 1 simulate 1 computer 3 alone 1 held 1 saint 1 pope 3 contributions 1 function 1 revealed 1 logic 3 algorithm 1 viii 1 regarded 1 alonzo 2 faith 1 introduced 1 recognized 1 mathematics 2 doctor 1 investigation 1 peter 1 theoretical 2 divinely 1 thesis 1 model 1 machine 2 bible 1 teaching 1 man 1 theology 2 boniface 1 teacher 1 major 1 mathematical 2 connections 1 successor 1 made 1 mathematician 2 challenged 1 " ], "metadata": {}, "output_type": "pyout", "prompt_number": 11, "text": [ "LabeledColumn([('church', 7), ('catholic', 4), ('foundations', 3), ('turing', 3),\n", " ('computer', 3), ('pope', 3), ('logic', 3), ('alonzo', 2),\n", " ('mathematics', 2), ('theoretical', 2), ('machine', 2),\n", " ('theology', 2), ('mathematical', 2), ('mathematician', 2),\n", " ('computable', 2), ('augustine', 2), ('roman', 2), ('science', 2),\n", " ('alone', 1), ('contributions', 1), ('algorithm', 1), ('faith', 1),\n", " ('doctor', 1), ('divinely', 1), ('bible', 1), ('boniface', 1),\n", " ('connections', 1), ('challenged', 1), ('calculus', 1),\n", " ('american', 1), ('close', 1), ('algorithmically', 1), ('held', 1),\n", " ('function', 1), ('viii', 1), ('introduced', 1),\n", " ('investigation', 1), ('thesis', 1), ('teaching', 1),\n", " ('teacher', 1), ('successor', 1), ('states', 1), ('stated', 1),\n", " ('source', 1), ('simulate', 1), ('saint', 1), ('revealed', 1),\n", " ('regarded', 1), ('recognized', 1), ('peter', 1), ('model', 1),\n", " ('man', 1), ('major', 1), ('made', 1), ('logician', 1),\n", " ('lambda', 1), ('knowledge', 1), ('justify', 1), ('adapted', 1)], \n", " dtype=[('word', '|S15'), ('value', 'Words: logicWord Distance Word Distance logic 0.00000 theoretical 0.06325 computer 0.00000 mathematics 0.07048 mathematician 0.04921 connections 0.09843 alonzo 0.04921 close 0.09843 foundations 0.05166 introduced 0.12865 american 0.05320 lambda 0.12865 contributions 0.05320 investigation 0.12865 logician 0.05320 calculus 0.12865 made 0.05320 church 0.18155 major 0.05320 simulate 0.19473 mathematical 0.06325 adapted 0.19473 science 0.06325 algorithm 0.19473 " ], "metadata": {}, "output_type": "pyout", "prompt_number": 18, "text": [ "LabeledColumn([('logic', 0.0), ('computer', 0.0),\n", " ('mathematician', 0.0492069763182823),\n", " ('alonzo', 0.0492069763182823),\n", " ('foundations', 0.05166368875941544),\n", " ('american', 0.05320430492811313),\n", " ('contributions', 0.05320430492811313),\n", " ('logician', 0.05320430492811313), ('made', 0.05320430492811313),\n", " ('major', 0.05320430492811313),\n", " ('mathematical', 0.0632503959697633),\n", " ('science', 0.0632503959697633),\n", " ('theoretical', 0.0632503959697633),\n", " ('mathematics', 0.07048230197449873),\n", " ('connections', 0.09842980401516312),\n", " ('close', 0.09842980401516312), ('introduced', 0.12864998450849807),\n", " ('lambda', 0.12864998450849807),\n", " ('investigation', 0.12864998450849807),\n", " ('calculus', 0.12864998450849807), ('church', 0.18154738827057404),\n", " ('simulate', 0.19473020810363867), ('adapted', 0.19473020810363867),\n", " ('algorithm', 0.19473020810363867), ('held', 0.22833958546575822),\n", " ('teacher', 0.22833958546575822), ('model', 0.22833958546575822),\n", " ('recognized', 0.22893465574112978),\n", " ('boniface', 0.22893465574112978), ('viii', 0.22893465574112978),\n", " ('doctor', 0.22893465574112978), ('augustine', 0.2290393895483237),\n", " ('peter', 0.23082222876942637), ('saint', 0.23082222876942637),\n", " ('successor', 0.23082222876942637),\n", " ('regarded', 0.23082222876942637), ('alone', 0.26236426820101866),\n", " ('justify', 0.26236426820101866), ('stated', 0.26236426820101866),\n", " ('faith', 0.26236426820101866), ('man', 0.26236426820101866),\n", " ('pope', 0.2993972085117987), ('catholic', 0.30013287605358824),\n", " ('machine', 0.3040698811395842),\n", " ('algorithmically', 0.35328304712159975),\n", " ('function', 0.35328304712159975), ('states', 0.35328304712159975),\n", " ('thesis', 0.35328304712159975), ('theology', 0.3796994235501325),\n", " ('roman', 0.3796994235501325), ('revealed', 0.38318073578287704),\n", " ('knowledge', 0.38318073578287704),\n", " ('teaching', 0.38318073578287704), ('bible', 0.38318073578287704),\n", " ('divinely', 0.38318073578287704),\n", " ('challenged', 0.38318073578287704),\n", " ('source', 0.38318073578287704), ('turing', 0.47942958346148457),\n", " ('computable', 0.6109097308593834)], \n", " dtype=[('word', '|S15'), ('value', '