Skip to content

Commit

Permalink
Fix code/docstring style (piskvorky#1650)
Browse files Browse the repository at this point in the history
* replace open->smart_open in annoy tutorial

* style fixes for lda model diff

* fix for piskvorky#1390

* fix for piskvorky#1423

* fix doc in Phrases
  • Loading branch information
menshikh-iv authored and horpto committed Oct 28, 2017
1 parent 83a6d95 commit 754ea54
Show file tree
Hide file tree
Showing 5 changed files with 2,189 additions and 245 deletions.
210 changes: 89 additions & 121 deletions docs/notebooks/annoytutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -41,21 +41,21 @@
"name": "stdout",
"output_type": "stream",
"text": [
"CPython 3.6.0\n",
"IPython 6.0.0\n",
"CPython 3.5.3\n",
"IPython 6.2.1\n",
"\n",
"gensim 2.1.0\n",
"numpy 1.12.1\n",
"scipy 0.19.0\n",
"psutil 5.2.2\n",
"matplotlib 2.0.0\n",
"gensim 3.0.1\n",
"numpy 1.13.3\n",
"scipy 1.0.0\n",
"psutil 5.4.0\n",
"matplotlib 2.1.0\n",
"\n",
"compiler : GCC 4.4.7 20120313 (Red Hat 4.4.7-1)\n",
"compiler : GCC 6.3.0 20170406\n",
"system : Linux\n",
"release : 4.9.27-moby\n",
"release : 4.10.0-37-generic\n",
"machine : x86_64\n",
"processor : x86_64\n",
"CPU cores : 4\n",
"CPU cores : 8\n",
"interpreter: 64bit\n"
]
}
Expand All @@ -76,9 +76,7 @@
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"import os.path\n",
Expand All @@ -98,9 +96,7 @@
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"LOGS = False\n",
Expand Down Expand Up @@ -136,28 +132,18 @@
"from gensim.models import Word2Vec, KeyedVectors\n",
"from gensim.models.word2vec import Text8Corpus\n",
"\n",
"# using params from Word2Vec_FastText_Comparison\n",
"\n",
"lr = 0.05\n",
"dim = 100\n",
"ws = 5\n",
"epoch = 5\n",
"minCount = 5\n",
"neg = 5\n",
"loss = 'ns'\n",
"t = 1e-4\n",
"# Using params from Word2Vec_FastText_Comparison\n",
"\n",
"# Same values as used for fastText training above\n",
"params = {\n",
" 'alpha': lr,\n",
" 'size': dim,\n",
" 'window': ws,\n",
" 'iter': epoch,\n",
" 'min_count': minCount,\n",
" 'sample': t,\n",
" 'alpha': 0.05,\n",
" 'size': 100,\n",
" 'window': 5,\n",
" 'iter': 5,\n",
" 'min_count': 5,\n",
" 'sample': 1e-4,\n",
" 'sg': 1,\n",
" 'hs': 0,\n",
" 'negative': neg\n",
" 'negative': 5\n",
"}\n",
"\n",
"model = Word2Vec(Text8Corpus('text8'), **params)\n",
Expand All @@ -181,16 +167,11 @@
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"#Set up the model and vector that we are using in the comparison\n",
"try:\n",
" from gensim.similarities.index import AnnoyIndexer\n",
"except ImportError:\n",
" raise ValueError(\"SKIP: Please install the annoy indexer\")\n",
"# Set up the model and vector that we are using in the comparison\n",
"from gensim.similarities.index import AnnoyIndexer\n",
"\n",
"model.init_sims()\n",
"annoy_index = AnnoyIndexer(model, 100)"
Expand All @@ -204,11 +185,11 @@
{
"data": {
"text/plain": [
"[('the', 1.0000001192092896),\n",
" ('of', 0.8333191275596619),\n",
" ('in', 0.8258030414581299),\n",
" ('a', 0.7722446918487549),\n",
" ('and', 0.7408151626586914)]"
"[('the', 0.9999999403953552),\n",
" ('of', 0.8254586458206177),\n",
" ('in', 0.8207480907440186),\n",
" ('a', 0.7935141324996948),\n",
" ('and', 0.7539303302764893)]"
]
},
"execution_count": 6,
Expand All @@ -226,9 +207,7 @@
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"import time\n",
Expand All @@ -238,9 +217,7 @@
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"def avg_query_time(annoy_index=None, queries=1000):\n",
Expand All @@ -266,10 +243,10 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Gensim (s/query):\t0.01409\n",
"Annoy (s/query):\t0.00031\n",
"Gensim (s/query):\t0.02066\n",
"Annoy (s/query):\t0.00038\n",
"\n",
"Annoy is 44.94 times faster on average on this particular run\n"
"Annoy is 54.59 times faster on average on this particular run\n"
]
}
],
Expand Down Expand Up @@ -329,30 +306,30 @@
"output_type": "stream",
"text": [
"Approximate Neighbors\n",
"('science', 0.9998273665114539)\n",
"('multidisciplinary', 0.6123671233654022)\n",
"('sciences', 0.6045806407928467)\n",
"('astrobiology', 0.5991603136062622)\n",
"('aaas', 0.5971885621547699)\n",
"('bimonthly', 0.5882039070129395)\n",
"('interdisciplinary', 0.5875678360462189)\n",
"('psychohistory', 0.5828642845153809)\n",
"('protoscience', 0.5820913016796112)\n",
"('scientific', 0.5779787003993988)\n",
"('transhumanism', 0.5754979848861694)\n",
"('science', 1.0)\n",
"('multidisciplinary', 0.6066591441631317)\n",
"('astrobiology', 0.5995452105998993)\n",
"('actuarial', 0.5984143614768982)\n",
"('robotics', 0.5919757187366486)\n",
"('sciences', 0.5884003043174744)\n",
"('scientific', 0.5805909633636475)\n",
"('interdisciplinary', 0.5763890445232391)\n",
"('astronautics', 0.5748652517795563)\n",
"('psychohistory', 0.5744689702987671)\n",
"('aaas', 0.574154257774353)\n",
"\n",
"Normal (not Annoy-indexed) Neighbors\n",
"('science', 0.9999998807907104)\n",
"('fiction', 0.7650254964828491)\n",
"('multidisciplinary', 0.6994814872741699)\n",
"('sciences', 0.6872870922088623)\n",
"('astrobiology', 0.6786551475524902)\n",
"('aaas', 0.6754858493804932)\n",
"('technology', 0.6748392581939697)\n",
"('bimonthly', 0.6608479619026184)\n",
"('interdisciplinary', 0.6597993969917297)\n",
"('astronautics', 0.6552520990371704)\n",
"('psychohistory', 0.6519955396652222)\n"
"('science', 1.0)\n",
"('fiction', 0.7570418119430542)\n",
"('multidisciplinary', 0.6905661225318909)\n",
"('astrobiology', 0.6792721152305603)\n",
"('actuarial', 0.6774581670761108)\n",
"('robotics', 0.6670321822166443)\n",
"('vinge', 0.6633784770965576)\n",
"('sciences', 0.6611713767051697)\n",
"('vernor', 0.6521490812301636)\n",
"('popularizer', 0.6499912738800049)\n",
"('scientific', 0.648192286491394)\n"
]
}
],
Expand Down Expand Up @@ -406,9 +383,7 @@
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"fname = '/tmp/mymodel.index'\n",
Expand All @@ -432,17 +407,17 @@
"name": "stdout",
"output_type": "stream",
"text": [
"('science', 0.9998273665114539)\n",
"('multidisciplinary', 0.6123671233654022)\n",
"('sciences', 0.6045806407928467)\n",
"('astrobiology', 0.5991603136062622)\n",
"('aaas', 0.5971885621547699)\n",
"('bimonthly', 0.5882039070129395)\n",
"('interdisciplinary', 0.5875678360462189)\n",
"('psychohistory', 0.5828642845153809)\n",
"('protoscience', 0.5820913016796112)\n",
"('scientific', 0.5779787003993988)\n",
"('transhumanism', 0.5754979848861694)\n"
"('science', 1.0)\n",
"('multidisciplinary', 0.6066591441631317)\n",
"('astrobiology', 0.5995452105998993)\n",
"('actuarial', 0.5984143614768982)\n",
"('robotics', 0.5919757187366486)\n",
"('sciences', 0.5884003043174744)\n",
"('scientific', 0.5805909633636475)\n",
"('interdisciplinary', 0.5763890445232391)\n",
"('astronautics', 0.5748652517795563)\n",
"('psychohistory', 0.5744689702987671)\n",
"('aaas', 0.574154257774353)\n"
]
}
],
Expand Down Expand Up @@ -477,9 +452,7 @@
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"# Remove verbosity from code below (if logging active)\n",
Expand All @@ -491,9 +464,7 @@
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"from multiprocessing import Process\n",
Expand All @@ -517,16 +488,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Process Id: 311\n",
"Process Id: 18708\n",
"\n",
"Memory used by process 311: pmem(rss=534646784, vms=1907343360, shared=12107776, text=4096, lib=0, data=563171328, dirty=0) \n",
"Memory used by process 18708: pmem(rss=544612352, vms=2047995904, shared=10641408, text=4120576, lib=0, data=823377920, dirty=0)\n",
"---\n",
"Process Id: 320\n",
"Process Id: 18715\n",
"\n",
"Memory used by process 320: pmem(rss=534663168, vms=1907343360, shared=12107776, text=4096, lib=0, data=563204096, dirty=0) \n",
"Memory used by process 18715: pmem(rss=544624640, vms=2047995904, shared=10641408, text=4120576, lib=0, data=823386112, dirty=0)\n",
"---\n",
"CPU times: user 540 ms, sys: 180 ms, total: 720 ms\n",
"Wall time: 24.5 s\n"
"CPU times: user 464 ms, sys: 68 ms, total: 532 ms\n",
"Wall time: 45.3 s\n"
]
}
],
Expand Down Expand Up @@ -569,16 +540,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Process Id: 329\n",
"Process Id: 18733\n",
"\n",
"Memory used by process 329: pmem(rss=514174976, vms=1885904896, shared=142942208, text=4096, lib=0, data=411869184, dirty=0) \n",
"Memory used by process 18733: pmem(rss=525369344, vms=2028597248, shared=140480512, text=4120576, lib=0, data=674148352, dirty=0)\n",
"---\n",
"Process Id: 338\n",
"Process Id: 18740\n",
"\n",
"Memory used by process 338: pmem(rss=514174976, vms=1885904896, shared=142942208, text=4096, lib=0, data=411869184, dirty=0) \n",
"Memory used by process 18740: pmem(rss=525365248, vms=2028597248, shared=140480512, text=4120576, lib=0, data=674148352, dirty=0)\n",
"---\n",
"CPU times: user 490 ms, sys: 210 ms, total: 700 ms\n",
"Wall time: 2.62 s\n"
"CPU times: user 444 ms, sys: 96 ms, total: 540 ms\n",
"Wall time: 2.06 s\n"
]
}
],
Expand Down Expand Up @@ -617,9 +588,7 @@
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
Expand Down Expand Up @@ -719,9 +688,7 @@
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"metadata": {},
"outputs": [],
"source": [
"# To export our model as text\n",
Expand All @@ -737,18 +704,19 @@
"name": "stdout",
"output_type": "stream",
"text": [
"71290 100\n",
"the 0.405333 0.074649 0.154192 0.091247 -0.036666 -0.079057 0.056531 0.012814 0.046281 0.056158 0.209166 -0.046209 0.252618 0.022687 0.239388 -0.122108 -0.028497 -0.098760 -0.334427 0.029130 0.117470 -0.237462 0.064778 -0.053481 -0.165359 0.223160 0.104593 0.144142 0.115136 0.142812 0.201899 0.171716 0.256478 0.142440 -0.150566 -0.175130 0.144592 0.156056 -0.181402 0.103827 -0.173085 0.053641 -0.085016 0.367614 -0.225947 0.033068 0.079073 0.134803 -0.303063 -0.104457 0.079638 -0.135635 -0.072654 0.001361 0.187478 -0.221080 -0.111177 0.071005 0.091342 0.020156 -0.157671 -0.075755 0.098052 -0.065106 0.201720 -0.064369 0.080100 -0.238081 -0.078123 -0.156004 -0.053440 0.234423 -0.117426 -0.127303 0.180088 -0.004023 -0.042677 0.059902 0.453670 -0.063391 -0.049869 0.060019 0.104559 0.085386 -0.071030 -0.117753 -0.032831 0.009222 0.100854 0.082896 -0.288745 -0.015596 -0.138211 0.017519 -0.044955 -0.002358 -0.084262 -0.127057 0.155300 0.342515\n",
"of 0.302899 0.135698 0.276234 0.060655 -0.121023 -0.036229 0.251403 0.087931 0.143489 0.086507 0.171695 -0.108421 0.168884 0.031430 0.128453 -0.157933 -0.041587 -0.012564 -0.242977 -0.134526 0.098855 -0.125527 0.114153 -0.197138 -0.167243 0.415763 -0.067183 0.244922 0.044159 0.178697 0.244680 0.156735 0.322327 0.050362 -0.196953 -0.211732 0.300875 0.184376 -0.071861 -0.000714 0.028612 0.156463 0.046373 0.274268 -0.103168 -0.144895 0.079764 0.314170 -0.236254 -0.108111 0.012367 -0.053291 0.079590 -0.057262 0.221644 -0.259905 -0.120234 0.005212 0.096316 -0.044126 -0.212473 -0.228809 0.089850 -0.023453 0.316282 0.087361 0.168300 -0.239052 0.062733 -0.178071 -0.023161 0.146075 -0.150015 -0.191352 0.136295 0.082557 -0.043620 0.213094 0.413238 -0.205452 -0.115454 -0.051733 0.132394 0.093741 -0.128791 -0.159032 0.015310 -0.135258 -0.099603 -0.042002 -0.193415 -0.032718 -0.341820 0.002871 -0.069954 -0.009055 -0.073843 -0.043583 0.052326 0.348435\n"
"b'71290 100'\n",
"b'the 0.141686 0.255228 -0.191478 0.232801 0.094346 0.120224 0.075487 0.032936 0.154292 -0.063886 -0.321305 0.128102 0.072219 0.081531 -0.080868 -0.000505 -0.094688 -0.031570 -0.022748 -0.030894 0.118537 -0.091672 0.268565 0.017336 -0.158142 0.028882 -0.354505 -0.248104 0.114017 -0.132821 -0.068284 -0.311653 -0.109148 0.071787 0.391749 0.027252 -0.192908 0.323144 0.100474 -0.049426 -0.157461 -0.289598 0.148029 0.059920 -0.084889 -0.012278 0.041439 0.109375 -0.123536 -0.001224 0.112495 -0.138175 0.114445 -0.208958 0.253858 -0.033594 0.145608 0.295680 -0.008925 0.032524 0.192903 0.035965 0.135603 -0.103187 0.162365 0.031851 0.017547 -0.106019 0.094497 0.071965 0.068053 0.024725 -0.003645 0.001062 0.078102 -0.172048 0.093869 -0.035663 -0.166211 0.176462 0.049964 -0.114905 0.024031 -0.058539 -0.117258 -0.351215 -0.025666 -0.211885 0.036296 -0.326675 -0.182654 -0.019680 -0.189521 -0.206698 -0.100391 0.120583 0.076890 -0.010218 0.084345 -0.277560'\n",
"b'of 0.042654 0.329115 -0.062874 0.331052 0.041591 0.141496 0.023409 0.054587 0.003090 0.059803 -0.190404 0.169919 -0.001547 -0.005588 0.060066 0.089611 -0.072265 -0.230048 -0.028314 -0.115761 0.126566 -0.054547 0.366766 0.045456 0.011724 0.010946 -0.237676 -0.323509 0.232554 -0.039293 -0.049269 -0.085853 -0.215061 0.130000 0.347488 0.165928 -0.169574 0.305217 -0.017916 0.034427 -0.133006 -0.144247 0.150204 0.120708 0.053237 -0.183496 0.053565 0.030120 -0.115428 0.030555 0.115227 -0.206632 -0.043280 -0.194560 0.220410 -0.107236 -0.003629 0.253298 0.048558 -0.040416 0.225557 0.091650 0.052787 -0.052910 0.101683 0.113876 -0.105539 -0.056264 0.159010 0.211075 0.057890 -0.017479 0.124350 0.032155 0.097972 -0.220727 0.148302 -0.019309 -0.098981 0.180954 -0.064003 -0.011532 0.148809 0.071048 0.002689 -0.310323 -0.272785 -0.213483 0.030733 -0.217041 -0.346220 0.031555 -0.209962 -0.303856 -0.218638 0.012904 0.188286 0.030006 0.090853 -0.374457'\n"
]
}
],
"source": [
"from smart_open import smart_open\n",
"# View the first 3 lines of the exported file\n",
"\n",
"# The first line has the total number of entries and the vector dimension count. \n",
"# The next lines have a key (a string) followed by its vector.\n",
"with open('/tmp/vectors.txt') as myfile:\n",
"with smart_open('/tmp/vectors.txt') as myfile:\n",
" for i in range(3):\n",
" print(myfile.readline().strip())"
]
Expand Down Expand Up @@ -900,7 +868,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
"version": "3.5.3"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit 754ea54

Please sign in to comment.