use smart_open for doc2vec-IMDB (#1278)

* fix the compatibility between python2 & 3 * require explicit corpus size, epochs for train() * make all train() calls use explicit count, epochs * add tests to make sure that ValueError is indeed thrown * update test * fix the word2vec's reset_from() * require explicit corpus size, epochs for train() * make all train() calls use explicit count, epochs * fix some error * fix test error * drop codecs, use smart_open
piskvorky · May 2, 2017 · d3b4fc3 · d3b4fc3
1 parent 50a18b9
commit d3b4fc3
Showing 1 changed file with 25 additions and 15 deletions.
diff --git a/docs/notebooks/doc2vec-IMDB.ipynb b/docs/notebooks/doc2vec-IMDB.ipynb
@@ -50,13 +50,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {
-    "collapsed": true,
+    "collapsed": false,
     "deletable": true,
     "editable": true
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total running time:  0.0014970000000000816\n"
+     ]
+    }
+   ],
    "source": [
     "import locale\n",
     "import glob\n",
@@ -65,6 +73,7 @@
     "import tarfile\n",
     "import sys\n",
     "import codecs\n",
+    "import smart_open\n",
     "\n",
     "dirname = 'aclImdb'\n",
     "filename = 'aclImdb_v1.tar.gz'\n",
@@ -113,11 +122,11 @@
     "        output = fol.replace('/', '-') + '.txt'\n",
     "\n",
     "        # Is there a better pattern to use?\n",
-    "        txt_files = glob.glob('/'.join([dirname, fol, '*.txt']))\n",
+    "        txt_files = glob.glob(os.path.join(dirname, fol, '*.txt'))\n",
     "\n",
     "        for txt in txt_files:\n",
-    "            with codecs.open(txt, 'r', encoding='utf-8') as t:\n",
-    "                t_clean = t.read()\n",
+    "            with smart_open.smart_open(txt, \"rb\") as t:\n",
+    "                t_clean = t.read().decode(\"utf-8\")\n",
     "\n",
     "                for c in control_chars:\n",
     "                    t_clean = t_clean.replace(c, ' ')\n",
@@ -127,15 +136,16 @@
     "            temp += \"\\n\"\n",
     "\n",
     "        temp_norm = normalize_text(temp)\n",
-    "        with codecs.open('/'.join([dirname, output]), 'w', encoding='utf-8') as n:\n",
-    "            n.write(temp_norm)\n",
+    "\n",
+    "        with smart_open.smart_open(os.path.join(dirname, output), \"wb\") as n:\n",
+    "            n.write(temp_norm.encode(\"utf-8\"))\n",
     "\n",
     "        alldata += temp_norm\n",
     "\n",
-    "    with codecs.open('/'.join([dirname, 'alldata-id.txt']), 'w', encoding='utf-8') as f:\n",
+    "    with smart_open.smart_open(os.path.join(dirname, 'alldata-id.txt'), 'wb') as f:\n",
     "        for idx, line in enumerate(alldata.splitlines()):\n",
     "            num_line = u\"_*{0} {1}\\n\".format(idx, line)\n",
-    "            f.write(num_line)\n",
+    "            f.write(num_line.encode(\"utf-8\"))\n",
     "\n",
     "end = time.clock()\n",
     "print (\"total running time: \", end-start)"
@@ -1092,21 +1102,21 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 2",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "python2"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.6"
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
   }
  },
  "nbformat": 4,