Skip to content

Commit

Permalink
use smart_open for doc2vec-IMDB (#1278)
Browse files Browse the repository at this point in the history
* fix the compatibility between python2 & 3

* require explicit corpus size, epochs for train()

* make all train() calls use explicit count, epochs

* add tests to make sure that ValueError is indeed thrown

* update test

* fix the word2vec's reset_from()

* require explicit corpus size, epochs for train()

* make all train() calls use explicit count, epochs

* fix some error

* fix test error

* drop codecs, use smart_open
  • Loading branch information
robotcator authored and tmylk committed May 2, 2017
1 parent 50a18b9 commit d3b4fc3
Showing 1 changed file with 25 additions and 15 deletions.
40 changes: 25 additions & 15 deletions docs/notebooks/doc2vec-IMDB.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,21 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {
"collapsed": true,
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total running time: 0.0014970000000000816\n"
]
}
],
"source": [
"import locale\n",
"import glob\n",
Expand All @@ -65,6 +73,7 @@
"import tarfile\n",
"import sys\n",
"import codecs\n",
"import smart_open\n",
"\n",
"dirname = 'aclImdb'\n",
"filename = 'aclImdb_v1.tar.gz'\n",
Expand Down Expand Up @@ -113,11 +122,11 @@
" output = fol.replace('/', '-') + '.txt'\n",
"\n",
" # Is there a better pattern to use?\n",
" txt_files = glob.glob('/'.join([dirname, fol, '*.txt']))\n",
" txt_files = glob.glob(os.path.join(dirname, fol, '*.txt'))\n",
"\n",
" for txt in txt_files:\n",
" with codecs.open(txt, 'r', encoding='utf-8') as t:\n",
" t_clean = t.read()\n",
" with smart_open.smart_open(txt, \"rb\") as t:\n",
" t_clean = t.read().decode(\"utf-8\")\n",
"\n",
" for c in control_chars:\n",
" t_clean = t_clean.replace(c, ' ')\n",
Expand All @@ -127,15 +136,16 @@
" temp += \"\\n\"\n",
"\n",
" temp_norm = normalize_text(temp)\n",
" with codecs.open('/'.join([dirname, output]), 'w', encoding='utf-8') as n:\n",
" n.write(temp_norm)\n",
"\n",
" with smart_open.smart_open(os.path.join(dirname, output), \"wb\") as n:\n",
" n.write(temp_norm.encode(\"utf-8\"))\n",
"\n",
" alldata += temp_norm\n",
"\n",
" with codecs.open('/'.join([dirname, 'alldata-id.txt']), 'w', encoding='utf-8') as f:\n",
" with smart_open.smart_open(os.path.join(dirname, 'alldata-id.txt'), 'wb') as f:\n",
" for idx, line in enumerate(alldata.splitlines()):\n",
" num_line = u\"_*{0} {1}\\n\".format(idx, line)\n",
" f.write(num_line)\n",
" f.write(num_line.encode(\"utf-8\"))\n",
"\n",
"end = time.clock()\n",
"print (\"total running time: \", end-start)"
Expand Down Expand Up @@ -1092,21 +1102,21 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"display_name": "Python 3",
"language": "python",
"name": "python2"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
Expand Down

0 comments on commit d3b4fc3

Please sign in to comment.