Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace open() with smart_open() in notebooks. Fix #1789 #1812

Merged
merged 4 commits into from
Mar 10, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions docs/notebooks/Corpora_and_Vector_Spaces.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -279,9 +279,10 @@
},
"outputs": [],
"source": [
"from smart_open import smart_open\n",
"class MyCorpus(object):\n",
" def __iter__(self):\n",
" for line in open('datasets/mycorpus.txt'):\n",
" for line in smart_open('datasets/mycorpus.txt', 'rb'):\n",
" # assume there's one document per line, tokens separated by whitespace\n",
" yield dictionary.doc2bow(line.lower().split())"
]
Expand Down Expand Up @@ -374,9 +375,10 @@
],
"source": [
"from six import iteritems\n",
"from smart_open import smart_open\n",
"\n",
"# collect statistics about all tokens\n",
"dictionary = corpora.Dictionary(line.lower().split() for line in open('datasets/mycorpus.txt'))\n",
"dictionary = corpora.Dictionary(line.lower().split() for line in smart_open('datasets/mycorpus.txt', 'rb'))\n",
"\n",
"# remove stop words and words that appear only once\n",
"stop_ids = [dictionary.token2id[stopword] for stopword in stoplist \n",
Expand Down
16 changes: 8 additions & 8 deletions docs/notebooks/Poincare Evaluation.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -697,7 +697,7 @@
" parts = first_line.rstrip().split(\"\\t\")\n",
" model_size = len(parts) - 1\n",
" vocab_size = len(lines)\n",
" with open(output_file, 'w') as f:\n",
" with smart_open(output_file, 'w') as f:\n",
" f.write('%d %d\\n' % (vocab_size, model_size))\n",
" for line in lines:\n",
" f.write(line.replace('\\t', ' '))\n",
Expand All @@ -709,7 +709,7 @@
" \n",
" model_size = random_embedding.shape[0]\n",
" vocab_size = len(np_embeddings)\n",
" with open(output_file, 'w') as f:\n",
" with smart_open(output_file, 'w') as f:\n",
" f.write('%d %d\\n' % (vocab_size, model_size))\n",
" for key, vector in np_embeddings.items():\n",
" vector_string = ' '.join('%.6f' % value for value in vector)\n",
Expand Down Expand Up @@ -1113,7 +1113,7 @@
" test_line_candidates = []\n",
" line_count = 0\n",
" all_nodes = set()\n",
" with open(data_file, 'rb') as f:\n",
" with smart_open(data_file, 'rb') as f:\n",
" for i, line in enumerate(f):\n",
" node_1, node_2 = line.split()\n",
" all_nodes.update([node_1, node_2])\n",
Expand All @@ -1135,9 +1135,9 @@
" train_line_indices = set(l for l in range(line_count) if l not in test_line_indices)\n",
" \n",
" train_set_nodes = set()\n",
" with open(data_file, 'rb') as f:\n",
" train_file = open(train_filename, 'wb')\n",
" test_file = open(test_filename, 'wb')\n",
" with smart_open(data_file, 'rb') as f:\n",
" train_file = smart_open(train_filename, 'wb')\n",
" test_file = smart_open(test_filename, 'wb')\n",
" for i, line in enumerate(f):\n",
" if i in train_line_indices:\n",
" train_set_nodes.update(line.split())\n",
Expand Down Expand Up @@ -1169,13 +1169,13 @@
" \"\"\"\n",
" root_candidates = set()\n",
" leaf_candidates = set()\n",
" with open(data_file, 'rb') as f:\n",
" with smart_open(data_file, 'rb') as f:\n",
" for line in f:\n",
" nodes = line.split()\n",
" root_candidates.update(nodes)\n",
" leaf_candidates.update(nodes)\n",
" \n",
" with open(data_file, 'rb') as f:\n",
" with smart_open(data_file, 'rb') as f:\n",
" for line in f:\n",
" node_1, node_2 = line.split()\n",
" if node_1 == node_2:\n",
Expand Down
9 changes: 5 additions & 4 deletions docs/notebooks/Tensorboard_visualizations.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,7 @@
"import pandas as pd\n",
"import smart_open\n",
"import random\n",
"from smart_open import smart_open\n",
"\n",
"# read data\n",
"dataframe = pd.read_csv('movie_plots.csv')\n",
Expand Down Expand Up @@ -803,7 +804,7 @@
},
"outputs": [],
"source": [
"with open('movie_plot_metadata.tsv','w') as w:\n",
"with smart_open('movie_plot_metadata.tsv','w') as w:\n",
" w.write('Titles\\tGenres\\n')\n",
" for i,j in zip(dataframe.Titles, dataframe.Genres):\n",
" w.write(\"%s\\t%s\\n\" % (i,j))"
Expand Down Expand Up @@ -1024,14 +1025,14 @@
"outputs": [],
"source": [
"# create file for tensors\n",
"with open('doc_lda_tensor.tsv','w') as w:\n",
"with smart_open('doc_lda_tensor.tsv','w') as w:\n",
" for doc_topics in all_topics:\n",
" for topics in doc_topics:\n",
" w.write(str(topics[1])+ \"\\t\")\n",
" w.write(\"\\n\")\n",
" \n",
"# create file for metadata\n",
"with open('doc_lda_metadata.tsv','w') as w:\n",
"with smart_open('doc_lda_metadata.tsv','w') as w:\n",
" w.write('Titles\\tGenres\\n')\n",
" for j, k in zip(dataframe.Titles, dataframe.Genres):\n",
" w.write(\"%s\\t%s\\n\" % (j, k))"
Expand Down Expand Up @@ -1084,7 +1085,7 @@
"\n",
"# overwrite metadata file\n",
"i=0\n",
"with open('doc_lda_metadata.tsv','w') as w:\n",
"with smart_open('doc_lda_metadata.tsv','w') as w:\n",
" w.write('Titles\\tGenres\\n')\n",
" for j,k in zip(dataframe.Titles, dataframe.Genres):\n",
" w.write(\"%s\\t%s\\n\" % (''.join((str(j), str(tensors[i]))),k))\n",
Expand Down
3 changes: 2 additions & 1 deletion docs/notebooks/WMD_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@
"start = time()\n",
"\n",
"import json\n",
"from smart_open import smart_open\n",
"\n",
"# Business IDs of the restaurants.\n",
"ids = ['4bEjOyTaDG24SY5TxsaUNQ', '2e2e7WgqU1BnpxmQL5jbfw', 'zt1TpTuJ6y9n551sw9TaEg',\n",
Expand All @@ -310,7 +311,7 @@
"w2v_corpus = [] # Documents to train word2vec on (all 6 restaurants).\n",
"wmd_corpus = [] # Documents to run queries against (only one restaurant).\n",
"documents = [] # wmd_corpus, with no pre-processing (so we can see the original documents).\n",
"with open('/data/yelp_academic_dataset_review.json') as data_file:\n",
"with smart_open('/data/yelp_academic_dataset_review.json', 'rb') as data_file:\n",
" for line in data_file:\n",
" json_line = json.loads(line)\n",
" \n",
Expand Down
3 changes: 2 additions & 1 deletion docs/notebooks/Word2Vec_FastText_Comparison.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,12 @@
],
"source": [
"import nltk\n",
"from smart_open import smart_open\n",
"nltk.download('brown') \n",
"# Only the brown corpus is needed in case you don't have it.\n",
"\n",
"# Generate brown corpus text file\n",
"with open('brown_corp.txt', 'w+') as f:\n",
"with smart_open('brown_corp.txt', 'w+') as f:\n",
" for word in nltk.corpus.brown.words():\n",
" f.write('{word} '.format(word=word))\n",
"\n",
Expand Down
8 changes: 5 additions & 3 deletions docs/notebooks/Wordrank_comparisons.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,21 @@
],
"source": [
"import nltk\n",
"from smart_open import smart_open\n",
"from gensim.parsing.preprocessing import strip_punctuation, strip_multiple_whitespaces\n",
"\n",
"# Only the brown corpus is needed in case you don't have it.\n",
"nltk.download('brown') \n",
"\n",
"# Generate brown corpus text file\n",
"with open('brown_corp.txt', 'w+') as f:\n",
"with smart_open('brown_corp.txt', 'w+') as f:\n",
" for word in nltk.corpus.brown.words():\n",
" f.write('{word} '.format(word=word))\n",
" f.seek(0)\n",
" brown = f.read()\n",
"\n",
"# Preprocess brown corpus\n",
"with open('proc_brown_corp.txt', 'w') as f:\n",
"with smart_open('proc_brown_corp.txt', 'w') as f:\n",
" proc_brown = strip_punctuation(brown)\n",
" proc_brown = strip_multiple_whitespaces(proc_brown).lower()\n",
" f.write(proc_brown)\n",
Expand Down Expand Up @@ -1004,12 +1005,13 @@
"import copy\n",
"import multiprocessing\n",
"import numpy as np\n",
"from smart_open import smart_open\n",
"\n",
"\n",
"def compute_accuracies(model, freq):\n",
" # mean_freq will contain analogies together with the mean frequency of 4 words involved\n",
" mean_freq = {}\n",
" with open(word_analogies_file, 'r') as r:\n",
" with smart_open(word_analogies_file, 'r') as r:\n",
" for i, line in enumerate(r):\n",
" if ':' not in line:\n",
" analogy = tuple(line.split())\n",
Expand Down
6 changes: 4 additions & 2 deletions docs/notebooks/atmodel_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
"outputs": [],
"source": [
"import os, re\n",
"from smart_open import smart_open\n",
"\n",
"# Folder containing all NIPS papers.\n",
"data_dir = '/tmp/nipstxt/' # Set this path to the data on your machine.\n",
Expand All @@ -125,7 +126,7 @@
" \n",
" # Read document text.\n",
" # Note: ignoring characters that cause encoding errors.\n",
" with open(data_dir + yr_dir + '/' + filen, errors='ignore', encoding='utf-8') as fid:\n",
" with smart_open(data_dir + yr_dir + '/' + filen, encoding='utf-8', 'rb') as fid:\n",
" txt = fid.read()\n",
" \n",
" # Replace any whitespace (newline, tabs, etc.) by a single space.\n",
Expand All @@ -149,6 +150,7 @@
},
"outputs": [],
"source": [
"from smart_open import smart_open\n",
"filenames = [data_dir + 'idx/a' + yr + '.txt' for yr in yrs] # Using the years defined in previous cell.\n",
"\n",
"# Get all author names and their corresponding document IDs.\n",
Expand All @@ -157,7 +159,7 @@
"for yr in yrs:\n",
" # The files \"a00.txt\" and so on contain the author-document mappings.\n",
" filename = data_dir + 'idx/a' + yr + '.txt'\n",
" for line in open(filename, errors='ignore', encoding='utf-8'):\n",
" for line in smart_open(filename, errors='ignore', encoding='utf-8', 'rb'):\n",
" # Each line corresponds to one author.\n",
" contents = re.split(',', line)\n",
" author_name = (contents[1] + contents[0]).strip()\n",
Expand Down
7 changes: 5 additions & 2 deletions docs/notebooks/doc2vec-IMDB.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@
" return norm_text\n",
"\n",
"import time\n",
"import smart_open\n",
"start = time.clock()\n",
"\n",
"if not os.path.isfile('aclImdb/alldata-id.txt'):\n",
Expand All @@ -118,7 +119,7 @@
" print(\"Downloading IMDB archive...\")\n",
" url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename\n",
" r = requests.get(url)\n",
" with open(filename, 'wb') as f:\n",
" with smart_open.smart_open(filename, 'wb') as f:\n",
" f.write(r.content)\n",
" tar = tarfile.open(filename, mode='r')\n",
" tar.extractall()\n",
Expand Down Expand Up @@ -190,11 +191,13 @@
"import gensim\n",
"from gensim.models.doc2vec import TaggedDocument\n",
"from collections import namedtuple\n",
"from smart_open import smart_open\n",
"\n",
"SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')\n",
"\n",
"alldocs = [] # Will hold all docs in original order\n",
"with open('aclImdb/alldata-id.txt', encoding='utf-8') as alldata:\n",
"with smart_open('aclImdb/alldata-id.txt', 'rb') as alldata:\n",
" alldata = alldata.read().decode("utf-8")\n",
" for line_no, line in enumerate(alldata):\n",
" tokens = gensim.utils.to_unicode(line).split()\n",
" words = tokens[1:]\n",
Expand Down
5 changes: 3 additions & 2 deletions docs/notebooks/gensim_news_classification.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
"from gensim.models.wrappers import LdaMallet\n",
"from gensim.corpora import Dictionary\n",
"from pprint import pprint\n",
"from smart_open import smart_open\n",
"\n",
"%matplotlib inline"
]
Expand Down Expand Up @@ -122,7 +123,7 @@
}
],
"source": [
"with open(lee_train_file) as f:\n",
"with smart_open(lee_train_file, 'rb') as f:\n",
" for n, l in enumerate(f):\n",
" if n < 5:\n",
" print([l])"
Expand Down Expand Up @@ -151,7 +152,7 @@
" -------\n",
" yields preprocessed line\n",
" \"\"\"\n",
" with open(fname) as f:\n",
" with smart_open(fname, 'rb') as f:\n",
" for line in f:\n",
" yield gensim.utils.simple_preprocess(line, deacc=True, min_len=3)"
]
Expand Down
3 changes: 2 additions & 1 deletion docs/notebooks/lda_training_tips.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
"# Read data.\n",
"\n",
"import os\n",
"from smart_open import smart_open\n",
"\n",
"# Folder containing all NIPS papers.\n",
"data_dir = 'nipstxt/'\n",
Expand All @@ -67,7 +68,7 @@
" files = os.listdir(data_dir + yr_dir)\n",
" for filen in files:\n",
" # Note: ignoring characters that cause encoding errors.\n",
" with open(data_dir + yr_dir + '/' + filen, errors='ignore') as fid:\n",
" with smart_open(data_dir + yr_dir + '/' + filen, 'rb') as fid:\n",
Copy link
Owner

@piskvorky piskvorky Apr 19, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug (not in this PR, notebook already bad): use os.path.join for joining filesystem paths.

Also, why the change in errors='ignore'?

" txt = fid.read()\n",
" docs.append(txt)"
]
Expand Down
5 changes: 3 additions & 2 deletions docs/notebooks/online_w2v_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
"from gensim.models.word2vec import Word2Vec, LineSentence\n",
"from pprint import pprint\n",
"from copy import deepcopy\n",
"from multiprocessing import cpu_count"
"from multiprocessing import cpu_count\n",
"from smart_open import smart_open"
]
},
{
Expand Down Expand Up @@ -93,7 +94,7 @@
"outputs": [],
"source": [
"def write_wiki(wiki, name, titles = []):\n",
" with open('{}.wiki'.format(name), 'wb') as f:\n",
" with smart_open('{}.wiki'.format(name), 'wb') as f:\n",
" wiki.metadata = True\n",
" for text, (page_id, title) in wiki.get_texts():\n",
" if title not in titles:\n",
Expand Down
2 changes: 1 addition & 1 deletion docs/notebooks/poincare/poincare_numpy.patch
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ index ecae36e..f85bf22 100644
+ emb[neg[1]] = update(emb[neg[1]], -1*der_neg[1])
+ print('Epoch #%d, time taken: %.2f seconds' % (epoch + 1, time.time() - last_time))
+ last_time = time.time()
+ pickle.dump(emb, open(output_file, 'wb'))
+ pickle.dump(emb, smart_open(output_file, 'wb'))
+
+
+if __name__ == "__main__":
Expand Down
3 changes: 2 additions & 1 deletion docs/notebooks/test_notebooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert.preprocessors.execute import CellExecutionError
"from smart_open import smart_open\n",


def _notebook_run(path):
Expand All @@ -16,7 +17,7 @@ def _notebook_run(path):
this_file_directory = os.path.dirname(__file__)
errors = []
with tempfile.NamedTemporaryFile(suffix=".ipynb", mode='wt') as fout:
with open(path) as f:
with smart_open(path, 'rb') as f:
nb = nbformat.read(f, as_version=4)
nb.metadata.get('kernelspec', {})['name'] = kernel_name
ep = ExecutePreprocessor(kernel_name=kernel_name, timeout=10)
Expand Down
9 changes: 5 additions & 4 deletions docs/notebooks/topic_coherence-movies.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@
"from datetime import datetime\n",
"\n",
"from gensim.models import CoherenceModel\n",
"from gensim.corpora.dictionary import Dictionary"
"from gensim.corpora.dictionary import Dictionary\n",
"from smart_open import smart_open"
]
},
{
Expand Down Expand Up @@ -114,7 +115,7 @@
" # as well as pages about a single year.\n",
" # As a result, this preprocessing differs from the paper.\n",
" \n",
" with open(os.path.join(data_dir, fname)) as f:\n",
" with smart_open(os.path.join(data_dir, fname), 'rb') as f:\n",
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like a bug. If you open the file in binary mode, operations like split and lower have a different meaning, compared to text.

I see the same problem in many places in this PR.

" for line in f:\n",
" # lower case all words\n",
" lowered = line.lower()\n",
Expand Down Expand Up @@ -206,7 +207,7 @@
],
"source": [
"topics = [] # list of 100 topics\n",
"with open(topics_path) as f:\n",
"with smart_open(topics_path, 'rb') as f:\n",
" topics = [line.split() for line in f if line]\n",
"len(topics)"
]
Expand All @@ -231,7 +232,7 @@
],
"source": [
"human_scores = []\n",
"with open(human_scores_path) as f:\n",
"with smart_open(human_scores_path, 'rb') as f:\n",
" for line in f:\n",
" human_scores.append(float(line.strip()))\n",
"len(human_scores)"
Expand Down
3 changes: 2 additions & 1 deletion docs/notebooks/word2vec.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -109,13 +109,14 @@
"metadata": {},
"outputs": [],
"source": [
"from smart_open import smart_open\n",
"class MySentences(object):\n",
" def __init__(self, dirname):\n",
" self.dirname = dirname\n",
" \n",
" def __iter__(self):\n",
" for fname in os.listdir(self.dirname):\n",
" for line in open(os.path.join(self.dirname, fname)):\n",
" for line in smart_open(os.path.join(self.dirname, fname), 'rb'):\n",
" yield line.split()"
]
},
Expand Down