diff --git a/docs/notebooks/Corpora_and_Vector_Spaces.ipynb b/docs/notebooks/Corpora_and_Vector_Spaces.ipynb index 72bb57b47e..0d6c21760b 100644 --- a/docs/notebooks/Corpora_and_Vector_Spaces.ipynb +++ b/docs/notebooks/Corpora_and_Vector_Spaces.ipynb @@ -279,9 +279,10 @@ }, "outputs": [], "source": [ + "from smart_open import smart_open\n", "class MyCorpus(object):\n", " def __iter__(self):\n", - " for line in open('datasets/mycorpus.txt'):\n", + " for line in smart_open('datasets/mycorpus.txt', 'rb'):\n", " # assume there's one document per line, tokens separated by whitespace\n", " yield dictionary.doc2bow(line.lower().split())" ] @@ -374,9 +375,10 @@ ], "source": [ "from six import iteritems\n", + "from smart_open import smart_open\n", "\n", "# collect statistics about all tokens\n", - "dictionary = corpora.Dictionary(line.lower().split() for line in open('datasets/mycorpus.txt'))\n", + "dictionary = corpora.Dictionary(line.lower().split() for line in smart_open('datasets/mycorpus.txt', 'rb'))\n", "\n", "# remove stop words and words that appear only once\n", "stop_ids = [dictionary.token2id[stopword] for stopword in stoplist \n", diff --git a/docs/notebooks/Poincare Evaluation.ipynb b/docs/notebooks/Poincare Evaluation.ipynb index cffd7c8491..0d3f8bb851 100644 --- a/docs/notebooks/Poincare Evaluation.ipynb +++ b/docs/notebooks/Poincare Evaluation.ipynb @@ -697,7 +697,7 @@ " parts = first_line.rstrip().split(\"\\t\")\n", " model_size = len(parts) - 1\n", " vocab_size = len(lines)\n", - " with open(output_file, 'w') as f:\n", + " with smart_open(output_file, 'w') as f:\n", " f.write('%d %d\\n' % (vocab_size, model_size))\n", " for line in lines:\n", " f.write(line.replace('\\t', ' '))\n", @@ -709,7 +709,7 @@ " \n", " model_size = random_embedding.shape[0]\n", " vocab_size = len(np_embeddings)\n", - " with open(output_file, 'w') as f:\n", + " with smart_open(output_file, 'w') as f:\n", " f.write('%d %d\\n' % (vocab_size, model_size))\n", " for key, vector in np_embeddings.items():\n", " vector_string = ' '.join('%.6f' % value for value in vector)\n", @@ -1113,7 +1113,7 @@ " test_line_candidates = []\n", " line_count = 0\n", " all_nodes = set()\n", - " with open(data_file, 'rb') as f:\n", + " with smart_open(data_file, 'rb') as f:\n", " for i, line in enumerate(f):\n", " node_1, node_2 = line.split()\n", " all_nodes.update([node_1, node_2])\n", @@ -1135,9 +1135,9 @@ " train_line_indices = set(l for l in range(line_count) if l not in test_line_indices)\n", " \n", " train_set_nodes = set()\n", - " with open(data_file, 'rb') as f:\n", - " train_file = open(train_filename, 'wb')\n", - " test_file = open(test_filename, 'wb')\n", + " with smart_open(data_file, 'rb') as f:\n", + " train_file = smart_open(train_filename, 'wb')\n", + " test_file = smart_open(test_filename, 'wb')\n", " for i, line in enumerate(f):\n", " if i in train_line_indices:\n", " train_set_nodes.update(line.split())\n", @@ -1169,13 +1169,13 @@ " \"\"\"\n", " root_candidates = set()\n", " leaf_candidates = set()\n", - " with open(data_file, 'rb') as f:\n", + " with smart_open(data_file, 'rb') as f:\n", " for line in f:\n", " nodes = line.split()\n", " root_candidates.update(nodes)\n", " leaf_candidates.update(nodes)\n", " \n", - " with open(data_file, 'rb') as f:\n", + " with smart_open(data_file, 'rb') as f:\n", " for line in f:\n", " node_1, node_2 = line.split()\n", " if node_1 == node_2:\n", diff --git a/docs/notebooks/Tensorboard_visualizations.ipynb b/docs/notebooks/Tensorboard_visualizations.ipynb index 915878097e..a2d88e9619 100644 --- a/docs/notebooks/Tensorboard_visualizations.ipynb +++ b/docs/notebooks/Tensorboard_visualizations.ipynb @@ -624,6 +624,7 @@ "import pandas as pd\n", "import smart_open\n", "import random\n", + "from smart_open import smart_open\n", "\n", "# read data\n", "dataframe = pd.read_csv('movie_plots.csv')\n", @@ -803,7 +804,7 @@ }, "outputs": [], "source": [ - "with open('movie_plot_metadata.tsv','w') as w:\n", + "with smart_open('movie_plot_metadata.tsv','w') as w:\n", " w.write('Titles\\tGenres\\n')\n", " for i,j in zip(dataframe.Titles, dataframe.Genres):\n", " w.write(\"%s\\t%s\\n\" % (i,j))" @@ -1024,14 +1025,14 @@ "outputs": [], "source": [ "# create file for tensors\n", - "with open('doc_lda_tensor.tsv','w') as w:\n", + "with smart_open('doc_lda_tensor.tsv','w') as w:\n", " for doc_topics in all_topics:\n", " for topics in doc_topics:\n", " w.write(str(topics[1])+ \"\\t\")\n", " w.write(\"\\n\")\n", " \n", "# create file for metadata\n", - "with open('doc_lda_metadata.tsv','w') as w:\n", + "with smart_open('doc_lda_metadata.tsv','w') as w:\n", " w.write('Titles\\tGenres\\n')\n", " for j, k in zip(dataframe.Titles, dataframe.Genres):\n", " w.write(\"%s\\t%s\\n\" % (j, k))" @@ -1084,7 +1085,7 @@ "\n", "# overwrite metadata file\n", "i=0\n", - "with open('doc_lda_metadata.tsv','w') as w:\n", + "with smart_open('doc_lda_metadata.tsv','w') as w:\n", " w.write('Titles\\tGenres\\n')\n", " for j,k in zip(dataframe.Titles, dataframe.Genres):\n", " w.write(\"%s\\t%s\\n\" % (''.join((str(j), str(tensors[i]))),k))\n", diff --git a/docs/notebooks/WMD_tutorial.ipynb b/docs/notebooks/WMD_tutorial.ipynb index aca190edc7..3a529f471e 100644 --- a/docs/notebooks/WMD_tutorial.ipynb +++ b/docs/notebooks/WMD_tutorial.ipynb @@ -302,6 +302,7 @@ "start = time()\n", "\n", "import json\n", + "from smart_open import smart_open\n", "\n", "# Business IDs of the restaurants.\n", "ids = ['4bEjOyTaDG24SY5TxsaUNQ', '2e2e7WgqU1BnpxmQL5jbfw', 'zt1TpTuJ6y9n551sw9TaEg',\n", @@ -310,7 +311,7 @@ "w2v_corpus = [] # Documents to train word2vec on (all 6 restaurants).\n", "wmd_corpus = [] # Documents to run queries against (only one restaurant).\n", "documents = [] # wmd_corpus, with no pre-processing (so we can see the original documents).\n", - "with open('/data/yelp_academic_dataset_review.json') as data_file:\n", + "with smart_open('/data/yelp_academic_dataset_review.json', 'rb') as data_file:\n", " for line in data_file:\n", " json_line = json.loads(line)\n", " \n", diff --git a/docs/notebooks/Word2Vec_FastText_Comparison.ipynb b/docs/notebooks/Word2Vec_FastText_Comparison.ipynb index a9191dcb28..b1d3f914e3 100644 --- a/docs/notebooks/Word2Vec_FastText_Comparison.ipynb +++ b/docs/notebooks/Word2Vec_FastText_Comparison.ipynb @@ -57,11 +57,12 @@ ], "source": [ "import nltk\n", + "from smart_open import smart_open\n", "nltk.download('brown') \n", "# Only the brown corpus is needed in case you don't have it.\n", "\n", "# Generate brown corpus text file\n", - "with open('brown_corp.txt', 'w+') as f:\n", + "with smart_open('brown_corp.txt', 'w+') as f:\n", " for word in nltk.corpus.brown.words():\n", " f.write('{word} '.format(word=word))\n", "\n", diff --git a/docs/notebooks/Wordrank_comparisons.ipynb b/docs/notebooks/Wordrank_comparisons.ipynb index 7bb7fd22c6..26bac2e880 100644 --- a/docs/notebooks/Wordrank_comparisons.ipynb +++ b/docs/notebooks/Wordrank_comparisons.ipynb @@ -38,20 +38,21 @@ ], "source": [ "import nltk\n", + "from smart_open import smart_open\n", "from gensim.parsing.preprocessing import strip_punctuation, strip_multiple_whitespaces\n", "\n", "# Only the brown corpus is needed in case you don't have it.\n", "nltk.download('brown') \n", "\n", "# Generate brown corpus text file\n", - "with open('brown_corp.txt', 'w+') as f:\n", + "with smart_open('brown_corp.txt', 'w+') as f:\n", " for word in nltk.corpus.brown.words():\n", " f.write('{word} '.format(word=word))\n", " f.seek(0)\n", " brown = f.read()\n", "\n", "# Preprocess brown corpus\n", - "with open('proc_brown_corp.txt', 'w') as f:\n", + "with smart_open('proc_brown_corp.txt', 'w') as f:\n", " proc_brown = strip_punctuation(brown)\n", " proc_brown = strip_multiple_whitespaces(proc_brown).lower()\n", " f.write(proc_brown)\n", @@ -1004,12 +1005,13 @@ "import copy\n", "import multiprocessing\n", "import numpy as np\n", + "from smart_open import smart_open\n", "\n", "\n", "def compute_accuracies(model, freq):\n", " # mean_freq will contain analogies together with the mean frequency of 4 words involved\n", " mean_freq = {}\n", - " with open(word_analogies_file, 'r') as r:\n", + " with smart_open(word_analogies_file, 'r') as r:\n", " for i, line in enumerate(r):\n", " if ':' not in line:\n", " analogy = tuple(line.split())\n", diff --git a/docs/notebooks/atmodel_tutorial.ipynb b/docs/notebooks/atmodel_tutorial.ipynb index 80bb993537..2699e3519c 100644 --- a/docs/notebooks/atmodel_tutorial.ipynb +++ b/docs/notebooks/atmodel_tutorial.ipynb @@ -105,6 +105,7 @@ "outputs": [], "source": [ "import os, re\n", + "from smart_open import smart_open\n", "\n", "# Folder containing all NIPS papers.\n", "data_dir = '/tmp/nipstxt/' # Set this path to the data on your machine.\n", @@ -125,7 +126,7 @@ " \n", " # Read document text.\n", " # Note: ignoring characters that cause encoding errors.\n", - " with open(data_dir + yr_dir + '/' + filen, errors='ignore', encoding='utf-8') as fid:\n", + " with smart_open(data_dir + yr_dir + '/' + filen, encoding='utf-8', 'rb') as fid:\n", " txt = fid.read()\n", " \n", " # Replace any whitespace (newline, tabs, etc.) by a single space.\n", @@ -149,6 +150,7 @@ }, "outputs": [], "source": [ + "from smart_open import smart_open\n", "filenames = [data_dir + 'idx/a' + yr + '.txt' for yr in yrs] # Using the years defined in previous cell.\n", "\n", "# Get all author names and their corresponding document IDs.\n", @@ -157,7 +159,7 @@ "for yr in yrs:\n", " # The files \"a00.txt\" and so on contain the author-document mappings.\n", " filename = data_dir + 'idx/a' + yr + '.txt'\n", - " for line in open(filename, errors='ignore', encoding='utf-8'):\n", + " for line in smart_open(filename, errors='ignore', encoding='utf-8', 'rb'):\n", " # Each line corresponds to one author.\n", " contents = re.split(',', line)\n", " author_name = (contents[1] + contents[0]).strip()\n", diff --git a/docs/notebooks/doc2vec-IMDB.ipynb b/docs/notebooks/doc2vec-IMDB.ipynb index 415f9ae837..ea66f1db7b 100644 --- a/docs/notebooks/doc2vec-IMDB.ipynb +++ b/docs/notebooks/doc2vec-IMDB.ipynb @@ -109,6 +109,7 @@ " return norm_text\n", "\n", "import time\n", + "import smart_open\n", "start = time.clock()\n", "\n", "if not os.path.isfile('aclImdb/alldata-id.txt'):\n", @@ -118,7 +119,7 @@ " print(\"Downloading IMDB archive...\")\n", " url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename\n", " r = requests.get(url)\n", - " with open(filename, 'wb') as f:\n", + " with smart_open.smart_open(filename, 'wb') as f:\n", " f.write(r.content)\n", " tar = tarfile.open(filename, mode='r')\n", " tar.extractall()\n", @@ -190,11 +191,13 @@ "import gensim\n", "from gensim.models.doc2vec import TaggedDocument\n", "from collections import namedtuple\n", + "from smart_open import smart_open\n", "\n", "SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')\n", "\n", "alldocs = [] # Will hold all docs in original order\n", - "with open('aclImdb/alldata-id.txt', encoding='utf-8') as alldata:\n", + "with smart_open('aclImdb/alldata-id.txt', 'rb') as alldata:\n", + " alldata = alldata.read().decode("utf-8")\n", " for line_no, line in enumerate(alldata):\n", " tokens = gensim.utils.to_unicode(line).split()\n", " words = tokens[1:]\n", diff --git a/docs/notebooks/gensim_news_classification.ipynb b/docs/notebooks/gensim_news_classification.ipynb index 6beae15285..e3d7ac8b9b 100644 --- a/docs/notebooks/gensim_news_classification.ipynb +++ b/docs/notebooks/gensim_news_classification.ipynb @@ -63,6 +63,7 @@ "from gensim.models.wrappers import LdaMallet\n", "from gensim.corpora import Dictionary\n", "from pprint import pprint\n", + "from smart_open import smart_open\n", "\n", "%matplotlib inline" ] @@ -122,7 +123,7 @@ } ], "source": [ - "with open(lee_train_file) as f:\n", + "with smart_open(lee_train_file, 'rb') as f:\n", " for n, l in enumerate(f):\n", " if n < 5:\n", " print([l])" @@ -151,7 +152,7 @@ " -------\n", " yields preprocessed line\n", " \"\"\"\n", - " with open(fname) as f:\n", + " with smart_open(fname, 'rb') as f:\n", " for line in f:\n", " yield gensim.utils.simple_preprocess(line, deacc=True, min_len=3)" ] diff --git a/docs/notebooks/lda_training_tips.ipynb b/docs/notebooks/lda_training_tips.ipynb index 3987ffe1a2..674973172c 100644 --- a/docs/notebooks/lda_training_tips.ipynb +++ b/docs/notebooks/lda_training_tips.ipynb @@ -53,6 +53,7 @@ "# Read data.\n", "\n", "import os\n", + "from smart_open import smart_open\n", "\n", "# Folder containing all NIPS papers.\n", "data_dir = 'nipstxt/'\n", @@ -67,7 +68,7 @@ " files = os.listdir(data_dir + yr_dir)\n", " for filen in files:\n", " # Note: ignoring characters that cause encoding errors.\n", - " with open(data_dir + yr_dir + '/' + filen, errors='ignore') as fid:\n", + " with smart_open(data_dir + yr_dir + '/' + filen, 'rb') as fid:\n", " txt = fid.read()\n", " docs.append(txt)" ] diff --git a/docs/notebooks/online_w2v_tutorial.ipynb b/docs/notebooks/online_w2v_tutorial.ipynb index ed51565272..02923a7a04 100644 --- a/docs/notebooks/online_w2v_tutorial.ipynb +++ b/docs/notebooks/online_w2v_tutorial.ipynb @@ -28,7 +28,8 @@ "from gensim.models.word2vec import Word2Vec, LineSentence\n", "from pprint import pprint\n", "from copy import deepcopy\n", - "from multiprocessing import cpu_count" + "from multiprocessing import cpu_count\n", + "from smart_open import smart_open" ] }, { @@ -93,7 +94,7 @@ "outputs": [], "source": [ "def write_wiki(wiki, name, titles = []):\n", - " with open('{}.wiki'.format(name), 'wb') as f:\n", + " with smart_open('{}.wiki'.format(name), 'wb') as f:\n", " wiki.metadata = True\n", " for text, (page_id, title) in wiki.get_texts():\n", " if title not in titles:\n", diff --git a/docs/notebooks/poincare/poincare_numpy.patch b/docs/notebooks/poincare/poincare_numpy.patch index 8134bea4a3..81c42e1c07 100644 --- a/docs/notebooks/poincare/poincare_numpy.patch +++ b/docs/notebooks/poincare/poincare_numpy.patch @@ -291,7 +291,7 @@ index ecae36e..f85bf22 100644 + emb[neg[1]] = update(emb[neg[1]], -1*der_neg[1]) + print('Epoch #%d, time taken: %.2f seconds' % (epoch + 1, time.time() - last_time)) + last_time = time.time() -+ pickle.dump(emb, open(output_file, 'wb')) ++ pickle.dump(emb, smart_open(output_file, 'wb')) + + +if __name__ == "__main__": diff --git a/docs/notebooks/test_notebooks.py b/docs/notebooks/test_notebooks.py index 77633b7037..1a0b73f44b 100644 --- a/docs/notebooks/test_notebooks.py +++ b/docs/notebooks/test_notebooks.py @@ -6,6 +6,7 @@ import nbformat from nbconvert.preprocessors import ExecutePreprocessor from nbconvert.preprocessors.execute import CellExecutionError +"from smart_open import smart_open\n", def _notebook_run(path): @@ -16,7 +17,7 @@ def _notebook_run(path): this_file_directory = os.path.dirname(__file__) errors = [] with tempfile.NamedTemporaryFile(suffix=".ipynb", mode='wt') as fout: - with open(path) as f: + with smart_open(path, 'rb') as f: nb = nbformat.read(f, as_version=4) nb.metadata.get('kernelspec', {})['name'] = kernel_name ep = ExecutePreprocessor(kernel_name=kernel_name, timeout=10) diff --git a/docs/notebooks/topic_coherence-movies.ipynb b/docs/notebooks/topic_coherence-movies.ipynb index 983905b31e..3cb4eb2752 100644 --- a/docs/notebooks/topic_coherence-movies.ipynb +++ b/docs/notebooks/topic_coherence-movies.ipynb @@ -38,7 +38,8 @@ "from datetime import datetime\n", "\n", "from gensim.models import CoherenceModel\n", - "from gensim.corpora.dictionary import Dictionary" + "from gensim.corpora.dictionary import Dictionary\n", + "from smart_open import smart_open" ] }, { @@ -114,7 +115,7 @@ " # as well as pages about a single year.\n", " # As a result, this preprocessing differs from the paper.\n", " \n", - " with open(os.path.join(data_dir, fname)) as f:\n", + " with smart_open(os.path.join(data_dir, fname), 'rb') as f:\n", " for line in f:\n", " # lower case all words\n", " lowered = line.lower()\n", @@ -206,7 +207,7 @@ ], "source": [ "topics = [] # list of 100 topics\n", - "with open(topics_path) as f:\n", + "with smart_open(topics_path, 'rb') as f:\n", " topics = [line.split() for line in f if line]\n", "len(topics)" ] @@ -231,7 +232,7 @@ ], "source": [ "human_scores = []\n", - "with open(human_scores_path) as f:\n", + "with smart_open(human_scores_path, 'rb') as f:\n", " for line in f:\n", " human_scores.append(float(line.strip()))\n", "len(human_scores)" diff --git a/docs/notebooks/word2vec.ipynb b/docs/notebooks/word2vec.ipynb index a2e944420f..db9b9487b0 100644 --- a/docs/notebooks/word2vec.ipynb +++ b/docs/notebooks/word2vec.ipynb @@ -109,13 +109,14 @@ "metadata": {}, "outputs": [], "source": [ + "from smart_open import smart_open\n", "class MySentences(object):\n", " def __init__(self, dirname):\n", " self.dirname = dirname\n", " \n", " def __iter__(self):\n", " for fname in os.listdir(self.dirname):\n", - " for line in open(os.path.join(self.dirname, fname)):\n", + " for line in smart_open(os.path.join(self.dirname, fname), 'rb'):\n", " yield line.split()" ] },