-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
* Fix typo * Make `save_corpus` private * Annotate `bleicorpus.py` * Make __save_corpus weakly private * Fix _save_corpus in tests * Fix _save_corpus[2] * Document bleicorpus in Numpy style * Document indexedcorpus * Annotate csvcorpus * Add "Yields" section * Make `_save_corpus` public * Annotate bleicorpus * Fix indentation in bleicorpus * `_save_corpus` -> `save_corpus` * Annotate bleicorpus * Convert dictionary docs to numpy style * Convert hashdictionary docs to numpy style * Convert indexedcorpus docs to numpy style * Convert lowcorpus docs to numpy style * Convert malletcorpus docs to numpy style * Convert mmcorpus docs to numpy style * Convert sharded_corpus docs to numpy style * Convert svmlightcorpus docs to numpy style * Convert textcorpus docs to numpy style * Convert ucicorpus docs to numpy style * Convert wikicorpus docs to numpy style * Add sphinx tweaks * Remove trailing whitespaces * Annotate wikicorpus * SVMLight Corpus annotated * Fix TODO * Fix grammar mistake * Undo changes to dictionary * Undo changes to hashdictionary * Document indexedcorpus * Document indexedcorpus[2] Fix identation * Remove redundant files * Add more dots. :) * Fix monospace * remove useless method * fix bleicorpus * fix csvcorpus * fix indexedcorpus * fix svmlightcorpus * fix wikicorpus[1] * fix wikicorpus[2] * fix wikicorpus[3] * fix review comments
- Loading branch information
1 parent
74dae4d
commit c5f487d
Showing
5 changed files
with
553 additions
and
202 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,10 +4,7 @@ | |
# Copyright (C) 2013 Zygmunt Zając <[email protected]> | ||
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html | ||
|
||
""" | ||
Corpus in CSV format. | ||
""" | ||
"""Corpus in CSV format.""" | ||
|
||
|
||
from __future__ import with_statement | ||
|
@@ -18,22 +15,28 @@ | |
|
||
from gensim import interfaces, utils | ||
|
||
logger = logging.getLogger('gensim.corpora.csvcorpus') | ||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class CsvCorpus(interfaces.CorpusABC): | ||
""" | ||
Corpus in CSV format. The CSV delimiter, headers etc. are guessed automatically | ||
based on the file content. | ||
"""Corpus in CSV format. | ||
Notes | ||
----- | ||
The CSV delimiter, headers etc. are guessed automatically based on the file content. | ||
All row values are expected to be ints/floats. | ||
""" | ||
|
||
def __init__(self, fname, labels): | ||
""" | ||
Initialize the corpus from a file. | ||
`labels` = are class labels present in the input file? => skip the first column | ||
Parameters | ||
---------- | ||
fname : str | ||
Path to corpus. | ||
labels : bool | ||
If True - ignore first column (class labels). | ||
""" | ||
logger.info("loading corpus from %s", fname) | ||
|
@@ -48,8 +51,12 @@ def __init__(self, fname, labels): | |
logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers) | ||
|
||
def __iter__(self): | ||
""" | ||
Iterate over the corpus, returning one sparse vector at a time. | ||
"""Iterate over the corpus, returning one BoW vector at a time. | ||
Yields | ||
------ | ||
list of (int, float) | ||
Document in BoW format. | ||
""" | ||
reader = csv.reader(utils.smart_open(self.fname), self.dialect) | ||
|
Oops, something went wrong.