From 353174665ee816b01c79f75ed950f96c1f8b14df Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 6 Aug 2021 11:23:00 -0500 Subject: [PATCH] Remove Project.index. (#593) * Make Project._index method private, refactor to remove unused arguments. * Remove xfailed crawler test (functionality no longer needed). * Delete indexing module. * Clean up references to index in docs. --- doc/api.rst | 1 - signac/__main__.py | 2 +- signac/contrib/indexing.py | 451 ------------------------------------- signac/contrib/project.py | 72 ++---- signac/contrib/utility.py | 41 ---- tests/test_indexing.py | 135 ----------- tests/test_project.py | 65 +----- 7 files changed, 25 insertions(+), 742 deletions(-) delete mode 100644 signac/contrib/indexing.py delete mode 100644 tests/test_indexing.py diff --git a/doc/api.rst b/doc/api.rst index 99a0f1292..a8e0fbb83 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -34,7 +34,6 @@ The Project Project.groupbydoc Project.import_from Project.id - Project.index Project.isfile Project.min_len_unique_id Project.num_jobs diff --git a/signac/__main__.py b/signac/__main__.py index 2c451e02e..6e746a840 100644 --- a/signac/__main__.py +++ b/signac/__main__.py @@ -174,7 +174,7 @@ def main_project(args): """Handle project subcommand.""" project = get_project() if args.index: - for doc in project.index(): + for doc in project._index(): print(json.dumps(doc)) return if args.workspace: diff --git a/signac/contrib/indexing.py b/signac/contrib/indexing.py deleted file mode 100644 index ee93aa9b7..000000000 --- a/signac/contrib/indexing.py +++ /dev/null @@ -1,451 +0,0 @@ -# Copyright (c) 2017 The Regents of the University of Michigan -# All rights reserved. -# This software is licensed under the BSD 3-Clause License. -"""Indexing features.""" - -import errno -import hashlib -import json -import logging -import math -import os -import re -import warnings - -from ..common import errors -from .hashing import calc_id -from .utility import walkdepth - -logger = logging.getLogger(__name__) - -KEY_PROJECT = "project" -KEY_FILENAME = "filename" -KEY_PATH = "root" -KEY_PAYLOAD = "format" - - -def _compute_file_md5(file): - """Calculate and return the md5 hash value for the file data.""" - m = hashlib.md5() - for chunk in iter(lambda: file.read(4096), b""): - m.update(chunk) - return m.hexdigest() - - -class _BaseCrawler: - """Crawl through ``root`` and index all files. - - The crawler creates an index on data, which can be exported to a database - for easier access. - """ - - tags = None - - def __init__(self, root): - """Initialize a _BaseCrawler instance. - - Parameters - ---------- - root : str - The path to the root directory to crawl through. - - """ - self.root = os.path.expanduser(root) - self.tags = set() if self.tags is None else set(self.tags) - - def docs_from_file(self, dirpath, fn): - """Implement this method to generate documents from files. - - Parameters - ---------- - dirpath : str - The path of the file, relative to ``root``. - fn : str - The filename. - - Yields - ------ - dict - Index documents. - - """ - raise NotImplementedError() - - def fetch(self, doc, mode="r"): - """Implement this generator method to associate data with a document.""" - raise errors.FetchError(f"Unable to fetch object for '{doc}'.") - - @classmethod - def _calculate_hash(cls, doc, dirpath, fn): - blob = json.dumps(doc, sort_keys=True) - m = hashlib.md5() - m.update(dirpath.encode("utf-8")) - m.update(fn.encode("utf-8")) - m.update(blob.encode("utf-8")) - return m.hexdigest() - - def crawl(self, depth=0): - """Crawl through the ``root`` directory. - - The crawler will inspect every file and directory up - until the specified ``depth`` and call the - :meth:`docs_from_file` method. - - Parameters - ---------- - depth : int - Maximum directory depth to crawl. A value of 0 specifies no limit. - - Yields - ------ - dict - Document. - - """ - logger.info(f"Crawling '{self.root}' (depth={depth})...") - for dirpath, dirnames, filenames in walkdepth(self.root, depth): - for fn in filenames: - for doc in self.docs_from_file(dirpath, fn): - logger.debug(f"doc from file: '{os.path.join(dirpath, fn)}'.") - doc.setdefault(KEY_PAYLOAD, None) - doc.setdefault("_id", self._calculate_hash(doc, dirpath, fn)) - yield doc - logger.info(f"Crawl of '{self.root}' done.") - - def process(self, doc, dirpath, fn): - """Implement this method for processing generated documents. - - The default implementation will return the unmodified ``doc``. - - Parameters - ---------- - dirpath : str - The path of the file, relative to `root`. - fn : str - The filename. - - Returns - ------- - dict - A document. - - """ - return doc - - -class _RegexFileCrawler(_BaseCrawler): - r"""Generate documents from filenames and associate each file with a data type. - - The :py:class:`_RegexFileCrawler` uses regular expressions to generate - data from files. This is a particular easy method to retrieve metadata - associated with files. Inherit from this class to configure a crawler - for your data structure. - - Let's assume we want to index text files, with a naming pattern, that - specifies a parameter ``a`` through the filename, e.g.: - - .. code-block:: python - - ~/my_project/a_0.txt - ~/my_project/a_1.txt - ... - - A valid regular expression to match this pattern would - be: ``.*\/a_(?P\d+)\.txt`` which may be defined for a crawler as such: - - .. code-block:: python - - MyCrawler(_RegexFileCrawler): - pass - - MyCrawler.define('.*\/a_(?P\d+)\.txt', 'TextFile') - - """ - - "Mapping of compiled regex objects and associated formats." - definitions = {} # type: ignore - - @classmethod - def define(cls, regex, format_=None): - """Define a format for a particular regular expression. - - Parameters - ---------- - regex : str - A regular expression used to match files of the specified format. - format_ : object - The format associated with all matching files. - - """ - if isinstance(regex, str): - regex = re.compile(regex) - definitions = dict(cls.definitions) - definitions[regex] = format_ - cls.definitions = definitions - - @classmethod - def compute_file_id(cls, doc, file): - """Compute the file id for a given doc and the associated file. - - The resulting id is assigned to ``doc["md5"]``. - - Parameters - ---------- - doc : dict - The index document. - file : file-like object - The associated file - - Returns - ------- - str - The file id. - - """ - file_id = doc["md5"] = _compute_file_md5(file) - return file_id - - def docs_from_file(self, dirpath, fn): - """Generate documents from filenames. - - This method implements the abstract - :py:meth:`~._BaseCrawler.docs_from_file` and yields index - documents associated with files. - - Notes - ----- - It is not recommended to reimplement this method to modify - documents generated from filenames. - See :py:meth:`~_RegexFileCrawler.process` instead. - - Parameters - ---------- - dirpath : str - The path of the file relative to root. - fn : str - The filename of the file. - - Yields - ------ - dict - Index document. - - """ - for regex, format_ in self.definitions.items(): - m = regex.match(os.path.join(dirpath, fn)) - if m: - doc = self.process(m.groupdict(), dirpath, fn) - doc[KEY_FILENAME] = os.path.relpath( - os.path.join(dirpath, fn), self.root - ) - doc[KEY_PATH] = os.path.abspath(self.root) - doc[KEY_PAYLOAD] = str(format_) - with open(os.path.join(dirpath, fn), "rb") as file: - doc["file_id"] = self.compute_file_id(doc, file) - yield doc - - def fetch(self, doc, mode="r"): - """Fetch the data associated with ``doc``. - - Parameters - ---------- - doc : dict - An index document. - mode : str - Mode used to open file object. - - Returns - ------- - file-like object - The file associated with the index document. - - """ - fn = doc.get(KEY_FILENAME) - if fn: - for regex, format_ in self.definitions.items(): - ffn = os.path.join(self.root, fn) - m = regex.match(ffn) - if m: - if isinstance(format_, str): - return open(ffn, mode=mode) - else: - for meth in ("read", "close"): - if not callable(getattr(format_, meth, None)): - msg = f"Format {format_} has no {meth}() method." - warnings.warn(msg) - return format_(open(ffn, mode=mode)) - else: - raise errors.FetchError( - f"Unable to match file path of doc '{doc}' to format definition." - ) - else: - raise errors.FetchError(f"Insufficient metadata in doc '{doc}'.") - - def process(self, doc, dirpath, fn): - """Post-process documents generated from filenames. - - Examples - -------- - .. code-block:: python - - MyCrawler(signac.indexing._RegexFileCrawler): - def process(self, doc, dirpath, fn): - doc['long_name_for_a'] = doc['a'] - return super(MyCrawler, self).process(doc, dirpath, fn) - - Parameters - ---------- - dirpath : str - The path of the file, relative to ``root``. - fn : str - The filename. - - Returns - ------- - dict - An index document. - - """ - result = {} - for key, value in doc.items(): - if value is None or isinstance(value, bool): - result[key] = value - continue - try: - value = float(value) - except Exception: - result[key] = value - else: - if not math.isnan(value) or math.isinf(value): - if float(value) == int(value): - result[key] = int(value) - else: - result[key] = float(value) - return super().process(result, dirpath, fn) - - def crawl(self, depth=0): - if self.definitions: - yield from super().crawl(depth=depth) - else: - return - - -def _index_signac_project_workspace( - root, - include_job_document=True, - fn_statepoint="signac_statepoint.json", - fn_job_document="signac_job_document.json", - statepoint_index="statepoint", - signac_id_alias="_id", - encoding="utf-8", - statepoint_dict=None, -): - """Yield standard index documents for a signac project workspace.""" - logger.debug(f"Indexing workspace '{root}'...") - m = re.compile(r"[a-f0-9]{32}") - try: - job_ids = [jid for jid in os.listdir(root) if m.match(jid)] - except OSError as error: - if error.errno == errno.ENOENT: - return - else: - raise - for i, job_id in enumerate(job_ids): - if not m.match(job_id): - continue - doc = {"signac_id": job_id, KEY_PATH: root} - if signac_id_alias: - doc[signac_id_alias] = job_id - fn_sp = os.path.join(root, job_id, fn_statepoint) - with open(fn_sp, "rb") as file: - sp = json.loads(file.read().decode(encoding)) - if statepoint_dict is not None: - statepoint_dict[job_id] = sp - if statepoint_index: - doc[statepoint_index] = sp - else: - doc.update(sp) - if include_job_document: - fn_doc = os.path.join(root, job_id, fn_job_document) - try: - with open(fn_doc, "rb") as file: - doc.update(json.loads(file.read().decode(encoding))) - except OSError as error: - if error.errno != errno.ENOENT: - raise - yield doc - if job_ids: - logger.debug(f"Indexed workspace '{root}', {i + 1} entries.") - - -class _SignacProjectCrawler(_RegexFileCrawler): - """Index a signac project workspace. - - Without any file format definitions, this crawler yields index documents - for each job, including the state point and the job document. - - See Also - -------- - :py:class:`~._RegexFileCrawler` - - Parameters - ---------- - root : str - The path to the project's root directory. - - """ - - encoding = "utf-8" - statepoint_index = "statepoint" - fn_statepoint = "signac_statepoint.json" - fn_job_document = "signac_job_document.json" - signac_id_alias = "_id" - - def __init__(self, root): - from .project import get_project - - root = get_project(root=root).workspace() - self._statepoints = {} - return super().__init__(root=root) - - def _get_job_id(self, dirpath): - return os.path.relpath(dirpath, self.root).split("/")[0] - - def _read_statepoint(self, job_id): - fn_sp = os.path.join(self.root, job_id, self.fn_statepoint) - with open(fn_sp, "rb") as file: - return json.loads(file.read().decode(self.encoding)) - - def _get_statepoint(self, job_id): - sp = self._statepoints.setdefault(job_id, self._read_statepoint(job_id)) - assert calc_id(sp) == job_id - return sp - - def get_statepoint(self, dirpath): - job_id = self._get_job_id(dirpath) - return job_id, self._get_statepoint(self, job_id) - - def process(self, doc, dirpath, fn): - if dirpath is not None: - job_id = self._get_job_id(dirpath) - statepoint = self._get_statepoint(job_id) - doc["signac_id"] = job_id - if self.statepoint_index: - doc[self.statepoint_index] = statepoint - else: - doc.update(statepoint) - return super().process(doc, dirpath, fn) - - def crawl(self, depth=0): - for doc in _index_signac_project_workspace( - root=self.root, - fn_statepoint=self.fn_statepoint, - fn_job_document=self.fn_job_document, - statepoint_index=self.statepoint_index, - signac_id_alias=self.signac_id_alias, - encoding=self.encoding, - statepoint_dict=self._statepoints, - ): - yield self.process(doc, None, None) - for doc in super().crawl(depth=depth): - yield doc diff --git a/signac/contrib/project.py b/signac/contrib/project.py index e926d25d1..9a92430bd 100644 --- a/signac/contrib/project.py +++ b/signac/contrib/project.py @@ -38,7 +38,6 @@ ) from .filterparse import _add_prefix, _root_keys, parse_filter from .hashing import calc_id -from .indexing import _SignacProjectCrawler from .job import Job from .schema import ProjectSchema from .utility import _mkdir_p, _nested_dicts_to_dotted_keys, _split_and_print_progress @@ -816,9 +815,7 @@ def detect_schema(self, exclude_const=False, subset=None, index=None): from .schema import _build_job_statepoint_index if index is None: - index = self.index(include_job_document=False) - else: - warnings.warn(INDEX_DEPRECATION_WARNING, FutureWarning) + index = self._index(include_job_document=False) if subset is not None: subset = {str(s) for s in subset} index = [doc for doc in index if doc["_id"] in subset] @@ -884,9 +881,9 @@ def _find_job_ids(self, filter=None, doc_filter=None, index=None): if doc_filter: warnings.warn(DOC_FILTER_WARNING, FutureWarning) filter.update(parse_filter(_add_prefix("doc.", doc_filter))) - index = self.index(include_job_document=True) + index = self._index(include_job_document=True) elif "doc" in _root_keys(filter): - index = self.index(include_job_document=True) + index = self._index(include_job_document=True) else: index = self._sp_index() else: @@ -1883,15 +1880,7 @@ def _read_cache(self): logger.debug(f"Read cache in {delta:.3f} seconds.") return cache - @deprecated( - deprecated_in="1.8", - removed_in="2.0", - current_version=__version__, - details="Indexing is deprecated.", - ) - def index( - self, formats=None, depth=0, skip_errors=False, include_job_document=True - ): + def _index(self, *, include_job_document=True): r"""Generate an index of the project's workspace. This generator function indexes every file in the project's @@ -1901,11 +1890,6 @@ def index( See :ref:`signac project -i ` for the command line equivalent. - .. code-block:: python - - for doc in project.index({r'.*\.txt', 'TextFile'}): - print(doc) - Parameters ---------- formats : str, dict @@ -1928,42 +1912,28 @@ def index( Index document. """ - if formats is None: - root = self.workspace() + root = self.workspace() - def _full_doc(doc): - """Add `signac_id` and `root` to the index document. + def _full_doc(doc): + """Add `signac_id` and `root` to the index document. - Parameters - ---------- - doc : dict - Index document. + Parameters + ---------- + doc : dict + Index document. - Returns - ------- - dict - Modified index document. + Returns + ------- + dict + Modified index document. - """ - doc["signac_id"] = doc["_id"] - doc["root"] = root - return doc + """ + doc["signac_id"] = doc["_id"] + doc["root"] = root + return doc - docs = self._build_index(include_job_document=include_job_document) - docs = map(_full_doc, docs) - else: - if isinstance(formats, str): - formats = {formats: "File"} - - class Crawler(_SignacProjectCrawler): - pass - - for pattern, fmt in formats.items(): - Crawler.define(pattern, fmt) - crawler = Crawler(self.root_directory()) - docs = crawler.crawl(depth=depth) - if skip_errors: - docs = _skip_errors(docs, logger.critical) + docs = self._build_index(include_job_document=include_job_document) + docs = map(_full_doc, docs) for doc in docs: yield doc diff --git a/signac/contrib/utility.py b/signac/contrib/utility.py index 5753e9681..216377a3a 100644 --- a/signac/contrib/utility.py +++ b/signac/contrib/utility.py @@ -88,47 +88,6 @@ def add_verbosity_argument(parser, default=0): ) -def walkdepth(path, depth=0): - """Transverse the directory starting from path. - - Parameters - ---------- - path :str - Directory passed to walk (transverse from). - depth : int - (Default value = 0) - - Yields - ------ - str - When depth==0. - tuple - When depth>0. - - Raises - ------ - ValueError - When the value of depth is negative. - OSError - When path is not name of a directory. - - """ - if depth == 0: - yield from os.walk(path) - elif depth > 0: - path = path.rstrip(os.path.sep) - if not os.path.isdir(path): - raise OSError(f"Not a directory: '{path}'.") - num_sep = path.count(os.path.sep) - for root, dirs, files in os.walk(path): - yield root, dirs, files - num_sep_this = root.count(os.path.sep) - if num_sep + depth <= num_sep_this: - del dirs[:] - else: - raise ValueError("The value of depth must be non-negative.") - - def _mkdir_p(path): """Make a new directory, or do nothing if the directory already exists. diff --git a/tests/test_indexing.py b/tests/test_indexing.py deleted file mode 100644 index c8019c5f0..000000000 --- a/tests/test_indexing.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright (c) 2017 The Regents of the University of Michigan -# All rights reserved. -# This software is licensed under the BSD 3-Clause License. -import json -import os -import re -from tempfile import TemporaryDirectory -from unittest.mock import Mock - -import pytest - -from signac import Collection -from signac.contrib import indexing -from signac.errors import FetchError - - -class TestFormat: - def read(self): - assert 0 - - def close(self): - assert 0 - - -class TestIndexingBase: - @pytest.fixture(autouse=True) - def setUp(self, request): - self._tmp_dir = TemporaryDirectory(prefix="signac_") - request.addfinalizer(self._tmp_dir.cleanup) - - def setup_project(self): - def fn(name): - return os.path.join(self._tmp_dir.name, name) - - with open(fn("a_0.txt"), "w") as file: - file.write('{"a": 0}') - with open(fn("a_1.txt"), "w") as file: - file.write('{"a": 1}') - with open(fn("a_0.json"), "w") as file: - json.dump(dict(a=0), file) - with open(fn("a_1.json"), "w") as file: - json.dump(dict(a=1), file) - - def get_index_collection(self): - c = Collection() - return Mock(spec=c, wraps=c) - - def test_base_crawler(self): - crawler = indexing._BaseCrawler(root=self._tmp_dir.name) - assert len(list(crawler.crawl())) == 0 - doc = dict(a=0) - with pytest.raises(FetchError): - assert crawler.fetch(doc) is None - assert doc == crawler.process(doc, None, None) - with pytest.raises(NotImplementedError): - for doc in crawler.docs_from_file(None, None): - pass - - def test_regex_file_crawler_pre_compiled(self): - self.setup_project() - - class Crawler(indexing._RegexFileCrawler): - pass - - regex = re.compile(r".*a_(?P\d)\.txt") - Crawler.define(regex, TestFormat) - crawler = Crawler(root=self._tmp_dir.name) - no_find = True - for doc in crawler.crawl(): - no_find = False - ffn = os.path.join(doc["root"], doc["filename"]) - m = regex.match(ffn) - assert m is not None - assert os.path.isfile(ffn) - with open(ffn) as file: - doc2 = json.load(file) - assert doc2["a"] == doc["a"] - assert not no_find - - def test_regex_file_crawler(self): - self.setup_project() - - class Crawler(indexing._RegexFileCrawler): - pass - - # First test without pattern - crawler = Crawler(root=self._tmp_dir.name) - assert len(list(crawler.crawl())) == 0 - - # Now with pattern(s) - pattern = r".*a_(?P\d)\.txt" - regex = re.compile(pattern) - Crawler.define(pattern, TestFormat) - Crawler.define("negativematch", "negativeformat") - crawler = Crawler(root=self._tmp_dir.name) - no_find = True - for doc in crawler.crawl(): - no_find = False - ffn = os.path.join(doc["root"], doc["filename"]) - m = regex.match(ffn) - assert m is not None - assert os.path.isfile(ffn) - with open(ffn) as file: - doc2 = json.load(file) - assert doc2["a"] == doc["a"] - assert not no_find - with pytest.raises(FetchError): - crawler.fetch(dict()) - with pytest.raises(FetchError): - crawler.fetch({"filename": "shouldnotmatch"}) - - def test_regex_file_crawler_inheritance(self): - self.setup_project() - - class CrawlerA(indexing._RegexFileCrawler): - pass - - class CrawlerB(indexing._RegexFileCrawler): - pass - - CrawlerA.define("a", TestFormat) - CrawlerB.define("b", TestFormat) - assert len(CrawlerA.definitions) == 1 - assert len(CrawlerB.definitions) == 1 - - class CrawlerC(CrawlerA): - pass - - assert len(CrawlerA.definitions) == 1 - assert len(CrawlerC.definitions) == 1 - assert len(CrawlerB.definitions) == 1 - CrawlerC.define("c", TestFormat) - assert len(CrawlerA.definitions) == 1 - assert len(CrawlerB.definitions) == 1 - assert len(CrawlerC.definitions) == 2 diff --git a/tests/test_project.py b/tests/test_project.py index c5cb70d12..5c92543e9 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -630,77 +630,18 @@ def test_repair_corrupted_workspace(self): logging.disable(logging.NOTSET) def test_index(self): - docs = list(self.project.index(include_job_document=True)) + docs = list(self.project._index(include_job_document=True)) assert len(docs) == 0 - docs = list(self.project.index(include_job_document=False)) + docs = list(self.project._index(include_job_document=False)) assert len(docs) == 0 statepoints = [{"a": i} for i in range(5)] for sp in statepoints: self.project.open_job(sp).document["test"] = True job_ids = {job.id for job in self.project.find_jobs()} - docs = list(self.project.index()) + docs = list(self.project._index()) job_ids_cmp = {doc["_id"] for doc in docs} assert job_ids == job_ids_cmp assert len(docs) == len(statepoints) - for sp in statepoints: - with self.project.open_job(sp): - with open("test.txt", "w"): - pass - docs = list( - self.project.index( - {".*" + re.escape(os.path.sep) + r"test\.txt": "TextFile"} - ) - ) - assert len(docs) == 2 * len(statepoints) - assert len({doc["_id"] for doc in docs}) == len(docs) - - # Index schema is changed - @pytest.mark.xfail() - def test_signac_project_crawler(self): - statepoints = [{"a": i} for i in range(5)] - for sp in statepoints: - self.project.open_job(sp).document["test"] = True - job_ids = {job.id for job in self.project.find_jobs()} - index = {} - for doc in self.project.index(): - index[doc["_id"]] = doc - assert len(index) == len(job_ids) - assert set(index.keys()) == set(job_ids) - crawler = signac.contrib._SignacProjectCrawler(self.project.root_directory()) - index2 = {} - for doc in crawler.crawl(): - index2[doc["_id"]] = doc - for _id, _id2 in zip(index, index2): - assert _id == _id2 - assert index[_id] == index2[_id] - assert index == index2 - for job in self.project.find_jobs(): - with open(job.fn("test.txt"), "w") as file: - file.write("test\n") - formats = {r".*" + re.escape(os.path.sep) + r"test\.txt": "TextFile"} - index = {} - for doc in self.project.index(formats): - index[doc["_id"]] = doc - assert len(index) == 2 * len(job_ids) - - class Crawler(signac.contrib._SignacProjectCrawler): - called = False - - def process(self_, doc, dirpath, fn): - Crawler.called = True - doc = super().process(doc=doc, dirpath=dirpath, fn=fn) - if "format" in doc and doc["format"] is None: - assert doc["_id"] == doc["signac_id"] - return doc - - for p, fmt in formats.items(): - with pytest.deprecated_call(): - Crawler.define(p, fmt) - index2 = {} - for doc in Crawler(root=self.project.root_directory()).crawl(): - index2[doc["_id"]] = doc - assert index == index2 - assert Crawler.called def test_custom_project(self): class CustomProject(signac.Project):