Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Trial APIs --local should respect .detignore. [MLG-1352] #8683

Merged
merged 4 commits into from
Jan 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 9 additions & 118 deletions harness/determined/common/context.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,13 @@
import base64
import collections
import os
import pathlib
import tarfile
from typing import Any, Dict, Iterable, List, Optional

from determined.common import constants, util
from determined.common import constants, detignore, util, v1file_utils
from determined.common.api import bindings

LegacyContext = List[Dict[str, Any]]


def v1File_size(f: bindings.v1File) -> int:
if f.content:
# The content is already base64-encoded, and we want the real length.
return len(f.content) // 4 * 3
return 0


def v1File_to_dict(f: bindings.v1File) -> Dict[str, Any]:
d = {
"path": f.path,
"type": f.type,
"uid": f.uid,
"gid": f.gid,
# Echo API expects int-type int64 value
"mtime": int(f.mtime),
"mode": f.mode,
}
if f.type in (ord(tarfile.REGTYPE), ord(tarfile.DIRTYPE)):
d["content"] = f.content
return d


def v1File_from_local_file(archive_path: str, path: pathlib.Path) -> bindings.v1File:
with path.open("rb") as f:
content = base64.b64encode(f.read()).decode("utf8")
st = path.stat()
return bindings.v1File(
path=archive_path,
type=ord(tarfile.REGTYPE),
content=content,
# Protobuf expects string-encoded int64
mtime=str(int(st.st_mtime)),
mode=st.st_mode,
uid=0,
gid=0,
)


def v1File_from_local_dir(archive_path: str, path: pathlib.Path) -> bindings.v1File:
st = path.stat()
return bindings.v1File(
path=archive_path,
type=ord(tarfile.DIRTYPE),
content="",
# Protobuf expects string-encoded int64
mtime=str(int(st.st_mtime)),
mode=st.st_mode,
uid=0,
gid=0,
)


class _Builder:
def __init__(self, limit: int) -> None:
self.limit = limit
Expand All @@ -73,7 +18,7 @@ def __init__(self, limit: int) -> None:

def add_v1File(self, f: bindings.v1File) -> None:
self.items.append(f)
self.size += v1File_size(f)
self.size += v1file_utils.v1File_size(f)
if self.size > self.limit:
raise ValueError(
"The total size of context directory and included files and directories exceeds "
Expand All @@ -97,70 +42,16 @@ def add(self, root_path: pathlib.Path, entry_prefix: pathlib.Path) -> None:
raise ValueError(f"Path '{root_path}' doesn't exist")

if root_path.is_file():
self.add_v1File(v1File_from_local_file(root_path.name, root_path))
self.add_v1File(v1file_utils.v1File_from_local_file(root_path.name, root_path))
return

if str(entry_prefix) != ".":
# For non-context directories, include the root directory.
self.add_v1File(v1File_from_local_dir(str(entry_prefix), root_path))

ignore = list(constants.DEFAULT_DETIGNORE)
ignore_path = root_path.joinpath(".detignore")
if ignore_path.is_file():
with ignore_path.open("r") as detignore_file:
ignore.extend(detignore_file)

import pathspec

ignore_spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, ignore)

# We could use pathlib.Path.rglob for scanning the directory;
# however, the Python documentation claims a warning that rglob may be
# inefficient on large directory trees, so we use the older os.walk().
for parent, dirs, files in os.walk(str(root_path)):
keep_dirs = []
for directory in dirs:
dir_path = pathlib.Path(parent).joinpath(directory)
dir_rel_path = dir_path.relative_to(root_path)

# If the directory matches any path specified in .detignore, then ignore it.
if ignore_spec.match_file(str(dir_rel_path)):
continue
if ignore_spec.match_file(str(dir_rel_path) + "/"):
continue
keep_dirs.append(directory)
# Determined only supports POSIX-style file paths. Use as_posix() in case this code
# is executed in a non-POSIX environment.
entry_path = (entry_prefix / dir_rel_path).as_posix()

self.add_v1File(v1File_from_local_dir(entry_path, dir_path))
# We can modify dirs in-place so that we do not recurse into ignored directories
# See https://docs.python.org/3/library/os.html#os.walk
dirs[:] = keep_dirs

for file in files:
file_path = pathlib.Path(parent).joinpath(file)
file_rel_path = file_path.relative_to(root_path)

# If the file is the .detignore file or matches one of the
# paths specified in .detignore, then ignore it.
if file_rel_path.name == ".detignore":
continue
if ignore_spec.match_file(str(file_rel_path)):
continue

# Determined only supports POSIX-style file paths. Use as_posix() in case this code
# is executed in a non-POSIX environment.
entry_path = (entry_prefix / file_rel_path).as_posix()

try:
entry = v1File_from_local_file(entry_path, file_path)
except OSError:
print(f"Error reading '{entry_path}', skipping this file.")
continue

self.add_v1File(entry)
self.update_msg()
self.add_v1File(v1file_utils.v1File_from_local_dir(str(entry_prefix), root_path))

for file in detignore.os_walk_to_v1Files(root_path, entry_prefix):
self.add_v1File(file)
self.update_msg()

def get_items(self) -> List[bindings.v1File]:
print()
Expand Down Expand Up @@ -217,4 +108,4 @@ def read_legacy_context(
includes: Iterable[pathlib.Path] = (),
limit: int = constants.MAX_CONTEXT_SIZE,
) -> LegacyContext:
return [v1File_to_dict(f) for f in read_v1_context(context_root, includes, limit)]
return [v1file_utils.v1File_to_dict(f) for f in read_v1_context(context_root, includes, limit)]
96 changes: 96 additions & 0 deletions harness/determined/common/detignore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import os
import pathlib
from typing import TYPE_CHECKING, Callable, Iterable, List, Set

if TYPE_CHECKING:
import pathspec

from determined.common import constants, v1file_utils
from determined.common.api import bindings


def _build_detignore_pathspec(root_path: pathlib.Path) -> "pathspec.PathSpec":
ignore = list(constants.DEFAULT_DETIGNORE)
ignore_path = root_path / ".detignore"
if ignore_path.is_file():
with ignore_path.open("r") as detignore_file:
ignore.extend(detignore_file)

# Lazy import to speed up load time.
# See https://github.com/determined-ai/determined/pull/6590 for details.
import pathspec

return pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, ignore)


def make_shutil_ignore(root_path: pathlib.Path) -> Callable:
ignore_spec = _build_detignore_pathspec(root_path)

def _ignore(path: str, names: List[str]) -> Set[str]:
ignored_names = set() # type: Set[str]
for name in names:
if name == ".detignore":
ignored_names.add(name)
continue

file_path = pathlib.Path(path) / name
file_rel_path = file_path.relative_to(root_path)

if (
file_path.is_dir() and ignore_spec.match_file(str(file_rel_path) + "/")
) or ignore_spec.match_file(str(file_rel_path)):
Comment on lines +36 to +41
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: why bounce back and forth between pathlib.Path and str? Why not just use os.path.join() and os.path.relpath()?

non-blocking; feel free to ignore my inner C programmer which cringes at unnecessary malloc() calls.

ignored_names.add(name)

return ignored_names

return _ignore


def os_walk_to_v1Files(
root_path: pathlib.Path, entry_prefix: pathlib.Path
) -> Iterable[bindings.v1File]:
ignore_spec = _build_detignore_pathspec(root_path)

for parent, dirs, files in os.walk(str(root_path)):
keep_dirs = []
for directory in dirs:
dir_path = pathlib.Path(parent).joinpath(directory)
dir_rel_path = dir_path.relative_to(root_path)

# If the directory matches any path specified in .detignore, then ignore it.
if ignore_spec.match_file(str(dir_rel_path)):
continue
if ignore_spec.match_file(str(dir_rel_path) + "/"):
continue
keep_dirs.append(directory)
# Determined only supports POSIX-style file paths. Use as_posix() in case this code
# is executed in a non-POSIX environment.
entry_path = (entry_prefix / dir_rel_path).as_posix()

yield v1file_utils.v1File_from_local_dir(entry_path, dir_path)
# We can modify dirs in-place so that we do not recurse into ignored directories
# See https://docs.python.org/3/library/os.html#os.walk
dirs[:] = keep_dirs

for file in files:
file_path = pathlib.Path(parent).joinpath(file)
file_rel_path = file_path.relative_to(root_path)

# If the file is the .detignore file or matches one of the
# paths specified in .detignore, then ignore it.
if file_rel_path.name == ".detignore":
continue
if ignore_spec.match_file(str(file_rel_path)):
continue

# Determined only supports POSIX-style file paths. Use as_posix() in case this code
# is executed in a non-POSIX environment.
entry_path = (entry_prefix / file_rel_path).as_posix()

try:
entry = v1file_utils.v1File_from_local_file(entry_path, file_path)
except OSError:
print(f"Error reading '{entry_path}', skipping this file.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you set file=sys.stderr on this print statement?

continue

yield entry
58 changes: 58 additions & 0 deletions harness/determined/common/v1file_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import base64
import pathlib
import tarfile
from typing import Any, Dict

from determined.common.api import bindings


def v1File_size(f: bindings.v1File) -> int:
if f.content:
# The content is already base64-encoded, and we want the real length.
return len(f.content) // 4 * 3
return 0


def v1File_to_dict(f: bindings.v1File) -> Dict[str, Any]:
d = {
"path": f.path,
"type": f.type,
"uid": f.uid,
"gid": f.gid,
# Echo API expects int-type int64 value
"mtime": int(f.mtime),
"mode": f.mode,
}
if f.type in (ord(tarfile.REGTYPE), ord(tarfile.DIRTYPE)):
d["content"] = f.content
return d


def v1File_from_local_file(archive_path: str, path: pathlib.Path) -> bindings.v1File:
with path.open("rb") as f:
content = base64.b64encode(f.read()).decode("utf8")
st = path.stat()
return bindings.v1File(
path=archive_path,
type=ord(tarfile.REGTYPE),
content=content,
# Protobuf expects string-encoded int64
mtime=str(int(st.st_mtime)),
mode=st.st_mode,
uid=0,
gid=0,
)


def v1File_from_local_dir(archive_path: str, path: pathlib.Path) -> bindings.v1File:
st = path.stat()
return bindings.v1File(
path=archive_path,
type=ord(tarfile.DIRTYPE),
content="",
# Protobuf expects string-encoded int64
mtime=str(int(st.st_mtime)),
mode=st.st_mode,
uid=0,
gid=0,
)
9 changes: 7 additions & 2 deletions harness/determined/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@

import determined as det
from determined import constants
from determined.common import check, util
from determined.common import check, detignore, util

logger = logging.getLogger("determined")

Expand Down Expand Up @@ -259,11 +259,16 @@ def write_user_code(path: pathlib.Path, on_cluster: bool) -> None:
# since it is rather common that users mount large, non-model files into their working directory
# (like data or their entire HOME directory), when we are training on-cluster we use a
# specially-prepared clean copy of the model rather than the working directory.
#
# When running locally, we filter out the files using detignore.
if on_cluster:
model_dir = constants.MANAGED_TRAINING_MODEL_COPY
ignore_func = None
else:
model_dir = "."
shutil.copytree(model_dir, code_path, ignore=shutil.ignore_patterns("__pycache__"))
ignore_func = detignore.make_shutil_ignore(pathlib.Path(model_dir))

shutil.copytree(model_dir, code_path, ignore=ignore_func)
os.chmod(code_path, 0o755)


Expand Down
14 changes: 13 additions & 1 deletion harness/tests/filetree.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import contextlib
import os
import shutil
import tempfile
from pathlib import Path
from types import TracebackType
from typing import ContextManager, Dict, Optional, Type, Union
from typing import ContextManager, Dict, Iterator, Optional, Type, Union


class FileTree(ContextManager[Path]):
Expand Down Expand Up @@ -39,3 +41,13 @@ def __exit__(
traceback: Optional[TracebackType],
) -> None:
shutil.rmtree(str(self.dir), ignore_errors=True)


@contextlib.contextmanager
def chdir(path: Path) -> Iterator:
old = os.getcwd()
os.chdir(str(path))
try:
yield
finally:
os.chdir(old)
Loading