Skip to content

Commit

Permalink
Merge pull request #501 from luca-medeiros-reef/fix/exclude
Browse files Browse the repository at this point in the history
Perform exclusions before accessing file/dir
  • Loading branch information
mjurbanski-reef authored Jun 15, 2024
2 parents 19c6cbb + 4ea9a57 commit 1b3189e
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 92 deletions.
90 changes: 32 additions & 58 deletions b2sdk/_internal/scan/folder.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from pathlib import Path
from typing import Iterator

from ..utils import fix_windows_path_limit, get_file_mtime, is_file_readable, validate_b2_file_name
from ..utils import fix_windows_path_limit, get_file_mtime, validate_b2_file_name
from .exception import (
EmptyDirectory,
EnvironmentEncodingError,
Expand Down Expand Up @@ -219,90 +219,64 @@ def _walk_relative_paths(
# a0.txt
#
# This is because in Unicode '.' comes before '/', which comes before '0'.
names = [] # list of (name, local_path, relative_file_path)

visited_symlinks = visited_symlinks or set()

if local_dir.is_symlink():
real_path = local_dir.resolve()
inode_number = real_path.stat().st_ino

visited_symlinks_count = len(visited_symlinks)

# Add symlink to visited_symlinks to prevent infinite symlink loops
visited_symlinks.add(inode_number)

# Check if set size has changed, if not, symlink has already been visited
if len(visited_symlinks) == visited_symlinks_count:
# Infinite symlink loop detected, report warning and skip symlink
if reporter is not None:
inode_number = local_dir.resolve().stat().st_ino
if inode_number in visited_symlinks:
if reporter:
reporter.circular_symlink_skipped(str(local_dir))
return

return # Skip if symlink already visited
visited_symlinks.add(inode_number)

for local_path in local_dir.iterdir():
for local_path in sorted(local_dir.iterdir()):
name = local_path.name
relative_file_path = join_b2_path(relative_dir_path, name)

if policies_manager.exclude_all_symlinks and local_path.is_symlink():
if reporter is not None:
reporter.symlink_skipped(str(local_path))
continue
try:
validate_b2_file_name(name)
except ValueError as e:
if reporter is not None:
reporter.invalid_name(str(local_path), str(e))
continue

# Skip broken symlinks or other inaccessible files
if not is_file_readable(str(local_path), reporter):
continue

if policies_manager.exclude_all_symlinks and local_path.is_symlink():
if reporter is not None:
reporter.symlink_skipped(str(local_path))
continue

if local_path.is_dir():
name += '/'
if policies_manager.should_exclude_local_directory(str(relative_file_path)):
continue

# remove the leading './' from the relative path to ensure backward compatibility
relative_file_path_str = str(relative_file_path)
if relative_file_path_str.startswith("./"):
relative_file_path_str = relative_file_path_str[2:]
names.append((name, local_path, relative_file_path_str))

# Yield all of the answers.
#
# Sorting the list of triples puts them in the right order because 'name',
# the sort key, is the first thing in the triple.
for (name, local_path, relative_file_path) in sorted(names):
if name.endswith('/'):
continue # Skip excluded directories
# Recurse into directories
yield from self._walk_relative_paths(
local_path,
relative_file_path,
reporter,
policies_manager,
visited_symlinks,
local_path, relative_file_path, reporter, policies_manager, visited_symlinks
)
else:
# Check that the file still exists and is accessible, since it can take a long time
# to iterate through large folders
if is_file_readable(str(local_path), reporter):
if policies_manager.should_exclude_relative_path(relative_file_path):
continue # Skip excluded files
try:
file_mod_time = get_file_mtime(str(local_path))
file_size = local_path.stat().st_size
except OSError:
if reporter is not None:
reporter.local_access_error(str(local_path))
continue

local_scan_path = LocalPath(
absolute_path=self.make_full_path(str(relative_file_path)),
relative_path=str(relative_file_path),
mod_time=file_mod_time,
size=file_size,
)
local_scan_path = LocalPath(
absolute_path=self.make_full_path(str(relative_file_path)),
relative_path=str(relative_file_path),
mod_time=file_mod_time,
size=file_size
)
if policies_manager.should_exclude_local_path(local_scan_path):
continue # Skip excluded files

if policies_manager.should_exclude_local_path(local_scan_path):
if not os.access(local_path, os.R_OK):
if reporter is not None:
reporter.local_permission_error(str(local_path))
continue

yield local_scan_path
yield local_scan_path

@classmethod
def _handle_non_unicode_file_name(cls, name):
Expand Down
6 changes: 3 additions & 3 deletions b2sdk/_internal/scan/policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def __init__(
exclude_uploaded_before, exclude_uploaded_after
)

def _should_exclude_relative_path(self, relative_path: str):
def should_exclude_relative_path(self, relative_path: str):
if self._include_file_set.matches(relative_path):
return False
return self._exclude_file_set.matches(relative_path)
Expand All @@ -197,7 +197,7 @@ def should_exclude_local_path(self, local_path: LocalPath):
"""
if local_path.mod_time not in self._include_mod_time_range:
return True
return self._should_exclude_relative_path(local_path.relative_path)
return self.should_exclude_relative_path(local_path.relative_path)

def should_exclude_b2_file_version(self, file_version: FileVersion, relative_path: str):
"""
Expand All @@ -209,7 +209,7 @@ def should_exclude_b2_file_version(self, file_version: FileVersion, relative_pat
return True
if file_version.mod_time_millis not in self._include_mod_time_range:
return True
return self._should_exclude_relative_path(relative_path)
return self.should_exclude_relative_path(relative_path)

def should_exclude_b2_directory(self, dir_path: str):
"""
Expand Down
20 changes: 0 additions & 20 deletions b2sdk/_internal/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,26 +252,6 @@ def validate_b2_file_name(name):
raise ValueError("file names segments (between '/') can be at most 250 utf-8 bytes")


def is_file_readable(local_path, reporter=None):
"""
Check if the local file has read permissions.
:param local_path: a file path
:type local_path: str
:param reporter: reporter object to put errors on
:rtype: bool
"""
if not os.path.exists(local_path):
if reporter is not None:
reporter.local_access_error(local_path)
return False
elif not os.access(local_path, os.R_OK):
if reporter is not None:
reporter.local_permission_error(local_path)
return False
return True


def get_file_mtime(local_path):
"""
Get modification time of a file in milliseconds.
Expand Down
3 changes: 3 additions & 0 deletions b2sdk/v1/sync/scan_policies.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ def __init__(self, scan_policies_manager: ScanPoliciesManager):
def __repr__(self):
return f"{self.__class__.__name__}({self.scan_policies_manager})"

def should_exclude_relative_path(self, relative_path: str):
self.scan_policies_manager.should_exclude_file(relative_path)

def should_exclude_local_path(self, local_path: v2.LocalSyncPath):
if self.scan_policies_manager.should_exclude_file_version(
_translate_local_path_to_file(local_path).latest_version()
Expand Down
1 change: 1 addition & 0 deletions changelog.d/456.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Move scan filters before a read on filesystem access attempt. This will prevent unnecessary warnings and IO operations on paths that are not relevant to the operation.
70 changes: 69 additions & 1 deletion test/unit/scan/test_folder_traversal.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import platform
import re
import sys
from unittest.mock import MagicMock
from unittest.mock import MagicMock, patch

import pytest

Expand Down Expand Up @@ -654,3 +654,71 @@ def test_folder_all_files__dir_excluded_by_regex(self, tmp_path):
assert absolute_paths == [
fix_windows_path_limit(str(d1_dir / "file1.txt")),
]

def test_excluded_no_access_check(self, tmp_path):
"""Test that a directory/file is not checked for access if it is excluded."""
# Create directories and files
excluded_dir = tmp_path / "excluded_dir"
excluded_dir.mkdir()
excluded_file = excluded_dir / "excluded_file.txt"
excluded_file.touch()
included_dir = tmp_path / "included_dir"
included_dir.mkdir()
(included_dir / "excluded_file.txt").touch()

# Setup exclusion regex that matches the excluded directory/file name
scan_policy = ScanPoliciesManager(
exclude_dir_regexes=[r"excluded_dir$"], exclude_file_regexes=[r'.*excluded_file.txt']
)
reporter = ProgressReport(sys.stdout, False)

# Patch os.access to monitor if it is called on the excluded file
with patch('os.access', MagicMock(return_value=True)) as mocked_access:
folder = LocalFolder(str(tmp_path))
list(folder.all_files(reporter=reporter, policies_manager=scan_policy))

# Verify os.access was not called for the excluded directory/file
mocked_access.assert_not_called()

reporter.close()

def test_excluded_without_permissions(self, tmp_path):
"""Test that a excluded directory/file without permissions is not processed and no warning is issued."""
excluded_dir = tmp_path / "excluded_dir"
excluded_dir.mkdir()
(excluded_dir / "file.txt").touch()

included_dir = tmp_path / "included_dir"
included_dir.mkdir()
(included_dir / "excluded_file.txt").touch()
(included_dir / "included_file.txt").touch()

# Modify directory permissions to simulate lack of access
(included_dir / "excluded_file.txt").chmod(0o000)
excluded_dir.chmod(0o000)

scan_policy = ScanPoliciesManager(
exclude_dir_regexes=[r"excluded_dir$"], exclude_file_regexes=[r'.*excluded_file.txt']
)
reporter = ProgressReport(sys.stdout, False)

folder = LocalFolder(str(tmp_path))
local_paths = folder.all_files(reporter=reporter, policies_manager=scan_policy)
absolute_paths = [path.absolute_path for path in local_paths]

# Restore directory permissions to clean up
(included_dir / "excluded_file.txt").chmod(0o755)
excluded_dir.chmod(0o755)

# Check that only included_dir/included_file.txt was return
assert any('included_file.txt' in path for path in absolute_paths)

# Check that no access warnings are issued for the excluded directory/file
assert not any(
re.match(
r"WARNING: .*excluded_.* could not be accessed \(no permissions to read\?\)",
warning,
) for warning in reporter.warnings
), "Access warning was issued for the excluded directory/file"

reporter.close()
10 changes: 5 additions & 5 deletions test/unit/v0/test_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,10 @@ class TestFolder(TestSync):

NAMES = [
'.dot_file',
'hello.',
os.path.join('hello', 'a', '1'),
os.path.join('hello', 'a', '2'),
os.path.join('hello', 'b'),
'hello.',
'hello0',
os.path.join('inner', 'a.bin'),
os.path.join('inner', 'a.txt'),
Expand Down Expand Up @@ -106,10 +106,10 @@ def assert_filtered_files(self, scan_results, expected_scan_results):
def test_exclusions(self):
expected_list = [
'.dot_file',
'hello.',
'hello/a/1',
'hello/a/2',
'hello/b',
'hello.',
'hello0',
'inner/a.txt',
'inner/b.txt',
Expand All @@ -129,10 +129,10 @@ def test_exclude_all(self):
def test_exclusions_inclusions(self):
expected_list = [
'.dot_file',
'hello.',
'hello/a/1',
'hello/a/2',
'hello/b',
'hello.',
'hello0',
'inner/a.bin',
'inner/a.txt',
Expand All @@ -151,8 +151,8 @@ def test_exclusions_inclusions(self):
def test_exclude_matches_prefix(self):
expected_list = [
'.dot_file',
'hello.',
'hello/b',
'hello.',
'hello0',
'inner/b.bin',
'inner/b.txt',
Expand Down Expand Up @@ -204,10 +204,10 @@ def test_exclude_directory_trailing_slash_does_not_match(self):
def test_exclusion_with_exact_match(self):
expected_list = [
'.dot_file',
'hello.',
'hello/a/1',
'hello/a/2',
'hello/b',
'hello.',
'inner/a.bin',
'inner/a.txt',
'inner/b.bin',
Expand Down
10 changes: 5 additions & 5 deletions test/unit/v1/test_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,10 @@ class TestFolder(TestSync):

NAMES = [
'.dot_file',
'hello.',
os.path.join('hello', 'a', '1'),
os.path.join('hello', 'a', '2'),
os.path.join('hello', 'b'),
'hello.',
'hello0',
os.path.join('inner', 'a.bin'),
os.path.join('inner', 'a.txt'),
Expand Down Expand Up @@ -109,10 +109,10 @@ def assert_filtered_files(self, scan_results, expected_scan_results):
def test_exclusions(self):
expected_list = [
'.dot_file',
'hello.',
'hello/a/1',
'hello/a/2',
'hello/b',
'hello.',
'hello0',
'inner/a.txt',
'inner/b.txt',
Expand All @@ -132,10 +132,10 @@ def test_exclude_all(self):
def test_exclusions_inclusions(self):
expected_list = [
'.dot_file',
'hello.',
'hello/a/1',
'hello/a/2',
'hello/b',
'hello.',
'hello0',
'inner/a.bin',
'inner/a.txt',
Expand All @@ -154,8 +154,8 @@ def test_exclusions_inclusions(self):
def test_exclude_matches_prefix(self):
expected_list = [
'.dot_file',
'hello.',
'hello/b',
'hello.',
'hello0',
'inner/b.bin',
'inner/b.txt',
Expand Down Expand Up @@ -207,10 +207,10 @@ def test_exclude_directory_trailing_slash_does_not_match(self):
def test_exclusion_with_exact_match(self):
expected_list = [
'.dot_file',
'hello.',
'hello/a/1',
'hello/a/2',
'hello/b',
'hello.',
'inner/a.bin',
'inner/a.txt',
'inner/b.bin',
Expand Down

0 comments on commit 1b3189e

Please sign in to comment.