Skip to content

Commit

Permalink
feat: increase file scanning performance (#486)
Browse files Browse the repository at this point in the history
* feat: increase file scanning performance

* fix: correct typo in comment

* refactor: use `continue` in place of nested `ifs`
  • Loading branch information
CyanVoxel authored Sep 12, 2024
1 parent dfa4079 commit 6490cc9
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 51 deletions.
99 changes: 54 additions & 45 deletions tagstudio/src/core/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,7 +737,9 @@ def _map_filenames_to_entry_ids(self):
"""Maps a full filepath to its corresponding Entry's ID."""
self.filename_to_entry_id_map.clear()
for entry in self.entries:
self.filename_to_entry_id_map[(entry.path / entry.filename)] = entry.id
self.filename_to_entry_id_map[
(self.library_dir / entry.path / entry.filename)
] = entry.id

# def _map_filenames_to_entry_ids(self):
# """Maps the file paths of entries to their index in the library list."""
Expand Down Expand Up @@ -884,59 +886,71 @@ def refresh_dir(self) -> Generator:

# Scans the directory for files, keeping track of:
# - Total file count
# - Files without library entries
# for type in TYPES:
start_time = time.time()
# - Files without Library entries
start_time_total = time.time()
start_time_loop = time.time()
ext_set = set(self.ext_list) # Should be slightly faster
for f in self.library_dir.glob("**/*"):
end_time_loop = time.time()
# Yield output every 1/30 of a second
if (end_time_loop - start_time_loop) > 0.034:
yield self.dir_file_count
start_time_loop = time.time()
try:
# Skip this file if it should be excluded
ext: str = f.suffix.lower()
if (ext in ext_set and self.is_exclude_list) or (
ext not in ext_set and not self.is_exclude_list
):
continue

# Finish if the file/path is already mapped in the Library
if self.filename_to_entry_id_map.get(f) is not None:
# No other checks are required.
self.dir_file_count += 1
continue

# If the file is new, check for validity
if (
"$RECYCLE.BIN" not in f.parts
and TS_FOLDER_NAME not in f.parts
and "tagstudio_thumbs" not in f.parts
and not f.is_dir()
"$RECYCLE.BIN" in f.parts
or TS_FOLDER_NAME in f.parts
or "tagstudio_thumbs" in f.parts
or f.is_dir()
):
if f.suffix.lower() not in self.ext_list and self.is_exclude_list:
self.dir_file_count += 1
file = f.relative_to(self.library_dir)
if file not in self.filename_to_entry_id_map:
self.files_not_in_library.append(file)
elif f.suffix.lower() in self.ext_list and not self.is_exclude_list:
self.dir_file_count += 1
file = f.relative_to(self.library_dir)
try:
_ = self.filename_to_entry_id_map[file]
except KeyError:
# print(file)
self.files_not_in_library.append(file)
continue

# Add the validated new file to the Library
self.dir_file_count += 1
self.files_not_in_library.append(f)

except PermissionError:
logging.info(
f"The File/Folder {f} cannot be accessed, because it requires higher permission!"
)
end_time = time.time()
# Yield output every 1/30 of a second
if (end_time - start_time) > 0.034:
yield self.dir_file_count
start_time = time.time()
# Sorts the files by date modified, descending.
logging.info(f'[LIBRARY] Cannot access "{f}": PermissionError')

yield self.dir_file_count
end_time_total = time.time()
logging.info(
f"[LIBRARY] Scanned directories in {(end_time_total - start_time_total):.3f} seconds"
)
# Sorts the files by date modified, descending
if len(self.files_not_in_library) <= 150000:
try:
if platform.system() == "Windows" or platform.system() == "Darwin":
self.files_not_in_library = sorted(
self.files_not_in_library,
key=lambda t: -(self.library_dir / t).stat().st_birthtime, # type: ignore[attr-defined]
key=lambda t: -(t).stat().st_birthtime, # type: ignore[attr-defined]
)
else:
self.files_not_in_library = sorted(
self.files_not_in_library,
key=lambda t: -(self.library_dir / t).stat().st_ctime,
key=lambda t: -(t).stat().st_ctime,
)
except (FileExistsError, FileNotFoundError):
print(
"[LIBRARY] [ERROR] Couldn't sort files, some were moved during the scanning/sorting process."
logging.info(
"[LIBRARY][ERROR] Couldn't sort files, some were moved during the scanning/sorting process."
)
pass
else:
print(
logging.info(
"[LIBRARY][INFO] Not bothering to sort files because there's OVER 150,000! Better sorting methods will be added in the future."
)

Expand All @@ -957,7 +971,7 @@ def remove_entry(self, entry_id: int) -> None:
# Step [1/2]:
# Remove this Entry from the Entries list.
entry = self.get_entry(entry_id)
path = entry.path / entry.filename
path = self.library_dir / entry.path / entry.filename
# logging.info(f'Removing path: {path}')

del self.filename_to_entry_id_map[path]
Expand Down Expand Up @@ -1087,8 +1101,8 @@ def refresh_dupe_files(self, results_filepath: str | Path):
)
)
for match in matches:
file_1 = files[match[0]].relative_to(self.library_dir)
file_2 = files[match[1]].relative_to(self.library_dir)
file_1 = files[match[0]]
file_2 = files[match[1]]

if (
file_1 in self.filename_to_entry_id_map.keys()
Expand Down Expand Up @@ -1289,8 +1303,7 @@ def add_new_files_as_entries(self) -> list[int]:
"""Adds files from the `files_not_in_library` list to the Library as Entries. Returns list of added indices."""
new_ids: list[int] = []
for file in self.files_not_in_library:
path = Path(file)
# print(os.path.split(file))
path = Path(*file.parts[len(self.library_dir.parts) :])
entry = Entry(
id=self._next_entry_id, filename=path.name, path=path.parent, fields=[]
)
Expand All @@ -1301,8 +1314,6 @@ def add_new_files_as_entries(self) -> list[int]:
self.files_not_in_library.clear()
return new_ids

self.files_not_in_library.clear()

def get_entry(self, entry_id: int) -> Entry:
"""Returns an Entry object given an Entry ID."""
return self.entries[self._entry_id_to_index_map[int(entry_id)]]
Expand All @@ -1323,9 +1334,7 @@ def get_entry_id_from_filepath(self, filename: Path):
"""Returns an Entry ID given the full filepath it points to."""
try:
if self.entries:
return self.filename_to_entry_id_map[
Path(filename).relative_to(self.library_dir)
]
return self.filename_to_entry_id_map[filename]
except KeyError:
return -1

Expand Down
11 changes: 5 additions & 6 deletions tagstudio/src/qt/modals/drop_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ def copy_files(self):
continue

dest_file = self.get_relative_path(file)
full_dest_path: Path = self.driver.lib.library_dir / dest_file

if file in self.duplicate_files:
duplicated_files_progress += 1
Expand All @@ -115,14 +116,12 @@ def copy_files(self):
if self.choice == 2: # rename
new_name = self.get_renamed_duplicate_filename_in_lib(dest_file)
dest_file = dest_file.with_name(new_name)
self.driver.lib.files_not_in_library.append(dest_file)
self.driver.lib.files_not_in_library.append(full_dest_path)
else: # override is simply copying but not adding a new entry
self.driver.lib.files_not_in_library.append(dest_file)
self.driver.lib.files_not_in_library.append(full_dest_path)

(self.driver.lib.library_dir / dest_file).parent.mkdir(
parents=True, exist_ok=True
)
shutil.copyfile(file, self.driver.lib.library_dir / dest_file)
(full_dest_path).parent.mkdir(parents=True, exist_ok=True)
shutil.copyfile(file, full_dest_path)

fileCount += 1
yield [fileCount, duplicated_files_progress]
Expand Down

0 comments on commit 6490cc9

Please sign in to comment.