feat: increase file scanning performance (#486)

* feat: increase file scanning performance * fix: correct typo in comment * refactor: use `continue` in place of nested `ifs`
TagStudioDev · Sep 12, 2024 · 6490cc9 · 6490cc9
1 parent dfa4079
commit 6490cc9
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 51 deletions.
diff --git a/tagstudio/src/core/library.py b/tagstudio/src/core/library.py
@@ -737,7 +737,9 @@ def _map_filenames_to_entry_ids(self):
         """Maps a full filepath to its corresponding Entry's ID."""
         self.filename_to_entry_id_map.clear()
         for entry in self.entries:
-            self.filename_to_entry_id_map[(entry.path / entry.filename)] = entry.id
+            self.filename_to_entry_id_map[
+                (self.library_dir / entry.path / entry.filename)
+            ] = entry.id
 
     # def _map_filenames_to_entry_ids(self):
     # 	"""Maps the file paths of entries to their index in the library list."""
@@ -884,59 +886,71 @@ def refresh_dir(self) -> Generator:
 
         # Scans the directory for files, keeping track of:
         #   - Total file count
-        #   - Files without library entries
-        # for type in TYPES:
-        start_time = time.time()
+        #   - Files without Library entries
+        start_time_total = time.time()
+        start_time_loop = time.time()
+        ext_set = set(self.ext_list)  # Should be slightly faster
         for f in self.library_dir.glob("**/*"):
+            end_time_loop = time.time()
+            # Yield output every 1/30 of a second
+            if (end_time_loop - start_time_loop) > 0.034:
+                yield self.dir_file_count
+                start_time_loop = time.time()
             try:
+                # Skip this file if it should be excluded
+                ext: str = f.suffix.lower()
+                if (ext in ext_set and self.is_exclude_list) or (
+                    ext not in ext_set and not self.is_exclude_list
+                ):
+                    continue
+
+                # Finish if the file/path is already mapped in the Library
+                if self.filename_to_entry_id_map.get(f) is not None:
+                    # No other checks are required.
+                    self.dir_file_count += 1
+                    continue
+
+                # If the file is new, check for validity
                 if (
-                    "$RECYCLE.BIN" not in f.parts
-                    and TS_FOLDER_NAME not in f.parts
-                    and "tagstudio_thumbs" not in f.parts
-                    and not f.is_dir()
+                    "$RECYCLE.BIN" in f.parts
+                    or TS_FOLDER_NAME in f.parts
+                    or "tagstudio_thumbs" in f.parts
+                    or f.is_dir()
                 ):
-                    if f.suffix.lower() not in self.ext_list and self.is_exclude_list:
-                        self.dir_file_count += 1
-                        file = f.relative_to(self.library_dir)
-                        if file not in self.filename_to_entry_id_map:
-                            self.files_not_in_library.append(file)
-                    elif f.suffix.lower() in self.ext_list and not self.is_exclude_list:
-                        self.dir_file_count += 1
-                        file = f.relative_to(self.library_dir)
-                        try:
-                            _ = self.filename_to_entry_id_map[file]
-                        except KeyError:
-                            # print(file)
-                            self.files_not_in_library.append(file)
+                    continue
+
+                # Add the validated new file to the Library
+                self.dir_file_count += 1
+                self.files_not_in_library.append(f)
+
             except PermissionError:
-                logging.info(
-                    f"The File/Folder {f} cannot be accessed, because it requires higher permission!"
-                )
-            end_time = time.time()
-            # Yield output every 1/30 of a second
-            if (end_time - start_time) > 0.034:
-                yield self.dir_file_count
-                start_time = time.time()
-        # Sorts the files by date modified, descending.
+                logging.info(f'[LIBRARY] Cannot access "{f}": PermissionError')
+
+        yield self.dir_file_count
+        end_time_total = time.time()
+        logging.info(
+            f"[LIBRARY] Scanned directories in {(end_time_total - start_time_total):.3f} seconds"
+        )
+        # Sorts the files by date modified, descending
         if len(self.files_not_in_library) <= 150000:
             try:
                 if platform.system() == "Windows" or platform.system() == "Darwin":
                     self.files_not_in_library = sorted(
                         self.files_not_in_library,
-                        key=lambda t: -(self.library_dir / t).stat().st_birthtime,  # type: ignore[attr-defined]
+                        key=lambda t: -(t).stat().st_birthtime,  # type: ignore[attr-defined]
                     )
                 else:
                     self.files_not_in_library = sorted(
                         self.files_not_in_library,
-                        key=lambda t: -(self.library_dir / t).stat().st_ctime,
+                        key=lambda t: -(t).stat().st_ctime,
                     )
             except (FileExistsError, FileNotFoundError):
-                print(
-                    "[LIBRARY] [ERROR] Couldn't sort files, some were moved during the scanning/sorting process."
+                logging.info(
+                    "[LIBRARY][ERROR] Couldn't sort files, some were moved during the scanning/sorting process."
                 )
                 pass
         else:
-            print(
+            logging.info(
                 "[LIBRARY][INFO] Not bothering to sort files because there's OVER 150,000! Better sorting methods will be added in the future."
             )
 
@@ -957,7 +971,7 @@ def remove_entry(self, entry_id: int) -> None:
         # Step [1/2]:
         # Remove this Entry from the Entries list.
         entry = self.get_entry(entry_id)
-        path = entry.path / entry.filename
+        path = self.library_dir / entry.path / entry.filename
         # logging.info(f'Removing path: {path}')
 
         del self.filename_to_entry_id_map[path]
@@ -1087,8 +1101,8 @@ def refresh_dupe_files(self, results_filepath: str | Path):
                             )
                         )
                 for match in matches:
-                    file_1 = files[match[0]].relative_to(self.library_dir)
-                    file_2 = files[match[1]].relative_to(self.library_dir)
+                    file_1 = files[match[0]]
+                    file_2 = files[match[1]]
 
                     if (
                         file_1 in self.filename_to_entry_id_map.keys()
@@ -1289,8 +1303,7 @@ def add_new_files_as_entries(self) -> list[int]:
         """Adds files from the `files_not_in_library` list to the Library as Entries. Returns list of added indices."""
         new_ids: list[int] = []
         for file in self.files_not_in_library:
-            path = Path(file)
-            # print(os.path.split(file))
+            path = Path(*file.parts[len(self.library_dir.parts) :])
             entry = Entry(
                 id=self._next_entry_id, filename=path.name, path=path.parent, fields=[]
             )
@@ -1301,8 +1314,6 @@ def add_new_files_as_entries(self) -> list[int]:
         self.files_not_in_library.clear()
         return new_ids
 
-        self.files_not_in_library.clear()
-
     def get_entry(self, entry_id: int) -> Entry:
         """Returns an Entry object given an Entry ID."""
         return self.entries[self._entry_id_to_index_map[int(entry_id)]]
@@ -1323,9 +1334,7 @@ def get_entry_id_from_filepath(self, filename: Path):
         """Returns an Entry ID given the full filepath it points to."""
         try:
             if self.entries:
-                return self.filename_to_entry_id_map[
-                    Path(filename).relative_to(self.library_dir)
-                ]
+                return self.filename_to_entry_id_map[filename]
         except KeyError:
             return -1
 

diff --git a/tagstudio/src/qt/modals/drop_import.py b/tagstudio/src/qt/modals/drop_import.py
@@ -106,6 +106,7 @@ def copy_files(self):
                 continue
 
             dest_file = self.get_relative_path(file)
+            full_dest_path: Path = self.driver.lib.library_dir / dest_file
 
             if file in self.duplicate_files:
                 duplicated_files_progress += 1
@@ -115,14 +116,12 @@ def copy_files(self):
                 if self.choice == 2:  # rename
                     new_name = self.get_renamed_duplicate_filename_in_lib(dest_file)
                     dest_file = dest_file.with_name(new_name)
-                    self.driver.lib.files_not_in_library.append(dest_file)
+                    self.driver.lib.files_not_in_library.append(full_dest_path)
             else:  # override is simply copying but not adding a new entry
-                self.driver.lib.files_not_in_library.append(dest_file)
+                self.driver.lib.files_not_in_library.append(full_dest_path)
 
-            (self.driver.lib.library_dir / dest_file).parent.mkdir(
-                parents=True, exist_ok=True
-            )
-            shutil.copyfile(file, self.driver.lib.library_dir / dest_file)
+            (full_dest_path).parent.mkdir(parents=True, exist_ok=True)
+            shutil.copyfile(file, full_dest_path)
 
             fileCount += 1
             yield [fileCount, duplicated_files_progress]