Skip to content

Commit

Permalink
Improve logging and error handling when ingesting an entire folder (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
NetroScript authored Oct 30, 2023
1 parent 5d1be6e commit b0e2582
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 6 deletions.
6 changes: 6 additions & 0 deletions docs/description.md
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,12 @@ and optionally watch changes on it with the command:
make ingest /path/to/folder -- --watch
```

To log the processed and failed files to an additional file, use:

```bash
make ingest /path/to/folder -- --watch --log-file /path/to/log/file.log
```

After ingestion is complete, you should be able to chat with your documents
by navigating to http://localhost:8001 and using the option `Query documents`,
or using the completions / chat API.
Expand Down
54 changes: 48 additions & 6 deletions scripts/ingest_folder.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import argparse
import sys
import logging
from pathlib import Path

from private_gpt.di import root_injector
from private_gpt.server.ingest.ingest_service import IngestService
from private_gpt.server.ingest.ingest_watcher import IngestWatcher

ingest_service = root_injector.get(IngestService)
logger = logging.getLogger(__name__)

ingest_service = root_injector.get(IngestService)

parser = argparse.ArgumentParser(prog="ingest_folder.py")
parser.add_argument("folder", help="Folder to ingest")
Expand All @@ -17,29 +18,70 @@
action=argparse.BooleanOptionalAction,
default=False,
)
parser.add_argument(
"--log-file",
help="Optional path to a log file. If provided, logs will be written to this file.",
type=str,
default=None,
)
args = parser.parse_args()

# Set up logging to a file if a path is provided
if args.log_file:
file_handler = logging.FileHandler(args.log_file, mode="a")
file_handler.setFormatter(
logging.Formatter(
"[%(asctime)s.%(msecs)03d] [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
)
logger.addHandler(file_handler)


total_documents = 0
current_document_count = 0


def count_documents(folder_path: Path) -> None:
global total_documents
for file_path in folder_path.iterdir():
if file_path.is_file():
total_documents += 1
elif file_path.is_dir():
count_documents(file_path)


def _recursive_ingest_folder(folder_path: Path) -> None:
global current_document_count, total_documents
for file_path in folder_path.iterdir():
if file_path.is_file():
current_document_count += 1
progress_msg = f"Document {current_document_count} of {total_documents} ({(current_document_count / total_documents) * 100:.2f}%)"
logger.info(progress_msg)
_do_ingest(file_path)
elif file_path.is_dir():
_recursive_ingest_folder(file_path)


def _do_ingest(changed_path: Path) -> None:
if changed_path.exists():
print(f"\nIngesting {changed_path}")
ingest_service.ingest(changed_path.name, changed_path)
try:
if changed_path.exists():
logger.info(f"Started ingesting {changed_path}")
ingest_service.ingest(changed_path.name, changed_path)
logger.info(f"Completed ingesting {changed_path}")
except Exception as e:
logger.error(f"Failed to ingest document: {changed_path}. Error: {e}")


path = Path(args.folder)
if not path.exists():
raise ValueError(f"Path {args.folder} does not exist")

# Count total documents before ingestion
count_documents(path)

_recursive_ingest_folder(path)
if args.watch:
print(f"Watching {args.folder} for changes, press Ctrl+C to stop...")
logger.info(f"Watching {args.folder} for changes, press Ctrl+C to stop...")
watcher = IngestWatcher(args.folder, _do_ingest)
watcher.start()

0 comments on commit b0e2582

Please sign in to comment.