Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add score filtering #634

Merged
merged 12 commits into from
Jul 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,15 @@ The following options apply only to the `download` command. This command downloa
- This skips all submissions from the specified subreddit
- Can be specified multiple times
- Also accepts CSV subreddit names
- `--min-score`
- This skips all submissions which have fewer than specified upvotes
- `--max-score`
- This skips all submissions which have more than specified upvotes
- `--min-score-ratio`
- This skips all submissions which have lower than specified upvote ratio
- `--max-score-ratio`
- This skips all submissions which have higher than specified upvote ratio


### Archiver Options

Expand Down
4 changes: 4 additions & 0 deletions bdfr/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@
click.option('--skip', default=None, multiple=True),
click.option('--skip-domain', default=None, multiple=True),
click.option('--skip-subreddit', default=None, multiple=True),
click.option('--min-score', type=int, default=None),
click.option('--max-score', type=int, default=None),
click.option('--min-score-ratio', type=float, default=None),
click.option('--max-score-ratio', type=float, default=None),
]

_archiver_options = [
Expand Down
4 changes: 4 additions & 0 deletions bdfr/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ def __init__(self):
self.skip: list[str] = []
self.skip_domain: list[str] = []
self.skip_subreddit: list[str] = []
self.min_score = None
self.max_score = None
self.min_score_ratio = None
self.max_score_ratio = None
self.sort: str = 'hot'
self.submitted: bool = False
self.subscribed: bool = False
Expand Down
13 changes: 13 additions & 0 deletions bdfr/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,19 @@ def _download_submission(self, submission: praw.models.Submission):
f'Submission {submission.id} in {submission.subreddit.display_name} skipped'
f' due to {submission.author.name if submission.author else "DELETED"} being an ignored user')
return
elif self.args.min_score and submission.score < self.args.min_score:
logger.debug(
f"Submission {submission.id} filtered due to score {submission.score} < [{self.args.min_score}]")
return
elif self.args.max_score and self.args.max_score < submission.score:
logger.debug(
f"Submission {submission.id} filtered due to score {submission.score} > [{self.args.max_score}]")
return
elif (self.args.min_score_ratio and submission.upvote_ratio < self.args.min_score_ratio) or (
self.args.max_score_ratio and self.args.max_score_ratio < submission.upvote_ratio
):
logger.debug(f"Submission {submission.id} filtered due to score ratio ({submission.upvote_ratio})")
return
elif not isinstance(submission, praw.models.Submission):
logger.warning(f'{submission.id} is not a submission')
return
Expand Down
1 change: 1 addition & 0 deletions scripts/extract_successful_ids.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ fi
grep 'Download filter' "$file" | awk '{ print $(NF-3) }' ;
grep 'already exists, continuing' "$file" | awk '{ print $(NF-3) }' ;
grep 'Hard link made' "$file" | awk '{ print $(NF) }' ;
grep 'filtered due to score' "$file" | awk '{ print $9 }'
}
2 changes: 2 additions & 0 deletions scripts/tests/example_logfiles/succeed_score_filter.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[2022-07-23 14:04:14,095 - bdfr.downloader - DEBUG] - Submission ljyy27 filtered due to score 15 < [50]
[2022-07-23 14:04:14,104 - bdfr.downloader - DEBUG] - Submission ljyy27 filtered due to score 16 > [1]
15 changes: 10 additions & 5 deletions scripts/tests/test_extract_failed_ids.bats
Original file line number Diff line number Diff line change
Expand Up @@ -13,31 +13,36 @@ teardown() {
}

@test "fail no downloader module" {
run ../extract_failed_ids.sh ./example_logfiles/failed_no_downloader.txt >> failed.txt
run ../extract_failed_ids.sh ./example_logfiles/failed_no_downloader.txt
echo "$output" > failed.txt
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "3" ];
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
}

@test "fail resource error" {
run ../extract_failed_ids.sh ./example_logfiles/failed_resource_error.txt >> failed.txt
run ../extract_failed_ids.sh ./example_logfiles/failed_resource_error.txt
echo "$output" > failed.txt
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ];
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
}

@test "fail site downloader error" {
run ../extract_failed_ids.sh ./example_logfiles/failed_sitedownloader_error.txt >> failed.txt
run ../extract_failed_ids.sh ./example_logfiles/failed_sitedownloader_error.txt
echo "$output" > failed.txt
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "2" ];
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
}

@test "fail failed file write" {
run ../extract_failed_ids.sh ./example_logfiles/failed_write_error.txt >> failed.txt
run ../extract_failed_ids.sh ./example_logfiles/failed_write_error.txt
echo "$output" > failed.txt
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ];
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
}

@test "fail disabled module" {
run ../extract_failed_ids.sh ./example_logfiles/failed_disabled_module.txt >> failed.txt
run ../extract_failed_ids.sh ./example_logfiles/failed_disabled_module.txt
echo "$output" > failed.txt
assert [ "$( wc -l 'failed.txt' | awk '{ print $1 }' )" -eq "1" ];
assert [ "$( grep -Ecv '\w{6,7}' 'failed.txt' )" -eq "0" ];
}
22 changes: 17 additions & 5 deletions scripts/tests/test_extract_successful_ids.bats
Original file line number Diff line number Diff line change
Expand Up @@ -8,31 +8,43 @@ teardown() {
}

@test "success downloaded submission" {
run ../extract_successful_ids.sh ./example_logfiles/succeed_downloaded_submission.txt >> ./successful.txt
run ../extract_successful_ids.sh ./example_logfiles/succeed_downloaded_submission.txt
echo "$output" > successful.txt
assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "7" ];
assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ];
}

@test "success resource hash" {
run ../extract_successful_ids.sh ./example_logfiles/succeed_resource_hash.txt >> ./successful.txt
run ../extract_successful_ids.sh ./example_logfiles/succeed_resource_hash.txt
echo "$output" > successful.txt
assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ];
assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ];
}

@test "success download filter" {
run ../extract_successful_ids.sh ./example_logfiles/succeed_download_filter.txt >> ./successful.txt
run ../extract_successful_ids.sh ./example_logfiles/succeed_download_filter.txt
echo "$output" > successful.txt
assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ];
assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ];
}

@test "success already exists" {
run ../extract_successful_ids.sh ./example_logfiles/succeed_already_exists.txt >> ./successful.txt
run ../extract_successful_ids.sh ./example_logfiles/succeed_already_exists.txt
echo "$output" > successful.txt
assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "3" ];
assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ];
}

@test "success hard link" {
run ../extract_successful_ids.sh ./example_logfiles/succeed_hard_link.txt >> ./successful.txt
run ../extract_successful_ids.sh ./example_logfiles/succeed_hard_link.txt
echo "$output" > successful.txt
assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "1" ];
assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ];
}

@test "success score filter" {
run ../extract_successful_ids.sh ./example_logfiles/succeed_score_filter.txt
echo "$output" > successful.txt
assert [ "$( wc -l 'successful.txt' | awk '{ print $1 }' )" -eq "2" ];
assert [ "$( grep -Ecv '\w{6,7}' 'successful.txt' )" -eq "0" ];
}
17 changes: 17 additions & 0 deletions tests/integration_tests/test_download_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,3 +351,20 @@ def test_cli_download_ignore_user(test_args: list[str], tmp_path: Path):
assert result.exit_code == 0
assert 'Downloaded submission' not in result.output
assert 'being an ignored user' in result.output


@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.skipif(not does_test_config_exist, reason='A test config file is required for integration tests')
@pytest.mark.parametrize(('test_args', 'was_filtered'), (
(['-l', 'ljyy27', '--min-score', '50'], True),
(['-l', 'ljyy27', '--min-score', '1'], False),
(['-l', 'ljyy27', '--max-score', '1'], True),
(['-l', 'ljyy27', '--max-score', '100'], False),
))
def test_cli_download_score_filter(test_args: list[str], was_filtered: bool, tmp_path: Path):
runner = CliRunner()
test_args = create_basic_args_for_download_runner(test_args, tmp_path)
result = runner.invoke(cli, test_args)
assert result.exit_code == 0
assert ('filtered due to score' in result.output) == was_filtered
104 changes: 104 additions & 0 deletions tests/test_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,3 +200,107 @@ def test_download_submission(
RedditDownloader._download_submission(downloader_mock, submission)
folder_contents = list(tmp_path.iterdir())
assert len(folder_contents) == expected_files_len


@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'min_score'), (
('ljyy27', 1),
))
def test_download_submission_min_score_above(
test_submission_id: str,
min_score: int,
downloader_mock: MagicMock,
reddit_instance: praw.Reddit,
tmp_path: Path,
capsys: pytest.CaptureFixture,
):
setup_logging(3)
downloader_mock.reddit_instance = reddit_instance
downloader_mock.download_filter.check_url.return_value = True
downloader_mock.args.folder_scheme = ''
downloader_mock.args.min_score = min_score
downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock)
downloader_mock.download_directory = tmp_path
submission = downloader_mock.reddit_instance.submission(id=test_submission_id)
RedditDownloader._download_submission(downloader_mock, submission)
output = capsys.readouterr()
assert 'filtered due to score' not in output.out


@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'min_score'), (
('ljyy27', 25),
))
def test_download_submission_min_score_below(
test_submission_id: str,
min_score: int,
downloader_mock: MagicMock,
reddit_instance: praw.Reddit,
tmp_path: Path,
capsys: pytest.CaptureFixture,
):
setup_logging(3)
downloader_mock.reddit_instance = reddit_instance
downloader_mock.download_filter.check_url.return_value = True
downloader_mock.args.folder_scheme = ''
downloader_mock.args.min_score = min_score
downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock)
downloader_mock.download_directory = tmp_path
submission = downloader_mock.reddit_instance.submission(id=test_submission_id)
RedditDownloader._download_submission(downloader_mock, submission)
output = capsys.readouterr()
assert 'filtered due to score' in output.out


@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'max_score'), (
('ljyy27', 25),
))
def test_download_submission_max_score_below(
test_submission_id: str,
max_score: int,
downloader_mock: MagicMock,
reddit_instance: praw.Reddit,
tmp_path: Path,
capsys: pytest.CaptureFixture,
):
setup_logging(3)
downloader_mock.reddit_instance = reddit_instance
downloader_mock.download_filter.check_url.return_value = True
downloader_mock.args.folder_scheme = ''
downloader_mock.args.max_score = max_score
downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock)
downloader_mock.download_directory = tmp_path
submission = downloader_mock.reddit_instance.submission(id=test_submission_id)
RedditDownloader._download_submission(downloader_mock, submission)
output = capsys.readouterr()
assert 'filtered due to score' not in output.out


@pytest.mark.online
@pytest.mark.reddit
@pytest.mark.parametrize(('test_submission_id', 'max_score'), (
('ljyy27', 1),
))
def test_download_submission_max_score_above(
test_submission_id: str,
max_score: int,
downloader_mock: MagicMock,
reddit_instance: praw.Reddit,
tmp_path: Path,
capsys: pytest.CaptureFixture,
):
setup_logging(3)
downloader_mock.reddit_instance = reddit_instance
downloader_mock.download_filter.check_url.return_value = True
downloader_mock.args.folder_scheme = ''
downloader_mock.args.max_score = max_score
downloader_mock.file_name_formatter = RedditConnector.create_file_name_formatter(downloader_mock)
downloader_mock.download_directory = tmp_path
submission = downloader_mock.reddit_instance.submission(id=test_submission_id)
RedditDownloader._download_submission(downloader_mock, submission)
output = capsys.readouterr()
assert 'filtered due to score' in output.out