diff --git a/findspam.py b/findspam.py index c420c022b9..71605a6cd0 100644 --- a/findspam.py +++ b/findspam.py @@ -794,6 +794,34 @@ def misleading_link(s, site): return False, '' +# noinspection PyUnusedLocal,PyMissingTypeHints,PyTypeChecker +@create_rule("text repeated in {}", title=False, body_summary=True, max_rep=10000, max_score=10000) +def body_text_repeated(s, site): + """ + Do some hacks to reduce the need for regex backtracking for this rule + """ + s = s.rstrip("\n") + if s.startswith("

") and s.endswith("

"): + s = s[3:-4] + initial_words = regex.match(r"\A([^\W_]+)[\W_]+([^\W_]+)[\W_]+([^\W_]+)", s) + if not initial_words: + return False, "" + escaped_initial_words = [regex.escape(x) for x in initial_words.groups()] + period = regex.match( + r"\A%s[\W_]+%s[\W_]+%s[\W_]+(.{1,40}?)%s[\W_]+%s[\W_]+%s(?=$|[\W_])" % ( + tuple(escaped_initial_words * 2)), s) + if not period: + return False, "" + period_words = regex.split(r"[\W_]+", period.groups(0)[0]) + escaped_words = escaped_initial_words + [ + regex.escape(x) for x in period_words] + repeats_regex = r"\A(" + r"[\W_]+".join(escaped_words) + r"[\W_]*){10,}" + repeats = regex.match(repeats_regex, s) + if repeats: + return True, "Body contains repeated phrase '%s'" % repeats.groups(0)[0] + return False, "" + + # noinspection PyUnusedLocal,PyMissingTypeHints,PyTypeChecker @create_rule("repeating words in {}", max_rep=11, stripcodeblocks=True) def has_repeating_words(s, site): diff --git a/test/test_findspam.py b/test/test_findspam.py index 40217b743a..00e6d6cc56 100644 --- a/test/test_findspam.py +++ b/test/test_findspam.py @@ -136,6 +136,7 @@ ('homoglyph phone numbers 07', '

Some 1-844i8O2i7S3S fbody

', 'a username', 'math.stackexchange.com', False, False, True), ('homoglyph phone numbers 08', '

Some 844-8O2-7S3S foobody

', 'a username', 'math.stackexchange.com', False, False, True), ('Multiple consecutive homoglyph numbers 1', '

SomeI-888-884-Olll 888-884-OIII +I-972-S34-S446 972-S34-S446 I-628-21S-2I66 628-21S-2l66 1-844i8O2i7S3S 844a8O2a7S3S body

', 'a username', 'math.stackexchange.com', False, False, True), + ('repeated body test', 'need enough interesting text to avoid few unique characters rule' * 15, 'luser', 'stackoverflow.com', False, False, True), ]) def test_findspam(title, body, username, site, body_is_summary, is_answer, expected_spam): post = Post(api_response={'title': title, 'body': body,