Charcoal-SE · tripleee · May 19, 2022 · makyen · May 21, 2022 · makyen
diff --git a/findspam.py b/findspam.py
@@ -794,6 +794,34 @@ def misleading_link(s, site):
         return False, ''
 
 
+# noinspection PyUnusedLocal,PyMissingTypeHints,PyTypeChecker
+@create_rule("text repeated in {}", title=False, body_summary=True, max_rep=10000, max_score=10000)
+def body_text_repeated(s, site):
+    """
+    Do some hacks to reduce the need for regex backtracking for this rule
+    """
+    s = s.rstrip("\n")
+    if s.startswith("<p>") and s.endswith("</p>"):
+        s = s[3:-4]
+    initial_words = regex.match(r"\A([^\W_]+)[\W_]+([^\W_]+)[\W_]+([^\W_]+)", s)
+    if not initial_words:
+        return False, ""
+    escaped_initial_words = [regex.escape(x) for x in initial_words.groups()]
+    period = regex.match(
+        r"\A%s[\W_]+%s[\W_]+%s[\W_]+(.{1,40}?)%s[\W_]+%s[\W_]+%s(?=$|[\W_])" % (
+            tuple(escaped_initial_words * 2)), s)
+    if not period:
+        return False, ""
+    period_words = regex.split(r"[\W_]+", period.groups(0)[0])
+    escaped_words = escaped_initial_words + [
+        regex.escape(x) for x in period_words]
+    repeats_regex = r"\A(" + r"[\W_]+".join(escaped_words) + r"[\W_]*){10,}"
+    repeats = regex.match(repeats_regex, s)
+    if repeats:
+        return True, "Body contains repeated phrase '%s'" % repeats.groups(0)[0]
+    return False, ""
+
+
 # noinspection PyUnusedLocal,PyMissingTypeHints,PyTypeChecker
 @create_rule("repeating words in {}", max_rep=11, stripcodeblocks=True)
 def has_repeating_words(s, site):

diff --git a/test/test_findspam.py b/test/test_findspam.py
@@ -136,6 +136,7 @@
     ('homoglyph phone numbers 07', '<p>Some 1-844i8O2i7S3S fbody</p>', 'a username', 'math.stackexchange.com', False, False, True),
     ('homoglyph phone numbers 08', '<p>Some 844-8O2-7S3S foobody</p>', 'a username', 'math.stackexchange.com', False, False, True),
     ('Multiple consecutive homoglyph numbers 1', '<p>SomeI-888-884-Olll 888-884-OIII +I-972-S34-S446 972-S34-S446 I-628-21S-2I66 628-21S-2l66 1-844i8O2i7S3S 844a8O2a7S3S body</p>', 'a username', 'math.stackexchange.com', False, False, True),
+    ('repeated body test', 'need enough interesting text to avoid few unique characters rule' * 15, 'luser', 'stackoverflow.com', False, False, True),
 ])
 def test_findspam(title, body, username, site, body_is_summary, is_answer, expected_spam):
     post = Post(api_response={'title': title, 'body': body,