Merge pull request #2210 from freelawproject/2204-harvard-opinions-im…

…port-duplicates-with-some-case-names Keep acronyms/abbreviations in case name when searching for duplicates in Harvard opinion importer
freelawproject · Nov 22, 2022 · e043f42 · e043f42
2 parents fbcad53 + 52a5ae4
commit e043f42
Show file tree

Hide file tree

Showing 2 changed files with 62 additions and 4 deletions.
diff --git a/cl/corpus_importer/management/commands/harvard_opinions.py b/cl/corpus_importer/management/commands/harvard_opinions.py
@@ -999,18 +999,33 @@ def winnow_case_name(case_name: str) -> Set:
         "|".join(map(re.escape, false_positive_strings))
     )
 
-    # Remove all non alphanumeric characters
+    # Fix case name to be cleaner
     case_name = harmonize(case_name)
+
+    # Join abbreviations/acronyms. e.g. "D.L.M. v. T.J.S." -> "DLM v. TJS"
+    case_name = re.sub(
+        r"\b[a-zA-Z][a-zA-Z\.]*[A-Za-z]\b\.?",
+        lambda m: m.group().replace(".", ""),
+        case_name,
+    )
+
+    # Remove all non-alphanumeric characters
     case_title = re.sub(r"[^a-z0-9 ]", " ", case_name.lower())
 
     # Remove strings that can cause an unnecessary overlap
     case_title = false_positive_strings_regex.sub("", case_title)
 
-    # Remove one letter words, initials etc.
+    # Remove one-letter words, initials etc.
     case_title = re.sub(r"\b[^ ]\b", "", case_title)
+
+    if not case_title:
+        # Log case name if the process reduce it to blank
+        logger.warning(f"Case name: {case_name} reduced to blank.")
+
+    # Convert case name to set of words
     cleaned_set = set(case_title.split())
 
-    # Lastly remove our ever growing set of false positive words
+    # Lastly remove our ever-growing set of false positive words
     # This is different from bad words, but may have some overlap.
     return cleaned_set - (cleaned_set & false_positive_set)
 

diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py
@@ -37,7 +37,7 @@
     OpinionCluster,
     RECAPDocument,
 )
-from cl.settings import INSTALL_ROOT, MEDIA_ROOT
+from cl.settings import MEDIA_ROOT
 from cl.tests.cases import SimpleTestCase, TestCase
 
 
@@ -829,3 +829,46 @@ def test_case_name_winnowing_comparison(self):
             harvard_case
         )
         self.assertEqual(len(overlap), 0)
+
+    def test_case_names_with_abbreviations(self):
+        """
+        Test what happens when the case name contains abbreviations
+        """
+
+        # Check against itself, there must be an overlap
+        case_1_data = {
+            "case_name_full": "In the matter of S.J.S., a minor child. "
+            "D.L.M. and D.E.M., Petitioners/Respondents v."
+            " T.J.S.",
+            "case_name_abbreviation": "D.L.M. v. T.J.S.",
+            "case_name_cl": "D.L.M. v. T.J.S.",
+            "overlaps": 2,
+        }
+
+        case_2_data = {
+            "case_name_full": "Appeal of HAMILTON & CHAMBERS CO., INC.",
+            "case_name_abbreviation": "Appeal of Hamilton & Chambers Co.",
+            "case_name_cl": "Appeal of Hamilton & Chambers Co.",
+            "overlaps": 4,
+        }
+
+        # Check against different case name, there shouldn't be an overlap
+        case_3_data = {
+            "case_name_full": "Henry B. Wesselman et al., as Executors of "
+            "Blanche Wesselman, Deceased, Respondents, "
+            "v. The Engel Company, Inc., et al., "
+            "Appellants, et al., Defendants",
+            "case_name_abbreviation": "Wesselman v. Engel Co.",
+            "case_name_cl": " McQuillan v. Schechter",
+            "overlaps": 0,
+        }
+
+        cases = [case_1_data, case_2_data, case_3_data]
+
+        for case in cases:
+            harvard_case = f"{case.get('case_name_full')} {case.get('case_name_abbreviation')}"
+            overlap = winnow_case_name(
+                case.get("case_name_cl")
+            ) & winnow_case_name(harvard_case)
+
+            self.assertEqual(len(overlap), case.get("overlaps"))