Skip to content

Commit

Permalink
Merge pull request #2210 from freelawproject/2204-harvard-opinions-im…
Browse files Browse the repository at this point in the history
…port-duplicates-with-some-case-names

Keep acronyms/abbreviations in case name when searching for duplicates in Harvard opinion importer
  • Loading branch information
flooie authored Nov 22, 2022
2 parents fbcad53 + 52a5ae4 commit e043f42
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 4 deletions.
21 changes: 18 additions & 3 deletions cl/corpus_importer/management/commands/harvard_opinions.py
Original file line number Diff line number Diff line change
Expand Up @@ -999,18 +999,33 @@ def winnow_case_name(case_name: str) -> Set:
"|".join(map(re.escape, false_positive_strings))
)

# Remove all non alphanumeric characters
# Fix case name to be cleaner
case_name = harmonize(case_name)

# Join abbreviations/acronyms. e.g. "D.L.M. v. T.J.S." -> "DLM v. TJS"
case_name = re.sub(
r"\b[a-zA-Z][a-zA-Z\.]*[A-Za-z]\b\.?",
lambda m: m.group().replace(".", ""),
case_name,
)

# Remove all non-alphanumeric characters
case_title = re.sub(r"[^a-z0-9 ]", " ", case_name.lower())

# Remove strings that can cause an unnecessary overlap
case_title = false_positive_strings_regex.sub("", case_title)

# Remove one letter words, initials etc.
# Remove one-letter words, initials etc.
case_title = re.sub(r"\b[^ ]\b", "", case_title)

if not case_title:
# Log case name if the process reduce it to blank
logger.warning(f"Case name: {case_name} reduced to blank.")

# Convert case name to set of words
cleaned_set = set(case_title.split())

# Lastly remove our ever growing set of false positive words
# Lastly remove our ever-growing set of false positive words
# This is different from bad words, but may have some overlap.
return cleaned_set - (cleaned_set & false_positive_set)

Expand Down
45 changes: 44 additions & 1 deletion cl/corpus_importer/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
OpinionCluster,
RECAPDocument,
)
from cl.settings import INSTALL_ROOT, MEDIA_ROOT
from cl.settings import MEDIA_ROOT
from cl.tests.cases import SimpleTestCase, TestCase


Expand Down Expand Up @@ -829,3 +829,46 @@ def test_case_name_winnowing_comparison(self):
harvard_case
)
self.assertEqual(len(overlap), 0)

def test_case_names_with_abbreviations(self):
"""
Test what happens when the case name contains abbreviations
"""

# Check against itself, there must be an overlap
case_1_data = {
"case_name_full": "In the matter of S.J.S., a minor child. "
"D.L.M. and D.E.M., Petitioners/Respondents v."
" T.J.S.",
"case_name_abbreviation": "D.L.M. v. T.J.S.",
"case_name_cl": "D.L.M. v. T.J.S.",
"overlaps": 2,
}

case_2_data = {
"case_name_full": "Appeal of HAMILTON & CHAMBERS CO., INC.",
"case_name_abbreviation": "Appeal of Hamilton & Chambers Co.",
"case_name_cl": "Appeal of Hamilton & Chambers Co.",
"overlaps": 4,
}

# Check against different case name, there shouldn't be an overlap
case_3_data = {
"case_name_full": "Henry B. Wesselman et al., as Executors of "
"Blanche Wesselman, Deceased, Respondents, "
"v. The Engel Company, Inc., et al., "
"Appellants, et al., Defendants",
"case_name_abbreviation": "Wesselman v. Engel Co.",
"case_name_cl": " McQuillan v. Schechter",
"overlaps": 0,
}

cases = [case_1_data, case_2_data, case_3_data]

for case in cases:
harvard_case = f"{case.get('case_name_full')} {case.get('case_name_abbreviation')}"
overlap = winnow_case_name(
case.get("case_name_cl")
) & winnow_case_name(harvard_case)

self.assertEqual(len(overlap), case.get("overlaps"))

0 comments on commit e043f42

Please sign in to comment.