From 9832fb9ae243faada49ab6a3dbed8d55c6f8153d Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Mon, 21 Nov 2022 14:11:35 -0600 Subject: [PATCH 1/3] feat(recap): Accept recap uploads for docket iQuery pages --- ..._alter_processingqueue_upload_type_noop.py | 23 +++++ ...alter_processingqueue_upload_type_noop.sql | 8 ++ cl/recap/models.py | 2 + cl/recap/tasks.py | 84 +++++++++++++++++++ cl/recap/tests.py | 20 +++++ 5 files changed, 137 insertions(+) create mode 100644 cl/recap/migrations/0009_alter_processingqueue_upload_type_noop.py create mode 100644 cl/recap/migrations/0009_alter_processingqueue_upload_type_noop.sql diff --git a/cl/recap/migrations/0009_alter_processingqueue_upload_type_noop.py b/cl/recap/migrations/0009_alter_processingqueue_upload_type_noop.py new file mode 100644 index 0000000000..dc924dc111 --- /dev/null +++ b/cl/recap/migrations/0009_alter_processingqueue_upload_type_noop.py @@ -0,0 +1,23 @@ +# Generated by Django 3.2.16 on 2022-11-21 20:07 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('recap', '0008_alter_nos_noop'), + ] + + operations = [ + migrations.AlterField( + model_name='pacerhtmlfiles', + name='upload_type', + field=models.SmallIntegerField(choices=[(1, 'HTML Docket'), (2, 'HTML attachment page'), (3, 'PDF'), (4, 'Docket history report'), (5, 'Appellate HTML docket'), (6, 'Appellate HTML attachment page'), (7, 'Internet Archive XML docket'), (8, 'Case report (iquery.pl) page'), (9, 'Claims register page'), (10, 'Zip archive of RECAP Documents'), (11, 'Email in the SES storage format'), (12, 'Docket iQuery page')], help_text='The type of object that is uploaded'), + ), + migrations.AlterField( + model_name='processingqueue', + name='upload_type', + field=models.SmallIntegerField(choices=[(1, 'HTML Docket'), (2, 'HTML attachment page'), (3, 'PDF'), (4, 'Docket history report'), (5, 'Appellate HTML docket'), (6, 'Appellate HTML attachment page'), (7, 'Internet Archive XML docket'), (8, 'Case report (iquery.pl) page'), (9, 'Claims register page'), (10, 'Zip archive of RECAP Documents'), (11, 'Email in the SES storage format'), (12, 'Docket iQuery page')], help_text='The type of object that is uploaded'), + ), + ] diff --git a/cl/recap/migrations/0009_alter_processingqueue_upload_type_noop.sql b/cl/recap/migrations/0009_alter_processingqueue_upload_type_noop.sql new file mode 100644 index 0000000000..534936f930 --- /dev/null +++ b/cl/recap/migrations/0009_alter_processingqueue_upload_type_noop.sql @@ -0,0 +1,8 @@ +BEGIN; +-- +-- Alter field upload_type on pacerhtmlfiles +-- +-- +-- Alter field upload_type on processingqueue +-- +COMMIT; diff --git a/cl/recap/models.py b/cl/recap/models.py index f71fe3d9ee..15e3b47a29 100644 --- a/cl/recap/models.py +++ b/cl/recap/models.py @@ -20,6 +20,7 @@ class UPLOAD_TYPE: CLAIMS_REGISTER = 9 DOCUMENT_ZIP = 10 SES_EMAIL = 11 + IQUERY_PAGE = 12 NAMES = ( (DOCKET, "HTML Docket"), @@ -33,6 +34,7 @@ class UPLOAD_TYPE: (CLAIMS_REGISTER, "Claims register page"), (DOCUMENT_ZIP, "Zip archive of RECAP Documents"), (SES_EMAIL, "Email in the SES storage format"), + (IQUERY_PAGE, "Docket iQuery page"), ) diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index b50dde554c..e17dfacd98 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -118,6 +118,8 @@ def process_recap_upload(pq: ProcessingQueue) -> None: process_recap_claims_register.delay(pq.pk) elif pq.upload_type == UPLOAD_TYPE.DOCUMENT_ZIP: process_recap_zip.delay(pq.pk) + elif pq.upload_type == UPLOAD_TYPE.IQUERY_PAGE: + process_docket_iquery_page.delay(pq.pk) def do_pacer_fetch(fq: PacerFetchQueue): @@ -823,6 +825,88 @@ def process_recap_docket_history_report(self, pk): } +@app.task( + bind=True, max_retries=3, interval_start=5 * 60, interval_step=5 * 60 +) +def process_docket_iquery_page(self, pk): + """Process the Docket iQuery page. + + :param pk: The primary key of the processing queue item you want to work on + :returns: A dict indicating whether the docket needs Solr re-indexing. + """ + + pq = ProcessingQueue.objects.get(pk=pk) + mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) + logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}") + + try: + text = pq.filepath_local.read().decode() + except IOError as exc: + msg = f"Internal processing error ({exc.errno}: {exc.strerror})." + if (self.request.retries == self.max_retries) or pq.debug: + mark_pq_status(pq, msg, PROCESSING_STATUS.FAILED) + return None + else: + mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) + raise self.retry(exc=exc) + + report = DocketHistoryReport(map_cl_to_pacer_id(pq.court_id)) + report._parse_text(text) + data = report.data + logger.info(f"Parsing completed for item {pq}") + + if data == {}: + # Bad docket iquery page. + msg = "Not a valid docket iquery page upload." + mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) + self.request.chain = None + return None + + # Merge the contents of the docket into CL. + d = find_docket_object( + pq.court_id, pq.pacer_case_id, data["docket_number"] + ) + d.add_recap_source() + update_docket_metadata(d, data) + + if pq.debug: + mark_pq_successful(pq, d_id=d.pk) + self.request.chain = None + return {"docket_pk": d.pk, "content_updated": False} + + try: + d.save() + except IntegrityError as exc: + logger.warning( + "Race condition experienced while attempting docket save." + ) + error_message = "Unable to save docket due to IntegrityError." + if self.request.retries == self.max_retries: + mark_pq_status(pq, error_message, PROCESSING_STATUS.FAILED) + self.request.chain = None + return None + else: + mark_pq_status( + pq, error_message, PROCESSING_STATUS.QUEUED_FOR_RETRY + ) + raise self.retry(exc=exc) + + # Add the HTML to the docket in case we need it someday. + pacer_file = PacerHtmlFiles( + content_object=d, upload_type=UPLOAD_TYPE.IQUERY_PAGE + ) + pacer_file.filepath.save( + # We only care about the ext w/S3PrivateUUIDStorageTest + "docket_iquery_page.html", + ContentFile(text.encode()), + ) + mark_pq_successful(pq, d_id=d.pk) + return { + "docket_pk": d.pk, + "content_updated": False, + } + + @app.task(bind=True, max_retries=3, ignore_result=True) def process_recap_appellate_docket(self, pk): """Process an uploaded appellate docket from the RECAP API endpoint. diff --git a/cl/recap/tests.py b/cl/recap/tests.py index a985c85468..a9bee0f26d 100644 --- a/cl/recap/tests.py +++ b/cl/recap/tests.py @@ -287,6 +287,26 @@ def test_ensure_no_users_in_response(self, mock): j[bad_key] mock.assert_called() + def test_uploading_a_docket_iquery_page(self, mock): + """Can we upload a docket iquery page and have it be saved correctly? + + Note that this works fine even though we're not actually uploading a + docket due to the mock. + """ + self.data.update( + {"upload_type": UPLOAD_TYPE.IQUERY_PAGE, "document_number": ""} + ) + del self.data["pacer_doc_id"] + r = self.client.post(self.path, self.data) + self.assertEqual(r.status_code, HTTP_201_CREATED) + + j = json.loads(r.content) + path = reverse( + "processingqueue-detail", kwargs={"version": "v3", "pk": j["id"]} + ) + r = self.client.get(path) + self.assertEqual(r.status_code, HTTP_200_OK) + @mock.patch("cl.recap.tasks.DocketReport", new=fakes.FakeDocketReport) @mock.patch( From 5aaf1fd85ea447efaafc38024fb611cd6e95ed30 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Tue, 22 Nov 2022 16:58:58 -0600 Subject: [PATCH 2/3] fix(recap): Use CaseQuery report to parse iquery pages uploaded via recap extension --- ..._alter_processingqueue_upload_type_noop.py | 6 ++-- cl/recap/models.py | 4 +-- cl/recap/tasks.py | 31 +++++++++++++------ cl/recap/tests.py | 4 +-- 4 files changed, 29 insertions(+), 16 deletions(-) diff --git a/cl/recap/migrations/0009_alter_processingqueue_upload_type_noop.py b/cl/recap/migrations/0009_alter_processingqueue_upload_type_noop.py index dc924dc111..9e378db6cf 100644 --- a/cl/recap/migrations/0009_alter_processingqueue_upload_type_noop.py +++ b/cl/recap/migrations/0009_alter_processingqueue_upload_type_noop.py @@ -1,4 +1,4 @@ -# Generated by Django 3.2.16 on 2022-11-21 20:07 +# Generated by Django 3.2.16 on 2022-11-22 22:57 from django.db import migrations, models @@ -13,11 +13,11 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='pacerhtmlfiles', name='upload_type', - field=models.SmallIntegerField(choices=[(1, 'HTML Docket'), (2, 'HTML attachment page'), (3, 'PDF'), (4, 'Docket history report'), (5, 'Appellate HTML docket'), (6, 'Appellate HTML attachment page'), (7, 'Internet Archive XML docket'), (8, 'Case report (iquery.pl) page'), (9, 'Claims register page'), (10, 'Zip archive of RECAP Documents'), (11, 'Email in the SES storage format'), (12, 'Docket iQuery page')], help_text='The type of object that is uploaded'), + field=models.SmallIntegerField(choices=[(1, 'HTML Docket'), (2, 'HTML attachment page'), (3, 'PDF'), (4, 'Docket history report'), (5, 'Appellate HTML docket'), (6, 'Appellate HTML attachment page'), (7, 'Internet Archive XML docket'), (8, 'Case report (iquery.pl) page'), (9, 'Claims register page'), (10, 'Zip archive of RECAP Documents'), (11, 'Email in the SES storage format'), (12, 'Case query page')], help_text='The type of object that is uploaded'), ), migrations.AlterField( model_name='processingqueue', name='upload_type', - field=models.SmallIntegerField(choices=[(1, 'HTML Docket'), (2, 'HTML attachment page'), (3, 'PDF'), (4, 'Docket history report'), (5, 'Appellate HTML docket'), (6, 'Appellate HTML attachment page'), (7, 'Internet Archive XML docket'), (8, 'Case report (iquery.pl) page'), (9, 'Claims register page'), (10, 'Zip archive of RECAP Documents'), (11, 'Email in the SES storage format'), (12, 'Docket iQuery page')], help_text='The type of object that is uploaded'), + field=models.SmallIntegerField(choices=[(1, 'HTML Docket'), (2, 'HTML attachment page'), (3, 'PDF'), (4, 'Docket history report'), (5, 'Appellate HTML docket'), (6, 'Appellate HTML attachment page'), (7, 'Internet Archive XML docket'), (8, 'Case report (iquery.pl) page'), (9, 'Claims register page'), (10, 'Zip archive of RECAP Documents'), (11, 'Email in the SES storage format'), (12, 'Case query page')], help_text='The type of object that is uploaded'), ), ] diff --git a/cl/recap/models.py b/cl/recap/models.py index 15e3b47a29..4d924fed63 100644 --- a/cl/recap/models.py +++ b/cl/recap/models.py @@ -20,7 +20,7 @@ class UPLOAD_TYPE: CLAIMS_REGISTER = 9 DOCUMENT_ZIP = 10 SES_EMAIL = 11 - IQUERY_PAGE = 12 + CASE_QUERY_PAGE = 12 NAMES = ( (DOCKET, "HTML Docket"), @@ -34,7 +34,7 @@ class UPLOAD_TYPE: (CLAIMS_REGISTER, "Claims register page"), (DOCUMENT_ZIP, "Zip archive of RECAP Documents"), (SES_EMAIL, "Email in the SES storage format"), - (IQUERY_PAGE, "Docket iQuery page"), + (CASE_QUERY_PAGE, "Case query page"), ) diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index e17dfacd98..4aa0d6c0d8 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -21,6 +21,7 @@ from juriscraper.pacer import ( AppellateDocketReport, AttachmentPage, + CaseQuery, ClaimsRegister, DocketHistoryReport, DocketReport, @@ -118,8 +119,11 @@ def process_recap_upload(pq: ProcessingQueue) -> None: process_recap_claims_register.delay(pq.pk) elif pq.upload_type == UPLOAD_TYPE.DOCUMENT_ZIP: process_recap_zip.delay(pq.pk) - elif pq.upload_type == UPLOAD_TYPE.IQUERY_PAGE: - process_docket_iquery_page.delay(pq.pk) + elif pq.upload_type == UPLOAD_TYPE.CASE_QUERY_PAGE: + chain( + process_case_query_page.s(pq.pk), + add_or_update_recap_docket.s(), + ).apply_async() def do_pacer_fetch(fq: PacerFetchQueue): @@ -828,8 +832,8 @@ def process_recap_docket_history_report(self, pk): @app.task( bind=True, max_retries=3, interval_start=5 * 60, interval_step=5 * 60 ) -def process_docket_iquery_page(self, pk): - """Process the Docket iQuery page. +def process_case_query_page(self, pk): + """Process the case query (iquery.pl) page. :param pk: The primary key of the processing queue item you want to work on :returns: A dict indicating whether the docket needs Solr re-indexing. @@ -850,14 +854,14 @@ def process_docket_iquery_page(self, pk): mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) - report = DocketHistoryReport(map_cl_to_pacer_id(pq.court_id)) + report = CaseQuery(map_cl_to_pacer_id(pq.court_id)) report._parse_text(text) data = report.data logger.info(f"Parsing completed for item {pq}") if data == {}: # Bad docket iquery page. - msg = "Not a valid docket iquery page upload." + msg = "Not a valid case query page upload." mark_pq_status(pq, msg, PROCESSING_STATUS.INVALID_CONTENT) self.request.chain = None return None @@ -866,9 +870,16 @@ def process_docket_iquery_page(self, pk): d = find_docket_object( pq.court_id, pq.pacer_case_id, data["docket_number"] ) + content_updated = False + current_case_name = d.case_name d.add_recap_source() update_docket_metadata(d, data) + if current_case_name != d.case_name or not d.pk: + # This docket should be added to Solr or updated since is new or the + # case name has changed. + content_updated = True + if pq.debug: mark_pq_successful(pq, d_id=d.pk) self.request.chain = None @@ -876,6 +887,7 @@ def process_docket_iquery_page(self, pk): try: d.save() + add_bankruptcy_data_to_docket(d, data) except IntegrityError as exc: logger.warning( "Race condition experienced while attempting docket save." @@ -893,17 +905,18 @@ def process_docket_iquery_page(self, pk): # Add the HTML to the docket in case we need it someday. pacer_file = PacerHtmlFiles( - content_object=d, upload_type=UPLOAD_TYPE.IQUERY_PAGE + content_object=d, upload_type=UPLOAD_TYPE.CASE_QUERY_PAGE ) pacer_file.filepath.save( # We only care about the ext w/S3PrivateUUIDStorageTest - "docket_iquery_page.html", + "case_report.html", ContentFile(text.encode()), ) + mark_pq_successful(pq, d_id=d.pk) return { "docket_pk": d.pk, - "content_updated": False, + "content_updated": content_updated, } diff --git a/cl/recap/tests.py b/cl/recap/tests.py index a9bee0f26d..ebe43b057d 100644 --- a/cl/recap/tests.py +++ b/cl/recap/tests.py @@ -287,14 +287,14 @@ def test_ensure_no_users_in_response(self, mock): j[bad_key] mock.assert_called() - def test_uploading_a_docket_iquery_page(self, mock): + def test_uploading_a_case_query_page(self, mock): """Can we upload a docket iquery page and have it be saved correctly? Note that this works fine even though we're not actually uploading a docket due to the mock. """ self.data.update( - {"upload_type": UPLOAD_TYPE.IQUERY_PAGE, "document_number": ""} + {"upload_type": UPLOAD_TYPE.CASE_QUERY_PAGE, "document_number": ""} ) del self.data["pacer_doc_id"] r = self.client.post(self.path, self.data) From c166e50e73f36d77c92cc688851115875a79bcf4 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Wed, 23 Nov 2022 11:06:25 -0600 Subject: [PATCH 3/3] fix(recap): Only update case query report in SOLR if is not new and contains docket entries --- cl/recap/tasks.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index 4aa0d6c0d8..b48605f56d 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -870,15 +870,16 @@ def process_case_query_page(self, pk): d = find_docket_object( pq.court_id, pq.pacer_case_id, data["docket_number"] ) - content_updated = False current_case_name = d.case_name d.add_recap_source() update_docket_metadata(d, data) - if current_case_name != d.case_name or not d.pk: - # This docket should be added to Solr or updated since is new or the - # case name has changed. - content_updated = True + # Update the docket in SOLR if the case name has changed and contains + # docket entries + content_updated = False + if current_case_name != d.case_name and d.pk: + if d.docket_entries.exists(): + content_updated = True if pq.debug: mark_pq_successful(pq, d_id=d.pk)