Skip to content

Commit ead3d6a

Browse files
committed
DocketReport._get_pacer_doc_id_and_seq_no() document fragility
Move comment from interior to docstring and then document the heck out of how fragile this method is.
1 parent ee0a2f7 commit ead3d6a

1 file changed

Lines changed: 31 additions & 5 deletions

File tree

juriscraper/pacer/docket_report.py

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1411,7 +1411,7 @@ def docket_entries(self):
14111411
number = de["document_number"]
14121412
if number is not None and not number.isdigit():
14131413
# Per b04951853, Some courts use the word "doc"
1414-
# instead of a docket number.
1414+
# instead of a docket number. See lawb_18072.
14151415
raise ValueError(f"Unexpected document number {number}")
14161416

14171417
docket_entries.append(de)
@@ -1572,13 +1572,39 @@ def _set_metadata_values(self):
15721572

15731573
@staticmethod
15741574
def _get_pacer_doc_id_and_seq_no(cell, document_number):
1575+
"""Parse the first <a> tag that contains the document number in a text
1576+
node.
1577+
1578+
This code is somewhat fragile, as noted in commented examples below.
1579+
It would benefit from some assertions and checks.
1580+
1581+
It was written to handle the case of txnb, which combines the
1582+
second (entry number with pagecount) and third (docket entry
1583+
text) columns in its docket report."""
1584+
15751585
if not document_number:
15761586
return None, None
15771587
else:
1578-
# We find the first link having the document number as text.
1579-
# This is needed because txnb combines the second and third
1580-
# column in their docket report.
15811588
anchors = cell.xpath(".//a")
1589+
#
1590+
_ = """
1591+
# This code can go terribly wrong, resulting in things like:
1592+
[ 'pacer_doc_id': 'Dis0layReceipt.pl' ]
1593+
# which occurred when this was invoked with document_number as 'view'.
1594+
1595+
# This happened when the anchors list began:
1596+
(Pdb) tostring(anchors[0])
1597+
b'<a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a>'
1598+
1599+
# which came from this:
1600+
(Pdb) tostring(cell)
1601+
b'<td align="right"><span class="iconContainer"><a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101111363917" onclick="goDLS(\'/doc1/101111363917\',\'230820\',\'26\',\'\',\'1\',\'1\',\'\',\'\',\'\');return(false);" rel="noopener noreferrer">4</a></span>&#160;</td>'
1602+
1603+
1604+
# This code was designed to deal with txsb:
1605+
<a href='/cgi-bin/show_doc.pl?caseid=322636&de_seq_num=2&dm_id=21705446&doc_num=1&pdf_header=0' id='documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0'>1</a><script>DocLink('documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0');</script>
1606+
1607+
""" # noqa
15821608
if len(anchors) == 0:
15831609
# Docket entry exists, but cannot download document (it's
15841610
# sealed, a minute entry, or otherwise unavailable in PACER).
@@ -1633,7 +1659,7 @@ def _get_document_number(self, cell):
16331659
16341660
<span class="iconContainer"><a href="/cgi-bin/DisplayReceipt.pl?230820,8"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101011362785" onclick="goDLS('/doc1/101011362785','230820','8','','1','1','','','');return(false);">1</a></span>&nbsp;
16351661
1636-
# Or, if the RECAP extension has modified the ODM:
1662+
# Or, if the RECAP extension has modified the DOM:
16371663
16381664
<span class="iconContainer"><a href="/cgi-bin/DisplayReceipt.pl?230877,20"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101111365370" onclick="goDLS('/doc1/101111365370','230877','20','','1','1','','','');return(false);">3</a><a class="recap-inline" title="Available for free from the RECAP Archive." href="https://storage.courtlistener.com/recap/gov.uscourts.mnd.230877/gov.uscourts.mnd.230877.3.0.pdf"><img src="moz-extension://c2baafb3-8fb5-4cba-a00c-0947a021a9bf/assets/images/icon-16.png"></a></span>
16391665

0 commit comments

Comments
 (0)