@@ -1411,7 +1411,7 @@ def docket_entries(self):
14111411 number = de ["document_number" ]
14121412 if number is not None and not number .isdigit ():
14131413 # Per b04951853, Some courts use the word "doc"
1414- # instead of a docket number.
1414+ # instead of a docket number. See lawb_18072.
14151415 raise ValueError (f"Unexpected document number { number } " )
14161416
14171417 docket_entries .append (de )
@@ -1572,13 +1572,39 @@ def _set_metadata_values(self):
15721572
15731573 @staticmethod
15741574 def _get_pacer_doc_id_and_seq_no (cell , document_number ):
1575+ """Parse the first <a> tag that contains the document number in a text
1576+ node.
1577+
1578+ This code is somewhat fragile, as noted in commented examples below.
1579+ It would benefit from some assertions and checks.
1580+
1581+ It was written to handle the case of txnb, which combines the
1582+ second (entry number with pagecount) and third (docket entry
1583+ text) columns in its docket report."""
1584+
15751585 if not document_number :
15761586 return None , None
15771587 else :
1578- # We find the first link having the document number as text.
1579- # This is needed because txnb combines the second and third
1580- # column in their docket report.
15811588 anchors = cell .xpath (".//a" )
1589+ #
1590+ _ = """
1591+ # This code can go terribly wrong, resulting in things like:
1592+ [ 'pacer_doc_id': 'Dis0layReceipt.pl' ]
1593+ # which occurred when this was invoked with document_number as 'view'.
1594+
1595+ # This happened when the anchors list began:
1596+ (Pdb) tostring(anchors[0])
1597+ b'<a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a>'
1598+
1599+ # which came from this:
1600+ (Pdb) tostring(cell)
1601+ b'<td align="right"><span class="iconContainer"><a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101111363917" onclick="goDLS(\' /doc1/101111363917\' ,\' 230820\' ,\' 26\' ,\' \' ,\' 1\' ,\' 1\' ,\' \' ,\' \' ,\' \' );return(false);" rel="noopener noreferrer">4</a></span> </td>'
1602+
1603+
1604+ # This code was designed to deal with txsb:
1605+ <a href='/cgi-bin/show_doc.pl?caseid=322636&de_seq_num=2&dm_id=21705446&doc_num=1&pdf_header=0' id='documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0'>1</a><script>DocLink('documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0');</script>
1606+
1607+ """ # noqa
15821608 if len (anchors ) == 0 :
15831609 # Docket entry exists, but cannot download document (it's
15841610 # sealed, a minute entry, or otherwise unavailable in PACER).
@@ -1633,7 +1659,7 @@ def _get_document_number(self, cell):
16331659
16341660<span class="iconContainer"><a href="/cgi-bin/DisplayReceipt.pl?230820,8"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101011362785" onclick="goDLS('/doc1/101011362785','230820','8','','1','1','','','');return(false);">1</a></span>
16351661
1636- # Or, if the RECAP extension has modified the ODM :
1662+ # Or, if the RECAP extension has modified the DOM :
16371663
16381664<span class="iconContainer"><a href="/cgi-bin/DisplayReceipt.pl?230877,20"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101111365370" onclick="goDLS('/doc1/101111365370','230877','20','','1','1','','','');return(false);">3</a><a class="recap-inline" title="Available for free from the RECAP Archive." href="https://storage.courtlistener.com/recap/gov.uscourts.mnd.230877/gov.uscourts.mnd.230877.3.0.pdf"><img src="moz-extension://c2baafb3-8fb5-4cba-a00c-0947a021a9bf/assets/images/icon-16.png"></a></span>
16391665
0 commit comments