freelawproject · johnhawkinson · Jan 28, 2026 · Jan 28, 2026 · Jan 27, 2026 · Jan 28, 2026
diff --git a/juriscraper/pacer/docket_report.py b/juriscraper/pacer/docket_report.py
@@ -45,6 +45,8 @@ class BaseDocketReport:
     # (U+2013), and ASCII slashes.
     DATE_REGEX = r"[—\d\-–/]+"
 
+    WHITESPACE_WITH_NBSP = r"[\s\u00a0]"  # Whitespace including nbsp
+
     date_entered_regex = re.compile(r"Entered:\s+(%s)" % DATE_REGEX)
     date_terminated_regex = re.compile(
         r"[tT]erminated:\s+(%s)" % DATE_REGEX, flags=re.IGNORECASE
@@ -1408,9 +1410,10 @@ def docket_entries(self):
 
             number = de["document_number"]
             if number is not None and not number.isdigit():
-                # Some courts use the word "doc" instead of a docket number. We
-                # skip these for now.
-                continue
+                # Per b04951853, Some courts use the word "doc"
+                # instead of a docket number. See lawb_18072.
+                raise ValueError(f"Unexpected document number {number}")
+
             docket_entries.append(de)
 
         docket_entries = clean_court_object(docket_entries)
@@ -1569,13 +1572,39 @@ def _set_metadata_values(self):
 
     @staticmethod
     def _get_pacer_doc_id_and_seq_no(cell, document_number):
+        """Parse the first <a> tag that contains the document number in a text
+        node.
+
+        This code is somewhat fragile, as noted in commented examples below.
+        It would benefit from some assertions and checks.
+
+        It was written to handle the case of txnb, which combines the
+        second (entry number with pagecount) and third (docket entry
+        text) columns in its docket report."""
+
         if not document_number:
             return None, None
         else:
-            # We find the first link having the document number as text.
-            # This is needed because txnb combines the second and third
-            # column in their docket report.
             anchors = cell.xpath(".//a")
+            #
+            _ = """
+            # This code can go terribly wrong, resulting in things like:
+[ 'pacer_doc_id': 'Dis0layReceipt.pl' ]
+            # which occurred when this was invoked with document_number as 'view'.
+
+            # This happened when the anchors list began:
+(Pdb) tostring(anchors[0])
+b'<a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a>'
+
+            # which came from this:
+(Pdb) tostring(cell)
+b'<td align="right"><span class="iconContainer"><a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101111363917" onclick="goDLS(\'/doc1/101111363917\',\'230820\',\'26\',\'\',\'1\',\'1\',\'\',\'\',\'\');return(false);" rel="noopener noreferrer">4</a></span>&#160;</td>'
+
+
+            # This code was designed to deal with txsb:
+<a href='/cgi-bin/show_doc.pl?caseid=322636&de_seq_num=2&dm_id=21705446&doc_num=1&pdf_header=0' id='documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0'>1</a><script>DocLink('documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0');</script>
+
+"""  # noqa
-            #
-            _ = """
-            # This code can go terribly wrong, resulting in things like:
-[ 'pacer_doc_id': 'Dis0layReceipt.pl' ]
-            # which occurred when this was invoked with document_number as 'view'.
-
-            # This happened when the anchors list began:
-(Pdb) tostring(anchors[0])
-b'<a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a>'
-
-            # which came from this:
-(Pdb) tostring(cell)
-b'<td align="right"><span class="iconContainer"><a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101111363917" onclick="goDLS(\'/doc1/101111363917\',\'230820\',\'26\',\'\',\'1\',\'1\',\'\',\'\',\'\');return(false);" rel="noopener noreferrer">4</a></span>&#160;</td>'
-
-
-            # This code was designed to deal with txsb:
-<a href='/cgi-bin/show_doc.pl?caseid=322636&de_seq_num=2&dm_id=21705446&doc_num=1&pdf_header=0' id='documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0'>1</a><script>DocLink('documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0');</script>
-
-"""  # noqa
+            """
+            # This code can go terribly wrong, resulting in things like:
+[ 'pacer_doc_id': 'Dis0layReceipt.pl' ]
+            # which occurred when this was invoked with document_number as 'view'.
+
+            # This happened when the anchors list began:
+(Pdb) tostring(anchors[0])
+b'<a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a>'
+
+            # which came from this:
+(Pdb) tostring(cell)
+b'<td align="right"><span class="iconContainer"><a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101111363917" onclick="goDLS(\'/doc1/101111363917\',\'230820\',\'26\',\'\',\'1\',\'1\',\'\',\'\',\'\');return(false);" rel="noopener noreferrer">4</a></span>&#160;</td>'
+
+
+            # This code was designed to deal with txsb:
+<a href='/cgi-bin/show_doc.pl?caseid=322636&de_seq_num=2&dm_id=21705446&doc_num=1&pdf_header=0' id='documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0'>1</a><script>DocLink('documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0');</script>
+            """
-            #
-            _ = """
-            # This code can go terribly wrong, resulting in things like:
-[ 'pacer_doc_id': 'Dis0layReceipt.pl' ]
-            # which occurred when this was invoked with document_number as 'view'.
-
-            # This happened when the anchors list began:
-(Pdb) tostring(anchors[0])
-b'<a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a>'
-
-            # which came from this:
-(Pdb) tostring(cell)
-b'<td align="right"><span class="iconContainer"><a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101111363917" onclick="goDLS(\'/doc1/101111363917\',\'230820\',\'26\',\'\',\'1\',\'1\',\'\',\'\',\'\');return(false);" rel="noopener noreferrer">4</a></span>&#160;</td>'
-
-
-            # This code was designed to deal with txsb:
-<a href='/cgi-bin/show_doc.pl?caseid=322636&de_seq_num=2&dm_id=21705446&doc_num=1&pdf_header=0' id='documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0'>1</a><script>DocLink('documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0');</script>
-
-"""  # noqa
+            """
+            # This code can go terribly wrong, resulting in things like:
+[ 'pacer_doc_id': 'Dis0layReceipt.pl' ]
+            # which occurred when this was invoked with document_number as 'view'.
+
+            # This happened when the anchors list began:
+(Pdb) tostring(anchors[0])
+b'<a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a>'
+
+            # which came from this:
+(Pdb) tostring(cell)
+b'<td align="right"><span class="iconContainer"><a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101111363917" onclick="goDLS(\'/doc1/101111363917\',\'230820\',\'26\',\'\',\'1\',\'1\',\'\',\'\',\'\');return(false);" rel="noopener noreferrer">4</a></span>&#160;</td>'
+
+
+            # This code was designed to deal with txsb:
+<a href='/cgi-bin/show_doc.pl?caseid=322636&de_seq_num=2&dm_id=21705446&doc_num=1&pdf_header=0' id='documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0'>1</a><script>DocLink('documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0');</script>
+            """
             if len(anchors) == 0:
                 # Docket entry exists, but cannot download document (it's
                 # sealed, a minute entry, or otherwise unavailable in PACER).
@@ -1593,23 +1622,93 @@ def _get_pacer_doc_id_and_seq_no(cell, document_number):
     def _get_document_number(self, cell):
         """Get the document number.
 
-        Some jurisdictions have the number as, "13 (5 pgs)" so some processing
-        is needed. See flsb, 09-02199-JKO.
+        CM/ECF can add things to this cell that are not numbers, in a
+        variety of ways:
+
+        . For filing users, the docket report has a "Links to Notices
+        of Electronic Filing" checkbox, which is on by default in some
+        districts. This produces an <a> linking to DisplayReceipt.pl,
+        which in CSS is rendered as a "silver ball" icon, and has a
+        text node of the value "view".
+
+        . For court users, a gavel icon is displayed for motions that
+        have not been resolved. We are unlikely to see this.
+
+        . Some jurisdictions have additional information, such as
+        "13 (5 pgs)". See flsb, 09-02199-JKO.
+
+        . Some jurisdictions, e.g. lawb, can have unnumbered PDF
+        attachments that appear as merely "doc" instead of a number.
+        We can't represent such non-unique non-numeric docket entry
+        numbers in our schema, so throw away the so-called "number"
+        ("doc"), and also throw away the doc1 link to the PDF.
         """
+
+        _ = """
+        # Examples of the HTML.
+        # A normal case, nysd:
+<a href="https://ecf.nysd.uscourts.gov/doc1/1270456659" onClick="goDLS('/doc1/1270456659','39589','2','','1','1','','');return(false);">1</a>&nbsp;
+
+        # An unnumbered entry, also nysd:
+1&nbsp;
+
+        # The flsb case, with pagecount parenthesis:
+<a href='/doc1/050010759404' onContextMenu='this.href="https://ecf.flsb.uscourts.gov/doc1/050010759404"'>13</a> <br><nobr>(5&nbsp;pgs)</nobr></nobr>
+
+        # With silver bells, from mnd:
+
+<span class="iconContainer"><a href="/cgi-bin/DisplayReceipt.pl?230820,8"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101011362785" onclick="goDLS('/doc1/101011362785','230820','8','','1','1','','','');return(false);">1</a></span>&nbsp;
+
+        # Or, if the RECAP extension has modified the DOM:
+
+<span class="iconContainer"><a href="/cgi-bin/DisplayReceipt.pl?230877,20"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101111365370" onclick="goDLS('/doc1/101111365370','230877','20','','1','1','','','');return(false);">3</a><a class="recap-inline" title="Available for free from the RECAP Archive." href="https://storage.courtlistener.com/recap/gov.uscourts.mnd.230877/gov.uscourts.mnd.230877.3.0.pdf"><img src="moz-extension://c2baafb3-8fb5-4cba-a00c-0947a021a9bf/assets/images/icon-16.png"></a></span>
+
+	# Or, in lawb, unnumbered document entries can have PDF
+	# document attachments.
+
+<a href="/doc1/0880393551" oncontextmenu="this.href=&quot;https://ecf.lawb.uscourts.gov/doc1/0880393551&quot;">doc</a>
+
+"""  # noqa
+
+        # Possible approaches to silver bell complexity:
+        # 1.   Look for the /doc1 href and take its contents
+        # 2.   Ignore the /cgi-bin/DisplayReceipt href and its contents
+        # 2(a) Ignore all /cgi-bin links hrefs?
+        # 3.   Ignore the <span class="receiptLink">
+        # 4.   Ignore "view" text nodes.
+        #
+        # Given that we use self._br_split() which only returns text nodes,
+        # only #4 is easy to do as the others require HTML tag processing.
+        # So we go with #4.
+
         words = [
             word for phrase in self._br_split(cell) for word in phrase.split()
         ]
+
+        # XXX: an unfortunately consequence of removing the word
+        # "doc" from the set of possible docket entry numbers is that
+        # we fail to capture the PDF attachment to this docket entry.
+        # But better that than not capturing the docket text at all.
+        for _ in ["view", "doc"]:
+            try:
+                words.remove(_)
+            except ValueError:
+                pass
+
         if words:
-            first_word = re.sub("[\\s\u00a0]", "", words[0])
-            if self.court_id == "txnb":
+            first_word = re.sub(self.WHITESPACE_WITH_NBSP, "", words[0])
+            if self.court_id != "txnb":
+                return first_word
+            else:
                 # txnb merges the second and third columns, so if the first
                 # word is a number, return it. Otherwise, assume doc number
                 # isn't listed for the item.
                 if first_word.isdigit():
                     return first_word
-            else:
-                return first_word
-        return None
+                else:
+                    return None
+        else:
+            return None
 
     def _get_description(self, cells):
         if self.court_id != "txnb":
@@ -1620,7 +1719,7 @@ def _get_description(self, cells):
         # combined. The field can have one of four formats. Attempt the most
         # detailed first, then work our way down to just giving up and
         # capturing it all.
-        ws = "[\\s\u00a0]"  # Whitespace including nbsp
+        ws = self.WHITESPACE_WITH_NBSP
         regexes = [
             # 2 (23 pgs; 4 docs) Blab blah (happens when attachments exist and
             # page numbers are on)

diff --git a/tests/examples/pacer/dockets/bankruptcy/lawb_18072.json b/tests/examples/pacer/dockets/bankruptcy/lawb_18072.json
@@ -41,6 +41,14 @@
       "pacer_doc_id": null,
       "pacer_seq_no": null
     },
+    {
+      "date_entered": "2001-03-01",
+      "date_filed": "2001-03-01",
+      "description": "Courts BNC Certificate of Service RE: [0-0] First Meeting . Notices sent: 16 (bnc) (Entered: 03/01/2001)",
+      "document_number": null,
+      "pacer_doc_id": null,
+      "pacer_seq_no": null
+    },
     {
       "date_entered": "2001-03-26",
       "date_filed": "2001-03-26",
@@ -89,6 +97,14 @@
       "pacer_doc_id": "0880432493",
       "pacer_seq_no": null
     },
+    {
+      "date_entered": "2001-07-13",
+      "date_filed": "2001-07-13",
+      "description": "Courts BNC Certificate of Service RE: [9-1] Discharge Order . Notices sent: 16 (bnc) (Entered: 07/13/2001)",
+      "document_number": null,
+      "pacer_doc_id": null,
+      "pacer_seq_no": null
+    },
     {
       "date_entered": "2001-07-16",
       "date_filed": "2001-07-16",
@@ -177,4 +193,4 @@
     }
   ],
   "referred_to_str": ""
-}
+}