Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 113 additions & 14 deletions juriscraper/pacer/docket_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ class BaseDocketReport:
# (U+2013), and ASCII slashes.
DATE_REGEX = r"[—\d\-–/]+"

WHITESPACE_WITH_NBSP = r"[\s\u00a0]" # Whitespace including nbsp

date_entered_regex = re.compile(r"Entered:\s+(%s)" % DATE_REGEX)
date_terminated_regex = re.compile(
r"[tT]erminated:\s+(%s)" % DATE_REGEX, flags=re.IGNORECASE
Expand Down Expand Up @@ -1408,9 +1410,10 @@ def docket_entries(self):

number = de["document_number"]
if number is not None and not number.isdigit():
# Some courts use the word "doc" instead of a docket number. We
# skip these for now.
continue
# Per b04951853, Some courts use the word "doc"
# instead of a docket number. See lawb_18072.
raise ValueError(f"Unexpected document number {number}")
Comment thread
mlissner marked this conversation as resolved.

docket_entries.append(de)

docket_entries = clean_court_object(docket_entries)
Expand Down Expand Up @@ -1569,13 +1572,39 @@ def _set_metadata_values(self):

@staticmethod
def _get_pacer_doc_id_and_seq_no(cell, document_number):
"""Parse the first <a> tag that contains the document number in a text
node.

This code is somewhat fragile, as noted in commented examples below.
It would benefit from some assertions and checks.

It was written to handle the case of txnb, which combines the
second (entry number with pagecount) and third (docket entry
text) columns in its docket report."""

if not document_number:
return None, None
else:
# We find the first link having the document number as text.
# This is needed because txnb combines the second and third
# column in their docket report.
anchors = cell.xpath(".//a")
#
_ = """
# This code can go terribly wrong, resulting in things like:
[ 'pacer_doc_id': 'Dis0layReceipt.pl' ]
# which occurred when this was invoked with document_number as 'view'.

# This happened when the anchors list began:
(Pdb) tostring(anchors[0])
b'<a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a>'

# which came from this:
(Pdb) tostring(cell)
b'<td align="right"><span class="iconContainer"><a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101111363917" onclick="goDLS(\'/doc1/101111363917\',\'230820\',\'26\',\'\',\'1\',\'1\',\'\',\'\',\'\');return(false);" rel="noopener noreferrer">4</a></span>&#160;</td>'


# This code was designed to deal with txsb:
<a href='/cgi-bin/show_doc.pl?caseid=322636&de_seq_num=2&dm_id=21705446&doc_num=1&pdf_header=0' id='documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0'>1</a><script>DocLink('documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0');</script>

""" # noqa
Comment on lines +1589 to +1607
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't seen this style before.

Suggested change
#
_ = """
# This code can go terribly wrong, resulting in things like:
[ 'pacer_doc_id': 'Dis0layReceipt.pl' ]
# which occurred when this was invoked with document_number as 'view'.
# This happened when the anchors list began:
(Pdb) tostring(anchors[0])
b'<a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a>'
# which came from this:
(Pdb) tostring(cell)
b'<td align="right"><span class="iconContainer"><a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101111363917" onclick="goDLS(\'/doc1/101111363917\',\'230820\',\'26\',\'\',\'1\',\'1\',\'\',\'\',\'\');return(false);" rel="noopener noreferrer">4</a></span>&#160;</td>'
# This code was designed to deal with txsb:
<a href='/cgi-bin/show_doc.pl?caseid=322636&de_seq_num=2&dm_id=21705446&doc_num=1&pdf_header=0' id='documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0'>1</a><script>DocLink('documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0');</script>
""" # noqa
"""
# This code can go terribly wrong, resulting in things like:
[ 'pacer_doc_id': 'Dis0layReceipt.pl' ]
# which occurred when this was invoked with document_number as 'view'.
# This happened when the anchors list began:
(Pdb) tostring(anchors[0])
b'<a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a>'
# which came from this:
(Pdb) tostring(cell)
b'<td align="right"><span class="iconContainer"><a href="https://ecf.mnd.uscourts.gov/cgi-bin/DisplayReceipt.pl?230820,26" rel="noopener noreferrer"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101111363917" onclick="goDLS(\'/doc1/101111363917\',\'230820\',\'26\',\'\',\'1\',\'1\',\'\',\'\',\'\');return(false);" rel="noopener noreferrer">4</a></span>&#160;</td>'
# This code was designed to deal with txsb:
<a href='/cgi-bin/show_doc.pl?caseid=322636&de_seq_num=2&dm_id=21705446&doc_num=1&pdf_header=0' id='documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0'>1</a><script>DocLink('documentKcaseidV322636Kde_seq_numV2Kdm_idV21705446Kdoc_numV1Kpdf_headerV0');</script>
"""

What's your approach for? Can you fix it here and elsewhere?

Copy link
Copy Markdown
Contributor Author

@johnhawkinson johnhawkinson Jan 29, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The use of the triple-quoted strings as comments is for lengthy comments that exceed the linter's line-length constraints.
The _ = """ assignment is unnecessary, I guess it's just a habit I can't explain.
But the # noqa is necessary to pacify linters like flake8.
Without it:

(juriscraper) jhawk@lrr juriscraper % flake8 juriscraper/pacer/docket_report.py > /tmp/d1
# (manual edit to remote "noqa" from line 1671)
(juriscraper) jhawk@lrr juriscraper % flake8 juriscraper/pacer/docket_report.py > /tmp/d2
(juriscraper) jhawk@lrr juriscraper % diff -u /tmp/d[12]
--- /tmp/d1	2026-01-28 22:22:39.156195760 -0500
+++ /tmp/d2	2026-01-28 22:22:54.287570031 -0500
@@ -14,3 +14,12 @@
 juriscraper/pacer/docket_report.py:1359:80: E501 line too long (80 > 79 characters)
 juriscraper/pacer/docket_report.py:1390:80: E501 line too long (90 > 79 characters)
 juriscraper/pacer/docket_report.py:1391:80: E501 line too long (87 > 79 characters)
+juriscraper/pacer/docket_report.py:1650:80: E501 line too long (147 > 79 characters)
+juriscraper/pacer/docket_report.py:1656:80: E501 line too long (148 > 79 characters)
+juriscraper/pacer/docket_report.py:1660:80: E501 line too long (276 > 79 characters)
+juriscraper/pacer/docket_report.py:1664:80: E501 line too long (544 > 79 characters)
+juriscraper/pacer/docket_report.py:1666:1: W191 indentation contains tabs
+juriscraper/pacer/docket_report.py:1666:1: E101 indentation contains mixed spaces and tabs
+juriscraper/pacer/docket_report.py:1667:1: W191 indentation contains tabs
+juriscraper/pacer/docket_report.py:1667:1: E101 indentation contains mixed spaces and tabs
+juriscraper/pacer/docket_report.py:1669:80: E501 line too long (118 > 79 characters)

I realize you're now using "Ruff" (which confusingly doesn't seem to do proper linting in my dev environment and I'm not sure why), but I would like my code to pass flake8.

(Although the spaces/tabs probably should be fixed)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool. Let's just drop the _ = """ business, please.

if len(anchors) == 0:
# Docket entry exists, but cannot download document (it's
# sealed, a minute entry, or otherwise unavailable in PACER).
Expand All @@ -1593,23 +1622,93 @@ def _get_pacer_doc_id_and_seq_no(cell, document_number):
def _get_document_number(self, cell):
"""Get the document number.

Some jurisdictions have the number as, "13 (5 pgs)" so some processing
is needed. See flsb, 09-02199-JKO.
CM/ECF can add things to this cell that are not numbers, in a
variety of ways:

. For filing users, the docket report has a "Links to Notices
of Electronic Filing" checkbox, which is on by default in some
districts. This produces an <a> linking to DisplayReceipt.pl,
which in CSS is rendered as a "silver ball" icon, and has a
text node of the value "view".

. For court users, a gavel icon is displayed for motions that
have not been resolved. We are unlikely to see this.

. Some jurisdictions have additional information, such as
"13 (5 pgs)". See flsb, 09-02199-JKO.

. Some jurisdictions, e.g. lawb, can have unnumbered PDF
attachments that appear as merely "doc" instead of a number.
We can't represent such non-unique non-numeric docket entry
numbers in our schema, so throw away the so-called "number"
("doc"), and also throw away the doc1 link to the PDF.
"""

_ = """
# Examples of the HTML.
# A normal case, nysd:
<a href="https://ecf.nysd.uscourts.gov/doc1/1270456659" onClick="goDLS('/doc1/1270456659','39589','2','','1','1','','');return(false);">1</a>&nbsp;

# An unnumbered entry, also nysd:
1&nbsp;

# The flsb case, with pagecount parenthesis:
<a href='/doc1/050010759404' onContextMenu='this.href="https://ecf.flsb.uscourts.gov/doc1/050010759404"'>13</a> <br><nobr>(5&nbsp;pgs)</nobr></nobr>

# With silver bells, from mnd:

<span class="iconContainer"><a href="/cgi-bin/DisplayReceipt.pl?230820,8"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101011362785" onclick="goDLS('/doc1/101011362785','230820','8','','1','1','','','');return(false);">1</a></span>&nbsp;

# Or, if the RECAP extension has modified the DOM:

<span class="iconContainer"><a href="/cgi-bin/DisplayReceipt.pl?230877,20"><span class="receiptLink">view</span></a><a href="https://ecf.mnd.uscourts.gov/doc1/101111365370" onclick="goDLS('/doc1/101111365370','230877','20','','1','1','','','');return(false);">3</a><a class="recap-inline" title="Available for free from the RECAP Archive." href="https://storage.courtlistener.com/recap/gov.uscourts.mnd.230877/gov.uscourts.mnd.230877.3.0.pdf"><img src="moz-extension://c2baafb3-8fb5-4cba-a00c-0947a021a9bf/assets/images/icon-16.png"></a></span>

# Or, in lawb, unnumbered document entries can have PDF
# document attachments.

<a href="/doc1/0880393551" oncontextmenu="this.href=&quot;https://ecf.lawb.uscourts.gov/doc1/0880393551&quot;">doc</a>

""" # noqa

# Possible approaches to silver bell complexity:
# 1. Look for the /doc1 href and take its contents
# 2. Ignore the /cgi-bin/DisplayReceipt href and its contents
# 2(a) Ignore all /cgi-bin links hrefs?
# 3. Ignore the <span class="receiptLink">
# 4. Ignore "view" text nodes.
#
# Given that we use self._br_split() which only returns text nodes,
# only #4 is easy to do as the others require HTML tag processing.
# So we go with #4.

words = [
word for phrase in self._br_split(cell) for word in phrase.split()
]

# XXX: an unfortunately consequence of removing the word
# "doc" from the set of possible docket entry numbers is that
# we fail to capture the PDF attachment to this docket entry.
# But better that than not capturing the docket text at all.
for _ in ["view", "doc"]:
try:
words.remove(_)
except ValueError:
pass

if words:
first_word = re.sub("[\\s\u00a0]", "", words[0])
if self.court_id == "txnb":
first_word = re.sub(self.WHITESPACE_WITH_NBSP, "", words[0])
if self.court_id != "txnb":
return first_word
else:
# txnb merges the second and third columns, so if the first
# word is a number, return it. Otherwise, assume doc number
# isn't listed for the item.
if first_word.isdigit():
return first_word
else:
return first_word
return None
else:
return None
else:
return None

def _get_description(self, cells):
if self.court_id != "txnb":
Expand All @@ -1620,7 +1719,7 @@ def _get_description(self, cells):
# combined. The field can have one of four formats. Attempt the most
# detailed first, then work our way down to just giving up and
# capturing it all.
ws = "[\\s\u00a0]" # Whitespace including nbsp
ws = self.WHITESPACE_WITH_NBSP
regexes = [
# 2 (23 pgs; 4 docs) Blab blah (happens when attachments exist and
# page numbers are on)
Expand Down
18 changes: 17 additions & 1 deletion tests/examples/pacer/dockets/bankruptcy/lawb_18072.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,14 @@
"pacer_doc_id": null,
"pacer_seq_no": null
},
{
"date_entered": "2001-03-01",
"date_filed": "2001-03-01",
"description": "Courts BNC Certificate of Service RE: [0-0] First Meeting . Notices sent: 16 (bnc) (Entered: 03/01/2001)",
"document_number": null,
"pacer_doc_id": null,
"pacer_seq_no": null
},
{
"date_entered": "2001-03-26",
"date_filed": "2001-03-26",
Expand Down Expand Up @@ -89,6 +97,14 @@
"pacer_doc_id": "0880432493",
"pacer_seq_no": null
},
{
"date_entered": "2001-07-13",
"date_filed": "2001-07-13",
"description": "Courts BNC Certificate of Service RE: [9-1] Discharge Order . Notices sent: 16 (bnc) (Entered: 07/13/2001)",
"document_number": null,
"pacer_doc_id": null,
"pacer_seq_no": null
},
{
"date_entered": "2001-07-16",
"date_filed": "2001-07-16",
Expand Down Expand Up @@ -177,4 +193,4 @@
}
],
"referred_to_str": ""
}
}
Loading