Skip to content

Commit eadca5f

Browse files
authored
Merge pull request #1299 from giancohs/1296-implement-2nd-circuit-lousiana
feat(lactapp_2): new scraper for Lousiana Court of Appeals Second Circuit
2 parents 922d291 + 8d42470 commit eadca5f

6 files changed

Lines changed: 8221 additions & 0 deletions

File tree

CHANGES.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,13 @@ words, they're the ones you'll want to watch, and the others are mostly noise.
1111
Releases are also tagged in git, if that's helpful.
1212

1313
## Coming up
14+
15+
- New scraper `lactapp_2` for Lousiana Court of Appeals, Second Circuit
16+
- Fix `me` Update maine scraper and add backscraper
17+
- Update `sd` backscraper and extract from text
1418
- Fix `bia` scraper and add extract from text test cases
1519

20+
1621
## Current
1722

1823
**2.6.66 - 2025-04-29**

juriscraper/opinions/united_states/state/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
"kyctapp",
6262
"la",
6363
"lactapp_1",
64+
"lactapp_2",
6465
"lactapp_5",
6566
"mass",
6667
"massappct",
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
"""
2+
Scraper for the Louisiana Second Circuit Court of Appeal
3+
CourtID: lactapp_2
4+
Court Short Name: La. Ct. App. 2d Cir
5+
Author: Gianfranco Huaman
6+
History:
7+
- 2025-01-11, giancohs: created
8+
"""
9+
10+
import re
11+
from datetime import datetime
12+
from urllib.parse import urlencode, urljoin
13+
14+
from juriscraper.AbstractSite import logger
15+
from juriscraper.lib.html_utils import (
16+
get_row_column_links,
17+
get_row_column_text,
18+
)
19+
from juriscraper.lib.judge_parsers import normalize_judge_string
20+
from juriscraper.OpinionSiteLinear import OpinionSiteLinear
21+
22+
23+
class Site(OpinionSiteLinear):
24+
def __init__(self, *args, **kwargs):
25+
super().__init__(*args, **kwargs)
26+
self.court_id = self.__module__
27+
self.base_url = "https://www.la2nd.org/opinions/"
28+
self.year = datetime.now().year
29+
params = {"opinion_year": self.year}
30+
self.url = urljoin(self.base_url, f"?{urlencode(params)}")
31+
self.first_opinion_date = datetime(2019, 7, 17).date()
32+
self.is_backscrape = False
33+
self.make_backscrape_iterable(kwargs)
34+
35+
def _process_html(self):
36+
"""Process the HTML and extract case information"""
37+
rows = self.html.xpath('//table[@id="datatable"]/tbody/tr')
38+
39+
for row in rows:
40+
author_str = get_row_column_text(row, 4)
41+
cleaned_author = normalize_judge_string(author_str)[0]
42+
if cleaned_author.endswith(" J."):
43+
cleaned_author = cleaned_author[:-3]
44+
status_str = get_row_column_text(row, 7)
45+
status = (
46+
"Published" if "Published" in status_str else "Unpublished"
47+
)
48+
date_str = get_row_column_text(row, 1)
49+
case_date = datetime.strptime(date_str, "%m/%d/%Y").date()
50+
51+
# Skip if not in date range
52+
if self.is_backscrape and not self.date_is_in_backscrape_range(
53+
case_date
54+
):
55+
continue
56+
57+
self.cases.append(
58+
{
59+
"date": date_str,
60+
"docket": get_row_column_text(row, 2),
61+
"name": get_row_column_text(row, 3),
62+
"author": cleaned_author,
63+
"disposition": get_row_column_text(row, 5),
64+
"url": get_row_column_links(row, 8),
65+
"status": status,
66+
}
67+
)
68+
69+
def make_backscrape_iterable(self, kwargs):
70+
"""Checks if backscrape start and end arguments have been passed
71+
by caller, and parses them accordingly
72+
73+
Louisiana's opinions page returns all opinions for a year (pagination is not needed),
74+
so we must filter out opinions not in the date range we are looking for
75+
76+
:return None
77+
"""
78+
start = kwargs.get("backscrape_start")
79+
end = kwargs.get("backscrape_end")
80+
81+
if start:
82+
start = datetime.strptime(start, "%Y/%m/%d").date()
83+
else:
84+
start = self.first_opinion_date
85+
if end:
86+
end = datetime.strptime(end, "%Y/%m/%d").date()
87+
else:
88+
end = datetime.now().date()
89+
90+
self.back_scrape_iterable = [(start, end)]
91+
92+
def _download_backwards(self, dates):
93+
"""Called when backscraping
94+
95+
:param dates: (start_date, end_date) tuple
96+
:return None
97+
"""
98+
self.start_date, self.end_date = dates
99+
self.is_backscrape = True
100+
logger.info(
101+
"Backscraping for range %s %s", self.start_date, self.end_date
102+
)
103+
104+
self.year = self.start_date.year
105+
params = {"opinion_year": self.year}
106+
self.url = urljoin(self.base_url, f"?{urlencode(params)}")
107+
self.html = self._download()
108+
self._process_html()
109+
110+
def date_is_in_backscrape_range(self, case_date):
111+
"""When backscraping, check if the case date is in
112+
the backscraping range
113+
114+
:param date_str: string date from the HTML source
115+
:return: True if date is in backscrape range
116+
"""
117+
return self.start_date <= case_date <= self.end_date
118+
119+
def extract_from_text(self, scraped_text):
120+
"""Extract the following values from the opinion's pdf text. The information we need is in the first page
121+
- appeal_from_str
122+
- judges
123+
124+
:param scraped_text: The text content of the pdf
125+
:return: Dictionary containing the extracted values that matches the courtlistener model objects
126+
"""
127+
metadata = {"Docket": {}}
128+
129+
appeal_from_match = re.search(
130+
r"Appealed from the\s*(.*?\s*),\s*Louisiana",
131+
scraped_text,
132+
re.DOTALL,
133+
)
134+
# Judges are in the format "Before [Judge1], [Judge2], and [Judge3], JJ."
135+
# Sometimes there are more than 3 judges, and other edge cases like "and" is in uppercase
136+
# or there is no comma between the last two judges
137+
judges_match = re.findall(
138+
r"Before\s+(.+?)(?:,\s*|\s+)?(?:and|AND)\s+([A-Z]+),\s+JJ\.",
139+
scraped_text,
140+
re.DOTALL,
141+
)
142+
if appeal_from_match:
143+
appeal_from_result = re.sub(
144+
r"\s+", " ", appeal_from_match.group(1).replace("\n", " ")
145+
).strip()
146+
metadata["Docket"] = {
147+
"appeal_from_str": appeal_from_result,
148+
}
149+
if judges_match:
150+
initial_judges, last_judge = judges_match[0]
151+
all_judges = initial_judges.split(",") + [last_judge]
152+
metadata["OpinionCluster"] = {
153+
"judges": "; ".join(filter(None, map(str.strip, all_judges))),
154+
}
155+
return metadata

0 commit comments

Comments
 (0)