Skip to content

Commit adbd7e9

Browse files
committed
fix(nc): update to OpinionSiteLinear and new site
Solves #1373
1 parent 9f022ce commit adbd7e9

13 files changed

Lines changed: 4894 additions & 9480 deletions

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ Releases are also tagged in git, if that's helpful.
1414

1515
- Fix `me` Update maine scraper and add backscraper
1616
- Update `sd` backscraper and extract from text
17+
- update `nc` scraper to OpinionSiteLinear and new website #1373
1718

1819
## Current
1920

juriscraper/opinions/united_states/state/nc.py

Lines changed: 64 additions & 143 deletions
Original file line numberDiff line numberDiff line change
@@ -5,174 +5,95 @@
55
History:
66
2014-05-01: Created by Brian Carver
77
2014-08-04: Rewritten by Jon Andersen with complete backscraper
8+
2025-04-22: grossir, Update to OpinionSiteLinear
89
"""
910

1011
import re
11-
import traceback
12-
from datetime import date, datetime
12+
from datetime import date
1313

14-
from lxml import html
14+
from juriscraper.AbstractSite import logger
15+
from juriscraper.OpinionSiteLinear import OpinionSiteLinear
1516

16-
from juriscraper.lib.exceptions import InsanityException
17-
from juriscraper.OpinionSite import OpinionSite
1817

19-
20-
class Site(OpinionSite):
18+
class Site(OpinionSiteLinear):
2119
start_year = 1997
2220
current_year = date.today().year
2321
court = "sc"
24-
base_url = "http://appellate.nccourts.org/opinions/?c={}&year={}"
22+
base_url = "http://appellate.nccourts.org/opinion-filings/?c={}&year={}"
23+
row_xpath = "//span[span[@class='title']] | //td[span[@class='title']]"
24+
title_regex = r"\((?P<docket>[\dA-Z-]+)\s+- (?P<status>(Unp|P)ublished)"
25+
26+
# in the browser inspector the tr-td containers do not appear for `nc`
27+
# but they do exist in the source inspected as text
28+
date_xpath = (
29+
"../../preceding-sibling::tr//strong[contains(text()[1], 'Filed:')] "
30+
)
31+
date_regex = r"Filed: (?P<date>[\d\w ]+)"
32+
secondary_date_regex = None
33+
34+
# For `nc` opinions (last available in 2022, as of April 2025)
35+
state_cite_regex = r"\d+ NC \d+"
2536

2637
def __init__(self, *args, **kwargs):
2738
super().__init__(*args, **kwargs)
2839
self.court_id = self.__module__
2940
self.url = self.base_url.format(self.court, self.current_year)
30-
3141
self.make_backscrape_iterable(kwargs)
3242

33-
self.my_download_urls = []
34-
self.my_case_names = []
35-
self.my_docket_numbers = []
36-
self.my_summaries = []
37-
self.my_neutral_citations = []
38-
self.my_precedential_statuses = []
39-
40-
def _get_case_dates(self):
41-
case_dates = []
42-
case_date = None
43-
precedential_status = "Published"
44-
date_cleaner = r"\d+ \w+ [12][90]\d\d"
45-
path = "//table//tr"
46-
for row_el in self.html.xpath(path):
47-
# Examine each row. If it contains the date, we set that as
48-
# the current date. If it contains a case, we parse it.
49-
try:
50-
date_nodes = row_el.xpath(".//strong/text()")
51-
date_str = date_nodes[0]
52-
if date_nodes:
53-
date_str = re.search(
54-
date_cleaner, date_str, re.MULTILINE
55-
).group()
56-
case_date = datetime.strptime(date_str, "%d %B %Y").date()
57-
# When a new date header appears, switch to Precedential
58-
precedential_status = "Published"
59-
continue # Row contained just the date, move on
60-
except IndexError:
61-
# No matching nodes; not a date header
62-
pass
63-
64-
path = "./td[contains(., 'Unpublished Opinions - Rule 30e')]"
65-
if row_el.xpath(path):
66-
precedential_status = "Unpublished"
67-
# When this header appears, switch to Nonprecedential, then
68-
# press on to the following rows.
43+
def _process_html(self):
44+
for row in self.html.xpath(self.row_xpath):
45+
title = row.xpath("string(span[@class='title'])")
46+
47+
link = row.xpath("span[@class='title']/@onclick")
48+
if not link:
49+
# some opinions may be withdrawn
50+
logger.warning("No link for row %s", title)
6951
continue
7052

71-
if precedential_status == "Published":
72-
urls = row_el.xpath("./td/span/span[1]/@onclick")
73-
# Like: viewOpinion("http://appellate.nccourts.org/opinions/?c=1&amp;pdf=31511")
74-
if len(urls) != 1 or urls[0].find("viewOpinion") != 0:
75-
continue # Only interested in cases with a download link
53+
url = link[0].split('("')[1].strip('")')
7654

77-
# Pull the URL out of the javascript viewOpinion function.
78-
download_url = re.search(
79-
r'viewopinion\("(.*)"', urls[0], re.IGNORECASE
80-
).group(1)
55+
match = re.search(self.title_regex, title)
56+
name = title[: match.start()].strip(" ,")
8157

82-
path = "./td/span/span[contains(@class,'title')]"
83-
txt = html.tostring(
84-
row_el.xpath(path)[0], method="text", encoding="unicode"
85-
)
86-
case_name, neutral_cite, docket_number = self.parse_title(txt)
87-
88-
summary = ""
89-
path = "./td/span/span[contains(@class,'desc')]/text()"
90-
summaries = row_el.xpath(path)
91-
try:
92-
summary = summaries[0]
93-
except IndexError:
94-
# Not all cases have a summary
95-
pass
96-
if case_name.strip() == "":
97-
continue # A few cases are missing a name
98-
99-
case_dates.append(case_date)
100-
self.my_download_urls.append(download_url)
101-
self.my_case_names.append(case_name)
102-
self.my_docket_numbers.append(docket_number)
103-
self.my_summaries.append(summary)
104-
self.my_neutral_citations.append(neutral_cite)
105-
self.my_precedential_statuses.append(precedential_status)
106-
107-
elif precedential_status == "Unpublished":
108-
for span in row_el.xpath("./td/span"):
109-
if "onclick" not in span.attrib:
110-
continue
111-
download_url = re.search(
112-
r'viewopinion\("(.*)"',
113-
span.attrib["onclick"],
114-
re.IGNORECASE,
115-
).group(1)
116-
117-
txt = span.text_content().strip()
118-
(
119-
case_name,
120-
neutral_cite,
121-
docket_number,
122-
) = self.parse_title(txt)
123-
if case_name.strip() == "":
124-
continue # A few cases are missing a name
125-
case_dates.append(case_date)
126-
self.my_download_urls.append(download_url)
127-
self.my_case_names.append(case_name)
128-
self.my_docket_numbers.append(docket_number)
129-
self.my_summaries.append("")
130-
self.my_neutral_citations.append(neutral_cite)
131-
self.my_precedential_statuses.append(precedential_status)
132-
133-
return case_dates
134-
135-
# Parses case titles like:
136-
# Fields v. Harnett Cnty., 367 NC 12 (13-761)
137-
# Clark v. Clark, (13-612)
138-
@staticmethod
139-
def parse_title(txt):
140-
try:
141-
name_and_citation = txt.rsplit("(", 1)[0].strip()
142-
docket_number = (
143-
re.search(r"(.*\d).*?", txt.rsplit("(", 1)[1]).group(0).strip()
144-
)
145-
case_name = name_and_citation.rsplit(",", 1)[0].strip()
146-
try:
147-
neutral_cite = name_and_citation.rsplit(",", 1)[1].strip()
148-
if not re.search(r"^\d\d.*\d\d$", neutral_cite):
149-
neutral_cite = ""
150-
except IndexError:
151-
# Unable to find comma to split on. No neutral cite.
152-
neutral_cite = ""
153-
except:
154-
raise InsanityException(
155-
f"Unable to parse: {txt}\n{traceback.format_exc()}"
156-
)
157-
return case_name, neutral_cite, docket_number
158-
159-
def _get_download_urls(self):
160-
return self.my_download_urls
58+
state_cite = ""
59+
if cite_match := re.search(self.state_cite_regex, name):
60+
state_cite = cite_match.group(0)
61+
name = name[: cite_match.start()].strip(" ,")
16162

162-
def _get_case_names(self):
163-
return self.my_case_names
63+
docket = match.group("docket")
64+
status = match.group("status")
65+
summary = row.xpath("string(span[@class='desc'])")
16466

165-
def _get_docket_numbers(self):
166-
return self.my_docket_numbers
67+
author = row.xpath("string(span[@class='author']/i)").strip()
68+
per_curiam = False
69+
if author.lower() == "per curiam":
70+
per_curiam = True
71+
author = ""
16772

168-
def _get_summaries(self):
169-
return self.my_summaries
73+
# pick the last preceding-sibling, the most recent date block
74+
date_block = row.xpath(self.date_xpath)[-1].text_content()
17075

171-
def _get_citations(self):
172-
return self.my_neutral_citations
76+
if match := re.search(self.date_regex, date_block):
77+
date = match.group("date")
78+
else:
79+
# # for ncctapp unpublished opinions
80+
date = self.secondary_date_regex.search(date_block).group(
81+
"date"
82+
)
17383

174-
def _get_precedential_statuses(self):
175-
return self.my_precedential_statuses
84+
self.cases.append(
85+
{
86+
"author": author,
87+
"per_curiam": per_curiam,
88+
"summary": summary,
89+
"status": status,
90+
"docket": docket,
91+
"name": name,
92+
"url": url,
93+
"date": date,
94+
"citation": state_cite,
95+
}
96+
)
17697

17798
def _download_backwards(self, year: int) -> None:
17899
"""Build year URL and scrape

juriscraper/opinions/united_states/state/ncctapp.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,20 @@
66
2014-08-04: Created by Jon Andersen
77
"""
88

9-
from datetime import date
9+
import re
1010

1111
from juriscraper.opinions.united_states.state import nc
1212

1313

1414
class Site(nc.Site):
1515
court = "coa"
16+
unpub_date_xpath = (
17+
"../preceding-sibling::tr/td[contains(text(), 'Rule 30e')]"
18+
)
19+
date_xpath = f"{nc.Site.date_xpath} | {unpub_date_xpath}"
20+
secondary_date_regex = re.compile(
21+
r"(?P<date>\d[\d \w]+)[\t\xa0\n]+- Rule 30e", flags=re.MULTILINE
22+
)
1623

17-
def __init__(self, *args, **kwargs):
18-
super().__init__(*args, **kwargs)
19-
self.court_id = self.__module__
24+
# For `ncctapp` opinions (last available in 2012, as of April 2025)
25+
state_cite_regex = r"\d+ NC App \d+"

0 commit comments

Comments
 (0)