|
5 | 5 | History: |
6 | 6 | 2014-05-01: Created by Brian Carver |
7 | 7 | 2014-08-04: Rewritten by Jon Andersen with complete backscraper |
| 8 | + 2025-04-22: grossir, Update to OpinionSiteLinear |
8 | 9 | """ |
9 | 10 |
|
10 | 11 | import re |
11 | | -import traceback |
12 | | -from datetime import date, datetime |
| 12 | +from datetime import date |
13 | 13 |
|
14 | | -from lxml import html |
| 14 | +from juriscraper.AbstractSite import logger |
| 15 | +from juriscraper.OpinionSiteLinear import OpinionSiteLinear |
15 | 16 |
|
16 | | -from juriscraper.lib.exceptions import InsanityException |
17 | | -from juriscraper.OpinionSite import OpinionSite |
18 | 17 |
|
19 | | - |
20 | | -class Site(OpinionSite): |
| 18 | +class Site(OpinionSiteLinear): |
21 | 19 | start_year = 1997 |
22 | 20 | current_year = date.today().year |
23 | 21 | court = "sc" |
24 | | - base_url = "http://appellate.nccourts.org/opinions/?c={}&year={}" |
| 22 | + base_url = "http://appellate.nccourts.org/opinion-filings/?c={}&year={}" |
| 23 | + row_xpath = "//span[span[@class='title']] | //td[span[@class='title']]" |
| 24 | + title_regex = r"\((?P<docket>[\dA-Z-]+)\s+- (?P<status>(Unp|P)ublished)" |
| 25 | + |
| 26 | + # in the browser inspector the tr-td containers do not appear for `nc` |
| 27 | + # but they do exist in the source inspected as text |
| 28 | + date_xpath = ( |
| 29 | + "../../preceding-sibling::tr//strong[contains(text()[1], 'Filed:')] " |
| 30 | + ) |
| 31 | + date_regex = r"Filed: (?P<date>[\d\w ]+)" |
| 32 | + secondary_date_regex = None |
| 33 | + |
| 34 | + # For `nc` opinions (last available in 2022, as of April 2025) |
| 35 | + state_cite_regex = r"\d+ NC \d+" |
25 | 36 |
|
26 | 37 | def __init__(self, *args, **kwargs): |
27 | 38 | super().__init__(*args, **kwargs) |
28 | 39 | self.court_id = self.__module__ |
29 | 40 | self.url = self.base_url.format(self.court, self.current_year) |
30 | | - |
31 | 41 | self.make_backscrape_iterable(kwargs) |
32 | 42 |
|
33 | | - self.my_download_urls = [] |
34 | | - self.my_case_names = [] |
35 | | - self.my_docket_numbers = [] |
36 | | - self.my_summaries = [] |
37 | | - self.my_neutral_citations = [] |
38 | | - self.my_precedential_statuses = [] |
39 | | - |
40 | | - def _get_case_dates(self): |
41 | | - case_dates = [] |
42 | | - case_date = None |
43 | | - precedential_status = "Published" |
44 | | - date_cleaner = r"\d+ \w+ [12][90]\d\d" |
45 | | - path = "//table//tr" |
46 | | - for row_el in self.html.xpath(path): |
47 | | - # Examine each row. If it contains the date, we set that as |
48 | | - # the current date. If it contains a case, we parse it. |
49 | | - try: |
50 | | - date_nodes = row_el.xpath(".//strong/text()") |
51 | | - date_str = date_nodes[0] |
52 | | - if date_nodes: |
53 | | - date_str = re.search( |
54 | | - date_cleaner, date_str, re.MULTILINE |
55 | | - ).group() |
56 | | - case_date = datetime.strptime(date_str, "%d %B %Y").date() |
57 | | - # When a new date header appears, switch to Precedential |
58 | | - precedential_status = "Published" |
59 | | - continue # Row contained just the date, move on |
60 | | - except IndexError: |
61 | | - # No matching nodes; not a date header |
62 | | - pass |
63 | | - |
64 | | - path = "./td[contains(., 'Unpublished Opinions - Rule 30e')]" |
65 | | - if row_el.xpath(path): |
66 | | - precedential_status = "Unpublished" |
67 | | - # When this header appears, switch to Nonprecedential, then |
68 | | - # press on to the following rows. |
| 43 | + def _process_html(self): |
| 44 | + for row in self.html.xpath(self.row_xpath): |
| 45 | + title = row.xpath("string(span[@class='title'])") |
| 46 | + |
| 47 | + link = row.xpath("span[@class='title']/@onclick") |
| 48 | + if not link: |
| 49 | + # some opinions may be withdrawn |
| 50 | + logger.warning("No link for row %s", title) |
69 | 51 | continue |
70 | 52 |
|
71 | | - if precedential_status == "Published": |
72 | | - urls = row_el.xpath("./td/span/span[1]/@onclick") |
73 | | - # Like: viewOpinion("http://appellate.nccourts.org/opinions/?c=1&pdf=31511") |
74 | | - if len(urls) != 1 or urls[0].find("viewOpinion") != 0: |
75 | | - continue # Only interested in cases with a download link |
| 53 | + url = link[0].split('("')[1].strip('")') |
76 | 54 |
|
77 | | - # Pull the URL out of the javascript viewOpinion function. |
78 | | - download_url = re.search( |
79 | | - r'viewopinion\("(.*)"', urls[0], re.IGNORECASE |
80 | | - ).group(1) |
| 55 | + match = re.search(self.title_regex, title) |
| 56 | + name = title[: match.start()].strip(" ,") |
81 | 57 |
|
82 | | - path = "./td/span/span[contains(@class,'title')]" |
83 | | - txt = html.tostring( |
84 | | - row_el.xpath(path)[0], method="text", encoding="unicode" |
85 | | - ) |
86 | | - case_name, neutral_cite, docket_number = self.parse_title(txt) |
87 | | - |
88 | | - summary = "" |
89 | | - path = "./td/span/span[contains(@class,'desc')]/text()" |
90 | | - summaries = row_el.xpath(path) |
91 | | - try: |
92 | | - summary = summaries[0] |
93 | | - except IndexError: |
94 | | - # Not all cases have a summary |
95 | | - pass |
96 | | - if case_name.strip() == "": |
97 | | - continue # A few cases are missing a name |
98 | | - |
99 | | - case_dates.append(case_date) |
100 | | - self.my_download_urls.append(download_url) |
101 | | - self.my_case_names.append(case_name) |
102 | | - self.my_docket_numbers.append(docket_number) |
103 | | - self.my_summaries.append(summary) |
104 | | - self.my_neutral_citations.append(neutral_cite) |
105 | | - self.my_precedential_statuses.append(precedential_status) |
106 | | - |
107 | | - elif precedential_status == "Unpublished": |
108 | | - for span in row_el.xpath("./td/span"): |
109 | | - if "onclick" not in span.attrib: |
110 | | - continue |
111 | | - download_url = re.search( |
112 | | - r'viewopinion\("(.*)"', |
113 | | - span.attrib["onclick"], |
114 | | - re.IGNORECASE, |
115 | | - ).group(1) |
116 | | - |
117 | | - txt = span.text_content().strip() |
118 | | - ( |
119 | | - case_name, |
120 | | - neutral_cite, |
121 | | - docket_number, |
122 | | - ) = self.parse_title(txt) |
123 | | - if case_name.strip() == "": |
124 | | - continue # A few cases are missing a name |
125 | | - case_dates.append(case_date) |
126 | | - self.my_download_urls.append(download_url) |
127 | | - self.my_case_names.append(case_name) |
128 | | - self.my_docket_numbers.append(docket_number) |
129 | | - self.my_summaries.append("") |
130 | | - self.my_neutral_citations.append(neutral_cite) |
131 | | - self.my_precedential_statuses.append(precedential_status) |
132 | | - |
133 | | - return case_dates |
134 | | - |
135 | | - # Parses case titles like: |
136 | | - # Fields v. Harnett Cnty., 367 NC 12 (13-761) |
137 | | - # Clark v. Clark, (13-612) |
138 | | - @staticmethod |
139 | | - def parse_title(txt): |
140 | | - try: |
141 | | - name_and_citation = txt.rsplit("(", 1)[0].strip() |
142 | | - docket_number = ( |
143 | | - re.search(r"(.*\d).*?", txt.rsplit("(", 1)[1]).group(0).strip() |
144 | | - ) |
145 | | - case_name = name_and_citation.rsplit(",", 1)[0].strip() |
146 | | - try: |
147 | | - neutral_cite = name_and_citation.rsplit(",", 1)[1].strip() |
148 | | - if not re.search(r"^\d\d.*\d\d$", neutral_cite): |
149 | | - neutral_cite = "" |
150 | | - except IndexError: |
151 | | - # Unable to find comma to split on. No neutral cite. |
152 | | - neutral_cite = "" |
153 | | - except: |
154 | | - raise InsanityException( |
155 | | - f"Unable to parse: {txt}\n{traceback.format_exc()}" |
156 | | - ) |
157 | | - return case_name, neutral_cite, docket_number |
158 | | - |
159 | | - def _get_download_urls(self): |
160 | | - return self.my_download_urls |
| 58 | + state_cite = "" |
| 59 | + if cite_match := re.search(self.state_cite_regex, name): |
| 60 | + state_cite = cite_match.group(0) |
| 61 | + name = name[: cite_match.start()].strip(" ,") |
161 | 62 |
|
162 | | - def _get_case_names(self): |
163 | | - return self.my_case_names |
| 63 | + docket = match.group("docket") |
| 64 | + status = match.group("status") |
| 65 | + summary = row.xpath("string(span[@class='desc'])") |
164 | 66 |
|
165 | | - def _get_docket_numbers(self): |
166 | | - return self.my_docket_numbers |
| 67 | + author = row.xpath("string(span[@class='author']/i)").strip() |
| 68 | + per_curiam = False |
| 69 | + if author.lower() == "per curiam": |
| 70 | + per_curiam = True |
| 71 | + author = "" |
167 | 72 |
|
168 | | - def _get_summaries(self): |
169 | | - return self.my_summaries |
| 73 | + # pick the last preceding-sibling, the most recent date block |
| 74 | + date_block = row.xpath(self.date_xpath)[-1].text_content() |
170 | 75 |
|
171 | | - def _get_citations(self): |
172 | | - return self.my_neutral_citations |
| 76 | + if match := re.search(self.date_regex, date_block): |
| 77 | + date = match.group("date") |
| 78 | + else: |
| 79 | + # # for ncctapp unpublished opinions |
| 80 | + date = self.secondary_date_regex.search(date_block).group( |
| 81 | + "date" |
| 82 | + ) |
173 | 83 |
|
174 | | - def _get_precedential_statuses(self): |
175 | | - return self.my_precedential_statuses |
| 84 | + self.cases.append( |
| 85 | + { |
| 86 | + "author": author, |
| 87 | + "per_curiam": per_curiam, |
| 88 | + "summary": summary, |
| 89 | + "status": status, |
| 90 | + "docket": docket, |
| 91 | + "name": name, |
| 92 | + "url": url, |
| 93 | + "date": date, |
| 94 | + "citation": state_cite, |
| 95 | + } |
| 96 | + ) |
176 | 97 |
|
177 | 98 | def _download_backwards(self, year: int) -> None: |
178 | 99 | """Build year URL and scrape |
|
0 commit comments