freelawproject
diff --git a/‎CHANGES.md‎
Lines changed: 1 addition & 0 deletions b/‎CHANGES.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎juriscraper/opinions/united_states/state/nc.py‎
Lines changed: 64 additions & 143 deletions b/‎juriscraper/opinions/united_states/state/nc.py‎
Lines changed: 64 additions & 143 deletions
diff --git a/‎juriscraper/opinions/united_states/state/ncctapp.py‎
Lines changed: 10 additions & 4 deletions b/‎juriscraper/opinions/united_states/state/ncctapp.py‎
Lines changed: 10 additions & 4 deletions
@@ -14,6 +14,7 @@ Releases are also tagged in git, if that's helpful.
 
 - Fix `me` Update maine scraper and add backscraper
 - Update `sd` backscraper and extract from text
+- update `nc` scraper to OpinionSiteLinear and new website #1373
 
 ## Current
 
 
@@ -5,174 +5,95 @@
 History:
     2014-05-01: Created by Brian Carver
     2014-08-04: Rewritten by Jon Andersen with complete backscraper
+    2025-04-22: grossir, Update to OpinionSiteLinear
 """
 
 import re
-import traceback
-from datetime import date, datetime
+from datetime import date
 
-from lxml import html
+from juriscraper.AbstractSite import logger
+from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
-from juriscraper.lib.exceptions import InsanityException
-from juriscraper.OpinionSite import OpinionSite
 
-
-class Site(OpinionSite):
+class Site(OpinionSiteLinear):
     start_year = 1997
     current_year = date.today().year
     court = "sc"
-    base_url = "http://appellate.nccourts.org/opinions/?c={}&year={}"
+    base_url = "http://appellate.nccourts.org/opinion-filings/?c={}&year={}"
+    row_xpath = "//span[span[@class='title']] | //td[span[@class='title']]"
+    title_regex = r"\((?P<docket>[\dA-Z-]+)\s+- (?P<status>(Unp|P)ublished)"
+
+    # in the browser inspector the tr-td containers do not appear for `nc`
+    # but they do exist in the source inspected as text
+    date_xpath = (
+        "../../preceding-sibling::tr//strong[contains(text()[1], 'Filed:')] "
+    )
+    date_regex = r"Filed: (?P<date>[\d\w ]+)"
+    secondary_date_regex = None
+
+    # For `nc` opinions (last available in 2022, as of April 2025)
+    state_cite_regex = r"\d+ NC \d+"
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.court_id = self.__module__
         self.url = self.base_url.format(self.court, self.current_year)
-
         self.make_backscrape_iterable(kwargs)
 
-        self.my_download_urls = []
-        self.my_case_names = []
-        self.my_docket_numbers = []
-        self.my_summaries = []
-        self.my_neutral_citations = []
-        self.my_precedential_statuses = []
-
-    def _get_case_dates(self):
-        case_dates = []
-        case_date = None
-        precedential_status = "Published"
-        date_cleaner = r"\d+ \w+ [12][90]\d\d"
-        path = "//table//tr"
-        for row_el in self.html.xpath(path):
-            # Examine each row. If it contains the date, we set that as
-            # the current date. If it contains a case, we parse it.
-            try:
-                date_nodes = row_el.xpath(".//strong/text()")
-                date_str = date_nodes[0]
-                if date_nodes:
-                    date_str = re.search(
-                        date_cleaner, date_str, re.MULTILINE
-                    ).group()
-                    case_date = datetime.strptime(date_str, "%d %B %Y").date()
-                    # When a new date header appears, switch to Precedential
-                    precedential_status = "Published"
-                    continue  # Row contained just the date, move on
-            except IndexError:
-                # No matching nodes; not a date header
-                pass
-
-            path = "./td[contains(., 'Unpublished Opinions - Rule 30e')]"
-            if row_el.xpath(path):
-                precedential_status = "Unpublished"
-                # When this header appears, switch to Nonprecedential, then
-                # press on to the following rows.
+    def _process_html(self):
+        for row in self.html.xpath(self.row_xpath):
+            title = row.xpath("string(span[@class='title'])")
+
+            link = row.xpath("span[@class='title']/@onclick")
+            if not link:
+                # some opinions may be withdrawn
+                logger.warning("No link for row %s", title)
                 continue
 
-            if precedential_status == "Published":
-                urls = row_el.xpath("./td/span/span[1]/@onclick")
-                # Like: viewOpinion("http://appellate.nccourts.org/opinions/?c=1&amp;pdf=31511")
-                if len(urls) != 1 or urls[0].find("viewOpinion") != 0:
-                    continue  # Only interested in cases with a download link
+            url = link[0].split('("')[1].strip('")')
 
-                # Pull the URL out of the javascript viewOpinion function.
-                download_url = re.search(
-                    r'viewopinion\("(.*)"', urls[0], re.IGNORECASE
-                ).group(1)
+            match = re.search(self.title_regex, title)
+            name = title[: match.start()].strip(" ,")
 
-                path = "./td/span/span[contains(@class,'title')]"
-                txt = html.tostring(
-                    row_el.xpath(path)[0], method="text", encoding="unicode"
-                )
-                case_name, neutral_cite, docket_number = self.parse_title(txt)
-
-                summary = ""
-                path = "./td/span/span[contains(@class,'desc')]/text()"
-                summaries = row_el.xpath(path)
-                try:
-                    summary = summaries[0]
-                except IndexError:
-                    # Not all cases have a summary
-                    pass
-                if case_name.strip() == "":
-                    continue  # A few cases are missing a name
-
-                case_dates.append(case_date)
-                self.my_download_urls.append(download_url)
-                self.my_case_names.append(case_name)
-                self.my_docket_numbers.append(docket_number)
-                self.my_summaries.append(summary)
-                self.my_neutral_citations.append(neutral_cite)
-                self.my_precedential_statuses.append(precedential_status)
-
-            elif precedential_status == "Unpublished":
-                for span in row_el.xpath("./td/span"):
-                    if "onclick" not in span.attrib:
-                        continue
-                    download_url = re.search(
-                        r'viewopinion\("(.*)"',
-                        span.attrib["onclick"],
-                        re.IGNORECASE,
-                    ).group(1)
-
-                    txt = span.text_content().strip()
-                    (
-                        case_name,
-                        neutral_cite,
-                        docket_number,
-                    ) = self.parse_title(txt)
-                    if case_name.strip() == "":
-                        continue  # A few cases are missing a name
-                    case_dates.append(case_date)
-                    self.my_download_urls.append(download_url)
-                    self.my_case_names.append(case_name)
-                    self.my_docket_numbers.append(docket_number)
-                    self.my_summaries.append("")
-                    self.my_neutral_citations.append(neutral_cite)
-                    self.my_precedential_statuses.append(precedential_status)
-
-        return case_dates
-
-    # Parses case titles like:
-    # Fields v. Harnett Cnty., 367 NC 12 (13-761)
-    # Clark v. Clark,  (13-612)
-    @staticmethod
-    def parse_title(txt):
-        try:
-            name_and_citation = txt.rsplit("(", 1)[0].strip()
-            docket_number = (
-                re.search(r"(.*\d).*?", txt.rsplit("(", 1)[1]).group(0).strip()
-            )
-            case_name = name_and_citation.rsplit(",", 1)[0].strip()
-            try:
-                neutral_cite = name_and_citation.rsplit(",", 1)[1].strip()
-                if not re.search(r"^\d\d.*\d\d$", neutral_cite):
-                    neutral_cite = ""
-            except IndexError:
-                # Unable to find comma to split on. No neutral cite.
-                neutral_cite = ""
-        except:
-            raise InsanityException(
-                f"Unable to parse: {txt}\n{traceback.format_exc()}"
-            )
-        return case_name, neutral_cite, docket_number
-
-    def _get_download_urls(self):
-        return self.my_download_urls
+            state_cite = ""
+            if cite_match := re.search(self.state_cite_regex, name):
+                state_cite = cite_match.group(0)
+                name = name[: cite_match.start()].strip(" ,")
 
-    def _get_case_names(self):
-        return self.my_case_names
+            docket = match.group("docket")
+            status = match.group("status")
+            summary = row.xpath("string(span[@class='desc'])")
 
-    def _get_docket_numbers(self):
-        return self.my_docket_numbers
+            author = row.xpath("string(span[@class='author']/i)").strip()
+            per_curiam = False
+            if author.lower() == "per curiam":
+                per_curiam = True
+                author = ""
 
-    def _get_summaries(self):
-        return self.my_summaries
+            # pick the last preceding-sibling, the most recent date block
+            date_block = row.xpath(self.date_xpath)[-1].text_content()
 
-    def _get_citations(self):
-        return self.my_neutral_citations
+            if match := re.search(self.date_regex, date_block):
+                date = match.group("date")
+            else:
+                # # for ncctapp unpublished opinions
+                date = self.secondary_date_regex.search(date_block).group(
+                    "date"
+                )
 
-    def _get_precedential_statuses(self):
-        return self.my_precedential_statuses
+            self.cases.append(
+                {
+                    "author": author,
+                    "per_curiam": per_curiam,
+                    "summary": summary,
+                    "status": status,
+                    "docket": docket,
+                    "name": name,
+                    "url": url,
+                    "date": date,
+                    "citation": state_cite,
+                }
+            )
 
     def _download_backwards(self, year: int) -> None:
         """Build year URL and scrape
 
@@ -6,14 +6,20 @@
     2014-08-04: Created by Jon Andersen
 """
 
-from datetime import date
+import re
 
 from juriscraper.opinions.united_states.state import nc
 
 
 class Site(nc.Site):
     court = "coa"
+    unpub_date_xpath = (
+        "../preceding-sibling::tr/td[contains(text(), 'Rule 30e')]"
+    )
+    date_xpath = f"{nc.Site.date_xpath} | {unpub_date_xpath}"
+    secondary_date_regex = re.compile(
+        r"(?P<date>\d[\d \w]+)[\t\xa0\n]+- Rule 30e", flags=re.MULTILINE
+    )
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.court_id = self.__module__
+    # For `ncctapp` opinions (last available in 2012, as of April 2025)
+    state_cite_regex = r"\d+ NC App \d+"