fix(nc): update to OpinionSiteLinear and new site#1381
Conversation
|
This is good, but the summaries data isnt quite the right category. It lists the metadata associated with the case but isnt a summary of the case. They actually have robust headnotes but as far as I can tell there is no way to link the headnotes for a case from the search engine provided. I emailed the court asking if there is a way and will report back. If we keep the "summary" information though I think it should go into the headnotes field, but should be cleaned up, atleast titled better. |
|
I will put this in "Blocked" until we get the court's answer From your comment, I noticed that the field I was collecting as "summary" makes more sense in
This should be a
For
|
|
So I think headnotes is too nice of an opportunity not to include. tell me what you think of this. Currently we aren't set up for headnotes but that should be an easy addition in opinion site.
and in process_html |
# Conflicts: # CHANGES.md # juriscraper/opinions/united_states/state/nc.py # juriscraper/opinions/united_states/state/ncctapp.py
…hods in NC scraper
|
@Luis-manzur is this still a draft? |
flooie
left a comment
There was a problem hiding this comment.
in theory I like the idea of collecting headnotes, but in practice I dont think we should include them here. They appear to publish the headnotes digest months and months after they publish opinions. This means in reality we will never actually collect them during a scrape and have no mechanism right now to merge headnotes back into the system.
What it does do is overly complicate the tool.
| """Goes into OpinionCluster.attorneys, type: string""" | ||
| return self._get_optional_field_by_id("attorney") | ||
|
|
||
| def _get_headnotes(self): |
There was a problem hiding this comment.
drop headnotes from this PR ...
and we should add them in a different pr if we want to include headnotes
| # Like: viewOpinion("http://appellate.nccourts.org/opinions/?c=1&pdf=31511") | ||
| if len(urls) != 1 or urls[0].find("viewOpinion") != 0: | ||
| continue # Only interested in cases with a download link | ||
| def _process_html(self): |
There was a problem hiding this comment.
docstring format is not standard juriscraper
| Iterates over each row in the HTML, extracting the title, link, summary, headnote, | ||
| docket, status, author, per curiam status, date, and citation for each opinion. | ||
| Handles cases where opinions may be withdrawn (no link), and parses additional | ||
| information from the headnote HTML if available. | ||
|
|
||
| path = "./td/span/span[contains(@class,'title')]" | ||
| txt = html.tostring( | ||
| row_el.xpath(path)[0], method="text", encoding="unicode" | ||
| ) | ||
| case_name, neutral_cite, docket_number = self.parse_title(txt) | ||
|
|
||
| summary = "" | ||
| path = "./td/span/span[contains(@class,'desc')]/text()" | ||
| summaries = row_el.xpath(path) | ||
| try: | ||
| summary = summaries[0] | ||
| except IndexError: | ||
| # Not all cases have a summary | ||
| pass | ||
| if case_name.strip() == "": | ||
| continue # A few cases are missing a name | ||
|
|
||
| case_dates.append(case_date) | ||
| self.my_download_urls.append(download_url) | ||
| self.my_case_names.append(case_name) | ||
| self.my_docket_numbers.append(docket_number) | ||
| self.my_summaries.append(summary) | ||
| self.my_neutral_citations.append(neutral_cite) | ||
| self.my_precedential_statuses.append(precedential_status) | ||
|
|
||
| elif precedential_status == "Unpublished": | ||
| for span in row_el.xpath("./td/span"): | ||
| if "onclick" not in span.attrib: | ||
| continue | ||
| download_url = re.search( | ||
| r'viewopinion\("(.*)"', | ||
| span.attrib["onclick"], | ||
| re.IGNORECASE, | ||
| ).group(1) | ||
|
|
||
| txt = span.text_content().strip() | ||
| ( | ||
| case_name, | ||
| neutral_cite, | ||
| docket_number, | ||
| ) = self.parse_title(txt) | ||
| if case_name.strip() == "": | ||
| continue # A few cases are missing a name | ||
| case_dates.append(case_date) | ||
| self.my_download_urls.append(download_url) | ||
| self.my_case_names.append(case_name) | ||
| self.my_docket_numbers.append(docket_number) | ||
| self.my_summaries.append("") | ||
| self.my_neutral_citations.append(neutral_cite) | ||
| self.my_precedential_statuses.append(precedential_status) | ||
|
|
||
| return case_dates | ||
|
|
||
| # Parses case titles like: | ||
| # Fields v. Harnett Cnty., 367 NC 12 (13-761) | ||
| # Clark v. Clark, (13-612) | ||
| @staticmethod | ||
| def parse_title(txt): | ||
| try: | ||
| name_and_citation = txt.rsplit("(", 1)[0].strip() | ||
| docket_number = ( | ||
| re.search(r"(.*\d).*?", txt.rsplit("(", 1)[1]).group(0).strip() | ||
| ) | ||
| case_name = name_and_citation.rsplit(",", 1)[0].strip() | ||
| try: | ||
| neutral_cite = name_and_citation.rsplit(",", 1)[1].strip() | ||
| if not re.search(r"^\d\d.*\d\d$", neutral_cite): | ||
| neutral_cite = "" | ||
| except IndexError: | ||
| # Unable to find comma to split on. No neutral cite. | ||
| neutral_cite = "" | ||
| except Exception: | ||
| raise InsanityException( | ||
| f"Unable to parse: {txt}\n{traceback.format_exc()}" | ||
| ) | ||
| return case_name, neutral_cite, docket_number | ||
|
|
||
| def _get_download_urls(self): | ||
| return self.my_download_urls | ||
| Appends a dictionary of extracted case data to self.cases for each valid row. | ||
| """ | ||
| for row in self.html.xpath(self.row_xpath): | ||
| title = row.xpath("string(span[@class='title'])") | ||
|
|
||
| def _get_case_names(self): | ||
| return self.my_case_names | ||
| link = row.xpath("span[@class='title']/@onclick") | ||
| if not link: | ||
| # some opinions may be withdrawn | ||
| logger.warning("No link for row %s", title) | ||
| continue | ||
|
|
||
| def _get_docket_numbers(self): | ||
| return self.my_docket_numbers | ||
| url = link[0].split('("')[1].strip('")') | ||
|
|
||
| def _get_summaries(self): | ||
| return self.my_summaries | ||
| summary = ( | ||
| row.xpath("string(span[@class='desc'])") | ||
| if self.collect_summary | ||
| else "" | ||
| ) | ||
| headnote = ( | ||
| "" | ||
| if self.collect_summary | ||
| else row.xpath("string(span[@class='desc'])") | ||
| ) | ||
|
|
||
| def _get_citations(self): | ||
| return self.my_neutral_citations | ||
| url = url.replace("http:", "https:") | ||
| divs = self.headnote_html.xpath( | ||
| f'(//a[@href="{url}"]/ancestor::p)[1]' | ||
| ) | ||
| if divs: | ||
| p_elt = divs[0] | ||
| all_text = p_elt.xpath("text()") | ||
|
|
||
| summary = "".join( | ||
| text.replace("—", "") | ||
| for text in all_text | ||
| if not (text.startswith("<b>") or text.startswith("<a")) | ||
| ).strip() | ||
| headnote = p_elt.xpath("./b//text()")[0] | ||
|
|
||
| match = re.search(self.title_regex, title) | ||
| name = title[: match.start()].strip(" ,") | ||
|
|
||
| state_cite = "" | ||
| if cite_match := re.search(self.state_cite_regex, name): | ||
| state_cite = cite_match.group(0) | ||
| name = name[: cite_match.start()].strip(" ,") | ||
|
|
||
| docket = match.group("docket") | ||
| status = match.group("status") | ||
|
|
||
| author = row.xpath("string(span[@class='author']/i)").strip() | ||
| per_curiam = False | ||
| if author.lower() == "per curiam": | ||
| per_curiam = True | ||
| author = "" | ||
|
|
||
| # pick the last preceding-sibling, the most recent date block | ||
| date_block = row.xpath(self.date_xpath)[-1].text_content() | ||
|
|
||
| if match := re.search(self.date_regex, date_block): | ||
| date = match.group("date") | ||
| else: | ||
| # # for ncctapp unpublished opinions | ||
| date = self.secondary_date_regex.search(date_block).group( | ||
| "date" | ||
| ) | ||
|
|
||
| def _get_precedential_statuses(self): | ||
| return self.my_precedential_statuses | ||
| self.cases.append( | ||
| { | ||
| "author": author, | ||
| "per_curiam": per_curiam, | ||
| "summary": summary, | ||
| "headnote": headnote, | ||
| "status": status, | ||
| "docket": docket, | ||
| "name": name, | ||
| "url": url, | ||
| "date": date, | ||
| "citation": state_cite, | ||
| } | ||
| ) |
There was a problem hiding this comment.
I think we can simplify this entire function
def _process_html(self):
"""
"""
for row in self.html.xpath(self.row_xpath):
title = row.xpath("string(span[@class='title'])")
links = row.xpath("span[@class='title']/@onclick")
summaries = row.xpath("span[@class='desc']/text()")
summary = summaries[0] if summaries else ""
if not links:
logger.warning("No link for row %s", title)
continue
url = links[0][13:-2].replace("http:", "https:")
m = re.search(r"(?P<name>.*),?\s?(?P<cite>\d+ NC \d+)? \((?P<docket>.*) - (?P<status>.*)\)", title)
name, citation, docket, status = m.groups()
author = row.xpath("string(span[@class='author']/i)").strip()
per_curiam = True if author == "Per Curiam" else False
date = row.xpath(self.date_xpath)[-1]
if date == "Zip File of Published Opinions":
date_parent = "../../preceding-sibling::tr//a/../text()"
date = row.xpath(date_parent)[0].strip()[7:]
elif not isinstance(date, str):
date = date.xpath(".//text()")[1].split("\n")[0]
self.cases.append({
"per_curiam": per_curiam,
"author": author if not per_curiam else "",
"docket": docket,
"status": status,
"name": name,
"url": url,
"date": date,
"summary": summary,
"citation": citation if citation else "",
})
to something like this. with an improved date xpath.
date_xpath = "../../preceding-sibling::tr//a/text()"
There was a problem hiding this comment.
there is some wonkiness around dates for a few edge cases but I think this code is cleaner and works for both sc and coa
|
@Luis-manzur whats is the status here? |
# Conflicts: # CHANGES.md
|
I Updated the
I did all the recommended changes |
Update tests Move code to nc when not needed in ncctapp fix docstrings Move date parsing to its own function Enable summaries in both courts Fix citation extraction
Solves #1373