|
| 1 | +""" |
| 2 | +Scraper for the Louisiana Second Circuit Court of Appeal |
| 3 | +CourtID: lactapp_2 |
| 4 | +Court Short Name: La. Ct. App. 2d Cir |
| 5 | +Author: Gianfranco Huaman |
| 6 | +History: |
| 7 | + - 2025-01-11, giancohs: created |
| 8 | +""" |
| 9 | + |
| 10 | +import re |
| 11 | +from datetime import datetime |
| 12 | +from urllib.parse import urlencode, urljoin |
| 13 | + |
| 14 | +from juriscraper.AbstractSite import logger |
| 15 | +from juriscraper.lib.html_utils import ( |
| 16 | + get_row_column_links, |
| 17 | + get_row_column_text, |
| 18 | +) |
| 19 | +from juriscraper.lib.judge_parsers import normalize_judge_string |
| 20 | +from juriscraper.OpinionSiteLinear import OpinionSiteLinear |
| 21 | + |
| 22 | + |
| 23 | +class Site(OpinionSiteLinear): |
| 24 | + def __init__(self, *args, **kwargs): |
| 25 | + super().__init__(*args, **kwargs) |
| 26 | + self.court_id = self.__module__ |
| 27 | + self.base_url = "https://www.la2nd.org/opinions/" |
| 28 | + self.year = datetime.now().year |
| 29 | + params = {"opinion_year": self.year} |
| 30 | + self.url = urljoin(self.base_url, f"?{urlencode(params)}") |
| 31 | + self.first_opinion_date = datetime(2019, 7, 17).date() |
| 32 | + self.is_backscrape = False |
| 33 | + self.make_backscrape_iterable(kwargs) |
| 34 | + |
| 35 | + def _process_html(self): |
| 36 | + """Process the HTML and extract case information""" |
| 37 | + rows = self.html.xpath('//table[@id="datatable"]/tbody/tr') |
| 38 | + |
| 39 | + for row in rows: |
| 40 | + author_str = get_row_column_text(row, 4) |
| 41 | + cleaned_author = normalize_judge_string(author_str)[0] |
| 42 | + if cleaned_author.endswith(" J."): |
| 43 | + cleaned_author = cleaned_author[:-3] |
| 44 | + status_str = get_row_column_text(row, 7) |
| 45 | + status = ( |
| 46 | + "Published" if "Published" in status_str else "Unpublished" |
| 47 | + ) |
| 48 | + date_str = get_row_column_text(row, 1) |
| 49 | + case_date = datetime.strptime(date_str, "%m/%d/%Y").date() |
| 50 | + |
| 51 | + # Skip if not in date range |
| 52 | + if self.is_backscrape and not self.date_is_in_backscrape_range( |
| 53 | + case_date |
| 54 | + ): |
| 55 | + continue |
| 56 | + |
| 57 | + self.cases.append( |
| 58 | + { |
| 59 | + "date": date_str, |
| 60 | + "docket": get_row_column_text(row, 2), |
| 61 | + "name": get_row_column_text(row, 3), |
| 62 | + "author": cleaned_author, |
| 63 | + "disposition": get_row_column_text(row, 5), |
| 64 | + "url": get_row_column_links(row, 8), |
| 65 | + "status": status, |
| 66 | + } |
| 67 | + ) |
| 68 | + |
| 69 | + def make_backscrape_iterable(self, kwargs): |
| 70 | + """Checks if backscrape start and end arguments have been passed |
| 71 | + by caller, and parses them accordingly |
| 72 | +
|
| 73 | + Louisiana's opinions page returns all opinions for a year (pagination is not needed), |
| 74 | + so we must filter out opinions not in the date range we are looking for |
| 75 | +
|
| 76 | + :return None |
| 77 | + """ |
| 78 | + start = kwargs.get("backscrape_start") |
| 79 | + end = kwargs.get("backscrape_end") |
| 80 | + |
| 81 | + if start: |
| 82 | + start = datetime.strptime(start, "%Y/%m/%d").date() |
| 83 | + else: |
| 84 | + start = self.first_opinion_date |
| 85 | + if end: |
| 86 | + end = datetime.strptime(end, "%Y/%m/%d").date() |
| 87 | + else: |
| 88 | + end = datetime.now().date() |
| 89 | + |
| 90 | + self.back_scrape_iterable = [(start, end)] |
| 91 | + |
| 92 | + def _download_backwards(self, dates): |
| 93 | + """Called when backscraping |
| 94 | +
|
| 95 | + :param dates: (start_date, end_date) tuple |
| 96 | + :return None |
| 97 | + """ |
| 98 | + self.start_date, self.end_date = dates |
| 99 | + self.is_backscrape = True |
| 100 | + logger.info( |
| 101 | + "Backscraping for range %s %s", self.start_date, self.end_date |
| 102 | + ) |
| 103 | + |
| 104 | + self.year = self.start_date.year |
| 105 | + params = {"opinion_year": self.year} |
| 106 | + self.url = urljoin(self.base_url, f"?{urlencode(params)}") |
| 107 | + self.html = self._download() |
| 108 | + self._process_html() |
| 109 | + |
| 110 | + def date_is_in_backscrape_range(self, case_date): |
| 111 | + """When backscraping, check if the case date is in |
| 112 | + the backscraping range |
| 113 | +
|
| 114 | + :param date_str: string date from the HTML source |
| 115 | + :return: True if date is in backscrape range |
| 116 | + """ |
| 117 | + return self.start_date <= case_date <= self.end_date |
| 118 | + |
| 119 | + def extract_from_text(self, scraped_text): |
| 120 | + """Extract the following values from the opinion's pdf text. The information we need is in the first page |
| 121 | + - appeal_from_str |
| 122 | + - judges |
| 123 | +
|
| 124 | + :param scraped_text: The text content of the pdf |
| 125 | + :return: Dictionary containing the extracted values that matches the courtlistener model objects |
| 126 | + """ |
| 127 | + metadata = {"Docket": {}} |
| 128 | + |
| 129 | + appeal_from_match = re.search( |
| 130 | + r"Appealed from the\s*(.*?\s*),\s*Louisiana", |
| 131 | + scraped_text, |
| 132 | + re.DOTALL, |
| 133 | + ) |
| 134 | + # Judges are in the format "Before [Judge1], [Judge2], and [Judge3], JJ." |
| 135 | + # Sometimes there are more than 3 judges, and other edge cases like "and" is in uppercase |
| 136 | + # or there is no comma between the last two judges |
| 137 | + judges_match = re.findall( |
| 138 | + r"Before\s+(.+?)(?:,\s*|\s+)?(?:and|AND)\s+([A-Z]+),\s+JJ\.", |
| 139 | + scraped_text, |
| 140 | + re.DOTALL, |
| 141 | + ) |
| 142 | + if appeal_from_match: |
| 143 | + appeal_from_result = re.sub( |
| 144 | + r"\s+", " ", appeal_from_match.group(1).replace("\n", " ") |
| 145 | + ).strip() |
| 146 | + metadata["Docket"] = { |
| 147 | + "appeal_from_str": appeal_from_result, |
| 148 | + } |
| 149 | + if judges_match: |
| 150 | + initial_judges, last_judge = judges_match[0] |
| 151 | + all_judges = initial_judges.split(",") + [last_judge] |
| 152 | + metadata["OpinionCluster"] = { |
| 153 | + "judges": "; ".join(filter(None, map(str.strip, all_judges))), |
| 154 | + } |
| 155 | + return metadata |
0 commit comments