Merge branch 'main' into 1373-update-nc

grossir · grossir · commit cb4725edb04d · 2025-04-30T16:23:14.000-05:00
diff --git a/CHANGES.md b/CHANGES.md
@@ -11,57 +11,62 @@ words, they're the ones you'll want to watch, and the others are mostly noise.
 Releases are also tagged in git, if that's helpful.
 
 ## Coming up
-
-- Fix `me` Update maine scraper and add backscraper
-- Update `sd` backscraper and extract from text
+- Fix `bia` scraper and add extract from text test cases
 - update `nc` scraper to OpinionSiteLinear and new website #1373
-- Fix `bia` scraper
 
 ## Current
 
-**2.6.65 - 2024-04-11**
+**2.6.66 - 2025-04-29**
+
+- Add backscraper for `dcd` #1336
+- Update `sd` backscraper and extract from text
+- Implement datestring format validation in test_ScraperExtractFromTextTest #838
+- Implement `or` extract_from_text to collection regional citations #1226
+- Fix `bia` scraper
+
+**2.6.65 - 2025-04-11**
 
 - `nh` was blocking; fixed by updating the user agent string #1370
 - Update `vtsuperct_*` scrapers to inherit `extract_from_text` from `vt` #1150
 
 ## Past
 
-**2.6.64 - 2024-04-10**
+**2.6.64 - 2025-04-10**
 
 - Fix `me` Update maine scraper and add backscraper #1360
 - Sites were blocking `cafc` scrapers. Fixed by passing a browser user agent #1366
 
 
-**2.6.63 - 2024-03-25**
+**2.6.63 - 2025-03-25**
 
 - Make `ga` backscraper take kwargs; fix a bug in 2018 #1349
 - Implement extract from text for `ga` #1349
 - Fix `ill` oral argument scraper #1356
 
-**2.6.62 - 2024-03-19**
+**2.6.62 - 2025-03-19**
 
 - Fix `uscgcoca` and `asbca` by replicating browser request headers #1352
 - Fix `uscgcoca` citation regex #1351
 
-**2.6.61 - 2024-03-06**
+**2.6.61 - 2025-03-06**
 
 - Fix `ca8` opinion scraper by setting `request.verify = False` #1346
 
-**2.6.60 - 2024-03-05**
+**2.6.60 - 2025-03-05**
 
 - Fix `ca7` scrapers url from http to https
 
-**2.6.59 - 2024-03-04**
+**2.6.59 - 2025-03-04**
 
 - Change `colo` user agent to prevent site block #1341
 
-**2.6.58 - 2024-02-26**
+**2.6.58 - 2025-02-26**
 
 - Fixes:
   - Add backscraper for `mesuperct` #1328
   - Fix `mont` cleanup_content, would fail when content was bytes #1323
 
-**2.6.57 - 2024-02-25**
+**2.6.57 - 2025-02-25**
 
 - Fixes:
   - fix cafc oral argument scraper PR (#1325)[https://github.com/freelawproject/juriscraper/pull/1325]
@@ -73,7 +78,7 @@ Releases are also tagged in git, if that's helpful.
   - Add workflow to check for new entries in CHANGES.md file
 
 
-**2.6.56 - 2024-02-19**
+**2.6.56 - 2025-02-19**
 
 - Fixes:
   - n/a
@@ -83,7 +88,7 @@ Releases are also tagged in git, if that's helpful.
   - Add citation extraction and author for MT
 
 
-**2.6.55 - 2024-02-10**
+**2.6.55 - 2025-02-10**
 
 - Fixes:
   - `cafc` opinion scraper now requests using `verify=False` #1314
@@ -94,27 +99,27 @@ Releases are also tagged in git, if that's helpful.
   - recap: improvement to the download_pdf method to handle cases where
   attachment pages are returned instead of the expected PDF documents. #1309
 
-**2.6.54 - 2024-01-24**
+**2.6.54 - 2025-01-24**
 
 - Fixes:
   - `ca6` oral argument scraper is no longer failing
   - update the pypi.yml github actions workflow to solve a bug with twine and
     packaging packages interaction. It now forces the update of packaging
   - due to that bug, we discarded the 2.6.53 version
 
-**2.6.52 - 2024-01-20**
+**2.6.52 - 2025-01-20**
 
 - Fixes:
   - `AppellateDocketReport.download_pdf` now returns a two-tuple containing the
     response object or None and a str. This aligns with the changes introduced
     in v 2.5.1.
 
-**2.6.51 - 2024-01-14**
+**2.6.51 - 2025-01-14**
 
 - Fixes:
   - `extract_from_text` now returns plain citation strings, instead of parsed dicts
 
-**2.6.50 - 2024-01-10**
+**2.6.50 - 2025-01-10**
 
 - Fixes:
   - add tests to ensure that `extract_from_text` does not fail
@@ -128,7 +133,7 @@ Releases are also tagged in git, if that's helpful.
 - Features
   - `pacer.email._parse_bankruptcy_short_description` now supports Multi Docket NEFs
 
-**2.6.49 - 2024-01-08**
+**2.6.49 - 2025-01-08**
 
 - Fixes:
   - `nh` scrapers no longer depend on harcoded year filter
diff --git a/juriscraper/opinions/united_states/administrative_agency/bia.py b/juriscraper/opinions/united_states/administrative_agency/bia.py
@@ -70,7 +70,7 @@ def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:
         :return: Metadata to be added to the case
         """
         date = re.findall(
-            r"Decided (?:(?:by (?:Acting\s)?Attorney General|as amended)\s)?(.*\d{4})",
+            r"Decided (?:(?:by (?:(?:Acting\s)?Attorney General|Board)|as amended)\s)?(.*\d{4})",
             scraped_text,
         )
         if not date:
diff --git a/juriscraper/opinions/united_states/federal_district/dcd.py b/juriscraper/opinions/united_states/federal_district/dcd.py
@@ -26,6 +26,7 @@ def __init__(self, *args, **kwargs):
         self.court_id = self.__module__
         self.url = f"https://ecf.dcd.uscourts.gov/cgi-bin/Opinions.pl?{date.today().year}"
         self.status = "Published"
+        self.make_backscrape_iterable(kwargs)
 
     def _process_html(self):
         """
@@ -72,3 +73,37 @@ def get_docket_document_number_from_url(self, url: str) -> Tuple[str, str]:
         doc_number = match.group(6) if match else url
 
         return doc_number
+
+    def _download_backwards(self, year: int) -> None:
+        """Build URL with year input and scrape
+
+        :param year: year to scrape
+        :return None
+        """
+        self.url = f"https://ecf.dcd.uscourts.gov/cgi-bin/Opinions.pl?{year}"
+        self.html = self._download()
+        self._process_html()
+
+    def make_backscrape_iterable(self, kwargs: dict) -> None:
+        """Checks if backscrape start and end arguments have been passed
+        by caller, and parses them accordingly
+
+        :param kwargs: passed when initializing the scraper, may or
+            may not contain backscrape controlling arguments
+        :return None
+        """
+        start_date = kwargs.get("backscrape_start")
+        end_date = kwargs.get("backscrape_end")
+
+        start = (
+            datetime.strptime(start_date, "%m/%d/%Y").year
+            if start_date
+            else date.today().year
+        )
+        end = (
+            datetime.strptime(end_date, "%m/%d/%Y").year + 1
+            if end_date
+            else date.today().year
+        )
+
+        self.back_scrape_iterable = range(max(2005, start), end)
diff --git a/juriscraper/opinions/united_states/state/or.py b/juriscraper/opinions/united_states/state/or.py
@@ -2,8 +2,10 @@
 History:
  - 2014-08-05: Adapted scraper to have year-based URLs.
  - 2023-11-18: Fixed and updated
+ - 2025-04-23: implement extract_from_text, grossir
 """
 
+import re
 from datetime import datetime, timedelta
 
 from juriscraper.AbstractSite import logger
@@ -129,3 +131,18 @@ def format_url(self, start_date: datetime, end_date: datetime) -> str:
         start = datetime.strftime(start_date, "%Y%m%d")
         end = datetime.strftime(end_date, "%Y%m%d")
         return self.base_url.format(self.court_code, start, end)
+
+    def extract_from_text(self, scraped_text: str) -> dict:
+        """Extract citations from text
+
+        Be careful with citations referring to other opinions that are
+        mentioned before the actual citation
+
+        See, for example:
+        https://ojd.contentdm.oclc.org/digital/api/collection/p17027coll5/id/28946/download
+        """
+        regex = r"\n\s+(?P<cite>\d+ P3d \d+)\s+\n"
+        if match := re.search(regex, scraped_text[:1000]):
+            return {"Citation": match.group("cite")}
+
+        return {}
diff --git a/juriscraper/opinions/united_states/state/ortc.py b/juriscraper/opinions/united_states/state/ortc.py
@@ -1,9 +1,14 @@
 from importlib import import_module
 
+from juriscraper.OpinionSite import OpinionSite
+
 # `or` is a python reserved keyword; can't import the module as usual
 oregon_module = import_module("juriscraper.opinions.united_states.state.or")
 
 
 class Site(oregon_module.Site):
     court_code = "p17027coll6"
     days_interval = 120
+    # prevent test_ScraperExtractFromTextTest failure, given that parent class
+    # `or` implements Site.extract_from_text
+    extract_from_text = OpinionSite.extract_from_text
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 from setuptools import find_packages, setup
 from setuptools.command.install import install
 
-VERSION = "2.6.65"
+VERSION = "2.6.66"
 AUTHOR = "Free Law Project"
 EMAIL = "info@free.law"
 HERE = os.path.abspath(os.path.dirname(__file__))
diff --git a/tests/local/test_ScraperExtractFromTextTest.py b/tests/local/test_ScraperExtractFromTextTest.py

Original file line number	Diff line number	Diff line change
`@@ -70,7 +70,7 @@ def extract_from_text(self, scraped_text: str) -> Dict[str, Any]:`
`70`	`70`	`:return: Metadata to be added to the case`
`71`	`71`	`"""`
`72`	`72`	`date = re.findall(`
`73`		`- r"Decided (?:(?:by (?:Acting\s)?Attorney General\|as amended)\s)?(.*\d{4})",`
	`73`	`+ r"Decided (?:(?:by (?:(?:Acting\s)?Attorney General\|Board)\|as amended)\s)?(.*\d{4})",`
`74`	`74`	`scraped_text,`
`75`	`75`	`)`
`76`	`76`	`if not date:`