From 8a29e5cddc58ef26ab32799d65aafd5757e9e4d0 Mon Sep 17 00:00:00 2001 From: luism Date: Thu, 31 Jul 2025 13:14:17 -0400 Subject: [PATCH 1/2] feat(AbstractSite): add validations for case names, dates, and download URLs in _sanity_check --- CHANGES.md | 2 +- juriscraper/AbstractSite.py | 47 +++++++++++++++++++++++++------------ 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 7e89fece4..9f864d364 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -19,7 +19,7 @@ Features: - Changes: -- +- new validations for `_sanity_check` in `AbstractSite` Fixes: - diff --git a/juriscraper/AbstractSite.py b/juriscraper/AbstractSite.py index 26d2bb343..301d6b5c8 100644 --- a/juriscraper/AbstractSite.py +++ b/juriscraper/AbstractSite.py @@ -1,5 +1,6 @@ import hashlib import json +import re from datetime import date, datetime, timedelta import certifi @@ -234,16 +235,20 @@ def _check_sanity(self): If sanity is OK, no return value. If not, throw InsanityException or warnings, as appropriate. """ + SUSPICIOUS_EXTENSIONS = re.compile( + r"\.(js|py|rb|java|c|cpp|cs|go|sh|pl|swift|ts|css)$", re.IGNORECASE + ) + FORBIDDEN_CHARS = re.compile(r"[<>\"'\\|/?*:]") + lengths = {} for attr in self._all_attrs: if self.__getattribute__(attr) is not None: lengths[attr] = len(self.__getattribute__(attr)) values = list(lengths.values()) if values.count(values[0]) != len(values): - # Are all elements equal? raise InsanityException( - "%s: Scraped meta data fields have differing" - " lengths: %s" % (self.court_id, lengths) + "%s: Scraped meta data fields have differing lengths: %s" + % (self.court_id, lengths) ) if len(self.case_names) == 0: if self.should_have_results: @@ -263,8 +268,12 @@ def _check_sanity(self): for i, name in enumerate(self.case_names): if not name.strip(): raise InsanityException( - "Item with index %s has an empty case name. The prior " - "item had case name of: %s" % (i, prior_case_name) + "Item with index %s has an empty case name. The prior item had case name of: %s" + % (i, prior_case_name) + ) + if FORBIDDEN_CHARS.search(name): + logger.warning( + f"{self.court_id}: Forbidden character found in case name at index {i}: {name}" ) prior_case_name = name @@ -272,17 +281,14 @@ def _check_sanity(self): for index, case_date in enumerate(self.case_dates): if not isinstance(case_date, date): raise InsanityException( - "%s: member of case_dates list not a valid date object. " - "Instead it is: %s with value: %s" + "%s: member of case_dates list not a valid date object. Instead it is: %s with value: %s" % (self.court_id, type(case_date), case_date) ) - # Sanitize case date, fix typo of current year if present fixed_date = fix_future_year_typo(case_date) case_name = self.case_names[index] if fixed_date != case_date: logger.info( - "Date year typo detected. Converting %s to %s " - "for case '%s' in %s", + "Date year typo detected. Converting %s to %s for case '%s' in %s", case_date, fixed_date, case_name, @@ -291,31 +297,42 @@ def _check_sanity(self): case_date = fixed_date self.case_dates[index] = fixed_date - # If a date is approximate, then it may be set in the future until - # half of the year has passed. Ignore this case if hasattr(self, "date_filed_is_approximate"): date_is_approximate = self.date_filed_is_approximate[index] else: date_is_approximate = False - # dates should not be in the future. Tolerate a week if not date_is_approximate and case_date > ( date.today() + timedelta(days=7) ): future_date_count += 1 error = f"{self.court_id}: {case_date} date is in the future. Case '{case_name}'" logger.error(error) - - # Interrupt data ingestion if more than 1 record has a bad date if future_date_count > 1: raise InsanityException( f"More than 1 case has a date in the future. Last case: {error}" ) + if not date_is_approximate and case_date < date(1900, 1, 1): + error = f"{self.court_id}: {case_date} date is before 1900. Case '{case_name}'" + logger.error(error) + raise InsanityException( + f"Case has a date before 1900. Case: {error}" + ) + if not isinstance(self.cookies, dict): raise InsanityException( "self.cookies not set to be a dict by scraper." ) + + # Check for suspicious download_urls + if hasattr(self, "download_urls") and self.download_urls is not None: + for url in self.download_urls: + if SUSPICIOUS_EXTENSIONS.search(url.strip()): + raise InsanityException( + f"{self.court_id}: Suspicious download_url detected in sanity check: {url}" + ) + logger.info( "%s: Successfully found %s items." % (self.court_id, len(self.case_names)) From c2efb9ae0c567c393ac5185e4a316b2ba7d1b1d1 Mon Sep 17 00:00:00 2001 From: luism Date: Mon, 25 Aug 2025 10:31:20 -0400 Subject: [PATCH 2/2] refactor(AbstractSite): remove validations for case names and download URLs --- juriscraper/AbstractSite.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/juriscraper/AbstractSite.py b/juriscraper/AbstractSite.py index 301d6b5c8..3aa89455d 100644 --- a/juriscraper/AbstractSite.py +++ b/juriscraper/AbstractSite.py @@ -1,6 +1,5 @@ import hashlib import json -import re from datetime import date, datetime, timedelta import certifi @@ -235,10 +234,6 @@ def _check_sanity(self): If sanity is OK, no return value. If not, throw InsanityException or warnings, as appropriate. """ - SUSPICIOUS_EXTENSIONS = re.compile( - r"\.(js|py|rb|java|c|cpp|cs|go|sh|pl|swift|ts|css)$", re.IGNORECASE - ) - FORBIDDEN_CHARS = re.compile(r"[<>\"'\\|/?*:]") lengths = {} for attr in self._all_attrs: @@ -271,10 +266,6 @@ def _check_sanity(self): "Item with index %s has an empty case name. The prior item had case name of: %s" % (i, prior_case_name) ) - if FORBIDDEN_CHARS.search(name): - logger.warning( - f"{self.court_id}: Forbidden character found in case name at index {i}: {name}" - ) prior_case_name = name future_date_count = 0 @@ -325,14 +316,6 @@ def _check_sanity(self): "self.cookies not set to be a dict by scraper." ) - # Check for suspicious download_urls - if hasattr(self, "download_urls") and self.download_urls is not None: - for url in self.download_urls: - if SUSPICIOUS_EXTENSIONS.search(url.strip()): - raise InsanityException( - f"{self.court_id}: Suspicious download_url detected in sanity check: {url}" - ) - logger.info( "%s: Successfully found %s items." % (self.court_id, len(self.case_names))