Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Features:
- implement `neb` and `nebctapp` extract_from_text #1549

Changes:
-
- new validations for `_sanity_check` in `AbstractSite`

Fixes:
-
Expand Down
30 changes: 15 additions & 15 deletions juriscraper/AbstractSite.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,16 +234,16 @@ def _check_sanity(self):
If sanity is OK, no return value. If not, throw InsanityException or
warnings, as appropriate.
"""

lengths = {}
for attr in self._all_attrs:
if self.__getattribute__(attr) is not None:
lengths[attr] = len(self.__getattribute__(attr))
values = list(lengths.values())
if values.count(values[0]) != len(values):
# Are all elements equal?
raise InsanityException(
"%s: Scraped meta data fields have differing"
" lengths: %s" % (self.court_id, lengths)
"%s: Scraped meta data fields have differing lengths: %s"
% (self.court_id, lengths)
)
if len(self.case_names) == 0:
if self.should_have_results:
Expand All @@ -263,26 +263,23 @@ def _check_sanity(self):
for i, name in enumerate(self.case_names):
if not name.strip():
raise InsanityException(
"Item with index %s has an empty case name. The prior "
"item had case name of: %s" % (i, prior_case_name)
"Item with index %s has an empty case name. The prior item had case name of: %s"
% (i, prior_case_name)
)
prior_case_name = name

future_date_count = 0
for index, case_date in enumerate(self.case_dates):
if not isinstance(case_date, date):
raise InsanityException(
"%s: member of case_dates list not a valid date object. "
"Instead it is: %s with value: %s"
"%s: member of case_dates list not a valid date object. Instead it is: %s with value: %s"
% (self.court_id, type(case_date), case_date)
)
# Sanitize case date, fix typo of current year if present
fixed_date = fix_future_year_typo(case_date)
case_name = self.case_names[index]
if fixed_date != case_date:
logger.info(
"Date year typo detected. Converting %s to %s "
"for case '%s' in %s",
"Date year typo detected. Converting %s to %s for case '%s' in %s",
case_date,
fixed_date,
case_name,
Expand All @@ -291,31 +288,34 @@ def _check_sanity(self):
case_date = fixed_date
self.case_dates[index] = fixed_date

# If a date is approximate, then it may be set in the future until
# half of the year has passed. Ignore this case
if hasattr(self, "date_filed_is_approximate"):
date_is_approximate = self.date_filed_is_approximate[index]
else:
date_is_approximate = False

# dates should not be in the future. Tolerate a week
if not date_is_approximate and case_date > (
date.today() + timedelta(days=7)
):
future_date_count += 1
error = f"{self.court_id}: {case_date} date is in the future. Case '{case_name}'"
logger.error(error)

# Interrupt data ingestion if more than 1 record has a bad date
if future_date_count > 1:
raise InsanityException(
f"More than 1 case has a date in the future. Last case: {error}"
)

if not date_is_approximate and case_date < date(1900, 1, 1):
error = f"{self.court_id}: {case_date} date is before 1900. Case '{case_name}'"
logger.error(error)
raise InsanityException(
f"Case has a date before 1900. Case: {error}"
)

if not isinstance(self.cookies, dict):
raise InsanityException(
"self.cookies not set to be a dict by scraper."
)

logger.info(
"%s: Successfully found %s items."
% (self.court_id, len(self.case_names))
Expand Down
Loading