Skip to content

Commit 8146a57

Browse files
authored
Merge pull request #1336 from nielash/dcd-backscraper
feat(dcd): implement backscraper
2 parents 829c877 + c27419d commit 8146a57

2 files changed

Lines changed: 36 additions & 0 deletions

File tree

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ Releases are also tagged in git, if that's helpful.
1313
## Coming up
1414

1515
- Fix `me` Update maine scraper and add backscraper
16+
- Add backscraper for `dcd` #1336
1617
- Update `sd` backscraper and extract from text
1718
- Implement datestring format validation in test_ScraperExtractFromTextTest #838
1819
- Implement `or` extract_from_text to collection regional citations #1226

juriscraper/opinions/united_states/federal_district/dcd.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def __init__(self, *args, **kwargs):
2626
self.court_id = self.__module__
2727
self.url = f"https://ecf.dcd.uscourts.gov/cgi-bin/Opinions.pl?{date.today().year}"
2828
self.status = "Published"
29+
self.make_backscrape_iterable(kwargs)
2930

3031
def _process_html(self):
3132
"""
@@ -72,3 +73,37 @@ def get_docket_document_number_from_url(self, url: str) -> Tuple[str, str]:
7273
doc_number = match.group(6) if match else url
7374

7475
return doc_number
76+
77+
def _download_backwards(self, year: int) -> None:
78+
"""Build URL with year input and scrape
79+
80+
:param year: year to scrape
81+
:return None
82+
"""
83+
self.url = f"https://ecf.dcd.uscourts.gov/cgi-bin/Opinions.pl?{year}"
84+
self.html = self._download()
85+
self._process_html()
86+
87+
def make_backscrape_iterable(self, kwargs: dict) -> None:
88+
"""Checks if backscrape start and end arguments have been passed
89+
by caller, and parses them accordingly
90+
91+
:param kwargs: passed when initializing the scraper, may or
92+
may not contain backscrape controlling arguments
93+
:return None
94+
"""
95+
start_date = kwargs.get("backscrape_start")
96+
end_date = kwargs.get("backscrape_end")
97+
98+
start = (
99+
datetime.strptime(start_date, "%m/%d/%Y").year
100+
if start_date
101+
else date.today().year
102+
)
103+
end = (
104+
datetime.strptime(end_date, "%m/%d/%Y").year + 1
105+
if end_date
106+
else date.today().year
107+
)
108+
109+
self.back_scrape_iterable = range(max(2005, start), end)

0 commit comments

Comments
 (0)