Skip to content

Commit ad3804b

Browse files
authored
Merge branch 'dev' into ca_ventura_sheriff
2 parents d89c4d0 + 118ab93 commit ad3804b

20 files changed

Lines changed: 1901 additions & 1010 deletions

.github/disabled-dependabot.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# To get started with Dependabot version updates, you'll need to specify which
2+
# package ecosystems to update and where the package manifests are located.
3+
# Please see the documentation for all configuration options:
4+
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5+
6+
version: 2
7+
updates:
8+
- package-ecosystem: "pip" # See documentation for possible values
9+
directory: "/" # Location of package manifests
10+
target-branch: "dev"
11+
schedule:
12+
interval: "weekly"

Pipfile

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@ twine = "*"
1111
flake8 = "*"
1212
coverage = "*"
1313
flake8-docstrings = "*"
14+
setuptools = "==79.0.0"
1415
setuptools-scm = "*"
1516
us = "*"
1617
jinja2 = "*"
1718
flake8-bugbear = "*"
1819
pre-commit = "*"
1920
types-requests = "*"
2021
mypy = "*"
21-
typing-extensions = "*"
2222
types-retry = "*"
2323
types-beautifulsoup4 = "*"
2424
types-openpyxl = "*"
@@ -27,22 +27,26 @@ exceptiongroup = "*"
2727
pytest-cov = "*"
2828
pytest-sugar = "*"
2929
pytest-stub = "*"
30+
typing-extensions = "==4.13.2"
3031

3132
[packages]
32-
beautifulsoup4 = "*"
33+
click = "*"
3334
html5lib = "*"
35+
playwright = "*"
36+
python-dotenv = "*"
37+
pytubefix = "*"
3438
requests = "*"
35-
pdfplumber = "*"
39+
retry2 = "*"
3640
tenacity = "*"
37-
click = "*"
38-
retry = "*"
39-
urllib3 = "1.26.19" # pegged to avoid test issue
4041
typing-extensions = "*"
42+
urllib3 = "*"
4143
us = "*"
42-
pytube = "*"
43-
python-dotenv = "*"
44-
yt-dlp = "*"
45-
playwright = "*"
44+
cryptography = "==44.0.2"
45+
yt-dlp = "==2025.3.25"
46+
jellyfish = "==1.2.0"
47+
certifi = "==2025.1.31"
48+
pdfplumber = "==0.11.6"
49+
beautifulsoup4 = "==4.13.3"
4650

4751
[pipenv]
4852
allow_prereleases = false

Pipfile.lock

Lines changed: 966 additions & 988 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import logging
2+
from pathlib import Path
3+
from typing import Dict, List
4+
5+
from .. import utils
6+
from ..cache import Cache
7+
from ..platforms.muckrock import process_muckrock
8+
9+
# from ..utils import MetadataDict
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
class Site:
15+
"""Scrape file metadata for the California Highway Patrol.
16+
17+
Attributes:
18+
name (str): The official name of the agency
19+
"""
20+
21+
name = "California Highway Patrol"
22+
23+
def __init__(
24+
self,
25+
data_dir: Path = utils.CLEAN_DATA_DIR,
26+
cache_dir: Path = utils.CLEAN_CACHE_DIR,
27+
):
28+
"""Initialize a new instance.
29+
30+
Args:
31+
data_dir (Path): The directory where downstream processed files/data will be saved
32+
cache_dir (Path): The directory where files will be cached
33+
"""
34+
self.site_slug = "ca_california_highway_patrol"
35+
self.base_url = "https://www.muckrock.com/foi/california-52/sb1421-records-86009" # Embargoed
36+
# Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
37+
# along with additional index pages
38+
self.data_dir = data_dir
39+
self.cache_dir = cache_dir
40+
self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
41+
self.cache = Cache(cache_dir)
42+
for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
43+
utils.create_directory(localdir)
44+
45+
def scrape_meta(self, throttle: int = 2) -> Path:
46+
"""Gather metadata on downloadable files (videos, etc.).
47+
48+
Args:
49+
throttle (int): Number of seconds to wait between requests. Defaults to 0.
50+
51+
Returns:
52+
Path: Local path of JSON file containing metadata on downloadable files
53+
"""
54+
to_be_scraped: Dict = {
55+
"https://www.muckrock.com/foi/california-52/sb1421-records-86009": True,
56+
"https://www.muckrock.com/foi/california-52/sb1421-records-2022-122698": True,
57+
"https://www.muckrock.com/foi/california-52/2023-sb1421sb16-request-california-highway-patrol-138613": True,
58+
}
59+
60+
metadata: List = []
61+
62+
subpages_dir = self.subpages_dir
63+
64+
api_key = utils.get_credentials("MUCKROCK_CRP")
65+
66+
for start_url in to_be_scraped:
67+
force = to_be_scraped[start_url]
68+
local_metadata = process_muckrock(subpages_dir, start_url, api_key, force)
69+
metadata.extend(local_metadata)
70+
71+
json_filename = self.data_dir / (self.site_slug + ".json")
72+
self.cache.write_json(json_filename, metadata)
73+
74+
return json_filename
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import logging
2+
from pathlib import Path
3+
from typing import Dict, List
4+
5+
from .. import utils
6+
from ..cache import Cache
7+
from ..platforms.muckrock import process_muckrock
8+
9+
# from ..utils import MetadataDict
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
class Site:
15+
"""Scrape file metadata for the California State Personnel Board.
16+
17+
Attributes:
18+
name (str): The official name of the agency
19+
"""
20+
21+
name = "California State Personnel Board"
22+
23+
def __init__(
24+
self,
25+
data_dir: Path = utils.CLEAN_DATA_DIR,
26+
cache_dir: Path = utils.CLEAN_CACHE_DIR,
27+
):
28+
"""Initialize a new instance.
29+
30+
Args:
31+
data_dir (Path): The directory where downstream processed files/data will be saved
32+
cache_dir (Path): The directory where files will be cached
33+
"""
34+
self.site_slug = "ca_california_state_personnel_board"
35+
self.base_url = "https://www.muckrock.com/foi/california-52/sb1421-records-86009" # Embargoed
36+
# Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
37+
# along with additional index pages
38+
self.data_dir = data_dir
39+
self.cache_dir = cache_dir
40+
self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
41+
self.cache = Cache(cache_dir)
42+
for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
43+
utils.create_directory(localdir)
44+
45+
def scrape_meta(self, throttle: int = 2) -> Path:
46+
"""Gather metadata on downloadable files (videos, etc.).
47+
48+
Args:
49+
throttle (int): Number of seconds to wait between requests. Defaults to 0.
50+
51+
Returns:
52+
Path: Local path of JSON file containing metadata on downloadable files
53+
"""
54+
to_be_scraped: Dict = {
55+
"https://www.muckrock.com/foi/california-52/sb1421-records-85998": True,
56+
"https://www.muckrock.com/foi/california-52/2023-sb1421sb16-request-california-state-personnel-board-138660": True,
57+
}
58+
59+
metadata: List = []
60+
61+
subpages_dir = self.subpages_dir
62+
63+
api_key = utils.get_credentials("MUCKROCK_CRP")
64+
65+
for start_url in to_be_scraped:
66+
force = to_be_scraped[start_url]
67+
local_metadata = process_muckrock(subpages_dir, start_url, api_key, force)
68+
metadata.extend(local_metadata)
69+
70+
json_filename = self.data_dir / (self.site_slug + ".json")
71+
self.cache.write_json(json_filename, metadata)
72+
73+
return json_filename
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import logging
2+
from pathlib import Path
3+
from typing import Dict, List
4+
5+
from .. import utils
6+
from ..cache import Cache
7+
from ..platforms.muckrock import process_muckrock
8+
9+
# from ..utils import MetadataDict
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
class Site:
15+
"""Scrape file metadata for the Contra Costa County District Attorney.
16+
17+
Attributes:
18+
name (str): The official name of the agency
19+
"""
20+
21+
name = "Contra Costa County District Attorney"
22+
23+
def __init__(
24+
self,
25+
data_dir: Path = utils.CLEAN_DATA_DIR,
26+
cache_dir: Path = utils.CLEAN_CACHE_DIR,
27+
):
28+
"""Initialize a new instance.
29+
30+
Args:
31+
data_dir (Path): The directory where downstream processed files/data will be saved
32+
cache_dir (Path): The directory where files will be cached
33+
"""
34+
self.site_slug = "ca_contra_costa_county_district_attorney"
35+
self.base_url = "https://www.muckrock.com/foi/martinez-3315/sb1421-records-85537" # Embargoed
36+
# Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
37+
# along with additional index pages
38+
self.data_dir = data_dir
39+
self.cache_dir = cache_dir
40+
self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
41+
self.cache = Cache(cache_dir)
42+
for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
43+
utils.create_directory(localdir)
44+
45+
def scrape_meta(self, throttle: int = 2) -> Path:
46+
"""Gather metadata on downloadable files (videos, etc.).
47+
48+
Args:
49+
throttle (int): Number of seconds to wait between requests. Defaults to 0.
50+
51+
Returns:
52+
Path: Local path of JSON file containing metadata on downloadable files
53+
"""
54+
to_be_scraped: Dict = {
55+
"https://www.muckrock.com/foi/martinez-3315/sb1421-records-85537": True,
56+
"https://www.muckrock.com/foi/martinez-3315/sb1421-records-2022-122563": True,
57+
}
58+
59+
metadata: List = []
60+
61+
subpages_dir = self.subpages_dir
62+
63+
api_key = utils.get_credentials("MUCKROCK_CRP")
64+
65+
for start_url in to_be_scraped:
66+
force = to_be_scraped[start_url]
67+
local_metadata = process_muckrock(subpages_dir, start_url, api_key, force)
68+
metadata.extend(local_metadata)
69+
70+
json_filename = self.data_dir / (self.site_slug + ".json")
71+
self.cache.write_json(json_filename, metadata)
72+
73+
return json_filename

clean/ca/desert_hot_springs_pd.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import logging
2+
from pathlib import Path
3+
from typing import Dict, List
4+
5+
from .. import utils
6+
from ..cache import Cache
7+
from ..platforms.muckrock import process_muckrock
8+
9+
# from ..utils import MetadataDict
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
class Site:
15+
"""Scrape file metadata for the Desert Hot Springs Police Department.
16+
17+
Attributes:
18+
name (str): The official name of the agency
19+
"""
20+
21+
name = "Desert Hot Springs Police Department"
22+
23+
def __init__(
24+
self,
25+
data_dir: Path = utils.CLEAN_DATA_DIR,
26+
cache_dir: Path = utils.CLEAN_CACHE_DIR,
27+
):
28+
"""Initialize a new instance.
29+
30+
Args:
31+
data_dir (Path): The directory where downstream processed files/data will be saved
32+
cache_dir (Path): The directory where files will be cached
33+
"""
34+
self.site_slug = "ca_desert_hot_springs_pd"
35+
self.base_url = "https://www.muckrock.com/foi/desert-hot-springs-3184/sb1421-records-2022-122787" # Embargoed
36+
# Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
37+
# along with additional index pages
38+
self.data_dir = data_dir
39+
self.cache_dir = cache_dir
40+
self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
41+
self.cache = Cache(cache_dir)
42+
for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
43+
utils.create_directory(localdir)
44+
45+
def scrape_meta(self, throttle: int = 2) -> Path:
46+
"""Gather metadata on downloadable files (videos, etc.).
47+
48+
Args:
49+
throttle (int): Number of seconds to wait between requests. Defaults to 0.
50+
51+
Returns:
52+
Path: Local path of JSON file containing metadata on downloadable files
53+
"""
54+
to_be_scraped: Dict = {
55+
"https://www.muckrock.com/foi/desert-hot-springs-3184/sb1421-records-2022-122787": True,
56+
}
57+
58+
metadata: List = []
59+
60+
subpages_dir = self.subpages_dir
61+
62+
api_key = utils.get_credentials("MUCKROCK_CRP")
63+
64+
for start_url in to_be_scraped:
65+
force = to_be_scraped[start_url]
66+
local_metadata = process_muckrock(subpages_dir, start_url, api_key, force)
67+
metadata.extend(local_metadata)
68+
69+
json_filename = self.data_dir / (self.site_slug + ".json")
70+
self.cache.write_json(json_filename, metadata)
71+
72+
return json_filename

clean/ca/fremont_pd.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from .. import utils
88
from ..cache import Cache
9-
from ..config.fremont_pd import index_request_headers
9+
from .config.fremont_pd import index_request_headers
1010

1111

1212
class Site:

0 commit comments

Comments
 (0)