biglocalnews
diff --git a/‎.github/disabled-dependabot.yml‎
Lines changed: 12 additions & 0 deletions b/‎.github/disabled-dependabot.yml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎Pipfile‎
Lines changed: 14 additions & 10 deletions b/‎Pipfile‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎Pipfile.lock‎
Lines changed: 966 additions & 988 deletions b/‎Pipfile.lock‎
Lines changed: 966 additions & 988 deletions
diff --git a/‎clean/ca/california_highway_patrol.py‎
Lines changed: 74 additions & 0 deletions b/‎clean/ca/california_highway_patrol.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎clean/ca/california_state_personnel_board.py‎
Lines changed: 73 additions & 0 deletions b/‎clean/ca/california_state_personnel_board.py‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎clean/config/fremont_pd.py‎ ‎clean/ca/config/fremont_pd.py‎clean/config/fremont_pd.py renamed to clean/ca/config/fremont_pd.py b/‎clean/config/fremont_pd.py‎ ‎clean/ca/config/fremont_pd.py‎clean/config/fremont_pd.py renamed to clean/ca/config/fremont_pd.py
diff --git a/‎clean/ca/contra_costa_county_district_attorney.py‎
Lines changed: 73 additions & 0 deletions b/‎clean/ca/contra_costa_county_district_attorney.py‎
Lines changed: 73 additions & 0 deletions
diff --git a/‎clean/ca/desert_hot_springs_pd.py‎
Lines changed: 72 additions & 0 deletions b/‎clean/ca/desert_hot_springs_pd.py‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎clean/ca/fremont_pd.py‎
Lines changed: 1 addition & 1 deletion b/‎clean/ca/fremont_pd.py‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,12 @@
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+
+version: 2
+updates:
+  - package-ecosystem: "pip" # See documentation for possible values
+    directory: "/" # Location of package manifests
+    target-branch: "dev"
+    schedule:
+      interval: "weekly"
@@ -11,14 +11,14 @@ twine = "*"
 flake8 = "*"
 coverage = "*"
 flake8-docstrings = "*"
+setuptools = "==79.0.0"
 setuptools-scm = "*"
 us = "*"
 jinja2 = "*"
 flake8-bugbear = "*"
 pre-commit = "*"
 types-requests = "*"
 mypy = "*"
-typing-extensions = "*"
 types-retry = "*"
 types-beautifulsoup4 = "*"
 types-openpyxl = "*"
@@ -27,22 +27,26 @@ exceptiongroup = "*"
 pytest-cov = "*"
 pytest-sugar = "*"
 pytest-stub = "*"
+typing-extensions = "==4.13.2"
 
 [packages]
-beautifulsoup4 = "*"
+click = "*"
 html5lib = "*"
+playwright = "*"
+python-dotenv = "*"
+pytubefix = "*"
 requests = "*"
-pdfplumber = "*"
+retry2 = "*"
 tenacity = "*"
-click = "*"
-retry = "*"
-urllib3 = "1.26.19" # pegged to avoid test issue
 typing-extensions = "*"
+urllib3 = "*"
 us = "*"
-pytube = "*"
-python-dotenv = "*"
-yt-dlp = "*"
-playwright = "*"
+cryptography = "==44.0.2"
+yt-dlp = "==2025.3.25"
+jellyfish = "==1.2.0"
+certifi = "==2025.1.31"
+pdfplumber = "==0.11.6"
+beautifulsoup4 = "==4.13.3"
 
 [pipenv]
 allow_prereleases = false
@@ -0,0 +1,74 @@
+import logging
+from pathlib import Path
+from typing import Dict, List
+
+from .. import utils
+from ..cache import Cache
+from ..platforms.muckrock import process_muckrock
+
+# from ..utils import MetadataDict
+
+logger = logging.getLogger(__name__)
+
+
+class Site:
+    """Scrape file metadata for the California Highway Patrol.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "California Highway Patrol"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.site_slug = "ca_california_highway_patrol"
+        self.base_url = "https://www.muckrock.com/foi/california-52/sb1421-records-86009"  # Embargoed
+        # Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
+        # along with additional index pages
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
+        self.cache = Cache(cache_dir)
+        for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
+            utils.create_directory(localdir)
+
+    def scrape_meta(self, throttle: int = 2) -> Path:
+        """Gather metadata on downloadable files (videos, etc.).
+
+        Args:
+            throttle (int): Number of seconds to wait between requests. Defaults to 0.
+
+        Returns:
+            Path: Local path of JSON file containing metadata on downloadable files
+        """
+        to_be_scraped: Dict = {
+            "https://www.muckrock.com/foi/california-52/sb1421-records-86009": True,
+            "https://www.muckrock.com/foi/california-52/sb1421-records-2022-122698": True,
+            "https://www.muckrock.com/foi/california-52/2023-sb1421sb16-request-california-highway-patrol-138613": True,
+        }
+
+        metadata: List = []
+
+        subpages_dir = self.subpages_dir
+
+        api_key = utils.get_credentials("MUCKROCK_CRP")
+
+        for start_url in to_be_scraped:
+            force = to_be_scraped[start_url]
+            local_metadata = process_muckrock(subpages_dir, start_url, api_key, force)
+            metadata.extend(local_metadata)
+
+        json_filename = self.data_dir / (self.site_slug + ".json")
+        self.cache.write_json(json_filename, metadata)
+
+        return json_filename
@@ -0,0 +1,73 @@
+import logging
+from pathlib import Path
+from typing import Dict, List
+
+from .. import utils
+from ..cache import Cache
+from ..platforms.muckrock import process_muckrock
+
+# from ..utils import MetadataDict
+
+logger = logging.getLogger(__name__)
+
+
+class Site:
+    """Scrape file metadata for the California State Personnel Board.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "California State Personnel Board"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.site_slug = "ca_california_state_personnel_board"
+        self.base_url = "https://www.muckrock.com/foi/california-52/sb1421-records-86009"  # Embargoed
+        # Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
+        # along with additional index pages
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
+        self.cache = Cache(cache_dir)
+        for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
+            utils.create_directory(localdir)
+
+    def scrape_meta(self, throttle: int = 2) -> Path:
+        """Gather metadata on downloadable files (videos, etc.).
+
+        Args:
+            throttle (int): Number of seconds to wait between requests. Defaults to 0.
+
+        Returns:
+            Path: Local path of JSON file containing metadata on downloadable files
+        """
+        to_be_scraped: Dict = {
+            "https://www.muckrock.com/foi/california-52/sb1421-records-85998": True,
+            "https://www.muckrock.com/foi/california-52/2023-sb1421sb16-request-california-state-personnel-board-138660": True,
+        }
+
+        metadata: List = []
+
+        subpages_dir = self.subpages_dir
+
+        api_key = utils.get_credentials("MUCKROCK_CRP")
+
+        for start_url in to_be_scraped:
+            force = to_be_scraped[start_url]
+            local_metadata = process_muckrock(subpages_dir, start_url, api_key, force)
+            metadata.extend(local_metadata)
+
+        json_filename = self.data_dir / (self.site_slug + ".json")
+        self.cache.write_json(json_filename, metadata)
+
+        return json_filename
@@ -0,0 +1,73 @@
+import logging
+from pathlib import Path
+from typing import Dict, List
+
+from .. import utils
+from ..cache import Cache
+from ..platforms.muckrock import process_muckrock
+
+# from ..utils import MetadataDict
+
+logger = logging.getLogger(__name__)
+
+
+class Site:
+    """Scrape file metadata for the Contra Costa County District Attorney.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "Contra Costa County District Attorney"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.site_slug = "ca_contra_costa_county_district_attorney"
+        self.base_url = "https://www.muckrock.com/foi/martinez-3315/sb1421-records-85537"  # Embargoed
+        # Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
+        # along with additional index pages
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
+        self.cache = Cache(cache_dir)
+        for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
+            utils.create_directory(localdir)
+
+    def scrape_meta(self, throttle: int = 2) -> Path:
+        """Gather metadata on downloadable files (videos, etc.).
+
+        Args:
+            throttle (int): Number of seconds to wait between requests. Defaults to 0.
+
+        Returns:
+            Path: Local path of JSON file containing metadata on downloadable files
+        """
+        to_be_scraped: Dict = {
+            "https://www.muckrock.com/foi/martinez-3315/sb1421-records-85537": True,
+            "https://www.muckrock.com/foi/martinez-3315/sb1421-records-2022-122563": True,
+        }
+
+        metadata: List = []
+
+        subpages_dir = self.subpages_dir
+
+        api_key = utils.get_credentials("MUCKROCK_CRP")
+
+        for start_url in to_be_scraped:
+            force = to_be_scraped[start_url]
+            local_metadata = process_muckrock(subpages_dir, start_url, api_key, force)
+            metadata.extend(local_metadata)
+
+        json_filename = self.data_dir / (self.site_slug + ".json")
+        self.cache.write_json(json_filename, metadata)
+
+        return json_filename
@@ -0,0 +1,72 @@
+import logging
+from pathlib import Path
+from typing import Dict, List
+
+from .. import utils
+from ..cache import Cache
+from ..platforms.muckrock import process_muckrock
+
+# from ..utils import MetadataDict
+
+logger = logging.getLogger(__name__)
+
+
+class Site:
+    """Scrape file metadata for the Desert Hot Springs Police Department.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "Desert Hot Springs Police Department"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.site_slug = "ca_desert_hot_springs_pd"
+        self.base_url = "https://www.muckrock.com/foi/desert-hot-springs-3184/sb1421-records-2022-122787"  # Embargoed
+        # Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
+        # along with additional index pages
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
+        self.cache = Cache(cache_dir)
+        for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
+            utils.create_directory(localdir)
+
+    def scrape_meta(self, throttle: int = 2) -> Path:
+        """Gather metadata on downloadable files (videos, etc.).
+
+        Args:
+            throttle (int): Number of seconds to wait between requests. Defaults to 0.
+
+        Returns:
+            Path: Local path of JSON file containing metadata on downloadable files
+        """
+        to_be_scraped: Dict = {
+            "https://www.muckrock.com/foi/desert-hot-springs-3184/sb1421-records-2022-122787": True,
+        }
+
+        metadata: List = []
+
+        subpages_dir = self.subpages_dir
+
+        api_key = utils.get_credentials("MUCKROCK_CRP")
+
+        for start_url in to_be_scraped:
+            force = to_be_scraped[start_url]
+            local_metadata = process_muckrock(subpages_dir, start_url, api_key, force)
+            metadata.extend(local_metadata)
+
+        json_filename = self.data_dir / (self.site_slug + ".json")
+        self.cache.write_json(json_filename, metadata)
+
+        return json_filename
@@ -6,7 +6,7 @@
 
 from .. import utils
 from ..cache import Cache
-from ..config.fremont_pd import index_request_headers
+from .config.fremont_pd import index_request_headers
 
 
 class Site: