biglocalnews
diff --git a/‎clean/ca/lake_county_sheriff.py‎
Lines changed: 162 additions & 0 deletions b/‎clean/ca/lake_county_sheriff.py‎
Lines changed: 162 additions & 0 deletions
diff --git a/‎clean/ca/san_francisco_pc.py‎
Lines changed: 192 additions & 0 deletions b/‎clean/ca/san_francisco_pc.py‎
Lines changed: 192 additions & 0 deletions
diff --git a/‎docs/contributing.md‎
Lines changed: 9 additions & 3 deletions b/‎docs/contributing.md‎
Lines changed: 9 additions & 3 deletions
@@ -0,0 +1,162 @@
+import os
+import time
+from pathlib import Path
+
+from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+
+from .. import utils
+from ..cache import Cache
+
+
+class Site:
+    """Scrape file metadata and download files for the City of Lake County Sheriff.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "Lake County Sheriff"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.base_url = "https://www.lakesheriff.com/969/Use-of-Force"
+        self.zenrows_api_url = "https://api.zenrows.com/v1/"
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.cache = Cache(cache_dir)
+        dotenv_path = "env/.env"
+        load_dotenv(dotenv_path=dotenv_path)
+        self.params = {
+            "apikey": os.getenv("ZENROWS_KEY"),
+            "url": "",  # Target website URL
+            # Add any other ZenRows parameters here (optional)
+        }
+
+    @property
+    def agency_slug(self) -> str:
+        """Construct the agency slug."""
+        # Use module path to construct agency slug, which we'll use downstream
+        mod = Path(__file__)
+        state_postal = mod.parent.stem
+        return f"{state_postal}_{mod.stem}"  # ca_lake_county_sheriff
+
+    def scrape_meta(self, throttle=0):
+        # construct a local filename relative to the cache directory - agency slug + page url (ca_lake_county_sheriff/Use-of-Force.html)
+        # download the page (if not already cached)
+        # save the index page url to cache (sensible name)
+        base_name = f"{self.base_url.split('/')[-1]}.html"
+        filename = f"{self.agency_slug}/{base_name}"
+        self.params["url"] = self.base_url
+        self.cache.download(filename, self.zenrows_api_url, params=self.params)
+        metadata = []
+        child_pages = []
+        html = self.cache.read(filename)
+        soup = BeautifulSoup(html, "html.parser")
+        body = soup.find("table", class_="fr-alternate-rows")
+        child_links = body.find_all("a")
+        for link in child_links:
+            tr_tag = link.find_parent("tr")
+            td_tag = tr_tag.find_all("td")
+            child_page_data = dict()
+            child_page_data["date"] = td_tag[0].text
+            child_page_data["location"] = td_tag[1].get_text(separator=", ")
+            child_page_data["name"] = td_tag[2].text
+            child_page_data["incident_type"] = td_tag[3].abbr.text
+            child_page_data["case_number"] = link.text
+            child_file_name = (
+                f'{self.agency_slug}/{child_page_data["case_number"]}.html'
+            )
+            if link["href"]:
+                link_url = f"https://www.lakesheriff.com{link['href']}"
+                self.params["url"] = link_url
+                self.cache.download(
+                    child_file_name, self.zenrows_api_url, params=self.params
+                )
+                child_page_data["page_filename"] = child_file_name
+                child_pages.append(child_page_data)
+            time.sleep(throttle)
+        for child_page in child_pages:
+            html = self.cache.read(child_page["page_filename"])
+            soup = BeautifulSoup(html, "html.parser")
+            body = soup.find(attrs={"data-cprole": "mainContentContainer"})
+            links = body.find_all("a")
+            for link in links:
+                link_href = link.get("href", None)
+                if link_href:
+                    if "youtu" in link_href:
+                        payload = {
+                            "asset_url": link_href,
+                            "case_id": child_page["case_number"],
+                            "name": link.text,
+                            "title": link.text,
+                            "parent_page": str(child_page["page_filename"]),
+                            "details": {
+                                "date": child_page["date"],
+                                "location": child_page["location"],
+                                "name": child_page["name"],
+                                "incident_type": child_page["incident_type"],
+                            },
+                        }
+                        metadata.append(payload)
+                    elif "DocumentCenter" in link_href:
+                        payload = {
+                            "asset_url": f"https://www.lakesheriff.com{link_href}",
+                            "case_id": child_page["case_number"],
+                            "name": link.text,
+                            "title": link.text,
+                            "parent_page": str(child_page["page_filename"]),
+                            "details": {
+                                "date": child_page["date"],
+                                "location": child_page["location"],
+                                "name": child_page["name"],
+                                "incident_type": child_page["incident_type"],
+                            },
+                        }
+                        metadata.append(payload)
+                    elif "gallery" in link_href:
+                        gallery_id = link_href.split("=")[-1]
+                        galley_link = f"https://www.lakesheriff.com/SlideShow.aspx?AID={gallery_id}&AN=Sheriff%20-%20Use%20of%20Force%20-%20Case%2014110123"
+                        self.params["url"] = galley_link
+                        images_file_name = (
+                            f"{self.agency_slug}/images_{gallery_id}.html"
+                        )
+                        self.cache.download(
+                            images_file_name, self.zenrows_api_url, params=self.params
+                        )
+                        html = self.cache.read(images_file_name)
+                        soup = BeautifulSoup(html, "html.parser")
+                        body = soup.find("div", class_="slides")
+                        a_tags = body.find_all("a")
+                        for a_tag in a_tags:
+                            img_tag = a_tag.find("img")
+                            # Get the 'src' and 'alt' attributes
+                            image_src = img_tag.get("src")
+                            image_alt = img_tag.get("alt")
+                            payload = {
+                                "asset_url": f"https://www.lakesheriff.com{image_src}",
+                                "case_id": child_page["case_number"],
+                                "name": image_alt,
+                                "title": link.text,
+                                "parent_page": str(child_page["page_filename"]),
+                                "details": {
+                                    "date": child_page["date"],
+                                    "location": child_page["location"],
+                                    "name": child_page["name"],
+                                    "incident_type": child_page["incident_type"],
+                                },
+                            }
+                            metadata.append(payload)
+
+        outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
+        self.cache.write_json(outfile, metadata)
+        return outfile
@@ -0,0 +1,192 @@
+import re
+import time
+from pathlib import Path
+from typing import List
+from urllib.parse import parse_qs, urlparse
+
+from bs4 import BeautifulSoup
+
+from .. import utils
+from ..cache import Cache
+from ..utils import MetadataDict
+
+
+class Site:
+    """Scrape file metadata for the San Francisco Police Commission."""
+
+    name = "San Francisco Police Commission"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance."""
+        self.base_url = "https://www.sf.gov"
+        self.disclosure_url = f"{self.base_url}/resource/2022/records-released-pursuant-ca-penal-code-ss-8327"
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.cache = Cache(cache_dir)
+
+    @property
+    def agency_slug(self) -> str:
+        """Construct the agency slug."""
+        mod = Path(__file__)
+        state_postal = mod.parent.stem
+        return f"{state_postal}_{mod.stem}"  # e.g., ca_san_francisco_pc
+
+    def scrape_meta(self, throttle: int = 0) -> Path:
+        """
+        Gather metadata on downloadable files by following a two-step process.
+
+        1. Extract links from main pages.
+        2. Extract metadata from detail pages.
+
+        Args:
+            throttle (int): Number of seconds to wait between requests. Defaults to 0.
+
+        Returns:
+            Path: Local path of JSON file containing metadata.
+        """
+        # Step 1: Extract links from main pages
+        main_links = self.get_main_page_links()
+
+        # Step 2: Extract metadata from detail pages
+        metadata = self.get_detail_page_links(main_links, throttle)
+
+        # Write metadata to a JSON file
+        outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
+        self.cache.write_json(outfile, metadata)
+
+        return outfile
+
+    def get_main_page_links(self) -> List[str]:
+        """
+        Retrieve links from the main page of the site.
+
+        Returns:
+            List[str]: A list of URLs for detailed pages.
+        """
+        main_links = []
+
+        cache_path = self._download_index_page(self.disclosure_url)
+        html = self.cache.read(cache_path)
+        soup = BeautifulSoup(html, "html.parser")
+
+        for link in soup.find_all("a", href=True):
+            if "RequestArchiveDetails" in link["href"]:
+                main_links.append(
+                    f"{self.base_url}/{link['href']}"
+                    if not link["href"].startswith("http")
+                    else link["href"]
+                )
+
+        return main_links
+
+    def get_detail_page_links(
+        self, main_links: List[str], throttle: int = 0
+    ) -> List[MetadataDict]:
+        """
+        Extract detailed metadata from links on the main pages.
+
+        Args:
+            main_links (List[str]): A list of main page URLs.
+            throttle (int): Number of seconds to wait between requests.
+
+        Returns:
+            List[MetadataDict]: A list of metadata dictionaries for downloadable resources.
+        """
+        metadata = []
+
+        # Define a regex pattern to match input ids with the format 'rptAttachments_ctlXX_hdnAzureURL'
+        id_pattern = re.compile(r"^rptAttachments_ctl\d+_hdnAzureURL$")
+
+        for link in main_links:
+            cache_path = self._download_index_page(link)
+            html = self.cache.read(cache_path)
+            soup = BeautifulSoup(html, "html.parser")
+
+            # Extract the case_id from the reference number paragraph (<p>) tag
+            case_id_tag = soup.find(
+                "p", style="font-weight: 400; max-width: 75%; font-size: 0.875rem"
+            )
+            case_id = case_id_tag.text.strip() if case_id_tag else None
+
+            # Ensure case_id is always a string
+            case_id = str(case_id) if case_id else ""
+
+            # Find all input tags where the id matches the pattern
+            input_tags = soup.find_all("input", id=id_pattern)
+
+            # Ensure we process each input tag
+            for input_tag in input_tags:
+                value = input_tag.get("value")
+                if isinstance(value, str):
+                    full_url = value.strip()
+                    if full_url:
+                        # Check if the URL starts with the base domain
+                        if full_url.startswith(
+                            "https://1sanfranciscopd.blob.core.usgovcloudapi.net/"
+                        ):
+                            asset_url = full_url
+                        else:
+                            asset_url = (
+                                "https://1sanfranciscopd.blob.core.usgovcloudapi.net/"
+                                + full_url.lstrip("/")
+                            )
+
+                        # Parse the URL and extract the filename from the query string
+                        parsed_url = urlparse(asset_url)
+                        query_params = parse_qs(parsed_url.query)
+
+                        # Get the filename from the 'rscd' parameter
+                        filename = query_params.get("rscd", [None])[0]
+
+                        if filename:
+                            # Extract the filename after the 'filename=' part
+                            filename = filename.split("filename=")[-1]
+
+                            # Generate a title by removing underscores and .pdf extension
+                            title = filename.replace("_", " ").replace(".pdf", "")
+                        else:
+                            # Default case if filename is not found
+                            filename = asset_url.split("?")[0].rsplit("/", 1)[-1]
+                            title = filename.replace("_", " ").replace(".pdf", "")
+
+                        # Set the filename as 'name'
+                        name = (
+                            filename
+                            if filename
+                            else asset_url.split("?")[0].rsplit("/", 1)[-1]
+                        )
+
+                        payload: MetadataDict = {
+                            "asset_url": asset_url,
+                            "case_id": case_id,  # Reference No as it appears on the website
+                            "name": name,
+                            "title": title,  # Use the formatted title here
+                            "parent_page": link,
+                        }
+                        metadata.append(payload)
+
+            time.sleep(throttle)
+
+        return metadata
+
+    def _download_index_page(self, page_url: str) -> Path:
+        """
+        Download the index page for use for officer involved shootings, use of force with great bodily injury/death, & sustained complaints of sexual assault, dishonesty, excessive force, biased conduct, unlawful search or arrest, and failing to intervene against another officer using excessive force.
+
+        Index pages link to child pages containing pdfs.
+
+        Returns:
+            Local path of downloaded file
+
+        """
+        split_url = page_url.split("/")
+        # Creates a unique filename using parts of the URL,
+        # combining the directory and filename, with _index appended.
+        file_stem = f"{split_url[-4]}_{split_url[-1]}_index"
+        # Downloads the content from the page_url and stores it locally with the generated file_stem.
+        cache_path = self.cache.download(file_stem, page_url, "utf-8")
+        return cache_path
@@ -116,8 +116,8 @@ When coding a new scraper, there are a few important conventions to follow:
 - If it's a new state folder, add an empty `__init__.py` to the folder
 - Create a `Site` class inside the agency's scraper module with the following attributes/methods:
   - `name` - Official name of the agency
-  - `scrape_meta` - generates a CSV with metadata about videos and other available files (file name, URL, and size at minimum)
-  - `scrape` - uses the CSV generated by `scrape_meta` to download videos and other files
+  - `scrape_meta` - generates a JSON with metadata about videos and other available files (file name, URL at a minimum)
+  - `download_agency` - uses the JSON generated by `scrape_meta` to download videos and other files
 
 Below is a pared down version of San Diego's [Site](https://github.com/biglocalnews/clean-scraper/blob/main/clean/ca/san_diego_pd.py) class to illustrate these conventions.
 
@@ -285,6 +285,7 @@ Options:
 Commands:
   list         List all available agencies and their slugs.
   scrape-meta  Command-line interface for generating metadata CSV about...
+  download_agency Downloads assets retrieved in scrape-meta
 ```
 
 Running a state is as simple as passing arguments to the appropriate subcommand.
@@ -299,7 +300,7 @@ pipenv run python -m clean.cli list
 pipenv run python -m clean.cli scrape-meta ca_san_diego_pd
 
 # Trigger file downloads using agency slug
-pipenv run python -m clean.cli scrape ca_san_diego_pd
+pipenv run python -m clean.cli download_agency ca_san_diego_pd
 ```
 
 For more verbose logging, you can ask the system to show debugging information.
@@ -365,3 +366,8 @@ git push origin your-branch-name
 The final step is to submit a [pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) to the main respository, asking the maintainers to consider integrating your patch.
 
 GitHub has [a short guide](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) that can walk you through the process. You should tag your issue number in the request so that it gets linked in GitHub's system.
+
+
+## Zen Rows Use
+
+Some sites uses zenrows API, click [here](https://app.zenrows.com/login) to sign up for an account and get a ZENROWS_KEY which you can add to your .env file