Skip to content

Commit d89c4d0

Browse files
authored
Merge branch 'dev' into ca_ventura_sheriff
2 parents 7bf34b2 + 791d39d commit d89c4d0

6 files changed

Lines changed: 382 additions & 8 deletions

File tree

clean/ca/lake_county_sheriff.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
import os
2+
import time
3+
from pathlib import Path
4+
5+
from bs4 import BeautifulSoup
6+
from dotenv import load_dotenv
7+
8+
from .. import utils
9+
from ..cache import Cache
10+
11+
12+
class Site:
13+
"""Scrape file metadata and download files for the City of Lake County Sheriff.
14+
15+
Attributes:
16+
name (str): The official name of the agency
17+
"""
18+
19+
name = "Lake County Sheriff"
20+
21+
def __init__(
22+
self,
23+
data_dir: Path = utils.CLEAN_DATA_DIR,
24+
cache_dir: Path = utils.CLEAN_CACHE_DIR,
25+
):
26+
"""Initialize a new instance.
27+
28+
Args:
29+
data_dir (Path): The directory where downstream processed files/data will be saved
30+
cache_dir (Path): The directory where files will be cached
31+
"""
32+
self.base_url = "https://www.lakesheriff.com/969/Use-of-Force"
33+
self.zenrows_api_url = "https://api.zenrows.com/v1/"
34+
self.data_dir = data_dir
35+
self.cache_dir = cache_dir
36+
self.cache = Cache(cache_dir)
37+
dotenv_path = "env/.env"
38+
load_dotenv(dotenv_path=dotenv_path)
39+
self.params = {
40+
"apikey": os.getenv("ZENROWS_KEY"),
41+
"url": "", # Target website URL
42+
# Add any other ZenRows parameters here (optional)
43+
}
44+
45+
@property
46+
def agency_slug(self) -> str:
47+
"""Construct the agency slug."""
48+
# Use module path to construct agency slug, which we'll use downstream
49+
mod = Path(__file__)
50+
state_postal = mod.parent.stem
51+
return f"{state_postal}_{mod.stem}" # ca_lake_county_sheriff
52+
53+
def scrape_meta(self, throttle=0):
54+
# construct a local filename relative to the cache directory - agency slug + page url (ca_lake_county_sheriff/Use-of-Force.html)
55+
# download the page (if not already cached)
56+
# save the index page url to cache (sensible name)
57+
base_name = f"{self.base_url.split('/')[-1]}.html"
58+
filename = f"{self.agency_slug}/{base_name}"
59+
self.params["url"] = self.base_url
60+
self.cache.download(filename, self.zenrows_api_url, params=self.params)
61+
metadata = []
62+
child_pages = []
63+
html = self.cache.read(filename)
64+
soup = BeautifulSoup(html, "html.parser")
65+
body = soup.find("table", class_="fr-alternate-rows")
66+
child_links = body.find_all("a")
67+
for link in child_links:
68+
tr_tag = link.find_parent("tr")
69+
td_tag = tr_tag.find_all("td")
70+
child_page_data = dict()
71+
child_page_data["date"] = td_tag[0].text
72+
child_page_data["location"] = td_tag[1].get_text(separator=", ")
73+
child_page_data["name"] = td_tag[2].text
74+
child_page_data["incident_type"] = td_tag[3].abbr.text
75+
child_page_data["case_number"] = link.text
76+
child_file_name = (
77+
f'{self.agency_slug}/{child_page_data["case_number"]}.html'
78+
)
79+
if link["href"]:
80+
link_url = f"https://www.lakesheriff.com{link['href']}"
81+
self.params["url"] = link_url
82+
self.cache.download(
83+
child_file_name, self.zenrows_api_url, params=self.params
84+
)
85+
child_page_data["page_filename"] = child_file_name
86+
child_pages.append(child_page_data)
87+
time.sleep(throttle)
88+
for child_page in child_pages:
89+
html = self.cache.read(child_page["page_filename"])
90+
soup = BeautifulSoup(html, "html.parser")
91+
body = soup.find(attrs={"data-cprole": "mainContentContainer"})
92+
links = body.find_all("a")
93+
for link in links:
94+
link_href = link.get("href", None)
95+
if link_href:
96+
if "youtu" in link_href:
97+
payload = {
98+
"asset_url": link_href,
99+
"case_id": child_page["case_number"],
100+
"name": link.text,
101+
"title": link.text,
102+
"parent_page": str(child_page["page_filename"]),
103+
"details": {
104+
"date": child_page["date"],
105+
"location": child_page["location"],
106+
"name": child_page["name"],
107+
"incident_type": child_page["incident_type"],
108+
},
109+
}
110+
metadata.append(payload)
111+
elif "DocumentCenter" in link_href:
112+
payload = {
113+
"asset_url": f"https://www.lakesheriff.com{link_href}",
114+
"case_id": child_page["case_number"],
115+
"name": link.text,
116+
"title": link.text,
117+
"parent_page": str(child_page["page_filename"]),
118+
"details": {
119+
"date": child_page["date"],
120+
"location": child_page["location"],
121+
"name": child_page["name"],
122+
"incident_type": child_page["incident_type"],
123+
},
124+
}
125+
metadata.append(payload)
126+
elif "gallery" in link_href:
127+
gallery_id = link_href.split("=")[-1]
128+
galley_link = f"https://www.lakesheriff.com/SlideShow.aspx?AID={gallery_id}&AN=Sheriff%20-%20Use%20of%20Force%20-%20Case%2014110123"
129+
self.params["url"] = galley_link
130+
images_file_name = (
131+
f"{self.agency_slug}/images_{gallery_id}.html"
132+
)
133+
self.cache.download(
134+
images_file_name, self.zenrows_api_url, params=self.params
135+
)
136+
html = self.cache.read(images_file_name)
137+
soup = BeautifulSoup(html, "html.parser")
138+
body = soup.find("div", class_="slides")
139+
a_tags = body.find_all("a")
140+
for a_tag in a_tags:
141+
img_tag = a_tag.find("img")
142+
# Get the 'src' and 'alt' attributes
143+
image_src = img_tag.get("src")
144+
image_alt = img_tag.get("alt")
145+
payload = {
146+
"asset_url": f"https://www.lakesheriff.com{image_src}",
147+
"case_id": child_page["case_number"],
148+
"name": image_alt,
149+
"title": link.text,
150+
"parent_page": str(child_page["page_filename"]),
151+
"details": {
152+
"date": child_page["date"],
153+
"location": child_page["location"],
154+
"name": child_page["name"],
155+
"incident_type": child_page["incident_type"],
156+
},
157+
}
158+
metadata.append(payload)
159+
160+
outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
161+
self.cache.write_json(outfile, metadata)
162+
return outfile

clean/ca/san_francisco_pc.py

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
import re
2+
import time
3+
from pathlib import Path
4+
from typing import List
5+
from urllib.parse import parse_qs, urlparse
6+
7+
from bs4 import BeautifulSoup
8+
9+
from .. import utils
10+
from ..cache import Cache
11+
from ..utils import MetadataDict
12+
13+
14+
class Site:
15+
"""Scrape file metadata for the San Francisco Police Commission."""
16+
17+
name = "San Francisco Police Commission"
18+
19+
def __init__(
20+
self,
21+
data_dir: Path = utils.CLEAN_DATA_DIR,
22+
cache_dir: Path = utils.CLEAN_CACHE_DIR,
23+
):
24+
"""Initialize a new instance."""
25+
self.base_url = "https://www.sf.gov"
26+
self.disclosure_url = f"{self.base_url}/resource/2022/records-released-pursuant-ca-penal-code-ss-8327"
27+
self.data_dir = data_dir
28+
self.cache_dir = cache_dir
29+
self.cache = Cache(cache_dir)
30+
31+
@property
32+
def agency_slug(self) -> str:
33+
"""Construct the agency slug."""
34+
mod = Path(__file__)
35+
state_postal = mod.parent.stem
36+
return f"{state_postal}_{mod.stem}" # e.g., ca_san_francisco_pc
37+
38+
def scrape_meta(self, throttle: int = 0) -> Path:
39+
"""
40+
Gather metadata on downloadable files by following a two-step process.
41+
42+
1. Extract links from main pages.
43+
2. Extract metadata from detail pages.
44+
45+
Args:
46+
throttle (int): Number of seconds to wait between requests. Defaults to 0.
47+
48+
Returns:
49+
Path: Local path of JSON file containing metadata.
50+
"""
51+
# Step 1: Extract links from main pages
52+
main_links = self.get_main_page_links()
53+
54+
# Step 2: Extract metadata from detail pages
55+
metadata = self.get_detail_page_links(main_links, throttle)
56+
57+
# Write metadata to a JSON file
58+
outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
59+
self.cache.write_json(outfile, metadata)
60+
61+
return outfile
62+
63+
def get_main_page_links(self) -> List[str]:
64+
"""
65+
Retrieve links from the main page of the site.
66+
67+
Returns:
68+
List[str]: A list of URLs for detailed pages.
69+
"""
70+
main_links = []
71+
72+
cache_path = self._download_index_page(self.disclosure_url)
73+
html = self.cache.read(cache_path)
74+
soup = BeautifulSoup(html, "html.parser")
75+
76+
for link in soup.find_all("a", href=True):
77+
if "RequestArchiveDetails" in link["href"]:
78+
main_links.append(
79+
f"{self.base_url}/{link['href']}"
80+
if not link["href"].startswith("http")
81+
else link["href"]
82+
)
83+
84+
return main_links
85+
86+
def get_detail_page_links(
87+
self, main_links: List[str], throttle: int = 0
88+
) -> List[MetadataDict]:
89+
"""
90+
Extract detailed metadata from links on the main pages.
91+
92+
Args:
93+
main_links (List[str]): A list of main page URLs.
94+
throttle (int): Number of seconds to wait between requests.
95+
96+
Returns:
97+
List[MetadataDict]: A list of metadata dictionaries for downloadable resources.
98+
"""
99+
metadata = []
100+
101+
# Define a regex pattern to match input ids with the format 'rptAttachments_ctlXX_hdnAzureURL'
102+
id_pattern = re.compile(r"^rptAttachments_ctl\d+_hdnAzureURL$")
103+
104+
for link in main_links:
105+
cache_path = self._download_index_page(link)
106+
html = self.cache.read(cache_path)
107+
soup = BeautifulSoup(html, "html.parser")
108+
109+
# Extract the case_id from the reference number paragraph (<p>) tag
110+
case_id_tag = soup.find(
111+
"p", style="font-weight: 400; max-width: 75%; font-size: 0.875rem"
112+
)
113+
case_id = case_id_tag.text.strip() if case_id_tag else None
114+
115+
# Ensure case_id is always a string
116+
case_id = str(case_id) if case_id else ""
117+
118+
# Find all input tags where the id matches the pattern
119+
input_tags = soup.find_all("input", id=id_pattern)
120+
121+
# Ensure we process each input tag
122+
for input_tag in input_tags:
123+
value = input_tag.get("value")
124+
if isinstance(value, str):
125+
full_url = value.strip()
126+
if full_url:
127+
# Check if the URL starts with the base domain
128+
if full_url.startswith(
129+
"https://1sanfranciscopd.blob.core.usgovcloudapi.net/"
130+
):
131+
asset_url = full_url
132+
else:
133+
asset_url = (
134+
"https://1sanfranciscopd.blob.core.usgovcloudapi.net/"
135+
+ full_url.lstrip("/")
136+
)
137+
138+
# Parse the URL and extract the filename from the query string
139+
parsed_url = urlparse(asset_url)
140+
query_params = parse_qs(parsed_url.query)
141+
142+
# Get the filename from the 'rscd' parameter
143+
filename = query_params.get("rscd", [None])[0]
144+
145+
if filename:
146+
# Extract the filename after the 'filename=' part
147+
filename = filename.split("filename=")[-1]
148+
149+
# Generate a title by removing underscores and .pdf extension
150+
title = filename.replace("_", " ").replace(".pdf", "")
151+
else:
152+
# Default case if filename is not found
153+
filename = asset_url.split("?")[0].rsplit("/", 1)[-1]
154+
title = filename.replace("_", " ").replace(".pdf", "")
155+
156+
# Set the filename as 'name'
157+
name = (
158+
filename
159+
if filename
160+
else asset_url.split("?")[0].rsplit("/", 1)[-1]
161+
)
162+
163+
payload: MetadataDict = {
164+
"asset_url": asset_url,
165+
"case_id": case_id, # Reference No as it appears on the website
166+
"name": name,
167+
"title": title, # Use the formatted title here
168+
"parent_page": link,
169+
}
170+
metadata.append(payload)
171+
172+
time.sleep(throttle)
173+
174+
return metadata
175+
176+
def _download_index_page(self, page_url: str) -> Path:
177+
"""
178+
Download the index page for use for officer involved shootings, use of force with great bodily injury/death, & sustained complaints of sexual assault, dishonesty, excessive force, biased conduct, unlawful search or arrest, and failing to intervene against another officer using excessive force.
179+
180+
Index pages link to child pages containing pdfs.
181+
182+
Returns:
183+
Local path of downloaded file
184+
185+
"""
186+
split_url = page_url.split("/")
187+
# Creates a unique filename using parts of the URL,
188+
# combining the directory and filename, with _index appended.
189+
file_stem = f"{split_url[-4]}_{split_url[-1]}_index"
190+
# Downloads the content from the page_url and stores it locally with the generated file_stem.
191+
cache_path = self.cache.download(file_stem, page_url, "utf-8")
192+
return cache_path

docs/contributing.md

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,8 +116,8 @@ When coding a new scraper, there are a few important conventions to follow:
116116
- If it's a new state folder, add an empty `__init__.py` to the folder
117117
- Create a `Site` class inside the agency's scraper module with the following attributes/methods:
118118
- `name` - Official name of the agency
119-
- `scrape_meta` - generates a CSV with metadata about videos and other available files (file name, URL, and size at minimum)
120-
- `scrape` - uses the CSV generated by `scrape_meta` to download videos and other files
119+
- `scrape_meta` - generates a JSON with metadata about videos and other available files (file name, URL at a minimum)
120+
- `download_agency` - uses the JSON generated by `scrape_meta` to download videos and other files
121121

122122
Below is a pared down version of San Diego's [Site](https://github.com/biglocalnews/clean-scraper/blob/main/clean/ca/san_diego_pd.py) class to illustrate these conventions.
123123

@@ -285,6 +285,7 @@ Options:
285285
Commands:
286286
list List all available agencies and their slugs.
287287
scrape-meta Command-line interface for generating metadata CSV about...
288+
download_agency Downloads assets retrieved in scrape-meta
288289
```
289290

290291
Running a state is as simple as passing arguments to the appropriate subcommand.
@@ -299,7 +300,7 @@ pipenv run python -m clean.cli list
299300
pipenv run python -m clean.cli scrape-meta ca_san_diego_pd
300301

301302
# Trigger file downloads using agency slug
302-
pipenv run python -m clean.cli scrape ca_san_diego_pd
303+
pipenv run python -m clean.cli download_agency ca_san_diego_pd
303304
```
304305

305306
For more verbose logging, you can ask the system to show debugging information.
@@ -365,3 +366,8 @@ git push origin your-branch-name
365366
The final step is to submit a [pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) to the main respository, asking the maintainers to consider integrating your patch.
366367

367368
GitHub has [a short guide](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) that can walk you through the process. You should tag your issue number in the request so that it gets linked in GitHub's system.
369+
370+
371+
## Zen Rows Use
372+
373+
Some sites uses zenrows API, click [here](https://app.zenrows.com/login) to sign up for an account and get a ZENROWS_KEY which you can add to your .env file

0 commit comments

Comments
 (0)