Skip to content

Commit 7bf34b2

Browse files
authored
Merge branch 'dev' into ca_ventura_sheriff
2 parents a3aa521 + b857661 commit 7bf34b2

19 files changed

Lines changed: 2826 additions & 1346 deletions

CONTRIBUTORS.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
11
* Ocheze Amuzie ([@ochezems](https://github.com/ochezems))
2+
* Irene Casado Sánchez ([@irenecasdo](https://github.com/irenecasado))
3+
* Dilcia Mercedes ([@dilcia19](https://github.com/dilcia19))
4+
* Lisa Pickoff-White ([@pickoffwhite](https://github.com/pickoffwhite))
5+
* Gerald Rich ([@newsroomdev](https://github.com/newsroomdev))
26
* Jordan Rynning ([@jrynning](https://github.com/jrynning))
7+
* Tarak Shah ([@tarakc02](https://github.com/tarakc02 ))
8+
* Nauman Sharif ([@naumansharifwork](https://github.com/naumansharifwork))
9+
* Mike Stucka ([@stucka](https://github.com/stucka))
310
* Serdar Tumgoren ([@zstumgoren](https://github.com/zstumgoren))

Pipfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,13 @@ pdfplumber = "*"
3636
tenacity = "*"
3737
click = "*"
3838
retry = "*"
39-
urllib3 = "1.26.18" # pegged to avoid test issue
39+
urllib3 = "1.26.19" # pegged to avoid test issue
4040
typing-extensions = "*"
4141
us = "*"
4242
pytube = "*"
43-
clean-scraper = {file = ".", editable = true}
4443
python-dotenv = "*"
4544
yt-dlp = "*"
45+
playwright = "*"
4646

4747
[pipenv]
4848
allow_prereleases = false

Pipfile.lock

Lines changed: 1198 additions & 1312 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

clean/ca/config/palm_springs_pd.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
index_request_headers = {
2+
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
3+
"accept-language": "en-US,en;q=0.9",
4+
"cache-control": "max-age=0",
5+
"priority": "u=0, i",
6+
"sec-ch-ua": '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
7+
"sec-ch-ua-mobile": "?0",
8+
"sec-ch-ua-platform": '"Windows"',
9+
"sec-fetch-dest": "document",
10+
"sec-fetch-mode": "navigate",
11+
"sec-fetch-site": "none",
12+
"sec-fetch-user": "?1",
13+
"upgrade-insecure-requests": "1",
14+
}

clean/ca/fullerton_pd.py

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
import re
2+
import time
3+
from pathlib import Path
4+
5+
from .. import utils
6+
from ..cache import Cache
7+
8+
9+
class Site:
10+
"""Scrape file metadata and download files for the fullerton_pd.
11+
12+
Attributes:
13+
name (str): The official name of the agency
14+
"""
15+
16+
name = "Fullerton Police Department"
17+
18+
def __init__(
19+
self,
20+
data_dir: Path = utils.CLEAN_DATA_DIR,
21+
cache_dir: Path = utils.CLEAN_CACHE_DIR,
22+
):
23+
"""Initialize a new instance.
24+
25+
Args:
26+
data_dir (Path): The directory where downstream processed files/data will be saved
27+
cache_dir (Path): The directory where files will be cached
28+
"""
29+
self.base_url = (
30+
"https://portal.laserfiche.com/Portal/Browse.aspx?id=726681&repo=r-3261686e"
31+
)
32+
self.folder_url = "https://portal.laserfiche.com/Portal/FolderListingService.aspx/GetFolderListing2"
33+
self.folder_request_body = {
34+
"repoName": "r-3261686e",
35+
"folderId": 726681,
36+
"getNewListing": True,
37+
"start": 0,
38+
"end": 36,
39+
"sortColumn": "",
40+
"sortAscending": True,
41+
}
42+
self.data_dir = data_dir
43+
self.cache_dir = cache_dir
44+
self.cache = Cache(cache_dir)
45+
46+
@property
47+
def agency_slug(self) -> str:
48+
"""Construct the agency slug."""
49+
# Use module path to construct agency slug, which we'll use downstream
50+
mod = Path(__file__)
51+
state_postal = mod.parent.stem
52+
return f"{state_postal}_{mod.stem}" # ca_fullerton
53+
54+
def scrape_meta(self, throttle=0):
55+
# construct a local filename relative to the cache directory - agency slug + page url (ca_fullerton/SB_1421.json)
56+
# download the page (if not already cached)
57+
# save the index page url to cache (sensible name)
58+
base_name = "SB_1421.json"
59+
filename = f"{self.agency_slug}/{base_name}"
60+
base_output_json = self.cache_dir.joinpath(filename)
61+
base_output_json.parent.mkdir(parents=True, exist_ok=True)
62+
with utils.post_url(
63+
self.folder_url, json=self.folder_request_body
64+
) as r: # getting the Index Page Json by hitting the POST Api
65+
self.cache.write_json(base_output_json, r.json())
66+
67+
metadata = []
68+
base_json = self.cache.read_json(
69+
base_output_json
70+
) # Reading The Index Page Json
71+
results = base_json.get("data", {}).get("results", [])
72+
local_index_json = []
73+
for (
74+
result
75+
) in (
76+
results
77+
): # For every result in the Index page which is a folder get the folder ID and add it to the folder_request_body and send the API call to get the next data
78+
self.folder_request_body["folderId"] = result.get("entryId")
79+
filename = f"{self.agency_slug}/{result.get('name')}.json"
80+
output_json = self.cache_dir.joinpath(filename)
81+
with utils.post_url(self.folder_url, json=self.folder_request_body) as r:
82+
self.cache.write_json(output_json, r.json())
83+
output_dict = {"fileName": filename, "filePath": output_json}
84+
local_index_json.append(output_dict)
85+
time.sleep(throttle)
86+
for (
87+
download_json_path
88+
) in (
89+
local_index_json
90+
): # Once the Index Page Folders Jsons are downloaded Iterating over them
91+
download_dict = self.cache.read_json(download_json_path["filePath"])
92+
results = download_dict.get("data", {}).get("results", [])
93+
title = download_dict.get("data", {}).get("name", "")
94+
case_id = self._get_case_id(title)
95+
for result in results: # This is for every folder within Index Page
96+
if (
97+
result.get("type") == -2 and result.get("mediaHandlerUrl") is None
98+
): # This check is for the PDF files
99+
payload = {
100+
"title": title,
101+
"parent_page": str(download_json_path["fileName"]),
102+
"case_id": case_id,
103+
"asset_url": f"https://portal.laserfiche.com/Portal/DocView.aspx?id={result.get('entryId')}&repo=r-3261686e",
104+
"name": result.get("name"),
105+
"details": {"extension": result.get("extension", None)},
106+
}
107+
metadata.append(payload)
108+
elif (
109+
result.get("type") == -2
110+
and result.get("mediaHandlerUrl")
111+
is not None # This check is for the media files like image, video, audio
112+
):
113+
payload = {
114+
"title": title,
115+
"parent_page": str(download_json_path["fileName"]),
116+
"asset_url": f'https://portal.laserfiche.com/Portal/{result.get("mediaHandlerUrl").replace("/u0026", "&")}',
117+
"name": result.get("name"),
118+
"details": {"extension": result.get("extension", None)},
119+
}
120+
metadata.append(payload)
121+
elif (
122+
result.get("type") == 0
123+
): # This check is for the folder within a folder if there are folders with in folders it will download and get the child folders as well.
124+
childMetadata_list = self._get_child_pages(
125+
result, download_json_path["fileName"], title
126+
)
127+
for payload in childMetadata_list:
128+
metadata.append(payload)
129+
130+
outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
131+
self.cache.write_json(outfile, metadata)
132+
return outfile
133+
134+
def _get_child_pages(self, result, parent_path, parent_title):
135+
childMetadata = []
136+
self.folder_request_body["folderId"] = result.get("entryId")
137+
filename = f"{str(parent_path).split('.json')[0]}/{result.get('name')}.json"
138+
output_json = self.cache_dir.joinpath(filename)
139+
with utils.post_url(self.folder_url, json=self.folder_request_body) as r:
140+
self.cache.write_json(output_json, r.json())
141+
output_dict = {"fileName": filename, "filePath": output_json}
142+
download_dict = self.cache.read_json(output_dict["filePath"])
143+
results = download_dict.get("data", {}).get("results", [])
144+
case_id = self._get_case_id(parent_title)
145+
for result in results:
146+
if result.get("type") == -2 and result.get("mediaHandlerUrl") is None:
147+
payload = {
148+
"title": parent_title,
149+
"parent_page": str(filename),
150+
"case_id": case_id,
151+
"asset_url": f"https://portal.laserfiche.com/Portal/DocView.aspx?id={result.get('entryId')}&repo=r-3261686e",
152+
"name": result.get("name"),
153+
"details": {"extension": result.get("extension", None)},
154+
}
155+
childMetadata.append(payload)
156+
elif (
157+
result.get("type") == -2
158+
and result.get("mediaHandlerUrl") is not None
159+
):
160+
payload = {
161+
"title": parent_title,
162+
"parent_page": str(filename),
163+
"case_id": case_id,
164+
"asset_url": f'https://portal.laserfiche.com/Portal/{result.get("mediaHandlerUrl").replace("/u0026", "&")}',
165+
"name": result.get("name"),
166+
"details": {"extension": result.get("extension", None)},
167+
}
168+
childMetadata.append(payload)
169+
else:
170+
childMetadata_list = self._get_child_pages(
171+
result,
172+
filename,
173+
parent_title, # Recursively go within folders to get to all the child folder data
174+
)
175+
176+
for payload in childMetadata_list:
177+
childMetadata.append(payload)
178+
179+
return childMetadata
180+
181+
def _get_case_id(self, title):
182+
case_id_pattern = r"\b(FPD# \d{2,5}-\d{3,5}|FN# \d{2}-\d{4})\b"
183+
case_ids = re.findall(case_id_pattern, title)
184+
if len(case_ids) > 0:
185+
return case_ids[0]
186+
else:
187+
return title

0 commit comments

Comments
 (0)