biglocalnews
diff --git a/‎CONTRIBUTORS.md‎
Lines changed: 7 additions & 0 deletions b/‎CONTRIBUTORS.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎Pipfile‎
Lines changed: 2 additions & 2 deletions b/‎Pipfile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎Pipfile.lock‎
Lines changed: 1198 additions & 1312 deletions b/‎Pipfile.lock‎
Lines changed: 1198 additions & 1312 deletions
diff --git a/‎clean/ca/config/palm_springs_pd.py‎
Lines changed: 14 additions & 0 deletions b/‎clean/ca/config/palm_springs_pd.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎clean/ca/fullerton_pd.py‎
Lines changed: 187 additions & 0 deletions b/‎clean/ca/fullerton_pd.py‎
Lines changed: 187 additions & 0 deletions
diff --git a/‎clean/ca/humboldt_pd.py‎ ‎clean/ca/humboldt_county_sheriff.py‎clean/ca/humboldt_pd.py renamed to clean/ca/humboldt_county_sheriff.py b/‎clean/ca/humboldt_pd.py‎ ‎clean/ca/humboldt_county_sheriff.py‎clean/ca/humboldt_pd.py renamed to clean/ca/humboldt_county_sheriff.py
@@ -1,3 +1,10 @@
 * Ocheze Amuzie ([@ochezems](https://github.com/ochezems))
+* Irene Casado Sánchez ([@irenecasdo](https://github.com/irenecasado))
+* Dilcia Mercedes ([@dilcia19](https://github.com/dilcia19))
+* Lisa Pickoff-White ([@pickoffwhite](https://github.com/pickoffwhite))
+* Gerald Rich ([@newsroomdev](https://github.com/newsroomdev))
 * Jordan Rynning ([@jrynning](https://github.com/jrynning))
+* Tarak Shah ([@tarakc02](https://github.com/tarakc02 ))
+* Nauman Sharif ([@naumansharifwork](https://github.com/naumansharifwork))
+* Mike Stucka ([@stucka](https://github.com/stucka))
 * Serdar Tumgoren ([@zstumgoren](https://github.com/zstumgoren))
@@ -36,13 +36,13 @@ pdfplumber = "*"
 tenacity = "*"
 click = "*"
 retry = "*"
-urllib3 = "1.26.18" # pegged to avoid test issue
+urllib3 = "1.26.19" # pegged to avoid test issue
 typing-extensions = "*"
 us = "*"
 pytube = "*"
-clean-scraper = {file = ".", editable = true}
 python-dotenv = "*"
 yt-dlp = "*"
+playwright = "*"
 
 [pipenv]
 allow_prereleases = false
@@ -0,0 +1,14 @@
+index_request_headers = {
+    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+    "accept-language": "en-US,en;q=0.9",
+    "cache-control": "max-age=0",
+    "priority": "u=0, i",
+    "sec-ch-ua": '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
+    "sec-ch-ua-mobile": "?0",
+    "sec-ch-ua-platform": '"Windows"',
+    "sec-fetch-dest": "document",
+    "sec-fetch-mode": "navigate",
+    "sec-fetch-site": "none",
+    "sec-fetch-user": "?1",
+    "upgrade-insecure-requests": "1",
+}
@@ -0,0 +1,187 @@
+import re
+import time
+from pathlib import Path
+
+from .. import utils
+from ..cache import Cache
+
+
+class Site:
+    """Scrape file metadata and download files for the fullerton_pd.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "Fullerton Police Department"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.base_url = (
+            "https://portal.laserfiche.com/Portal/Browse.aspx?id=726681&repo=r-3261686e"
+        )
+        self.folder_url = "https://portal.laserfiche.com/Portal/FolderListingService.aspx/GetFolderListing2"
+        self.folder_request_body = {
+            "repoName": "r-3261686e",
+            "folderId": 726681,
+            "getNewListing": True,
+            "start": 0,
+            "end": 36,
+            "sortColumn": "",
+            "sortAscending": True,
+        }
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.cache = Cache(cache_dir)
+
+    @property
+    def agency_slug(self) -> str:
+        """Construct the agency slug."""
+        # Use module path to construct agency slug, which we'll use downstream
+        mod = Path(__file__)
+        state_postal = mod.parent.stem
+        return f"{state_postal}_{mod.stem}"  # ca_fullerton
+
+    def scrape_meta(self, throttle=0):
+        # construct a local filename relative to the cache directory - agency slug + page url (ca_fullerton/SB_1421.json)
+        # download the page (if not already cached)
+        # save the index page url to cache (sensible name)
+        base_name = "SB_1421.json"
+        filename = f"{self.agency_slug}/{base_name}"
+        base_output_json = self.cache_dir.joinpath(filename)
+        base_output_json.parent.mkdir(parents=True, exist_ok=True)
+        with utils.post_url(
+            self.folder_url, json=self.folder_request_body
+        ) as r:  # getting the Index Page Json by hitting the POST Api
+            self.cache.write_json(base_output_json, r.json())
+
+        metadata = []
+        base_json = self.cache.read_json(
+            base_output_json
+        )  # Reading The Index Page Json
+        results = base_json.get("data", {}).get("results", [])
+        local_index_json = []
+        for (
+            result
+        ) in (
+            results
+        ):  # For every result in the Index page which is a folder get the folder ID and add it to the folder_request_body and send the API call to get the next data
+            self.folder_request_body["folderId"] = result.get("entryId")
+            filename = f"{self.agency_slug}/{result.get('name')}.json"
+            output_json = self.cache_dir.joinpath(filename)
+            with utils.post_url(self.folder_url, json=self.folder_request_body) as r:
+                self.cache.write_json(output_json, r.json())
+                output_dict = {"fileName": filename, "filePath": output_json}
+                local_index_json.append(output_dict)
+            time.sleep(throttle)
+        for (
+            download_json_path
+        ) in (
+            local_index_json
+        ):  # Once the Index Page Folders Jsons are downloaded Iterating over them
+            download_dict = self.cache.read_json(download_json_path["filePath"])
+            results = download_dict.get("data", {}).get("results", [])
+            title = download_dict.get("data", {}).get("name", "")
+            case_id = self._get_case_id(title)
+            for result in results:  # This is for every folder within Index Page
+                if (
+                    result.get("type") == -2 and result.get("mediaHandlerUrl") is None
+                ):  # This check is for the PDF files
+                    payload = {
+                        "title": title,
+                        "parent_page": str(download_json_path["fileName"]),
+                        "case_id": case_id,
+                        "asset_url": f"https://portal.laserfiche.com/Portal/DocView.aspx?id={result.get('entryId')}&repo=r-3261686e",
+                        "name": result.get("name"),
+                        "details": {"extension": result.get("extension", None)},
+                    }
+                    metadata.append(payload)
+                elif (
+                    result.get("type") == -2
+                    and result.get("mediaHandlerUrl")
+                    is not None  # This check is for the media files like image, video, audio
+                ):
+                    payload = {
+                        "title": title,
+                        "parent_page": str(download_json_path["fileName"]),
+                        "asset_url": f'https://portal.laserfiche.com/Portal/{result.get("mediaHandlerUrl").replace("/u0026", "&")}',
+                        "name": result.get("name"),
+                        "details": {"extension": result.get("extension", None)},
+                    }
+                    metadata.append(payload)
+                elif (
+                    result.get("type") == 0
+                ):  # This check is for the folder within a folder if there are folders with in folders it will download and get the child folders as well.
+                    childMetadata_list = self._get_child_pages(
+                        result, download_json_path["fileName"], title
+                    )
+                    for payload in childMetadata_list:
+                        metadata.append(payload)
+
+        outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
+        self.cache.write_json(outfile, metadata)
+        return outfile
+
+    def _get_child_pages(self, result, parent_path, parent_title):
+        childMetadata = []
+        self.folder_request_body["folderId"] = result.get("entryId")
+        filename = f"{str(parent_path).split('.json')[0]}/{result.get('name')}.json"
+        output_json = self.cache_dir.joinpath(filename)
+        with utils.post_url(self.folder_url, json=self.folder_request_body) as r:
+            self.cache.write_json(output_json, r.json())
+            output_dict = {"fileName": filename, "filePath": output_json}
+            download_dict = self.cache.read_json(output_dict["filePath"])
+            results = download_dict.get("data", {}).get("results", [])
+            case_id = self._get_case_id(parent_title)
+            for result in results:
+                if result.get("type") == -2 and result.get("mediaHandlerUrl") is None:
+                    payload = {
+                        "title": parent_title,
+                        "parent_page": str(filename),
+                        "case_id": case_id,
+                        "asset_url": f"https://portal.laserfiche.com/Portal/DocView.aspx?id={result.get('entryId')}&repo=r-3261686e",
+                        "name": result.get("name"),
+                        "details": {"extension": result.get("extension", None)},
+                    }
+                    childMetadata.append(payload)
+                elif (
+                    result.get("type") == -2
+                    and result.get("mediaHandlerUrl") is not None
+                ):
+                    payload = {
+                        "title": parent_title,
+                        "parent_page": str(filename),
+                        "case_id": case_id,
+                        "asset_url": f'https://portal.laserfiche.com/Portal/{result.get("mediaHandlerUrl").replace("/u0026", "&")}',
+                        "name": result.get("name"),
+                        "details": {"extension": result.get("extension", None)},
+                    }
+                    childMetadata.append(payload)
+                else:
+                    childMetadata_list = self._get_child_pages(
+                        result,
+                        filename,
+                        parent_title,  # Recursively go within folders to get to all the child folder data
+                    )
+
+                    for payload in childMetadata_list:
+                        childMetadata.append(payload)
+
+        return childMetadata
+
+    def _get_case_id(self, title):
+        case_id_pattern = r"\b(FPD# \d{2,5}-\d{3,5}|FN# \d{2}-\d{4})\b"
+        case_ids = re.findall(case_id_pattern, title)
+        if len(case_ids) > 0:
+            return case_ids[0]
+        else:
+            return title