|
| 1 | +import json |
| 2 | +import re |
| 3 | +from dataclasses import dataclass |
| 4 | +from typing import Any |
| 5 | + |
| 6 | +from bs4 import BeautifulSoup, Tag |
| 7 | +from httpx import Response |
| 8 | + |
| 9 | +from .constants import SITE_URL |
| 10 | + |
| 11 | + |
| 12 | +@dataclass(frozen=True) |
| 13 | +class RecommerceAd: |
| 14 | + id: int |
| 15 | + |
| 16 | + @property |
| 17 | + def url(self) -> str: |
| 18 | + return f"{SITE_URL}/recommerce/forsale/item/{self.id}" |
| 19 | + |
| 20 | + def parse(self, response: Response) -> Any: |
| 21 | + soup = BeautifulSoup(response.content, "html.parser") |
| 22 | + json_script_tag = soup.select_one( |
| 23 | + 'script:-soup-contains("window.__staticRouterHydrationData")' |
| 24 | + ) |
| 25 | + if not json_script_tag: |
| 26 | + return {} |
| 27 | + |
| 28 | + if match := re.search(r'JSON\.parse\("(.+)"\)', json_script_tag.text): |
| 29 | + raw_json = match.group(1) |
| 30 | + escaped = raw_json.encode("utf-8").decode("unicode_escape") |
| 31 | + return json.loads(escaped.encode("latin1").decode("utf-8")) |
| 32 | + else: |
| 33 | + return {} |
| 34 | + |
| 35 | + |
| 36 | +@dataclass(frozen=True) |
| 37 | +class CarAd: |
| 38 | + id: int |
| 39 | + |
| 40 | + @property |
| 41 | + def url(self) -> str: |
| 42 | + return f"{SITE_URL}/mobility/item/{self.id}" |
| 43 | + |
| 44 | + def parse(self, response: Response) -> dict: |
| 45 | + soup = BeautifulSoup(response.content, "html.parser") |
| 46 | + grid = soup.find("div", class_="grid grid-cols-1 md:grid-cols-3 md:gap-x-32") |
| 47 | + |
| 48 | + if not grid: |
| 49 | + return {} |
| 50 | + |
| 51 | + data: dict[str, Any] = {"url": self.url} |
| 52 | + |
| 53 | + self._extract_title_and_subtitle(grid, data) |
| 54 | + self._extract_quick_specs(grid, data) |
| 55 | + self._extract_price(grid, data) |
| 56 | + self._extract_description(grid, data) |
| 57 | + self._extract_specifications(grid, data) |
| 58 | + self._extract_equipment(grid, data) |
| 59 | + self._extract_seller_type(soup, data) |
| 60 | + self._extract_ad_id(soup, data) |
| 61 | + |
| 62 | + return data |
| 63 | + |
| 64 | + def _extract_title_and_subtitle(self, grid: Tag, data: dict[str, Any]) -> None: |
| 65 | + if title := grid.find("h1", class_=lambda x: x and "t1" in x): |
| 66 | + data["title"] = title.get_text(strip=True) |
| 67 | + |
| 68 | + if subtitle := grid.find( |
| 69 | + "p", class_=lambda x: x and "s-text-subtle" in x and "mt-8" in x |
| 70 | + ): |
| 71 | + data["subtitle"] = subtitle.get_text(strip=True) |
| 72 | + |
| 73 | + def _extract_quick_specs(self, grid: Tag, data: dict[str, Any]) -> None: |
| 74 | + if specs_grid := grid.find( |
| 75 | + "div", class_=lambda x: x and "grid" in x and "gap-24" in x |
| 76 | + ): |
| 77 | + spec_items = specs_grid.find_all("div", class_="flex gap-16 hyphens-auto") |
| 78 | + for item in spec_items: |
| 79 | + label = item.find("span", class_="s-text-subtle") |
| 80 | + value = item.find("p", class_="m-0 font-bold") |
| 81 | + if label and value: |
| 82 | + label_text = label.get_text(strip=True) |
| 83 | + value_text = value.get_text(strip=True) |
| 84 | + |
| 85 | + key_mapping = { |
| 86 | + "Modellår": "model_year", |
| 87 | + "Miltal": "mileage", |
| 88 | + "Växellåda": "transmission", |
| 89 | + "Drivmedel": "fuel", |
| 90 | + } |
| 91 | + key = key_mapping.get( |
| 92 | + label_text, label_text.lower().replace(" ", "_") |
| 93 | + ) |
| 94 | + data[key] = value_text |
| 95 | + |
| 96 | + def _extract_price(self, grid: Tag, data: dict[str, Any]) -> None: |
| 97 | + if price_section := grid.find("div", class_="border-t pt-40 mt-40"): |
| 98 | + price_labels = price_section.find_all("p", class_="s-text-subtle mb-0") |
| 99 | + for price_label in price_labels: |
| 100 | + label_text = price_label.get_text(strip=True).lower() |
| 101 | + if "pris" in label_text: |
| 102 | + price_elem = price_section.find("span", class_="t2") |
| 103 | + if price_elem: |
| 104 | + data["price"] = price_elem.get_text(strip=True) |
| 105 | + break |
| 106 | + elif "månadskostnad" in label_text: |
| 107 | + monthly_elem = price_section.find("h2", class_="t2") |
| 108 | + if monthly_elem: |
| 109 | + data["monthly_cost"] = monthly_elem.get_text(strip=True) |
| 110 | + break |
| 111 | + |
| 112 | + def _extract_description(self, grid: Tag, data: dict[str, Any]) -> None: |
| 113 | + desc_sections = grid.find_all("section", class_="border-t mt-40 pt-40") |
| 114 | + for section in desc_sections: |
| 115 | + h2 = section.find("h2", class_="t3 mb-0") |
| 116 | + if h2 and "beskrivning" in h2.get_text(strip=True).lower(): |
| 117 | + desc_div = section.find("div", class_="whitespace-pre-wrap") |
| 118 | + if desc_div: |
| 119 | + data["description"] = desc_div.get_text(strip=True) |
| 120 | + break |
| 121 | + |
| 122 | + def _extract_specifications(self, grid: Tag, data: dict[str, Any]) -> None: |
| 123 | + specs_section = grid.find("section", class_="key-info-section") |
| 124 | + if not specs_section: |
| 125 | + return |
| 126 | + |
| 127 | + dl = specs_section.find("dl") |
| 128 | + if not dl: |
| 129 | + return |
| 130 | + |
| 131 | + specifications: dict[str, str] = {} |
| 132 | + divs = dl.find_all("div", style="break-inside:avoid-column") |
| 133 | + for div in divs: |
| 134 | + dt = div.find("dt") |
| 135 | + dd = div.find("dd") |
| 136 | + if dt and dd: |
| 137 | + key_text = dt.get_text(strip=True) |
| 138 | + value_text = dd.get_text(strip=True) |
| 139 | + specifications[key_text] = value_text |
| 140 | + |
| 141 | + if specifications: |
| 142 | + data["specifications"] = specifications |
| 143 | + |
| 144 | + def _extract_equipment(self, grid: Tag, data: dict[str, Any]) -> None: |
| 145 | + equipment_section: Tag | None = None |
| 146 | + for section in grid.find_all("section", class_="border-t pt-40 mt-40"): |
| 147 | + h2 = section.find("h2", class_="t3 mb-0") |
| 148 | + if h2 and "utrustning" in h2.get_text(strip=True).lower(): |
| 149 | + equipment_section = section |
| 150 | + break |
| 151 | + |
| 152 | + if not equipment_section: |
| 153 | + return |
| 154 | + |
| 155 | + equipment_list = equipment_section.find("ul") |
| 156 | + if equipment_list: |
| 157 | + equipment_items: list[str] = [ |
| 158 | + li.get_text(strip=True) for li in equipment_list.find_all("li") |
| 159 | + ] |
| 160 | + if equipment_items: |
| 161 | + data["equipment"] = equipment_items |
| 162 | + |
| 163 | + def _extract_seller_type(self, soup: BeautifulSoup, data: dict[str, Any]) -> None: |
| 164 | + seller_type = ( |
| 165 | + "dealer" |
| 166 | + if soup.find("div", class_=lambda x: x and "dealer" in str(x).lower()) |
| 167 | + else "private" |
| 168 | + ) |
| 169 | + data["seller_type"] = seller_type |
| 170 | + |
| 171 | + def _extract_ad_id(self, soup: BeautifulSoup, data: dict[str, Any]) -> None: |
| 172 | + ad_info_divs = soup.find_all( |
| 173 | + "div", class_="text-m flex md:flex-row flex-col md:gap-x-56 gap-y-16" |
| 174 | + ) |
| 175 | + for div in ad_info_divs: |
| 176 | + ad_id_labels = div.find_all("p", class_="s-text-subtle mb-0") |
| 177 | + for ad_id_label in ad_id_labels: |
| 178 | + if "Annons-ID" in ad_id_label.get_text(strip=True): |
| 179 | + ad_id_elem = ad_id_label.find_next_sibling("p") |
| 180 | + if ad_id_elem: |
| 181 | + data["ad_id"] = ad_id_elem.get_text(strip=True) |
| 182 | + break |
0 commit comments