Skip to content

Commit 85a5cf5

Browse files
committed
feat: improve content_hash calculation
1 parent f1cd597 commit 85a5cf5

File tree

13 files changed

+359
-304
lines changed

13 files changed

+359
-304
lines changed

pyproject.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ ignore = [
226226
"E231", # Don't add whitespace after colon (:) on type declaration
227227
"E251", # Don't remove whitespace around parameter '=' sign.
228228
"E401", # Don't put imports on separate lines
229+
"FIX002", # Line contains TODO, consider resolving the issue
229230
"PERF203", # `try`-`except` within a loop incurs performance overhead
230231
"RET504", # Unnecessary assignment to `...` before `return` statement
231232
"PLR6301", # Method `...` could be a function, class method, or static method
@@ -234,6 +235,8 @@ ignore = [
234235
"SIM105", # Use `contextlib.suppress(TimeoutError)` instead of `try`-`except`-`pass`
235236
"SIM114", # Combine `if` branches using logical `or` operator
236237
"TC006", # Add quotes to type expression in `typing.cast()`
238+
"TD002", # Missing author in TODO
239+
"TD003", # Missing issue link for this TODO
237240
]
238241

239242
[tool.ruff.lint.per-file-ignores]
@@ -263,12 +266,12 @@ min-file-size = 256
263266
# https://pylint.pycqa.org/en/latest/user_guide/configuration/all-options.html#design-checker
264267
# https://pylint.pycqa.org/en/latest/user_guide/checkers/features.html#design-checker-messages
265268
max-args = 6 # max. number of args for function / method (R0913)
266-
# max-attributes = 15 # max. number of instance attrs for a class (R0902)
269+
# max-attributes = 15 # TODO max. number of instance attrs for a class (R0902)
267270
max-branches = 40 # max. number of branch for function / method body (R0912)
268271
max-locals = 30 # max. number of local vars for function / method body (R0914)
269272
max-returns = 15 # max. number of return / yield for function / method body (R0911)
270273
max-statements = 150 # max. number of statements in function / method body (R0915)
271-
max-public-methods = 20 # max. number of public methods for a class (R0904)
274+
max-public-methods = 25 # max. number of public methods for a class (R0904)
272275
# max-positional-arguments = 5 # max. number of positional args for function / method (R0917)
273276

274277

schemas/ad.schema.json

Lines changed: 63 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -72,18 +72,32 @@
7272
},
7373
"properties": {
7474
"active": {
75-
"default": true,
76-
"title": "Active",
77-
"type": "boolean"
75+
"anyOf": [
76+
{
77+
"type": "boolean"
78+
},
79+
{
80+
"type": "null"
81+
}
82+
],
83+
"default": null,
84+
"title": "Active"
7885
},
7986
"type": {
80-
"default": "OFFER",
81-
"enum": [
82-
"OFFER",
83-
"WANTED"
87+
"anyOf": [
88+
{
89+
"enum": [
90+
"OFFER",
91+
"WANTED"
92+
],
93+
"type": "string"
94+
},
95+
{
96+
"type": "null"
97+
}
8498
],
85-
"title": "Type",
86-
"type": "string"
99+
"default": null,
100+
"title": "Type"
87101
},
88102
"title": {
89103
"minLength": 10,
@@ -150,25 +164,39 @@
150164
"title": "Price"
151165
},
152166
"price_type": {
153-
"default": "NEGOTIABLE",
154-
"enum": [
155-
"FIXED",
156-
"NEGOTIABLE",
157-
"GIVE_AWAY",
158-
"NOT_APPLICABLE"
167+
"anyOf": [
168+
{
169+
"enum": [
170+
"FIXED",
171+
"NEGOTIABLE",
172+
"GIVE_AWAY",
173+
"NOT_APPLICABLE"
174+
],
175+
"type": "string"
176+
},
177+
{
178+
"type": "null"
179+
}
159180
],
160-
"title": "Price Type",
161-
"type": "string"
181+
"default": null,
182+
"title": "Price Type"
162183
},
163184
"shipping_type": {
164-
"default": "SHIPPING",
165-
"enum": [
166-
"PICKUP",
167-
"SHIPPING",
168-
"NOT_APPLICABLE"
185+
"anyOf": [
186+
{
187+
"enum": [
188+
"PICKUP",
189+
"SHIPPING",
190+
"NOT_APPLICABLE"
191+
],
192+
"type": "string"
193+
},
194+
{
195+
"type": "null"
196+
}
169197
],
170-
"title": "Shipping Type",
171-
"type": "string"
198+
"default": null,
199+
"title": "Shipping Type"
172200
},
173201
"shipping_costs": {
174202
"anyOf": [
@@ -206,7 +234,7 @@
206234
"type": "null"
207235
}
208236
],
209-
"default": false,
237+
"default": null,
210238
"title": "Sell Directly"
211239
},
212240
"images": {
@@ -236,9 +264,16 @@
236264
"default": null
237265
},
238266
"republication_interval": {
239-
"default": 7,
240-
"title": "Republication Interval",
241-
"type": "integer"
267+
"anyOf": [
268+
{
269+
"type": "integer"
270+
},
271+
{
272+
"type": "null"
273+
}
274+
],
275+
"default": null,
276+
"title": "Republication Interval"
242277
},
243278
"id": {
244279
"anyOf": [

src/kleinanzeigen_bot/__init__.py

Lines changed: 60 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# SPDX-FileCopyrightText: © Sebastian Thomschke and contributors
22
# SPDX-License-Identifier: AGPL-3.0-or-later
33
# SPDX-ArtifactOfProjectHomePage: https://github.com/Second-Hand-Friends/kleinanzeigen-bot/
4-
import atexit, copy, json, os, re, signal, sys, textwrap # isort: skip
4+
import atexit, json, os, re, signal, sys, textwrap # isort: skip
55
import getopt # pylint: disable=deprecated-module
66
import urllib.parse as urllib_parse
77
from gettext import gettext as _
@@ -13,8 +13,7 @@
1313

1414
from . import extract, resources
1515
from ._version import __version__
16-
from .ads import calculate_content_hash, get_description_affixes
17-
from .model.ad_model import MAX_DESCRIPTION_LENGTH, Ad
16+
from .model.ad_model import MAX_DESCRIPTION_LENGTH, Ad, AdPartial
1817
from .model.config_model import Config
1918
from .utils import dicts, error_handlers, loggers, misc
2019
from .utils.exceptions import CaptchaEncountered
@@ -80,6 +79,16 @@ async def run(self, args:list[str]) -> None:
8079
LOG.info("############################################")
8180
LOG.info("DONE: No configuration errors found.")
8281
LOG.info("############################################")
82+
case "update-content-hash":
83+
self.configure_file_logging()
84+
self.load_config()
85+
self.ads_selector = "all"
86+
if ads := self.load_ads(exclude_ads_with_id = False):
87+
self.update_content_hashes(ads)
88+
else:
89+
LOG.info("############################################")
90+
LOG.info("DONE: No active ads found.")
91+
LOG.info("############################################")
8392
case "publish":
8493
self.configure_file_logging()
8594
self.load_config()
@@ -143,6 +152,9 @@ def show_help(self) -> None:
143152
verify - Überprüft die Konfigurationsdateien
144153
delete - Löscht Anzeigen
145154
download - Lädt eine oder mehrere Anzeigen herunter
155+
update-content-hash - Berechnet den content_hash aller Anzeigen anhand der aktuellen ad_defaults neu;
156+
nach Änderungen an den config.yaml/ad_defaults verhindert es, dass alle Anzeigen als
157+
"geändert" gelten und neu veröffentlicht werden.
146158
--
147159
help - Zeigt diese Hilfe an (Standardbefehl)
148160
version - Zeigt die Version der Anwendung an
@@ -178,6 +190,8 @@ def show_help(self) -> None:
178190
verify - verifies the configuration files
179191
delete - deletes ads
180192
download - downloads one or multiple ads
193+
update-content-hash – recalculates each ad’s content_hash based on the current ad_defaults;
194+
use this after changing config.yaml/ad_defaults to avoid every ad being marked "changed" and republished
181195
--
182196
help - displays this help (default command)
183197
version - displays the application version
@@ -269,9 +283,10 @@ def configure_file_logging(self) -> None:
269283
def __check_ad_republication(self, ad_cfg:Ad, ad_file_relative:str) -> bool:
270284
"""
271285
Check if an ad needs to be republished based on republication interval.
272-
Returns True if the ad should be republished based on the interval.
286+
Note: This method does not check for content changes. Use __check_ad_changed for that.
273287
274-
Note: This method no longer checks for content changes. Use __check_ad_changed for that.
288+
Returns:
289+
True if the ad should be republished based on the interval.
275290
"""
276291
if ad_cfg.updated_on:
277292
last_updated_on = ad_cfg.updated_on
@@ -299,14 +314,16 @@ def __check_ad_republication(self, ad_cfg:Ad, ad_file_relative:str) -> bool:
299314
def __check_ad_changed(self, ad_cfg:Ad, ad_cfg_orig:dict[str, Any], ad_file_relative:str) -> bool:
300315
"""
301316
Check if an ad has been changed since last publication.
302-
Returns True if the ad has been changed.
317+
318+
Returns:
319+
True if the ad has been changed.
303320
"""
304321
if not ad_cfg.id:
305322
# New ads are not considered "changed"
306323
return False
307324

308325
# Calculate hash on original config to match what was stored
309-
current_hash = calculate_content_hash(ad_cfg_orig)
326+
current_hash = AdPartial.model_validate(ad_cfg_orig).update_content_hash().content_hash
310327
stored_hash = ad_cfg_orig.get("content_hash")
311328

312329
LOG.debug("Hash comparison for [%s]:", ad_file_relative)
@@ -321,7 +338,20 @@ def __check_ad_changed(self, ad_cfg:Ad, ad_cfg_orig:dict[str, Any], ad_file_rela
321338

322339
return False
323340

324-
def load_ads(self, *, ignore_inactive:bool = True, check_id:bool = True) -> list[tuple[str, Ad, dict[str, Any]]]:
341+
def load_ads(self, *, ignore_inactive:bool = True, exclude_ads_with_id:bool = True) -> list[tuple[str, Ad, dict[str, Any]]]:
342+
"""
343+
Load and validate all ad config files, optionally filtering out inactive or already‐published ads.
344+
345+
Args:
346+
ignore_inactive (bool):
347+
Skip ads with `active=False`.
348+
exclude_ads_with_id (bool):
349+
Skip ads whose raw data already contains an `id`, i.e. was published before.
350+
351+
Returns:
352+
list[tuple[str, Ad, dict[str, Any]]]:
353+
Tuples of (file_path, validated Ad model, original raw data).
354+
"""
325355
LOG.info("Searching for ad config files...")
326356

327357
ad_files:dict[str, str] = {}
@@ -366,9 +396,9 @@ def load_ads(self, *, ignore_inactive:bool = True, check_id:bool = True) -> list
366396
should_include = True
367397

368398
# Check for 'new' selector
369-
if "new" in selectors and (not ad_cfg.id or not check_id):
399+
if "new" in selectors and (not ad_cfg.id or not exclude_ads_with_id):
370400
should_include = True
371-
elif "new" in selectors and ad_cfg.id and check_id:
401+
elif "new" in selectors and ad_cfg.id and exclude_ads_with_id:
372402
LOG.info(" -> SKIPPED: ad [%s] is not new. already has an id assigned.", ad_file_relative)
373403

374404
# Check for 'due' selector
@@ -427,13 +457,7 @@ def load_ads(self, *, ignore_inactive:bool = True, check_id:bool = True) -> list
427457
return ads
428458

429459
def load_ad(self, ad_cfg_orig:dict[str, Any]) -> Ad:
430-
ad_cfg_merged = dicts.apply_defaults(
431-
target = copy.deepcopy(ad_cfg_orig),
432-
defaults = self.config.ad_defaults.model_dump(),
433-
ignore = lambda k, _: k == "description",
434-
override = lambda _, v: v == "" # noqa: PLC1901 can be simplified to `not v` as an empty string is falsey
435-
)
436-
return Ad.model_validate(ad_cfg_merged)
460+
return AdPartial.model_validate(ad_cfg_orig).to_ad(self.config.ad_defaults)
437461

438462
def load_config(self) -> None:
439463
# write default config.yaml if config file does not exist
@@ -805,7 +829,7 @@ async def publish_ad(self, ad_file:str, ad_cfg:Ad, ad_cfg_orig:dict[str, Any], p
805829

806830
# Update content hash after successful publication
807831
# Calculate hash on original config to ensure consistent comparison on restart
808-
ad_cfg_orig["content_hash"] = calculate_content_hash(ad_cfg_orig)
832+
ad_cfg_orig["content_hash"] = AdPartial.model_validate(ad_cfg_orig).update_content_hash().content_hash
809833
ad_cfg_orig["updated_on"] = misc.now().isoformat(timespec = "seconds")
810834
if not ad_cfg.created_on and not ad_cfg.id:
811835
ad_cfg_orig["created_on"] = ad_cfg_orig["updated_on"]
@@ -1052,7 +1076,7 @@ async def download_ads(self) -> None:
10521076
elif self.ads_selector == "new": # download only unsaved ads
10531077
# check which ads already saved
10541078
saved_ad_ids = []
1055-
ads = self.load_ads(ignore_inactive = False, check_id = False) # do not skip because of existing IDs
1079+
ads = self.load_ads(ignore_inactive = False, exclude_ads_with_id = False) # do not skip because of existing IDs
10561080
for ad in ads:
10571081
ad_id = int(ad[2]["id"])
10581082
saved_ad_ids.append(ad_id)
@@ -1111,7 +1135,7 @@ def __get_description(self, ad_cfg:Ad, *, with_affixes:bool) -> str:
11111135
# 1. Direct ad-level prefix
11121136
ad_cfg.description_prefix if ad_cfg.description_prefix is not None
11131137
# 2. Global prefix from config
1114-
else get_description_affixes(self.config, prefix = True)
1138+
else self.config.ad_defaults.description_prefix
11151139
or "" # Default to empty string if all sources are None
11161140
)
11171141

@@ -1120,7 +1144,7 @@ def __get_description(self, ad_cfg:Ad, *, with_affixes:bool) -> str:
11201144
# 1. Direct ad-level suffix
11211145
ad_cfg.description_suffix if ad_cfg.description_suffix is not None
11221146
# 2. Global suffix from config
1123-
else get_description_affixes(self.config, prefix = False)
1147+
else self.config.ad_defaults.description_suffix
11241148
or "" # Default to empty string if all sources are None
11251149
)
11261150

@@ -1137,6 +1161,21 @@ def __get_description(self, ad_cfg:Ad, *, with_affixes:bool) -> str:
11371161

11381162
return final_description
11391163

1164+
def update_content_hashes(self, ads:list[tuple[str, Ad, dict[str, Any]]]) -> None:
1165+
count = 0
1166+
1167+
for (ad_file, ad_cfg, ad_cfg_orig) in ads:
1168+
LOG.info("Processing %s/%s: '%s' from [%s]...", count + 1, len(ads), ad_cfg.title, ad_file)
1169+
ad_cfg.update_content_hash()
1170+
if ad_cfg.content_hash != ad_cfg_orig["content_hash"]:
1171+
count += 1
1172+
ad_cfg_orig["content_hash"] = ad_cfg.content_hash
1173+
dicts.save_dict(ad_file, ad_cfg_orig)
1174+
1175+
LOG.info("############################################")
1176+
LOG.info("DONE: Updated [content_hash] in %s", pluralize("ad", count))
1177+
LOG.info("############################################")
1178+
11401179
#############################
11411180
# main entry point
11421181
#############################

0 commit comments

Comments
 (0)