Skip to content

Commit eaefbf0

Browse files
committed
feat: enhance repository search with relevance scoring and diversity
- Add relevance scoring system for search results (exact match: 100, prefix: 50, contains: 25, description: 5) - Implement round-robin result interleaving for multi-repository diversity - Add deduplication of packages within each repository (same name+version) - Refactor search to apply limit at manager level for better efficiency - Fix cache entry data access (packages -> data attribute) - Improve search result quality and user experience
1 parent 92c501b commit eaefbf0

5 files changed

Lines changed: 117 additions & 32 deletions

File tree

CHANGELOG.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
88
## [Unreleased]
99

1010
### Added
11+
- **Repository Search Relevance Scoring**: Implemented intelligent search result ranking system
12+
- New `_calculate_relevance_score()` method in UniversalRepositoryDownloader for scoring search results
13+
- Exact name matches score 100, name prefix matches score 50, name contains query scores 25, description matches score 5
14+
- Search results automatically sorted by relevance score (highest first)
15+
- Improved search quality by prioritizing more relevant packages
16+
- **Repository Search Result Diversity**: Enhanced search to show results from multiple repositories
17+
- Round-robin interleaving of results from different repositories for better diversity
18+
- Configurable max results per repository (minimum 3) to ensure representation from multiple sources
19+
- Prevents single repository from dominating search results
20+
- Better user experience with varied package sources
21+
- **Repository Search Deduplication**: Added deduplication of packages within each repository
22+
- Removes duplicate packages with same name+version (e.g., different architecture variants)
23+
- Reduces noise in search results while maintaining unique packages
24+
- Improved logging to show both total and unique package counts
1125
- **RPM Repository Parser**: Complete implementation of RPM package metadata parser with comprehensive format support
1226
- New enhanced RPM parser for parsing repomd.xml and primary.xml metadata
1327
- Support for standard repomd.xml format (Rocky, AlmaLinux, CentOS Stream)
@@ -218,6 +232,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
218232
- **Security Enhancements**: File size limits for provider YAML files to prevent DoS attacks
219233

220234
### Changed
235+
- **Repository Search Implementation**: Refactored search logic for better performance and accuracy
236+
- Search now applies limit at manager level instead of CLI level for better efficiency
237+
- Removed redundant limit application in CLI after manager already limits results
238+
- Enhanced search to return pre-sorted, deduplicated, and limited results
239+
- **Repository Cache Access**: Fixed cache entry data access pattern
240+
- Changed from `cache_entry.packages` to `cache_entry.data` for correct attribute access
241+
- Ensures proper retrieval of cached package data
242+
- Consistent with cache entry data model structure
221243
- **LLM Provider Manager**: Enhanced to support multiple instances of the same provider type
222244
- Provider initialization now extracts base type from configuration or name
223245
- Improved error messages showing both provider name and type

saigen/cli/repositories.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -311,13 +311,13 @@ async def _search_packages(
311311
async with manager:
312312
click.echo(f"Searching for '{query}'...")
313313

314-
# Search packages
314+
# Search packages with limit for better diversity
315315
result = await manager.search_packages(
316-
query=query, platform=platform, repository_names=None
316+
query=query, platform=platform, repository_names=None, limit=limit
317317
)
318318

319-
# Apply limit
320-
packages = result.packages[:limit] if limit else result.packages
319+
# Packages are already limited by manager
320+
packages = result.packages
321321

322322
if output_format == "json":
323323
data = {

saigen/repositories/cache.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -399,8 +399,8 @@ async def get_all_packages(self, include_expired: bool = False) -> List[Reposito
399399
cache_key = meta_file.stem
400400
cache_entry = await self.get(cache_key)
401401

402-
if cache_entry and cache_entry.packages:
403-
all_packages.extend(cache_entry.packages)
402+
if cache_entry and cache_entry.data:
403+
all_packages.extend(cache_entry.data)
404404

405405
except Exception as e:
406406
# Log error but continue with other entries
@@ -443,8 +443,8 @@ async def get_packages_by_repository(self, repository_name: str) -> List[Reposit
443443
cache_key = meta_file.stem
444444
cache_entry = await self.get(cache_key)
445445

446-
if cache_entry and cache_entry.packages:
447-
packages.extend(cache_entry.packages)
446+
if cache_entry and cache_entry.data:
447+
packages.extend(cache_entry.data)
448448

449449
except Exception as e:
450450
# Log error but continue with other entries

saigen/repositories/downloaders/universal.py

Lines changed: 42 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ def _decompress_content(self, content: bytes, headers: Dict[str, str]) -> bytes:
306306
return content
307307

308308
async def search_package(self, name: str) -> List[RepositoryPackage]:
309-
"""Search for specific package."""
309+
"""Search for specific package with relevance scoring."""
310310
search_url = self.endpoints.get("search")
311311

312312
if search_url:
@@ -316,16 +316,17 @@ async def search_package(self, name: str) -> List[RepositoryPackage]:
316316
url = search_url.replace("{query}", name).replace("{package}", name)
317317
packages = await self._download_and_parse(session, url)
318318

319-
# Filter results to match search query
319+
# Filter and score results
320320
name_lower = name.lower()
321-
matching_packages = []
321+
scored_packages = []
322322
for package in packages:
323-
if name_lower in package.name.lower() or (
324-
package.description and name_lower in package.description.lower()
325-
):
326-
matching_packages.append(package)
323+
score = self._calculate_relevance_score(package, name_lower)
324+
if score > 0:
325+
scored_packages.append((score, package))
327326

328-
return matching_packages
327+
# Sort by relevance score (highest first)
328+
scored_packages.sort(key=lambda x: x[0], reverse=True)
329+
return [pkg for _, pkg in scored_packages]
329330

330331
except Exception as e:
331332
logger.debug(f"Search endpoint failed for {name}: {e}")
@@ -336,20 +337,47 @@ async def search_package(self, name: str) -> List[RepositoryPackage]:
336337
all_packages = await self.download_package_list()
337338

338339
name_lower = name.lower()
339-
matching_packages = []
340+
scored_packages = []
340341

341342
for package in all_packages:
342-
if name_lower in package.name.lower() or (
343-
package.description and name_lower in package.description.lower()
344-
):
345-
matching_packages.append(package)
343+
score = self._calculate_relevance_score(package, name_lower)
344+
if score > 0:
345+
scored_packages.append((score, package))
346346

347-
return matching_packages
347+
# Sort by relevance score (highest first)
348+
scored_packages.sort(key=lambda x: x[0], reverse=True)
349+
return [pkg for _, pkg in scored_packages]
348350

349351
except Exception as e:
350352
logger.error(f"Failed to search packages in {self.repository_info.name}: {e}")
351353
return []
352354

355+
def _calculate_relevance_score(self, package: RepositoryPackage, query: str) -> float:
356+
"""Calculate relevance score for search results.
357+
358+
Scoring:
359+
- Exact name match: 100
360+
- Name starts with query: 50
361+
- Name contains query: 25
362+
- Description contains query: 5
363+
"""
364+
score = 0.0
365+
pkg_name_lower = package.name.lower()
366+
367+
# Name matching (highest priority)
368+
if pkg_name_lower == query:
369+
score += 100
370+
elif pkg_name_lower.startswith(query):
371+
score += 50
372+
elif query in pkg_name_lower:
373+
score += 25
374+
375+
# Description matching (lower priority)
376+
if package.description and query in package.description.lower():
377+
score += 5
378+
379+
return score
380+
353381
async def get_package_details(
354382
self, name: str, version: Optional[str] = None
355383
) -> Optional[RepositoryPackage]:

saigen/repositories/universal_manager.py

Lines changed: 45 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -420,24 +420,59 @@ async def search_packages(
420420
task = asyncio.create_task(downloader.search_package(query), name=f"search_{name}")
421421
tasks.append((name, task))
422422

423-
# Collect search results
423+
# Collect search results by repository
424+
results_by_repo = {}
424425
for name, task in tasks:
425426
try:
426427
packages = await task
427428
if packages:
428-
# Apply limit per repository if specified
429-
if limit:
430-
packages = packages[:limit]
431-
432-
all_packages.extend(packages)
429+
# Deduplicate packages by name+version within each repository
430+
# This handles cases where repos return multiple arch variants
431+
seen = set()
432+
deduped = []
433+
for pkg in packages:
434+
key = (pkg.name, pkg.version)
435+
if key not in seen:
436+
seen.add(key)
437+
deduped.append(pkg)
438+
439+
results_by_repo[name] = deduped
433440
repository_sources.append(name)
434-
logger.debug(f"Found {len(packages)} matches in {name}")
441+
if len(deduped) < len(packages):
442+
logger.debug(f"Found {len(packages)} matches ({len(deduped)} unique) in {name}")
443+
else:
444+
logger.debug(f"Found {len(packages)} matches in {name}")
435445
except Exception as e:
436446
logger.error(f"Search failed in {name}: {e}")
437447

438-
# Apply global limit if specified
439-
if limit and len(all_packages) > limit:
440-
all_packages = all_packages[:limit]
448+
# Interleave results from different repositories for diversity
449+
# This ensures we show results from multiple repos, not just the first one
450+
if limit and results_by_repo:
451+
# Calculate max results per repository to ensure diversity
452+
num_repos = len(results_by_repo)
453+
max_per_repo = max(3, limit // num_repos + 1) # At least 3 per repo
454+
455+
# Limit each repository's results
456+
limited_results = {
457+
name: pkgs[:max_per_repo]
458+
for name, pkgs in results_by_repo.items()
459+
}
460+
461+
# Round-robin through repositories
462+
repo_iterators = {name: iter(pkgs) for name, pkgs in limited_results.items()}
463+
while len(all_packages) < limit and repo_iterators:
464+
for name in list(repo_iterators.keys()):
465+
try:
466+
pkg = next(repo_iterators[name])
467+
all_packages.append(pkg)
468+
if len(all_packages) >= limit:
469+
break
470+
except StopIteration:
471+
del repo_iterators[name]
472+
else:
473+
# No limit, just concatenate all results
474+
for packages in results_by_repo.values():
475+
all_packages.extend(packages)
441476

442477
# Calculate search time
443478
search_time = (datetime.utcnow() - start_time).total_seconds()

0 commit comments

Comments
 (0)