codec/codec_search.py at main · AVADSA25/codec · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""CODEC Search — DuckDuckGo (free, no API key) or Serper (better results, needs key)"""
import httpx
import json
import os
import re
import time
import threading

CONFIG_PATH = os.path.expanduser("~/.codec/config.json")

# --- TTL cache for search results ---
_cache: dict[str, tuple[float, list]] = {}  # key -> (timestamp, results)
_cache_lock = threading.Lock()
_CACHE_TTL = 300  # 5 minutes
_CACHE_MAX = 100


def _cache_get(key: str) -> list | None:
    with _cache_lock:
        entry = _cache.get(key)
        if entry and (time.monotonic() - entry[0]) < _CACHE_TTL:
            return entry[1]
        _cache.pop(key, None)
        return None


def _cache_put(key: str, value: list) -> None:
    with _cache_lock:
        # Evict expired entries if at capacity
        if len(_cache) >= _CACHE_MAX:
            now = time.monotonic()
            expired = [k for k, (ts, _) in _cache.items() if now - ts >= _CACHE_TTL]
            for k in expired:
                del _cache[k]
            # If still at capacity, drop oldest
            if len(_cache) >= _CACHE_MAX:
                oldest_key = min(_cache, key=lambda k: _cache[k][0])
                del _cache[oldest_key]
        _cache[key] = (time.monotonic(), value)


def search_ddg(query: str, max_results: int = 10) -> list:
    """Search DuckDuckGo Instant Answers API — free, no API key needed"""
    try:
        r = httpx.get(
            "https://api.duckduckgo.com/",
            params={"q": query, "format": "json", "no_html": 1, "skip_disambig": 1},
            headers={"User-Agent": "Mozilla/5.0"},
            timeout=10,
            follow_redirects=True,
        )
        if r.status_code == 200:
            data = r.json()
            results = []

            # Instant answer
            if data.get("AbstractText"):
                results.append({
                    "title": data.get("Heading", query),
                    "link": data.get("AbstractURL", ""),
                    "snippet": data["AbstractText"][:400],
                })

            # Related topics
            for topic in data.get("RelatedTopics", [])[:max_results - len(results)]:
                if isinstance(topic, dict) and topic.get("Text"):
                    results.append({
                        "title": topic.get("Text", "")[:80],
                        "link": topic.get("FirstURL", ""),
                        "snippet": topic.get("Text", "")[:200],
                    })

            if results:
                return results
    except Exception as e:
        pass

    # Fallback: HTML scrape
    try:
        r = httpx.post(
            "https://html.duckduckgo.com/html/",
            data={"q": query},
            headers={"User-Agent": "Mozilla/5.0"},
            timeout=10,
            follow_redirects=True,
        )
        results = []
        links = re.findall(r'<a rel="nofollow" class="result__a" href="(.*?)">(.*?)</a>', r.text)
        snippets = re.findall(r'class="result__snippet"[^>]*>(.*?)</a>', r.text, re.DOTALL)
        for i, (href, title) in enumerate(links[:max_results]):
            results.append({
                "title": re.sub(r"<[^>]+>", "", title).strip(),
                "link": href,
                "snippet": re.sub(r"<[^>]+>", "", snippets[i]).strip() if i < len(snippets) else "",
            })
        return results
    except Exception as e:
        return [{"title": "Search error", "link": "", "snippet": str(e)}]


def search_serper(query: str, api_key: str, max_results: int = 10) -> list:
    """Search via Serper.dev — better results, needs API key ($10 for 100k queries)"""
    try:
        r = httpx.post(
            "https://google.serper.dev/search",
            json={"q": query, "num": max_results},
            headers={"X-API-KEY": api_key, "Content-Type": "application/json"},
            timeout=10,
        )
        data = r.json()
        results = []

        # Answer box
        if data.get("answerBox"):
            box = data["answerBox"]
            results.append({
                "title": box.get("title", query),
                "link": box.get("link", ""),
                "snippet": box.get("answer", box.get("snippet", ""))[:400],
            })

        for item in data.get("organic", [])[:max_results - len(results)]:
            results.append({
                "title": item.get("title", ""),
                "link": item.get("link", ""),
                "snippet": item.get("snippet", ""),
            })

        return results
    except Exception as e:
        return [{"title": "Search error", "link": "", "snippet": str(e)}]


def search(query: str, max_results: int = 10) -> list:
    """Auto-select: use Serper if API key configured, otherwise DuckDuckGo.
    Results are cached for 5 minutes keyed on (query, max_results)."""
    cache_key = f"{query}||{max_results}"
    cached = _cache_get(cache_key)
    if cached is not None:
        return cached

    try:
        with open(CONFIG_PATH) as f:
            cfg = json.load(f)
        serper_key = cfg.get("serper_api_key", "").strip()
        if serper_key:
            results = search_serper(query, serper_key, max_results)
            _cache_put(cache_key, results)
            return results
    except Exception:
        pass
    results = search_ddg(query, max_results)
    _cache_put(cache_key, results)
    return results


def format_results(results: list, max_snippets: int = 3) -> str:
    """Format search results into a readable string for LLM context"""
    if not results:
        return "No results found."
    lines = []
    for i, r in enumerate(results[:max_snippets], 1):
        title = r.get("title", "")
        snippet = r.get("snippet", "")
        link = r.get("link", "")
        if title or snippet:
            lines.append(f"{i}. {title}")
            if snippet:
                lines.append(f"   {snippet}")
            if link:
                lines.append(f"   {link}")
    return "\n".join(lines)