|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +"""ELM Web Scraping - DuxDistributedGlobalSearch""" |
| 3 | +import logging |
| 4 | + |
| 5 | +from ddgs import DDGS |
| 6 | + |
| 7 | +from elm.web.search.base import SearchEngineLinkSearch |
| 8 | + |
| 9 | + |
| 10 | +logger = logging.getLogger(__name__) |
| 11 | + |
| 12 | + |
| 13 | +class DuxDistributedGlobalSearch(SearchEngineLinkSearch): |
| 14 | + """Search the web for links using DuxDistributedGlobalSearch""" |
| 15 | + |
| 16 | + _SE_NAME = "DuxDistributedGlobalSearch" |
| 17 | + |
| 18 | + def __init__(self, region="us-en", safesearch="moderate", timelimit=None, |
| 19 | + page=1, backend=("google", "bing", "yahoo", "duckduckgo"), |
| 20 | + timeout=10, verify=False): |
| 21 | + """ |
| 22 | +
|
| 23 | + Parameters |
| 24 | + ---------- |
| 25 | + region : str, optional |
| 26 | + DuxDistributedGlobalSearch search region param. |
| 27 | + By default, ``"us-en"``. |
| 28 | + safesearch : {on, moderate, off}, optional |
| 29 | + The `safesearch` setting for search engines. |
| 30 | + By default, ``None``. |
| 31 | + timelimit : {d, w, m, y}, optional |
| 32 | + The time limit used to bound the search results: |
| 33 | +
|
| 34 | + -d: last day |
| 35 | + -w: last week |
| 36 | + -m: last month |
| 37 | + -y: last year |
| 38 | +
|
| 39 | + By default, ``None``. |
| 40 | + page : int, default=1 |
| 41 | + The page of results to return. By default, ``1``. |
| 42 | + backend : str or iter of str, optional |
| 43 | + Option for DuxDistributedGlobalSearch backend: |
| 44 | +
|
| 45 | + - auto: Randomly select 3 search engines to use |
| 46 | + - all: All available search engines are used |
| 47 | + - wikipedia: Wikipedia |
| 48 | + - google: Google |
| 49 | + - bing: Bing |
| 50 | + - brave: Brave |
| 51 | + - mojeek: Mojeek |
| 52 | + - yahoo: Yahoo |
| 53 | + - yandex: Yandex |
| 54 | + - duckduckgo: Duckduckgo |
| 55 | +
|
| 56 | + Can also be a list or tuple of a combination of these. |
| 57 | + By default, ``("google", "bing", "yahoo", "duckduckgo")``. |
| 58 | + timeout : int, optional |
| 59 | + Timeout for HTTP requests, in seconds. By default, ``10``. |
| 60 | + verify : bool, optional |
| 61 | + Apply SSL verification when making the request. |
| 62 | + By default, ``False``. |
| 63 | + """ |
| 64 | + self.region = region |
| 65 | + self.safesearch = safesearch |
| 66 | + self.timelimit = timelimit |
| 67 | + self.page = page |
| 68 | + self.backend = backend |
| 69 | + self.timeout = timeout |
| 70 | + self.verify = verify |
| 71 | + |
| 72 | + async def _search(self, query, num_results=10): |
| 73 | + """Search web for links related to a query""" |
| 74 | + |
| 75 | + ddgs = DDGS(timeout=self.timeout, verify=self.verify) |
| 76 | + results = ddgs.text(query, region=self.region, |
| 77 | + safesearch=self.safesearch, |
| 78 | + timelimit=self.timelimit, |
| 79 | + page=self.page, |
| 80 | + backend=self.backend, |
| 81 | + num_results=num_results) |
| 82 | + |
| 83 | + return list(filter(None, (info.get('href', "").replace("+", "%20") |
| 84 | + for info in results))) |
| 85 | + |
0 commit comments