Skip to content

Commit 7dacce0

Browse files
authored
Merge pull request #67 from NREL/pp/update_ddg_lib
Update ddg lib
2 parents 0020ee9 + e2800a4 commit 7dacce0

5 files changed

Lines changed: 101 additions & 19 deletions

File tree

elm/web/search/duckduckgo.py

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import asyncio
55
import logging
66

7-
from duckduckgo_search import DDGS
7+
from ddgs import DDGS
88

99
from elm.web.search.base import (PlaywrightSearchEngineLinkSearch,
1010
SearchEngineLinkSearch)
@@ -52,28 +52,19 @@ class APIDuckDuckGoSearch(SearchEngineLinkSearch):
5252

5353
_SE_NAME = "DuckDuckGo API"
5454

55-
def __init__(self, region="wt-wt", backend="auto", timeout=10,
56-
verify=True, sleep_min_seconds=10, sleep_max_seconds=20):
55+
def __init__(self, region="us-en", timeout=10, verify=False,
56+
sleep_min_seconds=10, sleep_max_seconds=20):
5757
"""
5858
5959
Parameters
6060
----------
6161
region : str, optional
62-
DDG search region param. By default, ``"wt-wt"``, which
63-
signifies no region.
64-
backend : {auto, html, lite}, optional
65-
Option for DDG search type.
66-
67-
- auto: select randomly between HTML and Lite backends
68-
- html: collect data from https://html.duckduckgo.com
69-
- lite: collect data from https://lite.duckduckgo.com
70-
71-
By default, ``"auto"``.
62+
DDG search region param. By default, ``"us-en"``.
7263
timeout : int, optional
7364
Timeout for HTTP requests, in seconds. By default, ``10``.
7465
verify : bool, optional
7566
Apply SSL verification when making the request.
76-
By default, ``True``.
67+
By default, ``False``.
7768
sleep_min_seconds : int, optional
7869
Minimum number of seconds to sleep between queries. We
7970
recommend not setting this below ``5`` seconds to avoid
@@ -84,7 +75,6 @@ def __init__(self, region="wt-wt", backend="auto", timeout=10,
8475
By default, ``20``.
8576
"""
8677
self.region = region
87-
self.backend = backend
8878
self.timeout = timeout
8979
self.verify = verify
9080
self.sleep_min_seconds = sleep_min_seconds
@@ -95,8 +85,8 @@ async def _search(self, query, num_results=10):
9585

9686
ddgs = DDGS(timeout=self.timeout, verify=self.verify)
9787
results = ddgs.text(query, region=self.region,
98-
backend=self.backend,
99-
max_results=num_results)
88+
backend="duckduckgo",
89+
num_results=num_results)
10090

10191
return list(filter(None, (info.get('href', "").replace("+", "%20")
10292
for info in results)))

elm/web/search/dux.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# -*- coding: utf-8 -*-
2+
"""ELM Web Scraping - DuxDistributedGlobalSearch"""
3+
import logging
4+
5+
from ddgs import DDGS
6+
7+
from elm.web.search.base import SearchEngineLinkSearch
8+
9+
10+
logger = logging.getLogger(__name__)
11+
12+
13+
class DuxDistributedGlobalSearch(SearchEngineLinkSearch):
14+
"""Search the web for links using DuxDistributedGlobalSearch"""
15+
16+
_SE_NAME = "DuxDistributedGlobalSearch"
17+
18+
def __init__(self, region="us-en", safesearch="moderate", timelimit=None,
19+
page=1, backend=("google", "bing", "yahoo", "duckduckgo"),
20+
timeout=10, verify=False):
21+
"""
22+
23+
Parameters
24+
----------
25+
region : str, optional
26+
DuxDistributedGlobalSearch search region param.
27+
By default, ``"us-en"``.
28+
safesearch : {on, moderate, off}, optional
29+
The `safesearch` setting for search engines.
30+
By default, ``None``.
31+
timelimit : {d, w, m, y}, optional
32+
The time limit used to bound the search results:
33+
34+
-d: last day
35+
-w: last week
36+
-m: last month
37+
-y: last year
38+
39+
By default, ``None``.
40+
page : int, default=1
41+
The page of results to return. By default, ``1``.
42+
backend : str or iter of str, optional
43+
Option for DuxDistributedGlobalSearch backend:
44+
45+
- auto: Randomly select 3 search engines to use
46+
- all: All available search engines are used
47+
- wikipedia: Wikipedia
48+
- google: Google
49+
- bing: Bing
50+
- brave: Brave
51+
- mojeek: Mojeek
52+
- yahoo: Yahoo
53+
- yandex: Yandex
54+
- duckduckgo: Duckduckgo
55+
56+
Can also be a list or tuple of a combination of these.
57+
By default, ``("google", "bing", "yahoo", "duckduckgo")``.
58+
timeout : int, optional
59+
Timeout for HTTP requests, in seconds. By default, ``10``.
60+
verify : bool, optional
61+
Apply SSL verification when making the request.
62+
By default, ``False``.
63+
"""
64+
self.region = region
65+
self.safesearch = safesearch
66+
self.timelimit = timelimit
67+
self.page = page
68+
self.backend = backend
69+
self.timeout = timeout
70+
self.verify = verify
71+
72+
async def _search(self, query, num_results=10):
73+
"""Search web for links related to a query"""
74+
75+
ddgs = DDGS(timeout=self.timeout, verify=self.verify)
76+
results = ddgs.text(query, region=self.region,
77+
safesearch=self.safesearch,
78+
timelimit=self.timelimit,
79+
page=self.page,
80+
backend=self.backend,
81+
num_results=num_results)
82+
83+
return list(filter(None, (info.get('href', "").replace("+", "%20")
84+
for info in results)))
85+

elm/web/search/run.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from elm.web.search.bing import PlaywrightBingLinkSearch
1313
from elm.web.search.duckduckgo import (APIDuckDuckGoSearch,
1414
PlaywrightDuckDuckGoLinkSearch)
15+
from elm.web.search.dux import DuxDistributedGlobalSearch
1516
from elm.web.search.google import (APIGoogleCSESearch, APISerperSearch,
1617
PlaywrightGoogleCSELinkSearch,
1718
PlaywrightGoogleLinkSearch)
@@ -33,6 +34,8 @@
3334
"APISerperSearch": _SE_OPT(APISerperSearch, False,
3435
"google_serper_api_kwargs"),
3536
"APITavilySearch": _SE_OPT(APITavilySearch, False, "tavily_api_kwargs"),
37+
"DuxDistributedGlobalSearch": _SE_OPT(DuxDistributedGlobalSearch, False,
38+
"ddgs_kwargs"),
3639
"PlaywrightBingLinkSearch": _SE_OPT(PlaywrightBingLinkSearch, True,
3740
"pw_bing_se_kwargs"),
3841
"PlaywrightDuckDuckGoLinkSearch": _SE_OPT(PlaywrightDuckDuckGoLinkSearch,
@@ -46,7 +49,7 @@
4649
}
4750
"""Supported search engines"""
4851
_DEFAULT_SE = ("PlaywrightGoogleLinkSearch", "PlaywrightDuckDuckGoLinkSearch",
49-
"APIDuckDuckGoSearch")
52+
"DuxDistributedGlobalSearch")
5053

5154

5255
async def web_search_links_as_docs(queries, search_engines=_DEFAULT_SE,
@@ -113,6 +116,7 @@ async def web_search_links_as_docs(queries, search_engines=_DEFAULT_SE,
113116
- google_cse_api_kwargs
114117
- google_serper_api_kwargs
115118
- tavily_api_kwargs
119+
- ddgs_kwargs
116120
- pw_bing_se_kwargs
117121
- pw_ddg_se_kwargs
118122
- pw_google_cse_kwargs
@@ -202,6 +206,7 @@ async def search_with_fallback(queries, search_engines=_DEFAULT_SE,
202206
- google_cse_api_kwargs
203207
- google_serper_api_kwargs
204208
- tavily_api_kwargs
209+
- ddgs_kwargs
205210
- pw_bing_se_kwargs
206211
- pw_ddg_se_kwargs
207212
- pw_google_cse_kwargs

elm/web/website_crawl.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
# -*- coding: utf-8 -*-
2+
# flake8: noqa
3+
# pylint: disable=no-member
24
"""ELM Document retrieval from a website"""
35

46
import logging

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ beautifulsoup4
44
camoufox
55
click
66
crawl4ai
7-
duckduckgo-search
7+
ddgs
88
fake_useragent>=2.0.3
99
google-api-python-client
1010
html2text

0 commit comments

Comments
 (0)