Skip to content

Commit 206f8ef

Browse files
committed
Add support for WebConfig data sources for minds
1 parent b6ff4a7 commit 206f8ef

File tree

4 files changed

+82
-19
lines changed

4 files changed

+82
-19
lines changed

mindsdb_sdk/agents.py

Lines changed: 49 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -119,21 +119,41 @@ def add_file(self, file_path: str, description: str, knowledge_base: str = None)
119119
"""
120120
self.collection.add_file(self.name, file_path, description, knowledge_base)
121121

122-
def add_webpages(self, urls: List[str], description: str, knowledge_base: str = None):
122+
def add_webpages(
123+
self,
124+
urls: List[str],
125+
description: str,
126+
knowledge_base: str = None,
127+
crawl_depth: int = 1,
128+
filters: List[str] = None):
123129
"""
124-
Add a list of crawled URLs to the agent for retrieval.
130+
Add a crawled URL to the agent for retrieval.
125131
126-
:param urls: List of URLs to be crawled and added.
132+
:param urls: URLs of pages to be crawled and added.
133+
:param description: Description of the webpages. Used by agent to know when to do retrieval.
134+
:param knowledge_base: Name of an existing knowledge base to be used. Will create a default knowledge base if not given.
135+
:param crawl_depth: How deep to crawl from each base URL. 0 = scrape given URLs only, -1 = default max
136+
:param filters: Include only URLs that match these regex patterns
127137
"""
128-
self.collection.add_webpages(self.name, urls, description, knowledge_base)
138+
self.collection.add_webpages(self.name, urls, description, knowledge_base=knowledge_base, crawl_depth=crawl_depth, filters=filters)
129139

130-
def add_webpage(self, url: str, description: str, knowledge_base: str = None):
140+
def add_webpage(
141+
self,
142+
url: str,
143+
description: str,
144+
knowledge_base: str = None,
145+
crawl_depth: int = 1,
146+
filters: List[str] = None):
131147
"""
132148
Add a crawled URL to the agent for retrieval.
133149
134150
:param url: URL of the page to be crawled and added.
151+
:param description: Description of the webpages. Used by agent to know when to do retrieval.
152+
:param knowledge_base: Name of an existing knowledge base to be used. Will create a default knowledge base if not given.
153+
:param crawl_depth: How deep to crawl from each base URL. 0 = scrape given URLs only, -1 = default max
154+
:param filters: Include only URLs that match these regex patterns
135155
"""
136-
self.collection.add_webpage(self.name, url, description, knowledge_base)
156+
self.collection.add_webpage(self.name, url, description, knowledge_base=knowledge_base, crawl_depth=crawl_depth, filters=filters)
137157

138158
def add_database(self, database: str, tables: List[str], description: str):
139159
"""
@@ -313,14 +333,24 @@ def add_file(self, name: str, file_path: str, description: str, knowledge_base:
313333
"""
314334
self.add_files(name, [file_path], description, knowledge_base)
315335

316-
def add_webpages(self, name: str, urls: List[str], description: str, knowledge_base: str = None):
336+
def add_webpages(
337+
self,
338+
name: str,
339+
urls: List[str],
340+
description: str,
341+
knowledge_base: str = None,
342+
crawl_depth: int = 1,
343+
filters: List[str] = None
344+
):
317345
"""
318346
Add a list of webpages to the agent for retrieval.
319347
320348
:param name: Name of the agent
321349
:param urls: List of URLs of the webpages to be added.
322350
:param description: Description of the webpages. Used by agent to know when to do retrieval.
323351
:param knowledge_base: Name of an existing knowledge base to be used. Will create a default knowledge base if not given.
352+
:param crawl_depth: How deep to crawl from each base URL. 0 = scrape given URLs only
353+
:param filters: Include only URLs that match these regex patterns
324354
"""
325355
if not urls:
326356
return
@@ -339,7 +369,7 @@ def add_webpages(self, name: str, urls: List[str], description: str, knowledge_b
339369
kb = self._create_default_knowledge_base(agent, kb_name)
340370

341371
# Insert crawled webpage.
342-
kb.insert_webpages(urls)
372+
kb.insert_webpages(urls, crawl_depth=crawl_depth, filters=filters)
343373

344374
# Make sure skill name is unique.
345375
skill_name = f'{domain}{path}_retrieval_skill_{uuid4()}'
@@ -351,16 +381,25 @@ def add_webpages(self, name: str, urls: List[str], description: str, knowledge_b
351381
agent.skills.append(webpage_retrieval_skill)
352382
self.update(agent.name, agent)
353383

354-
def add_webpage(self, name: str, url: str, description: str, knowledge_base: str = None):
384+
def add_webpage(
385+
self,
386+
name: str,
387+
url: str,
388+
description: str,
389+
knowledge_base: str = None,
390+
crawl_depth: int = 1,
391+
filters: List[str] = None):
355392
"""
356393
Add a webpage to the agent for retrieval.
357394
358395
:param name: Name of the agent
359396
:param file_path: URL of the webpage to be added, or name of existing webpage.
360397
:param description: Description of the webpage. Used by agent to know when to do retrieval.
361398
:param knowledge_base: Name of an existing knowledge base to be used. Will create a default knowledge base if not given.
399+
:param crawl_depth: How deep to crawl from each base URL. 0 = scrape given URLs only
400+
:param filters: Include only URLs that match these regex patterns
362401
"""
363-
self.add_webpages(name, [url], description, knowledge_base)
402+
self.add_webpages(name, [url], description, knowledge_base=knowledge_base, crawl_depth=crawl_depth, filters=filters)
364403

365404
def add_database(self, name: str, database: str, tables: List[str], description: str):
366405
"""

mindsdb_sdk/connectors/rest_api.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -413,13 +413,16 @@ def insert_files_into_knowledge_base(self, project: str, knowledge_base_name: st
413413
return r.json()
414414

415415
@_try_relogin
416-
def insert_webpages_into_knowledge_base(self, project: str, knowledge_base_name: str, urls: List[str]):
416+
def insert_webpages_into_knowledge_base(self, project: str, knowledge_base_name: str, urls: List[str], crawl_depth: int = 1, filters: List[str] = None):
417+
data = {
418+
'urls': urls,
419+
'crawl_depth': crawl_depth,
420+
'filters': [] if filters is None else filters
421+
}
417422
r = self.session.put(
418423
self.url + f'/api/projects/{project}/knowledge_bases/{knowledge_base_name}',
419424
json={
420-
'knowledge_base': {
421-
'urls': urls
422-
}
425+
'knowledge_base': data
423426
}
424427
)
425428
_raise_for_status(r)

mindsdb_sdk/knowledge_bases.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -124,11 +124,15 @@ def insert_files(self, file_paths: List[str]):
124124
"""
125125
self.api.insert_files_into_knowledge_base(self.project.name, self.name, file_paths)
126126

127-
def insert_webpages(self, urls: List[str]):
127+
def insert_webpages(self, urls: List[str], crawl_depth: int = 1, filters: List[str] = None):
128128
"""
129-
Insert data from crawled URLs to knowledge base
129+
Insert data from crawled URLs to knowledge base.
130+
131+
:param urls: URLs to be crawled and inserted.
132+
:param crawl_depth: How deep to crawl from each base URL. 0 = scrape given URLs only
133+
:param filters: Include only URLs that match these regex patterns
130134
"""
131-
self.api.insert_webpages_into_knowledge_base(self.project.name, self.name, urls)
135+
self.api.insert_webpages_into_knowledge_base(self.project.name, self.name, urls, crawl_depth=crawl_depth, filters=filters)
132136

133137
def insert(self, data: Union[pd.DataFrame, Query, dict]):
134138
"""

mindsdb_sdk/utils/mind.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,15 +45,32 @@ class DatabaseConfig(DataSourceConfig):
4545

4646
class FileConfig(DataSourceConfig):
4747
"""
48-
Represents a colection of files that can be made available to a Mind.
48+
Represents a collection of files that can be made available to a Mind.
4949
"""
5050

5151
# Local file paths and/or URLs.
52-
paths: List[str] = []
52+
paths: List[str]
5353

5454
# TODO: Configure Vector storage. Use defaults for now.
5555

5656

57+
class WebConfig(DataSourceConfig):
58+
"""
59+
Represents a collection of URLs that can be crawled and made available to a Mind.
60+
"""
61+
62+
# Base URLs to crawl from.
63+
urls: List[str]
64+
65+
# Scrapes all URLs found in the starting page (default).
66+
# 0 = scrape provided URLs only
67+
# -1 = no limit (we should set our own sensible limit)
68+
crawl_depth: int = 1
69+
70+
# Include only URLs that match regex patterns.
71+
filters: List[str] = [ ]
72+
73+
5774
# Create mind entity util function
5875
def create_mind(
5976
base_url: str,

0 commit comments

Comments
 (0)