Skip to content

Commit 4c8876f

Browse files
authored
feat: add method - detect format / data_type (#380)
1 parent f92e890 commit 4c8876f

18 files changed

+472
-121
lines changed

README.md

+5-5
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ pip install embedchain
2828
zuck_bot = Llama2App()
2929

3030
# Embed your data
31-
zuck_bot.add("youtube_video", "https://www.youtube.com/watch?v=Ff4fRgnuFgQ")
32-
zuck_bot.add("web_page", "https://en.wikipedia.org/wiki/Mark_Zuckerberg")
31+
zuck_bot.add("https://www.youtube.com/watch?v=Ff4fRgnuFgQ")
32+
zuck_bot.add("https://en.wikipedia.org/wiki/Mark_Zuckerberg")
3333

3434
# Nice, your bot is ready now. Start asking questions to your bot.
3535
zuck_bot.query("Who is Mark Zuckerberg?")
@@ -64,9 +64,9 @@ os.environ["OPENAI_API_KEY"] = "YOUR API KEY"
6464
elon_bot = App()
6565

6666
# Embed online resources
67-
elon_bot.add("web_page", "https://en.wikipedia.org/wiki/Elon_Musk")
68-
elon_bot.add("web_page", "https://tesla.com/elon-musk")
69-
elon_bot.add("youtube_video", "https://www.youtube.com/watch?v=MxZpaJK74Y4")
67+
elon_bot.add("https://en.wikipedia.org/wiki/Elon_Musk")
68+
elon_bot.add("https://tesla.com/elon-musk")
69+
elon_bot.add("https://www.youtube.com/watch?v=MxZpaJK74Y4")
7070

7171
# Query the bot
7272
elon_bot.query("How many companies does Elon Musk run?")

docs/advanced/adding_data.mdx

+6-6
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,20 @@ title: '➕ Adding Data'
66

77
- This step assumes that you have already created an `app` instance by either using `App`, `OpenSourceApp` or `CustomApp`. We are calling our app instance as `naval_chat_bot` 🤖
88

9-
- Now use `.add()` function to add any dataset.
9+
- Now use `.add` method to add any dataset.
1010

1111
```python
1212
# naval_chat_bot = App() or
1313
# naval_chat_bot = OpenSourceApp()
1414

1515
# Embed Online Resources
16-
naval_chat_bot.add("youtube_video", "https://www.youtube.com/watch?v=3qHkcs3kG44")
17-
naval_chat_bot.add("pdf_file", "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
18-
naval_chat_bot.add("web_page", "https://nav.al/feedback")
19-
naval_chat_bot.add("web_page", "https://nav.al/agi")
16+
naval_chat_bot.add("https://www.youtube.com/watch?v=3qHkcs3kG44")
17+
naval_chat_bot.add("https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
18+
naval_chat_bot.add("https://nav.al/feedback")
19+
naval_chat_bot.add("https://nav.al/agi")
2020

2121
# Embed Local Resources
22-
naval_chat_bot.add_local("qna_pair", ("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."))
22+
naval_chat_bot.add(("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."))
2323
```
2424

2525
The possible formats to add data can be found on the [Supported Data Formats](/advanced/data_types) page.

docs/advanced/app_types.mdx

+2-2
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ os.environ['REPLICATE_API_TOKEN'] = "REPLICATE API TOKEN"
3535
zuck_bot = Llama2App()
3636

3737
# Embed your data
38-
zuck_bot.add("youtube_video", "https://www.youtube.com/watch?v=Ff4fRgnuFgQ")
39-
zuck_bot.add("web_page", "https://en.wikipedia.org/wiki/Mark_Zuckerberg")
38+
zuck_bot.add("https://www.youtube.com/watch?v=Ff4fRgnuFgQ")
39+
zuck_bot.add("https://en.wikipedia.org/wiki/Mark_Zuckerberg")
4040

4141
# Nice, your bot is ready now. Start asking questions to your bot.
4242
zuck_bot.query("Who is Mark Zuckerberg?")

docs/advanced/configuration.mdx

+8-8
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,17 @@ naval_chat_bot = App(config)
2626

2727
# Example: define your own chunker config for `youtube_video`
2828
chunker_config = ChunkerConfig(chunk_size=1000, chunk_overlap=100, length_function=len)
29-
naval_chat_bot.add("youtube_video", "https://www.youtube.com/watch?v=3qHkcs3kG44", AddConfig(chunker=chunker_config))
29+
naval_chat_bot.add("https://www.youtube.com/watch?v=3qHkcs3kG44", AddConfig(chunker=chunker_config))
3030

3131
add_config = AddConfig()
32-
naval_chat_bot.add("pdf_file", "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf", add_config)
33-
naval_chat_bot.add("web_page", "https://nav.al/feedback", add_config)
34-
naval_chat_bot.add("web_page", "https://nav.al/agi", add_config)
32+
naval_chat_bot.add("https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf", config=add_config)
33+
naval_chat_bot.add("https://nav.al/feedback", config=add_config)
34+
naval_chat_bot.add("https://nav.al/agi", config=add_config)
3535

36-
naval_chat_bot.add_local("qna_pair", ("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."), add_config)
36+
naval_chat_bot.add(("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."), config=add_config)
3737

3838
query_config = QueryConfig()
39-
print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?", query_config))
39+
print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?", config=query_config))
4040
```
4141

4242
### Custom prompt template
@@ -53,7 +53,7 @@ einstein_chat_bot = App()
5353

5454
# Embed Wikipedia page
5555
page = wikipedia.page("Albert Einstein")
56-
einstein_chat_bot.add("text", page.content)
56+
einstein_chat_bot.add(page.content)
5757

5858
# Example: use your own custom template with `$context` and `$query`
5959
einstein_chat_template = Template("""
@@ -75,7 +75,7 @@ queries = [
7575
"Why did you divorce your first wife?",
7676
]
7777
for query in queries:
78-
response = einstein_chat_bot.query(query, query_config)
78+
response = einstein_chat_bot.query(query, config=query_config)
7979
print("Query: ", query)
8080
print("Response: ", response)
8181

docs/advanced/data_types.mdx

+47-18
Original file line numberDiff line numberDiff line change
@@ -2,22 +2,48 @@
22
title: '📋 Supported data formats'
33
---
44

5-
Embedchain supports following data formats:
5+
## Automatic data type detection
6+
The add method automatically tries to detect the data_type, based on your input for the source argument. So `app.add('https://www.youtube.com/watch?v=dQw4w9WgXcQ')` is enough to embed a YouTube video.
7+
8+
This detection is implemented for all formats. It is based on factors such as whether it's a URL, a local file, the source data type, etc.
9+
10+
### Debugging automatic detection
11+
12+
13+
Set `log_level=DEBUG` (in [AppConfig](http://localhost:3000/advanced/query_configuration#appconfig)) and make sure it's working as intended.
14+
15+
Otherwise, you will not know when, for instance, an invalid filepath is interpreted as raw text instead.
16+
17+
### Forcing a data type
18+
19+
To omit any issues with the data type detection, you can **force** a data_type by adding it as a `add` method argument.
20+
The examples below show you the keyword to force the respective `data_type`.
21+
22+
Forcing can also be used for edge cases, such as interpreting a sitemap as a web_page, for reading it's raw text instead of following links.
23+
24+
## Remote Data Types
25+
26+
<Tip>
27+
**Use local files in remote data types**
28+
29+
Some data_types are meant for remote content and only work with URLs.
30+
You can pass local files by formatting the path using the `file:` [URI scheme](https://en.wikipedia.org/wiki/File_URI_scheme), e.g. `file:///info.pdf`.
31+
</Tip>
632

733
### Youtube video
834

935
To add any youtube video to your app, use the data_type (first argument to `.add()` method) as `youtube_video`. Eg:
1036

1137
```python
12-
app.add('youtube_video', 'a_valid_youtube_url_here')
38+
app.add('a_valid_youtube_url_here', data_type='youtube_video')
1339
```
1440

1541
### PDF file
1642

1743
To add any pdf file, use the data_type as `pdf_file`. Eg:
1844

1945
```python
20-
app.add('pdf_file', 'a_valid_url_where_pdf_file_can_be_accessed')
46+
app.add('a_valid_url_where_pdf_file_can_be_accessed', data_type='pdf_file')
2147
```
2248

2349
Note that we do not support password protected pdfs.
@@ -27,51 +53,54 @@ Note that we do not support password protected pdfs.
2753
To add any web page, use the data_type as `web_page`. Eg:
2854

2955
```python
30-
app.add('web_page', 'a_valid_web_page_url')
56+
app.add('a_valid_web_page_url', data_type='web_page')
3157
```
3258

3359
### Sitemap
3460

3561
Add all web pages from an xml-sitemap. Filters non-text files. Use the data_type as `sitemap`. Eg:
3662

3763
```python
38-
app.add('sitemap', 'https://example.com/sitemap.xml')
64+
app.add('https://example.com/sitemap.xml', data_type='sitemap')
3965
```
4066

4167
### Doc file
4268

43-
To add any doc/docx file, use the data_type as `docx`. Eg:
69+
To add any doc/docx file, use the data_type as `docx`. `docx` allows remote urls and conventional file paths. Eg:
4470

4571
```python
46-
app.add('docx', 'a_local_docx_file_path')
72+
app.add('https://example.com/content/intro.docx', data_type="docx")
73+
app.add('content/intro.docx', data_type="docx")
4774
```
4875

4976
### Code documentation website loader
5077

5178
To add any code documentation website as a loader, use the data_type as `docs_site`. Eg:
5279

5380
```python
54-
app.add("docs_site", "https://docs.embedchain.ai/")
81+
app.add("https://docs.embedchain.ai/", data_type="docs_site")
5582
```
5683

5784
### Notion
5885
To use notion you must install the extra dependencies with `pip install embedchain[notion]`.
5986

60-
To load a notion page, use the data_type as `notion`.
87+
To load a notion page, use the data_type as `notion`. Since it is hard to automatically detect, forcing this is advised.
6188
The next argument must **end** with the `notion page id`. The id is a 32-character string. Eg:
6289

6390
```python
64-
app.add("notion", "cfbc134ca6464fc980d0391613959196")
65-
app.add("notion", "my-page-cfbc134ca6464fc980d0391613959196")
66-
app.add("notion", "https://www.notion.so/my-page-cfbc134ca6464fc980d0391613959196")
91+
app.add("cfbc134ca6464fc980d0391613959196", "notion")
92+
app.add("my-page-cfbc134ca6464fc980d0391613959196", "notion")
93+
app.add("https://www.notion.so/my-page-cfbc134ca6464fc980d0391613959196", "notion")
6794
```
6895

96+
## Local Data Types
97+
6998
### Text
7099

71100
To supply your own text, use the data_type as `text` and enter a string. The text is not processed, this can be very versatile. Eg:
72101

73102
```python
74-
app.add_local('text', 'Seek wealth, not money or status. Wealth is having assets that earn while you sleep. Money is how we transfer time and wealth. Status is your place in the social hierarchy.')
103+
app.add('Seek wealth, not money or status. Wealth is having assets that earn while you sleep. Money is how we transfer time and wealth. Status is your place in the social hierarchy.', data_type='text')
75104
```
76105

77106
Note: This is not used in the examples because in most cases you will supply a whole paragraph or file, which did not fit.
@@ -81,7 +110,7 @@ Note: This is not used in the examples because in most cases you will supply a w
81110
To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple. Eg:
82111

83112
```python
84-
app.add_local('qna_pair', ("Question", "Answer"))
113+
app.add(("Question", "Answer"), data_type="qna_pair")
85114
```
86115

87116
## Reusing a vector database
@@ -94,8 +123,8 @@ Create a local index:
94123
from embedchain import App
95124

96125
naval_chat_bot = App()
97-
naval_chat_bot.add("youtube_video", "https://www.youtube.com/watch?v=3qHkcs3kG44")
98-
naval_chat_bot.add("pdf_file", "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
126+
naval_chat_bot.add("https://www.youtube.com/watch?v=3qHkcs3kG44")
127+
naval_chat_bot.add("https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
99128
```
100129

101130
You can reuse the local index with the same code, but without adding new documents:
@@ -107,6 +136,6 @@ naval_chat_bot = App()
107136
print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?"))
108137
```
109138

110-
### More formats (coming soon!)
139+
## More formats (coming soon!)
111140

112-
- If you want to add any other format, please create an [issue](https://github.com/embedchain/embedchain/issues) and we will add it to the list of supported formats.
141+
- If you want to add any other format, please create an [issue](https://github.com/embedchain/embedchain/issues) and we will add it to the list of supported formats.

docs/advanced/query_configuration.mdx

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ Yes, you are passing `ChunkerConfig` to `AddConfig`, like so:
2525
```python
2626
chunker_config = ChunkerConfig(chunk_size=100)
2727
add_config = AddConfig(chunker=chunker_config)
28-
app.add_local("text", "lorem ipsum", config=add_config)
28+
app.add("lorem ipsum", config=add_config)
2929
```
3030

3131
### ChunkerConfig

docs/introduction.mdx

+8-8
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ description: '📝 Embedchain is a framework to easily create LLM powered bots o
77

88
Embedchain abstracts the entire process of loading a dataset, chunking it, creating embeddings, and storing it in a vector database.
99

10-
You can add a single or multiple datasets using the .add and .add_local functions. Then, simply use the .query function to find answers from the added datasets.
10+
You can add a single or multiple datasets using the `.add` method. Then, simply use the `.query` method to find answers from the added datasets.
1111

1212
If you want to create a Naval Ravikant bot with a YouTube video, a book in PDF format, two blog posts, and a question and answer pair, all you need to do is add the respective links. Embedchain will take care of the rest, creating a bot for you.
1313

@@ -16,13 +16,13 @@ from embedchain import App
1616

1717
naval_chat_bot = App()
1818
# Embed Online Resources
19-
naval_chat_bot.add("youtube_video", "https://www.youtube.com/watch?v=3qHkcs3kG44")
20-
naval_chat_bot.add("pdf_file", "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
21-
naval_chat_bot.add("web_page", "https://nav.al/feedback")
22-
naval_chat_bot.add("web_page", "https://nav.al/agi")
19+
naval_chat_bot.add("https://www.youtube.com/watch?v=3qHkcs3kG44")
20+
naval_chat_bot.add("https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
21+
naval_chat_bot.add("https://nav.al/feedback")
22+
naval_chat_bot.add("https://nav.al/agi")
2323

2424
# Embed Local Resources
25-
naval_chat_bot.add_local("qna_pair", ("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."))
25+
naval_chat_bot.add(("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."))
2626

2727
naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?")
2828
# Answer: Naval argues that humans possess the unique capacity to understand explanations or concepts to the maximum extent possible in this physical reality.
@@ -32,7 +32,7 @@ naval_chat_bot.query("What unique capacity does Naval argue humans possess when
3232

3333
Creating a chat bot over any dataset involves the following steps:
3434

35-
1. Load the data
35+
1. Detect the data type and load the data
3636
2. Create meaningful chunks
3737
3. Create embeddings for each chunk
3838
4. Store the chunks in a vector database
@@ -53,4 +53,4 @@ The process of loading the dataset and querying involves multiple steps, each wi
5353

5454
Embedchain takes care of all these nuances and provides a simple interface to create bots over any dataset.
5555

56-
In the first release, we make it easier for anyone to get a chatbot over any dataset up and running in less than a minute. Just create an app instance, add the datasets using the `.add()` function, and use the `.query()` function to get the relevant answers.
56+
In the first release, we make it easier for anyone to get a chatbot over any dataset up and running in less than a minute. Just create an app instance, add the datasets using the `.add` method, and use the `.query` method to get the relevant answers.

docs/mint.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
},
3333
{
3434
"group": "Advanced",
35-
"pages": ["advanced/app_types", "advanced/interface_types", "advanced/adding_data","advanced/data_types", "advanced/query_configuration", "advanced/configuration", "advanced/testing", "advanced/vector_database", "advanced/showcase"]
35+
"pages": ["advanced/app_types", "advanced/interface_types", "advanced/adding_data", "advanced/data_types", "advanced/query_configuration", "advanced/configuration", "advanced/testing", "advanced/vector_database", "advanced/showcase"]
3636
},
3737
{
3838
"group": "Examples",

docs/quickstart.mdx

+2-2
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ os.environ["OPENAI_API_KEY"] = "xxx"
2626
elon_musk_bot = App()
2727

2828
# Embed Online Resources
29-
elon_musk_bot.add("web_page", "https://en.wikipedia.org/wiki/Elon_Musk")
30-
elon_musk_bot.add("web_page", "https://www.tesla.com/elon-musk")
29+
elon_musk_bot.add("https://en.wikipedia.org/wiki/Elon_Musk")
30+
elon_musk_bot.add("https://www.tesla.com/elon-musk")
3131

3232
response = elon_musk_bot.query("How many companies does Elon Musk run?")
3333
print(response)

embedchain/chunkers/base_chunker.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import hashlib
22

3+
from embedchain.models.data_type import DataType
4+
35

46
class BaseChunker:
57
def __init__(self, text_splitter):
@@ -26,7 +28,7 @@ def create_chunks(self, loader, src):
2628

2729
meta_data = data["meta_data"]
2830
# add data type to meta data to allow query using data type
29-
meta_data["data_type"] = self.data_type
31+
meta_data["data_type"] = self.data_type.value
3032
url = meta_data["url"]
3133

3234
chunks = self.get_chunks(content)
@@ -52,8 +54,10 @@ def get_chunks(self, content):
5254
"""
5355
return self.text_splitter.split_text(content)
5456

55-
def set_data_type(self, data_type):
57+
def set_data_type(self, data_type: DataType):
5658
"""
5759
set the data type of chunker
5860
"""
5961
self.data_type = data_type
62+
63+
# TODO: This should be done during initialization. This means it has to be done in the child classes.

0 commit comments

Comments
 (0)