Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add notebook for Aryn blog post #380

Merged
merged 22 commits into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
7250a4b
Create test
jonfritz Dec 17, 2024
88b192b
Add files via upload
jonfritz Dec 17, 2024
8b8c160
Rename aryn-elasticsearch-blog-demo-clean copy.ipynb to aryn-elastics…
jonfritz Dec 17, 2024
2e05581
Create README.md
jonfritz Dec 17, 2024
7c243f8
Delete notebooks/integrations/aryn/test
jonfritz Dec 17, 2024
5763770
Merge branch 'elastic:main' into main
jonfritz Jan 7, 2025
55bb17b
Add files via upload
jonfritz Jan 7, 2025
5ba1d88
Update aryn-elasticsearch-blog-dataprep.ipynb
jonfritz Jan 13, 2025
dd964a1
Update aryn-elasticsearch-blog-dataprep.ipynb
jonfritz Jan 15, 2025
35b1929
Update aryn-elasticsearch-blog-dataprep.ipynb
jonfritz Jan 15, 2025
58c586d
Update aryn-elasticsearch-blog-dataprep.ipynb
jonfritz Jan 15, 2025
ad20194
Update aryn-elasticsearch-blog-dataprep.ipynb
jonfritz Jan 15, 2025
4c0150f
Update aryn-elasticsearch-blog-dataprep.ipynb
jonfritz Jan 15, 2025
e226d1f
Update aryn-elasticsearch-blog-dataprep.ipynb
jonfritz Jan 15, 2025
da00b8d
fix formatting
HenryL27 Jan 16, 2025
e8c0d30
Merge pull request #1 from HenryL27/main
jonfritz Jan 16, 2025
efc88da
Create aryn-elasticsearch-RAG-data-preparation-demo
jonfritz Jan 17, 2025
36127f7
Rename supporting-blog-content/aryn-elasticsearch-RAG-data-preparatio…
jonfritz Jan 17, 2025
d45a97a
Add files via upload
jonfritz Jan 17, 2025
0e76471
Delete supporting-blog-content/Aryn-elasticsearch-RAG-data-preparatio…
jonfritz Jan 17, 2025
3159e61
Delete notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.i…
jonfritz Jan 17, 2025
6bff032
Update README.md
jonfritz Jan 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions notebooks/integrations/aryn/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This folder contains examples showing how to prepare data using Aryn Sycamore and load into Elasticsearch for RAG and GenAI use cases.

The notebook for the Aryn Elasticsearch blog example is [here](https://github.com/elastic/elasticsearch-labs/blob/main/supporting-blog-content/Aryn-elasticsearch-RAG-data-preparation-demo/aryn-elasticsearch-blog-dataprep.ipynb).
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "a8f66d95-a9c4-40f1-8cf8-19795653c3f3",
"metadata": {},
"outputs": [],
"source": [
"!pip install sycamore-ai[elasticsearch]\n",
"# Install the Sycamore document ETL library: https://github.com/aryn-ai/sycamore"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "60b49e1c-7055-4534-ac09-8b7ab45086d4",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import sycamore\n",
"from sycamore.context import ExecMode\n",
"from sycamore.transforms.partition import ArynPartitioner\n",
"from sycamore.transforms.extract_schema import LLMPropertyExtractor\n",
"from sycamore.transforms.summarize_images import SummarizeImages, LLMImageSummarizer\n",
"from sycamore.transforms.standardizer import (\n",
" USStateStandardizer,\n",
" DateTimeStandardizer,\n",
" ignore_errors,\n",
")\n",
"from sycamore.transforms.merge_elements import GreedySectionMerger\n",
"from sycamore.functions.tokenizer import HuggingFaceTokenizer\n",
"from sycamore.transforms.embed import SentenceTransformerEmbedder\n",
"from sycamore.llms import OpenAI, OpenAIModels\n",
"\n",
"import pyarrow.fs\n",
"\n",
"llm = OpenAI(OpenAIModels.GPT_4O_MINI)\n",
"os.environ[\"ARYN_API_KEY\"] = \"<MY-ARYN-API-KEY>\"\n",
"\n",
"paths = [\"s3://aryn-public/ntsb/\"]\n",
"\n",
"context = sycamore.init()\n",
"# Add exec_mode=ExecMode.LOCAL to .init to run without Ray\n",
"docset = context.read.binary(paths=paths, binary_format=\"pdf\")\n",
"docset = docset.materialize(\n",
" path=\"./elasticsearch-tutorial/downloaded-docset\",\n",
" source_mode=sycamore.MATERIALIZE_USE_STORED,\n",
")\n",
"# Make sure your Aryn token is accessible in the environment variable ARYN_API_KEY\n",
"partitioned_docset = docset.partition(\n",
" partitioner=ArynPartitioner(extract_table_structure=True, extract_images=True)\n",
").materialize(\n",
" path=\"./elasticsearch-tutorial/partitioned-docset\",\n",
" source_mode=sycamore.MATERIALIZE_USE_STORED,\n",
")\n",
"partitioned_docset.execute()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a755a09e-1622-400b-8b75-b3bad2981b5f",
"metadata": {},
"outputs": [],
"source": [
"schema = {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"accidentNumber\": {\"type\": \"string\"},\n",
" \"dateAndTime\": {\"type\": \"date\"},\n",
" \"location\": {\n",
" \"type\": \"string\",\n",
" \"description\": \"US State where the incident occured\",\n",
" },\n",
" \"aircraft\": {\"type\": \"string\"},\n",
" \"aircraftDamage\": {\"type\": \"string\"},\n",
" \"injuries\": {\"type\": \"string\"},\n",
" \"definingEvent\": {\"type\": \"string\"},\n",
" },\n",
" \"required\": [\"accidentNumber\", \"dateAndTime\", \"location\", \"aircraft\"],\n",
"}\n",
"\n",
"schema_name = \"FlightAccidentReport\"\n",
"property_extractor = LLMPropertyExtractor(\n",
" llm=llm, num_of_elements=20, schema_name=schema_name, schema=schema\n",
")\n",
"\n",
"enriched_docset = (\n",
" partitioned_docset\n",
" # Extracts the properties based on the schema defined\n",
" .extract_properties(property_extractor=property_extractor)\n",
" # Summarizes images that were extracted using an LLM\n",
" .transform(SummarizeImages, summarizer=LLMImageSummarizer(llm=llm))\n",
")\n",
"\n",
"formatted_docset = (\n",
" enriched_docset\n",
" # Converts state abbreviations to their full names.\n",
" .map(\n",
" lambda doc: ignore_errors(\n",
" doc, USStateStandardizer, [\"properties\", \"entity\", \"location\"]\n",
" )\n",
" )\n",
" # Converts datetime into a common format\n",
" .map(\n",
" lambda doc: ignore_errors(\n",
" doc, DateTimeStandardizer, [\"properties\", \"entity\", \"dateAndTime\"]\n",
" )\n",
" )\n",
")\n",
"\n",
"\n",
"merger = GreedySectionMerger(\n",
" tokenizer=HuggingFaceTokenizer(\"sentence-transformers/all-MiniLM-L6-v2\"),\n",
" max_tokens=512,\n",
")\n",
"chunked_docset = formatted_docset.merge(merger=merger)\n",
"\n",
"model_name = \"thenlper/gte-small\"\n",
"\n",
"embedded_docset = (\n",
" chunked_docset.spread_properties([\"entity\", \"path\"])\n",
" .explode()\n",
" .embed(\n",
" embedder=SentenceTransformerEmbedder(batch_size=10_000, model_name=model_name)\n",
" )\n",
")\n",
"\n",
"embedded_docset = embedded_docset.materialize(\n",
" path=\"./elasticsearch-tutorial/embedded-docset\",\n",
" source_mode=sycamore.MATERIALIZE_USE_STORED,\n",
")\n",
"embedded_docset.execute()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b9321d7e-e812-41ac-8030-3db80c2147ec",
"metadata": {},
"outputs": [],
"source": [
"# Write to a persistent Elasticsearch Index. Note: You must have a specified elasticsearch instance running for this to work.\n",
"# For more information on how to set one up, refer to https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html\n",
"\n",
"url = \"http://localhost:9200\"\n",
"index_name = \"aryn-demo\"\n",
"embedded_ds.write.elasticsearch(\n",
" url=url,\n",
" index_name=index_name,\n",
" es_client_args={\"basic_auth\": (\"<YOUR-USERNAME>\", os.getenv(\"ELASTIC_PASSWORD\"))},\n",
" mappings={\n",
" \"properties\": {\n",
" \"embeddings\": {\n",
" \"type\": \"dense_vector\",\n",
" \"dims\": dimensions,\n",
" \"index\": True,\n",
" \"similarity\": \"cosine\",\n",
" },\n",
" \"properties\": {\"type\": \"object\"},\n",
" }\n",
" },\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52970be4-7bac-455b-bcd0-868130ac61fd",
"metadata": {},
"outputs": [],
"source": [
"# Verify data has been loaded using DocSet Query to retrieve chunks\n",
"query_params = {\"match_all\": {}}\n",
"query_docs = ctx.read.elasticsearch(\n",
" url=url,\n",
" index_name=index_name,\n",
" query=query_params,\n",
" es_client_args={\"basic_auth\": (\"<YOUR-USERNAME>\", os.getenv(\"ELASTIC_PASSWORD\"))},\n",
")\n",
"query_docs.show(show_embedding=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading