-
Notifications
You must be signed in to change notification settings - Fork 205
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add notebook for Aryn blog post (#380)
* Create test * Add files via upload * Rename aryn-elasticsearch-blog-demo-clean copy.ipynb to aryn-elasticsearch-blog-dataprep.ipynb * Create README.md * Delete notebooks/integrations/aryn/test * Add files via upload Update with pip install * Update aryn-elasticsearch-blog-dataprep.ipynb Update from feedback * Update aryn-elasticsearch-blog-dataprep.ipynb Add placeholder API key * Update aryn-elasticsearch-blog-dataprep.ipynb Add comma * Update aryn-elasticsearch-blog-dataprep.ipynb Formatting * Update aryn-elasticsearch-blog-dataprep.ipynb Formatting * Update aryn-elasticsearch-blog-dataprep.ipynb Add API key placeholder * Update aryn-elasticsearch-blog-dataprep.ipynb Formatting * fix formatting Signed-off-by: Henry Lindeman <[email protected]> * Create aryn-elasticsearch-RAG-data-preparation-demo * Rename supporting-blog-content/aryn-elasticsearch-RAG-data-preparation-demo to supporting-blog-content/Aryn-elasticsearch-RAG-data-preparation-demo/test.md * Add files via upload * Delete supporting-blog-content/Aryn-elasticsearch-RAG-data-preparation-demo/test.md * Delete notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb * Update README.md Add link --------- Signed-off-by: Henry Lindeman <[email protected]> Co-authored-by: Henry Lindeman <[email protected]>
- Loading branch information
Showing
2 changed files
with
211 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
This folder contains examples showing how to prepare data using Aryn Sycamore and load into Elasticsearch for RAG and GenAI use cases. | ||
|
||
The notebook for the Aryn Elasticsearch blog example is [here](https://github.com/elastic/elasticsearch-labs/blob/main/supporting-blog-content/Aryn-elasticsearch-RAG-data-preparation-demo/aryn-elasticsearch-blog-dataprep.ipynb). |
208 changes: 208 additions & 0 deletions
208
...ntent/Aryn-elasticsearch-RAG-data-preparation-demo/aryn-elasticsearch-blog-dataprep.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,208 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "a8f66d95-a9c4-40f1-8cf8-19795653c3f3", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!pip install sycamore-ai[elasticsearch]\n", | ||
"# Install the Sycamore document ETL library: https://github.com/aryn-ai/sycamore" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "60b49e1c-7055-4534-ac09-8b7ab45086d4", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import sycamore\n", | ||
"from sycamore.context import ExecMode\n", | ||
"from sycamore.transforms.partition import ArynPartitioner\n", | ||
"from sycamore.transforms.extract_schema import LLMPropertyExtractor\n", | ||
"from sycamore.transforms.summarize_images import SummarizeImages, LLMImageSummarizer\n", | ||
"from sycamore.transforms.standardizer import (\n", | ||
" USStateStandardizer,\n", | ||
" DateTimeStandardizer,\n", | ||
" ignore_errors,\n", | ||
")\n", | ||
"from sycamore.transforms.merge_elements import GreedySectionMerger\n", | ||
"from sycamore.functions.tokenizer import HuggingFaceTokenizer\n", | ||
"from sycamore.transforms.embed import SentenceTransformerEmbedder\n", | ||
"from sycamore.llms import OpenAI, OpenAIModels\n", | ||
"\n", | ||
"import pyarrow.fs\n", | ||
"\n", | ||
"llm = OpenAI(OpenAIModels.GPT_4O_MINI)\n", | ||
"os.environ[\"ARYN_API_KEY\"] = \"<MY-ARYN-API-KEY>\"\n", | ||
"\n", | ||
"paths = [\"s3://aryn-public/ntsb/\"]\n", | ||
"\n", | ||
"context = sycamore.init()\n", | ||
"# Add exec_mode=ExecMode.LOCAL to .init to run without Ray\n", | ||
"docset = context.read.binary(paths=paths, binary_format=\"pdf\")\n", | ||
"docset = docset.materialize(\n", | ||
" path=\"./elasticsearch-tutorial/downloaded-docset\",\n", | ||
" source_mode=sycamore.MATERIALIZE_USE_STORED,\n", | ||
")\n", | ||
"# Make sure your Aryn token is accessible in the environment variable ARYN_API_KEY\n", | ||
"partitioned_docset = docset.partition(\n", | ||
" partitioner=ArynPartitioner(extract_table_structure=True, extract_images=True)\n", | ||
").materialize(\n", | ||
" path=\"./elasticsearch-tutorial/partitioned-docset\",\n", | ||
" source_mode=sycamore.MATERIALIZE_USE_STORED,\n", | ||
")\n", | ||
"partitioned_docset.execute()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "a755a09e-1622-400b-8b75-b3bad2981b5f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"schema = {\n", | ||
" \"type\": \"object\",\n", | ||
" \"properties\": {\n", | ||
" \"accidentNumber\": {\"type\": \"string\"},\n", | ||
" \"dateAndTime\": {\"type\": \"date\"},\n", | ||
" \"location\": {\n", | ||
" \"type\": \"string\",\n", | ||
" \"description\": \"US State where the incident occured\",\n", | ||
" },\n", | ||
" \"aircraft\": {\"type\": \"string\"},\n", | ||
" \"aircraftDamage\": {\"type\": \"string\"},\n", | ||
" \"injuries\": {\"type\": \"string\"},\n", | ||
" \"definingEvent\": {\"type\": \"string\"},\n", | ||
" },\n", | ||
" \"required\": [\"accidentNumber\", \"dateAndTime\", \"location\", \"aircraft\"],\n", | ||
"}\n", | ||
"\n", | ||
"schema_name = \"FlightAccidentReport\"\n", | ||
"property_extractor = LLMPropertyExtractor(\n", | ||
" llm=llm, num_of_elements=20, schema_name=schema_name, schema=schema\n", | ||
")\n", | ||
"\n", | ||
"enriched_docset = (\n", | ||
" partitioned_docset\n", | ||
" # Extracts the properties based on the schema defined\n", | ||
" .extract_properties(property_extractor=property_extractor)\n", | ||
" # Summarizes images that were extracted using an LLM\n", | ||
" .transform(SummarizeImages, summarizer=LLMImageSummarizer(llm=llm))\n", | ||
")\n", | ||
"\n", | ||
"formatted_docset = (\n", | ||
" enriched_docset\n", | ||
" # Converts state abbreviations to their full names.\n", | ||
" .map(\n", | ||
" lambda doc: ignore_errors(\n", | ||
" doc, USStateStandardizer, [\"properties\", \"entity\", \"location\"]\n", | ||
" )\n", | ||
" )\n", | ||
" # Converts datetime into a common format\n", | ||
" .map(\n", | ||
" lambda doc: ignore_errors(\n", | ||
" doc, DateTimeStandardizer, [\"properties\", \"entity\", \"dateAndTime\"]\n", | ||
" )\n", | ||
" )\n", | ||
")\n", | ||
"\n", | ||
"\n", | ||
"merger = GreedySectionMerger(\n", | ||
" tokenizer=HuggingFaceTokenizer(\"sentence-transformers/all-MiniLM-L6-v2\"),\n", | ||
" max_tokens=512,\n", | ||
")\n", | ||
"chunked_docset = formatted_docset.merge(merger=merger)\n", | ||
"\n", | ||
"model_name = \"thenlper/gte-small\"\n", | ||
"\n", | ||
"embedded_docset = (\n", | ||
" chunked_docset.spread_properties([\"entity\", \"path\"])\n", | ||
" .explode()\n", | ||
" .embed(\n", | ||
" embedder=SentenceTransformerEmbedder(batch_size=10_000, model_name=model_name)\n", | ||
" )\n", | ||
")\n", | ||
"\n", | ||
"embedded_docset = embedded_docset.materialize(\n", | ||
" path=\"./elasticsearch-tutorial/embedded-docset\",\n", | ||
" source_mode=sycamore.MATERIALIZE_USE_STORED,\n", | ||
")\n", | ||
"embedded_docset.execute()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "b9321d7e-e812-41ac-8030-3db80c2147ec", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Write to a persistent Elasticsearch Index. Note: You must have a specified elasticsearch instance running for this to work.\n", | ||
"# For more information on how to set one up, refer to https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html\n", | ||
"\n", | ||
"url = \"http://localhost:9200\"\n", | ||
"index_name = \"aryn-demo\"\n", | ||
"embedded_ds.write.elasticsearch(\n", | ||
" url=url,\n", | ||
" index_name=index_name,\n", | ||
" es_client_args={\"basic_auth\": (\"<YOUR-USERNAME>\", os.getenv(\"ELASTIC_PASSWORD\"))},\n", | ||
" mappings={\n", | ||
" \"properties\": {\n", | ||
" \"embeddings\": {\n", | ||
" \"type\": \"dense_vector\",\n", | ||
" \"dims\": dimensions,\n", | ||
" \"index\": True,\n", | ||
" \"similarity\": \"cosine\",\n", | ||
" },\n", | ||
" \"properties\": {\"type\": \"object\"},\n", | ||
" }\n", | ||
" },\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "52970be4-7bac-455b-bcd0-868130ac61fd", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Verify data has been loaded using DocSet Query to retrieve chunks\n", | ||
"query_params = {\"match_all\": {}}\n", | ||
"query_docs = ctx.read.elasticsearch(\n", | ||
" url=url,\n", | ||
" index_name=index_name,\n", | ||
" query=query_params,\n", | ||
" es_client_args={\"basic_auth\": (\"<YOUR-USERNAME>\", os.getenv(\"ELASTIC_PASSWORD\"))},\n", | ||
")\n", | ||
"query_docs.show(show_embedding=False)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |