Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Aryn notebook under integrations #361

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions notebooks/integrations/aryn/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This folder contains examples showing how to prepare data using Aryn Sycamore and load into Elasticsearch for RAG and GenAI use cases.
Copy link
Contributor

@justincastilla justincastilla Jan 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jonfritz Can you include a link back to the blog when it is published as well as any requirements to run this notebook listed here as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will after it's published.

173 changes: 173 additions & 0 deletions notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "a8f66d95-a9c4-40f1-8cf8-19795653c3f3",
"metadata": {},
"outputs": [],
"source": [
"!pip install sycamore-ai[elasticsearch]\n",
"# Install the Sycamore document ETL library: https://github.com/aryn-ai/sycamore "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "60b49e1c-7055-4534-ac09-8b7ab45086d4",
"metadata": {},
"outputs": [],
"source": [
"import sycamore\n",
"from sycamore.context import ExecMode\n",
"from sycamore.transforms.partition import ArynPartitioner\n",
"from sycamore.transforms.extract_schema import LLMPropertyExtractor\n",
"from sycamore.transforms.summarize_images import SummarizeImages, LLMImageSummarizer\n",
"from sycamore.transforms.standardizer import USStateStandardizer, DateTimeStandardizer, ignore_errors\n",
"from sycamore.transforms.merge_elements import GreedySectionMerger\n",
"from sycamore.functions.tokenizer import HuggingFaceTokenizer\n",
"from sycamore.transforms.embed import SentenceTransformerEmbedder\n",
"from sycamore.llms import OpenAI, OpenAIModels\n",
"\n",
"import pyarrow.fs\n",
"\n",
"llm = OpenAI(OpenAIModels.GPT_4O_MINI)\n",
"\n",
"paths = [\"s3://aryn-public/ntsb/\"]\n",
"\n",
"context = sycamore.init()\n",
"# Add exec_mode=ExecMode.LOCAL to .init to run without Ray\n",
"docset = context.read.binary(paths=paths, binary_format=\"pdf\")\n",
"docset = docset.materialize(path=\"./opensearch-tutorial/downloaded-docset\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n",
"# Make sure your Aryn token is accessible in the environment variable ARYN_API_KEY\n",
"partitioned_docset = (docset.partition(partitioner=ArynPartitioner(extract_table_structure=True, extract_images=True))\n",
" .materialize(path=\"./opensearch-tutorial/partitioned-docset\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n",
" )\n",
"partitioned_docset.execute()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a755a09e-1622-400b-8b75-b3bad2981b5f",
"metadata": {},
"outputs": [],
"source": [
"schema = {\n",
" 'type': 'object',\n",
" 'properties': {'accidentNumber': {'type': 'string'},\n",
" 'dateAndTime': {'type': 'date'},\n",
" 'location': {'type': 'string', 'description': 'US State where the incident occured'},\n",
" 'aircraft': {'type': 'string'},\n",
" 'aircraftDamage': {'type': 'string'},\n",
" 'injuries': {'type': 'string'},\n",
" 'definingEvent': {'type': 'string'}},\n",
" 'required': ['accidentNumber',\n",
" 'dateAndTime',\n",
" 'location',\n",
" 'aircraft']\n",
" }\n",
"\n",
"schema_name = 'FlightAccidentReport'\n",
"property_extractor=LLMPropertyExtractor(llm=llm, num_of_elements=20, schema_name=schema_name, schema=schema)\n",
"\n",
"enriched_docset = (\n",
" partitioned_docset\n",
" # Extracts the properties based on the schema defined \n",
" .extract_properties(property_extractor=property_extractor)\n",
"\n",
" # Summarizes images that were extracted using an LLM\n",
" .transform(SummarizeImages, summarizer=LLMImageSummarizer(llm=llm))\n",
")\n",
"\n",
"formatted_docset = (\n",
" enriched_docset\n",
" \n",
" # Converts state abbreviations to their full names.\n",
" .map( lambda doc: ignore_errors(doc, USStateStandardizer, [\"properties\",\"entity\",\"location\"]))\n",
"\n",
" # Converts datetime into a common format\n",
" .map( lambda doc: ignore_errors(doc, DateTimeStandardizer, [\"properties\",\"entity\",\"dateAndTime\"]))\n",
")\n",
"\n",
"\n",
"merger = GreedySectionMerger(tokenizer=HuggingFaceTokenizer(\"sentence-transformers/all-MiniLM-L6-v2\"), max_tokens=512)\n",
"chunked_docset = formatted_docset.merge(merger=merger)\n",
"\n",
"model_name = \"thenlper/gte-small\"\n",
"\n",
"embedded_docset = chunked_docset.spread_properties([\"entity\", \"path\"]).explode().embed(embedder=SentenceTransformerEmbedder(batch_size=10_000, model_name=model_name))\n",
"\n",
"embedded_docset = embedded_docset.materialize(path=\"./opensearch-tutorial/embedded-docset\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n",
"embedded_docset.execute()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b9321d7e-e812-41ac-8030-3db80c2147ec",
"metadata": {},
"outputs": [],
"source": [
"# Write to a persistent Elasticsearch Index. Note: You must have a specified elasticsearch instance running for this to work.\n",
"# For more information on how to set one up, refer to https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html\n",
"\n",
"url = \"http://localhost:9200\"\n",
"index_name = \"aryn-demo\"\n",
"embedded_ds.write.elasticsearch(\n",
" url=url, \n",
" index_name=index_name,\n",
" es_client_args={\"basic_auth\": (“<YOUR-USERNAME>”, os.getenv(\"ELASTIC_PASSWORD\"))},\n",
" mappings={\n",
" \"properties\": {\n",
" \"embeddings\": {\n",
" \"type\": \"dense_vector\",\n",
" \"dims\": dimensions,\n",
" \"index\": True,\n",
" \"similarity\": \"cosine\",\n",
" },\n",
" \"properties\": {\"type\": \"object\"},\n",
" }\n",
" }\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52970be4-7bac-455b-bcd0-868130ac61fd",
"metadata": {},
"outputs": [],
"source": [
"# Verify data has been loaded using DocSet Query to retrieve chunks\n",
"query_params = {\"match_all\": {}}\n",
"query_docs = ctx.read.elasticsearch(url=url, \n",
" index_name=index_name, \n",
" query=query_params,\n",
" es_client_args={\"basic_auth\": (“<YOUR-USERNAME>”, os.getenv(\"ELASTIC_PASSWORD\"))}\n",
"query_docs.show(show_embedding=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading