From 7250a4b0feb92a24a7d3045cb472a029f453de70 Mon Sep 17 00:00:00 2001 From: jonfritz <134336691+jonfritz@users.noreply.github.com> Date: Tue, 17 Dec 2024 12:11:52 -0800 Subject: [PATCH 01/14] Create test --- notebooks/integrations/aryn/test | 1 + 1 file changed, 1 insertion(+) create mode 100644 notebooks/integrations/aryn/test diff --git a/notebooks/integrations/aryn/test b/notebooks/integrations/aryn/test new file mode 100644 index 00000000..9411c6de --- /dev/null +++ b/notebooks/integrations/aryn/test @@ -0,0 +1 @@ +asdfadsf From 88b192b061decfe04855c163e1a3537cdbaba032 Mon Sep 17 00:00:00 2001 From: jonfritz <134336691+jonfritz@users.noreply.github.com> Date: Tue, 17 Dec 2024 12:13:52 -0800 Subject: [PATCH 02/14] Add files via upload --- ...n-elasticsearch-blog-demo-clean copy.ipynb | 162 ++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 notebooks/integrations/aryn/aryn-elasticsearch-blog-demo-clean copy.ipynb diff --git a/notebooks/integrations/aryn/aryn-elasticsearch-blog-demo-clean copy.ipynb b/notebooks/integrations/aryn/aryn-elasticsearch-blog-demo-clean copy.ipynb new file mode 100644 index 00000000..63675e00 --- /dev/null +++ b/notebooks/integrations/aryn/aryn-elasticsearch-blog-demo-clean copy.ipynb @@ -0,0 +1,162 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "60b49e1c-7055-4534-ac09-8b7ab45086d4", + "metadata": {}, + "outputs": [], + "source": [ + "import sycamore\n", + "from sycamore.context import ExecMode\n", + "from sycamore.transforms.partition import ArynPartitioner\n", + "from sycamore.transforms.extract_schema import LLMPropertyExtractor\n", + "from sycamore.transforms.summarize_images import SummarizeImages, LLMImageSummarizer\n", + "from sycamore.transforms.standardizer import USStateStandardizer, DateTimeStandardizer, ignore_errors\n", + "from sycamore.transforms.merge_elements import GreedySectionMerger\n", + "from sycamore.functions.tokenizer import HuggingFaceTokenizer\n", + "from sycamore.transforms.embed import SentenceTransformerEmbedder\n", + "from sycamore.llms import OpenAI, OpenAIModels\n", + "\n", + "import pyarrow.fs\n", + "\n", + "llm = OpenAI(OpenAIModels.GPT_4O_MINI)\n", + "\n", + "paths = [\"s3://aryn-public/ntsb/\"]\n", + "\n", + "context = sycamore.init()\n", + "# Add exec_mode=ExecMode.LOCAL to .init to run without Ray\n", + "docset = context.read.binary(paths=paths, binary_format=\"pdf\")\n", + "docset = docset.materialize(path=\"./opensearch-tutorial/downloaded-docset\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n", + "# Make sure your Aryn token is accessible in the environment variable ARYN_API_KEY\n", + "partitioned_docset = (docset.partition(partitioner=ArynPartitioner(extract_table_structure=True, extract_images=True))\n", + " .materialize(path=\"./opensearch-tutorial/partitioned-docset\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n", + " )\n", + "partitioned_docset.execute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a755a09e-1622-400b-8b75-b3bad2981b5f", + "metadata": {}, + "outputs": [], + "source": [ + "schema = {\n", + " 'type': 'object',\n", + " 'properties': {'accidentNumber': {'type': 'string'},\n", + " 'dateAndTime': {'type': 'date'},\n", + " 'location': {'type': 'string', 'description': 'US State where the incident occured'},\n", + " 'aircraft': {'type': 'string'},\n", + " 'aircraftDamage': {'type': 'string'},\n", + " 'injuries': {'type': 'string'},\n", + " 'definingEvent': {'type': 'string'}},\n", + " 'required': ['accidentNumber',\n", + " 'dateAndTime',\n", + " 'location',\n", + " 'aircraft']\n", + " }\n", + "\n", + "schema_name = 'FlightAccidentReport'\n", + "property_extractor=LLMPropertyExtractor(llm=llm, num_of_elements=20, schema_name=schema_name, schema=schema)\n", + "\n", + "enriched_docset = (\n", + " partitioned_docset\n", + " # Extracts the properties based on the schema defined \n", + " .extract_properties(property_extractor=property_extractor)\n", + "\n", + " # Summarizes images that were extracted using an LLM\n", + " .transform(SummarizeImages, summarizer=LLMImageSummarizer(llm=llm))\n", + ")\n", + "\n", + "formatted_docset = (\n", + " enriched_docset\n", + " \n", + " # Converts state abbreviations to their full names.\n", + " .map( lambda doc: ignore_errors(doc, USStateStandardizer, [\"properties\",\"entity\",\"location\"]))\n", + "\n", + " # Converts datetime into a common format\n", + " .map( lambda doc: ignore_errors(doc, DateTimeStandardizer, [\"properties\",\"entity\",\"dateAndTime\"]))\n", + ")\n", + "\n", + "\n", + "merger = GreedySectionMerger(tokenizer=HuggingFaceTokenizer(\"sentence-transformers/all-MiniLM-L6-v2\"), max_tokens=512)\n", + "chunked_docset = formatted_docset.merge(merger=merger)\n", + "\n", + "model_name = \"thenlper/gte-small\"\n", + "\n", + "embedded_docset = chunked_docset.spread_properties([\"entity\", \"path\"]).explode().embed(embedder=SentenceTransformerEmbedder(batch_size=10_000, model_name=model_name))\n", + "\n", + "embedded_docset = embedded_docset.materialize(path=\"./opensearch-tutorial/embedded-docset\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n", + "embedded_docset.execute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9321d7e-e812-41ac-8030-3db80c2147ec", + "metadata": {}, + "outputs": [], + "source": [ + "# Write to a persistent Elasticsearch Index. Note: You must have a specified elasticsearch instance running for this to work.\n", + "# For more information on how to set one up, refer to https://www.elastic.co/guide/en/elasticsearch/reference/current/install-elasticsearch.html\n", + "\n", + "url = \"http://localhost:9200\"\n", + "index_name = \"aryn-demo\"\n", + "embedded_ds.write.elasticsearch(\n", + " url=url, \n", + " index_name=index_name,\n", + " es_client_args={\"basic_auth\": (“”, os.getenv(\"ELASTIC_PASSWORD\"))},\n", + " mappings={\n", + " \"properties\": {\n", + " \"embeddings\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": dimensions,\n", + " \"index\": True,\n", + " \"similarity\": \"cosine\",\n", + " },\n", + " \"properties\": {\"type\": \"object\"},\n", + " }\n", + " }\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52970be4-7bac-455b-bcd0-868130ac61fd", + "metadata": {}, + "outputs": [], + "source": [ + "# Verify data has been loaded using DocSet Query to retrieve chunks\n", + "query_params = {\"match_all\": {}}\n", + "query_docs = ctx.read.elasticsearch(url=url, \n", + " index_name=index_name, \n", + " query=query_params,\n", + " es_client_args={\"basic_auth\": (“”, os.getenv(\"ELASTIC_PASSWORD\"))}\n", + "query_docs.show(show_embedding=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 8b8c160503173f6538c847623a5cef7c887de7c8 Mon Sep 17 00:00:00 2001 From: jonfritz <134336691+jonfritz@users.noreply.github.com> Date: Tue, 17 Dec 2024 12:14:40 -0800 Subject: [PATCH 03/14] Rename aryn-elasticsearch-blog-demo-clean copy.ipynb to aryn-elasticsearch-blog-dataprep.ipynb --- ...mo-clean copy.ipynb => aryn-elasticsearch-blog-dataprep.ipynb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename notebooks/integrations/aryn/{aryn-elasticsearch-blog-demo-clean copy.ipynb => aryn-elasticsearch-blog-dataprep.ipynb} (100%) diff --git a/notebooks/integrations/aryn/aryn-elasticsearch-blog-demo-clean copy.ipynb b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb similarity index 100% rename from notebooks/integrations/aryn/aryn-elasticsearch-blog-demo-clean copy.ipynb rename to notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb From 2e055817f1e656065cbc63d75e44fe8ea46177f2 Mon Sep 17 00:00:00 2001 From: jonfritz <134336691+jonfritz@users.noreply.github.com> Date: Tue, 17 Dec 2024 12:17:10 -0800 Subject: [PATCH 04/14] Create README.md --- notebooks/integrations/aryn/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 notebooks/integrations/aryn/README.md diff --git a/notebooks/integrations/aryn/README.md b/notebooks/integrations/aryn/README.md new file mode 100644 index 00000000..e3b291c2 --- /dev/null +++ b/notebooks/integrations/aryn/README.md @@ -0,0 +1 @@ +This folder contains examples showing how to prepare data using Aryn Sycamore and load into Elasticsearch for RAG and GenAI use cases. From 7c243f814c5f46ed09ebd611c46e4dab8a04174e Mon Sep 17 00:00:00 2001 From: jonfritz <134336691+jonfritz@users.noreply.github.com> Date: Tue, 17 Dec 2024 12:17:26 -0800 Subject: [PATCH 05/14] Delete notebooks/integrations/aryn/test --- notebooks/integrations/aryn/test | 1 - 1 file changed, 1 deletion(-) delete mode 100644 notebooks/integrations/aryn/test diff --git a/notebooks/integrations/aryn/test b/notebooks/integrations/aryn/test deleted file mode 100644 index 9411c6de..00000000 --- a/notebooks/integrations/aryn/test +++ /dev/null @@ -1 +0,0 @@ -asdfadsf From 55bb17b669af659e7a8389473cef43db9ae94871 Mon Sep 17 00:00:00 2001 From: jonfritz <134336691+jonfritz@users.noreply.github.com> Date: Tue, 7 Jan 2025 12:45:33 -0800 Subject: [PATCH 06/14] Add files via upload Update with pip install --- .../aryn/aryn-elasticsearch-blog-dataprep.ipynb | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb index 63675e00..d52a7bf8 100644 --- a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb +++ b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb @@ -1,5 +1,16 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "a8f66d95-a9c4-40f1-8cf8-19795653c3f3", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install sycamore-ai[elasticsearch]\n", + "# Install the Sycamore document ETL library: https://github.com/aryn-ai/sycamore " + ] + }, { "cell_type": "code", "execution_count": null, From 5ba1d8823b37ac42445d6d50b8d0592046c09dc4 Mon Sep 17 00:00:00 2001 From: jonfritz <134336691+jonfritz@users.noreply.github.com> Date: Mon, 13 Jan 2025 15:49:26 -0800 Subject: [PATCH 07/14] Update aryn-elasticsearch-blog-dataprep.ipynb Update from feedback --- .../aryn/aryn-elasticsearch-blog-dataprep.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb index d52a7bf8..ed248ff0 100644 --- a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb +++ b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb @@ -8,7 +8,7 @@ "outputs": [], "source": [ "!pip install sycamore-ai[elasticsearch]\n", - "# Install the Sycamore document ETL library: https://github.com/aryn-ai/sycamore " + "# Install the Sycamore document ETL library: https://github.com/aryn-ai/sycamore" ] }, { @@ -38,10 +38,10 @@ "context = sycamore.init()\n", "# Add exec_mode=ExecMode.LOCAL to .init to run without Ray\n", "docset = context.read.binary(paths=paths, binary_format=\"pdf\")\n", - "docset = docset.materialize(path=\"./opensearch-tutorial/downloaded-docset\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n", + "docset = docset.materialize(path=\"./elasticsearch-tutorial/downloaded-docset\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n", "# Make sure your Aryn token is accessible in the environment variable ARYN_API_KEY\n", "partitioned_docset = (docset.partition(partitioner=ArynPartitioner(extract_table_structure=True, extract_images=True))\n", - " .materialize(path=\"./opensearch-tutorial/partitioned-docset\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n", + " .materialize(path=\"./elasticsearch-tutorial/partitioned-docset\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n", " )\n", "partitioned_docset.execute()" ] @@ -98,7 +98,7 @@ "\n", "embedded_docset = chunked_docset.spread_properties([\"entity\", \"path\"]).explode().embed(embedder=SentenceTransformerEmbedder(batch_size=10_000, model_name=model_name))\n", "\n", - "embedded_docset = embedded_docset.materialize(path=\"./opensearch-tutorial/embedded-docset\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n", + "embedded_docset = embedded_docset.materialize(path=\"./elasticsearch-tutorial/embedded-docset\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n", "embedded_docset.execute()" ] }, From dd964a1394519a72afcafb88c5fc3bf68bd810a8 Mon Sep 17 00:00:00 2001 From: jonfritz <134336691+jonfritz@users.noreply.github.com> Date: Wed, 15 Jan 2025 13:43:46 -0800 Subject: [PATCH 08/14] Update aryn-elasticsearch-blog-dataprep.ipynb Add placeholder API key --- .../integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb index ed248ff0..2ec0a44b 100644 --- a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb +++ b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb @@ -18,6 +18,7 @@ "metadata": {}, "outputs": [], "source": [ + "import os\n", "import sycamore\n", "from sycamore.context import ExecMode\n", "from sycamore.transforms.partition import ArynPartitioner\n", @@ -32,6 +33,7 @@ "import pyarrow.fs\n", "\n", "llm = OpenAI(OpenAIModels.GPT_4O_MINI)\n", + "os.environ["ARYN_API_KEY"] = ""\n" "\n", "paths = [\"s3://aryn-public/ntsb/\"]\n", "\n", From 35b1929137e4faecc0b94975c09efe549eb38e1e Mon Sep 17 00:00:00 2001 From: jonfritz <134336691+jonfritz@users.noreply.github.com> Date: Wed, 15 Jan 2025 13:44:18 -0800 Subject: [PATCH 09/14] Update aryn-elasticsearch-blog-dataprep.ipynb Add comma --- .../integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb index 2ec0a44b..e2114fd3 100644 --- a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb +++ b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb @@ -33,7 +33,7 @@ "import pyarrow.fs\n", "\n", "llm = OpenAI(OpenAIModels.GPT_4O_MINI)\n", - "os.environ["ARYN_API_KEY"] = ""\n" + "os.environ["ARYN_API_KEY"] = ""\n", "\n", "paths = [\"s3://aryn-public/ntsb/\"]\n", "\n", From 58c586db90a2a1fa1c28a992124fea28ecfdd4b3 Mon Sep 17 00:00:00 2001 From: jonfritz <134336691+jonfritz@users.noreply.github.com> Date: Wed, 15 Jan 2025 13:48:00 -0800 Subject: [PATCH 10/14] Update aryn-elasticsearch-blog-dataprep.ipynb Formatting --- .../integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb index e2114fd3..c300c4db 100644 --- a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb +++ b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb @@ -33,7 +33,7 @@ "import pyarrow.fs\n", "\n", "llm = OpenAI(OpenAIModels.GPT_4O_MINI)\n", - "os.environ["ARYN_API_KEY"] = ""\n", + "os.environ["ARYN_API_KEY"] = [\""\"]\n", "\n", "paths = [\"s3://aryn-public/ntsb/\"]\n", "\n", From ad20194eeca6775423be687c907b092774f1b53d Mon Sep 17 00:00:00 2001 From: jonfritz <134336691+jonfritz@users.noreply.github.com> Date: Wed, 15 Jan 2025 13:50:01 -0800 Subject: [PATCH 11/14] Update aryn-elasticsearch-blog-dataprep.ipynb Formatting --- .../integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb | 1 - 1 file changed, 1 deletion(-) diff --git a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb index c300c4db..bf16410e 100644 --- a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb +++ b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb @@ -33,7 +33,6 @@ "import pyarrow.fs\n", "\n", "llm = OpenAI(OpenAIModels.GPT_4O_MINI)\n", - "os.environ["ARYN_API_KEY"] = [\""\"]\n", "\n", "paths = [\"s3://aryn-public/ntsb/\"]\n", "\n", From 4c0150f376ea816d2bba4295cab299ab7f202237 Mon Sep 17 00:00:00 2001 From: jonfritz <134336691+jonfritz@users.noreply.github.com> Date: Wed, 15 Jan 2025 13:52:03 -0800 Subject: [PATCH 12/14] Update aryn-elasticsearch-blog-dataprep.ipynb Add API key placeholder --- .../integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb index bf16410e..148a4f79 100644 --- a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb +++ b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb @@ -33,6 +33,7 @@ "import pyarrow.fs\n", "\n", "llm = OpenAI(OpenAIModels.GPT_4O_MINI)\n", + "os.environ[\"ARYN_API_KEY"\] = ""\n", "\n", "paths = [\"s3://aryn-public/ntsb/\"]\n", "\n", From e226d1f9678450d0d33261335289637f3daa83cd Mon Sep 17 00:00:00 2001 From: jonfritz <134336691+jonfritz@users.noreply.github.com> Date: Wed, 15 Jan 2025 13:53:36 -0800 Subject: [PATCH 13/14] Update aryn-elasticsearch-blog-dataprep.ipynb Formatting --- .../integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb index 148a4f79..474b0032 100644 --- a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb +++ b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb @@ -33,7 +33,7 @@ "import pyarrow.fs\n", "\n", "llm = OpenAI(OpenAIModels.GPT_4O_MINI)\n", - "os.environ[\"ARYN_API_KEY"\] = ""\n", + "os.environ[\"ARYN_API_KEY\"] = \"\"\n", "\n", "paths = [\"s3://aryn-public/ntsb/\"]\n", "\n", From da00b8db6623368ff32ac31f9b7ccc596221f46e Mon Sep 17 00:00:00 2001 From: Henry Lindeman Date: Thu, 16 Jan 2025 14:19:46 -0800 Subject: [PATCH 14/14] fix formatting Signed-off-by: Henry Lindeman --- .../aryn-elasticsearch-blog-dataprep.ipynb | 121 +++++++++++------- 1 file changed, 77 insertions(+), 44 deletions(-) diff --git a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb index 474b0032..4a3c0bc0 100644 --- a/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb +++ b/notebooks/integrations/aryn/aryn-elasticsearch-blog-dataprep.ipynb @@ -24,7 +24,11 @@ "from sycamore.transforms.partition import ArynPartitioner\n", "from sycamore.transforms.extract_schema import LLMPropertyExtractor\n", "from sycamore.transforms.summarize_images import SummarizeImages, LLMImageSummarizer\n", - "from sycamore.transforms.standardizer import USStateStandardizer, DateTimeStandardizer, ignore_errors\n", + "from sycamore.transforms.standardizer import (\n", + " USStateStandardizer,\n", + " DateTimeStandardizer,\n", + " ignore_errors,\n", + ")\n", "from sycamore.transforms.merge_elements import GreedySectionMerger\n", "from sycamore.functions.tokenizer import HuggingFaceTokenizer\n", "from sycamore.transforms.embed import SentenceTransformerEmbedder\n", @@ -33,18 +37,24 @@ "import pyarrow.fs\n", "\n", "llm = OpenAI(OpenAIModels.GPT_4O_MINI)\n", - "os.environ[\"ARYN_API_KEY\"] = \"\"\n", + "os.environ[\"ARYN_API_KEY\"] = \"\"\n", "\n", "paths = [\"s3://aryn-public/ntsb/\"]\n", "\n", "context = sycamore.init()\n", "# Add exec_mode=ExecMode.LOCAL to .init to run without Ray\n", "docset = context.read.binary(paths=paths, binary_format=\"pdf\")\n", - "docset = docset.materialize(path=\"./elasticsearch-tutorial/downloaded-docset\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n", + "docset = docset.materialize(\n", + " path=\"./elasticsearch-tutorial/downloaded-docset\",\n", + " source_mode=sycamore.MATERIALIZE_USE_STORED,\n", + ")\n", "# Make sure your Aryn token is accessible in the environment variable ARYN_API_KEY\n", - "partitioned_docset = (docset.partition(partitioner=ArynPartitioner(extract_table_structure=True, extract_images=True))\n", - " .materialize(path=\"./elasticsearch-tutorial/partitioned-docset\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n", - " )\n", + "partitioned_docset = docset.partition(\n", + " partitioner=ArynPartitioner(extract_table_structure=True, extract_images=True)\n", + ").materialize(\n", + " path=\"./elasticsearch-tutorial/partitioned-docset\",\n", + " source_mode=sycamore.MATERIALIZE_USE_STORED,\n", + ")\n", "partitioned_docset.execute()" ] }, @@ -56,51 +66,72 @@ "outputs": [], "source": [ "schema = {\n", - " 'type': 'object',\n", - " 'properties': {'accidentNumber': {'type': 'string'},\n", - " 'dateAndTime': {'type': 'date'},\n", - " 'location': {'type': 'string', 'description': 'US State where the incident occured'},\n", - " 'aircraft': {'type': 'string'},\n", - " 'aircraftDamage': {'type': 'string'},\n", - " 'injuries': {'type': 'string'},\n", - " 'definingEvent': {'type': 'string'}},\n", - " 'required': ['accidentNumber',\n", - " 'dateAndTime',\n", - " 'location',\n", - " 'aircraft']\n", - " }\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"accidentNumber\": {\"type\": \"string\"},\n", + " \"dateAndTime\": {\"type\": \"date\"},\n", + " \"location\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"US State where the incident occured\",\n", + " },\n", + " \"aircraft\": {\"type\": \"string\"},\n", + " \"aircraftDamage\": {\"type\": \"string\"},\n", + " \"injuries\": {\"type\": \"string\"},\n", + " \"definingEvent\": {\"type\": \"string\"},\n", + " },\n", + " \"required\": [\"accidentNumber\", \"dateAndTime\", \"location\", \"aircraft\"],\n", + "}\n", "\n", - "schema_name = 'FlightAccidentReport'\n", - "property_extractor=LLMPropertyExtractor(llm=llm, num_of_elements=20, schema_name=schema_name, schema=schema)\n", + "schema_name = \"FlightAccidentReport\"\n", + "property_extractor = LLMPropertyExtractor(\n", + " llm=llm, num_of_elements=20, schema_name=schema_name, schema=schema\n", + ")\n", "\n", "enriched_docset = (\n", " partitioned_docset\n", - " # Extracts the properties based on the schema defined \n", - " .extract_properties(property_extractor=property_extractor)\n", - "\n", - " # Summarizes images that were extracted using an LLM\n", - " .transform(SummarizeImages, summarizer=LLMImageSummarizer(llm=llm))\n", + " # Extracts the properties based on the schema defined\n", + " .extract_properties(property_extractor=property_extractor)\n", + " # Summarizes images that were extracted using an LLM\n", + " .transform(SummarizeImages, summarizer=LLMImageSummarizer(llm=llm))\n", ")\n", "\n", "formatted_docset = (\n", " enriched_docset\n", - " \n", - " # Converts state abbreviations to their full names.\n", - " .map( lambda doc: ignore_errors(doc, USStateStandardizer, [\"properties\",\"entity\",\"location\"]))\n", - "\n", - " # Converts datetime into a common format\n", - " .map( lambda doc: ignore_errors(doc, DateTimeStandardizer, [\"properties\",\"entity\",\"dateAndTime\"]))\n", + " # Converts state abbreviations to their full names.\n", + " .map(\n", + " lambda doc: ignore_errors(\n", + " doc, USStateStandardizer, [\"properties\", \"entity\", \"location\"]\n", + " )\n", + " )\n", + " # Converts datetime into a common format\n", + " .map(\n", + " lambda doc: ignore_errors(\n", + " doc, DateTimeStandardizer, [\"properties\", \"entity\", \"dateAndTime\"]\n", + " )\n", + " )\n", ")\n", "\n", "\n", - "merger = GreedySectionMerger(tokenizer=HuggingFaceTokenizer(\"sentence-transformers/all-MiniLM-L6-v2\"), max_tokens=512)\n", + "merger = GreedySectionMerger(\n", + " tokenizer=HuggingFaceTokenizer(\"sentence-transformers/all-MiniLM-L6-v2\"),\n", + " max_tokens=512,\n", + ")\n", "chunked_docset = formatted_docset.merge(merger=merger)\n", "\n", "model_name = \"thenlper/gte-small\"\n", "\n", - "embedded_docset = chunked_docset.spread_properties([\"entity\", \"path\"]).explode().embed(embedder=SentenceTransformerEmbedder(batch_size=10_000, model_name=model_name))\n", + "embedded_docset = (\n", + " chunked_docset.spread_properties([\"entity\", \"path\"])\n", + " .explode()\n", + " .embed(\n", + " embedder=SentenceTransformerEmbedder(batch_size=10_000, model_name=model_name)\n", + " )\n", + ")\n", "\n", - "embedded_docset = embedded_docset.materialize(path=\"./elasticsearch-tutorial/embedded-docset\", source_mode=sycamore.MATERIALIZE_USE_STORED)\n", + "embedded_docset = embedded_docset.materialize(\n", + " path=\"./elasticsearch-tutorial/embedded-docset\",\n", + " source_mode=sycamore.MATERIALIZE_USE_STORED,\n", + ")\n", "embedded_docset.execute()" ] }, @@ -117,9 +148,9 @@ "url = \"http://localhost:9200\"\n", "index_name = \"aryn-demo\"\n", "embedded_ds.write.elasticsearch(\n", - " url=url, \n", + " url=url,\n", " index_name=index_name,\n", - " es_client_args={\"basic_auth\": (“”, os.getenv(\"ELASTIC_PASSWORD\"))},\n", + " es_client_args={\"basic_auth\": (\"\", os.getenv(\"ELASTIC_PASSWORD\"))},\n", " mappings={\n", " \"properties\": {\n", " \"embeddings\": {\n", @@ -127,11 +158,11 @@ " \"dims\": dimensions,\n", " \"index\": True,\n", " \"similarity\": \"cosine\",\n", - " },\n", + " },\n", " \"properties\": {\"type\": \"object\"},\n", - " }\n", " }\n", - " )" + " },\n", + ")" ] }, { @@ -143,10 +174,12 @@ "source": [ "# Verify data has been loaded using DocSet Query to retrieve chunks\n", "query_params = {\"match_all\": {}}\n", - "query_docs = ctx.read.elasticsearch(url=url, \n", - " index_name=index_name, \n", - " query=query_params,\n", - " es_client_args={\"basic_auth\": (“”, os.getenv(\"ELASTIC_PASSWORD\"))}\n", + "query_docs = ctx.read.elasticsearch(\n", + " url=url,\n", + " index_name=index_name,\n", + " query=query_params,\n", + " es_client_args={\"basic_auth\": (\"\", os.getenv(\"ELASTIC_PASSWORD\"))},\n", + ")\n", "query_docs.show(show_embedding=False)" ] }