Skip to content

Commit 9049e4e

Browse files
rbiseck3ryannikolaidischristinestraub
authored
feat/remove ingest code, use new dep for tests (#3595)
### Description Alternative to #3572 but maintaining all ingest tests, running them by pulling in the latest version of unstructured-ingest. --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: rbiseck3 <[email protected]> Co-authored-by: Christine Straub <[email protected]> Co-authored-by: christinestraub <[email protected]>
1 parent ecf0267 commit 9049e4e

File tree

608 files changed

+943
-42409
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

608 files changed

+943
-42409
lines changed

.github/actions/base-ingest-cache/action.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ runs:
3939
python -m pip install --upgrade setuptools
4040
fi
4141
make install-ci
42-
make install-all-ingest
42+
make install-ingest
4343
- name: Save Ingest Cache
4444
if: steps.ingest-virtualenv-cache-restore.outputs.cache-hit != 'true'
4545
id: ingest-virtualenv-cache-save

.github/workflows/ci.yml

-131
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,6 @@ jobs:
7272
- name: Install all doc and test dependencies
7373
run: |
7474
make install-ci
75-
make install-all-ingest
7675
make check-licenses
7776
7877
lint:
@@ -273,37 +272,6 @@ jobs:
273272
python-version: ${{ matrix.python-version }}
274273
check-only: 'true'
275274

276-
test_ingest_unit:
277-
strategy:
278-
matrix:
279-
python-version: [ "3.9","3.10" ]
280-
runs-on: ubuntu-latest
281-
needs: [ setup_ingest, lint ]
282-
steps:
283-
# actions/checkout MUST come before auth
284-
- uses: 'actions/checkout@v4'
285-
- name: Set up Python ${{ matrix.python-version }}
286-
uses: actions/setup-python@v5
287-
with:
288-
python-version: ${{ matrix.python-version }}
289-
- name: Get full Python version
290-
id: full-python-version
291-
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
292-
- name: Setup virtual environment
293-
uses: ./.github/actions/base-ingest-cache
294-
with:
295-
python-version: ${{ matrix.python-version }}
296-
- name: Test Ingest (unit)
297-
env:
298-
NLTK_DATA: ${{ github.workspace }}/nltk_data
299-
PYTHON: python${{ matrix.python-version }}
300-
run: |
301-
source .venv/bin/activate
302-
make install-ci
303-
make install-all-ingest
304-
PYTHONPATH=. ${PYTHON} -m pytest test_unstructured_ingest/unit
305-
306-
307275
test_ingest_src:
308276
strategy:
309277
matrix:
@@ -378,8 +346,6 @@ jobs:
378346
PYTHON: python${{ matrix.python-version }}
379347
run: |
380348
source .venv/bin/activate
381-
make install-ci
382-
make install-all-ingest
383349
sudo apt-get update
384350
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
385351
make install-pandoc
@@ -392,103 +358,6 @@ jobs:
392358
./test_unstructured_ingest/test-ingest-src.sh
393359
394360
395-
test_ingest_dest:
396-
environment: ci
397-
strategy:
398-
matrix:
399-
python-version: ["3.9","3.10"]
400-
runs-on: ubuntu-latest-m
401-
needs: [setup_ingest, lint]
402-
steps:
403-
# actions/checkout MUST come before auth
404-
- uses: 'actions/checkout@v4'
405-
- name: Set up Python ${{ matrix.python-version }}
406-
uses: actions/setup-python@v5
407-
with:
408-
python-version: ${{ matrix.python-version }}
409-
- name: Get full Python version
410-
id: full-python-version
411-
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
412-
- name: Setup virtual environment
413-
uses: ./.github/actions/base-ingest-cache
414-
with:
415-
python-version: ${{ matrix.python-version }}
416-
- name: Setup docker-compose
417-
uses: KengoTODA/actions-setup-docker-compose@v1
418-
with:
419-
version: '2.22.0'
420-
- name: Test (end-to-end)
421-
env:
422-
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
423-
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
424-
S3_INGEST_TEST_ACCESS_KEY: ${{ secrets.S3_INGEST_TEST_ACCESS_KEY }}
425-
S3_INGEST_TEST_SECRET_KEY: ${{ secrets.S3_INGEST_TEST_SECRET_KEY }}
426-
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }}
427-
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
428-
BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }}
429-
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }}
430-
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }}
431-
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }}
432-
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
433-
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
434-
MONGODB_URI: ${{ secrets.MONGODB_URI }}
435-
MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }}
436-
AZURE_DEST_CONNECTION_STR: ${{ secrets.AZURE_DEST_CONNECTION_STR }}
437-
PINECONE_API_KEY: ${{secrets.PINECONE_API_KEY}}
438-
VECTARA_OAUTH_CLIENT_ID: ${{secrets.VECTARA_OAUTH_CLIENT_ID}}
439-
VECTARA_OAUTH_SECRET: ${{secrets.VECTARA_OAUTH_SECRET}}
440-
VECTARA_CUSTOMER_ID: ${{secrets.VECTARA_CUSTOMER_ID}}
441-
ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}}
442-
ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}}
443-
CLARIFAI_API_KEY: ${{secrets.CLARIFAI_API_KEY}}
444-
DATABRICKS_HOST: ${{secrets.DATABRICKS_HOST}}
445-
DATABRICKS_USERNAME: ${{secrets.DATABRICKS_USERNAME}}
446-
DATABRICKS_PASSWORD: ${{secrets.DATABRICKS_PASSWORD}}
447-
DATABRICKS_CATALOG: ${{secrets.DATABRICKS_CATALOG}}
448-
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
449-
CI: "true"
450-
NLTK_DATA: ${{ github.workspace }}/nltk_data
451-
PYTHON: python${{ matrix.python-version }}
452-
run: |
453-
source .venv/bin/activate
454-
make install-ci
455-
make install-all-ingest
456-
sudo apt-get update
457-
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
458-
make install-pandoc
459-
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
460-
sudo apt-get update
461-
sudo apt-get install -y tesseract-ocr
462-
sudo apt-get install -y tesseract-ocr-kor
463-
sudo apt-get install diffstat
464-
tesseract --version
465-
./test_unstructured_ingest/test-ingest-dest.sh
466-
467-
test_ingest_help:
468-
environment: ci
469-
strategy:
470-
matrix:
471-
python-version: ["3.9","3.10","3.11", "3.12"]
472-
runs-on: ubuntu-latest
473-
needs: [setup_ingest, lint]
474-
steps:
475-
- uses: 'actions/checkout@v4'
476-
- name: Set up Python ${{ matrix.python-version }}
477-
uses: actions/setup-python@v5
478-
with:
479-
python-version: ${{ matrix.python-version }}
480-
- name: Setup virtual environment
481-
uses: ./.github/actions/base-ingest-cache
482-
with:
483-
python-version: ${{ matrix.python-version }}
484-
- name: Validate --help
485-
run: |
486-
source .venv/bin/activate
487-
make install-ci
488-
make install-all-ingest
489-
./test_unstructured_ingest/test-help.sh
490-
491-
492361
test_unstructured_api_unit:
493362
strategy:
494363
matrix:

.github/workflows/ingest-test-fixtures-update-pr.yml

+1
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ jobs:
105105
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
106106
sudo apt-get install -y tesseract-ocr
107107
sudo apt-get install -y tesseract-ocr-kor
108+
sudo apt-get install diffstat
108109
tesseract --version
109110
./test_unstructured_ingest/test-ingest-src.sh
110111

CHANGELOG.md

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
1-
## 0.15.15-dev0
1+
## 0.16.0
22

33
### Enhancements
44

5+
* **Remove ingest implementation.** The deprecated ingest functionality has been removed, as it is now maintained in the separate [unstructured-ingest](https://github.com/Unstructured-IO/unstructured-ingest) repository.
6+
* Replace extras in `requirements/ingest` directory with a new `ingest.txt` extra for installing the `unstructured-ingest` library.
7+
* Remove the `unstructured.ingest` submodule.
8+
* Delete all shell scripts previously used for destination ingest tests.
9+
510
### Features
611

712
### Fixes

MANIFEST.in

-42
Original file line numberDiff line numberDiff line change
@@ -15,45 +15,3 @@ include requirements/extra-pptx.in
1515
include requirements/extra-xlsx.in
1616
include requirements/huggingface.in
1717

18-
# Ingest extras
19-
include requirements/ingest/airtable.in
20-
include requirements/ingest/astradb.in
21-
include requirements/ingest/azure-cognitive-search.in
22-
include requirements/ingest/azure.in
23-
include requirements/ingest/biomed.in
24-
include requirements/ingest/box.in
25-
include requirements/ingest/chroma.in
26-
include requirements/ingest/confluence.in
27-
include requirements/ingest/databricks-volumes.in
28-
include requirements/ingest/delta-table.in
29-
include requirements/ingest/discord.in
30-
include requirements/ingest/dropbox.in
31-
include requirements/ingest/elasticsearch.in
32-
include requirements/ingest/embed-aws-bedrock.in
33-
include requirements/ingest/embed-huggingface.in
34-
include requirements/ingest/embed-mixedbreadai.in
35-
include requirements/ingest/embed-openai.in
36-
include requirements/ingest/gcs.in
37-
include requirements/ingest/github.in
38-
include requirements/ingest/gitlab.in
39-
include requirements/ingest/google-drive.in
40-
include requirements/ingest/hubspot.in
41-
include requirements/ingest/jira.in
42-
include requirements/ingest/kafka.in
43-
include requirements/ingest/mongodb.in
44-
include requirements/ingest/notion.in
45-
include requirements/ingest/onedrive.in
46-
include requirements/ingest/opensearch.in
47-
include requirements/ingest/outlook.in
48-
include requirements/ingest/pinecone.in
49-
include requirements/ingest/postgres.in
50-
include requirements/ingest/qdrant.in
51-
include requirements/ingest/reddit.in
52-
include requirements/ingest/s3.in
53-
include requirements/ingest/salesforce.in
54-
include requirements/ingest/sftp.in
55-
include requirements/ingest/sharepoint.in
56-
include requirements/ingest/slack.in
57-
include requirements/ingest/singlestore.in
58-
include requirements/ingest/weaviate.in
59-
include requirements/ingest/wikipedia.in

0 commit comments

Comments
 (0)