Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# syntax=docker/dockerfile:experimental
# syntax=docker/dockerfile:1.4

FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base

# NOTE(crag): NB_USER ARG for mybinder.org compat:
Expand All @@ -19,8 +20,19 @@ USER ${NB_USER}
ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
ENV PATH="/home/${NB_USER}/.local/bin:${PATH}"

USER root
COPY --from=unstructured_src . /tmp/unstructured-build/unstructured
RUN chown -R ${NB_USER}:${NB_USER} /tmp/unstructured-build/unstructured && \
chmod -R a+w /tmp/unstructured-build/unstructured
USER ${NB_USER}

FROM base as python-deps
COPY --chown=${NB_USER}:${NB_USER} requirements/base.txt requirements-base.txt

USER root
RUN apk add --no-cache git
USER ${NB_USER}

RUN ${PIP} install pip==${PIP_VERSION}
RUN ${PIP} install --no-cache -r requirements-base.txt

Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ install-pandoc:
# is mounted under /home/notebook-user/local/ when the image is started with
# docker-start-api or docker-start-jupyter

DOCKER_IMAGE ?= pipeline-family-${PIPELINE_FAMILY}-dev:latest
DOCKER_IMAGE ?= unstructured-api-custom

.PHONY: docker-build
docker-build:
Expand Down
13 changes: 12 additions & 1 deletion prepline_general/api/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,8 @@ def pipeline_api(
unique_element_ids: Optional[bool] = False,
starting_page_number: Optional[int] = None,
include_slide_notes: Optional[bool] = True,
chunking_model_name: Optional[str] = None,
custom_metadata: Optional[str] = None,
) -> List[Dict[str, Any]] | str:
if filename.endswith(".msg"):
# Note(yuming): convert file type for msg files
Expand Down Expand Up @@ -286,6 +288,8 @@ def pipeline_api(
"overlap_all": overlap_all,
"starting_page_number": starting_page_number,
"include_slide_notes": include_slide_notes,
"chunking_model_name": chunking_model_name,
"custom_metadata": custom_metadata,
},
default=str,
)
Expand Down Expand Up @@ -343,6 +347,8 @@ def pipeline_api(
"extract_image_block_to_payload": extract_image_block_to_payload,
"unique_element_ids": unique_element_ids,
"include_slide_notes": include_slide_notes,
"chunking_model_name": chunking_model_name,
"custom_metadata": custom_metadata,
},
default=str,
)
Expand Down Expand Up @@ -374,6 +380,8 @@ def pipeline_api(
"unique_element_ids": unique_element_ids,
"starting_page_number": starting_page_number,
"include_slide_notes": include_slide_notes,
"chunking_model_name": chunking_model_name,
"custom_metadata": custom_metadata,
}

if file_content_type == "application/pdf" and pdf_parallel_mode_enabled:
Expand Down Expand Up @@ -406,6 +414,7 @@ def pipeline_api(
detail=str(e),
)
except ValueError as e:
logger.info(f"e error {e}")
if "Invalid file" in e.args[0]:
raise HTTPException(
status_code=400, detail=f"{file_content_type} not currently supported"
Expand Down Expand Up @@ -506,7 +515,7 @@ def _validate_chunking_strategy(chunking_strategy: Optional[str]) -> Optional[st
return None

chunking_strategy = chunking_strategy.lower()
available_strategies = ["basic", "by_title"]
available_strategies = ["basic", "by_title", "custom"]

if chunking_strategy not in available_strategies:
raise HTTPException(
Expand Down Expand Up @@ -701,6 +710,8 @@ def response_generator(is_multipart: bool):
overlap_all=form_params.overlap_all,
starting_page_number=form_params.starting_page_number,
include_slide_notes=form_params.include_slide_notes,
chunking_model_name=form_params.chunking_model_name,
custom_metadata=form_params.custom_metadata,
)

yield (
Expand Down
28 changes: 25 additions & 3 deletions prepline_general/api/models/form_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class GeneralFormParams(BaseModel):
encoding: str
content_type: Optional[str]
hi_res_model_name: Optional[str]
chunking_model_name: Optional[str]
include_page_breaks: bool
pdf_infer_table_structure: bool
strategy: str
Expand All @@ -37,6 +38,7 @@ class GeneralFormParams(BaseModel):
overlap_all: bool
starting_page_number: Optional[int] = None
include_slide_notes: bool
custom_metadata: Optional[str] = None
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this string ?


@classmethod
def as_form(
Expand Down Expand Up @@ -129,6 +131,15 @@ def as_form(
),
BeforeValidator(SmartValueParser[str]().value_or_first_element),
] = None,
chunking_model_name: Annotated[
Optional[str],
Form(
title="Chunking Model Name",
description="The name of a custom model used when the chunking strategy is set to 'custom'.",
example="yolox",
),
BeforeValidator(SmartValueParser[str]().value_or_first_element),
] = None,
include_page_breaks: Annotated[
bool,
Form(
Expand Down Expand Up @@ -178,11 +189,11 @@ def as_form(
] = False,
# -- chunking options --
chunking_strategy: Annotated[
Optional[Literal["by_title"]],
Optional[Literal["by_title", "custom"]],
Form(
title="Chunking Strategy",
description="Use one of the supported strategies to chunk the returned elements. Currently supports: by_title",
examples=["by_title"],
description="Use one of the supported strategies to chunk the returned elements. Currently supports: by_title, custom",
examples=["by_title", "custom"],
),
] = None,
combine_under_n_chars: Annotated[
Expand Down Expand Up @@ -258,6 +269,15 @@ def as_form(
example=False,
),
] = True,
custom_metadata: Annotated[
Optional[str],
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should be dict then ?

Form(
title="Custom Metadata",
description="A dictionary containing custom metadata for the document or elements. ",
example="{'author': 'John Doe', 'keywords': ['AI', 'ML', 'data processing']}",
),
BeforeValidator(SmartValueParser[dict]().value_or_first_element),
] = None,
) -> "GeneralFormParams":
return cls(
xml_keep_tags=xml_keep_tags,
Expand All @@ -270,6 +290,7 @@ def as_form(
content_type=content_type,
encoding=encoding,
hi_res_model_name=hi_res_model_name,
chunking_model_name=chunking_model_name,
include_page_breaks=include_page_breaks,
pdf_infer_table_structure=pdf_infer_table_structure,
strategy=strategy,
Expand All @@ -286,4 +307,5 @@ def as_form(
unique_element_ids=unique_element_ids,
starting_page_number=starting_page_number,
include_slide_notes=include_slide_notes,
custom_metadata=custom_metadata,
)
2 changes: 1 addition & 1 deletion requirements/base.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-c constraints.in
unstructured[all-docs]

# Pinning click due to a unicode issue in black
# can remove after black drops support for Python 3.6
# ref: https://github.com/psf/black/issues/2964
Expand Down
6 changes: 5 additions & 1 deletion requirements/constraints.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
# extras. Putting a dependency here will only affect dependency sets that contain them -- in other
# words, if something does not require a constraint, it will not be installed.
####################################################################################################

numpy<2.0.0
# later versions of Starlette break middleware
starlette==0.41.2
starlette==0.41.2

# pin packaging to a version compatible with langchain-core and prefect
packaging==23.2
15 changes: 10 additions & 5 deletions scripts/app-start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,17 @@ if [[ -n $MAX_LIFETIME_SECONDS ]]; then
fi
fi

${OPTIONAL_TIMEOUT} \
uvicorn prepline_general.api.app:app \
CMD="uvicorn prepline_general.api.app:app \
--host $HOST \
--port $PORT \
--log-config logger_config.yaml \
--host "$HOST" \
--port "$PORT" \
--workers "$WORKERS" \
--workers $WORKERS"

if [[ -n "$OPTIONAL_TIMEOUT" ]]; then
$OPTIONAL_TIMEOUT $CMD
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we not be doing eval here too ?

else
eval $CMD
fi

echo "Server was shutdown"
[ -n "$MAX_LIFETIME_SECONDS" ] && echo "Reached timeout of $MAX_LIFETIME_SECONDS seconds"
4 changes: 3 additions & 1 deletion scripts/docker-build.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env bash
# syntax=docker/dockerfile:1.4

set -euo pipefail
DOCKER_REPOSITORY="${DOCKER_REPOSITORY:-quay.io/unstructured-io/unstructured-api}"
Expand All @@ -8,9 +9,10 @@ PIP_VERSION="${PIP_VERSION:-22.2.1}"
DOCKER_IMAGE="${DOCKER_IMAGE:-pipeline-family-${PIPELINE_FAMILY}-dev}"
DOCKER_PLATFORM="${DOCKER_PLATFORM:-}"


UNSTRUCTURED_LOCAL_PATH="path/to/be/added/from/pipeline"
DOCKER_BUILD_CMD=(
docker buildx build --load -f Dockerfile
--build-context unstructured_src="$UNSTRUCTURED_LOCAL_PATH"
--build-arg PIP_VERSION="$PIP_VERSION"
--build-arg BUILDKIT_INLINE_CACHE=1
--build-arg PIPELINE_PACKAGE="$PIPELINE_PACKAGE"
Expand Down
Loading