remove languages

leoguillaume · leoguillaume · commit f30cb0d368c1 · 2025-06-19T14:42:19.000Z
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     env:
       IMAGE_NAME: ghcr.io/${{ github.repository }}/server
-      IMAGE_TAG: ${{ github.event_name == 'release' && github.event.release.tag_name || github.sha }}
+      IMAGE_TAG: ${{ github.event_name == 'release' && github.event.release.tag_name || 'latest' }}
     outputs:
       commit_title: ${{ steps.get_head_commit_title.outputs.title }}
     steps:
@@ -41,7 +41,7 @@ jobs:
           context: .
           file: ./server/Dockerfile
           push: true
-          tags: ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }},${{ env.IMAGE_NAME }}:latest
+          tags: ${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }}
           cache-from: type=gha
           cache-to: type=gha,mode=max
       
diff --git a/README.md b/README.md
@@ -72,7 +72,6 @@ Process a document.
 **Parameters:**
 - `file`: The PDF file to process (required)
 - `page_range`: Page range to convert (e.g. "0,5-10,20")
-- `languages`: Comma separated list of languages for OCR
 - `force_ocr`: Force OCR on all pages (default: false)
 - `paginate_output`: Whether to paginate the output (default: false)
 - `output_format`: Output format - "markdown", "json", or "html" (default: "markdown")
diff --git a/server/main.py b/server/main.py
@@ -17,7 +17,7 @@
 
 from server.exceptions import FailedToConvertPDFException
 from server.logger import logger
-from server.schemas import Languages, OutputFormat, ParseResponse
+from server.schemas import OutputFormat, ParseResponse
 from server.security import check_api_key
 
 app_data = {}
@@ -52,7 +52,6 @@ def health() -> Response:
 
 
 page_range = Form(default=None, description="Page range to convert, specify comma separated page numbers or ranges. Example: '0,5-10,20'", example="0,5-10,20")  # fmt: off
-languages = Form(default=None, description="Comma separated list of languages to use for OCR. Must be either the names or codes from from https://github.com/VikParuchuri/surya/blob/master/surya/recognition/languages.py.", example=None)  # fmt: off
 force_ocr = Form(default=False, description="Force OCR on all pages of the PDF.  Defaults to False.  This can lead to worse results if you have good text in your PDFs (which is true in most cases).")  # fmt: off
 paginate_output = Form(default=False, description="Whether to paginate the output.  Defaults to False.  If set to True, each page of the output will be separated by a horizontal rule that contains the page number (2 newlines, {PAGE_NUMBER}, 48 - characters, 2 newlines).")  # fmt: off
 output_format = Form(default="markdown", description="The format to output the text in.  Can be 'markdown', 'json', or 'html'.  Defaults to 'markdown'.")  # fmt: off
@@ -63,7 +62,6 @@ def health() -> Response:
 @app.post("/marker/upload", tags=["Marker"], response_model=ParseResponse, dependencies=[Security(check_api_key)])
 async def convert_pdf_upload(
     page_range: Optional[str] = page_range,
-    languages: Optional[Languages] = languages,
     force_ocr: Optional[bool] = force_ocr,
     paginate_output: Optional[bool] = paginate_output,
     output_format: Optional[OutputFormat] = output_format,
@@ -88,7 +86,6 @@ async def convert_pdf_upload(
     options = {
         "filepath": filepath,
         "page_range": page_range,
-        "languages": languages,
         "force_ocr": force_ocr,
         "paginate_output": paginate_output,
         "output_format": output_format,
diff --git a/server/schemas.py b/server/schemas.py
@@ -2,13 +2,6 @@
 from typing import Any
 
 from pydantic import BaseModel
-from surya.recognition.languages import CODE_TO_LANGUAGE
-
-LANGUAGES = list(CODE_TO_LANGUAGE.keys()) + list(CODE_TO_LANGUAGE.values())
-LANGUAGES = {str(lang).upper(): str(lang) for lang in sorted(set(LANGUAGES))}
-
-Languages = Enum("Language", LANGUAGES, type=str)
-
 
 class OutputFormat(str, Enum):
     markdown = "markdown"