Skip to content

Commit 6851c0a

Browse files
authored
Merge pull request #15 from dataforgoodfr/test/ingestion_100_papers
Test/ingestion 100 papers
2 parents 9d4671b + 9eaa42d commit 6851c0a

File tree

102 files changed

+5615
-533
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

102 files changed

+5615
-533
lines changed

.env.example

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
PG_DATABASE_URL = "postgresql+psycopg2://************:**********@*************-postgresql.services.clever-cloud.com:7327/************"
2+
LLM_INFERENCE_URL = "http://172.17.0.1:11434/v1/"
3+
LLM_INFERENCE_MODEL = "llama3.2:3b"
4+
LLM_INFERENCE_API_KEY = "ollama"
5+
EMBEDDING_MODEL_URL = "http://172.17.0.1:11434/v1/"
6+
EMBEDDING_MODEL = "snowflake-arctic-embed2"
7+
EMBEDDING_MODEL_API_KEY = "ollama"
8+
COLLECTION_ID = 1
9+
USER_ID = "123456"
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
name: main
2+
3+
on:
4+
push:
5+
branches:
6+
- "*"
7+
tags:
8+
- "*"
9+
workflow_dispatch:
10+
11+
12+
env:
13+
REGISTRY: ghcr.io
14+
IMAGE_NAME: ${{ github.repository }}
15+
16+
jobs:
17+
main:
18+
runs-on: ubuntu-latest
19+
steps:
20+
- name: Checkout
21+
uses: actions/checkout@v2
22+
- name: Set up QEMU
23+
uses: docker/setup-qemu-action@v1
24+
- name: Set up Docker Buildx
25+
uses: docker/setup-buildx-action@v1
26+
- name: Log in to the Container registry
27+
uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
28+
with:
29+
registry: ${{ env.REGISTRY }}
30+
username: ${{ github.actor }}
31+
password: ${{ secrets.GITHUB_TOKEN }}
32+
- name: Determine tag
33+
id: tag
34+
run: |
35+
if [[ "${{ github.ref }}" == "refs/heads/main" ]]; then
36+
echo "tag=latest" >> $GITHUB_OUTPUT
37+
elif [[ "${{ github.ref }}" == refs/tags/* ]]; then
38+
echo "tag=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
39+
else
40+
# For other branches, use branch name
41+
echo "tag=${GITHUB_REF#refs/heads/}" >> $GITHUB_OUTPUT
42+
fi
43+
- name: Extract metadata (tags, labels) for Docker
44+
id: meta
45+
uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
46+
with:
47+
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
48+
- name: Build and push Docker image
49+
run: |
50+
docker buildx build \
51+
--push \
52+
--platform=linux/amd64,linux/arm64,linux/armhf \
53+
-t ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.tag.outputs.tag }} -f rag_system/kotaemon/Dockerfile ./
54+
- name: Also create tagged image if tag exists on main branch
55+
if: startsWith(github.ref, 'refs/tags/')
56+
run: |
57+
TAG=${GITHUB_REF#refs/tags/}
58+
docker buildx build \
59+
--push \
60+
--platform=linux/amd64,linux/arm64,linux/armhf \
61+
-t ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:$TAG -f ./Dockerfile ./

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,3 +169,6 @@ cython_debug/
169169
# the flow stuff
170170
.theflow
171171
kotaemon-custom/kotaemon/ktem_app_data
172+
173+
# secret files
174+
secret*

Dockerfile

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# Lite version
2+
FROM python:3.10-slim AS lite
3+
4+
# Common dependencies
5+
RUN apt-get update -qqy && \
6+
apt-get install -y --no-install-recommends \
7+
ssh \
8+
git \
9+
gcc \
10+
g++ \
11+
poppler-utils \
12+
libpoppler-dev \
13+
unzip \
14+
curl \
15+
cargo
16+
17+
# Setup args
18+
ARG TARGETPLATFORM
19+
ARG TARGETARCH
20+
21+
# Set environment variables
22+
ENV PYTHONDONTWRITEBYTECODE=1
23+
ENV PYTHONUNBUFFERED=1
24+
ENV PYTHONIOENCODING=UTF-8
25+
ENV TARGETARCH=${TARGETARCH}
26+
27+
# Create working directory
28+
WORKDIR /app
29+
30+
# Download pdfjs
31+
RUN ls
32+
ADD rag_system/kotaemon/scripts/download_pdfjs.sh /app/scripts/download_pdfjs.sh
33+
RUN chmod +x /app/scripts/download_pdfjs.sh
34+
ENV PDFJS_PREBUILT_DIR="/app/libs/ktem/ktem/assets/prebuilt/pdfjs-dist"
35+
RUN bash /app/scripts/download_pdfjs.sh $PDFJS_PREBUILT_DIR
36+
37+
# Copy contents
38+
COPY rag_system /app
39+
COPY rag_system/kotaemon/launch.sh /app/launch.sh
40+
COPY rag_system/kotaemon/.env.example /app/.env
41+
42+
WORKDIR /app/kotaemon
43+
44+
# Install pip packages
45+
RUN pip install -e "libs/kotaemon" \
46+
&& pip install -e "libs/ktem" \
47+
&& pip install -e "libs/pipelineblocks" \
48+
&& pip install "pdfservices-sdk@git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements"
49+
50+
# Clean up
51+
RUN apt-get autoremove \
52+
&& apt-get clean \
53+
&& rm -rf /var/lib/apt/lists/* \
54+
&& rm -rf ~/.cache
55+
56+
ENTRYPOINT ["sh", "/app/launch.sh"]
57+
58+
# Full version
59+
FROM lite AS full
60+
61+
# Additional dependencies for full version
62+
RUN apt-get update -qqy && \
63+
apt-get install -y --no-install-recommends \
64+
tesseract-ocr \
65+
tesseract-ocr-jpn \
66+
libsm6 \
67+
libxext6 \
68+
libreoffice \
69+
ffmpeg \
70+
libmagic-dev
71+
72+
# Install torch and torchvision for unstructured
73+
RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
74+
RUN pip install psycopg2-binary logfire pydantic==2.10.6
75+
76+
# Install additional pip packages
77+
RUN pip install -e "libs/kotaemon[adv]" \
78+
&& pip install unstructured[all-docs]
79+
80+
# Install lightRAG
81+
ENV USE_LIGHTRAG=false
82+
83+
RUN pip install "docling<=2.5.2"
84+
85+
# Clean up
86+
RUN apt-get autoremove \
87+
&& apt-get clean \
88+
&& rm -rf /var/lib/apt/lists/* \
89+
&& rm -rf ~/.cache
90+
91+
CMD ["sh", "/app/launch.sh"]

0 commit comments

Comments
 (0)