Skip to content

Commit 1e6d28a

Browse files
Feature/documentation rag (#5)
Creación de los endpoints para poder subir y listar documentación y la creación de dos .MD sobre los endpoints de para poder subir "documentos" y explicación del flujo del backend. --------- Co-authored-by: Martin Silva <martin.s@lightit.io>
1 parent e656da3 commit 1e6d28a

16 files changed

+676
-7
lines changed

.env.example

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@ POSTGRES_USER=postgres
66
POSTGRES_PASSWORD=postgres
77
POSTGRES_DB=ithaka
88

9+
# Auth para endpoints administrativos (documents create/upload/delete)
10+
# Opcion simple:
11+
# ADMIN_API_TOKEN=pon_un_token_largo_y_aleatorio
12+
# Opcion con roles (formato token:rol separados por coma):
13+
# AUTH_TOKENS=token_admin_1:admin,token_lector_1:reader
14+
915
# Other environment variables (add as needed)
1016
# TWILIO_ACCOUNT_SID=your_twilio_sid
1117
# TWILIO_AUTH_TOKEN=your_twilio_token

Dockerfile

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,20 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
1010
# Set work directory
1111
WORKDIR /app
1212

13+
# Pin uv to a trusted version; bump deliberately during dependency maintenance.
14+
ARG UV_VERSION=0.5.31
15+
1316
# Install system dependencies
1417
RUN apt-get update \
1518
&& apt-get install -y --no-install-recommends \
1619
build-essential \
1720
curl \
1821
&& rm -rf /var/lib/apt/lists/*
1922

20-
# Install Python dependencies
23+
# Install Python dependencies with uv
2124
COPY requirements.txt .
22-
RUN pip install --no-cache-dir --upgrade pip \
23-
&& pip install --no-cache-dir -r requirements.txt
25+
RUN pip install --no-cache-dir "uv==${UV_VERSION}" \
26+
&& uv pip install --system -r requirements.txt
2427

2528
# Copy project
2629
COPY . .

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ print(result)
203203

204204
### Error: "No module named 'langchain'"
205205
```bash
206-
pip install -r requirements.txt
206+
uv pip install -r requirements.txt
207207
```
208208

209209
### Error: "Extension vector does not exist"
@@ -248,7 +248,7 @@ Verificar `DATABASE_URL` en `.env`
248248
Asegúrate de tener las dependencias necesarias:
249249

250250
```
251-
pip install -r requirements.txt
251+
uv pip install -r requirements.txt
252252
```
253253

254254
## 2. Configurar variables de entorno

app/api/v1/documents.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
from fastapi import APIRouter, Depends, File, HTTPException, Query, UploadFile
2+
from sqlalchemy import select
3+
from sqlalchemy.ext.asyncio import AsyncSession
4+
5+
from app.api.v1.schemas.documents import (
6+
DocumentCreate,
7+
DocumentResponse,
8+
DocumentUploadResponse,
9+
)
10+
from app.db.config.database import get_async_session
11+
from app.db.models import FAQEmbedding
12+
from app.security.auth import AuthUser, require_admin_user
13+
from app.services.document_ingestion_service import document_ingestion_service
14+
from app.services.embedding_service import embedding_service
15+
16+
router = APIRouter()
17+
18+
19+
@router.post("/documents", response_model=DocumentResponse, status_code=201)
20+
async def create_document(
21+
document: DocumentCreate,
22+
_current_user: AuthUser = Depends(require_admin_user),
23+
session: AsyncSession = Depends(get_async_session)
24+
) -> DocumentResponse:
25+
try:
26+
created = await embedding_service.add_faq_embedding(
27+
question=document.question,
28+
answer=document.answer,
29+
session=session
30+
)
31+
32+
if created is None:
33+
raise HTTPException(
34+
status_code=500,
35+
detail="No se pudo crear el documento y su embedding"
36+
)
37+
38+
return DocumentResponse(
39+
id=created.id,
40+
question=created.question,
41+
answer=created.answer,
42+
created_at=created.created_at
43+
)
44+
except HTTPException:
45+
raise
46+
except Exception:
47+
raise HTTPException(status_code=500, detail="Error creando documento")
48+
49+
50+
@router.post("/documents/upload", response_model=DocumentUploadResponse, status_code=201)
51+
async def upload_document(
52+
file: UploadFile = File(...),
53+
chunk_size: int = Query(1200, ge=200, le=4000),
54+
chunk_overlap: int = Query(150, ge=0, le=1000),
55+
_current_user: AuthUser = Depends(require_admin_user),
56+
session: AsyncSession = Depends(get_async_session)
57+
) -> DocumentUploadResponse:
58+
if not file.filename:
59+
raise HTTPException(status_code=400, detail="Archivo sin nombre")
60+
61+
if chunk_overlap >= chunk_size:
62+
raise HTTPException(
63+
status_code=400,
64+
detail="chunk_overlap debe ser menor que chunk_size"
65+
)
66+
if chunk_overlap * 2 >= chunk_size:
67+
raise HTTPException(
68+
status_code=400,
69+
detail="chunk_overlap debe ser menor al 50% de chunk_size"
70+
)
71+
72+
content = await file.read()
73+
if not content:
74+
raise HTTPException(status_code=400, detail="Archivo vacio")
75+
76+
if len(content) > document_ingestion_service.max_file_size_bytes:
77+
raise HTTPException(status_code=400, detail="Archivo demasiado grande (max 20MB)")
78+
79+
file_type, text = document_ingestion_service.extract_file_text(file.filename, content)
80+
chunks = document_ingestion_service.split_text(
81+
text,
82+
chunk_size=chunk_size,
83+
chunk_overlap=chunk_overlap,
84+
)
85+
86+
if not chunks:
87+
raise HTTPException(status_code=400, detail="No se encontro texto util en el archivo")
88+
89+
created_ids: list[int] = []
90+
total = len(chunks)
91+
92+
try:
93+
for index, chunk in enumerate(chunks, start=1):
94+
title = f"{file.filename} - fragmento {index}/{total}"
95+
combined_text = f"Pregunta: {title}\nRespuesta: {chunk}"
96+
vector = await embedding_service.generate_embedding(combined_text)
97+
item = FAQEmbedding(question=title, answer=chunk, embedding=vector)
98+
session.add(item)
99+
await session.flush()
100+
created_ids.append(item.id)
101+
102+
await session.commit()
103+
except Exception:
104+
await session.rollback()
105+
raise HTTPException(status_code=500, detail="Error procesando el archivo")
106+
107+
return DocumentUploadResponse(
108+
filename=file.filename,
109+
file_type=file_type,
110+
chunks_created=len(created_ids),
111+
document_ids=created_ids
112+
)
113+
114+
115+
@router.get("/documents", response_model=list[DocumentResponse])
116+
async def list_documents(
117+
limit: int = Query(50, ge=1, le=200),
118+
offset: int = Query(0, ge=0),
119+
session: AsyncSession = Depends(get_async_session)
120+
) -> list[DocumentResponse]:
121+
try:
122+
stmt = (
123+
select(FAQEmbedding)
124+
.order_by(FAQEmbedding.created_at.desc())
125+
.limit(limit)
126+
.offset(offset)
127+
)
128+
129+
result = await session.execute(stmt)
130+
documents = result.scalars().all()
131+
132+
return [
133+
DocumentResponse(
134+
id=document.id,
135+
question=document.question,
136+
answer=document.answer,
137+
created_at=document.created_at
138+
) for document in documents
139+
]
140+
except Exception:
141+
raise HTTPException(status_code=500, detail="Error listando documentos")
142+
143+
144+
@router.delete("/documents/{document_id}")
145+
async def delete_document(
146+
document_id: int,
147+
_current_user: AuthUser = Depends(require_admin_user),
148+
session: AsyncSession = Depends(get_async_session)
149+
) -> dict:
150+
try:
151+
result = await session.execute(
152+
select(FAQEmbedding).where(FAQEmbedding.id == document_id)
153+
)
154+
document = result.scalar_one_or_none()
155+
156+
if document is None:
157+
raise HTTPException(status_code=404, detail="Documento no encontrado")
158+
159+
await session.delete(document)
160+
await session.commit()
161+
162+
return {"message": f"Documento {document_id} eliminado correctamente"}
163+
except HTTPException:
164+
raise
165+
except Exception:
166+
await session.rollback()
167+
raise HTTPException(status_code=500, detail="Error eliminando documento")

app/api/v1/schemas/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+

app/api/v1/schemas/documents.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from datetime import datetime
2+
from typing import Optional
3+
4+
from pydantic import BaseModel, Field
5+
6+
7+
class DocumentCreate(BaseModel):
8+
question: str = Field(..., min_length=3, description="Pregunta o titulo del documento")
9+
answer: str = Field(..., min_length=3, description="Respuesta o contenido del documento")
10+
11+
12+
class DocumentResponse(BaseModel):
13+
id: int
14+
question: str
15+
answer: str
16+
created_at: Optional[datetime]
17+
18+
19+
class DocumentUploadResponse(BaseModel):
20+
filename: str
21+
file_type: str
22+
chunks_created: int
23+
document_ids: list[int]

app/db/config/create_tables.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
from sqlalchemy import text
55

66
from app.db.config.database import Base, engine
7+
from app.db import models # noqa: F401 - needed so SQLAlchemy registers model metadata
8+
9+
DATABASE_URL = os.getenv("DATABASE_URL", "")
710

811
DATABASE_URL = os.getenv("DATABASE_URL", "")
912

@@ -16,4 +19,4 @@ async def create_tables():
1619
await conn.run_sync(Base.metadata.create_all)
1720

1821
if __name__ == "__main__":
19-
asyncio.run(create_tables())
22+
asyncio.run(create_tables())

app/main.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from app.api.v1.conversations import router as conversations_router
1616
from app.api.v1.copilotkit_endpoint import router as copilotkit_router
17+
from app.api.v1.documents import router as documents_router
1718
from app.api.v1.scoring import router as scoring_router
1819

1920
v1 = '/api/v1'
@@ -30,7 +31,8 @@
3031
allow_headers=["*"],
3132
)
3233

33-
app.include_router(conversations_router)
34+
app.include_router(conversations_router, prefix=v1, tags=["Conversations"])
35+
app.include_router(documents_router, prefix=v1, tags=["Documents"])
3436
app.include_router(scoring_router, prefix=v1, tags=["Scoring"])
3537
app.include_router(copilotkit_router, prefix=v1, tags=["CopilotKit"])
3638

app/security/auth.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import os
2+
import secrets
3+
from dataclasses import dataclass
4+
5+
from fastapi import Depends, HTTPException, status
6+
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
7+
8+
bearer_scheme = HTTPBearer(auto_error=False)
9+
10+
11+
@dataclass(frozen=True)
12+
class AuthUser:
13+
role: str
14+
15+
16+
def _load_token_roles() -> list[tuple[str, str]]:
17+
"""
18+
Carga tokens desde variables de entorno.
19+
Formatos soportados:
20+
- ADMIN_API_TOKEN=<token_admin>
21+
- AUTH_TOKENS=<token1>:<rol1>,<token2>:<rol2>
22+
"""
23+
token_roles: list[tuple[str, str]] = []
24+
25+
admin_token = os.getenv("ADMIN_API_TOKEN", "").strip()
26+
if admin_token:
27+
token_roles.append((admin_token, "admin"))
28+
29+
raw_auth_tokens = os.getenv("AUTH_TOKENS", "").strip()
30+
if raw_auth_tokens:
31+
for pair in raw_auth_tokens.split(","):
32+
token_and_role = pair.strip()
33+
if not token_and_role or ":" not in token_and_role:
34+
continue
35+
token, role = token_and_role.split(":", 1)
36+
token = token.strip()
37+
role = role.strip().lower()
38+
if token and role:
39+
token_roles.append((token, role))
40+
41+
return token_roles
42+
43+
44+
def _resolve_user_from_token(token: str) -> AuthUser | None:
45+
for expected_token, role in _load_token_roles():
46+
if secrets.compare_digest(token, expected_token):
47+
return AuthUser(role=role)
48+
return None
49+
50+
51+
async def get_current_user(
52+
credentials: HTTPAuthorizationCredentials | None = Depends(bearer_scheme),
53+
) -> AuthUser:
54+
if credentials is None or credentials.scheme.lower() != "bearer":
55+
raise HTTPException(
56+
status_code=status.HTTP_401_UNAUTHORIZED,
57+
detail="No autenticado",
58+
)
59+
60+
user = _resolve_user_from_token(credentials.credentials)
61+
if user is None:
62+
raise HTTPException(
63+
status_code=status.HTTP_401_UNAUTHORIZED,
64+
detail="Token invalido",
65+
)
66+
67+
return user
68+
69+
70+
async def require_admin_user(
71+
user: AuthUser = Depends(get_current_user),
72+
) -> AuthUser:
73+
if user.role != "admin":
74+
raise HTTPException(
75+
status_code=status.HTTP_403_FORBIDDEN,
76+
detail="Permisos insuficientes. Se requiere rol admin",
77+
)
78+
return user

0 commit comments

Comments
 (0)