Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion RAGManager/app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,21 @@ class Settings(BaseSettings):
minio_access_key: str
minio_secret_key: str
minio_bucket: str
minio_secure: bool = True
minio_use_ssl: bool = True

# OpenAI Configuration
openai_api_key: str

# Database Configuration
database_url: str

# RabbitMQ Configuration
rabbitmq_user: str
rabbitmq_password: str
rabbitmq_host: str = "localhost"
rabbitmq_port: int = 5672
rabbitmq_queue_name: str = "document.process"

# Chunking Configuration
chunk_size: int = 1000
chunk_overlap: int = 200
Expand Down Expand Up @@ -67,6 +74,15 @@ class Settings(BaseSettings):
extra="ignore",
)

@property
def rabbitmq_url(self) -> str:
"""Returns the RabbitMQ connection URL with URL-encoded credentials."""
from urllib.parse import quote_plus
Copy link

Copilot AI Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The rabbitmq_url property imports urllib.parse.quote_plus inside the method. This import should be moved to the module level for better performance, as the import will be executed every time the property is accessed. Consider moving it to the top of the file with other imports.

Copilot uses AI. Check for mistakes.

encoded_user = quote_plus(self.rabbitmq_user)
encoded_password = quote_plus(self.rabbitmq_password)
return f"amqp://{encoded_user}:{encoded_password}@{self.rabbitmq_host}:{self.rabbitmq_port}/"


settings = Settings()

99 changes: 99 additions & 0 deletions RAGManager/app/core/rabbitmq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import json
Copy link

Copilot AI Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Import of 'json' is not used.

Suggested change
import json

Copilot uses AI. Check for mistakes.
import logging
from typing import Callable, Optional

import pika
from pika.exceptions import AMQPConnectionError

from app.core.config import settings

logger = logging.getLogger(__name__)


class RabbitMQConnection:
"""Handles connection and operations with RabbitMQ"""

def __init__(self):
self.connection: Optional[pika.BlockingConnection] = None
self.channel: Optional[pika.channel.Channel] = None

def connect(self):
"""Establishes connection with RabbitMQ"""
try:
url = settings.rabbitmq_url
logger.info(f"Connecting to RabbitMQ at {settings.rabbitmq_host}:{settings.rabbitmq_port}")
logger.debug(
f"RabbitMQ URL: amqp://{settings.rabbitmq_user}:***@{settings.rabbitmq_host}:{settings.rabbitmq_port}/"
Copy link

Copilot AI Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The RabbitMQ connection credentials are logged in plain debug output which could expose sensitive information if debug logging is enabled in production. The password should be redacted in all log messages. Consider using '***' for all credential fields in the debug output, not just the password in line 26.

Suggested change
f"RabbitMQ URL: amqp://{settings.rabbitmq_user}:***@{settings.rabbitmq_host}:{settings.rabbitmq_port}/"
f"RabbitMQ URL: amqp://***:***@{settings.rabbitmq_host}:{settings.rabbitmq_port}/"

Copilot uses AI. Check for mistakes.
)

self.connection = pika.BlockingConnection(pika.URLParameters(url))
self.channel = self.connection.channel()
logger.info("Connected to RabbitMQ")
except AMQPConnectionError as e:
logger.error(f"Failed to connect to RabbitMQ: {e}")
logger.error(f"Configured host: {settings.rabbitmq_host}")
logger.error(f"Configured port: {settings.rabbitmq_port}")
raise
except Exception as e:
logger.error(f"Unexpected error connecting to RabbitMQ: {e}")
logger.error(f"Error type: {type(e).__name__}")
raise

def close(self):
"""Closes the connection"""
if self.channel and not self.channel.is_closed:
self.channel.close()
logger.info("RabbitMQ channel closed")
if self.connection and not self.connection.is_closed:
self.connection.close()
logger.info("RabbitMQ connection closed")

def declare_queue(self, queue_name: str, durable: bool = True):
"""Declares a queue"""
Copy link

Copilot AI Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The RabbitMQ queue is declared with durable=True by default, but there's no corresponding message persistence configuration. Messages published to this queue should have delivery_mode=2 to make them persistent, otherwise they could be lost if RabbitMQ crashes even though the queue is durable. Consider documenting this requirement or configuring persistence at the publisher level.

Suggested change
"""Declares a queue"""
"""
Declares a queue.
Note:
This method declares the queue as durable by default so that the queue
itself survives RabbitMQ restarts. Durability of the queue does not
automatically make messages persistent. Any publisher sending messages
to this queue that must survive broker restarts should set
`delivery_mode=2` (for example via `pika.BasicProperties`) when
publishing.
"""

Copilot uses AI. Check for mistakes.
if not self.channel:
self.connect()
self.channel.queue_declare(queue=queue_name, durable=durable)
logger.info(f"Queue '{queue_name}' declared")

def consume_messages(self, queue_name: str, callback: Callable):
"""
Start consuming messages from the queue.

Args:
queue_name: Name of the queue to consume from
callback: Callback function to process messages
"""
if not self.channel:
self.connect()

# Declare queue (idempotent operation)
self.declare_queue(queue_name)

# Set QoS to process one message at a time
self.channel.basic_qos(prefetch_count=1)
Comment on lines +72 to +73
Copy link

Copilot AI Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The QoS setting prefetch_count=1 will process messages sequentially one at a time. While this ensures message ordering and prevents overwhelming the system, it may be inefficient for the PDF processing workload. Consider whether parallel processing of multiple PDFs would be beneficial, and if so, increase the prefetch_count or run multiple consumer instances.

Suggested change
# Set QoS to process one message at a time
self.channel.basic_qos(prefetch_count=1)
# Set QoS prefetch count (default to processing one message at a time)
prefetch_count = getattr(settings, "rabbitmq_prefetch_count", 1)
logger.info(f"Setting RabbitMQ QoS prefetch_count={prefetch_count}")
self.channel.basic_qos(prefetch_count=prefetch_count)

Copilot uses AI. Check for mistakes.

# Start consuming
self.channel.basic_consume(
queue=queue_name,
on_message_callback=callback,
auto_ack=False, # Manual acknowledgment
)

logger.info(f"Started consuming messages from queue '{queue_name}'")
logger.info("Waiting for messages. To exit press CTRL+C")

try:
self.channel.start_consuming()
except KeyboardInterrupt:
logger.info("Interrupted by user")
self.channel.stop_consuming()
except Exception as e:
logger.error(f"Error while consuming messages: {e}")
raise
finally:
self.close()


# Global instance
rabbitmq = RabbitMQConnection()

Comment on lines +97 to +99
Copy link

Copilot AI Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The global RabbitMQConnection instance is created at module level but the start_consumer function creates a new instance. This could lead to confusion about connection management and makes it unclear which instance should be used. Consider either using the global instance or removing it if not needed.

Suggested change
# Global instance
rabbitmq = RabbitMQConnection()

Copilot uses AI. Check for mistakes.
18 changes: 16 additions & 2 deletions RAGManager/app/services/minio_client.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# MinIO client configuration and utilities.

import logging
from urllib.parse import urlparse

import certifi
import urllib3
Expand All @@ -14,6 +15,19 @@

def get_minio_client() -> Minio:
"""Create a MinIO client with proper timeout and retry configuration."""
# Parse endpoint to extract host and port (urlparse strips the scheme automatically)
parsed = urlparse(settings.minio_endpoint)
endpoint = parsed.netloc or parsed.path

# Validate that the endpoint is not empty
if not endpoint or endpoint.strip() == "":
error_msg = (
f"Invalid MinIO endpoint configuration: '{settings.minio_endpoint}'. "
"Endpoint must be a valid host or host:port (e.g., 'localhost:9000')"
)
logger.error(error_msg)
raise ValueError(error_msg)

# Configure timeout: 10s connect, 30s read
timeout = UrllibTimeout(connect=10, read=30)

Expand All @@ -34,10 +48,10 @@ def get_minio_client() -> Minio:
)

return Minio(
endpoint=settings.minio_endpoint,
endpoint=endpoint,
access_key=settings.minio_access_key,
secret_key=settings.minio_secret_key,
secure=settings.minio_secure,
secure=settings.minio_use_ssl,
http_client=http_client,
)

Expand Down
2 changes: 2 additions & 0 deletions RAGManager/app/workers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"""Workers package for background processing tasks."""

121 changes: 121 additions & 0 deletions RAGManager/app/workers/pdf_processor_consumer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
"""RabbitMQ consumer for processing PDFs from MinIO events."""

import json
import logging
from urllib.parse import unquote

from app.core.config import settings
from app.core.rabbitmq import RabbitMQConnection
from app.services.pipeline import process_pdf_pipeline

logger = logging.getLogger(__name__)


def extract_pdf_path(message_body: dict) -> str:
"""
Extract the PDF path from MinIO event message.

Args:
message_body: Parsed JSON message from MinIO

Returns:
str: Decoded object path (e.g., "rag-docs/file.pdf")

Raises:
ValueError: If message structure is invalid
"""
records = message_body.get("Records", [])
if not records:
raise ValueError("No Records found in message")

# Extract key from Records[0].s3.object.key
try:
object_key = records[0]["s3"]["object"]["key"]
except (KeyError, IndexError) as e:
raise ValueError(f"Invalid message structure: {e}") from e

# URL decode: rag-docs%2Farchivo.pdf -> rag-docs/archivo.pdf
decoded_path = unquote(object_key)

return decoded_path


def message_callback(ch, method, properties, body):
"""
Callback function to process RabbitMQ messages.

Args:
ch: Channel
method: Method
properties: Properties
body: Message body (bytes)
Comment on lines +43 to +51
Copy link

Copilot AI Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The message callback function has generic parameter names (ch, method, properties, body) that don't provide clear type information. While these are conventional in pika, consider adding type hints to improve code clarity and enable better IDE support. For example: ch: pika.channel.Channel, method: pika.spec.Basic.Deliver, properties: pika.spec.BasicProperties, body: bytes.

Copilot uses AI. Check for mistakes.
"""
try:
# Parse JSON message
message = json.loads(body)
logger.info(f"Received message from RabbitMQ")
logger.debug(f"Message content: {message}")

# Extract PDF path
pdf_path = extract_pdf_path(message)
logger.info(f"Extracted PDF path: {pdf_path}")

# Only process PDFs
if not pdf_path.lower().endswith('.pdf'):
logger.info(f"Skipping non-PDF file: {pdf_path}")
ch.basic_ack(delivery_tag=method.delivery_tag)
return

# Call the existing pipeline
logger.info(f"Starting PDF processing pipeline for: {pdf_path}")
document_id = process_pdf_pipeline(pdf_path)
logger.info(f"PDF processed successfully: {pdf_path} -> Document ID: {document_id}")

# Acknowledge the message
ch.basic_ack(delivery_tag=method.delivery_tag)
logger.info(f"Message acknowledged for: {pdf_path}")
Comment on lines +71 to +76
Copy link

Copilot AI Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The callback function acknowledges messages even when the PDF processing pipeline only returns a mock document_id (when NotImplementedError is caught). This means messages will be permanently removed from the queue even though the processing is not fully complete. Consider implementing a dead-letter queue or requeuing strategy for partial failures until the pipeline is fully implemented.

Copilot uses AI. Check for mistakes.

except json.JSONDecodeError as e:
logger.error(f"Failed to parse JSON message: {e}")
# NACK without requeue - malformed messages won't be fixed by retrying
ch.basic_nack(delivery_tag=method.delivery_tag, requeue=False)
except ValueError as e:
logger.error(f"Invalid message structure: {e}")
# NACK without requeue - invalid structure won't be fixed by retrying
ch.basic_nack(delivery_tag=method.delivery_tag, requeue=False)
except Exception as e:
logger.error(f"Error processing message: {e}", exc_info=True)
# NACK without requeue to avoid infinite loops
# In production, consider implementing a dead-letter queue
ch.basic_nack(delivery_tag=method.delivery_tag, requeue=False)
Comment on lines +86 to +90
Copy link

Copilot AI Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The consumer lacks retry logic or circuit breaker patterns. If the process_pdf_pipeline encounters transient failures (e.g., temporary database unavailability), the message will be discarded (NACK without requeue). Consider implementing exponential backoff retries for transient errors before giving up, or using a dead-letter exchange for failed messages that can be retried later.

Copilot uses AI. Check for mistakes.


def start_consumer():
"""
Start the RabbitMQ consumer to process PDF files.

This function runs in a blocking loop and should be executed
in a separate thread or process.
"""
logger.info("Starting PDF processor consumer")

try:
# Create RabbitMQ connection
rabbitmq = RabbitMQConnection()
rabbitmq.connect()

# Start consuming messages
queue_name = settings.rabbitmq_queue_name
logger.info(f"Consuming messages from queue: {queue_name}")

rabbitmq.consume_messages(
queue_name=queue_name,
callback=message_callback
)

except KeyboardInterrupt:
logger.info("Consumer interrupted by user")
except Exception as e:
logger.error(f"Fatal error in consumer: {e}", exc_info=True)
raise
Comment on lines +100 to +120
Copy link

Copilot AI Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The start_consumer function does not handle connection failures gracefully after startup. If the RabbitMQ connection drops during operation, the consumer will crash and the exception will be raised in the daemon thread, but there's no mechanism to restart it. Consider implementing automatic reconnection logic with exponential backoff to make the consumer more resilient to network issues.

Copilot uses AI. Check for mistakes.

27 changes: 24 additions & 3 deletions RAGManager/main.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import logging
import threading

from fastapi import FastAPI

from app.api.routes import router as api_router
from app.api.routes.base import router as base_router
from app.core.database_connection import init_db
from app.workers.pdf_processor_consumer import start_consumer

# Configure logging
logging.basicConfig(
Expand All @@ -21,10 +24,28 @@

@app.on_event("startup")
async def startup_event():
"""Initialize database on startup."""
"""Initialize database and start RabbitMQ consumer on startup."""
try:
init_db()
logging.info("Database initialized successfully")
Comment on lines 26 to 30
Copy link

Copilot AI Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The startup_event function is async but the init_db() and start_consumer() functions are not awaited, suggesting they might be synchronous. If init_db() performs blocking I/O operations (like database connections), this could block the event loop during startup. Consider making init_db() async and awaiting it, or running it with asyncio.to_thread() if it must remain synchronous.

Copilot uses AI. Check for mistakes.

# Start RabbitMQ consumer in a separate daemon thread
consumer_thread = threading.Thread(target=start_consumer, daemon=True)
consumer_thread.start()
Comment on lines +33 to +34
Copy link

Copilot AI Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The RabbitMQ consumer is started in a daemon thread without any error handling or graceful shutdown mechanism. If the consumer thread crashes or the connection fails after startup, the application will continue running but will silently stop processing messages. Consider storing the thread reference and implementing proper error monitoring or a shutdown event handler to detect and log consumer failures.

Copilot uses AI. Check for mistakes.
logging.info("RabbitMQ consumer started successfully")
Comment on lines +33 to +35
Copy link

Copilot AI Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The RabbitMQ connection is not being closed properly on application shutdown. The consumer thread is a daemon thread that will be forcefully terminated when the main process exits, potentially leaving the RabbitMQ connection open. Consider implementing a shutdown event handler that calls rabbitmq.close() to ensure graceful cleanup of resources.

Copilot uses AI. Check for mistakes.

except Exception as e:
logging.error(f"Failed to initialize database: {e}")
raise
logging.error(f"Failed to initialize: {e}")
raise


@app.get("/")
async def root():
return {"message": "Hello World"}


@app.get("/health")
def health_check():
return {"message": "200 corriendo..."}
Copy link

Copilot AI Dec 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The health check endpoint returns a Spanish message "200 corriendo..." which is inconsistent with the English message in the root endpoint. This creates language inconsistency in the API responses. Consider using English consistently throughout the API or implementing proper internationalization.

Suggested change
return {"message": "200 corriendo..."}
return {"message": "200 running..."}

Copilot uses AI. Check for mistakes.


1 change: 1 addition & 0 deletions RAGManager/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ dependencies = [
"minio>=7.2.20",
"presidio-analyzer>=2.2.360",
"presidio-anonymizer>=2.2.360",
"pika>=1.3.0",
]

[project.optional-dependencies]
Expand Down