Skip to content

Commit 5b879b5

Browse files
feat: add multi-modal support for images and PDFs with OCR
- Add image attachment support via Discord URLs - Add PDF/document text extraction using markitdown - Add OCR fallback for scanned PDFs (ocrmypdf + tesseract) - Support .env.local for dev environment (falls back to .env for production) - Extend MessageBuilder to handle text, images, and documents - Update ClaudeSessionPool to support structured prompts with multi-modal content Supported formats: PDF, DOCX, XLSX, PPTX + images (JPEG, PNG, GIF, WebP) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 9aa6b6f commit 5b879b5

4 files changed

Lines changed: 327 additions & 10 deletions

File tree

discord-server/Dockerfile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ RUN apt-get update && apt-get install -y \
3737
python3 \
3838
python3-pip \
3939
wget \
40+
ocrmypdf \
41+
tesseract-ocr \
42+
tesseract-ocr-deu \
4043
&& rm -rf /var/lib/apt/lists/* \
4144
&& apt-get clean
4245

@@ -63,6 +66,9 @@ RUN wget -q -O /tmp/claude-install.sh https://claude.ai/install.sh && \
6366
timeout 120 bash /tmp/claude-install.sh || exit 1 && \
6467
rm -f /tmp/claude-install.sh
6568

69+
# Install markitdown for PDF/document processing
70+
RUN /home/node/.local/bin/uv pip install --system markitdown
71+
6672
# Copy application code AFTER CLI install (so code changes don't rebuild Claude CLI)
6773
COPY . .
6874

discord-server/bot.js

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
import dotenv from 'dotenv';
2-
dotenv.config();
2+
import { fileURLToPath } from 'url';
3+
import { dirname, join } from 'path';
4+
5+
const __dirname = dirname(fileURLToPath(import.meta.url));
6+
7+
// Load .env.local first (dev), fallback to .env (production)
8+
dotenv.config({ path: join(__dirname, '.env.local') });
9+
dotenv.config(); // Fallback for vars not in .env.local
310

411
import express from 'express';
512

@@ -31,6 +38,7 @@ import { ProgressReporter } from './lib/ProgressReporter.js';
3138
import ThreadNamer from './lib/ThreadNamer.js';
3239
import { PerformanceTracer } from './lib/PerformanceTracer.js';
3340
import ClaudeSessionPool from './lib/ClaudeSessionPool.js';
41+
import { buildStructuredPrompt } from './lib/MessageBuilder.js';
3442

3543
const client = new Client({
3644
intents: [
@@ -132,7 +140,16 @@ client.on('messageCreate', async (message) => {
132140

133141
if (!isInboxChannel && !isInboxThread) return;
134142

143+
// Build structured prompt with text, images, and documents
144+
const structuredPrompt = await buildStructuredPrompt(message);
145+
135146
console.log(`📝 Processing message from ${message.author.tag}: ${message.content}`);
147+
if (structuredPrompt.hasImages) {
148+
console.log(`🖼️ Found ${structuredPrompt.images.length} image(s): ${structuredPrompt.images.map(i => i.name).join(', ')}`);
149+
}
150+
if (structuredPrompt.hasDocuments) {
151+
console.log(`📄 Found ${structuredPrompt.documents.length} document(s): ${structuredPrompt.documents.map(d => d.name).join(', ')}`);
152+
}
136153

137154
// Initialize performance tracing
138155
const tracer = new PerformanceTracer();
@@ -168,25 +185,33 @@ client.on('messageCreate', async (message) => {
168185
await progressReporter.start();
169186
tracer.endPhase();
170187

171-
// Build conversation context
188+
// Build conversation context - use structured prompt for images and documents
172189
// Skip conversation context building - sessions handle context
173-
const conversationPrompt = message.content;
174190
tracer.endPhase({
175-
promptLength: conversationPrompt.length,
176-
promptPreview: conversationPrompt.slice(0, 100) + '...'
191+
promptLength: structuredPrompt.text.length,
192+
promptPreview: structuredPrompt.text.slice(0, 100) + '...',
193+
hasImages: structuredPrompt.hasImages,
194+
imageCount: structuredPrompt.images.length,
195+
hasDocuments: structuredPrompt.hasDocuments,
196+
documentCount: structuredPrompt.documents.length
177197
});
178198

179199
// Process with Claude Code SDK with retry logic
180200
console.log('🤖 Calling Claude with conversation context...');
181201

182202
tracer.startPhase('claude_sdk_call', {
183-
promptLength: conversationPrompt.length,
203+
promptLength: structuredPrompt.text.length,
204+
hasImages: structuredPrompt.hasImages,
205+
imageCount: structuredPrompt.images.length,
206+
hasDocuments: structuredPrompt.hasDocuments,
207+
documentCount: structuredPrompt.documents.length,
184208
vaultPath: process.env.OBSIDIAN_VAULT_PATH || '/srv/claude-jobs/obsidian-vault'
185209
});
186210

187211
const claudeResult = await errorHandler.retryOperation(async () => {
188212
// Use session pool instead of direct query
189-
const stream = await sessionPool.processMessage(thread.id, conversationPrompt);
213+
// Pass structured prompt for image support
214+
const stream = await sessionPool.processMessage(thread.id, structuredPrompt);
190215

191216
// Register stream for /stop command
192217
activeStreams.set(thread.id, {

discord-server/lib/ClaudeSessionPool.js

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { query } from '@anthropic-ai/claude-agent-sdk';
2+
import { createMessageIterable } from './MessageBuilder.js';
23

34
class ClaudeSessionPool {
45
constructor(redisClient) {
@@ -17,6 +18,12 @@ class ClaudeSessionPool {
1718

1819
async getOrCreateSession(threadId, initialPrompt = null) {
1920
try {
21+
// Normalize prompt - support both string and structured format
22+
const isStructured = typeof initialPrompt === 'object' && initialPrompt?.hasImages !== undefined;
23+
const textPrompt = isStructured ? initialPrompt.text : initialPrompt;
24+
const hasImages = isStructured && initialPrompt.hasImages;
25+
const content = isStructured ? initialPrompt.content : null;
26+
2027
// Check memory cache first - if found, use it exclusively
2128
if (this.sessionCache.has(threadId)) {
2229
const metadata = this.sessionMetadata.get(threadId);
@@ -40,8 +47,13 @@ class ClaudeSessionPool {
4047
console.log(`🔄 Resuming session ${persistedSessionId} for thread ${threadId}`);
4148

4249
try {
50+
// Use AsyncIterable for messages with images
51+
const promptValue = hasImages
52+
? createMessageIterable(content, persistedSessionId)
53+
: (textPrompt || "Continue our conversation");
54+
4355
const session = query({
44-
prompt: initialPrompt || "Continue our conversation",
56+
prompt: promptValue,
4557
options: {
4658
resume: persistedSessionId,
4759
cwd: process.env.OBSIDIAN_VAULT_PATH || '/srv/claude-jobs/obsidian-vault',
@@ -75,8 +87,13 @@ class ClaudeSessionPool {
7587
// Create new session
7688
console.log(`🆕 Creating new Claude session for thread ${threadId}`);
7789

90+
// Use AsyncIterable for messages with images
91+
const promptValue = hasImages
92+
? createMessageIterable(content, '')
93+
: (textPrompt || "Start new Discord conversation");
94+
7895
const session = query({
79-
prompt: initialPrompt || "Start new Discord conversation",
96+
prompt: promptValue,
8097
options: {
8198
cwd: process.env.OBSIDIAN_VAULT_PATH || '/srv/claude-jobs/obsidian-vault',
8299
maxTurns: 100,
@@ -111,8 +128,24 @@ class ClaudeSessionPool {
111128
}
112129
}
113130

131+
/**
132+
* Process a message with optional image support
133+
* @param {string} threadId - Discord thread ID
134+
* @param {string|Object} prompt - Either a string or structured prompt from buildStructuredPrompt
135+
* @returns {Query} Claude session stream
136+
*/
114137
async processMessage(threadId, prompt) {
115138
try {
139+
// Normalize prompt - support both string and structured format
140+
const isStructured = typeof prompt === 'object' && prompt.hasImages !== undefined;
141+
const textPrompt = isStructured ? prompt.text : prompt;
142+
const hasImages = isStructured && prompt.hasImages;
143+
const content = isStructured ? prompt.content : null;
144+
145+
if (hasImages) {
146+
console.log(`🖼️ Processing message with ${prompt.images.length} image(s)`);
147+
}
148+
116149
// Check if this is a cached session (existing conversation)
117150
if (this.sessionCache.has(threadId)) {
118151
console.log(`🔄 Continuing existing session for thread ${threadId}`);
@@ -127,8 +160,14 @@ class ClaudeSessionPool {
127160

128161
// Resume specific session by ID with detailed logging
129162
console.log(`🔄 Resuming session ${metadata.sessionId} for thread ${threadId}`);
163+
164+
// Use AsyncIterable for messages with images
165+
const promptValue = hasImages
166+
? createMessageIterable(content, metadata.sessionId)
167+
: textPrompt;
168+
130169
const continueSession = query({
131-
prompt: prompt,
170+
prompt: promptValue,
132171
options: {
133172
resume: metadata.sessionId,
134173
cwd: process.env.OBSIDIAN_VAULT_PATH || '/srv/claude-jobs/obsidian-vault',

0 commit comments

Comments
 (0)