Skip to content

Commit fa7e78e

Browse files
authored
Merge pull request #752 from doerfli/feature/mistral-ai-ocr
use mistral ai (#748)
2 parents 08e6970 + 7c88852 commit fa7e78e

23 files changed

+697
-123
lines changed

Gemfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,3 +100,5 @@ gem "ruby-vips", "~> 2.2"
100100
gem "nokogiri", "~> 1.19"
101101

102102
gem "propshaft", "~> 1.3"
103+
104+
gem "omniai-mistral", "~> 3.0"

Gemfile.lock

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ GEM
147147
reline (>= 0.3.8)
148148
debug_inspector (1.2.0)
149149
diff-lcs (1.6.2)
150+
domain_name (0.6.20240107)
150151
dotenv (3.2.0)
151152
dotenv-rails (3.2.0)
152153
dotenv (= 3.2.0)
@@ -177,9 +178,20 @@ GEM
177178
ffi (1.17.3-x86_64-darwin)
178179
ffi (1.17.3-x86_64-linux-gnu)
179180
ffi (1.17.3-x86_64-linux-musl)
181+
ffi-compiler (1.3.2)
182+
ffi (>= 1.15.5)
183+
rake
180184
globalid (1.3.0)
181185
activesupport (>= 6.1)
182186
hashie (5.0.0)
187+
http (5.3.1)
188+
addressable (~> 2.8)
189+
http-cookie (~> 1.0)
190+
http-form_data (~> 2.2)
191+
llhttp-ffi (~> 0.5.0)
192+
http-cookie (1.1.0)
193+
domain_name (~> 0.5)
194+
http-form_data (2.3.0)
183195
i18n (1.14.8)
184196
concurrent-ruby (~> 1.0)
185197
image_processing (1.14.0)
@@ -222,6 +234,9 @@ GEM
222234
listen (3.9.0)
223235
rb-fsevent (~> 0.10, >= 0.10.3)
224236
rb-inotify (~> 0.9, >= 0.9.10)
237+
llhttp-ffi (0.5.1)
238+
ffi-compiler (~> 1.0)
239+
rake (~> 13.0)
225240
logger (1.7.0)
226241
loofah (2.25.0)
227242
crass (~> 1.0.2)
@@ -279,6 +294,16 @@ GEM
279294
snaky_hash (~> 2.0, >= 2.0.3)
280295
version_gem (~> 1.1, >= 1.1.9)
281296
observer (0.1.2)
297+
omniai (3.2.0)
298+
base64
299+
event_stream_parser
300+
http
301+
logger
302+
zeitwerk
303+
omniai-mistral (3.0.0)
304+
event_stream_parser
305+
omniai (~> 3.0)
306+
zeitwerk
282307
omniauth (2.1.4)
283308
hashie (>= 3.4.6)
284309
logger
@@ -536,6 +561,7 @@ DEPENDENCIES
536561
listen (>= 3.0.5)
537562
mini_magick (~> 5.3)
538563
nokogiri (~> 1.19)
564+
omniai-mistral (~> 3.0)
539565
omniauth-auth0 (~> 3.1)
540566
omniauth-rails_csrf_protection (~> 2.0)
541567
pg (>= 0.18)
@@ -603,6 +629,7 @@ CHECKSUMS
603629
debug (1.11.1) sha256=2e0b0ac6119f2207a6f8ac7d4a73ca8eb4e440f64da0a3136c30343146e952b6
604630
debug_inspector (1.2.0) sha256=9bdfa02eebc3da163833e6a89b154084232f5766087e59573b70521c77ea68a2
605631
diff-lcs (1.6.2) sha256=9ae0d2cba7d4df3075fe8cd8602a8604993efc0dfa934cff568969efb1909962
632+
domain_name (0.6.20240107) sha256=5f693b2215708476517479bf2b3802e49068ad82167bcd2286f899536a17d933
606633
dotenv (3.2.0) sha256=e375b83121ea7ca4ce20f214740076129ab8514cd81378161f11c03853fe619d
607634
dotenv-rails (3.2.0) sha256=657e25554ba622ffc95d8c4f1670286510f47f2edda9f68293c3f661b303beab
608635
drb (2.2.3) sha256=0b00d6fdb50995fe4a45dea13663493c841112e4068656854646f418fda13373
@@ -623,8 +650,12 @@ CHECKSUMS
623650
ffi (1.17.3-x86_64-darwin) sha256=1f211811eb5cfaa25998322cdd92ab104bfbd26d1c4c08471599c511f2c00bb5
624651
ffi (1.17.3-x86_64-linux-gnu) sha256=3746b01f677aae7b16dc1acb7cb3cc17b3e35bdae7676a3f568153fb0e2c887f
625652
ffi (1.17.3-x86_64-linux-musl) sha256=086b221c3a68320b7564066f46fed23449a44f7a1935f1fe5a245bd89d9aea56
653+
ffi-compiler (1.3.2) sha256=a94f3d81d12caf5c5d4ecf13980a70d0aeaa72268f3b9cc13358bcc6509184a0
626654
globalid (1.3.0) sha256=05c639ad6eb4594522a0b07983022f04aa7254626ab69445a0e493aa3786ff11
627655
hashie (5.0.0) sha256=9d6c4e51f2a36d4616cbc8a322d619a162d8f42815a792596039fc95595603da
656+
http (5.3.1) sha256=c50802d8e9be3926cb84ac3b36d1a31fbbac383bc4cbecdce9053cb604231d7d
657+
http-cookie (1.1.0) sha256=38a5e60d1527eebc396831b8c4b9455440509881219273a6c99943d29eadbb19
658+
http-form_data (2.3.0) sha256=cc4eeb1361d9876821e31d7b1cf0b68f1cf874b201d27903480479d86448a5f3
628659
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
629660
image_processing (1.14.0) sha256=754cc169c9c262980889bec6bfd325ed1dafad34f85242b5a07b60af004742fb
630661
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
@@ -645,6 +676,7 @@ CHECKSUMS
645676
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
646677
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
647678
listen (3.9.0) sha256=db9e4424e0e5834480385197c139cb6b0ae0ef28cc13310cfd1ca78377d59c67
679+
llhttp-ffi (0.5.1) sha256=9a25a7fc19311f691a78c9c0ac0fbf4675adbd0cca74310228fdf841018fa7bc
648680
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
649681
loofah (2.25.0) sha256=df5ed7ac3bac6a4ec802df3877ee5cc86d027299f8952e6243b3dac446b060e6
650682
mail (2.9.0) sha256=6fa6673ecd71c60c2d996260f9ee3dd387d4673b8169b502134659ece6d34941
@@ -672,6 +704,8 @@ CHECKSUMS
672704
nokogiri (1.19.0-x86_64-linux-musl) sha256=1c4ca6b381622420073ce6043443af1d321e8ed93cc18b08e2666e5bd02ffae4
673705
oauth2 (2.0.18) sha256=bacf11e470dfb963f17348666d0a75c7b29ca65bc48fd47be9057cf91a403287
674706
observer (0.1.2) sha256=d8a3107131ba661138d748e7be3dbafc0d82e732fffba9fccb3d7829880950ac
707+
omniai (3.2.0) sha256=cb052d4027a91d26af4fe5b3f0a9436080b1b1821d5d2a6a43ff0f7f76053358
708+
omniai-mistral (3.0.0) sha256=e4958b27d56d1dda82c5ae824422bdee403a7f2d0c2d3b747e4c8e755d007c7b
675709
omniauth (2.1.4) sha256=42a05b0496f0d22e1dd85d42aaf602f064e36bb47a6826a27ab55e5ba608763c
676710
omniauth-auth0 (3.1.1) sha256=3d9e83377b37394db077cf27082d29ccff93158f072d92fc59f1e88798c6c2b2
677711
omniauth-oauth2 (1.9.0) sha256=ed15f6d9d20991807ce114cc5b9c1453bce3645b64e51c68c90cff5ff153fee8

README.md

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,11 @@ Runtime docker images are available.
77
## Features
88

99
- Recipe management with images and OCR text recognition
10+
- **AI-powered recipe extraction with two methods:**
11+
- **Mistral + OpenAI**: Image → Mistral OCR (markdown) → OpenAI parsing (structured recipe)
12+
- **OpenAI Direct**: Image → OpenAI (structured recipe)
1013
- AI-powered OCR text cleanup using GPT-4 Mini
14+
- Multiple recipes detection from single image
1115
- Tagging and search functionality
1216
- Auth0 authentication
1317
- AWS S3 file storage
@@ -38,9 +42,9 @@ AWS_ACCESS_KEY_ID=AAAAAAA
3842
AWS_SECRET_ACCESS_KEY=BBBBB
3943
```
4044

41-
### OpenAI API Key (for OCR text cleanup)
45+
### OpenAI API Key (for AI recipe extraction and OCR cleanup)
4246

43-
For the GPT-powered OCR text cleanup feature, configure your OpenAI API key:
47+
For the AI-powered recipe extraction and OCR cleanup features, configure your OpenAI API key:
4448

4549
**Development:**
4650
```bash
@@ -64,8 +68,26 @@ dokku config:set your-app-name OPENAI_CLEANUP_PROMPT_DE="Your custom German prom
6468
**Optional Configuration:**
6569
- `OPENAI_CLEANUP_PROMPT_EN`: Override the default English cleanup prompt
6670
- `OPENAI_CLEANUP_PROMPT_DE`: Override the default German cleanup prompt
67-
- `OPENAI_PROMPT_ID`: Override the default OpenAI prompt ID for OCR (default: `pmpt_69389bf4c7a481909d47bcf85f423781063a569321686620`)
68-
- `OPENAI_PROMPT_VERSION`: Override the default OpenAI prompt version for OCR (default: `8`)
71+
- `OPENAI_PROMPT_OCR_ID`: Override the default OpenAI prompt ID for direct OCR (default: `pmpt_694514e453388194a1e4c121407ef02204bec5d20e21b070`)
72+
- `OPENAI_PROMPT_OCR_VERSION`: Override the default OpenAI prompt version for direct OCR (default: `2`)
73+
- `OPENAI_MARKDOWN_PROMPT_ID`: Override the default OpenAI prompt ID for markdown parsing (default: `pmpt_696554b87ef88190bbc1156b6c5fe84f0050d5451e60ae6c`)
74+
- `OPENAI_MARKDOWN_PROMPT_VERSION`: Override the default OpenAI prompt version for markdown parsing (default: `2`)
75+
76+
### Mistral AI API Key (for two-phase OCR)
77+
78+
For the Mistral + OpenAI two-phase recipe extraction, configure your Mistral AI API key:
79+
80+
**Development:**
81+
```bash
82+
export MISTRAL_API_KEY=your_mistral_api_key_here
83+
```
84+
85+
**Production/Dokku:**
86+
```bash
87+
dokku config:set your-app-name MISTRAL_API_KEY=your_actual_mistral_api_key_here
88+
```
89+
90+
**Note:** The Mistral API key is only required if you plan to use the "Mistral + OpenAI" recognition method. The "OpenAI Direct" method only requires the OpenAI API key.
6991

7092

7193
## Start development server
@@ -80,5 +102,20 @@ yarn build --watch
80102

81103
## Start production via docker
82104

83-
Use provided `docker-compose.prod.yml` file for startup of postgres db and container for the rails app. Don't forget to set `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `OPENAI_API_KEY` and db passwords. To change name of S3 bucket (_reeper_) and region (_eu-central-1_) use `S3_BUCKET_NAME` and `S3_BUCKET_REGION` environment variables.
105+
Use provided `docker-compose.prod.yml` file for startup of postgres db and container for the rails app. Don't forget to set the following environment variables:
106+
107+
**Required:**
108+
- `AWS_ACCESS_KEY_ID`: AWS access key
109+
- `AWS_SECRET_ACCESS_KEY`: AWS secret key
110+
- `OPENAI_API_KEY`: OpenAI API key for AI recipe extraction
111+
- Database passwords
112+
113+
**Optional:**
114+
- `MISTRAL_API_KEY`: Mistral AI API key (only needed for "Mistral + OpenAI" method)
115+
- `S3_BUCKET_NAME`: Override S3 bucket name (default: _reeper_)
116+
- `S3_BUCKET_REGION`: Override S3 region (default: _eu-central-1_)
117+
- `OPENAI_PROMPT_OCR_ID`: Custom OpenAI prompt ID for direct OCR
118+
- `OPENAI_PROMPT_OCR_VERSION`: Custom OpenAI prompt version
119+
- `OPENAI_MARKDOWN_PROMPT_ID`: Custom OpenAI prompt ID for markdown parsing
120+
- `OPENAI_MARKDOWN_PROMPT_VERSION`: Custom OpenAI prompt version
84121

app/controllers/ocr_controller.rb

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,32 @@ def save_text
5959
def scan
6060
# Process only the first uploaded file
6161
file = params[:files].first
62+
ai_method = params[:ai_method] || 'openai_direct'
6263

6364
begin
64-
magic_data_json = openai_service.ocr(file.tempfile, file.content_type)
65+
# Process based on selected AI method
66+
magic_data_json = if ai_method == 'mistral_openai'
67+
# Two-phase: Mistral OCR -> OpenAI parsing
68+
markdown = mistral_service.ocr_to_markdown(file.tempfile, file.content_type)
69+
70+
if markdown.blank?
71+
raise "Mistral OCR returned empty markdown"
72+
end
73+
74+
openai_service.parse_markdown_to_recipes(markdown)
75+
else
76+
# Direct OpenAI OCR
77+
openai_service.ocr(file.tempfile, file.content_type)
78+
end
79+
80+
if magic_data_json.empty?
81+
raise "No recipes extracted from image"
82+
end
83+
84+
logger.debug "OCR extracted recipes: #{magic_data_json}"
6585

6686
# Save full OCR result array to database and store id in flash to avoid flash size limits
67-
ocrresult = OcrResult.create(result: magic_data_json.to_json)
87+
ocrresult = OcrResult.create(result: magic_data_json.to_json, ai_method: ai_method)
6888
ocrresult.image.attach(file)
6989
ocrresult.save
7090

@@ -80,7 +100,7 @@ def scan
80100
render json: { success: true, redirect_url: new_recipe_path }
81101
end
82102
rescue JSON::ParserError => e
83-
logger.error "OCR JSON parse error: #{e.message}"
103+
logger.error "OCR JSON parse error: #{e}"
84104
render json: { success: false, error: I18n.t('ocr.errors.parse_failed') }
85105
rescue => e
86106
logger.error "OCR error: #{e.message}"
@@ -154,6 +174,7 @@ def select_recipe
154174
def reparse_image
155175
@recipe = Recipe.find(params[:id])
156176
attachment_id = params[:attachment_id]
177+
ai_method = params[:ai_method] || 'openai_direct'
157178

158179
begin
159180
# Find the selected image attachment
@@ -164,17 +185,33 @@ def reparse_image
164185
image_file = blob.download
165186
content_type = blob.content_type
166187

167-
# Create a temporary file for the OpenAI service
188+
# Create a temporary file for the AI service
168189
temp_file = Tempfile.new(['recipe_image', File.extname(blob.filename.to_s)])
169190
temp_file.binmode
170191
temp_file.write(image_file)
171192
temp_file.rewind
172193

173-
# Call OpenAI service to parse the image
174-
magic_data_json = openai_service.ocr(temp_file, content_type)
194+
# Process based on selected AI method
195+
magic_data_json = if ai_method == 'mistral_openai'
196+
# Two-phase: Mistral OCR -> OpenAI parsing
197+
markdown = mistral_service.ocr_to_markdown(temp_file, content_type)
198+
199+
if markdown.blank?
200+
raise "Mistral OCR returned empty markdown"
201+
end
202+
203+
openai_service.parse_markdown_to_recipes(markdown)
204+
else
205+
# Direct OpenAI OCR
206+
openai_service.ocr(temp_file, content_type)
207+
end
208+
209+
if magic_data_json.empty?
210+
raise "No recipes extracted from image"
211+
end
175212

176213
# Save full OCR result array to database
177-
ocrresult = OcrResult.create(result: magic_data_json.to_json)
214+
ocrresult = OcrResult.create(result: magic_data_json.to_json, ai_method: ai_method)
178215
ocrresult.image.attach(blob)
179216
ocrresult.save
180217

@@ -209,4 +246,8 @@ def reparse_image
209246
def openai_service
210247
@openai_service ||= OpenaiService.new
211248
end
249+
250+
def mistral_service
251+
@mistral_service ||= MistralaiService.new
252+
end
212253
end
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
class OcrDebugController < ApplicationController
2+
include Secured
3+
4+
def index
5+
@page_title = I18n.t('ocr_debug.title')
6+
end
7+
8+
def upload
9+
file = params[:files].first
10+
11+
# Call Mistral AI OCR
12+
markdown = mistralai_service.ocr_to_markdown(file.tempfile, file.content_type)
13+
14+
# Create OcrResult and attach the image
15+
ocr_result = OcrResult.create(result: markdown)
16+
ocr_result.image.attach(file)
17+
ocr_result.save
18+
19+
# Return JSON with redirect to show page
20+
render json: { success: true, redirect_url: ocr_debug_path(ocr_result.id) }
21+
end
22+
23+
def show
24+
@ocr_result = OcrResult.find(params[:id])
25+
render plain: @ocr_result.result
26+
end
27+
28+
private
29+
30+
def mistralai_service
31+
@mistralai_service ||= MistralaiService.new
32+
end
33+
end

app/javascript/controllers/dropzone_controller.js

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,12 @@ export default class extends Controller {
9898
formData.append('files[]', files[i])
9999
}
100100

101+
// Add AI method selection if present
102+
const aiMethodSelect = document.querySelector('select[name="ai_method"]')
103+
if (aiMethodSelect) {
104+
formData.append('ai_method', aiMethodSelect.value)
105+
}
106+
101107
// Upload to server
102108
fetch(this.urlValue, {
103109
method: 'POST',

app/javascript/controllers/reparse_controller.js

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,19 @@
11
import { Controller } from "@hotwired/stimulus"
22

33
export default class extends Controller {
4-
static targets = ["button", "buttonText", "spinner"]
4+
static targets = ["button", "buttonText", "spinner", "aiMethod", "aiMethodField"]
55

66
onSubmit(event) {
77
const form = event.target
88
const button = form.querySelector('[data-reparse-target="button"]')
99

10+
// Get selected AI method and populate hidden field
11+
const selectedMethod = document.querySelector('select[name="ai_method"]')
12+
const aiMethodField = form.querySelector('[data-reparse-target="aiMethodField"]')
13+
if (selectedMethod && aiMethodField) {
14+
aiMethodField.value = selectedMethod.value
15+
}
16+
1017
if (button) {
1118
// Find the button text and spinner within this specific button
1219
const buttonText = button.querySelector('[data-reparse-target="buttonText"]')

app/services/mistralai_service.rb

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
class MistralaiService
2+
def initialize
3+
# API key validation happens at runtime (not initialization) to allow
4+
# the service to be instantiated even when Mistral AI is not configured.
5+
# This is intentional - the key is only checked when the service is actually used.
6+
api_key = ENV['MISTRAL_API_KEY']
7+
raise "Mistral API key not configured. Set MISTRAL_API_KEY environment variable" if api_key.blank?
8+
9+
@client = OmniAI::Mistral::Client.new
10+
end
11+
12+
def ocr_to_markdown(image_file, content_type)
13+
# Read and encode the image file as base64
14+
image_data = if image_file.respond_to?(:read)
15+
# Handle uploaded file (Tempfile)
16+
image_file.rewind
17+
data = Base64.strict_encode64(image_file.read)
18+
image_file.rewind # Reset for potential subsequent reads
19+
data
20+
elsif image_file.is_a?(String)
21+
# Handle file path
22+
File.open(image_file, 'rb') { |f| Base64.strict_encode64(f.read) }
23+
else
24+
raise ArgumentError, "Invalid image_file type"
25+
end
26+
27+
# Determine the image format from content_type or file extension
28+
image_format = case content_type
29+
when /jpeg|jpg/ then 'jpeg'
30+
when /png/ then 'png'
31+
when /webp/ then 'webp'
32+
when /heic|heif/ then 'heic'
33+
else 'jpeg' # default
34+
end
35+
36+
Rails.logger.debug "Image format detected: #{image_format} for content type: #{content_type}"
37+
filedata = "data:image/#{image_format};base64,#{image_data}"
38+
39+
response = @client.ocr(filedata, kind: :image)
40+
recognized_markdown = response.pages[0].markdown
41+
42+
Rails.logger.debug "Mistral OCR result:\n#{recognized_markdown[0..100]}..."
43+
recognized_markdown
44+
end
45+
end

0 commit comments

Comments
 (0)