Skip to content

Commit e0d5798

Browse files
committed
Add caption and alt text generation for media
1 parent 6a50a4d commit e0d5798

File tree

3 files changed

+406
-4
lines changed

3 files changed

+406
-4
lines changed

Modules/Sources/WordPressShared/Intelligence/IntelligenceService.swift

Lines changed: 251 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import Foundation
22
import FoundationModels
3+
import Vision
4+
import CoreImage
35

46
@available(iOS 26, *)
57
public actor IntelligenceService {
@@ -17,6 +19,66 @@ public actor IntelligenceService {
1719

1820
public init() {}
1921

22+
/// Analyzes an image using Vision framework to extract visual information.
23+
///
24+
/// - Parameter cgImage: The image to analyze.
25+
/// - Returns: A description of what's in the image.
26+
public func analyzeImage(_ cgImage: CGImage) async throws -> String {
27+
let startTime = CFAbsoluteTimeGetCurrent()
28+
29+
var analysisResults: [String] = []
30+
31+
// 1. Scene classification
32+
let sceneRequest = VNClassifyImageRequest()
33+
34+
// 2. Object recognition
35+
let objectRequest = VNRecognizeAnimalsRequest()
36+
37+
// 3. Text detection
38+
let textRequest = VNRecognizeTextRequest()
39+
textRequest.recognitionLevel = .fast
40+
41+
// Perform all requests
42+
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
43+
try handler.perform([sceneRequest, objectRequest, textRequest])
44+
45+
// Process scene classifications
46+
if let sceneResults = sceneRequest.results as? [VNClassificationObservation] {
47+
let topScenes = sceneResults
48+
.prefix(3)
49+
.filter { $0.confidence > 0.3 }
50+
.map { "\($0.identifier) (\(Int($0.confidence * 100))%)" }
51+
if !topScenes.isEmpty {
52+
analysisResults.append("Scenes: \(topScenes.joined(separator: ", "))")
53+
}
54+
}
55+
56+
// Process animal recognition
57+
if let animalResults = objectRequest.results as? [VNRecognizedObjectObservation] {
58+
let animals = animalResults
59+
.filter { $0.confidence > 0.5 }
60+
.compactMap { $0.labels.first?.identifier }
61+
if !animals.isEmpty {
62+
analysisResults.append("Animals: \(animals.joined(separator: ", "))")
63+
}
64+
}
65+
66+
// Process text recognition
67+
if let textResults = textRequest.results as? [VNRecognizedTextObservation] {
68+
let recognizedText = textResults
69+
.prefix(5)
70+
.compactMap { $0.topCandidates(1).first?.string }
71+
.filter { !$0.isEmpty }
72+
if !recognizedText.isEmpty {
73+
analysisResults.append("Text: \(recognizedText.joined(separator: ", "))")
74+
}
75+
}
76+
77+
WPLogInfo("IntelligenceService.analyzeImage executed in \((CFAbsoluteTimeGetCurrent() - startTime) * 1000) ms")
78+
79+
return analysisResults.isEmpty ? "" : analysisResults.joined(separator: "; ")
80+
}
81+
2082
/// Suggests tags for a WordPress post.
2183
///
2284
/// - Parameters:
@@ -155,6 +217,195 @@ public actor IntelligenceService {
155217
let postSizeLimit = Double(IntelligenceService.contextSizeLimit) * ratio
156218
return String((extract ?? post).prefix(Int(postSizeLimit)))
157219
}
220+
221+
/// Metadata for generating alt text and captions.
222+
public struct MediaMetadata {
223+
public let filename: String?
224+
public let title: String?
225+
public let caption: String?
226+
public let description: String?
227+
public let altText: String?
228+
public let fileType: String?
229+
public let dimensions: String?
230+
public let imageAnalysis: String?
231+
232+
public init(filename: String? = nil, title: String? = nil, caption: String? = nil, description: String? = nil, altText: String? = nil, fileType: String? = nil, dimensions: String? = nil, imageAnalysis: String? = nil) {
233+
self.filename = filename
234+
self.title = title
235+
self.caption = caption
236+
self.description = description
237+
self.altText = altText
238+
self.fileType = fileType
239+
self.dimensions = dimensions
240+
self.imageAnalysis = imageAnalysis
241+
}
242+
243+
var hasContent: Bool {
244+
return [filename, title, caption, description, altText, fileType, dimensions, imageAnalysis]
245+
.contains(where: { !($0?.isEmpty ?? true) })
246+
}
247+
}
248+
249+
/// Generates alt text for a media item based on available metadata.
250+
///
251+
/// - Parameter metadata: The media metadata to use for generation.
252+
/// - Returns: Generated alt text.
253+
public func generateAltText(metadata: MediaMetadata) async throws -> String {
254+
guard metadata.hasContent else {
255+
throw NSError(domain: "IntelligenceService", code: -1, userInfo: [
256+
NSLocalizedDescriptionKey: "Insufficient metadata to generate alt text. Please add a filename, title, or description first."
257+
])
258+
}
259+
260+
let startTime = CFAbsoluteTimeGetCurrent()
261+
262+
let instructions = """
263+
You are helping a WordPress user generate alt text for an image.
264+
Alt text should be concise, descriptive, and accessible for screen readers.
265+
266+
**Parameters**
267+
- IMAGE_ANALYSIS: Visual analysis of the actual image content (MOST IMPORTANT)
268+
- FILENAME: the image filename
269+
- FILE_TYPE: the file type/extension
270+
- DIMENSIONS: the image dimensions
271+
- TITLE: the image title (if available)
272+
- CAPTION: the image caption (if available)
273+
- DESCRIPTION: the image description (if available)
274+
275+
**Requirements**
276+
- Generate concise alt text (1-2 sentences, max 125 characters)
277+
- Prioritize IMAGE_ANALYSIS when describing what's in the image
278+
- Focus on what the image depicts, not decorative elements
279+
- Use simple, clear language
280+
- Do not include phrases like "image of" or "picture of"
281+
- Only output the alt text, nothing else
282+
"""
283+
284+
let session = LanguageModelSession(
285+
model: .init(guardrails: .permissiveContentTransformations),
286+
instructions: instructions
287+
)
288+
289+
var contextParts: [String] = []
290+
if let imageAnalysis = metadata.imageAnalysis, !imageAnalysis.isEmpty {
291+
contextParts.append("IMAGE_ANALYSIS: '\(imageAnalysis)'")
292+
}
293+
if let filename = metadata.filename, !filename.isEmpty {
294+
contextParts.append("FILENAME: '\(filename)'")
295+
}
296+
if let fileType = metadata.fileType, !fileType.isEmpty {
297+
contextParts.append("FILE_TYPE: '\(fileType)'")
298+
}
299+
if let dimensions = metadata.dimensions, !dimensions.isEmpty {
300+
contextParts.append("DIMENSIONS: '\(dimensions)'")
301+
}
302+
if let title = metadata.title, !title.isEmpty {
303+
contextParts.append("TITLE: '\(title)'")
304+
}
305+
if let caption = metadata.caption, !caption.isEmpty {
306+
contextParts.append("CAPTION: '\(caption)'")
307+
}
308+
if let description = metadata.description, !description.isEmpty {
309+
contextParts.append("DESCRIPTION: '\(description)'")
310+
}
311+
312+
let prompt = """
313+
Generate alt text for an image with the following information:
314+
315+
\(contextParts.joined(separator: "\n"))
316+
"""
317+
318+
WPLogInfo("IntelligenceService.generateAltText prompt:\n\(prompt)")
319+
320+
let response = try await session.respond(
321+
to: prompt,
322+
options: GenerationOptions(temperature: 0.7)
323+
)
324+
325+
WPLogInfo("IntelligenceService.generateAltText executed in \((CFAbsoluteTimeGetCurrent() - startTime) * 1000) ms")
326+
327+
return response.content.trimmingCharacters(in: .whitespacesAndNewlines)
328+
}
329+
330+
/// Generates a caption for a media item based on available metadata.
331+
///
332+
/// - Parameter metadata: The media metadata to use for generation.
333+
/// - Returns: Generated caption.
334+
public func generateCaption(metadata: MediaMetadata) async throws -> String {
335+
guard metadata.hasContent else {
336+
throw NSError(domain: "IntelligenceService", code: -1, userInfo: [
337+
NSLocalizedDescriptionKey: "Insufficient metadata to generate caption. Please add a filename, title, or description first."
338+
])
339+
}
340+
341+
let startTime = CFAbsoluteTimeGetCurrent()
342+
343+
let instructions = """
344+
You are helping a WordPress user generate a caption for an image.
345+
Captions should be engaging, informative, and complement the image.
346+
347+
**Parameters**
348+
- IMAGE_ANALYSIS: Visual analysis of the actual image content (MOST IMPORTANT)
349+
- FILENAME: the image filename
350+
- FILE_TYPE: the file type/extension
351+
- DIMENSIONS: the image dimensions
352+
- TITLE: the image title (if available)
353+
- ALT_TEXT: the image alt text (if available)
354+
- DESCRIPTION: the image description (if available)
355+
356+
**Requirements**
357+
- Generate an engaging caption (1-2 sentences)
358+
- Prioritize IMAGE_ANALYSIS to understand what's actually in the image
359+
- Can be more creative and conversational than alt text
360+
- May include context, emotion, or storytelling elements
361+
- Only output the caption, nothing else
362+
"""
363+
364+
let session = LanguageModelSession(
365+
model: .init(guardrails: .permissiveContentTransformations),
366+
instructions: instructions
367+
)
368+
369+
var contextParts: [String] = []
370+
if let imageAnalysis = metadata.imageAnalysis, !imageAnalysis.isEmpty {
371+
contextParts.append("IMAGE_ANALYSIS: '\(imageAnalysis)'")
372+
}
373+
if let filename = metadata.filename, !filename.isEmpty {
374+
contextParts.append("FILENAME: '\(filename)'")
375+
}
376+
if let fileType = metadata.fileType, !fileType.isEmpty {
377+
contextParts.append("FILE_TYPE: '\(fileType)'")
378+
}
379+
if let dimensions = metadata.dimensions, !dimensions.isEmpty {
380+
contextParts.append("DIMENSIONS: '\(dimensions)'")
381+
}
382+
if let title = metadata.title, !title.isEmpty {
383+
contextParts.append("TITLE: '\(title)'")
384+
}
385+
if let altText = metadata.altText, !altText.isEmpty {
386+
contextParts.append("ALT_TEXT: '\(altText)'")
387+
}
388+
if let description = metadata.description, !description.isEmpty {
389+
contextParts.append("DESCRIPTION: '\(description)'")
390+
}
391+
392+
let prompt = """
393+
Generate a caption for an image with the following information:
394+
395+
\(contextParts.joined(separator: "\n"))
396+
"""
397+
398+
WPLogInfo("IntelligenceService.generateCaption prompt:\n\(prompt)")
399+
400+
let response = try await session.respond(
401+
to: prompt,
402+
options: GenerationOptions(temperature: 0.8)
403+
)
404+
405+
WPLogInfo("IntelligenceService.generateCaption executed in \((CFAbsoluteTimeGetCurrent() - startTime) * 1000) ms")
406+
407+
return response.content.trimmingCharacters(in: .whitespacesAndNewlines)
408+
}
158409
}
159410

160411
private extension Array where Element: Hashable {

WordPress/Classes/ViewRelated/Media/MediaItemViewController.swift

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,24 @@ final class MediaItemViewController: UITableViewController {
2222

2323
private let headerView = MediaItemHeaderView()
2424
private lazy var headerMaxHeightConstraint = headerView.heightAnchor.constraint(lessThanOrEqualToConstant: 320)
25+
private var _textGenerationController: AnyObject?
26+
27+
@available(iOS 26, *)
28+
private var textGenerationController: MediaTextGenerationController {
29+
if _textGenerationController == nil {
30+
_textGenerationController = MediaTextGenerationController(media: media) { [weak self] type, generatedText in
31+
guard let self else { return }
32+
switch type {
33+
case .altText:
34+
self.mediaMetadata.alt = generatedText
35+
case .caption:
36+
self.mediaMetadata.caption = generatedText
37+
}
38+
self.reloadViewModel()
39+
}
40+
}
41+
return _textGenerationController as! MediaTextGenerationController
42+
}
2543

2644
init(media: Media) {
2745
self.media = media
@@ -327,11 +345,14 @@ final class MediaItemViewController: UITableViewController {
327345
private func editCaption() -> ((ImmuTableRow) -> ()) {
328346
return { [weak self] row in
329347
let editableRow = row as! EditableTextRow
330-
self?.pushSettingsController(for: editableRow, hint: Strings.Hints.imageCaption,
348+
let controller = self?.pushSettingsController(for: editableRow, hint: Strings.Hints.imageCaption,
331349
onValueChanged: { value in
332350
self?.mediaMetadata.caption = value
333351
self?.reloadViewModel()
334352
})
353+
if #available(iOS 26, *), let self, let controller {
354+
self.textGenerationController.configure(controller, for: .caption)
355+
}
335356
}
336357
}
337358

@@ -349,15 +370,19 @@ final class MediaItemViewController: UITableViewController {
349370
private func editAlt() -> ((ImmuTableRow) -> ()) {
350371
return { [weak self] row in
351372
let editableRow = row as! EditableTextRow
352-
self?.pushSettingsController(for: editableRow, hint: Strings.Hints.imageAlt,
373+
let controller = self?.pushSettingsController(for: editableRow, hint: Strings.Hints.imageAlt,
353374
onValueChanged: { value in
354375
self?.mediaMetadata.alt = value
355376
self?.reloadViewModel()
356377
})
378+
if #available(iOS 26, *), let self, let controller {
379+
self.textGenerationController.configure(controller, for: .altText)
380+
}
357381
}
358382
}
359383

360-
private func pushSettingsController(for row: EditableTextRow, hint: String? = nil, onValueChanged: @escaping SettingsTextChanged) {
384+
@discardableResult
385+
private func pushSettingsController(for row: EditableTextRow, hint: String? = nil, onValueChanged: @escaping SettingsTextChanged) -> SettingsTextViewController {
361386
let title = row.title
362387
let value = row.value
363388
let controller = SettingsTextViewController(text: value, placeholder: "\(title)...", hint: hint)
@@ -366,6 +391,7 @@ final class MediaItemViewController: UITableViewController {
366391
controller.onValueChanged = onValueChanged
367392

368393
navigationController?.pushViewController(controller, animated: true)
394+
return controller
369395
}
370396

371397
// MARK: - Sharing Logic
@@ -417,7 +443,7 @@ extension MediaItemViewController {
417443
/// Provides some extra formatting for a Media asset's metadata, used
418444
/// to present it in the MediaItemViewController
419445
///
420-
private struct MediaMetadataPresenter {
446+
struct MediaMetadataPresenter {
421447
let media: Media
422448

423449
/// A String containing the pixel size of the asset (width X height)

0 commit comments

Comments
 (0)