Skip to content

Commit 5ab43fd

Browse files
authored
fix: lang overflows, sst (#2314)
1 parent 5894e47 commit 5ab43fd

File tree

5 files changed

+194
-15
lines changed

5 files changed

+194
-15
lines changed

application/core/settings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ class Settings(BaseSettings):
159159
STT_PROVIDER: str = "openai" # openai or faster_whisper
160160
OPENAI_STT_MODEL: str = "gpt-4o-mini-transcribe"
161161
STT_LANGUAGE: Optional[str] = None
162-
STT_MAX_FILE_SIZE_MB: int = 25
162+
STT_MAX_FILE_SIZE_MB: int = 50
163163
STT_ENABLE_TIMESTAMPS: bool = False
164164
STT_ENABLE_DIARIZATION: bool = False
165165

docs/content/Deploying/DocsGPT-Settings.mdx

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ For an end-to-end walkthrough, see the [Speech and Audio Guide](/Guides/speech-a
111111
| `STT_PROVIDER` | Speech-to-text backend provider. | `openai`, `faster_whisper` |
112112
| `OPENAI_STT_MODEL` | OpenAI transcription model used when `STT_PROVIDER=openai`. | `gpt-4o-mini-transcribe` |
113113
| `STT_LANGUAGE` | Optional language hint passed to the provider. Leave unset for auto-detection when supported. | `en`, `es`, unset |
114-
| `STT_MAX_FILE_SIZE_MB` | Maximum file size accepted by the synchronous `/api/stt` endpoint. | `25` |
114+
| `STT_MAX_FILE_SIZE_MB` | Maximum file size accepted by the synchronous `/api/stt` endpoint. | `50` |
115115
| `STT_ENABLE_TIMESTAMPS` | Include timestamp segments in the normalized transcript response and stored parser metadata. | `true`, `false` |
116116
| `STT_ENABLE_DIARIZATION` | Reserved provider option for speaker diarization. Some providers may ignore it. | `true`, `false` |
117117

@@ -122,7 +122,7 @@ STT_PROVIDER=openai
122122
OPENAI_API_KEY=YOUR_OPENAI_API_KEY
123123
OPENAI_STT_MODEL=gpt-4o-mini-transcribe
124124
STT_LANGUAGE=
125-
STT_MAX_FILE_SIZE_MB=25
125+
STT_MAX_FILE_SIZE_MB=50
126126
STT_ENABLE_TIMESTAMPS=false
127127
STT_ENABLE_DIARIZATION=false
128128
```
@@ -224,4 +224,3 @@ These are just the basic settings to get you started. The `settings.py` file con
224224
- And many more!
225225

226226
For a complete list of available settings and their descriptions, refer to the `settings.py` file in `application/core`. Remember to restart your Docker containers after making changes to your `.env` file or `settings.py` for the changes to take effect.
227-

frontend/src/agents/NewAgent.tsx

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -711,7 +711,7 @@ export default function NewAgent({ mode }: { mode: 'new' | 'edit' | 'draft' }) {
711711
{modeConfig[effectiveMode].showSaveDraft && (
712712
<button
713713
disabled={isJsonSchemaInvalid()}
714-
className={`border-violets-are-blue text-violets-are-blue hover:bg-violets-are-blue w-28 rounded-3xl border border-solid py-2 text-sm font-medium transition-colors hover:text-white ${
714+
className={`border-violets-are-blue text-violets-are-blue hover:bg-violets-are-blue flex min-w-28 items-center justify-center rounded-3xl border border-solid px-5 py-2 text-sm font-medium whitespace-nowrap transition-colors hover:text-white ${
715715
isJsonSchemaInvalid() ? 'cursor-not-allowed opacity-30' : ''
716716
}`}
717717
onClick={handleSaveDraft}
@@ -744,7 +744,7 @@ export default function NewAgent({ mode }: { mode: 'new' | 'edit' | 'draft' }) {
744744
)}
745745
<button
746746
disabled={!isPublishable() || !hasChanges}
747-
className={`${!isPublishable() || !hasChanges ? 'cursor-not-allowed opacity-30' : ''} bg-purple-30 hover:bg-violets-are-blue flex w-28 items-center justify-center rounded-3xl py-2 text-sm font-medium text-white`}
747+
className={`${!isPublishable() || !hasChanges ? 'cursor-not-allowed opacity-30' : ''} bg-purple-30 hover:bg-violets-are-blue flex min-w-28 items-center justify-center rounded-3xl px-5 py-2 text-sm font-medium whitespace-nowrap text-white`}
748748
onClick={handlePublish}
749749
>
750750
<span className="flex items-center justify-center transition-all duration-200">
@@ -933,7 +933,7 @@ export default function NewAgent({ mode }: { mode: 'new' | 'edit' | 'draft' }) {
933933
/>
934934
</div>
935935
<button
936-
className="border-violets-are-blue text-violets-are-blue hover:bg-violets-are-blue w-20 shrink-0 basis-full rounded-3xl border-2 border-solid px-5 py-[11px] text-sm transition-colors hover:text-white sm:basis-auto"
936+
className="border-violets-are-blue text-violets-are-blue hover:bg-violets-are-blue min-w-20 shrink-0 basis-full rounded-3xl border-2 border-solid px-5 py-[11px] text-sm whitespace-nowrap transition-colors hover:text-white sm:basis-auto"
937937
onClick={() => setAddPromptModal('ACTIVE')}
938938
>
939939
{t('agents.form.buttons.add')}
@@ -1405,4 +1405,4 @@ function AddPromptModal({
14051405
handleAddPrompt={handleAddPrompt}
14061406
/>
14071407
);
1408-
}
1408+
}

frontend/src/components/MessageInput.tsx

Lines changed: 179 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import SourcesPopup from './SourcesPopup';
3434
import ToolsPopup from './ToolsPopup';
3535
import { handleAbort } from '../conversation/conversationSlice';
3636
import {
37+
AUDIO_FILE_ACCEPT_ATTR,
3738
FILE_UPLOAD_ACCEPT,
3839
FILE_UPLOAD_ACCEPT_ATTR,
3940
} from '../constants/fileUpload';
@@ -54,6 +55,24 @@ type AudioContextWindow = Window &
5455
webkitAudioContext?: typeof AudioContext;
5556
};
5657

58+
type LegacyNavigator = Navigator & {
59+
getUserMedia?: (
60+
constraints: MediaStreamConstraints,
61+
successCallback: (stream: MediaStream) => void,
62+
errorCallback: (error: DOMException) => void,
63+
) => void;
64+
webkitGetUserMedia?: (
65+
constraints: MediaStreamConstraints,
66+
successCallback: (stream: MediaStream) => void,
67+
errorCallback: (error: DOMException) => void,
68+
) => void;
69+
mozGetUserMedia?: (
70+
constraints: MediaStreamConstraints,
71+
successCallback: (stream: MediaStream) => void,
72+
errorCallback: (error: DOMException) => void,
73+
) => void;
74+
};
75+
5776
type LiveAudioSnapshot = {
5877
blob: Blob;
5978
chunkIndex: number;
@@ -69,6 +88,90 @@ const getAudioContextConstructor = (): typeof AudioContext | null => {
6988
return audioWindow.AudioContext || audioWindow.webkitAudioContext || null;
7089
};
7190

91+
const getLegacyGetUserMedia = () => {
92+
if (typeof navigator === 'undefined') {
93+
return null;
94+
}
95+
96+
const legacyNavigator = navigator as LegacyNavigator;
97+
return (
98+
legacyNavigator.getUserMedia ||
99+
legacyNavigator.webkitGetUserMedia ||
100+
legacyNavigator.mozGetUserMedia ||
101+
null
102+
);
103+
};
104+
105+
const getVoiceInputSupportError = (): string | null => {
106+
if (typeof window === 'undefined' || typeof navigator === 'undefined') {
107+
return 'Voice input is unavailable right now.';
108+
}
109+
110+
if (!window.isSecureContext) {
111+
return 'Voice input requires a secure connection (HTTPS or localhost).';
112+
}
113+
114+
if (!navigator.mediaDevices?.getUserMedia && !getLegacyGetUserMedia()) {
115+
return 'Voice input is not available in this browser.';
116+
}
117+
118+
if (!getAudioContextConstructor()) {
119+
return 'Voice input requires Web Audio support in this browser.';
120+
}
121+
122+
return null;
123+
};
124+
125+
const getUserMediaStream = (
126+
constraints: MediaStreamConstraints,
127+
): Promise<MediaStream> => {
128+
if (navigator.mediaDevices?.getUserMedia) {
129+
return navigator.mediaDevices.getUserMedia(constraints);
130+
}
131+
132+
const legacyGetUserMedia = getLegacyGetUserMedia();
133+
if (!legacyGetUserMedia) {
134+
return Promise.reject(
135+
new Error('Voice input is not available in this browser.'),
136+
);
137+
}
138+
139+
return new Promise((resolve, reject) => {
140+
legacyGetUserMedia.call(navigator, constraints, resolve, reject);
141+
});
142+
};
143+
144+
const getVoiceInputErrorMessage = (error: unknown): string => {
145+
if (typeof window !== 'undefined' && !window.isSecureContext) {
146+
return 'Voice input requires a secure connection (HTTPS or localhost).';
147+
}
148+
149+
if (error instanceof DOMException) {
150+
switch (error.name) {
151+
case 'NotAllowedError':
152+
case 'PermissionDeniedError':
153+
case 'SecurityError':
154+
return 'Microphone access was blocked. Allow microphone permission and try again.';
155+
case 'NotFoundError':
156+
case 'DevicesNotFoundError':
157+
return 'No microphone was found on this device.';
158+
case 'NotReadableError':
159+
case 'TrackStartError':
160+
return 'The microphone is unavailable or already in use.';
161+
case 'AbortError':
162+
return 'Microphone access was interrupted before recording started.';
163+
default:
164+
break;
165+
}
166+
}
167+
168+
if (error instanceof Error && error.message) {
169+
return error.message;
170+
}
171+
172+
return 'Microphone access was denied.';
173+
};
174+
72175
const downsampleFloat32Buffer = (
73176
source: Float32Array,
74177
inputSampleRate: number,
@@ -197,6 +300,7 @@ export default function MessageInput({
197300
const { t } = useTranslation();
198301
const [value, setValue] = useState('');
199302
const inputRef = useRef<HTMLTextAreaElement>(null);
303+
const voiceFileInputRef = useRef<HTMLInputElement>(null);
200304
const sourceButtonRef = useRef<HTMLButtonElement>(null);
201305
const toolButtonRef = useRef<HTMLButtonElement>(null);
202306
const [isSourcesPopupOpen, setIsSourcesPopupOpen] = useState(false);
@@ -808,6 +912,48 @@ export default function MessageInput({
808912
}, 0);
809913
};
810914

915+
const promptVoiceFileFallback = (message: string) => {
916+
setRecordingState('idle');
917+
setVoiceError(`${message} Choose or record an audio file instead.`);
918+
setTimeout(() => {
919+
voiceFileInputRef.current?.click();
920+
}, 0);
921+
};
922+
923+
const transcribeUploadedAudioFile = async (file: File) => {
924+
try {
925+
setVoiceError(null);
926+
setRecordingState('transcribing');
927+
voiceBaseValueRef.current = value;
928+
liveTranscriptRef.current = '';
929+
930+
const response = await userService.transcribeAudio(file, token);
931+
const data = await response.json();
932+
933+
if (!response.ok || !data?.success) {
934+
throw new Error(data?.message || 'Failed to transcribe audio.');
935+
}
936+
937+
if (typeof data.text !== 'string' || !data.text.trim()) {
938+
throw new Error('No transcript was returned for this audio file.');
939+
}
940+
941+
applyLiveTranscript(data.text);
942+
setRecordingState('idle');
943+
if (autoFocus) {
944+
setTimeout(() => {
945+
inputRef.current?.focus();
946+
}, 0);
947+
}
948+
} catch (error) {
949+
console.error('Uploaded audio transcription failed', error);
950+
setRecordingState('error');
951+
setVoiceError(
952+
error instanceof Error ? error.message : 'Failed to transcribe audio.',
953+
);
954+
}
955+
};
956+
811957
const trimLivePcmBuffer = () => {
812958
const maxBufferedSamples =
813959
LIVE_CAPTURE_SAMPLE_RATE * LIVE_CAPTURE_MAX_BUFFER_SECONDS;
@@ -1024,24 +1170,29 @@ export default function MessageInput({
10241170
return;
10251171
}
10261172

1027-
if (!navigator.mediaDevices?.getUserMedia) {
1028-
setRecordingState('error');
1029-
setVoiceError('Voice input is not supported in this browser.');
1173+
const voiceInputSupportError = getVoiceInputSupportError();
1174+
if (voiceInputSupportError) {
1175+
promptVoiceFileFallback(voiceInputSupportError);
10301176
return;
10311177
}
10321178

10331179
const AudioContextConstructor = getAudioContextConstructor();
10341180
if (!AudioContextConstructor) {
10351181
setRecordingState('error');
1036-
setVoiceError('Voice input is not supported in this browser.');
1182+
setVoiceError('Voice input requires Web Audio support in this browser.');
10371183
return;
10381184
}
10391185

10401186
let stream: MediaStream | null = null;
10411187
try {
10421188
setVoiceError(null);
1043-
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
1189+
stream = await getUserMediaStream({ audio: true });
1190+
} catch (error) {
1191+
promptVoiceFileFallback(getVoiceInputErrorMessage(error));
1192+
return;
1193+
}
10441194

1195+
try {
10451196
const liveStartResponse = await userService.startLiveTranscription(token);
10461197
const liveStartData = await liveStartResponse.json();
10471198
if (!liveStartResponse.ok || !liveStartData?.success) {
@@ -1121,7 +1272,7 @@ export default function MessageInput({
11211272

11221273
setRecordingState('recording');
11231274
} catch (error) {
1124-
console.error('Microphone access failed', error);
1275+
console.error('Live voice transcription failed', error);
11251276
stream?.getTracks().forEach((track) => track.stop());
11261277
stopAudioProcessing();
11271278
await cleanupLiveSession();
@@ -1130,7 +1281,7 @@ export default function MessageInput({
11301281
setVoiceError(
11311282
error instanceof Error
11321283
? error.message
1133-
: 'Microphone access was denied.',
1284+
: 'Failed to start live transcription.',
11341285
);
11351286
}
11361287
};
@@ -1186,6 +1337,19 @@ export default function MessageInput({
11861337
}
11871338
};
11881339

1340+
const handleVoiceFileAttachment = (
1341+
e: React.ChangeEvent<HTMLInputElement>,
1342+
) => {
1343+
const file = e.target.files?.[0];
1344+
e.target.value = '';
1345+
1346+
if (!file) {
1347+
return;
1348+
}
1349+
1350+
void transcribeUploadedAudioFile(file);
1351+
};
1352+
11891353
const handlePostDocumentSelect = (_docs: Doc[] | null) => {
11901354
// SourcesPopup updates Redux selection directly; this preserves the prop contract.
11911355
void _docs;
@@ -1265,6 +1429,14 @@ export default function MessageInput({
12651429
<div {...getRootProps()} className="flex w-full flex-col">
12661430
{/* react-dropzone input (for drag/drop) */}
12671431
<input {...getInputProps()} />
1432+
<input
1433+
ref={voiceFileInputRef}
1434+
type="file"
1435+
className="hidden"
1436+
accept={AUDIO_FILE_ACCEPT_ATTR}
1437+
capture="user"
1438+
onChange={handleVoiceFileAttachment}
1439+
/>
12681440

12691441
<div className="border-dark-gray bg-lotion dark:border-grey relative flex w-full flex-col rounded-[23px] border dark:bg-transparent">
12701442
<div className="flex flex-wrap gap-1.5 px-2 py-2 sm:gap-2 sm:px-3">

frontend/src/constants/fileUpload.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,14 @@ export const FILE_UPLOAD_ACCEPT_ATTR = [
5555
'.webm',
5656
].join(',');
5757

58+
export const AUDIO_FILE_ACCEPT_ATTR = [
59+
'.wav',
60+
'.mp3',
61+
'.m4a',
62+
'.ogg',
63+
'.webm',
64+
].join(',');
65+
5866
export const SOURCE_FILE_TREE_ACCEPT_ATTR = [
5967
'.rst',
6068
'.md',

0 commit comments

Comments
 (0)