From 577d89a08e1e8245da8bf568db3a9388b680e3a4 Mon Sep 17 00:00:00 2001 From: damencho Date: Fri, 7 Nov 2025 09:13:46 -0600 Subject: [PATCH] feat(transcriber): Moves to use speech to text v2. --- pom.xml | 4 +- .../GoogleCloudTranscriptionService.java | 204 ++++++++++++++---- 2 files changed, 159 insertions(+), 49 deletions(-) diff --git a/pom.xml b/pom.xml index 9c38b619..9f959df7 100644 --- a/pom.xml +++ b/pom.xml @@ -63,7 +63,7 @@ com.google.cloud google-cloud-speech - 4.70.0 + 4.73.0 @@ -75,7 +75,7 @@ com.google.cloud google-cloud-translate - 2.75.0 + 2.78.0 diff --git a/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java b/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java index cd61b1dc..5722c823 100644 --- a/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java +++ b/src/main/java/org/jitsi/jigasi/transcription/GoogleCloudTranscriptionService.java @@ -20,15 +20,18 @@ import com.fasterxml.uuid.*; import com.google.api.gax.rpc.*; import com.google.auth.oauth2.*; -import com.google.cloud.speech.v1.*; +import com.google.cloud.speech.v2.*; import com.google.protobuf.*; import org.jitsi.jigasi.*; import org.jitsi.jigasi.stats.*; import org.jitsi.jigasi.transcription.action.*; import org.jitsi.utils.logging.*; +import org.json.simple.*; +import org.json.simple.parser.*; import javax.media.format.*; import java.io.*; +import java.nio.file.*; import java.time.*; import java.util.*; import java.util.concurrent.*; @@ -184,6 +187,23 @@ public class GoogleCloudTranscriptionService */ private final static String DEFAULT_VALUE_GOOGLE_MODEL = "latest_long"; + /** + * Property name for the Google Cloud project ID + */ + private final static String GOOGLE_PROJECT_ID + = "org.jitsi.jigasi.transcription.google_project_id"; + + /** + * Property name for the Google Cloud location + */ + private final static String GOOGLE_LOCATION + = "org.jitsi.jigasi.transcription.google_location"; + + /** + * The default value for the property GOOGLE_LOCATION + */ + private final static String DEFAULT_VALUE_GOOGLE_LOCATION = "global"; + /** * Check whether the given string contains a supported language tag * @@ -218,17 +238,26 @@ public boolean supportsLanguageRouting() } /** - * List of SpeechContexts to be inserted in - * the RecognitionConfig. This is a list of phrases to be used as - * a dictionary to assist the speech recognition. + * SpeechAdaptation to be inserted in the RecognitionConfig. + * This contains phrases to be used as a dictionary to assist the speech recognition. */ - private List speechContexts = null; + private SpeechAdaptation speechAdaptation = null; /** * The model used for STT */ private final String useModel; + /** + * The Google Cloud project ID + */ + private final String projectId; + + /** + * The Google Cloud location + */ + private final String location; + /** * Creates the RecognitionConfig the Google service uses based * on the TranscriptionRequest @@ -243,13 +272,13 @@ private RecognitionConfig getRecognitionConfig(TranscriptionRequest request) { RecognitionConfig.Builder builder = RecognitionConfig.newBuilder(); - // Set the sampling rate and encoding of the audio + // Set the sampling rate and encoding of the audio using ExplicitDecodingConfig AudioFormat format = request.getFormat(); - builder.setSampleRateHertz(Double.valueOf(format.getSampleRate()).intValue()); + ExplicitDecodingConfig.AudioEncoding encoding; switch(format.getEncoding()) { case "LINEAR": - builder.setEncoding(RecognitionConfig.AudioEncoding.LINEAR16); + encoding = ExplicitDecodingConfig.AudioEncoding.LINEAR16; break; default: throw new IllegalArgumentException("Given AudioFormat" + @@ -257,6 +286,12 @@ private RecognitionConfig getRecognitionConfig(TranscriptionRequest request) "encoding"); } + builder.setExplicitDecodingConfig( + ExplicitDecodingConfig.newBuilder() + .setEncoding(encoding) + .setSampleRateHertz(Double.valueOf(format.getSampleRate()).intValue()) + .build()); + builder.setModel(useModel); if (logger.isDebugEnabled()) { @@ -266,16 +301,64 @@ private RecognitionConfig getRecognitionConfig(TranscriptionRequest request) // set the Language tag String languageTag = request.getLocale().toLanguageTag(); validateLanguageTag(languageTag); - builder.setLanguageCode(languageTag); + builder.addLanguageCodes(languageTag); - addSpeechContexts(builder); + addSpeechAdaptation(builder); - // set the requested alternatives - builder.setMaxAlternatives(MAXIMUM_DESIRED_ALTERNATIVES); + // set the requested alternatives using RecognitionFeatures + builder.setFeatures( + RecognitionFeatures.newBuilder() + .setMaxAlternatives(MAXIMUM_DESIRED_ALTERNATIVES) + .build()); return builder.build(); } + /** + * Extracts the project_id from the Google Application Credentials JSON file. + * + * @return the project_id from the credentials file, or null if not found + */ + private static String extractProjectIdFromCredentials() + { + String credentialsPath = System.getenv("GOOGLE_APPLICATION_CREDENTIALS"); + if (credentialsPath == null || credentialsPath.isEmpty()) + { + return null; + } + + try + { + String jsonContent = new String(Files.readAllBytes(Paths.get(credentialsPath))); + JSONParser parser = new JSONParser(); + JSONObject jsonObject = (JSONObject) parser.parse(jsonContent); + + if (jsonObject.containsKey("project_id")) + { + String projectId = (String) jsonObject.get("project_id"); + if (logger.isDebugEnabled()) + { + logger.debug("Extracted project_id from credentials file: " + projectId); + } + return projectId; + } + } + catch (IOException e) + { + logger.warn("Failed to read credentials file: " + credentialsPath, e); + } + catch (ParseException e) + { + logger.warn("Failed to parse credentials file: " + credentialsPath, e); + } + catch (Exception e) + { + logger.warn("Failed to extract project_id from credentials file", e); + } + + return null; + } + /** * Create a TranscriptionService which will send audio to the Google cloud * platform to get a transcription @@ -284,6 +367,24 @@ public GoogleCloudTranscriptionService() { useModel = JigasiBundleActivator.getConfigurationService() .getString(GOOGLE_MODEL, DEFAULT_VALUE_GOOGLE_MODEL); + + // First try to get project_id from config + String configProjectId = JigasiBundleActivator.getConfigurationService() + .getString(GOOGLE_PROJECT_ID); + + // If not in config, try to extract from credentials file + if (configProjectId == null || configProjectId.isEmpty()) + { + configProjectId = extractProjectIdFromCredentials(); + if (configProjectId != null) + { + logger.info("Using project_id from credentials file: " + configProjectId); + } + } + + projectId = configProjectId; + location = JigasiBundleActivator.getConfigurationService() + .getString(GOOGLE_LOCATION, DEFAULT_VALUE_GOOGLE_LOCATION); } /** @@ -326,12 +427,17 @@ public void sendSingleRequest(final TranscriptionRequest request, RecognitionConfig config = getRecognitionConfig(request); ByteString audioBytes = ByteString.copyFrom(request.getAudio()); - RecognitionAudio audio = RecognitionAudio.newBuilder() + + // Build the recognizer resource name + String recognizer = String.format("projects/%s/locations/%s/recognizers/_", projectId, location); + + RecognizeRequest recognizeRequest = RecognizeRequest.newBuilder() + .setRecognizer(recognizer) + .setConfig(config) .setContent(audioBytes) .build(); - RecognizeResponse recognizeResponse = - client.recognize(config, audio); + RecognizeResponse recognizeResponse = client.recognize(recognizeRequest); client.close(); @@ -382,22 +488,37 @@ public boolean supportsStreamRecognition() } /** - * Initialize speechContexts if needed, by getting all the phrases used + * Initialize speechAdaptation if needed, by getting all the phrases used * by the action handlers to detect commands to handle. - * Inserts all speechContexts to the RecognitionConfig.Builder. - * @param builder the builder where to add speech contexts. + * Inserts speechAdaptation to the RecognitionConfig.Builder. + * @param builder the builder where to add speech adaptation. */ - private void addSpeechContexts(RecognitionConfig.Builder builder) + private void addSpeechAdaptation(RecognitionConfig.Builder builder) { - if (speechContexts == null) + if (speechAdaptation == null) { - speechContexts = new ArrayList<>(); - ActionServicesHandler.getInstance().getPhrases() - .stream().map(ph -> speechContexts.add( - SpeechContext.newBuilder().addPhrases(ph).build())); + List phrases = ActionServicesHandler.getInstance().getPhrases(); + if (!phrases.isEmpty()) + { + PhraseSet.Builder phraseSetBuilder = PhraseSet.newBuilder(); + for (String phrase : phrases) + { + phraseSetBuilder.addPhrases(PhraseSet.Phrase.newBuilder().setValue(phrase).build()); + } + + speechAdaptation = SpeechAdaptation.newBuilder() + .addPhraseSets( + SpeechAdaptation.AdaptationPhraseSet.newBuilder() + .setInlinePhraseSet(phraseSetBuilder.build()) + .build()) + .build(); + } } - speechContexts.stream().map(ctx -> builder.addSpeechContexts(ctx)); + if (speechAdaptation != null) + { + builder.setAdaptation(speechAdaptation); + } } /** @@ -672,19 +793,23 @@ private ApiStreamObserver createObserver( { // Each observer gets its own responseObserver to be able to // get a unique ID + String languageCode = config.getLanguageCodesCount() > 0 + ? config.getLanguageCodes(0) : "en-US"; ResponseApiStreamingObserver responseObserver = new ResponseApiStreamingObserver( this, - config.getLanguageCode(), + languageCode, debugName); + // Build the recognizer resource name + String recognizer = String.format("projects/%s/locations/%s/recognizers/_", projectId, location); + // StreamingRecognitionConfig which will hold information // about the streaming session, including the RecognitionConfig StreamingRecognitionConfig streamingRecognitionConfig = StreamingRecognitionConfig.newBuilder() .setConfig(config) - .setInterimResults(RETRIEVE_INTERIM_RESULTS) .build(); // StreamingCallable manages sending the audio and receiving @@ -700,9 +825,10 @@ private ApiStreamObserver createObserver( = callable.bidiStreamingCall(responseObserver); // Sent the first request which needs to **only** contain the - // StreamingRecognitionConfig + // recognizer and StreamingRecognitionConfig requestObserver.onNext( StreamingRecognizeRequest.newBuilder() + .setRecognizer(recognizer) .setStreamingConfig(streamingRecognitionConfig) .build()); @@ -758,7 +884,7 @@ void sentRequest(TranscriptionRequest request) currentRequestObserver.onNext( StreamingRecognizeRequest.newBuilder() - .setAudioContent(audioBytes) + .setAudio(audioBytes) .build()); terminatingSessionThread.interrupt(); @@ -902,30 +1028,14 @@ public void onNext(StreamingRecognizeResponse message) { if (logger.isDebugEnabled()) logger.debug(debugName + ": received a StreamingRecognizeResponse"); - if (message.hasError()) - { - Statistics.incrementTotalTranscriberSendErrors(); - - // it is expected to get an error if the 60 seconds are exceeded - // without any speech in the audio OR if someone muted their mic - // and no new audio is coming in - // thus we cancel the current session and start a new one - // when new audio comes in - if (logger.isDebugEnabled()) - logger.debug( - debugName + ": received error from StreamingRecognizeResponse: " - + message.getError().getMessage()); - requestManager.terminateCurrentSession(); - return; - } if (message.getResultsCount() == 0) { if (logger.isDebugEnabled()) logger.debug( debugName + ": received a message with an empty results list"); - Statistics.incrementTotalTranscriberNoResultErrors(); - requestManager.terminateCurrentSession(); + // In v2, empty results can indicate events like SPEECH_ACTIVITY_BEGIN + // These are not errors, just status updates return; }