@@ -3055,6 +3055,115 @@ def __init__(
30553055 self .cumulative_tool_call_index : int = 0
30563056 self .has_seen_tool_calls : bool = False
30573057
3058+ def _apply_stream_candidates (
3059+ self ,
3060+ _candidates : List [Candidates ],
3061+ model_response : Any ,
3062+ ) -> Tuple [List [dict ], List [dict ], List [dict ], List [dict ]]:
3063+ (
3064+ grounding_metadata ,
3065+ url_context_metadata ,
3066+ safety_ratings ,
3067+ citation_metadata ,
3068+ self .cumulative_tool_call_index ,
3069+ ) = VertexGeminiConfig ._process_candidates (
3070+ _candidates ,
3071+ model_response ,
3072+ self .logging_obj .optional_params ,
3073+ cumulative_tool_call_index = self .cumulative_tool_call_index ,
3074+ )
3075+
3076+ # Track whether tool_calls have been seen across streaming chunks.
3077+ # Gemini sends tool_calls and finishReason in separate chunks,
3078+ # so we need to remember if earlier chunks contained tool_calls
3079+ # to correctly set finish_reason="tool_calls" per the OpenAI spec.
3080+ if not self .has_seen_tool_calls :
3081+ for choice in model_response .choices :
3082+ if (
3083+ hasattr (choice , "delta" )
3084+ and choice .delta
3085+ and choice .delta .tool_calls
3086+ ):
3087+ self .has_seen_tool_calls = True
3088+ break
3089+
3090+ # Handle final chunk with finishReason but no content.
3091+ # _process_candidates skips candidates without "content",
3092+ # so the finish_reason from the final chunk is lost.
3093+ if not model_response .choices and _candidates :
3094+ from litellm .types .utils import Delta , StreamingChoices
3095+
3096+ for candidate in _candidates :
3097+ finish_reason_str = candidate .get ("finishReason" )
3098+ if finish_reason_str is not None :
3099+ if self .has_seen_tool_calls :
3100+ mapped_finish_reason = "tool_calls"
3101+ else :
3102+ mapped_finish_reason = VertexGeminiConfig ._check_finish_reason (
3103+ None , finish_reason_str
3104+ )
3105+ choice = StreamingChoices (
3106+ finish_reason = mapped_finish_reason ,
3107+ index = candidate .get ("index" , 0 ),
3108+ delta = Delta (content = None , role = None ),
3109+ logprobs = None ,
3110+ enhancements = None ,
3111+ )
3112+ model_response .choices .append (choice )
3113+
3114+ # Also handle the case where the final chunk has empty
3115+ # content (e.g. text:"") WITH finishReason. In this case
3116+ # _process_candidates DOES create a choice, but maps
3117+ # finishReason="STOP" to "stop" because the current chunk
3118+ # has no tool_calls. Override if we saw tool_calls earlier.
3119+ if self .has_seen_tool_calls :
3120+ for choice in model_response .choices :
3121+ if choice .finish_reason == "stop" :
3122+ choice .finish_reason = "tool_calls"
3123+
3124+ setattr (model_response , "vertex_ai_grounding_metadata" , grounding_metadata ) # type: ignore
3125+ setattr (model_response , "vertex_ai_url_context_metadata" , url_context_metadata ) # type: ignore
3126+ setattr (model_response , "vertex_ai_safety_ratings" , safety_ratings ) # type: ignore
3127+ setattr (model_response , "vertex_ai_citation_metadata" , citation_metadata ) # type: ignore
3128+
3129+ return grounding_metadata , url_context_metadata , safety_ratings , citation_metadata
3130+
3131+ def _apply_stream_usage_metadata (
3132+ self ,
3133+ processed_chunk : Any ,
3134+ model_response : Any ,
3135+ grounding_metadata : List [dict ],
3136+ ) -> Optional [Usage ]:
3137+ if "usageMetadata" not in processed_chunk :
3138+ return None
3139+
3140+ usage = VertexGeminiConfig ._calculate_usage (
3141+ completion_response = processed_chunk ,
3142+ )
3143+
3144+ web_search_requests = VertexGeminiConfig ._calculate_web_search_requests (
3145+ grounding_metadata
3146+ )
3147+ if web_search_requests is not None :
3148+ cast (
3149+ PromptTokensDetailsWrapper , usage .prompt_tokens_details
3150+ ).web_search_requests = web_search_requests
3151+
3152+ traffic_type = processed_chunk .get ("usageMetadata" , {}).get ("trafficType" )
3153+ if traffic_type :
3154+ model_response ._hidden_params .setdefault (
3155+ "provider_specific_fields" , {}
3156+ )["traffic_type" ] = traffic_type
3157+
3158+ service_tier = self .response_headers .get ("x-gemini-service-tier" )
3159+ if service_tier :
3160+ if service_tier .lower () == "standard" :
3161+ setattr (model_response , "service_tier" , "default" )
3162+ else :
3163+ setattr (model_response , "service_tier" , service_tier .lower ())
3164+
3165+ return usage
3166+
30583167 def chunk_parser (self , chunk : dict ) -> Optional ["ModelResponseStream" ]:
30593168 try :
30603169 verbose_logger .debug (f"RAW GEMINI CHUNK: { chunk } " )
@@ -3072,108 +3181,23 @@ def chunk_parser(self, chunk: dict) -> Optional["ModelResponseStream"]:
30723181 if blocked_response is not None :
30733182 model_response = blocked_response
30743183
3075- usage : Optional [Usage ] = None
3076- _candidates : Optional [List [Candidates ]] = processed_chunk .get ("candidates" )
30773184 grounding_metadata : List [dict ] = []
30783185 url_context_metadata : List [dict ] = []
30793186 safety_ratings : List [dict ] = []
30803187 citation_metadata : List [dict ] = []
3188+
3189+ _candidates : Optional [List [Candidates ]] = processed_chunk .get ("candidates" )
30813190 if _candidates :
30823191 (
30833192 grounding_metadata ,
30843193 url_context_metadata ,
30853194 safety_ratings ,
30863195 citation_metadata ,
3087- self .cumulative_tool_call_index ,
3088- ) = VertexGeminiConfig ._process_candidates (
3089- _candidates ,
3090- model_response ,
3091- self .logging_obj .optional_params ,
3092- cumulative_tool_call_index = self .cumulative_tool_call_index ,
3093- )
3196+ ) = self ._apply_stream_candidates (_candidates , model_response )
30943197
3095- # Track whether tool_calls have been seen across streaming chunks.
3096- # Gemini sends tool_calls and finishReason in separate chunks,
3097- # so we need to remember if earlier chunks contained tool_calls
3098- # to correctly set finish_reason="tool_calls" per the OpenAI spec.
3099- if not self .has_seen_tool_calls :
3100- for choice in model_response .choices :
3101- if (
3102- hasattr (choice , "delta" )
3103- and choice .delta
3104- and choice .delta .tool_calls
3105- ):
3106- self .has_seen_tool_calls = True
3107- break
3108-
3109- # Handle final chunk with finishReason but no content.
3110- # _process_candidates skips candidates without "content",
3111- # so the finish_reason from the final chunk is lost.
3112- if not model_response .choices and _candidates :
3113- from litellm .types .utils import Delta , StreamingChoices
3114-
3115- for candidate in _candidates :
3116- finish_reason_str = candidate .get ("finishReason" )
3117- if finish_reason_str is not None :
3118- if self .has_seen_tool_calls :
3119- mapped_finish_reason = "tool_calls"
3120- else :
3121- mapped_finish_reason = (
3122- VertexGeminiConfig ._check_finish_reason (
3123- None , finish_reason_str
3124- )
3125- )
3126- choice = StreamingChoices (
3127- finish_reason = mapped_finish_reason ,
3128- index = candidate .get ("index" , 0 ),
3129- delta = Delta (content = None , role = None ),
3130- logprobs = None ,
3131- enhancements = None ,
3132- )
3133- model_response .choices .append (choice )
3134-
3135- # Also handle the case where the final chunk has empty
3136- # content (e.g. text:"") WITH finishReason. In this case
3137- # _process_candidates DOES create a choice, but maps
3138- # finishReason="STOP" to "stop" because the current chunk
3139- # has no tool_calls. Override if we saw tool_calls earlier.
3140- if self .has_seen_tool_calls :
3141- for choice in model_response .choices :
3142- if choice .finish_reason == "stop" :
3143- choice .finish_reason = "tool_calls"
3144-
3145- setattr (model_response , "vertex_ai_grounding_metadata" , grounding_metadata ) # type: ignore
3146- setattr (model_response , "vertex_ai_url_context_metadata" , url_context_metadata ) # type: ignore
3147- setattr (model_response , "vertex_ai_safety_ratings" , safety_ratings ) # type: ignore
3148- setattr (model_response , "vertex_ai_citation_metadata" , citation_metadata ) # type: ignore
3149-
3150- if "usageMetadata" in processed_chunk :
3151- usage = VertexGeminiConfig ._calculate_usage (
3152- completion_response = processed_chunk ,
3153- )
3154-
3155- web_search_requests = VertexGeminiConfig ._calculate_web_search_requests (
3156- grounding_metadata
3157- )
3158- if web_search_requests is not None :
3159- cast (
3160- PromptTokensDetailsWrapper , usage .prompt_tokens_details
3161- ).web_search_requests = web_search_requests
3162-
3163- traffic_type = processed_chunk .get ("usageMetadata" , {}).get (
3164- "trafficType"
3165- )
3166- if traffic_type :
3167- model_response ._hidden_params .setdefault (
3168- "provider_specific_fields" , {}
3169- )["traffic_type" ] = traffic_type
3170-
3171- service_tier = self .response_headers .get ("x-gemini-service-tier" )
3172- if service_tier :
3173- if service_tier .lower () == "standard" :
3174- setattr (model_response , "service_tier" , "default" )
3175- else :
3176- setattr (model_response , "service_tier" , service_tier .lower ())
3198+ usage = self ._apply_stream_usage_metadata (
3199+ processed_chunk , model_response , grounding_metadata
3200+ )
31773201
31783202 setattr (model_response , "usage" , usage ) # type: ignore
31793203
0 commit comments