diff --git a/newrelic/hooks/external_botocore.py b/newrelic/hooks/external_botocore.py index 216d38c9b..ac392d966 100644 --- a/newrelic/hooks/external_botocore.py +++ b/newrelic/hooks/external_botocore.py @@ -194,6 +194,7 @@ def create_chat_completion_message_event( request_model, request_id, llm_metadata_dict, + all_token_counts, response_id=None, request_timestamp=None, ): @@ -227,6 +228,8 @@ def create_chat_completion_message_event( "vendor": "bedrock", "ingest_source": "Python", } + if all_token_counts: + chat_completion_message_dict["token_count"] = 0 if settings.ai_monitoring.record_content.enabled: chat_completion_message_dict["content"] = content @@ -268,6 +271,8 @@ def create_chat_completion_message_event( "ingest_source": "Python", "is_response": True, } + if all_token_counts: + chat_completion_message_dict["token_count"] = 0 if settings.ai_monitoring.record_content.enabled: chat_completion_message_dict["content"] = content @@ -277,24 +282,21 @@ def create_chat_completion_message_event( transaction.record_custom_event("LlmChatCompletionMessage", chat_completion_message_dict) -def extract_bedrock_titan_text_model_request(request_body, bedrock_attrs): +def extract_bedrock_titan_embedding_model_request(request_body, bedrock_attrs): request_body = json.loads(request_body) - request_config = request_body.get("textGenerationConfig", {}) - input_message_list = [{"role": "user", "content": request_body.get("inputText")}] - - bedrock_attrs["input_message_list"] = input_message_list - bedrock_attrs["request.max_tokens"] = request_config.get("maxTokenCount") - bedrock_attrs["request.temperature"] = request_config.get("temperature") + bedrock_attrs["input"] = request_body.get("inputText") return bedrock_attrs -def extract_bedrock_mistral_text_model_request(request_body, bedrock_attrs): - request_body = json.loads(request_body) - bedrock_attrs["input_message_list"] = [{"role": "user", "content": request_body.get("prompt")}] - bedrock_attrs["request.max_tokens"] = request_body.get("max_tokens") - bedrock_attrs["request.temperature"] = request_body.get("temperature") +def extract_bedrock_titan_embedding_model_response(response_body, bedrock_attrs): + if response_body: + response_body = json.loads(response_body) + + input_tokens = response_body.get("inputTextTokenCount", 0) + bedrock_attrs["response.usage.total_tokens"] = input_tokens + return bedrock_attrs @@ -302,16 +304,31 @@ def extract_bedrock_titan_text_model_response(response_body, bedrock_attrs): if response_body: response_body = json.loads(response_body) + input_tokens = response_body.get("inputTextTokenCount", 0) + completion_tokens = sum(result.get("tokenCount", 0) for result in response_body.get("results", [])) + total_tokens = input_tokens + completion_tokens + output_message_list = [ - {"role": "assistant", "content": result["outputText"]} for result in response_body.get("results", []) + {"role": "assistant", "content": result.get("outputText")} for result in response_body.get("results", []) ] bedrock_attrs["response.choices.finish_reason"] = response_body["results"][0]["completionReason"] + bedrock_attrs["response.usage.completion_tokens"] = completion_tokens + bedrock_attrs["response.usage.prompt_tokens"] = input_tokens + bedrock_attrs["response.usage.total_tokens"] = total_tokens bedrock_attrs["output_message_list"] = output_message_list return bedrock_attrs +def extract_bedrock_mistral_text_model_request(request_body, bedrock_attrs): + request_body = json.loads(request_body) + bedrock_attrs["input_message_list"] = [{"role": "user", "content": request_body.get("prompt")}] + bedrock_attrs["request.max_tokens"] = request_body.get("max_tokens") + bedrock_attrs["request.temperature"] = request_body.get("temperature") + return bedrock_attrs + + def extract_bedrock_mistral_text_model_response(response_body, bedrock_attrs): if response_body: response_body = json.loads(response_body) @@ -324,17 +341,6 @@ def extract_bedrock_mistral_text_model_response(response_body, bedrock_attrs): return bedrock_attrs -def extract_bedrock_titan_text_model_streaming_response(response_body, bedrock_attrs): - if response_body: - if "outputText" in response_body: - bedrock_attrs["output_message_list"] = messages = bedrock_attrs.get("output_message_list", []) - messages.append({"role": "assistant", "content": response_body["outputText"]}) - - bedrock_attrs["response.choices.finish_reason"] = response_body.get("completionReason", None) - - return bedrock_attrs - - def extract_bedrock_mistral_text_model_streaming_response(response_body, bedrock_attrs): if response_body: outputs = response_body.get("outputs") @@ -343,14 +349,46 @@ def extract_bedrock_mistral_text_model_streaming_response(response_body, bedrock "output_message_list", [{"role": "assistant", "content": ""}] ) bedrock_attrs["output_message_list"][0]["content"] += outputs[0].get("text", "") - bedrock_attrs["response.choices.finish_reason"] = outputs[0].get("stop_reason", None) + bedrock_attrs["response.choices.finish_reason"] = outputs[0].get("stop_reason") return bedrock_attrs -def extract_bedrock_titan_embedding_model_request(request_body, bedrock_attrs): +def extract_bedrock_titan_text_model_request(request_body, bedrock_attrs): request_body = json.loads(request_body) + request_config = request_body.get("textGenerationConfig", {}) + + input_message_list = [{"role": "user", "content": request_body.get("inputText")}] + + bedrock_attrs["input_message_list"] = input_message_list + bedrock_attrs["request.max_tokens"] = request_config.get("maxTokenCount") + bedrock_attrs["request.temperature"] = request_config.get("temperature") + + return bedrock_attrs - bedrock_attrs["input"] = request_body.get("inputText") + +def extract_bedrock_titan_text_model_streaming_response(response_body, bedrock_attrs): + if response_body: + if "outputText" in response_body: + bedrock_attrs["output_message_list"] = messages = bedrock_attrs.get("output_message_list", []) + messages.append({"role": "assistant", "content": response_body["outputText"]}) + + bedrock_attrs["response.choices.finish_reason"] = response_body.get("completionReason") + + # Extract token information + invocation_metrics = response_body.get("amazon-bedrock-invocationMetrics", {}) + prompt_tokens = invocation_metrics.get("inputTokenCount", 0) + completion_tokens = invocation_metrics.get("outputTokenCount", 0) + total_tokens = prompt_tokens + completion_tokens + + bedrock_attrs["response.usage.completion_tokens"] = ( + bedrock_attrs.get("response.usage.completion_tokens", 0) + completion_tokens + ) + bedrock_attrs["response.usage.prompt_tokens"] = ( + bedrock_attrs.get("response.usage.prompt_tokens", 0) + prompt_tokens + ) + bedrock_attrs["response.usage.total_tokens"] = ( + bedrock_attrs.get("response.usage.total_tokens", 0) + total_tokens + ) return bedrock_attrs @@ -421,6 +459,21 @@ def extract_bedrock_claude_model_response(response_body, bedrock_attrs): bedrock_attrs["response.choices.finish_reason"] = response_body.get("stop_reason") bedrock_attrs["output_message_list"] = output_message_list + # Only set response_id if it exists and is not None + response_id = response_body.get("id") + if response_id: + bedrock_attrs["response_id"] = str(response_id) + + # Extract token information + token_usage = response_body.get("usage", {}) + if token_usage: + prompt_tokens = token_usage.get("input_tokens", 0) + completion_tokens = token_usage.get("output_tokens", 0) + total_tokens = prompt_tokens + completion_tokens + bedrock_attrs["response.usage.prompt_tokens"] = prompt_tokens + bedrock_attrs["response.usage.completion_tokens"] = completion_tokens + bedrock_attrs["response.usage.total_tokens"] = total_tokens + return bedrock_attrs @@ -432,6 +485,22 @@ def extract_bedrock_claude_model_streaming_response(response_body, bedrock_attrs bedrock_attrs["output_message_list"][0]["content"] += content bedrock_attrs["response.choices.finish_reason"] = response_body.get("stop_reason") + # Extract token information + invocation_metrics = response_body.get("amazon-bedrock-invocationMetrics", {}) + prompt_tokens = invocation_metrics.get("inputTokenCount", 0) + completion_tokens = invocation_metrics.get("outputTokenCount", 0) + total_tokens = prompt_tokens + completion_tokens + + bedrock_attrs["response.usage.completion_tokens"] = ( + bedrock_attrs.get("response.usage.completion_tokens", 0) + completion_tokens + ) + bedrock_attrs["response.usage.prompt_tokens"] = ( + bedrock_attrs.get("response.usage.prompt_tokens", 0) + prompt_tokens + ) + bedrock_attrs["response.usage.total_tokens"] = ( + bedrock_attrs.get("response.usage.total_tokens", 0) + total_tokens + ) + return bedrock_attrs @@ -452,6 +521,13 @@ def extract_bedrock_llama_model_response(response_body, bedrock_attrs): response_body = json.loads(response_body) output_message_list = [{"role": "assistant", "content": response_body.get("generation")}] + prompt_tokens = response_body.get("prompt_token_count", 0) + completion_tokens = response_body.get("generation_token_count", 0) + total_tokens = prompt_tokens + completion_tokens + + bedrock_attrs["response.usage.completion_tokens"] = completion_tokens + bedrock_attrs["response.usage.prompt_tokens"] = prompt_tokens + bedrock_attrs["response.usage.total_tokens"] = total_tokens bedrock_attrs["response.choices.finish_reason"] = response_body.get("stop_reason") bedrock_attrs["output_message_list"] = output_message_list @@ -465,6 +541,22 @@ def extract_bedrock_llama_model_streaming_response(response_body, bedrock_attrs) bedrock_attrs["output_message_list"] = [{"role": "assistant", "content": ""}] bedrock_attrs["output_message_list"][0]["content"] += content bedrock_attrs["response.choices.finish_reason"] = response_body.get("stop_reason") + + # Extract token information + invocation_metrics = response_body.get("amazon-bedrock-invocationMetrics", {}) + prompt_tokens = invocation_metrics.get("inputTokenCount", 0) + completion_tokens = invocation_metrics.get("outputTokenCount", 0) + total_tokens = prompt_tokens + completion_tokens + + bedrock_attrs["response.usage.completion_tokens"] = ( + bedrock_attrs.get("response.usage.completion_tokens", 0) + completion_tokens + ) + bedrock_attrs["response.usage.prompt_tokens"] = ( + bedrock_attrs.get("response.usage.prompt_tokens", 0) + prompt_tokens + ) + bedrock_attrs["response.usage.total_tokens"] = ( + bedrock_attrs.get("response.usage.total_tokens", 0) + total_tokens + ) return bedrock_attrs @@ -505,12 +597,33 @@ def extract_bedrock_cohere_model_streaming_response(response_body, bedrock_attrs bedrock_attrs["response.choices.finish_reason"] = response_body["generations"][0]["finish_reason"] bedrock_attrs["response_id"] = str(response_body.get("id")) + # Extract token information + invocation_metrics = response_body.get("amazon-bedrock-invocationMetrics", {}) + prompt_tokens = invocation_metrics.get("inputTokenCount", 0) + completion_tokens = invocation_metrics.get("outputTokenCount", 0) + total_tokens = prompt_tokens + completion_tokens + + bedrock_attrs["response.usage.completion_tokens"] = ( + bedrock_attrs.get("response.usage.completion_tokens", 0) + completion_tokens + ) + bedrock_attrs["response.usage.prompt_tokens"] = ( + bedrock_attrs.get("response.usage.prompt_tokens", 0) + prompt_tokens + ) + bedrock_attrs["response.usage.total_tokens"] = ( + bedrock_attrs.get("response.usage.total_tokens", 0) + total_tokens + ) + return bedrock_attrs NULL_EXTRACTOR = lambda *args: {} # noqa: E731 # Empty extractor that returns nothing MODEL_EXTRACTORS = [ # Order is important here, avoiding dictionaries - ("amazon.titan-embed", extract_bedrock_titan_embedding_model_request, NULL_EXTRACTOR, NULL_EXTRACTOR), + ( + "amazon.titan-embed", + extract_bedrock_titan_embedding_model_request, + extract_bedrock_titan_embedding_model_response, + NULL_EXTRACTOR, + ), ("cohere.embed", extract_bedrock_cohere_embedding_model_request, NULL_EXTRACTOR, NULL_EXTRACTOR), ( "amazon.titan", @@ -574,8 +687,8 @@ def handle_bedrock_exception( input_message_list = [] bedrock_attrs["input_message_list"] = input_message_list - bedrock_attrs["request.max_tokens"] = kwargs.get("inferenceConfig", {}).get("maxTokens", None) - bedrock_attrs["request.temperature"] = kwargs.get("inferenceConfig", {}).get("temperature", None) + bedrock_attrs["request.max_tokens"] = kwargs.get("inferenceConfig", {}).get("maxTokens") + bedrock_attrs["request.temperature"] = kwargs.get("inferenceConfig", {}).get("temperature") try: request_extractor(request_body, bedrock_attrs) @@ -843,6 +956,7 @@ def _wrap_bedrock_runtime_converse(wrapped, instance, args, kwargs): try: # For aioboto3 clients, this will call make_api_call instrumentation in external_aiobotocore response = wrapped(*args, **kwargs) + except Exception as exc: handle_bedrock_exception( exc, @@ -934,15 +1048,22 @@ def extract_bedrock_converse_attrs(kwargs, response, response_headers, model, sp exc_info=True, ) + response_prompt_tokens = response.get("usage", {}).get("inputTokens") if response else None + response_completion_tokens = response.get("usage", {}).get("outputTokens") if response else None + response_total_tokens = response.get("usage", {}).get("totalTokens") if response else None + bedrock_attrs = { "request_id": response_headers.get("x-amzn-requestid"), "model": model, "span_id": span_id, "trace_id": trace_id, "response.choices.finish_reason": response.get("stopReason"), - "request.max_tokens": kwargs.get("inferenceConfig", {}).get("maxTokens", None), - "request.temperature": kwargs.get("inferenceConfig", {}).get("temperature", None), + "request.max_tokens": kwargs.get("inferenceConfig", {}).get("maxTokens"), + "request.temperature": kwargs.get("inferenceConfig", {}).get("temperature"), "input_message_list": input_message_list, + "response.usage.prompt_tokens": response_prompt_tokens, + "response.usage.completion_tokens": response_completion_tokens, + "response.usage.total_tokens": response_total_tokens, } if output_message_list is not None: @@ -1142,29 +1263,34 @@ def handle_embedding_event(transaction, bedrock_attrs): custom_attrs_dict = transaction._custom_params llm_metadata_dict = {key: value for key, value in custom_attrs_dict.items() if key.startswith("llm.")} - span_id = bedrock_attrs.get("span_id", None) - trace_id = bedrock_attrs.get("trace_id", None) - request_id = bedrock_attrs.get("request_id", None) - model = bedrock_attrs.get("model", None) + span_id = bedrock_attrs.get("span_id") + trace_id = bedrock_attrs.get("trace_id") + request_id = bedrock_attrs.get("request_id") + model = bedrock_attrs.get("model") input_ = bedrock_attrs.get("input") + response_total_tokens = bedrock_attrs.get("response.usage.total_tokens") + + total_tokens = ( + settings.ai_monitoring.llm_token_count_callback(model, input_) + if settings.ai_monitoring.llm_token_count_callback and input_ + else response_total_tokens + ) + embedding_dict = { "vendor": "bedrock", "ingest_source": "Python", "id": embedding_id, "span_id": span_id, "trace_id": trace_id, - "token_count": ( - settings.ai_monitoring.llm_token_count_callback(model, input_) - if settings.ai_monitoring.llm_token_count_callback - else None - ), "request_id": request_id, - "duration": bedrock_attrs.get("duration", None), + "duration": bedrock_attrs.get("duration"), "request.model": model, "response.model": model, - "error": bedrock_attrs.get("error", None), + "response.usage.total_tokens": total_tokens, + "error": bedrock_attrs.get("error"), } + embedding_dict.update(llm_metadata_dict) if settings.ai_monitoring.record_content.enabled: @@ -1175,6 +1301,7 @@ def handle_embedding_event(transaction, bedrock_attrs): def handle_chat_completion_event(transaction, bedrock_attrs, request_timestamp=None): + settings = transaction.settings or global_settings() chat_completion_id = str(uuid.uuid4()) # Grab LLM-related custom attributes off of the transaction to store as metadata on LLM events custom_attrs_dict = transaction._custom_params @@ -1183,11 +1310,15 @@ def handle_chat_completion_event(transaction, bedrock_attrs, request_timestamp=N llm_context_attrs = getattr(transaction, "_llm_context_attrs", None) if llm_context_attrs: llm_metadata_dict.update(llm_context_attrs) - span_id = bedrock_attrs.get("span_id", None) - trace_id = bedrock_attrs.get("trace_id", None) - request_id = bedrock_attrs.get("request_id", None) - response_id = bedrock_attrs.get("response_id", None) - model = bedrock_attrs.get("model", None) + span_id = bedrock_attrs.get("span_id") + trace_id = bedrock_attrs.get("trace_id") + request_id = bedrock_attrs.get("request_id") + response_id = bedrock_attrs.get("response_id") + model = bedrock_attrs.get("model") + + response_prompt_tokens = bedrock_attrs.get("response.usage.prompt_tokens") + response_completion_tokens = bedrock_attrs.get("response.usage.completion_tokens") + response_total_tokens = bedrock_attrs.get("response.usage.total_tokens") input_message_list = bedrock_attrs.get("input_message_list", []) output_message_list = bedrock_attrs.get("output_message_list", []) @@ -1202,6 +1333,25 @@ def handle_chat_completion_event(transaction, bedrock_attrs, request_timestamp=N len(input_message_list) + len(output_message_list) ) or None # If 0, attribute will be set to None and removed + input_message_content = " ".join([msg.get("content") for msg in input_message_list if msg.get("content")]) + prompt_tokens = ( + settings.ai_monitoring.llm_token_count_callback(model, input_message_content) + if settings.ai_monitoring.llm_token_count_callback and input_message_content + else response_prompt_tokens + ) + + output_message_content = " ".join([msg.get("content") for msg in output_message_list if msg.get("content")]) + completion_tokens = ( + settings.ai_monitoring.llm_token_count_callback(model, output_message_content) + if settings.ai_monitoring.llm_token_count_callback and output_message_content + else response_completion_tokens + ) + total_tokens = ( + prompt_tokens + completion_tokens if all([prompt_tokens, completion_tokens]) else response_total_tokens + ) + + all_token_counts = bool(prompt_tokens and completion_tokens and total_tokens) + chat_completion_summary_dict = { "vendor": "bedrock", "ingest_source": "Python", @@ -1210,9 +1360,9 @@ def handle_chat_completion_event(transaction, bedrock_attrs, request_timestamp=N "trace_id": trace_id, "request_id": request_id, "response_id": response_id, - "duration": bedrock_attrs.get("duration", None), - "request.max_tokens": bedrock_attrs.get("request.max_tokens", None), - "request.temperature": bedrock_attrs.get("request.temperature", None), + "duration": bedrock_attrs.get("duration"), + "request.max_tokens": bedrock_attrs.get("request.max_tokens"), + "request.temperature": bedrock_attrs.get("request.temperature"), "request.model": model, "response.model": model, # Duplicate data required by the UI "response.number_of_messages": number_of_messages, @@ -1221,6 +1371,12 @@ def handle_chat_completion_event(transaction, bedrock_attrs, request_timestamp=N "timestamp": request_timestamp or None, "time_to_first_token": bedrock_attrs.get("time_to_first_token", None), } + + if all_token_counts: + chat_completion_summary_dict["response.usage.prompt_tokens"] = prompt_tokens + chat_completion_summary_dict["response.usage.completion_tokens"] = completion_tokens + chat_completion_summary_dict["response.usage.total_tokens"] = total_tokens + chat_completion_summary_dict.update(llm_metadata_dict) chat_completion_summary_dict = {k: v for k, v in chat_completion_summary_dict.items() if v is not None} transaction.record_custom_event("LlmChatCompletionSummary", chat_completion_summary_dict) @@ -1235,6 +1391,7 @@ def handle_chat_completion_event(transaction, bedrock_attrs, request_timestamp=N request_model=model, request_id=request_id, llm_metadata_dict=llm_metadata_dict, + all_token_counts=all_token_counts, response_id=response_id, request_timestamp=request_timestamp, ) diff --git a/newrelic/hooks/mlmodel_gemini.py b/newrelic/hooks/mlmodel_gemini.py index f9de68798..49811ed2a 100644 --- a/newrelic/hooks/mlmodel_gemini.py +++ b/newrelic/hooks/mlmodel_gemini.py @@ -176,20 +176,24 @@ def _record_embedding_success(transaction, embedding_id, linking_metadata, kwarg embedding_content = str(embedding_content) request_model = kwargs.get("model") + embedding_token_count = ( + settings.ai_monitoring.llm_token_count_callback(request_model, embedding_content) + if settings.ai_monitoring.llm_token_count_callback + else None + ) + full_embedding_response_dict = { "id": embedding_id, "span_id": span_id, "trace_id": trace_id, - "token_count": ( - settings.ai_monitoring.llm_token_count_callback(request_model, embedding_content) - if settings.ai_monitoring.llm_token_count_callback - else None - ), "request.model": request_model, "duration": ft.duration * 1000, "vendor": "gemini", "ingest_source": "Python", } + if embedding_token_count: + full_embedding_response_dict["response.usage.total_tokens"] = embedding_token_count + if settings.ai_monitoring.record_content.enabled: full_embedding_response_dict["input"] = embedding_content @@ -303,15 +307,13 @@ def _record_generation_error(transaction, linking_metadata, completion_id, kwarg "Unable to parse input message to Gemini LLM. Message content and role will be omitted from " "corresponding LlmChatCompletionMessage event. " ) + # Extract the input message content and role from the input message if it exists + input_message_content, input_role = _parse_input_message(input_message) if input_message else (None, None) - generation_config = kwargs.get("config") - if generation_config: - request_temperature = getattr(generation_config, "temperature", None) - request_max_tokens = getattr(generation_config, "max_output_tokens", None) - else: - request_temperature = None - request_max_tokens = None + # Extract data from generation config object + request_temperature, request_max_tokens = _extract_generation_config(kwargs) + # Prepare error attributes notice_error_attributes = { "http.statusCode": getattr(exc, "code", None), "error.message": getattr(exc, "message", None), @@ -352,15 +354,17 @@ def _record_generation_error(transaction, linking_metadata, completion_id, kwarg create_chat_completion_message_event( transaction, - input_message, + input_message_content, + input_role, completion_id, span_id, trace_id, # Passing the request model as the response model here since we do not have access to a response model request_model, - request_model, llm_metadata, output_message_list, + # We do not record token counts in error cases, so set all_token_counts to True so the pipeline tokenizer does not run + True, request_timestamp, ) except Exception: @@ -388,6 +392,7 @@ def _handle_generation_success( def _record_generation_success( transaction, linking_metadata, completion_id, kwargs, ft, response, request_timestamp=None ): + settings = transaction.settings or global_settings() span_id = linking_metadata.get("span.id") trace_id = linking_metadata.get("trace.id") try: @@ -396,12 +401,14 @@ def _record_generation_success( # finish_reason is an enum, so grab just the stringified value from it to report finish_reason = response.get("candidates")[0].get("finish_reason").value output_message_list = [response.get("candidates")[0].get("content")] + token_usage = response.get("usage_metadata") or {} else: # Set all values to NoneTypes since we cannot access them through kwargs or another method that doesn't # require the response object response_model = None output_message_list = [] finish_reason = None + token_usage = {} request_model = kwargs.get("model") @@ -423,13 +430,44 @@ def _record_generation_success( "corresponding LlmChatCompletionMessage event. " ) - generation_config = kwargs.get("config") - if generation_config: - request_temperature = getattr(generation_config, "temperature", None) - request_max_tokens = getattr(generation_config, "max_output_tokens", None) + input_message_content, input_role = _parse_input_message(input_message) if input_message else (None, None) + + # Parse output message content + # This list should have a length of 1 to represent the output message + # Parse the message text out to pass to any registered token counting callback + output_message_content = output_message_list[0].get("parts")[0].get("text") if output_message_list else None + + # Extract token counts from response object + if token_usage: + response_prompt_tokens = token_usage.get("prompt_token_count") + response_completion_tokens = token_usage.get("candidates_token_count") + response_total_tokens = token_usage.get("total_token_count") + else: - request_temperature = None - request_max_tokens = None + response_prompt_tokens = None + response_completion_tokens = None + response_total_tokens = None + + # Calculate token counts by checking if a callback is registered and if we have the necessary content to pass + # to it. If not, then we use the token counts provided in the response object + prompt_tokens = ( + settings.ai_monitoring.llm_token_count_callback(request_model, input_message_content) + if settings.ai_monitoring.llm_token_count_callback and input_message_content + else response_prompt_tokens + ) + completion_tokens = ( + settings.ai_monitoring.llm_token_count_callback(response_model, output_message_content) + if settings.ai_monitoring.llm_token_count_callback and output_message_content + else response_completion_tokens + ) + total_tokens = ( + prompt_tokens + completion_tokens if all([prompt_tokens, completion_tokens]) else response_total_tokens + ) + + all_token_counts = bool(prompt_tokens and completion_tokens and total_tokens) + + # Extract generation config + request_temperature, request_max_tokens = _extract_generation_config(kwargs) full_chat_completion_summary_dict = { "id": completion_id, @@ -450,68 +488,80 @@ def _record_generation_success( "timestamp": request_timestamp, } + if all_token_counts: + full_chat_completion_summary_dict["response.usage.prompt_tokens"] = prompt_tokens + full_chat_completion_summary_dict["response.usage.completion_tokens"] = completion_tokens + full_chat_completion_summary_dict["response.usage.total_tokens"] = total_tokens + llm_metadata = _get_llm_attributes(transaction) full_chat_completion_summary_dict.update(llm_metadata) transaction.record_custom_event("LlmChatCompletionSummary", full_chat_completion_summary_dict) create_chat_completion_message_event( transaction, - input_message, + input_message_content, + input_role, completion_id, span_id, trace_id, response_model, - request_model, llm_metadata, output_message_list, + all_token_counts, request_timestamp, ) except Exception: _logger.warning(RECORD_EVENTS_FAILURE_LOG_MESSAGE, exc_info=True) +def _parse_input_message(input_message): + # The input_message will be a string if generate_content was called directly. In this case, we don't have + # access to the role, so we default to user since this was an input message + if isinstance(input_message, str): + return input_message, "user" + # The input_message will be a Google Content type if send_message was called, so we parse out the message + # text and role (which should be "user") + elif isinstance(input_message, google.genai.types.Content): + return input_message.parts[0].text, input_message.role + else: + return None, None + + +def _extract_generation_config(kwargs): + generation_config = kwargs.get("config") + if generation_config: + request_temperature = getattr(generation_config, "temperature", None) + request_max_tokens = getattr(generation_config, "max_output_tokens", None) + else: + request_temperature = None + request_max_tokens = None + + return request_temperature, request_max_tokens + + def create_chat_completion_message_event( transaction, - input_message, + input_message_content, + input_role, chat_completion_id, span_id, trace_id, response_model, - request_model, llm_metadata, output_message_list, + all_token_counts, request_timestamp=None, ): try: settings = transaction.settings or global_settings() - if input_message: - # The input_message will be a string if generate_content was called directly. In this case, we don't have - # access to the role, so we default to user since this was an input message - if isinstance(input_message, str): - input_message_content = input_message - input_role = "user" - # The input_message will be a Google Content type if send_message was called, so we parse out the message - # text and role (which should be "user") - elif isinstance(input_message, google.genai.types.Content): - input_message_content = input_message.parts[0].text - input_role = input_message.role - # Set input data to NoneTypes to ensure token_count callback is not called - else: - input_message_content = None - input_role = None - + if input_message_content: message_id = str(uuid.uuid4()) chat_completion_input_message_dict = { "id": message_id, "span_id": span_id, "trace_id": trace_id, - "token_count": ( - settings.ai_monitoring.llm_token_count_callback(request_model, input_message_content) - if settings.ai_monitoring.llm_token_count_callback and input_message_content - else None - ), "role": input_role, "completion_id": chat_completion_id, # The input message will always be the first message in our request/ response sequence so this will @@ -521,6 +571,8 @@ def create_chat_completion_message_event( "vendor": "gemini", "ingest_source": "Python", } + if all_token_counts: + chat_completion_input_message_dict["token_count"] = 0 if settings.ai_monitoring.record_content.enabled: chat_completion_input_message_dict["content"] = input_message_content @@ -539,7 +591,7 @@ def create_chat_completion_message_event( # Add one to the index to account for the single input message so our sequence value is accurate for # the output message - if input_message: + if input_message_content: index += 1 message_id = str(uuid.uuid4()) @@ -548,11 +600,6 @@ def create_chat_completion_message_event( "id": message_id, "span_id": span_id, "trace_id": trace_id, - "token_count": ( - settings.ai_monitoring.llm_token_count_callback(response_model, message_content) - if settings.ai_monitoring.llm_token_count_callback - else None - ), "role": message.get("role"), "completion_id": chat_completion_id, "sequence": index, @@ -562,6 +609,9 @@ def create_chat_completion_message_event( "is_response": True, } + if all_token_counts: + chat_completion_output_message_dict["token_count"] = 0 + if settings.ai_monitoring.record_content.enabled: chat_completion_output_message_dict["content"] = message_content diff --git a/newrelic/hooks/mlmodel_openai.py b/newrelic/hooks/mlmodel_openai.py index ec1d77c8a..db48d36e3 100644 --- a/newrelic/hooks/mlmodel_openai.py +++ b/newrelic/hooks/mlmodel_openai.py @@ -133,11 +133,11 @@ def create_chat_completion_message_event( span_id, trace_id, response_model, - request_model, response_id, request_id, llm_metadata, output_message_list, + all_token_counts, request_timestamp=None, ): settings = transaction.settings if transaction.settings is not None else global_settings() @@ -158,11 +158,6 @@ def create_chat_completion_message_event( "request_id": request_id, "span_id": span_id, "trace_id": trace_id, - "token_count": ( - settings.ai_monitoring.llm_token_count_callback(request_model, message_content) - if settings.ai_monitoring.llm_token_count_callback - else None - ), "role": message.get("role"), "completion_id": chat_completion_id, "sequence": index, @@ -170,8 +165,13 @@ def create_chat_completion_message_event( "vendor": "openai", "ingest_source": "Python", } + if settings.ai_monitoring.record_content.enabled and message_content: chat_completion_input_message_dict["content"] = message_content + + if all_token_counts: + chat_completion_input_message_dict["token_count"] = 0 + if request_timestamp: chat_completion_input_message_dict["timestamp"] = request_timestamp @@ -199,11 +199,6 @@ def create_chat_completion_message_event( "request_id": request_id, "span_id": span_id, "trace_id": trace_id, - "token_count": ( - settings.ai_monitoring.llm_token_count_callback(response_model, message_content) - if settings.ai_monitoring.llm_token_count_callback - else None - ), "role": message.get("role"), "completion_id": chat_completion_id, "sequence": index, @@ -216,6 +211,9 @@ def create_chat_completion_message_event( if settings.ai_monitoring.record_content.enabled and message_content: chat_completion_output_message_dict["content"] = message_content + if all_token_counts: + chat_completion_output_message_dict["token_count"] = 0 + chat_completion_output_message_dict.update(llm_metadata) transaction.record_custom_event("LlmChatCompletionMessage", chat_completion_output_message_dict) @@ -286,15 +284,18 @@ def _record_embedding_success(transaction, embedding_id, linking_metadata, kwarg else getattr(attribute_response, "organization", None) ) + response_total_tokens = attribute_response.get("usage", {}).get("total_tokens") if response else None + + total_tokens = ( + settings.ai_monitoring.llm_token_count_callback(response_model, input_) + if settings.ai_monitoring.llm_token_count_callback and input_ + else response_total_tokens + ) + full_embedding_response_dict = { "id": embedding_id, "span_id": span_id, "trace_id": trace_id, - "token_count": ( - settings.ai_monitoring.llm_token_count_callback(response_model, input_) - if settings.ai_monitoring.llm_token_count_callback - else None - ), "request.model": kwargs.get("model") or kwargs.get("engine"), "request_id": request_id, "duration": ft.duration * 1000, @@ -319,6 +320,7 @@ def _record_embedding_success(transaction, embedding_id, linking_metadata, kwarg "response.headers.ratelimitRemainingRequests": check_rate_limit_header( response_headers, "x-ratelimit-remaining-requests", True ), + "response.usage.total_tokens": total_tokens, "vendor": "openai", "ingest_source": "Python", } @@ -489,6 +491,7 @@ def _handle_completion_success( def _record_completion_success( transaction, linking_metadata, completion_id, kwargs, ft, response_headers, response, request_timestamp=None ): + settings = transaction.settings if transaction.settings is not None else global_settings() span_id = linking_metadata.get("span.id") trace_id = linking_metadata.get("trace.id") @@ -496,6 +499,7 @@ def _record_completion_success( if response: response_model = response.get("model") response_id = response.get("id") + token_usage = response.get("usage") or {} output_message_list = [] finish_reason = None choices = response.get("choices") or [] @@ -509,6 +513,7 @@ def _record_completion_success( else: response_model = kwargs.get("response.model") response_id = kwargs.get("id") + token_usage = {} output_message_list = [] finish_reason = kwargs.get("finish_reason") if "content" in kwargs: @@ -520,10 +525,44 @@ def _record_completion_success( output_message_list = [] request_model = kwargs.get("model") or kwargs.get("engine") - request_id = response_headers.get("x-request-id") - organization = response_headers.get("openai-organization") or getattr(response, "organization", None) messages = kwargs.get("messages") or [{"content": kwargs.get("prompt"), "role": "user"}] input_message_list = list(messages) + + # Extract token counts from response object + if token_usage: + response_prompt_tokens = token_usage.get("prompt_tokens") + response_completion_tokens = token_usage.get("completion_tokens") + response_total_tokens = token_usage.get("total_tokens") + + else: + response_prompt_tokens = None + response_completion_tokens = None + response_total_tokens = None + + # Calculate token counts by checking if a callback is registered and if we have the necessary content to pass + # to it. If not, then we use the token counts provided in the response object + input_message_content = " ".join([msg.get("content", "") for msg in input_message_list if msg.get("content")]) + prompt_tokens = ( + settings.ai_monitoring.llm_token_count_callback(request_model, input_message_content) + if settings.ai_monitoring.llm_token_count_callback and input_message_content + else response_prompt_tokens + ) + output_message_content = " ".join([msg.get("content", "") for msg in output_message_list if msg.get("content")]) + completion_tokens = ( + settings.ai_monitoring.llm_token_count_callback(response_model, output_message_content) + if settings.ai_monitoring.llm_token_count_callback and output_message_content + else response_completion_tokens + ) + + total_tokens = ( + prompt_tokens + completion_tokens if all([prompt_tokens, completion_tokens]) else response_total_tokens + ) + + all_token_counts = bool(prompt_tokens and completion_tokens and total_tokens) + + request_id = response_headers.get("x-request-id") + organization = response_headers.get("openai-organization") or getattr(response, "organization", None) + full_chat_completion_summary_dict = { "id": completion_id, "span_id": span_id, @@ -570,6 +609,12 @@ def _record_completion_success( "response.number_of_messages": len(input_message_list) + len(output_message_list), "timestamp": request_timestamp, } + + if all_token_counts: + full_chat_completion_summary_dict["response.usage.prompt_tokens"] = prompt_tokens + full_chat_completion_summary_dict["response.usage.completion_tokens"] = completion_tokens + full_chat_completion_summary_dict["response.usage.total_tokens"] = total_tokens + llm_metadata = _get_llm_attributes(transaction) if "time_to_first_token" in kwargs: @@ -585,11 +630,11 @@ def _record_completion_success( span_id, trace_id, response_model, - request_model, response_id, request_id, llm_metadata, output_message_list, + all_token_counts, request_timestamp, ) except Exception: @@ -601,6 +646,7 @@ def _record_completion_error(transaction, linking_metadata, completion_id, kwarg trace_id = linking_metadata.get("trace.id") request_message_list = kwargs.get("messages", None) or [] notice_error_attributes = {} + try: if OPENAI_V1: response = getattr(exc, "response", None) @@ -666,6 +712,7 @@ def _record_completion_error(transaction, linking_metadata, completion_id, kwarg output_message_list = [] if "content" in kwargs: output_message_list = [{"content": kwargs.get("content"), "role": kwargs.get("role")}] + create_chat_completion_message_event( transaction, request_message_list, @@ -673,11 +720,12 @@ def _record_completion_error(transaction, linking_metadata, completion_id, kwarg span_id, trace_id, kwargs.get("response.model"), - request_model, response_id, request_id, llm_metadata, output_message_list, + # We do not record token counts in error cases, so set all_token_counts to True so the pipeline tokenizer does not run + True, request_timestamp, ) except Exception: diff --git a/tests/external_aiobotocore/test_bedrock_chat_completion_converse.py b/tests/external_aiobotocore/test_bedrock_chat_completion_converse.py index 55843b832..8d21e01ca 100644 --- a/tests/external_aiobotocore/test_bedrock_chat_completion_converse.py +++ b/tests/external_aiobotocore/test_bedrock_chat_completion_converse.py @@ -23,7 +23,8 @@ ) from testing_support.fixtures import override_llm_token_callback_settings, reset_core_stats_engine, validate_attributes from testing_support.ml_testing_utils import ( - add_token_count_to_events, + add_token_count_streaming_events, + add_token_counts_to_chat_events, disabled_ai_monitoring_record_content_settings, disabled_ai_monitoring_settings, events_sans_content, @@ -146,8 +147,14 @@ def _test(): @reset_core_stats_engine() @override_llm_token_callback_settings(llm_token_count_callback) -def test_bedrock_chat_completion_with_token_count(set_trace_info, exercise_model, expected_metric, expected_events): - @validate_custom_events(add_token_count_to_events(expected_events)) +def test_bedrock_chat_completion_with_token_count( + set_trace_info, exercise_model, expected_metric, expected_events, response_streaming +): + expected_events = add_token_counts_to_chat_events(expected_events) + if response_streaming: + expected_events = add_token_count_streaming_events(expected_events) + + @validate_custom_events(expected_events) # One summary event, one user message, and one response message from the assistant @validate_custom_event_count(count=4) @validate_transaction_metrics( @@ -278,49 +285,6 @@ def _test(): _test() -@reset_core_stats_engine() -@override_llm_token_callback_settings(llm_token_count_callback) -def test_bedrock_chat_completion_error_incorrect_access_key_with_token_count( - exercise_converse_incorrect_access_key, set_trace_info, expected_metric -): - """ - A request is made to the server with invalid credentials. botocore will reach out to the server and receive an - UnrecognizedClientException as a response. Information from the request will be parsed and reported in customer - events. The error response can also be parsed, and will be included as attributes on the recorded exception. - """ - - @validate_custom_events(add_token_count_to_events(chat_completion_invalid_access_key_error_events)) - @validate_error_trace_attributes( - _client_error_name, - exact_attrs={ - "agent": {}, - "intrinsic": {}, - "user": { - "http.statusCode": 403, - "error.message": "The security token included in the request is invalid.", - "error.code": "UnrecognizedClientException", - }, - }, - ) - @validate_transaction_metrics( - name="test_bedrock_chat_completion_incorrect_access_key_with_token_count", - scoped_metrics=[expected_metric], - rollup_metrics=[expected_metric], - custom_metrics=[(f"Supportability/Python/ML/Bedrock/{BOTOCORE_VERSION}", 1)], - background_task=True, - ) - @background_task(name="test_bedrock_chat_completion_incorrect_access_key_with_token_count") - def _test(): - set_trace_info() - add_custom_attribute("llm.conversation_id", "my-awesome-id") - add_custom_attribute("llm.foo", "bar") - add_custom_attribute("non_llm_attr", "python-agent") - - exercise_converse_incorrect_access_key() - - _test() - - @pytest.fixture def exercise_converse_invalid_model(loop, bedrock_converse_server, response_streaming, monkeypatch): def _exercise_converse_invalid_model(): diff --git a/tests/external_aiobotocore/test_bedrock_chat_completion_invoke_model.py b/tests/external_aiobotocore/test_bedrock_chat_completion_invoke_model.py index 207db7e31..40c21c35e 100644 --- a/tests/external_aiobotocore/test_bedrock_chat_completion_invoke_model.py +++ b/tests/external_aiobotocore/test_bedrock_chat_completion_invoke_model.py @@ -34,7 +34,8 @@ ) from testing_support.fixtures import override_llm_token_callback_settings, reset_core_stats_engine, validate_attributes from testing_support.ml_testing_utils import ( - add_token_count_to_events, + add_token_count_streaming_events, + add_token_counts_to_chat_events, disabled_ai_monitoring_record_content_settings, disabled_ai_monitoring_settings, disabled_ai_monitoring_streaming_settings, @@ -207,7 +208,7 @@ def _test(): @reset_core_stats_engine() @override_llm_token_callback_settings(llm_token_count_callback) def test_bedrock_chat_completion_with_token_count(set_trace_info, exercise_model, expected_events, expected_metrics): - @validate_custom_events(add_token_count_to_events(expected_events)) + @validate_custom_events(add_token_counts_to_chat_events(add_token_count_streaming_events(expected_events))) # One summary event, one user message, and one response message from the assistant @validate_custom_event_count(count=3) @validate_transaction_metrics( @@ -456,51 +457,6 @@ def _test(): _test() -@reset_core_stats_engine() -@override_llm_token_callback_settings(llm_token_count_callback) -def test_bedrock_chat_completion_error_incorrect_access_key_with_token( - monkeypatch, - bedrock_server, - exercise_model, - set_trace_info, - expected_invalid_access_key_error_events, - expected_metrics, -): - @validate_custom_events(add_token_count_to_events(expected_invalid_access_key_error_events)) - @validate_error_trace_attributes( - _client_error_name, - exact_attrs={ - "agent": {}, - "intrinsic": {}, - "user": { - "http.statusCode": 403, - "error.message": "The security token included in the request is invalid.", - "error.code": "UnrecognizedClientException", - }, - }, - ) - @validate_transaction_metrics( - name="test_bedrock_chat_completion", - scoped_metrics=expected_metrics, - rollup_metrics=expected_metrics, - custom_metrics=[(f"Supportability/Python/ML/Bedrock/{BOTOCORE_VERSION}", 1)], - background_task=True, - ) - @background_task(name="test_bedrock_chat_completion") - def _test(): - monkeypatch.setattr(bedrock_server._request_signer._credentials, "access_key", "INVALID-ACCESS-KEY") - - with pytest.raises(_client_error): # not sure where this exception actually comes from - set_trace_info() - add_custom_attribute("llm.conversation_id", "my-awesome-id") - add_custom_attribute("llm.foo", "bar") - add_custom_attribute("non_llm_attr", "python-agent") - - exercise_model(prompt="Invalid Token", temperature=0.7, max_tokens=100) - - _test() - - def invoke_model_malformed_request_body(loop, bedrock_server, response_streaming): async def _coro(): with pytest.raises(_client_error): @@ -799,58 +755,6 @@ async def _test(): loop.run_until_complete(_test()) -@reset_core_stats_engine() -@override_llm_token_callback_settings(llm_token_count_callback) -@validate_custom_events(add_token_count_to_events(chat_completion_expected_streaming_error_events)) -@validate_custom_event_count(count=2) -@validate_error_trace_attributes( - _event_stream_error_name, - exact_attrs={ - "agent": {}, - "intrinsic": {}, - "user": { - "error.message": "Malformed input request, please reformat your input and try again.", - "error.code": "ValidationException", - }, - }, - forgone_params={"agent": (), "intrinsic": (), "user": ("http.statusCode")}, -) -@validate_transaction_metrics( - name="test_bedrock_chat_completion", - scoped_metrics=[("Llm/completion/Bedrock/invoke_model_with_response_stream", 1)], - rollup_metrics=[("Llm/completion/Bedrock/invoke_model_with_response_stream", 1)], - custom_metrics=[(f"Supportability/Python/ML/Bedrock/{BOTOCORE_VERSION}", 1)], - background_task=True, -) -@background_task(name="test_bedrock_chat_completion") -def test_bedrock_chat_completion_error_streaming_exception_with_token_count(loop, bedrock_server, set_trace_info): - """ - Duplicate of test_bedrock_chat_completion_error_streaming_exception, but with token callback being set. - - See the original test for a description of the error case. - """ - - async def _test(): - with pytest.raises(_event_stream_error): - model = "amazon.titan-text-express-v1" - body = (chat_completion_payload_templates[model] % ("Streaming Exception", 0.7, 100)).encode("utf-8") - - set_trace_info() - add_custom_attribute("llm.conversation_id", "my-awesome-id") - add_custom_attribute("llm.foo", "bar") - add_custom_attribute("non_llm_attr", "python-agent") - - response = await bedrock_server.invoke_model_with_response_stream( - body=body, modelId=model, accept="application/json", contentType="application/json" - ) - - body = response.get("body") - async for resp in body: - assert resp - - loop.run_until_complete(_test()) - - def test_bedrock_chat_completion_functions_marked_as_wrapped_for_sdk_compatibility(bedrock_server): assert bedrock_server._nr_wrapped diff --git a/tests/external_aiobotocore/test_bedrock_embeddings.py b/tests/external_aiobotocore/test_bedrock_embeddings.py index b96412229..1f9359934 100644 --- a/tests/external_aiobotocore/test_bedrock_embeddings.py +++ b/tests/external_aiobotocore/test_bedrock_embeddings.py @@ -28,7 +28,7 @@ ) from testing_support.fixtures import override_llm_token_callback_settings, reset_core_stats_engine, validate_attributes from testing_support.ml_testing_utils import ( - add_token_count_to_events, + add_token_count_to_embedding_events, disabled_ai_monitoring_record_content_settings, disabled_ai_monitoring_settings, events_sans_content, @@ -165,7 +165,7 @@ def _test(): @reset_core_stats_engine() @override_llm_token_callback_settings(llm_token_count_callback) def test_bedrock_embedding_with_token_count(set_trace_info, exercise_model, expected_events): - @validate_custom_events(add_token_count_to_events(expected_events)) + @validate_custom_events(add_token_count_to_embedding_events(expected_events)) @validate_custom_event_count(count=1) @validate_transaction_metrics( name="test_bedrock_embedding", @@ -290,45 +290,6 @@ def _test(): _test() -@reset_core_stats_engine() -@override_llm_token_callback_settings(llm_token_count_callback) -def test_bedrock_embedding_error_incorrect_access_key_with_token_count( - monkeypatch, bedrock_server, exercise_model, set_trace_info, expected_invalid_access_key_error_events -): - @validate_custom_events(add_token_count_to_events(expected_invalid_access_key_error_events)) - @validate_error_trace_attributes( - _client_error_name, - exact_attrs={ - "agent": {}, - "intrinsic": {}, - "user": { - "http.statusCode": 403, - "error.message": "The security token included in the request is invalid.", - "error.code": "UnrecognizedClientException", - }, - }, - ) - @validate_transaction_metrics( - name="test_bedrock_embedding", - scoped_metrics=[("Llm/embedding/Bedrock/invoke_model", 1)], - rollup_metrics=[("Llm/embedding/Bedrock/invoke_model", 1)], - background_task=True, - ) - @background_task(name="test_bedrock_embedding") - def _test(): - monkeypatch.setattr(bedrock_server._request_signer._credentials, "access_key", "INVALID-ACCESS-KEY") - - with pytest.raises(_client_error): # not sure where this exception actually comes from - set_trace_info() - add_custom_attribute("llm.conversation_id", "my-awesome-id") - add_custom_attribute("llm.foo", "bar") - add_custom_attribute("non_llm_attr", "python-agent") - - exercise_model(prompt="Invalid Token") - - _test() - - @reset_core_stats_engine() @validate_custom_events(embedding_expected_malformed_request_body_events) @validate_custom_event_count(count=1) diff --git a/tests/external_botocore/_test_bedrock_chat_completion_converse.py b/tests/external_botocore/_test_bedrock_chat_completion_converse.py index 5eb2e02f0..acd8a43a3 100644 --- a/tests/external_botocore/_test_bedrock_chat_completion_converse.py +++ b/tests/external_botocore/_test_bedrock_chat_completion_converse.py @@ -29,6 +29,9 @@ "duration": None, # Response time varies each test run "request.model": "anthropic.claude-3-sonnet-20240229-v1:0", "response.model": "anthropic.claude-3-sonnet-20240229-v1:0", + "response.usage.prompt_tokens": 26, + "response.usage.completion_tokens": 100, + "response.usage.total_tokens": 126, "request.temperature": 0.7, "request.max_tokens": 100, "response.choices.finish_reason": "max_tokens", @@ -51,6 +54,7 @@ "role": "system", "completion_id": None, "sequence": 0, + "token_count": 0, "response.model": "anthropic.claude-3-sonnet-20240229-v1:0", "vendor": "bedrock", "ingest_source": "Python", @@ -70,6 +74,7 @@ "role": "user", "completion_id": None, "sequence": 1, + "token_count": 0, "response.model": "anthropic.claude-3-sonnet-20240229-v1:0", "vendor": "bedrock", "ingest_source": "Python", @@ -88,6 +93,7 @@ "role": "assistant", "completion_id": None, "sequence": 2, + "token_count": 0, "response.model": "anthropic.claude-3-sonnet-20240229-v1:0", "vendor": "bedrock", "ingest_source": "Python", diff --git a/tests/external_botocore/_test_bedrock_chat_completion_invoke_model.py b/tests/external_botocore/_test_bedrock_chat_completion_invoke_model.py index 2f5a06070..04e05a0c1 100644 --- a/tests/external_botocore/_test_bedrock_chat_completion_invoke_model.py +++ b/tests/external_botocore/_test_bedrock_chat_completion_invoke_model.py @@ -101,6 +101,9 @@ "duration": None, # Response time varies each test run "request.model": "amazon.titan-text-express-v1", "response.model": "amazon.titan-text-express-v1", + "response.usage.completion_tokens": 32, + "response.usage.total_tokens": 44, + "response.usage.prompt_tokens": 12, "request.temperature": 0.7, "request.max_tokens": 100, "response.choices.finish_reason": "FINISH", @@ -123,6 +126,7 @@ "role": "user", "completion_id": None, "sequence": 0, + "token_count": 0, "response.model": "amazon.titan-text-express-v1", "vendor": "bedrock", "ingest_source": "Python", @@ -141,6 +145,7 @@ "role": "assistant", "completion_id": None, "sequence": 1, + "token_count": 0, "response.model": "amazon.titan-text-express-v1", "vendor": "bedrock", "ingest_source": "Python", @@ -282,9 +287,13 @@ "span_id": None, "trace_id": "trace-id", "request_id": "ab38295d-df9c-4141-8173-38221651bf46", + "response_id": None, # UUID that varies with each run "duration": None, # Response time varies each test run "request.model": "anthropic.claude-3-sonnet-20240229-v1:0", "response.model": "anthropic.claude-3-sonnet-20240229-v1:0", + "response.usage.completion_tokens": 31, + "response.usage.prompt_tokens": 21, + "response.usage.total_tokens": 52, "request.temperature": 0.7, "request.max_tokens": 100, "response.choices.finish_reason": "end_turn", @@ -307,6 +316,7 @@ "role": "user", "completion_id": None, "sequence": 0, + "token_count": 0, "response.model": "anthropic.claude-3-sonnet-20240229-v1:0", "vendor": "bedrock", "ingest_source": "Python", @@ -325,6 +335,7 @@ "role": "assistant", "completion_id": None, "sequence": 1, + "token_count": 0, "response.model": "anthropic.claude-3-sonnet-20240229-v1:0", "vendor": "bedrock", "ingest_source": "Python", @@ -408,6 +419,9 @@ "duration": None, # Response time varies each test run "request.model": "meta.llama2-13b-chat-v1", "response.model": "meta.llama2-13b-chat-v1", + "response.usage.prompt_tokens": 17, + "response.usage.completion_tokens": 69, + "response.usage.total_tokens": 86, "request.temperature": 0.7, "request.max_tokens": 100, "response.choices.finish_reason": "stop", @@ -430,6 +444,7 @@ "role": "user", "completion_id": None, "sequence": 0, + "token_count": 0, "response.model": "meta.llama2-13b-chat-v1", "vendor": "bedrock", "ingest_source": "Python", @@ -448,6 +463,7 @@ "role": "assistant", "completion_id": None, "sequence": 1, + "token_count": 0, "response.model": "meta.llama2-13b-chat-v1", "vendor": "bedrock", "ingest_source": "Python", @@ -646,6 +662,7 @@ "span_id": None, "trace_id": "trace-id", "request_id": "e8fc1dd7-3d1e-42c6-9c58-535cae563bff", + "response_id": None, # UUID that varies with each run "duration": None, # Response time varies each test run "request.model": "anthropic.claude-3-sonnet-20240229-v1:0", "response.model": "anthropic.claude-3-sonnet-20240229-v1:0", @@ -943,6 +960,7 @@ "span_id": None, "trace_id": "trace-id", "request_id": "96c7306d-2d60-4629-83e9-dbd6befb0e4e", + "response_id": None, # UUID that varies with each run "duration": None, # Response time varies each test run "request.model": "anthropic.claude-3-sonnet-20240229-v1:0", "response.model": "anthropic.claude-3-sonnet-20240229-v1:0", @@ -1129,6 +1147,9 @@ "duration": None, # Response time varies each test run "request.model": "amazon.titan-text-express-v1", "response.model": "amazon.titan-text-express-v1", + "response.usage.completion_tokens": 35, + "response.usage.total_tokens": 47, + "response.usage.prompt_tokens": 12, "request.temperature": 0.7, "request.max_tokens": 100, "response.choices.finish_reason": "FINISH", @@ -1152,6 +1173,7 @@ "role": "user", "completion_id": None, "sequence": 0, + "token_count": 0, "response.model": "amazon.titan-text-express-v1", "vendor": "bedrock", "ingest_source": "Python", @@ -1170,6 +1192,7 @@ "role": "assistant", "completion_id": None, "sequence": 1, + "token_count": 0, "response.model": "amazon.titan-text-express-v1", "vendor": "bedrock", "ingest_source": "Python", @@ -1191,6 +1214,9 @@ "duration": None, # Response time varies each test run "request.model": "anthropic.claude-instant-v1", "response.model": "anthropic.claude-instant-v1", + "response.usage.completion_tokens": 99, + "response.usage.prompt_tokens": 19, + "response.usage.total_tokens": 118, "request.temperature": 0.7, "request.max_tokens": 100, "response.choices.finish_reason": "stop_sequence", @@ -1214,6 +1240,7 @@ "role": "user", "completion_id": None, "sequence": 0, + "token_count": 0, "response.model": "anthropic.claude-instant-v1", "vendor": "bedrock", "ingest_source": "Python", @@ -1232,6 +1259,7 @@ "role": "assistant", "completion_id": None, "sequence": 1, + "token_count": 0, "response.model": "anthropic.claude-instant-v1", "vendor": "bedrock", "ingest_source": "Python", @@ -1315,6 +1343,9 @@ "duration": None, # Response time varies each test run "request.model": "cohere.command-text-v14", "response.model": "cohere.command-text-v14", + "response.usage.completion_tokens": 91, + "response.usage.total_tokens": 100, + "response.usage.prompt_tokens": 9, "request.temperature": 0.7, "request.max_tokens": 100, "response.choices.finish_reason": "COMPLETE", @@ -1338,6 +1369,7 @@ "role": "user", "completion_id": None, "sequence": 0, + "token_count": 0, "response.model": "cohere.command-text-v14", "vendor": "bedrock", "ingest_source": "Python", @@ -1356,6 +1388,7 @@ "role": "assistant", "completion_id": None, "sequence": 1, + "token_count": 0, "response.model": "cohere.command-text-v14", "vendor": "bedrock", "ingest_source": "Python", @@ -1377,6 +1410,9 @@ "duration": None, # Response time varies each test run "request.model": "meta.llama2-13b-chat-v1", "response.model": "meta.llama2-13b-chat-v1", + "response.usage.prompt_tokens": 17, + "response.usage.completion_tokens": 100, + "response.usage.total_tokens": 117, "request.temperature": 0.7, "request.max_tokens": 100, "response.choices.finish_reason": "length", @@ -1400,6 +1436,7 @@ "role": "user", "completion_id": None, "sequence": 0, + "token_count": 0, "response.model": "meta.llama2-13b-chat-v1", "vendor": "bedrock", "ingest_source": "Python", @@ -1418,6 +1455,7 @@ "role": "assistant", "completion_id": None, "sequence": 1, + "token_count": 0, "response.model": "meta.llama2-13b-chat-v1", "vendor": "bedrock", "ingest_source": "Python", diff --git a/tests/external_botocore/_test_bedrock_embeddings.py b/tests/external_botocore/_test_bedrock_embeddings.py index f5c227b9c..af544af00 100644 --- a/tests/external_botocore/_test_bedrock_embeddings.py +++ b/tests/external_botocore/_test_bedrock_embeddings.py @@ -33,6 +33,7 @@ "response.model": "amazon.titan-embed-text-v1", "request.model": "amazon.titan-embed-text-v1", "request_id": "11233989-07e8-4ecb-9ba6-79601ba6d8cc", + "response.usage.total_tokens": 6, "vendor": "bedrock", "ingest_source": "Python", }, @@ -52,6 +53,7 @@ "response.model": "amazon.titan-embed-g1-text-02", "request.model": "amazon.titan-embed-g1-text-02", "request_id": "b10ac895-eae3-4f07-b926-10b2866c55ed", + "response.usage.total_tokens": 6, "vendor": "bedrock", "ingest_source": "Python", }, diff --git a/tests/external_botocore/test_bedrock_chat_completion_converse.py b/tests/external_botocore/test_bedrock_chat_completion_converse.py index e365b5163..b613b6c3a 100644 --- a/tests/external_botocore/test_bedrock_chat_completion_converse.py +++ b/tests/external_botocore/test_bedrock_chat_completion_converse.py @@ -23,7 +23,8 @@ from conftest import BOTOCORE_VERSION from testing_support.fixtures import override_llm_token_callback_settings, reset_core_stats_engine, validate_attributes from testing_support.ml_testing_utils import ( - add_token_count_to_events, + add_token_count_streaming_events, + add_token_counts_to_chat_events, disabled_ai_monitoring_record_content_settings, disabled_ai_monitoring_settings, events_sans_content, @@ -140,8 +141,14 @@ def _test(): @reset_core_stats_engine() @override_llm_token_callback_settings(llm_token_count_callback) -def test_bedrock_chat_completion_with_token_count(set_trace_info, exercise_model, expected_metric, expected_events): - @validate_custom_events(add_token_count_to_events(expected_events)) +def test_bedrock_chat_completion_with_token_count( + set_trace_info, exercise_model, expected_metric, expected_events, response_streaming +): + expected_events = add_token_counts_to_chat_events(expected_events) + if response_streaming: + expected_events = add_token_count_streaming_events(expected_events) + + @validate_custom_events(expected_events) # One summary event, one user message, and one response message from the assistant @validate_custom_event_count(count=4) @validate_transaction_metrics( @@ -265,49 +272,6 @@ def _test(): _test() -@reset_core_stats_engine() -@override_llm_token_callback_settings(llm_token_count_callback) -def test_bedrock_chat_completion_error_incorrect_access_key_with_token_count( - exercise_converse_incorrect_access_key, set_trace_info, expected_metric -): - """ - A request is made to the server with invalid credentials. botocore will reach out to the server and receive an - UnrecognizedClientException as a response. Information from the request will be parsed and reported in customer - events. The error response can also be parsed, and will be included as attributes on the recorded exception. - """ - - @validate_custom_events(add_token_count_to_events(chat_completion_invalid_access_key_error_events)) - @validate_error_trace_attributes( - _client_error_name, - exact_attrs={ - "agent": {}, - "intrinsic": {}, - "user": { - "http.statusCode": 403, - "error.message": "The security token included in the request is invalid.", - "error.code": "UnrecognizedClientException", - }, - }, - ) - @validate_transaction_metrics( - name="test_bedrock_chat_completion_incorrect_access_key_with_token_count", - scoped_metrics=[expected_metric], - rollup_metrics=[expected_metric], - custom_metrics=[(f"Supportability/Python/ML/Bedrock/{BOTOCORE_VERSION}", 1)], - background_task=True, - ) - @background_task(name="test_bedrock_chat_completion_incorrect_access_key_with_token_count") - def _test(): - set_trace_info() - add_custom_attribute("llm.conversation_id", "my-awesome-id") - add_custom_attribute("llm.foo", "bar") - add_custom_attribute("non_llm_attr", "python-agent") - - exercise_converse_incorrect_access_key() - - _test() - - @pytest.fixture def exercise_converse_invalid_model(bedrock_converse_server, response_streaming, monkeypatch): def _exercise_converse_invalid_model(): diff --git a/tests/external_botocore/test_bedrock_chat_completion_invoke_model.py b/tests/external_botocore/test_bedrock_chat_completion_invoke_model.py index 9acb0e8ed..ac72e458f 100644 --- a/tests/external_botocore/test_bedrock_chat_completion_invoke_model.py +++ b/tests/external_botocore/test_bedrock_chat_completion_invoke_model.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import json import os from io import BytesIO @@ -35,7 +36,8 @@ from conftest import BOTOCORE_VERSION from testing_support.fixtures import override_llm_token_callback_settings, reset_core_stats_engine, validate_attributes from testing_support.ml_testing_utils import ( - add_token_count_to_events, + add_token_count_streaming_events, + add_token_counts_to_chat_events, disabled_ai_monitoring_record_content_settings, disabled_ai_monitoring_settings, disabled_ai_monitoring_streaming_settings, @@ -129,6 +131,14 @@ def expected_events(model_id, response_streaming): return chat_completion_expected_events[model_id] +@pytest.fixture(scope="module") +def expected_events(model_id, response_streaming): + if response_streaming: + return chat_completion_streaming_expected_events[model_id] + else: + return chat_completion_expected_events[model_id] + + @pytest.fixture(scope="module") def expected_metrics(response_streaming): if response_streaming: @@ -200,7 +210,7 @@ def _test(): @reset_core_stats_engine() @override_llm_token_callback_settings(llm_token_count_callback) def test_bedrock_chat_completion_with_token_count(set_trace_info, exercise_model, expected_events, expected_metrics): - @validate_custom_events(add_token_count_to_events(expected_events)) + @validate_custom_events(add_token_counts_to_chat_events(add_token_count_streaming_events(expected_events))) # One summary event, one user message, and one response message from the assistant @validate_custom_event_count(count=3) @validate_transaction_metrics( @@ -438,49 +448,50 @@ def _test(): _test() -@reset_core_stats_engine() -@override_llm_token_callback_settings(llm_token_count_callback) -def test_bedrock_chat_completion_error_incorrect_access_key_with_token( - monkeypatch, - bedrock_server, - exercise_model, - set_trace_info, - expected_invalid_access_key_error_events, - expected_metrics, -): - @validate_custom_events(add_token_count_to_events(expected_invalid_access_key_error_events)) - @validate_error_trace_attributes( - _client_error_name, - exact_attrs={ - "agent": {}, - "intrinsic": {}, - "user": { - "http.statusCode": 403, - "error.message": "The security token included in the request is invalid.", - "error.code": "UnrecognizedClientException", - }, - }, - ) - @validate_transaction_metrics( - name="test_bedrock_chat_completion", - scoped_metrics=expected_metrics, - rollup_metrics=expected_metrics, - custom_metrics=[(f"Supportability/Python/ML/Bedrock/{BOTOCORE_VERSION}", 1)], - background_task=True, - ) - @background_task(name="test_bedrock_chat_completion") - def _test(): - monkeypatch.setattr(bedrock_server._request_signer._credentials, "access_key", "INVALID-ACCESS-KEY") - - with pytest.raises(_client_error): # not sure where this exception actually comes from - set_trace_info() - add_custom_attribute("llm.conversation_id", "my-awesome-id") - add_custom_attribute("llm.foo", "bar") - add_custom_attribute("non_llm_attr", "python-agent") - - exercise_model(prompt="Invalid Token", temperature=0.7, max_tokens=100) - - _test() +# +# @reset_core_stats_engine() +# @override_llm_token_callback_settings(llm_token_count_callback) +# def test_bedrock_chat_completion_error_incorrect_access_key_with_token( +# monkeypatch, +# bedrock_server, +# exercise_model, +# set_trace_info, +# expected_invalid_access_key_error_events, +# expected_metrics, +# ): +# @validate_custom_events(add_token_count_to_events(expected_invalid_access_key_error_events)) +# @validate_error_trace_attributes( +# _client_error_name, +# exact_attrs={ +# "agent": {}, +# "intrinsic": {}, +# "user": { +# "http.statusCode": 403, +# "error.message": "The security token included in the request is invalid.", +# "error.code": "UnrecognizedClientException", +# }, +# }, +# ) +# @validate_transaction_metrics( +# name="test_bedrock_chat_completion", +# scoped_metrics=expected_metrics, +# rollup_metrics=expected_metrics, +# custom_metrics=[(f"Supportability/Python/ML/Bedrock/{BOTOCORE_VERSION}", 1)], +# background_task=True, +# ) +# @background_task(name="test_bedrock_chat_completion") +# def _test(): +# monkeypatch.setattr(bedrock_server._request_signer._credentials, "access_key", "INVALID-ACCESS-KEY") +# +# with pytest.raises(_client_error): # not sure where this exception actually comes from +# set_trace_info() +# add_custom_attribute("llm.conversation_id", "my-awesome-id") +# add_custom_attribute("llm.foo", "bar") +# add_custom_attribute("non_llm_attr", "python-agent") +# +# exercise_model(prompt="Invalid Token", temperature=0.7, max_tokens=100) +# +# _test() @reset_core_stats_engine() @@ -762,55 +773,6 @@ def _test(): _test() -@reset_core_stats_engine() -@override_llm_token_callback_settings(llm_token_count_callback) -def test_bedrock_chat_completion_error_streaming_exception_with_token_count(bedrock_server, set_trace_info): - """ - Duplicate of test_bedrock_chat_completion_error_streaming_exception, but with token callback being set. - - See the original test for a description of the error case. - """ - - @validate_custom_events(add_token_count_to_events(chat_completion_expected_streaming_error_events)) - @validate_custom_event_count(count=2) - @validate_error_trace_attributes( - _event_stream_error_name, - exact_attrs={ - "agent": {}, - "intrinsic": {}, - "user": { - "error.message": "Malformed input request, please reformat your input and try again.", - "error.code": "ValidationException", - }, - }, - forgone_params={"agent": (), "intrinsic": (), "user": ("http.statusCode")}, - ) - @validate_transaction_metrics( - name="test_bedrock_chat_completion", - scoped_metrics=[("Llm/completion/Bedrock/invoke_model_with_response_stream", 1)], - rollup_metrics=[("Llm/completion/Bedrock/invoke_model_with_response_stream", 1)], - custom_metrics=[(f"Supportability/Python/ML/Bedrock/{BOTOCORE_VERSION}", 1)], - background_task=True, - ) - @background_task(name="test_bedrock_chat_completion") - def _test(): - with pytest.raises(_event_stream_error): - model = "amazon.titan-text-express-v1" - body = (chat_completion_payload_templates[model] % ("Streaming Exception", 0.7, 100)).encode("utf-8") - - set_trace_info() - add_custom_attribute("llm.conversation_id", "my-awesome-id") - add_custom_attribute("llm.foo", "bar") - add_custom_attribute("non_llm_attr", "python-agent") - - response = bedrock_server.invoke_model_with_response_stream( - body=body, modelId=model, accept="application/json", contentType="application/json" - ) - list(response["body"]) # Iterate - - _test() - - def test_bedrock_chat_completion_functions_marked_as_wrapped_for_sdk_compatibility(bedrock_server): assert bedrock_server._nr_wrapped diff --git a/tests/external_botocore/test_bedrock_embeddings.py b/tests/external_botocore/test_bedrock_embeddings.py index 36a5db661..f28308354 100644 --- a/tests/external_botocore/test_bedrock_embeddings.py +++ b/tests/external_botocore/test_bedrock_embeddings.py @@ -29,7 +29,7 @@ from conftest import BOTOCORE_VERSION from testing_support.fixtures import override_llm_token_callback_settings, reset_core_stats_engine, validate_attributes from testing_support.ml_testing_utils import ( - add_token_count_to_events, + add_token_count_to_embedding_events, disabled_ai_monitoring_record_content_settings, disabled_ai_monitoring_settings, events_sans_content, @@ -162,7 +162,7 @@ def _test(): @reset_core_stats_engine() @override_llm_token_callback_settings(llm_token_count_callback) def test_bedrock_embedding_with_token_count(set_trace_info, exercise_model, expected_events): - @validate_custom_events(add_token_count_to_events(expected_events)) + @validate_custom_events(add_token_count_to_embedding_events(expected_events)) @validate_custom_event_count(count=1) @validate_transaction_metrics( name="test_bedrock_embedding", @@ -287,45 +287,6 @@ def _test(): _test() -@reset_core_stats_engine() -@override_llm_token_callback_settings(llm_token_count_callback) -def test_bedrock_embedding_error_incorrect_access_key_with_token_count( - monkeypatch, bedrock_server, exercise_model, set_trace_info, expected_invalid_access_key_error_events -): - @validate_custom_events(add_token_count_to_events(expected_invalid_access_key_error_events)) - @validate_error_trace_attributes( - _client_error_name, - exact_attrs={ - "agent": {}, - "intrinsic": {}, - "user": { - "http.statusCode": 403, - "error.message": "The security token included in the request is invalid.", - "error.code": "UnrecognizedClientException", - }, - }, - ) - @validate_transaction_metrics( - name="test_bedrock_embedding", - scoped_metrics=[("Llm/embedding/Bedrock/invoke_model", 1)], - rollup_metrics=[("Llm/embedding/Bedrock/invoke_model", 1)], - background_task=True, - ) - @background_task(name="test_bedrock_embedding") - def _test(): - monkeypatch.setattr(bedrock_server._request_signer._credentials, "access_key", "INVALID-ACCESS-KEY") - - with pytest.raises(_client_error): # not sure where this exception actually comes from - set_trace_info() - add_custom_attribute("llm.conversation_id", "my-awesome-id") - add_custom_attribute("llm.foo", "bar") - add_custom_attribute("non_llm_attr", "python-agent") - - exercise_model(prompt="Invalid Token") - - _test() - - @reset_core_stats_engine() def test_bedrock_embedding_error_malformed_request_body(bedrock_server, set_trace_info): """ diff --git a/tests/mlmodel_gemini/test_embeddings.py b/tests/mlmodel_gemini/test_embeddings.py index 0fc92897b..5b4e30f86 100644 --- a/tests/mlmodel_gemini/test_embeddings.py +++ b/tests/mlmodel_gemini/test_embeddings.py @@ -15,7 +15,7 @@ import google.genai from testing_support.fixtures import override_llm_token_callback_settings, reset_core_stats_engine, validate_attributes from testing_support.ml_testing_utils import ( - add_token_count_to_events, + add_token_count_to_embedding_events, disabled_ai_monitoring_record_content_settings, disabled_ai_monitoring_settings, events_sans_content, @@ -93,7 +93,7 @@ def test_gemini_embedding_sync_no_content(gemini_dev_client, set_trace_info): @reset_core_stats_engine() @override_llm_token_callback_settings(llm_token_count_callback) -@validate_custom_events(add_token_count_to_events(embedding_recorded_events)) +@validate_custom_events(add_token_count_to_embedding_events(embedding_recorded_events)) @validate_custom_event_count(count=1) @validate_transaction_metrics( name="test_embeddings:test_gemini_embedding_sync_with_token_count", @@ -177,7 +177,7 @@ def test_gemini_embedding_async_no_content(gemini_dev_client, loop, set_trace_in @reset_core_stats_engine() @override_llm_token_callback_settings(llm_token_count_callback) -@validate_custom_events(add_token_count_to_events(embedding_recorded_events)) +@validate_custom_events(add_token_count_to_embedding_events(embedding_recorded_events)) @validate_custom_event_count(count=1) @validate_transaction_metrics( name="test_embeddings:test_gemini_embedding_async_with_token_count", diff --git a/tests/mlmodel_gemini/test_embeddings_error.py b/tests/mlmodel_gemini/test_embeddings_error.py index a65a6c2c6..f0e7aac58 100644 --- a/tests/mlmodel_gemini/test_embeddings_error.py +++ b/tests/mlmodel_gemini/test_embeddings_error.py @@ -16,12 +16,10 @@ import google.genai import pytest -from testing_support.fixtures import dt_enabled, override_llm_token_callback_settings, reset_core_stats_engine +from testing_support.fixtures import dt_enabled, reset_core_stats_engine from testing_support.ml_testing_utils import ( - add_token_count_to_events, disabled_ai_monitoring_record_content_settings, events_sans_content, - llm_token_count_callback, set_trace_info, ) from testing_support.validators.validate_custom_event import validate_custom_event_count @@ -159,34 +157,6 @@ def test_embeddings_invalid_request_error_invalid_model(gemini_dev_client, set_t gemini_dev_client.models.embed_content(contents="Embedded: Model does not exist.", model="does-not-exist") -@dt_enabled -@reset_core_stats_engine() -@override_llm_token_callback_settings(llm_token_count_callback) -@validate_error_trace_attributes( - callable_name(google.genai.errors.ClientError), - exact_attrs={"agent": {}, "intrinsic": {}, "user": {"error.code": "NOT_FOUND", "http.statusCode": 404}}, -) -@validate_span_events( - exact_agents={ - "error.message": "models/does-not-exist is not found for API version v1beta, or is not supported for embedContent. Call ListModels to see the list of available models and their supported methods." - } -) -@validate_transaction_metrics( - name="test_embeddings_error:test_embeddings_invalid_request_error_invalid_model_with_token_count", - scoped_metrics=[("Llm/embedding/Gemini/embed_content", 1)], - rollup_metrics=[("Llm/embedding/Gemini/embed_content", 1)], - custom_metrics=[(f"Supportability/Python/ML/Gemini/{google.genai.__version__}", 1)], - background_task=True, -) -@validate_custom_events(add_token_count_to_events(invalid_model_events)) -@validate_custom_event_count(count=1) -@background_task() -def test_embeddings_invalid_request_error_invalid_model_with_token_count(gemini_dev_client, set_trace_info): - with pytest.raises(google.genai.errors.ClientError): - set_trace_info() - gemini_dev_client.models.embed_content(contents="Embedded: Model does not exist.", model="does-not-exist") - - embedding_invalid_key_error_events = [ ( {"type": "LlmEmbedding"}, @@ -326,36 +296,6 @@ def test_embeddings_async_invalid_request_error_invalid_model(gemini_dev_client, ) -@dt_enabled -@reset_core_stats_engine() -@override_llm_token_callback_settings(llm_token_count_callback) -@validate_error_trace_attributes( - callable_name(google.genai.errors.ClientError), - exact_attrs={"agent": {}, "intrinsic": {}, "user": {"error.code": "NOT_FOUND", "http.statusCode": 404}}, -) -@validate_span_events( - exact_agents={ - "error.message": "models/does-not-exist is not found for API version v1beta, or is not supported for embedContent. Call ListModels to see the list of available models and their supported methods." - } -) -@validate_transaction_metrics( - name="test_embeddings_error:test_embeddings_async_invalid_request_error_invalid_model_with_token_count", - scoped_metrics=[("Llm/embedding/Gemini/embed_content", 1)], - rollup_metrics=[("Llm/embedding/Gemini/embed_content", 1)], - custom_metrics=[(f"Supportability/Python/ML/Gemini/{google.genai.__version__}", 1)], - background_task=True, -) -@validate_custom_events(add_token_count_to_events(invalid_model_events)) -@validate_custom_event_count(count=1) -@background_task() -def test_embeddings_async_invalid_request_error_invalid_model_with_token_count(gemini_dev_client, loop, set_trace_info): - with pytest.raises(google.genai.errors.ClientError): - set_trace_info() - loop.run_until_complete( - gemini_dev_client.models.embed_content(contents="Embedded: Model does not exist.", model="does-not-exist") - ) - - # Wrong api_key provided @dt_enabled @reset_core_stats_engine() diff --git a/tests/mlmodel_gemini/test_text_generation.py b/tests/mlmodel_gemini/test_text_generation.py index a01d10897..5d81deea6 100644 --- a/tests/mlmodel_gemini/test_text_generation.py +++ b/tests/mlmodel_gemini/test_text_generation.py @@ -15,7 +15,7 @@ import google.genai from testing_support.fixtures import override_llm_token_callback_settings, reset_core_stats_engine, validate_attributes from testing_support.ml_testing_utils import ( - add_token_count_to_events, + add_token_counts_to_chat_events, disabled_ai_monitoring_record_content_settings, disabled_ai_monitoring_settings, events_sans_content, @@ -51,6 +51,9 @@ "vendor": "gemini", "ingest_source": "Python", "response.number_of_messages": 2, + "response.usage.prompt_tokens": 9, + "response.usage.completion_tokens": 13, + "response.usage.total_tokens": 22, }, ), ( @@ -62,6 +65,7 @@ "llm.foo": "bar", "span_id": None, "trace_id": "trace-id", + "token_count": 0, "content": "How many letters are in the word Python?", "role": "user", "completion_id": None, @@ -79,6 +83,7 @@ "llm.foo": "bar", "span_id": None, "trace_id": "trace-id", + "token_count": 0, "content": 'There are **6** letters in the word "Python".\n', "role": "model", "completion_id": None, @@ -185,7 +190,8 @@ def test_gemini_text_generation_sync_no_content(gemini_dev_client, set_trace_inf @reset_core_stats_engine() @override_llm_token_callback_settings(llm_token_count_callback) -@validate_custom_events(add_token_count_to_events(text_generation_recorded_events)) +# Ensure LLM callback is invoked and response token counts are overridden +@validate_custom_events(add_token_counts_to_chat_events(text_generation_recorded_events)) @validate_custom_event_count(count=3) @validate_transaction_metrics( name="test_text_generation:test_gemini_text_generation_sync_with_token_count", @@ -326,7 +332,7 @@ def test_gemini_text_generation_async_no_content(gemini_dev_client, loop, set_tr @reset_core_stats_engine() @override_llm_token_callback_settings(llm_token_count_callback) -@validate_custom_events(add_token_count_to_events(text_generation_recorded_events)) +@validate_custom_events(add_token_counts_to_chat_events(text_generation_recorded_events)) @validate_custom_event_count(count=3) @validate_transaction_metrics( name="test_text_generation:test_gemini_text_generation_async_with_token_count", diff --git a/tests/mlmodel_gemini/test_text_generation_error.py b/tests/mlmodel_gemini/test_text_generation_error.py index eb8aec950..37f5b0646 100644 --- a/tests/mlmodel_gemini/test_text_generation_error.py +++ b/tests/mlmodel_gemini/test_text_generation_error.py @@ -17,13 +17,11 @@ import google.genai import pytest -from testing_support.fixtures import dt_enabled, override_llm_token_callback_settings, reset_core_stats_engine +from testing_support.fixtures import dt_enabled, reset_core_stats_engine from testing_support.ml_testing_utils import ( - add_token_count_to_events, disabled_ai_monitoring_record_content_settings, events_sans_content, events_with_context_attrs, - llm_token_count_callback, set_trace_info, ) from testing_support.validators.validate_custom_event import validate_custom_event_count @@ -65,6 +63,7 @@ "trace_id": "trace-id", "content": "How many letters are in the word Python?", "role": "user", + "token_count": 0, "completion_id": None, "sequence": 0, "vendor": "gemini", @@ -171,6 +170,7 @@ def _test(): "trace_id": "trace-id", "content": "Model does not exist.", "role": "user", + "token_count": 0, "completion_id": None, "response.model": "does-not-exist", "sequence": 0, @@ -183,39 +183,6 @@ def _test(): @dt_enabled @reset_core_stats_engine() -@override_llm_token_callback_settings(llm_token_count_callback) -@validate_error_trace_attributes( - callable_name(google.genai.errors.ClientError), - exact_attrs={"agent": {}, "intrinsic": {}, "user": {"error.code": "NOT_FOUND", "http.statusCode": 404}}, -) -@validate_span_events( - exact_agents={ - "error.message": "models/does-not-exist is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods." - } -) -@validate_transaction_metrics( - "test_text_generation_error:test_text_generation_invalid_request_error_invalid_model_with_token_count", - scoped_metrics=[("Llm/completion/Gemini/generate_content", 1)], - rollup_metrics=[("Llm/completion/Gemini/generate_content", 1)], - background_task=True, -) -@validate_custom_events(add_token_count_to_events(expected_events_on_invalid_model_error)) -@validate_custom_event_count(count=2) -@background_task() -def test_text_generation_invalid_request_error_invalid_model_with_token_count(gemini_dev_client, set_trace_info): - with pytest.raises(google.genai.errors.ClientError): - set_trace_info() - add_custom_attribute("llm.conversation_id", "my-awesome-id") - gemini_dev_client.models.generate_content( - model="does-not-exist", - contents=["Model does not exist."], - config=google.genai.types.GenerateContentConfig(max_output_tokens=100, temperature=0.7), - ) - - -@dt_enabled -@reset_core_stats_engine() -@override_llm_token_callback_settings(llm_token_count_callback) @validate_error_trace_attributes( callable_name(google.genai.errors.ClientError), exact_attrs={"agent": {}, "intrinsic": {}, "user": {"error.code": "NOT_FOUND", "http.statusCode": 404}}, @@ -231,7 +198,7 @@ def test_text_generation_invalid_request_error_invalid_model_with_token_count(ge rollup_metrics=[("Llm/completion/Gemini/generate_content", 1)], background_task=True, ) -@validate_custom_events(add_token_count_to_events(expected_events_on_invalid_model_error)) +@validate_custom_events(expected_events_on_invalid_model_error) @validate_custom_event_count(count=2) @background_task() def test_text_generation_invalid_request_error_invalid_model_chat(gemini_dev_client, set_trace_info): @@ -272,6 +239,7 @@ def test_text_generation_invalid_request_error_invalid_model_chat(gemini_dev_cli "trace_id": "trace-id", "content": "Invalid API key.", "role": "user", + "token_count": 0, "response.model": "gemini-flash-2.0", "completion_id": None, "sequence": 0, @@ -383,43 +351,6 @@ def _test(): @dt_enabled @reset_core_stats_engine() -@override_llm_token_callback_settings(llm_token_count_callback) -@validate_error_trace_attributes( - callable_name(google.genai.errors.ClientError), - exact_attrs={"agent": {}, "intrinsic": {}, "user": {"error.code": "NOT_FOUND", "http.statusCode": 404}}, -) -@validate_span_events( - exact_agents={ - "error.message": "models/does-not-exist is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods." - } -) -@validate_transaction_metrics( - "test_text_generation_error:test_text_generation_async_invalid_request_error_invalid_model_with_token_count", - scoped_metrics=[("Llm/completion/Gemini/generate_content", 1)], - rollup_metrics=[("Llm/completion/Gemini/generate_content", 1)], - background_task=True, -) -@validate_custom_events(add_token_count_to_events(expected_events_on_invalid_model_error)) -@validate_custom_event_count(count=2) -@background_task() -def test_text_generation_async_invalid_request_error_invalid_model_with_token_count( - gemini_dev_client, loop, set_trace_info -): - with pytest.raises(google.genai.errors.ClientError): - set_trace_info() - add_custom_attribute("llm.conversation_id", "my-awesome-id") - loop.run_until_complete( - gemini_dev_client.models.generate_content( - model="does-not-exist", - contents=["Model does not exist."], - config=google.genai.types.GenerateContentConfig(max_output_tokens=100, temperature=0.7), - ) - ) - - -@dt_enabled -@reset_core_stats_engine() -@override_llm_token_callback_settings(llm_token_count_callback) @validate_error_trace_attributes( callable_name(google.genai.errors.ClientError), exact_attrs={"agent": {}, "intrinsic": {}, "user": {"error.code": "NOT_FOUND", "http.statusCode": 404}}, @@ -435,7 +366,7 @@ def test_text_generation_async_invalid_request_error_invalid_model_with_token_co rollup_metrics=[("Llm/completion/Gemini/generate_content", 1)], background_task=True, ) -@validate_custom_events(add_token_count_to_events(expected_events_on_invalid_model_error)) +@validate_custom_events(expected_events_on_invalid_model_error) @validate_custom_event_count(count=2) @background_task() def test_text_generation_async_invalid_request_error_invalid_model_chat(gemini_dev_client, loop, set_trace_info): diff --git a/tests/mlmodel_langchain/test_chain.py b/tests/mlmodel_langchain/test_chain.py index 30281843b..858c0e356 100644 --- a/tests/mlmodel_langchain/test_chain.py +++ b/tests/mlmodel_langchain/test_chain.py @@ -385,6 +385,7 @@ "response.headers.ratelimitRemainingTokens": 9999992, "response.headers.ratelimitResetRequests": "6ms", "response.headers.ratelimitResetTokens": "0s", + "response.usage.total_tokens": 8, "vendor": "openai", "ingest_source": "Python", "input": "[[3923, 374, 220, 17, 489, 220, 19, 30]]", @@ -408,6 +409,7 @@ "response.headers.ratelimitRemainingTokens": 9999998, "response.headers.ratelimitResetRequests": "6ms", "response.headers.ratelimitResetTokens": "0s", + "response.usage.total_tokens": 1, "vendor": "openai", "ingest_source": "Python", "input": "[[10590]]", @@ -480,6 +482,9 @@ "response.headers.ratelimitRemainingTokens": 49999927, "response.headers.ratelimitResetRequests": "6ms", "response.headers.ratelimitResetTokens": "0s", + "response.usage.prompt_tokens": 73, + "response.usage.completion_tokens": 337, + "response.usage.total_tokens": 410, "response.number_of_messages": 3, }, ], @@ -496,6 +501,7 @@ "sequence": 0, "response.model": "gpt-3.5-turbo-0125", "vendor": "openai", + "token_count": 0, "ingest_source": "Python", "content": "You are a generator of quiz questions for a seminar. Use the following pieces of retrieved context to generate 5 multiple choice questions (A,B,C,D) on the subject matter. Use a three sentence maximum and keep the answer concise. Render the output as HTML\n\nWhat is 2 + 4?", }, @@ -513,6 +519,7 @@ "sequence": 1, "response.model": "gpt-3.5-turbo-0125", "vendor": "openai", + "token_count": 0, "ingest_source": "Python", "content": "math", }, @@ -529,6 +536,7 @@ "sequence": 2, "response.model": "gpt-3.5-turbo-0125", "vendor": "openai", + "token_count": 0, "ingest_source": "Python", "is_response": True, "content": "```html\n\n\n
\n