Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased

### Changed

- Improved LiteLLM GenAI util invocation mapping for positional arguments,
streaming time-to-first-token, multi-choice outputs, tool-call deltas, and
a real smoke example
([#191](https://github.com/alibaba/loongsuite-python-agent/pull/191)).

## Version 0.5.0 (2026-05-11)

There are no changelog entries for this release.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ Configuration
The instrumentation can be enabled/disabled using environment variables:

* ``ENABLE_LITELLM_INSTRUMENTOR``: Enable/disable instrumentation (default: true)
* ``OTEL_SEMCONV_STABILITY_OPT_IN``: Set to ``gen_ai_latest_experimental`` to enable GenAI semantic conventions
* ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT``: Set to ``NO_CONTENT``, ``SPAN_ONLY``, ``EVENT_ONLY``, or ``SPAN_AND_EVENT``

Usage
-----
Expand All @@ -43,6 +45,32 @@ Usage
messages=[{"role": "user", "content": "Hello!"}]
)

Local OTLP smoke
----------------

The ``examples/litellm_genai_smoke.py`` script sends real LiteLLM traffic for:

* non-streaming completion
* streaming completion
* concurrent async completion calls

Set ``LITELLM_SMOKE_MODE`` to ``non_streaming``, ``streaming``,
``concurrent``, or ``all`` (default) to run a subset.

Example with a local ``otel-gui`` OTLP endpoint:

.. code:: console

export DASHSCOPE_API_KEY=...
export OTEL_EXPORTER_OTLP_ENDPOINT=http://127.0.0.1:4318
export OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
export OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental
export OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=SPAN_ONLY
export OTEL_SERVICE_NAME=loongsuite-litellm-smoke

loongsuite-instrument python \
instrumentation-loongsuite/loongsuite-instrumentation-litellm/examples/litellm_genai_smoke.py

Features
--------

Expand All @@ -53,6 +81,9 @@ This instrumentation automatically captures:
* Embedding calls
* Retry mechanisms
* Tool/function calls
* Provider inference from known OpenAI-compatible base URLs, custom providers, and model names
* Streaming time-to-first-token, including reasoning/thinking deltas
* Multi-choice streaming outputs and tool-call delta accumulation
* Request and response metadata
* Token usage
* Model information
Expand All @@ -65,4 +96,3 @@ References
* `OpenTelemetry LiteLLM Instrumentation <https://opentelemetry-python-contrib.readthedocs.io/en/latest/instrumentation/litellm/litellm.html>`_
* `OpenTelemetry Project <https://opentelemetry.io/>`_
* `LiteLLM Documentation <https://docs.litellm.ai/>`_

Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# Copyright The OpenTelemetry Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Real LiteLLM smoke traffic for LoongSuite GenAI telemetry.

Run this under ``loongsuite-instrument`` with OTLP configured. The script
exercises non-streaming, streaming, and concurrent async completion calls.
"""

from __future__ import annotations

import asyncio
import os

import litellm

MODEL = os.getenv("LITELLM_MODEL", "qwen-turbo")
API_BASE = os.getenv(
"LITELLM_API_BASE",
"https://dashscope.aliyuncs.com/compatible-mode/v1",
)
CUSTOM_PROVIDER = os.getenv("LITELLM_CUSTOM_LLM_PROVIDER", "openai")


def _configure_provider() -> None:
litellm.telemetry = False


def _provider_kwargs() -> dict[str, str]:
api_key = (
os.getenv("LITELLM_API_KEY")
or os.getenv("DASHSCOPE_API_KEY")
or os.getenv("OPENAI_API_KEY")
)
if not api_key:
raise SystemExit(
"Missing required API key: set LITELLM_API_KEY, "
"DASHSCOPE_API_KEY, or OPENAI_API_KEY"
)

return {
"custom_llm_provider": CUSTOM_PROVIDER,
"api_key": api_key,
"api_base": API_BASE,
}


def run_non_streaming() -> None:
response = litellm.completion(
model=MODEL,
**_provider_kwargs(),
messages=[
{
"role": "user",
"content": "Reply with exactly one short sentence.",
}
],
temperature=0.1,
max_tokens=64,
)
print("non_streaming:", response.choices[0].message.content[:80])


def run_streaming() -> None:
stream = litellm.completion(
model=MODEL,
**_provider_kwargs(),
messages=[
{
"role": "user",
"content": "Count from one to five, separated by commas.",
}
],
stream=True,
temperature=0.1,
max_tokens=64,
)

chunks = []
for chunk in stream:
if chunk.choices:
delta = chunk.choices[0].delta
if getattr(delta, "content", None):
chunks.append(delta.content)
print("streaming:", "".join(chunks)[:80])


async def run_concurrent() -> None:
prompts = [
"Give one word for sky color.",
"Give one word for ocean color.",
"Give one word for grass color.",
]

async def call(prompt: str):
return await litellm.acompletion(
model=MODEL,
**_provider_kwargs(),
messages=[{"role": "user", "content": prompt}],
temperature=0.1,
max_tokens=32,
)

responses = await asyncio.gather(*(call(prompt) for prompt in prompts))
print(
"concurrent:",
", ".join(
response.choices[0].message.content[:24] for response in responses
),
)


def main() -> None:
_configure_provider()
mode = os.getenv("LITELLM_SMOKE_MODE", "all").lower()

if mode in ("all", "non_streaming"):
run_non_streaming()
if mode in ("all", "streaming"):
run_streaming()
if mode in ("all", "concurrent"):
asyncio.run(run_concurrent())


if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,18 @@ instruments = [
litellm = "opentelemetry.instrumentation.litellm:LiteLLMInstrumentor"

[project.urls]
Homepage = "https://github.com/open-telemetry/opentelemetry-python-contrib/tree/main/instrumentation/opentelemetry-instrumentation-litellm"
Repository = "https://github.com/open-telemetry/opentelemetry-python-contrib"
Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-litellm"
Repository = "https://github.com/alibaba/loongsuite-python-agent"

[tool.hatch.version]
path = "src/opentelemetry/instrumentation/litellm/version.py"

[tool.hatch.build.targets.sdist]
include = [
"/examples",
"/src",
"/tests",
]

[tool.hatch.build.targets.wheel]
packages = ["src/opentelemetry"]

Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,18 @@
Embedding wrapper for LiteLLM instrumentation.
"""

import logging
import os
from typing import Callable

from opentelemetry import context
from opentelemetry.context import _SUPPRESS_INSTRUMENTATION_KEY
from opentelemetry.instrumentation.litellm._utils import (
apply_litellm_embedding_response_to_invocation,
create_embedding_invocation_from_litellm,
normalize_litellm_embedding_kwargs,
)
from opentelemetry.util.genai.types import Error

logger = logging.getLogger(__name__)


def _is_instrumentation_enabled() -> bool:
"""Check if instrumentation is enabled via environment variable."""
Expand All @@ -53,8 +52,10 @@ def __call__(self, *args, **kwargs):
if context.get_value(_SUPPRESS_INSTRUMENTATION_KEY):
return self.original_func(*args, **kwargs)

# Create invocation object
invocation = create_embedding_invocation_from_litellm(**kwargs)
request_kwargs = normalize_litellm_embedding_kwargs(
self.original_func, args, kwargs
)
invocation = create_embedding_invocation_from_litellm(**request_kwargs)

# Start Embedding invocation
self._handler.start_embedding(invocation)
Expand All @@ -63,43 +64,9 @@ def __call__(self, *args, **kwargs):
# Call original function
response = self.original_func(*args, **kwargs)

# Extract response metadata
if hasattr(response, "model"):
invocation.response_model_name = response.model

# Extract token usage if available
if hasattr(response, "usage") and response.usage:
invocation.input_tokens = getattr(
response.usage, "prompt_tokens", None
)
invocation.output_tokens = getattr(
response.usage, "total_tokens", None
)

# Extract embedding dimension count
if (
hasattr(response, "data")
and response.data
and len(response.data) > 0
):
try:
first_embedding = response.data[0]
# Handle dict response
if (
isinstance(first_embedding, dict)
and "embedding" in first_embedding
):
embedding_vector = first_embedding["embedding"]
if isinstance(embedding_vector, list):
invocation.dimension_count = len(embedding_vector)
# Handle object response
elif hasattr(first_embedding, "embedding"):
embedding_vector = first_embedding.embedding
if isinstance(embedding_vector, list):
invocation.dimension_count = len(embedding_vector)
except (IndexError, AttributeError, KeyError, TypeError):
# If we can't extract dimension, just skip it
pass
apply_litellm_embedding_response_to_invocation(
invocation, response
)

# End Embedding invocation successfully
self._handler.stop_embedding(invocation)
Expand Down Expand Up @@ -131,8 +98,10 @@ async def __call__(self, *args, **kwargs):
if context.get_value(_SUPPRESS_INSTRUMENTATION_KEY):
return await self.original_func(*args, **kwargs)

# Create invocation object
invocation = create_embedding_invocation_from_litellm(**kwargs)
request_kwargs = normalize_litellm_embedding_kwargs(
self.original_func, args, kwargs
)
invocation = create_embedding_invocation_from_litellm(**request_kwargs)

# Start Embedding invocation
self._handler.start_embedding(invocation)
Expand All @@ -141,43 +110,9 @@ async def __call__(self, *args, **kwargs):
# Call original function
response = await self.original_func(*args, **kwargs)

# Extract response metadata
if hasattr(response, "model"):
invocation.response_model_name = response.model

# Extract token usage if available
if hasattr(response, "usage") and response.usage:
invocation.input_tokens = getattr(
response.usage, "prompt_tokens", None
)
invocation.output_tokens = getattr(
response.usage, "total_tokens", None
)

# Extract embedding dimension count
if (
hasattr(response, "data")
and response.data
and len(response.data) > 0
):
try:
first_embedding = response.data[0]
# Handle dict response
if (
isinstance(first_embedding, dict)
and "embedding" in first_embedding
):
embedding_vector = first_embedding["embedding"]
if isinstance(embedding_vector, list):
invocation.dimension_count = len(embedding_vector)
# Handle object response
elif hasattr(first_embedding, "embedding"):
embedding_vector = first_embedding.embedding
if isinstance(embedding_vector, list):
invocation.dimension_count = len(embedding_vector)
except (IndexError, AttributeError, KeyError, TypeError):
# If we can't extract dimension, just skip it
pass
apply_litellm_embedding_response_to_invocation(
invocation, response
)

# End Embedding invocation successfully
self._handler.stop_embedding(invocation)
Expand Down
Loading
Loading