alibaba · sipercai · May 20, 2026 · May 21, 2026 · May 21, 2026
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-litellm/CHANGELOG.md b/instrumentation-loongsuite/loongsuite-instrumentation-litellm/CHANGELOG.md
@@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### Changed
+
+- Improved LiteLLM GenAI util invocation mapping for positional arguments,
+  streaming time-to-first-token, multi-choice outputs, tool-call deltas, and
+  a real smoke example
+  ([#191](https://github.com/alibaba/loongsuite-python-agent/pull/191)).
+
 ## Version 0.5.0 (2026-05-11)
 
 There are no changelog entries for this release.

diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-litellm/README.rst b/instrumentation-loongsuite/loongsuite-instrumentation-litellm/README.rst
@@ -25,6 +25,8 @@ Configuration
 The instrumentation can be enabled/disabled using environment variables:
 
 * ``ENABLE_LITELLM_INSTRUMENTOR``: Enable/disable instrumentation (default: true)
+* ``OTEL_SEMCONV_STABILITY_OPT_IN``: Set to ``gen_ai_latest_experimental`` to enable GenAI semantic conventions
+* ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT``: Set to ``NO_CONTENT``, ``SPAN_ONLY``, ``EVENT_ONLY``, or ``SPAN_AND_EVENT``
 
 Usage
 -----
@@ -43,6 +45,32 @@ Usage
         messages=[{"role": "user", "content": "Hello!"}]
     )
 
+Local OTLP smoke
+----------------
+
+The ``examples/litellm_genai_smoke.py`` script sends real LiteLLM traffic for:
+
+* non-streaming completion
+* streaming completion
+* concurrent async completion calls
+
+Set ``LITELLM_SMOKE_MODE`` to ``non_streaming``, ``streaming``,
+``concurrent``, or ``all`` (default) to run a subset.
+
+Example with a local ``otel-gui`` OTLP endpoint:
+
+.. code:: console
+
+    export DASHSCOPE_API_KEY=...
+    export OTEL_EXPORTER_OTLP_ENDPOINT=http://127.0.0.1:4318
+    export OTEL_EXPORTER_OTLP_PROTOCOL=http/protobuf
+    export OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental
+    export OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=SPAN_ONLY
+    export OTEL_SERVICE_NAME=loongsuite-litellm-smoke
+
+    loongsuite-instrument python \
+        instrumentation-loongsuite/loongsuite-instrumentation-litellm/examples/litellm_genai_smoke.py
+
 Features
 --------
 
@@ -53,6 +81,9 @@ This instrumentation automatically captures:
 * Embedding calls
 * Retry mechanisms
 * Tool/function calls
+* Provider inference from known OpenAI-compatible base URLs, custom providers, and model names
+* Streaming time-to-first-token, including reasoning/thinking deltas
+* Multi-choice streaming outputs and tool-call delta accumulation
 * Request and response metadata
 * Token usage
 * Model information
@@ -65,4 +96,3 @@ References
 * `OpenTelemetry LiteLLM Instrumentation <https://opentelemetry-python-contrib.readthedocs.io/en/latest/instrumentation/litellm/litellm.html>`_
 * `OpenTelemetry Project <https://opentelemetry.io/>`_
 * `LiteLLM Documentation <https://docs.litellm.ai/>`_
-
diff --git a/...rumentation-loongsuite/loongsuite-instrumentation-litellm/examples/litellm_genai_smoke.py b/...rumentation-loongsuite/loongsuite-instrumentation-litellm/examples/litellm_genai_smoke.py
@@ -0,0 +1,137 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Real LiteLLM smoke traffic for LoongSuite GenAI telemetry.
+
+Run this under ``loongsuite-instrument`` with OTLP configured. The script
+exercises non-streaming, streaming, and concurrent async completion calls.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import os
+
+import litellm
+
+MODEL = os.getenv("LITELLM_MODEL", "qwen-turbo")
+API_BASE = os.getenv(
+    "LITELLM_API_BASE",
+    "https://dashscope.aliyuncs.com/compatible-mode/v1",
+)
+CUSTOM_PROVIDER = os.getenv("LITELLM_CUSTOM_LLM_PROVIDER", "openai")
+
+
+def _configure_provider() -> None:
+    litellm.telemetry = False
+
+
+def _provider_kwargs() -> dict[str, str]:
+    api_key = (
+        os.getenv("LITELLM_API_KEY")
+        or os.getenv("DASHSCOPE_API_KEY")
+        or os.getenv("OPENAI_API_KEY")
+    )
+    if not api_key:
+        raise SystemExit(
+            "Missing required API key: set LITELLM_API_KEY, "
+            "DASHSCOPE_API_KEY, or OPENAI_API_KEY"
+        )
+
+    return {
+        "custom_llm_provider": CUSTOM_PROVIDER,
+        "api_key": api_key,
+        "api_base": API_BASE,
+    }
+
+
+def run_non_streaming() -> None:
+    response = litellm.completion(
+        model=MODEL,
+        **_provider_kwargs(),
+        messages=[
+            {
+                "role": "user",
+                "content": "Reply with exactly one short sentence.",
+            }
+        ],
+        temperature=0.1,
+        max_tokens=64,
+    )
+    print("non_streaming:", response.choices[0].message.content[:80])
+
+
+def run_streaming() -> None:
+    stream = litellm.completion(
+        model=MODEL,
+        **_provider_kwargs(),
+        messages=[
+            {
+                "role": "user",
+                "content": "Count from one to five, separated by commas.",
+            }
+        ],
+        stream=True,
+        temperature=0.1,
+        max_tokens=64,
+    )
+
+    chunks = []
+    for chunk in stream:
+        if chunk.choices:
+            delta = chunk.choices[0].delta
+            if getattr(delta, "content", None):
+                chunks.append(delta.content)
+    print("streaming:", "".join(chunks)[:80])
+
+
+async def run_concurrent() -> None:
+    prompts = [
+        "Give one word for sky color.",
+        "Give one word for ocean color.",
+        "Give one word for grass color.",
+    ]
+
+    async def call(prompt: str):
+        return await litellm.acompletion(
+            model=MODEL,
+            **_provider_kwargs(),
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0.1,
+            max_tokens=32,
+        )
+
+    responses = await asyncio.gather(*(call(prompt) for prompt in prompts))
+    print(
+        "concurrent:",
+        ", ".join(
+            response.choices[0].message.content[:24] for response in responses
+        ),
+    )
+
+
+def main() -> None:
+    _configure_provider()
+    mode = os.getenv("LITELLM_SMOKE_MODE", "all").lower()
+
+    if mode in ("all", "non_streaming"):
+        run_non_streaming()
+    if mode in ("all", "streaming"):
+        run_streaming()
+    if mode in ("all", "concurrent"):
+        asyncio.run(run_concurrent())
+
+
+if __name__ == "__main__":
+    main()
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-litellm/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-litellm/pyproject.toml
@@ -41,18 +41,18 @@ instruments = [
 litellm = "opentelemetry.instrumentation.litellm:LiteLLMInstrumentor"
 
 [project.urls]
-Homepage = "https://github.com/open-telemetry/opentelemetry-python-contrib/tree/main/instrumentation/opentelemetry-instrumentation-litellm"
-Repository = "https://github.com/open-telemetry/opentelemetry-python-contrib"
+Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-litellm"
+Repository = "https://github.com/alibaba/loongsuite-python-agent"
 
 [tool.hatch.version]
 path = "src/opentelemetry/instrumentation/litellm/version.py"
 
 [tool.hatch.build.targets.sdist]
 include = [
+  "/examples",
   "/src",
   "/tests",
 ]
 
 [tool.hatch.build.targets.wheel]
 packages = ["src/opentelemetry"]
-
diff --git a/...e-instrumentation-litellm/src/opentelemetry/instrumentation/litellm/_embedding_wrapper.py b/...e-instrumentation-litellm/src/opentelemetry/instrumentation/litellm/_embedding_wrapper.py
@@ -16,19 +16,18 @@
 Embedding wrapper for LiteLLM instrumentation.
 """
 
-import logging
 import os
 from typing import Callable
 
 from opentelemetry import context
 from opentelemetry.context import _SUPPRESS_INSTRUMENTATION_KEY
 from opentelemetry.instrumentation.litellm._utils import (
+    apply_litellm_embedding_response_to_invocation,
     create_embedding_invocation_from_litellm,
+    normalize_litellm_embedding_kwargs,
 )
 from opentelemetry.util.genai.types import Error
 
-logger = logging.getLogger(__name__)
-
 
 def _is_instrumentation_enabled() -> bool:
     """Check if instrumentation is enabled via environment variable."""
@@ -53,8 +52,10 @@ def __call__(self, *args, **kwargs):
         if context.get_value(_SUPPRESS_INSTRUMENTATION_KEY):
             return self.original_func(*args, **kwargs)
 
-        # Create invocation object
-        invocation = create_embedding_invocation_from_litellm(**kwargs)
+        request_kwargs = normalize_litellm_embedding_kwargs(
+            self.original_func, args, kwargs
+        )
+        invocation = create_embedding_invocation_from_litellm(**request_kwargs)
 
         # Start Embedding invocation
         self._handler.start_embedding(invocation)
@@ -63,43 +64,9 @@ def __call__(self, *args, **kwargs):
             # Call original function
             response = self.original_func(*args, **kwargs)
 
-            # Extract response metadata
-            if hasattr(response, "model"):
-                invocation.response_model_name = response.model
-
-            # Extract token usage if available
-            if hasattr(response, "usage") and response.usage:
-                invocation.input_tokens = getattr(
-                    response.usage, "prompt_tokens", None
-                )
-                invocation.output_tokens = getattr(
-                    response.usage, "total_tokens", None
-                )
-
-            # Extract embedding dimension count
-            if (
-                hasattr(response, "data")
-                and response.data
-                and len(response.data) > 0
-            ):
-                try:
-                    first_embedding = response.data[0]
-                    # Handle dict response
-                    if (
-                        isinstance(first_embedding, dict)
-                        and "embedding" in first_embedding
-                    ):
-                        embedding_vector = first_embedding["embedding"]
-                        if isinstance(embedding_vector, list):
-                            invocation.dimension_count = len(embedding_vector)
-                    # Handle object response
-                    elif hasattr(first_embedding, "embedding"):
-                        embedding_vector = first_embedding.embedding
-                        if isinstance(embedding_vector, list):
-                            invocation.dimension_count = len(embedding_vector)
-                except (IndexError, AttributeError, KeyError, TypeError):
-                    # If we can't extract dimension, just skip it
-                    pass
+            apply_litellm_embedding_response_to_invocation(
+                invocation, response
+            )
 
             # End Embedding invocation successfully
             self._handler.stop_embedding(invocation)
@@ -131,8 +98,10 @@ async def __call__(self, *args, **kwargs):
         if context.get_value(_SUPPRESS_INSTRUMENTATION_KEY):
             return await self.original_func(*args, **kwargs)
 
-        # Create invocation object
-        invocation = create_embedding_invocation_from_litellm(**kwargs)
+        request_kwargs = normalize_litellm_embedding_kwargs(
+            self.original_func, args, kwargs
+        )
+        invocation = create_embedding_invocation_from_litellm(**request_kwargs)
 
         # Start Embedding invocation
         self._handler.start_embedding(invocation)
@@ -141,43 +110,9 @@ async def __call__(self, *args, **kwargs):
             # Call original function
             response = await self.original_func(*args, **kwargs)
 
-            # Extract response metadata
-            if hasattr(response, "model"):
-                invocation.response_model_name = response.model
-
-            # Extract token usage if available
-            if hasattr(response, "usage") and response.usage:
-                invocation.input_tokens = getattr(
-                    response.usage, "prompt_tokens", None
-                )
-                invocation.output_tokens = getattr(
-                    response.usage, "total_tokens", None
-                )
-
-            # Extract embedding dimension count
-            if (
-                hasattr(response, "data")
-                and response.data
-                and len(response.data) > 0
-            ):
-                try:
-                    first_embedding = response.data[0]
-                    # Handle dict response
-                    if (
-                        isinstance(first_embedding, dict)
-                        and "embedding" in first_embedding
-                    ):
-                        embedding_vector = first_embedding["embedding"]
-                        if isinstance(embedding_vector, list):
-                            invocation.dimension_count = len(embedding_vector)
-                    # Handle object response
-                    elif hasattr(first_embedding, "embedding"):
-                        embedding_vector = first_embedding.embedding
-                        if isinstance(embedding_vector, list):
-                            invocation.dimension_count = len(embedding_vector)
-                except (IndexError, AttributeError, KeyError, TypeError):
-                    # If we can't extract dimension, just skip it
-                    pass
+            apply_litellm_embedding_response_to_invocation(
+                invocation, response
+            )
 
             # End Embedding invocation successfully
             self._handler.stop_embedding(invocation)