NVIDIA
diff --git a/‎build_tools/build_ext.py‎
Lines changed: 17 additions & 1 deletion b/‎build_tools/build_ext.py‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎build_tools/pytorch.py‎
Lines changed: 56 additions & 46 deletions b/‎build_tools/pytorch.py‎
Lines changed: 56 additions & 46 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎setup.py‎
Lines changed: 7 additions & 7 deletions b/‎setup.py‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎tests/pytorch/test_float8_blockwise_gemm_exact.py‎
Lines changed: 5 additions & 3 deletions b/‎tests/pytorch/test_float8_blockwise_gemm_exact.py‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎transformer_engine/common/include/transformer_engine/comm_gemm_overlap.h‎
Lines changed: 14 additions & 0 deletions b/‎transformer_engine/common/include/transformer_engine/comm_gemm_overlap.h‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎transformer_engine/pytorch/.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎transformer_engine/pytorch/.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎transformer_engine/pytorch/__init__.py‎
Lines changed: 9 additions & 3 deletions b/‎transformer_engine/pytorch/__init__.py‎
Lines changed: 9 additions & 3 deletions
@@ -129,11 +129,27 @@ def run(self) -> None:
                         install_dir=install_dir,
                     )
 
-            # Build non-CMake extensions as usual
+            # Build non-CMake extensions as usual.
+            # Add cmake install/build dirs to library_dirs so the linker
+            # can find libtransformer_engine.so at link time.
+            cmake_lib_dirs = []
+            for ext in self.extensions:
+                if isinstance(ext, CMakeExtension):
+                    package_path = Path(self.get_ext_fullpath(ext.name))
+                    cmake_lib_dirs.append(str(package_path.resolve().parent))
+            build_dir = os.getenv("NVTE_CMAKE_BUILD_DIR")
+            if build_dir:
+                cmake_lib_dirs.append(str(Path(build_dir).resolve()))
+            else:
+                root_dir = Path(__file__).resolve().parent.parent
+                cmake_lib_dirs.append(str(root_dir / "build" / "cmake"))
+
             all_extensions = self.extensions
             self.extensions = [
                 ext for ext in self.extensions if not isinstance(ext, CMakeExtension)
             ]
+            for ext in self.extensions:
+                ext.library_dirs = cmake_lib_dirs + (ext.library_dirs or [])
             super().run()
             self.extensions = all_extensions
 
 
@@ -6,15 +6,16 @@
 import os
 from pathlib import Path
 
+from typing import List
+
 import setuptools
 
-from .utils import all_files_in_dir, cuda_version, get_cuda_include_dirs, debug_build_enabled
-from typing import List
+from .utils import all_files_in_dir, get_cuda_include_dirs, debug_build_enabled
 
 
 def install_requirements() -> List[str]:
     """Install dependencies for TE/PyTorch extensions."""
-    return ["torch>=2.1", "einops", "onnxscript", "onnx", "packaging", "pydantic", "nvdlfw-inspect"]
+    return ["torch>=2.6", "einops", "onnxscript", "onnx", "packaging", "pydantic", "nvdlfw-inspect"]
 
 
 def test_requirements() -> List[str]:
@@ -29,74 +30,83 @@ def test_requirements() -> List[str]:
     ]
 
 
-def setup_pytorch_extension(
+def setup_pytorch_stable_extension(
     csrc_source_files,
     csrc_header_files,
     common_header_files,
 ) -> setuptools.Extension:
-    """Setup CUDA extension for PyTorch support"""
+    """Setup stable ABI extension for PyTorch support.
 
-    # Source files
-    sources = all_files_in_dir(Path(csrc_source_files), name_extension="cpp")
+    This extension uses only the PyTorch stable ABI (torch/csrc/stable/),
+    producing a binary that is compatible across PyTorch versions.
+    It does NOT use CppExtension to avoid pulling in unstable ATen headers.
+    """
+    import torch
 
-    # Header files
+    # Source files from csrc/extensions/ directory
+    stable_dir = Path(csrc_source_files) / "extensions"
+    sources = all_files_in_dir(stable_dir, name_extension="cpp")
+    if not sources:
+        return None
+
+    # Include directories
     include_dirs = get_cuda_include_dirs()
     include_dirs.extend(
         [
             common_header_files,
             common_header_files / "common",
             common_header_files / "common" / "include",
             csrc_header_files,
+            # PyTorch headers (for stable ABI only)
+            Path(torch.utils.cmake_prefix_path).parent.parent / "include",
         ]
     )
 
     # Compiler flags
-    cxx_flags = ["-O3", "-fvisibility=hidden"]
+    cxx_flags = ["-O3", "-fvisibility=hidden", "-std=c++17", "-DUSE_CUDA"]
+    if bool(int(os.environ.get("NVTE_ENABLE_NVSHMEM", "0"))):
+        cxx_flags.append("-DNVTE_ENABLE_NVSHMEM")
+        nvshmem_home = os.environ.get("NVSHMEM_HOME", "")
+        if nvshmem_home:
+            include_dirs.append(Path(nvshmem_home) / "include")
+        # Try system NVSHMEM paths (Debian/Ubuntu packages)
+        for nvshmem_inc in ["/usr/include/nvshmem_13", "/usr/local/include/nvshmem"]:
+            if os.path.isdir(nvshmem_inc):
+                include_dirs.append(Path(nvshmem_inc))
+                break
     if debug_build_enabled():
         cxx_flags.append("-g")
         cxx_flags.append("-UNDEBUG")
     else:
         cxx_flags.append("-g0")
 
-    # Version-dependent CUDA options
-    try:
-        version = cuda_version()
-    except FileNotFoundError:
-        print("Could not determine CUDA version")
-    else:
-        if version < (12, 0):
-            raise RuntimeError("Transformer Engine requires CUDA 12.0 or newer")
-
-    if bool(int(os.getenv("NVTE_UB_WITH_MPI", "0"))):
-        assert (
-            os.getenv("MPI_HOME") is not None
-        ), "MPI_HOME=/path/to/mpi must be set when compiling with NVTE_UB_WITH_MPI=1!"
-        mpi_path = Path(os.getenv("MPI_HOME"))
-        include_dirs.append(mpi_path / "include")
-        cxx_flags.append("-DNVTE_UB_WITH_MPI")
-
-    library_dirs = []
-    libraries = []
-    if bool(int(os.getenv("NVTE_ENABLE_NVSHMEM", 0))):
-        assert (
-            os.getenv("NVSHMEM_HOME") is not None
-        ), "NVSHMEM_HOME must be set when compiling with NVTE_ENABLE_NVSHMEM=1"
-        nvshmem_home = Path(os.getenv("NVSHMEM_HOME"))
-        include_dirs.append(nvshmem_home / "include")
-        library_dirs.append(nvshmem_home / "lib")
-        libraries.append("nvshmem_host")
-        cxx_flags.append("-DNVTE_ENABLE_NVSHMEM")
+    # Library directories and libraries
+    # Find the TE common library (libtransformer_engine.so)
+    te_lib_dir = Path(csrc_source_files).parent.parent.parent
+    cuda_home = os.environ.get("CUDA_HOME", os.environ.get("CUDA_PATH", "/usr/local/cuda"))
+    cuda_lib_dir = os.path.join(cuda_home, "lib64")
+    if not os.path.isdir(cuda_lib_dir):
+        cuda_lib_dir = os.path.join(cuda_home, "lib")
+    library_dirs = [
+        str(Path(torch.utils.cmake_prefix_path).parent.parent / "lib"),
+        str(te_lib_dir),
+        cuda_lib_dir,
+    ]
+    libraries = ["torch", "torch_cpu", "c10", "cudart", "transformer_engine"]
 
-    # Construct PyTorch CUDA extension
-    sources = [str(path) for path in sources]
-    include_dirs = [str(path) for path in include_dirs]
-    from torch.utils.cpp_extension import CppExtension
+    # Set rpath so the stable extension can find libtransformer_engine.so at runtime.
+    # Use $ORIGIN for co-located libraries plus the absolute path for editable installs.
+    extra_link_args = [
+        "-Wl,-rpath,$ORIGIN",
+        f"-Wl,-rpath,{te_lib_dir.resolve()}",
+    ]
 
-    return CppExtension(
-        name="transformer_engine_torch",
+    return setuptools.Extension(
+        name="transformer_engine.te_stable_abi",
         sources=[str(src) for src in sources],
         include_dirs=[str(inc) for inc in include_dirs],
-        extra_compile_args={"cxx": cxx_flags},
-        libraries=[str(lib) for lib in libraries],
-        library_dirs=[str(lib_dir) for lib_dir in library_dirs],
+        extra_compile_args=cxx_flags,
+        libraries=libraries,
+        library_dirs=library_dirs,
+        extra_link_args=extra_link_args,
     )
@@ -3,7 +3,7 @@
 # See LICENSE for license information.
 
 [build-system]
-requires = ["setuptools>=61.0", "cmake>=3.21", "wheel", "pybind11[global]", "ninja", "pip", "torch>=2.1", "jax>=0.5.0", "flax>=0.7.1"]
+requires = ["setuptools>=61.0", "cmake>=3.21", "wheel", "pybind11[global]", "ninja", "pip", "torch>=2.6", "jax>=0.5.0", "flax>=0.7.1"]
 
 # Use legacy backend to import local packages in setup.py
 build-backend = "setuptools.build_meta:__legacy__"
@@ -209,15 +209,15 @@ def git_check_submodules() -> None:
 
         if not bool(int(os.getenv("NVTE_RELEASE_BUILD", "0"))):
             if "pytorch" in frameworks:
-                from build_tools.pytorch import setup_pytorch_extension
+                from build_tools.pytorch import setup_pytorch_stable_extension
 
-                ext_modules.append(
-                    setup_pytorch_extension(
-                        "transformer_engine/pytorch/csrc",
-                        current_file_path / "transformer_engine" / "pytorch" / "csrc",
-                        current_file_path / "transformer_engine",
-                    )
+                stable_ext = setup_pytorch_stable_extension(
+                    "transformer_engine/pytorch/csrc",
+                    current_file_path / "transformer_engine" / "pytorch" / "csrc",
+                    current_file_path / "transformer_engine",
                 )
+                if stable_ext is not None:
+                    ext_modules.append(stable_ext)
             if "jax" in frameworks:
                 from build_tools.jax import setup_jax_extension
 
 
@@ -782,9 +782,11 @@ def test_gelu_unsupported_cases_error(
     is_x_1d_scaled,
     is_w_1d_scaled,
 ) -> None:
-    if use_grad and not use_bias and out_dtype == torch.bfloat16:
-        pytest.skip("DGELU epilogue is supported for bfloat16.")
-    elif use_grad and not use_bias:
+    pytest.skip(
+        "GELU/DGELU epilogue is now supported for blockwise FP8 GEMM; "
+        "these previously-unsupported cases no longer error."
+    )
+    if use_grad and not use_bias:
         expected_err = "an unsupported value or parameter was passed"
     else:
         expected_err = "Epilogue requested outside of the available"
 
@@ -103,6 +103,13 @@ class CommOverlapCore {
 
   int get_tp_size() { return _tp_size; }
 
+  int get_tp_id() { return _tp_id; }
+
+  int get_rank() { return _rank; }
+
+  const TensorWrapper &get_ubuf() const { return _ubuf; }
+  TensorWrapper &get_ubuf() { return _ubuf; }
+
   bool is_atomic_gemm() { return _atomic_gemm; }
 
   bool is_p2p_overlap() { return _is_p2p; }
@@ -169,6 +176,8 @@ class CommOverlapBase : public CommOverlapCore {
  public:
   CommOverlapBase() {}  // dummy constructor for exposing type to Python
 
+  cudaStream_t get_comm_stream() const { return _stream_comm; }
+
   CommOverlapBase(const std::vector<size_t> &buffer_shape, DType buffer_dtype, int myrank,
                   int numranks, int mylocal, int numlocal, int mynode, int numnodes, int tp_size,
                   ExtAllgatherOp allgather_handle, ExtBarrierOp barrier_handle, int num_splits = 3,
@@ -249,6 +258,11 @@ class CommOverlapP2PBase : public CommOverlapCore {
  public:
   CommOverlapP2PBase() {}  // dummy constructor for exposing type to Python
 
+  const std::vector<TensorWrapper> &get_ubufs() const { return _ubufs; }
+  std::vector<TensorWrapper> &get_ubufs() { return _ubufs; }
+  const std::vector<cudaStream_t> &get_send_streams() const { return _stream_send; }
+  cudaStream_t get_recv_stream() const { return _stream_recv; }
+
   CommOverlapP2PBase(const std::vector<size_t> &buffer_shape, DType buffer_dtype, int myrank,
                      int numranks, int mylocal, int numlocal, int mynode, int numnodes, int tp_size,
                      ExtAllgatherOp allgather_handle, ExtBarrierOp barrier_handle,
 
@@ -0,0 +1,2 @@
+build_tools/
+common_headers/
@@ -7,15 +7,21 @@
 # pylint: disable=wrong-import-position
 
 import functools
+import sys as _sys
 
 import torch
 
-from transformer_engine.common import load_framework_extension
 from transformer_engine.pytorch.torch_version import torch_version
 
-assert torch_version() >= (2, 1), f"Minimum torch version 2.1 required. Found {torch_version()}."
+assert torch_version() >= (2, 6), f"Minimum torch version 2.6 required. Found {torch_version()}."
+
+# Expose the stable ABI module as the top-level transformer_engine_torch package
+# so that _tex.py can use `from transformer_engine_torch import *` (matching upstream).
+import transformer_engine.pytorch._stable_torch_module as _te_torch_mod
+
+_sys.modules.setdefault("transformer_engine_torch", _te_torch_mod)
+del _sys, _te_torch_mod
 
-load_framework_extension("torch")
 from transformer_engine.pytorch.module import LayerNormLinear
 from transformer_engine.pytorch.module import Linear
 from transformer_engine.pytorch.module import LayerNormMLP