InfiniTensor
diff --git a/‎include/infinicore/ops.hpp‎
Lines changed: 0 additions & 1 deletion b/‎include/infinicore/ops.hpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎include/infinicore/ops/embedding.hpp‎
Lines changed: 0 additions & 7 deletions b/‎include/infinicore/ops/embedding.hpp‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎include/infiniop.h‎
Lines changed: 0 additions & 1 deletion b/‎include/infiniop.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎include/infiniop/ops/embedding.h‎
Lines changed: 0 additions & 25 deletions b/‎include/infiniop/ops/embedding.h‎
Lines changed: 0 additions & 25 deletions
diff --git a/‎src/infinicore/ops/embedding/embedding.cc‎
Lines changed: 63 additions & 21 deletions b/‎src/infinicore/ops/embedding/embedding.cc‎
Lines changed: 63 additions & 21 deletions
diff --git a/‎src/infinicore/ops/embedding/embedding_infiniop.cc‎
Lines changed: 0 additions & 49 deletions b/‎src/infinicore/ops/embedding/embedding_infiniop.cc‎
Lines changed: 0 additions & 49 deletions
diff --git a/‎src/infiniop/ops/embedding/operator.cc‎
Lines changed: 0 additions & 89 deletions b/‎src/infiniop/ops/embedding/operator.cc‎
Lines changed: 0 additions & 89 deletions
@@ -4,7 +4,6 @@
 #include "ops/add_rms_norm.hpp"
 #include "ops/attention.hpp"
 #include "ops/causal_softmax.hpp"
-#include "ops/embedding.hpp"
 #include "ops/flash_attention.hpp"
 #include "ops/kv_caching.hpp"
 #include "ops/matmul.hpp"
 
@@ -4,13 +4,6 @@
 
 namespace infinicore::op {
 
-class Embedding {
-public:
-    using schema = void (*)(Tensor, Tensor, Tensor);
-    static void execute(Tensor out, Tensor input, Tensor weight);
-    static common::OpDispatcher<schema> &dispatcher();
-};
-
 Tensor embedding(Tensor input, Tensor weight);
 void embedding_(Tensor out, Tensor input, Tensor weight);
 } // namespace infinicore::op
@@ -9,7 +9,6 @@
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
 #include "infiniop/ops/dequantize_awq.h"
-#include "infiniop/ops/embedding.h"
 #include "infiniop/ops/flash_attention.h"
 #include "infiniop/ops/gelu.h"
 #include "infiniop/ops/gemm.h"
 
@@ -1,34 +1,15 @@
 #include "infinicore/ops/embedding.hpp"
-#include "../../utils.hpp"
 #include "infinicore/context/context.hpp"
 #include <cstring>
-#include <stdexcept>
 
 namespace infinicore::op {
 
-common::OpDispatcher<Embedding::schema> &Embedding::dispatcher() {
-    static common::OpDispatcher<Embedding::schema> dispatcher_;
-    return dispatcher_;
-}
-
-void Embedding::execute(Tensor out, Tensor input, Tensor weight) {
-    // Check that all tensors are on the same device
-    // This is critical: if input is on CPU while out/weight are on GPU,
-    // passing CPU pointer to CUDA kernel will cause memory access errors
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, input, weight);
-
-    // Set device context
-    infinicore::context::setDevice(out->device());
-
-    // Use dispatcher to lookup kernel (infiniop implementation)
-    dispatcher().lookup(out->device().getType())(out, input, weight);
-}
-
 Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the indices to extract
                  Tensor weight // Weight: Embedding matrix of floating point type with shape (V, embedding_dim), where V = maximum index + 1
 ) {
     auto input_shape = input->shape();
     auto weight_shape = weight->shape();
+    // auto vocab_size = weight_shape[0];
     auto embedding_dim = weight_shape[1];
 
     // Assign memory to out variables
@@ -41,7 +22,68 @@ Tensor embedding(Tensor input, // LongTensor of arbitrary shape containing the i
 }
 
 void embedding_(Tensor out, Tensor input, Tensor weight) {
-    Embedding::execute(out, input, weight);
+    assert(infinicore::DataType::I64 == input->dtype() || (infinicore::DataType::I32 == input->dtype()));
+    assert(infinicore::Device::Type::CPU == input->device().getType());
+
+    auto input_shape = input->shape();
+    auto weight_shape = weight->shape();
+    auto embedding_dim = weight_shape[1];
+
+    // Calculate the number of token
+    Size counts = 1;
+    for (auto &v : input_shape) {
+        counts *= v;
+    }
+
+    // the bytes of one token
+    const Size bytes = dsize(weight->dtype()) * embedding_dim;
+    auto *weight_ptr = weight->data();
+    auto *out_ptr = out->data();
+
+    // copies
+    if (weight->device().getType() == Device::Type::CPU) {
+        if (infinicore::DataType::I64 == input->dtype()) {
+            const int64_t *input_arr = reinterpret_cast<const int64_t *>(input->data());
+            for (Size i = 0; i < counts; ++i) {
+                int64_t idx = input_arr[i];
+                assert((idx >= 0) && (idx < weight_shape[0]));
+                std::memcpy(out_ptr + i * bytes,
+                            weight_ptr + idx * bytes,
+                            bytes);
+            }
+        } else if (infinicore::DataType::I32 == input->dtype()) {
+            const int32_t *input_arr = reinterpret_cast<const int32_t *>(input->data());
+
+            for (Size i = 0; i < counts; ++i) {
+                int32_t idx = input_arr[i];
+                assert((idx >= 0) && (idx < weight_shape[0]));
+                std::memcpy(out_ptr + i * bytes,
+                            weight_ptr + idx * bytes,
+                            bytes);
+            }
+        }
+
+    } else {
+        if (infinicore::DataType::I64 == input->dtype()) {
+            const int64_t *input_arr = reinterpret_cast<const int64_t *>(input->data());
+            for (Size i = 0; i < counts; ++i) {
+                int64_t idx = input_arr[i];
+                assert((idx >= 0) && (idx < weight_shape[0]));
+                context::memcpyD2D(out_ptr + i * bytes,
+                                   weight_ptr + idx * bytes,
+                                   bytes);
+            }
+        } else if (infinicore::DataType::I32 == input->dtype()) {
+            const int32_t *input_arr = reinterpret_cast<const int32_t *>(input->data());
+            for (Size i = 0; i < counts; ++i) {
+                int32_t idx = input_arr[i];
+                assert((idx >= 0) && (idx < weight_shape[0]));
+                context::memcpyD2D(out_ptr + i * bytes,
+                                   weight_ptr + idx * bytes,
+                                   bytes);
+            }
+        }
+    }
 }
 
 } // namespace infinicore::op