chraac · chraac · Jun 18, 2025 · May 27, 2025 · May 28, 2025 · May 28, 2025
diff --git a/ggml/src/ggml-qnn/npu/CMakeLists.txt b/ggml/src/ggml-qnn/npu/CMakeLists.txt
@@ -3,6 +3,8 @@ cmake_policy(SET CMP0115 OLD)
 
 if(DEFINED ENV{HEXAGON_SDK_ROOT})
     set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT})
+    message("HEXAGON_SDK_ROOT (from environment): ${HEXAGON_SDK_ROOT}")
+elseif(DEFINED HEXAGON_SDK_ROOT)
     message("HEXAGON_SDK_ROOT: ${HEXAGON_SDK_ROOT}")
 else()
     message(FATAL_ERROR "HEXAGON_SDK_ROOT not defined")

diff --git a/ggml/src/ggml-qnn/npu/device/device.cpp b/ggml/src/ggml-qnn/npu/device/device.cpp
@@ -9,10 +9,10 @@
 #include "graph.hpp"
 #include "hexagon_npu.h"
 #include "op_impl.hpp"
-#include "quants.hpp"
 #include "remote.h"
 #include "tensor.hpp"
 #include "thread_pool.hpp"
+#include "type_traits.hpp"
 #include "util.hpp"
 
 namespace {
@@ -124,21 +124,20 @@ int npu_device_close(remote_handle64 h) {
 
 AEEResult npu_device_device_get_alignment(remote_handle64 _h, uint32_t * alignment) {
     NPU_UNUSED(_h);
-    *alignment = sizeof(HVX_Vector);
+    *alignment = sizeof(HVX_VectorPair);
     return AEE_SUCCESS;
 }
 
-AEEResult npu_device_device_support_op(remote_handle64 _h, const npu_device_tensor_spec * src0,
-                                       const npu_device_tensor_spec * src1, const npu_device_tensor_spec * dst,
-                                       npu_device_tensor_op op, boolean * is_supported) {
+AEEResult npu_device_device_support_op(remote_handle64 _h, npu_device_tensor_op op, const npu_device_tensor_spec * dst,
+                                       const npu_device_tensor_spec * srcs, int srcsLen, boolean * is_supported) {
     NPU_UNUSED(_h);
 
-    if (!src0 || !src1 || !dst || !is_supported) {
+    if (!srcs || srcsLen <= 0 || !dst || !is_supported) {
         DEVICE_LOG_ERROR("npu_device_device_support_op: Invalid arguments");
         return AEE_EINVARGS;
     }
 
-    *is_supported = hexagon::support_op(*src0, *src1, *dst, op);
+    *is_supported = hexagon::support_op(op, dst, srcs, srcsLen);
     return AEE_SUCCESS;
 }
 
@@ -208,19 +207,20 @@ AEEResult npu_device_graph_set_tensor_with_param(remote_handle64 _h, npu_device_
                                                  int                                     tensor_paramsLen) {
     NPU_UNUSED(_h);
     auto * graph = graph_from_handle(graph_handle);
-    if (!graph || !tensor_handles || tensor_handlesLen <= 0 || !tensor_params ||
-        tensor_handlesLen != tensor_paramsLen) {
+    if (!graph || tensor_handlesLen != tensor_paramsLen || tensor_handlesLen < 0) {
         return AEE_EINVHANDLE;
     }
 
-    graph->set_tensor(tensor_handles, tensor_handlesLen);
-    for (int i = 0; i < tensor_handlesLen; ++i) {
-        auto * tensor = tensor_from_handle(tensor_handles[i]);
-        if (tensor) {
-            tensor->update_config(tensor_params[i]);
+    if (tensor_params && tensor_handles) {
+        for (int i = 0; i < tensor_handlesLen; ++i) {
+            auto * tensor = tensor_from_handle(tensor_handles[i]);
+            if (tensor) {
+                tensor->update_config(tensor_params[i]);
+            }
         }
     }
 
+    graph->set_tensor(tensor_handles, tensor_handlesLen);
     return AEE_SUCCESS;
 }
 

diff --git a/ggml/src/ggml-qnn/npu/device/graph.cpp b/ggml/src/ggml-qnn/npu/device/graph.cpp
@@ -10,8 +10,7 @@
 namespace hexagon {
 
 graph::graph() noexcept {
-    _vtcm_quota_size = hexagon::vtcm_mem::get_avail_block_size();  // TODO: move to device init?
-    DEVICE_LOG_DEBUG("graph(%p) created: vtcm quota size: %zu\n", (void *) this, _vtcm_quota_size);
+    DEVICE_LOG_DEBUG("graph(%p) created\n", (void *) this);
 }
 
 graph::~graph() noexcept {
@@ -20,9 +19,10 @@ graph::~graph() noexcept {
 }
 
 void graph::set_tensor(const npu_device_tensor_handle_t * tensors, int tensor_count) {
-    if (tensor_count <= 0) {
+    if (tensor_count <= 0 || !tensors) {
         _tensors.reset();
         _tensor_count = 0;
+        DEVICE_LOG_DEBUG("graph(%p) set_tensor: no tensors to set\n", (void *) this);
         return;
     }
 
@@ -50,21 +50,27 @@ bool graph::compute(default_thread_pool * thread_pool, const float * f16_to_f32_
     DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]compute", (void *) this);
     _f16_to_f32_table = f16_to_f32_table;
     if (thread_pool) {
-        thread_pool->sync_execute(reinterpret_cast<default_thread_pool::task_type>(&graph::thread_pool_task), this);
+        thread_pool->sync_execute(&graph::thread_pool_task, this);
     } else {
-        compute_impl(nullptr, 0, 1);
+        default_thread_pool::thread_params param = {
+            0, 1, nullptr, hexagon::vtcm_mem::get_avail_block_size()
+        };  // TODO: should have a better way to initialize thread_params
+
+        compute_impl(nullptr, &param);
     }
 
+    _tensors[_tensor_count - 1]->invalidate();
     _f16_to_f32_table = nullptr;
     return true;
 }
 
-void graph::thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph) {
-    graph->compute_impl(pool, thread_idx, thread_count);
+void graph::thread_pool_task(default_thread_pool * pool, default_thread_pool::thread_params * thread_params,
+                             void * graph) {
+    reinterpret_cast<hexagon::graph *>(graph)->compute_impl(pool, thread_params);
 }
 
-void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count) {
-    hexagon::compute_params params = { thread_idx, thread_count, _vtcm_quota_size / thread_count, _f16_to_f32_table };
+void graph::compute_impl(default_thread_pool * pool, default_thread_pool::thread_params * thread_params) {
+    hexagon::compute_params params = { thread_params, _f16_to_f32_table };
 
     for (size_t i = 0; i < _tensor_count; ++i) {
         auto * dst  = _tensors[i];
@@ -78,13 +84,12 @@ void graph::compute_impl(default_thread_pool * pool, size_t thread_idx, size_t t
             DEVICE_LOG_ERROR("graph(%p) tensor[%zu] op %d compute failed\n", (void *) this, i, op);
         }
 
-        DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu", (void *) this, thread_idx);
-
         const bool should_sync = requires_thread_barrier(op);
         if (pool && should_sync && i < _tensor_count - 1) {
+            DEVICE_SCOPED_PERFORMANCE_TRACKER("[%p]sync_thread, tidx: %zu, tensor[%zu/%zu]", (void *) this,
+                                              params.get_thread_index(), i, _tensor_count);
             pool->sync_thread();
         }
-        dst->invalidate();
     }
 }
 

diff --git a/ggml/src/ggml-qnn/npu/device/graph.hpp b/ggml/src/ggml-qnn/npu/device/graph.hpp
@@ -20,12 +20,12 @@ class graph {
     bool compute(default_thread_pool * thread_pool, const float * f16_to_f32_table);
 
   private:
-    static void thread_pool_task(default_thread_pool * pool, size_t thread_idx, size_t thread_count, graph * graph);
-    void        compute_impl(default_thread_pool * pool, size_t thread_idx, size_t thread_count);
+    static void thread_pool_task(default_thread_pool * pool, default_thread_pool::thread_params * thread_params,
+                                 void * graph);
+    void        compute_impl(default_thread_pool * pool, default_thread_pool::thread_params * thread_params);
 
     std::unique_ptr<tensor *[]> _tensors;
     size_t                      _tensor_count     = 0;
-    size_t                      _vtcm_quota_size  = 0;
     const float *               _f16_to_f32_table = nullptr;
 
     DISABLE_COPY_AND_MOVE(graph);