chraac
diff --git a/‎ggml/include/ggml-qnn.h
Lines changed: 2 additions & 3 deletions b/‎ggml/include/ggml-qnn.h
Lines changed: 2 additions & 3 deletions
diff --git a/‎ggml/src/ggml-qnn/CMakeLists.txt
Lines changed: 9 additions & 0 deletions b/‎ggml/src/ggml-qnn/CMakeLists.txt
Lines changed: 9 additions & 0 deletions
diff --git a/‎ggml/src/ggml-qnn/backend-ops.cpp
Lines changed: 113 additions & 190 deletions b/‎ggml/src/ggml-qnn/backend-ops.cpp
Lines changed: 113 additions & 190 deletions
diff --git a/‎ggml/src/ggml-qnn/backend.hpp
Lines changed: 10 additions & 5 deletions b/‎ggml/src/ggml-qnn/backend.hpp
Lines changed: 10 additions & 5 deletions
diff --git a/‎ggml/src/ggml-qnn/buffer.hpp
Lines changed: 6 additions & 6 deletions b/‎ggml/src/ggml-qnn/buffer.hpp
Lines changed: 6 additions & 6 deletions
diff --git a/‎ggml/src/ggml-qnn/convert.cpp
Lines changed: 155 additions & 0 deletions b/‎ggml/src/ggml-qnn/convert.cpp
Lines changed: 155 additions & 0 deletions
diff --git a/‎ggml/src/ggml-qnn/convert.hpp
Lines changed: 26 additions & 0 deletions b/‎ggml/src/ggml-qnn/convert.hpp
Lines changed: 26 additions & 0 deletions
@@ -1,14 +1,13 @@
 #pragma once
 
-#include "ggml.h"
-
 #include "ggml-backend.h"
+#include "ggml.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define GGML_QNN_NAME "QNN"
+#define GGML_QNN_NAME        "qnn"
 #define GGML_QNN_MAX_DEVICES QNN_BACKEND_COUNT
 
 enum QNNBackend {
 
@@ -42,4 +42,13 @@ target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${
 if(GGML_QNN_ENABLE_CPU_BACKEND)
     message("GGML_QNN_ENABLE_CPU_BACKEND is enabled")
     target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_CPU_BACKEND)
+else()
+    message("GGML_QNN_ENABLE_CPU_BACKEND is disabled")
+endif()
+
+if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING)
+    message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is enabled")
+    target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_PERFORMANCE_TRACKING)
+else()
+    message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is disabled")
 endif()
@@ -10,6 +10,7 @@
 #include <unordered_map>
 #include <unordered_set>
 
+#include "convert.hpp"
 #include "ggml-backend.h"
 #include "ggml-qnn.h"
 #include "ggml.h"
@@ -25,26 +26,30 @@ struct ggml_backend_qnn_device_context {
     QNNBackend  device;
     size_t      threads;
     std::string name;
-    std::string lib_name;
+    std::string description;
 
     // initialize in qnn init
     qnn::qcom_socinfo                   socinfo = {};
-    uint64_t                            supported_types;
+    size_t                              max_tensor_size_in_bytes;
     std::shared_ptr<qnn::qnn_instance>  instance;
     std::shared_ptr<qnn::qnn_interface> qnn_interface;
 
-    qnn::qnn_graph_cache_t qnn_graph_cache;
+    qnn::qnn_graph_cache_t                      qnn_graph_cache;
+    std::shared_ptr<qnn::qnn_convert_context_t> convert_context = std::make_shared<qnn::qnn_convert_context_t>();
 
 #ifndef NDEBUG
     std::atomic_uint32_t supported_op_count   = 0;
     std::atomic_uint32_t unsupported_op_count = 0;
 #endif
 
+    bool     enable_cpu_dequantize = false;
+    uint64_t supported_types;
+    uint64_t cpu_preprocess_types;
+
     explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char * name,
-                                             const char * lib_name, uint64_t supported_types) :
+                                             uint64_t supported_types) :
         device(device),
         threads(threads),
         name(name),
-        lib_name(lib_name),
         supported_types(supported_types) {}
 };
@@ -69,8 +69,8 @@ using qnn_buffer_ptr = std::shared_ptr<qnn_buffer_interface>;
  */
 class qnn_rpc_buffer : public qnn_buffer_interface {
   public:
-    qnn_rpc_buffer(std::shared_ptr<qnn_instance> qnn_instance, const size_t size, const uint32_t rank,
-                   uint32_t * dimensions, Qnn_DataType_t data_type) :
+    qnn_rpc_buffer(qnn_instance_ptr qnn_instance, const size_t size, const uint32_t rank, uint32_t * dimensions,
+                   Qnn_DataType_t data_type) :
         _size(size),
         _qnn_instance(qnn_instance) {
         _qnn_rpc_buffer     = static_cast<uint8_t *>(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *)));
@@ -105,10 +105,10 @@ class qnn_rpc_buffer : public qnn_buffer_interface {
     Qnn_MemHandle_t get_mem_handle() const override { return _qnn_rpc_mem_handle; }
 
   private:
-    size_t                        _size               = 0;
-    uint8_t *                     _qnn_rpc_buffer     = nullptr;
-    Qnn_MemHandle_t               _qnn_rpc_mem_handle = nullptr;
-    std::shared_ptr<qnn_instance> _qnn_instance;
+    size_t           _size               = 0;
+    uint8_t *        _qnn_rpc_buffer     = nullptr;
+    Qnn_MemHandle_t  _qnn_rpc_mem_handle = nullptr;
+    qnn_instance_ptr _qnn_instance;
 
     DISABLE_COPY(qnn_rpc_buffer);
     DISABLE_MOVE(qnn_rpc_buffer);
 
@@ -0,0 +1,155 @@
+
+#include "convert.hpp"
+
+#include "logger.hpp"
+
+namespace {
+
+size_t get_convert_buffer_size(const qnn::ggml_dimension_array_t & dimensions, ggml_type dst_type) {
+    GGML_ASSERT(ggml_blck_size(dst_type) == 1);
+    size_t nbytes = ggml_type_size(dst_type);
+    for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
+        nbytes *= dimensions[i];  // tight packing
+    }
+
+    return nbytes;
+}
+
+// from ggml_backend_blas_mul_mat, when omp available, use it otherwise will fall back to standard lib solution
+// TODO: remove this when we can fall back the convert to blas backend
+#ifdef GGML_USE_OPENMP
+
+void convert_tensor_impl(const ggml_tensor * src, int max_threads,
+                         std::shared_ptr<qnn::qnn_mem_buffer_slice> & output_buffer) {
+    const auto ne03                = src->ne[3];
+    const auto ne02                = src->ne[2];
+    const auto ne01                = src->ne[1];
+    const auto ne00                = src->ne[0];
+    const auto ne_plane            = ne01 * ne00;
+    const auto nb03                = src->nb[3];
+    const auto nb02                = src->nb[2];
+    const auto nb01                = src->nb[1];
+    const int  min_cols_per_thread = 4096;
+    void *     wdata               = output_buffer->get_buffer();
+    const auto to_float            = ggml_get_type_traits(src->type)->to_float;
+    GGML_ASSERT(to_float);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            const void *  x      = (char *) src->data + i02 * nb02 + i03 * nb03;
+            float * const wplane = (float *) wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
+
+            const int min_rows_per_thread = std::max((int) (min_cols_per_thread / ne00), 1);
+            const int n_threads           = std::max(std::min(max_threads, (int) (ne01 / min_rows_per_thread)), 1);
+
+#    pragma omp parallel for num_threads(n_threads)
+            for (int64_t i01 = 0; i01 < ne01; i01++) {
+                to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
+            }
+        }
+    }
+
+    return output_buffer;
+}
+
+#else
+
+void convert_tensor_impl(const ggml_tensor * src, int max_threads, std::vector<std::future<void>> & tasks,
+                         std::shared_ptr<qnn::qnn_mem_buffer_slice> & output_buffer) {
+    const auto ne03                = src->ne[3];
+    const auto ne02                = src->ne[2];
+    const auto ne01                = src->ne[1];
+    const auto ne00                = src->ne[0];
+    const auto ne_plane            = ne01 * ne00;
+    const auto nb03                = src->nb[3];
+    const auto nb02                = src->nb[2];
+    const auto nb01                = src->nb[1];
+    const int  min_cols_per_thread = 4096;
+    void *     wdata               = output_buffer->get_buffer();
+    const auto to_float            = ggml_get_type_traits(src->type)->to_float;
+    GGML_ASSERT(to_float);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            const void *  x      = (char *) src->data + i02 * nb02 + i03 * nb03;
+            float * const wplane = (float *) wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
+
+            const int min_rows_per_thread = std::max((int) (min_cols_per_thread / ne00), 1);
+            const int n_threads           = std::max(std::min(max_threads, (int) (ne01 / min_rows_per_thread)), 1);
+
+            for (int i = 1; i < n_threads; i++) {
+                const int64_t start = i * ne01 / n_threads;
+                const int64_t end   = (i + 1) * ne01 / n_threads;
+                if (start < end) {
+                    tasks.push_back(std::async(std::launch::async, [=]() {
+                        for (int64_t i01 = start; i01 < end; i01++) {
+                            to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
+                        }
+                    }));
+                }
+            }
+            {
+                // reuse the current thread for the first task
+                const int64_t start = 0;
+                const int64_t end   = ne01 / n_threads;
+                for (int64_t i01 = start; i01 < end; i01++) {
+                    to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
+                }
+            }
+        }
+    }
+
+    // wait for all tasks to finish
+    for (auto & task : tasks) {
+        task.get();
+    }
+    tasks.clear();
+}
+
+#endif
+
+}  // namespace
+
+namespace qnn {
+
+std::vector<qnn::qnn_buffer_ptr> convert(std::shared_ptr<qnn_convert_context_t> convert_context,
+                                         const ggml_tensor_array_t & tensors, ggml_type target_data_type) {
+    convert_context->buffers.resize(tensors.size());
+    std::vector<qnn::qnn_buffer_ptr> output_buffers(tensors.size());
+    for (size_t i = 0; i < tensors.size(); ++i) {
+        const ggml_tensor * src = tensors[i];
+        if (src->type == target_data_type) {
+            continue;
+        }
+
+        auto &     data_buffer = convert_context->buffers[i];
+        const auto dst_size    = get_convert_buffer_size(src->ne, target_data_type);
+        if (!data_buffer || data_buffer->get_size() < dst_size) {
+#ifndef NDEBUG
+            auto old_size = data_buffer ? data_buffer->get_size() : 0;
+            QNN_LOG_DEBUG("create buffer[%d] for tensor %s(%s), old_size: %d, new_size: %d\n", (int) i,
+                          ggml_get_name(src), ggml_type_name(src->type), (int) old_size, (int) dst_size);
+#endif
+            data_buffer = std::make_shared<qnn::qnn_mem_buffer>(dst_size);
+        }
+
+        // TODO: add more restrictions to the buffer slice here
+        std::shared_ptr<qnn::qnn_mem_buffer_slice> output_buffer =
+            std::make_shared<qnn::qnn_mem_buffer_slice>(data_buffer->get_buffer(), dst_size);
+
+        QNN_LOG_DEBUG("convert tensor(%s) from %s to %s, size: %d, n_threads: %d\n", ggml_get_name(src),
+                      ggml_type_name(src->type), ggml_type_name(target_data_type), (int) dst_size,
+                      convert_context->n_threads);
+
+#ifdef GGML_USE_OPENMP
+        convert_tensor_impl(src, convert_context->n_threads, output_buffer);
+#else
+        convert_tensor_impl(src, convert_context->n_threads, convert_context->tasks, output_buffer);
+#endif
+        output_buffers[i] = output_buffer;
+    }
+
+    return output_buffers;
+}
+
+}  // namespace qnn
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <future>
+#include <memory>
+#include <thread>
+
+#include "buffer.hpp"
+#include "ggml-qnn.h"
+#include "tensor.hpp"
+#include "utils.hpp"
+
+namespace qnn {
+
+// see also: ggml_backend_blas_context
+struct qnn_convert_context_t {
+    int                                          n_threads = std::thread::hardware_concurrency();
+    std::vector<std::shared_ptr<qnn_mem_buffer>> buffers;
+#ifndef GGML_USE_OPENMP
+    std::vector<std::future<void>> tasks;
+#endif
+};
+
+std::vector<qnn::qnn_buffer_ptr> convert(std::shared_ptr<qnn_convert_context_t> convert_context,
+                                         const ggml_tensor_array_t & tensors, ggml_type target_data_type);
+
+}  // namespace qnn