Skip to content

Commit a1ab674

Browse files
authored
[feat] add more op (#35)
* move op key generate function to kOpCaps * fix op desc print * try fix rms_norm * Revert "try fix rms_norm" This reverts commit 33b2960. * add quantization type support by converting them to float * enable quantization tensor for mulmat in gpu/npu * fix asan error * add log and assert * insert output convert operator after mulmat * add log * fix some error in running * disable permute again * add log * add error function * Revert "add error function" This reverts commit f92ff47. * add log * more log * disable convert op in graph * wip * add f16 config for graph * set f16 precision for f16 graph * fix override data type * add comment * add config flag to enable quantize type * add log * more quantized type for cpu and gpu backend * enable all quant types for cpu and gpu backend * rename * wip * add log * remove unused functions * skip permute * remove get_qnn_op_input_param_count * fallback to generic_get_op_desc if no op_desc * revert 'skip permute' * Revert "revert 'skip permute'" This reverts commit 5761e31. * wip * add log * print qnn tensor type * add log * limit the max size of tensor * add log * fix tensor size limiter * small improve on tensor info printer * disable sqrt and div to pass test-backend-ops for 8 gen 2 * remove debug log in release build * add log * skip permute in src * wip * disable reshape * skip mul at decoder start * wip * add log * add qnn_scoped_timer * add perf tracker in graph * add cmake options GGML_QNN_ENABLE_PERFORMANCE_TRACKING * fix flag name * use milli-second * wip * fix comment string * add file for profiler * change qnn-cpu to GGML_BACKEND_DEVICE_TYPE_ACCEL, so that we can run tests on cpu * wip * profiler: refactoring * wip * add implement for print_profile_events * set-up profiler for graph * set profiler to graph execute * pretty print events * unified log print prefix * print event count * enable optrace * print duration at event end * wip * add more detailed soc information * wip * move device caps array into qnn-lib.cpp * remove lib_name in device_context * move get_graph_key_from_cgraph to graph.cpp * add override type for tensor key * use override_type instead of original data type for graph key * append op type to tensor name to fix error in qwen * remove todo * wip
1 parent 525cd2d commit a1ab674

24 files changed

+1385
-704
lines changed

ggml/include/ggml-qnn.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
#pragma once
22

3-
#include "ggml.h"
4-
53
#include "ggml-backend.h"
4+
#include "ggml.h"
65

76
#ifdef __cplusplus
87
extern "C" {
98
#endif
109

11-
#define GGML_QNN_NAME "QNN"
10+
#define GGML_QNN_NAME "qnn"
1211
#define GGML_QNN_MAX_DEVICES QNN_BACKEND_COUNT
1312

1413
enum QNNBackend {

ggml/src/ggml-qnn/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,4 +42,13 @@ target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${
4242
if(GGML_QNN_ENABLE_CPU_BACKEND)
4343
message("GGML_QNN_ENABLE_CPU_BACKEND is enabled")
4444
target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_CPU_BACKEND)
45+
else()
46+
message("GGML_QNN_ENABLE_CPU_BACKEND is disabled")
47+
endif()
48+
49+
if(GGML_QNN_ENABLE_PERFORMANCE_TRACKING)
50+
message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is enabled")
51+
target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_ENABLE_PERFORMANCE_TRACKING)
52+
else()
53+
message("GGML_QNN_ENABLE_PERFORMANCE_TRACKING is disabled")
4554
endif()

ggml/src/ggml-qnn/backend-ops.cpp

Lines changed: 113 additions & 190 deletions
Large diffs are not rendered by default.

ggml/src/ggml-qnn/backend.hpp

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include <unordered_map>
1111
#include <unordered_set>
1212

13+
#include "convert.hpp"
1314
#include "ggml-backend.h"
1415
#include "ggml-qnn.h"
1516
#include "ggml.h"
@@ -25,26 +26,30 @@ struct ggml_backend_qnn_device_context {
2526
QNNBackend device;
2627
size_t threads;
2728
std::string name;
28-
std::string lib_name;
29+
std::string description;
2930

3031
// initialize in qnn init
3132
qnn::qcom_socinfo socinfo = {};
32-
uint64_t supported_types;
33+
size_t max_tensor_size_in_bytes;
3334
std::shared_ptr<qnn::qnn_instance> instance;
3435
std::shared_ptr<qnn::qnn_interface> qnn_interface;
3536

36-
qnn::qnn_graph_cache_t qnn_graph_cache;
37+
qnn::qnn_graph_cache_t qnn_graph_cache;
38+
std::shared_ptr<qnn::qnn_convert_context_t> convert_context = std::make_shared<qnn::qnn_convert_context_t>();
3739

3840
#ifndef NDEBUG
3941
std::atomic_uint32_t supported_op_count = 0;
4042
std::atomic_uint32_t unsupported_op_count = 0;
4143
#endif
4244

45+
bool enable_cpu_dequantize = false;
46+
uint64_t supported_types;
47+
uint64_t cpu_preprocess_types;
48+
4349
explicit ggml_backend_qnn_device_context(QNNBackend device, size_t threads, const char * name,
44-
const char * lib_name, uint64_t supported_types) :
50+
uint64_t supported_types) :
4551
device(device),
4652
threads(threads),
4753
name(name),
48-
lib_name(lib_name),
4954
supported_types(supported_types) {}
5055
};

ggml/src/ggml-qnn/buffer.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,8 @@ using qnn_buffer_ptr = std::shared_ptr<qnn_buffer_interface>;
6969
*/
7070
class qnn_rpc_buffer : public qnn_buffer_interface {
7171
public:
72-
qnn_rpc_buffer(std::shared_ptr<qnn_instance> qnn_instance, const size_t size, const uint32_t rank,
73-
uint32_t * dimensions, Qnn_DataType_t data_type) :
72+
qnn_rpc_buffer(qnn_instance_ptr qnn_instance, const size_t size, const uint32_t rank, uint32_t * dimensions,
73+
Qnn_DataType_t data_type) :
7474
_size(size),
7575
_qnn_instance(qnn_instance) {
7676
_qnn_rpc_buffer = static_cast<uint8_t *>(qnn_instance->alloc_rpcmem(size, alignof(uint8_t *)));
@@ -105,10 +105,10 @@ class qnn_rpc_buffer : public qnn_buffer_interface {
105105
Qnn_MemHandle_t get_mem_handle() const override { return _qnn_rpc_mem_handle; }
106106

107107
private:
108-
size_t _size = 0;
109-
uint8_t * _qnn_rpc_buffer = nullptr;
110-
Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr;
111-
std::shared_ptr<qnn_instance> _qnn_instance;
108+
size_t _size = 0;
109+
uint8_t * _qnn_rpc_buffer = nullptr;
110+
Qnn_MemHandle_t _qnn_rpc_mem_handle = nullptr;
111+
qnn_instance_ptr _qnn_instance;
112112

113113
DISABLE_COPY(qnn_rpc_buffer);
114114
DISABLE_MOVE(qnn_rpc_buffer);

ggml/src/ggml-qnn/convert.cpp

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
2+
#include "convert.hpp"
3+
4+
#include "logger.hpp"
5+
6+
namespace {
7+
8+
size_t get_convert_buffer_size(const qnn::ggml_dimension_array_t & dimensions, ggml_type dst_type) {
9+
GGML_ASSERT(ggml_blck_size(dst_type) == 1);
10+
size_t nbytes = ggml_type_size(dst_type);
11+
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
12+
nbytes *= dimensions[i]; // tight packing
13+
}
14+
15+
return nbytes;
16+
}
17+
18+
// from ggml_backend_blas_mul_mat, when omp available, use it otherwise will fall back to standard lib solution
19+
// TODO: remove this when we can fall back the convert to blas backend
20+
#ifdef GGML_USE_OPENMP
21+
22+
void convert_tensor_impl(const ggml_tensor * src, int max_threads,
23+
std::shared_ptr<qnn::qnn_mem_buffer_slice> & output_buffer) {
24+
const auto ne03 = src->ne[3];
25+
const auto ne02 = src->ne[2];
26+
const auto ne01 = src->ne[1];
27+
const auto ne00 = src->ne[0];
28+
const auto ne_plane = ne01 * ne00;
29+
const auto nb03 = src->nb[3];
30+
const auto nb02 = src->nb[2];
31+
const auto nb01 = src->nb[1];
32+
const int min_cols_per_thread = 4096;
33+
void * wdata = output_buffer->get_buffer();
34+
const auto to_float = ggml_get_type_traits(src->type)->to_float;
35+
GGML_ASSERT(to_float);
36+
37+
for (int64_t i03 = 0; i03 < ne03; i03++) {
38+
for (int64_t i02 = 0; i02 < ne02; i02++) {
39+
const void * x = (char *) src->data + i02 * nb02 + i03 * nb03;
40+
float * const wplane = (float *) wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
41+
42+
const int min_rows_per_thread = std::max((int) (min_cols_per_thread / ne00), 1);
43+
const int n_threads = std::max(std::min(max_threads, (int) (ne01 / min_rows_per_thread)), 1);
44+
45+
# pragma omp parallel for num_threads(n_threads)
46+
for (int64_t i01 = 0; i01 < ne01; i01++) {
47+
to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
48+
}
49+
}
50+
}
51+
52+
return output_buffer;
53+
}
54+
55+
#else
56+
57+
void convert_tensor_impl(const ggml_tensor * src, int max_threads, std::vector<std::future<void>> & tasks,
58+
std::shared_ptr<qnn::qnn_mem_buffer_slice> & output_buffer) {
59+
const auto ne03 = src->ne[3];
60+
const auto ne02 = src->ne[2];
61+
const auto ne01 = src->ne[1];
62+
const auto ne00 = src->ne[0];
63+
const auto ne_plane = ne01 * ne00;
64+
const auto nb03 = src->nb[3];
65+
const auto nb02 = src->nb[2];
66+
const auto nb01 = src->nb[1];
67+
const int min_cols_per_thread = 4096;
68+
void * wdata = output_buffer->get_buffer();
69+
const auto to_float = ggml_get_type_traits(src->type)->to_float;
70+
GGML_ASSERT(to_float);
71+
72+
for (int64_t i03 = 0; i03 < ne03; i03++) {
73+
for (int64_t i02 = 0; i02 < ne02; i02++) {
74+
const void * x = (char *) src->data + i02 * nb02 + i03 * nb03;
75+
float * const wplane = (float *) wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
76+
77+
const int min_rows_per_thread = std::max((int) (min_cols_per_thread / ne00), 1);
78+
const int n_threads = std::max(std::min(max_threads, (int) (ne01 / min_rows_per_thread)), 1);
79+
80+
for (int i = 1; i < n_threads; i++) {
81+
const int64_t start = i * ne01 / n_threads;
82+
const int64_t end = (i + 1) * ne01 / n_threads;
83+
if (start < end) {
84+
tasks.push_back(std::async(std::launch::async, [=]() {
85+
for (int64_t i01 = start; i01 < end; i01++) {
86+
to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
87+
}
88+
}));
89+
}
90+
}
91+
{
92+
// reuse the current thread for the first task
93+
const int64_t start = 0;
94+
const int64_t end = ne01 / n_threads;
95+
for (int64_t i01 = start; i01 < end; i01++) {
96+
to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
97+
}
98+
}
99+
}
100+
}
101+
102+
// wait for all tasks to finish
103+
for (auto & task : tasks) {
104+
task.get();
105+
}
106+
tasks.clear();
107+
}
108+
109+
#endif
110+
111+
} // namespace
112+
113+
namespace qnn {
114+
115+
std::vector<qnn::qnn_buffer_ptr> convert(std::shared_ptr<qnn_convert_context_t> convert_context,
116+
const ggml_tensor_array_t & tensors, ggml_type target_data_type) {
117+
convert_context->buffers.resize(tensors.size());
118+
std::vector<qnn::qnn_buffer_ptr> output_buffers(tensors.size());
119+
for (size_t i = 0; i < tensors.size(); ++i) {
120+
const ggml_tensor * src = tensors[i];
121+
if (src->type == target_data_type) {
122+
continue;
123+
}
124+
125+
auto & data_buffer = convert_context->buffers[i];
126+
const auto dst_size = get_convert_buffer_size(src->ne, target_data_type);
127+
if (!data_buffer || data_buffer->get_size() < dst_size) {
128+
#ifndef NDEBUG
129+
auto old_size = data_buffer ? data_buffer->get_size() : 0;
130+
QNN_LOG_DEBUG("create buffer[%d] for tensor %s(%s), old_size: %d, new_size: %d\n", (int) i,
131+
ggml_get_name(src), ggml_type_name(src->type), (int) old_size, (int) dst_size);
132+
#endif
133+
data_buffer = std::make_shared<qnn::qnn_mem_buffer>(dst_size);
134+
}
135+
136+
// TODO: add more restrictions to the buffer slice here
137+
std::shared_ptr<qnn::qnn_mem_buffer_slice> output_buffer =
138+
std::make_shared<qnn::qnn_mem_buffer_slice>(data_buffer->get_buffer(), dst_size);
139+
140+
QNN_LOG_DEBUG("convert tensor(%s) from %s to %s, size: %d, n_threads: %d\n", ggml_get_name(src),
141+
ggml_type_name(src->type), ggml_type_name(target_data_type), (int) dst_size,
142+
convert_context->n_threads);
143+
144+
#ifdef GGML_USE_OPENMP
145+
convert_tensor_impl(src, convert_context->n_threads, output_buffer);
146+
#else
147+
convert_tensor_impl(src, convert_context->n_threads, convert_context->tasks, output_buffer);
148+
#endif
149+
output_buffers[i] = output_buffer;
150+
}
151+
152+
return output_buffers;
153+
}
154+
155+
} // namespace qnn

ggml/src/ggml-qnn/convert.hpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#pragma once
2+
3+
#include <future>
4+
#include <memory>
5+
#include <thread>
6+
7+
#include "buffer.hpp"
8+
#include "ggml-qnn.h"
9+
#include "tensor.hpp"
10+
#include "utils.hpp"
11+
12+
namespace qnn {
13+
14+
// see also: ggml_backend_blas_context
15+
struct qnn_convert_context_t {
16+
int n_threads = std::thread::hardware_concurrency();
17+
std::vector<std::shared_ptr<qnn_mem_buffer>> buffers;
18+
#ifndef GGML_USE_OPENMP
19+
std::vector<std::future<void>> tasks;
20+
#endif
21+
};
22+
23+
std::vector<qnn::qnn_buffer_ptr> convert(std::shared_ptr<qnn_convert_context_t> convert_context,
24+
const ggml_tensor_array_t & tensors, ggml_type target_data_type);
25+
26+
} // namespace qnn

0 commit comments

Comments
 (0)