PS D:\Python code> & C:/Users/johnx/AppData/Local/Programs/Python/Python312/python.exe d:/models/zhipu/chatglm-6b-int4/ChatGLMint4.py Symbol cudaLaunchKernel not found in C:\Program Files (x86)\NVIDIA Corporation\PhysX\Common\cudart64_65.dll No compiled kernel found. Compiling kernels : C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\quantization_kernels_parallel.c Compiling gcc -O3 -fPIC -pthread -fopenmp -std=c99 C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\quantization_kernels_parallel.c -shared -o C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\quantization_kernels_parallel.so Kernels compiled : C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\quantization_kernels_parallel.so Load kernel : C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\quantization_kernels_parallel.so Setting CPU quantization kernel threads to 8 Using quantization cache Applying quantization to glm layers No compiled kernel found. Compiling kernels : C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\quantization_kernels_parallel.c Compiling gcc -O3 -fPIC -pthread -fopenmp -std=c99 C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\quantization_kernels_parallel.c -shared -o C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\quantization_kernels_parallel.so C:/TDM-GCC-64/bin/../lib/gcc/x86_64-w64-mingw32/10.3.0/../../../../x86_64-w64-mingw32/bin/ld.exe: cannot open output file C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\quantization_kernels_parallel.so: Permission denied collect2.exe: error: ld returned 1 exit status Compile failed, using default cpu kernel code. Compiling gcc -O3 -fPIC -std=c99 C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\quantization_kernels.c -shared -o C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\quantization_kernels.so Kernels compiled : C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\quantization_kernels.so Load kernel : C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\quantization_kernels.so The dtype of attention mask (torch.int64) is not bool Traceback (most recent call last): File "d:\models\zhipu\chatglm-6b-int4\ChatGLMint4.py", line 10, in response, history = model.chat(tokenizer, message, history=[]) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\johnx\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\utils\_contextlib.py", line 116, in decorate_context return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\modeling_chatglm.py", line 1286, in chat outputs = self.generate(**inputs, **gen_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\johnx\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\utils\_contextlib.py", line 116, in decorate_context return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\johnx\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\generation\utils.py", line 1914, in generate result = self._sample( ^^^^^^^^^^^^^ File "C:\Users\johnx\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\generation\utils.py", line 2651, in _sample outputs = self( ^^^^^ File "C:\Users\johnx\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\johnx\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\modeling_chatglm.py", line 1191, in forward transformer_outputs = self.transformer( ^^^^^^^^^^^^^^^^^ File "C:\Users\johnx\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\johnx\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\modeling_chatglm.py", line 997, in forward layer_ret = layer( ^^^^^^ File "C:\Users\johnx\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\johnx\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\modeling_chatglm.py", line 628, in forward attention_outputs = self.attention( ^^^^^^^^^^^^^^^ File "C:\Users\johnx\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\johnx\AppData\Local\Programs\Python\Python312\Lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\modeling_chatglm.py", line 475, in forward context_layer, present, attention_probs = attention_fn( ^^^^^^^^^^^^^ File "C:\Users\johnx\.cache\huggingface\modules\transformers_modules\chatglm-6b-int4\modeling_chatglm.py", line 257, in attention_fn key_layer = torch.cat((past_key, key_layer), dim=0) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ TypeError: expected Tensor as element 0 in argument 0, but got tuple