Skip to content

[Issue]: Alloca null check returns true even though memcpy is successful #281

Closed
@simeonschaub

Description

@simeonschaub

Problem Description

kernel.ll:

define amdgpu_kernel void @alloca_kernel(ptr addrspace(1) %C, ptr addrspace(1) %A, ptr addrspace(1) %nulls) {
conversion:
  %0 = alloca [2 x i64], align 8, addrspace(5)

  %1 = icmp eq ptr addrspace(5) %0, null
  %2 = zext i1 %1 to i64
  store i64 %2, ptr addrspace(1) %nulls, align 8

  call void @llvm.memcpy.p1.p5.i64(ptr addrspace(5) noundef align 8 dereferenceable(16) %0, ptr addrspace(1) noundef nonnull align 8 dereferenceable(16) %A, i64 16, i1 false)
  %3 = load double, ptr addrspace(5) %0, align 8
  %4 = getelementptr inbounds i8, ptr addrspace(5) %0, i32 8
  %5 = load double, ptr addrspace(5) %4, align 8
  store double %3, ptr addrspace(1) %C, align 8
  %6 = getelementptr inbounds [2 x double], ptr addrspace(1) %C, i64 0, i64 1
  store double %5, ptr addrspace(1) %6, align 8
  ret void
}

main.cpp:

#include <hip/hip_runtime.h>
#include <hip/hiprtc.h>
#include <iostream>
#include <vector>
#include <cassert>
#include <cstdint>

int main() {
    hipModule_t module;
    hipFunction_t kernel;

    const char* hsaco_path = "kernel.hsaco";
    hipError_t err = hipModuleLoad(&module, hsaco_path);
    assert(err == hipSuccess);

    err = hipModuleGetFunction(&kernel, module, "alloca_kernel");
    assert(err == hipSuccess);

    // Allocate GPU memory for A and C device arrays
    constexpr int64_t lenA = 2;
    constexpr int64_t lenC = 2;

    void* d_A;
    void* d_C;
    void* nulls;
    hipMalloc(&d_A, lenA * sizeof(double) * 1); // 1 complex double (2 doubles)
    hipMalloc(&d_C, lenC * sizeof(double) * 1); // 1 complex double (2 doubles)
    hipMalloc(&nulls, sizeof(int64_t)); // 1 i64

    double h_init[2] = {3.14, 2.71};
    hipMemcpy(d_A, h_init, sizeof(h_init), hipMemcpyHostToDevice);

    // Set up argument array
    void* args[] = {
        &d_C,
        &d_A,
        &nulls,
    };

    // Launch kernel
    dim3 gridDim(1);
    dim3 blockDim(1);

    err = hipModuleLaunchKernel(
        kernel,
        gridDim.x, gridDim.y, gridDim.z,
        blockDim.x, blockDim.y, blockDim.z,
        0, nullptr,
        args,
        nullptr
    );
    assert(err == hipSuccess);

    // Copy result back and inspect with hipMemcpy
    std::vector<double> result(2);
    hipMemcpy(result.data(), d_C, sizeof(double) * 2, hipMemcpyDeviceToHost);
    std::cout << "Result: (" << result[0] << ", " << result[1] << ")\n";
    int64_t nulls_host[1];
    hipMemcpy(nulls_host, nulls, sizeof(int64_t), hipMemcpyDeviceToHost);
    std::cout << "Nulls: " << *nulls_host << "\n";

    // Cleanup
    hipFree(d_A);
    hipFree(d_C);
    hipFree(nulls);
    hipModuleUnload(module);

    return 0;
}

Compiling and running with:

/usr/lib64/rocm/llvm/bin/clang --target=amdgcn-amd-amdhsa -mcpu=gfx1150 -O1 -x ir kernel.ll -c -o kernel.o &&                                                     ✔ 
/usr/lib64/rocm/llvm/bin/ld.lld -shared -o kernel.hsaco kernel.o &&
hipcc main.cpp -o main &&
./main

I get the following output:

Result: (3.14, 2.71)
Nulls: 1

So the result is copied correctly, but the icmp eq ptr addrspace(5) %0, null seems to return 1 which is confusing me. Note that if I change -O1 to -O0 I get Nulls: 0 as expected, could there be something wrong with one of the optimization passes?

Original issue is JuliaGPU/AMDGPU.jl#780, reduced to a self-contained HIP example with some help from ChatGPT.

Operating System

Fedora 42 (Workstation Edition)

CPU

AMD Ryzen AI 7 PRO 360 w/ Radeon 880M

GPU

Radeon 880M iGPU (gfx1150)

ROCm Version

ROCm 6.3.42133-0

ROCm Component

llvm-project

Steps to Reproduce

No response

(Optional for Linux users) Output of /opt/rocm/bin/rocminfo --support

ROCk module is loaded
=====================    
HSA System Attributes    
=====================    
Runtime Version:         1.1
Runtime Ext Version:     1.6
System Timestamp Freq.:  1000.000000MHz
Sig. Max Wait Duration:  18446744073709551615 (0xFFFFFFFFFFFFFFFF) (timestamp count)
Machine Model:           LARGE                              
System Endianness:       LITTLE                             
Mwaitx:                  DISABLED
DMAbuf Support:          YES

==========               
HSA Agents               
==========               
*******                  
Agent 1                  
*******                  
  Name:                    AMD Ryzen AI 7 PRO 360 w/ Radeon 880M
  Uuid:                    CPU-XX                             
  Marketing Name:          AMD Ryzen AI 7 PRO 360 w/ Radeon 880M
  Vendor Name:             CPU                                
  Feature:                 None specified                     
  Profile:                 FULL_PROFILE                       
  Float Round Mode:        NEAR                               
  Max Queue Number:        0(0x0)                             
  Queue Min Size:          0(0x0)                             
  Queue Max Size:          0(0x0)                             
  Queue Type:              MULTI                              
  Node:                    0                                  
  Device Type:             CPU                                
  Cache Info:              
    L1:                      49152(0xc000) KB                   
  Chip ID:                 0(0x0)                             
  ASIC Revision:           0(0x0)                             
  Cacheline Size:          64(0x40)                           
  Max Clock Freq. (MHz):   5090                               
  BDFID:                   0                                  
  Internal Node ID:        0                                  
  Compute Unit:            16                                 
  SIMDs per CU:            0                                  
  Shader Engines:          0                                  
  Shader Arrs. per Eng.:   0                                  
  WatchPts on Addr. Ranges:1                                  
  Memory Properties:       
  Features:                None
  Pool Info:               
    Pool 1                   
      Segment:                 GLOBAL; FLAGS: FINE GRAINED        
      Size:                    31707724(0x1e3d24c) KB             
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Recommended Granule:4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       TRUE                               
    Pool 2                   
      Segment:                 GLOBAL; FLAGS: EXTENDED FINE GRAINED
      Size:                    31707724(0x1e3d24c) KB             
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Recommended Granule:4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       TRUE                               
    Pool 3                   
      Segment:                 GLOBAL; FLAGS: KERNARG, FINE GRAINED
      Size:                    31707724(0x1e3d24c) KB             
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Recommended Granule:4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       TRUE                               
    Pool 4                   
      Segment:                 GLOBAL; FLAGS: COARSE GRAINED      
      Size:                    31707724(0x1e3d24c) KB             
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Recommended Granule:4KB                                
      Alloc Alignment:         4KB                                
      Accessible by all:       TRUE                               
  ISA Info:                
*******                  
Agent 2                  
*******                  
  Name:                    gfx1150                            
  Uuid:                    GPU-XX                             
  Marketing Name:          AMD Radeon 890M Graphics           
  Vendor Name:             AMD                                
  Feature:                 KERNEL_DISPATCH                    
  Profile:                 BASE_PROFILE                       
  Float Round Mode:        NEAR                               
  Max Queue Number:        128(0x80)                          
  Queue Min Size:          64(0x40)                           
  Queue Max Size:          131072(0x20000)                    
  Queue Type:              MULTI                              
  Node:                    1                                  
  Device Type:             GPU                                
  Cache Info:              
    L1:                      32(0x20) KB                        
    L2:                      2048(0x800) KB                     
  Chip ID:                 5390(0x150e)                       
  ASIC Revision:           4(0x4)                             
  Cacheline Size:          128(0x80)                          
  Max Clock Freq. (MHz):   2799                               
  BDFID:                   49920                              
  Internal Node ID:        1                                  
  Compute Unit:            12                                 
  SIMDs per CU:            2                                  
  Shader Engines:          1                                  
  Shader Arrs. per Eng.:   2                                  
  WatchPts on Addr. Ranges:4                                  
  Coherent Host Access:    FALSE                              
  Memory Properties:       APU
  Features:                KERNEL_DISPATCH 
  Fast F16 Operation:      TRUE                               
  Wavefront Size:          32(0x20)                           
  Workgroup Max Size:      1024(0x400)                        
  Workgroup Max Size per Dimension:
    x                        1024(0x400)                        
    y                        1024(0x400)                        
    z                        1024(0x400)                        
  Max Waves Per CU:        32(0x20)                           
  Max Work-item Per CU:    1024(0x400)                        
  Grid Max Size:           4294967295(0xffffffff)             
  Grid Max Size per Dimension:
    x                        4294967295(0xffffffff)             
    y                        4294967295(0xffffffff)             
    z                        4294967295(0xffffffff)             
  Max fbarriers/Workgrp:   32                                 
  Packet Processor uCode:: 29                                 
  SDMA engine uCode::      11                                 
  IOMMU Support::          None                               
  Pool Info:               
    Pool 1                   
      Segment:                 GLOBAL; FLAGS: COARSE GRAINED      
      Size:                    15853860(0xf1e924) KB              
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Recommended Granule:2048KB                             
      Alloc Alignment:         4KB                                
      Accessible by all:       FALSE                              
    Pool 2                   
      Segment:                 GLOBAL; FLAGS: EXTENDED FINE GRAINED
      Size:                    15853860(0xf1e924) KB              
      Allocatable:             TRUE                               
      Alloc Granule:           4KB                                
      Alloc Recommended Granule:2048KB                             
      Alloc Alignment:         4KB                                
      Accessible by all:       FALSE                              
    Pool 3                   
      Segment:                 GROUP                              
      Size:                    64(0x40) KB                        
      Allocatable:             FALSE                              
      Alloc Granule:           0KB                                
      Alloc Recommended Granule:0KB                                
      Alloc Alignment:         0KB                                
      Accessible by all:       FALSE                              
  ISA Info:                
    ISA 1                    
      Name:                    amdgcn-amd-amdhsa--gfx1150         
      Machine Models:          HSA_MACHINE_MODEL_LARGE            
      Profiles:                HSA_PROFILE_BASE                   
      Default Rounding Mode:   NEAR                               
      Default Rounding Mode:   NEAR                               
      Fast f16:                TRUE                               
      Workgroup Max Size:      1024(0x400)                        
      Workgroup Max Size per Dimension:
        x                        1024(0x400)                        
        y                        1024(0x400)                        
        z                        1024(0x400)                        
      Grid Max Size:           4294967295(0xffffffff)             
      Grid Max Size per Dimension:
        x                        4294967295(0xffffffff)             
        y                        4294967295(0xffffffff)             
        z                        4294967295(0xffffffff)             
      FBarrier Max Size:       32                                 
*** Done ***             

Additional Information

No response

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions