Why can JAX run fp8 on Nvidia GPUs with sm < 89? #23124

woct0rdho · 2025-02-26T08:39:46Z

woct0rdho
Feb 26, 2025

I'm also asking this in the JAX repo and a few Discord channels but didn't have an answer yet.

fp8 has hardware support only on GPUs with sm >= 89 (Ada), such as RTX 4090 or A100. I've seen people trying to run it in PyTorch (e.g., this script) on older GPUs and getting errors. But JAX can actually run it on older GPUs.

I tried to run

def f(x, y): return x @ y
a = jnp.ones((3, 3), dtype=jnp.float8_e4m3fn)
print(jax.jit(f).lower(a, a).as_text())

and I can see the dtype is f8E4M3FN in the HLO IR. Then I used XLA_FLAGS="--xla_dump_to=..." and read module_0005.jit_f.ir-no-opt.ll. If the above dtype is float32, then the LLVM IR is relatively simple. But if it's float8, then the LLVM IR is much longer and contains instructions like load i8. So I assume there is some compiler pass in XLA to do the emulation?

`module_0005.jit_f.ir-no-opt.ll` with `dtype=jnp.float32`

; ModuleID = 'jit_f'
source_filename = "jit_f"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

define void @wrapped_dot(ptr noalias align 16 dereferenceable(36) %0, ptr noalias align 16 dereferenceable(36) %1, ptr noalias align 128 dereferenceable(36) %2) {
  %4 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
  %5 = zext i32 %4 to i64
  %6 = udiv i64 %5, 3
  %7 = mul i64 %6, 3
  %8 = trunc i64 %7 to i32
  %9 = getelementptr inbounds float, ptr %0, i32 %8
  %10 = load float, ptr %9, align 4, !invariant.load !4
  %11 = urem i64 %5, 3
  %12 = trunc i64 %11 to i32
  %13 = getelementptr inbounds float, ptr %1, i32 %12
  %14 = load float, ptr %13, align 4, !invariant.load !4
  %15 = fmul float %10, %14
  %16 = fadd float %15, 0.000000e+00
  %17 = add i64 %7, 1
  %18 = trunc i64 %17 to i32
  %19 = getelementptr inbounds float, ptr %0, i32 %18
  %20 = load float, ptr %19, align 4, !invariant.load !4
  %21 = add i64 %11, 3
  %22 = trunc i64 %21 to i32
  %23 = getelementptr inbounds float, ptr %1, i32 %22
  %24 = load float, ptr %23, align 4, !invariant.load !4
  %25 = fmul float %20, %24
  %26 = fadd float %16, %25
  %27 = add i64 %7, 2
  %28 = trunc i64 %27 to i32
  %29 = getelementptr inbounds float, ptr %0, i32 %28
  %30 = load float, ptr %29, align 4, !invariant.load !4
  %31 = add i64 %11, 6
  %32 = trunc i64 %31 to i32
  %33 = getelementptr inbounds float, ptr %1, i32 %32
  %34 = load float, ptr %33, align 4, !invariant.load !4
  %35 = fmul float %30, %34
  %36 = fadd float %26, %35
  %37 = getelementptr inbounds float, ptr %2, i32 %4
  store float %36, ptr %37, align 4
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

!nvvm.annotations = !{!0, !1}
!llvm.module.flags = !{!2}

!0 = !{ptr @wrapped_dot, !"kernel", i32 1}
!1 = !{ptr @wrapped_dot, !"reqntidx", i32 9}
!2 = !{i32 2, !"Debug Info Version", i32 3}
!3 = !{i32 0, i32 9}
!4 = !{}

`module_0005.jit_f.ir-no-opt.ll` with `dtype=jnp.float8_e4m3fn`

; ModuleID = 'jit_f'
source_filename = "jit_f"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

define void @loop_convert_fusion(ptr noalias align 16 dereferenceable(9) %0, ptr noalias align 16 dereferenceable(9) %1, ptr noalias align 128 dereferenceable(18) %2, ptr noalias align 128 dereferenceable(18) %3) {
  %5 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
  %6 = getelementptr inbounds i8, ptr %0, i32 %5
  %7 = load i8, ptr %6, align 1, !invariant.load !8
  %8 = lshr i8 %7, 7
  %9 = icmp ne i8 %8, 0
  %10 = and i8 %7, 127
  %11 = zext i8 %10 to i16
  %12 = add i16 %11, 64
  %13 = shl i16 %12, 7
  %14 = lshr i8 %10, 3
  %15 = zext i8 %14 to i32
  %16 = zext i8 %10 to i16
  %17 = call i8 @llvm.ctlz.i8(i8 %10, i1 false)
  %18 = zext i8 %17 to i32
  %19 = sub i32 7, %18
  %20 = sub i32 3, %19
  %21 = sub i32 9, %20
  %22 = shl i16 %16, 8
  %23 = trunc i32 %20 to i16
  %24 = shl i16 %16, %23
  %25 = and i16 %24, -9
  %26 = trunc i32 %21 to i16
  %27 = shl i16 %26, 3
  %28 = or i16 %25, %27
  %29 = icmp sle i32 %21, 0
  %30 = select i1 %29, i16 %22, i16 %28
  %31 = shl i16 %30, 7
  %32 = icmp eq i32 %15, 0
  %33 = select i1 %32, i16 %31, i16 %13
  %34 = and i8 %7, 127
  %35 = icmp eq i8 %34, 127
  %36 = icmp eq i8 %10, 0
  %37 = select i1 %36, i16 0, i16 %33
  %38 = select i1 %35, i16 32256, i16 %37
  %39 = or i16 %38, -32768
  %40 = select i1 %9, i16 %39, i16 %38
  %41 = bitcast i16 %40 to half
  %42 = zext i32 %5 to i64
  %43 = urem i64 %42, 3
  %44 = mul i64 %43, 3
  %45 = udiv i64 %42, 3
  %46 = add i64 %44, %45
  %47 = trunc i64 %46 to i32
  %48 = getelementptr inbounds i8, ptr %1, i32 %47
  %49 = load i8, ptr %48, align 1, !invariant.load !8
  %50 = lshr i8 %49, 7
  %51 = icmp ne i8 %50, 0
  %52 = and i8 %49, 127
  %53 = zext i8 %52 to i16
  %54 = add i16 %53, 64
  %55 = shl i16 %54, 7
  %56 = lshr i8 %52, 3
  %57 = zext i8 %56 to i32
  %58 = zext i8 %52 to i16
  %59 = call i8 @llvm.ctlz.i8(i8 %52, i1 false)
  %60 = zext i8 %59 to i32
  %61 = sub i32 7, %60
  %62 = sub i32 3, %61
  %63 = sub i32 9, %62
  %64 = shl i16 %58, 8
  %65 = trunc i32 %62 to i16
  %66 = shl i16 %58, %65
  %67 = and i16 %66, -9
  %68 = trunc i32 %63 to i16
  %69 = shl i16 %68, 3
  %70 = or i16 %67, %69
  %71 = icmp sle i32 %63, 0
  %72 = select i1 %71, i16 %64, i16 %70
  %73 = shl i16 %72, 7
  %74 = icmp eq i32 %57, 0
  %75 = select i1 %74, i16 %73, i16 %55
  %76 = and i8 %49, 127
  %77 = icmp eq i8 %76, 127
  %78 = icmp eq i8 %52, 0
  %79 = select i1 %78, i16 0, i16 %75
  %80 = select i1 %77, i16 32256, i16 %79
  %81 = or i16 %80, -32768
  %82 = select i1 %51, i16 %81, i16 %80
  %83 = bitcast i16 %82 to half
  %84 = getelementptr inbounds half, ptr %2, i32 %5
  store half %41, ptr %84, align 2
  %85 = getelementptr inbounds half, ptr %3, i32 %5
  store half %83, ptr %85, align 2
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i8 @llvm.ctlz.i8(i8, i1 immarg) #0

define void @wrapped_dot(ptr noalias align 128 dereferenceable(18) %0, ptr noalias align 128 dereferenceable(18) %1, ptr noalias align 128 dereferenceable(18) %2) {
  %4 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
  %5 = zext i32 %4 to i64
  %6 = udiv i64 %5, 3
  %7 = mul i64 %6, 3
  %8 = trunc i64 %7 to i32
  %9 = getelementptr inbounds half, ptr %0, i32 %8
  %10 = load half, ptr %9, align 2, !invariant.load !8
  %11 = urem i64 %5, 3
  %12 = mul i64 %11, 3
  %13 = trunc i64 %12 to i32
  %14 = getelementptr inbounds half, ptr %1, i32 %13
  %15 = load half, ptr %14, align 2, !invariant.load !8
  %16 = fmul half %10, %15
  %17 = fadd half %16, 0xH0000
  %18 = add i64 %7, 1
  %19 = trunc i64 %18 to i32
  %20 = getelementptr inbounds half, ptr %0, i32 %19
  %21 = load half, ptr %20, align 2, !invariant.load !8
  %22 = add i64 %12, 1
  %23 = trunc i64 %22 to i32
  %24 = getelementptr inbounds half, ptr %1, i32 %23
  %25 = load half, ptr %24, align 2, !invariant.load !8
  %26 = fmul half %21, %25
  %27 = fadd half %17, %26
  %28 = add i64 %7, 2
  %29 = trunc i64 %28 to i32
  %30 = getelementptr inbounds half, ptr %0, i32 %29
  %31 = load half, ptr %30, align 2, !invariant.load !8
  %32 = add i64 %12, 2
  %33 = trunc i64 %32 to i32
  %34 = getelementptr inbounds half, ptr %1, i32 %33
  %35 = load half, ptr %34, align 2, !invariant.load !8
  %36 = fmul half %31, %35
  %37 = fadd half %27, %36
  %38 = getelementptr inbounds half, ptr %2, i32 %4
  store half %37, ptr %38, align 2
  ret void
}

define void @wrapped_convert(ptr noalias align 128 dereferenceable(18) %0, ptr noalias align 128 dereferenceable(9) %1) {
  %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !7
  %4 = getelementptr inbounds half, ptr %0, i32 %3
  %5 = load half, ptr %4, align 2, !invariant.load !8
  %6 = bitcast half %5 to i16
  %7 = lshr i16 %6, 15
  %8 = icmp ne i16 %7, 0
  %9 = and i16 %6, 32767
  %10 = lshr i16 %9, 7
  %11 = and i16 %10, 1
  %12 = add i16 %11, 63
  %13 = add i16 %9, %12
  %14 = and i16 %13, -128
  %15 = add i16 %14, -8192
  %16 = lshr i16 %15, 7
  %17 = trunc i16 %16 to i8
  %18 = icmp ugt i16 %15, 16128
  %19 = select i1 %18, i8 127, i8 %17
  %20 = lshr i16 %9, 10
  %21 = zext i16 %20 to i32
  %22 = add i32 %21, -8
  %23 = icmp ne i32 %21, 0
  %24 = zext i1 %23 to i32
  %25 = zext i1 %23 to i16
  %26 = sub i32 %24, %22
  %27 = sub i32 %26, -7
  %28 = and i16 %9, 1023
  %29 = shl i16 %25, 10
  %30 = or i16 %28, %29
  %31 = trunc i32 %27 to i16
  %32 = trunc i32 %27 to i8
  %33 = lshr i16 %30, %31
  %34 = and i16 %33, 1
  %35 = sub i16 %31, 1
  %36 = shl i16 1, %35
  %37 = sub i16 %36, 1
  %38 = add i16 %34, %37
  %39 = add i16 %30, %38
  %40 = lshr i16 %39, %31
  %41 = trunc i16 %40 to i8
  %42 = icmp sle i32 %27, 11
  %43 = select i1 %42, i8 %41, i8 0
  %44 = trunc i16 %30 to i8
  %45 = sub i8 0, %32
  %46 = shl i8 %44, %45
  %47 = icmp sgt i32 %27, 0
  %48 = select i1 %47, i8 %43, i8 %46
  %49 = icmp sle i32 %22, 0
  %50 = select i1 %49, i8 %48, i8 %19
  %51 = fpext half %5 to float
  %52 = call float @__nv_fabsf(float %51)
  %53 = fptrunc float %52 to half
  %54 = fcmp oeq half %53, 0xH7C00
  %55 = fcmp uno half %5, %5
  %56 = icmp eq i16 %9, 0
  %57 = select i1 %56, i8 0, i8 %50
  %58 = select i1 %54, i8 127, i8 %57
  %59 = select i1 %55, i8 127, i8 %58
  %60 = or i8 %59, -128
  %61 = select i1 %8, i8 %60, i8 %59
  %62 = getelementptr inbounds i8, ptr %1, i32 %3
  store i8 %61, ptr %62, align 1
  ret void
}

declare float @__nv_fabsf(float)

attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

!nvvm.annotations = !{!0, !1, !2, !3, !4, !5}
!llvm.module.flags = !{!6}

!0 = !{ptr @loop_convert_fusion, !"kernel", i32 1}
!1 = !{ptr @loop_convert_fusion, !"reqntidx", i32 9}
!2 = !{ptr @wrapped_dot, !"kernel", i32 1}
!3 = !{ptr @wrapped_dot, !"reqntidx", i32 9}
!4 = !{ptr @wrapped_convert, !"kernel", i32 1}
!5 = !{ptr @wrapped_convert, !"reqntidx", i32 9}
!6 = !{i32 2, !"Debug Info Version", i32 3}
!7 = !{i32 0, i32 9}
!8 = !{}

Answered by Angelogeb

Mar 10, 2025

XLA falls back to a higher precision.
It will upcast operands to fp16 (if supported) and perform the dot in that precision and then downcast back to fp8.

I wouldn't call this fp8 emulation since it does not try to match the corresponding fp8 semantics, but more like "higher precision" fallback.

As far as I can tell, this happens through a mix of Gemm Rewriter passes and Float Normalization passes.

You can add --xla_dump_hlo_pass_re=.* to XLA_FLAGS to see how the IR changes through the compiler passes.

Some backstory can be found in this discussion and git logs.

View full answer

Angelogeb · 2025-03-10T17:47:14Z

Angelogeb
Mar 10, 2025

XLA falls back to a higher precision.
It will upcast operands to fp16 (if supported) and perform the dot in that precision and then downcast back to fp8.

I wouldn't call this fp8 emulation since it does not try to match the corresponding fp8 semantics, but more like "higher precision" fallback.

As far as I can tell, this happens through a mix of Gemm Rewriter passes and Float Normalization passes.

You can add --xla_dump_hlo_pass_re=.* to XLA_FLAGS to see how the IR changes through the compiler passes.

Some backstory can be found in this discussion and git logs.

1 reply

woct0rdho Mar 14, 2025
Author

Thank you for the explanation! I'm considering porting this to Triton (admittedly it's more widely used than XLA in the PyTorch community / local AI community with poor GPUs), and I'll take some time to understand this.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Why can JAX run fp8 on Nvidia GPUs with sm < 89? #23124

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 1 comment 1 reply

{{title}}

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Select a reply

Why can JAX run fp8 on Nvidia GPUs with sm < 89? #23124

woct0rdho Feb 26, 2025

Replies: 1 comment · 1 reply

Angelogeb Mar 10, 2025

woct0rdho Mar 14, 2025 Author

woct0rdho
Feb 26, 2025

Replies: 1 comment 1 reply

Angelogeb
Mar 10, 2025

woct0rdho Mar 14, 2025
Author