fixed Float16 from Float64 and BigFloat (#42837)

oscardssmith · web-flow · commit 707c57c4907f · 2021-11-04T10:22:22.000-04:00
* fixed Float16 from Float64 and BigFloat. Many thanks to Jamison.
diff --git a/base/mpfr.jl b/base/mpfr.jl
@@ -338,8 +338,23 @@ Float32(x::BigFloat, r::MPFRRoundingMode=ROUNDING_MODE[]) =
     _cpynansgn(ccall((:mpfr_get_flt,:libmpfr), Float32, (Ref{BigFloat}, MPFRRoundingMode), x, r), x)
 Float32(x::BigFloat, r::RoundingMode) = Float32(x, convert(MPFRRoundingMode, r))
 
-# TODO: avoid double rounding
-Float16(x::BigFloat) = Float16(Float64(x))
+function Float16(x::BigFloat) :: Float16
+    res = Float32(x)
+    resi = reinterpret(UInt32, res)
+    if (resi&0x7fffffff) < 0x38800000 # if Float16(res) is subnormal
+        #shift so that the mantissa lines up where it would for normal Float16
+        shift = 113-((resi & 0x7f800000)>>23)
+        if shift<23
+            resi |= 0x0080_0000 # set implicit bit
+            resi >>= shift
+        end
+    end
+    if (resi & 0x1fff == 0x1000) # if we are halfway between 2 Float16 values
+        # adjust the value by 1 ULP in the direction that will make Float16(res) give the right answer
+        res = nextfloat(res, cmp(x, res))
+    end
+    return res
+end
 
 promote_rule(::Type{BigFloat}, ::Type{<:Real}) = BigFloat
 promote_rule(::Type{BigInt}, ::Type{<:AbstractFloat}) = BigFloat
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
@@ -205,7 +205,24 @@ JL_DLLEXPORT uint16_t __gnu_f2h_ieee(float param)
 
 JL_DLLEXPORT uint16_t __truncdfhf2(double param)
 {
-    return float_to_half((float)param);
+    float res = (float)param;
+    uint32_t resi;
+    memcpy(&resi, &res, sizeof(res));
+    if ((resi&0x7fffffffu) < 0x38800000u){ // if Float16(res) is subnormal
+        // shift so that the mantissa lines up where it would for normal Float16
+        uint32_t shift = 113u-((resi & 0x7f800000u)>>23u);
+        if (shift<23u) {
+            resi |= 0x00800000; // set implicit bit
+            resi >>= shift;
+        }
+    }
+    if ((resi & 0x1fffu) == 0x1000u) { // if we are halfway between 2 Float16 values
+        memcpy(&resi, &res, sizeof(res));
+        // adjust the value by 1 ULP in the direction that will make Float16(res) give the right answer
+        resi += (fabs(res) < fabs(param)) - (fabs(param) < fabs(res));
+        memcpy(&res, &resi, sizeof(res));
+    }
+    return float_to_half(res);
 }
 
 #endif
diff --git a/test/float16.jl b/test/float16.jl
@@ -202,3 +202,25 @@ const minsubf16_32 = Float32(minsubf16)
 
 # issues #33076
 @test Float16(1f5) == Inf16
+
+@testset "conversion to Float16 from" begin
+    for T in (Float32, Float64, BigFloat)
+        @testset "conversion from $T" begin
+            for i in 1:2^16
+                f = reinterpret(Float16, UInt16(i-1))
+                isfinite(f) || continue
+                if f < 0
+                    epsdown = T(eps(f))/2
+                    epsup   = issubnormal(f) ? epsdown : T(eps(nextfloat(f)))/2
+                else
+                    epsup   = T(eps(f))/2
+                    epsdown = issubnormal(f) ? epsup : T(eps(prevfloat(f)))/2
+                end
+                @test isequal(f*(-1)^(f === Float16(0)),  Float16(nextfloat(T(f) - epsdown)))
+                @test isequal(f*(-1)^(f === -Float16(0)), Float16(prevfloat(T(f) + epsup)))
+                @test isequal(prevfloat(f), Float16(prevfloat(T(f) - epsdown)))
+                @test isequal(nextfloat(f), Float16(nextfloat(T(f) + epsup)))
+            end
+        end
+    end
+end