Seed7 Algorithms: Float

Float

The type float uses IEEE 754 double precision floating point numbers. The standard defines exactly how floating point numbers are represented. There is one sign bit 11 exponent bits and 52 fraction bits. To access the bits of a float the value must be converted (with bin64(x)) to a bin64 value. This allows that float functions can be based on manipulating the binary representation instead of using CPU float instructions.

Absolute value

The function abs is part of the "float.s7i" library. This function computes the absolute value of a float by clearing the sign bit.

$ include "seed7_05.s7i";
  include "float.s7i";
  include "bin64.s7i";

const func float: fabs (in float: number) is
  return float(bin64(number) & bin64(16#7fffffffffffffff));

Decompose float into fraction and exponent

The function decompose is part of the "float.s7i" library. This function decomposes a float into a normalized fraction and an integral exponent for 2. If the argument is 0.0, -0.0, Infinity, -Infinity or NaN the fraction is set to the argument and the exponent is set to 0. For all other arguments the fraction is set to an absolute value between 0.5(included) and 1.0(excluded).

$ include "seed7_05.s7i";
  include "float.s7i";
  include "bin64.s7i";

const func floatElements: decompose (in float: number) is func

  result
    var floatElements: elements is floatElements.value;
  local
    const float: twoPower64 is 2.0 ** 64;
    var bin64: binNumber is bin64.value;
    var integer: exponent is 0;
  begin
    binNumber :=  bin64(number);
    exponent := ord(binNumber >> DBL_FRACBITS & bin64(16#7ff));
    if exponent = 0 then
      if number <> 0.0 then
        elements := decompose(number * twoPower64);
        elements.exponent -:= 64;
      else
        elements.fraction := number;
      end if;
    elsif exponent = 16#7ff then
      elements.fraction := number;
    else
      elements.fraction := float(binNumber & bin64(16#800fffffffffffff_) | bin64(16#3fe0000000000000));
      elements.exponent := exponent - 16#3fe;
    end if;
  end func;

Multiply or divide with a power of two

The type float defines a shift operator which can be called with 'number << shift'. The function below shows how the shift operator for float works.

$ include "seed7_05.s7i";
  include "float.s7i";
  include "bin64.s7i";

const integer: DBL_FRACBITS is 52;
const integer: DBL_EXPONENT_BIAS is 1023;
const integer: DBL_EXPONENT_MAX is 1023;

const func float: ldexp (in float: number, in integer: exp) is func

  result
    var float: res is 0.0;
  local
    var integer: oldexp is 0;
    var integer: newexp is 0;
    var bin64: binNumber is bin64.value;
    var integer: intNumber is 0;
    var bin64: sign is bin64.value;
    var float: factor is 0.0;
  begin
    binNumber := bin64(number);
    oldexp := ord(binNumber >> DBL_FRACBITS & bin64(16#7ff));
    if number = 0.0 or exp = 0 or oldexp = 16#7ff then
      # For zero, Infinity and NaN, or if there is no change return number.
      res := number;
    elsif exp >= 16#7fe + DBL_FRACBITS then
      # The result exponent is not representable: Overflow
      # Return infinite with original sign.
      res := float(binNumber & bin64(16#8000000000000000_) | bin64(16#7ff0000000000000));
    else
      # Compute new exponent and check for over- / underflow.
      newexp := oldexp + exp;
      if newexp <= 0 then
        # The result is denormal or there is an underflow.
        if newexp < -DBL_FRACBITS then
          # Underflow: Return zero with original sign.
          res := float(binNumber & bin64(16#8000000000000000_));
        elsif oldexp = 0 then
          # Number is denormal and the result stays denormal.
          sign := binNumber & bin64(16#8000000000000000_);
          intNumber := ord(binNumber & bin64(16#000fffffffffffff));
          # Exact middle values are rounded to even.
          if odd(intNumber >> -newexp) then
            res := float(sign | bin64((intNumber + (1 << (-1 - newexp))) >> -newexp));
          else
            res := float(sign | bin64((intNumber + (1 << (-1 - newexp)) - 1) >> -newexp));
          end if;
        else
          # Number is normal and the result becomes denormal.
          sign := binNumber & bin64(16#8000000000000000_);
          intNumber := ord(binNumber & bin64(16#000fffffffffffff) | bin64(16#0010000000000000));
          # Exact middle values are rounded to even.
          if odd(intNumber >> (1 - newexp)) then
            res := float(sign | bin64((intNumber + (1 << -newexp)) >> (1 - newexp)));
          else
            res := float(sign | bin64((intNumber + (1 << -newexp) - 1) >> (1 - newexp)));
          end if;
        end if;
      else
        # The result is normal or there is an overflow.
        if oldexp = 0 then
          # Number is denormal. Just adjusting the exponent will not work.
          if exp <= DBL_EXPONENT_MAX then
            # Multiply number with 2**exp and we are done.
            factor := float(bin64(DBL_EXPONENT_BIAS + exp) << DBL_FRACBITS);
            res := number * factor;
          else
            # Multiply number with 2**DBL_EXPONENT_MAX.
            # We cannot multiply by more, but we can adjust the exponent later.
            factor := float(bin64(DBL_EXPONENT_BIAS + DBL_EXPONENT_MAX) << DBL_FRACBITS);
            binNumber := bin64(number * factor);
            # Recalculate oldexp as if number would be normal.
            # This oldexp is negative but the new calculated newexp is still positive.
            oldexp := ord(binNumber >> DBL_FRACBITS & bin64(16#7ff)) - DBL_EXPONENT_MAX;
            newexp := oldexp + exp;
            if newexp >= 16#7ff then
              # The result exponent is not representable: Overflow
              # Return infinite with original sign.
              res := float(binNumber & bin64(16#8000000000000000_) | bin64(16#7ff0000000000000));
            else
              # Now binNumber is normalized so adjusting the exponent will work.
              res := float(binNumber & bin64(16#800fffffffffffff_) | bin64(newexp) << DBL_FRACBITS);
            end if;
          end if;
        else
          # Number is normal.
          if newexp >= 16#7ff then
            # The result exponent is not representable: Overflow
            # Return infinite with original sign.
            res := float(binNumber & bin64(16#8000000000000000_) | bin64(16#7ff0000000000000));
          else
            # Both number and result are normal: Adjusting the exponent works.
            # Replace the old exponent with the new one.
            res := float(binNumber & bin64(16#800fffffffffffff_) | bin64(newexp) << DBL_FRACBITS);
          end if;
        end if;
      end if;
    end if;
  end func;

Convert a float to Microsoft Binary Format

The function float2MbfBits is part of the "bin64.s7i" library. This function converts a float into a bin64 value in the MBF double-precision representation. Microsoft Binary Format (MBF) is a format for floating point numbers. The double-precision version of MBF has a 8 bit exponent, a sign bit and a 55 bit mantissa.

const func bin64: float2MbfBits (in float: number, DOUBLE) is func
  result
    var bin64: bits is bin64(0);
  local
    const integer: ieeeExponentBits is 11;
    const integer: ieeeMantissaBits is 52;
    const bin64: ieeeSignMask is bin64(1) << (ieeeExponentBits + ieeeMantissaBits);
    const bin64: ieeeMantissaMask is bin64(pred(1 << ieeeMantissaBits));
    const integer: mbfExponentBits is 8;
    const integer: mbfMantissaBits is 55;
    const integer: mbfMaxExponent is pred(2 ** mbfExponentBits);
    const integer: mbfExponentBias is 129;
    var floatElements: ieeeElements is floatElements.value;
    var bin64: fractionBits is bin64(0);
    var integer: mbfExponent is 0;
  begin
    if isNaN(number) or abs(number) = Infinity then
      raise RANGE_ERROR;
    elsif number <> 0.0 then
      ieeeElements := decompose(number);
      fractionBits := bin64(ieeeElements.fraction);
      mbfExponent := ieeeElements.exponent - 1 + mbfExponentBias;
      if mbfExponent > mbfMaxExponent then
        raise RANGE_ERROR;
      elsif mbfExponent > 0 then
        bits := (bin64(mbfExponent) << succ(mbfMantissaBits)) |
                ((fractionBits & ieeeSignMask) >> mbfExponentBits) |
                ((fractionBits & ieeeMantissaMask) << (mbfMantissaBits - ieeeMantissaBits));
      end if;
    end if;
  end func;

Get a float from bits in Microsoft Binary Format

The function mbfBits2Float is part of the "bin64.s7i" library. This function gets a float from a bin64 value in the MBF double-precision representation. Microsoft Binary Format (MBF) is a format for floating point numbers. The double-precision version of MBF has a 8 bit exponent, a sign bit and a 55 bit mantissa.

const func float: mbfBits2Float (in bin64: bits) is func
  result
    var float: aFloat is 0.0;
  local
    const integer: mantissaBits is 55;
    const bin64: mantissaMask is bin64(pred(1 << mantissaBits));
    const bin64: mantissaSign is bin64(1 << mantissaBits);
    const integer: exponentBias is 129;
    var integer: exponent is 0;
  begin
    exponent := ord(bits >> succ(mantissaBits));
    if exponent <> 0 then
      # Ignore sign bit and set implicit leading one bit of mantissa instead.
      aFloat := float(ord(mantissaSign | bits & mantissaMask));
      # Check sign bit.
      if ord(bits & mantissaSign) <> 0 then
        aFloat := -aFloat;
      end if;
      aFloat := aFloat * 2.0 ** (exponent - exponentBias - mantissaBits);
    end if;
  end func;