diff --git a/3B2/3b2_mau.c b/3B2/3b2_mau.c
index 8c721aeb..42cc4b89 100644
--- a/3B2/3b2_mau.c
+++ b/3B2/3b2_mau.c
@@ -103,26 +103,21 @@ static uint8 leading_zeros_64(t_int64 val);
 static void shift_right_32_jamming(uint32 val, int16 count, uint32 *result);
 static void shift_right_64_jamming(t_uint64 val, int16 count, t_uint64 *result);
 static void shift_right_extra_64_jamming(t_uint64 val_a, t_uint64 val_b, int16 count,
-                                         t_uint64 *result_a, t_uint64 *result_b);
+                                         t_uint64 *r_a, t_uint64 *r_b);
 static void shift_right_128_jamming(t_uint64 val_a, t_uint64 val_b, int16 count,
-                                    t_uint64 *result_a, t_uint64 *result_b);
+                                    t_uint64 *r_a, t_uint64 *r_b);
 static void short_shift_left_128(t_uint64 val_a, t_uint64 val_b, int16 count,
-                                 t_uint64 *result_a, t_uint64 *result_b);
+                                 t_uint64 *r_a, t_uint64 *r_b);
 static void shift_right_128(t_uint64 val_a, t_uint64 val_b, int16 count,
-                            t_uint64 *result_a, t_uint64 *result_b);
+                            t_uint64 *r_a, t_uint64 *r_b);
 static void add_128(t_uint64 a0, t_uint64 a1,
                     t_uint64 b0, t_uint64 b1,
                     t_uint64 *r_low, t_uint64 *r_high);
 static void sub_128(t_uint64 a0, t_uint64 a1,
                     t_uint64 b0, t_uint64 b1,
                     t_uint64 *r_low, t_uint64 *r_high);
-static void add_192(t_uint64 a0, t_uint64 a1, t_uint64 a2,
-                    t_uint64 b0, t_uint64 b1, t_uint64 b2,
-                    t_uint64 *z0, t_uint64 *z1, t_uint64 *z2);
-static void sub_192(t_uint64 a0, t_uint64 a1, t_uint64 a2,
-                    t_uint64 b0, t_uint64 b1, t_uint64 b2,
-                    t_uint64 *z0, t_uint64 *z1, t_uint64 *z2);
 static void mul_64_to_128(t_uint64 a, t_uint64 b, t_uint64 *r_low, t_uint64 *r_high);
+static void mul_64_by_shifted_32_to_128(t_uint64 a, uint32 b, t_mau_128 *result);
 static t_uint64 estimate_div_128_to_64(t_uint64 a0, t_uint64 a1, t_uint64 b);
 static uint32 estimate_sqrt_32(int16 a_exp, uint32 a);
 
@@ -139,6 +134,7 @@ static void round_pack_xfp(t_bool sign, int32 exp,
                            t_uint64 frac_a, t_uint64 frac_b,
                            RM rounding_mode, XFP *result);
 static void propagate_xfp_nan(XFP *a, XFP *b, XFP *result);
+static void propagate_xfp_nan_128(XFP* a, XFP* b, t_mau_128* result);
 static void normalize_round_pack_xfp(t_bool sign, int32 exp,
                                      t_uint64 frac_0, t_uint64 frac_1,
                                      RM rounding_mode, XFP *result);
@@ -146,12 +142,12 @@ static void normalize_sfp_subnormal(uint32 in_frac, int16 *out_exp, uint32 *out_
 static void normalize_dfp_subnormal(t_uint64 in_frac, int16 *out_exp, t_uint64 *out_frac);
 static void normalize_xfp_subnormal(t_uint64 in_frac, int32 *out_exp, t_uint64 *out_frac);
 
-static NAN_T sfp_to_common_nan(SFP val);
-static NAN_T dfp_to_common_nan(DFP val);
-static NAN_T xfp_to_common_nan(XFP *val);
-static SFP common_nan_to_sfp(NAN_T nan);
-static DFP common_nan_to_dfp(NAN_T nan);
-static void common_nan_to_xfp(NAN_T nan, XFP *result);
+static T_NAN sfp_to_common_nan(SFP val);
+static T_NAN dfp_to_common_nan(DFP val);
+static T_NAN xfp_to_common_nan(XFP *val);
+static SFP common_nan_to_sfp(T_NAN nan);
+static DFP common_nan_to_dfp(T_NAN nan);
+static void common_nan_to_xfp(T_NAN nan, XFP *result);
 
 static void sfp_to_xfp(SFP val, XFP *result);
 static void dfp_to_xfp(DFP val, XFP *result);
@@ -634,7 +630,7 @@ static void shift_right_64_jamming(t_uint64 val, int16 count, t_uint64 *result)
  * Derived from the SoftFloat 2c package (see copyright notice above)
  */
 static void shift_right_extra_64_jamming(t_uint64 val_a, t_uint64 val_b, int16 count,
-                                         t_uint64 *result_a, t_uint64 *result_b)
+                                         t_uint64 *r_a, t_uint64 *r_b)
 {
     t_uint64 a, b;
     int8 neg_count = (-count) & 63;
@@ -654,8 +650,8 @@ static void shift_right_extra_64_jamming(t_uint64 val_a, t_uint64 val_b, int16 c
         a = 0;
     }
 
-    *result_a = a;
-    *result_b = b;
+    *r_a = a;
+    *r_b = b;
 }
 
 /*
@@ -665,7 +661,7 @@ static void shift_right_extra_64_jamming(t_uint64 val_a, t_uint64 val_b, int16 c
  * Derived from the SoftFloat 2c package (see copyright notice above)
  */
 static void shift_right_128_jamming(t_uint64 val_a, t_uint64 val_b, int16 count,
-                                    t_uint64 *result_a, t_uint64 *result_b)
+                                    t_uint64 *r_a, t_uint64 *r_b)
 {
     t_uint64 tmp_a, tmp_b;
     int8 neg_count = (-count) & 63;
@@ -685,8 +681,8 @@ static void shift_right_128_jamming(t_uint64 val_a, t_uint64 val_b, int16 count,
         tmp_a = 0;
     }
 
-    *result_a = tmp_a;
-    *result_b = tmp_b;
+    *r_a = tmp_a;
+    *r_b = tmp_b;
 }
 
 /*
@@ -696,13 +692,13 @@ static void shift_right_128_jamming(t_uint64 val_a, t_uint64 val_b, int16 count,
  * Derived from the SoftFloat 2c package (see copyright notice above)
  */
 static void short_shift_left_128(t_uint64 val_a, t_uint64 val_b, int16 count,
-                                 t_uint64 *result_a, t_uint64 *result_b)
+                                 t_uint64 *r_a, t_uint64 *r_b)
 {
-    *result_b = val_b << count;
+    *r_b = val_b << count;
     if (count == 0) {
-        *result_a = val_a;
+        *r_a = val_a;
     } else {
-        *result_a = (val_a << count) | (val_b >> ((-count) & 63));
+        *r_a = (val_a << count) | (val_b >> ((-count) & 63));
     }
 }
 
@@ -713,7 +709,7 @@ static void short_shift_left_128(t_uint64 val_a, t_uint64 val_b, int16 count,
  * Derived from the SoftFloat 2c package (see copyright notice above)
  */
 static void shift_right_128(t_uint64 val_a, t_uint64 val_b, int16 count,
-                            t_uint64 *result_a, t_uint64 *result_b)
+                            t_uint64 *r_a, t_uint64 *r_b)
 {
     t_uint64 tmp_a, tmp_b;
     int8 neg_count;
@@ -731,8 +727,8 @@ static void shift_right_128(t_uint64 val_a, t_uint64 val_b, int16 count,
         tmp_b = (count < 128) ? (val_a >> (count & 63)) : 0;
     }
 
-    *result_a = tmp_a;
-    *result_b = tmp_b;
+    *r_a = tmp_a;
+    *r_b = tmp_b;
 }
 
 /*
@@ -764,48 +760,6 @@ static void sub_128(t_uint64 a0, t_uint64 a1,
     *r_low = a0 - b0 - (a1 < b1);
 }
 
-/*
- * Add two 192-bit values.
- *
- * Derived from the SoftFloat 2c package (see copyright notice above)
- */
-static void add_192(t_uint64 a0, t_uint64 a1, t_uint64 a2,
-                    t_uint64 b0, t_uint64 b1, t_uint64 b2,
-                    t_uint64 *z0, t_uint64 *z1, t_uint64 *z2)
-{
-    int8 carry_0, carry_1;
-
-    *z2 = a2 + b2;
-    carry_1 = (*z2 < a2);
-    *z1 = a1 + b1;
-    carry_0 = (*z1 < a1);
-    *z0 = a0 + b0;
-    *z1 += carry_1;
-    *z0 += (*z1 < (t_uint64) carry_1);
-    *z0 += carry_0;
-}
-
-/*
- * Subtract two 192-bit values.
- *
- * Derived from the SoftFloat 2c package (see copyright notice above)
- */
-static void sub_192(t_uint64 a0, t_uint64 a1, t_uint64 a2,
-                    t_uint64 b0, t_uint64 b1, t_uint64 b2,
-                    t_uint64 *z0, t_uint64 *z1, t_uint64 *z2)
-{
-    int8 borrow_0, borrow_1;
-
-    *z2 = a2 - b2;
-    borrow_1 = (a2 < b2);
-    *z1 = a1 - b1;
-    borrow_0 = (a1 < b1);
-    *z0 = a0 - b0;
-    *z0 -= (*z1 < (t_uint64) borrow_1);
-    *z1 -= borrow_1;
-    *z2 -= borrow_0;
-}
-
 /*
  * Multiplies a by b to obtain a 128-bit product. The product is
  * broken into two 64-bit pieces which are stored at r_low and r_high.
@@ -839,6 +793,18 @@ static void mul_64_to_128(t_uint64 a, t_uint64 b, t_uint64 *r_low, t_uint64 *r_h
     *r_low = rl;
 }
 
+/*
+ * Derived from the SoftFloat 2c package (see copyright notice above)
+ */
+static void mul_64_by_shifted_32_to_128(t_uint64 a, uint32 b, t_mau_128 *result)
+{
+    t_uint64 mid;
+
+    mid = (t_uint64)(uint32) a * b;
+    result->low = mid << 32;
+    result->high = (t_uint64)(uint32)(a >> 32) * b + (mid >> 32);
+}
+
 /*
  * Returns an approximation of the 64-bit integer value obtained by
  * dividing 'b' into the 128-bit value 'a0' and 'a1'.
@@ -912,6 +878,41 @@ static uint32 estimate_sqrt_32(int16 a_exp, uint32 a)
     return ((uint32) ((((t_uint64) a )<<31 ) / z)) + (z >> 1);
 }
 
+static uint32 approx_recip_sqrt_32(uint32 oddExpA, uint32 a)
+{
+    int index;
+    uint16 eps, r0;
+    uint32 ESqrR0;
+    uint32 sigma0;
+    uint32 r;
+    uint32 sqrSigma0;
+
+    static const uint16 softfloat_approxRecipSqrt_1k0s[16] = {
+        0xB4C9, 0xFFAB, 0xAA7D, 0xF11C, 0xA1C5, 0xE4C7, 0x9A43, 0xDA29,
+        0x93B5, 0xD0E5, 0x8DED, 0xC8B7, 0x88C6, 0xC16D, 0x8424, 0xBAE1
+    };
+    static const uint16 softfloat_approxRecipSqrt_1k1s[16] = {
+        0xA5A5, 0xEA42, 0x8C21, 0xC62D, 0x788F, 0xAA7F, 0x6928, 0x94B6,
+        0x5CC7, 0x8335, 0x52A6, 0x74E2, 0x4A3E, 0x68FE, 0x432B, 0x5EFD
+    };
+
+    index = (a>>27 & 0xE) + oddExpA;
+    eps = (uint16) (a>>12);
+    r0 = softfloat_approxRecipSqrt_1k0s[index]
+             - ((softfloat_approxRecipSqrt_1k1s[index] * (uint32) eps)
+                    >>20);
+    ESqrR0 = (uint32) r0 * r0;
+    if ( ! oddExpA ) ESqrR0 <<= 1;
+    sigma0 = ~(uint32) (((uint32) ESqrR0 * (t_uint64) a)>>23);
+    r = ((uint32) r0<<16) + ((r0 * (t_uint64) sigma0)>>25);
+    sqrSigma0 = ((t_uint64) sigma0 * sigma0)>>32;
+    r += ((uint32) ((r>>1) + (r>>3) - ((uint32) r0<<14))
+              * (t_uint64) sqrSigma0)
+             >>48;
+    if ( ! (r & 0x80000000) ) r = 0x80000000;
+    return r;
+}
+
 /*
  * Return the properly rounded 32-bit integer corresponding to 'sign'
  * and 'frac'.
@@ -1051,7 +1052,7 @@ static SFP round_pack_sfp(t_bool sign, int16 exp, uint32 frac, RM rounding_mode)
 
     round_bits = frac & 0x7f;
 
-    if (0xfd < (uint16) exp) {
+    if (0xfd <= (uint16) exp) {
         if ((0xfd < exp) ||
             (exp == 0xfd && (int32)(frac + round_increment) < 0)) {
             mau_exc(MAU_ASR_OS, MAU_ASR_OM);
@@ -1251,6 +1252,52 @@ static void propagate_xfp_nan(XFP *a, XFP *b, XFP *result)
     }
 }
 
+/*
+ * Derived from the SoftFloat 2c package (see copyright notice above)
+ */
+static void propagate_xfp_nan_128(XFP* a, XFP* b, t_mau_128* result)
+{
+    t_bool is_sig_nan_a, is_sig_nan_b;
+    t_uint64 non_frac_a_low, non_frac_b_low;
+    uint16 mag_a, mag_b;
+
+    is_sig_nan_a = XFP_IS_TRAPPING_NAN(a);
+    is_sig_nan_b = XFP_IS_TRAPPING_NAN(b);
+
+    non_frac_a_low = a->frac & 0xC000000000000000ull;
+    non_frac_b_low = b->frac & 0xC000000000000000ull;
+
+    if (is_sig_nan_a | is_sig_nan_b) {
+        /* Invalid */
+        mau_exc(MAU_ASR_IS, MAU_ASR_IM);
+        if (is_sig_nan_a) {
+            if (is_sig_nan_b) goto return_larger_mag;
+            if (XFP_IS_NAN(b)) goto return_b;
+            goto return_a;
+        } else {
+            if (XFP_IS_NAN(a)) goto return_a;
+            goto return_b;
+        }
+    }
+
+ return_larger_mag:
+    mag_a = a->frac & 0x7fff;
+    mag_b = b->frac & 0x7fff;
+    if (mag_a < mag_b) goto return_b;
+    if (mag_b < mag_a) goto return_a;
+    if (a->frac < b->frac) goto return_b;
+    if (b->frac < a->frac) goto return_a;
+    if (a->sign_exp < b->sign_exp) goto return_a;
+ return_b:
+    result->high = b->sign_exp;
+    result->low = non_frac_b_low;
+    return;
+ return_a:
+    result->high = a->sign_exp;
+    result->low = non_frac_a_low;
+    return;
+}
+
 /*
  * Normalize and round an extended-precision floating point value.
  *
@@ -1288,6 +1335,14 @@ static void normalize_sfp_subnormal(uint32 in_frac, int16 *out_exp, uint32 *out_
     int8 shift_count;
 
     shift_count = leading_zeros(in_frac) - 8;
+
+    if (shift_count < 0) {
+        /* There was invalid input, there's nothing we can do. */
+        *out_frac = in_frac;
+        *out_exp = 0;
+        return;
+    }
+
     *out_frac = in_frac << shift_count;
     *out_exp = (uint16)(1 - shift_count);
 }
@@ -1303,6 +1358,14 @@ static void normalize_dfp_subnormal(t_uint64 in_frac, int16 *out_exp, t_uint64 *
     int8 shift_count;
 
     shift_count = leading_zeros_64(in_frac) - 11;
+
+    if (shift_count < 0) {
+        /* There was invalid input, there's nothing we can do. */
+        *out_frac = in_frac;
+        *out_exp = 0;
+        return;
+    }
+
     *out_frac = in_frac << shift_count;
     *out_exp = 1 - shift_count;
 }
@@ -1328,9 +1391,9 @@ static void normalize_xfp_subnormal(t_uint64 in_frac, int32 *out_exp, t_uint64 *
  *
  * Derived from the SoftFloat 2c package (see copyright notice above)
  */
-static NAN_T sfp_to_common_nan(SFP val)
+static T_NAN sfp_to_common_nan(SFP val)
 {
-    NAN_T nan = {0};
+    T_NAN nan = {0};
 
     if (SFP_IS_TRAPPING_NAN(val)) {
         mau_state.trapping_nan = TRUE;
@@ -1349,9 +1412,9 @@ static NAN_T sfp_to_common_nan(SFP val)
  *
  * Derived from the SoftFloat 2c package (see copyright notice above)
  */
-static NAN_T dfp_to_common_nan(DFP val)
+static T_NAN dfp_to_common_nan(DFP val)
 {
-    NAN_T nan = {0};
+    T_NAN nan = {0};
 
     if (DFP_IS_TRAPPING_NAN(val)) {
         mau_state.trapping_nan = TRUE;
@@ -1370,9 +1433,9 @@ static NAN_T dfp_to_common_nan(DFP val)
  *
  * Derived from the SoftFloat 2c package (see copyright notice above)
  */
-static NAN_T xfp_to_common_nan(XFP *val)
+static T_NAN xfp_to_common_nan(XFP *val)
 {
-    NAN_T nan = {0};
+    T_NAN nan = {0};
 
     if (XFP_IS_TRAPPING_NAN(val)) {
         mau_state.trapping_nan = TRUE;
@@ -1391,7 +1454,7 @@ static NAN_T xfp_to_common_nan(XFP *val)
  *
  * Derived from the SoftFloat 2c package (see copyright notice above)
  */
-static SFP common_nan_to_sfp(NAN_T nan)
+static SFP common_nan_to_sfp(T_NAN nan)
 {
     return ((((uint32)nan.sign) << 31)
             | 0x7fc00000
@@ -1404,7 +1467,7 @@ static SFP common_nan_to_sfp(NAN_T nan)
  *
  * Derived from the SoftFloat 2c package (see copyright notice above)
  */
-static DFP common_nan_to_dfp(NAN_T nan)
+static DFP common_nan_to_dfp(T_NAN nan)
 {
     return ((((t_uint64)nan.sign) << 63)
             | 0x7ff8000000000000ull
@@ -1417,7 +1480,7 @@ static DFP common_nan_to_dfp(NAN_T nan)
  *
  * Derived from the SoftFloat 2c package (see copyright notice above)
  */
-static void common_nan_to_xfp(NAN_T nan, XFP *result)
+static void common_nan_to_xfp(T_NAN nan, XFP *result)
 {
     result->frac = 0xc000000000000000ull | (nan.high >> 1);
     result->sign_exp = (((uint16)nan.sign) << 15) | 0x7fff;
@@ -1662,6 +1725,8 @@ void mau_int_to_xfp(int32 val, XFP *result)
 
 /*
  * Convert a floating point value to a 64-bit integer.
+ *
+ * Derived from the SoftFloat 2c package (see copyright notice above)
  */
 t_int64 xfp_to_int64(XFP *val, RM rounding_mode)
 {
@@ -1762,7 +1827,7 @@ void xfp_to_decimal(XFP *a, DEC *d, RM rounding_mode)
 /*
  * Convert a decimal value to a float value.
  */
-void mau_decimal_to_xfp(DEC *d, XFP *a, RM rounding_mode)
+void mau_decimal_to_xfp(DEC *d, XFP *a)
 {
     int i;
     t_bool sign;
@@ -1771,7 +1836,7 @@ void mau_decimal_to_xfp(DEC *d, XFP *a, RM rounding_mode)
     t_uint64 tmp;
     t_int64 signed_tmp;
 
-        sim_debug(TRACE_DBG, &mau_dev,
+    sim_debug(TRACE_DBG, &mau_dev,
               "[%08x] [mau_decimal_to_xfp] DEC input: %08x %08x %08x\n",
               R[NUM_PC], d->h, (uint32)(d->l >> 32), (uint32)(d->l));
 
@@ -1836,6 +1901,8 @@ void mau_decimal_to_xfp(DEC *d, XFP *a, RM rounding_mode)
 
 /*
  * Convert a floating point value to a 32-bit integer.
+ *
+ * Derived from the SoftFloat 2c package (see copyright notice above)
  */
 uint32 xfp_to_int(XFP *val, RM rounding_mode)
 {
@@ -1915,7 +1982,7 @@ void mau_round_xfp_to_int(XFP *val, XFP *result, RM rounding_mode)
             }
             return;
         default:
-            // do nothing
+            /* Do nothing */
             break;
         }
         PACK_XFP(sign, 0, 0, result);
@@ -1955,10 +2022,13 @@ void mau_round_xfp_to_int(XFP *val, XFP *result, RM rounding_mode)
  * Math Functions
  ****************************************************************************/
 
+/*
+ * Derived from the SoftFloat 2c package (see copyright notice above)
+ */
 static void xfp_add_fracs(XFP *a, XFP *b, t_bool sign, XFP *result, RM rounding_mode)
 {
-    int32 a_exp, b_exp, result_exp;
-    t_uint64 a_frac, b_frac, result_frac_0, result_frac_1;
+    int32 a_exp, b_exp, r_exp;
+    t_uint64 a_frac, b_frac, r_frac_0, r_frac_1;
     int32 exp_diff;
 
     sim_debug(TRACE_DBG, &mau_dev,
@@ -1986,8 +2056,8 @@ static void xfp_add_fracs(XFP *a, XFP *b, t_bool sign, XFP *result, RM rounding_
         if (b_exp == 0) {
             --exp_diff;
         }
-        shift_right_extra_64_jamming(b_frac, 0, exp_diff, &b_frac, &result_frac_1);
-        result_exp = a_exp;
+        shift_right_extra_64_jamming(b_frac, 0, exp_diff, &b_frac, &r_frac_1);
+        r_exp = a_exp;
     } else if (exp_diff < 0) {
         if (b_exp == 0x7fff) {
             if ((t_uint64) (b_frac << 1)) {
@@ -2001,8 +2071,8 @@ static void xfp_add_fracs(XFP *a, XFP *b, t_bool sign, XFP *result, RM rounding_
             ++exp_diff;
         }
 
-        shift_right_extra_64_jamming(a_frac, 0, -exp_diff, &a_frac, &result_frac_1);
-        result_exp = b_exp;
+        shift_right_extra_64_jamming(a_frac, 0, -exp_diff, &a_frac, &r_frac_1);
+        r_exp = b_exp;
     } else {
         if (a_exp == 0x7fff) {
             if ((t_uint64)((a_frac | b_frac) << 1)) {
@@ -2013,37 +2083,40 @@ static void xfp_add_fracs(XFP *a, XFP *b, t_bool sign, XFP *result, RM rounding_
             result->frac = a->frac;
             return;
         }
-        result_frac_1 = 0;
-        result_frac_0 = a_frac + b_frac;
+        r_frac_1 = 0;
+        r_frac_0 = a_frac + b_frac;
         if (a_exp == 0) {
-            normalize_xfp_subnormal(result_frac_0, &result_exp, &result_frac_0);
+            normalize_xfp_subnormal(r_frac_0, &r_exp, &r_frac_0);
 
-            round_pack_xfp(sign, result_exp, result_frac_0, result_frac_1, rounding_mode, result);
+            round_pack_xfp(sign, r_exp, r_frac_0, r_frac_1, rounding_mode, result);
             return;
         }
-        result_exp = a_exp;
-        shift_right_extra_64_jamming(result_frac_0, result_frac_1, 1, &result_frac_0, &result_frac_1);
-        result_frac_0 |= 0x8000000000000000ull;
-        ++result_exp;
-        round_pack_xfp(sign, result_exp, result_frac_0, result_frac_1, rounding_mode, result);
+        r_exp = a_exp;
+        shift_right_extra_64_jamming(r_frac_0, r_frac_1, 1, &r_frac_0, &r_frac_1);
+        r_frac_0 |= 0x8000000000000000ull;
+        ++r_exp;
+        round_pack_xfp(sign, r_exp, r_frac_0, r_frac_1, rounding_mode, result);
         return;
     }
-    result_frac_0 = a_frac + b_frac;
-    if (((t_int64) result_frac_0) < 0) {
-        round_pack_xfp(sign, result_exp, result_frac_0, result_frac_1, rounding_mode, result);
+    r_frac_0 = a_frac + b_frac;
+    if (((t_int64) r_frac_0) < 0) {
+        round_pack_xfp(sign, r_exp, r_frac_0, r_frac_1, rounding_mode, result);
         return;
     }
-    shift_right_extra_64_jamming(result_frac_0, result_frac_1, 1, &result_frac_0, &result_frac_1);
-    result_frac_0 |= 0x8000000000000000ull;
-    ++result_exp;
-    round_pack_xfp(sign, result_exp, result_frac_0, result_frac_1, rounding_mode, result);
+    shift_right_extra_64_jamming(r_frac_0, r_frac_1, 1, &r_frac_0, &r_frac_1);
+    r_frac_0 |= 0x8000000000000000ull;
+    ++r_exp;
+    round_pack_xfp(sign, r_exp, r_frac_0, r_frac_1, rounding_mode, result);
     return;
 }
 
+/*
+ * Derived from the SoftFloat 2c package (see copyright notice above)
+ */
 static void xfp_sub_fracs(XFP *a, XFP *b, t_bool sign, XFP *result, RM rounding_mode)
 {
-    int32 a_exp, b_exp, result_exp;
-    t_uint64 a_frac, b_frac, result_frac_0, result_frac_1;
+    int32 a_exp, b_exp, r_exp;
+    t_uint64 a_frac, b_frac, r_frac_0, r_frac_1;
     int32 exp_diff;
 
     a_exp = XFP_EXP(a);
@@ -2066,12 +2139,12 @@ static void xfp_sub_fracs(XFP *a, XFP *b, t_bool sign, XFP *result, RM rounding_
         if (b_exp == 0) {
             --exp_diff;
         }
-        shift_right_128_jamming(b_frac, 0, exp_diff, &b_frac, &result_frac_1);
+        shift_right_128_jamming(b_frac, 0, exp_diff, &b_frac, &r_frac_1);
         /* aBigger */
-        sub_128(a_frac, 0, b_frac, result_frac_1, &result_frac_0, &result_frac_1);
-        result_exp = a_exp;
+        sub_128(a_frac, 0, b_frac, r_frac_1, &r_frac_0, &r_frac_1);
+        r_exp = a_exp;
         /* normalizeRoundAndPack */
-        normalize_round_pack_xfp(sign, result_exp, result_frac_0, result_frac_1, rounding_mode, result);
+        normalize_round_pack_xfp(sign, r_exp, r_frac_0, r_frac_1, rounding_mode, result);
         return;
     }
     if (exp_diff < 0) {
@@ -2087,14 +2160,14 @@ static void xfp_sub_fracs(XFP *a, XFP *b, t_bool sign, XFP *result, RM rounding_
         if (a_exp == 0) {
             ++exp_diff;
         }
-        shift_right_128_jamming(a_frac, 0, -exp_diff, &a_frac, &result_frac_1);
+        shift_right_128_jamming(a_frac, 0, -exp_diff, &a_frac, &r_frac_1);
         /* bBigger */
-        sub_128(b_frac, 0, a_frac, result_frac_1, &result_frac_0, &result_frac_1);
-        result_exp = b_exp;
+        sub_128(b_frac, 0, a_frac, r_frac_1, &r_frac_0, &r_frac_1);
+        r_exp = b_exp;
         sign = sign ? 0 : 1;
         /* normalizeRoundAndPack */
-        normalize_round_pack_xfp(sign, result_exp,
-                                 result_frac_0, result_frac_1,
+        normalize_round_pack_xfp(sign, r_exp,
+                                 r_frac_0, r_frac_1,
                                  rounding_mode, result);
         return;
     }
@@ -2112,26 +2185,26 @@ static void xfp_sub_fracs(XFP *a, XFP *b, t_bool sign, XFP *result, RM rounding_
         a_exp = 1;
         b_exp = 1;
     }
-    result_frac_1 = 0;
+    r_frac_1 = 0;
     if (b_frac < a_frac) {
         /* aBigger */
-        sub_128(a_frac, 0, b_frac, result_frac_1, &result_frac_0, &result_frac_1);
-        result_exp = a_exp;
+        sub_128(a_frac, 0, b_frac, r_frac_1, &r_frac_0, &r_frac_1);
+        r_exp = a_exp;
         /* normalizeRoundAndPack */
-        normalize_round_pack_xfp(sign, result_exp,
-                                 result_frac_0, result_frac_1,
+        normalize_round_pack_xfp(sign, r_exp,
+                                 r_frac_0, r_frac_1,
                                  rounding_mode, result);
         return;
     }
     if (a_frac < b_frac) {
         /* bBigger */
-        sub_128(b_frac, 0, a_frac, result_frac_1, &result_frac_0, &result_frac_1);
-        result_exp = b_exp;
+        sub_128(b_frac, 0, a_frac, r_frac_1, &r_frac_0, &r_frac_1);
+        r_exp = b_exp;
         sign ^= 1;
 
         /* normalizeRoundAndPack */
-        normalize_round_pack_xfp(sign, result_exp,
-                                 result_frac_0, result_frac_1,
+        normalize_round_pack_xfp(sign, r_exp,
+                                 r_frac_0, r_frac_1,
                                  rounding_mode, result);
         return;
     }
@@ -2147,6 +2220,8 @@ static void xfp_sub_fracs(XFP *a, XFP *b, t_bool sign, XFP *result, RM rounding_
 
 /*
  * Set condition flags based on comparison of the two values A and B.
+ *
+ * Derived from the SoftFloat 2c package (see copyright notice above)
  */
 static void xfp_cmp(XFP *a, XFP *b)
 {
@@ -2281,11 +2356,14 @@ static void xfp_sub(XFP *a, XFP *b, XFP *result, RM rounding_mode)
     }
 }
 
+/*
+ * Derived from the SoftFloat 2c package (see copyright notice above)
+ */
 static void xfp_mul(XFP *a, XFP *b, XFP *result, RM rounding_mode)
 {
-    uint32 a_sign, b_sign, result_sign;
-    int32 a_exp, b_exp, result_exp;
-    t_uint64 a_frac, b_frac, result_frac_0, result_frac_1;
+    uint32 a_sign, b_sign, r_sign;
+    int32 a_exp, b_exp, r_exp;
+    t_uint64 a_frac, b_frac, r_frac_0, r_frac_1;
 
     sim_debug(TRACE_DBG, &mau_dev,
               "[%08x] [MUL] op1=%04x%016llx  op2=%04x%016llx\n",
@@ -2300,7 +2378,7 @@ static void xfp_mul(XFP *a, XFP *b, XFP *result, RM rounding_mode)
     b_exp = XFP_EXP(b);
     b_frac = XFP_FRAC(b);
 
-    result_sign = a_sign ^ b_sign;
+    r_sign = a_sign ^ b_sign;
 
     if (a_exp == 0x7fff) {
         if ((t_uint64)(a_frac << 1) || ((b_exp == 0x7fff) && (t_uint64)(b_frac << 1))) {
@@ -2314,7 +2392,7 @@ static void xfp_mul(XFP *a, XFP *b, XFP *result, RM rounding_mode)
             result->frac = DEFAULT_XFP_NAN_FRAC;
             return;
         }
-        PACK_XFP(result_sign, 0x7fff, 0x8000000000000000ull, result);
+        PACK_XFP(r_sign, 0x7fff, 0x8000000000000000ull, result);
         return;
     }
 
@@ -2330,13 +2408,13 @@ static void xfp_mul(XFP *a, XFP *b, XFP *result, RM rounding_mode)
             result->frac = DEFAULT_XFP_NAN_FRAC;
             return;
         }
-        PACK_XFP(result_sign, 0x7fff, 0x8000000000000000ull, result);
+        PACK_XFP(r_sign, 0x7fff, 0x8000000000000000ull, result);
         return;
     }
 
     if (a_exp == 0) {
         if (a_frac == 0) {
-            PACK_XFP(result_sign, 0, 0, result);
+            PACK_XFP(r_sign, 0, 0, result);
             return;
         }
         normalize_xfp_subnormal(a_frac, &a_exp, &a_frac);
@@ -2344,24 +2422,27 @@ static void xfp_mul(XFP *a, XFP *b, XFP *result, RM rounding_mode)
 
     if (b_exp == 0) {
         if (b_frac == 0) {
-            PACK_XFP(result_sign, 0, 0, result);
+            PACK_XFP(r_sign, 0, 0, result);
             return;
         }
         normalize_xfp_subnormal(b_frac, &b_exp, &b_frac);
     }
 
-    result_exp = a_exp + b_exp - 0x3ffe;
-    mul_64_to_128(a_frac, b_frac, &result_frac_0, &result_frac_1);
-    if (0 < (t_int64)result_frac_0) {
-        short_shift_left_128(result_frac_0, result_frac_1, 1,
-                             &result_frac_0, &result_frac_1);
-        --result_exp;
+    r_exp = a_exp + b_exp - 0x3ffe;
+    mul_64_to_128(a_frac, b_frac, &r_frac_0, &r_frac_1);
+    if (0 < (t_int64)r_frac_0) {
+        short_shift_left_128(r_frac_0, r_frac_1, 1,
+                             &r_frac_0, &r_frac_1);
+        --r_exp;
     }
 
-    round_pack_xfp(result_sign, result_exp, result_frac_0,
-                   result_frac_1, rounding_mode, result);
+    round_pack_xfp(r_sign, r_exp, r_frac_0,
+                   r_frac_1, rounding_mode, result);
 }
 
+/*
+ * Derived from the SoftFloat 2c package (see copyright notice above)
+ */
 static void xfp_div(XFP *a, XFP *b, XFP *result, RM rounding_mode)
 {
     t_bool a_sign, b_sign, r_sign;
@@ -2471,30 +2552,38 @@ static void xfp_div(XFP *a, XFP *b, XFP *result, RM rounding_mode)
     round_pack_xfp(r_sign, r_exp, r_frac0, r_frac1, rounding_mode, result);
 }
 
+/*
+ * Derived from the SoftFloat 2c package (see copyright notice above)
+ */
 static void xfp_sqrt(XFP *a, XFP *result, RM rounding_mode)
 {
-    uint32 a_sign;
-    int32 a_exp, result_exp;
-    t_uint64 a_frac_0, a_frac_1, result_frac_0, result_frac_1, double_result_frac_0;
-    t_uint64 rem_0, rem_1, rem_2, rem_3, term_0, term_1, term_2, term_3;
+    XFP zero = {0, 0, 0};
+    t_bool a_sign;
+    int32 a_exp, norm_exp, r_exp;
+    uint32 a_frac_32, sqrt_recip_32, r_frac_32;
+    t_uint64 a_frac, norm_frac, q, x64, z_frac, z_frac_extra;
+    t_mau_128 nan_128, rem, y, term;
+
+    sim_debug(TRACE_DBG, &mau_dev,
+              "[%08x] [SQRT] op1=%04x%016llx\n",
+              R[NUM_PC], a->sign_exp, a->frac);
 
     a_sign = XFP_SIGN(a);
-    a_exp = XFP_EXP(a);
-    a_frac_0 = XFP_FRAC(a);
+    a_exp  = XFP_EXP(a);
+    a_frac  = XFP_FRAC(a);
 
     if (a_exp == 0x7fff) {
-        if ((t_uint64)(a_frac_0 << 1)) {
-            propagate_xfp_nan(a, a, result);
+        if ( a_frac & 0x7fffffffffffffffull ) {
+            propagate_xfp_nan_128(a, &zero, &nan_128);
+            result->sign_exp = nan_128.high;
+            result->frac = nan_128.low;
             return;
         }
-
-        if (!a_sign) {
+        if ( ! a_sign ) {
             result->sign_exp = a->sign_exp;
             result->frac = a->frac;
-            return;
         }
-
-        /* invalid */
+        /* Invalid */
         mau_exc(MAU_ASR_IS, MAU_ASR_IM);
         result->sign_exp = DEFAULT_XFP_NAN_SIGN_EXP;
         result->frac = DEFAULT_XFP_NAN_FRAC;
@@ -2502,74 +2591,98 @@ static void xfp_sqrt(XFP *a, XFP *result, RM rounding_mode)
     }
 
     if (a_sign) {
-        if ((a_exp | a_frac_0) == 0) {
-            result->sign_exp = a->sign_exp;
-            result->frac = a->frac;
+        if (!a_frac) {
+            PACK_XFP(a_sign, 0, 0, result);
             return;
         }
-
-        /* invalid */
         mau_exc(MAU_ASR_IS, MAU_ASR_IM);
         result->sign_exp = DEFAULT_XFP_NAN_SIGN_EXP;
         result->frac = DEFAULT_XFP_NAN_FRAC;
         return;
     }
 
-    if (a_exp == 0) {
-        if (a_frac_0 == 0) {
-            PACK_XFP(0, 0, 0, result);
+    if (!a_exp) {
+        a_exp = 1;
+    }
+
+    if (!(a_frac & 0x8000000000000000ull)) {
+        if (!a_frac) {
+            PACK_XFP(a_sign, 0, 0, result);
             return;
         }
-
-        normalize_xfp_subnormal(a_frac_0, &a_exp, &a_frac_0);
+        normalize_xfp_subnormal(a->frac, &norm_exp, &norm_frac);
+        a_exp += norm_exp;
+        a_frac = norm_frac;
     }
 
-    result_exp = ((a_exp - 0x3fff) >> 1) + 0x3fff;
-    result_frac_0 = estimate_sqrt_32(a_exp, a_frac_0 >> 32);
-    shift_right_128(a_frac_0, 0, 2 + (a_exp & 1), &a_frac_0, &a_frac_1);
-    result_frac_0 = estimate_div_128_to_64(a_frac_0, a_frac_1, result_frac_0 << 32)
-        + (result_frac_0 << 30);
-    double_result_frac_0 = result_frac_0 << 1;
-    mul_64_to_128(result_frac_0, result_frac_0, &term_0, &term_1);
-    sub_128(a_frac_0, a_frac_1, term_0, term_1, &rem_0, &rem_1);
-    while ((t_int64) rem_0 < 0) {
-        --result_frac_0;
-        double_result_frac_0 -= 2;
-        add_128(rem_0, rem_1, result_frac_0 >> 63,
-                double_result_frac_0 | 1, &rem_0, &rem_1);
+    /*
+     * r_frac_32 is guaranteed to be a lower bound on the square root of
+     * a_frac_32, which makes r_frac_32 also a lower bound on the square
+     * root of `a_frac'.
+     */
+    r_exp = ((a_exp - 0x3FFF) >> 1) + 0x3FFF;
+    a_exp &= 1;
+    a_frac_32 = a_frac >> 32;
+    sqrt_recip_32 = approx_recip_sqrt_32(a_exp, a_frac_32);
+    r_frac_32 = ((t_uint64) a_frac_32 * sqrt_recip_32) >> 32;
+
+    if (a_exp) {
+        r_frac_32 >>= 1;
+        short_shift_left_128(0, a_frac, 61, &rem.high, &rem.low);
+    } else {
+        short_shift_left_128(0, a_frac, 62, &rem.high, &rem.low);
     }
 
-    result_frac_1 = estimate_div_128_to_64(rem_0, 0, double_result_frac_0);
+    rem.high -= (t_uint64) r_frac_32 * r_frac_32;
 
-    if ((result_frac_1 & 0x3fffffffffffffff) <= 5) {
-        if (result_frac_1 == 0) {
-            result_frac_1 = 1;
+    q = ((uint32) (rem.high >> 2) * (t_uint64) sqrt_recip_32) >> 32;
+    x64 = (t_uint64) r_frac_32 << 32;
+    z_frac = x64 + (q<<3);
+    short_shift_left_128(rem.high, rem.low, 29, &y.high, &y.low);
+
+    /* Repeating this loop is a rare occurrence. */
+    while(1) {
+        mul_64_by_shifted_32_to_128(x64 + z_frac, q, &term);
+        sub_128(y.high, y.low, term.high, term.low, &rem.high, &rem.low);
+        if (!(rem.high & 0x8000000000000000ull)) {
+            break;
         }
-        mul_64_to_128(double_result_frac_0, result_frac_1, &term_1, &term_2);
-        sub_128(rem_1, 0, term_1, term_2, &rem_1, &rem_2);
-        mul_64_to_128(result_frac_1, result_frac_1, &term_2, &term_3);
-        sub_192(rem_1, rem_2, 0, 0, term_2, term_3, &rem_1, &rem_2, &rem_3);
-        while ((t_int64) rem_1 < 0) {
-            --result_frac_1;
-            short_shift_left_128(0, result_frac_1, 1, &term_2, &term_3);
-            term_3 |= 1;
-            term_2 |= double_result_frac_0;
-            add_192(rem_1, rem_2, rem_3,
-                    0, term_2, term_3,
-                    &rem_1, &rem_2, &rem_3);
-        }
-        result_frac_1 |= ((rem_1 | rem_2 | rem_3) != 0);
+        --q;
+        z_frac -= 1<<3;
     }
-    short_shift_left_128(0, result_frac_1, 1, &result_frac_0, &result_frac_1);
-    result_frac_0 |= double_result_frac_0;
 
-    round_pack_xfp(0, result_exp, result_frac_0, result_frac_1,
-                   rounding_mode, result);
+    q = (((rem.high>>2) * sqrt_recip_32)>>32) + 2;
+    x64 = z_frac;
+    z_frac = (z_frac<<1) + (q>>25);
+    z_frac_extra = (t_uint64) (q<<39);
+
+    if ( (q & 0xffffff) <= 2 ) {
+        q &= ~(t_uint64) 0xffff;
+        z_frac_extra = (t_uint64) (q<<39);
+        mul_64_by_shifted_32_to_128(x64 + (q >> 27), q, &term);
+        x64 = (uint32) (q<<5) * (t_uint64) (uint32) q;
+        add_128(term.high, term.low, 0, x64, &term.high, &term.low);
+        short_shift_left_128(rem.high, rem.low, 28, &rem.high, &rem.low);
+        sub_128(rem.high, rem.low, term.high, term.low, &rem.high, &rem.low);
+        if (rem.high & 0x8000000000000000ull) {
+            if (!z_frac_extra ) {
+                --z_frac;
+            }
+            --z_frac_extra;
+        } else {
+            if (rem.high | rem.low) {
+                z_frac_extra |= 1;
+            }
+        }
+    }
+
+    round_pack_xfp(0, r_exp, z_frac, z_frac_extra, rounding_mode,result);
+    return;
 }
 
 static void xfp_remainder(XFP *a, XFP *b, XFP *result, RM rounding_mode)
 {
-    uint32 a_sign, b_sign, result_sign;
+    uint32 a_sign, b_sign, r_sign;
     int32 a_exp, b_exp, exp_diff;
     t_uint64 a_frac_0, a_frac_1, b_frac;
     t_uint64 q, term_0, term_1, alt_a_frac_0, alt_a_frac_1;
@@ -2624,7 +2737,7 @@ static void xfp_remainder(XFP *a, XFP *b, XFP *result, RM rounding_mode)
     }
 
     b_frac |= 0x8000000000000000ull;
-    result_sign = a_sign;
+    r_sign = a_sign;
     exp_diff = a_exp - b_exp;
     a_frac_1 = 0;
     if (exp_diff < 0) {
@@ -2679,10 +2792,10 @@ static void xfp_remainder(XFP *a, XFP *b, XFP *result, RM rounding_mode)
          (q & 1))) {
         a_frac_0 = alt_a_frac_0;
         a_frac_1 = alt_a_frac_1;
-        result_sign = result_sign ? 0 : 1;
+        r_sign = r_sign ? 0 : 1;
     }
 
-    normalize_round_pack_xfp(result_sign, b_exp + exp_diff,
+    normalize_round_pack_xfp(r_sign, b_exp + exp_diff,
                              a_frac_0, a_frac_1,
                              rounding_mode, result);
 }
@@ -2810,6 +2923,57 @@ static void store_op3_decimal(DEC *d)
     mau_state.dr.frac = ((t_uint64)d->l >> 32) | (t_uint64)d->l;
 }
 
+static void store_op3_reg(XFP *xfp, XFP *reg)
+{
+    DFP dfp;
+    SFP sfp;
+    XFP xfp_r;
+
+    if (mau_state.ntnan) {
+        reg->sign_exp = GEN_NONTRAPPING_NAN.sign_exp;
+        reg->frac = GEN_NONTRAPPING_NAN.frac;
+    } else {
+        switch(mau_state.op3) {
+        case M_OP3_F0_SINGLE:
+        case M_OP3_F1_SINGLE:
+        case M_OP3_F2_SINGLE:
+        case M_OP3_F3_SINGLE:
+            sfp = xfp_to_sfp(xfp, MAU_RM);
+            sfp_to_xfp(sfp, &xfp_r);
+            reg->sign_exp = xfp_r.sign_exp;
+            reg->frac = xfp_r.frac;
+            reg->s = xfp_r.s;
+            break;
+        case M_OP3_F0_DOUBLE:
+        case M_OP3_F1_DOUBLE:
+        case M_OP3_F2_DOUBLE:
+        case M_OP3_F3_DOUBLE:
+            dfp = xfp_to_dfp(xfp, MAU_RM);
+            dfp_to_xfp(dfp, &xfp_r);
+            reg->sign_exp = xfp_r.sign_exp;
+            reg->frac = xfp_r.frac;
+            reg->s = xfp_r.s;
+            break;
+        case M_OP3_F0_TRIPLE:
+        case M_OP3_F1_TRIPLE:
+        case M_OP3_F2_TRIPLE:
+        case M_OP3_F3_TRIPLE:
+            reg->sign_exp = xfp->sign_exp;
+            reg->frac = xfp->frac;
+            reg->s = xfp->s;
+            break;
+        }
+    }
+    if (set_nz()) {
+        if (XFP_SIGN(xfp)) {
+            mau_state.asr |= MAU_ASR_N;
+        }
+        if (XFP_EXP(xfp) == 0 && XFP_FRAC(xfp) == 0) {
+            mau_state.asr |= MAU_ASR_Z;
+        }
+    }
+}
+
 static void store_op3(XFP *xfp)
 {
     DFP dfp;
@@ -2837,78 +3001,22 @@ static void store_op3(XFP *xfp)
     case M_OP3_F0_SINGLE:
     case M_OP3_F0_DOUBLE:
     case M_OP3_F0_TRIPLE:
-        if (mau_state.ntnan) {
-            mau_state.f0.sign_exp = GEN_NONTRAPPING_NAN.sign_exp;
-            mau_state.f0.frac = GEN_NONTRAPPING_NAN.frac;
-        } else {
-            mau_state.f0.sign_exp = xfp->sign_exp;
-            mau_state.f0.frac = xfp->frac;
-        }
-        if (set_nz()) {
-            if (XFP_SIGN(xfp)) {
-                mau_state.asr |= MAU_ASR_N;
-            }
-            if (XFP_EXP(xfp) == 0 && XFP_FRAC(xfp) == 0) {
-                mau_state.asr |= MAU_ASR_Z;
-            }
-        }
+        store_op3_reg(xfp, &mau_state.f0);
         break;
     case M_OP3_F1_SINGLE:
     case M_OP3_F1_DOUBLE:
     case M_OP3_F1_TRIPLE:
-        if (mau_state.ntnan) {
-            mau_state.f1.sign_exp = GEN_NONTRAPPING_NAN.sign_exp;
-            mau_state.f1.frac = GEN_NONTRAPPING_NAN.frac;
-        } else {
-            mau_state.f1.sign_exp = xfp->sign_exp;
-            mau_state.f1.frac = xfp->frac;
-        }
-        if (set_nz()) {
-            if (XFP_SIGN(xfp)) {
-                mau_state.asr |= MAU_ASR_N;
-            }
-            if (XFP_EXP(xfp) == 0 && XFP_FRAC(xfp) == 0) {
-                mau_state.asr |= MAU_ASR_Z;
-            }
-        }
+        store_op3_reg(xfp, &mau_state.f1);
         break;
     case M_OP3_F2_SINGLE:
     case M_OP3_F2_DOUBLE:
     case M_OP3_F2_TRIPLE:
-        if (mau_state.ntnan) {
-            mau_state.f2.sign_exp = GEN_NONTRAPPING_NAN.sign_exp;
-            mau_state.f2.frac = GEN_NONTRAPPING_NAN.frac;
-        } else {
-            mau_state.f2.sign_exp = xfp->sign_exp;
-            mau_state.f2.frac = xfp->frac;
-        }
-        if (set_nz()) {
-            if (XFP_SIGN(xfp)) {
-                mau_state.asr |= MAU_ASR_N;
-            }
-            if (XFP_EXP(xfp) == 0 && XFP_FRAC(xfp) == 0) {
-                mau_state.asr |= MAU_ASR_Z;
-            }
-        }
+        store_op3_reg(xfp, &mau_state.f2);
         break;
     case M_OP3_F3_SINGLE:
     case M_OP3_F3_DOUBLE:
     case M_OP3_F3_TRIPLE:
-        if (mau_state.ntnan) {
-            mau_state.f3.sign_exp = GEN_NONTRAPPING_NAN.sign_exp;
-            mau_state.f3.frac = GEN_NONTRAPPING_NAN.frac;
-        } else {
-            mau_state.f3.sign_exp = xfp->sign_exp;
-            mau_state.f3.frac = xfp->frac;
-        }
-        if (set_nz()) {
-            if (XFP_SIGN(xfp)) {
-                mau_state.asr |= MAU_ASR_N;
-            }
-            if (XFP_EXP(xfp) == 0 && XFP_FRAC(xfp) == 0) {
-                mau_state.asr |= MAU_ASR_Z;
-            }
-        }
+        store_op3_reg(xfp, &mau_state.f3);
         break;
     case M_OP3_MEM_SINGLE:
         if (mau_state.ntnan) {
@@ -3164,7 +3272,7 @@ static void mau_dtof()
     XFP result;
 
     load_op1_decimal(&d);
-    mau_decimal_to_xfp(&d, &result, MAU_RM);
+    mau_decimal_to_xfp(&d, &result);
     store_op3(&result);
 }
 
diff --git a/3B2/3b2_mau.h b/3B2/3b2_mau.h
index 53825fe1..e71b92ac 100644
--- a/3B2/3b2_mau.h
+++ b/3B2/3b2_mau.h
@@ -282,6 +282,14 @@ typedef enum {
     M_OP_NONE
 } op_spec;
 
+/*
+ * 128-bit value
+ */
+typedef struct {
+    t_uint64 low;
+    t_uint64 high;
+} t_mau_128;
+
 /*
  * Not-a-Number Type
  */
@@ -289,7 +297,7 @@ typedef struct {
     t_bool sign;
     t_uint64 high;
     t_uint64 low;
-} NAN_T;
+} T_NAN;
 
 /*
  * Extended Precision (80 bits).