849 if constexpr (modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
850 return montgomery_mul_big(*
this);
852#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
853 uint64_t carry_hi = 0;
855 auto [t0, carry_lo] = mul_wide(
data[0],
data[0]);
856 uint64_t t1 = square_accumulate(0,
data[1],
data[0], carry_lo, carry_hi, carry_lo, carry_hi);
857 uint64_t t2 = square_accumulate(0,
data[2],
data[0], carry_lo, carry_hi, carry_lo, carry_hi);
858 uint64_t t3 = square_accumulate(0,
data[3],
data[0], carry_lo, carry_hi, carry_lo, carry_hi);
860 uint64_t round_carry = carry_lo;
861 uint64_t k = t0 * T::r_inv;
862 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
863 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
864 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
865 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
866 t3 = carry_lo + round_carry;
868 t1 = mac_mini(t1,
data[1],
data[1], carry_lo);
870 t2 = square_accumulate(t2,
data[2],
data[1], carry_lo, carry_hi, carry_lo, carry_hi);
871 t3 = square_accumulate(t3,
data[3],
data[1], carry_lo, carry_hi, carry_lo, carry_hi);
872 round_carry = carry_lo;
874 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
875 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
876 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
877 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
878 t3 = carry_lo + round_carry;
880 t2 = mac_mini(t2,
data[2],
data[2], carry_lo);
882 t3 = square_accumulate(t3,
data[3],
data[2], carry_lo, carry_hi, carry_lo, carry_hi);
883 round_carry = carry_lo;
885 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
886 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
887 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
888 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
889 t3 = carry_lo + round_carry;
891 t3 = mac_mini(t3,
data[3],
data[3], carry_lo);
893 round_carry = carry_lo;
894 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
895 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
896 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
897 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
898 t3 = carry_lo + round_carry;
899 return { t0, t1, t2, t3 };
902 auto left = wasm_convert(
data);
903 constexpr uint64_t mask = 0x1fffffff;
914 uint64_t temp_10 = 0;
915 uint64_t temp_11 = 0;
916 uint64_t temp_12 = 0;
917 uint64_t temp_13 = 0;
918 uint64_t temp_14 = 0;
919 uint64_t temp_15 = 0;
920 uint64_t temp_16 = 0;
923 temp_0 += left[0] * left[0];
925 acc += left[0] * left[1];
926 temp_1 += (acc << 1);
928 acc += left[0] * left[2];
929 temp_2 += left[1] * left[1];
930 temp_2 += (acc << 1);
932 acc += left[0] * left[3];
933 acc += left[1] * left[2];
934 temp_3 += (acc << 1);
936 acc += left[0] * left[4];
937 acc += left[1] * left[3];
938 temp_4 += left[2] * left[2];
939 temp_4 += (acc << 1);
941 acc += left[0] * left[5];
942 acc += left[1] * left[4];
943 acc += left[2] * left[3];
944 temp_5 += (acc << 1);
946 acc += left[0] * left[6];
947 acc += left[1] * left[5];
948 acc += left[2] * left[4];
949 temp_6 += left[3] * left[3];
950 temp_6 += (acc << 1);
952 acc += left[0] * left[7];
953 acc += left[1] * left[6];
954 acc += left[2] * left[5];
955 acc += left[3] * left[4];
956 temp_7 += (acc << 1);
958 acc += left[0] * left[8];
959 acc += left[1] * left[7];
960 acc += left[2] * left[6];
961 acc += left[3] * left[5];
962 temp_8 += left[4] * left[4];
963 temp_8 += (acc << 1);
965 acc += left[1] * left[8];
966 acc += left[2] * left[7];
967 acc += left[3] * left[6];
968 acc += left[4] * left[5];
969 temp_9 += (acc << 1);
971 acc += left[2] * left[8];
972 acc += left[3] * left[7];
973 acc += left[4] * left[6];
974 temp_10 += left[5] * left[5];
975 temp_10 += (acc << 1);
977 acc += left[3] * left[8];
978 acc += left[4] * left[7];
979 acc += left[5] * left[6];
980 temp_11 += (acc << 1);
982 acc += left[4] * left[8];
983 acc += left[5] * left[7];
984 temp_12 += left[6] * left[6];
985 temp_12 += (acc << 1);
987 acc += left[5] * left[8];
988 acc += left[6] * left[7];
989 temp_13 += (acc << 1);
991 acc += left[6] * left[8];
992 temp_14 += left[7] * left[7];
993 temp_14 += (acc << 1);
995 acc += left[7] * left[8];
996 temp_15 += (acc << 1);
997 temp_16 += left[8] * left[8];
1001 wasm_reduce_yuval(temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
1002 wasm_reduce_yuval(temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
1003 wasm_reduce_yuval(temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
1004 wasm_reduce_yuval(temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
1005 wasm_reduce_yuval(temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
1006 wasm_reduce_yuval(temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
1007 wasm_reduce_yuval(temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
1008 wasm_reduce_yuval(temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
1026 wasm_reduce(temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
1044 return { (temp_9 << 0) | (temp_10 << 29) | (temp_11 << 58),
1045 (temp_11 >> 6) | (temp_12 << 23) | (temp_13 << 52),
1046 (temp_13 >> 12) | (temp_14 << 17) | (temp_15 << 46),
1047 (temp_15 >> 18) | (temp_16 << 11) };