diff --git a/bench/eth_bench.cpp b/bench/eth_bench.cpp
index 6e93967..16d5b5d 100644
--- a/bench/eth_bench.cpp
+++ b/bench/eth_bench.cpp
@@ -82,13 +82,13 @@ fp12 random_fe12()
 g1 random_g1()
 {
     array<uint64_t, 4> k = random_scalar();
-    return g1::one().mulScalar(k);
+    return g1::one().scale(k);
 }
 
 g2 random_g2()
 {
     array<uint64_t, 4> k = random_scalar();
-    return g2::one().mulScalar(k);
+    return g2::one().scale(k);
 }
 
 void benchG1Add() {
@@ -113,7 +113,20 @@ void benchG1Mul() {
     auto start = startStopwatch();
 
     for (int i = 0; i < numIters; i++) {
-        p.mulScalar(s);
+        p.scale(s);
+    }
+    endStopwatch(testName, start, numIters);
+}
+
+void benchG1WeightedSum() {
+    string testName = "G1 WeightedSum";
+    const int numIters = 10000;
+    g1 p = random_g1();
+    vector<g1> bases{8, p};
+    vector<array<uint64_t, 4>> scalars{8, {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff}};
+    auto start = startStopwatch();
+    for (int i = 0; i < numIters; i++) {
+        g1::weightedSum(bases, scalars);
     }
     endStopwatch(testName, start, numIters);
 }
@@ -140,7 +153,29 @@ void benchG2Mul() {
     auto start = startStopwatch();
 
     for (int i = 0; i < numIters; i++) {
-        p.mulScalar(s);
+        p.scale(s);
+    }
+    endStopwatch(testName, start, numIters);
+}
+
+void benchG2WeightedSum() {
+    string testName = "G2 WeightedSum";
+    const int numIters = 10000;
+    g2 p = random_g2();
+    vector<g2> bases = {p,p,p,p,p,p,p,p};
+    vector<array<uint64_t, 4>> scalars = {
+        {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff},
+        {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff},
+        {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff},
+        {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff},
+        {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff},
+        {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff},
+        {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff},
+        {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff}
+    };
+    auto start = startStopwatch();
+    for (int i = 0; i < numIters; i++) {
+        g2::weightedSum(bases, scalars);
     }
     endStopwatch(testName, start, numIters);
 }
@@ -161,11 +196,100 @@ void benchPairing() {
     endStopwatch(testName, start, numIters);
 }
 
+void benchG1Add2() {
+    string testName = "G1 Addition With different settings";
+    cout << endl << testName << endl;
+
+    const int numIters = 10000;
+    g1 pbak = random_g1();
+    
+   
+
+    array<uint64_t, 4> s = {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff};
+
+
+    auto performTest = [numIters, s, pbak](bool check, bool raw) {
+        array<uint8_t, 144> pRaw = {};
+        g1 p = pbak;
+        p.toJacobianBytesLE(pRaw, raw);
+
+        auto start = startStopwatch();
+
+        for (int i = 0; i < numIters; i++) {
+            p = *g1::fromJacobianBytesLE(pRaw, check, raw);
+            p.add(p);
+            p.toJacobianBytesLE(pRaw, raw);
+        }
+        endStopwatch(string("check=") + std::to_string(check) + string(", raw=") + std::to_string(raw), start, numIters);
+
+        start = startStopwatch();
+    };
+
+    performTest(true, true);
+    performTest(true, false);
+    performTest(false, true);
+    performTest(false, false);
+
+}
+
+void benchG2Add2() {
+    string testName = "G2 Addition With different settings";
+    cout << endl << testName << endl;
+
+    const int numIters = 10000;
+    g2 pbak = random_g2();
+    
+   
+
+    array<uint64_t, 4> s = {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff};
+
+
+    auto performTest = [numIters, s, pbak](bool check, bool raw) {
+        array<uint8_t, 288> pRaw = {};
+        g2 p = pbak;
+        p.toJacobianBytesLE(pRaw, raw);
+
+        auto start = startStopwatch();
+
+        for (int i = 0; i < numIters; i++) {
+            p = *g2::fromJacobianBytesLE(pRaw, check, raw);
+            p.add(p);
+            p.toJacobianBytesLE(pRaw, raw);
+        }
+        endStopwatch(string("check=") + std::to_string(check) + string(", raw=") + std::to_string(raw), start, numIters);
+
+        start = startStopwatch();
+    };
+
+    performTest(true, true);
+    performTest(true, false);
+    performTest(false, true);
+    performTest(false, false);
+
+}
+
+void benchInverse() {
+    string testName = "Inverse";
+    const int numIters = 10000;
+    fp a = random_fe();
+    auto start = startStopwatch();
+
+    for (int i = 0; i < numIters; i++) {
+        a.inverse();
+    }
+    endStopwatch(testName, start, numIters);
+}
+
 int main(int argc, char* argv[])
 {
     benchG1Add();
     benchG1Mul();
+    benchG1WeightedSum();
     benchG2Add();
     benchG2Mul();
+    benchG2WeightedSum();
     benchPairing();
+    benchG1Add2();
+    benchG2Add2();
+    benchInverse();
 }
\ No newline at end of file
diff --git a/include/bls12-381/arithmetic.hpp b/include/bls12-381/arithmetic.hpp
index eb4cb5b..1006b55 100644
--- a/include/bls12-381/arithmetic.hpp
+++ b/include/bls12-381/arithmetic.hpp
@@ -12,24 +12,27 @@ class fp;
 void init(bool cpu_features = true);
 
 void _add(fp* z, const fp* x, const fp* y);
-void _addAssign(fp* x, const fp* y);
-void _ladd(fp* z, const fp* x, const fp* y);
-void _laddAssign(fp* x, const fp* y);
 void _double(fp* z, const fp* x);
-void _doubleAssign(fp* z);
-void _ldouble(fp* z, const fp* x);
-void _sub(fp* z, const fp* x, const fp* y);
-void _subAssign(fp* z, const fp* x);
-void _lsubAssign(fp* z, const fp* x);
-void _neg(fp* z, const fp* x);
+void _subtract(fp* z, const fp* x, const fp* y);
+void _negate(fp* z, const fp* x);
 void _square(fp* z, const fp* x);
 
+// Those are low level calls that will not perform modular reduction after the operation 
+// Please use with caution. Overflow will happen.
+void _ladd(fp* z, const fp* x, const fp* y);
+void _ldouble(fp* z, const fp* x);
+void _lsubtract(fp* z, const fp* x, const fp* y);
+
+// Multiplication will work fine as long as both operands are smaller than around 4p. 
+// The "smaller than 4p" here means the montgomery form itself as number is less than 4p.
+// Therefore, at most ONE _ladd/_lsubstract/_ldouble is allowed before passing the result to _multiply,
+// unless the algorithm makes sure the number is small.
 #if defined(__x86_64__) && defined(__ELF__)
-extern void _mul(fp*, const fp*, const fp*);
+extern void _multiply(fp*, const fp*, const fp*);
 #elif defined(__x86_64__)
-extern void (*_mul)(fp*, const fp*, const fp*);
+extern void (*_multiply)(fp*, const fp*, const fp*);
 #else
-void _mul(fp*, const fp*, const fp*);
+void _multiply(fp*, const fp*, const fp*);
 #endif
 
 // Add64 returns the sum with carry of x, y and carry: sum = x + y + carry.
diff --git a/include/bls12-381/fp.hpp b/include/bls12-381/fp.hpp
index e6a3d91..6519647 100644
--- a/include/bls12-381/fp.hpp
+++ b/include/bls12-381/fp.hpp
@@ -19,10 +19,10 @@ class fp
     std::array<uint64_t, 6> d;
 
     fp();
-    fp(const std::array<uint64_t, 6>& d);
+    explicit fp(const std::array<uint64_t, 6>& d);
     fp(const fp& e);
-    static std::optional<fp> fromBytesBE(const std::span<const uint8_t, 48> in, const bool check = false, const bool raw = false);
-    static std::optional<fp> fromBytesLE(const std::span<const uint8_t, 48> in, const bool check = false, const bool raw = false);
+    static std::optional<fp> fromBytesBE(const std::span<const uint8_t, 48> in, const bool check = true, const bool raw = false);
+    static std::optional<fp> fromBytesLE(const std::span<const uint8_t, 48> in, const bool check = true, const bool raw = false);
     void toBytesBE(const std::span<uint8_t, 48> out, const bool raw = false) const;
     void toBytesLE(const std::span<uint8_t, 48> out, const bool raw = false) const;
     std::array<uint8_t, 48> toBytesBE(const bool raw = false) const;
@@ -34,9 +34,35 @@ class fp
     bool isEven() const;
     bool isZero() const;
     bool isOne() const;
-    std::strong_ordering cmp(const fp& e) const;
+    constexpr std::strong_ordering cmp(const fp& e) const {
+        for(int64_t i = 5; i >= 0; i--)
+        {
+            if(d[i] < e.d[i])
+            {
+                return std::strong_ordering::less;
+            }
+            if(d[i] > e.d[i])
+            {
+                return std::strong_ordering::greater;
+            }
+        }
+        return std::strong_ordering::equal;
+    };
     bool equal(const fp& e) const;
     bool sign() const;
+    
+    fp add(const fp& e) const;
+    void addAssign(const fp& e);
+    fp dbl() const;
+    void doubleAssign();
+    fp subtract(const fp& e) const;
+    void subtractAssign(const fp& e);
+    fp negate() const;
+    fp multiply(const fp& e) const;
+    void multiplyAssign(const fp& e);
+    fp square() const;
+    void squareAssign();
+
     void div2(const uint64_t& e);
     uint64_t mul2();
     fp toMont() const;
@@ -49,7 +75,11 @@ class fp
     bool isLexicographicallyLargest() const;
     template<size_t N> static fp modPrime(const std::array<uint64_t, N>& k);
 
-    std::strong_ordering operator<=>(const fp& e) const { return cmp(e); }
+    // Those operators are defined to support set and map.
+    // They are mathematically correctly in certain cases.
+    // However, there are still ambiguity there as the fp can be in Montgomery form or not.
+    // Please avoid using those operators.
+    constexpr std::strong_ordering operator<=>(const fp& e) const { return cmp(e); }
 
     static const fp MODULUS;                            // base field modulus: p = 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559787 or 0x1A0111EA397FE69A4B1BA7B6434BACD764774B84F38512BF6730D2A0F6B0F6241EABFFFEB153FFFFB9FEFFFFFFFFAAAB
     static const uint64_t INP;                          // INP = -(p^{-1} mod 2^64) mod 2^64
@@ -74,10 +104,10 @@ class fp2
     fp c1;
 
     fp2();
-    fp2(const std::array<fp, 2>& e2);
+    explicit fp2(const std::array<fp, 2>& e2);
     fp2(const fp2& e);
-    static std::optional<fp2> fromBytesBE(const std::span<const uint8_t, 96> in, const bool check = false, const bool raw = false);
-    static std::optional<fp2> fromBytesLE(const std::span<const uint8_t, 96> in, const bool check = false, const bool raw = false);
+    static std::optional<fp2> fromBytesBE(const std::span<const uint8_t, 96> in, const bool check = true, const bool raw = false);
+    static std::optional<fp2> fromBytesLE(const std::span<const uint8_t, 96> in, const bool check = true, const bool raw = false);
     void toBytesBE(const std::span<uint8_t, 96> out, const bool raw = false) const;
     void toBytesLE(const std::span<uint8_t, 96> out, const bool raw = false) const;
     std::array<uint8_t, 96> toBytesBE(const bool raw = false) const;
@@ -90,16 +120,14 @@ class fp2
     bool sign() const;
     fp2 add(const fp2& e) const;
     void addAssign(const fp2& e);
-    fp2 ladd(const fp2& e) const;
     fp2 dbl() const;
     void doubleAssign();
-    fp2 ldouble() const;
-    fp2 sub(const fp2& e) const;
-    void subAssign(const fp2& e);
-    fp2 neg() const;
-    fp2 conj() const;
-    fp2 mul(const fp2& e) const;
-    void mulAssign(const fp2& e);
+    fp2 subtract(const fp2& e) const;
+    void subtractAssign(const fp2& e);
+    fp2 negate() const;
+    fp2 conjugate() const;
+    fp2 multiply(const fp2& e) const;
+    void multiplyAssign(const fp2& e);
     fp2 square() const;
     void squareAssign();
     fp2 mulByNonResidue() const;
@@ -113,6 +141,9 @@ class fp2
     bool isQuadraticNonResidue() const;
     bool isLexicographicallyLargest() const;
 
+    // Those operators are defined to support set and map.
+    // They are not mathematically correct.
+    // DO NOT use them to compare fp2.
     auto operator<=>(const fp2&) const = default;
 
     static const fp2 negativeOne2;
@@ -132,10 +163,10 @@ class fp6
     fp2 c2;
 
     fp6();
-    fp6(const std::array<fp2, 3>& e3);
+    explicit fp6(const std::array<fp2, 3>& e3);
     fp6(const fp6& e);
-    static std::optional<fp6> fromBytesBE(const std::span<const uint8_t, 288> in, const bool check = false, const bool raw = false);
-    static std::optional<fp6> fromBytesLE(const std::span<const uint8_t, 288> in, const bool check = false, const bool raw = false);
+    static std::optional<fp6> fromBytesBE(const std::span<const uint8_t, 288> in, const bool check = true, const bool raw = false);
+    static std::optional<fp6> fromBytesLE(const std::span<const uint8_t, 288> in, const bool check = true, const bool raw = false);
     void toBytesBE(const std::span<uint8_t, 288> out, const bool raw = false) const;
     void toBytesLE(const std::span<uint8_t, 288> out, const bool raw = false) const;
     std::array<uint8_t, 288> toBytesBE(const bool raw = false) const;
@@ -149,12 +180,13 @@ class fp6
     void addAssign(const fp6& e);
     fp6 dbl() const;
     void doubleAssign();
-    fp6 sub(const fp6& e) const;
-    void subAssign(const fp6& e);
-    fp6 neg() const;
-    fp6 mul(const fp6& e) const;
-    void mulAssign(const fp6& e);
+    fp6 subtract(const fp6& e) const;
+    void subtractAssign(const fp6& e);
+    fp6 negate() const;
+    fp6 multiply(const fp6& e) const;
+    void multiplyAssign(const fp6& e);
     fp6 square() const;
+    void squareAssign();
     void mulBy01Assign(const fp2& e0, const fp2& e1);
     fp6 mulBy01(const fp2& e0, const fp2& e1) const;
     fp6 mulBy1(const fp2& e1) const;
@@ -179,10 +211,10 @@ class fp12
     fp6 c1;
 
     fp12();
-    fp12(const std::array<fp6, 2>& e2);
+    explicit fp12(const std::array<fp6, 2>& e2);
     fp12(const fp12& e);
-    static std::optional<fp12> fromBytesBE(const std::span<const uint8_t, 576> in, const bool check = false, const bool raw = false);
-    static std::optional<fp12> fromBytesLE(const std::span<const uint8_t, 576> in, const bool check = false, const bool raw = false);
+    static std::optional<fp12> fromBytesBE(const std::span<const uint8_t, 576> in, const bool check = true, const bool raw = false);
+    static std::optional<fp12> fromBytesLE(const std::span<const uint8_t, 576> in, const bool check = true, const bool raw = false);
     void toBytesBE(const std::span<uint8_t, 576> out, const bool raw = false) const;
     void toBytesLE(const std::span<uint8_t, 576> out, const bool raw = false) const;
     std::array<uint8_t, 576> toBytesBE(const bool raw = false) const;
@@ -194,14 +226,19 @@ class fp12
     bool isGtValid() const;
     bool equal(const fp12& e) const;
     fp12 add(const fp12& e) const;
+    void addAssign(const fp12& e);
     fp12 dbl() const;
-    fp12 sub(const fp12& e) const;
-    fp12 neg() const;
+    void doubleAssign();
+    fp12 subtract(const fp12& e) const;
+    void subtractAssign(const fp12& e);
+    fp12 negate() const;
     fp12 conjugate() const;
     fp12 square() const;
+    void squareAssign();
     fp12 cyclotomicSquare() const;
-    fp12 mul(const fp12& e) const;
-    void mulAssign(const fp12& e);
+    void cyclotomicSquareAssign();
+    fp12 multiply(const fp12& e) const;
+    void multiplyAssign(const fp12& e);
     static std::tuple<fp2, fp2> fp4Square(const fp2& e0, const fp2& e1);
     fp12 inverse() const;
     void mulBy014Assign(const fp2& e0, const fp2& e1, const fp2& e4);
diff --git a/include/bls12-381/g.hpp b/include/bls12-381/g.hpp
index b849183..9daad3a 100644
--- a/include/bls12-381/g.hpp
+++ b/include/bls12-381/g.hpp
@@ -25,7 +25,7 @@ class g1
     fp z;
 
     g1();
-    g1(const std::array<fp, 3>& e3);
+    explicit g1(const std::array<fp, 3>& e3);
     g1(const g1& e);
     static std::optional<g1> fromJacobianBytesBE(const std::span<const uint8_t, 144> in, const bool check = false, const bool raw = false);
     static std::optional<g1> fromJacobianBytesLE(const std::span<const uint8_t, 144> in, const bool check = false, const bool raw = false);
@@ -51,16 +51,22 @@ class g1
     bool isAffine() const;
     g1 affine() const;
     g1 add(const g1& e) const;
+    void addAssign(const g1& e);
     g1 dbl() const;
-    g1 neg() const;
-    g1 sub(const g1& e) const;
-    template<size_t N> g1 mulScalar(const std::array<uint64_t, N>& s) const;
+    void doubleAssign();
+    g1 negate() const;
+    g1 subtract(const g1& e) const;
+    void subtractAssign(const g1& e);
+    template<size_t N> g1 scale(const std::array<uint64_t, N>& s) const;
     g1 clearCofactor() const;
     g1 glvEndomorphism() const;
-
+    
+    // Those operators are defined to support set and map.
+    // They are not mathematically correct.
+    // DO NOT use them to compare g1.
     auto operator<=>(const g1&) const = default;
    
-    static std::optional<g1> multiExp(std::span<const g1> points, std::span<const std::array<uint64_t, 4>> scalars, std::function<void()> yield = std::function<void()>());
+    static g1 weightedSum(std::span<const g1> points, std::span<const std::array<uint64_t, 4>> scalars, const std::function<void()>& yield = std::function<void()>());
     static g1 mapToCurve(const fp& e);
     static std::tuple<fp, fp> swuMapG1(const fp& e);
     static void isogenyMapG1(fp& x, fp& y);
@@ -81,7 +87,7 @@ class g2
     fp2 z;
 
     g2();
-    g2(const std::array<fp2, 3>& e3);
+    explicit g2(const std::array<fp2, 3>& e3);
     g2(const g2& e);
     static std::optional<g2> fromJacobianBytesBE(const std::span<const uint8_t, 288> in, const bool check = false, const bool raw = false);
     static std::optional<g2> fromJacobianBytesLE(const std::span<const uint8_t, 288> in, const bool check = false, const bool raw = false);
@@ -107,17 +113,23 @@ class g2
     bool isAffine() const;
     g2 affine() const;
     g2 add(const g2& e) const;
+    void addAssign(const g2& e);
     g2 dbl() const;
-    g2 neg() const;
-    g2 sub(const g2& e) const;
+    void doubleAssign();
+    g2 negate() const;
+    g2 subtract(const g2& e) const;
+    void subtractAssign(const g2& e);
     g2 psi() const;
-    template<size_t N> g2 mulScalar(const std::array<uint64_t, N>& s) const;
+    template<size_t N> g2 scale(const std::array<uint64_t, N>& s) const;
     g2 clearCofactor() const;
     g2 frobeniusMap(int64_t power) const;
 
+    // Those operators are defined to support set and map.
+    // They are not mathematically correct.
+    // DO NOT use them to compare g2.
     auto operator<=>(const g2&) const = default;
-   
-    static std::optional<g2> multiExp(std::span<const g2> points, std::span<const std::array<uint64_t, 4>> scalars, std::function<void()> yield = std::function<void()>());
+
+    static g2 weightedSum(std::span<const g2> points, std::span<const std::array<uint64_t, 4>> scalars, const std::function<void()>& yield = std::function<void()>());
     static g2 mapToCurve(const fp2& e);
     static std::tuple<fp2, fp2> swuMapG2(const fp2& e);
     //static void isogenyMapG2(fp2& x, fp2& y);
diff --git a/include/bls12-381/scalar.hpp b/include/bls12-381/scalar.hpp
index 1448fdf..f53a259 100644
--- a/include/bls12-381/scalar.hpp
+++ b/include/bls12-381/scalar.hpp
@@ -125,7 +125,7 @@ std::array<uint64_t, NC> add(const std::array<uint64_t, NA>& a, const std::array
 
 // multiplies two std::arrays: calculates c = a * b
 template<size_t NC, size_t NA, size_t NB>
-std::array<uint64_t, NC> mul(const std::array<uint64_t, NA>& a, const std::array<uint64_t, NB>& b)
+std::array<uint64_t, NC> multiply(const std::array<uint64_t, NA>& a, const std::array<uint64_t, NB>& b)
 {
     std::array<uint64_t, NC> c;
     memset(c.data(), 0, NC * sizeof(uint64_t));
@@ -255,10 +255,10 @@ fp fp::exp(const std::array<uint64_t, N>& s) const
     uint64_t l = scalar::bitLength(s);
     for(int64_t i = l - 1; i >= 0; i--)
     {
-        _mul(&z, &z, &z);
+        z.squareAssign();
         if((s[i/64] >> (i%64) & 1) == 1)
         {
-            _mul(&z, &z, this);
+            z.multiplyAssign(*this);
         }
     }
     return z;
@@ -271,10 +271,10 @@ fp2 fp2::exp(const std::array<uint64_t, N>& s) const
     uint64_t l = scalar::bitLength(s);
     for(int64_t i = l - 1; i >= 0; i--)
     {
-        z = z.square();
+        z.squareAssign();
         if((s[i/64] >> (i%64) & 1) == 1)
         {
-            z = z.mul(*this);
+            z.multiplyAssign(*this);
         }
     }
     return z;
@@ -287,10 +287,10 @@ fp6 fp6::exp(const std::array<uint64_t, N>& s) const
     uint64_t l = scalar::bitLength(s);
     for(int64_t i = l - 1; i >= 0; i--)
     {
-        z = z.square();
+        z.squareAssign();
         if((s[i/64] >> (i%64) & 1) == 1)
         {
-            z = z.mul(*this);
+            z.multiplyAssign(*this);
         }
     }
     return z;
@@ -302,10 +302,10 @@ template<size_t N> fp12 fp12::exp(const std::array<uint64_t, N>& s) const
     uint64_t l = scalar::bitLength(s);
     for(int64_t i = l - 1; i >= 0; i--)
     {
-        z = z.square();
+        z.squareAssign();
         if((s[i/64] >> (i%64) & 1) == 1)
         {
-            z = z.mul(*this);
+            z.multiplyAssign(*this);
         }
     }
     return z;
@@ -318,17 +318,17 @@ fp12 fp12::cyclotomicExp(const std::array<uint64_t, N>& s) const
     uint64_t l = scalar::bitLength(s);
     for(int64_t i = l - 1; i >= 0; i--)
     {
-        z = z.cyclotomicSquare();
+        z.cyclotomicSquareAssign();
         if((s[i/64] >> (i%64) & 1) == 1)
         {
-            z = z.mul(*this);
+            z.multiplyAssign(*this);
         }
     }
     return z;
 }
 
 template<size_t N>
-g1 g1::mulScalar(const std::array<uint64_t, N>& s) const
+g1 g1::scale(const std::array<uint64_t, N>& s) const
 {
     g1 q = g1({fp::zero(), fp::zero(), fp::zero()});
     g1 n = *this;
@@ -345,7 +345,7 @@ g1 g1::mulScalar(const std::array<uint64_t, N>& s) const
 }
 
 template<size_t N>
-g2 g2::mulScalar(const std::array<uint64_t, N>& s) const
+g2 g2::scale(const std::array<uint64_t, N>& s) const
 {
     g2 q = g2({fp2::zero(), fp2::zero(), fp2::zero()});
     g2 n = *this;
diff --git a/src/arithmetic.cpp b/src/arithmetic.cpp
index 56fa924..d5f2650 100644
--- a/src/arithmetic.cpp
+++ b/src/arithmetic.cpp
@@ -96,93 +96,6 @@ void _add(fp* z, const fp* x, const fp* y)
 }
 #endif
 
-#ifdef __x86_64__
-void _addAssign(fp* x, const fp* y)
-{
-    // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
-    // x => %rdi
-    // y => %rsi
-    // callee needs to restore registers r15, r14, r13, r12, rbx before returning
-    asm("push %r15;");
-    asm("push %r14;");
-    asm("push %r13;");
-    asm("push %r12;");
-    asm("push %rbx;");
-    asm("mov     (%rdi),%r8;");
-    asm("mov 0x08(%rdi),%r9;");
-    asm("mov 0x10(%rdi),%r10;");
-    asm("mov 0x18(%rdi),%r11;");
-    asm("mov 0x20(%rdi),%r12;");
-    asm("mov 0x28(%rdi),%r13;");
-    asm("add     (%rsi),%r8;");
-    asm("adc 0x08(%rsi),%r9;");
-    asm("adc 0x10(%rsi),%r10;");
-    asm("adc 0x18(%rsi),%r11;");
-    asm("adc 0x20(%rsi),%r12;");
-    asm("adc 0x28(%rsi),%r13;");
-    asm("mov %r8,%r14;");
-    asm("mov %r9,%r15;");
-    asm("mov %r10,%rcx;");
-    asm("mov %r11,%rdx;");
-    asm("mov %r12,%rsi;");
-    asm("mov %r13,%rbx;");
-    asm("mov $0xb9feffffffffaaab,%rax;");
-    asm("sub %rax,%r14;");
-    asm("mov $0x1eabfffeb153ffff,%rax;");
-    asm("sbb %rax,%r15;");
-    asm("mov $0x6730d2a0f6b0f624,%rax;");
-    asm("sbb %rax,%rcx;");
-    asm("mov $0x64774b84f38512bf,%rax;");
-    asm("sbb %rax,%rdx;");
-    asm("mov $0x4b1ba7b6434bacd7,%rax;");
-    asm("sbb %rax,%rsi;");
-    asm("mov $0x1a0111ea397fe69a,%rax;");
-    asm("sbb %rax,%rbx;");
-    asm("cmovae %r14,%r8;");
-    asm("cmovae %r15,%r9;");
-    asm("cmovae %rcx,%r10;");
-    asm("cmovae %rdx,%r11;");
-    asm("cmovae %rsi,%r12;");
-    asm("cmovae %rbx,%r13;");
-    asm("mov %r8,     (%rdi);");
-    asm("mov %r9, 0x08(%rdi);");
-    asm("mov %r10,0x10(%rdi);");
-    asm("mov %r11,0x18(%rdi);");
-    asm("mov %r12,0x20(%rdi);");
-    asm("mov %r13,0x28(%rdi);");
-    asm("pop %rbx;");
-    asm("pop %r12;");
-    asm("pop %r13;");
-    asm("pop %r14;");
-    asm("pop %r15;");
-}
-#else
-void _addAssign(fp* x, const fp* y)
-{
-    uint64_t carry, _;
-
-    tie(x->d[0], carry) = Add64(x->d[0], y->d[0], 0);
-    tie(x->d[1], carry) = Add64(x->d[1], y->d[1], carry);
-    tie(x->d[2], carry) = Add64(x->d[2], y->d[2], carry);
-    tie(x->d[3], carry) = Add64(x->d[3], y->d[3], carry);
-    tie(x->d[4], carry) = Add64(x->d[4], y->d[4], carry);
-    tie(x->d[5], _)     = Add64(x->d[5], y->d[5], carry);
-
-    // if z > q --> z -= q
-    // note: this is NOT constant time
-    if(!(x->d[5] < fp::MODULUS.d[5] || (x->d[5] == fp::MODULUS.d[5] && (x->d[4] < fp::MODULUS.d[4] || (x->d[4] == fp::MODULUS.d[4] && (x->d[3] < fp::MODULUS.d[3] || (x->d[3] == fp::MODULUS.d[3] && (x->d[2] < fp::MODULUS.d[2] || (x->d[2] == fp::MODULUS.d[2] && (x->d[1] < fp::MODULUS.d[1] || (x->d[1] == fp::MODULUS.d[1] && (x->d[0] < fp::MODULUS.d[0]))))))))))))
-    {
-        uint64_t b;
-        tie(x->d[0], b) = Sub64(x->d[0], fp::MODULUS.d[0], 0);
-        tie(x->d[1], b) = Sub64(x->d[1], fp::MODULUS.d[1], b);
-        tie(x->d[2], b) = Sub64(x->d[2], fp::MODULUS.d[2], b);
-        tie(x->d[3], b) = Sub64(x->d[3], fp::MODULUS.d[3], b);
-        tie(x->d[4], b) = Sub64(x->d[4], fp::MODULUS.d[4], b);
-        tie(x->d[5], _) = Sub64(x->d[5], fp::MODULUS.d[5], b);
-    }
-}
-#endif
-
 #ifdef __x86_64__
 void _ladd(fp* z, const fp* x, const fp* y)
 {
@@ -223,45 +136,6 @@ void _ladd(fp* z, const fp* x, const fp* y)
 }
 #endif
 
-#ifdef __x86_64__
-void _laddAssign(fp* x, const fp* y)
-{
-    // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
-    // x => %rdi
-    // y => %rsi
-    // callee needs to restore registers r15, r14, r13, r12, rbx (if clobbed) before returning
-    asm("mov     (%rdi),%r8;");
-    asm("mov 0x08(%rdi),%r9;");
-    asm("mov 0x10(%rdi),%r10;");
-    asm("mov 0x18(%rdi),%r11;");
-    asm("mov 0x20(%rdi),%rcx;");
-    asm("mov 0x28(%rdi),%rax;");
-    asm("add     (%rsi),%r8;");
-    asm("adc 0x08(%rsi),%r9;");
-    asm("adc 0x10(%rsi),%r10;");
-    asm("adc 0x18(%rsi),%r11;");
-    asm("adc 0x20(%rsi),%rcx;");
-    asm("adc 0x28(%rsi),%rax;");
-    asm("mov %r8,     (%rdi);");
-    asm("mov %r9, 0x08(%rdi);");
-    asm("mov %r10,0x10(%rdi);");
-    asm("mov %r11,0x18(%rdi);");
-    asm("mov %rcx,0x20(%rdi);");
-    asm("mov %rax,0x28(%rdi);");
-}
-#else
-void _laddAssign(fp* x, const fp* y)
-{
-    uint64_t carry, _;
-    tie(x->d[0], carry) = Add64(x->d[0], y->d[0], 0);
-    tie(x->d[1], carry) = Add64(x->d[1], y->d[1], carry);
-    tie(x->d[2], carry) = Add64(x->d[2], y->d[2], carry);
-    tie(x->d[3], carry) = Add64(x->d[3], y->d[3], carry);
-    tie(x->d[4], carry) = Add64(x->d[4], y->d[4], carry);
-    tie(x->d[5], _)     = Add64(x->d[5], y->d[5], carry);
-}
-#endif
-
 #ifdef __x86_64__
 void _double(fp* z, const fp* x)
 {
@@ -349,92 +223,6 @@ void _double(fp* z, const fp* x)
 }
 #endif
 
-#ifdef __x86_64__
-void _doubleAssign(fp* z)
-{
-    // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
-    // z => %rdi
-    // callee needs to restore registers r15, r14, r13, r12, rbx before returning
-    asm("push %r15;");
-    asm("push %r14;");
-    asm("push %r13;");
-    asm("push %r12;");
-    asm("push %rbx;");
-    asm("mov     (%rdi),%r8;");
-    asm("mov 0x08(%rdi),%r9;");
-    asm("mov 0x10(%rdi),%r10;");
-    asm("mov 0x18(%rdi),%r11;");
-    asm("mov 0x20(%rdi),%r12;");
-    asm("mov 0x28(%rdi),%r13;");
-    asm("add %r8,%r8;");
-    asm("adc %r9,%r9;");
-    asm("adc %r10,%r10;");
-    asm("adc %r11,%r11;");
-    asm("adc %r12,%r12;");
-    asm("adc %r13,%r13;");
-    asm("mov %r8,%r14;");
-    asm("mov %r9,%r15;");
-    asm("mov %r10,%rcx;");
-    asm("mov %r11,%rdx;");
-    asm("mov %r12,%rsi;");
-    asm("mov %r13,%rbx;");
-    asm("mov $0xb9feffffffffaaab,%rax;");
-    asm("sub %rax,%r14;");
-    asm("mov $0x1eabfffeb153ffff,%rax;");
-    asm("sbb %rax,%r15;");
-    asm("mov $0x6730d2a0f6b0f624,%rax;");
-    asm("sbb %rax,%rcx;");
-    asm("mov $0x64774b84f38512bf,%rax;");
-    asm("sbb %rax,%rdx;");
-    asm("mov $0x4b1ba7b6434bacd7,%rax;");
-    asm("sbb %rax,%rsi;");
-    asm("mov $0x1a0111ea397fe69a,%rax;");
-    asm("sbb %rax,%rbx;");
-    asm("cmovae %r14,%r8;");
-    asm("cmovae %r15,%r9;");
-    asm("cmovae %rcx,%r10;");
-    asm("cmovae %rdx,%r11;");
-    asm("cmovae %rsi,%r12;");
-    asm("cmovae %rbx,%r13;");
-    asm("mov %r8,     (%rdi);");
-    asm("mov %r9, 0x08(%rdi);");
-    asm("mov %r10,0x10(%rdi);");
-    asm("mov %r11,0x18(%rdi);");
-    asm("mov %r12,0x20(%rdi);");
-    asm("mov %r13,0x28(%rdi);");
-    asm("pop %rbx;");
-    asm("pop %r12;");
-    asm("pop %r13;");
-    asm("pop %r14;");
-    asm("pop %r15;");
-}
-#else
-void _doubleAssign(fp* z)
-{
-    uint64_t carry, _;
-
-    tie(z->d[0], carry) = Add64(z->d[0], z->d[0], 0);
-    tie(z->d[1], carry) = Add64(z->d[1], z->d[1], carry);
-    tie(z->d[2], carry) = Add64(z->d[2], z->d[2], carry);
-    tie(z->d[3], carry) = Add64(z->d[3], z->d[3], carry);
-    tie(z->d[4], carry) = Add64(z->d[4], z->d[4], carry);
-    tie(z->d[5], _)     = Add64(z->d[5], z->d[5], carry);
-
-    // if z > q --> z -= q
-    // note: this is NOT constant time
-    if(!(z->d[5] < fp::MODULUS.d[5] || (z->d[5] == fp::MODULUS.d[5] && (z->d[4] < fp::MODULUS.d[4] || (z->d[4] == fp::MODULUS.d[4] && (z->d[3] < fp::MODULUS.d[3] || (z->d[3] == fp::MODULUS.d[3] && (z->d[2] < fp::MODULUS.d[2] || (z->d[2] == fp::MODULUS.d[2] && (z->d[1] < fp::MODULUS.d[1] || (z->d[1] == fp::MODULUS.d[1] && (z->d[0] < fp::MODULUS.d[0]))))))))))))
-    {
-        uint64_t b;
-        tie(z->d[0], b) = Sub64(z->d[0], fp::MODULUS.d[0], 0);
-        tie(z->d[1], b) = Sub64(z->d[1], fp::MODULUS.d[1], b);
-        tie(z->d[2], b) = Sub64(z->d[2], fp::MODULUS.d[2], b);
-        tie(z->d[3], b) = Sub64(z->d[3], fp::MODULUS.d[3], b);
-        tie(z->d[4], b) = Sub64(z->d[4], fp::MODULUS.d[4], b);
-        tie(z->d[5], _) = Sub64(z->d[5], fp::MODULUS.d[5], b);
-    }
-}
-#endif
-
 #ifdef __x86_64__
 void _ldouble(fp* z, const fp* x)
 {
@@ -476,7 +264,7 @@ void _ldouble(fp* z, const fp* x)
 #endif
 
 #ifdef __x86_64__
-void _sub(fp* z, const fp* x, const fp* y)
+void _subtract(fp* z, const fp* x, const fp* y)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
     // z => %rdi
@@ -532,7 +320,7 @@ void _sub(fp* z, const fp* x, const fp* y)
     asm("pop %r15;");
 }
 #else
-void _sub(fp* z, const fp* x, const fp* y)
+void _subtract(fp* z, const fp* x, const fp* y)
 {
     uint64_t b;
     tie(z->d[0], b) = Sub64(x->d[0], y->d[0], 0);
@@ -555,102 +343,25 @@ void _sub(fp* z, const fp* x, const fp* y)
 #endif
 
 #ifdef __x86_64__
-void _subAssign(fp* z, const fp* x)
-{
-    // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
-    // z => %rdi
-    // x => %rsi
-    // callee needs to restore registers r15, r14, r13, r12, rbx before returning
-    asm("push %r15;");
-    asm("push %r14;");
-    asm("push %r13;");
-    asm("push %r12;");
-    asm("push %rbx;");
-    asm("xor %rax,%rax;");
-    asm("mov     (%rdi),%r8;");
-    asm("mov 0x08(%rdi),%r9;");
-    asm("mov 0x10(%rdi),%r10;");
-    asm("mov 0x18(%rdi),%r11;");
-    asm("mov 0x20(%rdi),%r12;");
-    asm("mov 0x28(%rdi),%r13;");
-    asm("sub     (%rsi),%r8;");
-    asm("sbb 0x08(%rsi),%r9;");
-    asm("sbb 0x10(%rsi),%r10;");
-    asm("sbb 0x18(%rsi),%r11;");
-    asm("sbb 0x20(%rsi),%r12;");
-    asm("sbb 0x28(%rsi),%r13;");
-    asm("mov $0xb9feffffffffaaab,%r14;");
-    asm("mov $0x1eabfffeb153ffff,%r15;");
-    asm("mov $0x6730d2a0f6b0f624,%rcx;");
-    asm("mov $0x64774b84f38512bf,%rdx;");
-    asm("mov $0x4b1ba7b6434bacd7,%rsi;");
-    asm("mov $0x1a0111ea397fe69a,%rbx;");
-    asm("cmovae %rax,%r14;");
-    asm("cmovae %rax,%r15;");
-    asm("cmovae %rax,%rcx;");
-    asm("cmovae %rax,%rdx;");
-    asm("cmovae %rax,%rsi;");
-    asm("cmovae %rax,%rbx;");
-    asm("add %r14,%r8;");
-    asm("adc %r15,%r9;");
-    asm("adc %rcx,%r10;");
-    asm("adc %rdx,%r11;");
-    asm("adc %rsi,%r12;");
-    asm("adc %rbx,%r13;");
-    asm("mov %r8,     (%rdi);");
-    asm("mov %r9, 0x08(%rdi);");
-    asm("mov %r10,0x10(%rdi);");
-    asm("mov %r11,0x18(%rdi);");
-    asm("mov %r12,0x20(%rdi);");
-    asm("mov %r13,0x28(%rdi);");
-    asm("pop %rbx;");
-    asm("pop %r12;");
-    asm("pop %r13;");
-    asm("pop %r14;");
-    asm("pop %r15;");
-}
-#else
-void _subAssign(fp* z, const fp* x)
-{
-    uint64_t b;
-    tie(z->d[0], b) = Sub64(z->d[0], x->d[0], 0);
-    tie(z->d[1], b) = Sub64(z->d[1], x->d[1], b);
-    tie(z->d[2], b) = Sub64(z->d[2], x->d[2], b);
-    tie(z->d[3], b) = Sub64(z->d[3], x->d[3], b);
-    tie(z->d[4], b) = Sub64(z->d[4], x->d[4], b);
-    tie(z->d[5], b) = Sub64(z->d[5], x->d[5], b);
-    if(b != 0)
-    {
-        uint64_t c, _;
-        tie(z->d[0], c) = Add64(z->d[0], fp::MODULUS.d[0], 0);
-        tie(z->d[1], c) = Add64(z->d[1], fp::MODULUS.d[1], c);
-        tie(z->d[2], c) = Add64(z->d[2], fp::MODULUS.d[2], c);
-        tie(z->d[3], c) = Add64(z->d[3], fp::MODULUS.d[3], c);
-        tie(z->d[4], c) = Add64(z->d[4], fp::MODULUS.d[4], c);
-        tie(z->d[5], _) = Add64(z->d[5], fp::MODULUS.d[5], c);
-    }
-}
-#endif
-
-#ifdef __x86_64__
-void _lsubAssign(fp* z, const fp* x)
+void _lsubtract(fp* z, const fp* x, const fp* y)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
     // z => %rdi
     // x => %rsi
+    // y => %rdx
     // callee needs to restore registers r15, r14, r13, r12, rbx before returning
-    asm("mov     (%rdi),%r8;");
-    asm("mov 0x08(%rdi),%r9;");
-    asm("mov 0x10(%rdi),%r10;");
-    asm("mov 0x18(%rdi),%r11;");
-    asm("mov 0x20(%rdi),%rcx;");
-    asm("mov 0x28(%rdi),%rax;");
-    asm("sub     (%rsi),%r8;");
-    asm("sbb 0x08(%rsi),%r9;");
-    asm("sbb 0x10(%rsi),%r10;");
-    asm("sbb 0x18(%rsi),%r11;");
-    asm("sbb 0x20(%rsi),%rcx;");
-    asm("sbb 0x28(%rsi),%rax;");
+    asm("mov     (%rsi),%r8;");
+    asm("mov 0x08(%rsi),%r9;");
+    asm("mov 0x10(%rsi),%r10;");
+    asm("mov 0x18(%rsi),%r11;");
+    asm("mov 0x20(%rsi),%rcx;");
+    asm("mov 0x28(%rsi),%rax;");
+    asm("sub     (%rdx),%r8;");
+    asm("sbb 0x08(%rdx),%r9;");
+    asm("sbb 0x10(%rdx),%r10;");
+    asm("sbb 0x18(%rdx),%r11;");
+    asm("sbb 0x20(%rdx),%rcx;");
+    asm("sbb 0x28(%rdx),%rax;");
     asm("mov %r8,     (%rdi);");
     asm("mov %r9, 0x08(%rdi);");
     asm("mov %r10,0x10(%rdi);");
@@ -659,20 +370,20 @@ void _lsubAssign(fp* z, const fp* x)
     asm("mov %rax,0x28(%rdi);");
 }
 #else
-void _lsubAssign(fp* z, const fp* x)
+void _lsubtract(fp* z, const fp* x, const fp* y)
 {
     uint64_t b, _;
-    tie(z->d[0], b) = Sub64(z->d[0], x->d[0], 0);
-    tie(z->d[1], b) = Sub64(z->d[1], x->d[1], b);
-    tie(z->d[2], b) = Sub64(z->d[2], x->d[2], b);
-    tie(z->d[3], b) = Sub64(z->d[3], x->d[3], b);
-    tie(z->d[4], b) = Sub64(z->d[4], x->d[4], b);
-    tie(z->d[5], _) = Sub64(z->d[5], x->d[5], b);
+    tie(z->d[0], b) = Sub64(x->d[0], y->d[0], 0);
+    tie(z->d[1], b) = Sub64(x->d[1], y->d[1], b);
+    tie(z->d[2], b) = Sub64(x->d[2], y->d[2], b);
+    tie(z->d[3], b) = Sub64(x->d[3], y->d[3], b);
+    tie(z->d[4], b) = Sub64(x->d[4], y->d[4], b);
+    tie(z->d[5], _) = Sub64(x->d[5], y->d[5], b);
 }
 #endif
 
 #ifdef __x86_64__
-void __neg(fp* z, const fp* x)
+void __negate(fp* z, const fp* x)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
     // z => %rdi
@@ -697,9 +408,9 @@ void __neg(fp* z, const fp* x)
     asm("mov %rcx,0x20(%rdi);");
     asm("mov %rax,0x28(%rdi);");
 }
-void _neg(fp* z, const fp* x)
+void _negate(fp* z, const fp* x)
 {
-    __neg(z, x);
+    __negate(z, x);
     // put zero check after __neg because gcc messes up %rdi in -O3 (doesn't restore it before inlining asm code)
     if(x->isZero())
     {
@@ -708,7 +419,7 @@ void _neg(fp* z, const fp* x)
     }
 }
 #else
-void _neg(fp* z, const fp* x)
+void _negate(fp* z, const fp* x)
 {
     if(x->isZero())
     {
@@ -726,7 +437,7 @@ void _neg(fp* z, const fp* x)
 #endif
 
 #ifdef __x86_64__
-void __mul(fp* z, const fp* x, const fp* y)
+void __multiply(fp* z, const fp* x, const fp* y)
 {
     // x86_64 calling convention (https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI):
     // z => %rdi (=> stack)
@@ -1788,29 +1499,29 @@ extern "C" blsmul_func_t __attribute__((no_sanitize_address)) resolve_blsmul() {
     while(*my_environ != nullptr) {
        const char disable_str[] = "BLS_DISABLE_BMI2";
         if(strncmp(*my_environ++, disable_str, strlen(disable_str)) == 0)
-           return __mul;
+           return __multiply;
     }
 
     if(cpu_has_bmi2_and_adx())
        return __mul_ex;
-    return __mul;
+    return __multiply;
 }
 
-void _mul(fp*, const fp*, const fp*) __attribute__((ifunc("resolve_blsmul")));
+void _multiply(fp*, const fp*, const fp*) __attribute__((ifunc("resolve_blsmul")));
 #else
-blsmul_func_t _mul = __mul;
+blsmul_func_t _multiply = __multiply;
 
 struct bls_mul_init {
    bls_mul_init() {
       if(cpu_has_bmi2_and_adx())
-         _mul = __mul_ex;
+         _multiply = __mul_ex;
    }
 };
 static bls_mul_init the_bls_mul_init;
 
 #endif //__ELF__
 #else
-void _mul(fp* z, const fp* x, const fp* y)
+void _multiply(fp* z, const fp* x, const fp* y)
 {
     array<uint64_t, 6> t;
     array<uint64_t, 3> c;
@@ -1936,16 +1647,16 @@ void _mul(fp* z, const fp* x, const fp* y)
 void _square(fp* z, const fp* x)
 {
     #ifdef __clang__
-    // The clang compiler completely optimizes out the _square() function and inlines __mul() wherever
+    // The clang compiler completely optimizes out the _square() function and inlines __multiply() wherever
     // it occurs. However, for some reason the compiler forgets that it has to move the third
-    // parameter ('y') of __mul() into %rdx according to the calling convention. The first two
+    // parameter ('y') of __multiply() into %rdx according to the calling convention. The first two
     // parameters, 'z' (%rdi) and 'x' (%rsi), are set properly because they are the exact same as for
     // __square(). But the third parameter (which sould be 'x' as well) is somehow ignored by the
-    // clang compiler. So we need to help out by moving it into %rdx before calling __mul().
+    // clang compiler. So we need to help out by moving it into %rdx before calling __multiply().
     // This is probably a bug in clang!
     asm("mov %rsi,%rdx;");
     #endif
-    __mul(z, x, x);
+    __multiply(z, x, x);
 }
 #else
 void _square(fp* z, const fp* x)
diff --git a/src/fp.cpp b/src/fp.cpp
index 1cf3f52..4379033 100644
--- a/src/fp.cpp
+++ b/src/fp.cpp
@@ -19,6 +19,7 @@ fp::fp(const fp& e) : d{e.d[0], e.d[1], e.d[2], e.d[3], e.d[4], e.d[5]}
 
 optional<fp> fp::fromBytesBE(const span<const uint8_t, 48> in, const bool check, const bool raw)
 {
+    // We decided to always validate the input here. But we reserve the flag.
     fp e = fp(scalar::fromBytesBE<6>(in));
     if(check && !e.isValid()) return {};
     if(raw) return e;
@@ -27,6 +28,7 @@ optional<fp> fp::fromBytesBE(const span<const uint8_t, 48> in, const bool check,
 
 optional<fp> fp::fromBytesLE(const span<const uint8_t, 48> in, const bool check, const bool raw)
 {
+    // We decided to always validate the input here. But we reserve the flag.
     fp e = fp(scalar::fromBytesLE<6>(in));
     if(check && !e.isValid()) return {};
     if(raw) return e;
@@ -92,11 +94,6 @@ bool fp::isOne() const
     return equal(R1);
 }
 
-std::strong_ordering fp::cmp(const fp& e) const
-{
-    return scalar::cmp<6>(d, e.d);
-}
-
 bool fp::equal(const fp& e) const
 {
     return d[0] == e.d[0] && d[1] == e.d[1] && d[2] == e.d[2] && d[3] == e.d[3] && d[4] == e.d[4] && d[5] == e.d[5];
@@ -107,6 +104,73 @@ bool fp::sign() const
     return (fromMont().d[0] & 1) == 0;
 }
 
+fp fp::add(const fp& e) const
+{
+    fp c;
+    _add(&c, this, &e);
+    return c;
+}
+
+void fp::addAssign(const fp& e)
+{
+    _add(this, this, &e);
+}
+
+fp fp::dbl() const
+{
+    fp c;
+    _double(&c, this);
+    return c;
+}
+
+void fp::doubleAssign()
+{
+    _double(this, this);
+}
+
+fp fp::subtract(const fp& e) const
+{
+    fp c;
+    _subtract(&c, this, &e);
+    return c;
+}
+
+void fp::subtractAssign(const fp& e)
+{
+    _subtract(this, this, &e);
+}
+
+fp fp::negate() const
+{
+    fp c;
+    _negate(&c, this);
+    return c;
+}
+
+fp fp::multiply(const fp& e) const
+{
+    fp c;
+    _multiply(&c, this, &e);
+    return c;
+}
+
+void fp::multiplyAssign(const fp& e)
+{
+    _multiply(this, this, &e);
+}
+
+fp fp::square() const
+{
+    fp c;
+    _square(&c, this);
+    return c;
+}
+
+void fp::squareAssign()
+{
+    _square(this, this);
+}
+
 void fp::div2(const uint64_t& e)
 {
     d[0] = d[0]>>1 | d[1]<<63;
@@ -132,24 +196,28 @@ uint64_t fp::mul2()
 fp fp::toMont() const
 {
     fp c;
-    _mul(&c, this, &R2);
+    _multiply(&c, this, &R2);
     return c;
 }
 
 fp fp::fromMont() const
 {
     fp c, b = fp({1, 0, 0, 0, 0, 0});
-    _mul(&c, this, &b);
+    _multiply(&c, this, &b);
     return c;
 }
 
 fp fp::phi() const
 {
     fp c;
-    _mul(&c, this, &glvPhi1);
+    _multiply(&c, this, &glvPhi1);
     return c;
 }
 
+// Origin algorithm comes from:
+// [1] B.S.Kaliski Jr. The Montgomery inverse and its applications. IEEE Transactions on Computers, 44(8):1064–1065, August 1995.
+// Modified according to:
+// [2] Savas, Erkay, and Cetin Kaya Koç. "The Montgomery modular inverse-revisited." IEEE transactions on computers 49, no. 7 (2000): 763-766.
 fp fp::inverse() const
 {
     if(isZero())
@@ -161,9 +229,16 @@ fp fp::inverse() const
     fp s({1, 0, 0, 0, 0, 0});
     fp r({0, 0, 0, 0, 0, 0});
     int64_t k = 0;
-    uint64_t z = 0;
     bool found = false;
+
     // Phase 1
+    // Input: a*2^384 mod p < p, p.
+    // Output: a^{-1} * 2^{k-384} mod p, where 381 <= k <= 2 * 381.
+    // 1. Input is in Montgomery form, so the input is a*2^384 mod p.
+    // 2. All Phase 1 operations will treat fp as integer in Z instead of elements in GF(p).
+    // Therefore, NO modular reduction shall happen in Phase 1.
+    // 3. As proved in [1], s,r,u,v < 2p. 
+    // Therefore, there shall be NO overflow in the process as p has 381 bits and we can hold 384 bits in fp.
     for(uint64_t i = 0; i < 768; i++)
     {
         if(v.isZero())
@@ -179,21 +254,21 @@ fp fp::inverse() const
         else if(v.isEven())
         {
             v.div2(0);
-            z += r.mul2();
+            r.mul2();
         }
         else if(u.cmp(v) > 0)
         {
-            _lsubAssign(&u, &v);
+            _lsubtract(&u, &u, &v);
             u.div2(0);
-            _laddAssign(&r, &s);
+            _ladd(&r, &r, &s);
             s.mul2();
         }
         else
         {
-            _lsubAssign(&v, &u);
+            _lsubtract(&v, &v, &u);
             v.div2(0);
-            _laddAssign(&s, &r);
-            z += r.mul2();
+            _ladd(&s, &s, &r);
+            r.mul2();
         }
         k += 1;
     }
@@ -203,23 +278,47 @@ fp fp::inverse() const
         return zero();
     }
 
-    if(k < 381 || k > 381+384)
+    if(r.cmp(MODULUS) >= 0)
     {
-        return zero();
-    }
-
-    if(r.cmp(MODULUS) >= 0 || z > 0)
-    {
-        _lsubAssign(&r, &MODULUS);
+        _lsubtract(&r, &r, &MODULUS);
     }
     u = MODULUS;
-    _lsubAssign(&u, &r);
+    _lsubtract(&u, &u, &r);
 
     // Phase 2
-    for(uint64_t i = k; i < 384*2; i++)
+    // Input: u = a^{-1} * 2^{k-384} mod p, k where 381 <= k <= 2*381
+    // Output: a^{-1} * 2^384 mod p
+
+    // Process for 381 <= k <= 384
+    // Generate new u, k pair that
+    // k' = k + 384
+    // u' = u * 2^{2*384} * 2^{-384} mod p
+    // u' = a^{-1} * 2^{k'-384} mod p will still hold.
+    // Will hit this case if and only if input is fp::one() for the current p.
+    if (k <= 384) {
+        u.multiplyAssign(fp::R2);
+        k += 384;
+    }
+
+    // u = u * 2^{2*384} * 2^{-384} mod p = a^{-1} * 2^k mod p
+    u.multiplyAssign(fp::R2);
+
+    // This case should not happen mathmatically.
+    // Leave it here as sanity check to guard against access vialotion later.
+    if(k > 2 * 384)
     {
-        _double(&u, &u);
+        return zero();
     }
+
+    // 384 < k <= 2 * 384
+    // 0 <= power < 384
+    uint64_t power = 2*384 - k;
+    fp fpPower;
+    fpPower.d[power/64] = ((uint64_t)1) << (power%64);
+   
+    // result = u * 2^(2*384-k) * 2^{-384} = a^{-1} * 2^k * 2^{384-k} mod p = a^{-1} * 2^384 mod p
+    u.multiplyAssign(fpPower);
+
     return u;
 }
 
@@ -370,16 +469,16 @@ optional<fp2> fp2::fromBytesBE(const span<const uint8_t, 96> in, const bool chec
 {
     optional<fp> c1 = fp::fromBytesBE(span<const uint8_t, 48>(&in[ 0], &in[48]), check, raw);
     optional<fp> c0 = fp::fromBytesBE(span<const uint8_t, 48>(&in[48], &in[96]), check, raw);
-    if(!c1.has_value() || !c0.has_value()) return {};
-    return fp2({c0.value(), c1.value()});
+    if(!c1 || !c0) return {};
+    return fp2({*c0, *c1});
 }
 
 optional<fp2> fp2::fromBytesLE(const span<const uint8_t, 96> in, const bool check, const bool raw)
 {
     optional<fp> c0 = fp::fromBytesLE(span<const uint8_t, 48>(&in[ 0], &in[48]), check, raw);
     optional<fp> c1 = fp::fromBytesLE(span<const uint8_t, 48>(&in[48], &in[96]), check, raw);
-    if(!c1.has_value() || !c0.has_value()) return {};
-    return fp2({c0.value(), c1.value()});
+    if(!c1 || !c0) return {};
+    return fp2({*c0, *c1});
 }
 
 void fp2::toBytesBE(const span<uint8_t, 96> out, const bool raw) const
@@ -445,115 +544,83 @@ bool fp2::sign() const
 
 fp2 fp2::add(const fp2& e) const
 {
-    fp2 c;
-    _add(&c.c0, &c0, &e.c0);
-    _add(&c.c1, &c1, &e.c1);
+    fp2 c(*this);
+    c.addAssign(e);
     return c;
 }
 
 void fp2::addAssign(const fp2& e)
 {
-    _addAssign(&c0, &e.c0);
-    _addAssign(&c1, &e.c1);
-}
-
-fp2 fp2::ladd(const fp2& e) const
-{
-    fp2 c;
-    _ladd(&c.c0, &c0, &e.c0);
-    _ladd(&c.c1, &c1, &e.c1);
-    return c;
+    _add(&c0, &c0, &e.c0);
+    _add(&c1, &c1, &e.c1);
 }
 
 fp2 fp2::dbl() const
 {
-    fp2 c;
-    _double(&c.c0, &c0);
-    _double(&c.c1, &c1);
+    fp2 c(*this);
+    c.doubleAssign();
     return c;
 }
 
 void fp2::doubleAssign()
 {
-    _doubleAssign(&c0);
-    _doubleAssign(&c1);
+    _double(&c0, &c0);
+    _double(&c1, &c1);
 }
 
-fp2 fp2::ldouble() const
+fp2 fp2::subtract(const fp2& e) const
 {
-    fp2 c;
-    _ldouble(&c.c0, &c0);
-    _ldouble(&c.c1, &c1);
+    fp2 c(*this);
+    c.subtractAssign(e);
     return c;
 }
 
-fp2 fp2::sub(const fp2& e) const
+void fp2::subtractAssign(const fp2& e)
 {
-    fp2 c;
-    _sub(&c.c0, &c0, &e.c0);
-    _sub(&c.c1, &c1, &e.c1);
-    return c;
-}
-
-void fp2::subAssign(const fp2& e)
-{
-    _subAssign(&c0, &e.c0);
-    _subAssign(&c1, &e.c1);
+    _subtract(&c0, &c0, &e.c0);
+    _subtract(&c1, &c1, &e.c1);
 }
 
-fp2 fp2::neg() const
+fp2 fp2::negate() const
 {
     fp2 c;
-    _neg(&c.c0, &c0);
-    _neg(&c.c1, &c1);
+    _negate(&c.c0, &c0);
+    _negate(&c.c1, &c1);
     return c;
 }
 
-fp2 fp2::conj() const
+fp2 fp2::conjugate() const
 {
     fp2 c;
     c.c0 = c0;
-    _neg(&c.c1, &c1);
+    _negate(&c.c1, &c1);
     return c;
 }
 
-fp2 fp2::mul(const fp2& e) const
+fp2 fp2::multiply(const fp2& e) const
 {
-    fp t[4];
-    fp2 c;
-    _mul(&t[1], &c0, &e.c0);
-    _mul(&t[2], &c1, &e.c1);
-    _add(&t[0], &c0, &c1);
-    _add(&t[3], &e.c0, &e.c1);
-    _sub(&c.c0, &t[1], &t[2]);
-    _addAssign(&t[1], &t[2]);
-    _mul(&t[0], &t[0], &t[3]);
-    _sub(&c.c1, &t[0], &t[1]);
+    fp2 c(*this);
+    c.multiplyAssign(e);
     return c;
 }
 
-void fp2::mulAssign(const fp2& e)
+void fp2::multiplyAssign(const fp2& e)
 {
     fp t[4];
-    _mul(&t[1], &c0, &e.c0);
-    _mul(&t[2], &c1, &e.c1);
+    _multiply(&t[1], &c0, &e.c0);
+    _multiply(&t[2], &c1, &e.c1);
     _add(&t[0], &c0, &c1);
     _add(&t[3], &e.c0, &e.c1);
-    _sub(&c0, &t[1], &t[2]);
-    _addAssign(&t[1], &t[2]);
-    _mul(&t[0], &t[0], &t[3]);
-    _sub(&c1, &t[0], &t[1]);
+    _subtract(&c0, &t[1], &t[2]);
+    _add(&t[1], &t[1], &t[2]);
+    _multiply(&t[0], &t[0], &t[3]);
+    _subtract(&c1, &t[0], &t[1]);
 }
 
 fp2 fp2::square() const
 {
-    fp t[3];
-    fp2 c;
-    _ladd(&t[0], &c0, &c1);
-    _sub(&t[1], &c0, &c1);
-    _ldouble(&t[2], &c0);
-    _mul(&c.c0, &t[0], &t[1]);
-    _mul(&c.c1, &t[2], &c1);
+    fp2 c(*this);
+    c.squareAssign();
     return c;
 }
 
@@ -561,17 +628,17 @@ void fp2::squareAssign()
 {
     fp t[3];
     _ladd(&t[0], &c0, &c1);
-    _sub(&t[1], &c0, &c1);
+    _subtract(&t[1], &c0, &c1);
     _ldouble(&t[2], &c0);
-    _mul(&c0, &t[0], &t[1]);
-    _mul(&c1, &t[2], &c1);
+    _multiply(&c0, &t[0], &t[1]);
+    _multiply(&c1, &t[2], &c1);
 }
 
 fp2 fp2::mulByNonResidue() const
 {
     fp t;
     fp2 c;
-    _sub(&t, &c0, &c1);
+    _subtract(&t, &c0, &c1);
     _add(&c.c1, &c0, &c1);
     c.c0 = t;
     return c;
@@ -583,9 +650,9 @@ fp2 fp2::mulByB() const
     fp2 c;
     _double(&t[0], &c0);
     _double(&t[1], &c1);
-    _doubleAssign(&t[0]);
-    _doubleAssign(&t[1]);
-    _sub(&c.c0, &t[0], &t[1]);
+    _double(&t[0], &t[0]);
+    _double(&t[1], &t[1]);
+    _subtract(&c.c0, &t[0], &t[1]);
     _add(&c.c1, &t[0], &t[1]);
     return c;
 }
@@ -596,19 +663,19 @@ fp2 fp2::inverse() const
     fp2 c;
     _square(&t[0], &c0);
     _square(&t[1], &c1);
-    _addAssign(&t[0], &t[1]);
+    _add(&t[0], &t[0], &t[1]);
     t[0] = t[0].inverse();
-    _mul(&c.c0, &c0, &t[0]);
-    _mul(&t[0], &t[0], &c1);
-    _neg(&c.c1, &t[0]);
+    _multiply(&c.c0, &c0, &t[0]);
+    _multiply(&t[0], &t[0], &c1);
+    _negate(&c.c1, &t[0]);
     return c;
 }
 
 fp2 fp2::mulByFq(const fp& e) const
 {
     fp2 c;
-    _mul(&c.c0, &c0, &e);
-    _mul(&c.c1, &c1, &e);
+    _multiply(&c.c0, &c0, &e);
+    _multiply(&c.c1, &c1, &e);
     return c;
 }
 
@@ -618,7 +685,7 @@ fp2 fp2::frobeniusMap(const uint64_t& power) const
     c.c0 = c0;
     if(power%2 == 1)
     {
-        _neg(&c.c1, &c1);
+        _negate(&c.c1, &c1);
         return c;
     }
     c.c1 = c1;
@@ -629,7 +696,7 @@ void fp2::frobeniusMapAssign(const uint64_t& power)
 {
     if(power%2 == 1)
     {
-        _neg(&c1, &c1);
+        _negate(&c1, &c1);
     }
 }
 
@@ -639,17 +706,17 @@ bool fp2::sqrt(fp2& c) const
     u = *this;
     a1 = this->exp(fp::pMinus3Over4);
     alpha = a1.square();
-    alpha = alpha.mul(*this);
-    x0 = a1.mul(*this);
+    alpha = alpha.multiply(*this);
+    x0 = a1.multiply(*this);
     if(alpha.equal(fp2::negativeOne2))
     {
-        _neg(&c.c0, &x0.c1);
+        _negate(&c.c0, &x0.c1);
         c.c1 = x0.c0;
         return true;
     }
     alpha = alpha.add(fp2::one());
     alpha = alpha.exp(fp::pMinus1Over2);
-    c = alpha.mul(x0);
+    c = alpha.multiply(x0);
     alpha = c.square();
     return alpha.equal(u);
 }
@@ -763,8 +830,8 @@ optional<fp6> fp6::fromBytesBE(const span<const uint8_t, 288> in, const bool che
     optional<fp2> c2 = fp2::fromBytesBE(span<const uint8_t, 96>(&in[  0], &in[ 96]), check, raw);
     optional<fp2> c1 = fp2::fromBytesBE(span<const uint8_t, 96>(&in[ 96], &in[192]), check, raw);
     optional<fp2> c0 = fp2::fromBytesBE(span<const uint8_t, 96>(&in[192], &in[288]), check, raw);
-    if(!c2.has_value() || !c1.has_value() || !c0.has_value()) return {};
-    return fp6({c0.value(), c1.value(), c2.value()});
+    if(!c2 || !c1 || !c0) return {};
+    return fp6({*c0, *c1, *c2});
 }
 
 optional<fp6> fp6::fromBytesLE(const span<const uint8_t, 288> in, const bool check, const bool raw)
@@ -772,8 +839,8 @@ optional<fp6> fp6::fromBytesLE(const span<const uint8_t, 288> in, const bool che
     optional<fp2> c0 = fp2::fromBytesLE(span<const uint8_t, 96>(&in[  0], &in[ 96]), check, raw);
     optional<fp2> c1 = fp2::fromBytesLE(span<const uint8_t, 96>(&in[ 96], &in[192]), check, raw);
     optional<fp2> c2 = fp2::fromBytesLE(span<const uint8_t, 96>(&in[192], &in[288]), check, raw);
-    if(!c2.has_value() || !c1.has_value() || !c0.has_value()) return {};
-    return fp6({c0.value(), c1.value(), c2.value()});
+    if(!c2 || !c1 || !c0) return {};
+    return fp6({*c0, *c1, *c2});
 }
 
 void fp6::toBytesBE(const span<uint8_t, 288> out, const bool raw) const
@@ -831,10 +898,8 @@ bool fp6::equal(const fp6& e) const
 
 fp6 fp6::add(const fp6& e) const
 {
-    fp6 c;
-    c.c0 = c0.add(e.c0);
-    c.c1 = c1.add(e.c1);
-    c.c2 = c2.add(e.c2);
+    fp6 c(*this);
+    c.addAssign(e);
     return c;
 }
 
@@ -847,10 +912,8 @@ void fp6::addAssign(const fp6& e)
 
 fp6 fp6::dbl() const
 {
-    fp6 c;
-    c.c0 = c0.dbl();
-    c.c1 = c1.dbl();
-    c.c2 = c2.dbl();
+    fp6 c(*this);
+    c.doubleAssign();
     return c;
 }
 
@@ -861,133 +924,113 @@ void fp6::doubleAssign()
     c2.doubleAssign();
 }
 
-fp6 fp6::sub(const fp6& e) const
+fp6 fp6::subtract(const fp6& e) const
 {
-    fp6 c;
-    c.c0 = c0.sub(e.c0);
-    c.c1 = c1.sub(e.c1);
-    c.c2 = c2.sub(e.c2);
+    fp6 c(*this);
+    c.subtractAssign(e);
     return c;
 }
 
-void fp6::subAssign(const fp6& e)
+void fp6::subtractAssign(const fp6& e)
 {
-    c0.subAssign(e.c0);
-    c1.subAssign(e.c1);
-    c2.subAssign(e.c2);
+    c0.subtractAssign(e.c0);
+    c1.subtractAssign(e.c1);
+    c2.subtractAssign(e.c2);
 }
 
-fp6 fp6::neg() const
+fp6 fp6::negate() const
 {
     fp6 c;
-    c.c0 = c0.neg();
-    c.c1 = c1.neg();
-    c.c2 = c2.neg();
+    c.c0 = c0.negate();
+    c.c1 = c1.negate();
+    c.c2 = c2.negate();
     return c;
 }
 
-fp6 fp6::mul(const fp6& e) const
+fp6 fp6::multiply(const fp6& e) const
 {
-    fp2 t[6];
-    fp6 c;
-    t[0] = c0.mul(e.c0);
-    t[1] = c1.mul(e.c1);
-    t[2] = c2.mul(e.c2);
-    t[3] = c1.add(c2);
-    t[4] = e.c1.add(e.c2);
-    t[3].mulAssign(t[4]);
-    t[4] = t[1].add(t[2]);
-    t[3].subAssign(t[4]);
-    t[3] = t[3].mulByNonResidue();
-    t[5] = t[0].add(t[3]);
-    t[3] = c0.add(c1);
-    t[4] = e.c0.add(e.c1);
-    t[3].mulAssign(t[4]);
-    t[4] = t[0].add(t[1]);
-    t[3].subAssign(t[4]);
-    t[4] = t[2].mulByNonResidue();
-    c.c1 = t[3].add(t[4]);
-    t[3] = c0.add(c2);
-    t[4] = e.c0.add(e.c2);
-    t[3].mulAssign(t[4]);
-    t[4] = t[0].add(t[2]);
-    t[3].subAssign(t[4]);
-    c.c2 = t[1].add(t[3]);
-    c.c0 = t[5];
+    fp6 c(*this);
+    c.multiplyAssign(e);
     return c;
 }
 
-void fp6::mulAssign(const fp6& e)
+void fp6::multiplyAssign(const fp6& e)
 {
     fp2 t[6];
-    t[0] = c0.mul(e.c0);
-    t[1] = c1.mul(e.c1);
-    t[2] = c2.mul(e.c2);
+    t[0] = c0.multiply(e.c0);
+    t[1] = c1.multiply(e.c1);
+    t[2] = c2.multiply(e.c2);
     t[3] = c1.add(c2);
     t[4] = e.c1.add(e.c2);
-    t[3].mulAssign(t[4]);
+    t[3].multiplyAssign(t[4]);
     t[4] = t[1].add(t[2]);
-    t[3].subAssign(t[4]);
+    t[3].subtractAssign(t[4]);
     t[3] = t[3].mulByNonResidue();
     t[5] = t[0].add(t[3]);
     t[3] = c0.add(c1);
     t[4] = e.c0.add(e.c1);
-    t[3].mulAssign(t[4]);
+    t[3].multiplyAssign(t[4]);
     t[4] = t[0].add(t[1]);
-    t[3].subAssign(t[4]);
+    t[3].subtractAssign(t[4]);
     t[4] = t[2].mulByNonResidue();
     c1 = t[3].add(t[4]);
     t[3] = c0.add(c2);
     t[4] = e.c0.add(e.c2);
-    t[3].mulAssign(t[4]);
+    t[3].multiplyAssign(t[4]);
     t[4] = t[0].add(t[2]);
-    t[3].subAssign(t[4]);
+    t[3].subtractAssign(t[4]);
     c2 = t[1].add(t[3]);
     c0 = t[5];
 }
 
 fp6 fp6::square() const
+{
+    fp6 c(*this);
+    c.squareAssign();
+    return c;
+}
+
+void fp6::squareAssign()
 {
     fp2 t[6];
     fp6 c;
     t[0] = c0.square();
-    t[1] = c0.mul(c1);
+    t[1] = c0.multiply(c1);
     t[1].doubleAssign();
-    t[2] = c0.sub(c1);
+    t[2] = c0.subtract(c1);
     t[2].addAssign(c2);
     t[2].squareAssign();
-    t[3] = c1.mul(c2);
+    t[3] = c1.multiply(c2);
     t[3].doubleAssign();
     t[4] = c2.square();
     t[5] = t[3].mulByNonResidue();
-    c.c0 = t[0].add(t[5]);
+    c0 = t[0].add(t[5]);
     t[5] = t[4].mulByNonResidue();
-    c.c1 = t[1].add(t[5]);
+    c1 = t[1].add(t[5]);
     t[1].addAssign(t[2]);
     t[1].addAssign(t[3]);
     t[0].addAssign(t[4]);
-    c.c2 = t[1].sub(t[0]);
-    return c;
+    c2 = t[1].subtract(t[0]);
 }
 
 void fp6::mulBy01Assign(const fp2& e0, const fp2& e1)
 {
     fp2 t[6];
-    t[0] = c0.mul(e0);
-    t[1] = c1.mul(e1);
+    t[0] = c0.multiply(e0);
+    t[1] = c1.multiply(e1);
     t[5] = c1.add(c2);
-    t[2] = e1.mul(t[5]);
-    t[2].subAssign(t[1]);
+    t[2] = e1.multiply(t[5]);
+    t[2].subtractAssign(t[1]);
     t[2] = t[2].mulByNonResidue();
     t[5] = c0.add(c2);
-    t[3] = e0.mul(t[5]);
-    t[3].subAssign(t[0]);
+    t[3] = e0.multiply(t[5]);
+    t[3].subtractAssign(t[0]);
     c2 = t[3].add(t[1]);
     t[4] = e0.add(e1);
     t[5] = c0.add(c1);
-    t[4].mulAssign(t[5]);
-    t[4].subAssign(t[0]);
-    c1 = t[4].sub(t[1]);
+    t[4].multiplyAssign(t[5]);
+    t[4].subtractAssign(t[0]);
+    c1 = t[4].subtract(t[1]);
     c0 = t[2].add(t[0]);
 }
 
@@ -995,21 +1038,21 @@ fp6 fp6::mulBy01(const fp2& e0, const fp2& e1) const
 {
     fp2 t[5];
     fp6 c;
-    t[0] = c0.mul(e0);
-    t[1] = c1.mul(e1);
+    t[0] = c0.multiply(e0);
+    t[1] = c1.multiply(e1);
     t[2] = c1.add(c2);
-    t[2].mulAssign(e1);
-    t[2].subAssign(t[1]);
+    t[2].multiplyAssign(e1);
+    t[2].subtractAssign(t[1]);
     t[2] = t[2].mulByNonResidue();
     t[3] = c0.add(c2);
-    t[3].mulAssign(e0);
-    t[3].subAssign(t[0]);
+    t[3].multiplyAssign(e0);
+    t[3].subtractAssign(t[0]);
     c.c2 = t[3].add(t[1]);
     t[4] = e0.add(e1);
     t[3] = c0.add(c1);
-    t[4].mulAssign(t[3]);
-    t[4].subAssign(t[0]);
-    c.c1 = t[4].sub(t[1]);
+    t[4].multiplyAssign(t[3]);
+    t[4].subtractAssign(t[0]);
+    c.c1 = t[4].subtract(t[1]);
     c.c0 = t[2].add(t[0]);
     return c;
 }
@@ -1018,9 +1061,9 @@ fp6 fp6::mulBy1(const fp2& e1) const
 {
     fp2 t;
     fp6 c;
-    t = c2.mul(e1);
-    c.c2 = c1.mul(e1);
-    c.c1 = c0.mul(e1);
+    t = c2.multiply(e1);
+    c.c2 = c1.multiply(e1);
+    c.c1 = c0.multiply(e1);
     c.c0 = t.mulByNonResidue();
     return c;
 }
@@ -1037,9 +1080,9 @@ fp6 fp6::mulByNonResidue() const
 fp6 fp6::mulByBaseField(const fp2& e) const
 {
     fp6 c;
-    c.c0 = c0.mul(e);
-    c.c1 = c1.mul(e);
-    c.c2 = c2.mul(e);
+    c.c0 = c0.multiply(e);
+    c.c1 = c1.multiply(e);
+    c.c2 = c2.multiply(e);
     return c;
 }
 
@@ -1048,26 +1091,26 @@ fp6 fp6::inverse() const
     fp2 t[5];
     fp6 c;
     t[0] = c0.square();
-    t[1] = c1.mul(c2);
+    t[1] = c1.multiply(c2);
     t[1] = t[1].mulByNonResidue();
-    t[0].subAssign(t[1]);
+    t[0].subtractAssign(t[1]);
     t[1] = c1.square();
-    t[2] = c0.mul(c2);
-    t[1].subAssign(t[2]);
+    t[2] = c0.multiply(c2);
+    t[1].subtractAssign(t[2]);
     t[2] = c2.square();
     t[2] = t[2].mulByNonResidue();
-    t[3] = c0.mul(c1);
-    t[2].subAssign(t[3]);
-    t[3] = c2.mul(t[2]);
-    t[4] = c1.mul(t[1]);
+    t[3] = c0.multiply(c1);
+    t[2].subtractAssign(t[3]);
+    t[3] = c2.multiply(t[2]);
+    t[4] = c1.multiply(t[1]);
     t[3].addAssign(t[4]);
     t[3] = t[3].mulByNonResidue();
-    t[4] = c0.mul(t[0]);
+    t[4] = c0.multiply(t[0]);
     t[3].addAssign(t[4]);
     t[3] = t[3].inverse();
-    c.c0 = t[0].mul(t[3]);
-    c.c1 = t[2].mul(t[3]);
-    c.c2 = t[1].mul(t[3]);
+    c.c0 = t[0].multiply(t[3]);
+    c.c1 = t[2].multiply(t[3]);
+    c.c2 = t[1].multiply(t[3]);
     return c;
 }
 
@@ -1085,15 +1128,15 @@ fp6 fp6::frobeniusMap(const uint64_t& power) const
         }
         case 3:
         {
-            _neg(&c.c0.c0, &c1.c1);
+            _negate(&c.c0.c0, &c1.c1);
             c.c1.c1 = c1.c0;
-            c.c2 = c2.neg();
+            c.c2 = c2.negate();
             break;
         }
         default:
         {
-            c.c1 = c.c1.mul(frobeniusCoeffs61[power%6]);
-            c.c2 = c.c2.mul(frobeniusCoeffs62[power%6]);
+            c.c1 = c.c1.multiply(frobeniusCoeffs61[power%6]);
+            c.c2 = c.c2.multiply(frobeniusCoeffs62[power%6]);
             break;
         }
     }
@@ -1114,16 +1157,16 @@ void fp6::frobeniusMapAssign(const uint64_t& power)
         }
         case 3:
         {
-            _neg(&t.c0, &c1.c1);
+            _negate(&t.c0, &c1.c1);
             c1.c1 = c1.c0;
             c1.c0 = t.c0;
-            c2 = c2.neg();
+            c2 = c2.negate();
             break;
         }
         default:
         {
-            c1.mulAssign(frobeniusCoeffs61[power%6]);
-            c2.mulAssign(frobeniusCoeffs62[power%6]);
+            c1.multiplyAssign(frobeniusCoeffs61[power%6]);
+            c2.multiplyAssign(frobeniusCoeffs62[power%6]);
             break;
         }
     }
@@ -1199,16 +1242,16 @@ optional<fp12> fp12::fromBytesBE(const span<const uint8_t, 576> in, const bool c
 {
     optional<fp6> c1 = fp6::fromBytesBE(span<const uint8_t, 288>(&in[  0], &in[288]), check, raw);
     optional<fp6> c0 = fp6::fromBytesBE(span<const uint8_t, 288>(&in[288], &in[576]), check, raw);
-    if(!c1.has_value() || !c0.has_value()) return {};
-    return fp12({c0.value(), c1.value()});
+    if(!c1 || !c0) return {};
+    return fp12({*c0, *c1});
 }
 
 optional<fp12> fp12::fromBytesLE(const span<const uint8_t, 576> in, const bool check, const bool raw)
 {
     optional<fp6> c0 = fp6::fromBytesLE(span<const uint8_t, 288>(&in[  0], &in[288]), check, raw);
     optional<fp6> c1 = fp6::fromBytesLE(span<const uint8_t, 288>(&in[288], &in[576]), check, raw);
-    if(!c1.has_value() || !c0.has_value()) return {};
-    return fp12({c0.value(), c1.value()});
+    if(!c1 || !c0) return {};
+    return fp12({*c0, *c1});
 }
 
 void fp12::toBytesBE(const span<uint8_t, 576> out, const bool raw) const
@@ -1270,33 +1313,45 @@ bool fp12::equal(const fp12& e) const
 
 fp12 fp12::add(const fp12& e) const
 {
-    fp12 c;
-    c.c0 = c0.add(e.c0);
-    c.c1 = c1.add(e.c1);
+    fp12 c(*this);
+    c.addAssign(e);
     return c;
 }
 
+void fp12::addAssign(const fp12& e) {
+    c0.addAssign(e.c0);
+    c1.addAssign(e.c1);
+}
+
 fp12 fp12::dbl() const
 {
-    fp12 c;
-    c.c0 = c0.dbl();
-    c.c1 = c1.dbl();
+    fp12 c(*this);
+    c.doubleAssign();
     return c;
 }
 
-fp12 fp12::sub(const fp12& e) const
+void fp12::doubleAssign() {
+    c0.doubleAssign();
+    c1.doubleAssign();
+}
+
+fp12 fp12::subtract(const fp12& e) const
 {
-    fp12 c;
-    c.c0 = c0.sub(e.c0);
-    c.c1 = c1.sub(e.c1);
+    fp12 c(*this);
+    c.subtractAssign(e);
     return c;
 }
 
-fp12 fp12::neg() const
+void fp12::subtractAssign(const fp12& e) {
+    c0.subtractAssign(e.c0);
+    c1.subtractAssign(e.c1);
+}
+
+fp12 fp12::negate() const
 {
     fp12 c;
-    c.c0 = c0.neg();
-    c.c1 = c1.neg();
+    c.c0 = c0.negate();
+    c.c1 = c1.negate();
     return c;
 }
 
@@ -1304,85 +1359,84 @@ fp12 fp12::conjugate() const
 {
     fp12 c;
     c.c0 = c0;
-    c.c1 = c1.neg();
+    c.c1 = c1.negate();
     return c;
 }
 
 fp12 fp12::square() const
 {
+    fp12 c(*this);
+    c.squareAssign();
+    return c;
+}
+
+void fp12::squareAssign() {
     fp6 t[4];
-    fp12 c;
+
     t[0] = c0.add(c1);
-    t[2] = c0.mul(c1);
+    t[2] = c0.multiply(c1);
     t[1] = c1.mulByNonResidue();
     t[1].addAssign(c0);
     t[3] = t[2].mulByNonResidue();
-    t[0].mulAssign(t[1]);
-    t[0].subAssign(t[2]);
-    c.c0 = t[0].sub(t[3]);
-    c.c1 = t[2].dbl();
-    return c;
+    t[0].multiplyAssign(t[1]);
+    t[0].subtractAssign(t[2]);
+    c0 = t[0].subtract(t[3]);
+    c1 = t[2].dbl();
 }
 
 fp12 fp12::cyclotomicSquare() const
 {
+    fp12 c(*this);
+    c.cyclotomicSquareAssign();
+    return c;
+}
+
+void fp12::cyclotomicSquareAssign() {
     fp2 t[7];
-    fp12 c;
     tie(t[3], t[4]) = fp4Square(c0.c0, c1.c1);
-    t[2] = t[3].sub(c0.c0);
+    t[2] = t[3].subtract(c0.c0);
     t[2].doubleAssign();
-    c.c0.c0 = t[2].add(t[3]);
+    c0.c0 = t[2].add(t[3]);
     t[2] = t[4].add(c1.c1);
     t[2].doubleAssign();
-    c.c1.c1 = t[2].add(t[4]);
+    c1.c1 = t[2].add(t[4]);
     tie(t[3], t[4]) = fp4Square(c1.c0, c0.c2);
     tie(t[5], t[6]) = fp4Square(c0.c1, c1.c2);
-    t[2] = t[3].sub(c0.c1);
+    t[2] = t[3].subtract(c0.c1);
     t[2].doubleAssign();
-    c.c0.c1 = t[2].add(t[3]);
+    c0.c1 = t[2].add(t[3]);
     t[2] = t[4].add(c1.c2);
     t[2].doubleAssign();
-    c.c1.c2 = t[2].add(t[4]);
+    c1.c2 = t[2].add(t[4]);
     t[3] = t[6].mulByNonResidue();
     t[2] = t[3].add(c1.c0);
     t[2].doubleAssign();
-    c.c1.c0 = t[2].add(t[3]);
-    t[2] = t[5].sub(c0.c2);
+    c1.c0 = t[2].add(t[3]);
+    t[2] = t[5].subtract(c0.c2);
     t[2].doubleAssign();
-    c.c0.c2 = t[2].add(t[5]);
-    return c;
+    c0.c2 = t[2].add(t[5]);
 }
 
-fp12 fp12::mul(const fp12& e) const
+fp12 fp12::multiply(const fp12& e) const
 {
-    fp6 t[4];
-    fp12 c;
-    t[1] = c0.mul(e.c0);
-    t[2] = c1.mul(e.c1);
-    t[0] = t[1].add(t[2]);
-    t[2] = t[2].mulByNonResidue();
-    t[3] = t[1].add(t[2]);
-    t[1] = c0.add(c1);
-    t[2] = e.c0.add(e.c1);
-    t[1].mulAssign(t[2]);
-    c.c0 = t[3];
-    c.c1 = t[1].sub(t[0]);
+    fp12 c(*this);
+    c.multiplyAssign(e);
     return c;
 }
 
-void fp12::mulAssign(const fp12& e)
+void fp12::multiplyAssign(const fp12& e)
 {
     fp6 t[4];
-    t[1] = c0.mul(e.c0);
-    t[2] = c1.mul(e.c1);
+    t[1] = c0.multiply(e.c0);
+    t[2] = c1.multiply(e.c1);
     t[0] = t[1].add(t[2]);
     t[2] = t[2].mulByNonResidue();
     t[3] = t[1].add(t[2]);
     t[1] = c0.add(c1);
     t[2] = e.c0.add(e.c1);
-    t[1].mulAssign(t[2]);
+    t[1].multiplyAssign(t[2]);
     c0 = t[3];
-    c1 = t[1].sub(t[0]);
+    c1 = t[1].subtract(t[0]);
 }
 
 tuple<fp2, fp2> fp12::fp4Square(const fp2& e0, const fp2& e1)
@@ -1395,8 +1449,8 @@ tuple<fp2, fp2> fp12::fp4Square(const fp2& e0, const fp2& e1)
     c0 = t[2].add(t[0]);
     t[2] = e0.add(e1);
     t[2].squareAssign();
-    t[2].subAssign(t[0]);
-    c1 = t[2].sub(t[1]);
+    t[2].subtractAssign(t[0]);
+    c1 = t[2].subtract(t[1]);
     return {c0, c1};
 }
 
@@ -1407,11 +1461,11 @@ fp12 fp12::inverse() const
     t[0] = c0.square();
     t[1] = c1.square();
     t[1] = t[1].mulByNonResidue();
-    t[1] = t[0].sub(t[1]);
+    t[1] = t[0].subtract(t[1]);
     t[0] = t[1].inverse();
-    c.c0 = c0.mul(t[0]);
-    t[0].mulAssign(c1);
-    c.c1 = t[0].neg();
+    c.c0 = c0.multiply(t[0]);
+    t[0].multiplyAssign(c1);
+    c.c1 = t[0].negate();
     return c;
 }
 
@@ -1424,8 +1478,8 @@ void fp12::mulBy014Assign(const fp2& e0, const fp2& e1, const fp2& e4)
     t2 = e1.add(e4);
     t[2] = c1.add(c0);
     t[2].mulBy01Assign(e0, t2);
-    t[2].subAssign(t[0]);
-    c1 = t[2].sub(t[1]);
+    t[2].subtractAssign(t[0]);
+    c1 = t[2].subtract(t[1]);
     t[1] = t[1].mulByNonResidue();
     c0 = t[1].add(t[0]);
 }
@@ -1443,7 +1497,7 @@ fp12 fp12::frobeniusMap(const uint64_t& power) const
         }
         case 6:
         {
-            c.c1 = c.c1.neg();
+            c.c1 = c.c1.negate();
             break;
         }
         default:
@@ -1467,7 +1521,7 @@ void fp12::frobeniusMapAssign(const uint64_t& power)
         }
         case 6:
         {
-            c1 = c1.neg();
+            c1 = c1.negate();
             break;
         }
         default:
diff --git a/src/g.cpp b/src/g.cpp
index e9d75c4..6aac590 100644
--- a/src/g.cpp
+++ b/src/g.cpp
@@ -19,11 +19,12 @@ g1::g1(const g1& e) : x(e.x), y(e.y), z(e.z)
 
 optional<g1> g1::fromJacobianBytesBE(const span<const uint8_t, 144> in, const bool check, const bool raw)
 {
-    optional<fp> x = fp::fromBytesBE(span<const uint8_t, 48>(&in[ 0], &in[ 48]), check, raw);
-    optional<fp> y = fp::fromBytesBE(span<const uint8_t, 48>(&in[48], &in[ 96]), check, raw);
-    optional<fp> z = fp::fromBytesBE(span<const uint8_t, 48>(&in[96], &in[144]), check, raw);
-    if(!x.has_value() || !y.has_value() || !z.has_value()) return {};
-    g1 p = g1({x.value(), y.value(), z.value()});
+    // We decided to always validate the input here. Check flag will only affect on-curve checks.
+    optional<fp> x = fp::fromBytesBE(span<const uint8_t, 48>(&in[ 0], &in[ 48]), true, raw);
+    optional<fp> y = fp::fromBytesBE(span<const uint8_t, 48>(&in[48], &in[ 96]), true, raw);
+    optional<fp> z = fp::fromBytesBE(span<const uint8_t, 48>(&in[96], &in[144]), true, raw);
+    if(!x || !y || !z) return {};
+    g1 p = g1({*x, *y, *z});
     if(check && !p.isOnCurve())
     {
         return {};
@@ -33,11 +34,12 @@ optional<g1> g1::fromJacobianBytesBE(const span<const uint8_t, 144> in, const bo
 
 optional<g1> g1::fromJacobianBytesLE(const span<const uint8_t, 144> in, const bool check, const bool raw)
 {
-    optional<fp> x = fp::fromBytesLE(span<const uint8_t, 48>(&in[ 0], &in[ 48]), check, raw);
-    optional<fp> y = fp::fromBytesLE(span<const uint8_t, 48>(&in[48], &in[ 96]), check, raw);
-    optional<fp> z = fp::fromBytesLE(span<const uint8_t, 48>(&in[96], &in[144]), check, raw);
-    if(!x.has_value() || !y.has_value() || !z.has_value()) return {};
-    g1 p = g1({x.value(), y.value(), z.value()});
+    // We decided to always validate the input here. Check flag will only affect on-curve checks.
+    optional<fp> x = fp::fromBytesLE(span<const uint8_t, 48>(&in[ 0], &in[ 48]), true, raw);
+    optional<fp> y = fp::fromBytesLE(span<const uint8_t, 48>(&in[48], &in[ 96]), true, raw);
+    optional<fp> z = fp::fromBytesLE(span<const uint8_t, 48>(&in[96], &in[144]), true, raw);
+    if(!x || !y || !z) return {};
+    g1 p = g1({*x, *y, *z});
     if(check && !p.isOnCurve())
     {
         return {};
@@ -47,16 +49,17 @@ optional<g1> g1::fromJacobianBytesLE(const span<const uint8_t, 144> in, const bo
 
 optional<g1> g1::fromAffineBytesBE(const span<const uint8_t, 96> in, const bool check, const bool raw)
 {
-    optional<fp> x = fp::fromBytesBE(span<const uint8_t, 48>(&in[ 0], &in[ 48]), check, raw);
-    optional<fp> y = fp::fromBytesBE(span<const uint8_t, 48>(&in[48], &in[ 96]), check, raw);
-    if(!x.has_value() || !y.has_value()) return {};
+    // We decided to always validate the input here. Check flag will only affect on-curve checks.
+    optional<fp> x = fp::fromBytesBE(span<const uint8_t, 48>(&in[ 0], &in[ 48]), true, raw);
+    optional<fp> y = fp::fromBytesBE(span<const uint8_t, 48>(&in[48], &in[ 96]), true, raw);
+    if(!x || !y) return {};
     // check if given input points to infinity
-    if(x.value().isZero() && y.value().isZero())
+    if(x->isZero() && y->isZero())
     {
         return zero();
     }
     fp z = fp::one();
-    g1 p = g1({x.value(), y.value(), z});
+    g1 p = g1({*x, *y, z});
     if(check && !p.isOnCurve())
     {
         return {};
@@ -66,15 +69,17 @@ optional<g1> g1::fromAffineBytesBE(const span<const uint8_t, 96> in, const bool
 
 optional<g1> g1::fromAffineBytesLE(const span<const uint8_t, 96> in, const bool check, const bool raw)
 {
-    optional<fp> x = fp::fromBytesLE(span<const uint8_t, 48>(&in[ 0], &in[ 48]), check, raw);
-    optional<fp> y = fp::fromBytesLE(span<const uint8_t, 48>(&in[48], &in[ 96]), check, raw);
+    // We decided to always validate the input here. Check flag will only affect on-curve checks.
+    optional<fp> x = fp::fromBytesLE(span<const uint8_t, 48>(&in[ 0], &in[ 48]), true, raw);
+    optional<fp> y = fp::fromBytesLE(span<const uint8_t, 48>(&in[48], &in[ 96]), true, raw);
+    if(!x || !y) return {};
     // check if given input points to infinity
-    if(x.value().isZero() && y.value().isZero())
+    if(x->isZero() && y->isZero())
     {
         return zero();
     }
     fp z = fp::one();
-    g1 p = g1({x.value(), y.value(), z});
+    g1 p = g1({*x, *y, z});
     if(check && !p.isOnCurve())
     {
         return {};
@@ -110,7 +115,7 @@ optional<g1> g1::fromCompressedBytesBE(const span<const uint8_t, 48> in)
     //  =>  y   = +/- sqrt(x^3 + B)
     fp b = fp::B;
     _square(&p.y, &p.x);        // y = x^2
-    _mul(&p.y, &p.y, &p.x);     // y = x^2 * x = x^3
+    _multiply(&p.y, &p.y, &p.x);     // y = x^2 * x = x^3
     _add(&p.y, &p.y, &b);       // y = x^3 + B
     if(!p.y.sqrt(p.y))
     {
@@ -118,7 +123,7 @@ optional<g1> g1::fromCompressedBytesBE(const span<const uint8_t, 48> in)
     }
     if(p.y.isLexicographicallyLargest() ^ ysign)
     {
-        _neg(&p.y, &p.y);
+        _negate(&p.y, &p.y);
     }
     p.z = fp::one();
     return p;
@@ -251,12 +256,12 @@ bool g1::equal(const g1& e) const
     fp t[4];
     _square(&t[0], &z);
     _square(&t[1], &b.z);
-    _mul(&t[2], &t[0], &b.x);
-    _mul(&t[3], &t[1], &x);
-    _mul(&t[0], &t[0], &z);
-    _mul(&t[1], &t[1], &b.z);
-    _mul(&t[1], &t[1], &y);
-    _mul(&t[0], &t[0], &b.y);
+    _multiply(&t[2], &t[0], &b.x);
+    _multiply(&t[3], &t[1], &x);
+    _multiply(&t[0], &t[0], &z);
+    _multiply(&t[1], &t[1], &b.z);
+    _multiply(&t[1], &t[1], &y);
+    _multiply(&t[0], &t[0], &b.y);
     return t[0].equal(t[1]) && t[2].equal(t[3]);
 }
 
@@ -269,10 +274,10 @@ bool g1::inCorrectSubgroup() const
     g1 t1 = t0;
     t0 = t0.glvEndomorphism();
     t1 = t1.dbl();
-    t1 = t1.sub(*this);
-    t1 = t1.sub(t0);
-    t1 = t1.mulScalar<2>({0x0000000055555555, 0x396c8c005555e156});
-    t1 = t1.sub(t0);
+    t1 = t1.subtract(*this);
+    t1 = t1.subtract(t0);
+    t1 = t1.scale<2>({0x0000000055555555, 0x396c8c005555e156});
+    t1 = t1.subtract(t0);
     return t1.isZero();
 }
 
@@ -285,11 +290,11 @@ bool g1::isOnCurve() const
     fp t[4], _b = fp::B;
     _square(&t[0], &y);
     _square(&t[1], &x);
-    _mul(&t[1], &t[1], &x);
+    _multiply(&t[1], &t[1], &x);
     _square(&t[2], &z);
     _square(&t[3], &t[2]);
-    _mul(&t[2], &t[2], &t[3]);
-    _mul(&t[2], &_b, &t[2]);
+    _multiply(&t[2], &t[2], &t[3]);
+    _multiply(&t[2], &_b, &t[2]);
     _add(&t[1], &t[1], &t[2]);
     return t[0].equal(t[1]);
 }
@@ -309,120 +314,136 @@ g1 g1::affine() const
     fp t[2];
     t[0] = r.z.inverse();
     _square(&t[1], &t[0]);
-    _mul(&r.x, &r.x, &t[1]);
-    _mul(&t[0], &t[0], &t[1]);
-    _mul(&r.y, &r.y, &t[0]);
+    _multiply(&r.x, &r.x, &t[1]);
+    _multiply(&t[0], &t[0], &t[1]);
+    _multiply(&r.y, &r.y, &t[0]);
     r.z = fp::one();
     return r;
 }
 
 g1 g1::add(const g1& e) const
 {
+    g1 r(*this);
+    r.addAssign(e);
+    return r;
+}
+
+void g1::addAssign(const g1& e) {
     // www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
     if(isZero())
     {
-        return e;
+        *this = e;
+        return;
     }
     if(e.isZero())
     {
-        return *this;
+        return;
     }
     fp t[9];
     _square(&t[7], &z);
-    _mul(&t[1], &e.x, &t[7]);
-    _mul(&t[2], &z, &t[7]);
-    _mul(&t[0], &e.y, &t[2]);
+    _multiply(&t[1], &e.x, &t[7]);
+    _multiply(&t[2], &z, &t[7]);
+    _multiply(&t[0], &e.y, &t[2]);
     _square(&t[8], &e.z);
-    _mul(&t[3], &x, &t[8]);
-    _mul(&t[4], &e.z, &t[8]);
-    _mul(&t[2], &y, &t[4]);
+    _multiply(&t[3], &x, &t[8]);
+    _multiply(&t[4], &e.z, &t[8]);
+    _multiply(&t[2], &y, &t[4]);
     if(t[1].equal(t[3]))
     {
         if(t[0].equal(t[2]))
         {
-            return dbl();
+            doubleAssign();
+            return;
         }
-        return zero();
+        *this = zero();
+        return;
     }
-    g1 r;
-    _sub(&t[1], &t[1], &t[3]);
+    
+    _subtract(&t[1], &t[1], &t[3]);
     _double(&t[4], &t[1]);
     _square(&t[4], &t[4]);
-    _mul(&t[5], &t[1], &t[4]);
-    _sub(&t[0], &t[0], &t[2]);
+    _multiply(&t[5], &t[1], &t[4]);
+    _subtract(&t[0], &t[0], &t[2]);
     _double(&t[0], &t[0]);
     _square(&t[6], &t[0]);
-    _sub(&t[6], &t[6], &t[5]);
-    _mul(&t[3], &t[3], &t[4]);
+    _subtract(&t[6], &t[6], &t[5]);
+    _multiply(&t[3], &t[3], &t[4]);
     _double(&t[4], &t[3]);
-    _sub(&r.x, &t[6], &t[4]);
-    _sub(&t[4], &t[3], &r.x);
-    _mul(&t[6], &t[2], &t[5]);
+    _subtract(&x, &t[6], &t[4]);
+    _subtract(&t[4], &t[3], &x);
+    _multiply(&t[6], &t[2], &t[5]);
     _double(&t[6], &t[6]);
-    _mul(&t[0], &t[0], &t[4]);
-    _sub(&r.y, &t[0], &t[6]);
+    _multiply(&t[0], &t[0], &t[4]);
+    _subtract(&y, &t[0], &t[6]);
     _add(&t[0], &z, &e.z);
     _square(&t[0], &t[0]);
-    _sub(&t[0], &t[0], &t[7]);
-    _sub(&t[0], &t[0], &t[8]);
-    _mul(&r.z, &t[0], &t[1]);
-    return r;
+    _subtract(&t[0], &t[0], &t[7]);
+    _subtract(&t[0], &t[0], &t[8]);
+    _multiply(&z, &t[0], &t[1]);
 }
 
 g1 g1::dbl() const
 {
+    g1 r(*this);
+    r.doubleAssign();
+    return r;
+}
+
+void g1::doubleAssign() {
     // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
     if(isZero())
     {
-        return *this;
+        return;
     }
     fp t[5];
-    g1 r;
+
     _square(&t[0], &x);
     _square(&t[1], &y);
     _square(&t[2], &t[1]);
     _add(&t[1], &x, &t[1]);
     _square(&t[1], &t[1]);
-    _sub(&t[1], &t[1], &t[0]);
-    _sub(&t[1], &t[1], &t[2]);
+    _subtract(&t[1], &t[1], &t[0]);
+    _subtract(&t[1], &t[1], &t[2]);
     _double(&t[1], &t[1]);
     _double(&t[3], &t[0]);
     _add(&t[0], &t[3], &t[0]);
     _square(&t[4], &t[0]);
     _double(&t[3], &t[1]);
-    _sub(&r.x, &t[4], &t[3]);
-    _sub(&t[1], &t[1], &r.x);
+    _subtract(&x, &t[4], &t[3]);
+    _subtract(&t[1], &t[1], &x);
     _double(&t[2], &t[2]);
     _double(&t[2], &t[2]);
     _double(&t[2], &t[2]);
-    _mul(&t[0], &t[0], &t[1]);
-    _sub(&t[1], &t[0], &t[2]);
-    _mul(&t[0], &y, &z);
-    r.y = t[1];
-    _double(&r.z, &t[0]);
-    return r;
+    _multiply(&t[0], &t[0], &t[1]);
+    _subtract(&t[1], &t[0], &t[2]);
+    _multiply(&t[0], &y, &z);
+    y = t[1];
+    _double(&z, &t[0]);
 }
 
-g1 g1::neg() const
+g1 g1::negate() const
 {
     g1 r;
     r.x = x;
     r.z = z;
-    _neg(&r.y, &y);
+    _negate(&r.y, &y);
     return r;
 }
 
-g1 g1::sub(const g1& e) const
+g1 g1::subtract(const g1& e) const
 {
-    g1 c, d;
-    d = e.neg();
-    c = this->add(d);
+    g1 c(*this);
+    c.subtractAssign(e);
     return c;
 }
 
+void g1::subtractAssign(const g1& e) {
+    addAssign(e.negate());
+}
+
 g1 g1::clearCofactor() const
 {
-    return this->mulScalar(cofactorEFF);
+    return this->scale(cofactorEFF);
 }
 
 g1 g1::glvEndomorphism() const
@@ -436,19 +457,17 @@ g1 g1::glvEndomorphism() const
     return t;
 }
 
-// MultiExp calculates multi exponentiation. Given pairs of G1 point and scalar values
+// Given pairs of G1 point and scalar values
 // (P_0, e_0), (P_1, e_1), ... (P_n, e_n) calculates r = e_0 * P_0 + e_1 * P_1 + ... + e_n * P_n
-// Length of points and scalars are expected to be equal, otherwise NONE is returned.
-optional<g1> g1::multiExp(std::span<const g1> points, std::span<const array<uint64_t, 4>> scalars, std::function<void()> yield)
+// If length of points and scalars are not the same, then missing points will be treated as the zero point 
+// and missing scalars will be treated as the zero scalar.
+g1 g1::weightedSum(std::span<const g1> points, std::span<const std::array<uint64_t, 4>> scalars, const std::function<void()>& yield)
 {
-    if(points.size() != scalars.size())
-    {
-        return {};
-    }
+    const size_t effective_size = min(scalars.size(), points.size());
     uint64_t c = 3;
-    if(scalars.size() >= 32)
+    if(effective_size >= 32)
     {
-        c = static_cast<uint64_t>(ceil(log10(static_cast<double>(scalars.size()))));
+        c = (std::numeric_limits<size_t>::digits - std::countl_zero(effective_size))/3 + 2;
     }
     uint64_t bucketSize = (1<<c)-1;
     uint64_t windowsSize = 255/c+1;
@@ -460,39 +479,45 @@ optional<g1> g1::multiExp(std::span<const g1> points, std::span<const array<uint
     {
         for(uint64_t i = 0; i < bucketSize; i++)
         {
+            if (yield && ((i & 255) == 0)) {
+                yield();
+            }
             bucket[i] = zero();
         }
-        for(uint64_t i = 0; i < scalars.size(); i++)
+        for(uint64_t i = 0; i < effective_size; i++)
         {
+            if (yield && ((i & 255) == 0)) {
+                yield();
+            }
             array<uint64_t, 4> shifted;
             scalar::rsh(shifted, scalars[i], c*j);
             uint64_t index = bucketSize & shifted[0];
             if(index != 0)
             {
-                bucket[index-1] = bucket[index-1].add(points[i]);
+                bucket[index-1].addAssign(points[i]);
             }
         }
         g1 acc = zero();
         g1 sum = zero();
         for(int64_t i = bucketSize-1; i >= 0; i--)
         {
-            sum = sum.add(bucket[i]);
-            acc = acc.add(sum);
+            if (yield && ((i & 255) == 0)) {
+                yield();
+            }
+            sum.addAssign(bucket[i]);
+            acc.addAssign(sum);
         }
         windows.push_back(acc);
     }
-    if(scalars.size() >= 32 && yield)
-    {
-        yield();
-    }
+
     g1 acc = zero();
     for(int64_t i = windows.size()-1; i >= 0; i--)
     {
         for(uint64_t j = 0; j < c; j++)
         {
-            acc = acc.dbl();
+            acc.doubleAssign();
         }
-        acc = acc.add(windows[i]);
+        acc.addAssign(windows[i]);
     }
     return acc;
 }
@@ -529,7 +554,7 @@ tuple<fp, fp> g1::swuMapG1(const fp& e)
     };
     fp tv[4];
     _square(&tv[0], &e);
-    _mul(&tv[0], &tv[0], &params.z);
+    _multiply(&tv[0], &tv[0], &params.z);
     _square(&tv[1], &tv[0]);
     fp x1;
     _add(&x1, &tv[0], &tv[1]);
@@ -541,17 +566,17 @@ tuple<fp, fp> g1::swuMapG1(const fp& e)
     {
         x1 = params.zInv;
     }
-    _mul(&x1, &x1, &params.minusBOverA);
+    _multiply(&x1, &x1, &params.minusBOverA);
     fp gx1;
     _square(&gx1, &x1);
     _add(&gx1, &gx1, &params.a);
-    _mul(&gx1, &gx1, &x1);
+    _multiply(&gx1, &gx1, &x1);
     _add(&gx1, &gx1, &params.b);
     fp x2;
-    _mul(&x2, &tv[0], &x1);
-    _mul(&tv[1], &tv[0], &tv[1]);
+    _multiply(&x2, &tv[0], &x1);
+    _multiply(&tv[1], &tv[0], &tv[1]);
     fp gx2;
-    _mul(&gx2, &gx1, &tv[1]);
+    _multiply(&gx2, &gx1, &tv[1]);
     bool e2 = !gx1.isQuadraticNonResidue();
     fp x, y2;
     if(e2)
@@ -568,7 +593,7 @@ tuple<fp, fp> g1::swuMapG1(const fp& e)
     y2.sqrt(y);
     if(y.sign() != e.sign())
     {
-        _neg(&y, &y);
+        _negate(&y, &y);
     }
     return {x, y};
 }
@@ -659,10 +684,10 @@ void g1::isogenyMapG1(fp& x, fp& y)
     yDen = params[3][degree];
     for(int64_t i = degree - 1; i >= 0; i--)
     {
-        _mul(&xNum, &xNum, &x);
-        _mul(&xDen, &xDen, &x);
-        _mul(&yNum, &yNum, &x);
-        _mul(&yDen, &yDen, &x);
+        _multiply(&xNum, &xNum, &x);
+        _multiply(&xDen, &xDen, &x);
+        _multiply(&yNum, &yNum, &x);
+        _multiply(&yDen, &yDen, &x);
         _add(&xNum, &xNum, &params[0][i]);
         _add(&xDen, &xDen, &params[1][i]);
         _add(&yNum, &yNum, &params[2][i]);
@@ -670,9 +695,9 @@ void g1::isogenyMapG1(fp& x, fp& y)
     }
     xDen = xDen.inverse();
     yDen = yDen.inverse();
-    _mul(&xNum, &xNum, &xDen);
-    _mul(&yNum, &yNum, &yDen);
-    _mul(&yNum, &yNum, &y);
+    _multiply(&xNum, &xNum, &xDen);
+    _multiply(&yNum, &yNum, &yDen);
+    _multiply(&yNum, &yNum, &y);
     x = xNum;
     y = yNum;
 }
@@ -699,11 +724,12 @@ g2::g2(const g2& e) : x(e.x), y(e.y), z(e.z)
 
 optional<g2> g2::fromJacobianBytesBE(const span<const uint8_t, 288> in, const bool check, const bool raw)
 {
-    optional<fp2> x = fp2::fromBytesBE(span<const uint8_t, 96>(&in[  0], &in[ 96]), check, raw);
-    optional<fp2> y = fp2::fromBytesBE(span<const uint8_t, 96>(&in[ 96], &in[192]), check, raw);
-    optional<fp2> z = fp2::fromBytesBE(span<const uint8_t, 96>(&in[192], &in[288]), check, raw);
-    if(!x.has_value() || !y.has_value() || !z.has_value()) return {};
-    g2 p = g2({x.value(), y.value(), z.value()});
+    // We decided to always validate the input here. Check flag will only affect on-curve checks.
+    optional<fp2> x = fp2::fromBytesBE(span<const uint8_t, 96>(&in[  0], &in[ 96]), true, raw);
+    optional<fp2> y = fp2::fromBytesBE(span<const uint8_t, 96>(&in[ 96], &in[192]), true, raw);
+    optional<fp2> z = fp2::fromBytesBE(span<const uint8_t, 96>(&in[192], &in[288]), true, raw);
+    if(!x || !y || !z) return {};
+    g2 p = g2({*x, *y, *z});
     if(check && !p.isOnCurve())
     {
         return {};
@@ -713,11 +739,12 @@ optional<g2> g2::fromJacobianBytesBE(const span<const uint8_t, 288> in, const bo
 
 optional<g2> g2::fromJacobianBytesLE(const span<const uint8_t, 288> in, const bool check, const bool raw)
 {
-    optional<fp2> x = fp2::fromBytesLE(span<const uint8_t, 96>(&in[  0], &in[ 96]), check, raw);
-    optional<fp2> y = fp2::fromBytesLE(span<const uint8_t, 96>(&in[ 96], &in[192]), check, raw);
-    optional<fp2> z = fp2::fromBytesLE(span<const uint8_t, 96>(&in[192], &in[288]), check, raw);
-    if(!x.has_value() || !y.has_value() || !z.has_value()) return {};
-    g2 p = g2({x.value(), y.value(), z.value()});
+    // We decided to always validate the input here. Check flag will only affect on-curve checks.
+    optional<fp2> x = fp2::fromBytesLE(span<const uint8_t, 96>(&in[  0], &in[ 96]), true, raw);
+    optional<fp2> y = fp2::fromBytesLE(span<const uint8_t, 96>(&in[ 96], &in[192]), true, raw);
+    optional<fp2> z = fp2::fromBytesLE(span<const uint8_t, 96>(&in[192], &in[288]), true, raw);
+    if(!x || !y || !z) return {};
+    g2 p = g2({*x, *y, *z});
     if(check && !p.isOnCurve())
     {
         return {};
@@ -727,16 +754,17 @@ optional<g2> g2::fromJacobianBytesLE(const span<const uint8_t, 288> in, const bo
 
 optional<g2> g2::fromAffineBytesBE(const span<const uint8_t, 192> in, const bool check, const bool raw)
 {
-    optional<fp2> x = fp2::fromBytesBE(span<const uint8_t, 96>(&in[  0], &in[ 96]), check, raw);
-    optional<fp2> y = fp2::fromBytesBE(span<const uint8_t, 96>(&in[ 96], &in[192]), check, raw);
-    if(!x.has_value() || !y.has_value()) return {};
+    // We decided to always validate the input here. Check flag will only affect on-curve checks.
+    optional<fp2> x = fp2::fromBytesBE(span<const uint8_t, 96>(&in[  0], &in[ 96]), true, raw);
+    optional<fp2> y = fp2::fromBytesBE(span<const uint8_t, 96>(&in[ 96], &in[192]), true, raw);
+    if(!x || !y) return {};
     // check if given input points to infinity
-    if(x.value().isZero() && y.value().isZero())
+    if(x->isZero() && y->isZero())
     {
         return zero();
     }
     fp2 z = fp2::one();
-    g2 p = g2({x.value(), y.value(), z});
+    g2 p = g2({*x, *y, z});
     if(check && !p.isOnCurve())
     {
         return {};
@@ -746,16 +774,17 @@ optional<g2> g2::fromAffineBytesBE(const span<const uint8_t, 192> in, const bool
 
 optional<g2> g2::fromAffineBytesLE(const span<const uint8_t, 192> in, const bool check, const bool raw)
 {
-    optional<fp2> x = fp2::fromBytesLE(span<const uint8_t, 96>(&in[  0], &in[ 96]), check, raw);
-    optional<fp2> y = fp2::fromBytesLE(span<const uint8_t, 96>(&in[ 96], &in[192]), check, raw);
-    if(!x.has_value() || !y.has_value()) return {};
+    // We decided to always validate the input here. Check flag will only affect on-curve checks.
+    optional<fp2> x = fp2::fromBytesLE(span<const uint8_t, 96>(&in[  0], &in[ 96]), true, raw);
+    optional<fp2> y = fp2::fromBytesLE(span<const uint8_t, 96>(&in[ 96], &in[192]), true, raw);
+    if(!x || !y) return {};
     // check if given input points to infinity
-    if(x.value().isZero() && y.value().isZero())
+    if(x->isZero() && y->isZero())
     {
         return zero();
     }
     fp2 z = fp2::one();
-    g2 p = g2({x.value(), y.value(), z});
+    g2 p = g2({*x, *y, z});
     if(check && !p.isOnCurve())
     {
         return {};
@@ -779,7 +808,11 @@ optional<g2> g2::fromCompressedBytesBE(const span<const uint8_t, 96> in)
     bool ysign = ((in[0] >> 5) & 1) == 1;
     g2 p;
     scalar::fromBytesBE(span<const uint8_t, 48>(&in[0], &in[48]), p.x.c1.d);
-    p.x.c0 = fp::fromBytesBE(span<const uint8_t, 48>(&in[48], &in[96])).value();
+    auto c0 = fp::fromBytesBE(span<const uint8_t, 48>(&in[48], &in[96]));
+    if (!c0) {
+        return {};
+    }
+    p.x.c0 = *c0;
     // erase 3 msbs from given input and perform validity check
     p.x.c1.d[5] &= 0x1FFFFFFFFFFFFFFF;
     p.x.c1 = p.x.c1.toMont();
@@ -792,7 +825,7 @@ optional<g2> g2::fromCompressedBytesBE(const span<const uint8_t, 96> in)
     //  =>  y   = +/- sqrt(x^3 + B)
     fp2 b = fp2::B;
     p.y = p.x.square();         // y = x^2
-    p.y = p.y.mul(p.x);         // y = x^2 * x = x^3
+    p.y = p.y.multiply(p.x);         // y = x^2 * x = x^3
     p.y = p.y.add(b);           // y = x^3 + B
     if(!p.y.sqrt(p.y))
     {
@@ -800,7 +833,7 @@ optional<g2> g2::fromCompressedBytesBE(const span<const uint8_t, 96> in)
     }
     if(p.y.isLexicographicallyLargest() ^ ysign)
     {
-        p.y = p.y.neg();
+        p.y = p.y.negate();
     }
     p.z = fp2::one();
     return p;
@@ -933,12 +966,12 @@ bool g2::equal(const g2& e) const
     fp2 t[9];
     t[0] = z.square();
     t[1] = e.z.square();
-    t[2] = t[0].mul(e.x);
-    t[3] = t[1].mul(x);
-    t[0] = t[0].mul(z);
-    t[1] = t[1].mul(e.z);
-    t[1] = t[1].mul(y);
-    t[0] = t[0].mul(e.y);
+    t[2] = t[0].multiply(e.x);
+    t[3] = t[1].multiply(x);
+    t[0] = t[0].multiply(z);
+    t[1] = t[1].multiply(e.z);
+    t[1] = t[1].multiply(y);
+    t[0] = t[0].multiply(e.y);
     return t[0].equal(t[1]) && t[2].equal(t[3]);
 }
 
@@ -950,10 +983,10 @@ bool g2::inCorrectSubgroup() const
     g2 t0, t1;
     t0 = this->psi();
     t0 = t0.psi();
-    t1 = t0.neg();                  // - ψ^2(P)
+    t1 = t0.negate();               // - ψ^2(P)
     t0 = t0.psi();                  // ψ^3(P)
-    t0 = t0.mulScalar(cofactorEFF); // - x ψ^3(P)
-    t0 = t0.neg();
+    t0 = t0.scale(cofactorEFF);     // - x ψ^3(P)
+    t0 = t0.negate();
 
     t0 = t0.add(t1);
     t0 = t0.add(*this);
@@ -970,11 +1003,11 @@ bool g2::isOnCurve() const
     fp2 t[9];
     t[0] = y.square();
     t[1] = x.square();
-    t[1] = t[1].mul(x);
+    t[1] = t[1].multiply(x);
     t[2] = z.square();
     t[3] = t[2].square();
-    t[2] = t[2].mul(t[3]);
-    t[2] = fp2::B.mul(t[2]);
+    t[2] = t[2].multiply(t[3]);
+    t[2] = fp2::B.multiply(t[2]);
     t[1] = t[1].add(t[2]);
     return t[0].equal(t[1]);
 }
@@ -994,125 +1027,141 @@ g2 g2::affine() const
     fp2 t[9];
     t[0] = r.z.inverse();
     t[1] = t[0].square();
-    r.x = r.x.mul(t[1]);
-    t[0] = t[0].mul(t[1]);
-    r.y = r.y.mul(t[0]);
+    r.x = r.x.multiply(t[1]);
+    t[0] = t[0].multiply(t[1]);
+    r.y = r.y.multiply(t[0]);
     r.z = fp2::one();
     return r;
 }
 
 g2 g2::add(const g2& e) const
 {
+    g2 r(*this);
+    r.addAssign(e);
+    return r;
+}
+
+void g2::addAssign(const g2& e) {
     // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl
     if(isZero())
     {
-        return e;
+        *this = e;
+        return;
     }
     if(e.isZero())
     {
-        return *this;
+        return;
     }
     fp2 t[9];
     t[7] = z.square();
-    t[1] = e.x.mul(t[7]);
-    t[2] = z.mul(t[7]);
-    t[0] = e.y.mul(t[2]);
+    t[1] = e.x.multiply(t[7]);
+    t[2] = z.multiply(t[7]);
+    t[0] = e.y.multiply(t[2]);
     t[8] = e.z.square();
-    t[3] = x.mul(t[8]);
-    t[4] = e.z.mul(t[8]);
-    t[2] = y.mul(t[4]);
+    t[3] = x.multiply(t[8]);
+    t[4] = e.z.multiply(t[8]);
+    t[2] = y.multiply(t[4]);
     if(t[1].equal(t[3]))
     {
         if(t[0].equal(t[2]))
         {
-            return dbl();
+            doubleAssign();
+            return;
         }
-        return zero();
+        *this = zero();
+        return;
     }
-    g2 r;
-    t[1] = t[1].sub(t[3]);
+
+    t[1] = t[1].subtract(t[3]);
     t[4] = t[1].dbl();
     t[4] = t[4].square();
-    t[5] = t[1].mul(t[4]);
-    t[0] = t[0].sub(t[2]);
+    t[5] = t[1].multiply(t[4]);
+    t[0] = t[0].subtract(t[2]);
     t[0] = t[0].dbl();
     t[6] = t[0].square();
-    t[6] = t[6].sub(t[5]);
-    t[3] = t[3].mul(t[4]);
+    t[6] = t[6].subtract(t[5]);
+    t[3] = t[3].multiply(t[4]);
     t[4] = t[3].dbl();
-    r.x = t[6].sub(t[4]);
-    t[4] = t[3].sub(r.x);
-    t[6] = t[2].mul(t[5]);
+    x = t[6].subtract(t[4]);
+    t[4] = t[3].subtract(x);
+    t[6] = t[2].multiply(t[5]);
     t[6] = t[6].dbl();
-    t[0] = t[0].mul(t[4]);
-    r.y = t[0].sub(t[6]);
+    t[0] = t[0].multiply(t[4]);
+    y = t[0].subtract(t[6]);
     t[0] = z.add(e.z);
     t[0] = t[0].square();
-    t[0] = t[0].sub(t[7]);
-    t[0] = t[0].sub(t[8]);
-    r.z = t[0].mul(t[1]);
-    return r;
+    t[0] = t[0].subtract(t[7]);
+    t[0] = t[0].subtract(t[8]);
+    z = t[0].multiply(t[1]);
 }
 
 g2 g2::dbl() const
 {
+    g2 r(*this);
+    r.doubleAssign();
+    return r;
+}
+
+void g2::doubleAssign() {
     // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l
     if(isZero())
     {
-        return *this;
+        return;
     }
     fp2 t[9];
-    g2 r;
+
     t[0] = x.square();
     t[1] = y.square();
     t[2] = t[1].square();
     t[1] = x.add(t[1]);
     t[1] = t[1].square();
-    t[1] = t[1].sub(t[0]);
-    t[1] = t[1].sub(t[2]);
+    t[1] = t[1].subtract(t[0]);
+    t[1] = t[1].subtract(t[2]);
     t[1] = t[1].dbl();
     t[3] = t[0].dbl();
     t[0] = t[3].add(t[0]);
     t[4] = t[0].square();
     t[3] = t[1].dbl();
-    r.x = t[4].sub(t[3]);
-    t[1] = t[1].sub(r.x);
+    x = t[4].subtract(t[3]);
+    t[1] = t[1].subtract(x);
     t[2] = t[2].dbl();
     t[2] = t[2].dbl();
     t[2] = t[2].dbl();
-    t[0] = t[0].mul(t[1]);
-    t[1] = t[0].sub(t[2]);
-    t[0] = y.mul(z);
-    r.y = t[1];
-    r.z = t[0].dbl();
-    return r;
+    t[0] = t[0].multiply(t[1]);
+    t[1] = t[0].subtract(t[2]);
+    t[0] = y.multiply(z);
+    y = t[1];
+    z = t[0].dbl();
 }
 
-g2 g2::neg() const
+g2 g2::negate() const
 {
     g2 r;
     r.x = x;
-    r.y = y.neg();
+    r.y = y.negate();
     r.z = z;
     return r;
 }
 
-g2 g2::sub(const g2& e) const
+g2 g2::subtract(const g2& e) const
 {
-    g2 c, d;
-    d = e.neg();
-    c = this->add(d);
+    g2 c(*this);
+    c.subtractAssign(e);
     return c;
 }
 
+void g2::subtractAssign(const g2& e) {
+    addAssign(e.negate());
+}
+
 g2 g2::psi() const
 {
     g2 p;
-    p.x = this->x.conj();
-    p.y = this->y.conj();
-    p.z = this->z.conj();
-    p.x = p.x.mul(fp2::psiX);
-    p.y = p.y.mul(fp2::psiY);
+    p.x = this->x.conjugate();
+    p.y = this->y.conjugate();
+    p.z = this->z.conjugate();
+    p.x = p.x.multiply(fp2::psiX);
+    p.y = p.y.multiply(fp2::psiY);
     return p;
 }
 
@@ -1120,19 +1169,19 @@ g2 g2::clearCofactor() const
 {
     g2 t0, t1, t2, t3;
     // Compute t0 = xP
-    t0 = mulScalar(cofactorEFF);
+    t0 = scale(cofactorEFF);
     // cofactorEFF has the MSB set, so relic has the sign bit set and negates the y coordinate
-    t0 = t0.neg();
+    t0 = t0.negate();
     // Compute t1 = [x^2]P
-    t1 = t0.mulScalar(cofactorEFF);
+    t1 = t0.scale(cofactorEFF);
     // cofactorEFF has the MSB set, so relic has the sign bit set and negates the y coordinate
-    t1 = t1.neg();
+    t1 = t1.negate();
 
     // t2 = (x^2 - x - 1)P = x^2P - x*P - P
-    t2 = t1.sub(t0);
-    t2 = t2.sub(*this);
+    t2 = t1.subtract(t0);
+    t2 = t2.subtract(*this);
     // t3 = \psi(x - 1)P
-    t3 = t0.sub(*this);
+    t3 = t0.subtract(*this);
     t3 = t3.frobeniusMap(1);
     t2 = t2.add(t3);
     // t3 = \psi^2(2P)
@@ -1158,25 +1207,23 @@ g2 g2::frobeniusMap(int64_t power) const
         r.x.frobeniusMapAssign(1);
         r.y.frobeniusMapAssign(1);
         r.z.frobeniusMapAssign(1);
-        r.x = r.x.mul(frb0);
-        r.y = r.y.mul(frb1);
+        r.x = r.x.multiply(frb0);
+        r.y = r.y.multiply(frb1);
     }
     return r;
 }
 
-// MultiExp calculates multi exponentiation. Given pairs of G2 point and scalar values
+// Given pairs of G2 point and scalar values
 // (P_0, e_0), (P_1, e_1), ... (P_n, e_n) calculates r = e_0 * P_0 + e_1 * P_1 + ... + e_n * P_n
-// Length of points and scalars are expected to be equal, otherwise NONE is returned.
-optional<g2> g2::multiExp(std::span<const g2> points, std::span<const std::array<uint64_t, 4>> scalars, std::function<void()> yield)
+// If length of points and scalars are not the same, then missing points will be treated as the zero point 
+// and missing scalars will be treated as the zero scalar.
+g2 g2::weightedSum(std::span<const g2> points, std::span<const std::array<uint64_t, 4>> scalars, const std::function<void()>& yield)
 {
-    if(points.size() != scalars.size())
-    {
-        return {};
-    }
+    const size_t effective_size = min(scalars.size(), points.size());
     uint64_t c = 3;
-    if(scalars.size() >= 32)
+    if(effective_size >= 32)
     {
-        c = static_cast<uint64_t>(ceil(log10(static_cast<double>(scalars.size()))));
+        c = (std::numeric_limits<size_t>::digits - std::countl_zero(effective_size))/3 + 2;
     }
     uint64_t bucketSize = (1<<c)-1;
     uint64_t windowsSize = 255/c+1;
@@ -1188,39 +1235,45 @@ optional<g2> g2::multiExp(std::span<const g2> points, std::span<const std::array
     {
         for(uint64_t i = 0; i < bucketSize; i++)
         {
+            if (yield && ((i & 255) == 0)) {
+                yield();
+            }
             bucket[i] = zero();
         }
-        for(uint64_t i = 0; i < scalars.size(); i++)
+        for(uint64_t i = 0; i < effective_size; i++)
         {
+            if (yield && ((i & 255) == 0)) {
+                yield();
+            }
             array<uint64_t, 4> shifted;
             scalar::rsh(shifted, scalars[i], c*j);
             uint64_t index = bucketSize & shifted[0];
             if(index != 0)
             {
-                bucket[index-1] = bucket[index-1].add(points[i]);
+                bucket[index-1].addAssign(points[i]);
             }
         }
         g2 acc = zero();
         g2 sum = zero();
         for(int64_t i = bucketSize-1; i >= 0; i--)
         {
-            sum = sum.add(bucket[i]);
-            acc = acc.add(sum);
+            if (yield && ((i & 255) == 0)) {
+                yield();
+            }
+            sum.addAssign(bucket[i]);
+            acc.addAssign(sum);
         }
         windows.push_back(acc);
     }
-    if(scalars.size() >= 32 && yield)
-    {
-        yield();
-    }
+
     g2 acc = zero();
     for(int64_t i = windows.size()-1; i >= 0; i--)
     {
         for(uint64_t j = 0; j < c; j++)
         {
-            acc = acc.dbl();
+            acc.doubleAssign();
         }
-        acc = acc.add(windows[i]);
+        acc.addAssign(windows[i]);
     }
     return acc;
 }
@@ -1272,7 +1325,7 @@ tuple<fp2, fp2> g2::swuMapG2(const fp2& e)
     };
     fp2 tv[4];
     tv[0] = e.square();
-    tv[0] = tv[0].mul(params.z);
+    tv[0] = tv[0].multiply(params.z);
     tv[1] = tv[0].square();
     fp2 x1;
     x1 = tv[0].add(tv[1]);
@@ -1283,17 +1336,17 @@ tuple<fp2, fp2> g2::swuMapG2(const fp2& e)
     {
         x1 = params.zInv;
     }
-    x1 = x1.mul(params.minusBOverA);
+    x1 = x1.multiply(params.minusBOverA);
     fp2 gx1;
     gx1 = x1.square();
     gx1 = gx1.add(params.a);
-    gx1 = gx1.mul(x1);
+    gx1 = gx1.multiply(x1);
     gx1 = gx1.add(params.b);
     fp2 x2;
-    x2 = tv[0].mul(x1);
-    tv[1] = tv[0].mul(tv[1]);
+    x2 = tv[0].multiply(x1);
+    tv[1] = tv[0].multiply(tv[1]);
     fp2 gx2;
-    gx2 = gx1.mul(tv[1]);
+    gx2 = gx1.multiply(tv[1]);
     bool e2 = !gx1.isQuadraticNonResidue();
     fp2 x, y2;
     if(e2)
@@ -1310,7 +1363,7 @@ tuple<fp2, fp2> g2::swuMapG2(const fp2& e)
     y2.sqrt(y);
     if(y.sign() != e.sign())
     {
-        y = y.neg();
+        y = y.negate();
     }
     return {x, y};
 }
@@ -1400,10 +1453,10 @@ void g2::isogenyMapG2(fp2& x, fp2& y)
     fp2 yDen = params[3][degree];
     for(int64_t i = degree - 1; i >= 0; i--)
     {
-        xNum = xNum.mul(x);
-        xDen = xDen.mul(x);
-        yNum = yNum.mul(x);
-        yDen = yDen.mul(x);
+        xNum = xNum.multiply(x);
+        xDen = xDen.multiply(x);
+        yNum = yNum.multiply(x);
+        yDen = yDen.multiply(x);
         xNum = xNum.add(params[0][i]);
         xDen = xDen.add(params[1][i]);
         yNum = yNum.add(params[2][i]);
@@ -1411,9 +1464,9 @@ void g2::isogenyMapG2(fp2& x, fp2& y)
     }
     xDen = xDen.inverse();
     yDen = yDen.inverse();
-    xNum = xNum.mul(xDen);
-    yNum = yNum.mul(yDen);
-    yNum = yNum.mul(y);
+    xNum = xNum.multiply(xDen);
+    yNum = yNum.multiply(yDen);
+    yNum = yNum.multiply(y);
     x = xNum;
     y = yNum;
 }
@@ -1502,38 +1555,38 @@ g2 g2::isogenyMap() const
     t0 = params[0][3];
     for(int i = 3; i > 0; --i)
     {
-        t0 = t0.mul(x);
+        t0 = t0.multiply(x);
         t0 = t0.add(params[0][i - 1]);
     }
     t1 = params[2][3];
     for(int i = 3; i > 0; --i)
     {
-        t1 = t1.mul(x);
+        t1 = t1.multiply(x);
         t1 = t1.add(params[2][i - 1]);
     }
     t2 = params[3][3];
     for(int i = 3; i > 0; --i)
     {
-        t2 = t2.mul(x);
+        t2 = t2.multiply(x);
         t2 = t2.add(params[3][i - 1]);
     }
     t3 = params[1][2];
     for(int i = 2; i > 0; --i)
     {
-        t3 = t3.mul(x);
+        t3 = t3.multiply(x);
         t3 = t3.add(params[1][i - 1]);
     }
 
     // Y = Ny * Dx * Z^2.
-    q.y = y.mul(t1);
-    q.y = q.y.mul(t3);
+    q.y = y.multiply(t1);
+    q.y = q.y.multiply(t3);
     // Z = Dx * Dy, t1 = Z^2.
-    q.z = t2.mul(t3);
+    q.z = t2.multiply(t3);
     t1 = q.z.square();
-    q.y = q.y.mul(t1);
+    q.y = q.y.multiply(t1);
     // X = Nx * Dy * Z.
-    q.x = t0.mul(t2);
-    q.x = q.x.mul(q.z);
+    q.x = t0.multiply(t2);
+    q.x = q.x.multiply(q.z);
 
     return q;
 }
diff --git a/src/pairing.cpp b/src/pairing.cpp
index d142edf..1f98083 100644
--- a/src/pairing.cpp
+++ b/src/pairing.cpp
@@ -11,7 +11,7 @@ void doubling_step(array<fp2, 3>& coeff, g2& r)
 {
     // Adaptation of Formula 3 in https://eprint.iacr.org/2010/526.pdf
     fp2 t[10];
-    t[0] = r.x.mul(r.y);
+    t[0] = r.x.multiply(r.y);
     t[0] = t[0].mulByFq(fp::twoInv);
     t[1] = r.y.square();
     t[2] = r.z.square();
@@ -25,50 +25,50 @@ void doubling_step(array<fp2, 3>& coeff, g2& r)
     t[6] = r.y.add(r.z);
     t[6] = t[6].square();
     t[7] = t[2].add(t[1]);
-    t[6] = t[6].sub(t[7]);
-    coeff[0] = t[3].sub(t[1]);
+    t[6] = t[6].subtract(t[7]);
+    coeff[0] = t[3].subtract(t[1]);
     t[7] = r.x.square();
-    t[4] = t[1].sub(t[4]);
-    r.x = t[4].mul(t[0]);
+    t[4] = t[1].subtract(t[4]);
+    r.x = t[4].multiply(t[0]);
     t[2] = t[3].square();
     t[3] = t[2].dbl();
     t[3] = t[3].add(t[2]);
     t[5] = t[5].square();
-    r.y = t[5].sub(t[3]);
-    r.z = t[1].mul(t[6]);
+    r.y = t[5].subtract(t[3]);
+    r.z = t[1].multiply(t[6]);
     t[0] = t[7].dbl();
     coeff[1] = t[0].add(t[7]);
-    coeff[2] = t[6].neg();
+    coeff[2] = t[6].negate();
 }
 
 void addition_step(array<fp2, 3>& coeff, g2& r, const g2& tp)
 {
     // Algorithm 12 in https://eprint.iacr.org/2010/526.pdf
     fp2 t[10];
-    t[0] = tp.y.mul(r.z);
-    t[0] = t[0].neg();
+    t[0] = tp.y.multiply(r.z);
+    t[0] = t[0].negate();
     t[0] = t[0].add(r.y);
-    t[1] = tp.x.mul(r.z);
-    t[1] = t[1].neg();
+    t[1] = tp.x.multiply(r.z);
+    t[1] = t[1].negate();
     t[1] = t[1].add(r.x);
     t[2] = t[0].square();
     t[3] = t[1].square();
-    t[4] = t[1].mul(t[3]);
-    t[2] = r.z.mul(t[2]);
-    t[3] = r.x.mul(t[3]);
+    t[4] = t[1].multiply(t[3]);
+    t[2] = r.z.multiply(t[2]);
+    t[3] = r.x.multiply(t[3]);
     t[5] = t[3].dbl();
-    t[5] = t[4].sub(t[5]);
+    t[5] = t[4].subtract(t[5]);
     t[5] = t[5].add(t[2]);
-    r.x = t[1].mul(t[5]);
-    t[2] = t[3].sub(t[5]);
-    t[2] = t[2].mul(t[0]);
-    t[3] = r.y.mul(t[4]);
-    r.y = t[2].sub(t[3]);
-    r.z = r.z.mul(t[4]);
-    t[2] = t[1].mul(tp.y);
-    t[3] = t[0].mul(tp.x);
-    coeff[0] = t[3].sub(t[2]);
-    coeff[1] = t[0].neg();
+    r.x = t[1].multiply(t[5]);
+    t[2] = t[3].subtract(t[5]);
+    t[2] = t[2].multiply(t[0]);
+    t[3] = r.y.multiply(t[4]);
+    r.y = t[2].subtract(t[3]);
+    r.z = r.z.multiply(t[4]);
+    t[2] = t[1].multiply(tp.y);
+    t[3] = t[0].multiply(tp.x);
+    coeff[0] = t[3].subtract(t[2]);
+    coeff[1] = t[0].negate();
     coeff[2] = t[1];
 }
 
@@ -142,39 +142,39 @@ void final_exponentiation(fp12& f)
     // easy part
     t[0] = f.frobeniusMap(6);
     t[1] = f.inverse();
-    t[2] = t[0].mul(t[1]);
+    t[2] = t[0].multiply(t[1]);
     t[1] = t[2];
     t[2].frobeniusMapAssign(2);
-    t[2].mulAssign(t[1]);
+    t[2].multiplyAssign(t[1]);
     t[1] = t[2].cyclotomicSquare();
     t[1] = t[1].conjugate();
     // hard part
     t[3] = t[2].cyclotomicExp(g2::cofactorEFF);
     t[3] = t[3].conjugate();
     t[4] = t[3].cyclotomicSquare();
-    t[5] = t[1].mul(t[3]);
+    t[5] = t[1].multiply(t[3]);
     t[1] = t[5].cyclotomicExp(g2::cofactorEFF);
     t[1] = t[1].conjugate();
     t[0] = t[1].cyclotomicExp(g2::cofactorEFF);
     t[0] = t[0].conjugate();
     t[6] = t[0].cyclotomicExp(g2::cofactorEFF);
     t[6] = t[6].conjugate();
-    t[6].mulAssign(t[4]);
+    t[6].multiplyAssign(t[4]);
     t[4] = t[6].cyclotomicExp(g2::cofactorEFF);
     t[4] = t[4].conjugate();
     t[5] = t[5].conjugate();
-    t[4].mulAssign(t[5]);
-    t[4].mulAssign(t[2]);
+    t[4].multiplyAssign(t[5]);
+    t[4].multiplyAssign(t[2]);
     t[5] = t[2].conjugate();
-    t[1].mulAssign(t[2]);
+    t[1].multiplyAssign(t[2]);
     t[1].frobeniusMapAssign(3);
-    t[6].mulAssign(t[5]);
+    t[6].multiplyAssign(t[5]);
     t[6].frobeniusMapAssign(1);
-    t[3].mulAssign(t[0]);
+    t[3].multiplyAssign(t[0]);
     t[3].frobeniusMapAssign(2);
-    t[3].mulAssign(t[1]);
-    t[3].mulAssign(t[6]);
-    f = t[3].mul(t[4]);
+    t[3].multiplyAssign(t[1]);
+    t[3].multiplyAssign(t[6]);
+    f = t[3].multiply(t[4]);
 }
 
 fp12 calculate(std::span<const std::tuple<g1, g2>> pairs, std::function<void()> yield)
diff --git a/src/signatures.cpp b/src/signatures.cpp
index 0751a76..3ad2d6a 100644
--- a/src/signatures.cpp
+++ b/src/signatures.cpp
@@ -352,7 +352,7 @@ g1 derive_child_g1_unhardened(
 
     bn_divn_safe(quotient, remainder, nonce, fp::Q);
 
-    return pk.add(g1::one().mulScalar(remainder));
+    return pk.add(g1::one().scale(remainder));
 }
 
 g2 derive_child_g2_unhardened(
@@ -378,7 +378,7 @@ g2 derive_child_g2_unhardened(
 
     bn_divn_safe(quotient, remainder, nonce, fp::Q);
 
-    return pk.add(g2::one().mulScalar(remainder));
+    return pk.add(g2::one().scale(remainder));
 }
 
 array<uint64_t, 4> aggregate_secret_keys(std::span<const std::array<uint64_t, 4>> sks)
@@ -437,7 +437,7 @@ array<uint64_t, 4> sk_from_bytes(
 
 g1 public_key(const array<uint64_t, 4>& sk)
 {
-    return g1::one().mulScalar(sk).affine();
+    return g1::one().scale(sk).affine();
 }
 
 // Construct an extensible-output function based on SHA256
@@ -532,7 +532,7 @@ g2 sign(
 )
 {
     g2 p = fromMessage(msg, CIPHERSUITE_ID);
-    return p.mulScalar(sk);
+    return p.scale(sk);
 }
 
 bool verify(
@@ -542,7 +542,7 @@ bool verify(
 )
 {
     vector<tuple<g1, g2>> v;
-    pairing::add_pair(v, g1::one().neg(), signature);
+    pairing::add_pair(v, g1::one().negate(), signature);
     const g2 hashedPoint = fromMessage(message, CIPHERSUITE_ID);
     pairing::add_pair(v, pubkey, hashedPoint);
 
@@ -587,7 +587,7 @@ bool aggregate_verify(
 )
 {
     vector<tuple<g1, g2>> v;
-    pairing::add_pair(v, g1::one().neg(), signature);
+    pairing::add_pair(v, g1::one().negate(), signature);
 
     if(!signature.isOnCurve() || !signature.inCorrectSubgroup())
     {
@@ -632,7 +632,7 @@ g2 pop_prove(const array<uint64_t, 4>& sk)
     g1 pk = public_key(sk);
     array<uint8_t, 96> msg = pk.toAffineBytesLE(true);
     g2 hashed_key = fromMessage(vector<uint8_t>(msg.begin(), msg.end()), POP_CIPHERSUITE_ID);
-    return hashed_key.mulScalar(sk);
+    return hashed_key.scale(sk);
 }
 
 bool pop_verify(
@@ -653,7 +653,7 @@ bool pop_verify(
     }
 
     vector<tuple<g1, g2>> v;
-    pairing::add_pair(v, g1::one().neg(), signature_proof);
+    pairing::add_pair(v, g1::one().negate(), signature_proof);
     pairing::add_pair(v, pubkey, hashedPoint);
 
     // 1 =? prod e(pubkey[i], hash[i]) * e(-g1, aggSig)
diff --git a/test/unittests.cpp b/test/unittests.cpp
index cb4b10b..a31eec3 100644
--- a/test/unittests.cpp
+++ b/test/unittests.cpp
@@ -66,13 +66,13 @@ fp12 random_fe12()
 g1 random_g1()
 {
     array<uint64_t, 4> k = random_scalar();
-    return g1::one().mulScalar(k);
+    return g1::one().scale(k);
 }
 
 g2 random_g2()
 {
     array<uint64_t, 4> k = random_scalar();
-    return g2::one().mulScalar(k);
+    return g2::one().scale(k);
 }
 
 void TestScalar()
@@ -204,6 +204,345 @@ void TestFieldElementEquality()
     }
 }
 
+void TestFieldElementArithmeticCornerCases() {
+    const char* testVectorInput[] = {
+        "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
+        "1A0111EA397FE69A4B1BA7B6434BACD764774B84F38512BF6730D2A0F6B0F6241EABFFFEB153FFFFB9FEFFFFFFFFAAAA", // p-1
+        "1A0111EA397FE69A4B1BA7B6434BACD764774B84F38512BF6730D2A0F6B0F6241EABFFFEB153FFFFB9FEFFFFFFFFAAAB", // p
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"
+    };
+
+    const char* testVectorExpectedSquare[] = {
+        "NA",
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+        "NA",
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"
+    };
+
+    const char* testVectorExpectedAdd[] = {
+        "NA",
+        "1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9",
+        "NA",
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000002",
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"
+    };
+
+    auto testSqureMul = [](const char* in, const char* expectedSquare, const char* expectedAdd) {
+        // Input should be convert to Montgomery form, so "raw" = false
+        auto input = fp::fromBytesBE(hexToBytes<48>(in), true, false);
+
+        if (0 == strcmp("NA", expectedSquare)) {
+            if (input) {
+                throw invalid_argument("input should be invalid but not");
+            }
+            return;
+        }
+
+        // Expected result will be compared against numbers converted back from Montgomery form, so "raw" = true
+        auto fpExpectedSquare = fp::fromBytesBE(hexToBytes<48>(expectedSquare), false, true);
+        auto fpExpectedAdd = fp::fromBytesBE(hexToBytes<48>(expectedAdd), false, true);
+
+        fp s,m,a,d;
+        fp s1,m1,a1,d1;
+
+        _square(&s, &*input);
+        _multiply(&m, &*input, &*input);
+        _add(&a, &*input, &*input);
+        _double(&d, &*input);
+
+        s1 = input->square();
+        m1 = input->multiply(*input);
+        a1 = input->add(*input);
+        d1 = input->dbl();
+
+        if(!s.equal(s1)) {
+            throw invalid_argument("_square != fp::square");
+        }
+        if(!m.equal(m1)) {
+            throw invalid_argument("_multiply != fp::multiply");
+        }
+        if(!a.equal(a1)) {
+            throw invalid_argument("_add != fp::add");
+        }
+        if(!d.equal(d1)) {
+            throw invalid_argument("_double != fp::dbl");
+        }
+
+        s = s.fromMont();
+        m = m.fromMont();
+        a = a.fromMont();
+        d = d.fromMont();
+
+        if(!s.equal(m))
+        {
+            throw invalid_argument("square != mul self");
+        }
+
+        if(!s.equal(*fpExpectedSquare))
+        {
+            throw invalid_argument("square != expected");
+        }
+
+        if(!a.equal(d))
+        {
+            throw invalid_argument("double != add self");
+        }
+
+        if(!a.equal(*fpExpectedAdd))
+        {
+            throw invalid_argument("add != expected");
+        }
+
+    };
+
+    for (int i = 0; i < sizeof(testVectorInput) / sizeof(const char*); ++i) {
+        testSqureMul(testVectorInput[i], testVectorExpectedSquare[i], testVectorExpectedAdd[i]);
+    }
+}
+
+template<class T> void TestHelperMultiplySquare(const T input) {
+    T s,m;
+    s = input.square();
+    m = input.multiply(input);
+
+    T sa,ma;
+    sa = input;
+    sa.squareAssign();
+    ma = input;
+    ma.multiplyAssign(ma);
+
+    if(!s.equal(sa))
+    {
+        throw invalid_argument("square != squareAssign");
+    }
+
+    if(!m.equal(ma))
+    {
+        throw invalid_argument("multiply != multiplyAssign");
+    }
+
+    if(!s.equal(m))
+    {
+        throw invalid_argument("square != mul self");
+    }
+}
+
+template<class T> void TestHelperAddSubtractDouble(const T input) {
+    T a,d;
+    a = input.add(input);
+    d = input.dbl();
+
+    T aa,da;
+    aa = input;
+    aa.addAssign(aa);
+    da = input;
+    da.doubleAssign();
+
+    if(!a.equal(aa))
+    {
+        throw invalid_argument("add != addAssign");
+    }
+
+    if(!d.equal(da))
+    {
+        throw invalid_argument("dbl != doubleAssign");
+    }
+
+    if(!a.equal(d))
+    {
+        throw invalid_argument("double != add self");
+    }
+
+    T s = input.subtract(input);
+    T sa = input;
+    sa.subtractAssign(sa);
+
+    if(!s.equal(sa))
+    {
+        throw invalid_argument("subtract != subtractAssign");
+    }
+
+    if(!s.isZero())
+    {
+        throw invalid_argument("zero != sub self");
+    }
+}
+
+void TestArithmeticOpraters() {
+    for(int i = 0; i < 100; i++)
+    {
+        TestHelperMultiplySquare(random_fe());
+        TestHelperMultiplySquare(random_fe2());
+        TestHelperMultiplySquare(random_fe6());
+        TestHelperMultiplySquare(random_fe12());
+
+        TestHelperAddSubtractDouble(random_fe());
+        TestHelperAddSubtractDouble(random_fe2());
+        TestHelperAddSubtractDouble(random_fe6());
+        TestHelperAddSubtractDouble(random_fe12());
+
+        TestHelperAddSubtractDouble(random_g1());
+        TestHelperAddSubtractDouble(random_g2());
+    }
+}
+
+void TestSqrt() {
+    for (int i = 0; i < 100; ++ i) {
+        fp a = random_fe();
+        fp as = a.square();
+        fp asqrt;
+        if (as.sqrt(asqrt)) {
+            if(!as.equal(asqrt.square()))
+            {
+                throw invalid_argument("sqrt(fp).square != fp");
+            }
+            if(!a.equal(asqrt) && !a.equal(asqrt.negate()))
+            {
+                throw invalid_argument("fp!= sqrt(fp.square)");
+            }
+        } else {
+            throw invalid_argument("failed to find sqrt for fp");
+        }
+    }
+    
+    for (int i = 0; i < 100; ++ i) {
+        fp2 a = random_fe2();
+        fp2 as = a.square();
+        fp2 asqrt;
+        if (as.sqrt(asqrt)) {
+            if(!as.equal(asqrt.square()))
+            {
+                throw invalid_argument("sqrt(fp2).square != fp2");
+            }
+            if(!a.equal(asqrt) && !a.equal(asqrt.negate()))
+            {
+                throw invalid_argument("fp2 != sqrt(fp2.square)");
+            }
+        } else {
+            throw invalid_argument("failed to find sqrt for fp2");
+        }
+    }
+
+}
+
+void TestInverse() {
+    if (!fp::one().inverse().equal(fp::one())) {
+        throw invalid_argument("1^-1 != 1");
+    }
+
+    auto two = fp::one().dbl();
+    if (!two.multiply(two.inverse()).equal(fp::one())) {
+        throw invalid_argument("2 * 2^-1 != 1");
+    }
+
+    auto pminus1 = *fp::fromBytesBE(hexToBytes<48>("1A0111EA397FE69A4B1BA7B6434BACD764774B84F38512BF6730D2A0F6B0F6241EABFFFEB153FFFFB9FEFFFFFFFFAAAA"), false, true);
+    if (!pminus1.multiply(pminus1.inverse()).equal(fp::one())) {
+        throw invalid_argument("(p-1) * (p-1)^-1 != 1");
+    }
+
+    for (int i = 0; i < 100; ++ i) {
+        fp a = random_fe();
+        auto b = a.inverse();
+        if (!a.multiply(b).equal(fp::one())) {
+            throw invalid_argument("fp * fp^-1 != 1");
+        }
+    }
+
+    for (int i = 0; i < 100; ++ i) {
+        fp2 a = random_fe2();
+        auto b = a.inverse();
+        if (!a.multiply(b).equal(fp2::one())) {
+            throw invalid_argument("fp2 * fp2^-1 != 1");
+        }
+    }
+
+    for (int i = 0; i < 100; ++ i) {
+        fp6 a = random_fe6();
+        auto b = a.inverse();
+        if (!a.multiply(b).equal(fp6::one())) {
+            throw invalid_argument("fp * fp^-1 != 1");
+        }
+    }
+
+    for (int i = 0; i < 100; ++ i) {
+        fp12 a = random_fe12();
+        auto b = a.inverse();
+        if (!a.multiply(b).equal(fp12::one())) {
+            throw invalid_argument("fp * fp^-1 != 1");
+        }
+    }
+}
+
+void TestMod() {
+
+    const char* testVectorInput[] = {
+        "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+        "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
+        "000000000000000000000000000000001A0111EA397FE69A4B1BA7B6434BACD764774B84F38512BF6730D2A0F6B0F6241EABFFFEB153FFFFB9FEFFFFFFFFAAAA", // p-1
+        "000000000000000000000000000000001A0111EA397FE69A4B1BA7B6434BACD764774B84F38512BF6730D2A0F6B0F6241EABFFFEB153FFFFB9FEFFFFFFFFAAAB", // p
+        "000000000000000000000000000000001A0111EA397FE69A4B1BA7B6434BACD764774B84F38512BF6730D2A0F6B0F6241EABFFFEB153FFFFB9FEFFFFFFFFAAAC", // p+1
+    };
+
+    const char* testVectorExpected[] = {
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
+        "1A0111EA397FE69A4B1BA7B6434BACD764774B84F38512BF6730D2A0F6B0F6241EABFFFEB153FFFFB9FEFFFFFFFFAAAA",
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+    };
+
+    for (int i = 0; i < sizeof(testVectorInput)/sizeof(const char*); ++ i) {
+        auto s = hexToBytes<64>(testVectorInput[i]);
+        auto k = scalar::fromBytesBE<8>(s);
+        fp r = fp::modPrime<8>(k);
+        auto fpExpected = fp::fromBytesBE(hexToBytes<48>(testVectorExpected[i]), false, false);
+        if(!fpExpected->equal(r))
+        {
+            throw invalid_argument("r != expected for Mod");
+        }
+        
+    }
+}
+
+void TestExp() {
+
+    const char* testVectorInput[] = {
+        "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+        "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
+        "00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
+        "000000000000000000000000000000001A0111EA397FE69A4B1BA7B6434BACD764774B84F38512BF6730D2A0F6B0F6241EABFFFEB153FFFFB9FEFFFFFFFFAAAA", // p-1
+    };
+
+    const char* testVectorInput2[] = {
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000",
+        "1A0111EA397FE69A4B1BA7B6434BACD764774B84F38512BF6730D2A0F6B0F6241EABFFFEB153FFFFB9FEFFFFFFFFAAAA", // p-1
+    };
+
+    const char* testVectorExpected[] = {
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+        "000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001",
+    };
+
+    for (int i = 0; i < sizeof(testVectorInput)/sizeof(const char*); ++ i) {
+        auto s = hexToBytes<64>(testVectorInput[i]);
+        auto b = fp::fromBytesBE(hexToBytes<48>(testVectorInput2[i]), false, false);
+        auto k = scalar::fromBytesBE<8>(s);
+        fp r = b->exp(k);
+        auto fpExpected = fp::fromBytesBE(hexToBytes<48>(testVectorExpected[i]), false, false);
+        if(!fpExpected->equal(r))
+        {
+            throw invalid_argument("r != expected for Exp");
+        }
+        
+    }
+}
+
 void TestFieldElementHelpers()
 {
     // fe
@@ -391,6 +730,36 @@ void TestG1Serialization()
     }
 }
 
+void TestG1SerializationGarbage() {
+    array<uint8_t, 144> buf;
+    buf.fill(0xff);
+    for (int i = 0 ; i < 4; ++i ) {
+        auto a = g1::fromJacobianBytesBE(buf, i < 2, i%2);
+        if(a)
+        {
+            throw invalid_argument("g1, jacobianBE: serialization not catching invalid input");
+        }
+        auto b = g1::fromJacobianBytesLE(buf, i < 2, i%2);
+        if(b)
+        {
+            throw invalid_argument("g1, jacobianLE: serialization not catching invalid input");
+        }
+    }
+       
+    for (int i = 0 ; i < 4; ++i ) {
+        auto a = g1::fromAffineBytesBE(std::span<const uint8_t, 96>{buf.begin(),96}, i < 2, i%2);
+        if(a)
+        {
+            throw invalid_argument("g1, affineBE: serialization not catching invalid input");
+        }
+        auto b = g1::fromAffineBytesLE(std::span<const uint8_t, 96>{buf.begin(),96}, i < 2, i%2);
+        if(b)
+        {
+            throw invalid_argument("g1, affineLE: serialization not catching invalid input");
+        }
+    }
+}
+
 void TestG1IsOnCurve()
 {
     g1 zero = g1::zero();
@@ -424,23 +793,23 @@ void TestG1AdditiveProperties()
         {
             throw invalid_argument("0 + 0 == 0");
         }
-        t0 = a.sub(zero);
+        t0 = a.subtract(zero);
         if(!t0.equal(a))
         {
             throw invalid_argument("a - 0 == a");
         }
-        t0 = zero.sub(zero);
+        t0 = zero.subtract(zero);
         if(!t0.equal(zero))
         {
             throw invalid_argument("0 - 0 == 0");
         }
-        t0 = zero.neg();
+        t0 = zero.negate();
         if(!t0.equal(zero))
         {
             throw invalid_argument("- 0 == 0");
         }
-        t0 = zero.sub(a);
-        t0 = t0.neg();
+        t0 = zero.subtract(a);
+        t0 = t0.negate();
         if(!t0.equal(a))
         {
             throw invalid_argument(" - (0 - a) == a");
@@ -451,7 +820,7 @@ void TestG1AdditiveProperties()
             throw invalid_argument("2 * 0 == 0");
         }
         t0 = a.dbl();
-        t0 = t0.sub(a);
+        t0 = t0.subtract(a);
         if(!t0.equal(a) || !t0.isOnCurve())
         {
             throw invalid_argument(" (2 * a) - a == a");
@@ -462,9 +831,9 @@ void TestG1AdditiveProperties()
         {
             throw invalid_argument("a + b == b + a");
         }
-        t0 = a.sub(b);
-        t1 = b.sub(a);
-        t1 = t1.neg();
+        t0 = a.subtract(b);
+        t1 = b.subtract(a);
+        t1 = t1.negate();
         if(!t0.equal(t1))
         {
             throw invalid_argument("a - b == - ( b - a )");
@@ -478,10 +847,10 @@ void TestG1AdditiveProperties()
         {
             throw invalid_argument("(a + b) + c == (a + c ) + b");
         }
-        t0 = a.sub(b);
-        t0 = t0.sub(c);
-        t1 = a.sub(c);
-        t1 = t1.sub(b);
+        t0 = a.subtract(b);
+        t0 = t0.subtract(c);
+        t1 = a.subtract(c);
+        t1 = t1.subtract(b);
         if(!t0.equal(t1))
         {
             throw invalid_argument("(a - b) - c == (a - c) -b");
@@ -517,34 +886,34 @@ void TestG1MultiplicativePropertiesExpected()
         array<uint64_t, 4> s2 = tv[i].s2;
         array<uint64_t, 10> s3;
         array<uint64_t, 4> sone = {1, 0, 0, 0};
-        t0 = zero.mulScalar(s1);
+        t0 = zero.scale(s1);
         if(!t0.equal(zero))
         {
             throw invalid_argument(" 0 ^ s == 0");
         }
-        t0 = a.mulScalar(sone);
+        t0 = a.scale(sone);
         if(!t0.equal(a))
         {
             throw invalid_argument(" a ^ 1 == a");
         }
-        t0 = zero.mulScalar(s1);
+        t0 = zero.scale(s1);
         if(!t0.equal(zero))
         {
             throw invalid_argument(" 0 ^ s == a");
         }
-        t0 = a.mulScalar(s1);
-        t0 = t0.mulScalar(s2);
-        s3 = scalar::mul<10, 4, 4>(s1, s2);
-        t1 = a.mulScalar(s3);
+        t0 = a.scale(s1);
+        t0 = t0.scale(s2);
+        s3 = scalar::multiply<10, 4, 4>(s1, s2);
+        t1 = a.scale(s3);
         if(!t0.equal(t1))
         {
             throw invalid_argument("G1: (a ^ s1) ^ s2 == a ^ (s1 * s2)");
         }
-        t0 = a.mulScalar(s1);
-        t1 = a.mulScalar(s2);
+        t0 = a.scale(s1);
+        t1 = a.scale(s2);
         t0 = t0.add(t1);
         s3 = scalar::add<10, 4, 4>(s1, s2);
-        t1 = a.mulScalar(s3);
+        t1 = a.scale(s3);
         if(!t0.equal(t1))
         {
             throw invalid_argument(" (a ^ s1) + (a ^ s2) == a ^ (s1 + s2)");
@@ -563,34 +932,34 @@ void TestG1MultiplicativeProperties()
         array<uint64_t, 4> s2 = random_scalar(); 
         array<uint64_t, 10> s3;
         array<uint64_t, 4> sone = {1, 0, 0, 0};
-        t0 = zero.mulScalar(s1);
+        t0 = zero.scale(s1);
         if(!t0.equal(zero))
         {
             throw invalid_argument(" 0 ^ s == 0");
         }
-        t0 = a.mulScalar(sone);
+        t0 = a.scale(sone);
         if(!t0.equal(a))
         {
             throw invalid_argument(" a ^ 1 == a");
         }
-        t0 = zero.mulScalar(s1);
+        t0 = zero.scale(s1);
         if(!t0.equal(zero))
         {
             throw invalid_argument(" 0 ^ s == a");
         }
-        t0 = a.mulScalar(s1);
-        t0 = t0.mulScalar(s2);
-        s3 = scalar::mul<10, 4, 4>(s1, s2);
-        t1 = a.mulScalar(s3);
+        t0 = a.scale(s1);
+        t0 = t0.scale(s2);
+        s3 = scalar::multiply<10, 4, 4>(s1, s2);
+        t1 = a.scale(s3);
         if(!t0.equal(t1))
         {
             throw invalid_argument("G1: (a ^ s1) ^ s2 == a ^ (s1 * s2)");
         }
-        t0 = a.mulScalar(s1);
-        t1 = a.mulScalar(s2);
+        t0 = a.scale(s1);
+        t1 = a.scale(s2);
         t0 = t0.add(t1);
         s3 = scalar::add<10, 4, 4>(s1, s2);
-        t1 = a.mulScalar(s3);
+        t1 = a.scale(s3);
         if(!t0.equal(t1))
         {
             throw invalid_argument(" (a ^ s1) + (a ^ s2) == a ^ (s1 + s2)");
@@ -598,7 +967,7 @@ void TestG1MultiplicativeProperties()
     }
 }
 
-void TestG1MultiExpExpected()
+void TestG1WeightedSumExpected()
 {
     g1 one = g1::one();
     vector<array<uint64_t, 4>> scalars = {
@@ -607,40 +976,52 @@ void TestG1MultiExpExpected()
     };
     vector<g1> bases = {one, one};
     g1 expected, result;
-    expected = one.mulScalar<1>({5});
-    result = g1::multiExp(bases, scalars).value();
+    expected = one.scale<1>({5});
+    result = g1::weightedSum(bases, scalars);
     if(!expected.equal(result))
     {
-        throw invalid_argument("TestG1MultiExpExpected: bad multi-exponentiation");
+        throw invalid_argument("TestG1WeightedSumExpected: bad multi-exponentiation");
     }
 }
 
-void TestG1MultiExpBatch()
+void TestG1WeightedSumBatch()
 {
-    g1 one = g1::one();
-    int64_t n = 1000;
-    vector<array<uint64_t, 4>> scalars;
-    vector<g1> bases;
-    // scalars: [s0,s1 ... s(n-1)]
-    // bases: [P0,P1,..P(n-1)] = [s(n-1)*G, s(n-2)*G ... s0*G]
-    for(int64_t i = 0, j = n-1; i < n; i++, j--)
-    {
-        scalars.insert(scalars.begin(), array<uint64_t, 4>{static_cast<uint64_t>(rand()%100000), 0, 0, 0});
-        bases.push_back(g1::zero());
-        bases[i] = one.mulScalar(scalars[0]);
-    }
-    // expected: s(n-1)*P0 + s(n-2)*P1 + s0*P(n-1)
-    g1 expected, tmp;
-    for(int64_t i = 0; i < n; i++)
-    {
-        tmp = bases[i].mulScalar(scalars[i]);
-        expected = expected.add(tmp);
-    }
-    g1 result = g1::multiExp(bases, scalars).value();
-    if(!expected.equal(result))
-    {
-        throw invalid_argument("bad multi-exponentiation");
-    }
+    const auto doTest = [](int64_t n) {
+        g1 one = g1::one();
+        vector<array<uint64_t, 4>> scalars;
+        vector<g1> bases;
+
+        for(int64_t i = 0; i < n; i++)
+        {
+            scalars.push_back(random_scalar());
+            bases.push_back(random_g1());
+        }
+
+        g1 expected, tmp;
+        for(int64_t i = 0; i < n; i++)
+        {
+            tmp = bases[i].scale(scalars[i]);
+            expected = expected.add(tmp);
+        }
+        g1 result = g1::weightedSum(bases, scalars);
+        if(!expected.equal(result))
+        {
+            throw invalid_argument("bad G1 weighted sum");
+        }
+    };
+
+    doTest(0);
+    doTest(1);
+    doTest(2);
+    doTest(31);
+    doTest(32);
+    doTest(33);
+    doTest(63);
+    doTest(64);
+    doTest(65);
+    doTest(511);
+    doTest(512);
+    doTest(513);
 }
 
 void TestG1MapToCurve()
@@ -738,6 +1119,35 @@ void TestG2Serialization()
     }
 }
 
+void TestG2SerializationGarbage() {
+    array<uint8_t, 288> buf;
+    buf.fill(0xff);
+    for (int i = 0 ; i < 4; ++i ) {
+        auto a = g2::fromJacobianBytesBE(buf, i < 2, i%2);
+        if(a)
+        {
+            throw invalid_argument("g2, jacobianBE: serialization not catching invalid input");
+        }
+        auto b = g2::fromJacobianBytesLE(buf, i < 2, i%2);
+        if(b)
+        {
+            throw invalid_argument("g2, jacobianLE: serialization not catching invalid input");
+        }
+    }
+    for (int i = 0 ; i < 4; ++i ) {
+        auto a = g2::fromAffineBytesBE(std::span<const uint8_t, 192>{buf.begin(),192}, i < 2, i%2);
+        if(a)
+        {
+            throw invalid_argument("g2, affineBE: serialization not catching invalid input");
+        }
+        auto b = g2::fromAffineBytesLE(std::span<const uint8_t, 192>{buf.begin(),192}, i < 2, i%2);
+        if(b)
+        {
+            throw invalid_argument("g2, affineLE: serialization not catching invalid input");
+        }
+    }
+}
+
 void TestG2IsOnCurve()
 {
     g2 zero = g2::zero();
@@ -771,23 +1181,23 @@ void TestG2AdditiveProperties()
         {
             throw invalid_argument("0 + 0 == 0");
         }
-        t0 = a.sub(zero);
+        t0 = a.subtract(zero);
         if(!t0.equal(a))
         {
             throw invalid_argument("a - 0 == a");
         }
-        t0 = zero.sub(zero);
+        t0 = zero.subtract(zero);
         if(!t0.equal(zero))
         {
             throw invalid_argument("0 - 0 == 0");
         }
-        t0 = zero.neg();
+        t0 = zero.negate();
         if(!t0.equal(zero))
         {
             throw invalid_argument("- 0 == 0");
         }
-        t0 = zero.sub(a);
-        t0 = t0.neg();
+        t0 = zero.subtract(a);
+        t0 = t0.negate();
         if(!t0.equal(a))
         {
             throw invalid_argument(" - (0 - a) == a");
@@ -798,7 +1208,7 @@ void TestG2AdditiveProperties()
             throw invalid_argument("2 * 0 == 0");
         }
         t0 = a.dbl();
-        t0 = t0.sub(a);
+        t0 = t0.subtract(a);
         if(!t0.equal(a) || !t0.isOnCurve())
         {
             throw invalid_argument(" (2 * a) - a == a");
@@ -809,9 +1219,9 @@ void TestG2AdditiveProperties()
         {
             throw invalid_argument("a + b == b + a");
         }
-        t0 = a.sub(b);
-        t1 = b.sub(a);
-        t1 = t1.neg();
+        t0 = a.subtract(b);
+        t1 = b.subtract(a);
+        t1 = t1.negate();
         if(!t0.equal(t1))
         {
             throw invalid_argument("a - b == - ( b - a )");
@@ -825,10 +1235,10 @@ void TestG2AdditiveProperties()
         {
             throw invalid_argument("(a + b) + c == (a + c ) + b");
         }
-        t0 = a.sub(b);
-        t0 = t0.sub(c);
-        t1 = a.sub(c);
-        t1 = t1.sub(b);
+        t0 = a.subtract(b);
+        t0 = t0.subtract(c);
+        t1 = a.subtract(c);
+        t1 = t1.subtract(b);
         if(!t0.equal(t1))
         {
             throw invalid_argument("(a - b) - c == (a - c) -b");
@@ -847,34 +1257,34 @@ void TestG2MultiplicativeProperties()
         array<uint64_t, 4> s2 = random_scalar(); 
         array<uint64_t, 10> s3;
         array<uint64_t, 4> sone = {1, 0, 0, 0};
-        t0 = zero.mulScalar(s1);
+        t0 = zero.scale(s1);
         if(!t0.equal(zero))
         {
             throw invalid_argument(" 0 ^ s == 0");
         }
-        t0 = a.mulScalar(sone);
+        t0 = a.scale(sone);
         if(!t0.equal(a))
         {
             throw invalid_argument(" a ^ 1 == a");
         }
-        t0 = zero.mulScalar(s1);
+        t0 = zero.scale(s1);
         if(!t0.equal(zero))
         {
             throw invalid_argument(" 0 ^ s == a");
         }
-        t0 = a.mulScalar(s1);
-        t0 = t0.mulScalar(s2);
-        s3 = scalar::mul<10, 4, 4>(s1, s2);
-        t1 = a.mulScalar(s3);
+        t0 = a.scale(s1);
+        t0 = t0.scale(s2);
+        s3 = scalar::multiply<10, 4, 4>(s1, s2);
+        t1 = a.scale(s3);
         if(!t0.equal(t1))
         {
             throw invalid_argument("G2: (a ^ s1) ^ s2 == a ^ (s1 * s2)");
         }
-        t0 = a.mulScalar(s1);
-        t1 = a.mulScalar(s2);
+        t0 = a.scale(s1);
+        t1 = a.scale(s2);
         t0 = t0.add(t1);
         s3 = scalar::add<10, 4, 4>(s1, s2);
-        t1 = a.mulScalar(s3);
+        t1 = a.scale(s3);
         if(!t0.equal(t1))
         {
             throw invalid_argument(" (a ^ s1) + (a ^ s2) == a ^ (s1 + s2)");
@@ -882,7 +1292,7 @@ void TestG2MultiplicativeProperties()
     }
 }
 
-void TestG2MultiExpExpected()
+void TestG2WeightedSumExpected()
 {
     g2 one = g2::one();
     vector<array<uint64_t, 4>> scalars = {
@@ -891,40 +1301,52 @@ void TestG2MultiExpExpected()
     };
     vector<g2> bases = {one, one};
     g2 expected, result;
-    expected = one.mulScalar<1>({5});
-    result = g2::multiExp(bases, scalars).value();
+    expected = one.scale<1>({5});
+    result = g2::weightedSum(bases, scalars);
     if(!expected.equal(result))
     {
         throw invalid_argument("bad multi-exponentiation");
     }
 }
 
-void TestG2MultiExpBatch()
+void TestG2WeightedSumBatch()
 {
-    g2 one = g2::one();
-    int64_t n = 1000;
-    vector<array<uint64_t, 4>> scalars;
-    vector<g2> bases;
-    // scalars: [s0,s1 ... s(n-1)]
-    // bases: [P0,P1,..P(n-1)] = [s(n-1)*G, s(n-2)*G ... s0*G]
-    for(int64_t i = 0, j = n-1; i < n; i++, j--)
-    {
-        scalars.insert(scalars.begin(), array<uint64_t, 4>{static_cast<uint64_t>(rand()%100000), 0, 0, 0});
-        bases.push_back(g2::zero());
-        bases[i] = one.mulScalar(scalars[0]);
-    }
-    // expected: s(n-1)*P0 + s(n-2)*P1 + s0*P(n-1)
-    g2 expected, tmp;
-    for(int64_t i = 0; i < n; i++)
-    {
-        tmp = bases[i].mulScalar(scalars[i]);
-        expected = expected.add(tmp);
-    }
-    g2 result = g2::multiExp(bases, scalars).value();
-    if(!expected.equal(result))
-    {
-        throw invalid_argument("bad multi-exponentiation");
-    }
+    const auto doTest = [](int64_t n) {
+        g2 one = g2::one();
+        vector<array<uint64_t, 4>> scalars;
+        vector<g2> bases;
+
+        for(int64_t i = 0; i < n; i++)
+        {
+            scalars.push_back(random_scalar());
+            bases.push_back(random_g2());
+        }
+
+        g2 expected, tmp;
+        for(int64_t i = 0; i < n; i++)
+        {
+            tmp = bases[i].scale(scalars[i]);
+            expected = expected.add(tmp);
+        }
+        g2 result = g2::weightedSum(bases, scalars);
+        if(!expected.equal(result))
+        {
+            throw invalid_argument("bad G2 weighted sum");
+        }
+    };
+    
+    doTest(0);
+    doTest(1);
+    doTest(2);
+    doTest(31);
+    doTest(32);
+    doTest(33);
+    doTest(63);
+    doTest(64);
+    doTest(65);
+    doTest(511);
+    doTest(512);
+    doTest(513);
 }
 
 void TestG2MapToCurve()
@@ -1093,8 +1515,8 @@ void TestPairingBilinearity()
         vector<tuple<g1, g2>> v;
         pairing::add_pair(v, G1, G2);
         fp12 e0 = pairing::calculate(v);
-        g1 P1 = G1.mulScalar(a);
-        g2 P2 = G2.mulScalar(b);
+        g1 P1 = G1.scale(a);
+        g2 P2 = G2.scale(b);
         v = {};
         pairing::add_pair(v, P1, P2);
         fp12 e1 = pairing::calculate(v);
@@ -1115,14 +1537,14 @@ void TestPairingBilinearity()
         // LHS
         g1 G1 = g1::one();
         g2 G2 = g2::one();
-        G1 = G1.mulScalar(c);
+        G1 = G1.scale(c);
         pairing::add_pair(v, G1, G2);
         // RHS
         g1 P1 = g1::one();
         g2 P2 = g2::one();
-        P1 = P1.mulScalar(a);
-        P2 = P2.mulScalar(b);
-        P1 = P1.neg();
+        P1 = P1.scale(a);
+        P2 = P2.scale(b);
+        P1 = P1.negate();
         pairing::add_pair(v, P1, P2);
         // should be one
         if(!pairing::calculate(v).isOne())
@@ -1141,14 +1563,14 @@ void TestPairingBilinearity()
         // LHS
         g1 G1 = g1::one();
         g2 G2 = g2::one();
-        G2 = G2.mulScalar(c);
+        G2 = G2.scale(c);
         pairing::add_pair(v, G1, G2);
         // RHS
         g1 H1 = g1::one();
         g2 H2 = g2::one();
-        H1 = H1.mulScalar(a);
-        H2 = H2.mulScalar(b);
-        H1 = H1.neg();
+        H1 = H1.scale(a);
+        H2 = H2.scale(b);
+        H1 = H1.negate();
         pairing::add_pair(v, H1, H2);
         // should be one
         if(!pairing::calculate(v).isOne())
@@ -1173,20 +1595,20 @@ void TestPairingMulti()
         array<uint64_t, 4> a2 = random_scalar();
         g1 P1 = g1::one();
         g2 P2 = g2::one();
-        P1 = P1.mulScalar(a1);
-        P2 = P2.mulScalar(a2);
+        P1 = P1.scale(a1);
+        P2 = P2.scale(a2);
         pairing::add_pair(v, P1, P2);
         // accumulate targetExp
         // t += (ai1 * ai2)
-        array<uint64_t, 10> tmp = scalar::mul<10, 4, 4>(a1, a2);
+        array<uint64_t, 10> tmp = scalar::multiply<10, 4, 4>(a1, a2);
         targetExp = scalar::add<10, 10, 10>(targetExp, tmp);
     }
     // LHS
     // e(t * G1, G2)
     g1 T1 = g1::one();
     g2 T2 = g2::one();
-    T1 = T1.mulScalar(targetExp);
-    T1 = T1.neg();
+    T1 = T1.scale(targetExp);
+    T1 = T1.negate();
     pairing::add_pair(v, T1, T2);
     if(!pairing::calculate(v).isOne())
     {
@@ -1787,31 +2209,115 @@ void TestExtraVectors()
     }
 }
 
+void TestOutOfRangeInputs() {
+    // This test is to make sure multiplication wiil not fail if the inputs is just slightly larger than p
+    // The 4(p-1) limit may be not that strict. But we should only relax this limit if we are absolutely sure this 
+    // will not cause problems all the methods calling _ladd/_lsubstract/_ldouble.
+    auto p = *fp::fromBytesBE(hexToBytes<48>("1A0111EA397FE69A4B1BA7B6434BACD764774B84F38512BF6730D2A0F6B0F6241EABFFFEB153FFFFB9FEFFFFFFFFAAAB"), false, true);
+    auto pminus1 = *fp::fromBytesBE(hexToBytes<48>("1A0111EA397FE69A4B1BA7B6434BACD764774B84F38512BF6730D2A0F6B0F6241EABFFFEB153FFFFB9FEFFFFFFFFAAAA"), false, true);
+    // 2^383, largest possible input to multiplication during the inverse().
+    auto two383 = *fp::fromBytesBE(hexToBytes<48>("400000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"), false, true);
+    // 4(p-1) * 4(p-1) will work
+    for (int i = 0 ; i < 3; ++ i) {
+        auto a = pminus1;
+        auto b = pminus1;
+        auto a2 = pminus1;
+        a2 = a2.fromMont().toMont();
+        auto b2 = pminus1;
+        b2 = b2.fromMont().toMont();
+        for (int j = 0; j < i; ++ j) {
+            _ldouble(&a,&a);
+            _ldouble(&b,&b);
+            a2 = a2.dbl();
+            b2 = b2.dbl();
+        }
+        auto c = a.multiply(b); 
+        auto c2 = a2.multiply(b2);
+
+        if (!c.equal(c2)) {
+            cout << "multiplication 2^i(p-1) * 2^i(p-1) failed: i = "<< i;
+            throw;
+        }
+        
+    }
+
+    // 2p * 2p will work
+    for (int i = 0 ; i < 2; ++ i) {
+        auto a = p;
+        auto b = p;
+        auto a2 = p;
+        a2 = a2.fromMont().toMont();
+        auto b2 = p;
+        b2 = b2.fromMont().toMont();
+        for (int j = 0; j < i; ++ j) {
+            _ldouble(&a,&a);
+            _ldouble(&b,&b);
+            a2 = a2.dbl();
+            b2 = b2.dbl();
+        }
+        auto c = a.multiply(b); 
+        auto c2 = a2.multiply(b2);
+
+        if (!c.equal(c2)) {
+            cout << "multiplication 2^i(p) * 2^i(p) failed: i = "<< i;
+            throw;
+        }
+        
+    }
+
+    {
+        auto a = two383;
+        auto b = two383;
+        auto a2 = two383;
+        a2 = a2.fromMont().toMont();
+        auto b2 = two383;
+        b2 = b2.fromMont().toMont();
+        
+        auto c = a.multiply(b); 
+        auto c2 = a2.multiply(b2);
+
+        if (!c.equal(c2)) {
+            cout << "multiplication 2^383 * 2^383 failed";
+            throw;
+        }
+    }
+}
+
 int main()
 {
     TestScalar();
 
     TestFieldElementValidation();
     TestFieldElementEquality();
+    TestFieldElementArithmeticCornerCases();
     TestFieldElementHelpers();
     TestFieldElementSerialization();
     TestFieldElementByteInputs();
 
+    TestArithmeticOpraters();
+
+    TestSqrt();
+    TestInverse();
+    TestMod();
+    TestExp();
+
     TestG1Serialization();
+    TestG1SerializationGarbage();
     TestG1IsOnCurve();
     TestG1AdditiveProperties();
     TestG1MultiplicativePropertiesExpected();
     TestG1MultiplicativeProperties();
-    TestG1MultiExpExpected();
-    TestG1MultiExpBatch();
+    TestG1WeightedSumExpected();
+    TestG1WeightedSumBatch();
     TestG1MapToCurve();
 
     TestG2Serialization();
+    TestG2SerializationGarbage();
     TestG2IsOnCurve();
     TestG2AdditiveProperties();
     TestG2MultiplicativeProperties();
-    TestG2MultiExpExpected();
-    TestG2MultiExpBatch();
+    TestG2WeightedSumExpected();
+    TestG2WeightedSumBatch();
     TestG2MapToCurve();
 
     TestPairingExpected();
@@ -1831,6 +2337,7 @@ int main()
     TestPopScheme();
 
     TestExtraVectors();
-    
+    TestOutOfRangeInputs();
+
     return 0;
 }