From 70c6c5ec3b509da6534c2121d53ddcb2ab8cf340 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Wed, 15 Sep 2021 16:58:06 +0200 Subject: [PATCH 01/11] feat(Fp12): implements the Karabina cyclotomic square in E12/E6 --- ecc/bls12-377/internal/fptower/e12.go | 125 +++++++++++++++++- ecc/bls12-377/internal/fptower/e12_test.go | 16 ++- ecc/bls12-381/internal/fptower/e12.go | 125 +++++++++++++++++- ecc/bls12-381/internal/fptower/e12_test.go | 16 ++- ecc/bn254/internal/fptower/e12.go | 125 +++++++++++++++++- ecc/bn254/internal/fptower/e12_test.go | 16 ++- .../template/fq12over6over2/fq12.go.tmpl | 125 +++++++++++++++++- .../fq12over6over2/tests/fq12.go.tmpl | 16 ++- 8 files changed, 556 insertions(+), 8 deletions(-) diff --git a/ecc/bls12-377/internal/fptower/e12.go b/ecc/bls12-377/internal/fptower/e12.go index 1749ef9e3..e402f40c0 100644 --- a/ecc/bls12-377/internal/fptower/e12.go +++ b/ecc/bls12-377/internal/fptower/e12.go @@ -134,7 +134,130 @@ func (z *E12) Square(x *E12) *E12 { return z } -// CyclotomicSquare https://eprint.iacr.org/2009/565.pdf, 3.2 +// Karabina's compressed cyclotomic square +// https://eprint.iacr.org/2010/542.pdf +// Th. 3.2 with minor modifications to fit our tower +func (z *E12) CyclotomicSquareCompressed(x *E12) *E12 { + + var t [7]E2 + + // t0 = g1^2 + t[0].Square(&x.C0.B1) + // t1 = g5^2 + t[1].Square(&x.C1.B2) + // t5 = g1 + g5 + t[5].Add(&x.C0.B1, &x.C1.B2) + // t2 = (g1 + g5)^2 + t[2].Square(&t[5]) + + // t3 = g1^2 + g5^2 + t[3].Add(&t[0], &t[1]) + // t5 = 2 * g1 * g5 + t[5].Sub(&t[2], &t[3]) + + // t6 = g3 + g2 + t[6].Add(&x.C1.B0, &x.C0.B2) + // t3 = (g3 + g2)^2 + t[3].Square(&t[6]) + // t2 = g3^2 + t[2].Square(&x.C1.B0) + + // t6 = 2 * nr * g1 * g5 + t[6].MulByNonResidue(&t[5]) + // t5 = 4 * nr * g1 * g5 + 2 * g3 + t[5].Add(&t[6], &x.C1.B0). + Double(&t[5]) + // z3 = 6 * nr * g1 * g5 + 2 * g3 + z.C1.B0.Add(&t[5], &t[6]) + + // t4 = nr * g5^2 + t[4].MulByNonResidue(&t[1]) + // t5 = nr * g5^2 + g1^2 + t[5].Add(&t[0], &t[4]) + // t6 = nr * g5^2 + g1^2 - g2 + t[6].Sub(&t[5], &x.C0.B2) + + // t1 = g2^2 + t[1].Square(&x.C0.B2) + + // t6 = 2 * nr * g5^2 + 2 * g1^2 - 2*g2 + t[6].Double(&t[6]) + // z2 = 3 * nr * g5^2 + 3 * g1^2 - 2*g2 + z.C0.B2.Add(&t[6], &t[5]) + + // t4 = nr * g2^2 + t[4].MulByNonResidue(&t[1]) + // t5 = g3^2 + nr * g2^2 + t[5].Add(&t[2], &t[4]) + // t6 = g3^2 + nr * g2^2 - g1 + t[6].Sub(&t[5], &x.C0.B1) + // t6 = 2 * g3^2 + 2 * nr * g2^2 - 2 * g1 + t[6].Double(&t[6]) + // z1 = 3 * g3^2 + 3 * nr * g2^2 - 2 * g1 + z.C0.B1.Add(&t[6], &t[5]) + + // t0 = g2^2 + g3^2 + t[0].Add(&t[2], &t[1]) + // t5 = 2 * g3 * g2 + t[5].Sub(&t[3], &t[0]) + // t6 = 2 * g3 * g2 + g5 + t[6].Add(&t[5], &x.C1.B2) + // t6 = 4 * g3 * g2 + 2 * g5 + t[6].Double(&t[6]) + // z5 = 6 * g3 * g2 + 2 * g5 + z.C1.B2.Add(&t[5], &t[6]) + + return z +} + +// Decompress Karabina's cyclotomic square result +func (z *E12) Decompress(x *E12) *E12 { + + var t [3]E2 + var one E2 + one.SetOne() + + // t0 = g1^2 + t[0].Square(&x.C0.B1) + // t1 = 3 * g1^2 - 2 * g2 + t[1].Sub(&t[0], &x.C0.B2). + Double(&t[1]). + Add(&t[1], &t[0]) + // t0 = E * g5^2 + t1 + t[2].Square(&x.C1.B2) + t[0].MulByNonResidue(&t[2]). + Add(&t[0], &t[1]) + // t1 = 1/(4 * g3) + t[1].Double(&x.C1.B0). + Double(&t[1]). + Inverse(&t[1]) // costly + // z4 = g4 + z.C1.B1.Mul(&t[0], &t[1]) + + // t1 = g2 * g1 + t[1].Mul(&x.C0.B2, &x.C0.B1) + // t2 = 2 * g4^2 - 3 * g2 * g1 + t[2].Square(&x.C1.B1). + Sub(&t[2], &t[1]). + Double(&t[2]). + Sub(&t[2], &t[1]) + // t1 = g3 * g5 + t[1].Mul(&x.C1.B0, &x.C1.B2) + // c_0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1 + t[2].Add(&t[2], &t[1]) + z.C0.B0.MulByNonResidue(&t[2]). + Add(&z.C0.B0, &one) + + z.C0.B1.Set(&x.C0.B1) + z.C0.B2.Set(&x.C0.B2) + z.C1.B0.Set(&x.C1.B0) + z.C1.B2.Set(&x.C1.B2) + + return z +} + +// Granger-Scott's cyclotomic square +// https://eprint.iacr.org/2009/565.pdf, 3.2 func (z *E12) CyclotomicSquare(x *E12) *E12 { // x=(x0,x1,x2,x3,x4,x5,x6,x7) in E2^6 diff --git a/ecc/bls12-377/internal/fptower/e12_test.go b/ecc/bls12-377/internal/fptower/e12_test.go index c0fd0ffce..957b9cbaa 100644 --- a/ecc/bls12-377/internal/fptower/e12_test.go +++ b/ecc/bls12-377/internal/fptower/e12_test.go @@ -295,7 +295,7 @@ func TestE12Ops(t *testing.T) { genA, )) - properties.Property("[BLS12-377] cyclotomic square and square should be the same in the cyclotomic subgroup", prop.ForAll( + properties.Property("[BLS12-377] cyclotomic square (Granger-Scott) and square should be the same in the cyclotomic subgroup", prop.ForAll( func(a *E12) bool { var b, c, d E12 b.Conjugate(a) @@ -309,6 +309,20 @@ func TestE12Ops(t *testing.T) { genA, )) + properties.Property("[BLS12-377] compressed cyclotomic square (Karabina) and square should be the same in the cyclotomic subgroup", prop.ForAll( + func(a *E12) bool { + var b, c, d E12 + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusSquare(&b).Mul(a, &b) + c.Square(a) + d.CyclotomicSquareCompressed(a).Decompress(&d) + return c.Equal(&d) + }, + genA, + )) + properties.Property("[BLS12-377] Frobenius of x in E12 should be equal to x^q", prop.ForAll( func(a *E12) bool { var b, c E12 diff --git a/ecc/bls12-381/internal/fptower/e12.go b/ecc/bls12-381/internal/fptower/e12.go index d53153f6e..6886c5762 100644 --- a/ecc/bls12-381/internal/fptower/e12.go +++ b/ecc/bls12-381/internal/fptower/e12.go @@ -134,7 +134,130 @@ func (z *E12) Square(x *E12) *E12 { return z } -// CyclotomicSquare https://eprint.iacr.org/2009/565.pdf, 3.2 +// Karabina's compressed cyclotomic square +// https://eprint.iacr.org/2010/542.pdf +// Th. 3.2 with minor modifications to fit our tower +func (z *E12) CyclotomicSquareCompressed(x *E12) *E12 { + + var t [7]E2 + + // t0 = g1^2 + t[0].Square(&x.C0.B1) + // t1 = g5^2 + t[1].Square(&x.C1.B2) + // t5 = g1 + g5 + t[5].Add(&x.C0.B1, &x.C1.B2) + // t2 = (g1 + g5)^2 + t[2].Square(&t[5]) + + // t3 = g1^2 + g5^2 + t[3].Add(&t[0], &t[1]) + // t5 = 2 * g1 * g5 + t[5].Sub(&t[2], &t[3]) + + // t6 = g3 + g2 + t[6].Add(&x.C1.B0, &x.C0.B2) + // t3 = (g3 + g2)^2 + t[3].Square(&t[6]) + // t2 = g3^2 + t[2].Square(&x.C1.B0) + + // t6 = 2 * nr * g1 * g5 + t[6].MulByNonResidue(&t[5]) + // t5 = 4 * nr * g1 * g5 + 2 * g3 + t[5].Add(&t[6], &x.C1.B0). + Double(&t[5]) + // z3 = 6 * nr * g1 * g5 + 2 * g3 + z.C1.B0.Add(&t[5], &t[6]) + + // t4 = nr * g5^2 + t[4].MulByNonResidue(&t[1]) + // t5 = nr * g5^2 + g1^2 + t[5].Add(&t[0], &t[4]) + // t6 = nr * g5^2 + g1^2 - g2 + t[6].Sub(&t[5], &x.C0.B2) + + // t1 = g2^2 + t[1].Square(&x.C0.B2) + + // t6 = 2 * nr * g5^2 + 2 * g1^2 - 2*g2 + t[6].Double(&t[6]) + // z2 = 3 * nr * g5^2 + 3 * g1^2 - 2*g2 + z.C0.B2.Add(&t[6], &t[5]) + + // t4 = nr * g2^2 + t[4].MulByNonResidue(&t[1]) + // t5 = g3^2 + nr * g2^2 + t[5].Add(&t[2], &t[4]) + // t6 = g3^2 + nr * g2^2 - g1 + t[6].Sub(&t[5], &x.C0.B1) + // t6 = 2 * g3^2 + 2 * nr * g2^2 - 2 * g1 + t[6].Double(&t[6]) + // z1 = 3 * g3^2 + 3 * nr * g2^2 - 2 * g1 + z.C0.B1.Add(&t[6], &t[5]) + + // t0 = g2^2 + g3^2 + t[0].Add(&t[2], &t[1]) + // t5 = 2 * g3 * g2 + t[5].Sub(&t[3], &t[0]) + // t6 = 2 * g3 * g2 + g5 + t[6].Add(&t[5], &x.C1.B2) + // t6 = 4 * g3 * g2 + 2 * g5 + t[6].Double(&t[6]) + // z5 = 6 * g3 * g2 + 2 * g5 + z.C1.B2.Add(&t[5], &t[6]) + + return z +} + +// Decompress Karabina's cyclotomic square result +func (z *E12) Decompress(x *E12) *E12 { + + var t [3]E2 + var one E2 + one.SetOne() + + // t0 = g1^2 + t[0].Square(&x.C0.B1) + // t1 = 3 * g1^2 - 2 * g2 + t[1].Sub(&t[0], &x.C0.B2). + Double(&t[1]). + Add(&t[1], &t[0]) + // t0 = E * g5^2 + t1 + t[2].Square(&x.C1.B2) + t[0].MulByNonResidue(&t[2]). + Add(&t[0], &t[1]) + // t1 = 1/(4 * g3) + t[1].Double(&x.C1.B0). + Double(&t[1]). + Inverse(&t[1]) // costly + // z4 = g4 + z.C1.B1.Mul(&t[0], &t[1]) + + // t1 = g2 * g1 + t[1].Mul(&x.C0.B2, &x.C0.B1) + // t2 = 2 * g4^2 - 3 * g2 * g1 + t[2].Square(&x.C1.B1). + Sub(&t[2], &t[1]). + Double(&t[2]). + Sub(&t[2], &t[1]) + // t1 = g3 * g5 + t[1].Mul(&x.C1.B0, &x.C1.B2) + // c_0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1 + t[2].Add(&t[2], &t[1]) + z.C0.B0.MulByNonResidue(&t[2]). + Add(&z.C0.B0, &one) + + z.C0.B1.Set(&x.C0.B1) + z.C0.B2.Set(&x.C0.B2) + z.C1.B0.Set(&x.C1.B0) + z.C1.B2.Set(&x.C1.B2) + + return z +} + +// Granger-Scott's cyclotomic square +// https://eprint.iacr.org/2009/565.pdf, 3.2 func (z *E12) CyclotomicSquare(x *E12) *E12 { // x=(x0,x1,x2,x3,x4,x5,x6,x7) in E2^6 diff --git a/ecc/bls12-381/internal/fptower/e12_test.go b/ecc/bls12-381/internal/fptower/e12_test.go index fca4570ae..e5cfd97f9 100644 --- a/ecc/bls12-381/internal/fptower/e12_test.go +++ b/ecc/bls12-381/internal/fptower/e12_test.go @@ -295,7 +295,7 @@ func TestE12Ops(t *testing.T) { genA, )) - properties.Property("[BLS12-381] cyclotomic square and square should be the same in the cyclotomic subgroup", prop.ForAll( + properties.Property("[BLS12-381] cyclotomic square (Granger-Scott) and square should be the same in the cyclotomic subgroup", prop.ForAll( func(a *E12) bool { var b, c, d E12 b.Conjugate(a) @@ -309,6 +309,20 @@ func TestE12Ops(t *testing.T) { genA, )) + properties.Property("[BLS12-381] compressed cyclotomic square (Karabina) and square should be the same in the cyclotomic subgroup", prop.ForAll( + func(a *E12) bool { + var b, c, d E12 + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusSquare(&b).Mul(a, &b) + c.Square(a) + d.CyclotomicSquareCompressed(a).Decompress(&d) + return c.Equal(&d) + }, + genA, + )) + properties.Property("[BLS12-381] Frobenius of x in E12 should be equal to x^q", prop.ForAll( func(a *E12) bool { var b, c E12 diff --git a/ecc/bn254/internal/fptower/e12.go b/ecc/bn254/internal/fptower/e12.go index 6e8da4dc5..64f1e99f3 100644 --- a/ecc/bn254/internal/fptower/e12.go +++ b/ecc/bn254/internal/fptower/e12.go @@ -134,7 +134,130 @@ func (z *E12) Square(x *E12) *E12 { return z } -// CyclotomicSquare https://eprint.iacr.org/2009/565.pdf, 3.2 +// Karabina's compressed cyclotomic square +// https://eprint.iacr.org/2010/542.pdf +// Th. 3.2 with minor modifications to fit our tower +func (z *E12) CyclotomicSquareCompressed(x *E12) *E12 { + + var t [7]E2 + + // t0 = g1^2 + t[0].Square(&x.C0.B1) + // t1 = g5^2 + t[1].Square(&x.C1.B2) + // t5 = g1 + g5 + t[5].Add(&x.C0.B1, &x.C1.B2) + // t2 = (g1 + g5)^2 + t[2].Square(&t[5]) + + // t3 = g1^2 + g5^2 + t[3].Add(&t[0], &t[1]) + // t5 = 2 * g1 * g5 + t[5].Sub(&t[2], &t[3]) + + // t6 = g3 + g2 + t[6].Add(&x.C1.B0, &x.C0.B2) + // t3 = (g3 + g2)^2 + t[3].Square(&t[6]) + // t2 = g3^2 + t[2].Square(&x.C1.B0) + + // t6 = 2 * nr * g1 * g5 + t[6].MulByNonResidue(&t[5]) + // t5 = 4 * nr * g1 * g5 + 2 * g3 + t[5].Add(&t[6], &x.C1.B0). + Double(&t[5]) + // z3 = 6 * nr * g1 * g5 + 2 * g3 + z.C1.B0.Add(&t[5], &t[6]) + + // t4 = nr * g5^2 + t[4].MulByNonResidue(&t[1]) + // t5 = nr * g5^2 + g1^2 + t[5].Add(&t[0], &t[4]) + // t6 = nr * g5^2 + g1^2 - g2 + t[6].Sub(&t[5], &x.C0.B2) + + // t1 = g2^2 + t[1].Square(&x.C0.B2) + + // t6 = 2 * nr * g5^2 + 2 * g1^2 - 2*g2 + t[6].Double(&t[6]) + // z2 = 3 * nr * g5^2 + 3 * g1^2 - 2*g2 + z.C0.B2.Add(&t[6], &t[5]) + + // t4 = nr * g2^2 + t[4].MulByNonResidue(&t[1]) + // t5 = g3^2 + nr * g2^2 + t[5].Add(&t[2], &t[4]) + // t6 = g3^2 + nr * g2^2 - g1 + t[6].Sub(&t[5], &x.C0.B1) + // t6 = 2 * g3^2 + 2 * nr * g2^2 - 2 * g1 + t[6].Double(&t[6]) + // z1 = 3 * g3^2 + 3 * nr * g2^2 - 2 * g1 + z.C0.B1.Add(&t[6], &t[5]) + + // t0 = g2^2 + g3^2 + t[0].Add(&t[2], &t[1]) + // t5 = 2 * g3 * g2 + t[5].Sub(&t[3], &t[0]) + // t6 = 2 * g3 * g2 + g5 + t[6].Add(&t[5], &x.C1.B2) + // t6 = 4 * g3 * g2 + 2 * g5 + t[6].Double(&t[6]) + // z5 = 6 * g3 * g2 + 2 * g5 + z.C1.B2.Add(&t[5], &t[6]) + + return z +} + +// Decompress Karabina's cyclotomic square result +func (z *E12) Decompress(x *E12) *E12 { + + var t [3]E2 + var one E2 + one.SetOne() + + // t0 = g1^2 + t[0].Square(&x.C0.B1) + // t1 = 3 * g1^2 - 2 * g2 + t[1].Sub(&t[0], &x.C0.B2). + Double(&t[1]). + Add(&t[1], &t[0]) + // t0 = E * g5^2 + t1 + t[2].Square(&x.C1.B2) + t[0].MulByNonResidue(&t[2]). + Add(&t[0], &t[1]) + // t1 = 1/(4 * g3) + t[1].Double(&x.C1.B0). + Double(&t[1]). + Inverse(&t[1]) // costly + // z4 = g4 + z.C1.B1.Mul(&t[0], &t[1]) + + // t1 = g2 * g1 + t[1].Mul(&x.C0.B2, &x.C0.B1) + // t2 = 2 * g4^2 - 3 * g2 * g1 + t[2].Square(&x.C1.B1). + Sub(&t[2], &t[1]). + Double(&t[2]). + Sub(&t[2], &t[1]) + // t1 = g3 * g5 + t[1].Mul(&x.C1.B0, &x.C1.B2) + // c_0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1 + t[2].Add(&t[2], &t[1]) + z.C0.B0.MulByNonResidue(&t[2]). + Add(&z.C0.B0, &one) + + z.C0.B1.Set(&x.C0.B1) + z.C0.B2.Set(&x.C0.B2) + z.C1.B0.Set(&x.C1.B0) + z.C1.B2.Set(&x.C1.B2) + + return z +} + +// Granger-Scott's cyclotomic square +// https://eprint.iacr.org/2009/565.pdf, 3.2 func (z *E12) CyclotomicSquare(x *E12) *E12 { // x=(x0,x1,x2,x3,x4,x5,x6,x7) in E2^6 diff --git a/ecc/bn254/internal/fptower/e12_test.go b/ecc/bn254/internal/fptower/e12_test.go index 0e942ea95..14d126d2f 100644 --- a/ecc/bn254/internal/fptower/e12_test.go +++ b/ecc/bn254/internal/fptower/e12_test.go @@ -295,7 +295,7 @@ func TestE12Ops(t *testing.T) { genA, )) - properties.Property("[BN254] cyclotomic square and square should be the same in the cyclotomic subgroup", prop.ForAll( + properties.Property("[BN254] cyclotomic square (Granger-Scott) and square should be the same in the cyclotomic subgroup", prop.ForAll( func(a *E12) bool { var b, c, d E12 b.Conjugate(a) @@ -309,6 +309,20 @@ func TestE12Ops(t *testing.T) { genA, )) + properties.Property("[BN254] compressed cyclotomic square (Karabina) and square should be the same in the cyclotomic subgroup", prop.ForAll( + func(a *E12) bool { + var b, c, d E12 + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusSquare(&b).Mul(a, &b) + c.Square(a) + d.CyclotomicSquareCompressed(a).Decompress(&d) + return c.Equal(&d) + }, + genA, + )) + properties.Property("[BN254] Frobenius of x in E12 should be equal to x^q", prop.ForAll( func(a *E12) bool { var b, c E12 diff --git a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl index 0fb8def65..c116e2cfc 100644 --- a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl +++ b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl @@ -116,7 +116,130 @@ func (z *E12) Square(x *E12) *E12 { return z } -// CyclotomicSquare https://eprint.iacr.org/2009/565.pdf, 3.2 +// Karabina's compressed cyclotomic square +// https://eprint.iacr.org/2010/542.pdf +// Th. 3.2 with minor modifications to fit our tower +func (z *E12) CyclotomicSquareCompressed(x *E12) *E12 { + + var t [7]E2 + + // t0 = g1^2 + t[0].Square(&x.C0.B1) + // t1 = g5^2 + t[1].Square(&x.C1.B2) + // t5 = g1 + g5 + t[5].Add(&x.C0.B1, &x.C1.B2) + // t2 = (g1 + g5)^2 + t[2].Square(&t[5]) + + // t3 = g1^2 + g5^2 + t[3].Add(&t[0], &t[1]) + // t5 = 2 * g1 * g5 + t[5].Sub(&t[2], &t[3]) + + // t6 = g3 + g2 + t[6].Add(&x.C1.B0, &x.C0.B2) + // t3 = (g3 + g2)^2 + t[3].Square(&t[6]) + // t2 = g3^2 + t[2].Square(&x.C1.B0) + + // t6 = 2 * nr * g1 * g5 + t[6].MulByNonResidue(&t[5]) + // t5 = 4 * nr * g1 * g5 + 2 * g3 + t[5].Add(&t[6], &x.C1.B0). + Double(&t[5]) + // z3 = 6 * nr * g1 * g5 + 2 * g3 + z.C1.B0.Add(&t[5], &t[6]) + + // t4 = nr * g5^2 + t[4].MulByNonResidue(&t[1]) + // t5 = nr * g5^2 + g1^2 + t[5].Add(&t[0], &t[4]) + // t6 = nr * g5^2 + g1^2 - g2 + t[6].Sub(&t[5], &x.C0.B2) + + // t1 = g2^2 + t[1].Square(&x.C0.B2) + + // t6 = 2 * nr * g5^2 + 2 * g1^2 - 2*g2 + t[6].Double(&t[6]) + // z2 = 3 * nr * g5^2 + 3 * g1^2 - 2*g2 + z.C0.B2.Add(&t[6], &t[5]) + + // t4 = nr * g2^2 + t[4].MulByNonResidue(&t[1]) + // t5 = g3^2 + nr * g2^2 + t[5].Add(&t[2], &t[4]) + // t6 = g3^2 + nr * g2^2 - g1 + t[6].Sub(&t[5], &x.C0.B1) + // t6 = 2 * g3^2 + 2 * nr * g2^2 - 2 * g1 + t[6].Double(&t[6]) + // z1 = 3 * g3^2 + 3 * nr * g2^2 - 2 * g1 + z.C0.B1.Add(&t[6], &t[5]) + + // t0 = g2^2 + g3^2 + t[0].Add(&t[2], &t[1]) + // t5 = 2 * g3 * g2 + t[5].Sub(&t[3], &t[0]) + // t6 = 2 * g3 * g2 + g5 + t[6].Add(&t[5], &x.C1.B2) + // t6 = 4 * g3 * g2 + 2 * g5 + t[6].Double(&t[6]) + // z5 = 6 * g3 * g2 + 2 * g5 + z.C1.B2.Add(&t[5], &t[6]) + + return z +} + +// Decompress Karabina's cyclotomic square result +func (z *E12) Decompress(x *E12) *E12 { + + var t [3]E2 + var one E2 + one.SetOne() + + // t0 = g1^2 + t[0].Square(&x.C0.B1) + // t1 = 3 * g1^2 - 2 * g2 + t[1].Sub(&t[0], &x.C0.B2). + Double(&t[1]). + Add(&t[1], &t[0]) + // t0 = E * g5^2 + t1 + t[2].Square(&x.C1.B2) + t[0].MulByNonResidue(&t[2]). + Add(&t[0], &t[1]) + // t1 = 1/(4 * g3) + t[1].Double(&x.C1.B0). + Double(&t[1]). + Inverse(&t[1]) // costly + // z4 = g4 + z.C1.B1.Mul(&t[0], &t[1]) + + // t1 = g2 * g1 + t[1].Mul(&x.C0.B2, &x.C0.B1); + // t2 = 2 * g4^2 - 3 * g2 * g1 + t[2].Square(&x.C1.B1). + Sub(&t[2], &t[1]). + Double(&t[2]). + Sub(&t[2], &t[1]) + // t1 = g3 * g5 + t[1].Mul(&x.C1.B0, &x.C1.B2) + // c_0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1 + t[2].Add(&t[2], &t[1]) + z.C0.B0.MulByNonResidue(&t[2]). + Add(&z.C0.B0, &one) + + z.C0.B1.Set(&x.C0.B1) + z.C0.B2.Set(&x.C0.B2) + z.C1.B0.Set(&x.C1.B0) + z.C1.B2.Set(&x.C1.B2) + + return z +} + +// Granger-Scott's cyclotomic square +// https://eprint.iacr.org/2009/565.pdf, 3.2 func (z *E12) CyclotomicSquare(x *E12) *E12 { // x=(x0,x1,x2,x3,x4,x5,x6,x7) in E2^6 diff --git a/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl index 39896a07a..793d33124 100644 --- a/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl +++ b/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl @@ -277,7 +277,7 @@ func TestE12Ops(t *testing.T) { genA, )) - properties.Property("[{{ toUpper .Name }}] cyclotomic square and square should be the same in the cyclotomic subgroup", prop.ForAll( + properties.Property("[{{ toUpper .Name }}] cyclotomic square (Granger-Scott) and square should be the same in the cyclotomic subgroup", prop.ForAll( func(a *E12) bool { var b, c, d E12 b.Conjugate(a) @@ -291,6 +291,20 @@ func TestE12Ops(t *testing.T) { genA, )) + properties.Property("[{{ toUpper .Name }}] compressed cyclotomic square (Karabina) and square should be the same in the cyclotomic subgroup", prop.ForAll( + func(a *E12) bool { + var b, c, d E12 + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusSquare(&b).Mul(a, &b) + c.Square(a) + d.CyclotomicSquareCompressed(a).Decompress(&d) + return c.Equal(&d) + }, + genA, + )) + properties.Property("[{{ toUpper .Name }}] Frobenius of x in E12 should be equal to x^q", prop.ForAll( func(a *E12) bool { var b, c E12 From bdf5c6bcc3e99f69c2fc64325bd074523befd800 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Wed, 15 Sep 2021 16:58:33 +0200 Subject: [PATCH 02/11] feat(Fp24): implements the Karabina cyclotomic square in E24/E8 --- ecc/bls24-315/internal/fptower/e24.go | 125 ++++++++++++++++++++- ecc/bls24-315/internal/fptower/e24_test.go | 28 +++++ 2 files changed, 152 insertions(+), 1 deletion(-) diff --git a/ecc/bls24-315/internal/fptower/e24.go b/ecc/bls24-315/internal/fptower/e24.go index 2707389d7..4e3159eef 100644 --- a/ecc/bls24-315/internal/fptower/e24.go +++ b/ecc/bls24-315/internal/fptower/e24.go @@ -168,7 +168,130 @@ func (z *E24) Square(x *E24) *E24 { return z } -// CyclotomicSquare https://eprint.iacr.org/2009/565.pdf, 3.2 +// Karabina's compressed cyclotomic square +// https://eprint.iacr.org/2010/542.pdf +// Th. 3.2 with minor modifications to fit our tower +func (z *E24) CyclotomicSquareCompressed(x *E24) *E24 { + + var t [7]E4 + + // t0 = g4^2 + t[0].Square(&x.D2.C0) + // t1 = g5^2 + t[1].Square(&x.D2.C1) + // t5 = g4 + g5 + t[5].Add(&x.D2.C0, &x.D2.C1) + // t2 = (g4 + g5)^2 + t[2].Square(&t[5]) + + // t3 = g4^2 + g5^2 + t[3].Add(&t[0], &t[1]) + // t5 = 2 * g4 * g5 + t[5].Sub(&t[2], &t[3]) + + // t6 = g3 + g2 + t[6].Add(&x.D1.C1, &x.D1.C0) + // t3 = (g3 + g2)^2 + t[3].Square(&t[6]) + // t2 = g2^2 + t[2].Square(&x.D1.C0) + + // t6 = 2 * nr * g4 * g5 + t[6].MulByNonResidue(&t[5]) + // t5 = 4 * nr * g4 * g5 + 2 * g2 + t[5].Add(&t[6], &x.D1.C0). + Double(&t[5]) + // z2 = 6 * nr * g4 * g5 + 2 * g2 + z.D1.C0.Add(&t[5], &t[6]) + + // t4 = nr * g5^2 + t[4].MulByNonResidue(&t[1]) + // t5 = nr * g5^2 + g4^2 + t[5].Add(&t[0], &t[4]) + // t6 = nr * g5^2 + g1^2 - g3 + t[6].Sub(&t[5], &x.D1.C1) + + // t1 = g3^2 + t[1].Square(&x.D1.C1) + + // t6 = 2 * nr * g5^2 + 2 * g4^2 - 2*g3 + t[6].Double(&t[6]) + // z3 = 3 * nr * g5^2 + 3 * g4^2 - 2*g3 + z.D1.C1.Add(&t[6], &t[5]) + + // t4 = nr * g3^2 + t[4].MulByNonResidue(&t[1]) + // t5 = g2^2 + nr * g3^2 + t[5].Add(&t[2], &t[4]) + // t6 = g2^2 + nr * g3^2 - g4 + t[6].Sub(&t[5], &x.D2.C0) + // t6 = 2 * g2^2 + 2 * nr * g3^2 - 2 * g4 + t[6].Double(&t[6]) + // z4 = 3 * g2^2 + 3 * nr * g3^2 - 2 * g4 + z.D2.C0.Add(&t[6], &t[5]) + + // t0 = g3^2 + g2^2 + t[0].Add(&t[2], &t[1]) + // t5 = 2 * g2 * g3 + t[5].Sub(&t[3], &t[0]) + // t6 = 2 * g2 * g3 + g5 + t[6].Add(&t[5], &x.D2.C1) + // t6 = 4 * g2 * g3 + 2 * g5 + t[6].Double(&t[6]) + // z5 = 6 * g2 * g3 + 2 * g5 + z.D2.C1.Add(&t[5], &t[6]) + + return z +} + +// Decompress Karabina's cyclotomic square result +func (z *E24) Decompress(x *E24) *E24 { + + var t [3]E4 + var one E4 + one.SetOne() + + // t0 = g4^2 + t[0].Square(&x.D2.C0) + // t1 = 3 * g4^2 - 2 * g3 + t[1].Sub(&t[0], &x.D1.C1). + Double(&t[1]). + Add(&t[1], &t[0]) + // t0 = E * g5^2 + t1 + t[2].Square(&x.D2.C1) + t[0].MulByNonResidue(&t[2]). + Add(&t[0], &t[1]) + // t1 = 1/(4 * g2) + t[1].Double(&x.D1.C0). + Double(&t[1]). + Inverse(&t[1]) // costly + // z1 = g4 + z.D0.C1.Mul(&t[0], &t[1]) + + // t1 = g3 * g4 + t[1].Mul(&x.D1.C1, &x.D2.C0) + // t2 = 2 * g1^2 - 3 * g3 * g4 + t[2].Square(&x.D0.C1). + Sub(&t[2], &t[1]). + Double(&t[2]). + Sub(&t[2], &t[1]) + // t1 = g2 * g5 + t[1].Mul(&x.D1.C0, &x.D2.C1) + // z0 = E * (2 * g1^2 + g2 * g5 - 3 * g3 * g4) + 1 + t[2].Add(&t[2], &t[1]) + z.D0.C0.MulByNonResidue(&t[2]). + Add(&z.D0.C0, &one) + + z.D1.C0.Set(&x.D1.C0) + z.D1.C1.Set(&x.D1.C1) + z.D2.C0.Set(&x.D2.C0) + z.D2.C1.Set(&x.D2.C1) + + return z +} + +// Granger-Scott's cyclotomic square +// https://eprint.iacr.org/2009/565.pdf, 3.2 func (z *E24) CyclotomicSquare(x *E24) *E24 { var A, B, C, D E8 diff --git a/ecc/bls24-315/internal/fptower/e24_test.go b/ecc/bls24-315/internal/fptower/e24_test.go index 0643d6402..6bd9360c1 100644 --- a/ecc/bls24-315/internal/fptower/e24_test.go +++ b/ecc/bls24-315/internal/fptower/e24_test.go @@ -303,6 +303,34 @@ func TestE24Ops(t *testing.T) { genA, )) + properties.Property("[BLS24-315] cyclotomic square (Granger-Scott) and square should be the same in the cyclotomic subgroup", prop.ForAll( + func(a *E24) bool { + var b, c, d E24 + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusQuad(&b).Mul(a, &b) + c.Square(a) + d.CyclotomicSquare(a) + return c.Equal(&d) + }, + genA, + )) + + properties.Property("[BLS24-315] compressed cyclotomic square (Karabina) and square should be the same in the cyclotomic subgroup", prop.ForAll( + func(a *E24) bool { + var b, c, d E24 + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusQuad(&b).Mul(a, &b) + c.Square(a) + d.CyclotomicSquareCompressed(a).Decompress(&d) + return c.Equal(&d) + }, + genA, + )) + properties.Property("[BLS24-315] Frobenius of x in E24 should be equal to x^q", prop.ForAll( func(a *E24) bool { var b, c E24 From fdc94fe25b6b36cdb270ed8923f0bcf079ec2e0c Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Wed, 15 Sep 2021 17:06:52 +0200 Subject: [PATCH 03/11] feat(Fp6): implements the Karabina cyclotomic square in E6/E3 --- ecc/bw6-633/internal/fptower/e6.go | 125 +++++++++++++++++++++++- ecc/bw6-633/internal/fptower/e6_test.go | 16 ++- ecc/bw6-761/internal/fptower/e6.go | 125 +++++++++++++++++++++++- ecc/bw6-761/internal/fptower/e6_test.go | 16 ++- 4 files changed, 278 insertions(+), 4 deletions(-) diff --git a/ecc/bw6-633/internal/fptower/e6.go b/ecc/bw6-633/internal/fptower/e6.go index e7e633593..19e6c860e 100644 --- a/ecc/bw6-633/internal/fptower/e6.go +++ b/ecc/bw6-633/internal/fptower/e6.go @@ -134,7 +134,130 @@ func (z *E6) Square(x *E6) *E6 { return z } -// CyclotomicSquare https://eprint.iacr.org/2009/565.pdf, 3.2 +// Karabina's compressed cyclotomic square +// https://eprint.iacr.org/2010/542.pdf +// Th. 3.2 with minor modifications to fit our tower +func (z *E6) CyclotomicSquareCompressed(x *E6) *E6 { + + var t [7]fp.Element + + // t0 = g1^2 + t[0].Square(&x.B0.A1) + // t1 = g5^2 + t[1].Square(&x.B1.A2) + // t5 = g1 + g5 + t[5].Add(&x.B0.A1, &x.B1.A2) + // t2 = (g1 + g5)^2 + t[2].Square(&t[5]) + + // t3 = g1^2 + g5^2 + t[3].Add(&t[0], &t[1]) + // t5 = 2 * g1 * g5 + t[5].Sub(&t[2], &t[3]) + + // t6 = g3 + g2 + t[6].Add(&x.B1.A0, &x.B0.A2) + // t3 = (g3 + g2)^2 + t[3].Square(&t[6]) + // t2 = g3^2 + t[2].Square(&x.B1.A0) + + // t6 = 2 * nr * g1 * g5 + t[6].MulByNonResidue(&t[5]) + // t5 = 4 * nr * g1 * g5 + 2 * g3 + t[5].Add(&t[6], &x.B1.A0). + Double(&t[5]) + // z3 = 6 * nr * g1 * g5 + 2 * g3 + z.B1.A0.Add(&t[5], &t[6]) + + // t4 = nr * g5^2 + t[4].MulByNonResidue(&t[1]) + // t5 = nr * g5^2 + g1^2 + t[5].Add(&t[0], &t[4]) + // t6 = nr * g5^2 + g1^2 - g2 + t[6].Sub(&t[5], &x.B0.A2) + + // t1 = g2^2 + t[1].Square(&x.B0.A2) + + // t6 = 2 * nr * g5^2 + 2 * g1^2 - 2*g2 + t[6].Double(&t[6]) + // z2 = 3 * nr * g5^2 + 3 * g1^2 - 2*g2 + z.B0.A2.Add(&t[6], &t[5]) + + // t4 = nr * g2^2 + t[4].MulByNonResidue(&t[1]) + // t5 = g3^2 + nr * g2^2 + t[5].Add(&t[2], &t[4]) + // t6 = g3^2 + nr * g2^2 - g1 + t[6].Sub(&t[5], &x.B0.A1) + // t6 = 2 * g3^2 + 2 * nr * g2^2 - 2 * g1 + t[6].Double(&t[6]) + // z1 = 3 * g3^2 + 3 * nr * g2^2 - 2 * g1 + z.B0.A1.Add(&t[6], &t[5]) + + // t0 = g2^2 + g3^2 + t[0].Add(&t[2], &t[1]) + // t5 = 2 * g3 * g2 + t[5].Sub(&t[3], &t[0]) + // t6 = 2 * g3 * g2 + g5 + t[6].Add(&t[5], &x.B1.A2) + // t6 = 4 * g3 * g2 + 2 * g5 + t[6].Double(&t[6]) + // z5 = 6 * g3 * g2 + 2 * g5 + z.B1.A2.Add(&t[5], &t[6]) + + return z +} + +// Decompress Karabina's cyclotomic square result +func (z *E6) Decompress(x *E6) *E6 { + + var t [3]fp.Element + var one fp.Element + one.SetOne() + + // t0 = g1^2 + t[0].Square(&x.B0.A1) + // t1 = 3 * g1^2 - 2 * g2 + t[1].Sub(&t[0], &x.B0.A2). + Double(&t[1]). + Add(&t[1], &t[0]) + // t0 = E * g5^2 + t1 + t[2].Square(&x.B1.A2) + t[0].MulByNonResidue(&t[2]). + Add(&t[0], &t[1]) + // t1 = 1/(4 * g3) + t[1].Double(&x.B1.A0). + Double(&t[1]). + Inverse(&t[1]) // costly + // z4 = g4 + z.B1.A1.Mul(&t[0], &t[1]) + + // t1 = g2 * g1 + t[1].Mul(&x.B0.A2, &x.B0.A1) + // t2 = 2 * g4^2 - 3 * g2 * g1 + t[2].Square(&x.B1.A1). + Sub(&t[2], &t[1]). + Double(&t[2]). + Sub(&t[2], &t[1]) + // t1 = g3 * g5 + t[1].Mul(&x.B1.A0, &x.B1.A2) + // c_0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1 + t[2].Add(&t[2], &t[1]) + z.B0.A0.MulByNonResidue(&t[2]). + Add(&z.B0.A0, &one) + + z.B0.A1.Set(&x.B0.A1) + z.B0.A2.Set(&x.B0.A2) + z.B1.A0.Set(&x.B1.A0) + z.B1.A2.Set(&x.B1.A2) + + return z +} + +// Granger-Scott's cyclotomic square +// https://eprint.iacr.org/2009/565.pdf, 3.2 func (z *E6) CyclotomicSquare(x *E6) *E6 { // x=(x0,x1,x2,x3,x4,x5,x6,x7) in E3^6 // cyclosquare(x)=(3*x4^2*u + 3*x0^2 - 2*x0, diff --git a/ecc/bw6-633/internal/fptower/e6_test.go b/ecc/bw6-633/internal/fptower/e6_test.go index 7a54a4ac9..49e6cb7bf 100644 --- a/ecc/bw6-633/internal/fptower/e6_test.go +++ b/ecc/bw6-633/internal/fptower/e6_test.go @@ -249,7 +249,7 @@ func TestE6Ops(t *testing.T) { genA, )) - properties.Property("[BW6-633] cyclotomic square and square should be the same in the cyclotomic subgroup", prop.ForAll( + properties.Property("[BW6-633] cyclotomic square (Granger-Scott) and square should be the same in the cyclotomic subgroup", prop.ForAll( func(a *E6) bool { var b, c, d E6 b.Conjugate(a) @@ -263,6 +263,20 @@ func TestE6Ops(t *testing.T) { genA, )) + properties.Property("[BW6-633] compressed cyclotomic square (Karabina) and square should be the same in the cyclotomic subgroup", prop.ForAll( + func(a *E6) bool { + var b, c, d E6 + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.Frobenius(&b).Mul(a, &b) + c.Square(a) + d.CyclotomicSquareCompressed(a).Decompress(&d) + return c.Equal(&d) + }, + genA, + )) + properties.Property("[BW6-633] Frobenius of x in E6 should be equal to x^q", prop.ForAll( func(a *E6) bool { var b, c E6 diff --git a/ecc/bw6-761/internal/fptower/e6.go b/ecc/bw6-761/internal/fptower/e6.go index 2b5f8b358..b8027d9bc 100644 --- a/ecc/bw6-761/internal/fptower/e6.go +++ b/ecc/bw6-761/internal/fptower/e6.go @@ -133,7 +133,130 @@ func (z *E6) Square(x *E6) *E6 { return z } -// CyclotomicSquare https://eprint.iacr.org/2009/565.pdf, 3.2 +// Karabina's compressed cyclotomic square +// https://eprint.iacr.org/2010/542.pdf +// Th. 3.2 with minor modifications to fit our tower +func (z *E6) CyclotomicSquareCompressed(x *E6) *E6 { + + var t [7]fp.Element + + // t0 = g1^2 + t[0].Square(&x.B0.A1) + // t1 = g5^2 + t[1].Square(&x.B1.A2) + // t5 = g1 + g5 + t[5].Add(&x.B0.A1, &x.B1.A2) + // t2 = (g1 + g5)^2 + t[2].Square(&t[5]) + + // t3 = g1^2 + g5^2 + t[3].Add(&t[0], &t[1]) + // t5 = 2 * g1 * g5 + t[5].Sub(&t[2], &t[3]) + + // t6 = g3 + g2 + t[6].Add(&x.B1.A0, &x.B0.A2) + // t3 = (g3 + g2)^2 + t[3].Square(&t[6]) + // t2 = g3^2 + t[2].Square(&x.B1.A0) + + // t6 = 2 * nr * g1 * g5 + t[6].MulByNonResidue(&t[5]) + // t5 = 4 * nr * g1 * g5 + 2 * g3 + t[5].Add(&t[6], &x.B1.A0). + Double(&t[5]) + // z3 = 6 * nr * g1 * g5 + 2 * g3 + z.B1.A0.Add(&t[5], &t[6]) + + // t4 = nr * g5^2 + t[4].MulByNonResidue(&t[1]) + // t5 = nr * g5^2 + g1^2 + t[5].Add(&t[0], &t[4]) + // t6 = nr * g5^2 + g1^2 - g2 + t[6].Sub(&t[5], &x.B0.A2) + + // t1 = g2^2 + t[1].Square(&x.B0.A2) + + // t6 = 2 * nr * g5^2 + 2 * g1^2 - 2*g2 + t[6].Double(&t[6]) + // z2 = 3 * nr * g5^2 + 3 * g1^2 - 2*g2 + z.B0.A2.Add(&t[6], &t[5]) + + // t4 = nr * g2^2 + t[4].MulByNonResidue(&t[1]) + // t5 = g3^2 + nr * g2^2 + t[5].Add(&t[2], &t[4]) + // t6 = g3^2 + nr * g2^2 - g1 + t[6].Sub(&t[5], &x.B0.A1) + // t6 = 2 * g3^2 + 2 * nr * g2^2 - 2 * g1 + t[6].Double(&t[6]) + // z1 = 3 * g3^2 + 3 * nr * g2^2 - 2 * g1 + z.B0.A1.Add(&t[6], &t[5]) + + // t0 = g2^2 + g3^2 + t[0].Add(&t[2], &t[1]) + // t5 = 2 * g3 * g2 + t[5].Sub(&t[3], &t[0]) + // t6 = 2 * g3 * g2 + g5 + t[6].Add(&t[5], &x.B1.A2) + // t6 = 4 * g3 * g2 + 2 * g5 + t[6].Double(&t[6]) + // z5 = 6 * g3 * g2 + 2 * g5 + z.B1.A2.Add(&t[5], &t[6]) + + return z +} + +// Decompress Karabina's cyclotomic square result +func (z *E6) Decompress(x *E6) *E6 { + + var t [3]fp.Element + var one fp.Element + one.SetOne() + + // t0 = g1^2 + t[0].Square(&x.B0.A1) + // t1 = 3 * g1^2 - 2 * g2 + t[1].Sub(&t[0], &x.B0.A2). + Double(&t[1]). + Add(&t[1], &t[0]) + // t0 = E * g5^2 + t1 + t[2].Square(&x.B1.A2) + t[0].MulByNonResidue(&t[2]). + Add(&t[0], &t[1]) + // t1 = 1/(4 * g3) + t[1].Double(&x.B1.A0). + Double(&t[1]). + Inverse(&t[1]) // costly + // z4 = g4 + z.B1.A1.Mul(&t[0], &t[1]) + + // t1 = g2 * g1 + t[1].Mul(&x.B0.A2, &x.B0.A1) + // t2 = 2 * g4^2 - 3 * g2 * g1 + t[2].Square(&x.B1.A1). + Sub(&t[2], &t[1]). + Double(&t[2]). + Sub(&t[2], &t[1]) + // t1 = g3 * g5 + t[1].Mul(&x.B1.A0, &x.B1.A2) + // c_0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1 + t[2].Add(&t[2], &t[1]) + z.B0.A0.MulByNonResidue(&t[2]). + Add(&z.B0.A0, &one) + + z.B0.A1.Set(&x.B0.A1) + z.B0.A2.Set(&x.B0.A2) + z.B1.A0.Set(&x.B1.A0) + z.B1.A2.Set(&x.B1.A2) + + return z +} + +// Granger-Scott's cyclotomic square +// https://eprint.iacr.org/2009/565.pdf, 3.2 func (z *E6) CyclotomicSquare(x *E6) *E6 { // x=(x0,x1,x2,x3,x4,x5,x6,x7) in E3^6 // cyclosquare(x)=(3*x4^2*u + 3*x0^2 - 2*x0, diff --git a/ecc/bw6-761/internal/fptower/e6_test.go b/ecc/bw6-761/internal/fptower/e6_test.go index a7c3a1c3c..44aeb6618 100644 --- a/ecc/bw6-761/internal/fptower/e6_test.go +++ b/ecc/bw6-761/internal/fptower/e6_test.go @@ -249,7 +249,7 @@ func TestE6Ops(t *testing.T) { genA, )) - properties.Property("[BW6-761] cyclotomic square and square should be the same in the cyclotomic subgroup", prop.ForAll( + properties.Property("[BW6-761] cyclotomic square (Granger-Scott) and square should be the same in the cyclotomic subgroup", prop.ForAll( func(a *E6) bool { var b, c, d E6 b.Conjugate(a) @@ -263,6 +263,20 @@ func TestE6Ops(t *testing.T) { genA, )) + properties.Property("[BW6-761] compressed cyclotomic square (Karabina) and square should be the same in the cyclotomic subgroup", prop.ForAll( + func(a *E6) bool { + var b, c, d E6 + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.Frobenius(&b).Mul(a, &b) + c.Square(a) + d.CyclotomicSquareCompressed(a).Decompress(&d) + return c.Equal(&d) + }, + genA, + )) + properties.Property("[BW6-761] Frobenius of x in E6 should be equal to x^q", prop.ForAll( func(a *E6) bool { var b, c E6 From 70a662dbc1035242078b4292e96dd2fb96be755e Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Wed, 15 Sep 2021 11:04:55 -0500 Subject: [PATCH 04/11] feat: added x86 assembly impl for field.Inverse --- ecc/bls12-377/fp/element.go | 124 ++-- ecc/bls12-377/fp/element_ops_amd64.go | 3 + ecc/bls12-377/fp/element_ops_amd64.s | 353 ++++++++++ ecc/bls12-377/fp/element_ops_noasm.go | 4 + ecc/bls12-377/fr/element.go | 86 +-- ecc/bls12-377/fr/element_ops_amd64.go | 3 + ecc/bls12-377/fr/element_ops_amd64.s | 265 +++++++ ecc/bls12-377/fr/element_ops_noasm.go | 4 + ecc/bls12-381/fp/element.go | 124 ++-- ecc/bls12-381/fp/element_ops_amd64.go | 3 + ecc/bls12-381/fp/element_ops_amd64.s | 353 ++++++++++ ecc/bls12-381/fp/element_ops_noasm.go | 4 + ecc/bls12-381/fr/element.go | 86 +-- ecc/bls12-381/fr/element_ops_amd64.go | 3 + ecc/bls12-381/fr/element_ops_amd64.s | 265 +++++++ ecc/bls12-381/fr/element_ops_noasm.go | 4 + ecc/bls24-315/fp/element.go | 105 +-- ecc/bls24-315/fp/element_ops_amd64.go | 3 + ecc/bls24-315/fp/element_ops_amd64.s | 309 ++++++++ ecc/bls24-315/fp/element_ops_noasm.go | 4 + ecc/bls24-315/fr/element.go | 86 +-- ecc/bls24-315/fr/element_ops_amd64.go | 3 + ecc/bls24-315/fr/element_ops_amd64.s | 265 +++++++ ecc/bls24-315/fr/element_ops_noasm.go | 4 + ecc/bn254/fp/element.go | 86 +-- ecc/bn254/fp/element_ops_amd64.go | 3 + ecc/bn254/fp/element_ops_amd64.s | 265 +++++++ ecc/bn254/fp/element_ops_noasm.go | 4 + ecc/bn254/fr/element.go | 86 +-- ecc/bn254/fr/element_ops_amd64.go | 3 + ecc/bn254/fr/element_ops_amd64.s | 265 +++++++ ecc/bn254/fr/element_ops_noasm.go | 4 + ecc/bw6-633/fp/element.go | 200 +++--- ecc/bw6-633/fp/element_ops_amd64.go | 3 + ecc/bw6-633/fp/element_ops_amd64.s | 568 +++++++++++++++ ecc/bw6-633/fp/element_ops_noasm.go | 4 + ecc/bw6-633/fr/element.go | 105 +-- ecc/bw6-633/fr/element_ops_amd64.go | 3 + ecc/bw6-633/fr/element_ops_amd64.s | 309 ++++++++ ecc/bw6-633/fr/element_ops_noasm.go | 4 + ecc/bw6-761/fp/element.go | 238 +++---- ecc/bw6-761/fp/element_ops_amd64.go | 3 + ecc/bw6-761/fp/element_ops_amd64.s | 664 ++++++++++++++++++ ecc/bw6-761/fp/element_ops_noasm.go | 4 + ecc/bw6-761/fr/element.go | 124 ++-- ecc/bw6-761/fr/element_ops_amd64.go | 3 + ecc/bw6-761/fr/element_ops_amd64.s | 353 ++++++++++ ecc/bw6-761/fr/element_ops_noasm.go | 4 + field/asm/amd64/asm_macros.go | 7 + field/asm/amd64/build.go | 3 + field/asm/amd64/element_inverse.go | 246 +++++++ field/internal/templates/element/base.go | 11 + field/internal/templates/element/inverse.go | 52 +- field/internal/templates/element/ops.go | 3 + .../internal/templates/element/ops_generic.go | 4 + go.mod | 2 +- go.sum | 6 +- 57 files changed, 5335 insertions(+), 767 deletions(-) create mode 100644 field/asm/amd64/element_inverse.go diff --git a/ecc/bls12-377/fp/element.go b/ecc/bls12-377/fp/element.go index ed38d2520..ff802d3ae 100644 --- a/ecc/bls12-377/fp/element.go +++ b/ecc/bls12-377/fp/element.go @@ -699,6 +699,27 @@ func _butterflyGeneric(a, b *Element) { b.Sub(&t, b) } +// BitLen returns the minimum number of bits needed to represent z +// returns 0 if z == 0 +func (z *Element) BitLen() int { + if z[5] != 0 { + return 320 + bits.Len64(z[5]) + } + if z[4] != 0 { + return 256 + bits.Len64(z[4]) + } + if z[3] != 0 { + return 192 + bits.Len64(z[3]) + } + if z[2] != 0 { + return 128 + bits.Len64(z[2]) + } + if z[1] != 0 { + return 64 + bits.Len64(z[1]) + } + return bits.Len64(z[0]) +} + // Exp z = x^exponent mod q func (z *Element) Exp(x Element, exponent *big.Int) *Element { var bZero big.Int @@ -962,8 +983,17 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { + inverse(z, x) + return z +} + +// _inverseGeneric z = x^-1 mod q +// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" +// if x == 0, sets and returns z = x +func _inverseGeneric(z, x *Element) { if x.IsZero() { - return z.Set(x) + z.SetZero() + return } // initialize u = q @@ -991,29 +1021,20 @@ func (z *Element) Inverse(x *Element) *Element { v := *x - var carry, borrow, t, t2 uint64 + var carry, borrow uint64 var bigger bool for { for v[0]&1 == 0 { // v = v >> 1 - t2 = v[5] << 63 + + v[0] = v[0]>>1 | v[1]<<63 + v[1] = v[1]>>1 | v[2]<<63 + v[2] = v[2]>>1 | v[3]<<63 + v[3] = v[3]>>1 | v[4]<<63 + v[4] = v[4]>>1 | v[5]<<63 v[5] >>= 1 - t = t2 - t2 = v[4] << 63 - v[4] = (v[4] >> 1) | t - t = t2 - t2 = v[3] << 63 - v[3] = (v[3] >> 1) | t - t = t2 - t2 = v[2] << 63 - v[2] = (v[2] >> 1) | t - t = t2 - t2 = v[1] << 63 - v[1] = (v[1] >> 1) | t - t = t2 - v[0] = (v[0] >> 1) | t if s[0]&1 == 1 { @@ -1028,43 +1049,25 @@ func (z *Element) Inverse(x *Element) *Element { } // s = s >> 1 - t2 = s[5] << 63 + + s[0] = s[0]>>1 | s[1]<<63 + s[1] = s[1]>>1 | s[2]<<63 + s[2] = s[2]>>1 | s[3]<<63 + s[3] = s[3]>>1 | s[4]<<63 + s[4] = s[4]>>1 | s[5]<<63 s[5] >>= 1 - t = t2 - t2 = s[4] << 63 - s[4] = (s[4] >> 1) | t - t = t2 - t2 = s[3] << 63 - s[3] = (s[3] >> 1) | t - t = t2 - t2 = s[2] << 63 - s[2] = (s[2] >> 1) | t - t = t2 - t2 = s[1] << 63 - s[1] = (s[1] >> 1) | t - t = t2 - s[0] = (s[0] >> 1) | t } for u[0]&1 == 0 { // u = u >> 1 - t2 = u[5] << 63 + + u[0] = u[0]>>1 | u[1]<<63 + u[1] = u[1]>>1 | u[2]<<63 + u[2] = u[2]>>1 | u[3]<<63 + u[3] = u[3]>>1 | u[4]<<63 + u[4] = u[4]>>1 | u[5]<<63 u[5] >>= 1 - t = t2 - t2 = u[4] << 63 - u[4] = (u[4] >> 1) | t - t = t2 - t2 = u[3] << 63 - u[3] = (u[3] >> 1) | t - t = t2 - t2 = u[2] << 63 - u[2] = (u[2] >> 1) | t - t = t2 - t2 = u[1] << 63 - u[1] = (u[1] >> 1) | t - t = t2 - u[0] = (u[0] >> 1) | t if r[0]&1 == 1 { @@ -1079,22 +1082,13 @@ func (z *Element) Inverse(x *Element) *Element { } // r = r >> 1 - t2 = r[5] << 63 + + r[0] = r[0]>>1 | r[1]<<63 + r[1] = r[1]>>1 | r[2]<<63 + r[2] = r[2]>>1 | r[3]<<63 + r[3] = r[3]>>1 | r[4]<<63 + r[4] = r[4]>>1 | r[5]<<63 r[5] >>= 1 - t = t2 - t2 = r[4] << 63 - r[4] = (r[4] >> 1) | t - t = t2 - t2 = r[3] << 63 - r[3] = (r[3] >> 1) | t - t = t2 - t2 = r[2] << 63 - r[2] = (r[2] >> 1) | t - t = t2 - t2 = r[1] << 63 - r[1] = (r[1] >> 1) | t - t = t2 - r[0] = (r[0] >> 1) | t } @@ -1161,10 +1155,12 @@ func (z *Element) Inverse(x *Element) *Element { } } if (u[0] == 1) && (u[5]|u[4]|u[3]|u[2]|u[1]) == 0 { - return z.Set(&r) + z.Set(&r) + return } if (v[0] == 1) && (v[5]|v[4]|v[3]|v[2]|v[1]) == 0 { - return z.Set(&s) + z.Set(&s) + return } } diff --git a/ecc/bls12-377/fp/element_ops_amd64.go b/ecc/bls12-377/fp/element_ops_amd64.go index 73a3711ec..d61412bd6 100644 --- a/ecc/bls12-377/fp/element_ops_amd64.go +++ b/ecc/bls12-377/fp/element_ops_amd64.go @@ -48,3 +48,6 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) + +//go:noescape +func inverse(res, x *Element) diff --git a/ecc/bls12-377/fp/element_ops_amd64.s b/ecc/bls12-377/fp/element_ops_amd64.s index 596b552dd..857431795 100644 --- a/ecc/bls12-377/fp/element_ops_amd64.s +++ b/ecc/bls12-377/fp/element_ops_amd64.s @@ -450,3 +450,356 @@ TEXT ·Butterfly(SB), $48-16 MOVQ R8, 32(AX) MOVQ R9, 40(AX) RET + +// inverse(res, x *Element) +TEXT ·inverse(SB), $136-16 + // u = q + // u[0] -> R9 + // u[1] -> R10 + // u[2] -> R11 + // u[3] -> R12 + // u[4] -> R13 + // u[5] -> R14 + MOVQ q<>+0(SB), R9 + MOVQ q<>+8(SB), R10 + MOVQ q<>+16(SB), R11 + MOVQ q<>+24(SB), R12 + MOVQ q<>+32(SB), R13 + MOVQ q<>+40(SB), R14 + + // s = r^2 + // s[0] -> s11-96(SP) + // s[1] -> s12-104(SP) + // s[2] -> s13-112(SP) + // s[3] -> s14-120(SP) + // s[4] -> s15-128(SP) + // s[5] -> s16-136(SP) + MOVQ $0xb786686c9400cd22, R8 + MOVQ R8, s11-96(SP) + MOVQ $0x0329fcaab00431b1, R8 + MOVQ R8, s12-104(SP) + MOVQ $0x22a5f11162d6b46d, R8 + MOVQ R8, s13-112(SP) + MOVQ $0xbfdf7d03827dc3ac, R8 + MOVQ R8, s14-120(SP) + MOVQ $0x837e92f041790bf9, R8 + MOVQ R8, s15-128(SP) + MOVQ $0x006dfccb1e914b88, R8 + MOVQ R8, s16-136(SP) + + // v = x + // v[0] -> R15 + // v[1] -> s0-8(SP) + // v[2] -> s1-16(SP) + // v[3] -> s2-24(SP) + // v[4] -> s3-32(SP) + // v[5] -> s4-40(SP) + MOVQ x+8(FP), R8 + MOVQ 0(R8), AX + MOVQ 8(R8), DX + MOVQ 16(R8), CX + MOVQ 24(R8), BX + MOVQ 32(R8), SI + MOVQ 40(R8), DI + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + MOVQ SI, s3-32(SP) + MOVQ DI, s4-40(SP) + + // if x is 0, returns 0 + MOVQ AX, R8 + ORQ DX, R8 + ORQ CX, R8 + ORQ BX, R8 + ORQ SI, R8 + ORQ DI, R8 + JEQ l7 + + // r = 0 + // r[0] -> s5-48(SP) + // r[1] -> s6-56(SP) + // r[2] -> s7-64(SP) + // r[3] -> s8-72(SP) + // r[4] -> s9-80(SP) + // r[5] -> s10-88(SP) + MOVQ $0, s5-48(SP) + MOVQ $0, s6-56(SP) + MOVQ $0, s7-64(SP) + MOVQ $0, s8-72(SP) + MOVQ $0, s9-80(SP) + MOVQ $0, s10-88(SP) + +l2: + BTQ $0, AX + JCS l8 + MOVQ $0, BP + XORQ R8, R8 + +l9: + INCQ BP + SHRQ $1, AX, R8 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, DI + BTQ $0, AX + JCC l9 + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + MOVQ SI, s3-32(SP) + MOVQ DI, s4-40(SP) + MOVQ s11-96(SP), AX + MOVQ s12-104(SP), DX + MOVQ s13-112(SP), CX + MOVQ s14-120(SP), BX + MOVQ s15-128(SP), SI + MOVQ s16-136(SP), DI + +l10: + BTQ $0, AX + JCC l11 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + +l11: + SHRQ $1, AX, R8 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, DI + DECQ BP + JNE l10 + MOVQ AX, s11-96(SP) + MOVQ DX, s12-104(SP) + MOVQ CX, s13-112(SP) + MOVQ BX, s14-120(SP) + MOVQ SI, s15-128(SP) + MOVQ DI, s16-136(SP) + +l8: + MOVQ R9, AX + MOVQ R10, DX + MOVQ R11, CX + MOVQ R12, BX + MOVQ R13, SI + MOVQ R14, DI + BTQ $0, AX + JCS l12 + MOVQ $0, BP + XORQ R8, R8 + +l13: + INCQ BP + SHRQ $1, AX, R8 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, DI + BTQ $0, AX + JCC l13 + MOVQ AX, R9 + MOVQ DX, R10 + MOVQ CX, R11 + MOVQ BX, R12 + MOVQ SI, R13 + MOVQ DI, R14 + MOVQ s5-48(SP), AX + MOVQ s6-56(SP), DX + MOVQ s7-64(SP), CX + MOVQ s8-72(SP), BX + MOVQ s9-80(SP), SI + MOVQ s10-88(SP), DI + +l14: + BTQ $0, AX + JCC l15 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + +l15: + SHRQ $1, AX, R8 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, DI + DECQ BP + JNE l14 + MOVQ AX, s5-48(SP) + MOVQ DX, s6-56(SP) + MOVQ CX, s7-64(SP) + MOVQ BX, s8-72(SP) + MOVQ SI, s9-80(SP) + MOVQ DI, s10-88(SP) + +l12: + // v = v - u + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + MOVQ s3-32(SP), SI + MOVQ s4-40(SP), DI + SUBQ R9, AX + SBBQ R10, DX + SBBQ R11, CX + SBBQ R12, BX + SBBQ R13, SI + SBBQ R14, DI + JCC l3 + SUBQ R15, R9 + SBBQ s0-8(SP), R10 + SBBQ s1-16(SP), R11 + SBBQ s2-24(SP), R12 + SBBQ s3-32(SP), R13 + SBBQ s4-40(SP), R14 + MOVQ s5-48(SP), AX + MOVQ s6-56(SP), DX + MOVQ s7-64(SP), CX + MOVQ s8-72(SP), BX + MOVQ s9-80(SP), SI + MOVQ s10-88(SP), DI + SUBQ s11-96(SP), AX + SBBQ s12-104(SP), DX + SBBQ s13-112(SP), CX + SBBQ s14-120(SP), BX + SBBQ s15-128(SP), SI + SBBQ s16-136(SP), DI + JCC l16 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + +l16: + MOVQ AX, s5-48(SP) + MOVQ DX, s6-56(SP) + MOVQ CX, s7-64(SP) + MOVQ BX, s8-72(SP) + MOVQ SI, s9-80(SP) + MOVQ DI, s10-88(SP) + JMP l4 + +l3: + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + MOVQ SI, s3-32(SP) + MOVQ DI, s4-40(SP) + MOVQ s11-96(SP), AX + MOVQ s12-104(SP), DX + MOVQ s13-112(SP), CX + MOVQ s14-120(SP), BX + MOVQ s15-128(SP), SI + MOVQ s16-136(SP), DI + SUBQ s5-48(SP), AX + SBBQ s6-56(SP), DX + SBBQ s7-64(SP), CX + SBBQ s8-72(SP), BX + SBBQ s9-80(SP), SI + SBBQ s10-88(SP), DI + JCC l17 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + +l17: + MOVQ AX, s11-96(SP) + MOVQ DX, s12-104(SP) + MOVQ CX, s13-112(SP) + MOVQ BX, s14-120(SP) + MOVQ SI, s15-128(SP) + MOVQ DI, s16-136(SP) + +l4: + MOVQ R9, R8 + SUBQ $1, R8 + ORQ R10, R8 + ORQ R11, R8 + ORQ R12, R8 + ORQ R13, R8 + ORQ R14, R8 + JEQ l5 + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + MOVQ s3-32(SP), SI + MOVQ s4-40(SP), DI + MOVQ AX, R8 + SUBQ $1, R8 + JNE l2 + ORQ DX, R8 + ORQ CX, R8 + ORQ BX, R8 + ORQ SI, R8 + ORQ DI, R8 + JEQ l6 + JMP l2 + +l5: + MOVQ res+0(FP), R8 + MOVQ s5-48(SP), AX + MOVQ s6-56(SP), DX + MOVQ s7-64(SP), CX + MOVQ s8-72(SP), BX + MOVQ s9-80(SP), SI + MOVQ s10-88(SP), DI + MOVQ AX, 0(R8) + MOVQ DX, 8(R8) + MOVQ CX, 16(R8) + MOVQ BX, 24(R8) + MOVQ SI, 32(R8) + MOVQ DI, 40(R8) + RET + +l6: + MOVQ res+0(FP), R8 + MOVQ s11-96(SP), AX + MOVQ s12-104(SP), DX + MOVQ s13-112(SP), CX + MOVQ s14-120(SP), BX + MOVQ s15-128(SP), SI + MOVQ s16-136(SP), DI + MOVQ AX, 0(R8) + MOVQ DX, 8(R8) + MOVQ CX, 16(R8) + MOVQ BX, 24(R8) + MOVQ SI, 32(R8) + MOVQ DI, 40(R8) + RET + +l7: + MOVQ res+0(FP), R8 + MOVQ $0, 0(R8) + MOVQ $0, 8(R8) + MOVQ $0, 16(R8) + MOVQ $0, 24(R8) + MOVQ $0, 32(R8) + MOVQ $0, 40(R8) + RET diff --git a/ecc/bls12-377/fp/element_ops_noasm.go b/ecc/bls12-377/fp/element_ops_noasm.go index fec628918..48d55e2ea 100644 --- a/ecc/bls12-377/fp/element_ops_noasm.go +++ b/ecc/bls12-377/fp/element_ops_noasm.go @@ -51,6 +51,10 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } +func inverse(z, x *Element) { + _inverseGeneric(z, x) +} + // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bls12-377/fr/element.go b/ecc/bls12-377/fr/element.go index 8447be1f8..014cc671b 100644 --- a/ecc/bls12-377/fr/element.go +++ b/ecc/bls12-377/fr/element.go @@ -573,6 +573,21 @@ func _butterflyGeneric(a, b *Element) { b.Sub(&t, b) } +// BitLen returns the minimum number of bits needed to represent z +// returns 0 if z == 0 +func (z *Element) BitLen() int { + if z[3] != 0 { + return 192 + bits.Len64(z[3]) + } + if z[2] != 0 { + return 128 + bits.Len64(z[2]) + } + if z[1] != 0 { + return 64 + bits.Len64(z[1]) + } + return bits.Len64(z[0]) +} + // Exp z = x^exponent mod q func (z *Element) Exp(x Element, exponent *big.Int) *Element { var bZero big.Int @@ -830,8 +845,17 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { + inverse(z, x) + return z +} + +// _inverseGeneric z = x^-1 mod q +// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" +// if x == 0, sets and returns z = x +func _inverseGeneric(z, x *Element) { if x.IsZero() { - return z.Set(x) + z.SetZero() + return } // initialize u = q @@ -855,23 +879,18 @@ func (z *Element) Inverse(x *Element) *Element { v := *x - var carry, borrow, t, t2 uint64 + var carry, borrow uint64 var bigger bool for { for v[0]&1 == 0 { // v = v >> 1 - t2 = v[3] << 63 + + v[0] = v[0]>>1 | v[1]<<63 + v[1] = v[1]>>1 | v[2]<<63 + v[2] = v[2]>>1 | v[3]<<63 v[3] >>= 1 - t = t2 - t2 = v[2] << 63 - v[2] = (v[2] >> 1) | t - t = t2 - t2 = v[1] << 63 - v[1] = (v[1] >> 1) | t - t = t2 - v[0] = (v[0] >> 1) | t if s[0]&1 == 1 { @@ -884,31 +903,21 @@ func (z *Element) Inverse(x *Element) *Element { } // s = s >> 1 - t2 = s[3] << 63 + + s[0] = s[0]>>1 | s[1]<<63 + s[1] = s[1]>>1 | s[2]<<63 + s[2] = s[2]>>1 | s[3]<<63 s[3] >>= 1 - t = t2 - t2 = s[2] << 63 - s[2] = (s[2] >> 1) | t - t = t2 - t2 = s[1] << 63 - s[1] = (s[1] >> 1) | t - t = t2 - s[0] = (s[0] >> 1) | t } for u[0]&1 == 0 { // u = u >> 1 - t2 = u[3] << 63 + + u[0] = u[0]>>1 | u[1]<<63 + u[1] = u[1]>>1 | u[2]<<63 + u[2] = u[2]>>1 | u[3]<<63 u[3] >>= 1 - t = t2 - t2 = u[2] << 63 - u[2] = (u[2] >> 1) | t - t = t2 - t2 = u[1] << 63 - u[1] = (u[1] >> 1) | t - t = t2 - u[0] = (u[0] >> 1) | t if r[0]&1 == 1 { @@ -921,16 +930,11 @@ func (z *Element) Inverse(x *Element) *Element { } // r = r >> 1 - t2 = r[3] << 63 + + r[0] = r[0]>>1 | r[1]<<63 + r[1] = r[1]>>1 | r[2]<<63 + r[2] = r[2]>>1 | r[3]<<63 r[3] >>= 1 - t = t2 - t2 = r[2] << 63 - r[2] = (r[2] >> 1) | t - t = t2 - t2 = r[1] << 63 - r[1] = (r[1] >> 1) | t - t = t2 - r[0] = (r[0] >> 1) | t } @@ -985,10 +989,12 @@ func (z *Element) Inverse(x *Element) *Element { } } if (u[0] == 1) && (u[3]|u[2]|u[1]) == 0 { - return z.Set(&r) + z.Set(&r) + return } if (v[0] == 1) && (v[3]|v[2]|v[1]) == 0 { - return z.Set(&s) + z.Set(&s) + return } } diff --git a/ecc/bls12-377/fr/element_ops_amd64.go b/ecc/bls12-377/fr/element_ops_amd64.go index 78022b3e6..9ebabc26a 100644 --- a/ecc/bls12-377/fr/element_ops_amd64.go +++ b/ecc/bls12-377/fr/element_ops_amd64.go @@ -48,3 +48,6 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) + +//go:noescape +func inverse(res, x *Element) diff --git a/ecc/bls12-377/fr/element_ops_amd64.s b/ecc/bls12-377/fr/element_ops_amd64.s index 85a1c457f..21cd7c70f 100644 --- a/ecc/bls12-377/fr/element_ops_amd64.s +++ b/ecc/bls12-377/fr/element_ops_amd64.s @@ -338,3 +338,268 @@ TEXT ·Butterfly(SB), NOSPLIT, $0-16 MOVQ SI, 16(AX) MOVQ DI, 24(AX) RET + +// inverse(res, x *Element) +TEXT ·inverse(SB), $56-16 + // u = q + // u[0] -> DI + // u[1] -> R8 + // u[2] -> R9 + // u[3] -> R10 + MOVQ q<>+0(SB), DI + MOVQ q<>+8(SB), R8 + MOVQ q<>+16(SB), R9 + MOVQ q<>+24(SB), R10 + + // s = r^2 + // s[0] -> s3-32(SP) + // s[1] -> s4-40(SP) + // s[2] -> s5-48(SP) + // s[3] -> s6-56(SP) + MOVQ $0x25d577bab861857b, SI + MOVQ SI, s3-32(SP) + MOVQ $0xcc2c27b58860591f, SI + MOVQ SI, s4-40(SP) + MOVQ $0xa7cc008fe5dc8593, SI + MOVQ SI, s5-48(SP) + MOVQ $0x011fdae7eff1c939, SI + MOVQ SI, s6-56(SP) + + // v = x + // v[0] -> R11 + // v[1] -> R12 + // v[2] -> R13 + // v[3] -> R14 + MOVQ x+8(FP), SI + MOVQ 0(SI), AX + MOVQ 8(SI), DX + MOVQ 16(SI), CX + MOVQ 24(SI), BX + MOVQ AX, R11 + MOVQ DX, R12 + MOVQ CX, R13 + MOVQ BX, R14 + + // if x is 0, returns 0 + MOVQ AX, SI + ORQ DX, SI + ORQ CX, SI + ORQ BX, SI + JEQ l7 + + // r = 0 + // r[0] -> R15 + // r[1] -> s0-8(SP) + // r[2] -> s1-16(SP) + // r[3] -> s2-24(SP) + MOVQ $0, R15 + MOVQ $0, s0-8(SP) + MOVQ $0, s1-16(SP) + MOVQ $0, s2-24(SP) + +l2: + BTQ $0, AX + JCS l8 + MOVQ $0, BP + XORQ SI, SI + +l9: + INCQ BP + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + BTQ $0, AX + JCC l9 + MOVQ AX, R11 + MOVQ DX, R12 + MOVQ CX, R13 + MOVQ BX, R14 + MOVQ s3-32(SP), AX + MOVQ s4-40(SP), DX + MOVQ s5-48(SP), CX + MOVQ s6-56(SP), BX + +l10: + BTQ $0, AX + JCC l11 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l11: + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + DECQ BP + JNE l10 + MOVQ AX, s3-32(SP) + MOVQ DX, s4-40(SP) + MOVQ CX, s5-48(SP) + MOVQ BX, s6-56(SP) + +l8: + MOVQ DI, AX + MOVQ R8, DX + MOVQ R9, CX + MOVQ R10, BX + BTQ $0, AX + JCS l12 + MOVQ $0, BP + XORQ SI, SI + +l13: + INCQ BP + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + BTQ $0, AX + JCC l13 + MOVQ AX, DI + MOVQ DX, R8 + MOVQ CX, R9 + MOVQ BX, R10 + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + +l14: + BTQ $0, AX + JCC l15 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l15: + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + DECQ BP + JNE l14 + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + +l12: + // v = v - u + MOVQ R11, AX + MOVQ R12, DX + MOVQ R13, CX + MOVQ R14, BX + SUBQ DI, AX + SBBQ R8, DX + SBBQ R9, CX + SBBQ R10, BX + JCC l3 + SUBQ R11, DI + SBBQ R12, R8 + SBBQ R13, R9 + SBBQ R14, R10 + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + SUBQ s3-32(SP), AX + SBBQ s4-40(SP), DX + SBBQ s5-48(SP), CX + SBBQ s6-56(SP), BX + JCC l16 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l16: + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + JMP l4 + +l3: + MOVQ AX, R11 + MOVQ DX, R12 + MOVQ CX, R13 + MOVQ BX, R14 + MOVQ s3-32(SP), AX + MOVQ s4-40(SP), DX + MOVQ s5-48(SP), CX + MOVQ s6-56(SP), BX + SUBQ R15, AX + SBBQ s0-8(SP), DX + SBBQ s1-16(SP), CX + SBBQ s2-24(SP), BX + JCC l17 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l17: + MOVQ AX, s3-32(SP) + MOVQ DX, s4-40(SP) + MOVQ CX, s5-48(SP) + MOVQ BX, s6-56(SP) + +l4: + MOVQ DI, SI + SUBQ $1, SI + ORQ R8, SI + ORQ R9, SI + ORQ R10, SI + JEQ l5 + MOVQ R11, AX + MOVQ R12, DX + MOVQ R13, CX + MOVQ R14, BX + MOVQ AX, SI + SUBQ $1, SI + JNE l2 + ORQ DX, SI + ORQ CX, SI + ORQ BX, SI + JEQ l6 + JMP l2 + +l5: + MOVQ res+0(FP), SI + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + MOVQ AX, 0(SI) + MOVQ DX, 8(SI) + MOVQ CX, 16(SI) + MOVQ BX, 24(SI) + RET + +l6: + MOVQ res+0(FP), SI + MOVQ s3-32(SP), AX + MOVQ s4-40(SP), DX + MOVQ s5-48(SP), CX + MOVQ s6-56(SP), BX + MOVQ AX, 0(SI) + MOVQ DX, 8(SI) + MOVQ CX, 16(SI) + MOVQ BX, 24(SI) + RET + +l7: + MOVQ res+0(FP), SI + MOVQ $0, 0(SI) + MOVQ $0, 8(SI) + MOVQ $0, 16(SI) + MOVQ $0, 24(SI) + RET diff --git a/ecc/bls12-377/fr/element_ops_noasm.go b/ecc/bls12-377/fr/element_ops_noasm.go index ec1fac18d..006365daa 100644 --- a/ecc/bls12-377/fr/element_ops_noasm.go +++ b/ecc/bls12-377/fr/element_ops_noasm.go @@ -51,6 +51,10 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } +func inverse(z, x *Element) { + _inverseGeneric(z, x) +} + // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bls12-381/fp/element.go b/ecc/bls12-381/fp/element.go index ce7cd405d..2dc0c7ce8 100644 --- a/ecc/bls12-381/fp/element.go +++ b/ecc/bls12-381/fp/element.go @@ -699,6 +699,27 @@ func _butterflyGeneric(a, b *Element) { b.Sub(&t, b) } +// BitLen returns the minimum number of bits needed to represent z +// returns 0 if z == 0 +func (z *Element) BitLen() int { + if z[5] != 0 { + return 320 + bits.Len64(z[5]) + } + if z[4] != 0 { + return 256 + bits.Len64(z[4]) + } + if z[3] != 0 { + return 192 + bits.Len64(z[3]) + } + if z[2] != 0 { + return 128 + bits.Len64(z[2]) + } + if z[1] != 0 { + return 64 + bits.Len64(z[1]) + } + return bits.Len64(z[0]) +} + // Exp z = x^exponent mod q func (z *Element) Exp(x Element, exponent *big.Int) *Element { var bZero big.Int @@ -908,8 +929,17 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { + inverse(z, x) + return z +} + +// _inverseGeneric z = x^-1 mod q +// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" +// if x == 0, sets and returns z = x +func _inverseGeneric(z, x *Element) { if x.IsZero() { - return z.Set(x) + z.SetZero() + return } // initialize u = q @@ -937,29 +967,20 @@ func (z *Element) Inverse(x *Element) *Element { v := *x - var carry, borrow, t, t2 uint64 + var carry, borrow uint64 var bigger bool for { for v[0]&1 == 0 { // v = v >> 1 - t2 = v[5] << 63 + + v[0] = v[0]>>1 | v[1]<<63 + v[1] = v[1]>>1 | v[2]<<63 + v[2] = v[2]>>1 | v[3]<<63 + v[3] = v[3]>>1 | v[4]<<63 + v[4] = v[4]>>1 | v[5]<<63 v[5] >>= 1 - t = t2 - t2 = v[4] << 63 - v[4] = (v[4] >> 1) | t - t = t2 - t2 = v[3] << 63 - v[3] = (v[3] >> 1) | t - t = t2 - t2 = v[2] << 63 - v[2] = (v[2] >> 1) | t - t = t2 - t2 = v[1] << 63 - v[1] = (v[1] >> 1) | t - t = t2 - v[0] = (v[0] >> 1) | t if s[0]&1 == 1 { @@ -974,43 +995,25 @@ func (z *Element) Inverse(x *Element) *Element { } // s = s >> 1 - t2 = s[5] << 63 + + s[0] = s[0]>>1 | s[1]<<63 + s[1] = s[1]>>1 | s[2]<<63 + s[2] = s[2]>>1 | s[3]<<63 + s[3] = s[3]>>1 | s[4]<<63 + s[4] = s[4]>>1 | s[5]<<63 s[5] >>= 1 - t = t2 - t2 = s[4] << 63 - s[4] = (s[4] >> 1) | t - t = t2 - t2 = s[3] << 63 - s[3] = (s[3] >> 1) | t - t = t2 - t2 = s[2] << 63 - s[2] = (s[2] >> 1) | t - t = t2 - t2 = s[1] << 63 - s[1] = (s[1] >> 1) | t - t = t2 - s[0] = (s[0] >> 1) | t } for u[0]&1 == 0 { // u = u >> 1 - t2 = u[5] << 63 + + u[0] = u[0]>>1 | u[1]<<63 + u[1] = u[1]>>1 | u[2]<<63 + u[2] = u[2]>>1 | u[3]<<63 + u[3] = u[3]>>1 | u[4]<<63 + u[4] = u[4]>>1 | u[5]<<63 u[5] >>= 1 - t = t2 - t2 = u[4] << 63 - u[4] = (u[4] >> 1) | t - t = t2 - t2 = u[3] << 63 - u[3] = (u[3] >> 1) | t - t = t2 - t2 = u[2] << 63 - u[2] = (u[2] >> 1) | t - t = t2 - t2 = u[1] << 63 - u[1] = (u[1] >> 1) | t - t = t2 - u[0] = (u[0] >> 1) | t if r[0]&1 == 1 { @@ -1025,22 +1028,13 @@ func (z *Element) Inverse(x *Element) *Element { } // r = r >> 1 - t2 = r[5] << 63 + + r[0] = r[0]>>1 | r[1]<<63 + r[1] = r[1]>>1 | r[2]<<63 + r[2] = r[2]>>1 | r[3]<<63 + r[3] = r[3]>>1 | r[4]<<63 + r[4] = r[4]>>1 | r[5]<<63 r[5] >>= 1 - t = t2 - t2 = r[4] << 63 - r[4] = (r[4] >> 1) | t - t = t2 - t2 = r[3] << 63 - r[3] = (r[3] >> 1) | t - t = t2 - t2 = r[2] << 63 - r[2] = (r[2] >> 1) | t - t = t2 - t2 = r[1] << 63 - r[1] = (r[1] >> 1) | t - t = t2 - r[0] = (r[0] >> 1) | t } @@ -1107,10 +1101,12 @@ func (z *Element) Inverse(x *Element) *Element { } } if (u[0] == 1) && (u[5]|u[4]|u[3]|u[2]|u[1]) == 0 { - return z.Set(&r) + z.Set(&r) + return } if (v[0] == 1) && (v[5]|v[4]|v[3]|v[2]|v[1]) == 0 { - return z.Set(&s) + z.Set(&s) + return } } diff --git a/ecc/bls12-381/fp/element_ops_amd64.go b/ecc/bls12-381/fp/element_ops_amd64.go index 73a3711ec..d61412bd6 100644 --- a/ecc/bls12-381/fp/element_ops_amd64.go +++ b/ecc/bls12-381/fp/element_ops_amd64.go @@ -48,3 +48,6 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) + +//go:noescape +func inverse(res, x *Element) diff --git a/ecc/bls12-381/fp/element_ops_amd64.s b/ecc/bls12-381/fp/element_ops_amd64.s index 099c9afca..482186d3e 100644 --- a/ecc/bls12-381/fp/element_ops_amd64.s +++ b/ecc/bls12-381/fp/element_ops_amd64.s @@ -450,3 +450,356 @@ TEXT ·Butterfly(SB), $48-16 MOVQ R8, 32(AX) MOVQ R9, 40(AX) RET + +// inverse(res, x *Element) +TEXT ·inverse(SB), $136-16 + // u = q + // u[0] -> R9 + // u[1] -> R10 + // u[2] -> R11 + // u[3] -> R12 + // u[4] -> R13 + // u[5] -> R14 + MOVQ q<>+0(SB), R9 + MOVQ q<>+8(SB), R10 + MOVQ q<>+16(SB), R11 + MOVQ q<>+24(SB), R12 + MOVQ q<>+32(SB), R13 + MOVQ q<>+40(SB), R14 + + // s = r^2 + // s[0] -> s11-96(SP) + // s[1] -> s12-104(SP) + // s[2] -> s13-112(SP) + // s[3] -> s14-120(SP) + // s[4] -> s15-128(SP) + // s[5] -> s16-136(SP) + MOVQ $0xf4df1f341c341746, R8 + MOVQ R8, s11-96(SP) + MOVQ $0x0a76e6a609d104f1, R8 + MOVQ R8, s12-104(SP) + MOVQ $0x8de5476c4c95b6d5, R8 + MOVQ R8, s13-112(SP) + MOVQ $0x67eb88a9939d83c0, R8 + MOVQ R8, s14-120(SP) + MOVQ $0x9a793e85b519952d, R8 + MOVQ R8, s15-128(SP) + MOVQ $0x11988fe592cae3aa, R8 + MOVQ R8, s16-136(SP) + + // v = x + // v[0] -> R15 + // v[1] -> s0-8(SP) + // v[2] -> s1-16(SP) + // v[3] -> s2-24(SP) + // v[4] -> s3-32(SP) + // v[5] -> s4-40(SP) + MOVQ x+8(FP), R8 + MOVQ 0(R8), AX + MOVQ 8(R8), DX + MOVQ 16(R8), CX + MOVQ 24(R8), BX + MOVQ 32(R8), SI + MOVQ 40(R8), DI + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + MOVQ SI, s3-32(SP) + MOVQ DI, s4-40(SP) + + // if x is 0, returns 0 + MOVQ AX, R8 + ORQ DX, R8 + ORQ CX, R8 + ORQ BX, R8 + ORQ SI, R8 + ORQ DI, R8 + JEQ l7 + + // r = 0 + // r[0] -> s5-48(SP) + // r[1] -> s6-56(SP) + // r[2] -> s7-64(SP) + // r[3] -> s8-72(SP) + // r[4] -> s9-80(SP) + // r[5] -> s10-88(SP) + MOVQ $0, s5-48(SP) + MOVQ $0, s6-56(SP) + MOVQ $0, s7-64(SP) + MOVQ $0, s8-72(SP) + MOVQ $0, s9-80(SP) + MOVQ $0, s10-88(SP) + +l2: + BTQ $0, AX + JCS l8 + MOVQ $0, BP + XORQ R8, R8 + +l9: + INCQ BP + SHRQ $1, AX, R8 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, DI + BTQ $0, AX + JCC l9 + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + MOVQ SI, s3-32(SP) + MOVQ DI, s4-40(SP) + MOVQ s11-96(SP), AX + MOVQ s12-104(SP), DX + MOVQ s13-112(SP), CX + MOVQ s14-120(SP), BX + MOVQ s15-128(SP), SI + MOVQ s16-136(SP), DI + +l10: + BTQ $0, AX + JCC l11 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + +l11: + SHRQ $1, AX, R8 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, DI + DECQ BP + JNE l10 + MOVQ AX, s11-96(SP) + MOVQ DX, s12-104(SP) + MOVQ CX, s13-112(SP) + MOVQ BX, s14-120(SP) + MOVQ SI, s15-128(SP) + MOVQ DI, s16-136(SP) + +l8: + MOVQ R9, AX + MOVQ R10, DX + MOVQ R11, CX + MOVQ R12, BX + MOVQ R13, SI + MOVQ R14, DI + BTQ $0, AX + JCS l12 + MOVQ $0, BP + XORQ R8, R8 + +l13: + INCQ BP + SHRQ $1, AX, R8 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, DI + BTQ $0, AX + JCC l13 + MOVQ AX, R9 + MOVQ DX, R10 + MOVQ CX, R11 + MOVQ BX, R12 + MOVQ SI, R13 + MOVQ DI, R14 + MOVQ s5-48(SP), AX + MOVQ s6-56(SP), DX + MOVQ s7-64(SP), CX + MOVQ s8-72(SP), BX + MOVQ s9-80(SP), SI + MOVQ s10-88(SP), DI + +l14: + BTQ $0, AX + JCC l15 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + +l15: + SHRQ $1, AX, R8 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, DI + DECQ BP + JNE l14 + MOVQ AX, s5-48(SP) + MOVQ DX, s6-56(SP) + MOVQ CX, s7-64(SP) + MOVQ BX, s8-72(SP) + MOVQ SI, s9-80(SP) + MOVQ DI, s10-88(SP) + +l12: + // v = v - u + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + MOVQ s3-32(SP), SI + MOVQ s4-40(SP), DI + SUBQ R9, AX + SBBQ R10, DX + SBBQ R11, CX + SBBQ R12, BX + SBBQ R13, SI + SBBQ R14, DI + JCC l3 + SUBQ R15, R9 + SBBQ s0-8(SP), R10 + SBBQ s1-16(SP), R11 + SBBQ s2-24(SP), R12 + SBBQ s3-32(SP), R13 + SBBQ s4-40(SP), R14 + MOVQ s5-48(SP), AX + MOVQ s6-56(SP), DX + MOVQ s7-64(SP), CX + MOVQ s8-72(SP), BX + MOVQ s9-80(SP), SI + MOVQ s10-88(SP), DI + SUBQ s11-96(SP), AX + SBBQ s12-104(SP), DX + SBBQ s13-112(SP), CX + SBBQ s14-120(SP), BX + SBBQ s15-128(SP), SI + SBBQ s16-136(SP), DI + JCC l16 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + +l16: + MOVQ AX, s5-48(SP) + MOVQ DX, s6-56(SP) + MOVQ CX, s7-64(SP) + MOVQ BX, s8-72(SP) + MOVQ SI, s9-80(SP) + MOVQ DI, s10-88(SP) + JMP l4 + +l3: + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + MOVQ SI, s3-32(SP) + MOVQ DI, s4-40(SP) + MOVQ s11-96(SP), AX + MOVQ s12-104(SP), DX + MOVQ s13-112(SP), CX + MOVQ s14-120(SP), BX + MOVQ s15-128(SP), SI + MOVQ s16-136(SP), DI + SUBQ s5-48(SP), AX + SBBQ s6-56(SP), DX + SBBQ s7-64(SP), CX + SBBQ s8-72(SP), BX + SBBQ s9-80(SP), SI + SBBQ s10-88(SP), DI + JCC l17 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + +l17: + MOVQ AX, s11-96(SP) + MOVQ DX, s12-104(SP) + MOVQ CX, s13-112(SP) + MOVQ BX, s14-120(SP) + MOVQ SI, s15-128(SP) + MOVQ DI, s16-136(SP) + +l4: + MOVQ R9, R8 + SUBQ $1, R8 + ORQ R10, R8 + ORQ R11, R8 + ORQ R12, R8 + ORQ R13, R8 + ORQ R14, R8 + JEQ l5 + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + MOVQ s3-32(SP), SI + MOVQ s4-40(SP), DI + MOVQ AX, R8 + SUBQ $1, R8 + JNE l2 + ORQ DX, R8 + ORQ CX, R8 + ORQ BX, R8 + ORQ SI, R8 + ORQ DI, R8 + JEQ l6 + JMP l2 + +l5: + MOVQ res+0(FP), R8 + MOVQ s5-48(SP), AX + MOVQ s6-56(SP), DX + MOVQ s7-64(SP), CX + MOVQ s8-72(SP), BX + MOVQ s9-80(SP), SI + MOVQ s10-88(SP), DI + MOVQ AX, 0(R8) + MOVQ DX, 8(R8) + MOVQ CX, 16(R8) + MOVQ BX, 24(R8) + MOVQ SI, 32(R8) + MOVQ DI, 40(R8) + RET + +l6: + MOVQ res+0(FP), R8 + MOVQ s11-96(SP), AX + MOVQ s12-104(SP), DX + MOVQ s13-112(SP), CX + MOVQ s14-120(SP), BX + MOVQ s15-128(SP), SI + MOVQ s16-136(SP), DI + MOVQ AX, 0(R8) + MOVQ DX, 8(R8) + MOVQ CX, 16(R8) + MOVQ BX, 24(R8) + MOVQ SI, 32(R8) + MOVQ DI, 40(R8) + RET + +l7: + MOVQ res+0(FP), R8 + MOVQ $0, 0(R8) + MOVQ $0, 8(R8) + MOVQ $0, 16(R8) + MOVQ $0, 24(R8) + MOVQ $0, 32(R8) + MOVQ $0, 40(R8) + RET diff --git a/ecc/bls12-381/fp/element_ops_noasm.go b/ecc/bls12-381/fp/element_ops_noasm.go index fec628918..48d55e2ea 100644 --- a/ecc/bls12-381/fp/element_ops_noasm.go +++ b/ecc/bls12-381/fp/element_ops_noasm.go @@ -51,6 +51,10 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } +func inverse(z, x *Element) { + _inverseGeneric(z, x) +} + // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bls12-381/fr/element.go b/ecc/bls12-381/fr/element.go index 9ac36b9a0..e077d84b8 100644 --- a/ecc/bls12-381/fr/element.go +++ b/ecc/bls12-381/fr/element.go @@ -573,6 +573,21 @@ func _butterflyGeneric(a, b *Element) { b.Sub(&t, b) } +// BitLen returns the minimum number of bits needed to represent z +// returns 0 if z == 0 +func (z *Element) BitLen() int { + if z[3] != 0 { + return 192 + bits.Len64(z[3]) + } + if z[2] != 0 { + return 128 + bits.Len64(z[2]) + } + if z[1] != 0 { + return 64 + bits.Len64(z[1]) + } + return bits.Len64(z[0]) +} + // Exp z = x^exponent mod q func (z *Element) Exp(x Element, exponent *big.Int) *Element { var bZero big.Int @@ -830,8 +845,17 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { + inverse(z, x) + return z +} + +// _inverseGeneric z = x^-1 mod q +// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" +// if x == 0, sets and returns z = x +func _inverseGeneric(z, x *Element) { if x.IsZero() { - return z.Set(x) + z.SetZero() + return } // initialize u = q @@ -855,23 +879,18 @@ func (z *Element) Inverse(x *Element) *Element { v := *x - var carry, borrow, t, t2 uint64 + var carry, borrow uint64 var bigger bool for { for v[0]&1 == 0 { // v = v >> 1 - t2 = v[3] << 63 + + v[0] = v[0]>>1 | v[1]<<63 + v[1] = v[1]>>1 | v[2]<<63 + v[2] = v[2]>>1 | v[3]<<63 v[3] >>= 1 - t = t2 - t2 = v[2] << 63 - v[2] = (v[2] >> 1) | t - t = t2 - t2 = v[1] << 63 - v[1] = (v[1] >> 1) | t - t = t2 - v[0] = (v[0] >> 1) | t if s[0]&1 == 1 { @@ -884,31 +903,21 @@ func (z *Element) Inverse(x *Element) *Element { } // s = s >> 1 - t2 = s[3] << 63 + + s[0] = s[0]>>1 | s[1]<<63 + s[1] = s[1]>>1 | s[2]<<63 + s[2] = s[2]>>1 | s[3]<<63 s[3] >>= 1 - t = t2 - t2 = s[2] << 63 - s[2] = (s[2] >> 1) | t - t = t2 - t2 = s[1] << 63 - s[1] = (s[1] >> 1) | t - t = t2 - s[0] = (s[0] >> 1) | t } for u[0]&1 == 0 { // u = u >> 1 - t2 = u[3] << 63 + + u[0] = u[0]>>1 | u[1]<<63 + u[1] = u[1]>>1 | u[2]<<63 + u[2] = u[2]>>1 | u[3]<<63 u[3] >>= 1 - t = t2 - t2 = u[2] << 63 - u[2] = (u[2] >> 1) | t - t = t2 - t2 = u[1] << 63 - u[1] = (u[1] >> 1) | t - t = t2 - u[0] = (u[0] >> 1) | t if r[0]&1 == 1 { @@ -921,16 +930,11 @@ func (z *Element) Inverse(x *Element) *Element { } // r = r >> 1 - t2 = r[3] << 63 + + r[0] = r[0]>>1 | r[1]<<63 + r[1] = r[1]>>1 | r[2]<<63 + r[2] = r[2]>>1 | r[3]<<63 r[3] >>= 1 - t = t2 - t2 = r[2] << 63 - r[2] = (r[2] >> 1) | t - t = t2 - t2 = r[1] << 63 - r[1] = (r[1] >> 1) | t - t = t2 - r[0] = (r[0] >> 1) | t } @@ -985,10 +989,12 @@ func (z *Element) Inverse(x *Element) *Element { } } if (u[0] == 1) && (u[3]|u[2]|u[1]) == 0 { - return z.Set(&r) + z.Set(&r) + return } if (v[0] == 1) && (v[3]|v[2]|v[1]) == 0 { - return z.Set(&s) + z.Set(&s) + return } } diff --git a/ecc/bls12-381/fr/element_ops_amd64.go b/ecc/bls12-381/fr/element_ops_amd64.go index 78022b3e6..9ebabc26a 100644 --- a/ecc/bls12-381/fr/element_ops_amd64.go +++ b/ecc/bls12-381/fr/element_ops_amd64.go @@ -48,3 +48,6 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) + +//go:noescape +func inverse(res, x *Element) diff --git a/ecc/bls12-381/fr/element_ops_amd64.s b/ecc/bls12-381/fr/element_ops_amd64.s index d385629f1..e3b75120b 100644 --- a/ecc/bls12-381/fr/element_ops_amd64.s +++ b/ecc/bls12-381/fr/element_ops_amd64.s @@ -338,3 +338,268 @@ TEXT ·Butterfly(SB), NOSPLIT, $0-16 MOVQ SI, 16(AX) MOVQ DI, 24(AX) RET + +// inverse(res, x *Element) +TEXT ·inverse(SB), $56-16 + // u = q + // u[0] -> DI + // u[1] -> R8 + // u[2] -> R9 + // u[3] -> R10 + MOVQ q<>+0(SB), DI + MOVQ q<>+8(SB), R8 + MOVQ q<>+16(SB), R9 + MOVQ q<>+24(SB), R10 + + // s = r^2 + // s[0] -> s3-32(SP) + // s[1] -> s4-40(SP) + // s[2] -> s5-48(SP) + // s[3] -> s6-56(SP) + MOVQ $0xc999e990f3f29c6d, SI + MOVQ SI, s3-32(SP) + MOVQ $0x2b6cedcb87925c23, SI + MOVQ SI, s4-40(SP) + MOVQ $0x05d314967254398f, SI + MOVQ SI, s5-48(SP) + MOVQ $0x0748d9d99f59ff11, SI + MOVQ SI, s6-56(SP) + + // v = x + // v[0] -> R11 + // v[1] -> R12 + // v[2] -> R13 + // v[3] -> R14 + MOVQ x+8(FP), SI + MOVQ 0(SI), AX + MOVQ 8(SI), DX + MOVQ 16(SI), CX + MOVQ 24(SI), BX + MOVQ AX, R11 + MOVQ DX, R12 + MOVQ CX, R13 + MOVQ BX, R14 + + // if x is 0, returns 0 + MOVQ AX, SI + ORQ DX, SI + ORQ CX, SI + ORQ BX, SI + JEQ l7 + + // r = 0 + // r[0] -> R15 + // r[1] -> s0-8(SP) + // r[2] -> s1-16(SP) + // r[3] -> s2-24(SP) + MOVQ $0, R15 + MOVQ $0, s0-8(SP) + MOVQ $0, s1-16(SP) + MOVQ $0, s2-24(SP) + +l2: + BTQ $0, AX + JCS l8 + MOVQ $0, BP + XORQ SI, SI + +l9: + INCQ BP + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + BTQ $0, AX + JCC l9 + MOVQ AX, R11 + MOVQ DX, R12 + MOVQ CX, R13 + MOVQ BX, R14 + MOVQ s3-32(SP), AX + MOVQ s4-40(SP), DX + MOVQ s5-48(SP), CX + MOVQ s6-56(SP), BX + +l10: + BTQ $0, AX + JCC l11 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l11: + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + DECQ BP + JNE l10 + MOVQ AX, s3-32(SP) + MOVQ DX, s4-40(SP) + MOVQ CX, s5-48(SP) + MOVQ BX, s6-56(SP) + +l8: + MOVQ DI, AX + MOVQ R8, DX + MOVQ R9, CX + MOVQ R10, BX + BTQ $0, AX + JCS l12 + MOVQ $0, BP + XORQ SI, SI + +l13: + INCQ BP + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + BTQ $0, AX + JCC l13 + MOVQ AX, DI + MOVQ DX, R8 + MOVQ CX, R9 + MOVQ BX, R10 + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + +l14: + BTQ $0, AX + JCC l15 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l15: + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + DECQ BP + JNE l14 + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + +l12: + // v = v - u + MOVQ R11, AX + MOVQ R12, DX + MOVQ R13, CX + MOVQ R14, BX + SUBQ DI, AX + SBBQ R8, DX + SBBQ R9, CX + SBBQ R10, BX + JCC l3 + SUBQ R11, DI + SBBQ R12, R8 + SBBQ R13, R9 + SBBQ R14, R10 + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + SUBQ s3-32(SP), AX + SBBQ s4-40(SP), DX + SBBQ s5-48(SP), CX + SBBQ s6-56(SP), BX + JCC l16 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l16: + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + JMP l4 + +l3: + MOVQ AX, R11 + MOVQ DX, R12 + MOVQ CX, R13 + MOVQ BX, R14 + MOVQ s3-32(SP), AX + MOVQ s4-40(SP), DX + MOVQ s5-48(SP), CX + MOVQ s6-56(SP), BX + SUBQ R15, AX + SBBQ s0-8(SP), DX + SBBQ s1-16(SP), CX + SBBQ s2-24(SP), BX + JCC l17 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l17: + MOVQ AX, s3-32(SP) + MOVQ DX, s4-40(SP) + MOVQ CX, s5-48(SP) + MOVQ BX, s6-56(SP) + +l4: + MOVQ DI, SI + SUBQ $1, SI + ORQ R8, SI + ORQ R9, SI + ORQ R10, SI + JEQ l5 + MOVQ R11, AX + MOVQ R12, DX + MOVQ R13, CX + MOVQ R14, BX + MOVQ AX, SI + SUBQ $1, SI + JNE l2 + ORQ DX, SI + ORQ CX, SI + ORQ BX, SI + JEQ l6 + JMP l2 + +l5: + MOVQ res+0(FP), SI + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + MOVQ AX, 0(SI) + MOVQ DX, 8(SI) + MOVQ CX, 16(SI) + MOVQ BX, 24(SI) + RET + +l6: + MOVQ res+0(FP), SI + MOVQ s3-32(SP), AX + MOVQ s4-40(SP), DX + MOVQ s5-48(SP), CX + MOVQ s6-56(SP), BX + MOVQ AX, 0(SI) + MOVQ DX, 8(SI) + MOVQ CX, 16(SI) + MOVQ BX, 24(SI) + RET + +l7: + MOVQ res+0(FP), SI + MOVQ $0, 0(SI) + MOVQ $0, 8(SI) + MOVQ $0, 16(SI) + MOVQ $0, 24(SI) + RET diff --git a/ecc/bls12-381/fr/element_ops_noasm.go b/ecc/bls12-381/fr/element_ops_noasm.go index ec1fac18d..006365daa 100644 --- a/ecc/bls12-381/fr/element_ops_noasm.go +++ b/ecc/bls12-381/fr/element_ops_noasm.go @@ -51,6 +51,10 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } +func inverse(z, x *Element) { + _inverseGeneric(z, x) +} + // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bls24-315/fp/element.go b/ecc/bls24-315/fp/element.go index a30829da9..79c15ad72 100644 --- a/ecc/bls24-315/fp/element.go +++ b/ecc/bls24-315/fp/element.go @@ -633,6 +633,24 @@ func _butterflyGeneric(a, b *Element) { b.Sub(&t, b) } +// BitLen returns the minimum number of bits needed to represent z +// returns 0 if z == 0 +func (z *Element) BitLen() int { + if z[4] != 0 { + return 256 + bits.Len64(z[4]) + } + if z[3] != 0 { + return 192 + bits.Len64(z[3]) + } + if z[2] != 0 { + return 128 + bits.Len64(z[2]) + } + if z[1] != 0 { + return 64 + bits.Len64(z[1]) + } + return bits.Len64(z[0]) +} + // Exp z = x^exponent mod q func (z *Element) Exp(x Element, exponent *big.Int) *Element { var bZero big.Int @@ -893,8 +911,17 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { + inverse(z, x) + return z +} + +// _inverseGeneric z = x^-1 mod q +// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" +// if x == 0, sets and returns z = x +func _inverseGeneric(z, x *Element) { if x.IsZero() { - return z.Set(x) + z.SetZero() + return } // initialize u = q @@ -920,26 +947,19 @@ func (z *Element) Inverse(x *Element) *Element { v := *x - var carry, borrow, t, t2 uint64 + var carry, borrow uint64 var bigger bool for { for v[0]&1 == 0 { // v = v >> 1 - t2 = v[4] << 63 + + v[0] = v[0]>>1 | v[1]<<63 + v[1] = v[1]>>1 | v[2]<<63 + v[2] = v[2]>>1 | v[3]<<63 + v[3] = v[3]>>1 | v[4]<<63 v[4] >>= 1 - t = t2 - t2 = v[3] << 63 - v[3] = (v[3] >> 1) | t - t = t2 - t2 = v[2] << 63 - v[2] = (v[2] >> 1) | t - t = t2 - t2 = v[1] << 63 - v[1] = (v[1] >> 1) | t - t = t2 - v[0] = (v[0] >> 1) | t if s[0]&1 == 1 { @@ -953,37 +973,23 @@ func (z *Element) Inverse(x *Element) *Element { } // s = s >> 1 - t2 = s[4] << 63 + + s[0] = s[0]>>1 | s[1]<<63 + s[1] = s[1]>>1 | s[2]<<63 + s[2] = s[2]>>1 | s[3]<<63 + s[3] = s[3]>>1 | s[4]<<63 s[4] >>= 1 - t = t2 - t2 = s[3] << 63 - s[3] = (s[3] >> 1) | t - t = t2 - t2 = s[2] << 63 - s[2] = (s[2] >> 1) | t - t = t2 - t2 = s[1] << 63 - s[1] = (s[1] >> 1) | t - t = t2 - s[0] = (s[0] >> 1) | t } for u[0]&1 == 0 { // u = u >> 1 - t2 = u[4] << 63 + + u[0] = u[0]>>1 | u[1]<<63 + u[1] = u[1]>>1 | u[2]<<63 + u[2] = u[2]>>1 | u[3]<<63 + u[3] = u[3]>>1 | u[4]<<63 u[4] >>= 1 - t = t2 - t2 = u[3] << 63 - u[3] = (u[3] >> 1) | t - t = t2 - t2 = u[2] << 63 - u[2] = (u[2] >> 1) | t - t = t2 - t2 = u[1] << 63 - u[1] = (u[1] >> 1) | t - t = t2 - u[0] = (u[0] >> 1) | t if r[0]&1 == 1 { @@ -997,19 +1003,12 @@ func (z *Element) Inverse(x *Element) *Element { } // r = r >> 1 - t2 = r[4] << 63 + + r[0] = r[0]>>1 | r[1]<<63 + r[1] = r[1]>>1 | r[2]<<63 + r[2] = r[2]>>1 | r[3]<<63 + r[3] = r[3]>>1 | r[4]<<63 r[4] >>= 1 - t = t2 - t2 = r[3] << 63 - r[3] = (r[3] >> 1) | t - t = t2 - t2 = r[2] << 63 - r[2] = (r[2] >> 1) | t - t = t2 - t2 = r[1] << 63 - r[1] = (r[1] >> 1) | t - t = t2 - r[0] = (r[0] >> 1) | t } @@ -1070,10 +1069,12 @@ func (z *Element) Inverse(x *Element) *Element { } } if (u[0] == 1) && (u[4]|u[3]|u[2]|u[1]) == 0 { - return z.Set(&r) + z.Set(&r) + return } if (v[0] == 1) && (v[4]|v[3]|v[2]|v[1]) == 0 { - return z.Set(&s) + z.Set(&s) + return } } diff --git a/ecc/bls24-315/fp/element_ops_amd64.go b/ecc/bls24-315/fp/element_ops_amd64.go index 73a3711ec..d61412bd6 100644 --- a/ecc/bls24-315/fp/element_ops_amd64.go +++ b/ecc/bls24-315/fp/element_ops_amd64.go @@ -48,3 +48,6 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) + +//go:noescape +func inverse(res, x *Element) diff --git a/ecc/bls24-315/fp/element_ops_amd64.s b/ecc/bls24-315/fp/element_ops_amd64.s index ab0ced516..354d2294a 100644 --- a/ecc/bls24-315/fp/element_ops_amd64.s +++ b/ecc/bls24-315/fp/element_ops_amd64.s @@ -398,3 +398,312 @@ TEXT ·Butterfly(SB), $24-16 MOVQ DI, 24(AX) MOVQ R8, 32(AX) RET + +// inverse(res, x *Element) +TEXT ·inverse(SB), $96-16 + // u = q + // u[0] -> R8 + // u[1] -> R9 + // u[2] -> R10 + // u[3] -> R11 + // u[4] -> R12 + MOVQ q<>+0(SB), R8 + MOVQ q<>+8(SB), R9 + MOVQ q<>+16(SB), R10 + MOVQ q<>+24(SB), R11 + MOVQ q<>+32(SB), R12 + + // s = r^2 + // s[0] -> s7-64(SP) + // s[1] -> s8-72(SP) + // s[2] -> s9-80(SP) + // s[3] -> s10-88(SP) + // s[4] -> s11-96(SP) + MOVQ $0x6b817891fe329c16, DI + MOVQ DI, s7-64(SP) + MOVQ $0x599ce86eec6e2c35, DI + MOVQ DI, s8-72(SP) + MOVQ $0xc338890f540d5ad6, DI + MOVQ DI, s9-80(SP) + MOVQ $0xcc160f6924c81f32, DI + MOVQ DI, s10-88(SP) + MOVQ $0x0215d8d4607a88d5, DI + MOVQ DI, s11-96(SP) + + // v = x + // v[0] -> R13 + // v[1] -> R14 + // v[2] -> R15 + // v[3] -> s0-8(SP) + // v[4] -> s1-16(SP) + MOVQ x+8(FP), DI + MOVQ 0(DI), AX + MOVQ 8(DI), DX + MOVQ 16(DI), CX + MOVQ 24(DI), BX + MOVQ 32(DI), SI + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ CX, R15 + MOVQ BX, s0-8(SP) + MOVQ SI, s1-16(SP) + + // if x is 0, returns 0 + MOVQ AX, DI + ORQ DX, DI + ORQ CX, DI + ORQ BX, DI + ORQ SI, DI + JEQ l7 + + // r = 0 + // r[0] -> s2-24(SP) + // r[1] -> s3-32(SP) + // r[2] -> s4-40(SP) + // r[3] -> s5-48(SP) + // r[4] -> s6-56(SP) + MOVQ $0, s2-24(SP) + MOVQ $0, s3-32(SP) + MOVQ $0, s4-40(SP) + MOVQ $0, s5-48(SP) + MOVQ $0, s6-56(SP) + +l2: + BTQ $0, AX + JCS l8 + MOVQ $0, BP + XORQ DI, DI + +l9: + INCQ BP + SHRQ $1, AX, DI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, SI + BTQ $0, AX + JCC l9 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ CX, R15 + MOVQ BX, s0-8(SP) + MOVQ SI, s1-16(SP) + MOVQ s7-64(SP), AX + MOVQ s8-72(SP), DX + MOVQ s9-80(SP), CX + MOVQ s10-88(SP), BX + MOVQ s11-96(SP), SI + +l10: + BTQ $0, AX + JCC l11 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + +l11: + SHRQ $1, AX, DI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, SI + DECQ BP + JNE l10 + MOVQ AX, s7-64(SP) + MOVQ DX, s8-72(SP) + MOVQ CX, s9-80(SP) + MOVQ BX, s10-88(SP) + MOVQ SI, s11-96(SP) + +l8: + MOVQ R8, AX + MOVQ R9, DX + MOVQ R10, CX + MOVQ R11, BX + MOVQ R12, SI + BTQ $0, AX + JCS l12 + MOVQ $0, BP + XORQ DI, DI + +l13: + INCQ BP + SHRQ $1, AX, DI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, SI + BTQ $0, AX + JCC l13 + MOVQ AX, R8 + MOVQ DX, R9 + MOVQ CX, R10 + MOVQ BX, R11 + MOVQ SI, R12 + MOVQ s2-24(SP), AX + MOVQ s3-32(SP), DX + MOVQ s4-40(SP), CX + MOVQ s5-48(SP), BX + MOVQ s6-56(SP), SI + +l14: + BTQ $0, AX + JCC l15 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + +l15: + SHRQ $1, AX, DI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, SI + DECQ BP + JNE l14 + MOVQ AX, s2-24(SP) + MOVQ DX, s3-32(SP) + MOVQ CX, s4-40(SP) + MOVQ BX, s5-48(SP) + MOVQ SI, s6-56(SP) + +l12: + // v = v - u + MOVQ R13, AX + MOVQ R14, DX + MOVQ R15, CX + MOVQ s0-8(SP), BX + MOVQ s1-16(SP), SI + SUBQ R8, AX + SBBQ R9, DX + SBBQ R10, CX + SBBQ R11, BX + SBBQ R12, SI + JCC l3 + SUBQ R13, R8 + SBBQ R14, R9 + SBBQ R15, R10 + SBBQ s0-8(SP), R11 + SBBQ s1-16(SP), R12 + MOVQ s2-24(SP), AX + MOVQ s3-32(SP), DX + MOVQ s4-40(SP), CX + MOVQ s5-48(SP), BX + MOVQ s6-56(SP), SI + SUBQ s7-64(SP), AX + SBBQ s8-72(SP), DX + SBBQ s9-80(SP), CX + SBBQ s10-88(SP), BX + SBBQ s11-96(SP), SI + JCC l16 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + +l16: + MOVQ AX, s2-24(SP) + MOVQ DX, s3-32(SP) + MOVQ CX, s4-40(SP) + MOVQ BX, s5-48(SP) + MOVQ SI, s6-56(SP) + JMP l4 + +l3: + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ CX, R15 + MOVQ BX, s0-8(SP) + MOVQ SI, s1-16(SP) + MOVQ s7-64(SP), AX + MOVQ s8-72(SP), DX + MOVQ s9-80(SP), CX + MOVQ s10-88(SP), BX + MOVQ s11-96(SP), SI + SUBQ s2-24(SP), AX + SBBQ s3-32(SP), DX + SBBQ s4-40(SP), CX + SBBQ s5-48(SP), BX + SBBQ s6-56(SP), SI + JCC l17 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + +l17: + MOVQ AX, s7-64(SP) + MOVQ DX, s8-72(SP) + MOVQ CX, s9-80(SP) + MOVQ BX, s10-88(SP) + MOVQ SI, s11-96(SP) + +l4: + MOVQ R8, DI + SUBQ $1, DI + ORQ R9, DI + ORQ R10, DI + ORQ R11, DI + ORQ R12, DI + JEQ l5 + MOVQ R13, AX + MOVQ R14, DX + MOVQ R15, CX + MOVQ s0-8(SP), BX + MOVQ s1-16(SP), SI + MOVQ AX, DI + SUBQ $1, DI + JNE l2 + ORQ DX, DI + ORQ CX, DI + ORQ BX, DI + ORQ SI, DI + JEQ l6 + JMP l2 + +l5: + MOVQ res+0(FP), DI + MOVQ s2-24(SP), AX + MOVQ s3-32(SP), DX + MOVQ s4-40(SP), CX + MOVQ s5-48(SP), BX + MOVQ s6-56(SP), SI + MOVQ AX, 0(DI) + MOVQ DX, 8(DI) + MOVQ CX, 16(DI) + MOVQ BX, 24(DI) + MOVQ SI, 32(DI) + RET + +l6: + MOVQ res+0(FP), DI + MOVQ s7-64(SP), AX + MOVQ s8-72(SP), DX + MOVQ s9-80(SP), CX + MOVQ s10-88(SP), BX + MOVQ s11-96(SP), SI + MOVQ AX, 0(DI) + MOVQ DX, 8(DI) + MOVQ CX, 16(DI) + MOVQ BX, 24(DI) + MOVQ SI, 32(DI) + RET + +l7: + MOVQ res+0(FP), DI + MOVQ $0, 0(DI) + MOVQ $0, 8(DI) + MOVQ $0, 16(DI) + MOVQ $0, 24(DI) + MOVQ $0, 32(DI) + RET diff --git a/ecc/bls24-315/fp/element_ops_noasm.go b/ecc/bls24-315/fp/element_ops_noasm.go index fec628918..48d55e2ea 100644 --- a/ecc/bls24-315/fp/element_ops_noasm.go +++ b/ecc/bls24-315/fp/element_ops_noasm.go @@ -51,6 +51,10 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } +func inverse(z, x *Element) { + _inverseGeneric(z, x) +} + // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bls24-315/fr/element.go b/ecc/bls24-315/fr/element.go index d5de5ca59..726a02f32 100644 --- a/ecc/bls24-315/fr/element.go +++ b/ecc/bls24-315/fr/element.go @@ -573,6 +573,21 @@ func _butterflyGeneric(a, b *Element) { b.Sub(&t, b) } +// BitLen returns the minimum number of bits needed to represent z +// returns 0 if z == 0 +func (z *Element) BitLen() int { + if z[3] != 0 { + return 192 + bits.Len64(z[3]) + } + if z[2] != 0 { + return 128 + bits.Len64(z[2]) + } + if z[1] != 0 { + return 64 + bits.Len64(z[1]) + } + return bits.Len64(z[0]) +} + // Exp z = x^exponent mod q func (z *Element) Exp(x Element, exponent *big.Int) *Element { var bZero big.Int @@ -830,8 +845,17 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { + inverse(z, x) + return z +} + +// _inverseGeneric z = x^-1 mod q +// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" +// if x == 0, sets and returns z = x +func _inverseGeneric(z, x *Element) { if x.IsZero() { - return z.Set(x) + z.SetZero() + return } // initialize u = q @@ -855,23 +879,18 @@ func (z *Element) Inverse(x *Element) *Element { v := *x - var carry, borrow, t, t2 uint64 + var carry, borrow uint64 var bigger bool for { for v[0]&1 == 0 { // v = v >> 1 - t2 = v[3] << 63 + + v[0] = v[0]>>1 | v[1]<<63 + v[1] = v[1]>>1 | v[2]<<63 + v[2] = v[2]>>1 | v[3]<<63 v[3] >>= 1 - t = t2 - t2 = v[2] << 63 - v[2] = (v[2] >> 1) | t - t = t2 - t2 = v[1] << 63 - v[1] = (v[1] >> 1) | t - t = t2 - v[0] = (v[0] >> 1) | t if s[0]&1 == 1 { @@ -884,31 +903,21 @@ func (z *Element) Inverse(x *Element) *Element { } // s = s >> 1 - t2 = s[3] << 63 + + s[0] = s[0]>>1 | s[1]<<63 + s[1] = s[1]>>1 | s[2]<<63 + s[2] = s[2]>>1 | s[3]<<63 s[3] >>= 1 - t = t2 - t2 = s[2] << 63 - s[2] = (s[2] >> 1) | t - t = t2 - t2 = s[1] << 63 - s[1] = (s[1] >> 1) | t - t = t2 - s[0] = (s[0] >> 1) | t } for u[0]&1 == 0 { // u = u >> 1 - t2 = u[3] << 63 + + u[0] = u[0]>>1 | u[1]<<63 + u[1] = u[1]>>1 | u[2]<<63 + u[2] = u[2]>>1 | u[3]<<63 u[3] >>= 1 - t = t2 - t2 = u[2] << 63 - u[2] = (u[2] >> 1) | t - t = t2 - t2 = u[1] << 63 - u[1] = (u[1] >> 1) | t - t = t2 - u[0] = (u[0] >> 1) | t if r[0]&1 == 1 { @@ -921,16 +930,11 @@ func (z *Element) Inverse(x *Element) *Element { } // r = r >> 1 - t2 = r[3] << 63 + + r[0] = r[0]>>1 | r[1]<<63 + r[1] = r[1]>>1 | r[2]<<63 + r[2] = r[2]>>1 | r[3]<<63 r[3] >>= 1 - t = t2 - t2 = r[2] << 63 - r[2] = (r[2] >> 1) | t - t = t2 - t2 = r[1] << 63 - r[1] = (r[1] >> 1) | t - t = t2 - r[0] = (r[0] >> 1) | t } @@ -985,10 +989,12 @@ func (z *Element) Inverse(x *Element) *Element { } } if (u[0] == 1) && (u[3]|u[2]|u[1]) == 0 { - return z.Set(&r) + z.Set(&r) + return } if (v[0] == 1) && (v[3]|v[2]|v[1]) == 0 { - return z.Set(&s) + z.Set(&s) + return } } diff --git a/ecc/bls24-315/fr/element_ops_amd64.go b/ecc/bls24-315/fr/element_ops_amd64.go index 78022b3e6..9ebabc26a 100644 --- a/ecc/bls24-315/fr/element_ops_amd64.go +++ b/ecc/bls24-315/fr/element_ops_amd64.go @@ -48,3 +48,6 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) + +//go:noescape +func inverse(res, x *Element) diff --git a/ecc/bls24-315/fr/element_ops_amd64.s b/ecc/bls24-315/fr/element_ops_amd64.s index 102b8883b..2b915f270 100644 --- a/ecc/bls24-315/fr/element_ops_amd64.s +++ b/ecc/bls24-315/fr/element_ops_amd64.s @@ -338,3 +338,268 @@ TEXT ·Butterfly(SB), NOSPLIT, $0-16 MOVQ SI, 16(AX) MOVQ DI, 24(AX) RET + +// inverse(res, x *Element) +TEXT ·inverse(SB), $56-16 + // u = q + // u[0] -> DI + // u[1] -> R8 + // u[2] -> R9 + // u[3] -> R10 + MOVQ q<>+0(SB), DI + MOVQ q<>+8(SB), R8 + MOVQ q<>+16(SB), R9 + MOVQ q<>+24(SB), R10 + + // s = r^2 + // s[0] -> s3-32(SP) + // s[1] -> s4-40(SP) + // s[2] -> s5-48(SP) + // s[3] -> s6-56(SP) + MOVQ $0x56a1ff2e50fc8851, SI + MOVQ SI, s3-32(SP) + MOVQ $0xeb3f198d55a12c3f, SI + MOVQ SI, s4-40(SP) + MOVQ $0x9799359271b08283, SI + MOVQ SI, s5-48(SP) + MOVQ $0x081d245007d35a5a, SI + MOVQ SI, s6-56(SP) + + // v = x + // v[0] -> R11 + // v[1] -> R12 + // v[2] -> R13 + // v[3] -> R14 + MOVQ x+8(FP), SI + MOVQ 0(SI), AX + MOVQ 8(SI), DX + MOVQ 16(SI), CX + MOVQ 24(SI), BX + MOVQ AX, R11 + MOVQ DX, R12 + MOVQ CX, R13 + MOVQ BX, R14 + + // if x is 0, returns 0 + MOVQ AX, SI + ORQ DX, SI + ORQ CX, SI + ORQ BX, SI + JEQ l7 + + // r = 0 + // r[0] -> R15 + // r[1] -> s0-8(SP) + // r[2] -> s1-16(SP) + // r[3] -> s2-24(SP) + MOVQ $0, R15 + MOVQ $0, s0-8(SP) + MOVQ $0, s1-16(SP) + MOVQ $0, s2-24(SP) + +l2: + BTQ $0, AX + JCS l8 + MOVQ $0, BP + XORQ SI, SI + +l9: + INCQ BP + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + BTQ $0, AX + JCC l9 + MOVQ AX, R11 + MOVQ DX, R12 + MOVQ CX, R13 + MOVQ BX, R14 + MOVQ s3-32(SP), AX + MOVQ s4-40(SP), DX + MOVQ s5-48(SP), CX + MOVQ s6-56(SP), BX + +l10: + BTQ $0, AX + JCC l11 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l11: + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + DECQ BP + JNE l10 + MOVQ AX, s3-32(SP) + MOVQ DX, s4-40(SP) + MOVQ CX, s5-48(SP) + MOVQ BX, s6-56(SP) + +l8: + MOVQ DI, AX + MOVQ R8, DX + MOVQ R9, CX + MOVQ R10, BX + BTQ $0, AX + JCS l12 + MOVQ $0, BP + XORQ SI, SI + +l13: + INCQ BP + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + BTQ $0, AX + JCC l13 + MOVQ AX, DI + MOVQ DX, R8 + MOVQ CX, R9 + MOVQ BX, R10 + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + +l14: + BTQ $0, AX + JCC l15 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l15: + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + DECQ BP + JNE l14 + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + +l12: + // v = v - u + MOVQ R11, AX + MOVQ R12, DX + MOVQ R13, CX + MOVQ R14, BX + SUBQ DI, AX + SBBQ R8, DX + SBBQ R9, CX + SBBQ R10, BX + JCC l3 + SUBQ R11, DI + SBBQ R12, R8 + SBBQ R13, R9 + SBBQ R14, R10 + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + SUBQ s3-32(SP), AX + SBBQ s4-40(SP), DX + SBBQ s5-48(SP), CX + SBBQ s6-56(SP), BX + JCC l16 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l16: + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + JMP l4 + +l3: + MOVQ AX, R11 + MOVQ DX, R12 + MOVQ CX, R13 + MOVQ BX, R14 + MOVQ s3-32(SP), AX + MOVQ s4-40(SP), DX + MOVQ s5-48(SP), CX + MOVQ s6-56(SP), BX + SUBQ R15, AX + SBBQ s0-8(SP), DX + SBBQ s1-16(SP), CX + SBBQ s2-24(SP), BX + JCC l17 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l17: + MOVQ AX, s3-32(SP) + MOVQ DX, s4-40(SP) + MOVQ CX, s5-48(SP) + MOVQ BX, s6-56(SP) + +l4: + MOVQ DI, SI + SUBQ $1, SI + ORQ R8, SI + ORQ R9, SI + ORQ R10, SI + JEQ l5 + MOVQ R11, AX + MOVQ R12, DX + MOVQ R13, CX + MOVQ R14, BX + MOVQ AX, SI + SUBQ $1, SI + JNE l2 + ORQ DX, SI + ORQ CX, SI + ORQ BX, SI + JEQ l6 + JMP l2 + +l5: + MOVQ res+0(FP), SI + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + MOVQ AX, 0(SI) + MOVQ DX, 8(SI) + MOVQ CX, 16(SI) + MOVQ BX, 24(SI) + RET + +l6: + MOVQ res+0(FP), SI + MOVQ s3-32(SP), AX + MOVQ s4-40(SP), DX + MOVQ s5-48(SP), CX + MOVQ s6-56(SP), BX + MOVQ AX, 0(SI) + MOVQ DX, 8(SI) + MOVQ CX, 16(SI) + MOVQ BX, 24(SI) + RET + +l7: + MOVQ res+0(FP), SI + MOVQ $0, 0(SI) + MOVQ $0, 8(SI) + MOVQ $0, 16(SI) + MOVQ $0, 24(SI) + RET diff --git a/ecc/bls24-315/fr/element_ops_noasm.go b/ecc/bls24-315/fr/element_ops_noasm.go index ec1fac18d..006365daa 100644 --- a/ecc/bls24-315/fr/element_ops_noasm.go +++ b/ecc/bls24-315/fr/element_ops_noasm.go @@ -51,6 +51,10 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } +func inverse(z, x *Element) { + _inverseGeneric(z, x) +} + // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bn254/fp/element.go b/ecc/bn254/fp/element.go index 446f369f4..1aa44e1e4 100644 --- a/ecc/bn254/fp/element.go +++ b/ecc/bn254/fp/element.go @@ -573,6 +573,21 @@ func _butterflyGeneric(a, b *Element) { b.Sub(&t, b) } +// BitLen returns the minimum number of bits needed to represent z +// returns 0 if z == 0 +func (z *Element) BitLen() int { + if z[3] != 0 { + return 192 + bits.Len64(z[3]) + } + if z[2] != 0 { + return 128 + bits.Len64(z[2]) + } + if z[1] != 0 { + return 64 + bits.Len64(z[1]) + } + return bits.Len64(z[0]) +} + // Exp z = x^exponent mod q func (z *Element) Exp(x Element, exponent *big.Int) *Element { var bZero big.Int @@ -778,8 +793,17 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { + inverse(z, x) + return z +} + +// _inverseGeneric z = x^-1 mod q +// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" +// if x == 0, sets and returns z = x +func _inverseGeneric(z, x *Element) { if x.IsZero() { - return z.Set(x) + z.SetZero() + return } // initialize u = q @@ -803,23 +827,18 @@ func (z *Element) Inverse(x *Element) *Element { v := *x - var carry, borrow, t, t2 uint64 + var carry, borrow uint64 var bigger bool for { for v[0]&1 == 0 { // v = v >> 1 - t2 = v[3] << 63 + + v[0] = v[0]>>1 | v[1]<<63 + v[1] = v[1]>>1 | v[2]<<63 + v[2] = v[2]>>1 | v[3]<<63 v[3] >>= 1 - t = t2 - t2 = v[2] << 63 - v[2] = (v[2] >> 1) | t - t = t2 - t2 = v[1] << 63 - v[1] = (v[1] >> 1) | t - t = t2 - v[0] = (v[0] >> 1) | t if s[0]&1 == 1 { @@ -832,31 +851,21 @@ func (z *Element) Inverse(x *Element) *Element { } // s = s >> 1 - t2 = s[3] << 63 + + s[0] = s[0]>>1 | s[1]<<63 + s[1] = s[1]>>1 | s[2]<<63 + s[2] = s[2]>>1 | s[3]<<63 s[3] >>= 1 - t = t2 - t2 = s[2] << 63 - s[2] = (s[2] >> 1) | t - t = t2 - t2 = s[1] << 63 - s[1] = (s[1] >> 1) | t - t = t2 - s[0] = (s[0] >> 1) | t } for u[0]&1 == 0 { // u = u >> 1 - t2 = u[3] << 63 + + u[0] = u[0]>>1 | u[1]<<63 + u[1] = u[1]>>1 | u[2]<<63 + u[2] = u[2]>>1 | u[3]<<63 u[3] >>= 1 - t = t2 - t2 = u[2] << 63 - u[2] = (u[2] >> 1) | t - t = t2 - t2 = u[1] << 63 - u[1] = (u[1] >> 1) | t - t = t2 - u[0] = (u[0] >> 1) | t if r[0]&1 == 1 { @@ -869,16 +878,11 @@ func (z *Element) Inverse(x *Element) *Element { } // r = r >> 1 - t2 = r[3] << 63 + + r[0] = r[0]>>1 | r[1]<<63 + r[1] = r[1]>>1 | r[2]<<63 + r[2] = r[2]>>1 | r[3]<<63 r[3] >>= 1 - t = t2 - t2 = r[2] << 63 - r[2] = (r[2] >> 1) | t - t = t2 - t2 = r[1] << 63 - r[1] = (r[1] >> 1) | t - t = t2 - r[0] = (r[0] >> 1) | t } @@ -933,10 +937,12 @@ func (z *Element) Inverse(x *Element) *Element { } } if (u[0] == 1) && (u[3]|u[2]|u[1]) == 0 { - return z.Set(&r) + z.Set(&r) + return } if (v[0] == 1) && (v[3]|v[2]|v[1]) == 0 { - return z.Set(&s) + z.Set(&s) + return } } diff --git a/ecc/bn254/fp/element_ops_amd64.go b/ecc/bn254/fp/element_ops_amd64.go index 73a3711ec..d61412bd6 100644 --- a/ecc/bn254/fp/element_ops_amd64.go +++ b/ecc/bn254/fp/element_ops_amd64.go @@ -48,3 +48,6 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) + +//go:noescape +func inverse(res, x *Element) diff --git a/ecc/bn254/fp/element_ops_amd64.s b/ecc/bn254/fp/element_ops_amd64.s index 5b8b1f0e7..bbea3d8c6 100644 --- a/ecc/bn254/fp/element_ops_amd64.s +++ b/ecc/bn254/fp/element_ops_amd64.s @@ -338,3 +338,268 @@ TEXT ·Butterfly(SB), NOSPLIT, $0-16 MOVQ SI, 16(AX) MOVQ DI, 24(AX) RET + +// inverse(res, x *Element) +TEXT ·inverse(SB), $56-16 + // u = q + // u[0] -> DI + // u[1] -> R8 + // u[2] -> R9 + // u[3] -> R10 + MOVQ q<>+0(SB), DI + MOVQ q<>+8(SB), R8 + MOVQ q<>+16(SB), R9 + MOVQ q<>+24(SB), R10 + + // s = r^2 + // s[0] -> s3-32(SP) + // s[1] -> s4-40(SP) + // s[2] -> s5-48(SP) + // s[3] -> s6-56(SP) + MOVQ $0xf32cfc5b538afa89, SI + MOVQ SI, s3-32(SP) + MOVQ $0xb5e71911d44501fb, SI + MOVQ SI, s4-40(SP) + MOVQ $0x47ab1eff0a417ff6, SI + MOVQ SI, s5-48(SP) + MOVQ $0x06d89f71cab8351f, SI + MOVQ SI, s6-56(SP) + + // v = x + // v[0] -> R11 + // v[1] -> R12 + // v[2] -> R13 + // v[3] -> R14 + MOVQ x+8(FP), SI + MOVQ 0(SI), AX + MOVQ 8(SI), DX + MOVQ 16(SI), CX + MOVQ 24(SI), BX + MOVQ AX, R11 + MOVQ DX, R12 + MOVQ CX, R13 + MOVQ BX, R14 + + // if x is 0, returns 0 + MOVQ AX, SI + ORQ DX, SI + ORQ CX, SI + ORQ BX, SI + JEQ l7 + + // r = 0 + // r[0] -> R15 + // r[1] -> s0-8(SP) + // r[2] -> s1-16(SP) + // r[3] -> s2-24(SP) + MOVQ $0, R15 + MOVQ $0, s0-8(SP) + MOVQ $0, s1-16(SP) + MOVQ $0, s2-24(SP) + +l2: + BTQ $0, AX + JCS l8 + MOVQ $0, BP + XORQ SI, SI + +l9: + INCQ BP + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + BTQ $0, AX + JCC l9 + MOVQ AX, R11 + MOVQ DX, R12 + MOVQ CX, R13 + MOVQ BX, R14 + MOVQ s3-32(SP), AX + MOVQ s4-40(SP), DX + MOVQ s5-48(SP), CX + MOVQ s6-56(SP), BX + +l10: + BTQ $0, AX + JCC l11 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l11: + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + DECQ BP + JNE l10 + MOVQ AX, s3-32(SP) + MOVQ DX, s4-40(SP) + MOVQ CX, s5-48(SP) + MOVQ BX, s6-56(SP) + +l8: + MOVQ DI, AX + MOVQ R8, DX + MOVQ R9, CX + MOVQ R10, BX + BTQ $0, AX + JCS l12 + MOVQ $0, BP + XORQ SI, SI + +l13: + INCQ BP + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + BTQ $0, AX + JCC l13 + MOVQ AX, DI + MOVQ DX, R8 + MOVQ CX, R9 + MOVQ BX, R10 + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + +l14: + BTQ $0, AX + JCC l15 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l15: + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + DECQ BP + JNE l14 + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + +l12: + // v = v - u + MOVQ R11, AX + MOVQ R12, DX + MOVQ R13, CX + MOVQ R14, BX + SUBQ DI, AX + SBBQ R8, DX + SBBQ R9, CX + SBBQ R10, BX + JCC l3 + SUBQ R11, DI + SBBQ R12, R8 + SBBQ R13, R9 + SBBQ R14, R10 + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + SUBQ s3-32(SP), AX + SBBQ s4-40(SP), DX + SBBQ s5-48(SP), CX + SBBQ s6-56(SP), BX + JCC l16 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l16: + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + JMP l4 + +l3: + MOVQ AX, R11 + MOVQ DX, R12 + MOVQ CX, R13 + MOVQ BX, R14 + MOVQ s3-32(SP), AX + MOVQ s4-40(SP), DX + MOVQ s5-48(SP), CX + MOVQ s6-56(SP), BX + SUBQ R15, AX + SBBQ s0-8(SP), DX + SBBQ s1-16(SP), CX + SBBQ s2-24(SP), BX + JCC l17 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l17: + MOVQ AX, s3-32(SP) + MOVQ DX, s4-40(SP) + MOVQ CX, s5-48(SP) + MOVQ BX, s6-56(SP) + +l4: + MOVQ DI, SI + SUBQ $1, SI + ORQ R8, SI + ORQ R9, SI + ORQ R10, SI + JEQ l5 + MOVQ R11, AX + MOVQ R12, DX + MOVQ R13, CX + MOVQ R14, BX + MOVQ AX, SI + SUBQ $1, SI + JNE l2 + ORQ DX, SI + ORQ CX, SI + ORQ BX, SI + JEQ l6 + JMP l2 + +l5: + MOVQ res+0(FP), SI + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + MOVQ AX, 0(SI) + MOVQ DX, 8(SI) + MOVQ CX, 16(SI) + MOVQ BX, 24(SI) + RET + +l6: + MOVQ res+0(FP), SI + MOVQ s3-32(SP), AX + MOVQ s4-40(SP), DX + MOVQ s5-48(SP), CX + MOVQ s6-56(SP), BX + MOVQ AX, 0(SI) + MOVQ DX, 8(SI) + MOVQ CX, 16(SI) + MOVQ BX, 24(SI) + RET + +l7: + MOVQ res+0(FP), SI + MOVQ $0, 0(SI) + MOVQ $0, 8(SI) + MOVQ $0, 16(SI) + MOVQ $0, 24(SI) + RET diff --git a/ecc/bn254/fp/element_ops_noasm.go b/ecc/bn254/fp/element_ops_noasm.go index fec628918..48d55e2ea 100644 --- a/ecc/bn254/fp/element_ops_noasm.go +++ b/ecc/bn254/fp/element_ops_noasm.go @@ -51,6 +51,10 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } +func inverse(z, x *Element) { + _inverseGeneric(z, x) +} + // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bn254/fr/element.go b/ecc/bn254/fr/element.go index f19d0eabf..e1666826b 100644 --- a/ecc/bn254/fr/element.go +++ b/ecc/bn254/fr/element.go @@ -573,6 +573,21 @@ func _butterflyGeneric(a, b *Element) { b.Sub(&t, b) } +// BitLen returns the minimum number of bits needed to represent z +// returns 0 if z == 0 +func (z *Element) BitLen() int { + if z[3] != 0 { + return 192 + bits.Len64(z[3]) + } + if z[2] != 0 { + return 128 + bits.Len64(z[2]) + } + if z[1] != 0 { + return 64 + bits.Len64(z[1]) + } + return bits.Len64(z[0]) +} + // Exp z = x^exponent mod q func (z *Element) Exp(x Element, exponent *big.Int) *Element { var bZero big.Int @@ -830,8 +845,17 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { + inverse(z, x) + return z +} + +// _inverseGeneric z = x^-1 mod q +// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" +// if x == 0, sets and returns z = x +func _inverseGeneric(z, x *Element) { if x.IsZero() { - return z.Set(x) + z.SetZero() + return } // initialize u = q @@ -855,23 +879,18 @@ func (z *Element) Inverse(x *Element) *Element { v := *x - var carry, borrow, t, t2 uint64 + var carry, borrow uint64 var bigger bool for { for v[0]&1 == 0 { // v = v >> 1 - t2 = v[3] << 63 + + v[0] = v[0]>>1 | v[1]<<63 + v[1] = v[1]>>1 | v[2]<<63 + v[2] = v[2]>>1 | v[3]<<63 v[3] >>= 1 - t = t2 - t2 = v[2] << 63 - v[2] = (v[2] >> 1) | t - t = t2 - t2 = v[1] << 63 - v[1] = (v[1] >> 1) | t - t = t2 - v[0] = (v[0] >> 1) | t if s[0]&1 == 1 { @@ -884,31 +903,21 @@ func (z *Element) Inverse(x *Element) *Element { } // s = s >> 1 - t2 = s[3] << 63 + + s[0] = s[0]>>1 | s[1]<<63 + s[1] = s[1]>>1 | s[2]<<63 + s[2] = s[2]>>1 | s[3]<<63 s[3] >>= 1 - t = t2 - t2 = s[2] << 63 - s[2] = (s[2] >> 1) | t - t = t2 - t2 = s[1] << 63 - s[1] = (s[1] >> 1) | t - t = t2 - s[0] = (s[0] >> 1) | t } for u[0]&1 == 0 { // u = u >> 1 - t2 = u[3] << 63 + + u[0] = u[0]>>1 | u[1]<<63 + u[1] = u[1]>>1 | u[2]<<63 + u[2] = u[2]>>1 | u[3]<<63 u[3] >>= 1 - t = t2 - t2 = u[2] << 63 - u[2] = (u[2] >> 1) | t - t = t2 - t2 = u[1] << 63 - u[1] = (u[1] >> 1) | t - t = t2 - u[0] = (u[0] >> 1) | t if r[0]&1 == 1 { @@ -921,16 +930,11 @@ func (z *Element) Inverse(x *Element) *Element { } // r = r >> 1 - t2 = r[3] << 63 + + r[0] = r[0]>>1 | r[1]<<63 + r[1] = r[1]>>1 | r[2]<<63 + r[2] = r[2]>>1 | r[3]<<63 r[3] >>= 1 - t = t2 - t2 = r[2] << 63 - r[2] = (r[2] >> 1) | t - t = t2 - t2 = r[1] << 63 - r[1] = (r[1] >> 1) | t - t = t2 - r[0] = (r[0] >> 1) | t } @@ -985,10 +989,12 @@ func (z *Element) Inverse(x *Element) *Element { } } if (u[0] == 1) && (u[3]|u[2]|u[1]) == 0 { - return z.Set(&r) + z.Set(&r) + return } if (v[0] == 1) && (v[3]|v[2]|v[1]) == 0 { - return z.Set(&s) + z.Set(&s) + return } } diff --git a/ecc/bn254/fr/element_ops_amd64.go b/ecc/bn254/fr/element_ops_amd64.go index 78022b3e6..9ebabc26a 100644 --- a/ecc/bn254/fr/element_ops_amd64.go +++ b/ecc/bn254/fr/element_ops_amd64.go @@ -48,3 +48,6 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) + +//go:noescape +func inverse(res, x *Element) diff --git a/ecc/bn254/fr/element_ops_amd64.s b/ecc/bn254/fr/element_ops_amd64.s index d5dca83d2..318af3bc9 100644 --- a/ecc/bn254/fr/element_ops_amd64.s +++ b/ecc/bn254/fr/element_ops_amd64.s @@ -338,3 +338,268 @@ TEXT ·Butterfly(SB), NOSPLIT, $0-16 MOVQ SI, 16(AX) MOVQ DI, 24(AX) RET + +// inverse(res, x *Element) +TEXT ·inverse(SB), $56-16 + // u = q + // u[0] -> DI + // u[1] -> R8 + // u[2] -> R9 + // u[3] -> R10 + MOVQ q<>+0(SB), DI + MOVQ q<>+8(SB), R8 + MOVQ q<>+16(SB), R9 + MOVQ q<>+24(SB), R10 + + // s = r^2 + // s[0] -> s3-32(SP) + // s[1] -> s4-40(SP) + // s[2] -> s5-48(SP) + // s[3] -> s6-56(SP) + MOVQ $0x1bb8e645ae216da7, SI + MOVQ SI, s3-32(SP) + MOVQ $0x53fe3ab1e35c59e3, SI + MOVQ SI, s4-40(SP) + MOVQ $0x8c49833d53bb8085, SI + MOVQ SI, s5-48(SP) + MOVQ $0x0216d0b17f4e44a5, SI + MOVQ SI, s6-56(SP) + + // v = x + // v[0] -> R11 + // v[1] -> R12 + // v[2] -> R13 + // v[3] -> R14 + MOVQ x+8(FP), SI + MOVQ 0(SI), AX + MOVQ 8(SI), DX + MOVQ 16(SI), CX + MOVQ 24(SI), BX + MOVQ AX, R11 + MOVQ DX, R12 + MOVQ CX, R13 + MOVQ BX, R14 + + // if x is 0, returns 0 + MOVQ AX, SI + ORQ DX, SI + ORQ CX, SI + ORQ BX, SI + JEQ l7 + + // r = 0 + // r[0] -> R15 + // r[1] -> s0-8(SP) + // r[2] -> s1-16(SP) + // r[3] -> s2-24(SP) + MOVQ $0, R15 + MOVQ $0, s0-8(SP) + MOVQ $0, s1-16(SP) + MOVQ $0, s2-24(SP) + +l2: + BTQ $0, AX + JCS l8 + MOVQ $0, BP + XORQ SI, SI + +l9: + INCQ BP + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + BTQ $0, AX + JCC l9 + MOVQ AX, R11 + MOVQ DX, R12 + MOVQ CX, R13 + MOVQ BX, R14 + MOVQ s3-32(SP), AX + MOVQ s4-40(SP), DX + MOVQ s5-48(SP), CX + MOVQ s6-56(SP), BX + +l10: + BTQ $0, AX + JCC l11 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l11: + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + DECQ BP + JNE l10 + MOVQ AX, s3-32(SP) + MOVQ DX, s4-40(SP) + MOVQ CX, s5-48(SP) + MOVQ BX, s6-56(SP) + +l8: + MOVQ DI, AX + MOVQ R8, DX + MOVQ R9, CX + MOVQ R10, BX + BTQ $0, AX + JCS l12 + MOVQ $0, BP + XORQ SI, SI + +l13: + INCQ BP + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + BTQ $0, AX + JCC l13 + MOVQ AX, DI + MOVQ DX, R8 + MOVQ CX, R9 + MOVQ BX, R10 + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + +l14: + BTQ $0, AX + JCC l15 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l15: + SHRQ $1, AX, SI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, BX + DECQ BP + JNE l14 + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + +l12: + // v = v - u + MOVQ R11, AX + MOVQ R12, DX + MOVQ R13, CX + MOVQ R14, BX + SUBQ DI, AX + SBBQ R8, DX + SBBQ R9, CX + SBBQ R10, BX + JCC l3 + SUBQ R11, DI + SBBQ R12, R8 + SBBQ R13, R9 + SBBQ R14, R10 + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + SUBQ s3-32(SP), AX + SBBQ s4-40(SP), DX + SBBQ s5-48(SP), CX + SBBQ s6-56(SP), BX + JCC l16 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l16: + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + JMP l4 + +l3: + MOVQ AX, R11 + MOVQ DX, R12 + MOVQ CX, R13 + MOVQ BX, R14 + MOVQ s3-32(SP), AX + MOVQ s4-40(SP), DX + MOVQ s5-48(SP), CX + MOVQ s6-56(SP), BX + SUBQ R15, AX + SBBQ s0-8(SP), DX + SBBQ s1-16(SP), CX + SBBQ s2-24(SP), BX + JCC l17 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + +l17: + MOVQ AX, s3-32(SP) + MOVQ DX, s4-40(SP) + MOVQ CX, s5-48(SP) + MOVQ BX, s6-56(SP) + +l4: + MOVQ DI, SI + SUBQ $1, SI + ORQ R8, SI + ORQ R9, SI + ORQ R10, SI + JEQ l5 + MOVQ R11, AX + MOVQ R12, DX + MOVQ R13, CX + MOVQ R14, BX + MOVQ AX, SI + SUBQ $1, SI + JNE l2 + ORQ DX, SI + ORQ CX, SI + ORQ BX, SI + JEQ l6 + JMP l2 + +l5: + MOVQ res+0(FP), SI + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + MOVQ AX, 0(SI) + MOVQ DX, 8(SI) + MOVQ CX, 16(SI) + MOVQ BX, 24(SI) + RET + +l6: + MOVQ res+0(FP), SI + MOVQ s3-32(SP), AX + MOVQ s4-40(SP), DX + MOVQ s5-48(SP), CX + MOVQ s6-56(SP), BX + MOVQ AX, 0(SI) + MOVQ DX, 8(SI) + MOVQ CX, 16(SI) + MOVQ BX, 24(SI) + RET + +l7: + MOVQ res+0(FP), SI + MOVQ $0, 0(SI) + MOVQ $0, 8(SI) + MOVQ $0, 16(SI) + MOVQ $0, 24(SI) + RET diff --git a/ecc/bn254/fr/element_ops_noasm.go b/ecc/bn254/fr/element_ops_noasm.go index ec1fac18d..006365daa 100644 --- a/ecc/bn254/fr/element_ops_noasm.go +++ b/ecc/bn254/fr/element_ops_noasm.go @@ -51,6 +51,10 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } +func inverse(z, x *Element) { + _inverseGeneric(z, x) +} + // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bw6-633/fp/element.go b/ecc/bw6-633/fp/element.go index 0647ac87a..398f6b339 100644 --- a/ecc/bw6-633/fp/element.go +++ b/ecc/bw6-633/fp/element.go @@ -1023,6 +1023,39 @@ func _butterflyGeneric(a, b *Element) { b.Sub(&t, b) } +// BitLen returns the minimum number of bits needed to represent z +// returns 0 if z == 0 +func (z *Element) BitLen() int { + if z[9] != 0 { + return 576 + bits.Len64(z[9]) + } + if z[8] != 0 { + return 512 + bits.Len64(z[8]) + } + if z[7] != 0 { + return 448 + bits.Len64(z[7]) + } + if z[6] != 0 { + return 384 + bits.Len64(z[6]) + } + if z[5] != 0 { + return 320 + bits.Len64(z[5]) + } + if z[4] != 0 { + return 256 + bits.Len64(z[4]) + } + if z[3] != 0 { + return 192 + bits.Len64(z[3]) + } + if z[2] != 0 { + return 128 + bits.Len64(z[2]) + } + if z[1] != 0 { + return 64 + bits.Len64(z[1]) + } + return bits.Len64(z[0]) +} + // Exp z = x^exponent mod q func (z *Element) Exp(x Element, exponent *big.Int) *Element { var bZero big.Int @@ -1248,8 +1281,17 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { + inverse(z, x) + return z +} + +// _inverseGeneric z = x^-1 mod q +// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" +// if x == 0, sets and returns z = x +func _inverseGeneric(z, x *Element) { if x.IsZero() { - return z.Set(x) + z.SetZero() + return } // initialize u = q @@ -1285,41 +1327,24 @@ func (z *Element) Inverse(x *Element) *Element { v := *x - var carry, borrow, t, t2 uint64 + var carry, borrow uint64 var bigger bool for { for v[0]&1 == 0 { // v = v >> 1 - t2 = v[9] << 63 + + v[0] = v[0]>>1 | v[1]<<63 + v[1] = v[1]>>1 | v[2]<<63 + v[2] = v[2]>>1 | v[3]<<63 + v[3] = v[3]>>1 | v[4]<<63 + v[4] = v[4]>>1 | v[5]<<63 + v[5] = v[5]>>1 | v[6]<<63 + v[6] = v[6]>>1 | v[7]<<63 + v[7] = v[7]>>1 | v[8]<<63 + v[8] = v[8]>>1 | v[9]<<63 v[9] >>= 1 - t = t2 - t2 = v[8] << 63 - v[8] = (v[8] >> 1) | t - t = t2 - t2 = v[7] << 63 - v[7] = (v[7] >> 1) | t - t = t2 - t2 = v[6] << 63 - v[6] = (v[6] >> 1) | t - t = t2 - t2 = v[5] << 63 - v[5] = (v[5] >> 1) | t - t = t2 - t2 = v[4] << 63 - v[4] = (v[4] >> 1) | t - t = t2 - t2 = v[3] << 63 - v[3] = (v[3] >> 1) | t - t = t2 - t2 = v[2] << 63 - v[2] = (v[2] >> 1) | t - t = t2 - t2 = v[1] << 63 - v[1] = (v[1] >> 1) | t - t = t2 - v[0] = (v[0] >> 1) | t if s[0]&1 == 1 { @@ -1338,67 +1363,33 @@ func (z *Element) Inverse(x *Element) *Element { } // s = s >> 1 - t2 = s[9] << 63 + + s[0] = s[0]>>1 | s[1]<<63 + s[1] = s[1]>>1 | s[2]<<63 + s[2] = s[2]>>1 | s[3]<<63 + s[3] = s[3]>>1 | s[4]<<63 + s[4] = s[4]>>1 | s[5]<<63 + s[5] = s[5]>>1 | s[6]<<63 + s[6] = s[6]>>1 | s[7]<<63 + s[7] = s[7]>>1 | s[8]<<63 + s[8] = s[8]>>1 | s[9]<<63 s[9] >>= 1 - t = t2 - t2 = s[8] << 63 - s[8] = (s[8] >> 1) | t - t = t2 - t2 = s[7] << 63 - s[7] = (s[7] >> 1) | t - t = t2 - t2 = s[6] << 63 - s[6] = (s[6] >> 1) | t - t = t2 - t2 = s[5] << 63 - s[5] = (s[5] >> 1) | t - t = t2 - t2 = s[4] << 63 - s[4] = (s[4] >> 1) | t - t = t2 - t2 = s[3] << 63 - s[3] = (s[3] >> 1) | t - t = t2 - t2 = s[2] << 63 - s[2] = (s[2] >> 1) | t - t = t2 - t2 = s[1] << 63 - s[1] = (s[1] >> 1) | t - t = t2 - s[0] = (s[0] >> 1) | t } for u[0]&1 == 0 { // u = u >> 1 - t2 = u[9] << 63 + + u[0] = u[0]>>1 | u[1]<<63 + u[1] = u[1]>>1 | u[2]<<63 + u[2] = u[2]>>1 | u[3]<<63 + u[3] = u[3]>>1 | u[4]<<63 + u[4] = u[4]>>1 | u[5]<<63 + u[5] = u[5]>>1 | u[6]<<63 + u[6] = u[6]>>1 | u[7]<<63 + u[7] = u[7]>>1 | u[8]<<63 + u[8] = u[8]>>1 | u[9]<<63 u[9] >>= 1 - t = t2 - t2 = u[8] << 63 - u[8] = (u[8] >> 1) | t - t = t2 - t2 = u[7] << 63 - u[7] = (u[7] >> 1) | t - t = t2 - t2 = u[6] << 63 - u[6] = (u[6] >> 1) | t - t = t2 - t2 = u[5] << 63 - u[5] = (u[5] >> 1) | t - t = t2 - t2 = u[4] << 63 - u[4] = (u[4] >> 1) | t - t = t2 - t2 = u[3] << 63 - u[3] = (u[3] >> 1) | t - t = t2 - t2 = u[2] << 63 - u[2] = (u[2] >> 1) | t - t = t2 - t2 = u[1] << 63 - u[1] = (u[1] >> 1) | t - t = t2 - u[0] = (u[0] >> 1) | t if r[0]&1 == 1 { @@ -1417,34 +1408,17 @@ func (z *Element) Inverse(x *Element) *Element { } // r = r >> 1 - t2 = r[9] << 63 + + r[0] = r[0]>>1 | r[1]<<63 + r[1] = r[1]>>1 | r[2]<<63 + r[2] = r[2]>>1 | r[3]<<63 + r[3] = r[3]>>1 | r[4]<<63 + r[4] = r[4]>>1 | r[5]<<63 + r[5] = r[5]>>1 | r[6]<<63 + r[6] = r[6]>>1 | r[7]<<63 + r[7] = r[7]>>1 | r[8]<<63 + r[8] = r[8]>>1 | r[9]<<63 r[9] >>= 1 - t = t2 - t2 = r[8] << 63 - r[8] = (r[8] >> 1) | t - t = t2 - t2 = r[7] << 63 - r[7] = (r[7] >> 1) | t - t = t2 - t2 = r[6] << 63 - r[6] = (r[6] >> 1) | t - t = t2 - t2 = r[5] << 63 - r[5] = (r[5] >> 1) | t - t = t2 - t2 = r[4] << 63 - r[4] = (r[4] >> 1) | t - t = t2 - t2 = r[3] << 63 - r[3] = (r[3] >> 1) | t - t = t2 - t2 = r[2] << 63 - r[2] = (r[2] >> 1) | t - t = t2 - t2 = r[1] << 63 - r[1] = (r[1] >> 1) | t - t = t2 - r[0] = (r[0] >> 1) | t } @@ -1535,10 +1509,12 @@ func (z *Element) Inverse(x *Element) *Element { } } if (u[0] == 1) && (u[9]|u[8]|u[7]|u[6]|u[5]|u[4]|u[3]|u[2]|u[1]) == 0 { - return z.Set(&r) + z.Set(&r) + return } if (v[0] == 1) && (v[9]|v[8]|v[7]|v[6]|v[5]|v[4]|v[3]|v[2]|v[1]) == 0 { - return z.Set(&s) + z.Set(&s) + return } } diff --git a/ecc/bw6-633/fp/element_ops_amd64.go b/ecc/bw6-633/fp/element_ops_amd64.go index 73a3711ec..d61412bd6 100644 --- a/ecc/bw6-633/fp/element_ops_amd64.go +++ b/ecc/bw6-633/fp/element_ops_amd64.go @@ -48,3 +48,6 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) + +//go:noescape +func inverse(res, x *Element) diff --git a/ecc/bw6-633/fp/element_ops_amd64.s b/ecc/bw6-633/fp/element_ops_amd64.s index bb8d0be3b..78f5179d2 100644 --- a/ecc/bw6-633/fp/element_ops_amd64.s +++ b/ecc/bw6-633/fp/element_ops_amd64.s @@ -644,3 +644,571 @@ l3: MOVQ R11, 64(AX) MOVQ R12, 72(AX) RET + +// inverse(res, x *Element) +TEXT ·inverse(SB), $296-16 + // u = q + // u[0] -> R13 + // u[1] -> R14 + // u[2] -> R15 + // u[3] -> s0-8(SP) + // u[4] -> s1-16(SP) + // u[5] -> s2-24(SP) + // u[6] -> s3-32(SP) + // u[7] -> s4-40(SP) + // u[8] -> s5-48(SP) + // u[9] -> s6-56(SP) + MOVQ q<>+0(SB), R12 + MOVQ R12, R13 + MOVQ q<>+8(SB), R12 + MOVQ R12, R14 + MOVQ q<>+16(SB), R12 + MOVQ R12, R15 + MOVQ q<>+24(SB), R12 + MOVQ R12, s0-8(SP) + MOVQ q<>+32(SB), R12 + MOVQ R12, s1-16(SP) + MOVQ q<>+40(SB), R12 + MOVQ R12, s2-24(SP) + MOVQ q<>+48(SB), R12 + MOVQ R12, s3-32(SP) + MOVQ q<>+56(SB), R12 + MOVQ R12, s4-40(SP) + MOVQ q<>+64(SB), R12 + MOVQ R12, s5-48(SP) + MOVQ q<>+72(SB), R12 + MOVQ R12, s6-56(SP) + + // s = r^2 + // s[0] -> s27-224(SP) + // s[1] -> s28-232(SP) + // s[2] -> s29-240(SP) + // s[3] -> s30-248(SP) + // s[4] -> s31-256(SP) + // s[5] -> s32-264(SP) + // s[6] -> s33-272(SP) + // s[7] -> s34-280(SP) + // s[8] -> s35-288(SP) + // s[9] -> s36-296(SP) + MOVQ $0x661e804ca9d73f4c, R12 + MOVQ R12, s27-224(SP) + MOVQ $0xc8097534c70cdf8b, R12 + MOVQ R12, s28-232(SP) + MOVQ $0xe6a4436c7d9c2a0b, R12 + MOVQ R12, s29-240(SP) + MOVQ $0x0a8eade777742a9e, R12 + MOVQ R12, s30-248(SP) + MOVQ $0xb0fc02b996feedd8, R12 + MOVQ R12, s31-256(SP) + MOVQ $0xba4fdbddeb83543a, R12 + MOVQ R12, s32-264(SP) + MOVQ $0xaebec1921b2490f8, R12 + MOVQ R12, s33-272(SP) + MOVQ $0xd4af2c0e74212f40, R12 + MOVQ R12, s34-280(SP) + MOVQ $0x499179a8fa1cce12, R12 + MOVQ R12, s35-288(SP) + MOVQ $0x007da75a34ab397a, R12 + MOVQ R12, s36-296(SP) + + // v = x + // v[0] -> s7-64(SP) + // v[1] -> s8-72(SP) + // v[2] -> s9-80(SP) + // v[3] -> s10-88(SP) + // v[4] -> s11-96(SP) + // v[5] -> s12-104(SP) + // v[6] -> s13-112(SP) + // v[7] -> s14-120(SP) + // v[8] -> s15-128(SP) + // v[9] -> s16-136(SP) + MOVQ x+8(FP), R12 + MOVQ 0(R12), AX + MOVQ 8(R12), DX + MOVQ 16(R12), CX + MOVQ 24(R12), BX + MOVQ 32(R12), SI + MOVQ 40(R12), DI + MOVQ 48(R12), R8 + MOVQ 56(R12), R9 + MOVQ 64(R12), R10 + MOVQ 72(R12), R11 + MOVQ AX, s7-64(SP) + MOVQ DX, s8-72(SP) + MOVQ CX, s9-80(SP) + MOVQ BX, s10-88(SP) + MOVQ SI, s11-96(SP) + MOVQ DI, s12-104(SP) + MOVQ R8, s13-112(SP) + MOVQ R9, s14-120(SP) + MOVQ R10, s15-128(SP) + MOVQ R11, s16-136(SP) + + // if x is 0, returns 0 + MOVQ AX, R12 + ORQ DX, R12 + ORQ CX, R12 + ORQ BX, R12 + ORQ SI, R12 + ORQ DI, R12 + ORQ R8, R12 + ORQ R9, R12 + ORQ R10, R12 + ORQ R11, R12 + JEQ l9 + + // r = 0 + // r[0] -> s17-144(SP) + // r[1] -> s18-152(SP) + // r[2] -> s19-160(SP) + // r[3] -> s20-168(SP) + // r[4] -> s21-176(SP) + // r[5] -> s22-184(SP) + // r[6] -> s23-192(SP) + // r[7] -> s24-200(SP) + // r[8] -> s25-208(SP) + // r[9] -> s26-216(SP) + MOVQ $0, s17-144(SP) + MOVQ $0, s18-152(SP) + MOVQ $0, s19-160(SP) + MOVQ $0, s20-168(SP) + MOVQ $0, s21-176(SP) + MOVQ $0, s22-184(SP) + MOVQ $0, s23-192(SP) + MOVQ $0, s24-200(SP) + MOVQ $0, s25-208(SP) + MOVQ $0, s26-216(SP) + +l4: + BTQ $0, AX + JCS l10 + MOVQ $0, BP + XORQ R12, R12 + +l11: + INCQ BP + SHRQ $1, AX, R12 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, R8, DI + SHRQ $1, R9, R8 + SHRQ $1, R10, R9 + SHRQ $1, R11, R10 + SHRQ $1, R11 + BTQ $0, AX + JCC l11 + MOVQ AX, s7-64(SP) + MOVQ DX, s8-72(SP) + MOVQ CX, s9-80(SP) + MOVQ BX, s10-88(SP) + MOVQ SI, s11-96(SP) + MOVQ DI, s12-104(SP) + MOVQ R8, s13-112(SP) + MOVQ R9, s14-120(SP) + MOVQ R10, s15-128(SP) + MOVQ R11, s16-136(SP) + MOVQ s27-224(SP), AX + MOVQ s28-232(SP), DX + MOVQ s29-240(SP), CX + MOVQ s30-248(SP), BX + MOVQ s31-256(SP), SI + MOVQ s32-264(SP), DI + MOVQ s33-272(SP), R8 + MOVQ s34-280(SP), R9 + MOVQ s35-288(SP), R10 + MOVQ s36-296(SP), R11 + +l12: + BTQ $0, AX + JCC l13 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + ADCQ q<>+48(SB), R8 + ADCQ q<>+56(SB), R9 + ADCQ q<>+64(SB), R10 + ADCQ q<>+72(SB), R11 + +l13: + SHRQ $1, AX, R12 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, R8, DI + SHRQ $1, R9, R8 + SHRQ $1, R10, R9 + SHRQ $1, R11, R10 + SHRQ $1, R11 + DECQ BP + JNE l12 + MOVQ AX, s27-224(SP) + MOVQ DX, s28-232(SP) + MOVQ CX, s29-240(SP) + MOVQ BX, s30-248(SP) + MOVQ SI, s31-256(SP) + MOVQ DI, s32-264(SP) + MOVQ R8, s33-272(SP) + MOVQ R9, s34-280(SP) + MOVQ R10, s35-288(SP) + MOVQ R11, s36-296(SP) + +l10: + MOVQ R13, AX + MOVQ R14, DX + MOVQ R15, CX + MOVQ s0-8(SP), BX + MOVQ s1-16(SP), SI + MOVQ s2-24(SP), DI + MOVQ s3-32(SP), R8 + MOVQ s4-40(SP), R9 + MOVQ s5-48(SP), R10 + MOVQ s6-56(SP), R11 + BTQ $0, AX + JCS l14 + MOVQ $0, BP + XORQ R12, R12 + +l15: + INCQ BP + SHRQ $1, AX, R12 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, R8, DI + SHRQ $1, R9, R8 + SHRQ $1, R10, R9 + SHRQ $1, R11, R10 + SHRQ $1, R11 + BTQ $0, AX + JCC l15 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ CX, R15 + MOVQ BX, s0-8(SP) + MOVQ SI, s1-16(SP) + MOVQ DI, s2-24(SP) + MOVQ R8, s3-32(SP) + MOVQ R9, s4-40(SP) + MOVQ R10, s5-48(SP) + MOVQ R11, s6-56(SP) + MOVQ s17-144(SP), AX + MOVQ s18-152(SP), DX + MOVQ s19-160(SP), CX + MOVQ s20-168(SP), BX + MOVQ s21-176(SP), SI + MOVQ s22-184(SP), DI + MOVQ s23-192(SP), R8 + MOVQ s24-200(SP), R9 + MOVQ s25-208(SP), R10 + MOVQ s26-216(SP), R11 + +l16: + BTQ $0, AX + JCC l17 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + ADCQ q<>+48(SB), R8 + ADCQ q<>+56(SB), R9 + ADCQ q<>+64(SB), R10 + ADCQ q<>+72(SB), R11 + +l17: + SHRQ $1, AX, R12 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, R8, DI + SHRQ $1, R9, R8 + SHRQ $1, R10, R9 + SHRQ $1, R11, R10 + SHRQ $1, R11 + DECQ BP + JNE l16 + MOVQ AX, s17-144(SP) + MOVQ DX, s18-152(SP) + MOVQ CX, s19-160(SP) + MOVQ BX, s20-168(SP) + MOVQ SI, s21-176(SP) + MOVQ DI, s22-184(SP) + MOVQ R8, s23-192(SP) + MOVQ R9, s24-200(SP) + MOVQ R10, s25-208(SP) + MOVQ R11, s26-216(SP) + +l14: + // v = v - u + MOVQ s7-64(SP), AX + MOVQ s8-72(SP), DX + MOVQ s9-80(SP), CX + MOVQ s10-88(SP), BX + MOVQ s11-96(SP), SI + MOVQ s12-104(SP), DI + MOVQ s13-112(SP), R8 + MOVQ s14-120(SP), R9 + MOVQ s15-128(SP), R10 + MOVQ s16-136(SP), R11 + SUBQ R13, AX + SBBQ R14, DX + SBBQ R15, CX + SBBQ s0-8(SP), BX + SBBQ s1-16(SP), SI + SBBQ s2-24(SP), DI + SBBQ s3-32(SP), R8 + SBBQ s4-40(SP), R9 + SBBQ s5-48(SP), R10 + SBBQ s6-56(SP), R11 + JCC l5 + MOVQ R13, AX + MOVQ R14, DX + MOVQ R15, CX + MOVQ s0-8(SP), BX + MOVQ s1-16(SP), SI + MOVQ s2-24(SP), DI + MOVQ s3-32(SP), R8 + MOVQ s4-40(SP), R9 + MOVQ s5-48(SP), R10 + MOVQ s6-56(SP), R11 + SUBQ s7-64(SP), AX + SBBQ s8-72(SP), DX + SBBQ s9-80(SP), CX + SBBQ s10-88(SP), BX + SBBQ s11-96(SP), SI + SBBQ s12-104(SP), DI + SBBQ s13-112(SP), R8 + SBBQ s14-120(SP), R9 + SBBQ s15-128(SP), R10 + SBBQ s16-136(SP), R11 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ CX, R15 + MOVQ BX, s0-8(SP) + MOVQ SI, s1-16(SP) + MOVQ DI, s2-24(SP) + MOVQ R8, s3-32(SP) + MOVQ R9, s4-40(SP) + MOVQ R10, s5-48(SP) + MOVQ R11, s6-56(SP) + MOVQ s17-144(SP), AX + MOVQ s18-152(SP), DX + MOVQ s19-160(SP), CX + MOVQ s20-168(SP), BX + MOVQ s21-176(SP), SI + MOVQ s22-184(SP), DI + MOVQ s23-192(SP), R8 + MOVQ s24-200(SP), R9 + MOVQ s25-208(SP), R10 + MOVQ s26-216(SP), R11 + SUBQ s27-224(SP), AX + SBBQ s28-232(SP), DX + SBBQ s29-240(SP), CX + SBBQ s30-248(SP), BX + SBBQ s31-256(SP), SI + SBBQ s32-264(SP), DI + SBBQ s33-272(SP), R8 + SBBQ s34-280(SP), R9 + SBBQ s35-288(SP), R10 + SBBQ s36-296(SP), R11 + JCC l18 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + ADCQ q<>+48(SB), R8 + ADCQ q<>+56(SB), R9 + ADCQ q<>+64(SB), R10 + ADCQ q<>+72(SB), R11 + +l18: + MOVQ AX, s17-144(SP) + MOVQ DX, s18-152(SP) + MOVQ CX, s19-160(SP) + MOVQ BX, s20-168(SP) + MOVQ SI, s21-176(SP) + MOVQ DI, s22-184(SP) + MOVQ R8, s23-192(SP) + MOVQ R9, s24-200(SP) + MOVQ R10, s25-208(SP) + MOVQ R11, s26-216(SP) + JMP l6 + +l5: + MOVQ AX, s7-64(SP) + MOVQ DX, s8-72(SP) + MOVQ CX, s9-80(SP) + MOVQ BX, s10-88(SP) + MOVQ SI, s11-96(SP) + MOVQ DI, s12-104(SP) + MOVQ R8, s13-112(SP) + MOVQ R9, s14-120(SP) + MOVQ R10, s15-128(SP) + MOVQ R11, s16-136(SP) + MOVQ s27-224(SP), AX + MOVQ s28-232(SP), DX + MOVQ s29-240(SP), CX + MOVQ s30-248(SP), BX + MOVQ s31-256(SP), SI + MOVQ s32-264(SP), DI + MOVQ s33-272(SP), R8 + MOVQ s34-280(SP), R9 + MOVQ s35-288(SP), R10 + MOVQ s36-296(SP), R11 + SUBQ s17-144(SP), AX + SBBQ s18-152(SP), DX + SBBQ s19-160(SP), CX + SBBQ s20-168(SP), BX + SBBQ s21-176(SP), SI + SBBQ s22-184(SP), DI + SBBQ s23-192(SP), R8 + SBBQ s24-200(SP), R9 + SBBQ s25-208(SP), R10 + SBBQ s26-216(SP), R11 + JCC l19 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + ADCQ q<>+48(SB), R8 + ADCQ q<>+56(SB), R9 + ADCQ q<>+64(SB), R10 + ADCQ q<>+72(SB), R11 + +l19: + MOVQ AX, s27-224(SP) + MOVQ DX, s28-232(SP) + MOVQ CX, s29-240(SP) + MOVQ BX, s30-248(SP) + MOVQ SI, s31-256(SP) + MOVQ DI, s32-264(SP) + MOVQ R8, s33-272(SP) + MOVQ R9, s34-280(SP) + MOVQ R10, s35-288(SP) + MOVQ R11, s36-296(SP) + +l6: + MOVQ R13, AX + MOVQ R14, DX + MOVQ R15, CX + MOVQ s0-8(SP), BX + MOVQ s1-16(SP), SI + MOVQ s2-24(SP), DI + MOVQ s3-32(SP), R8 + MOVQ s4-40(SP), R9 + MOVQ s5-48(SP), R10 + MOVQ s6-56(SP), R11 + SUBQ $1, AX + ORQ AX, R11 + ORQ DX, R11 + ORQ CX, R11 + ORQ BX, R11 + ORQ SI, R11 + ORQ DI, R11 + ORQ R8, R11 + ORQ R9, R11 + ORQ R10, R11 + JEQ l7 + MOVQ s7-64(SP), AX + MOVQ s8-72(SP), DX + MOVQ s9-80(SP), CX + MOVQ s10-88(SP), BX + MOVQ s11-96(SP), SI + MOVQ s12-104(SP), DI + MOVQ s13-112(SP), R8 + MOVQ s14-120(SP), R9 + MOVQ s15-128(SP), R10 + MOVQ s16-136(SP), R11 + MOVQ AX, R12 + SUBQ $1, R12 + JNE l4 + ORQ DX, R12 + ORQ CX, R12 + ORQ BX, R12 + ORQ SI, R12 + ORQ DI, R12 + ORQ R8, R12 + ORQ R9, R12 + ORQ R10, R12 + ORQ R11, R12 + JEQ l8 + JMP l4 + +l7: + MOVQ res+0(FP), R12 + MOVQ s17-144(SP), AX + MOVQ s18-152(SP), DX + MOVQ s19-160(SP), CX + MOVQ s20-168(SP), BX + MOVQ s21-176(SP), SI + MOVQ s22-184(SP), DI + MOVQ s23-192(SP), R8 + MOVQ s24-200(SP), R9 + MOVQ s25-208(SP), R10 + MOVQ s26-216(SP), R11 + MOVQ AX, 0(R12) + MOVQ DX, 8(R12) + MOVQ CX, 16(R12) + MOVQ BX, 24(R12) + MOVQ SI, 32(R12) + MOVQ DI, 40(R12) + MOVQ R8, 48(R12) + MOVQ R9, 56(R12) + MOVQ R10, 64(R12) + MOVQ R11, 72(R12) + RET + +l8: + MOVQ res+0(FP), R12 + MOVQ s27-224(SP), AX + MOVQ s28-232(SP), DX + MOVQ s29-240(SP), CX + MOVQ s30-248(SP), BX + MOVQ s31-256(SP), SI + MOVQ s32-264(SP), DI + MOVQ s33-272(SP), R8 + MOVQ s34-280(SP), R9 + MOVQ s35-288(SP), R10 + MOVQ s36-296(SP), R11 + MOVQ AX, 0(R12) + MOVQ DX, 8(R12) + MOVQ CX, 16(R12) + MOVQ BX, 24(R12) + MOVQ SI, 32(R12) + MOVQ DI, 40(R12) + MOVQ R8, 48(R12) + MOVQ R9, 56(R12) + MOVQ R10, 64(R12) + MOVQ R11, 72(R12) + RET + +l9: + MOVQ res+0(FP), R12 + MOVQ $0, 0(R12) + MOVQ $0, 8(R12) + MOVQ $0, 16(R12) + MOVQ $0, 24(R12) + MOVQ $0, 32(R12) + MOVQ $0, 40(R12) + MOVQ $0, 48(R12) + MOVQ $0, 56(R12) + MOVQ $0, 64(R12) + MOVQ $0, 72(R12) + RET diff --git a/ecc/bw6-633/fp/element_ops_noasm.go b/ecc/bw6-633/fp/element_ops_noasm.go index fec628918..48d55e2ea 100644 --- a/ecc/bw6-633/fp/element_ops_noasm.go +++ b/ecc/bw6-633/fp/element_ops_noasm.go @@ -51,6 +51,10 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } +func inverse(z, x *Element) { + _inverseGeneric(z, x) +} + // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bw6-633/fr/element.go b/ecc/bw6-633/fr/element.go index b39a4aae2..019e344e9 100644 --- a/ecc/bw6-633/fr/element.go +++ b/ecc/bw6-633/fr/element.go @@ -633,6 +633,24 @@ func _butterflyGeneric(a, b *Element) { b.Sub(&t, b) } +// BitLen returns the minimum number of bits needed to represent z +// returns 0 if z == 0 +func (z *Element) BitLen() int { + if z[4] != 0 { + return 256 + bits.Len64(z[4]) + } + if z[3] != 0 { + return 192 + bits.Len64(z[3]) + } + if z[2] != 0 { + return 128 + bits.Len64(z[2]) + } + if z[1] != 0 { + return 64 + bits.Len64(z[1]) + } + return bits.Len64(z[0]) +} + // Exp z = x^exponent mod q func (z *Element) Exp(x Element, exponent *big.Int) *Element { var bZero big.Int @@ -893,8 +911,17 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { + inverse(z, x) + return z +} + +// _inverseGeneric z = x^-1 mod q +// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" +// if x == 0, sets and returns z = x +func _inverseGeneric(z, x *Element) { if x.IsZero() { - return z.Set(x) + z.SetZero() + return } // initialize u = q @@ -920,26 +947,19 @@ func (z *Element) Inverse(x *Element) *Element { v := *x - var carry, borrow, t, t2 uint64 + var carry, borrow uint64 var bigger bool for { for v[0]&1 == 0 { // v = v >> 1 - t2 = v[4] << 63 + + v[0] = v[0]>>1 | v[1]<<63 + v[1] = v[1]>>1 | v[2]<<63 + v[2] = v[2]>>1 | v[3]<<63 + v[3] = v[3]>>1 | v[4]<<63 v[4] >>= 1 - t = t2 - t2 = v[3] << 63 - v[3] = (v[3] >> 1) | t - t = t2 - t2 = v[2] << 63 - v[2] = (v[2] >> 1) | t - t = t2 - t2 = v[1] << 63 - v[1] = (v[1] >> 1) | t - t = t2 - v[0] = (v[0] >> 1) | t if s[0]&1 == 1 { @@ -953,37 +973,23 @@ func (z *Element) Inverse(x *Element) *Element { } // s = s >> 1 - t2 = s[4] << 63 + + s[0] = s[0]>>1 | s[1]<<63 + s[1] = s[1]>>1 | s[2]<<63 + s[2] = s[2]>>1 | s[3]<<63 + s[3] = s[3]>>1 | s[4]<<63 s[4] >>= 1 - t = t2 - t2 = s[3] << 63 - s[3] = (s[3] >> 1) | t - t = t2 - t2 = s[2] << 63 - s[2] = (s[2] >> 1) | t - t = t2 - t2 = s[1] << 63 - s[1] = (s[1] >> 1) | t - t = t2 - s[0] = (s[0] >> 1) | t } for u[0]&1 == 0 { // u = u >> 1 - t2 = u[4] << 63 + + u[0] = u[0]>>1 | u[1]<<63 + u[1] = u[1]>>1 | u[2]<<63 + u[2] = u[2]>>1 | u[3]<<63 + u[3] = u[3]>>1 | u[4]<<63 u[4] >>= 1 - t = t2 - t2 = u[3] << 63 - u[3] = (u[3] >> 1) | t - t = t2 - t2 = u[2] << 63 - u[2] = (u[2] >> 1) | t - t = t2 - t2 = u[1] << 63 - u[1] = (u[1] >> 1) | t - t = t2 - u[0] = (u[0] >> 1) | t if r[0]&1 == 1 { @@ -997,19 +1003,12 @@ func (z *Element) Inverse(x *Element) *Element { } // r = r >> 1 - t2 = r[4] << 63 + + r[0] = r[0]>>1 | r[1]<<63 + r[1] = r[1]>>1 | r[2]<<63 + r[2] = r[2]>>1 | r[3]<<63 + r[3] = r[3]>>1 | r[4]<<63 r[4] >>= 1 - t = t2 - t2 = r[3] << 63 - r[3] = (r[3] >> 1) | t - t = t2 - t2 = r[2] << 63 - r[2] = (r[2] >> 1) | t - t = t2 - t2 = r[1] << 63 - r[1] = (r[1] >> 1) | t - t = t2 - r[0] = (r[0] >> 1) | t } @@ -1070,10 +1069,12 @@ func (z *Element) Inverse(x *Element) *Element { } } if (u[0] == 1) && (u[4]|u[3]|u[2]|u[1]) == 0 { - return z.Set(&r) + z.Set(&r) + return } if (v[0] == 1) && (v[4]|v[3]|v[2]|v[1]) == 0 { - return z.Set(&s) + z.Set(&s) + return } } diff --git a/ecc/bw6-633/fr/element_ops_amd64.go b/ecc/bw6-633/fr/element_ops_amd64.go index 78022b3e6..9ebabc26a 100644 --- a/ecc/bw6-633/fr/element_ops_amd64.go +++ b/ecc/bw6-633/fr/element_ops_amd64.go @@ -48,3 +48,6 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) + +//go:noescape +func inverse(res, x *Element) diff --git a/ecc/bw6-633/fr/element_ops_amd64.s b/ecc/bw6-633/fr/element_ops_amd64.s index ab0ced516..354d2294a 100644 --- a/ecc/bw6-633/fr/element_ops_amd64.s +++ b/ecc/bw6-633/fr/element_ops_amd64.s @@ -398,3 +398,312 @@ TEXT ·Butterfly(SB), $24-16 MOVQ DI, 24(AX) MOVQ R8, 32(AX) RET + +// inverse(res, x *Element) +TEXT ·inverse(SB), $96-16 + // u = q + // u[0] -> R8 + // u[1] -> R9 + // u[2] -> R10 + // u[3] -> R11 + // u[4] -> R12 + MOVQ q<>+0(SB), R8 + MOVQ q<>+8(SB), R9 + MOVQ q<>+16(SB), R10 + MOVQ q<>+24(SB), R11 + MOVQ q<>+32(SB), R12 + + // s = r^2 + // s[0] -> s7-64(SP) + // s[1] -> s8-72(SP) + // s[2] -> s9-80(SP) + // s[3] -> s10-88(SP) + // s[4] -> s11-96(SP) + MOVQ $0x6b817891fe329c16, DI + MOVQ DI, s7-64(SP) + MOVQ $0x599ce86eec6e2c35, DI + MOVQ DI, s8-72(SP) + MOVQ $0xc338890f540d5ad6, DI + MOVQ DI, s9-80(SP) + MOVQ $0xcc160f6924c81f32, DI + MOVQ DI, s10-88(SP) + MOVQ $0x0215d8d4607a88d5, DI + MOVQ DI, s11-96(SP) + + // v = x + // v[0] -> R13 + // v[1] -> R14 + // v[2] -> R15 + // v[3] -> s0-8(SP) + // v[4] -> s1-16(SP) + MOVQ x+8(FP), DI + MOVQ 0(DI), AX + MOVQ 8(DI), DX + MOVQ 16(DI), CX + MOVQ 24(DI), BX + MOVQ 32(DI), SI + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ CX, R15 + MOVQ BX, s0-8(SP) + MOVQ SI, s1-16(SP) + + // if x is 0, returns 0 + MOVQ AX, DI + ORQ DX, DI + ORQ CX, DI + ORQ BX, DI + ORQ SI, DI + JEQ l7 + + // r = 0 + // r[0] -> s2-24(SP) + // r[1] -> s3-32(SP) + // r[2] -> s4-40(SP) + // r[3] -> s5-48(SP) + // r[4] -> s6-56(SP) + MOVQ $0, s2-24(SP) + MOVQ $0, s3-32(SP) + MOVQ $0, s4-40(SP) + MOVQ $0, s5-48(SP) + MOVQ $0, s6-56(SP) + +l2: + BTQ $0, AX + JCS l8 + MOVQ $0, BP + XORQ DI, DI + +l9: + INCQ BP + SHRQ $1, AX, DI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, SI + BTQ $0, AX + JCC l9 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ CX, R15 + MOVQ BX, s0-8(SP) + MOVQ SI, s1-16(SP) + MOVQ s7-64(SP), AX + MOVQ s8-72(SP), DX + MOVQ s9-80(SP), CX + MOVQ s10-88(SP), BX + MOVQ s11-96(SP), SI + +l10: + BTQ $0, AX + JCC l11 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + +l11: + SHRQ $1, AX, DI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, SI + DECQ BP + JNE l10 + MOVQ AX, s7-64(SP) + MOVQ DX, s8-72(SP) + MOVQ CX, s9-80(SP) + MOVQ BX, s10-88(SP) + MOVQ SI, s11-96(SP) + +l8: + MOVQ R8, AX + MOVQ R9, DX + MOVQ R10, CX + MOVQ R11, BX + MOVQ R12, SI + BTQ $0, AX + JCS l12 + MOVQ $0, BP + XORQ DI, DI + +l13: + INCQ BP + SHRQ $1, AX, DI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, SI + BTQ $0, AX + JCC l13 + MOVQ AX, R8 + MOVQ DX, R9 + MOVQ CX, R10 + MOVQ BX, R11 + MOVQ SI, R12 + MOVQ s2-24(SP), AX + MOVQ s3-32(SP), DX + MOVQ s4-40(SP), CX + MOVQ s5-48(SP), BX + MOVQ s6-56(SP), SI + +l14: + BTQ $0, AX + JCC l15 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + +l15: + SHRQ $1, AX, DI + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, SI + DECQ BP + JNE l14 + MOVQ AX, s2-24(SP) + MOVQ DX, s3-32(SP) + MOVQ CX, s4-40(SP) + MOVQ BX, s5-48(SP) + MOVQ SI, s6-56(SP) + +l12: + // v = v - u + MOVQ R13, AX + MOVQ R14, DX + MOVQ R15, CX + MOVQ s0-8(SP), BX + MOVQ s1-16(SP), SI + SUBQ R8, AX + SBBQ R9, DX + SBBQ R10, CX + SBBQ R11, BX + SBBQ R12, SI + JCC l3 + SUBQ R13, R8 + SBBQ R14, R9 + SBBQ R15, R10 + SBBQ s0-8(SP), R11 + SBBQ s1-16(SP), R12 + MOVQ s2-24(SP), AX + MOVQ s3-32(SP), DX + MOVQ s4-40(SP), CX + MOVQ s5-48(SP), BX + MOVQ s6-56(SP), SI + SUBQ s7-64(SP), AX + SBBQ s8-72(SP), DX + SBBQ s9-80(SP), CX + SBBQ s10-88(SP), BX + SBBQ s11-96(SP), SI + JCC l16 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + +l16: + MOVQ AX, s2-24(SP) + MOVQ DX, s3-32(SP) + MOVQ CX, s4-40(SP) + MOVQ BX, s5-48(SP) + MOVQ SI, s6-56(SP) + JMP l4 + +l3: + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ CX, R15 + MOVQ BX, s0-8(SP) + MOVQ SI, s1-16(SP) + MOVQ s7-64(SP), AX + MOVQ s8-72(SP), DX + MOVQ s9-80(SP), CX + MOVQ s10-88(SP), BX + MOVQ s11-96(SP), SI + SUBQ s2-24(SP), AX + SBBQ s3-32(SP), DX + SBBQ s4-40(SP), CX + SBBQ s5-48(SP), BX + SBBQ s6-56(SP), SI + JCC l17 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + +l17: + MOVQ AX, s7-64(SP) + MOVQ DX, s8-72(SP) + MOVQ CX, s9-80(SP) + MOVQ BX, s10-88(SP) + MOVQ SI, s11-96(SP) + +l4: + MOVQ R8, DI + SUBQ $1, DI + ORQ R9, DI + ORQ R10, DI + ORQ R11, DI + ORQ R12, DI + JEQ l5 + MOVQ R13, AX + MOVQ R14, DX + MOVQ R15, CX + MOVQ s0-8(SP), BX + MOVQ s1-16(SP), SI + MOVQ AX, DI + SUBQ $1, DI + JNE l2 + ORQ DX, DI + ORQ CX, DI + ORQ BX, DI + ORQ SI, DI + JEQ l6 + JMP l2 + +l5: + MOVQ res+0(FP), DI + MOVQ s2-24(SP), AX + MOVQ s3-32(SP), DX + MOVQ s4-40(SP), CX + MOVQ s5-48(SP), BX + MOVQ s6-56(SP), SI + MOVQ AX, 0(DI) + MOVQ DX, 8(DI) + MOVQ CX, 16(DI) + MOVQ BX, 24(DI) + MOVQ SI, 32(DI) + RET + +l6: + MOVQ res+0(FP), DI + MOVQ s7-64(SP), AX + MOVQ s8-72(SP), DX + MOVQ s9-80(SP), CX + MOVQ s10-88(SP), BX + MOVQ s11-96(SP), SI + MOVQ AX, 0(DI) + MOVQ DX, 8(DI) + MOVQ CX, 16(DI) + MOVQ BX, 24(DI) + MOVQ SI, 32(DI) + RET + +l7: + MOVQ res+0(FP), DI + MOVQ $0, 0(DI) + MOVQ $0, 8(DI) + MOVQ $0, 16(DI) + MOVQ $0, 24(DI) + MOVQ $0, 32(DI) + RET diff --git a/ecc/bw6-633/fr/element_ops_noasm.go b/ecc/bw6-633/fr/element_ops_noasm.go index ec1fac18d..006365daa 100644 --- a/ecc/bw6-633/fr/element_ops_noasm.go +++ b/ecc/bw6-633/fr/element_ops_noasm.go @@ -51,6 +51,10 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } +func inverse(z, x *Element) { + _inverseGeneric(z, x) +} + // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bw6-761/fp/element.go b/ecc/bw6-761/fp/element.go index a9dff33e5..a40f89d6b 100644 --- a/ecc/bw6-761/fp/element.go +++ b/ecc/bw6-761/fp/element.go @@ -1221,6 +1221,45 @@ func _butterflyGeneric(a, b *Element) { b.Sub(&t, b) } +// BitLen returns the minimum number of bits needed to represent z +// returns 0 if z == 0 +func (z *Element) BitLen() int { + if z[11] != 0 { + return 704 + bits.Len64(z[11]) + } + if z[10] != 0 { + return 640 + bits.Len64(z[10]) + } + if z[9] != 0 { + return 576 + bits.Len64(z[9]) + } + if z[8] != 0 { + return 512 + bits.Len64(z[8]) + } + if z[7] != 0 { + return 448 + bits.Len64(z[7]) + } + if z[6] != 0 { + return 384 + bits.Len64(z[6]) + } + if z[5] != 0 { + return 320 + bits.Len64(z[5]) + } + if z[4] != 0 { + return 256 + bits.Len64(z[4]) + } + if z[3] != 0 { + return 192 + bits.Len64(z[3]) + } + if z[2] != 0 { + return 128 + bits.Len64(z[2]) + } + if z[1] != 0 { + return 64 + bits.Len64(z[1]) + } + return bits.Len64(z[0]) +} + // Exp z = x^exponent mod q func (z *Element) Exp(x Element, exponent *big.Int) *Element { var bZero big.Int @@ -1442,8 +1481,17 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { + inverse(z, x) + return z +} + +// _inverseGeneric z = x^-1 mod q +// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" +// if x == 0, sets and returns z = x +func _inverseGeneric(z, x *Element) { if x.IsZero() { - return z.Set(x) + z.SetZero() + return } // initialize u = q @@ -1483,47 +1531,26 @@ func (z *Element) Inverse(x *Element) *Element { v := *x - var carry, borrow, t, t2 uint64 + var carry, borrow uint64 var bigger bool for { for v[0]&1 == 0 { // v = v >> 1 - t2 = v[11] << 63 + + v[0] = v[0]>>1 | v[1]<<63 + v[1] = v[1]>>1 | v[2]<<63 + v[2] = v[2]>>1 | v[3]<<63 + v[3] = v[3]>>1 | v[4]<<63 + v[4] = v[4]>>1 | v[5]<<63 + v[5] = v[5]>>1 | v[6]<<63 + v[6] = v[6]>>1 | v[7]<<63 + v[7] = v[7]>>1 | v[8]<<63 + v[8] = v[8]>>1 | v[9]<<63 + v[9] = v[9]>>1 | v[10]<<63 + v[10] = v[10]>>1 | v[11]<<63 v[11] >>= 1 - t = t2 - t2 = v[10] << 63 - v[10] = (v[10] >> 1) | t - t = t2 - t2 = v[9] << 63 - v[9] = (v[9] >> 1) | t - t = t2 - t2 = v[8] << 63 - v[8] = (v[8] >> 1) | t - t = t2 - t2 = v[7] << 63 - v[7] = (v[7] >> 1) | t - t = t2 - t2 = v[6] << 63 - v[6] = (v[6] >> 1) | t - t = t2 - t2 = v[5] << 63 - v[5] = (v[5] >> 1) | t - t = t2 - t2 = v[4] << 63 - v[4] = (v[4] >> 1) | t - t = t2 - t2 = v[3] << 63 - v[3] = (v[3] >> 1) | t - t = t2 - t2 = v[2] << 63 - v[2] = (v[2] >> 1) | t - t = t2 - t2 = v[1] << 63 - v[1] = (v[1] >> 1) | t - t = t2 - v[0] = (v[0] >> 1) | t if s[0]&1 == 1 { @@ -1544,79 +1571,37 @@ func (z *Element) Inverse(x *Element) *Element { } // s = s >> 1 - t2 = s[11] << 63 + + s[0] = s[0]>>1 | s[1]<<63 + s[1] = s[1]>>1 | s[2]<<63 + s[2] = s[2]>>1 | s[3]<<63 + s[3] = s[3]>>1 | s[4]<<63 + s[4] = s[4]>>1 | s[5]<<63 + s[5] = s[5]>>1 | s[6]<<63 + s[6] = s[6]>>1 | s[7]<<63 + s[7] = s[7]>>1 | s[8]<<63 + s[8] = s[8]>>1 | s[9]<<63 + s[9] = s[9]>>1 | s[10]<<63 + s[10] = s[10]>>1 | s[11]<<63 s[11] >>= 1 - t = t2 - t2 = s[10] << 63 - s[10] = (s[10] >> 1) | t - t = t2 - t2 = s[9] << 63 - s[9] = (s[9] >> 1) | t - t = t2 - t2 = s[8] << 63 - s[8] = (s[8] >> 1) | t - t = t2 - t2 = s[7] << 63 - s[7] = (s[7] >> 1) | t - t = t2 - t2 = s[6] << 63 - s[6] = (s[6] >> 1) | t - t = t2 - t2 = s[5] << 63 - s[5] = (s[5] >> 1) | t - t = t2 - t2 = s[4] << 63 - s[4] = (s[4] >> 1) | t - t = t2 - t2 = s[3] << 63 - s[3] = (s[3] >> 1) | t - t = t2 - t2 = s[2] << 63 - s[2] = (s[2] >> 1) | t - t = t2 - t2 = s[1] << 63 - s[1] = (s[1] >> 1) | t - t = t2 - s[0] = (s[0] >> 1) | t } for u[0]&1 == 0 { // u = u >> 1 - t2 = u[11] << 63 + + u[0] = u[0]>>1 | u[1]<<63 + u[1] = u[1]>>1 | u[2]<<63 + u[2] = u[2]>>1 | u[3]<<63 + u[3] = u[3]>>1 | u[4]<<63 + u[4] = u[4]>>1 | u[5]<<63 + u[5] = u[5]>>1 | u[6]<<63 + u[6] = u[6]>>1 | u[7]<<63 + u[7] = u[7]>>1 | u[8]<<63 + u[8] = u[8]>>1 | u[9]<<63 + u[9] = u[9]>>1 | u[10]<<63 + u[10] = u[10]>>1 | u[11]<<63 u[11] >>= 1 - t = t2 - t2 = u[10] << 63 - u[10] = (u[10] >> 1) | t - t = t2 - t2 = u[9] << 63 - u[9] = (u[9] >> 1) | t - t = t2 - t2 = u[8] << 63 - u[8] = (u[8] >> 1) | t - t = t2 - t2 = u[7] << 63 - u[7] = (u[7] >> 1) | t - t = t2 - t2 = u[6] << 63 - u[6] = (u[6] >> 1) | t - t = t2 - t2 = u[5] << 63 - u[5] = (u[5] >> 1) | t - t = t2 - t2 = u[4] << 63 - u[4] = (u[4] >> 1) | t - t = t2 - t2 = u[3] << 63 - u[3] = (u[3] >> 1) | t - t = t2 - t2 = u[2] << 63 - u[2] = (u[2] >> 1) | t - t = t2 - t2 = u[1] << 63 - u[1] = (u[1] >> 1) | t - t = t2 - u[0] = (u[0] >> 1) | t if r[0]&1 == 1 { @@ -1637,40 +1622,19 @@ func (z *Element) Inverse(x *Element) *Element { } // r = r >> 1 - t2 = r[11] << 63 + + r[0] = r[0]>>1 | r[1]<<63 + r[1] = r[1]>>1 | r[2]<<63 + r[2] = r[2]>>1 | r[3]<<63 + r[3] = r[3]>>1 | r[4]<<63 + r[4] = r[4]>>1 | r[5]<<63 + r[5] = r[5]>>1 | r[6]<<63 + r[6] = r[6]>>1 | r[7]<<63 + r[7] = r[7]>>1 | r[8]<<63 + r[8] = r[8]>>1 | r[9]<<63 + r[9] = r[9]>>1 | r[10]<<63 + r[10] = r[10]>>1 | r[11]<<63 r[11] >>= 1 - t = t2 - t2 = r[10] << 63 - r[10] = (r[10] >> 1) | t - t = t2 - t2 = r[9] << 63 - r[9] = (r[9] >> 1) | t - t = t2 - t2 = r[8] << 63 - r[8] = (r[8] >> 1) | t - t = t2 - t2 = r[7] << 63 - r[7] = (r[7] >> 1) | t - t = t2 - t2 = r[6] << 63 - r[6] = (r[6] >> 1) | t - t = t2 - t2 = r[5] << 63 - r[5] = (r[5] >> 1) | t - t = t2 - t2 = r[4] << 63 - r[4] = (r[4] >> 1) | t - t = t2 - t2 = r[3] << 63 - r[3] = (r[3] >> 1) | t - t = t2 - t2 = r[2] << 63 - r[2] = (r[2] >> 1) | t - t = t2 - t2 = r[1] << 63 - r[1] = (r[1] >> 1) | t - t = t2 - r[0] = (r[0] >> 1) | t } @@ -1773,10 +1737,12 @@ func (z *Element) Inverse(x *Element) *Element { } } if (u[0] == 1) && (u[11]|u[10]|u[9]|u[8]|u[7]|u[6]|u[5]|u[4]|u[3]|u[2]|u[1]) == 0 { - return z.Set(&r) + z.Set(&r) + return } if (v[0] == 1) && (v[11]|v[10]|v[9]|v[8]|v[7]|v[6]|v[5]|v[4]|v[3]|v[2]|v[1]) == 0 { - return z.Set(&s) + z.Set(&s) + return } } diff --git a/ecc/bw6-761/fp/element_ops_amd64.go b/ecc/bw6-761/fp/element_ops_amd64.go index 73a3711ec..d61412bd6 100644 --- a/ecc/bw6-761/fp/element_ops_amd64.go +++ b/ecc/bw6-761/fp/element_ops_amd64.go @@ -48,3 +48,6 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) + +//go:noescape +func inverse(res, x *Element) diff --git a/ecc/bw6-761/fp/element_ops_amd64.s b/ecc/bw6-761/fp/element_ops_amd64.s index 242ba6a68..2012c2840 100644 --- a/ecc/bw6-761/fp/element_ops_amd64.s +++ b/ecc/bw6-761/fp/element_ops_amd64.s @@ -744,3 +744,667 @@ l3: MOVQ R13, 80(AX) MOVQ R14, 88(AX) RET + +// inverse(res, x *Element) +TEXT ·inverse(SB), $376-16 + // u = q + // u[0] -> R15 + // u[1] -> s0-8(SP) + // u[2] -> s1-16(SP) + // u[3] -> s2-24(SP) + // u[4] -> s3-32(SP) + // u[5] -> s4-40(SP) + // u[6] -> s5-48(SP) + // u[7] -> s6-56(SP) + // u[8] -> s7-64(SP) + // u[9] -> s8-72(SP) + // u[10] -> s9-80(SP) + // u[11] -> s10-88(SP) + MOVQ q<>+0(SB), R14 + MOVQ R14, R15 + MOVQ q<>+8(SB), R14 + MOVQ R14, s0-8(SP) + MOVQ q<>+16(SB), R14 + MOVQ R14, s1-16(SP) + MOVQ q<>+24(SB), R14 + MOVQ R14, s2-24(SP) + MOVQ q<>+32(SB), R14 + MOVQ R14, s3-32(SP) + MOVQ q<>+40(SB), R14 + MOVQ R14, s4-40(SP) + MOVQ q<>+48(SB), R14 + MOVQ R14, s5-48(SP) + MOVQ q<>+56(SB), R14 + MOVQ R14, s6-56(SP) + MOVQ q<>+64(SB), R14 + MOVQ R14, s7-64(SP) + MOVQ q<>+72(SB), R14 + MOVQ R14, s8-72(SP) + MOVQ q<>+80(SB), R14 + MOVQ R14, s9-80(SP) + MOVQ q<>+88(SB), R14 + MOVQ R14, s10-88(SP) + + // s = r^2 + // s[0] -> s35-288(SP) + // s[1] -> s36-296(SP) + // s[2] -> s37-304(SP) + // s[3] -> s38-312(SP) + // s[4] -> s39-320(SP) + // s[5] -> s40-328(SP) + // s[6] -> s41-336(SP) + // s[7] -> s42-344(SP) + // s[8] -> s43-352(SP) + // s[9] -> s44-360(SP) + // s[10] -> s45-368(SP) + // s[11] -> s46-376(SP) + MOVQ $0xc686392d2d1fa659, R14 + MOVQ R14, s35-288(SP) + MOVQ $0x7b14c9b2f79484ab, R14 + MOVQ R14, s36-296(SP) + MOVQ $0x7fa1e825c1d2b459, R14 + MOVQ R14, s37-304(SP) + MOVQ $0xd6ec28f848329d88, R14 + MOVQ R14, s38-312(SP) + MOVQ $0x4afb427b73a1ed40, R14 + MOVQ R14, s39-320(SP) + MOVQ $0x972c69400d5930ae, R14 + MOVQ R14, s40-328(SP) + MOVQ $0x2c7a26bf8c995976, R14 + MOVQ R14, s41-336(SP) + MOVQ $0xac52e458c6e57af9, R14 + MOVQ R14, s42-344(SP) + MOVQ $0xac731bfa0c536dfe, R14 + MOVQ R14, s43-352(SP) + MOVQ $0x121e5c630b103f50, R14 + MOVQ R14, s44-360(SP) + MOVQ $0x8f1b0953b886cda4, R14 + MOVQ R14, s45-368(SP) + MOVQ $0x00ad253c2da8d807, R14 + MOVQ R14, s46-376(SP) + + // v = x + // v[0] -> s11-96(SP) + // v[1] -> s12-104(SP) + // v[2] -> s13-112(SP) + // v[3] -> s14-120(SP) + // v[4] -> s15-128(SP) + // v[5] -> s16-136(SP) + // v[6] -> s17-144(SP) + // v[7] -> s18-152(SP) + // v[8] -> s19-160(SP) + // v[9] -> s20-168(SP) + // v[10] -> s21-176(SP) + // v[11] -> s22-184(SP) + MOVQ x+8(FP), R14 + MOVQ 0(R14), AX + MOVQ 8(R14), DX + MOVQ 16(R14), CX + MOVQ 24(R14), BX + MOVQ 32(R14), SI + MOVQ 40(R14), DI + MOVQ 48(R14), R8 + MOVQ 56(R14), R9 + MOVQ 64(R14), R10 + MOVQ 72(R14), R11 + MOVQ 80(R14), R12 + MOVQ 88(R14), R13 + MOVQ AX, s11-96(SP) + MOVQ DX, s12-104(SP) + MOVQ CX, s13-112(SP) + MOVQ BX, s14-120(SP) + MOVQ SI, s15-128(SP) + MOVQ DI, s16-136(SP) + MOVQ R8, s17-144(SP) + MOVQ R9, s18-152(SP) + MOVQ R10, s19-160(SP) + MOVQ R11, s20-168(SP) + MOVQ R12, s21-176(SP) + MOVQ R13, s22-184(SP) + + // if x is 0, returns 0 + MOVQ AX, R14 + ORQ DX, R14 + ORQ CX, R14 + ORQ BX, R14 + ORQ SI, R14 + ORQ DI, R14 + ORQ R8, R14 + ORQ R9, R14 + ORQ R10, R14 + ORQ R11, R14 + ORQ R12, R14 + ORQ R13, R14 + JEQ l9 + + // r = 0 + // r[0] -> s23-192(SP) + // r[1] -> s24-200(SP) + // r[2] -> s25-208(SP) + // r[3] -> s26-216(SP) + // r[4] -> s27-224(SP) + // r[5] -> s28-232(SP) + // r[6] -> s29-240(SP) + // r[7] -> s30-248(SP) + // r[8] -> s31-256(SP) + // r[9] -> s32-264(SP) + // r[10] -> s33-272(SP) + // r[11] -> s34-280(SP) + MOVQ $0, s23-192(SP) + MOVQ $0, s24-200(SP) + MOVQ $0, s25-208(SP) + MOVQ $0, s26-216(SP) + MOVQ $0, s27-224(SP) + MOVQ $0, s28-232(SP) + MOVQ $0, s29-240(SP) + MOVQ $0, s30-248(SP) + MOVQ $0, s31-256(SP) + MOVQ $0, s32-264(SP) + MOVQ $0, s33-272(SP) + MOVQ $0, s34-280(SP) + +l4: + BTQ $0, AX + JCS l10 + MOVQ $0, BP + XORQ R14, R14 + +l11: + INCQ BP + SHRQ $1, AX, R14 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, R8, DI + SHRQ $1, R9, R8 + SHRQ $1, R10, R9 + SHRQ $1, R11, R10 + SHRQ $1, R12, R11 + SHRQ $1, R13, R12 + SHRQ $1, R13 + BTQ $0, AX + JCC l11 + MOVQ AX, s11-96(SP) + MOVQ DX, s12-104(SP) + MOVQ CX, s13-112(SP) + MOVQ BX, s14-120(SP) + MOVQ SI, s15-128(SP) + MOVQ DI, s16-136(SP) + MOVQ R8, s17-144(SP) + MOVQ R9, s18-152(SP) + MOVQ R10, s19-160(SP) + MOVQ R11, s20-168(SP) + MOVQ R12, s21-176(SP) + MOVQ R13, s22-184(SP) + MOVQ s35-288(SP), AX + MOVQ s36-296(SP), DX + MOVQ s37-304(SP), CX + MOVQ s38-312(SP), BX + MOVQ s39-320(SP), SI + MOVQ s40-328(SP), DI + MOVQ s41-336(SP), R8 + MOVQ s42-344(SP), R9 + MOVQ s43-352(SP), R10 + MOVQ s44-360(SP), R11 + MOVQ s45-368(SP), R12 + MOVQ s46-376(SP), R13 + +l12: + BTQ $0, AX + JCC l13 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + ADCQ q<>+48(SB), R8 + ADCQ q<>+56(SB), R9 + ADCQ q<>+64(SB), R10 + ADCQ q<>+72(SB), R11 + ADCQ q<>+80(SB), R12 + ADCQ q<>+88(SB), R13 + +l13: + SHRQ $1, AX, R14 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, R8, DI + SHRQ $1, R9, R8 + SHRQ $1, R10, R9 + SHRQ $1, R11, R10 + SHRQ $1, R12, R11 + SHRQ $1, R13, R12 + SHRQ $1, R13 + DECQ BP + JNE l12 + MOVQ AX, s35-288(SP) + MOVQ DX, s36-296(SP) + MOVQ CX, s37-304(SP) + MOVQ BX, s38-312(SP) + MOVQ SI, s39-320(SP) + MOVQ DI, s40-328(SP) + MOVQ R8, s41-336(SP) + MOVQ R9, s42-344(SP) + MOVQ R10, s43-352(SP) + MOVQ R11, s44-360(SP) + MOVQ R12, s45-368(SP) + MOVQ R13, s46-376(SP) + +l10: + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + MOVQ s3-32(SP), SI + MOVQ s4-40(SP), DI + MOVQ s5-48(SP), R8 + MOVQ s6-56(SP), R9 + MOVQ s7-64(SP), R10 + MOVQ s8-72(SP), R11 + MOVQ s9-80(SP), R12 + MOVQ s10-88(SP), R13 + BTQ $0, AX + JCS l14 + MOVQ $0, BP + XORQ R14, R14 + +l15: + INCQ BP + SHRQ $1, AX, R14 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, R8, DI + SHRQ $1, R9, R8 + SHRQ $1, R10, R9 + SHRQ $1, R11, R10 + SHRQ $1, R12, R11 + SHRQ $1, R13, R12 + SHRQ $1, R13 + BTQ $0, AX + JCC l15 + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + MOVQ SI, s3-32(SP) + MOVQ DI, s4-40(SP) + MOVQ R8, s5-48(SP) + MOVQ R9, s6-56(SP) + MOVQ R10, s7-64(SP) + MOVQ R11, s8-72(SP) + MOVQ R12, s9-80(SP) + MOVQ R13, s10-88(SP) + MOVQ s23-192(SP), AX + MOVQ s24-200(SP), DX + MOVQ s25-208(SP), CX + MOVQ s26-216(SP), BX + MOVQ s27-224(SP), SI + MOVQ s28-232(SP), DI + MOVQ s29-240(SP), R8 + MOVQ s30-248(SP), R9 + MOVQ s31-256(SP), R10 + MOVQ s32-264(SP), R11 + MOVQ s33-272(SP), R12 + MOVQ s34-280(SP), R13 + +l16: + BTQ $0, AX + JCC l17 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + ADCQ q<>+48(SB), R8 + ADCQ q<>+56(SB), R9 + ADCQ q<>+64(SB), R10 + ADCQ q<>+72(SB), R11 + ADCQ q<>+80(SB), R12 + ADCQ q<>+88(SB), R13 + +l17: + SHRQ $1, AX, R14 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, R8, DI + SHRQ $1, R9, R8 + SHRQ $1, R10, R9 + SHRQ $1, R11, R10 + SHRQ $1, R12, R11 + SHRQ $1, R13, R12 + SHRQ $1, R13 + DECQ BP + JNE l16 + MOVQ AX, s23-192(SP) + MOVQ DX, s24-200(SP) + MOVQ CX, s25-208(SP) + MOVQ BX, s26-216(SP) + MOVQ SI, s27-224(SP) + MOVQ DI, s28-232(SP) + MOVQ R8, s29-240(SP) + MOVQ R9, s30-248(SP) + MOVQ R10, s31-256(SP) + MOVQ R11, s32-264(SP) + MOVQ R12, s33-272(SP) + MOVQ R13, s34-280(SP) + +l14: + // v = v - u + MOVQ s11-96(SP), AX + MOVQ s12-104(SP), DX + MOVQ s13-112(SP), CX + MOVQ s14-120(SP), BX + MOVQ s15-128(SP), SI + MOVQ s16-136(SP), DI + MOVQ s17-144(SP), R8 + MOVQ s18-152(SP), R9 + MOVQ s19-160(SP), R10 + MOVQ s20-168(SP), R11 + MOVQ s21-176(SP), R12 + MOVQ s22-184(SP), R13 + SUBQ R15, AX + SBBQ s0-8(SP), DX + SBBQ s1-16(SP), CX + SBBQ s2-24(SP), BX + SBBQ s3-32(SP), SI + SBBQ s4-40(SP), DI + SBBQ s5-48(SP), R8 + SBBQ s6-56(SP), R9 + SBBQ s7-64(SP), R10 + SBBQ s8-72(SP), R11 + SBBQ s9-80(SP), R12 + SBBQ s10-88(SP), R13 + JCC l5 + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + MOVQ s3-32(SP), SI + MOVQ s4-40(SP), DI + MOVQ s5-48(SP), R8 + MOVQ s6-56(SP), R9 + MOVQ s7-64(SP), R10 + MOVQ s8-72(SP), R11 + MOVQ s9-80(SP), R12 + MOVQ s10-88(SP), R13 + SUBQ s11-96(SP), AX + SBBQ s12-104(SP), DX + SBBQ s13-112(SP), CX + SBBQ s14-120(SP), BX + SBBQ s15-128(SP), SI + SBBQ s16-136(SP), DI + SBBQ s17-144(SP), R8 + SBBQ s18-152(SP), R9 + SBBQ s19-160(SP), R10 + SBBQ s20-168(SP), R11 + SBBQ s21-176(SP), R12 + SBBQ s22-184(SP), R13 + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + MOVQ SI, s3-32(SP) + MOVQ DI, s4-40(SP) + MOVQ R8, s5-48(SP) + MOVQ R9, s6-56(SP) + MOVQ R10, s7-64(SP) + MOVQ R11, s8-72(SP) + MOVQ R12, s9-80(SP) + MOVQ R13, s10-88(SP) + MOVQ s23-192(SP), AX + MOVQ s24-200(SP), DX + MOVQ s25-208(SP), CX + MOVQ s26-216(SP), BX + MOVQ s27-224(SP), SI + MOVQ s28-232(SP), DI + MOVQ s29-240(SP), R8 + MOVQ s30-248(SP), R9 + MOVQ s31-256(SP), R10 + MOVQ s32-264(SP), R11 + MOVQ s33-272(SP), R12 + MOVQ s34-280(SP), R13 + SUBQ s35-288(SP), AX + SBBQ s36-296(SP), DX + SBBQ s37-304(SP), CX + SBBQ s38-312(SP), BX + SBBQ s39-320(SP), SI + SBBQ s40-328(SP), DI + SBBQ s41-336(SP), R8 + SBBQ s42-344(SP), R9 + SBBQ s43-352(SP), R10 + SBBQ s44-360(SP), R11 + SBBQ s45-368(SP), R12 + SBBQ s46-376(SP), R13 + JCC l18 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + ADCQ q<>+48(SB), R8 + ADCQ q<>+56(SB), R9 + ADCQ q<>+64(SB), R10 + ADCQ q<>+72(SB), R11 + ADCQ q<>+80(SB), R12 + ADCQ q<>+88(SB), R13 + +l18: + MOVQ AX, s23-192(SP) + MOVQ DX, s24-200(SP) + MOVQ CX, s25-208(SP) + MOVQ BX, s26-216(SP) + MOVQ SI, s27-224(SP) + MOVQ DI, s28-232(SP) + MOVQ R8, s29-240(SP) + MOVQ R9, s30-248(SP) + MOVQ R10, s31-256(SP) + MOVQ R11, s32-264(SP) + MOVQ R12, s33-272(SP) + MOVQ R13, s34-280(SP) + JMP l6 + +l5: + MOVQ AX, s11-96(SP) + MOVQ DX, s12-104(SP) + MOVQ CX, s13-112(SP) + MOVQ BX, s14-120(SP) + MOVQ SI, s15-128(SP) + MOVQ DI, s16-136(SP) + MOVQ R8, s17-144(SP) + MOVQ R9, s18-152(SP) + MOVQ R10, s19-160(SP) + MOVQ R11, s20-168(SP) + MOVQ R12, s21-176(SP) + MOVQ R13, s22-184(SP) + MOVQ s35-288(SP), AX + MOVQ s36-296(SP), DX + MOVQ s37-304(SP), CX + MOVQ s38-312(SP), BX + MOVQ s39-320(SP), SI + MOVQ s40-328(SP), DI + MOVQ s41-336(SP), R8 + MOVQ s42-344(SP), R9 + MOVQ s43-352(SP), R10 + MOVQ s44-360(SP), R11 + MOVQ s45-368(SP), R12 + MOVQ s46-376(SP), R13 + SUBQ s23-192(SP), AX + SBBQ s24-200(SP), DX + SBBQ s25-208(SP), CX + SBBQ s26-216(SP), BX + SBBQ s27-224(SP), SI + SBBQ s28-232(SP), DI + SBBQ s29-240(SP), R8 + SBBQ s30-248(SP), R9 + SBBQ s31-256(SP), R10 + SBBQ s32-264(SP), R11 + SBBQ s33-272(SP), R12 + SBBQ s34-280(SP), R13 + JCC l19 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + ADCQ q<>+48(SB), R8 + ADCQ q<>+56(SB), R9 + ADCQ q<>+64(SB), R10 + ADCQ q<>+72(SB), R11 + ADCQ q<>+80(SB), R12 + ADCQ q<>+88(SB), R13 + +l19: + MOVQ AX, s35-288(SP) + MOVQ DX, s36-296(SP) + MOVQ CX, s37-304(SP) + MOVQ BX, s38-312(SP) + MOVQ SI, s39-320(SP) + MOVQ DI, s40-328(SP) + MOVQ R8, s41-336(SP) + MOVQ R9, s42-344(SP) + MOVQ R10, s43-352(SP) + MOVQ R11, s44-360(SP) + MOVQ R12, s45-368(SP) + MOVQ R13, s46-376(SP) + +l6: + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + MOVQ s3-32(SP), SI + MOVQ s4-40(SP), DI + MOVQ s5-48(SP), R8 + MOVQ s6-56(SP), R9 + MOVQ s7-64(SP), R10 + MOVQ s8-72(SP), R11 + MOVQ s9-80(SP), R12 + MOVQ s10-88(SP), R13 + SUBQ $1, AX + ORQ AX, R13 + ORQ DX, R13 + ORQ CX, R13 + ORQ BX, R13 + ORQ SI, R13 + ORQ DI, R13 + ORQ R8, R13 + ORQ R9, R13 + ORQ R10, R13 + ORQ R11, R13 + ORQ R12, R13 + JEQ l7 + MOVQ s11-96(SP), AX + MOVQ s12-104(SP), DX + MOVQ s13-112(SP), CX + MOVQ s14-120(SP), BX + MOVQ s15-128(SP), SI + MOVQ s16-136(SP), DI + MOVQ s17-144(SP), R8 + MOVQ s18-152(SP), R9 + MOVQ s19-160(SP), R10 + MOVQ s20-168(SP), R11 + MOVQ s21-176(SP), R12 + MOVQ s22-184(SP), R13 + MOVQ AX, R14 + SUBQ $1, R14 + JNE l4 + ORQ DX, R14 + ORQ CX, R14 + ORQ BX, R14 + ORQ SI, R14 + ORQ DI, R14 + ORQ R8, R14 + ORQ R9, R14 + ORQ R10, R14 + ORQ R11, R14 + ORQ R12, R14 + ORQ R13, R14 + JEQ l8 + JMP l4 + +l7: + MOVQ res+0(FP), R14 + MOVQ s23-192(SP), AX + MOVQ s24-200(SP), DX + MOVQ s25-208(SP), CX + MOVQ s26-216(SP), BX + MOVQ s27-224(SP), SI + MOVQ s28-232(SP), DI + MOVQ s29-240(SP), R8 + MOVQ s30-248(SP), R9 + MOVQ s31-256(SP), R10 + MOVQ s32-264(SP), R11 + MOVQ s33-272(SP), R12 + MOVQ s34-280(SP), R13 + MOVQ AX, 0(R14) + MOVQ DX, 8(R14) + MOVQ CX, 16(R14) + MOVQ BX, 24(R14) + MOVQ SI, 32(R14) + MOVQ DI, 40(R14) + MOVQ R8, 48(R14) + MOVQ R9, 56(R14) + MOVQ R10, 64(R14) + MOVQ R11, 72(R14) + MOVQ R12, 80(R14) + MOVQ R13, 88(R14) + RET + +l8: + MOVQ res+0(FP), R14 + MOVQ s35-288(SP), AX + MOVQ s36-296(SP), DX + MOVQ s37-304(SP), CX + MOVQ s38-312(SP), BX + MOVQ s39-320(SP), SI + MOVQ s40-328(SP), DI + MOVQ s41-336(SP), R8 + MOVQ s42-344(SP), R9 + MOVQ s43-352(SP), R10 + MOVQ s44-360(SP), R11 + MOVQ s45-368(SP), R12 + MOVQ s46-376(SP), R13 + MOVQ AX, 0(R14) + MOVQ DX, 8(R14) + MOVQ CX, 16(R14) + MOVQ BX, 24(R14) + MOVQ SI, 32(R14) + MOVQ DI, 40(R14) + MOVQ R8, 48(R14) + MOVQ R9, 56(R14) + MOVQ R10, 64(R14) + MOVQ R11, 72(R14) + MOVQ R12, 80(R14) + MOVQ R13, 88(R14) + RET + +l9: + MOVQ res+0(FP), R14 + MOVQ $0, 0(R14) + MOVQ $0, 8(R14) + MOVQ $0, 16(R14) + MOVQ $0, 24(R14) + MOVQ $0, 32(R14) + MOVQ $0, 40(R14) + MOVQ $0, 48(R14) + MOVQ $0, 56(R14) + MOVQ $0, 64(R14) + MOVQ $0, 72(R14) + MOVQ $0, 80(R14) + MOVQ $0, 88(R14) + RET diff --git a/ecc/bw6-761/fp/element_ops_noasm.go b/ecc/bw6-761/fp/element_ops_noasm.go index fec628918..48d55e2ea 100644 --- a/ecc/bw6-761/fp/element_ops_noasm.go +++ b/ecc/bw6-761/fp/element_ops_noasm.go @@ -51,6 +51,10 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } +func inverse(z, x *Element) { + _inverseGeneric(z, x) +} + // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bw6-761/fr/element.go b/ecc/bw6-761/fr/element.go index 657229a13..6199bcfb4 100644 --- a/ecc/bw6-761/fr/element.go +++ b/ecc/bw6-761/fr/element.go @@ -699,6 +699,27 @@ func _butterflyGeneric(a, b *Element) { b.Sub(&t, b) } +// BitLen returns the minimum number of bits needed to represent z +// returns 0 if z == 0 +func (z *Element) BitLen() int { + if z[5] != 0 { + return 320 + bits.Len64(z[5]) + } + if z[4] != 0 { + return 256 + bits.Len64(z[4]) + } + if z[3] != 0 { + return 192 + bits.Len64(z[3]) + } + if z[2] != 0 { + return 128 + bits.Len64(z[2]) + } + if z[1] != 0 { + return 64 + bits.Len64(z[1]) + } + return bits.Len64(z[0]) +} + // Exp z = x^exponent mod q func (z *Element) Exp(x Element, exponent *big.Int) *Element { var bZero big.Int @@ -962,8 +983,17 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { + inverse(z, x) + return z +} + +// _inverseGeneric z = x^-1 mod q +// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" +// if x == 0, sets and returns z = x +func _inverseGeneric(z, x *Element) { if x.IsZero() { - return z.Set(x) + z.SetZero() + return } // initialize u = q @@ -991,29 +1021,20 @@ func (z *Element) Inverse(x *Element) *Element { v := *x - var carry, borrow, t, t2 uint64 + var carry, borrow uint64 var bigger bool for { for v[0]&1 == 0 { // v = v >> 1 - t2 = v[5] << 63 + + v[0] = v[0]>>1 | v[1]<<63 + v[1] = v[1]>>1 | v[2]<<63 + v[2] = v[2]>>1 | v[3]<<63 + v[3] = v[3]>>1 | v[4]<<63 + v[4] = v[4]>>1 | v[5]<<63 v[5] >>= 1 - t = t2 - t2 = v[4] << 63 - v[4] = (v[4] >> 1) | t - t = t2 - t2 = v[3] << 63 - v[3] = (v[3] >> 1) | t - t = t2 - t2 = v[2] << 63 - v[2] = (v[2] >> 1) | t - t = t2 - t2 = v[1] << 63 - v[1] = (v[1] >> 1) | t - t = t2 - v[0] = (v[0] >> 1) | t if s[0]&1 == 1 { @@ -1028,43 +1049,25 @@ func (z *Element) Inverse(x *Element) *Element { } // s = s >> 1 - t2 = s[5] << 63 + + s[0] = s[0]>>1 | s[1]<<63 + s[1] = s[1]>>1 | s[2]<<63 + s[2] = s[2]>>1 | s[3]<<63 + s[3] = s[3]>>1 | s[4]<<63 + s[4] = s[4]>>1 | s[5]<<63 s[5] >>= 1 - t = t2 - t2 = s[4] << 63 - s[4] = (s[4] >> 1) | t - t = t2 - t2 = s[3] << 63 - s[3] = (s[3] >> 1) | t - t = t2 - t2 = s[2] << 63 - s[2] = (s[2] >> 1) | t - t = t2 - t2 = s[1] << 63 - s[1] = (s[1] >> 1) | t - t = t2 - s[0] = (s[0] >> 1) | t } for u[0]&1 == 0 { // u = u >> 1 - t2 = u[5] << 63 + + u[0] = u[0]>>1 | u[1]<<63 + u[1] = u[1]>>1 | u[2]<<63 + u[2] = u[2]>>1 | u[3]<<63 + u[3] = u[3]>>1 | u[4]<<63 + u[4] = u[4]>>1 | u[5]<<63 u[5] >>= 1 - t = t2 - t2 = u[4] << 63 - u[4] = (u[4] >> 1) | t - t = t2 - t2 = u[3] << 63 - u[3] = (u[3] >> 1) | t - t = t2 - t2 = u[2] << 63 - u[2] = (u[2] >> 1) | t - t = t2 - t2 = u[1] << 63 - u[1] = (u[1] >> 1) | t - t = t2 - u[0] = (u[0] >> 1) | t if r[0]&1 == 1 { @@ -1079,22 +1082,13 @@ func (z *Element) Inverse(x *Element) *Element { } // r = r >> 1 - t2 = r[5] << 63 + + r[0] = r[0]>>1 | r[1]<<63 + r[1] = r[1]>>1 | r[2]<<63 + r[2] = r[2]>>1 | r[3]<<63 + r[3] = r[3]>>1 | r[4]<<63 + r[4] = r[4]>>1 | r[5]<<63 r[5] >>= 1 - t = t2 - t2 = r[4] << 63 - r[4] = (r[4] >> 1) | t - t = t2 - t2 = r[3] << 63 - r[3] = (r[3] >> 1) | t - t = t2 - t2 = r[2] << 63 - r[2] = (r[2] >> 1) | t - t = t2 - t2 = r[1] << 63 - r[1] = (r[1] >> 1) | t - t = t2 - r[0] = (r[0] >> 1) | t } @@ -1161,10 +1155,12 @@ func (z *Element) Inverse(x *Element) *Element { } } if (u[0] == 1) && (u[5]|u[4]|u[3]|u[2]|u[1]) == 0 { - return z.Set(&r) + z.Set(&r) + return } if (v[0] == 1) && (v[5]|v[4]|v[3]|v[2]|v[1]) == 0 { - return z.Set(&s) + z.Set(&s) + return } } diff --git a/ecc/bw6-761/fr/element_ops_amd64.go b/ecc/bw6-761/fr/element_ops_amd64.go index 78022b3e6..9ebabc26a 100644 --- a/ecc/bw6-761/fr/element_ops_amd64.go +++ b/ecc/bw6-761/fr/element_ops_amd64.go @@ -48,3 +48,6 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) + +//go:noescape +func inverse(res, x *Element) diff --git a/ecc/bw6-761/fr/element_ops_amd64.s b/ecc/bw6-761/fr/element_ops_amd64.s index 596b552dd..857431795 100644 --- a/ecc/bw6-761/fr/element_ops_amd64.s +++ b/ecc/bw6-761/fr/element_ops_amd64.s @@ -450,3 +450,356 @@ TEXT ·Butterfly(SB), $48-16 MOVQ R8, 32(AX) MOVQ R9, 40(AX) RET + +// inverse(res, x *Element) +TEXT ·inverse(SB), $136-16 + // u = q + // u[0] -> R9 + // u[1] -> R10 + // u[2] -> R11 + // u[3] -> R12 + // u[4] -> R13 + // u[5] -> R14 + MOVQ q<>+0(SB), R9 + MOVQ q<>+8(SB), R10 + MOVQ q<>+16(SB), R11 + MOVQ q<>+24(SB), R12 + MOVQ q<>+32(SB), R13 + MOVQ q<>+40(SB), R14 + + // s = r^2 + // s[0] -> s11-96(SP) + // s[1] -> s12-104(SP) + // s[2] -> s13-112(SP) + // s[3] -> s14-120(SP) + // s[4] -> s15-128(SP) + // s[5] -> s16-136(SP) + MOVQ $0xb786686c9400cd22, R8 + MOVQ R8, s11-96(SP) + MOVQ $0x0329fcaab00431b1, R8 + MOVQ R8, s12-104(SP) + MOVQ $0x22a5f11162d6b46d, R8 + MOVQ R8, s13-112(SP) + MOVQ $0xbfdf7d03827dc3ac, R8 + MOVQ R8, s14-120(SP) + MOVQ $0x837e92f041790bf9, R8 + MOVQ R8, s15-128(SP) + MOVQ $0x006dfccb1e914b88, R8 + MOVQ R8, s16-136(SP) + + // v = x + // v[0] -> R15 + // v[1] -> s0-8(SP) + // v[2] -> s1-16(SP) + // v[3] -> s2-24(SP) + // v[4] -> s3-32(SP) + // v[5] -> s4-40(SP) + MOVQ x+8(FP), R8 + MOVQ 0(R8), AX + MOVQ 8(R8), DX + MOVQ 16(R8), CX + MOVQ 24(R8), BX + MOVQ 32(R8), SI + MOVQ 40(R8), DI + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + MOVQ SI, s3-32(SP) + MOVQ DI, s4-40(SP) + + // if x is 0, returns 0 + MOVQ AX, R8 + ORQ DX, R8 + ORQ CX, R8 + ORQ BX, R8 + ORQ SI, R8 + ORQ DI, R8 + JEQ l7 + + // r = 0 + // r[0] -> s5-48(SP) + // r[1] -> s6-56(SP) + // r[2] -> s7-64(SP) + // r[3] -> s8-72(SP) + // r[4] -> s9-80(SP) + // r[5] -> s10-88(SP) + MOVQ $0, s5-48(SP) + MOVQ $0, s6-56(SP) + MOVQ $0, s7-64(SP) + MOVQ $0, s8-72(SP) + MOVQ $0, s9-80(SP) + MOVQ $0, s10-88(SP) + +l2: + BTQ $0, AX + JCS l8 + MOVQ $0, BP + XORQ R8, R8 + +l9: + INCQ BP + SHRQ $1, AX, R8 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, DI + BTQ $0, AX + JCC l9 + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + MOVQ SI, s3-32(SP) + MOVQ DI, s4-40(SP) + MOVQ s11-96(SP), AX + MOVQ s12-104(SP), DX + MOVQ s13-112(SP), CX + MOVQ s14-120(SP), BX + MOVQ s15-128(SP), SI + MOVQ s16-136(SP), DI + +l10: + BTQ $0, AX + JCC l11 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + +l11: + SHRQ $1, AX, R8 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, DI + DECQ BP + JNE l10 + MOVQ AX, s11-96(SP) + MOVQ DX, s12-104(SP) + MOVQ CX, s13-112(SP) + MOVQ BX, s14-120(SP) + MOVQ SI, s15-128(SP) + MOVQ DI, s16-136(SP) + +l8: + MOVQ R9, AX + MOVQ R10, DX + MOVQ R11, CX + MOVQ R12, BX + MOVQ R13, SI + MOVQ R14, DI + BTQ $0, AX + JCS l12 + MOVQ $0, BP + XORQ R8, R8 + +l13: + INCQ BP + SHRQ $1, AX, R8 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, DI + BTQ $0, AX + JCC l13 + MOVQ AX, R9 + MOVQ DX, R10 + MOVQ CX, R11 + MOVQ BX, R12 + MOVQ SI, R13 + MOVQ DI, R14 + MOVQ s5-48(SP), AX + MOVQ s6-56(SP), DX + MOVQ s7-64(SP), CX + MOVQ s8-72(SP), BX + MOVQ s9-80(SP), SI + MOVQ s10-88(SP), DI + +l14: + BTQ $0, AX + JCC l15 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + +l15: + SHRQ $1, AX, R8 + SHRQ $1, DX, AX + SHRQ $1, CX, DX + SHRQ $1, BX, CX + SHRQ $1, SI, BX + SHRQ $1, DI, SI + SHRQ $1, DI + DECQ BP + JNE l14 + MOVQ AX, s5-48(SP) + MOVQ DX, s6-56(SP) + MOVQ CX, s7-64(SP) + MOVQ BX, s8-72(SP) + MOVQ SI, s9-80(SP) + MOVQ DI, s10-88(SP) + +l12: + // v = v - u + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + MOVQ s3-32(SP), SI + MOVQ s4-40(SP), DI + SUBQ R9, AX + SBBQ R10, DX + SBBQ R11, CX + SBBQ R12, BX + SBBQ R13, SI + SBBQ R14, DI + JCC l3 + SUBQ R15, R9 + SBBQ s0-8(SP), R10 + SBBQ s1-16(SP), R11 + SBBQ s2-24(SP), R12 + SBBQ s3-32(SP), R13 + SBBQ s4-40(SP), R14 + MOVQ s5-48(SP), AX + MOVQ s6-56(SP), DX + MOVQ s7-64(SP), CX + MOVQ s8-72(SP), BX + MOVQ s9-80(SP), SI + MOVQ s10-88(SP), DI + SUBQ s11-96(SP), AX + SBBQ s12-104(SP), DX + SBBQ s13-112(SP), CX + SBBQ s14-120(SP), BX + SBBQ s15-128(SP), SI + SBBQ s16-136(SP), DI + JCC l16 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + +l16: + MOVQ AX, s5-48(SP) + MOVQ DX, s6-56(SP) + MOVQ CX, s7-64(SP) + MOVQ BX, s8-72(SP) + MOVQ SI, s9-80(SP) + MOVQ DI, s10-88(SP) + JMP l4 + +l3: + MOVQ AX, R15 + MOVQ DX, s0-8(SP) + MOVQ CX, s1-16(SP) + MOVQ BX, s2-24(SP) + MOVQ SI, s3-32(SP) + MOVQ DI, s4-40(SP) + MOVQ s11-96(SP), AX + MOVQ s12-104(SP), DX + MOVQ s13-112(SP), CX + MOVQ s14-120(SP), BX + MOVQ s15-128(SP), SI + MOVQ s16-136(SP), DI + SUBQ s5-48(SP), AX + SBBQ s6-56(SP), DX + SBBQ s7-64(SP), CX + SBBQ s8-72(SP), BX + SBBQ s9-80(SP), SI + SBBQ s10-88(SP), DI + JCC l17 + ADDQ q<>+0(SB), AX + ADCQ q<>+8(SB), DX + ADCQ q<>+16(SB), CX + ADCQ q<>+24(SB), BX + ADCQ q<>+32(SB), SI + ADCQ q<>+40(SB), DI + +l17: + MOVQ AX, s11-96(SP) + MOVQ DX, s12-104(SP) + MOVQ CX, s13-112(SP) + MOVQ BX, s14-120(SP) + MOVQ SI, s15-128(SP) + MOVQ DI, s16-136(SP) + +l4: + MOVQ R9, R8 + SUBQ $1, R8 + ORQ R10, R8 + ORQ R11, R8 + ORQ R12, R8 + ORQ R13, R8 + ORQ R14, R8 + JEQ l5 + MOVQ R15, AX + MOVQ s0-8(SP), DX + MOVQ s1-16(SP), CX + MOVQ s2-24(SP), BX + MOVQ s3-32(SP), SI + MOVQ s4-40(SP), DI + MOVQ AX, R8 + SUBQ $1, R8 + JNE l2 + ORQ DX, R8 + ORQ CX, R8 + ORQ BX, R8 + ORQ SI, R8 + ORQ DI, R8 + JEQ l6 + JMP l2 + +l5: + MOVQ res+0(FP), R8 + MOVQ s5-48(SP), AX + MOVQ s6-56(SP), DX + MOVQ s7-64(SP), CX + MOVQ s8-72(SP), BX + MOVQ s9-80(SP), SI + MOVQ s10-88(SP), DI + MOVQ AX, 0(R8) + MOVQ DX, 8(R8) + MOVQ CX, 16(R8) + MOVQ BX, 24(R8) + MOVQ SI, 32(R8) + MOVQ DI, 40(R8) + RET + +l6: + MOVQ res+0(FP), R8 + MOVQ s11-96(SP), AX + MOVQ s12-104(SP), DX + MOVQ s13-112(SP), CX + MOVQ s14-120(SP), BX + MOVQ s15-128(SP), SI + MOVQ s16-136(SP), DI + MOVQ AX, 0(R8) + MOVQ DX, 8(R8) + MOVQ CX, 16(R8) + MOVQ BX, 24(R8) + MOVQ SI, 32(R8) + MOVQ DI, 40(R8) + RET + +l7: + MOVQ res+0(FP), R8 + MOVQ $0, 0(R8) + MOVQ $0, 8(R8) + MOVQ $0, 16(R8) + MOVQ $0, 24(R8) + MOVQ $0, 32(R8) + MOVQ $0, 40(R8) + RET diff --git a/ecc/bw6-761/fr/element_ops_noasm.go b/ecc/bw6-761/fr/element_ops_noasm.go index ec1fac18d..006365daa 100644 --- a/ecc/bw6-761/fr/element_ops_noasm.go +++ b/ecc/bw6-761/fr/element_ops_noasm.go @@ -51,6 +51,10 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } +func inverse(z, x *Element) { + _inverseGeneric(z, x) +} + // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/field/asm/amd64/asm_macros.go b/field/asm/amd64/asm_macros.go index bd27ab905..8a5eeee82 100644 --- a/field/asm/amd64/asm_macros.go +++ b/field/asm/amd64/asm_macros.go @@ -85,6 +85,8 @@ GLOBL qInv0<>(SB), (RODATA+NOPTR), $8 {{- range $i := .NbWordsIndexesFull}} CMOVQCS rb{{$i}}, ra{{$i}}; \ {{- end}} + + ` func (f *FFAmd64) GenerateDefines() { @@ -240,9 +242,14 @@ func helpers() template.FuncMap { return template.FuncMap{ "mul": mul, "imm": imm, + "sub": sub, } } +func sub(a, b int) int { + return a - b +} + func mul(a, b int) int { return a * b } diff --git a/field/asm/amd64/build.go b/field/asm/amd64/build.go index 86ed46a60..bf54b3e64 100644 --- a/field/asm/amd64/build.go +++ b/field/asm/amd64/build.go @@ -170,6 +170,9 @@ func Generate(w io.Writer, F *field.Field) error { // fft butterflies f.generateButterfly() + // inverse asm + f.generateInverse() + return nil } diff --git a/field/asm/amd64/element_inverse.go b/field/asm/amd64/element_inverse.go new file mode 100644 index 000000000..fade06285 --- /dev/null +++ b/field/asm/amd64/element_inverse.go @@ -0,0 +1,246 @@ +package amd64 + +import "github.com/consensys/bavard/amd64" + +func (f *FFAmd64) generateInverse() { + f.Comment("inverse(res, x *Element)") + + // we need r, s, u, v registers, + one set for subs or reductions + stackSize := f.StackSize(f.NbWords*5+1, 0, 0) + registers := f.FnHeader("inverse", stackSize, 16) + defer f.AssertCleanStack(stackSize, 0) + + t := f.PopN(®isters) + zero := f.Pop(®isters) + loopCounter := amd64.BP + + // order is important here; for NbWords <= 6, u is going to fit into registers. + u := f.PopN(®isters) + v := f.PopN(®isters) + r := f.PopN(®isters) + s := f.PopN(®isters) + + uOnStack := f.NbWords > 6 + + // labels + startLoop := f.NewLabel() + vBigger := f.NewLabel() + endLoop := f.NewLabel() + returnR := f.NewLabel() + returnS := f.NewLabel() + returnZero := f.NewLabel() + + // u = q + f.Comment("u = q") + f.LabelRegisters("u", u...) + for i := 0; i < f.NbWords; i++ { + if !uOnStack { + // u is on registers + f.MOVQ(f.qAt(i), u[i]) + } else { + f.MOVQ(f.qAt(i), zero) + f.MOVQ(zero, u[i]) + } + } + + // s = r^2 + f.Comment("s = r^2") + f.LabelRegisters("s", s...) + for i := 0; i < f.NbWords; i++ { + f.MOVQ(f.RSquare[i], zero) + f.MOVQ(zero, s[i]) + } + + // v = x + f.Comment("v = x") + f.LabelRegisters("v", v...) + f.MOVQ("x+8(FP)", zero) + f.Mov(zero, t) + f.Mov(t, v) + + f.Comment("if x is 0, returns 0") + f.MOVQ(t[0], zero) + for i := 1; i < len(t); i++ { + f.ORQ(t[i], zero) + } + f.JEQ(returnZero) + + //r = 0 + f.Comment("r = 0") + f.LabelRegisters("r", r...) + for i := 0; i < len(r); i++ { + f.MOVQ(0, r[i]) + } + + // rshOne set a and b such that + // for a[0]&1 == 0 { + // a <<= 1 + // if b[0] & 1 == 1 { + // b += q + // } + // b <<= 1 + // } + // t must be a set of registers. + rshOne := func(a, b, t []amd64.Register) { + end := f.NewLabel() + firstLoop := f.NewLabel() + secondLoop := f.NewLabel() + + // this is done before by the caller + // f.Mov(a, t) + f.BTQ(0, t[0]) + f.JCS(end) + + f.MOVQ(0, loopCounter) + f.XORQ(zero, zero) + f.LABEL(firstLoop) + f.INCQ(loopCounter) + + f.SHRQw(1, t[0], zero) + for i := 1; i < len(t); i++ { + f.SHRQw(1, t[i], t[i-1]) + } + f.SHRQ(1, t[len(t)-1]) + + f.BTQ(0, t[0]) + f.JCC(firstLoop) + + // we need to save the result of the first loop + f.Mov(t, a) + f.Mov(b, t) + // we need to shift r (t) loopCOunter times + f.LABEL(secondLoop) + + f.BTQ(0, t[0]) // if r[0] is odd, we add modulus + f.reduceIfBorrow(t) + f.SHRQw(1, t[0], zero) + for i := 1; i < len(t); i++ { + f.SHRQw(1, t[i], t[i-1]) + } + f.SHRQ(1, t[len(t)-1]) + + f.DECQ(loopCounter) + f.JNE(secondLoop) + + // save result of second loop + f.Mov(t, b) + + f.LABEL(end) + } + + f.LABEL(startLoop) + + // note: t always contains v here + rshOne(v, s, t) + + f.Mov(u, t) + rshOne(u, r, t) + + // f.Push(®isters, loopCounter) + + // v = v - u + f.Comment("v = v - u") + f.Mov(v, t) + + f.Sub(u, t) + f.JCC(vBigger) + + // here v is smaller + // u = u - v + if !uOnStack { + f.Sub(v, u) + } else { + f.Mov(u, t) + f.Sub(v, t) + f.Mov(t, u) + } + + // r = r - s + f.Mov(r, t) + f.Sub(s, t) + f.reduceIfBorrow(t) + f.Mov(t, r) + f.JMP(endLoop) + + // here v is bigger + f.LABEL(vBigger) + // v = v - u + f.Mov(t, v) + // s = s - r + f.Mov(s, t) + f.Sub(r, t) + f.reduceIfBorrow(t) + f.Mov(t, s) + f.LABEL(endLoop) + + // if (u[0] == 1) && (u[5]|u[4]|u[3]|u[2]|u[1]) == 0 { + // return z.Set(&r) + // } + // if (v[0] == 1) && (v[5]|v[4]|v[3]|v[2]|v[1]) == 0 { + // return z.Set(&s) + // } + if !uOnStack { + f.MOVQ(u[0], zero) + f.SUBQ(1, zero) + for i := 1; i < f.NbWords; i++ { + f.ORQ(u[i], zero) + } + } else { + f.Mov(u, t) + f.SUBQ(1, t[0]) + last := len(t) - 1 + for i := 0; i < f.NbWords-1; i++ { + f.ORQ(t[i], t[last]) + } + } + + f.JEQ(returnR) + + f.Mov(v, t) + f.MOVQ(t[0], zero) + f.SUBQ(1, zero) + f.JNE(startLoop) + for i := 1; i < f.NbWords; i++ { + f.ORQ(t[i], zero) + } + f.JEQ(returnS) + + f.JMP(startLoop) + + f.LABEL(returnR) + f.MOVQ("res+0(FP)", zero) + f.Mov(r, t) + f.Mov(t, zero) + f.RET() + + f.LABEL(returnS) + f.MOVQ("res+0(FP)", zero) + f.Mov(s, t) + f.Mov(t, zero) + f.RET() + + f.LABEL(returnZero) + f.MOVQ("res+0(FP)", zero) + for i := 0; i < len(t); i++ { + f.MOVQ(0, zero.At(i)) + } + f.RET() + + // f.Push(®isters, flagBorrow) + f.Push(®isters, u...) + f.Push(®isters, r...) + f.Push(®isters, v...) + f.Push(®isters, s...) + f.Push(®isters, t...) + f.Push(®isters, zero) +} + +func (f *FFAmd64) reduceIfBorrow(t []amd64.Register) { + noReduce := f.NewLabel() + f.JCC(noReduce) + f.ADDQ(f.qAt(0), t[0]) + for i := 1; i < f.NbWords; i++ { + f.ADCQ(f.qAt(i), t[i]) + } + f.LABEL(noReduce) +} diff --git a/field/internal/templates/element/base.go b/field/internal/templates/element/base.go index 4e8ffdc64..e4be477f6 100644 --- a/field/internal/templates/element/base.go +++ b/field/internal/templates/element/base.go @@ -475,4 +475,15 @@ func _butterflyGeneric(a, b *{{.ElementName}}) { b.Sub(&t, b) } +// BitLen returns the minimum number of bits needed to represent z +// returns 0 if z == 0 +func (z *{{.ElementName}}) BitLen() int { + {{- range $i := reverse .NbWordsIndexesNoZero}} + if z[{{$i}}] != 0 { + return {{mul $i 64}} + bits.Len64(z[{{$i}}]) + } + {{- end}} + return bits.Len64(z[0]) +} + ` diff --git a/field/internal/templates/element/inverse.go b/field/internal/templates/element/inverse.go index a93bfdef2..8ec60adc6 100644 --- a/field/internal/templates/element/inverse.go +++ b/field/internal/templates/element/inverse.go @@ -8,11 +8,15 @@ const Inverse = ` // Inverse z = x^-1 mod q // note: allocates a big.Int (math/big) func (z *{{.ElementName}}) Inverse( x *{{.ElementName}}) *{{.ElementName}} { + inverse(z, x) + return z +} + +func _inverseGeneric(z, x *{{.ElementName}}) { var _xNonMont big.Int x.ToBigIntRegular( &_xNonMont) _xNonMont.ModInverse(&_xNonMont, Modulus()) z.SetBigInt(&_xNonMont) - return z } {{ else }} @@ -21,8 +25,19 @@ func (z *{{.ElementName}}) Inverse( x *{{.ElementName}}) *{{.ElementName}} { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *{{.ElementName}}) Inverse(x *{{.ElementName}}) *{{.ElementName}} { + inverse(z, x) + return z +} + + + +// _inverseGeneric z = x^-1 mod q +// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" +// if x == 0, sets and returns z = x +func _inverseGeneric(z, x *{{.ElementName}}) { if x.IsZero() { - return z.Set(x) + z.SetZero() + return } // initialize u = q @@ -42,23 +57,23 @@ func (z *{{.ElementName}}) Inverse(x *{{.ElementName}}) *{{.ElementName}} { v := *x - var carry, borrow, t, t2 uint64 + var carry, borrow uint64 var bigger bool for { for v[0]&1 == 0 { - {{ template "div2" dict "all" . "V" "v"}} + {{ rsh "v" .NbWords}} if s[0]&1 == 1 { {{ template "add_q" dict "all" . "V1" "s" }} } - {{ template "div2" dict "all" . "V" "s"}} + {{ rsh "s" .NbWords}} } for u[0]&1 == 0 { - {{ template "div2" dict "all" . "V" "u"}} + {{ rsh "u" .NbWords}} if r[0]&1 == 1 { {{ template "add_q" dict "all" . "V1" "r" }} } - {{ template "div2" dict "all" . "V" "r"}} + {{ rsh "r" .NbWords}} } {{ template "bigger" dict "all" . "V1" "v" "V2" "u"}} if bigger { @@ -75,10 +90,12 @@ func (z *{{.ElementName}}) Inverse(x *{{.ElementName}}) *{{.ElementName}} { } } if (u[0] == 1) && ({{- range $i := reverse .NbWordsIndexesNoZero}}u[{{$i}}] {{if eq $i 1}}{{else}} | {{end}}{{end}} ) == 0 { - return z.Set(&r) + z.Set(&r) + return } if (v[0] == 1) && ({{- range $i := reverse .NbWordsIndexesNoZero}}v[{{$i}}] {{if eq $i 1}}{{else}} | {{end}}{{end}} ) == 0 { - return z.Set(&s) + z.Set(&s) + return } } @@ -120,20 +137,15 @@ func (z *{{.ElementName}}) Inverse(x *{{.ElementName}}) *{{.ElementName}} { {{ end }} -{{ define "div2" }} +{{ define "rsh V nbWords" }} // {{$.V}} = {{$.V}} >> 1 - {{- range $i := reverse .all.NbWordsIndexesNoZero}} - {{- if eq $i $.all.NbWordsLastIndex}} - t2 = {{$.V}}[{{$i}}] << 63 - {{$.V}}[{{$i}}] >>= 1 - {{- else}} - t2 = {{$.V}}[{{$i}}] << 63 - {{$.V}}[{{$i}}] = ({{$.V}}[{{$i}}] >> 1) | t + {{$lastIndex := sub .nbWords 1}} + {{- range $i := iterate .nbWords}} + {{- if ne $i $lastIndex}} + {{$.V}}[{{$i}}] = {{$.V}}[{{$i}}] >> 1 | {{$.V}}[{{(add $i 1)}}] << 63 {{- end}} - t = t2 {{- end}} - {{$.V}}[0] = ({{$.V}}[0] >> 1) | t + {{$.V}}[{{$lastIndex}}] >>= 1 {{ end }} - ` diff --git a/field/internal/templates/element/ops.go b/field/internal/templates/element/ops.go index 7bb3744fb..e68b09574 100644 --- a/field/internal/templates/element/ops.go +++ b/field/internal/templates/element/ops.go @@ -38,6 +38,9 @@ func reduce(res *{{.ElementName}}) //go:noescape func Butterfly(a, b *{{.ElementName}}) +//go:noescape +func inverse(res, x *{{.ElementName}}) + {{end}} diff --git a/field/internal/templates/element/ops_generic.go b/field/internal/templates/element/ops_generic.go index 2d26465dc..ccbaa87e9 100644 --- a/field/internal/templates/element/ops_generic.go +++ b/field/internal/templates/element/ops_generic.go @@ -34,6 +34,10 @@ func mul(z, x, y *{{.ElementName}}) { _mulGeneric(z, x, y) } +func inverse(z, x *{{.ElementName}}) { + _inverseGeneric(z, x) +} + // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *{{.ElementName}} ) { diff --git a/go.mod b/go.mod index e6fc70005..ceb094c34 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/consensys/gnark-crypto go 1.16 require ( - github.com/consensys/bavard v0.1.8-0.20210806153619-fcffe4ffd871 + github.com/consensys/bavard v0.1.8-0.20210915155054-088da2f7f54a github.com/leanovate/gopter v0.2.9 golang.org/x/crypto v0.0.0-20210322153248-0c34fe9e7dc2 golang.org/x/sys v0.0.0-20210420205809-ac73e9fd8988 diff --git a/go.sum b/go.sum index 11e4907b0..558b048d0 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,5 @@ -github.com/consensys/bavard v0.1.8-0.20210806153619-fcffe4ffd871 h1:gfdz2r/E4uQhD8jDUv2SaWQClfzFuZioHGAzPw7oZng= -github.com/consensys/bavard v0.1.8-0.20210806153619-fcffe4ffd871/go.mod h1:Bpd0/3mZuaj6Sj+PqrmIquiOKy397AKGThQPaGzNXAQ= +github.com/consensys/bavard v0.1.8-0.20210915155054-088da2f7f54a h1:AEpwbXTjBGKoqxuQ6QAcBMEuK0+PtajQj0wJkhTnSd0= +github.com/consensys/bavard v0.1.8-0.20210915155054-088da2f7f54a/go.mod h1:9ItSMtA/dXMAiL7BG6bqW2m3NdSEObYWoH223nGHukI= github.com/leanovate/gopter v0.2.9 h1:fQjYxZaynp97ozCzfOyOuAGOU4aU/z37zf/tOujFk7c= github.com/leanovate/gopter v0.2.9/go.mod h1:U2L/78B+KVFIx2VmW6onHJQzXtFb+p5y3y2Sh+Jxxv8= golang.org/x/crypto v0.0.0-20210322153248-0c34fe9e7dc2 h1:It14KIkyBFYkHkwZ7k45minvA9aorojkyjGk9KJ5B/w= @@ -11,3 +11,5 @@ golang.org/x/sys v0.0.0-20210420205809-ac73e9fd8988/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +rsc.io/tmplfunc v0.0.3 h1:53XFQh69AfOa8Tw0Jm7t+GV7KZhOi6jzsCzTtKbMvzU= +rsc.io/tmplfunc v0.0.3/go.mod h1:AG3sTPzElb1Io3Yg4voV9AGZJuleGAwaVRxL9M49PhA= From 67e5245639ec9765048286e913bddfeb3b3c3694 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Wed, 15 Sep 2021 18:27:46 +0200 Subject: [PATCH 05/11] perf(bls12,bls24): uses a mix of Granger-Scott and Karabina in Expt --- ecc/bls12-377/internal/fptower/e12_pairing.go | 50 +++++++++---------- ecc/bls12-381/internal/fptower/e12_pairing.go | 9 +++- ecc/bls24-315/internal/fptower/e24_pairing.go | 9 +++- 3 files changed, 40 insertions(+), 28 deletions(-) diff --git a/ecc/bls12-377/internal/fptower/e12_pairing.go b/ecc/bls12-377/internal/fptower/e12_pairing.go index 37fe796ca..55c932879 100644 --- a/ecc/bls12-377/internal/fptower/e12_pairing.go +++ b/ecc/bls12-377/internal/fptower/e12_pairing.go @@ -1,5 +1,17 @@ package fptower +func (z *E12) nSquare(n int) { + for i := 0; i < n; i++ { + z.CyclotomicSquare(z) + } +} + +func (z *E12) nSquareCompressed(n int) { + for i := 0; i < n; i++ { + z.CyclotomicSquareCompressed(z) + } +} + // Expt set z to x^t in E12 and return z func (z *E12) Expt(x *E12) *E12 { // const tAbsVal uint64 = 9586122913090633729 @@ -10,34 +22,20 @@ func (z *E12) Expt(x *E12) *E12 { var result, x33 E12 // a shortest addition chain for 136227 - result.Set(x) // 0 1 - result.CyclotomicSquare(&result) // 1( 0) 2 - result.CyclotomicSquare(&result) // 2( 1) 4 - result.CyclotomicSquare(&result) // 3( 2) 8 - result.CyclotomicSquare(&result) // 4( 3) 16 - result.CyclotomicSquare(&result) // 5( 4) 32 - result.Mul(&result, x) // 6( 5, 0) 33 - x33.Set(&result) // save x33 for step 14 - result.CyclotomicSquare(&result) // 7( 6) 66 - result.CyclotomicSquare(&result) // 8( 7) 132 - result.CyclotomicSquare(&result) // 9( 8) 264 - result.CyclotomicSquare(&result) // 10( 9) 528 - result.CyclotomicSquare(&result) // 11(10) 1056 - result.CyclotomicSquare(&result) // 12(11) 2112 - result.CyclotomicSquare(&result) // 13(12) 4224 - result.Mul(&result, &x33) // 14(13, 6) 4257 - result.CyclotomicSquare(&result) // 15(14) 8514 - result.CyclotomicSquare(&result) // 16(15) 17028 - result.CyclotomicSquare(&result) // 17(16) 34056 - result.CyclotomicSquare(&result) // 18(17) 68112 - result.Mul(&result, x) // 19(18, 0) 68113 - result.CyclotomicSquare(&result) // 20(19) 136226 - result.Mul(&result, x) // 21(20, 0) 136227 + result.Set(x) + result.nSquare(5) + result.Mul(&result, x) + x33.Set(&result) + result.nSquare(7) + result.Mul(&result, &x33) + result.nSquare(4) + result.Mul(&result, x) + result.CyclotomicSquare(&result) + result.Mul(&result, x) // the remaining 46 bits - for i := 0; i < 46; i++ { - result.CyclotomicSquare(&result) - } + result.nSquareCompressed(46) + result.Decompress(&result) result.Mul(&result, x) z.Set(&result) diff --git a/ecc/bls12-381/internal/fptower/e12_pairing.go b/ecc/bls12-381/internal/fptower/e12_pairing.go index 948ba0771..500e3bde2 100644 --- a/ecc/bls12-381/internal/fptower/e12_pairing.go +++ b/ecc/bls12-381/internal/fptower/e12_pairing.go @@ -6,6 +6,12 @@ func (z *E12) nSquare(n int) { } } +func (z *E12) nSquareCompressed(n int) { + for i := 0; i < n; i++ { + z.CyclotomicSquareCompressed(z) + } +} + // ExptHalf set z to x^(t/2) in E12 and return z // const t/2 uint64 = 7566188111470821376 // negative func (z *E12) ExptHalf(x *E12) *E12 { @@ -18,7 +24,8 @@ func (z *E12) ExptHalf(x *E12) *E12 { result.Mul(&result, x) result.nSquare(9) result.Mul(&result, x) - result.nSquare(32) + result.nSquareCompressed(32) + result.Decompress(&result) result.Mul(&result, x) result.nSquare(15) return z.Conjugate(&result) // because tAbsVal is negative diff --git a/ecc/bls24-315/internal/fptower/e24_pairing.go b/ecc/bls24-315/internal/fptower/e24_pairing.go index 04bf8275b..ac82fc805 100644 --- a/ecc/bls24-315/internal/fptower/e24_pairing.go +++ b/ecc/bls24-315/internal/fptower/e24_pairing.go @@ -6,6 +6,12 @@ func (z *E24) nSquare(n int) { } } +func (z *E24) nSquareCompressed(n int) { + for i := 0; i < n; i++ { + z.CyclotomicSquareCompressed(z) + } +} + // Expt set z to x^t in E24 and return z (t is the seed of the curve) func (z *E24) Expt(x *E24) *E24 { @@ -19,7 +25,8 @@ func (z *E24) Expt(x *E24) *E24 { result.Mul(&result, &xInv) result.nSquare(2) result.Mul(&result, x) - result.nSquare(20) + result.nSquareCompressed(20) + result.Decompress(&result) result.Mul(&result, &xInv) z.Conjugate(&result) From 4dd427d81398a41ec79ebeb434dcdc7384ff0bf4 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Sun, 19 Sep 2021 15:08:59 +0200 Subject: [PATCH 06/11] feat(e12): implements batch decompression for karabina cyclo square --- ecc/bls12-377/internal/fptower/e12.go | 56 +++++++++++ ecc/bls12-377/internal/fptower/e12_test.go | 26 +++++ ecc/bls12-377/internal/fptower/e2.go | 34 +++++++ ecc/bls12-377/internal/fptower/e2_test.go | 14 +++ ecc/bls12-381/internal/fptower/e12.go | 56 +++++++++++ ecc/bls12-381/internal/fptower/e12_test.go | 26 +++++ ecc/bls12-381/internal/fptower/e2.go | 34 +++++++ ecc/bls12-381/internal/fptower/e2_test.go | 14 +++ ecc/bn254/internal/fptower/e12.go | 56 +++++++++++ ecc/bn254/internal/fptower/e12_pairing.go | 98 +++++++------------ ecc/bn254/internal/fptower/e12_test.go | 26 +++++ ecc/bn254/internal/fptower/e2.go | 34 +++++++ ecc/bn254/internal/fptower/e2_test.go | 14 +++ .../template/fq12over6over2/fq12.go.tmpl | 56 +++++++++++ .../tower/template/fq12over6over2/fq2.go.tmpl | 50 ++++++++-- .../fq12over6over2/tests/fq12.go.tmpl | 26 +++++ .../template/fq12over6over2/tests/fq2.go.tmpl | 18 +++- 17 files changed, 566 insertions(+), 72 deletions(-) diff --git a/ecc/bls12-377/internal/fptower/e12.go b/ecc/bls12-377/internal/fptower/e12.go index e402f40c0..0f6b8b7e5 100644 --- a/ecc/bls12-377/internal/fptower/e12.go +++ b/ecc/bls12-377/internal/fptower/e12.go @@ -256,6 +256,62 @@ func (z *E12) Decompress(x *E12) *E12 { return z } +// BatchDecompress multiple Karabina's cyclotomic square results +func BatchDecompress(x []E12) []E12 { + + n := len(x) + if n == 0 { + return x + } + + t0 := make([]E2, n) + t1 := make([]E2, n) + t2 := make([]E2, n) + + var one E2 + one.SetOne() + + for i := 0; i < n; i++ { + // t0 = g1^2 + t0[i].Square(&x[i].C0.B1) + // t1 = 3 * g1^2 - 2 * g2 + t1[i].Sub(&t0[i], &x[i].C0.B2). + Double(&t1[i]). + Add(&t1[i], &t0[i]) + // t0 = E * g5^2 + t1 + t2[i].Square(&x[i].C1.B2) + t0[i].MulByNonResidue(&t2[i]). + Add(&t0[i], &t1[i]) + // t1 = 4 * g3 + t1[i].Double(&x[i].C1.B0). + Double(&t1[i]) + } + + t1 = BatchInvert(t1) // costs 1 inverse + + for i := 0; i < n; i++ { + // z4 = g4 + x[i].C1.B1.Mul(&t0[i], &t1[i]) + + // t1 = g2 * g1 + t1[i].Mul(&x[i].C0.B2, &x[i].C0.B1) + // t2 = 2 * g4^2 - 3 * g2 * g1 + t2[i].Square(&x[i].C1.B1) + t2[i].Sub(&t2[i], &t1[i]) + t2[i].Double(&t2[i]) + t2[i].Sub(&t2[i], &t1[i]) + + // t1 = g3 * g5 + t1[i].Mul(&x[i].C1.B0, &x[i].C1.B2) + // z0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1 + t2[i].Add(&t2[i], &t1[i]) + x[i].C0.B0.MulByNonResidue(&t2[i]). + Add(&x[i].C0.B0, &one) + } + + return x +} + // Granger-Scott's cyclotomic square // https://eprint.iacr.org/2009/565.pdf, 3.2 func (z *E12) CyclotomicSquare(x *E12) *E12 { diff --git a/ecc/bls12-377/internal/fptower/e12_test.go b/ecc/bls12-377/internal/fptower/e12_test.go index 957b9cbaa..a7cfec865 100644 --- a/ecc/bls12-377/internal/fptower/e12_test.go +++ b/ecc/bls12-377/internal/fptower/e12_test.go @@ -323,6 +323,32 @@ func TestE12Ops(t *testing.T) { genA, )) + properties.Property("[BLS12-377] batch decompress and individual decompress (Karabina) should be the same", prop.ForAll( + func(a *E12) bool { + var b E12 + // put in the cyclotomic subgroup + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusSquare(&b).Mul(a, &b) + + var a2, a4, a17 E12 + a2.Set(a) + a4.Set(a) + a17.Set(a) + a2.nSquareCompressed(2) + a4.nSquareCompressed(4) + a17.nSquareCompressed(17) + batch := BatchDecompress([]E12{a2, a4, a17}) + a2.Decompress(&a2) + a4.Decompress(&a4) + a17.Decompress(&a17) + + return a2.Equal(&batch[0]) && a4.Equal(&batch[1]) && a17.Equal(&batch[2]) + }, + genA, + )) + properties.Property("[BLS12-377] Frobenius of x in E12 should be equal to x^q", prop.ForAll( func(a *E12) bool { var b, c E12 diff --git a/ecc/bls12-377/internal/fptower/e2.go b/ecc/bls12-377/internal/fptower/e2.go index 6c364298b..78327f4c4 100644 --- a/ecc/bls12-377/internal/fptower/e2.go +++ b/ecc/bls12-377/internal/fptower/e2.go @@ -220,3 +220,37 @@ func (z *E2) Sqrt(x *E2) *E2 { return z } + +// BatchInvert returns a new slice with every element inverted. +// Uses Montgomery batch inversion trick +func BatchInvert(a []E2) []E2 { + res := make([]E2, len(a)) + if len(a) == 0 { + return res + } + + zeroes := make([]bool, len(a)) + var accumulator E2 + accumulator.SetOne() + + for i := 0; i < len(a); i++ { + if a[i].IsZero() { + zeroes[i] = true + continue + } + res[i].Set(&accumulator) + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := len(a) - 1; i >= 0; i-- { + if zeroes[i] { + continue + } + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } + + return res +} diff --git a/ecc/bls12-377/internal/fptower/e2_test.go b/ecc/bls12-377/internal/fptower/e2_test.go index bd079af86..ef16d7a4d 100644 --- a/ecc/bls12-377/internal/fptower/e2_test.go +++ b/ecc/bls12-377/internal/fptower/e2_test.go @@ -248,6 +248,20 @@ func TestE2Ops(t *testing.T) { genB, )) + properties.Property("[BLS12-377] BatchInvert should output the same result as Inverse", prop.ForAll( + func(a, b, c *E2) bool { + + batch := BatchInvert([]E2{*a, *b, *c}) + a.Inverse(a) + b.Inverse(b) + c.Inverse(c) + return a.Equal(&batch[0]) && b.Equal(&batch[1]) && c.Equal(&batch[2]) + }, + genA, + genA, + genA, + )) + properties.Property("[BLS12-377] inverse twice should leave an element invariant", prop.ForAll( func(a *E2) bool { var b E2 diff --git a/ecc/bls12-381/internal/fptower/e12.go b/ecc/bls12-381/internal/fptower/e12.go index 6886c5762..983000524 100644 --- a/ecc/bls12-381/internal/fptower/e12.go +++ b/ecc/bls12-381/internal/fptower/e12.go @@ -256,6 +256,62 @@ func (z *E12) Decompress(x *E12) *E12 { return z } +// BatchDecompress multiple Karabina's cyclotomic square results +func BatchDecompress(x []E12) []E12 { + + n := len(x) + if n == 0 { + return x + } + + t0 := make([]E2, n) + t1 := make([]E2, n) + t2 := make([]E2, n) + + var one E2 + one.SetOne() + + for i := 0; i < n; i++ { + // t0 = g1^2 + t0[i].Square(&x[i].C0.B1) + // t1 = 3 * g1^2 - 2 * g2 + t1[i].Sub(&t0[i], &x[i].C0.B2). + Double(&t1[i]). + Add(&t1[i], &t0[i]) + // t0 = E * g5^2 + t1 + t2[i].Square(&x[i].C1.B2) + t0[i].MulByNonResidue(&t2[i]). + Add(&t0[i], &t1[i]) + // t1 = 4 * g3 + t1[i].Double(&x[i].C1.B0). + Double(&t1[i]) + } + + t1 = BatchInvert(t1) // costs 1 inverse + + for i := 0; i < n; i++ { + // z4 = g4 + x[i].C1.B1.Mul(&t0[i], &t1[i]) + + // t1 = g2 * g1 + t1[i].Mul(&x[i].C0.B2, &x[i].C0.B1) + // t2 = 2 * g4^2 - 3 * g2 * g1 + t2[i].Square(&x[i].C1.B1) + t2[i].Sub(&t2[i], &t1[i]) + t2[i].Double(&t2[i]) + t2[i].Sub(&t2[i], &t1[i]) + + // t1 = g3 * g5 + t1[i].Mul(&x[i].C1.B0, &x[i].C1.B2) + // z0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1 + t2[i].Add(&t2[i], &t1[i]) + x[i].C0.B0.MulByNonResidue(&t2[i]). + Add(&x[i].C0.B0, &one) + } + + return x +} + // Granger-Scott's cyclotomic square // https://eprint.iacr.org/2009/565.pdf, 3.2 func (z *E12) CyclotomicSquare(x *E12) *E12 { diff --git a/ecc/bls12-381/internal/fptower/e12_test.go b/ecc/bls12-381/internal/fptower/e12_test.go index e5cfd97f9..6901a716e 100644 --- a/ecc/bls12-381/internal/fptower/e12_test.go +++ b/ecc/bls12-381/internal/fptower/e12_test.go @@ -323,6 +323,32 @@ func TestE12Ops(t *testing.T) { genA, )) + properties.Property("[BLS12-381] batch decompress and individual decompress (Karabina) should be the same", prop.ForAll( + func(a *E12) bool { + var b E12 + // put in the cyclotomic subgroup + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusSquare(&b).Mul(a, &b) + + var a2, a4, a17 E12 + a2.Set(a) + a4.Set(a) + a17.Set(a) + a2.nSquareCompressed(2) + a4.nSquareCompressed(4) + a17.nSquareCompressed(17) + batch := BatchDecompress([]E12{a2, a4, a17}) + a2.Decompress(&a2) + a4.Decompress(&a4) + a17.Decompress(&a17) + + return a2.Equal(&batch[0]) && a4.Equal(&batch[1]) && a17.Equal(&batch[2]) + }, + genA, + )) + properties.Property("[BLS12-381] Frobenius of x in E12 should be equal to x^q", prop.ForAll( func(a *E12) bool { var b, c E12 diff --git a/ecc/bls12-381/internal/fptower/e2.go b/ecc/bls12-381/internal/fptower/e2.go index a7a17d467..74a18d560 100644 --- a/ecc/bls12-381/internal/fptower/e2.go +++ b/ecc/bls12-381/internal/fptower/e2.go @@ -221,3 +221,37 @@ func (z *E2) Sqrt(x *E2) *E2 { z.Set(&b) return z } + +// BatchInvert returns a new slice with every element inverted. +// Uses Montgomery batch inversion trick +func BatchInvert(a []E2) []E2 { + res := make([]E2, len(a)) + if len(a) == 0 { + return res + } + + zeroes := make([]bool, len(a)) + var accumulator E2 + accumulator.SetOne() + + for i := 0; i < len(a); i++ { + if a[i].IsZero() { + zeroes[i] = true + continue + } + res[i].Set(&accumulator) + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := len(a) - 1; i >= 0; i-- { + if zeroes[i] { + continue + } + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } + + return res +} diff --git a/ecc/bls12-381/internal/fptower/e2_test.go b/ecc/bls12-381/internal/fptower/e2_test.go index 68c6f218a..667da416f 100644 --- a/ecc/bls12-381/internal/fptower/e2_test.go +++ b/ecc/bls12-381/internal/fptower/e2_test.go @@ -248,6 +248,20 @@ func TestE2Ops(t *testing.T) { genB, )) + properties.Property("[BLS12-381] BatchInvert should output the same result as Inverse", prop.ForAll( + func(a, b, c *E2) bool { + + batch := BatchInvert([]E2{*a, *b, *c}) + a.Inverse(a) + b.Inverse(b) + c.Inverse(c) + return a.Equal(&batch[0]) && b.Equal(&batch[1]) && c.Equal(&batch[2]) + }, + genA, + genA, + genA, + )) + properties.Property("[BLS12-381] mulGeneric & mul should be equal", prop.ForAll( func(a, b *E2) bool { var c, d E2 diff --git a/ecc/bn254/internal/fptower/e12.go b/ecc/bn254/internal/fptower/e12.go index 64f1e99f3..572c9eade 100644 --- a/ecc/bn254/internal/fptower/e12.go +++ b/ecc/bn254/internal/fptower/e12.go @@ -256,6 +256,62 @@ func (z *E12) Decompress(x *E12) *E12 { return z } +// BatchDecompress multiple Karabina's cyclotomic square results +func BatchDecompress(x []E12) []E12 { + + n := len(x) + if n == 0 { + return x + } + + t0 := make([]E2, n) + t1 := make([]E2, n) + t2 := make([]E2, n) + + var one E2 + one.SetOne() + + for i := 0; i < n; i++ { + // t0 = g1^2 + t0[i].Square(&x[i].C0.B1) + // t1 = 3 * g1^2 - 2 * g2 + t1[i].Sub(&t0[i], &x[i].C0.B2). + Double(&t1[i]). + Add(&t1[i], &t0[i]) + // t0 = E * g5^2 + t1 + t2[i].Square(&x[i].C1.B2) + t0[i].MulByNonResidue(&t2[i]). + Add(&t0[i], &t1[i]) + // t1 = 4 * g3 + t1[i].Double(&x[i].C1.B0). + Double(&t1[i]) + } + + t1 = BatchInvert(t1) // costs 1 inverse + + for i := 0; i < n; i++ { + // z4 = g4 + x[i].C1.B1.Mul(&t0[i], &t1[i]) + + // t1 = g2 * g1 + t1[i].Mul(&x[i].C0.B2, &x[i].C0.B1) + // t2 = 2 * g4^2 - 3 * g2 * g1 + t2[i].Square(&x[i].C1.B1) + t2[i].Sub(&t2[i], &t1[i]) + t2[i].Double(&t2[i]) + t2[i].Sub(&t2[i], &t1[i]) + + // t1 = g3 * g5 + t1[i].Mul(&x[i].C1.B0, &x[i].C1.B2) + // z0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1 + t2[i].Add(&t2[i], &t1[i]) + x[i].C0.B0.MulByNonResidue(&t2[i]). + Add(&x[i].C0.B0, &one) + } + + return x +} + // Granger-Scott's cyclotomic square // https://eprint.iacr.org/2009/565.pdf, 3.2 func (z *E12) CyclotomicSquare(x *E12) *E12 { diff --git a/ecc/bn254/internal/fptower/e12_pairing.go b/ecc/bn254/internal/fptower/e12_pairing.go index 6db3e7716..eb7e90716 100644 --- a/ecc/bn254/internal/fptower/e12_pairing.go +++ b/ecc/bn254/internal/fptower/e12_pairing.go @@ -1,95 +1,69 @@ package fptower +func (z *E12) nSquare(n int) { + for i := 0; i < n; i++ { + z.CyclotomicSquare(z) + } +} + +func (z *E12) nSquareCompressed(n int) { + for i := 0; i < n; i++ { + z.CyclotomicSquareCompressed(z) + } +} + // Expt set z to x^t in E12 and return z (t is the generator of the BN curve) func (z *E12) Expt(x *E12) *E12 { var result, xInv E12 xInv.Conjugate(x) - result.CyclotomicSquare(x) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.Set(x) + result.nSquare(4) result.Mul(&result, x) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(2) result.Mul(&result, x) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(3) result.Mul(&result, &xInv) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(2) result.Mul(&result, x) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(2) result.Mul(&result, x) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(2) result.Mul(&result, &xInv) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(3) result.Mul(&result, x) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(2) result.Mul(&result, x) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(2) result.Mul(&result, &xInv) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(2) result.Mul(&result, &xInv) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(2) result.Mul(&result, &xInv) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(2) result.Mul(&result, x) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(4) result.Mul(&result, x) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(3) result.Mul(&result, x) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(2) result.Mul(&result, x) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(2) result.Mul(&result, x) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(2) result.Mul(&result, &xInv) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(2) result.Mul(&result, x) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(3) result.Mul(&result, x) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(5) result.Mul(&result, x) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(2) result.Mul(&result, x) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(5) result.Mul(&result, &xInv) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) - result.CyclotomicSquare(&result) + result.nSquare(4) z.Mul(&result, x) return z diff --git a/ecc/bn254/internal/fptower/e12_test.go b/ecc/bn254/internal/fptower/e12_test.go index 14d126d2f..6c56c8680 100644 --- a/ecc/bn254/internal/fptower/e12_test.go +++ b/ecc/bn254/internal/fptower/e12_test.go @@ -323,6 +323,32 @@ func TestE12Ops(t *testing.T) { genA, )) + properties.Property("[BN254] batch decompress and individual decompress (Karabina) should be the same", prop.ForAll( + func(a *E12) bool { + var b E12 + // put in the cyclotomic subgroup + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusSquare(&b).Mul(a, &b) + + var a2, a4, a17 E12 + a2.Set(a) + a4.Set(a) + a17.Set(a) + a2.nSquareCompressed(2) + a4.nSquareCompressed(4) + a17.nSquareCompressed(17) + batch := BatchDecompress([]E12{a2, a4, a17}) + a2.Decompress(&a2) + a4.Decompress(&a4) + a17.Decompress(&a17) + + return a2.Equal(&batch[0]) && a4.Equal(&batch[1]) && a17.Equal(&batch[2]) + }, + genA, + )) + properties.Property("[BN254] Frobenius of x in E12 should be equal to x^q", prop.ForAll( func(a *E12) bool { var b, c E12 diff --git a/ecc/bn254/internal/fptower/e2.go b/ecc/bn254/internal/fptower/e2.go index 76b508bd3..03a20afe3 100644 --- a/ecc/bn254/internal/fptower/e2.go +++ b/ecc/bn254/internal/fptower/e2.go @@ -221,3 +221,37 @@ func (z *E2) Sqrt(x *E2) *E2 { z.Set(&b) return z } + +// BatchInvert returns a new slice with every element inverted. +// Uses Montgomery batch inversion trick +func BatchInvert(a []E2) []E2 { + res := make([]E2, len(a)) + if len(a) == 0 { + return res + } + + zeroes := make([]bool, len(a)) + var accumulator E2 + accumulator.SetOne() + + for i := 0; i < len(a); i++ { + if a[i].IsZero() { + zeroes[i] = true + continue + } + res[i].Set(&accumulator) + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := len(a) - 1; i >= 0; i-- { + if zeroes[i] { + continue + } + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } + + return res +} diff --git a/ecc/bn254/internal/fptower/e2_test.go b/ecc/bn254/internal/fptower/e2_test.go index c31c8806c..81b389e1f 100644 --- a/ecc/bn254/internal/fptower/e2_test.go +++ b/ecc/bn254/internal/fptower/e2_test.go @@ -246,6 +246,20 @@ func TestE2Ops(t *testing.T) { genB, )) + properties.Property("[BN254] BatchInvert should output the same result as Inverse", prop.ForAll( + func(a, b, c *E2) bool { + + batch := BatchInvert([]E2{*a, *b, *c}) + a.Inverse(a) + b.Inverse(b) + c.Inverse(c) + return a.Equal(&batch[0]) && b.Equal(&batch[1]) && c.Equal(&batch[2]) + }, + genA, + genA, + genA, + )) + properties.Property("[BN254] mulGeneric & mul should be equal", prop.ForAll( func(a, b *E2) bool { var c, d E2 diff --git a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl index c116e2cfc..6b336c45e 100644 --- a/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl +++ b/internal/generator/tower/template/fq12over6over2/fq12.go.tmpl @@ -238,6 +238,62 @@ func (z *E12) Decompress(x *E12) *E12 { return z } +// BatchDecompress multiple Karabina's cyclotomic square results +func BatchDecompress(x []E12) []E12 { + + n := len(x) + if n == 0 { + return x + } + + t0 := make([]E2, n) + t1 := make([]E2, n) + t2 := make([]E2, n) + + var one E2 + one.SetOne() + + for i := 0; i < n; i++ { + // t0 = g1^2 + t0[i].Square(&x[i].C0.B1) + // t1 = 3 * g1^2 - 2 * g2 + t1[i].Sub(&t0[i], &x[i].C0.B2). + Double(&t1[i]). + Add(&t1[i], &t0[i]) + // t0 = E * g5^2 + t1 + t2[i].Square(&x[i].C1.B2) + t0[i].MulByNonResidue(&t2[i]). + Add(&t0[i], &t1[i]) + // t1 = 4 * g3 + t1[i].Double(&x[i].C1.B0). + Double(&t1[i]) + } + + t1 = BatchInvert(t1) // costs 1 inverse + + for i := 0; i < n; i++ { + // z4 = g4 + x[i].C1.B1.Mul(&t0[i], &t1[i]) + + // t1 = g2 * g1 + t1[i].Mul(&x[i].C0.B2, &x[i].C0.B1) + // t2 = 2 * g4^2 - 3 * g2 * g1 + t2[i].Square(&x[i].C1.B1) + t2[i].Sub(&t2[i], &t1[i]) + t2[i].Double(&t2[i]) + t2[i].Sub(&t2[i], &t1[i]) + + // t1 = g3 * g5 + t1[i].Mul(&x[i].C1.B0, &x[i].C1.B2) + // z0 = E * (2 * g4^2 + g3 * g5 - 3 * g2 * g1) + 1 + t2[i].Add(&t2[i], &t1[i]) + x[i].C0.B0.MulByNonResidue(&t2[i]). + Add(&x[i].C0.B0, &one) + } + + return x +} + // Granger-Scott's cyclotomic square // https://eprint.iacr.org/2009/565.pdf, 3.2 func (z *E12) CyclotomicSquare(x *E12) *E12 { diff --git a/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl b/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl index 6963f2de6..ed4e5ef12 100644 --- a/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl +++ b/internal/generator/tower/template/fq12over6over2/fq2.go.tmpl @@ -24,7 +24,7 @@ func (z *E2) Equal(x *E2) bool { func (z *E2) Cmp(x *E2) int { if a1 := z.A1.Cmp(&x.A1); a1 != 0 { return a1 - } + } return z.A0.Cmp(&x.A0) } @@ -69,10 +69,10 @@ func (z *E2) SetOne() *E2 { // SetRandom sets a0 and a1 to random values func (z *E2) SetRandom() (*E2, error) { if _, err := z.A0.SetRandom(); err != nil { - return nil, err + return nil, err } if _, err := z.A1.SetRandom(); err != nil { - return nil, err + return nil, err } return z, nil } @@ -163,7 +163,7 @@ func (z *E2) Exp(x E2, exponent *big.Int) *E2 { } } } - + return z } @@ -189,7 +189,7 @@ func (z *E2) Exp(x E2, exponent *big.Int) *E2 { var a1, alpha, b, x0, minusone E2 minusone.SetOne().Neg(&minusone) - + a1.Exp(*x, &sqrtExp1) alpha.Square(&a1). Mul(&alpha, x) @@ -203,7 +203,7 @@ func (z *E2) Exp(x E2, exponent *big.Int) *E2 { } a1.SetOne() b.Add(&a1, &alpha) - + b.Exp(b, &sqrtExp2).Mul(&x0, &b) z.Set(&b) return z @@ -219,9 +219,9 @@ func (z *E2) Exp(x E2, exponent *big.Int) *E2 { // precomputation var b, c, d, e, f, x0 E2 var _b, o fp.Element - + // c must be a non square (works for p=1 mod 12 hence 1 mod 4, only bls377 has such a p currently) - c.A1.SetOne() + c.A1.SetOne() q := fp.Modulus() var exp, one big.Int @@ -249,3 +249,37 @@ func (z *E2) Exp(x E2, exponent *big.Int) *E2 { return z } {{end}} + +// BatchInvert returns a new slice with every element inverted. +// Uses Montgomery batch inversion trick +func BatchInvert(a []E2) []E2 { + res := make([]E2, len(a)) + if len(a) == 0 { + return res + } + + zeroes := make([]bool, len(a)) + var accumulator E2 + accumulator.SetOne() + + for i := 0; i < len(a); i++ { + if a[i].IsZero() { + zeroes[i] = true + continue + } + res[i].Set(&accumulator) + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := len(a) - 1; i >= 0; i-- { + if zeroes[i] { + continue + } + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } + + return res +} diff --git a/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl b/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl index 793d33124..126132c0d 100644 --- a/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl +++ b/internal/generator/tower/template/fq12over6over2/tests/fq12.go.tmpl @@ -305,6 +305,32 @@ func TestE12Ops(t *testing.T) { genA, )) + properties.Property("[{{ toUpper .Name }}] batch decompress and individual decompress (Karabina) should be the same", prop.ForAll( + func(a *E12) bool { + var b E12 + // put in the cyclotomic subgroup + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusSquare(&b).Mul(a, &b) + + var a2, a4, a17 E12 + a2.Set(a) + a4.Set(a) + a17.Set(a) + a2.nSquareCompressed(2) + a4.nSquareCompressed(4) + a17.nSquareCompressed(17) + batch := BatchDecompress([]E12{a2, a4, a17}) + a2.Decompress(&a2) + a4.Decompress(&a4) + a17.Decompress(&a17) + + return a2.Equal(&batch[0]) && a4.Equal(&batch[1]) && a17.Equal(&batch[2]) + }, + genA, + )) + properties.Property("[{{ toUpper .Name }}] Frobenius of x in E12 should be equal to x^q", prop.ForAll( func(a *E12) bool { var b, c E12 diff --git a/internal/generator/tower/template/fq12over6over2/tests/fq2.go.tmpl b/internal/generator/tower/template/fq12over6over2/tests/fq2.go.tmpl index c0adec39e..aa174990c 100644 --- a/internal/generator/tower/template/fq12over6over2/tests/fq2.go.tmpl +++ b/internal/generator/tower/template/fq12over6over2/tests/fq2.go.tmpl @@ -183,7 +183,7 @@ func TestE2MulMaxed(t *testing.T) { b.A0 = fpMaxValue b.A1 = fpMaxValue - + var c, d E2 d.Inverse(&b) @@ -228,6 +228,20 @@ func TestE2Ops(t *testing.T) { genB, )) + properties.Property("[{{ toUpper .Name }}] BatchInvert should output the same result as Inverse", prop.ForAll( + func(a, b, c *E2) bool { + + batch := BatchInvert([]E2{*a, *b, *c}) + a.Inverse(a) + b.Inverse(b) + c.Inverse(c) + return a.Equal(&batch[0]) && b.Equal(&batch[1]) && c.Equal(&batch[2]) + }, + genA, + genA, + genA, + )) + {{if or (eq .Name "bn254") (eq .Name "bls12-381")}} properties.Property("[{{ toUpper .Name }}] mulGeneric & mul should be equal", prop.ForAll( @@ -358,7 +372,7 @@ func TestE2Ops(t *testing.T) { cmpResult := a.Cmp(&negA) lResult := a.LexicographicallyLargest() if lResult && cmpResult == 1 { - return true + return true } if !lResult && cmpResult !=1 { return true From 170f69b89cc78587f29a7c155f113afaec3eb8cc Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Sun, 19 Sep 2021 19:44:38 +0200 Subject: [PATCH 07/11] feat(e24): implements batch decompression for karabina cyclo square --- ecc/bls24-315/internal/fptower/e24.go | 56 +++++++++++++++++++ ecc/bls24-315/internal/fptower/e24_pairing.go | 28 ++++++---- ecc/bls24-315/internal/fptower/e24_test.go | 26 +++++++++ ecc/bls24-315/internal/fptower/e4.go | 34 +++++++++++ ecc/bls24-315/internal/fptower/e4_test.go | 14 +++++ 5 files changed, 146 insertions(+), 12 deletions(-) diff --git a/ecc/bls24-315/internal/fptower/e24.go b/ecc/bls24-315/internal/fptower/e24.go index 4e3159eef..10e0e7513 100644 --- a/ecc/bls24-315/internal/fptower/e24.go +++ b/ecc/bls24-315/internal/fptower/e24.go @@ -290,6 +290,62 @@ func (z *E24) Decompress(x *E24) *E24 { return z } +// BatchDecompress multiple Karabina's cyclotomic square results +func BatchDecompress(x []E24) []E24 { + + n := len(x) + if n == 0 { + return x + } + + t0 := make([]E4, n) + t1 := make([]E4, n) + t2 := make([]E4, n) + + var one E4 + one.SetOne() + + for i := 0; i < n; i++ { + // t0 = g4^2 + t0[i].Square(&x[i].D2.C0) + // t1 = 3 * g4^2 - 2 * g3 + t1[i].Sub(&t0[i], &x[i].D1.C1). + Double(&t1[i]). + Add(&t1[i], &t0[i]) + // t0 = E * g5^2 + t1 + t2[i].Square(&x[i].D2.C1) + t0[i].MulByNonResidue(&t2[i]). + Add(&t0[i], &t1[i]) + // t1 = 4 * g2 + t1[i].Double(&x[i].D1.C0). + Double(&t1[i]) + } + + t1 = BatchInvert(t1) // costs 1 inverse + + for i := 0; i < n; i++ { + // z4 = g1 + x[i].D0.C1.Mul(&t0[i], &t1[i]) + + // t1 = g3 * g1 + t1[i].Mul(&x[i].D1.C1, &x[i].D2.C0) + // t2 = 2 * g4^2 - 3 * g2 * g1 + t2[i].Square(&x[i].D0.C1). + Sub(&t2[i], &t1[i]). + Double(&t2[i]). + Sub(&t2[i], &t1[i]) + + // t1 = g2 * g5 + t1[i].Mul(&x[i].D1.C0, &x[i].D2.C1) + // z0 = E * (2 * g1^2 + g2 * g5 - 3 * g3 * g4) + 1 + t2[i].Add(&t2[i], &t1[i]) + x[i].D0.C0.MulByNonResidue(&t2[i]). + Add(&x[i].D0.C0, &one) + } + + return x +} + // Granger-Scott's cyclotomic square // https://eprint.iacr.org/2009/565.pdf, 3.2 func (z *E24) CyclotomicSquare(x *E24) *E24 { diff --git a/ecc/bls24-315/internal/fptower/e24_pairing.go b/ecc/bls24-315/internal/fptower/e24_pairing.go index ac82fc805..a149b86ce 100644 --- a/ecc/bls24-315/internal/fptower/e24_pairing.go +++ b/ecc/bls24-315/internal/fptower/e24_pairing.go @@ -13,23 +13,27 @@ func (z *E24) nSquareCompressed(n int) { } // Expt set z to x^t in E24 and return z (t is the seed of the curve) +// -2**32+2**30+2**22-2**20+1 func (z *E24) Expt(x *E24) *E24 { - var result, xInv E24 + var result, x20, x22, x30, x32 E24 result.Set(x) - xInv.Conjugate(x) - - result.nSquare(2) - result.Mul(&result, &xInv) - result.nSquare(8) - result.Mul(&result, &xInv) - result.nSquare(2) - result.Mul(&result, x) + result.nSquareCompressed(20) - result.Decompress(&result) - result.Mul(&result, &xInv) + x20.Conjugate(&result) + result.nSquareCompressed(2) + x22.Set(&result) + result.nSquareCompressed(8) + x30.Set(&result) + result.nSquareCompressed(2) + x32.Conjugate(&result) + + batch := BatchDecompress([]E24{x20, x22, x30, x32}) - z.Conjugate(&result) + z.Mul(x, &batch[0]). + Mul(z, &batch[1]). + Mul(z, &batch[2]). + Mul(z, &batch[3]) return z } diff --git a/ecc/bls24-315/internal/fptower/e24_test.go b/ecc/bls24-315/internal/fptower/e24_test.go index 6bd9360c1..7ce13252f 100644 --- a/ecc/bls24-315/internal/fptower/e24_test.go +++ b/ecc/bls24-315/internal/fptower/e24_test.go @@ -331,6 +331,32 @@ func TestE24Ops(t *testing.T) { genA, )) + properties.Property("[BLS24-315] batch decompress and individual decompress (Karabina) should be the same", prop.ForAll( + func(a *E24) bool { + var b E24 + // put in the cyclotomic subgroup + b.Conjugate(a) + a.Inverse(a) + b.Mul(&b, a) + a.FrobeniusQuad(&b).Mul(a, &b) + + var a2, a4, a17 E24 + a2.Set(a) + a4.Set(a) + a17.Set(a) + a2.nSquareCompressed(2) + a4.nSquareCompressed(4) + a17.nSquareCompressed(17) + batch := BatchDecompress([]E24{a2, a4, a17}) + a2.Decompress(&a2) + a4.Decompress(&a4) + a17.Decompress(&a17) + + return a2.Equal(&batch[0]) && a4.Equal(&batch[1]) && a17.Equal(&batch[2]) + }, + genA, + )) + properties.Property("[BLS24-315] Frobenius of x in E24 should be equal to x^q", prop.ForAll( func(a *E24) bool { var b, c E24 diff --git a/ecc/bls24-315/internal/fptower/e4.go b/ecc/bls24-315/internal/fptower/e4.go index ab96d8340..40c91c846 100644 --- a/ecc/bls24-315/internal/fptower/e4.go +++ b/ecc/bls24-315/internal/fptower/e4.go @@ -297,3 +297,37 @@ func (z *E4) Sqrt(x *E4) *E4 { return z } + +// BatchInvert returns a new slice with every element inverted. +// Uses Montgomery batch inversion trick +func BatchInvert(a []E4) []E4 { + res := make([]E4, len(a)) + if len(a) == 0 { + return res + } + + zeroes := make([]bool, len(a)) + var accumulator E4 + accumulator.SetOne() + + for i := 0; i < len(a); i++ { + if a[i].IsZero() { + zeroes[i] = true + continue + } + res[i].Set(&accumulator) + accumulator.Mul(&accumulator, &a[i]) + } + + accumulator.Inverse(&accumulator) + + for i := len(a) - 1; i >= 0; i-- { + if zeroes[i] { + continue + } + res[i].Mul(&res[i], &accumulator) + accumulator.Mul(&accumulator, &a[i]) + } + + return res +} diff --git a/ecc/bls24-315/internal/fptower/e4_test.go b/ecc/bls24-315/internal/fptower/e4_test.go index 238ac17e7..92f23b0a6 100644 --- a/ecc/bls24-315/internal/fptower/e4_test.go +++ b/ecc/bls24-315/internal/fptower/e4_test.go @@ -188,6 +188,20 @@ func TestE4Ops(t *testing.T) { genB, )) + properties.Property("[BLS24-315] BatchInvert should output the same result as Inverse", prop.ForAll( + func(a, b, c *E4) bool { + + batch := BatchInvert([]E4{*a, *b, *c}) + a.Inverse(a) + b.Inverse(b) + c.Inverse(c) + return a.Equal(&batch[0]) && b.Equal(&batch[1]) && c.Equal(&batch[2]) + }, + genA, + genA, + genB, + )) + properties.Property("[BLS24-315] inverse twice should leave an element invariant", prop.ForAll( func(a *E4) bool { var b E4 From ba008f2d7be3412875931ef830279ec8092ee554 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Sun, 19 Sep 2021 20:30:29 +0200 Subject: [PATCH 08/11] perf(bls24): mix Karabina+GS+BatchInvert for faster FinalExp (Expt) --- ecc/bls24-315/internal/fptower/e24_pairing.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ecc/bls24-315/internal/fptower/e24_pairing.go b/ecc/bls24-315/internal/fptower/e24_pairing.go index a149b86ce..98622a5e5 100644 --- a/ecc/bls24-315/internal/fptower/e24_pairing.go +++ b/ecc/bls24-315/internal/fptower/e24_pairing.go @@ -25,15 +25,17 @@ func (z *E24) Expt(x *E24) *E24 { x22.Set(&result) result.nSquareCompressed(8) x30.Set(&result) - result.nSquareCompressed(2) - x32.Conjugate(&result) - batch := BatchDecompress([]E24{x20, x22, x30, x32}) + batch := BatchDecompress([]E24{x20, x22, x30}) + + x32.CyclotomicSquare(&batch[2]). + CyclotomicSquare(&x32). + Conjugate(&x32) z.Mul(x, &batch[0]). Mul(z, &batch[1]). Mul(z, &batch[2]). - Mul(z, &batch[3]) + Mul(z, &x32) return z } From f61053579454a75b36c3a2734074a23781b14ad7 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 21 Sep 2021 11:11:37 -0500 Subject: [PATCH 09/11] feat: reverted to non-asm field inverse --- ecc/bls12-377/fp/element.go | 14 +- ecc/bls12-377/fp/element_ops_amd64.go | 3 - ecc/bls12-377/fp/element_ops_amd64.s | 353 ---------- ecc/bls12-377/fp/element_ops_noasm.go | 4 - ecc/bls12-377/fr/element.go | 14 +- ecc/bls12-377/fr/element_ops_amd64.go | 3 - ecc/bls12-377/fr/element_ops_amd64.s | 265 ------- ecc/bls12-377/fr/element_ops_noasm.go | 4 - ecc/bls12-381/fp/element.go | 14 +- ecc/bls12-381/fp/element_ops_amd64.go | 3 - ecc/bls12-381/fp/element_ops_amd64.s | 353 ---------- ecc/bls12-381/fp/element_ops_noasm.go | 4 - ecc/bls12-381/fr/element.go | 14 +- ecc/bls12-381/fr/element_ops_amd64.go | 3 - ecc/bls12-381/fr/element_ops_amd64.s | 265 ------- ecc/bls12-381/fr/element_ops_noasm.go | 4 - ecc/bls24-315/fp/element.go | 14 +- ecc/bls24-315/fp/element_ops_amd64.go | 3 - ecc/bls24-315/fp/element_ops_amd64.s | 309 -------- ecc/bls24-315/fp/element_ops_noasm.go | 4 - ecc/bls24-315/fr/element.go | 14 +- ecc/bls24-315/fr/element_ops_amd64.go | 3 - ecc/bls24-315/fr/element_ops_amd64.s | 265 ------- ecc/bls24-315/fr/element_ops_noasm.go | 4 - ecc/bn254/fp/element.go | 14 +- ecc/bn254/fp/element_ops_amd64.go | 3 - ecc/bn254/fp/element_ops_amd64.s | 265 ------- ecc/bn254/fp/element_ops_noasm.go | 4 - ecc/bn254/fr/element.go | 14 +- ecc/bn254/fr/element_ops_amd64.go | 3 - ecc/bn254/fr/element_ops_amd64.s | 265 ------- ecc/bn254/fr/element_ops_noasm.go | 4 - ecc/bw6-633/fp/element.go | 14 +- ecc/bw6-633/fp/element_ops_amd64.go | 3 - ecc/bw6-633/fp/element_ops_amd64.s | 568 --------------- ecc/bw6-633/fp/element_ops_noasm.go | 4 - ecc/bw6-633/fr/element.go | 14 +- ecc/bw6-633/fr/element_ops_amd64.go | 3 - ecc/bw6-633/fr/element_ops_amd64.s | 309 -------- ecc/bw6-633/fr/element_ops_noasm.go | 4 - ecc/bw6-761/fp/element.go | 14 +- ecc/bw6-761/fp/element_ops_amd64.go | 3 - ecc/bw6-761/fp/element_ops_amd64.s | 664 ------------------ ecc/bw6-761/fp/element_ops_noasm.go | 4 - ecc/bw6-761/fr/element.go | 14 +- ecc/bw6-761/fr/element_ops_amd64.go | 3 - ecc/bw6-761/fr/element_ops_amd64.s | 353 ---------- ecc/bw6-761/fr/element_ops_noasm.go | 4 - field/asm/amd64/build.go | 3 - field/asm/amd64/element_inverse.go | 246 ------- field/internal/templates/element/inverse.go | 23 +- field/internal/templates/element/ops.go | 3 - .../internal/templates/element/ops_generic.go | 3 - 53 files changed, 41 insertions(+), 4723 deletions(-) delete mode 100644 field/asm/amd64/element_inverse.go diff --git a/ecc/bls12-377/fp/element.go b/ecc/bls12-377/fp/element.go index ff802d3ae..084471c7e 100644 --- a/ecc/bls12-377/fp/element.go +++ b/ecc/bls12-377/fp/element.go @@ -983,17 +983,9 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { - inverse(z, x) - return z -} - -// _inverseGeneric z = x^-1 mod q -// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" -// if x == 0, sets and returns z = x -func _inverseGeneric(z, x *Element) { if x.IsZero() { z.SetZero() - return + return z } // initialize u = q @@ -1156,11 +1148,11 @@ func _inverseGeneric(z, x *Element) { } if (u[0] == 1) && (u[5]|u[4]|u[3]|u[2]|u[1]) == 0 { z.Set(&r) - return + return z } if (v[0] == 1) && (v[5]|v[4]|v[3]|v[2]|v[1]) == 0 { z.Set(&s) - return + return z } } diff --git a/ecc/bls12-377/fp/element_ops_amd64.go b/ecc/bls12-377/fp/element_ops_amd64.go index d61412bd6..73a3711ec 100644 --- a/ecc/bls12-377/fp/element_ops_amd64.go +++ b/ecc/bls12-377/fp/element_ops_amd64.go @@ -48,6 +48,3 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) - -//go:noescape -func inverse(res, x *Element) diff --git a/ecc/bls12-377/fp/element_ops_amd64.s b/ecc/bls12-377/fp/element_ops_amd64.s index 857431795..596b552dd 100644 --- a/ecc/bls12-377/fp/element_ops_amd64.s +++ b/ecc/bls12-377/fp/element_ops_amd64.s @@ -450,356 +450,3 @@ TEXT ·Butterfly(SB), $48-16 MOVQ R8, 32(AX) MOVQ R9, 40(AX) RET - -// inverse(res, x *Element) -TEXT ·inverse(SB), $136-16 - // u = q - // u[0] -> R9 - // u[1] -> R10 - // u[2] -> R11 - // u[3] -> R12 - // u[4] -> R13 - // u[5] -> R14 - MOVQ q<>+0(SB), R9 - MOVQ q<>+8(SB), R10 - MOVQ q<>+16(SB), R11 - MOVQ q<>+24(SB), R12 - MOVQ q<>+32(SB), R13 - MOVQ q<>+40(SB), R14 - - // s = r^2 - // s[0] -> s11-96(SP) - // s[1] -> s12-104(SP) - // s[2] -> s13-112(SP) - // s[3] -> s14-120(SP) - // s[4] -> s15-128(SP) - // s[5] -> s16-136(SP) - MOVQ $0xb786686c9400cd22, R8 - MOVQ R8, s11-96(SP) - MOVQ $0x0329fcaab00431b1, R8 - MOVQ R8, s12-104(SP) - MOVQ $0x22a5f11162d6b46d, R8 - MOVQ R8, s13-112(SP) - MOVQ $0xbfdf7d03827dc3ac, R8 - MOVQ R8, s14-120(SP) - MOVQ $0x837e92f041790bf9, R8 - MOVQ R8, s15-128(SP) - MOVQ $0x006dfccb1e914b88, R8 - MOVQ R8, s16-136(SP) - - // v = x - // v[0] -> R15 - // v[1] -> s0-8(SP) - // v[2] -> s1-16(SP) - // v[3] -> s2-24(SP) - // v[4] -> s3-32(SP) - // v[5] -> s4-40(SP) - MOVQ x+8(FP), R8 - MOVQ 0(R8), AX - MOVQ 8(R8), DX - MOVQ 16(R8), CX - MOVQ 24(R8), BX - MOVQ 32(R8), SI - MOVQ 40(R8), DI - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - MOVQ SI, s3-32(SP) - MOVQ DI, s4-40(SP) - - // if x is 0, returns 0 - MOVQ AX, R8 - ORQ DX, R8 - ORQ CX, R8 - ORQ BX, R8 - ORQ SI, R8 - ORQ DI, R8 - JEQ l7 - - // r = 0 - // r[0] -> s5-48(SP) - // r[1] -> s6-56(SP) - // r[2] -> s7-64(SP) - // r[3] -> s8-72(SP) - // r[4] -> s9-80(SP) - // r[5] -> s10-88(SP) - MOVQ $0, s5-48(SP) - MOVQ $0, s6-56(SP) - MOVQ $0, s7-64(SP) - MOVQ $0, s8-72(SP) - MOVQ $0, s9-80(SP) - MOVQ $0, s10-88(SP) - -l2: - BTQ $0, AX - JCS l8 - MOVQ $0, BP - XORQ R8, R8 - -l9: - INCQ BP - SHRQ $1, AX, R8 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, DI - BTQ $0, AX - JCC l9 - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - MOVQ SI, s3-32(SP) - MOVQ DI, s4-40(SP) - MOVQ s11-96(SP), AX - MOVQ s12-104(SP), DX - MOVQ s13-112(SP), CX - MOVQ s14-120(SP), BX - MOVQ s15-128(SP), SI - MOVQ s16-136(SP), DI - -l10: - BTQ $0, AX - JCC l11 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - -l11: - SHRQ $1, AX, R8 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, DI - DECQ BP - JNE l10 - MOVQ AX, s11-96(SP) - MOVQ DX, s12-104(SP) - MOVQ CX, s13-112(SP) - MOVQ BX, s14-120(SP) - MOVQ SI, s15-128(SP) - MOVQ DI, s16-136(SP) - -l8: - MOVQ R9, AX - MOVQ R10, DX - MOVQ R11, CX - MOVQ R12, BX - MOVQ R13, SI - MOVQ R14, DI - BTQ $0, AX - JCS l12 - MOVQ $0, BP - XORQ R8, R8 - -l13: - INCQ BP - SHRQ $1, AX, R8 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, DI - BTQ $0, AX - JCC l13 - MOVQ AX, R9 - MOVQ DX, R10 - MOVQ CX, R11 - MOVQ BX, R12 - MOVQ SI, R13 - MOVQ DI, R14 - MOVQ s5-48(SP), AX - MOVQ s6-56(SP), DX - MOVQ s7-64(SP), CX - MOVQ s8-72(SP), BX - MOVQ s9-80(SP), SI - MOVQ s10-88(SP), DI - -l14: - BTQ $0, AX - JCC l15 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - -l15: - SHRQ $1, AX, R8 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, DI - DECQ BP - JNE l14 - MOVQ AX, s5-48(SP) - MOVQ DX, s6-56(SP) - MOVQ CX, s7-64(SP) - MOVQ BX, s8-72(SP) - MOVQ SI, s9-80(SP) - MOVQ DI, s10-88(SP) - -l12: - // v = v - u - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - MOVQ s3-32(SP), SI - MOVQ s4-40(SP), DI - SUBQ R9, AX - SBBQ R10, DX - SBBQ R11, CX - SBBQ R12, BX - SBBQ R13, SI - SBBQ R14, DI - JCC l3 - SUBQ R15, R9 - SBBQ s0-8(SP), R10 - SBBQ s1-16(SP), R11 - SBBQ s2-24(SP), R12 - SBBQ s3-32(SP), R13 - SBBQ s4-40(SP), R14 - MOVQ s5-48(SP), AX - MOVQ s6-56(SP), DX - MOVQ s7-64(SP), CX - MOVQ s8-72(SP), BX - MOVQ s9-80(SP), SI - MOVQ s10-88(SP), DI - SUBQ s11-96(SP), AX - SBBQ s12-104(SP), DX - SBBQ s13-112(SP), CX - SBBQ s14-120(SP), BX - SBBQ s15-128(SP), SI - SBBQ s16-136(SP), DI - JCC l16 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - -l16: - MOVQ AX, s5-48(SP) - MOVQ DX, s6-56(SP) - MOVQ CX, s7-64(SP) - MOVQ BX, s8-72(SP) - MOVQ SI, s9-80(SP) - MOVQ DI, s10-88(SP) - JMP l4 - -l3: - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - MOVQ SI, s3-32(SP) - MOVQ DI, s4-40(SP) - MOVQ s11-96(SP), AX - MOVQ s12-104(SP), DX - MOVQ s13-112(SP), CX - MOVQ s14-120(SP), BX - MOVQ s15-128(SP), SI - MOVQ s16-136(SP), DI - SUBQ s5-48(SP), AX - SBBQ s6-56(SP), DX - SBBQ s7-64(SP), CX - SBBQ s8-72(SP), BX - SBBQ s9-80(SP), SI - SBBQ s10-88(SP), DI - JCC l17 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - -l17: - MOVQ AX, s11-96(SP) - MOVQ DX, s12-104(SP) - MOVQ CX, s13-112(SP) - MOVQ BX, s14-120(SP) - MOVQ SI, s15-128(SP) - MOVQ DI, s16-136(SP) - -l4: - MOVQ R9, R8 - SUBQ $1, R8 - ORQ R10, R8 - ORQ R11, R8 - ORQ R12, R8 - ORQ R13, R8 - ORQ R14, R8 - JEQ l5 - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - MOVQ s3-32(SP), SI - MOVQ s4-40(SP), DI - MOVQ AX, R8 - SUBQ $1, R8 - JNE l2 - ORQ DX, R8 - ORQ CX, R8 - ORQ BX, R8 - ORQ SI, R8 - ORQ DI, R8 - JEQ l6 - JMP l2 - -l5: - MOVQ res+0(FP), R8 - MOVQ s5-48(SP), AX - MOVQ s6-56(SP), DX - MOVQ s7-64(SP), CX - MOVQ s8-72(SP), BX - MOVQ s9-80(SP), SI - MOVQ s10-88(SP), DI - MOVQ AX, 0(R8) - MOVQ DX, 8(R8) - MOVQ CX, 16(R8) - MOVQ BX, 24(R8) - MOVQ SI, 32(R8) - MOVQ DI, 40(R8) - RET - -l6: - MOVQ res+0(FP), R8 - MOVQ s11-96(SP), AX - MOVQ s12-104(SP), DX - MOVQ s13-112(SP), CX - MOVQ s14-120(SP), BX - MOVQ s15-128(SP), SI - MOVQ s16-136(SP), DI - MOVQ AX, 0(R8) - MOVQ DX, 8(R8) - MOVQ CX, 16(R8) - MOVQ BX, 24(R8) - MOVQ SI, 32(R8) - MOVQ DI, 40(R8) - RET - -l7: - MOVQ res+0(FP), R8 - MOVQ $0, 0(R8) - MOVQ $0, 8(R8) - MOVQ $0, 16(R8) - MOVQ $0, 24(R8) - MOVQ $0, 32(R8) - MOVQ $0, 40(R8) - RET diff --git a/ecc/bls12-377/fp/element_ops_noasm.go b/ecc/bls12-377/fp/element_ops_noasm.go index 48d55e2ea..fec628918 100644 --- a/ecc/bls12-377/fp/element_ops_noasm.go +++ b/ecc/bls12-377/fp/element_ops_noasm.go @@ -51,10 +51,6 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } -func inverse(z, x *Element) { - _inverseGeneric(z, x) -} - // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bls12-377/fr/element.go b/ecc/bls12-377/fr/element.go index 014cc671b..daf96aadf 100644 --- a/ecc/bls12-377/fr/element.go +++ b/ecc/bls12-377/fr/element.go @@ -845,17 +845,9 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { - inverse(z, x) - return z -} - -// _inverseGeneric z = x^-1 mod q -// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" -// if x == 0, sets and returns z = x -func _inverseGeneric(z, x *Element) { if x.IsZero() { z.SetZero() - return + return z } // initialize u = q @@ -990,11 +982,11 @@ func _inverseGeneric(z, x *Element) { } if (u[0] == 1) && (u[3]|u[2]|u[1]) == 0 { z.Set(&r) - return + return z } if (v[0] == 1) && (v[3]|v[2]|v[1]) == 0 { z.Set(&s) - return + return z } } diff --git a/ecc/bls12-377/fr/element_ops_amd64.go b/ecc/bls12-377/fr/element_ops_amd64.go index 9ebabc26a..78022b3e6 100644 --- a/ecc/bls12-377/fr/element_ops_amd64.go +++ b/ecc/bls12-377/fr/element_ops_amd64.go @@ -48,6 +48,3 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) - -//go:noescape -func inverse(res, x *Element) diff --git a/ecc/bls12-377/fr/element_ops_amd64.s b/ecc/bls12-377/fr/element_ops_amd64.s index 21cd7c70f..85a1c457f 100644 --- a/ecc/bls12-377/fr/element_ops_amd64.s +++ b/ecc/bls12-377/fr/element_ops_amd64.s @@ -338,268 +338,3 @@ TEXT ·Butterfly(SB), NOSPLIT, $0-16 MOVQ SI, 16(AX) MOVQ DI, 24(AX) RET - -// inverse(res, x *Element) -TEXT ·inverse(SB), $56-16 - // u = q - // u[0] -> DI - // u[1] -> R8 - // u[2] -> R9 - // u[3] -> R10 - MOVQ q<>+0(SB), DI - MOVQ q<>+8(SB), R8 - MOVQ q<>+16(SB), R9 - MOVQ q<>+24(SB), R10 - - // s = r^2 - // s[0] -> s3-32(SP) - // s[1] -> s4-40(SP) - // s[2] -> s5-48(SP) - // s[3] -> s6-56(SP) - MOVQ $0x25d577bab861857b, SI - MOVQ SI, s3-32(SP) - MOVQ $0xcc2c27b58860591f, SI - MOVQ SI, s4-40(SP) - MOVQ $0xa7cc008fe5dc8593, SI - MOVQ SI, s5-48(SP) - MOVQ $0x011fdae7eff1c939, SI - MOVQ SI, s6-56(SP) - - // v = x - // v[0] -> R11 - // v[1] -> R12 - // v[2] -> R13 - // v[3] -> R14 - MOVQ x+8(FP), SI - MOVQ 0(SI), AX - MOVQ 8(SI), DX - MOVQ 16(SI), CX - MOVQ 24(SI), BX - MOVQ AX, R11 - MOVQ DX, R12 - MOVQ CX, R13 - MOVQ BX, R14 - - // if x is 0, returns 0 - MOVQ AX, SI - ORQ DX, SI - ORQ CX, SI - ORQ BX, SI - JEQ l7 - - // r = 0 - // r[0] -> R15 - // r[1] -> s0-8(SP) - // r[2] -> s1-16(SP) - // r[3] -> s2-24(SP) - MOVQ $0, R15 - MOVQ $0, s0-8(SP) - MOVQ $0, s1-16(SP) - MOVQ $0, s2-24(SP) - -l2: - BTQ $0, AX - JCS l8 - MOVQ $0, BP - XORQ SI, SI - -l9: - INCQ BP - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - BTQ $0, AX - JCC l9 - MOVQ AX, R11 - MOVQ DX, R12 - MOVQ CX, R13 - MOVQ BX, R14 - MOVQ s3-32(SP), AX - MOVQ s4-40(SP), DX - MOVQ s5-48(SP), CX - MOVQ s6-56(SP), BX - -l10: - BTQ $0, AX - JCC l11 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l11: - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - DECQ BP - JNE l10 - MOVQ AX, s3-32(SP) - MOVQ DX, s4-40(SP) - MOVQ CX, s5-48(SP) - MOVQ BX, s6-56(SP) - -l8: - MOVQ DI, AX - MOVQ R8, DX - MOVQ R9, CX - MOVQ R10, BX - BTQ $0, AX - JCS l12 - MOVQ $0, BP - XORQ SI, SI - -l13: - INCQ BP - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - BTQ $0, AX - JCC l13 - MOVQ AX, DI - MOVQ DX, R8 - MOVQ CX, R9 - MOVQ BX, R10 - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - -l14: - BTQ $0, AX - JCC l15 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l15: - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - DECQ BP - JNE l14 - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - -l12: - // v = v - u - MOVQ R11, AX - MOVQ R12, DX - MOVQ R13, CX - MOVQ R14, BX - SUBQ DI, AX - SBBQ R8, DX - SBBQ R9, CX - SBBQ R10, BX - JCC l3 - SUBQ R11, DI - SBBQ R12, R8 - SBBQ R13, R9 - SBBQ R14, R10 - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - SUBQ s3-32(SP), AX - SBBQ s4-40(SP), DX - SBBQ s5-48(SP), CX - SBBQ s6-56(SP), BX - JCC l16 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l16: - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - JMP l4 - -l3: - MOVQ AX, R11 - MOVQ DX, R12 - MOVQ CX, R13 - MOVQ BX, R14 - MOVQ s3-32(SP), AX - MOVQ s4-40(SP), DX - MOVQ s5-48(SP), CX - MOVQ s6-56(SP), BX - SUBQ R15, AX - SBBQ s0-8(SP), DX - SBBQ s1-16(SP), CX - SBBQ s2-24(SP), BX - JCC l17 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l17: - MOVQ AX, s3-32(SP) - MOVQ DX, s4-40(SP) - MOVQ CX, s5-48(SP) - MOVQ BX, s6-56(SP) - -l4: - MOVQ DI, SI - SUBQ $1, SI - ORQ R8, SI - ORQ R9, SI - ORQ R10, SI - JEQ l5 - MOVQ R11, AX - MOVQ R12, DX - MOVQ R13, CX - MOVQ R14, BX - MOVQ AX, SI - SUBQ $1, SI - JNE l2 - ORQ DX, SI - ORQ CX, SI - ORQ BX, SI - JEQ l6 - JMP l2 - -l5: - MOVQ res+0(FP), SI - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - MOVQ AX, 0(SI) - MOVQ DX, 8(SI) - MOVQ CX, 16(SI) - MOVQ BX, 24(SI) - RET - -l6: - MOVQ res+0(FP), SI - MOVQ s3-32(SP), AX - MOVQ s4-40(SP), DX - MOVQ s5-48(SP), CX - MOVQ s6-56(SP), BX - MOVQ AX, 0(SI) - MOVQ DX, 8(SI) - MOVQ CX, 16(SI) - MOVQ BX, 24(SI) - RET - -l7: - MOVQ res+0(FP), SI - MOVQ $0, 0(SI) - MOVQ $0, 8(SI) - MOVQ $0, 16(SI) - MOVQ $0, 24(SI) - RET diff --git a/ecc/bls12-377/fr/element_ops_noasm.go b/ecc/bls12-377/fr/element_ops_noasm.go index 006365daa..ec1fac18d 100644 --- a/ecc/bls12-377/fr/element_ops_noasm.go +++ b/ecc/bls12-377/fr/element_ops_noasm.go @@ -51,10 +51,6 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } -func inverse(z, x *Element) { - _inverseGeneric(z, x) -} - // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bls12-381/fp/element.go b/ecc/bls12-381/fp/element.go index 2dc0c7ce8..a1cfbd48f 100644 --- a/ecc/bls12-381/fp/element.go +++ b/ecc/bls12-381/fp/element.go @@ -929,17 +929,9 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { - inverse(z, x) - return z -} - -// _inverseGeneric z = x^-1 mod q -// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" -// if x == 0, sets and returns z = x -func _inverseGeneric(z, x *Element) { if x.IsZero() { z.SetZero() - return + return z } // initialize u = q @@ -1102,11 +1094,11 @@ func _inverseGeneric(z, x *Element) { } if (u[0] == 1) && (u[5]|u[4]|u[3]|u[2]|u[1]) == 0 { z.Set(&r) - return + return z } if (v[0] == 1) && (v[5]|v[4]|v[3]|v[2]|v[1]) == 0 { z.Set(&s) - return + return z } } diff --git a/ecc/bls12-381/fp/element_ops_amd64.go b/ecc/bls12-381/fp/element_ops_amd64.go index d61412bd6..73a3711ec 100644 --- a/ecc/bls12-381/fp/element_ops_amd64.go +++ b/ecc/bls12-381/fp/element_ops_amd64.go @@ -48,6 +48,3 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) - -//go:noescape -func inverse(res, x *Element) diff --git a/ecc/bls12-381/fp/element_ops_amd64.s b/ecc/bls12-381/fp/element_ops_amd64.s index 482186d3e..099c9afca 100644 --- a/ecc/bls12-381/fp/element_ops_amd64.s +++ b/ecc/bls12-381/fp/element_ops_amd64.s @@ -450,356 +450,3 @@ TEXT ·Butterfly(SB), $48-16 MOVQ R8, 32(AX) MOVQ R9, 40(AX) RET - -// inverse(res, x *Element) -TEXT ·inverse(SB), $136-16 - // u = q - // u[0] -> R9 - // u[1] -> R10 - // u[2] -> R11 - // u[3] -> R12 - // u[4] -> R13 - // u[5] -> R14 - MOVQ q<>+0(SB), R9 - MOVQ q<>+8(SB), R10 - MOVQ q<>+16(SB), R11 - MOVQ q<>+24(SB), R12 - MOVQ q<>+32(SB), R13 - MOVQ q<>+40(SB), R14 - - // s = r^2 - // s[0] -> s11-96(SP) - // s[1] -> s12-104(SP) - // s[2] -> s13-112(SP) - // s[3] -> s14-120(SP) - // s[4] -> s15-128(SP) - // s[5] -> s16-136(SP) - MOVQ $0xf4df1f341c341746, R8 - MOVQ R8, s11-96(SP) - MOVQ $0x0a76e6a609d104f1, R8 - MOVQ R8, s12-104(SP) - MOVQ $0x8de5476c4c95b6d5, R8 - MOVQ R8, s13-112(SP) - MOVQ $0x67eb88a9939d83c0, R8 - MOVQ R8, s14-120(SP) - MOVQ $0x9a793e85b519952d, R8 - MOVQ R8, s15-128(SP) - MOVQ $0x11988fe592cae3aa, R8 - MOVQ R8, s16-136(SP) - - // v = x - // v[0] -> R15 - // v[1] -> s0-8(SP) - // v[2] -> s1-16(SP) - // v[3] -> s2-24(SP) - // v[4] -> s3-32(SP) - // v[5] -> s4-40(SP) - MOVQ x+8(FP), R8 - MOVQ 0(R8), AX - MOVQ 8(R8), DX - MOVQ 16(R8), CX - MOVQ 24(R8), BX - MOVQ 32(R8), SI - MOVQ 40(R8), DI - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - MOVQ SI, s3-32(SP) - MOVQ DI, s4-40(SP) - - // if x is 0, returns 0 - MOVQ AX, R8 - ORQ DX, R8 - ORQ CX, R8 - ORQ BX, R8 - ORQ SI, R8 - ORQ DI, R8 - JEQ l7 - - // r = 0 - // r[0] -> s5-48(SP) - // r[1] -> s6-56(SP) - // r[2] -> s7-64(SP) - // r[3] -> s8-72(SP) - // r[4] -> s9-80(SP) - // r[5] -> s10-88(SP) - MOVQ $0, s5-48(SP) - MOVQ $0, s6-56(SP) - MOVQ $0, s7-64(SP) - MOVQ $0, s8-72(SP) - MOVQ $0, s9-80(SP) - MOVQ $0, s10-88(SP) - -l2: - BTQ $0, AX - JCS l8 - MOVQ $0, BP - XORQ R8, R8 - -l9: - INCQ BP - SHRQ $1, AX, R8 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, DI - BTQ $0, AX - JCC l9 - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - MOVQ SI, s3-32(SP) - MOVQ DI, s4-40(SP) - MOVQ s11-96(SP), AX - MOVQ s12-104(SP), DX - MOVQ s13-112(SP), CX - MOVQ s14-120(SP), BX - MOVQ s15-128(SP), SI - MOVQ s16-136(SP), DI - -l10: - BTQ $0, AX - JCC l11 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - -l11: - SHRQ $1, AX, R8 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, DI - DECQ BP - JNE l10 - MOVQ AX, s11-96(SP) - MOVQ DX, s12-104(SP) - MOVQ CX, s13-112(SP) - MOVQ BX, s14-120(SP) - MOVQ SI, s15-128(SP) - MOVQ DI, s16-136(SP) - -l8: - MOVQ R9, AX - MOVQ R10, DX - MOVQ R11, CX - MOVQ R12, BX - MOVQ R13, SI - MOVQ R14, DI - BTQ $0, AX - JCS l12 - MOVQ $0, BP - XORQ R8, R8 - -l13: - INCQ BP - SHRQ $1, AX, R8 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, DI - BTQ $0, AX - JCC l13 - MOVQ AX, R9 - MOVQ DX, R10 - MOVQ CX, R11 - MOVQ BX, R12 - MOVQ SI, R13 - MOVQ DI, R14 - MOVQ s5-48(SP), AX - MOVQ s6-56(SP), DX - MOVQ s7-64(SP), CX - MOVQ s8-72(SP), BX - MOVQ s9-80(SP), SI - MOVQ s10-88(SP), DI - -l14: - BTQ $0, AX - JCC l15 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - -l15: - SHRQ $1, AX, R8 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, DI - DECQ BP - JNE l14 - MOVQ AX, s5-48(SP) - MOVQ DX, s6-56(SP) - MOVQ CX, s7-64(SP) - MOVQ BX, s8-72(SP) - MOVQ SI, s9-80(SP) - MOVQ DI, s10-88(SP) - -l12: - // v = v - u - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - MOVQ s3-32(SP), SI - MOVQ s4-40(SP), DI - SUBQ R9, AX - SBBQ R10, DX - SBBQ R11, CX - SBBQ R12, BX - SBBQ R13, SI - SBBQ R14, DI - JCC l3 - SUBQ R15, R9 - SBBQ s0-8(SP), R10 - SBBQ s1-16(SP), R11 - SBBQ s2-24(SP), R12 - SBBQ s3-32(SP), R13 - SBBQ s4-40(SP), R14 - MOVQ s5-48(SP), AX - MOVQ s6-56(SP), DX - MOVQ s7-64(SP), CX - MOVQ s8-72(SP), BX - MOVQ s9-80(SP), SI - MOVQ s10-88(SP), DI - SUBQ s11-96(SP), AX - SBBQ s12-104(SP), DX - SBBQ s13-112(SP), CX - SBBQ s14-120(SP), BX - SBBQ s15-128(SP), SI - SBBQ s16-136(SP), DI - JCC l16 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - -l16: - MOVQ AX, s5-48(SP) - MOVQ DX, s6-56(SP) - MOVQ CX, s7-64(SP) - MOVQ BX, s8-72(SP) - MOVQ SI, s9-80(SP) - MOVQ DI, s10-88(SP) - JMP l4 - -l3: - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - MOVQ SI, s3-32(SP) - MOVQ DI, s4-40(SP) - MOVQ s11-96(SP), AX - MOVQ s12-104(SP), DX - MOVQ s13-112(SP), CX - MOVQ s14-120(SP), BX - MOVQ s15-128(SP), SI - MOVQ s16-136(SP), DI - SUBQ s5-48(SP), AX - SBBQ s6-56(SP), DX - SBBQ s7-64(SP), CX - SBBQ s8-72(SP), BX - SBBQ s9-80(SP), SI - SBBQ s10-88(SP), DI - JCC l17 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - -l17: - MOVQ AX, s11-96(SP) - MOVQ DX, s12-104(SP) - MOVQ CX, s13-112(SP) - MOVQ BX, s14-120(SP) - MOVQ SI, s15-128(SP) - MOVQ DI, s16-136(SP) - -l4: - MOVQ R9, R8 - SUBQ $1, R8 - ORQ R10, R8 - ORQ R11, R8 - ORQ R12, R8 - ORQ R13, R8 - ORQ R14, R8 - JEQ l5 - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - MOVQ s3-32(SP), SI - MOVQ s4-40(SP), DI - MOVQ AX, R8 - SUBQ $1, R8 - JNE l2 - ORQ DX, R8 - ORQ CX, R8 - ORQ BX, R8 - ORQ SI, R8 - ORQ DI, R8 - JEQ l6 - JMP l2 - -l5: - MOVQ res+0(FP), R8 - MOVQ s5-48(SP), AX - MOVQ s6-56(SP), DX - MOVQ s7-64(SP), CX - MOVQ s8-72(SP), BX - MOVQ s9-80(SP), SI - MOVQ s10-88(SP), DI - MOVQ AX, 0(R8) - MOVQ DX, 8(R8) - MOVQ CX, 16(R8) - MOVQ BX, 24(R8) - MOVQ SI, 32(R8) - MOVQ DI, 40(R8) - RET - -l6: - MOVQ res+0(FP), R8 - MOVQ s11-96(SP), AX - MOVQ s12-104(SP), DX - MOVQ s13-112(SP), CX - MOVQ s14-120(SP), BX - MOVQ s15-128(SP), SI - MOVQ s16-136(SP), DI - MOVQ AX, 0(R8) - MOVQ DX, 8(R8) - MOVQ CX, 16(R8) - MOVQ BX, 24(R8) - MOVQ SI, 32(R8) - MOVQ DI, 40(R8) - RET - -l7: - MOVQ res+0(FP), R8 - MOVQ $0, 0(R8) - MOVQ $0, 8(R8) - MOVQ $0, 16(R8) - MOVQ $0, 24(R8) - MOVQ $0, 32(R8) - MOVQ $0, 40(R8) - RET diff --git a/ecc/bls12-381/fp/element_ops_noasm.go b/ecc/bls12-381/fp/element_ops_noasm.go index 48d55e2ea..fec628918 100644 --- a/ecc/bls12-381/fp/element_ops_noasm.go +++ b/ecc/bls12-381/fp/element_ops_noasm.go @@ -51,10 +51,6 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } -func inverse(z, x *Element) { - _inverseGeneric(z, x) -} - // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bls12-381/fr/element.go b/ecc/bls12-381/fr/element.go index e077d84b8..53b00877e 100644 --- a/ecc/bls12-381/fr/element.go +++ b/ecc/bls12-381/fr/element.go @@ -845,17 +845,9 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { - inverse(z, x) - return z -} - -// _inverseGeneric z = x^-1 mod q -// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" -// if x == 0, sets and returns z = x -func _inverseGeneric(z, x *Element) { if x.IsZero() { z.SetZero() - return + return z } // initialize u = q @@ -990,11 +982,11 @@ func _inverseGeneric(z, x *Element) { } if (u[0] == 1) && (u[3]|u[2]|u[1]) == 0 { z.Set(&r) - return + return z } if (v[0] == 1) && (v[3]|v[2]|v[1]) == 0 { z.Set(&s) - return + return z } } diff --git a/ecc/bls12-381/fr/element_ops_amd64.go b/ecc/bls12-381/fr/element_ops_amd64.go index 9ebabc26a..78022b3e6 100644 --- a/ecc/bls12-381/fr/element_ops_amd64.go +++ b/ecc/bls12-381/fr/element_ops_amd64.go @@ -48,6 +48,3 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) - -//go:noescape -func inverse(res, x *Element) diff --git a/ecc/bls12-381/fr/element_ops_amd64.s b/ecc/bls12-381/fr/element_ops_amd64.s index e3b75120b..d385629f1 100644 --- a/ecc/bls12-381/fr/element_ops_amd64.s +++ b/ecc/bls12-381/fr/element_ops_amd64.s @@ -338,268 +338,3 @@ TEXT ·Butterfly(SB), NOSPLIT, $0-16 MOVQ SI, 16(AX) MOVQ DI, 24(AX) RET - -// inverse(res, x *Element) -TEXT ·inverse(SB), $56-16 - // u = q - // u[0] -> DI - // u[1] -> R8 - // u[2] -> R9 - // u[3] -> R10 - MOVQ q<>+0(SB), DI - MOVQ q<>+8(SB), R8 - MOVQ q<>+16(SB), R9 - MOVQ q<>+24(SB), R10 - - // s = r^2 - // s[0] -> s3-32(SP) - // s[1] -> s4-40(SP) - // s[2] -> s5-48(SP) - // s[3] -> s6-56(SP) - MOVQ $0xc999e990f3f29c6d, SI - MOVQ SI, s3-32(SP) - MOVQ $0x2b6cedcb87925c23, SI - MOVQ SI, s4-40(SP) - MOVQ $0x05d314967254398f, SI - MOVQ SI, s5-48(SP) - MOVQ $0x0748d9d99f59ff11, SI - MOVQ SI, s6-56(SP) - - // v = x - // v[0] -> R11 - // v[1] -> R12 - // v[2] -> R13 - // v[3] -> R14 - MOVQ x+8(FP), SI - MOVQ 0(SI), AX - MOVQ 8(SI), DX - MOVQ 16(SI), CX - MOVQ 24(SI), BX - MOVQ AX, R11 - MOVQ DX, R12 - MOVQ CX, R13 - MOVQ BX, R14 - - // if x is 0, returns 0 - MOVQ AX, SI - ORQ DX, SI - ORQ CX, SI - ORQ BX, SI - JEQ l7 - - // r = 0 - // r[0] -> R15 - // r[1] -> s0-8(SP) - // r[2] -> s1-16(SP) - // r[3] -> s2-24(SP) - MOVQ $0, R15 - MOVQ $0, s0-8(SP) - MOVQ $0, s1-16(SP) - MOVQ $0, s2-24(SP) - -l2: - BTQ $0, AX - JCS l8 - MOVQ $0, BP - XORQ SI, SI - -l9: - INCQ BP - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - BTQ $0, AX - JCC l9 - MOVQ AX, R11 - MOVQ DX, R12 - MOVQ CX, R13 - MOVQ BX, R14 - MOVQ s3-32(SP), AX - MOVQ s4-40(SP), DX - MOVQ s5-48(SP), CX - MOVQ s6-56(SP), BX - -l10: - BTQ $0, AX - JCC l11 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l11: - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - DECQ BP - JNE l10 - MOVQ AX, s3-32(SP) - MOVQ DX, s4-40(SP) - MOVQ CX, s5-48(SP) - MOVQ BX, s6-56(SP) - -l8: - MOVQ DI, AX - MOVQ R8, DX - MOVQ R9, CX - MOVQ R10, BX - BTQ $0, AX - JCS l12 - MOVQ $0, BP - XORQ SI, SI - -l13: - INCQ BP - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - BTQ $0, AX - JCC l13 - MOVQ AX, DI - MOVQ DX, R8 - MOVQ CX, R9 - MOVQ BX, R10 - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - -l14: - BTQ $0, AX - JCC l15 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l15: - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - DECQ BP - JNE l14 - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - -l12: - // v = v - u - MOVQ R11, AX - MOVQ R12, DX - MOVQ R13, CX - MOVQ R14, BX - SUBQ DI, AX - SBBQ R8, DX - SBBQ R9, CX - SBBQ R10, BX - JCC l3 - SUBQ R11, DI - SBBQ R12, R8 - SBBQ R13, R9 - SBBQ R14, R10 - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - SUBQ s3-32(SP), AX - SBBQ s4-40(SP), DX - SBBQ s5-48(SP), CX - SBBQ s6-56(SP), BX - JCC l16 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l16: - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - JMP l4 - -l3: - MOVQ AX, R11 - MOVQ DX, R12 - MOVQ CX, R13 - MOVQ BX, R14 - MOVQ s3-32(SP), AX - MOVQ s4-40(SP), DX - MOVQ s5-48(SP), CX - MOVQ s6-56(SP), BX - SUBQ R15, AX - SBBQ s0-8(SP), DX - SBBQ s1-16(SP), CX - SBBQ s2-24(SP), BX - JCC l17 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l17: - MOVQ AX, s3-32(SP) - MOVQ DX, s4-40(SP) - MOVQ CX, s5-48(SP) - MOVQ BX, s6-56(SP) - -l4: - MOVQ DI, SI - SUBQ $1, SI - ORQ R8, SI - ORQ R9, SI - ORQ R10, SI - JEQ l5 - MOVQ R11, AX - MOVQ R12, DX - MOVQ R13, CX - MOVQ R14, BX - MOVQ AX, SI - SUBQ $1, SI - JNE l2 - ORQ DX, SI - ORQ CX, SI - ORQ BX, SI - JEQ l6 - JMP l2 - -l5: - MOVQ res+0(FP), SI - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - MOVQ AX, 0(SI) - MOVQ DX, 8(SI) - MOVQ CX, 16(SI) - MOVQ BX, 24(SI) - RET - -l6: - MOVQ res+0(FP), SI - MOVQ s3-32(SP), AX - MOVQ s4-40(SP), DX - MOVQ s5-48(SP), CX - MOVQ s6-56(SP), BX - MOVQ AX, 0(SI) - MOVQ DX, 8(SI) - MOVQ CX, 16(SI) - MOVQ BX, 24(SI) - RET - -l7: - MOVQ res+0(FP), SI - MOVQ $0, 0(SI) - MOVQ $0, 8(SI) - MOVQ $0, 16(SI) - MOVQ $0, 24(SI) - RET diff --git a/ecc/bls12-381/fr/element_ops_noasm.go b/ecc/bls12-381/fr/element_ops_noasm.go index 006365daa..ec1fac18d 100644 --- a/ecc/bls12-381/fr/element_ops_noasm.go +++ b/ecc/bls12-381/fr/element_ops_noasm.go @@ -51,10 +51,6 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } -func inverse(z, x *Element) { - _inverseGeneric(z, x) -} - // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bls24-315/fp/element.go b/ecc/bls24-315/fp/element.go index 79c15ad72..03c0c6913 100644 --- a/ecc/bls24-315/fp/element.go +++ b/ecc/bls24-315/fp/element.go @@ -911,17 +911,9 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { - inverse(z, x) - return z -} - -// _inverseGeneric z = x^-1 mod q -// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" -// if x == 0, sets and returns z = x -func _inverseGeneric(z, x *Element) { if x.IsZero() { z.SetZero() - return + return z } // initialize u = q @@ -1070,11 +1062,11 @@ func _inverseGeneric(z, x *Element) { } if (u[0] == 1) && (u[4]|u[3]|u[2]|u[1]) == 0 { z.Set(&r) - return + return z } if (v[0] == 1) && (v[4]|v[3]|v[2]|v[1]) == 0 { z.Set(&s) - return + return z } } diff --git a/ecc/bls24-315/fp/element_ops_amd64.go b/ecc/bls24-315/fp/element_ops_amd64.go index d61412bd6..73a3711ec 100644 --- a/ecc/bls24-315/fp/element_ops_amd64.go +++ b/ecc/bls24-315/fp/element_ops_amd64.go @@ -48,6 +48,3 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) - -//go:noescape -func inverse(res, x *Element) diff --git a/ecc/bls24-315/fp/element_ops_amd64.s b/ecc/bls24-315/fp/element_ops_amd64.s index 354d2294a..ab0ced516 100644 --- a/ecc/bls24-315/fp/element_ops_amd64.s +++ b/ecc/bls24-315/fp/element_ops_amd64.s @@ -398,312 +398,3 @@ TEXT ·Butterfly(SB), $24-16 MOVQ DI, 24(AX) MOVQ R8, 32(AX) RET - -// inverse(res, x *Element) -TEXT ·inverse(SB), $96-16 - // u = q - // u[0] -> R8 - // u[1] -> R9 - // u[2] -> R10 - // u[3] -> R11 - // u[4] -> R12 - MOVQ q<>+0(SB), R8 - MOVQ q<>+8(SB), R9 - MOVQ q<>+16(SB), R10 - MOVQ q<>+24(SB), R11 - MOVQ q<>+32(SB), R12 - - // s = r^2 - // s[0] -> s7-64(SP) - // s[1] -> s8-72(SP) - // s[2] -> s9-80(SP) - // s[3] -> s10-88(SP) - // s[4] -> s11-96(SP) - MOVQ $0x6b817891fe329c16, DI - MOVQ DI, s7-64(SP) - MOVQ $0x599ce86eec6e2c35, DI - MOVQ DI, s8-72(SP) - MOVQ $0xc338890f540d5ad6, DI - MOVQ DI, s9-80(SP) - MOVQ $0xcc160f6924c81f32, DI - MOVQ DI, s10-88(SP) - MOVQ $0x0215d8d4607a88d5, DI - MOVQ DI, s11-96(SP) - - // v = x - // v[0] -> R13 - // v[1] -> R14 - // v[2] -> R15 - // v[3] -> s0-8(SP) - // v[4] -> s1-16(SP) - MOVQ x+8(FP), DI - MOVQ 0(DI), AX - MOVQ 8(DI), DX - MOVQ 16(DI), CX - MOVQ 24(DI), BX - MOVQ 32(DI), SI - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ CX, R15 - MOVQ BX, s0-8(SP) - MOVQ SI, s1-16(SP) - - // if x is 0, returns 0 - MOVQ AX, DI - ORQ DX, DI - ORQ CX, DI - ORQ BX, DI - ORQ SI, DI - JEQ l7 - - // r = 0 - // r[0] -> s2-24(SP) - // r[1] -> s3-32(SP) - // r[2] -> s4-40(SP) - // r[3] -> s5-48(SP) - // r[4] -> s6-56(SP) - MOVQ $0, s2-24(SP) - MOVQ $0, s3-32(SP) - MOVQ $0, s4-40(SP) - MOVQ $0, s5-48(SP) - MOVQ $0, s6-56(SP) - -l2: - BTQ $0, AX - JCS l8 - MOVQ $0, BP - XORQ DI, DI - -l9: - INCQ BP - SHRQ $1, AX, DI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, SI - BTQ $0, AX - JCC l9 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ CX, R15 - MOVQ BX, s0-8(SP) - MOVQ SI, s1-16(SP) - MOVQ s7-64(SP), AX - MOVQ s8-72(SP), DX - MOVQ s9-80(SP), CX - MOVQ s10-88(SP), BX - MOVQ s11-96(SP), SI - -l10: - BTQ $0, AX - JCC l11 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - -l11: - SHRQ $1, AX, DI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, SI - DECQ BP - JNE l10 - MOVQ AX, s7-64(SP) - MOVQ DX, s8-72(SP) - MOVQ CX, s9-80(SP) - MOVQ BX, s10-88(SP) - MOVQ SI, s11-96(SP) - -l8: - MOVQ R8, AX - MOVQ R9, DX - MOVQ R10, CX - MOVQ R11, BX - MOVQ R12, SI - BTQ $0, AX - JCS l12 - MOVQ $0, BP - XORQ DI, DI - -l13: - INCQ BP - SHRQ $1, AX, DI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, SI - BTQ $0, AX - JCC l13 - MOVQ AX, R8 - MOVQ DX, R9 - MOVQ CX, R10 - MOVQ BX, R11 - MOVQ SI, R12 - MOVQ s2-24(SP), AX - MOVQ s3-32(SP), DX - MOVQ s4-40(SP), CX - MOVQ s5-48(SP), BX - MOVQ s6-56(SP), SI - -l14: - BTQ $0, AX - JCC l15 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - -l15: - SHRQ $1, AX, DI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, SI - DECQ BP - JNE l14 - MOVQ AX, s2-24(SP) - MOVQ DX, s3-32(SP) - MOVQ CX, s4-40(SP) - MOVQ BX, s5-48(SP) - MOVQ SI, s6-56(SP) - -l12: - // v = v - u - MOVQ R13, AX - MOVQ R14, DX - MOVQ R15, CX - MOVQ s0-8(SP), BX - MOVQ s1-16(SP), SI - SUBQ R8, AX - SBBQ R9, DX - SBBQ R10, CX - SBBQ R11, BX - SBBQ R12, SI - JCC l3 - SUBQ R13, R8 - SBBQ R14, R9 - SBBQ R15, R10 - SBBQ s0-8(SP), R11 - SBBQ s1-16(SP), R12 - MOVQ s2-24(SP), AX - MOVQ s3-32(SP), DX - MOVQ s4-40(SP), CX - MOVQ s5-48(SP), BX - MOVQ s6-56(SP), SI - SUBQ s7-64(SP), AX - SBBQ s8-72(SP), DX - SBBQ s9-80(SP), CX - SBBQ s10-88(SP), BX - SBBQ s11-96(SP), SI - JCC l16 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - -l16: - MOVQ AX, s2-24(SP) - MOVQ DX, s3-32(SP) - MOVQ CX, s4-40(SP) - MOVQ BX, s5-48(SP) - MOVQ SI, s6-56(SP) - JMP l4 - -l3: - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ CX, R15 - MOVQ BX, s0-8(SP) - MOVQ SI, s1-16(SP) - MOVQ s7-64(SP), AX - MOVQ s8-72(SP), DX - MOVQ s9-80(SP), CX - MOVQ s10-88(SP), BX - MOVQ s11-96(SP), SI - SUBQ s2-24(SP), AX - SBBQ s3-32(SP), DX - SBBQ s4-40(SP), CX - SBBQ s5-48(SP), BX - SBBQ s6-56(SP), SI - JCC l17 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - -l17: - MOVQ AX, s7-64(SP) - MOVQ DX, s8-72(SP) - MOVQ CX, s9-80(SP) - MOVQ BX, s10-88(SP) - MOVQ SI, s11-96(SP) - -l4: - MOVQ R8, DI - SUBQ $1, DI - ORQ R9, DI - ORQ R10, DI - ORQ R11, DI - ORQ R12, DI - JEQ l5 - MOVQ R13, AX - MOVQ R14, DX - MOVQ R15, CX - MOVQ s0-8(SP), BX - MOVQ s1-16(SP), SI - MOVQ AX, DI - SUBQ $1, DI - JNE l2 - ORQ DX, DI - ORQ CX, DI - ORQ BX, DI - ORQ SI, DI - JEQ l6 - JMP l2 - -l5: - MOVQ res+0(FP), DI - MOVQ s2-24(SP), AX - MOVQ s3-32(SP), DX - MOVQ s4-40(SP), CX - MOVQ s5-48(SP), BX - MOVQ s6-56(SP), SI - MOVQ AX, 0(DI) - MOVQ DX, 8(DI) - MOVQ CX, 16(DI) - MOVQ BX, 24(DI) - MOVQ SI, 32(DI) - RET - -l6: - MOVQ res+0(FP), DI - MOVQ s7-64(SP), AX - MOVQ s8-72(SP), DX - MOVQ s9-80(SP), CX - MOVQ s10-88(SP), BX - MOVQ s11-96(SP), SI - MOVQ AX, 0(DI) - MOVQ DX, 8(DI) - MOVQ CX, 16(DI) - MOVQ BX, 24(DI) - MOVQ SI, 32(DI) - RET - -l7: - MOVQ res+0(FP), DI - MOVQ $0, 0(DI) - MOVQ $0, 8(DI) - MOVQ $0, 16(DI) - MOVQ $0, 24(DI) - MOVQ $0, 32(DI) - RET diff --git a/ecc/bls24-315/fp/element_ops_noasm.go b/ecc/bls24-315/fp/element_ops_noasm.go index 48d55e2ea..fec628918 100644 --- a/ecc/bls24-315/fp/element_ops_noasm.go +++ b/ecc/bls24-315/fp/element_ops_noasm.go @@ -51,10 +51,6 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } -func inverse(z, x *Element) { - _inverseGeneric(z, x) -} - // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bls24-315/fr/element.go b/ecc/bls24-315/fr/element.go index 726a02f32..ec20f3a0d 100644 --- a/ecc/bls24-315/fr/element.go +++ b/ecc/bls24-315/fr/element.go @@ -845,17 +845,9 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { - inverse(z, x) - return z -} - -// _inverseGeneric z = x^-1 mod q -// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" -// if x == 0, sets and returns z = x -func _inverseGeneric(z, x *Element) { if x.IsZero() { z.SetZero() - return + return z } // initialize u = q @@ -990,11 +982,11 @@ func _inverseGeneric(z, x *Element) { } if (u[0] == 1) && (u[3]|u[2]|u[1]) == 0 { z.Set(&r) - return + return z } if (v[0] == 1) && (v[3]|v[2]|v[1]) == 0 { z.Set(&s) - return + return z } } diff --git a/ecc/bls24-315/fr/element_ops_amd64.go b/ecc/bls24-315/fr/element_ops_amd64.go index 9ebabc26a..78022b3e6 100644 --- a/ecc/bls24-315/fr/element_ops_amd64.go +++ b/ecc/bls24-315/fr/element_ops_amd64.go @@ -48,6 +48,3 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) - -//go:noescape -func inverse(res, x *Element) diff --git a/ecc/bls24-315/fr/element_ops_amd64.s b/ecc/bls24-315/fr/element_ops_amd64.s index 2b915f270..102b8883b 100644 --- a/ecc/bls24-315/fr/element_ops_amd64.s +++ b/ecc/bls24-315/fr/element_ops_amd64.s @@ -338,268 +338,3 @@ TEXT ·Butterfly(SB), NOSPLIT, $0-16 MOVQ SI, 16(AX) MOVQ DI, 24(AX) RET - -// inverse(res, x *Element) -TEXT ·inverse(SB), $56-16 - // u = q - // u[0] -> DI - // u[1] -> R8 - // u[2] -> R9 - // u[3] -> R10 - MOVQ q<>+0(SB), DI - MOVQ q<>+8(SB), R8 - MOVQ q<>+16(SB), R9 - MOVQ q<>+24(SB), R10 - - // s = r^2 - // s[0] -> s3-32(SP) - // s[1] -> s4-40(SP) - // s[2] -> s5-48(SP) - // s[3] -> s6-56(SP) - MOVQ $0x56a1ff2e50fc8851, SI - MOVQ SI, s3-32(SP) - MOVQ $0xeb3f198d55a12c3f, SI - MOVQ SI, s4-40(SP) - MOVQ $0x9799359271b08283, SI - MOVQ SI, s5-48(SP) - MOVQ $0x081d245007d35a5a, SI - MOVQ SI, s6-56(SP) - - // v = x - // v[0] -> R11 - // v[1] -> R12 - // v[2] -> R13 - // v[3] -> R14 - MOVQ x+8(FP), SI - MOVQ 0(SI), AX - MOVQ 8(SI), DX - MOVQ 16(SI), CX - MOVQ 24(SI), BX - MOVQ AX, R11 - MOVQ DX, R12 - MOVQ CX, R13 - MOVQ BX, R14 - - // if x is 0, returns 0 - MOVQ AX, SI - ORQ DX, SI - ORQ CX, SI - ORQ BX, SI - JEQ l7 - - // r = 0 - // r[0] -> R15 - // r[1] -> s0-8(SP) - // r[2] -> s1-16(SP) - // r[3] -> s2-24(SP) - MOVQ $0, R15 - MOVQ $0, s0-8(SP) - MOVQ $0, s1-16(SP) - MOVQ $0, s2-24(SP) - -l2: - BTQ $0, AX - JCS l8 - MOVQ $0, BP - XORQ SI, SI - -l9: - INCQ BP - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - BTQ $0, AX - JCC l9 - MOVQ AX, R11 - MOVQ DX, R12 - MOVQ CX, R13 - MOVQ BX, R14 - MOVQ s3-32(SP), AX - MOVQ s4-40(SP), DX - MOVQ s5-48(SP), CX - MOVQ s6-56(SP), BX - -l10: - BTQ $0, AX - JCC l11 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l11: - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - DECQ BP - JNE l10 - MOVQ AX, s3-32(SP) - MOVQ DX, s4-40(SP) - MOVQ CX, s5-48(SP) - MOVQ BX, s6-56(SP) - -l8: - MOVQ DI, AX - MOVQ R8, DX - MOVQ R9, CX - MOVQ R10, BX - BTQ $0, AX - JCS l12 - MOVQ $0, BP - XORQ SI, SI - -l13: - INCQ BP - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - BTQ $0, AX - JCC l13 - MOVQ AX, DI - MOVQ DX, R8 - MOVQ CX, R9 - MOVQ BX, R10 - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - -l14: - BTQ $0, AX - JCC l15 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l15: - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - DECQ BP - JNE l14 - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - -l12: - // v = v - u - MOVQ R11, AX - MOVQ R12, DX - MOVQ R13, CX - MOVQ R14, BX - SUBQ DI, AX - SBBQ R8, DX - SBBQ R9, CX - SBBQ R10, BX - JCC l3 - SUBQ R11, DI - SBBQ R12, R8 - SBBQ R13, R9 - SBBQ R14, R10 - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - SUBQ s3-32(SP), AX - SBBQ s4-40(SP), DX - SBBQ s5-48(SP), CX - SBBQ s6-56(SP), BX - JCC l16 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l16: - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - JMP l4 - -l3: - MOVQ AX, R11 - MOVQ DX, R12 - MOVQ CX, R13 - MOVQ BX, R14 - MOVQ s3-32(SP), AX - MOVQ s4-40(SP), DX - MOVQ s5-48(SP), CX - MOVQ s6-56(SP), BX - SUBQ R15, AX - SBBQ s0-8(SP), DX - SBBQ s1-16(SP), CX - SBBQ s2-24(SP), BX - JCC l17 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l17: - MOVQ AX, s3-32(SP) - MOVQ DX, s4-40(SP) - MOVQ CX, s5-48(SP) - MOVQ BX, s6-56(SP) - -l4: - MOVQ DI, SI - SUBQ $1, SI - ORQ R8, SI - ORQ R9, SI - ORQ R10, SI - JEQ l5 - MOVQ R11, AX - MOVQ R12, DX - MOVQ R13, CX - MOVQ R14, BX - MOVQ AX, SI - SUBQ $1, SI - JNE l2 - ORQ DX, SI - ORQ CX, SI - ORQ BX, SI - JEQ l6 - JMP l2 - -l5: - MOVQ res+0(FP), SI - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - MOVQ AX, 0(SI) - MOVQ DX, 8(SI) - MOVQ CX, 16(SI) - MOVQ BX, 24(SI) - RET - -l6: - MOVQ res+0(FP), SI - MOVQ s3-32(SP), AX - MOVQ s4-40(SP), DX - MOVQ s5-48(SP), CX - MOVQ s6-56(SP), BX - MOVQ AX, 0(SI) - MOVQ DX, 8(SI) - MOVQ CX, 16(SI) - MOVQ BX, 24(SI) - RET - -l7: - MOVQ res+0(FP), SI - MOVQ $0, 0(SI) - MOVQ $0, 8(SI) - MOVQ $0, 16(SI) - MOVQ $0, 24(SI) - RET diff --git a/ecc/bls24-315/fr/element_ops_noasm.go b/ecc/bls24-315/fr/element_ops_noasm.go index 006365daa..ec1fac18d 100644 --- a/ecc/bls24-315/fr/element_ops_noasm.go +++ b/ecc/bls24-315/fr/element_ops_noasm.go @@ -51,10 +51,6 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } -func inverse(z, x *Element) { - _inverseGeneric(z, x) -} - // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bn254/fp/element.go b/ecc/bn254/fp/element.go index 1aa44e1e4..d5363aa4f 100644 --- a/ecc/bn254/fp/element.go +++ b/ecc/bn254/fp/element.go @@ -793,17 +793,9 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { - inverse(z, x) - return z -} - -// _inverseGeneric z = x^-1 mod q -// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" -// if x == 0, sets and returns z = x -func _inverseGeneric(z, x *Element) { if x.IsZero() { z.SetZero() - return + return z } // initialize u = q @@ -938,11 +930,11 @@ func _inverseGeneric(z, x *Element) { } if (u[0] == 1) && (u[3]|u[2]|u[1]) == 0 { z.Set(&r) - return + return z } if (v[0] == 1) && (v[3]|v[2]|v[1]) == 0 { z.Set(&s) - return + return z } } diff --git a/ecc/bn254/fp/element_ops_amd64.go b/ecc/bn254/fp/element_ops_amd64.go index d61412bd6..73a3711ec 100644 --- a/ecc/bn254/fp/element_ops_amd64.go +++ b/ecc/bn254/fp/element_ops_amd64.go @@ -48,6 +48,3 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) - -//go:noescape -func inverse(res, x *Element) diff --git a/ecc/bn254/fp/element_ops_amd64.s b/ecc/bn254/fp/element_ops_amd64.s index bbea3d8c6..5b8b1f0e7 100644 --- a/ecc/bn254/fp/element_ops_amd64.s +++ b/ecc/bn254/fp/element_ops_amd64.s @@ -338,268 +338,3 @@ TEXT ·Butterfly(SB), NOSPLIT, $0-16 MOVQ SI, 16(AX) MOVQ DI, 24(AX) RET - -// inverse(res, x *Element) -TEXT ·inverse(SB), $56-16 - // u = q - // u[0] -> DI - // u[1] -> R8 - // u[2] -> R9 - // u[3] -> R10 - MOVQ q<>+0(SB), DI - MOVQ q<>+8(SB), R8 - MOVQ q<>+16(SB), R9 - MOVQ q<>+24(SB), R10 - - // s = r^2 - // s[0] -> s3-32(SP) - // s[1] -> s4-40(SP) - // s[2] -> s5-48(SP) - // s[3] -> s6-56(SP) - MOVQ $0xf32cfc5b538afa89, SI - MOVQ SI, s3-32(SP) - MOVQ $0xb5e71911d44501fb, SI - MOVQ SI, s4-40(SP) - MOVQ $0x47ab1eff0a417ff6, SI - MOVQ SI, s5-48(SP) - MOVQ $0x06d89f71cab8351f, SI - MOVQ SI, s6-56(SP) - - // v = x - // v[0] -> R11 - // v[1] -> R12 - // v[2] -> R13 - // v[3] -> R14 - MOVQ x+8(FP), SI - MOVQ 0(SI), AX - MOVQ 8(SI), DX - MOVQ 16(SI), CX - MOVQ 24(SI), BX - MOVQ AX, R11 - MOVQ DX, R12 - MOVQ CX, R13 - MOVQ BX, R14 - - // if x is 0, returns 0 - MOVQ AX, SI - ORQ DX, SI - ORQ CX, SI - ORQ BX, SI - JEQ l7 - - // r = 0 - // r[0] -> R15 - // r[1] -> s0-8(SP) - // r[2] -> s1-16(SP) - // r[3] -> s2-24(SP) - MOVQ $0, R15 - MOVQ $0, s0-8(SP) - MOVQ $0, s1-16(SP) - MOVQ $0, s2-24(SP) - -l2: - BTQ $0, AX - JCS l8 - MOVQ $0, BP - XORQ SI, SI - -l9: - INCQ BP - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - BTQ $0, AX - JCC l9 - MOVQ AX, R11 - MOVQ DX, R12 - MOVQ CX, R13 - MOVQ BX, R14 - MOVQ s3-32(SP), AX - MOVQ s4-40(SP), DX - MOVQ s5-48(SP), CX - MOVQ s6-56(SP), BX - -l10: - BTQ $0, AX - JCC l11 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l11: - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - DECQ BP - JNE l10 - MOVQ AX, s3-32(SP) - MOVQ DX, s4-40(SP) - MOVQ CX, s5-48(SP) - MOVQ BX, s6-56(SP) - -l8: - MOVQ DI, AX - MOVQ R8, DX - MOVQ R9, CX - MOVQ R10, BX - BTQ $0, AX - JCS l12 - MOVQ $0, BP - XORQ SI, SI - -l13: - INCQ BP - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - BTQ $0, AX - JCC l13 - MOVQ AX, DI - MOVQ DX, R8 - MOVQ CX, R9 - MOVQ BX, R10 - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - -l14: - BTQ $0, AX - JCC l15 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l15: - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - DECQ BP - JNE l14 - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - -l12: - // v = v - u - MOVQ R11, AX - MOVQ R12, DX - MOVQ R13, CX - MOVQ R14, BX - SUBQ DI, AX - SBBQ R8, DX - SBBQ R9, CX - SBBQ R10, BX - JCC l3 - SUBQ R11, DI - SBBQ R12, R8 - SBBQ R13, R9 - SBBQ R14, R10 - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - SUBQ s3-32(SP), AX - SBBQ s4-40(SP), DX - SBBQ s5-48(SP), CX - SBBQ s6-56(SP), BX - JCC l16 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l16: - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - JMP l4 - -l3: - MOVQ AX, R11 - MOVQ DX, R12 - MOVQ CX, R13 - MOVQ BX, R14 - MOVQ s3-32(SP), AX - MOVQ s4-40(SP), DX - MOVQ s5-48(SP), CX - MOVQ s6-56(SP), BX - SUBQ R15, AX - SBBQ s0-8(SP), DX - SBBQ s1-16(SP), CX - SBBQ s2-24(SP), BX - JCC l17 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l17: - MOVQ AX, s3-32(SP) - MOVQ DX, s4-40(SP) - MOVQ CX, s5-48(SP) - MOVQ BX, s6-56(SP) - -l4: - MOVQ DI, SI - SUBQ $1, SI - ORQ R8, SI - ORQ R9, SI - ORQ R10, SI - JEQ l5 - MOVQ R11, AX - MOVQ R12, DX - MOVQ R13, CX - MOVQ R14, BX - MOVQ AX, SI - SUBQ $1, SI - JNE l2 - ORQ DX, SI - ORQ CX, SI - ORQ BX, SI - JEQ l6 - JMP l2 - -l5: - MOVQ res+0(FP), SI - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - MOVQ AX, 0(SI) - MOVQ DX, 8(SI) - MOVQ CX, 16(SI) - MOVQ BX, 24(SI) - RET - -l6: - MOVQ res+0(FP), SI - MOVQ s3-32(SP), AX - MOVQ s4-40(SP), DX - MOVQ s5-48(SP), CX - MOVQ s6-56(SP), BX - MOVQ AX, 0(SI) - MOVQ DX, 8(SI) - MOVQ CX, 16(SI) - MOVQ BX, 24(SI) - RET - -l7: - MOVQ res+0(FP), SI - MOVQ $0, 0(SI) - MOVQ $0, 8(SI) - MOVQ $0, 16(SI) - MOVQ $0, 24(SI) - RET diff --git a/ecc/bn254/fp/element_ops_noasm.go b/ecc/bn254/fp/element_ops_noasm.go index 48d55e2ea..fec628918 100644 --- a/ecc/bn254/fp/element_ops_noasm.go +++ b/ecc/bn254/fp/element_ops_noasm.go @@ -51,10 +51,6 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } -func inverse(z, x *Element) { - _inverseGeneric(z, x) -} - // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bn254/fr/element.go b/ecc/bn254/fr/element.go index e1666826b..9a32f847d 100644 --- a/ecc/bn254/fr/element.go +++ b/ecc/bn254/fr/element.go @@ -845,17 +845,9 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { - inverse(z, x) - return z -} - -// _inverseGeneric z = x^-1 mod q -// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" -// if x == 0, sets and returns z = x -func _inverseGeneric(z, x *Element) { if x.IsZero() { z.SetZero() - return + return z } // initialize u = q @@ -990,11 +982,11 @@ func _inverseGeneric(z, x *Element) { } if (u[0] == 1) && (u[3]|u[2]|u[1]) == 0 { z.Set(&r) - return + return z } if (v[0] == 1) && (v[3]|v[2]|v[1]) == 0 { z.Set(&s) - return + return z } } diff --git a/ecc/bn254/fr/element_ops_amd64.go b/ecc/bn254/fr/element_ops_amd64.go index 9ebabc26a..78022b3e6 100644 --- a/ecc/bn254/fr/element_ops_amd64.go +++ b/ecc/bn254/fr/element_ops_amd64.go @@ -48,6 +48,3 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) - -//go:noescape -func inverse(res, x *Element) diff --git a/ecc/bn254/fr/element_ops_amd64.s b/ecc/bn254/fr/element_ops_amd64.s index 318af3bc9..d5dca83d2 100644 --- a/ecc/bn254/fr/element_ops_amd64.s +++ b/ecc/bn254/fr/element_ops_amd64.s @@ -338,268 +338,3 @@ TEXT ·Butterfly(SB), NOSPLIT, $0-16 MOVQ SI, 16(AX) MOVQ DI, 24(AX) RET - -// inverse(res, x *Element) -TEXT ·inverse(SB), $56-16 - // u = q - // u[0] -> DI - // u[1] -> R8 - // u[2] -> R9 - // u[3] -> R10 - MOVQ q<>+0(SB), DI - MOVQ q<>+8(SB), R8 - MOVQ q<>+16(SB), R9 - MOVQ q<>+24(SB), R10 - - // s = r^2 - // s[0] -> s3-32(SP) - // s[1] -> s4-40(SP) - // s[2] -> s5-48(SP) - // s[3] -> s6-56(SP) - MOVQ $0x1bb8e645ae216da7, SI - MOVQ SI, s3-32(SP) - MOVQ $0x53fe3ab1e35c59e3, SI - MOVQ SI, s4-40(SP) - MOVQ $0x8c49833d53bb8085, SI - MOVQ SI, s5-48(SP) - MOVQ $0x0216d0b17f4e44a5, SI - MOVQ SI, s6-56(SP) - - // v = x - // v[0] -> R11 - // v[1] -> R12 - // v[2] -> R13 - // v[3] -> R14 - MOVQ x+8(FP), SI - MOVQ 0(SI), AX - MOVQ 8(SI), DX - MOVQ 16(SI), CX - MOVQ 24(SI), BX - MOVQ AX, R11 - MOVQ DX, R12 - MOVQ CX, R13 - MOVQ BX, R14 - - // if x is 0, returns 0 - MOVQ AX, SI - ORQ DX, SI - ORQ CX, SI - ORQ BX, SI - JEQ l7 - - // r = 0 - // r[0] -> R15 - // r[1] -> s0-8(SP) - // r[2] -> s1-16(SP) - // r[3] -> s2-24(SP) - MOVQ $0, R15 - MOVQ $0, s0-8(SP) - MOVQ $0, s1-16(SP) - MOVQ $0, s2-24(SP) - -l2: - BTQ $0, AX - JCS l8 - MOVQ $0, BP - XORQ SI, SI - -l9: - INCQ BP - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - BTQ $0, AX - JCC l9 - MOVQ AX, R11 - MOVQ DX, R12 - MOVQ CX, R13 - MOVQ BX, R14 - MOVQ s3-32(SP), AX - MOVQ s4-40(SP), DX - MOVQ s5-48(SP), CX - MOVQ s6-56(SP), BX - -l10: - BTQ $0, AX - JCC l11 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l11: - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - DECQ BP - JNE l10 - MOVQ AX, s3-32(SP) - MOVQ DX, s4-40(SP) - MOVQ CX, s5-48(SP) - MOVQ BX, s6-56(SP) - -l8: - MOVQ DI, AX - MOVQ R8, DX - MOVQ R9, CX - MOVQ R10, BX - BTQ $0, AX - JCS l12 - MOVQ $0, BP - XORQ SI, SI - -l13: - INCQ BP - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - BTQ $0, AX - JCC l13 - MOVQ AX, DI - MOVQ DX, R8 - MOVQ CX, R9 - MOVQ BX, R10 - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - -l14: - BTQ $0, AX - JCC l15 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l15: - SHRQ $1, AX, SI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, BX - DECQ BP - JNE l14 - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - -l12: - // v = v - u - MOVQ R11, AX - MOVQ R12, DX - MOVQ R13, CX - MOVQ R14, BX - SUBQ DI, AX - SBBQ R8, DX - SBBQ R9, CX - SBBQ R10, BX - JCC l3 - SUBQ R11, DI - SBBQ R12, R8 - SBBQ R13, R9 - SBBQ R14, R10 - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - SUBQ s3-32(SP), AX - SBBQ s4-40(SP), DX - SBBQ s5-48(SP), CX - SBBQ s6-56(SP), BX - JCC l16 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l16: - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - JMP l4 - -l3: - MOVQ AX, R11 - MOVQ DX, R12 - MOVQ CX, R13 - MOVQ BX, R14 - MOVQ s3-32(SP), AX - MOVQ s4-40(SP), DX - MOVQ s5-48(SP), CX - MOVQ s6-56(SP), BX - SUBQ R15, AX - SBBQ s0-8(SP), DX - SBBQ s1-16(SP), CX - SBBQ s2-24(SP), BX - JCC l17 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - -l17: - MOVQ AX, s3-32(SP) - MOVQ DX, s4-40(SP) - MOVQ CX, s5-48(SP) - MOVQ BX, s6-56(SP) - -l4: - MOVQ DI, SI - SUBQ $1, SI - ORQ R8, SI - ORQ R9, SI - ORQ R10, SI - JEQ l5 - MOVQ R11, AX - MOVQ R12, DX - MOVQ R13, CX - MOVQ R14, BX - MOVQ AX, SI - SUBQ $1, SI - JNE l2 - ORQ DX, SI - ORQ CX, SI - ORQ BX, SI - JEQ l6 - JMP l2 - -l5: - MOVQ res+0(FP), SI - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - MOVQ AX, 0(SI) - MOVQ DX, 8(SI) - MOVQ CX, 16(SI) - MOVQ BX, 24(SI) - RET - -l6: - MOVQ res+0(FP), SI - MOVQ s3-32(SP), AX - MOVQ s4-40(SP), DX - MOVQ s5-48(SP), CX - MOVQ s6-56(SP), BX - MOVQ AX, 0(SI) - MOVQ DX, 8(SI) - MOVQ CX, 16(SI) - MOVQ BX, 24(SI) - RET - -l7: - MOVQ res+0(FP), SI - MOVQ $0, 0(SI) - MOVQ $0, 8(SI) - MOVQ $0, 16(SI) - MOVQ $0, 24(SI) - RET diff --git a/ecc/bn254/fr/element_ops_noasm.go b/ecc/bn254/fr/element_ops_noasm.go index 006365daa..ec1fac18d 100644 --- a/ecc/bn254/fr/element_ops_noasm.go +++ b/ecc/bn254/fr/element_ops_noasm.go @@ -51,10 +51,6 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } -func inverse(z, x *Element) { - _inverseGeneric(z, x) -} - // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bw6-633/fp/element.go b/ecc/bw6-633/fp/element.go index 398f6b339..2af61465c 100644 --- a/ecc/bw6-633/fp/element.go +++ b/ecc/bw6-633/fp/element.go @@ -1281,17 +1281,9 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { - inverse(z, x) - return z -} - -// _inverseGeneric z = x^-1 mod q -// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" -// if x == 0, sets and returns z = x -func _inverseGeneric(z, x *Element) { if x.IsZero() { z.SetZero() - return + return z } // initialize u = q @@ -1510,11 +1502,11 @@ func _inverseGeneric(z, x *Element) { } if (u[0] == 1) && (u[9]|u[8]|u[7]|u[6]|u[5]|u[4]|u[3]|u[2]|u[1]) == 0 { z.Set(&r) - return + return z } if (v[0] == 1) && (v[9]|v[8]|v[7]|v[6]|v[5]|v[4]|v[3]|v[2]|v[1]) == 0 { z.Set(&s) - return + return z } } diff --git a/ecc/bw6-633/fp/element_ops_amd64.go b/ecc/bw6-633/fp/element_ops_amd64.go index d61412bd6..73a3711ec 100644 --- a/ecc/bw6-633/fp/element_ops_amd64.go +++ b/ecc/bw6-633/fp/element_ops_amd64.go @@ -48,6 +48,3 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) - -//go:noescape -func inverse(res, x *Element) diff --git a/ecc/bw6-633/fp/element_ops_amd64.s b/ecc/bw6-633/fp/element_ops_amd64.s index 78f5179d2..bb8d0be3b 100644 --- a/ecc/bw6-633/fp/element_ops_amd64.s +++ b/ecc/bw6-633/fp/element_ops_amd64.s @@ -644,571 +644,3 @@ l3: MOVQ R11, 64(AX) MOVQ R12, 72(AX) RET - -// inverse(res, x *Element) -TEXT ·inverse(SB), $296-16 - // u = q - // u[0] -> R13 - // u[1] -> R14 - // u[2] -> R15 - // u[3] -> s0-8(SP) - // u[4] -> s1-16(SP) - // u[5] -> s2-24(SP) - // u[6] -> s3-32(SP) - // u[7] -> s4-40(SP) - // u[8] -> s5-48(SP) - // u[9] -> s6-56(SP) - MOVQ q<>+0(SB), R12 - MOVQ R12, R13 - MOVQ q<>+8(SB), R12 - MOVQ R12, R14 - MOVQ q<>+16(SB), R12 - MOVQ R12, R15 - MOVQ q<>+24(SB), R12 - MOVQ R12, s0-8(SP) - MOVQ q<>+32(SB), R12 - MOVQ R12, s1-16(SP) - MOVQ q<>+40(SB), R12 - MOVQ R12, s2-24(SP) - MOVQ q<>+48(SB), R12 - MOVQ R12, s3-32(SP) - MOVQ q<>+56(SB), R12 - MOVQ R12, s4-40(SP) - MOVQ q<>+64(SB), R12 - MOVQ R12, s5-48(SP) - MOVQ q<>+72(SB), R12 - MOVQ R12, s6-56(SP) - - // s = r^2 - // s[0] -> s27-224(SP) - // s[1] -> s28-232(SP) - // s[2] -> s29-240(SP) - // s[3] -> s30-248(SP) - // s[4] -> s31-256(SP) - // s[5] -> s32-264(SP) - // s[6] -> s33-272(SP) - // s[7] -> s34-280(SP) - // s[8] -> s35-288(SP) - // s[9] -> s36-296(SP) - MOVQ $0x661e804ca9d73f4c, R12 - MOVQ R12, s27-224(SP) - MOVQ $0xc8097534c70cdf8b, R12 - MOVQ R12, s28-232(SP) - MOVQ $0xe6a4436c7d9c2a0b, R12 - MOVQ R12, s29-240(SP) - MOVQ $0x0a8eade777742a9e, R12 - MOVQ R12, s30-248(SP) - MOVQ $0xb0fc02b996feedd8, R12 - MOVQ R12, s31-256(SP) - MOVQ $0xba4fdbddeb83543a, R12 - MOVQ R12, s32-264(SP) - MOVQ $0xaebec1921b2490f8, R12 - MOVQ R12, s33-272(SP) - MOVQ $0xd4af2c0e74212f40, R12 - MOVQ R12, s34-280(SP) - MOVQ $0x499179a8fa1cce12, R12 - MOVQ R12, s35-288(SP) - MOVQ $0x007da75a34ab397a, R12 - MOVQ R12, s36-296(SP) - - // v = x - // v[0] -> s7-64(SP) - // v[1] -> s8-72(SP) - // v[2] -> s9-80(SP) - // v[3] -> s10-88(SP) - // v[4] -> s11-96(SP) - // v[5] -> s12-104(SP) - // v[6] -> s13-112(SP) - // v[7] -> s14-120(SP) - // v[8] -> s15-128(SP) - // v[9] -> s16-136(SP) - MOVQ x+8(FP), R12 - MOVQ 0(R12), AX - MOVQ 8(R12), DX - MOVQ 16(R12), CX - MOVQ 24(R12), BX - MOVQ 32(R12), SI - MOVQ 40(R12), DI - MOVQ 48(R12), R8 - MOVQ 56(R12), R9 - MOVQ 64(R12), R10 - MOVQ 72(R12), R11 - MOVQ AX, s7-64(SP) - MOVQ DX, s8-72(SP) - MOVQ CX, s9-80(SP) - MOVQ BX, s10-88(SP) - MOVQ SI, s11-96(SP) - MOVQ DI, s12-104(SP) - MOVQ R8, s13-112(SP) - MOVQ R9, s14-120(SP) - MOVQ R10, s15-128(SP) - MOVQ R11, s16-136(SP) - - // if x is 0, returns 0 - MOVQ AX, R12 - ORQ DX, R12 - ORQ CX, R12 - ORQ BX, R12 - ORQ SI, R12 - ORQ DI, R12 - ORQ R8, R12 - ORQ R9, R12 - ORQ R10, R12 - ORQ R11, R12 - JEQ l9 - - // r = 0 - // r[0] -> s17-144(SP) - // r[1] -> s18-152(SP) - // r[2] -> s19-160(SP) - // r[3] -> s20-168(SP) - // r[4] -> s21-176(SP) - // r[5] -> s22-184(SP) - // r[6] -> s23-192(SP) - // r[7] -> s24-200(SP) - // r[8] -> s25-208(SP) - // r[9] -> s26-216(SP) - MOVQ $0, s17-144(SP) - MOVQ $0, s18-152(SP) - MOVQ $0, s19-160(SP) - MOVQ $0, s20-168(SP) - MOVQ $0, s21-176(SP) - MOVQ $0, s22-184(SP) - MOVQ $0, s23-192(SP) - MOVQ $0, s24-200(SP) - MOVQ $0, s25-208(SP) - MOVQ $0, s26-216(SP) - -l4: - BTQ $0, AX - JCS l10 - MOVQ $0, BP - XORQ R12, R12 - -l11: - INCQ BP - SHRQ $1, AX, R12 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, R8, DI - SHRQ $1, R9, R8 - SHRQ $1, R10, R9 - SHRQ $1, R11, R10 - SHRQ $1, R11 - BTQ $0, AX - JCC l11 - MOVQ AX, s7-64(SP) - MOVQ DX, s8-72(SP) - MOVQ CX, s9-80(SP) - MOVQ BX, s10-88(SP) - MOVQ SI, s11-96(SP) - MOVQ DI, s12-104(SP) - MOVQ R8, s13-112(SP) - MOVQ R9, s14-120(SP) - MOVQ R10, s15-128(SP) - MOVQ R11, s16-136(SP) - MOVQ s27-224(SP), AX - MOVQ s28-232(SP), DX - MOVQ s29-240(SP), CX - MOVQ s30-248(SP), BX - MOVQ s31-256(SP), SI - MOVQ s32-264(SP), DI - MOVQ s33-272(SP), R8 - MOVQ s34-280(SP), R9 - MOVQ s35-288(SP), R10 - MOVQ s36-296(SP), R11 - -l12: - BTQ $0, AX - JCC l13 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - ADCQ q<>+48(SB), R8 - ADCQ q<>+56(SB), R9 - ADCQ q<>+64(SB), R10 - ADCQ q<>+72(SB), R11 - -l13: - SHRQ $1, AX, R12 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, R8, DI - SHRQ $1, R9, R8 - SHRQ $1, R10, R9 - SHRQ $1, R11, R10 - SHRQ $1, R11 - DECQ BP - JNE l12 - MOVQ AX, s27-224(SP) - MOVQ DX, s28-232(SP) - MOVQ CX, s29-240(SP) - MOVQ BX, s30-248(SP) - MOVQ SI, s31-256(SP) - MOVQ DI, s32-264(SP) - MOVQ R8, s33-272(SP) - MOVQ R9, s34-280(SP) - MOVQ R10, s35-288(SP) - MOVQ R11, s36-296(SP) - -l10: - MOVQ R13, AX - MOVQ R14, DX - MOVQ R15, CX - MOVQ s0-8(SP), BX - MOVQ s1-16(SP), SI - MOVQ s2-24(SP), DI - MOVQ s3-32(SP), R8 - MOVQ s4-40(SP), R9 - MOVQ s5-48(SP), R10 - MOVQ s6-56(SP), R11 - BTQ $0, AX - JCS l14 - MOVQ $0, BP - XORQ R12, R12 - -l15: - INCQ BP - SHRQ $1, AX, R12 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, R8, DI - SHRQ $1, R9, R8 - SHRQ $1, R10, R9 - SHRQ $1, R11, R10 - SHRQ $1, R11 - BTQ $0, AX - JCC l15 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ CX, R15 - MOVQ BX, s0-8(SP) - MOVQ SI, s1-16(SP) - MOVQ DI, s2-24(SP) - MOVQ R8, s3-32(SP) - MOVQ R9, s4-40(SP) - MOVQ R10, s5-48(SP) - MOVQ R11, s6-56(SP) - MOVQ s17-144(SP), AX - MOVQ s18-152(SP), DX - MOVQ s19-160(SP), CX - MOVQ s20-168(SP), BX - MOVQ s21-176(SP), SI - MOVQ s22-184(SP), DI - MOVQ s23-192(SP), R8 - MOVQ s24-200(SP), R9 - MOVQ s25-208(SP), R10 - MOVQ s26-216(SP), R11 - -l16: - BTQ $0, AX - JCC l17 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - ADCQ q<>+48(SB), R8 - ADCQ q<>+56(SB), R9 - ADCQ q<>+64(SB), R10 - ADCQ q<>+72(SB), R11 - -l17: - SHRQ $1, AX, R12 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, R8, DI - SHRQ $1, R9, R8 - SHRQ $1, R10, R9 - SHRQ $1, R11, R10 - SHRQ $1, R11 - DECQ BP - JNE l16 - MOVQ AX, s17-144(SP) - MOVQ DX, s18-152(SP) - MOVQ CX, s19-160(SP) - MOVQ BX, s20-168(SP) - MOVQ SI, s21-176(SP) - MOVQ DI, s22-184(SP) - MOVQ R8, s23-192(SP) - MOVQ R9, s24-200(SP) - MOVQ R10, s25-208(SP) - MOVQ R11, s26-216(SP) - -l14: - // v = v - u - MOVQ s7-64(SP), AX - MOVQ s8-72(SP), DX - MOVQ s9-80(SP), CX - MOVQ s10-88(SP), BX - MOVQ s11-96(SP), SI - MOVQ s12-104(SP), DI - MOVQ s13-112(SP), R8 - MOVQ s14-120(SP), R9 - MOVQ s15-128(SP), R10 - MOVQ s16-136(SP), R11 - SUBQ R13, AX - SBBQ R14, DX - SBBQ R15, CX - SBBQ s0-8(SP), BX - SBBQ s1-16(SP), SI - SBBQ s2-24(SP), DI - SBBQ s3-32(SP), R8 - SBBQ s4-40(SP), R9 - SBBQ s5-48(SP), R10 - SBBQ s6-56(SP), R11 - JCC l5 - MOVQ R13, AX - MOVQ R14, DX - MOVQ R15, CX - MOVQ s0-8(SP), BX - MOVQ s1-16(SP), SI - MOVQ s2-24(SP), DI - MOVQ s3-32(SP), R8 - MOVQ s4-40(SP), R9 - MOVQ s5-48(SP), R10 - MOVQ s6-56(SP), R11 - SUBQ s7-64(SP), AX - SBBQ s8-72(SP), DX - SBBQ s9-80(SP), CX - SBBQ s10-88(SP), BX - SBBQ s11-96(SP), SI - SBBQ s12-104(SP), DI - SBBQ s13-112(SP), R8 - SBBQ s14-120(SP), R9 - SBBQ s15-128(SP), R10 - SBBQ s16-136(SP), R11 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ CX, R15 - MOVQ BX, s0-8(SP) - MOVQ SI, s1-16(SP) - MOVQ DI, s2-24(SP) - MOVQ R8, s3-32(SP) - MOVQ R9, s4-40(SP) - MOVQ R10, s5-48(SP) - MOVQ R11, s6-56(SP) - MOVQ s17-144(SP), AX - MOVQ s18-152(SP), DX - MOVQ s19-160(SP), CX - MOVQ s20-168(SP), BX - MOVQ s21-176(SP), SI - MOVQ s22-184(SP), DI - MOVQ s23-192(SP), R8 - MOVQ s24-200(SP), R9 - MOVQ s25-208(SP), R10 - MOVQ s26-216(SP), R11 - SUBQ s27-224(SP), AX - SBBQ s28-232(SP), DX - SBBQ s29-240(SP), CX - SBBQ s30-248(SP), BX - SBBQ s31-256(SP), SI - SBBQ s32-264(SP), DI - SBBQ s33-272(SP), R8 - SBBQ s34-280(SP), R9 - SBBQ s35-288(SP), R10 - SBBQ s36-296(SP), R11 - JCC l18 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - ADCQ q<>+48(SB), R8 - ADCQ q<>+56(SB), R9 - ADCQ q<>+64(SB), R10 - ADCQ q<>+72(SB), R11 - -l18: - MOVQ AX, s17-144(SP) - MOVQ DX, s18-152(SP) - MOVQ CX, s19-160(SP) - MOVQ BX, s20-168(SP) - MOVQ SI, s21-176(SP) - MOVQ DI, s22-184(SP) - MOVQ R8, s23-192(SP) - MOVQ R9, s24-200(SP) - MOVQ R10, s25-208(SP) - MOVQ R11, s26-216(SP) - JMP l6 - -l5: - MOVQ AX, s7-64(SP) - MOVQ DX, s8-72(SP) - MOVQ CX, s9-80(SP) - MOVQ BX, s10-88(SP) - MOVQ SI, s11-96(SP) - MOVQ DI, s12-104(SP) - MOVQ R8, s13-112(SP) - MOVQ R9, s14-120(SP) - MOVQ R10, s15-128(SP) - MOVQ R11, s16-136(SP) - MOVQ s27-224(SP), AX - MOVQ s28-232(SP), DX - MOVQ s29-240(SP), CX - MOVQ s30-248(SP), BX - MOVQ s31-256(SP), SI - MOVQ s32-264(SP), DI - MOVQ s33-272(SP), R8 - MOVQ s34-280(SP), R9 - MOVQ s35-288(SP), R10 - MOVQ s36-296(SP), R11 - SUBQ s17-144(SP), AX - SBBQ s18-152(SP), DX - SBBQ s19-160(SP), CX - SBBQ s20-168(SP), BX - SBBQ s21-176(SP), SI - SBBQ s22-184(SP), DI - SBBQ s23-192(SP), R8 - SBBQ s24-200(SP), R9 - SBBQ s25-208(SP), R10 - SBBQ s26-216(SP), R11 - JCC l19 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - ADCQ q<>+48(SB), R8 - ADCQ q<>+56(SB), R9 - ADCQ q<>+64(SB), R10 - ADCQ q<>+72(SB), R11 - -l19: - MOVQ AX, s27-224(SP) - MOVQ DX, s28-232(SP) - MOVQ CX, s29-240(SP) - MOVQ BX, s30-248(SP) - MOVQ SI, s31-256(SP) - MOVQ DI, s32-264(SP) - MOVQ R8, s33-272(SP) - MOVQ R9, s34-280(SP) - MOVQ R10, s35-288(SP) - MOVQ R11, s36-296(SP) - -l6: - MOVQ R13, AX - MOVQ R14, DX - MOVQ R15, CX - MOVQ s0-8(SP), BX - MOVQ s1-16(SP), SI - MOVQ s2-24(SP), DI - MOVQ s3-32(SP), R8 - MOVQ s4-40(SP), R9 - MOVQ s5-48(SP), R10 - MOVQ s6-56(SP), R11 - SUBQ $1, AX - ORQ AX, R11 - ORQ DX, R11 - ORQ CX, R11 - ORQ BX, R11 - ORQ SI, R11 - ORQ DI, R11 - ORQ R8, R11 - ORQ R9, R11 - ORQ R10, R11 - JEQ l7 - MOVQ s7-64(SP), AX - MOVQ s8-72(SP), DX - MOVQ s9-80(SP), CX - MOVQ s10-88(SP), BX - MOVQ s11-96(SP), SI - MOVQ s12-104(SP), DI - MOVQ s13-112(SP), R8 - MOVQ s14-120(SP), R9 - MOVQ s15-128(SP), R10 - MOVQ s16-136(SP), R11 - MOVQ AX, R12 - SUBQ $1, R12 - JNE l4 - ORQ DX, R12 - ORQ CX, R12 - ORQ BX, R12 - ORQ SI, R12 - ORQ DI, R12 - ORQ R8, R12 - ORQ R9, R12 - ORQ R10, R12 - ORQ R11, R12 - JEQ l8 - JMP l4 - -l7: - MOVQ res+0(FP), R12 - MOVQ s17-144(SP), AX - MOVQ s18-152(SP), DX - MOVQ s19-160(SP), CX - MOVQ s20-168(SP), BX - MOVQ s21-176(SP), SI - MOVQ s22-184(SP), DI - MOVQ s23-192(SP), R8 - MOVQ s24-200(SP), R9 - MOVQ s25-208(SP), R10 - MOVQ s26-216(SP), R11 - MOVQ AX, 0(R12) - MOVQ DX, 8(R12) - MOVQ CX, 16(R12) - MOVQ BX, 24(R12) - MOVQ SI, 32(R12) - MOVQ DI, 40(R12) - MOVQ R8, 48(R12) - MOVQ R9, 56(R12) - MOVQ R10, 64(R12) - MOVQ R11, 72(R12) - RET - -l8: - MOVQ res+0(FP), R12 - MOVQ s27-224(SP), AX - MOVQ s28-232(SP), DX - MOVQ s29-240(SP), CX - MOVQ s30-248(SP), BX - MOVQ s31-256(SP), SI - MOVQ s32-264(SP), DI - MOVQ s33-272(SP), R8 - MOVQ s34-280(SP), R9 - MOVQ s35-288(SP), R10 - MOVQ s36-296(SP), R11 - MOVQ AX, 0(R12) - MOVQ DX, 8(R12) - MOVQ CX, 16(R12) - MOVQ BX, 24(R12) - MOVQ SI, 32(R12) - MOVQ DI, 40(R12) - MOVQ R8, 48(R12) - MOVQ R9, 56(R12) - MOVQ R10, 64(R12) - MOVQ R11, 72(R12) - RET - -l9: - MOVQ res+0(FP), R12 - MOVQ $0, 0(R12) - MOVQ $0, 8(R12) - MOVQ $0, 16(R12) - MOVQ $0, 24(R12) - MOVQ $0, 32(R12) - MOVQ $0, 40(R12) - MOVQ $0, 48(R12) - MOVQ $0, 56(R12) - MOVQ $0, 64(R12) - MOVQ $0, 72(R12) - RET diff --git a/ecc/bw6-633/fp/element_ops_noasm.go b/ecc/bw6-633/fp/element_ops_noasm.go index 48d55e2ea..fec628918 100644 --- a/ecc/bw6-633/fp/element_ops_noasm.go +++ b/ecc/bw6-633/fp/element_ops_noasm.go @@ -51,10 +51,6 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } -func inverse(z, x *Element) { - _inverseGeneric(z, x) -} - // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bw6-633/fr/element.go b/ecc/bw6-633/fr/element.go index 019e344e9..de7de2f48 100644 --- a/ecc/bw6-633/fr/element.go +++ b/ecc/bw6-633/fr/element.go @@ -911,17 +911,9 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { - inverse(z, x) - return z -} - -// _inverseGeneric z = x^-1 mod q -// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" -// if x == 0, sets and returns z = x -func _inverseGeneric(z, x *Element) { if x.IsZero() { z.SetZero() - return + return z } // initialize u = q @@ -1070,11 +1062,11 @@ func _inverseGeneric(z, x *Element) { } if (u[0] == 1) && (u[4]|u[3]|u[2]|u[1]) == 0 { z.Set(&r) - return + return z } if (v[0] == 1) && (v[4]|v[3]|v[2]|v[1]) == 0 { z.Set(&s) - return + return z } } diff --git a/ecc/bw6-633/fr/element_ops_amd64.go b/ecc/bw6-633/fr/element_ops_amd64.go index 9ebabc26a..78022b3e6 100644 --- a/ecc/bw6-633/fr/element_ops_amd64.go +++ b/ecc/bw6-633/fr/element_ops_amd64.go @@ -48,6 +48,3 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) - -//go:noescape -func inverse(res, x *Element) diff --git a/ecc/bw6-633/fr/element_ops_amd64.s b/ecc/bw6-633/fr/element_ops_amd64.s index 354d2294a..ab0ced516 100644 --- a/ecc/bw6-633/fr/element_ops_amd64.s +++ b/ecc/bw6-633/fr/element_ops_amd64.s @@ -398,312 +398,3 @@ TEXT ·Butterfly(SB), $24-16 MOVQ DI, 24(AX) MOVQ R8, 32(AX) RET - -// inverse(res, x *Element) -TEXT ·inverse(SB), $96-16 - // u = q - // u[0] -> R8 - // u[1] -> R9 - // u[2] -> R10 - // u[3] -> R11 - // u[4] -> R12 - MOVQ q<>+0(SB), R8 - MOVQ q<>+8(SB), R9 - MOVQ q<>+16(SB), R10 - MOVQ q<>+24(SB), R11 - MOVQ q<>+32(SB), R12 - - // s = r^2 - // s[0] -> s7-64(SP) - // s[1] -> s8-72(SP) - // s[2] -> s9-80(SP) - // s[3] -> s10-88(SP) - // s[4] -> s11-96(SP) - MOVQ $0x6b817891fe329c16, DI - MOVQ DI, s7-64(SP) - MOVQ $0x599ce86eec6e2c35, DI - MOVQ DI, s8-72(SP) - MOVQ $0xc338890f540d5ad6, DI - MOVQ DI, s9-80(SP) - MOVQ $0xcc160f6924c81f32, DI - MOVQ DI, s10-88(SP) - MOVQ $0x0215d8d4607a88d5, DI - MOVQ DI, s11-96(SP) - - // v = x - // v[0] -> R13 - // v[1] -> R14 - // v[2] -> R15 - // v[3] -> s0-8(SP) - // v[4] -> s1-16(SP) - MOVQ x+8(FP), DI - MOVQ 0(DI), AX - MOVQ 8(DI), DX - MOVQ 16(DI), CX - MOVQ 24(DI), BX - MOVQ 32(DI), SI - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ CX, R15 - MOVQ BX, s0-8(SP) - MOVQ SI, s1-16(SP) - - // if x is 0, returns 0 - MOVQ AX, DI - ORQ DX, DI - ORQ CX, DI - ORQ BX, DI - ORQ SI, DI - JEQ l7 - - // r = 0 - // r[0] -> s2-24(SP) - // r[1] -> s3-32(SP) - // r[2] -> s4-40(SP) - // r[3] -> s5-48(SP) - // r[4] -> s6-56(SP) - MOVQ $0, s2-24(SP) - MOVQ $0, s3-32(SP) - MOVQ $0, s4-40(SP) - MOVQ $0, s5-48(SP) - MOVQ $0, s6-56(SP) - -l2: - BTQ $0, AX - JCS l8 - MOVQ $0, BP - XORQ DI, DI - -l9: - INCQ BP - SHRQ $1, AX, DI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, SI - BTQ $0, AX - JCC l9 - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ CX, R15 - MOVQ BX, s0-8(SP) - MOVQ SI, s1-16(SP) - MOVQ s7-64(SP), AX - MOVQ s8-72(SP), DX - MOVQ s9-80(SP), CX - MOVQ s10-88(SP), BX - MOVQ s11-96(SP), SI - -l10: - BTQ $0, AX - JCC l11 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - -l11: - SHRQ $1, AX, DI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, SI - DECQ BP - JNE l10 - MOVQ AX, s7-64(SP) - MOVQ DX, s8-72(SP) - MOVQ CX, s9-80(SP) - MOVQ BX, s10-88(SP) - MOVQ SI, s11-96(SP) - -l8: - MOVQ R8, AX - MOVQ R9, DX - MOVQ R10, CX - MOVQ R11, BX - MOVQ R12, SI - BTQ $0, AX - JCS l12 - MOVQ $0, BP - XORQ DI, DI - -l13: - INCQ BP - SHRQ $1, AX, DI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, SI - BTQ $0, AX - JCC l13 - MOVQ AX, R8 - MOVQ DX, R9 - MOVQ CX, R10 - MOVQ BX, R11 - MOVQ SI, R12 - MOVQ s2-24(SP), AX - MOVQ s3-32(SP), DX - MOVQ s4-40(SP), CX - MOVQ s5-48(SP), BX - MOVQ s6-56(SP), SI - -l14: - BTQ $0, AX - JCC l15 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - -l15: - SHRQ $1, AX, DI - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, SI - DECQ BP - JNE l14 - MOVQ AX, s2-24(SP) - MOVQ DX, s3-32(SP) - MOVQ CX, s4-40(SP) - MOVQ BX, s5-48(SP) - MOVQ SI, s6-56(SP) - -l12: - // v = v - u - MOVQ R13, AX - MOVQ R14, DX - MOVQ R15, CX - MOVQ s0-8(SP), BX - MOVQ s1-16(SP), SI - SUBQ R8, AX - SBBQ R9, DX - SBBQ R10, CX - SBBQ R11, BX - SBBQ R12, SI - JCC l3 - SUBQ R13, R8 - SBBQ R14, R9 - SBBQ R15, R10 - SBBQ s0-8(SP), R11 - SBBQ s1-16(SP), R12 - MOVQ s2-24(SP), AX - MOVQ s3-32(SP), DX - MOVQ s4-40(SP), CX - MOVQ s5-48(SP), BX - MOVQ s6-56(SP), SI - SUBQ s7-64(SP), AX - SBBQ s8-72(SP), DX - SBBQ s9-80(SP), CX - SBBQ s10-88(SP), BX - SBBQ s11-96(SP), SI - JCC l16 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - -l16: - MOVQ AX, s2-24(SP) - MOVQ DX, s3-32(SP) - MOVQ CX, s4-40(SP) - MOVQ BX, s5-48(SP) - MOVQ SI, s6-56(SP) - JMP l4 - -l3: - MOVQ AX, R13 - MOVQ DX, R14 - MOVQ CX, R15 - MOVQ BX, s0-8(SP) - MOVQ SI, s1-16(SP) - MOVQ s7-64(SP), AX - MOVQ s8-72(SP), DX - MOVQ s9-80(SP), CX - MOVQ s10-88(SP), BX - MOVQ s11-96(SP), SI - SUBQ s2-24(SP), AX - SBBQ s3-32(SP), DX - SBBQ s4-40(SP), CX - SBBQ s5-48(SP), BX - SBBQ s6-56(SP), SI - JCC l17 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - -l17: - MOVQ AX, s7-64(SP) - MOVQ DX, s8-72(SP) - MOVQ CX, s9-80(SP) - MOVQ BX, s10-88(SP) - MOVQ SI, s11-96(SP) - -l4: - MOVQ R8, DI - SUBQ $1, DI - ORQ R9, DI - ORQ R10, DI - ORQ R11, DI - ORQ R12, DI - JEQ l5 - MOVQ R13, AX - MOVQ R14, DX - MOVQ R15, CX - MOVQ s0-8(SP), BX - MOVQ s1-16(SP), SI - MOVQ AX, DI - SUBQ $1, DI - JNE l2 - ORQ DX, DI - ORQ CX, DI - ORQ BX, DI - ORQ SI, DI - JEQ l6 - JMP l2 - -l5: - MOVQ res+0(FP), DI - MOVQ s2-24(SP), AX - MOVQ s3-32(SP), DX - MOVQ s4-40(SP), CX - MOVQ s5-48(SP), BX - MOVQ s6-56(SP), SI - MOVQ AX, 0(DI) - MOVQ DX, 8(DI) - MOVQ CX, 16(DI) - MOVQ BX, 24(DI) - MOVQ SI, 32(DI) - RET - -l6: - MOVQ res+0(FP), DI - MOVQ s7-64(SP), AX - MOVQ s8-72(SP), DX - MOVQ s9-80(SP), CX - MOVQ s10-88(SP), BX - MOVQ s11-96(SP), SI - MOVQ AX, 0(DI) - MOVQ DX, 8(DI) - MOVQ CX, 16(DI) - MOVQ BX, 24(DI) - MOVQ SI, 32(DI) - RET - -l7: - MOVQ res+0(FP), DI - MOVQ $0, 0(DI) - MOVQ $0, 8(DI) - MOVQ $0, 16(DI) - MOVQ $0, 24(DI) - MOVQ $0, 32(DI) - RET diff --git a/ecc/bw6-633/fr/element_ops_noasm.go b/ecc/bw6-633/fr/element_ops_noasm.go index 006365daa..ec1fac18d 100644 --- a/ecc/bw6-633/fr/element_ops_noasm.go +++ b/ecc/bw6-633/fr/element_ops_noasm.go @@ -51,10 +51,6 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } -func inverse(z, x *Element) { - _inverseGeneric(z, x) -} - // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bw6-761/fp/element.go b/ecc/bw6-761/fp/element.go index a40f89d6b..2c137eb98 100644 --- a/ecc/bw6-761/fp/element.go +++ b/ecc/bw6-761/fp/element.go @@ -1481,17 +1481,9 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { - inverse(z, x) - return z -} - -// _inverseGeneric z = x^-1 mod q -// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" -// if x == 0, sets and returns z = x -func _inverseGeneric(z, x *Element) { if x.IsZero() { z.SetZero() - return + return z } // initialize u = q @@ -1738,11 +1730,11 @@ func _inverseGeneric(z, x *Element) { } if (u[0] == 1) && (u[11]|u[10]|u[9]|u[8]|u[7]|u[6]|u[5]|u[4]|u[3]|u[2]|u[1]) == 0 { z.Set(&r) - return + return z } if (v[0] == 1) && (v[11]|v[10]|v[9]|v[8]|v[7]|v[6]|v[5]|v[4]|v[3]|v[2]|v[1]) == 0 { z.Set(&s) - return + return z } } diff --git a/ecc/bw6-761/fp/element_ops_amd64.go b/ecc/bw6-761/fp/element_ops_amd64.go index d61412bd6..73a3711ec 100644 --- a/ecc/bw6-761/fp/element_ops_amd64.go +++ b/ecc/bw6-761/fp/element_ops_amd64.go @@ -48,6 +48,3 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) - -//go:noescape -func inverse(res, x *Element) diff --git a/ecc/bw6-761/fp/element_ops_amd64.s b/ecc/bw6-761/fp/element_ops_amd64.s index 2012c2840..242ba6a68 100644 --- a/ecc/bw6-761/fp/element_ops_amd64.s +++ b/ecc/bw6-761/fp/element_ops_amd64.s @@ -744,667 +744,3 @@ l3: MOVQ R13, 80(AX) MOVQ R14, 88(AX) RET - -// inverse(res, x *Element) -TEXT ·inverse(SB), $376-16 - // u = q - // u[0] -> R15 - // u[1] -> s0-8(SP) - // u[2] -> s1-16(SP) - // u[3] -> s2-24(SP) - // u[4] -> s3-32(SP) - // u[5] -> s4-40(SP) - // u[6] -> s5-48(SP) - // u[7] -> s6-56(SP) - // u[8] -> s7-64(SP) - // u[9] -> s8-72(SP) - // u[10] -> s9-80(SP) - // u[11] -> s10-88(SP) - MOVQ q<>+0(SB), R14 - MOVQ R14, R15 - MOVQ q<>+8(SB), R14 - MOVQ R14, s0-8(SP) - MOVQ q<>+16(SB), R14 - MOVQ R14, s1-16(SP) - MOVQ q<>+24(SB), R14 - MOVQ R14, s2-24(SP) - MOVQ q<>+32(SB), R14 - MOVQ R14, s3-32(SP) - MOVQ q<>+40(SB), R14 - MOVQ R14, s4-40(SP) - MOVQ q<>+48(SB), R14 - MOVQ R14, s5-48(SP) - MOVQ q<>+56(SB), R14 - MOVQ R14, s6-56(SP) - MOVQ q<>+64(SB), R14 - MOVQ R14, s7-64(SP) - MOVQ q<>+72(SB), R14 - MOVQ R14, s8-72(SP) - MOVQ q<>+80(SB), R14 - MOVQ R14, s9-80(SP) - MOVQ q<>+88(SB), R14 - MOVQ R14, s10-88(SP) - - // s = r^2 - // s[0] -> s35-288(SP) - // s[1] -> s36-296(SP) - // s[2] -> s37-304(SP) - // s[3] -> s38-312(SP) - // s[4] -> s39-320(SP) - // s[5] -> s40-328(SP) - // s[6] -> s41-336(SP) - // s[7] -> s42-344(SP) - // s[8] -> s43-352(SP) - // s[9] -> s44-360(SP) - // s[10] -> s45-368(SP) - // s[11] -> s46-376(SP) - MOVQ $0xc686392d2d1fa659, R14 - MOVQ R14, s35-288(SP) - MOVQ $0x7b14c9b2f79484ab, R14 - MOVQ R14, s36-296(SP) - MOVQ $0x7fa1e825c1d2b459, R14 - MOVQ R14, s37-304(SP) - MOVQ $0xd6ec28f848329d88, R14 - MOVQ R14, s38-312(SP) - MOVQ $0x4afb427b73a1ed40, R14 - MOVQ R14, s39-320(SP) - MOVQ $0x972c69400d5930ae, R14 - MOVQ R14, s40-328(SP) - MOVQ $0x2c7a26bf8c995976, R14 - MOVQ R14, s41-336(SP) - MOVQ $0xac52e458c6e57af9, R14 - MOVQ R14, s42-344(SP) - MOVQ $0xac731bfa0c536dfe, R14 - MOVQ R14, s43-352(SP) - MOVQ $0x121e5c630b103f50, R14 - MOVQ R14, s44-360(SP) - MOVQ $0x8f1b0953b886cda4, R14 - MOVQ R14, s45-368(SP) - MOVQ $0x00ad253c2da8d807, R14 - MOVQ R14, s46-376(SP) - - // v = x - // v[0] -> s11-96(SP) - // v[1] -> s12-104(SP) - // v[2] -> s13-112(SP) - // v[3] -> s14-120(SP) - // v[4] -> s15-128(SP) - // v[5] -> s16-136(SP) - // v[6] -> s17-144(SP) - // v[7] -> s18-152(SP) - // v[8] -> s19-160(SP) - // v[9] -> s20-168(SP) - // v[10] -> s21-176(SP) - // v[11] -> s22-184(SP) - MOVQ x+8(FP), R14 - MOVQ 0(R14), AX - MOVQ 8(R14), DX - MOVQ 16(R14), CX - MOVQ 24(R14), BX - MOVQ 32(R14), SI - MOVQ 40(R14), DI - MOVQ 48(R14), R8 - MOVQ 56(R14), R9 - MOVQ 64(R14), R10 - MOVQ 72(R14), R11 - MOVQ 80(R14), R12 - MOVQ 88(R14), R13 - MOVQ AX, s11-96(SP) - MOVQ DX, s12-104(SP) - MOVQ CX, s13-112(SP) - MOVQ BX, s14-120(SP) - MOVQ SI, s15-128(SP) - MOVQ DI, s16-136(SP) - MOVQ R8, s17-144(SP) - MOVQ R9, s18-152(SP) - MOVQ R10, s19-160(SP) - MOVQ R11, s20-168(SP) - MOVQ R12, s21-176(SP) - MOVQ R13, s22-184(SP) - - // if x is 0, returns 0 - MOVQ AX, R14 - ORQ DX, R14 - ORQ CX, R14 - ORQ BX, R14 - ORQ SI, R14 - ORQ DI, R14 - ORQ R8, R14 - ORQ R9, R14 - ORQ R10, R14 - ORQ R11, R14 - ORQ R12, R14 - ORQ R13, R14 - JEQ l9 - - // r = 0 - // r[0] -> s23-192(SP) - // r[1] -> s24-200(SP) - // r[2] -> s25-208(SP) - // r[3] -> s26-216(SP) - // r[4] -> s27-224(SP) - // r[5] -> s28-232(SP) - // r[6] -> s29-240(SP) - // r[7] -> s30-248(SP) - // r[8] -> s31-256(SP) - // r[9] -> s32-264(SP) - // r[10] -> s33-272(SP) - // r[11] -> s34-280(SP) - MOVQ $0, s23-192(SP) - MOVQ $0, s24-200(SP) - MOVQ $0, s25-208(SP) - MOVQ $0, s26-216(SP) - MOVQ $0, s27-224(SP) - MOVQ $0, s28-232(SP) - MOVQ $0, s29-240(SP) - MOVQ $0, s30-248(SP) - MOVQ $0, s31-256(SP) - MOVQ $0, s32-264(SP) - MOVQ $0, s33-272(SP) - MOVQ $0, s34-280(SP) - -l4: - BTQ $0, AX - JCS l10 - MOVQ $0, BP - XORQ R14, R14 - -l11: - INCQ BP - SHRQ $1, AX, R14 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, R8, DI - SHRQ $1, R9, R8 - SHRQ $1, R10, R9 - SHRQ $1, R11, R10 - SHRQ $1, R12, R11 - SHRQ $1, R13, R12 - SHRQ $1, R13 - BTQ $0, AX - JCC l11 - MOVQ AX, s11-96(SP) - MOVQ DX, s12-104(SP) - MOVQ CX, s13-112(SP) - MOVQ BX, s14-120(SP) - MOVQ SI, s15-128(SP) - MOVQ DI, s16-136(SP) - MOVQ R8, s17-144(SP) - MOVQ R9, s18-152(SP) - MOVQ R10, s19-160(SP) - MOVQ R11, s20-168(SP) - MOVQ R12, s21-176(SP) - MOVQ R13, s22-184(SP) - MOVQ s35-288(SP), AX - MOVQ s36-296(SP), DX - MOVQ s37-304(SP), CX - MOVQ s38-312(SP), BX - MOVQ s39-320(SP), SI - MOVQ s40-328(SP), DI - MOVQ s41-336(SP), R8 - MOVQ s42-344(SP), R9 - MOVQ s43-352(SP), R10 - MOVQ s44-360(SP), R11 - MOVQ s45-368(SP), R12 - MOVQ s46-376(SP), R13 - -l12: - BTQ $0, AX - JCC l13 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - ADCQ q<>+48(SB), R8 - ADCQ q<>+56(SB), R9 - ADCQ q<>+64(SB), R10 - ADCQ q<>+72(SB), R11 - ADCQ q<>+80(SB), R12 - ADCQ q<>+88(SB), R13 - -l13: - SHRQ $1, AX, R14 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, R8, DI - SHRQ $1, R9, R8 - SHRQ $1, R10, R9 - SHRQ $1, R11, R10 - SHRQ $1, R12, R11 - SHRQ $1, R13, R12 - SHRQ $1, R13 - DECQ BP - JNE l12 - MOVQ AX, s35-288(SP) - MOVQ DX, s36-296(SP) - MOVQ CX, s37-304(SP) - MOVQ BX, s38-312(SP) - MOVQ SI, s39-320(SP) - MOVQ DI, s40-328(SP) - MOVQ R8, s41-336(SP) - MOVQ R9, s42-344(SP) - MOVQ R10, s43-352(SP) - MOVQ R11, s44-360(SP) - MOVQ R12, s45-368(SP) - MOVQ R13, s46-376(SP) - -l10: - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - MOVQ s3-32(SP), SI - MOVQ s4-40(SP), DI - MOVQ s5-48(SP), R8 - MOVQ s6-56(SP), R9 - MOVQ s7-64(SP), R10 - MOVQ s8-72(SP), R11 - MOVQ s9-80(SP), R12 - MOVQ s10-88(SP), R13 - BTQ $0, AX - JCS l14 - MOVQ $0, BP - XORQ R14, R14 - -l15: - INCQ BP - SHRQ $1, AX, R14 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, R8, DI - SHRQ $1, R9, R8 - SHRQ $1, R10, R9 - SHRQ $1, R11, R10 - SHRQ $1, R12, R11 - SHRQ $1, R13, R12 - SHRQ $1, R13 - BTQ $0, AX - JCC l15 - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - MOVQ SI, s3-32(SP) - MOVQ DI, s4-40(SP) - MOVQ R8, s5-48(SP) - MOVQ R9, s6-56(SP) - MOVQ R10, s7-64(SP) - MOVQ R11, s8-72(SP) - MOVQ R12, s9-80(SP) - MOVQ R13, s10-88(SP) - MOVQ s23-192(SP), AX - MOVQ s24-200(SP), DX - MOVQ s25-208(SP), CX - MOVQ s26-216(SP), BX - MOVQ s27-224(SP), SI - MOVQ s28-232(SP), DI - MOVQ s29-240(SP), R8 - MOVQ s30-248(SP), R9 - MOVQ s31-256(SP), R10 - MOVQ s32-264(SP), R11 - MOVQ s33-272(SP), R12 - MOVQ s34-280(SP), R13 - -l16: - BTQ $0, AX - JCC l17 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - ADCQ q<>+48(SB), R8 - ADCQ q<>+56(SB), R9 - ADCQ q<>+64(SB), R10 - ADCQ q<>+72(SB), R11 - ADCQ q<>+80(SB), R12 - ADCQ q<>+88(SB), R13 - -l17: - SHRQ $1, AX, R14 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, R8, DI - SHRQ $1, R9, R8 - SHRQ $1, R10, R9 - SHRQ $1, R11, R10 - SHRQ $1, R12, R11 - SHRQ $1, R13, R12 - SHRQ $1, R13 - DECQ BP - JNE l16 - MOVQ AX, s23-192(SP) - MOVQ DX, s24-200(SP) - MOVQ CX, s25-208(SP) - MOVQ BX, s26-216(SP) - MOVQ SI, s27-224(SP) - MOVQ DI, s28-232(SP) - MOVQ R8, s29-240(SP) - MOVQ R9, s30-248(SP) - MOVQ R10, s31-256(SP) - MOVQ R11, s32-264(SP) - MOVQ R12, s33-272(SP) - MOVQ R13, s34-280(SP) - -l14: - // v = v - u - MOVQ s11-96(SP), AX - MOVQ s12-104(SP), DX - MOVQ s13-112(SP), CX - MOVQ s14-120(SP), BX - MOVQ s15-128(SP), SI - MOVQ s16-136(SP), DI - MOVQ s17-144(SP), R8 - MOVQ s18-152(SP), R9 - MOVQ s19-160(SP), R10 - MOVQ s20-168(SP), R11 - MOVQ s21-176(SP), R12 - MOVQ s22-184(SP), R13 - SUBQ R15, AX - SBBQ s0-8(SP), DX - SBBQ s1-16(SP), CX - SBBQ s2-24(SP), BX - SBBQ s3-32(SP), SI - SBBQ s4-40(SP), DI - SBBQ s5-48(SP), R8 - SBBQ s6-56(SP), R9 - SBBQ s7-64(SP), R10 - SBBQ s8-72(SP), R11 - SBBQ s9-80(SP), R12 - SBBQ s10-88(SP), R13 - JCC l5 - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - MOVQ s3-32(SP), SI - MOVQ s4-40(SP), DI - MOVQ s5-48(SP), R8 - MOVQ s6-56(SP), R9 - MOVQ s7-64(SP), R10 - MOVQ s8-72(SP), R11 - MOVQ s9-80(SP), R12 - MOVQ s10-88(SP), R13 - SUBQ s11-96(SP), AX - SBBQ s12-104(SP), DX - SBBQ s13-112(SP), CX - SBBQ s14-120(SP), BX - SBBQ s15-128(SP), SI - SBBQ s16-136(SP), DI - SBBQ s17-144(SP), R8 - SBBQ s18-152(SP), R9 - SBBQ s19-160(SP), R10 - SBBQ s20-168(SP), R11 - SBBQ s21-176(SP), R12 - SBBQ s22-184(SP), R13 - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - MOVQ SI, s3-32(SP) - MOVQ DI, s4-40(SP) - MOVQ R8, s5-48(SP) - MOVQ R9, s6-56(SP) - MOVQ R10, s7-64(SP) - MOVQ R11, s8-72(SP) - MOVQ R12, s9-80(SP) - MOVQ R13, s10-88(SP) - MOVQ s23-192(SP), AX - MOVQ s24-200(SP), DX - MOVQ s25-208(SP), CX - MOVQ s26-216(SP), BX - MOVQ s27-224(SP), SI - MOVQ s28-232(SP), DI - MOVQ s29-240(SP), R8 - MOVQ s30-248(SP), R9 - MOVQ s31-256(SP), R10 - MOVQ s32-264(SP), R11 - MOVQ s33-272(SP), R12 - MOVQ s34-280(SP), R13 - SUBQ s35-288(SP), AX - SBBQ s36-296(SP), DX - SBBQ s37-304(SP), CX - SBBQ s38-312(SP), BX - SBBQ s39-320(SP), SI - SBBQ s40-328(SP), DI - SBBQ s41-336(SP), R8 - SBBQ s42-344(SP), R9 - SBBQ s43-352(SP), R10 - SBBQ s44-360(SP), R11 - SBBQ s45-368(SP), R12 - SBBQ s46-376(SP), R13 - JCC l18 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - ADCQ q<>+48(SB), R8 - ADCQ q<>+56(SB), R9 - ADCQ q<>+64(SB), R10 - ADCQ q<>+72(SB), R11 - ADCQ q<>+80(SB), R12 - ADCQ q<>+88(SB), R13 - -l18: - MOVQ AX, s23-192(SP) - MOVQ DX, s24-200(SP) - MOVQ CX, s25-208(SP) - MOVQ BX, s26-216(SP) - MOVQ SI, s27-224(SP) - MOVQ DI, s28-232(SP) - MOVQ R8, s29-240(SP) - MOVQ R9, s30-248(SP) - MOVQ R10, s31-256(SP) - MOVQ R11, s32-264(SP) - MOVQ R12, s33-272(SP) - MOVQ R13, s34-280(SP) - JMP l6 - -l5: - MOVQ AX, s11-96(SP) - MOVQ DX, s12-104(SP) - MOVQ CX, s13-112(SP) - MOVQ BX, s14-120(SP) - MOVQ SI, s15-128(SP) - MOVQ DI, s16-136(SP) - MOVQ R8, s17-144(SP) - MOVQ R9, s18-152(SP) - MOVQ R10, s19-160(SP) - MOVQ R11, s20-168(SP) - MOVQ R12, s21-176(SP) - MOVQ R13, s22-184(SP) - MOVQ s35-288(SP), AX - MOVQ s36-296(SP), DX - MOVQ s37-304(SP), CX - MOVQ s38-312(SP), BX - MOVQ s39-320(SP), SI - MOVQ s40-328(SP), DI - MOVQ s41-336(SP), R8 - MOVQ s42-344(SP), R9 - MOVQ s43-352(SP), R10 - MOVQ s44-360(SP), R11 - MOVQ s45-368(SP), R12 - MOVQ s46-376(SP), R13 - SUBQ s23-192(SP), AX - SBBQ s24-200(SP), DX - SBBQ s25-208(SP), CX - SBBQ s26-216(SP), BX - SBBQ s27-224(SP), SI - SBBQ s28-232(SP), DI - SBBQ s29-240(SP), R8 - SBBQ s30-248(SP), R9 - SBBQ s31-256(SP), R10 - SBBQ s32-264(SP), R11 - SBBQ s33-272(SP), R12 - SBBQ s34-280(SP), R13 - JCC l19 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - ADCQ q<>+48(SB), R8 - ADCQ q<>+56(SB), R9 - ADCQ q<>+64(SB), R10 - ADCQ q<>+72(SB), R11 - ADCQ q<>+80(SB), R12 - ADCQ q<>+88(SB), R13 - -l19: - MOVQ AX, s35-288(SP) - MOVQ DX, s36-296(SP) - MOVQ CX, s37-304(SP) - MOVQ BX, s38-312(SP) - MOVQ SI, s39-320(SP) - MOVQ DI, s40-328(SP) - MOVQ R8, s41-336(SP) - MOVQ R9, s42-344(SP) - MOVQ R10, s43-352(SP) - MOVQ R11, s44-360(SP) - MOVQ R12, s45-368(SP) - MOVQ R13, s46-376(SP) - -l6: - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - MOVQ s3-32(SP), SI - MOVQ s4-40(SP), DI - MOVQ s5-48(SP), R8 - MOVQ s6-56(SP), R9 - MOVQ s7-64(SP), R10 - MOVQ s8-72(SP), R11 - MOVQ s9-80(SP), R12 - MOVQ s10-88(SP), R13 - SUBQ $1, AX - ORQ AX, R13 - ORQ DX, R13 - ORQ CX, R13 - ORQ BX, R13 - ORQ SI, R13 - ORQ DI, R13 - ORQ R8, R13 - ORQ R9, R13 - ORQ R10, R13 - ORQ R11, R13 - ORQ R12, R13 - JEQ l7 - MOVQ s11-96(SP), AX - MOVQ s12-104(SP), DX - MOVQ s13-112(SP), CX - MOVQ s14-120(SP), BX - MOVQ s15-128(SP), SI - MOVQ s16-136(SP), DI - MOVQ s17-144(SP), R8 - MOVQ s18-152(SP), R9 - MOVQ s19-160(SP), R10 - MOVQ s20-168(SP), R11 - MOVQ s21-176(SP), R12 - MOVQ s22-184(SP), R13 - MOVQ AX, R14 - SUBQ $1, R14 - JNE l4 - ORQ DX, R14 - ORQ CX, R14 - ORQ BX, R14 - ORQ SI, R14 - ORQ DI, R14 - ORQ R8, R14 - ORQ R9, R14 - ORQ R10, R14 - ORQ R11, R14 - ORQ R12, R14 - ORQ R13, R14 - JEQ l8 - JMP l4 - -l7: - MOVQ res+0(FP), R14 - MOVQ s23-192(SP), AX - MOVQ s24-200(SP), DX - MOVQ s25-208(SP), CX - MOVQ s26-216(SP), BX - MOVQ s27-224(SP), SI - MOVQ s28-232(SP), DI - MOVQ s29-240(SP), R8 - MOVQ s30-248(SP), R9 - MOVQ s31-256(SP), R10 - MOVQ s32-264(SP), R11 - MOVQ s33-272(SP), R12 - MOVQ s34-280(SP), R13 - MOVQ AX, 0(R14) - MOVQ DX, 8(R14) - MOVQ CX, 16(R14) - MOVQ BX, 24(R14) - MOVQ SI, 32(R14) - MOVQ DI, 40(R14) - MOVQ R8, 48(R14) - MOVQ R9, 56(R14) - MOVQ R10, 64(R14) - MOVQ R11, 72(R14) - MOVQ R12, 80(R14) - MOVQ R13, 88(R14) - RET - -l8: - MOVQ res+0(FP), R14 - MOVQ s35-288(SP), AX - MOVQ s36-296(SP), DX - MOVQ s37-304(SP), CX - MOVQ s38-312(SP), BX - MOVQ s39-320(SP), SI - MOVQ s40-328(SP), DI - MOVQ s41-336(SP), R8 - MOVQ s42-344(SP), R9 - MOVQ s43-352(SP), R10 - MOVQ s44-360(SP), R11 - MOVQ s45-368(SP), R12 - MOVQ s46-376(SP), R13 - MOVQ AX, 0(R14) - MOVQ DX, 8(R14) - MOVQ CX, 16(R14) - MOVQ BX, 24(R14) - MOVQ SI, 32(R14) - MOVQ DI, 40(R14) - MOVQ R8, 48(R14) - MOVQ R9, 56(R14) - MOVQ R10, 64(R14) - MOVQ R11, 72(R14) - MOVQ R12, 80(R14) - MOVQ R13, 88(R14) - RET - -l9: - MOVQ res+0(FP), R14 - MOVQ $0, 0(R14) - MOVQ $0, 8(R14) - MOVQ $0, 16(R14) - MOVQ $0, 24(R14) - MOVQ $0, 32(R14) - MOVQ $0, 40(R14) - MOVQ $0, 48(R14) - MOVQ $0, 56(R14) - MOVQ $0, 64(R14) - MOVQ $0, 72(R14) - MOVQ $0, 80(R14) - MOVQ $0, 88(R14) - RET diff --git a/ecc/bw6-761/fp/element_ops_noasm.go b/ecc/bw6-761/fp/element_ops_noasm.go index 48d55e2ea..fec628918 100644 --- a/ecc/bw6-761/fp/element_ops_noasm.go +++ b/ecc/bw6-761/fp/element_ops_noasm.go @@ -51,10 +51,6 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } -func inverse(z, x *Element) { - _inverseGeneric(z, x) -} - // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/ecc/bw6-761/fr/element.go b/ecc/bw6-761/fr/element.go index 6199bcfb4..af0a16e1e 100644 --- a/ecc/bw6-761/fr/element.go +++ b/ecc/bw6-761/fr/element.go @@ -983,17 +983,9 @@ func (z *Element) Sqrt(x *Element) *Element { // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *Element) Inverse(x *Element) *Element { - inverse(z, x) - return z -} - -// _inverseGeneric z = x^-1 mod q -// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" -// if x == 0, sets and returns z = x -func _inverseGeneric(z, x *Element) { if x.IsZero() { z.SetZero() - return + return z } // initialize u = q @@ -1156,11 +1148,11 @@ func _inverseGeneric(z, x *Element) { } if (u[0] == 1) && (u[5]|u[4]|u[3]|u[2]|u[1]) == 0 { z.Set(&r) - return + return z } if (v[0] == 1) && (v[5]|v[4]|v[3]|v[2]|v[1]) == 0 { z.Set(&s) - return + return z } } diff --git a/ecc/bw6-761/fr/element_ops_amd64.go b/ecc/bw6-761/fr/element_ops_amd64.go index 9ebabc26a..78022b3e6 100644 --- a/ecc/bw6-761/fr/element_ops_amd64.go +++ b/ecc/bw6-761/fr/element_ops_amd64.go @@ -48,6 +48,3 @@ func reduce(res *Element) //go:noescape func Butterfly(a, b *Element) - -//go:noescape -func inverse(res, x *Element) diff --git a/ecc/bw6-761/fr/element_ops_amd64.s b/ecc/bw6-761/fr/element_ops_amd64.s index 857431795..596b552dd 100644 --- a/ecc/bw6-761/fr/element_ops_amd64.s +++ b/ecc/bw6-761/fr/element_ops_amd64.s @@ -450,356 +450,3 @@ TEXT ·Butterfly(SB), $48-16 MOVQ R8, 32(AX) MOVQ R9, 40(AX) RET - -// inverse(res, x *Element) -TEXT ·inverse(SB), $136-16 - // u = q - // u[0] -> R9 - // u[1] -> R10 - // u[2] -> R11 - // u[3] -> R12 - // u[4] -> R13 - // u[5] -> R14 - MOVQ q<>+0(SB), R9 - MOVQ q<>+8(SB), R10 - MOVQ q<>+16(SB), R11 - MOVQ q<>+24(SB), R12 - MOVQ q<>+32(SB), R13 - MOVQ q<>+40(SB), R14 - - // s = r^2 - // s[0] -> s11-96(SP) - // s[1] -> s12-104(SP) - // s[2] -> s13-112(SP) - // s[3] -> s14-120(SP) - // s[4] -> s15-128(SP) - // s[5] -> s16-136(SP) - MOVQ $0xb786686c9400cd22, R8 - MOVQ R8, s11-96(SP) - MOVQ $0x0329fcaab00431b1, R8 - MOVQ R8, s12-104(SP) - MOVQ $0x22a5f11162d6b46d, R8 - MOVQ R8, s13-112(SP) - MOVQ $0xbfdf7d03827dc3ac, R8 - MOVQ R8, s14-120(SP) - MOVQ $0x837e92f041790bf9, R8 - MOVQ R8, s15-128(SP) - MOVQ $0x006dfccb1e914b88, R8 - MOVQ R8, s16-136(SP) - - // v = x - // v[0] -> R15 - // v[1] -> s0-8(SP) - // v[2] -> s1-16(SP) - // v[3] -> s2-24(SP) - // v[4] -> s3-32(SP) - // v[5] -> s4-40(SP) - MOVQ x+8(FP), R8 - MOVQ 0(R8), AX - MOVQ 8(R8), DX - MOVQ 16(R8), CX - MOVQ 24(R8), BX - MOVQ 32(R8), SI - MOVQ 40(R8), DI - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - MOVQ SI, s3-32(SP) - MOVQ DI, s4-40(SP) - - // if x is 0, returns 0 - MOVQ AX, R8 - ORQ DX, R8 - ORQ CX, R8 - ORQ BX, R8 - ORQ SI, R8 - ORQ DI, R8 - JEQ l7 - - // r = 0 - // r[0] -> s5-48(SP) - // r[1] -> s6-56(SP) - // r[2] -> s7-64(SP) - // r[3] -> s8-72(SP) - // r[4] -> s9-80(SP) - // r[5] -> s10-88(SP) - MOVQ $0, s5-48(SP) - MOVQ $0, s6-56(SP) - MOVQ $0, s7-64(SP) - MOVQ $0, s8-72(SP) - MOVQ $0, s9-80(SP) - MOVQ $0, s10-88(SP) - -l2: - BTQ $0, AX - JCS l8 - MOVQ $0, BP - XORQ R8, R8 - -l9: - INCQ BP - SHRQ $1, AX, R8 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, DI - BTQ $0, AX - JCC l9 - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - MOVQ SI, s3-32(SP) - MOVQ DI, s4-40(SP) - MOVQ s11-96(SP), AX - MOVQ s12-104(SP), DX - MOVQ s13-112(SP), CX - MOVQ s14-120(SP), BX - MOVQ s15-128(SP), SI - MOVQ s16-136(SP), DI - -l10: - BTQ $0, AX - JCC l11 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - -l11: - SHRQ $1, AX, R8 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, DI - DECQ BP - JNE l10 - MOVQ AX, s11-96(SP) - MOVQ DX, s12-104(SP) - MOVQ CX, s13-112(SP) - MOVQ BX, s14-120(SP) - MOVQ SI, s15-128(SP) - MOVQ DI, s16-136(SP) - -l8: - MOVQ R9, AX - MOVQ R10, DX - MOVQ R11, CX - MOVQ R12, BX - MOVQ R13, SI - MOVQ R14, DI - BTQ $0, AX - JCS l12 - MOVQ $0, BP - XORQ R8, R8 - -l13: - INCQ BP - SHRQ $1, AX, R8 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, DI - BTQ $0, AX - JCC l13 - MOVQ AX, R9 - MOVQ DX, R10 - MOVQ CX, R11 - MOVQ BX, R12 - MOVQ SI, R13 - MOVQ DI, R14 - MOVQ s5-48(SP), AX - MOVQ s6-56(SP), DX - MOVQ s7-64(SP), CX - MOVQ s8-72(SP), BX - MOVQ s9-80(SP), SI - MOVQ s10-88(SP), DI - -l14: - BTQ $0, AX - JCC l15 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - -l15: - SHRQ $1, AX, R8 - SHRQ $1, DX, AX - SHRQ $1, CX, DX - SHRQ $1, BX, CX - SHRQ $1, SI, BX - SHRQ $1, DI, SI - SHRQ $1, DI - DECQ BP - JNE l14 - MOVQ AX, s5-48(SP) - MOVQ DX, s6-56(SP) - MOVQ CX, s7-64(SP) - MOVQ BX, s8-72(SP) - MOVQ SI, s9-80(SP) - MOVQ DI, s10-88(SP) - -l12: - // v = v - u - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - MOVQ s3-32(SP), SI - MOVQ s4-40(SP), DI - SUBQ R9, AX - SBBQ R10, DX - SBBQ R11, CX - SBBQ R12, BX - SBBQ R13, SI - SBBQ R14, DI - JCC l3 - SUBQ R15, R9 - SBBQ s0-8(SP), R10 - SBBQ s1-16(SP), R11 - SBBQ s2-24(SP), R12 - SBBQ s3-32(SP), R13 - SBBQ s4-40(SP), R14 - MOVQ s5-48(SP), AX - MOVQ s6-56(SP), DX - MOVQ s7-64(SP), CX - MOVQ s8-72(SP), BX - MOVQ s9-80(SP), SI - MOVQ s10-88(SP), DI - SUBQ s11-96(SP), AX - SBBQ s12-104(SP), DX - SBBQ s13-112(SP), CX - SBBQ s14-120(SP), BX - SBBQ s15-128(SP), SI - SBBQ s16-136(SP), DI - JCC l16 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - -l16: - MOVQ AX, s5-48(SP) - MOVQ DX, s6-56(SP) - MOVQ CX, s7-64(SP) - MOVQ BX, s8-72(SP) - MOVQ SI, s9-80(SP) - MOVQ DI, s10-88(SP) - JMP l4 - -l3: - MOVQ AX, R15 - MOVQ DX, s0-8(SP) - MOVQ CX, s1-16(SP) - MOVQ BX, s2-24(SP) - MOVQ SI, s3-32(SP) - MOVQ DI, s4-40(SP) - MOVQ s11-96(SP), AX - MOVQ s12-104(SP), DX - MOVQ s13-112(SP), CX - MOVQ s14-120(SP), BX - MOVQ s15-128(SP), SI - MOVQ s16-136(SP), DI - SUBQ s5-48(SP), AX - SBBQ s6-56(SP), DX - SBBQ s7-64(SP), CX - SBBQ s8-72(SP), BX - SBBQ s9-80(SP), SI - SBBQ s10-88(SP), DI - JCC l17 - ADDQ q<>+0(SB), AX - ADCQ q<>+8(SB), DX - ADCQ q<>+16(SB), CX - ADCQ q<>+24(SB), BX - ADCQ q<>+32(SB), SI - ADCQ q<>+40(SB), DI - -l17: - MOVQ AX, s11-96(SP) - MOVQ DX, s12-104(SP) - MOVQ CX, s13-112(SP) - MOVQ BX, s14-120(SP) - MOVQ SI, s15-128(SP) - MOVQ DI, s16-136(SP) - -l4: - MOVQ R9, R8 - SUBQ $1, R8 - ORQ R10, R8 - ORQ R11, R8 - ORQ R12, R8 - ORQ R13, R8 - ORQ R14, R8 - JEQ l5 - MOVQ R15, AX - MOVQ s0-8(SP), DX - MOVQ s1-16(SP), CX - MOVQ s2-24(SP), BX - MOVQ s3-32(SP), SI - MOVQ s4-40(SP), DI - MOVQ AX, R8 - SUBQ $1, R8 - JNE l2 - ORQ DX, R8 - ORQ CX, R8 - ORQ BX, R8 - ORQ SI, R8 - ORQ DI, R8 - JEQ l6 - JMP l2 - -l5: - MOVQ res+0(FP), R8 - MOVQ s5-48(SP), AX - MOVQ s6-56(SP), DX - MOVQ s7-64(SP), CX - MOVQ s8-72(SP), BX - MOVQ s9-80(SP), SI - MOVQ s10-88(SP), DI - MOVQ AX, 0(R8) - MOVQ DX, 8(R8) - MOVQ CX, 16(R8) - MOVQ BX, 24(R8) - MOVQ SI, 32(R8) - MOVQ DI, 40(R8) - RET - -l6: - MOVQ res+0(FP), R8 - MOVQ s11-96(SP), AX - MOVQ s12-104(SP), DX - MOVQ s13-112(SP), CX - MOVQ s14-120(SP), BX - MOVQ s15-128(SP), SI - MOVQ s16-136(SP), DI - MOVQ AX, 0(R8) - MOVQ DX, 8(R8) - MOVQ CX, 16(R8) - MOVQ BX, 24(R8) - MOVQ SI, 32(R8) - MOVQ DI, 40(R8) - RET - -l7: - MOVQ res+0(FP), R8 - MOVQ $0, 0(R8) - MOVQ $0, 8(R8) - MOVQ $0, 16(R8) - MOVQ $0, 24(R8) - MOVQ $0, 32(R8) - MOVQ $0, 40(R8) - RET diff --git a/ecc/bw6-761/fr/element_ops_noasm.go b/ecc/bw6-761/fr/element_ops_noasm.go index 006365daa..ec1fac18d 100644 --- a/ecc/bw6-761/fr/element_ops_noasm.go +++ b/ecc/bw6-761/fr/element_ops_noasm.go @@ -51,10 +51,6 @@ func mul(z, x, y *Element) { _mulGeneric(z, x, y) } -func inverse(z, x *Element) { - _inverseGeneric(z, x) -} - // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 func fromMont(z *Element) { diff --git a/field/asm/amd64/build.go b/field/asm/amd64/build.go index bf54b3e64..86ed46a60 100644 --- a/field/asm/amd64/build.go +++ b/field/asm/amd64/build.go @@ -170,9 +170,6 @@ func Generate(w io.Writer, F *field.Field) error { // fft butterflies f.generateButterfly() - // inverse asm - f.generateInverse() - return nil } diff --git a/field/asm/amd64/element_inverse.go b/field/asm/amd64/element_inverse.go deleted file mode 100644 index fade06285..000000000 --- a/field/asm/amd64/element_inverse.go +++ /dev/null @@ -1,246 +0,0 @@ -package amd64 - -import "github.com/consensys/bavard/amd64" - -func (f *FFAmd64) generateInverse() { - f.Comment("inverse(res, x *Element)") - - // we need r, s, u, v registers, + one set for subs or reductions - stackSize := f.StackSize(f.NbWords*5+1, 0, 0) - registers := f.FnHeader("inverse", stackSize, 16) - defer f.AssertCleanStack(stackSize, 0) - - t := f.PopN(®isters) - zero := f.Pop(®isters) - loopCounter := amd64.BP - - // order is important here; for NbWords <= 6, u is going to fit into registers. - u := f.PopN(®isters) - v := f.PopN(®isters) - r := f.PopN(®isters) - s := f.PopN(®isters) - - uOnStack := f.NbWords > 6 - - // labels - startLoop := f.NewLabel() - vBigger := f.NewLabel() - endLoop := f.NewLabel() - returnR := f.NewLabel() - returnS := f.NewLabel() - returnZero := f.NewLabel() - - // u = q - f.Comment("u = q") - f.LabelRegisters("u", u...) - for i := 0; i < f.NbWords; i++ { - if !uOnStack { - // u is on registers - f.MOVQ(f.qAt(i), u[i]) - } else { - f.MOVQ(f.qAt(i), zero) - f.MOVQ(zero, u[i]) - } - } - - // s = r^2 - f.Comment("s = r^2") - f.LabelRegisters("s", s...) - for i := 0; i < f.NbWords; i++ { - f.MOVQ(f.RSquare[i], zero) - f.MOVQ(zero, s[i]) - } - - // v = x - f.Comment("v = x") - f.LabelRegisters("v", v...) - f.MOVQ("x+8(FP)", zero) - f.Mov(zero, t) - f.Mov(t, v) - - f.Comment("if x is 0, returns 0") - f.MOVQ(t[0], zero) - for i := 1; i < len(t); i++ { - f.ORQ(t[i], zero) - } - f.JEQ(returnZero) - - //r = 0 - f.Comment("r = 0") - f.LabelRegisters("r", r...) - for i := 0; i < len(r); i++ { - f.MOVQ(0, r[i]) - } - - // rshOne set a and b such that - // for a[0]&1 == 0 { - // a <<= 1 - // if b[0] & 1 == 1 { - // b += q - // } - // b <<= 1 - // } - // t must be a set of registers. - rshOne := func(a, b, t []amd64.Register) { - end := f.NewLabel() - firstLoop := f.NewLabel() - secondLoop := f.NewLabel() - - // this is done before by the caller - // f.Mov(a, t) - f.BTQ(0, t[0]) - f.JCS(end) - - f.MOVQ(0, loopCounter) - f.XORQ(zero, zero) - f.LABEL(firstLoop) - f.INCQ(loopCounter) - - f.SHRQw(1, t[0], zero) - for i := 1; i < len(t); i++ { - f.SHRQw(1, t[i], t[i-1]) - } - f.SHRQ(1, t[len(t)-1]) - - f.BTQ(0, t[0]) - f.JCC(firstLoop) - - // we need to save the result of the first loop - f.Mov(t, a) - f.Mov(b, t) - // we need to shift r (t) loopCOunter times - f.LABEL(secondLoop) - - f.BTQ(0, t[0]) // if r[0] is odd, we add modulus - f.reduceIfBorrow(t) - f.SHRQw(1, t[0], zero) - for i := 1; i < len(t); i++ { - f.SHRQw(1, t[i], t[i-1]) - } - f.SHRQ(1, t[len(t)-1]) - - f.DECQ(loopCounter) - f.JNE(secondLoop) - - // save result of second loop - f.Mov(t, b) - - f.LABEL(end) - } - - f.LABEL(startLoop) - - // note: t always contains v here - rshOne(v, s, t) - - f.Mov(u, t) - rshOne(u, r, t) - - // f.Push(®isters, loopCounter) - - // v = v - u - f.Comment("v = v - u") - f.Mov(v, t) - - f.Sub(u, t) - f.JCC(vBigger) - - // here v is smaller - // u = u - v - if !uOnStack { - f.Sub(v, u) - } else { - f.Mov(u, t) - f.Sub(v, t) - f.Mov(t, u) - } - - // r = r - s - f.Mov(r, t) - f.Sub(s, t) - f.reduceIfBorrow(t) - f.Mov(t, r) - f.JMP(endLoop) - - // here v is bigger - f.LABEL(vBigger) - // v = v - u - f.Mov(t, v) - // s = s - r - f.Mov(s, t) - f.Sub(r, t) - f.reduceIfBorrow(t) - f.Mov(t, s) - f.LABEL(endLoop) - - // if (u[0] == 1) && (u[5]|u[4]|u[3]|u[2]|u[1]) == 0 { - // return z.Set(&r) - // } - // if (v[0] == 1) && (v[5]|v[4]|v[3]|v[2]|v[1]) == 0 { - // return z.Set(&s) - // } - if !uOnStack { - f.MOVQ(u[0], zero) - f.SUBQ(1, zero) - for i := 1; i < f.NbWords; i++ { - f.ORQ(u[i], zero) - } - } else { - f.Mov(u, t) - f.SUBQ(1, t[0]) - last := len(t) - 1 - for i := 0; i < f.NbWords-1; i++ { - f.ORQ(t[i], t[last]) - } - } - - f.JEQ(returnR) - - f.Mov(v, t) - f.MOVQ(t[0], zero) - f.SUBQ(1, zero) - f.JNE(startLoop) - for i := 1; i < f.NbWords; i++ { - f.ORQ(t[i], zero) - } - f.JEQ(returnS) - - f.JMP(startLoop) - - f.LABEL(returnR) - f.MOVQ("res+0(FP)", zero) - f.Mov(r, t) - f.Mov(t, zero) - f.RET() - - f.LABEL(returnS) - f.MOVQ("res+0(FP)", zero) - f.Mov(s, t) - f.Mov(t, zero) - f.RET() - - f.LABEL(returnZero) - f.MOVQ("res+0(FP)", zero) - for i := 0; i < len(t); i++ { - f.MOVQ(0, zero.At(i)) - } - f.RET() - - // f.Push(®isters, flagBorrow) - f.Push(®isters, u...) - f.Push(®isters, r...) - f.Push(®isters, v...) - f.Push(®isters, s...) - f.Push(®isters, t...) - f.Push(®isters, zero) -} - -func (f *FFAmd64) reduceIfBorrow(t []amd64.Register) { - noReduce := f.NewLabel() - f.JCC(noReduce) - f.ADDQ(f.qAt(0), t[0]) - for i := 1; i < f.NbWords; i++ { - f.ADCQ(f.qAt(i), t[i]) - } - f.LABEL(noReduce) -} diff --git a/field/internal/templates/element/inverse.go b/field/internal/templates/element/inverse.go index 8ec60adc6..6341d5454 100644 --- a/field/internal/templates/element/inverse.go +++ b/field/internal/templates/element/inverse.go @@ -8,36 +8,23 @@ const Inverse = ` // Inverse z = x^-1 mod q // note: allocates a big.Int (math/big) func (z *{{.ElementName}}) Inverse( x *{{.ElementName}}) *{{.ElementName}} { - inverse(z, x) - return z -} - -func _inverseGeneric(z, x *{{.ElementName}}) { var _xNonMont big.Int x.ToBigIntRegular( &_xNonMont) _xNonMont.ModInverse(&_xNonMont, Modulus()) z.SetBigInt(&_xNonMont) + return z } + {{ else }} // Inverse z = x^-1 mod q // Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" // if x == 0, sets and returns z = x func (z *{{.ElementName}}) Inverse(x *{{.ElementName}}) *{{.ElementName}} { - inverse(z, x) - return z -} - - - -// _inverseGeneric z = x^-1 mod q -// Algorithm 16 in "Efficient Software-Implementation of Finite Fields with Applications to Cryptography" -// if x == 0, sets and returns z = x -func _inverseGeneric(z, x *{{.ElementName}}) { if x.IsZero() { z.SetZero() - return + return z } // initialize u = q @@ -91,11 +78,11 @@ func _inverseGeneric(z, x *{{.ElementName}}) { } if (u[0] == 1) && ({{- range $i := reverse .NbWordsIndexesNoZero}}u[{{$i}}] {{if eq $i 1}}{{else}} | {{end}}{{end}} ) == 0 { z.Set(&r) - return + return z } if (v[0] == 1) && ({{- range $i := reverse .NbWordsIndexesNoZero}}v[{{$i}}] {{if eq $i 1}}{{else}} | {{end}}{{end}} ) == 0 { z.Set(&s) - return + return z } } diff --git a/field/internal/templates/element/ops.go b/field/internal/templates/element/ops.go index e68b09574..7bb3744fb 100644 --- a/field/internal/templates/element/ops.go +++ b/field/internal/templates/element/ops.go @@ -38,9 +38,6 @@ func reduce(res *{{.ElementName}}) //go:noescape func Butterfly(a, b *{{.ElementName}}) -//go:noescape -func inverse(res, x *{{.ElementName}}) - {{end}} diff --git a/field/internal/templates/element/ops_generic.go b/field/internal/templates/element/ops_generic.go index ccbaa87e9..73f19ac29 100644 --- a/field/internal/templates/element/ops_generic.go +++ b/field/internal/templates/element/ops_generic.go @@ -34,9 +34,6 @@ func mul(z, x, y *{{.ElementName}}) { _mulGeneric(z, x, y) } -func inverse(z, x *{{.ElementName}}) { - _inverseGeneric(z, x) -} // FromMont converts z in place (i.e. mutates) from Montgomery to regular representation // sets and returns z = z * 1 From 47fc8ad97708516791c63a0908f4bfb15cf1cf36 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 21 Sep 2021 11:16:51 -0500 Subject: [PATCH 10/11] build: replace go get by go install in CI workflow --- .github/workflows/develop.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/develop.yml b/.github/workflows/develop.yml index 3b5965b59..0e9d3cf4a 100644 --- a/.github/workflows/develop.yml +++ b/.github/workflows/develop.yml @@ -31,18 +31,18 @@ jobs: restore-keys: | ${{ runner.os }}-go- - name: install deps - run: go get golang.org/x/tools/cmd/goimports && go get github.com/klauspost/asmfmt/cmd/asmfmt + run: go install golang.org/x/tools/cmd/goimports@latest && go install github.com/klauspost/asmfmt/cmd/asmfmt@latest - name: gofmt run: if [[ -n $(gofmt -l .) ]]; then echo "please run gofmt"; exit 1; fi - name: go vet run: go vet ./... - name: staticcheck run: | - go get -u honnef.co/go/tools/cmd/staticcheck + go install honnef.co/go/tools/cmd/staticcheck@latest staticcheck ./... - name: gosec run: | - go get -u github.com/securego/gosec/cmd/gosec + go install github.com/securego/gosec/cmd/gosec@latest gosec -exclude G204 ./... - name: generated files should not be modified run: | @@ -77,7 +77,7 @@ jobs: restore-keys: | ${{ runner.os }}-go- - name: install deps - run: go get golang.org/x/tools/cmd/goimports && go get github.com/klauspost/asmfmt/cmd/asmfmt + run: go install golang.org/x/tools/cmd/goimports@latest && go install github.com/klauspost/asmfmt/cmd/asmfmt@latest - name: Test run: | go test -v -short ./... From 6c20886c068d6975a709bf518c9ceb9979d4c235 Mon Sep 17 00:00:00 2001 From: Youssef El Housni Date: Tue, 21 Sep 2021 18:19:51 +0200 Subject: [PATCH 11/11] build: remove unused code (nSquare Fp24) --- ecc/bls24-315/internal/fptower/e24_pairing.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ecc/bls24-315/internal/fptower/e24_pairing.go b/ecc/bls24-315/internal/fptower/e24_pairing.go index 98622a5e5..3dc1204d3 100644 --- a/ecc/bls24-315/internal/fptower/e24_pairing.go +++ b/ecc/bls24-315/internal/fptower/e24_pairing.go @@ -1,11 +1,5 @@ package fptower -func (z *E24) nSquare(n int) { - for i := 0; i < n; i++ { - z.CyclotomicSquare(z) - } -} - func (z *E24) nSquareCompressed(n int) { for i := 0; i < n; i++ { z.CyclotomicSquareCompressed(z)