scalar_p.sx


.section .rodata 

.global prime_p_and_c
prime_p_and_c: .quad 0xfffffffefffffc2f,  0x1000003d1


.text

.include "aliases"
.include "commonmacros"
.include "u256macros"


/**
 Tests whether a scalar in sa0...sa3 is zero.
 Returns 0 in io0 if true, otherwise 1.
*/
BEGIN_GLOBAL_FUNCTION a_is_zero
      IS_A_ZERO
END_GLOBAL_FUNCTION a_is_zero


/**
 Tests whether a scalar is zero.
 Input:
 Address of scalar in io0
 Result:
 io0: 0 if scalar is zero, otherwise 1
*/
BEGIN_C_FUNCTION secp256k1_is_zero
     LOAD_A
     IS_A_ZERO
END_C_FUNCTION secp256k1_is_zero

/**
 Tests whether a Scalar r (lo, sr0-sr3) is zero.
 Returns 0 in io0 if true, otherwise 1.
*/
BEGIN_GLOBAL_FUNCTION r_is_zero
      IS_R_LO_ZERO
END_GLOBAL_FUNCTION r_is_zero


/**
The macro REDUCE_512_MOD_P 
reduces a 512-bit-value r to a 256-bit-value mod p.
p = 2**256 - c, with c = ppc = 0x1000003d1
==> 2**256 = p+c = 0+c =c
*/
.macro REDUCE_512_MOD_P
      
        //sr7*ppc to sr4 sr3
        reduce_r7\@: 
        mul     tmp4, ppc, sr7 //c*r7 lo
        umulh   tmp5, ppc, sr7 //c*r7 hi
        adds    sr3, sr3, tmp4 
        adcs    sr4, sr4, tmp5
        adcs    sr5, sr5, xzr
        adcs    sr6, sr6, xzr
        adc     sr7, xzr, xzr //new carry in sr7?
        cbnz sr7, reduce_r7\@ 
         
        //sr6*ppc to sr3 sr2     
        reduce_r6\@: 
        mul     tmp4, ppc, sr6 //c*r6 lo
        umulh   tmp5, ppc, sr6 //c*r6 hi
        adds    sr2, sr2, tmp4 
        adcs    sr3, sr3, tmp5
        adcs    sr4, sr4, xzr
        adcs    sr5, sr5, xzr
        adc     sr6, xzr, xzr //new carry in sr6?
        cbnz sr6, reduce_r6\@  
        
        //sr5*ppc to sr2 sr1     
        reduce_r5\@: //r5=sr5
        mul     tmp4, ppc, sr5 //c*r5 lo
        umulh   tmp5, ppc, sr5 //c*r5 hi
        adds    sr1, sr1, tmp4 
        adcs    sr2, sr2, tmp5
        adcs    sr3, sr3, xzr
        adcs    sr4, sr4, xzr
        adc     sr5, xzr, xzr //new carry in sr5?
        cbnz sr5, reduce_r5\@  

        //sr4*ppc to sr1 sr0     
        reduce_r4\@: //r4=sr4
        mul     tmp4, ppc, sr4 //c*r4 lo
        umulh   tmp5, ppc, sr4 //c*r4 hi
        adds    sr0, sr0, tmp4 
        adcs    sr1, sr1, tmp5
        adcs    sr2, sr2, xzr
        adcs    sr3, sr3, xzr
        adc     sr4, xzr, xzr //new carry in sr4?
        cbnz sr4, reduce_r4\@
        
        REDUCE_256_MOD_P
.endm

/*
The macro REDUCE_256_MOD_P
reduces a 256-bit-value x, if x>= p.
Asigns the reduced value to x.
x is expected in sr0...sr3
Output:
in sr0...sr3
*/
.macro REDUCE_256_MOD_P  reg0=sr0 reg1=sr1 reg2=sr2 reg3=sr3
        cmp \reg3, pp1
        blo is_lower\@ 
        bhi is_higher\@
        cmp \reg2, pp1
        blo is_lower\@
        bhi is_higher\@
        cmp \reg1, pp1
        blo is_lower\@
        bhi is_higher\@
        cmp \reg0, pp0
        blo is_lower\@
        //here x >= n
       is_higher\@: //x > prime -> subtract prime
        subs \reg0, \reg0, pp0
        sbcs \reg1, \reg1, pp1
        sbcs \reg2, \reg2, pp1
        sbc  \reg3, \reg3, pp1  
       is_lower\@: //x < prime, do nothing
.endm


/*
Macro ADD_A_B_MOD_P
Modular Addition modulo p
r = (a + b) mod p
a in sa0...sa3
b in sb0...sb3
r in sr0...sr3
The prime is expected in pp0 and pp1
*/ 
.macro ADD_A_B_MOD_P
        adds   sr0, sa0, sb0
        adcs   sr1, sa1, sb1
        adcs   sr2, sa2, sb2
        adcs   sr3, sa3, sb3
        adc    sr4, xzr, xzr  
        cbz sr4, possibly_reduce\@
		    carry_occured\@: 
		    subs   sr0, sr0, pp0
		    sbcs   sr1, sr1, pp1
		    sbcs   sr2, sr2, pp1
		    sbcs   sr3, sr3, pp1
		    adc    sr4, xzr, xzr
		    cbnz sr4, carry_occured\@ //this is possible if a > prime and b > prime
        possibly_reduce\@: //reduces if sum >= prime
        	REDUCE_256_MOD_P   
.endm

/** Wrapper function for internal use*/
BEGIN_GLOBAL_FUNCTION add_a_b_mod_p
      ADD_A_B_MOD_P
END_GLOBAL_FUNCTION add_a_b_mod_p

/*
 The public C compatible function secp256k1_add_mod_p adds two scalars a + b
 modulo p.
 r = (a + b) mod p
 Address of scalar a in io0
 Address of scalar b in io1
 Address of result r in io2
*/
BEGIN_C_FUNCTION secp256k1_add_mod_p  
        INIT_PRIME_P   
        LOAD_A_B
        ADD_A_B_MOD_P
        STORE_R_LO io2
END_C_FUNCTION secp256k1_add_mod_p   

/*
Macro ADD_A_B_MOD_P_ASIGN_A
Modular Addition modulo p
r = (a + b) mod p
a = r
a in sa0...sa3
b in sb0...sb3
r in sr0...sr3
The prime is expected in pp0 and pp1
*/
.macro ADD_A_B_MOD_P_ASIGN_A
      ADD_A_B_MOD_P
      COPY_R_LO_TO_A
.endm

/** Wrapper function */
BEGIN_GLOBAL_FUNCTION add_a_b_mod_p_asign_a
      ADD_A_B_MOD_P_ASIGN_A
END_GLOBAL_FUNCTION add_a_b_mod_p_asign_a


 /*
Macro ADD_A_B_MOD_P_ASIGN_B
Modular Addition modulo p
r = (a + b) mod p
b = r
a in sa0...sa3
b in sb0...sb3
r in sb0...sb3
The prime is expected in pp0 and pp1
*/
.macro ADD_A_B_MOD_P_ASIGN_B
      ADD_A_B_MOD_P
      COPY_R_LO_TO_B
.endm

/** Wrapper function */
BEGIN_GLOBAL_FUNCTION add_a_b_mod_p_asign_b
      ADD_A_B_MOD_P_ASIGN_B
END_GLOBAL_FUNCTION add_a_b_mod_p_asign_b


/*
Macro DOUBLE_A_MOD_P
Modular duplication modulo p
r = (a + a) mod p
a in sa0...sa3
r in sr0...sr3
The prime is expected in pp0 and pp1
*/ 
.macro DOUBLE_A_MOD_P
        adds   sr0, sa0, sa0
        adcs   sr1, sa1, sa1
        adcs   sr2, sa2, sa2
        adcs   sr3, sa3, sa3
        adc    sr4, xzr, xzr  
        cbz sr4, possibly_reduce_\@
		    carry_occured_\@: 
		    subs   sr0, sr0, pp0
		    sbcs   sr1, sr1, pp1
		    sbcs   sr2, sr2, pp1
		    sbcs   sr3, sr3, pp1
		    adc    sr4, xzr, xzr
		    cbnz sr4, carry_occured_\@ //this is possible if a > prime and b > prime
        possibly_reduce_\@: //reduces if sum >= prime
        	REDUCE_256_MOD_P   
.endm

/** Wrapper function */
BEGIN_GLOBAL_FUNCTION double_a_mod_p
		DOUBLE_A_MOD_P
END_GLOBAL_FUNCTION double_a_mod_p

/** Public C function */
BEGIN_C_FUNCTION secp256k1_double_mod_p
        INIT_PRIME_P   
        LOAD_A
        DOUBLE_A_MOD_P
        STORE_R_LO io1
END_C_FUNCTION secp256k1_double_mod_p

/** Public C function */
BEGIN_C_FUNCTION secp256k1_double_mod_p_asign
        INIT_PRIME_P   
        LOAD_A
        DOUBLE_A_MOD_P
        STORE_R_LO io0
END_C_FUNCTION secp256k1_double_mod_p_asign


/*
Macro DOUBLE_A_MOD_P_ASIGN_A
Modular duplication modulo p
r = (a + a) mod p
a = r
a in sa0...sa3
r in sb0...sb3
The prime is expected in pp0 and pp1
*/
.macro DOUBLE_A_MOD_P_ASIGN_A
		DOUBLE_A_MOD_P
		COPY_R_LO_TO_A
.endm

/** Wrapper function */
BEGIN_GLOBAL_FUNCTION double_a_mod_p_asign_a
		DOUBLE_A_MOD_P_ASIGN_A
END_GLOBAL_FUNCTION double_a_mod_p_asign_a


/*
Macro DOUBLE_A_MOD_P_ASIGN_B
Modular duplication modulo p
r = (a + a) mod p
b = r
a in sa0...sa3
b in sb0...sb3
r in sb0...sb3
The prime is expected in pp0 and pp1
*/
.macro DOUBLE_A_MOD_P_ASIGN_B
		DOUBLE_A_MOD_P
		COPY_R_LO_TO_B
.endm

/** Wrapper function */
BEGIN_GLOBAL_FUNCTION double_a_mod_p_asign_b
		DOUBLE_A_MOD_P_ASIGN_B
END_GLOBAL_FUNCTION double_a_mod_p_asign_b


/*
Macro TRIPLE_A_MOD_P
Modular triplication modulo p
r = (a + a + a) mod p
a in sa0...sa3
r in sr0...sr3
The prime is expected in pp0 and pp2
*/ 
.macro TRIPLE_A_MOD_P
        adds   sr0, sa0, sa0
        adcs   sr1, sa1, sa1
        adcs   sr2, sa2, sa2
        adcs   sr3, sa3, sa3
        adc    sr4, xzr, xzr  
        adds   sr0, sr0, sa0
        adcs   sr1, sr1, sa1
        adcs   sr2, sr2, sa2
        adcs   sr3, sr3, sa3
        adc    sr4, sr4, xzr          
        cbz sr4, _possibly_reduce_\@
		    _carry_occured_\@: 
		    mul   sr5, sr4, ppc //carries*c
		    adds   sr0, sr0, sr5
		    adcs   sr1, sr1, xzr
		    adcs   sr2, sr2, xzr
		    adcs   sr3, sr3, xzr
		    adc    sr4, xzr, xzr
		    cbnz sr4, _carry_occured_\@ //this is possible if a > prime and b > prime
        _possibly_reduce_\@: //reduces if sum >= prime
        	REDUCE_256_MOD_P   
.endm

/** wrapper function*/
BEGIN_GLOBAL_FUNCTION triple_a_mod_p
		TRIPLE_A_MOD_P
END_GLOBAL_FUNCTION triple_a_mod_p

/** public C function  */
BEGIN_C_FUNCTION secp256k1_triple_mod_p
        INIT_PRIME_P   
        LOAD_A
        TRIPLE_A_MOD_P
        STORE_R_LO io1 
END_C_FUNCTION secp256k1_triple_mod_p

/** public C function  */
BEGIN_C_FUNCTION secp256k1_triple_mod_p_asign
        INIT_PRIME_P   
        LOAD_A
        TRIPLE_A_MOD_P
        STORE_R_LO io0 
END_C_FUNCTION secp256k1_triple_mod_p_asign

/*
Macro TRIPLE_A_MOD_P_ASIGN_A
Modular triplication modulo p
r = (a + a + a) mod p
a = r
a in sa0...sa3
r in sr0...sr3
The prime is expected in pp0 and pp1
*/ 
.macro TRIPLE_A_MOD_P_ASIGN_A
		TRIPLE_A_MOD_P
		COPY_R_LO_TO_A
.endm

/** wrapper function*/
BEGIN_GLOBAL_FUNCTION triple_a_mod_p_asign_a
		TRIPLE_A_MOD_P_ASIGN_A
END_GLOBAL_FUNCTION triple_a_mod_p_asign_a


/*
Macro TRIPLE_A_MOD_P_ASIGN_B
Modular triplication modulo p
r = (a + a + a) mod p
b = r
a in sa0...sa3
b in sb0...sb3
r in sr0...sr3
The prime is expected in pp0 and pp1
*/
.macro TRIPLE_A_MOD_P_ASIGN_B
		TRIPLE_A_MOD_P
		COPY_R_LO_TO_B
.endm

/** wrapper function*/
BEGIN_GLOBAL_FUNCTION triple_a_mod_p_asign_b
		TRIPLE_A_MOD_P_ASIGN_B
END_GLOBAL_FUNCTION triple_a_mod_p_asign_b


//**********************subtraction modulo p********************************


/*
Macro SUB_A_B_MOD_P
Modular subtraction modulo p
r = (a - b) mod n
a in sa0...sa3
b in sb0...sb3
r in sr0...sr3
The prime is expected in pp0 and pp1
*/
.macro SUB_A_B_MOD_P
        subs   sr0, sa0, sb0
        sbcs   sr1, sa1, sb1
        sbcs   sr2, sa2, sb2
        sbcs   sr3, sa3, sb3
        sbc    sr4, xzr, xzr 
        cbz sr4, possibly_reduce\@
        //carry occured -> add p and ignore carry
		    adds   sr0, sr0, pp0
		    adcs   sr1, sr1, pp1
		    adcs   sr2, sr2, pp1
		    adc    sr3, sr3, pp1
        b endsubmod\@
        possibly_reduce\@:
        REDUCE_256_MOD_P
        endsubmod\@:
.endm

/**wrapper functions*/
BEGIN_GLOBAL_FUNCTION sub_a_b_mod_p
		SUB_A_B_MOD_P
END_GLOBAL_FUNCTION sub_a_b_mod_p

BEGIN_GLOBAL_FUNCTION sub_a_b_mod_p_asign_a
		SUB_A_B_MOD_P
		mov sa0, sr0
		mov sa1, sr1
		mov sa2, sr2
		mov sa3, sr3
END_GLOBAL_FUNCTION sub_a_b_mod_p_asign_a

BEGIN_GLOBAL_FUNCTION sub_a_b_mod_p_asign_b
		SUB_A_B_MOD_P
		mov sb0, sr0
		mov sb1, sr1
		mov sb2, sr2
		mov sb3, sr3
END_GLOBAL_FUNCTION sub_a_b_mod_p_asign_b

/** Public C function */
BEGIN_C_FUNCTION secp256k1_sub_mod_p  
        INIT_PRIME_P   
        LOAD_A_B
        SUB_A_B_MOD_P
        STORE_R_LO io2
END_C_FUNCTION secp256k1_sub_mod_p

/**
 a = -a mod p
*/
.macro negate_a_mod_p
        subs   sr0, xzr, sa0
        sbcs   sr1, xzr, sa1
        sbcs   sr2, xzr, sa2
        sbcs   sr3, xzr, sa3
        sbc    sr4, xzr, xzr 
        cbz sr4, possibly_reduce\@
        //carry occured -> add p and ignore carry
		    adds   sr0, sr0, pp0
		    adcs   sr1, sr1, pp1
		    adcs   sr2, sr2, pp1
		    adc    sr3, sr3, pp1
        b endsubmod\@
        possibly_reduce\@:
        REDUCE_256_MOD_P
        endsubmod\@:
.endm

/** Public C function */
BEGIN_C_FUNCTION secp256k1_negate_mod_p  
        INIT_PRIME_P   
        LOAD_A
        REDUCE_256_MOD_P sa0 sa1 sa2 sa3
        negate_a_mod_p
        STORE_R_LO io1
END_C_FUNCTION secp256k1_negate_mod_p

BEGIN_C_FUNCTION secp256k1_negate_mod_p_asign  
        INIT_PRIME_P   
        LOAD_A
        REDUCE_256_MOD_P sa0 sa1 sa2 sa3
        negate_a_mod_p
        STORE_R_LO io0
END_C_FUNCTION secp256k1_negate_mod_p_asign


/** Internal functions */
BEGIN_GLOBAL_FUNCTION square_a_mod_p
        SQUARE_U256
        REDUCE_512_MOD_P
END_GLOBAL_FUNCTION square_a_mod_p

BEGIN_GLOBAL_FUNCTION square_a_mod_p_asign_a
        SQUARE_U256
        REDUCE_512_MOD_P
        COPY_R_LO_TO_A
END_GLOBAL_FUNCTION square_a_mod_p_asign_a

BEGIN_GLOBAL_FUNCTION square_a_mod_p_asign_b
        SQUARE_U256
        REDUCE_512_MOD_P
        COPY_R_LO_TO_B
END_GLOBAL_FUNCTION square_a_mod_p_asign_b

/** Public C functions */
BEGIN_C_FUNCTION secp256k1_square_mod_p
        INIT_PRIME_P
        LOAD_A
        SQUARE_U256
        REDUCE_512_MOD_P
        STORE_R_LO io1
END_C_FUNCTION secp256k1_square_mod_p

BEGIN_C_FUNCTION secp256k1_square_mod_p_asign
        INIT_PRIME_P
        LOAD_A
        SQUARE_U256
        REDUCE_512_MOD_P
        STORE_R_LO io0
END_C_FUNCTION secp256k1_square_mod_p_asign

/**
  Macro MUL_A_B_MOD_P
  multiplies two Scalars modulo p
  r= (a*b) mod p
  Input: 
  a: in sa0-sa3
  b: in sb0-sb3
  Output:
  r: in sr0-sr3
  
*/
.macro MUL_A_B_MOD_P
        MUL_U256
        REDUCE_512_MOD_P
.endm

/** wrapper functions */
BEGIN_GLOBAL_FUNCTION mul_a_b_mod_p
        MUL_A_B_MOD_P
END_GLOBAL_FUNCTION mul_a_b_mod_p

BEGIN_GLOBAL_FUNCTION mul_a_b_mod_p_asign_b
		MUL_A_B_MOD_P
        COPY_R_LO_TO_B
END_GLOBAL_FUNCTION mul_a_b_mod_p_asign_b

BEGIN_GLOBAL_FUNCTION mul_a_b_mod_p_asign_a
        MUL_A_B_MOD_P
        COPY_R_LO_TO_A
END_GLOBAL_FUNCTION mul_a_b_mod_p_asign_a

/** 
Public C function secp256k1_mul_mod_p
r = (a*b) mod p
Input:
Address of a in io0
Address of b in io1
Address of r in io2
*/
BEGIN_C_FUNCTION secp256k1_mul_mod_p
		INIT_PRIME_P
        LOAD_A_B
        MUL_A_B_MOD_P
        STORE_R_LO io2
END_C_FUNCTION secp256k1_mul_mod_p

/**  
Macro CUBE_A_MOD_P cubes scalar a
r = a³
Input: a in sa0...sa3
Result: r in sr0...sr3
*/
.macro CUBE_A_MOD_P
        mov sb0, sa0
        mov sb1, sa1
        mov sb2,sa2
        mov sb3,sa3
        bl square_a_mod_p_asign_a
        bl mul_a_b_mod_p 
.endm

/** wrapper function */
BEGIN_GLOBAL_FUNCTION cube_a_mod_p
        CUBE_A_MOD_P
END_GLOBAL_FUNCTION cube_a_mod_p

BEGIN_GLOBAL_FUNCTION cube_a_mod_p_asign_a
        CUBE_A_MOD_P
        COPY_R_LO_TO_A
END_GLOBAL_FUNCTION cube_a_mod_p_asign_a

BEGIN_GLOBAL_FUNCTION cube_a_mod_p_asign_b
        CUBE_A_MOD_P
        COPY_R_LO_TO_B
END_GLOBAL_FUNCTION cube_a_mod_p_asign_a

/** public C function */
BEGIN_C_FUNCTION secp256k1_cube_mod_p
        INIT_PRIME_P
        LOAD_A io0
        CUBE_A_MOD_P
        STORE_R_LO io1
END_C_FUNCTION secp256k1_cube_mod_p

BEGIN_C_FUNCTION secp256k1_cube_mod_p_asign
        INIT_PRIME_P
        LOAD_A io0  
        CUBE_A_MOD_P
        STORE_R_LO io0
END_C_FUNCTION secp256k1_cube_mod_p_asign


/*
Macros for the functions that use the same exponentiation algorithm but
with different exponents.
*/
.macro BIT0   //(exp & 1 != 1)
        bl square_a_mod_p_asign_a
.endm
.macro BIT1   //(exp & 1 == 1)
        bl mul_a_b_mod_p_asign_b
        bl square_a_mod_p_asign_a  
.endm
.macro NIB0   //0x0
        BIT0
        BIT0
        BIT0
        BIT0
.endm
.macro NIB1   //0x1
        BIT1
        BIT0
        BIT0
        BIT0
.endm
.macro NIB2   //0x2
        BIT0
        BIT1
        BIT0
        BIT0
.endm
.macro NIB3   //0x3
        BIT1
        BIT1
        BIT0
        BIT0
.endm
.macro NIB7   //0x7
        BIT1
        BIT1
        BIT1
        BIT0
.endm

.macro NIBb  //0xb
        BIT1
        BIT1
        BIT0
        BIT1
.endm
.macro NIBc  //0xc
        BIT0
        BIT0
        BIT1
        BIT1
.endm
.macro NIBd  //0xd
        BIT1
        BIT0
        BIT1
        BIT1
.endm
.macro NIBe //0xe
        BIT0
        BIT1
        BIT1
        BIT1
.endm
.macro NIBf   //0xf
        BIT1
        BIT1
        BIT1
        BIT1
.endm
.macro BYTEff //ff
        NIBf
        NIBf
.endm
.macro QUADX  //fffffffefffffc2d
        NIBd
        NIB2
        NIBc
        NIBf
        BYTEff
        BYTEff
        NIBe
        NIBf
        BYTEff
        BYTEff        
        BYTEff        
.endm

.macro QUADY //0xFFFFFFFFBFFFFF0C
        NIBc
        NIB0
        BYTEff
        BYTEff
        NIBf
        NIBb
        BYTEff
        BYTEff
        BYTEff
        BYTEff        
.endm 

.macro QUADZ //ffffffff7ffffe17
        NIB7
        NIB1
        NIBe
        BYTEff
        BYTEff
        NIB7
        BYTEff
        BYTEff
        BYTEff
        BYTEff                
.endm


.macro QUADff
        BYTEff 
        BYTEff 
        BYTEff 
        BYTEff 
        BYTEff 
        BYTEff 
        BYTEff 
        BYTEff 
.endm
.macro QUAD3f
        BYTEff 
        BYTEff 
        BYTEff 
        BYTEff 
        BYTEff 
        BYTEff 
        BYTEff 
        NIBf
        NIB3 
.endm 
.macro QUAD7f
        BYTEff 
        BYTEff 
        BYTEff 
        BYTEff 
        BYTEff 
        BYTEff 
        BYTEff 
        NIBf
        NIB7 
.endm 

/*
Macro INVERT_A_MOD_P_INTO_B
calculates the inverse of scalar a modulo p 
b = inv a = a^-1 = a^p-2 (mod p)
--> a*b = a*(inv a) = 1
Input
a: sa0 sa1 sa2 sa3 
Result
b: sb0 sb1 sb2 sb3  

Algorithm (all functions are modulo p): 
exp=0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffefffffc2d //p-2
b = 1 //The result
for(int i = 0; i<256; i++){
   if (exp & 1 == 1){
            b = b * a mod p;    
   }
   a = a * a mod p;             
   exp = exp >> 1;
}
In INVERT_A_MOD_P_INTO_B the for-loop, anding and shifting of the exponent (p-2) are 
unrolled using the macros above.
*/

.macro INVERT_A_MOD_P_INTO_B
        SET_B_TO_1
        QUADX
        QUADff
        QUADff
        QUADff
.endm

/** wrapper function */
BEGIN_GLOBAL_FUNCTION invert_a_mod_p_into_b
        INVERT_A_MOD_P_INTO_B
END_GLOBAL_FUNCTION invert_a_mod_p_into_b

/** C function */
BEGIN_C_FUNCTION secp256k1_invert_mod_p
        INIT_PRIME_P
        LOAD_A
        INVERT_A_MOD_P_INTO_B
        STORE_B io1
END_C_FUNCTION secp256k1_invert_mod_p

/** C function */
BEGIN_C_FUNCTION secp256k1_invert_mod_p_asign
        INIT_PRIME_P
        LOAD_A
        INVERT_A_MOD_P_INTO_B
        STORE_B io0
END_C_FUNCTION secp256k1_invert_mod_p_asign


/*
Function exponentiate_a_e_mod_p
calculates the base a raised to the power of exponent e modulo prime p
b = a^e  (mod p)
Input
a:  sa0...sa3
e:  k0...k3
Result
b: sb0 sb1 sb2 sb3   

Algorithm (all functions are modulo p): 
b = 1 //The result
for(int i = 0; i<256; i++){ //LOOP_EXPONENT
   if (e & 1 == 1){
            b = a * b mod p;   //function mul_A_B_mod_p_asign_b          
   }
   a = a * a mod p;            //function square_a_mod_p_asign_a
   e = e >> 1;
}
*/
.macro LOOP_EXPONENT register
        mov io0, #64
        loop_exp_\@: 
        cbz io0, endloop_exp_\@ 
        and io3, \register, #1 
                   //if (exp & 1) == 1 -> b = a*b mod p
        cbz io3, biszero_\@
        bl mul_a_b_mod_p_asign_b
        biszero_\@: 
        bl square_a_mod_p_asign_a  
        lsr \register, \register, #1
        sub io0, io0, #1 
        b loop_exp_\@
        endloop_exp_\@: 
.endm

BEGIN_GLOBAL_FUNCTION exponentiate_a_e_mod_p 
      SET_B_TO_1
      LOOP_EXPONENT sk0
      LOOP_EXPONENT sk1
      LOOP_EXPONENT sk2
      LOOP_EXPONENT sk3
END_GLOBAL_FUNCTION exponentiate_a_e_mod_p

BEGIN_C_FUNCTION secp256k1_exponentiate_mod_p
        INIT_PRIME_P
        LOAD_A 
        LOAD_K io1
        bl exponentiate_a_e_mod_p
        STORE_B io2
END_C_FUNCTION secp256k1_exponentiate_mod_p


BEGIN_GLOBAL_FUNCTION squareroot_a_mod_p_asign_b
        SET_B_TO_1
        QUADY
        QUADff
        QUADff
        QUAD3f
END_GLOBAL_FUNCTION squareroot_a_mod_p_asign_b

/*
Function secp256k1_squareroot_mod_p
calculates the square root b of a scalar a modulo p.
e = (p+1)/4 = (p+1)*inv 4 = 0x3FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFFFFF0C
b =  a^e = squareroot of a
a = b² mod p
The result is undefined if a has no squareroot.
Input
a: sa0 sa1 sa2 sa3  
Result
b: sb0 sb0 sb2 sb3  
Algorithm (all functions are modulo p): 
exp=0x3FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFFFFF0C //(p+1)/4
b = 1 //The result
for(int i = 0; i<256; i++){
   if (exp & 1 == 1){
            b = b * a mod p;   //function secp256k1_mul_ab_asign_b
   }
   a = a * a mod p;            //function secp256k1_square_asign_a
   exp = exp >> 1;
In secp256k1_squareroot_private the for-loop, anding and shifting of the 
exponent are unrolled using the macros above.
}
*/
BEGIN_C_FUNCTION  secp256k1_squareroot_mod_p
        INIT_PRIME_P
        LOAD_A 
        bl squareroot_a_mod_p_asign_b
        STORE_B io1
END_C_FUNCTION secp256k1_squareroot_mod_p


BEGIN_C_FUNCTION  secp256k1_squareroot_mod_p_asign
        INIT_PRIME_P
        LOAD_A 
        bl squareroot_a_mod_p_asign_b
        STORE_B io0
END_C_FUNCTION secp256k1_squareroot_mod_p_asign

/**
Function secp256k1_has_squareroot_mod_p
calculates Eulers Criterion b = a^(p-1)/2 = a^((p-1)>>1) 
The b is 1 if a has a squareroot otherwise p-1
Algorithm (all functions are modulo p): 
exp=0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffff7ffffe17 //(p-1)/2
b = 1  
for(int i = 0; i<256; i++){
   if (exp & 1 == 1){
            b = b * a mod p;   
   }
   a = a * a mod p;            
   exp = exp >> 1;
In has_square_root_private the for-loop, anding and shifting of the 
exponent are unrolled using the macros above.
}
*/


BEGIN_C_FUNCTION secp256k1_has_squareroot_mod_p
        INIT_PRIME_P
        LOAD_A 
        SET_B_TO_1
        QUADZ
        QUADff
        QUADff
        QUAD7f
        mov io0, sb0

END_C_FUNCTION secp256k1_has_squareroot_mod_p