369 lines
9.0 KiB
C
369 lines
9.0 KiB
C
/* TomsFastMath, a fast ISO C bignum library.
|
|
*
|
|
* This project is meant to fill in where LibTomMath
|
|
* falls short. That is speed ;-)
|
|
*
|
|
* This project is public domain and free for all purposes.
|
|
*
|
|
* Tom St Denis, tomstdenis@gmail.com
|
|
*/
|
|
|
|
/* About this file...
|
|
|
|
*/
|
|
|
|
#include "bignum_fast.h"
|
|
|
|
#if defined(TFM_PRESCOTT) && defined(TFM_SSE2)
|
|
#undef TFM_SSE2
|
|
#define TFM_X86
|
|
#endif
|
|
|
|
/* these are the combas. Worship them. */
|
|
#if defined(TFM_X86)
|
|
/* Generic x86 optimized code */
|
|
|
|
/* anything you need at the start */
|
|
#define COMBA_START
|
|
|
|
/* clear the chaining variables */
|
|
#define COMBA_CLEAR \
|
|
c0 = c1 = c2 = 0;
|
|
|
|
/* forward the carry to the next digit */
|
|
#define COMBA_FORWARD \
|
|
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
|
|
|
/* store the first sum */
|
|
#define COMBA_STORE(x) \
|
|
x = c0;
|
|
|
|
/* store the second sum [carry] */
|
|
#define COMBA_STORE2(x) \
|
|
x = c1;
|
|
|
|
/* anything you need at the end */
|
|
#define COMBA_FINI
|
|
|
|
/* this should multiply i and j */
|
|
#define MULADD(i, j) \
|
|
asm( \
|
|
"movl %6,%%eax \n\t" \
|
|
"mull %7 \n\t" \
|
|
"addl %%eax,%0 \n\t" \
|
|
"adcl %%edx,%1 \n\t" \
|
|
"adcl $0,%2 \n\t" \
|
|
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","%edx","cc");
|
|
|
|
#elif defined(TFM_X86_64)
|
|
/* x86-64 optimized */
|
|
|
|
/* anything you need at the start */
|
|
#define COMBA_START
|
|
|
|
/* clear the chaining variables */
|
|
#define COMBA_CLEAR \
|
|
c0 = c1 = c2 = 0;
|
|
|
|
/* forward the carry to the next digit */
|
|
#define COMBA_FORWARD \
|
|
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
|
|
|
/* store the first sum */
|
|
#define COMBA_STORE(x) \
|
|
x = c0;
|
|
|
|
/* store the second sum [carry] */
|
|
#define COMBA_STORE2(x) \
|
|
x = c1;
|
|
|
|
/* anything you need at the end */
|
|
#define COMBA_FINI
|
|
|
|
/* this should multiply i and j */
|
|
#define MULADD(i, j) \
|
|
asm ( \
|
|
"movq %6,%%rax \n\t" \
|
|
"mulq %7 \n\t" \
|
|
"addq %%rax,%0 \n\t" \
|
|
"adcq %%rdx,%1 \n\t" \
|
|
"adcq $0,%2 \n\t" \
|
|
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "g"(i), "g"(j) :"%rax","%rdx","cc");
|
|
|
|
#elif defined(TFM_SSE2)
|
|
/* use SSE2 optimizations */
|
|
|
|
/* anything you need at the start */
|
|
#define COMBA_START
|
|
|
|
/* clear the chaining variables */
|
|
#define COMBA_CLEAR \
|
|
c0 = c1 = c2 = 0;
|
|
|
|
/* forward the carry to the next digit */
|
|
#define COMBA_FORWARD \
|
|
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
|
|
|
/* store the first sum */
|
|
#define COMBA_STORE(x) \
|
|
x = c0;
|
|
|
|
/* store the second sum [carry] */
|
|
#define COMBA_STORE2(x) \
|
|
x = c1;
|
|
|
|
/* anything you need at the end */
|
|
#define COMBA_FINI \
|
|
asm("emms");
|
|
|
|
/* this should multiply i and j */
|
|
#define MULADD(i, j) \
|
|
asm( \
|
|
"movd %6,%%mm0 \n\t" \
|
|
"movd %7,%%mm1 \n\t" \
|
|
"pmuludq %%mm1,%%mm0\n\t" \
|
|
"movd %%mm0,%%eax \n\t" \
|
|
"psrlq $32,%%mm0 \n\t" \
|
|
"addl %%eax,%0 \n\t" \
|
|
"movd %%mm0,%%eax \n\t" \
|
|
"adcl %%eax,%1 \n\t" \
|
|
"adcl $0,%2 \n\t" \
|
|
:"=r"(c0), "=r"(c1), "=r"(c2): "0"(c0), "1"(c1), "2"(c2), "m"(i), "m"(j) :"%eax","cc");
|
|
|
|
#elif defined(TFM_ARM)
|
|
/* ARM code */
|
|
|
|
#define COMBA_START
|
|
|
|
#define COMBA_CLEAR \
|
|
c0 = c1 = c2 = 0;
|
|
|
|
#define COMBA_FORWARD \
|
|
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
|
|
|
#define COMBA_STORE(x) \
|
|
x = c0;
|
|
|
|
#define COMBA_STORE2(x) \
|
|
x = c1;
|
|
|
|
#define COMBA_FINI
|
|
|
|
#define MULADD(i, j) \
|
|
asm( \
|
|
" UMULL r0,r1,%6,%7 \n\t" \
|
|
" ADDS %0,%0,r0 \n\t" \
|
|
" ADCS %1,%1,r1 \n\t" \
|
|
" ADC %2,%2,#0 \n\t" \
|
|
:"=r"(c0), "=r"(c1), "=r"(c2) : "0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j) : "r0", "r1", "cc");
|
|
|
|
#elif defined(TFM_PPC32)
|
|
/* For 32-bit PPC */
|
|
|
|
#define COMBA_START
|
|
|
|
#define COMBA_CLEAR \
|
|
c0 = c1 = c2 = 0;
|
|
|
|
#define COMBA_FORWARD \
|
|
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
|
|
|
#define COMBA_STORE(x) \
|
|
x = c0;
|
|
|
|
#define COMBA_STORE2(x) \
|
|
x = c1;
|
|
|
|
#define COMBA_FINI
|
|
|
|
/* untested: will mulhwu change the flags? Docs say no */
|
|
#define MULADD(i, j) \
|
|
asm( \
|
|
" mullw 16,%6,%7 \n\t" \
|
|
" addc %0,%0,16 \n\t" \
|
|
" mulhwu 16,%6,%7 \n\t" \
|
|
" adde %1,%1,16 \n\t" \
|
|
" addze %2,%2 \n\t" \
|
|
:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"16");
|
|
|
|
#elif defined(TFM_PPC64)
|
|
/* For 64-bit PPC */
|
|
|
|
#define COMBA_START
|
|
|
|
#define COMBA_CLEAR \
|
|
c0 = c1 = c2 = 0;
|
|
|
|
#define COMBA_FORWARD \
|
|
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
|
|
|
#define COMBA_STORE(x) \
|
|
x = c0;
|
|
|
|
#define COMBA_STORE2(x) \
|
|
x = c1;
|
|
|
|
#define COMBA_FINI
|
|
|
|
/* untested: will mulhdu change the flags? Docs say no */
|
|
#define MULADD(i, j) \
|
|
asm( \
|
|
" mulld r16,%6,%7 \n\t" \
|
|
" addc %0,%0,16 \n\t" \
|
|
" mulhdu r16,%6,%7 \n\t" \
|
|
" adde %1,%1,16 \n\t" \
|
|
" addze %2,%2 \n\t" \
|
|
:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r16");
|
|
|
|
#elif defined(TFM_AVR32)
|
|
|
|
/* ISO C code */
|
|
|
|
#define COMBA_START
|
|
|
|
#define COMBA_CLEAR \
|
|
c0 = c1 = c2 = 0;
|
|
|
|
#define COMBA_FORWARD \
|
|
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
|
|
|
#define COMBA_STORE(x) \
|
|
x = c0;
|
|
|
|
#define COMBA_STORE2(x) \
|
|
x = c1;
|
|
|
|
#define COMBA_FINI
|
|
|
|
#define MULADD(i, j) \
|
|
asm( \
|
|
" mulu.d r2,%6,%7 \n\t"\
|
|
" add %0,r2 \n\t"\
|
|
" adc %1,%1,r3 \n\t"\
|
|
" acr %2 \n\t"\
|
|
:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"r2","r3");
|
|
|
|
#elif defined(TFM_MIPS)
|
|
|
|
#define COMBA_START
|
|
|
|
#define COMBA_CLEAR \
|
|
c0 = c1 = c2 = 0;
|
|
|
|
#define COMBA_FORWARD \
|
|
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
|
|
|
#define COMBA_STORE(x) \
|
|
x = c0;
|
|
|
|
#define COMBA_STORE2(x) \
|
|
x = c1;
|
|
|
|
#define COMBA_FINI
|
|
|
|
#define MULADD(i, j) \
|
|
asm( \
|
|
" multu %6,%7 \n\t" \
|
|
" mflo $12 \n\t" \
|
|
" mfhi $13 \n\t" \
|
|
" addu %0,%0,$12 \n\t" \
|
|
" sltu $12,%0,$12 \n\t" \
|
|
" addu %1,%1,$13 \n\t" \
|
|
" sltu $13,%1,$13 \n\t" \
|
|
" addu %1,%1,$12 \n\t" \
|
|
" sltu $12,%1,$12 \n\t" \
|
|
" addu %2,%2,$13 \n\t" \
|
|
" addu %2,%2,$12 \n\t" \
|
|
:"=r"(c0), "=r"(c1), "=r"(c2):"0"(c0), "1"(c1), "2"(c2), "r"(i), "r"(j):"$12","$13");
|
|
|
|
#else
|
|
/* ISO C code */
|
|
|
|
#define COMBA_START
|
|
|
|
#define COMBA_CLEAR \
|
|
c0 = c1 = c2 = 0;
|
|
|
|
#define COMBA_FORWARD \
|
|
do { c0 = c1; c1 = c2; c2 = 0; } while (0);
|
|
|
|
#define COMBA_STORE(x) \
|
|
x = c0;
|
|
|
|
#define COMBA_STORE2(x) \
|
|
x = c1;
|
|
|
|
#define COMBA_FINI
|
|
|
|
#define MULADD(i, j) \
|
|
do { fp_word t; \
|
|
t = (fp_word)c0 + ((fp_word)i) * ((fp_word)j); c0 = t; \
|
|
t = (fp_word)c1 + (t >> DIGIT_BIT); c1 = t; c2 += t >> DIGIT_BIT; \
|
|
} while (0);
|
|
|
|
#endif
|
|
|
|
#ifndef TFM_DEFINES
|
|
|
|
/* generic PxQ multiplier */
|
|
void fp_mul_comba(fp_int *A, fp_int *B, fp_int *C)
|
|
{
|
|
int ix, iy, iz, tx, ty, pa;
|
|
fp_digit c0, c1, c2, *tmpx, *tmpy;
|
|
fp_int tmp, *dst;
|
|
|
|
COMBA_START;
|
|
COMBA_CLEAR;
|
|
|
|
/* get size of output and trim */
|
|
pa = A->used + B->used;
|
|
if (pa >= FP_SIZE) {
|
|
pa = FP_SIZE-1;
|
|
}
|
|
|
|
if (A == C || B == C) {
|
|
fp_zero(&tmp);
|
|
dst = &tmp;
|
|
} else {
|
|
fp_zero(C);
|
|
dst = C;
|
|
}
|
|
|
|
for (ix = 0; ix < pa; ix++) {
|
|
/* get offsets into the two bignums */
|
|
ty = MIN(ix, B->used-1);
|
|
tx = ix - ty;
|
|
|
|
/* setup temp aliases */
|
|
tmpx = A->dp + tx;
|
|
tmpy = B->dp + ty;
|
|
|
|
/* this is the number of times the loop will iterrate, essentially its
|
|
while (tx++ < a->used && ty-- >= 0) { ... }
|
|
*/
|
|
iy = MIN(A->used-tx, ty+1);
|
|
|
|
/* execute loop */
|
|
COMBA_FORWARD;
|
|
for (iz = 0; iz < iy; ++iz) {
|
|
MULADD(*tmpx++, *tmpy--);
|
|
}
|
|
|
|
/* store term */
|
|
COMBA_STORE(dst->dp[ix]);
|
|
}
|
|
COMBA_FINI;
|
|
|
|
dst->used = pa;
|
|
dst->sign = A->sign ^ B->sign;
|
|
fp_clamp(dst);
|
|
fp_copy(dst, C);
|
|
}
|
|
|
|
#endif
|
|
|
|
/* $Source: /cvs/libtom/tomsfastmath/src/mul/fp_mul_comba.c,v $ */
|
|
/* $Revision: 1.4 $ */
|
|
/* $Date: 2007/03/14 23:47:42 $ */
|
|
|