1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
/* LibTomMath, multiple-precision integer library -- Tom St Denis
*
* LibTomMath is library that provides for multiple-precision
* integer arithmetic as well as number theoretic functionality.
*
* The library is designed directly after the MPI library by
* Michael Fromberger but has been written from scratch with
* additional optimizations in place.
*
* The library is free for all purposes without any express
* guarantee it works.
*
* Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
*/
#include <tommath.h>
/* fast squaring
*
* This is the comba method where the columns of the product are computed first
* then the carries are computed. This has the effect of making a very simple
* inner loop that is executed the most
*
* W2 represents the outer products and W the inner.
*
* A further optimizations is made because the inner products are of the form
* "A * B * 2". The *2 part does not need to be computed until the end which is
* good because 64-bit shifts are slow!
*
* Based on Algorithm 14.16 on pp.597 of HAC.
*
*/
int
fast_s_mp_sqr (mp_int * a, mp_int * b)
{
int olduse, newused, res, ix, pa;
mp_word W2[512], W[512];
/* calculate size of product and allocate as required */
pa = a->used;
newused = pa + pa + 1;
if (b->alloc < newused) {
if ((res = mp_grow (b, newused)) != MP_OKAY) {
return res;
}
}
/* zero temp buffer (columns)
* Note that there are two buffers. Since squaring requires
* a outter and inner product and the inner product requires
* computing a product and doubling it (a relatively expensive
* op to perform n^2 times if you don't have to) the inner and
* outer products are computed in different buffers. This way
* the inner product can be doubled using n doublings instead of
* n^2
*/
memset (W, 0, newused * sizeof (mp_word));
memset (W2, 0, newused * sizeof (mp_word));
/* note optimization
* values in W2 are only written in even locations which means
* we can collapse the array to 256 words [and fixup the memset above]
* provided we also fix up the summations below. Ideally
* the fixup loop should be unrolled twice to handle the even/odd
* cases, and then a final step to handle odd cases [e.g. newused == odd]
*
* This will not only save ~8*256 = 2KB of stack but lower the number of
* operations required to finally fix up the columns
*/
/* This computes the inner product. To simplify the inner N^2 loop
* the multiplication by two is done afterwards in the N loop.
*/
for (ix = 0; ix < pa; ix++) {
/* compute the outer product
*
* Note that every outer product is computed
* for a particular column only once which means that
* there is no need todo a double precision addition
*/
W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
{
register mp_digit tmpx, *tmpy;
register mp_word *_W;
register int iy;
/* copy of left side */
tmpx = a->dp[ix];
/* alias for right side */
tmpy = a->dp + (ix + 1);
/* the column to store the result in */
_W = W + (ix + ix + 1);
/* inner products */
for (iy = ix + 1; iy < pa; iy++) {
*_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
}
}
}
/* setup dest */
olduse = b->used;
b->used = newused;
/* double first value, since the inner products are half of what they should be */
W[0] += W[0] + W2[0];
/* now compute digits */
{
register mp_digit *tmpb;
tmpb = b->dp;
for (ix = 1; ix < newused; ix++) {
/* double/add next digit */
W[ix] += W[ix] + W2[ix];
W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
*tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
}
*tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
/* clear high */
for (; ix < olduse; ix++) {
*tmpb++ = 0;
}
}
mp_clamp (b);
return MP_OKAY;
}