1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
/* LibTomMath, multiple-precision integer library -- Tom St Denis
*
* LibTomMath is a library that provides multiple-precision
* integer arithmetic as well as number theoretic functionality.
*
* The library was designed directly after the MPI library by
* Michael Fromberger but has been written from scratch with
* additional optimizations in place.
*
* The library is free for all purposes without any express
* guarantee it works.
*
* Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
*/
#include <tommath.h>
/* fast squaring
*
* This is the comba method where the columns of the product
* are computed first then the carries are computed. This
* has the effect of making a very simple inner loop that
* is executed the most
*
* W2 represents the outer products and W the inner.
*
* A further optimizations is made because the inner
* products are of the form "A * B * 2". The *2 part does
* not need to be computed until the end which is good
* because 64-bit shifts are slow!
*
* Based on Algorithm 14.16 on pp.597 of HAC.
*
*/
int fast_s_mp_sqr (mp_int * a, mp_int * b)
{
int olduse, newused, res, ix, pa;
mp_word W2[MP_WARRAY], W[MP_WARRAY];
/* calculate size of product and allocate as required */
pa = a->used;
newused = pa + pa + 1;
if (b->alloc < newused) {
if ((res = mp_grow (b, newused)) != MP_OKAY) {
return res;
}
}
/* zero temp buffer (columns)
* Note that there are two buffers. Since squaring requires
* a outer and inner product and the inner product requires
* computing a product and doubling it (a relatively expensive
* op to perform n**2 times if you don't have to) the inner and
* outer products are computed in different buffers. This way
* the inner product can be doubled using n doublings instead of
* n**2
*/
memset (W, 0, newused * sizeof (mp_word));
memset (W2, 0, newused * sizeof (mp_word));
/* This computes the inner product. To simplify the inner N**2 loop
* the multiplication by two is done afterwards in the N loop.
*/
for (ix = 0; ix < pa; ix++) {
/* compute the outer product
*
* Note that every outer product is computed
* for a particular column only once which means that
* there is no need todo a double precision addition
* into the W2[] array.
*/
W2[ix + ix] = ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
{
register mp_digit tmpx, *tmpy;
register mp_word *_W;
register int iy;
/* copy of left side */
tmpx = a->dp[ix];
/* alias for right side */
tmpy = a->dp + (ix + 1);
/* the column to store the result in */
_W = W + (ix + ix + 1);
/* inner products */
for (iy = ix + 1; iy < pa; iy++) {
*_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
}
}
}
/* setup dest */
olduse = b->used;
b->used = newused;
/* now compute digits
*
* We have to double the inner product sums, add in the
* outer product sums, propagate carries and convert
* to single precision.
*/
{
register mp_digit *tmpb;
/* double first value, since the inner products are
* half of what they should be
*/
W[0] += W[0] + W2[0];
tmpb = b->dp;
for (ix = 1; ix < newused; ix++) {
/* double/add next digit */
W[ix] += W[ix] + W2[ix];
/* propagate carry forwards [from the previous digit] */
W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
/* store the current digit now that the carry isn't
* needed
*/
*tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
}
/* set the last value. Note even if the carry is zero
* this is required since the next step will not zero
* it if b originally had a value at b->dp[2*a.used]
*/
*tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
/* clear high digits of b if there were any originally */
for (; ix < olduse; ix++) {
*tmpb++ = 0;
}
}
mp_clamp (b);
return MP_OKAY;
}