kmx git

Commit b66471f74f1f16ebd823f71c5602cc6b68e69574

2003-02-28T16:09:08
added libtommath-0.13
diff --git a/bn.pdf b/bn.pdf
index df69417..6b5a2d1 100644
Binary files a/bn.pdf and b/bn.pdf differ
diff --git a/bn.tex b/bn.tex
index b5f0227..5c40174 100644
--- a/bn.tex
+++ b/bn.tex
@@ -1,7 +1,7 @@
 \documentclass{article}
 \begin{document}
 
-\title{LibTomMath v0.12 \\ A Free Multiple Precision Integer Library}
+\title{LibTomMath v0.13 \\ A Free Multiple Precision Integer Library}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
@@ -409,7 +409,6 @@ of $c$ is the maximum length of the two inputs.
 Computes $c = a \land b$, pseudo-extends with zeroes whichever of $a$ or $b$ is shorter such that the length
 of $c$ is the maximum length of the two inputs.
 
-
 \subsection{Basic Arithmetic}
 
 \subsubsection{mp\_cmp(mp\_int *a, mp\_int *b)}
@@ -440,19 +439,18 @@ This function requires no additional memory and $O(N)$ time.
 Computes $c = a \cdot b$ using signed arithmetic.  Handles the sign of the numbers correctly which means it will 
 correct the sign of the product as required, e.g. $a \cdot -b$ turns into $-ab$.
 
-For relatively small inputs, that is less than 80 digits a standard baseline or comba-baseline multiplier is used.  It
-requires no additional memory and $O(N^2)$ time.  The comba-baseline multiplier is only used if it can safely be used
-without losing carry digits.  The comba method is faster than the baseline method but cannot always be used which is why
-both are provided.  The code will automatically determine when it can be used.  If the digit count is higher
-than 80 for the inputs than a Karatsuba multiplier is used which requires approximately $O(6 \cdot N)$ memory and 
-$O(N^{lg(3)})$ time.  
+This function requires $O(N^2)$ time for small inputs and $O(N^{1.584})$ time for relatively large 
+inputs (\textit{above the }KARATSUBA\_MUL\_CUTOFF \textit{value defined in bncore.c.}).  There is 
+considerable overhead in the Karatsuba method which only pays off when the digit count is fairly high
+(\textit{typically around 80}).  For small inputs the function requires $O(2N)$ memory, otherwise it
+requires $O(6 \cdot \mbox{lg}(N) \cdot N)$ memory.
+
 
 \subsubsection{mp\_sqr(mp\_int *a, mp\_int *b)}
-Computes $b = a^2$. 
-For relatively small inputs, that is less than 80 digits a modified squaring or comba-squaring algorithm is used.  It
-requires no additional memory and $O((N^2 + N)/2)$ time.  The comba-squaring method is used only if it can be safely used
-without losing carry digits.  After 80 digits a Karatsuba squaring algorithm is used whcih requires approximately 
-$O(4 \cdot N)$ memory and $O(N^{lg(3)})$ time.
+Computes $b = a^2$ and fixes the sign of $b$ to be positive.
+
+This function has a running time and memory requirement profile very similar to that of the 
+mp\_mul function.  It is always faster and uses less memory for the larger inputs.
 
 \subsubsection{mp\_div(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
 Computes $c = \lfloor a/b \rfloor$ and $d \equiv a \mbox{ (mod }b\mbox{)}$.  The division is signed which means the sign
@@ -482,7 +480,8 @@ Also note that these functions use mp\_mod which means the result are guaranteed
 
 \subsubsection{mp\_invmod(mp\_int *a, mp\_int *b, mp\_int *c)}
 This function will find $c = 1/a \mbox{ (mod }b\mbox{)}$ for any value of $a$ such that $(a, b) = 1$ and $b > 0$.  When
-$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast.  
+$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast.  If no inverse is found (e.g. $(a, b) \ne 1$) then
+the function returns \textbf{MP\_VAL} and the result in $c$ is undefined.
 
 \subsubsection{mp\_gcd(mp\_int *a, mp\_int *b, mp\_int *c)}
 Finds the greatest common divisor of both $a$ and $b$ and places the result in $c$.  Will work with either positive
@@ -497,13 +496,13 @@ both.
 
 Functions requires no additional memory and approximately $O(4 \cdot N^2)$ time.
 
-\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int c)}
+\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int *c)}
 Finds the $b$'th root of $a$ and stores it in $b$.  The roots are found such that $\vert c \vert^b \le \vert a \vert$.  
 Uses the Newton approximation approach which means it converges in $O(log \beta^N)$ time to a final result.  Each iteration
 requires $b$ multiplications and one division for a total work of $O(6N^2 \cdot log \beta^N) = O(6N^3 \cdot log \beta)$.
 
-If the input $a$ is negative and $b$ is even the function returns an error.  Otherwise the function will return a root
-that has a sign that agrees with the sign of $a$.
+If the input $a$ is negative and $b$ is even the function returns \textbf{MP\_VAL}.  Otherwise the function will 
+return a root that has a sign that agrees with the sign of $a$.
 
 \subsubsection{mp\_jacobi(mp\_int *a, mp\_int *n, int *c)}
 Computes $c = \left ( {a \over n} \right )$ or the Jacobi function of $(a, n)$ and stores the result in an integer addressed
diff --git a/bn_fast_mp_invmod.c b/bn_fast_mp_invmod.c
index e1dcce3..249ff43 100644
--- a/bn_fast_mp_invmod.c
+++ b/bn_fast_mp_invmod.c
@@ -14,13 +14,17 @@
  */
 #include <tommath.h>
 
-/* computes the modular inverse via binary extended euclidean algorithm, that is c = 1/a mod b */
+/* computes the modular inverse via binary extended euclidean algorithm, 
+ * that is c = 1/a mod b 
+ *
+ * Based on mp_invmod except this is optimized for the case where b is 
+ * odd as per HAC Note 14.64 on pp. 610
+ */
 int
 fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int    x, y, u, v, B, D;
-  int       res, neg;
-
+  mp_int  x, y, u, v, B, D;
+  int     res, neg;
 
   if ((res = mp_init (&x)) != MP_OKAY) {
     goto __ERR;
@@ -58,7 +62,10 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
     goto __D;
   }
 
-  /* 2. [modified] if x,y are both even then return an error! */
+  /* 2. [modified] if x,y are both even then return an error! 
+   * 
+   * That is if gcd(x,y) = 2 * k then obviously there is no inverse.
+   */
   if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
     res = MP_VAL;
     goto __D;
@@ -135,8 +142,9 @@ top:
   }
 
   /* if not zero goto step 4 */
-  if (mp_iszero (&u) == 0)
+  if (mp_iszero (&u) == 0) {
     goto top;
+  }
 
   /* now a = C, b = D, gcd == g*v */
 
diff --git a/bn_fast_mp_montgomery_reduce.c b/bn_fast_mp_montgomery_reduce.c
index dbc3478..2e03936 100644
--- a/bn_fast_mp_montgomery_reduce.c
+++ b/bn_fast_mp_montgomery_reduce.c
@@ -14,12 +14,19 @@
  */
 #include <tommath.h>
 
-/* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */
+/* computes xR^-1 == x (mod N) via Montgomery Reduction 
+ * 
+ * This is an optimized implementation of mp_montgomery_reduce 
+ * which uses the comba method to quickly calculate the columns of the
+ * reduction.  
+ *
+ * Based on Algorithm 14.32 on pp.601 of HAC.
+*/
 int
 fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
 {
-  int       ix, res, olduse;
-  mp_word   W[512];
+  int     ix, res, olduse;
+  mp_word W[512];
 
   /* get old used count */
   olduse = a->used;
@@ -31,14 +38,22 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
     }
   }
 
-  /* copy the digits of a */
-  for (ix = 0; ix < a->used; ix++) {
-    W[ix] = a->dp[ix];
-  }
+  {
+    register mp_word *_W;
+    register mp_digit *tmpa;
+
+    _W = W;
+    tmpa = a->dp;
+
+    /* copy the digits of a */
+    for (ix = 0; ix < a->used; ix++) {
+      *_W++ = *tmpa++;
+    }
 
-  /* zero the high words */
-  for (; ix < m->used * 2 + 1; ix++) {
-    W[ix] = 0;
+    /* zero the high words */
+    for (; ix < m->used * 2 + 1; ix++) {
+      *_W++ = 0;
+    }
   }
 
   for (ix = 0; ix < m->used; ix++) {
@@ -69,8 +84,10 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
       register mp_digit *tmpx;
       register mp_word *_W;
 
-      /* aliases */
+      /* alias for the digits of the modulus */
       tmpx = m->dp;
+
+      /* Alias for the columns set by an offset of ix */
       _W = W + ix;
 
       /* inner loop */
@@ -88,24 +105,32 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
     W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
   }
 
-  /* copy out, A = A/b^n
-   *
-   * The result is A/b^n but instead of converting from an array of mp_word
-   * to mp_digit than calling mp_rshd we just copy them in the right
-   * order
-   */
-  for (ix = 0; ix < m->used + 1; ix++) {
-    a->dp[ix] = W[ix + m->used] & ((mp_word) MP_MASK);
-  }
+  {
+    register mp_digit *tmpa;
+    register mp_word *_W;
 
-  /* set the max used */
-  a->used = m->used + 1;
+    /* copy out, A = A/b^n
+     *
+     * The result is A/b^n but instead of converting from an array of mp_word
+     * to mp_digit than calling mp_rshd we just copy them in the right
+     * order
+     */
+    tmpa = a->dp;
+    _W = W + m->used;
+
+    for (ix = 0; ix < m->used + 1; ix++) {
+      *tmpa++ = *_W++ & ((mp_word) MP_MASK);
+    }
 
-  /* zero oldused digits, if the input a was larger than
-   * m->used+1 we'll have to clear the digits */
-  for (; ix < olduse; ix++) {
-    a->dp[ix] = 0;
+    /* zero oldused digits, if the input a was larger than
+     * m->used+1 we'll have to clear the digits */
+    for (; ix < olduse; ix++) {
+      *tmpa++ = 0;
+    }
   }
+
+  /* set the max used and clamp */
+  a->used = m->used + 1;
   mp_clamp (a);
 
   /* if A >= m then A = A - m */
diff --git a/bn_fast_s_mp_mul_digs.c b/bn_fast_s_mp_mul_digs.c
index c433dd7..dc0c33e 100644
--- a/bn_fast_s_mp_mul_digs.c
+++ b/bn_fast_s_mp_mul_digs.c
@@ -21,13 +21,20 @@
  * has the effect of making the nested loops that compute the columns very
  * simple and schedulable on super-scalar processors.
  *
+ * This has been modified to produce a variable number of digits of output so
+ * if say only a half-product is required you don't have to compute the upper half
+ * (a feature required for fast Barrett reduction).
+ *
+ * Based on Algorithm 14.12 on pp.595 of HAC.
+ *
  */
 int
 fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
-  int       olduse, res, pa, ix;
-  mp_word   W[512];
+  int     olduse, res, pa, ix;
+  mp_word W[512];
 
+  /* grow the destination as required */
   if (c->alloc < digs) {
     if ((res = mp_grow (c, digs)) != MP_OKAY) {
       return res;
@@ -43,11 +50,9 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 
     /* this multiplier has been modified to allow you to control how many digits 
      * of output are produced.  So at most we want to make upto "digs" digits
-     * of output
-     */
-
-
-    /* this adds products to distinct columns (at ix+iy) of W
+     * of output.
+     *
+     * this adds products to distinct columns (at ix+iy) of W
      * note that each step through the loop is not dependent on
      * the previous which means the compiler can easily unroll
      * the loop without scheduling problems
@@ -85,27 +90,30 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
   olduse = c->used;
   c->used = digs;
 
+  {
+    register mp_digit *tmpc;
+
+    /* At this point W[] contains the sums of each column.  To get the
+     * correct result we must take the extra bits from each column and
+     * carry them down
+     *
+     * Note that while this adds extra code to the multiplier it saves time
+     * since the carry propagation is removed from the above nested loop.
+     * This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to
+     * N^2 + N*c where c is the cost of the shifting.  On very small numbers
+     * this is slower but on most cryptographic size numbers it is faster.
+     */
+    tmpc = c->dp;
+    for (ix = 1; ix < digs; ix++) {
+      W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+      *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+    }
+    *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
 
-  /* At this point W[] contains the sums of each column.  To get the
-   * correct result we must take the extra bits from each column and
-   * carry them down
-   *
-   * Note that while this adds extra code to the multiplier it saves time
-   * since the carry propagation is removed from the above nested loop.
-   * This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to
-   * N^2 + N*c where c is the cost of the shifting.  On very small numbers
-   * this is slower but on most cryptographic size numbers it is faster.
-   */
-
-  for (ix = 1; ix < digs; ix++) {
-    W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-    c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
-  }
-  c->dp[digs - 1] = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
-
-  /* clear unused */
-  for (; ix < olduse; ix++) {
-    c->dp[ix] = 0;
+    /* clear unused */
+    for (; ix < olduse; ix++) {
+      *tmpc++ = 0;
+    }
   }
 
   mp_clamp (c);
diff --git a/bn_fast_s_mp_mul_high_digs.c b/bn_fast_s_mp_mul_high_digs.c
index cc30386..3458d96 100644
--- a/bn_fast_s_mp_mul_high_digs.c
+++ b/bn_fast_s_mp_mul_high_digs.c
@@ -20,14 +20,16 @@
  *
  * This is used in the Barrett reduction since for one of the multiplications
  * only the higher digits were needed.  This essentially halves the work.
+ *
+ * Based on Algorithm 14.12 on pp.595 of HAC.
  */
 int
 fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
-  int       oldused, newused, res, pa, pb, ix;
-  mp_word   W[512];
-
+  int     oldused, newused, res, pa, pb, ix;
+  mp_word W[512];
 
+  /* calculate size of product and allocate more space if required */
   newused = a->used + b->used + 1;
   if (c->alloc < newused) {
     if ((res = mp_grow (c, newused)) != MP_OKAY) {
@@ -38,7 +40,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
   /* like the other comba method we compute the columns first */
   pa = a->used;
   pb = b->used;
-  memset (&W[digs], 0, (pa + pb + 1 - digs) * sizeof (mp_word));
+  memset (W + digs, 0, (pa + pb + 1 - digs) * sizeof (mp_word));
   for (ix = 0; ix < pa; ix++) {
     {
       register mp_digit tmpx, *tmpy;
@@ -75,8 +77,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
     W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
     c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
   }
-  c->dp[(pa + pb + 1) - 1] =
-    (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK));
+  c->dp[(pa + pb + 1) - 1] = (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK));
 
   for (; ix < oldused; ix++) {
     c->dp[ix] = 0;
diff --git a/bn_fast_s_mp_sqr.c b/bn_fast_s_mp_sqr.c
index c4f786d..2b945ba 100644
--- a/bn_fast_s_mp_sqr.c
+++ b/bn_fast_s_mp_sqr.c
@@ -26,14 +26,16 @@
  * "A * B * 2".  The *2 part does not need to be computed until the end which is
  * good because 64-bit shifts are slow!
  *
+ * Based on Algorithm 14.16 on pp.597 of HAC.
  *
  */
 int
 fast_s_mp_sqr (mp_int * a, mp_int * b)
 {
-  int       olduse, newused, res, ix, pa;
-  mp_word   W2[512], W[512];
+  int     olduse, newused, res, ix, pa;
+  mp_word W2[512], W[512];
 
+  /* calculate size of product and allocate as required */
   pa = a->used;
   newused = pa + pa + 1;
   if (b->alloc < newused) {
@@ -51,15 +53,31 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
    * the inner product can be doubled using n doublings instead of
    * n^2
    */
-  memset (W, 0,  newused * sizeof (mp_word));
+  memset (W, 0, newused * sizeof (mp_word));
   memset (W2, 0, newused * sizeof (mp_word));
 
+/* note optimization
+ * values in W2 are only written in even locations which means
+ * we can collapse the array to 256 words [and fixup the memset above]
+ * provided we also fix up the summations below.  Ideally
+ * the fixup loop should be unrolled twice to handle the even/odd 
+ * cases, and then a final step to handle odd cases [e.g. newused == odd]
+ *
+ * This will not only save ~8*256 = 2KB of stack but lower the number of
+ * operations required to finally fix up the columns
+ */
+
   /* This computes the inner product.  To simplify the inner N^2 loop
    * the multiplication by two is done afterwards in the N loop.
    */
   for (ix = 0; ix < pa; ix++) {
-     /* compute the outer product */
-    W2[ix + ix] += ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+    /* compute the outer product 
+     *
+     * Note that every outer product is computed 
+     * for a particular column only once which means that 
+     * there is no need todo a double precision addition
+     */
+    W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
 
     {
       register mp_digit tmpx, *tmpy;
@@ -90,22 +108,25 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
   W[0] += W[0] + W2[0];
 
   /* now compute digits */
-  for (ix = 1; ix < newused; ix++) {
-    /* double/add next digit */
-    W[ix] += W[ix] + W2[ix];
+  {
+    register mp_digit *tmpb;
 
-    W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-    b->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
-  }
-  b->dp[(newused) - 1] = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
+    tmpb = b->dp;
 
-  /* clear high */
-  for (; ix < olduse; ix++) {
-    b->dp[ix] = 0;
-  }
+    for (ix = 1; ix < newused; ix++) {
+      /* double/add next digit */
+      W[ix] += W[ix] + W2[ix];
 
-  /* fix the sign (since we no longer make a fresh temp) */
-  b->sign = MP_ZPOS;
+      W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+      *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+    }
+    *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
+
+    /* clear high */
+    for (; ix < olduse; ix++) {
+      *tmpb++ = 0;
+    }
+  }
 
   mp_clamp (b);
   return MP_OKAY;
diff --git a/bn_mp_2expt.c b/bn_mp_2expt.c
index ace1b99..71d04e9 100644
--- a/bn_mp_2expt.c
+++ b/bn_mp_2expt.c
@@ -14,11 +14,15 @@
  */
 #include <tommath.h>
 
-/* computes a = 2^b */
+/* computes a = 2^b 
+ *
+ * Simple algorithm which zeroes the int, grows it then just sets one bit
+ * as required.
+ */
 int
 mp_2expt (mp_int * a, int b)
 {
-  int       res;
+  int     res;
 
   mp_zero (a);
   if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
diff --git a/bn_mp_abs.c b/bn_mp_abs.c
index 299d3b6..9e6956e 100644
--- a/bn_mp_abs.c
+++ b/bn_mp_abs.c
@@ -14,11 +14,14 @@
  */
 #include <tommath.h>
 
-/* b = |a| */
+/* b = |a| 
+ *
+ * Simple function copies the input and fixes the sign to positive
+ */
 int
 mp_abs (mp_int * a, mp_int * b)
 {
-  int       res;
+  int     res;
   if ((res = mp_copy (a, b)) != MP_OKAY) {
     return res;
   }
diff --git a/bn_mp_add.c b/bn_mp_add.c
index a2ad4fd..a8addfb 100644
--- a/bn_mp_add.c
+++ b/bn_mp_add.c
@@ -18,9 +18,9 @@
 int
 mp_add (mp_int * a, mp_int * b, mp_int * c)
 {
-  int       sa, sb, res;
-
+  int     sa, sb, res;
 
+  /* get sign of both inputs */
   sa = a->sign;
   sb = b->sign;
 
diff --git a/bn_mp_add_d.c b/bn_mp_add_d.c
index 0391bc1..1b30fa4 100644
--- a/bn_mp_add_d.c
+++ b/bn_mp_add_d.c
@@ -18,9 +18,8 @@
 int
 mp_add_d (mp_int * a, mp_digit b, mp_int * c)
 {
-  mp_int    t;
-  int       res;
-
+  mp_int  t;
+  int     res;
 
   if ((res = mp_init (&t)) != MP_OKAY) {
     return res;
diff --git a/bn_mp_addmod.c b/bn_mp_addmod.c
index adce5d5..abc3719 100644
--- a/bn_mp_addmod.c
+++ b/bn_mp_addmod.c
@@ -18,9 +18,8 @@
 int
 mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 {
-  int       res;
-  mp_int    t;
-
+  int     res;
+  mp_int  t;
 
   if ((res = mp_init (&t)) != MP_OKAY) {
     return res;
diff --git a/bn_mp_and.c b/bn_mp_and.c
index d153c44..6c05d68 100644
--- a/bn_mp_and.c
+++ b/bn_mp_and.c
@@ -18,8 +18,8 @@
 int
 mp_and (mp_int * a, mp_int * b, mp_int * c)
 {
-  int       res, ix, px;
-  mp_int    t, *x;
+  int     res, ix, px;
+  mp_int  t, *x;
 
   if (a->used > b->used) {
     if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
diff --git a/bn_mp_clamp.c b/bn_mp_clamp.c
index 77f3e1a..3741f62 100644
--- a/bn_mp_clamp.c
+++ b/bn_mp_clamp.c
@@ -14,7 +14,13 @@
  */
 #include <tommath.h>
 
-/* trim unused digits */
+/* trim unused digits 
+ *
+ * This is used to ensure that leading zero digits are
+ * trimed and the leading "used" digit will be non-zero
+ * Typically very fast.  Also fixes the sign if there
+ * are no more leading digits
+ */
 void
 mp_clamp (mp_int * a)
 {
diff --git a/bn_mp_cmp.c b/bn_mp_cmp.c
index 527d35c..ca0c463 100644
--- a/bn_mp_cmp.c
+++ b/bn_mp_cmp.c
@@ -18,13 +18,11 @@
 int
 mp_cmp (mp_int * a, mp_int * b)
 {
-  int       res;
   /* compare based on sign */
   if (a->sign == MP_NEG && b->sign == MP_ZPOS) {
     return MP_LT;
   } else if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
     return MP_GT;
   }
-  res = mp_cmp_mag (a, b);
-  return res;
+  return mp_cmp_mag (a, b);
 }
diff --git a/bn_mp_cmp_mag.c b/bn_mp_cmp_mag.c
index efe5b2b..6d4a02d 100644
--- a/bn_mp_cmp_mag.c
+++ b/bn_mp_cmp_mag.c
@@ -18,8 +18,7 @@
 int
 mp_cmp_mag (mp_int * a, mp_int * b)
 {
-  int       n;
-
+  int     n;
 
   /* compare based on # of non-zero digits */
   if (a->used > b->used) {
diff --git a/bn_mp_copy.c b/bn_mp_copy.c
index a502620..68705a4 100644
--- a/bn_mp_copy.c
+++ b/bn_mp_copy.c
@@ -18,8 +18,7 @@
 int
 mp_copy (mp_int * a, mp_int * b)
 {
-  int       res, n;
-
+  int     res, n;
 
   /* if dst == src do nothing */
   if (a == b || a->dp == b->dp) {
@@ -35,14 +34,21 @@ mp_copy (mp_int * a, mp_int * b)
   b->used = a->used;
   b->sign = a->sign;
 
-  /* copy all the digits */
-  for (n = 0; n < a->used; n++) {
-    b->dp[n] = a->dp[n];
-  }
+  {
+    register mp_digit *tmpa, *tmpb;
+
+    tmpa = a->dp;
+    tmpb = b->dp;
+
+    /* copy all the digits */
+    for (n = 0; n < a->used; n++) {
+      *tmpb++ = *tmpa++;
+    }
 
-  /* clear high digits */
-  for (n = b->used; n < b->alloc; n++) {
-    b->dp[n] = 0;
+    /* clear high digits */
+    for (; n < b->alloc; n++) {
+      *tmpb++ = 0;
+    }
   }
   return MP_OKAY;
 }
diff --git a/bn_mp_count_bits.c b/bn_mp_count_bits.c
index 3f6ed28..09992d4 100644
--- a/bn_mp_count_bits.c
+++ b/bn_mp_count_bits.c
@@ -18,8 +18,8 @@
 int
 mp_count_bits (mp_int * a)
 {
-  int       r;
-  mp_digit  q;
+  int     r;
+  mp_digit q;
 
   if (a->used == 0) {
     return 0;
diff --git a/bn_mp_div.c b/bn_mp_div.c
index b324d2d..3954c9f 100644
--- a/bn_mp_div.c
+++ b/bn_mp_div.c
@@ -26,8 +26,8 @@
 int
 mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 {
-  mp_int    q, x, y, t1, t2;
-  int       res, n, t, i, norm, neg;
+  mp_int  q, x, y, t1, t2;
+  int     res, n, t, i, norm, neg;
 
 
   /* is divisor zero ? */
@@ -75,13 +75,12 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 
   /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */
   norm = 0;
-  while ((y.dp[y.used - 1] & (((mp_digit) 1) << (DIGIT_BIT - 1))) ==
-	 ((mp_digit) 0)) {
+  while ((y.dp[y.used - 1] & (((mp_digit) 1) << (DIGIT_BIT - 1))) == ((mp_digit) 0)) {
     ++norm;
-    if ((res = mp_mul_2d (&x, 1, &x)) != MP_OKAY) {
+    if ((res = mp_mul_2 (&x, &x)) != MP_OKAY) {
       goto __Y;
     }
-    if ((res = mp_mul_2d (&y, 1, &y)) != MP_OKAY) {
+    if ((res = mp_mul_2 (&y, &y)) != MP_OKAY) {
       goto __Y;
     }
   }
@@ -114,7 +113,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
     if (x.dp[i] == y.dp[t]) {
       q.dp[i - t - 1] = ((1UL << DIGIT_BIT) - 1UL);
     } else {
-      mp_word   tmp;
+      mp_word tmp;
       tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
       tmp |= ((mp_word) x.dp[i - 1]);
       tmp /= ((mp_word) y.dp[t]);
@@ -142,8 +141,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
       t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
       t2.dp[2] = x.dp[i];
       t2.used = 3;
-    }
-    while (mp_cmp (&t1, &t2) == MP_GT);
+    } while (mp_cmp (&t1, &t2) == MP_GT);
 
     /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */
     if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
diff --git a/bn_mp_div_2.c b/bn_mp_div_2.c
index bc7dc28..284f330 100644
--- a/bn_mp_div_2.c
+++ b/bn_mp_div_2.c
@@ -18,20 +18,33 @@
 int
 mp_div_2 (mp_int * a, mp_int * b)
 {
-  mp_digit  r, rr;
-  int       x, res;
-
+  int     x, res, oldused;
 
   /* copy */
-  if ((res = mp_copy (a, b)) != MP_OKAY) {
-    return res;
+  if (b->alloc < a->used) {
+    if ((res = mp_grow (b, a->used)) != MP_OKAY) {
+      return res;
+    }
   }
 
-  r = 0;
-  for (x = b->used - 1; x >= 0; x--) {
-    rr = b->dp[x] & 1;
-    b->dp[x] = (b->dp[x] >> 1) | (r << (DIGIT_BIT - 1));
-    r = rr;
+  oldused = b->used;
+  b->used = a->used;
+  {
+    register mp_digit r, rr, *tmpa, *tmpb;
+
+    tmpa = a->dp + b->used - 1;
+    tmpb = b->dp + b->used - 1;
+    r = 0;
+    for (x = b->used - 1; x >= 0; x--) {
+      rr = *tmpa & 1;
+      *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
+      r = rr;
+    }
+
+    tmpb = b->dp + b->used;
+    for (x = b->used; x < oldused; x++) {
+      *tmpb++ = 0;
+    }
   }
   mp_clamp (b);
   return MP_OKAY;
diff --git a/bn_mp_div_2d.c b/bn_mp_div_2d.c
index 3bfa5aa..9d24e10 100644
--- a/bn_mp_div_2d.c
+++ b/bn_mp_div_2d.c
@@ -18,9 +18,9 @@
 int
 mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
 {
-  mp_digit  D, r, rr;
-  int       x, res;
-  mp_int    t;
+  mp_digit D, r, rr;
+  int     x, res;
+  mp_int  t;
 
 
   /* if the shift count is <= 0 then we do no work */
diff --git a/bn_mp_div_d.c b/bn_mp_div_d.c
index 321f030..e43802a 100644
--- a/bn_mp_div_d.c
+++ b/bn_mp_div_d.c
@@ -18,8 +18,8 @@
 int
 mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
 {
-  mp_int    t, t2;
-  int       res;
+  mp_int  t, t2;
+  int     res;
 
 
   if ((res = mp_init (&t)) != MP_OKAY) {
diff --git a/bn_mp_exch.c b/bn_mp_exch.c
index 44e7087..2ccaf9e 100644
--- a/bn_mp_exch.c
+++ b/bn_mp_exch.c
@@ -17,7 +17,7 @@
 void
 mp_exch (mp_int * a, mp_int * b)
 {
-  mp_int    t;
+  mp_int  t;
 
   t = *a;
   *a = *b;
diff --git a/bn_mp_expt_d.c b/bn_mp_expt_d.c
index 80f9578..e5106be 100644
--- a/bn_mp_expt_d.c
+++ b/bn_mp_expt_d.c
@@ -17,8 +17,8 @@
 int
 mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
 {
-  int       res, x;
-  mp_int    g;
+  int     res, x;
+  mp_int  g;
 
 
   if ((res = mp_init_copy (&g, a)) != MP_OKAY) {
diff --git a/bn_mp_exptmod.c b/bn_mp_exptmod.c
index ce81953..ea58a4f 100644
--- a/bn_mp_exptmod.c
+++ b/bn_mp_exptmod.c
@@ -14,19 +14,30 @@
  */
 #include <tommath.h>
 
+static int f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y);
+
+/* this is a shell function that calls either the normal or Montgomery
+ * exptmod functions.  Originally the call to the montgomery code was 
+ * embedded in the normal function but that wasted alot of stack space
+ * for nothing (since 99% of the time the Montgomery code would be called)
+ */
 int
 mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 {
-  mp_int    M[256], res, mu;
-  mp_digit  buf;
-  int       err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-
-
   /* if the modulus is odd use the fast method */
   if (mp_isodd (P) == 1 && P->used > 4 && P->used < MONTGOMERY_EXPT_CUTOFF) {
-    err = mp_exptmod_fast (G, X, P, Y);
-    return err;
+    return mp_exptmod_fast (G, X, P, Y);
+  } else {
+    return f_mp_exptmod (G, X, P, Y);
   }
+}
+
+static int
+f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+{
+  mp_int  M[256], res, mu;
+  mp_digit buf;
+  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
 
   /* find window size */
   x = mp_count_bits (X);
@@ -80,9 +91,7 @@ mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
   }
 
   for (x = 0; x < (winsize - 1); x++) {
-    if ((err =
-	 mp_sqr (&M[1 << (winsize - 1)],
-		 &M[1 << (winsize - 1)])) != MP_OKAY) {
+    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
       goto __MU;
     }
     if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
diff --git a/bn_mp_exptmod_fast.c b/bn_mp_exptmod_fast.c
index bd25b3e..fe0dfcb 100644
--- a/bn_mp_exptmod_fast.c
+++ b/bn_mp_exptmod_fast.c
@@ -24,9 +24,9 @@
 int
 mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 {
-  mp_int    M[256], res;
-  mp_digit  buf, mp;
-  int       err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+  mp_int  M[256], res;
+  mp_digit buf, mp;
+  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
 
   /* find window size */
   x = mp_count_bits (X);
@@ -48,7 +48,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 
   /* init G array */
   for (x = 0; x < (1 << winsize); x++) {
-    if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) {
+    if ((err = mp_init (&M[x])) != MP_OKAY) {
       for (y = 0; y < x; y++) {
 	mp_clear (&M[y]);
       }
@@ -66,44 +66,32 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
     goto __RES;
   }
 
-  /* now we need R mod m */
-  if ((err = mp_2expt (&res, P->used * DIGIT_BIT)) != MP_OKAY) {
-    goto __RES;
-  }
-
-  /* res = R mod m (can use modified double/subtract ...) */
-  if ((err = mp_mod (&res, P, &res)) != MP_OKAY) {
-    goto __RES;
-  }
-
   /* create M table
    *
    * The M table contains powers of the input base, e.g. M[x] = G^x mod P
    *
    * The first half of the table is not computed though accept for M[0] and M[1]
    */
-  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
+
+  /* now we need R mod m */
+  if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
     goto __RES;
   }
 
   /* now set M[1] to G * R mod m */
-  if ((err = mp_mulmod (&M[1], &res, P, &M[1])) != MP_OKAY) {
+  if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
     goto __RES;
   }
-
   /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
   if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
     goto __RES;
   }
 
   for (x = 0; x < (winsize - 1); x++) {
-    if ((err =
-	 mp_sqr (&M[1 << (winsize - 1)],
-		 &M[1 << (winsize - 1)])) != MP_OKAY) {
+    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
       goto __RES;
     }
-    if ((err =
-	 mp_montgomery_reduce (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
+    if ((err = mp_montgomery_reduce (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
       goto __RES;
     }
   }
diff --git a/bn_mp_gcd.c b/bn_mp_gcd.c
index e2594b5..35b287e 100644
--- a/bn_mp_gcd.c
+++ b/bn_mp_gcd.c
@@ -19,8 +19,8 @@
 int
 mp_gcd (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int    u, v, t;
-  int       k, res, neg;
+  mp_int  u, v, t;
+  int     k, res, neg;
 
 
   /* either zero than gcd is the largest */
@@ -57,10 +57,10 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
   k = 0;
   while ((u.dp[0] & 1) == 0 && (v.dp[0] & 1) == 0) {
     ++k;
-    if ((res = mp_div_2d (&u, 1, &u, NULL)) != MP_OKAY) {
+    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
       goto __T;
     }
-    if ((res = mp_div_2d (&v, 1, &v, NULL)) != MP_OKAY) {
+    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
       goto __T;
     }
   }
@@ -80,7 +80,7 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
   do {
     /* B3 (and B4).  Halve t, if even */
     while (t.used != 0 && (t.dp[0] & 1) == 0) {
-      if ((res = mp_div_2d (&t, 1, &t, NULL)) != MP_OKAY) {
+      if ((res = mp_div_2 (&t, &t)) != MP_OKAY) {
 	goto __T;
       }
     }
diff --git a/bn_mp_grow.c b/bn_mp_grow.c
index 2a12369..91c1867 100644
--- a/bn_mp_grow.c
+++ b/bn_mp_grow.c
@@ -18,8 +18,7 @@
 int
 mp_grow (mp_int * a, int size)
 {
-  int       i, n;
-
+  int     i, n;
 
   /* if the alloc size is smaller alloc more ram */
   if (a->alloc < size) {
diff --git a/bn_mp_init_copy.c b/bn_mp_init_copy.c
index b3f25ee..f79d2b1 100644
--- a/bn_mp_init_copy.c
+++ b/bn_mp_init_copy.c
@@ -18,11 +18,10 @@
 int
 mp_init_copy (mp_int * a, mp_int * b)
 {
-  int       res;
+  int     res;
 
   if ((res = mp_init (a)) != MP_OKAY) {
     return res;
   }
-  res = mp_copy (b, a);
-  return res;
+  return mp_copy (b, a);
 }
diff --git a/bn_mp_invmod.c b/bn_mp_invmod.c
index 1051eb0..006efd2 100644
--- a/bn_mp_invmod.c
+++ b/bn_mp_invmod.c
@@ -17,9 +17,8 @@
 int
 mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int    x, y, u, v, A, B, C, D;
-  int       res;
-
+  mp_int  x, y, u, v, A, B, C, D;
+  int     res;
 
   /* b cannot be negative */
   if (b->sign == MP_NEG) {
@@ -28,8 +27,7 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 
   /* if the modulus is odd we can use a faster routine instead */
   if (mp_iseven (b) == 0) {
-    res = fast_mp_invmod (a, b, c);
-    return res;
+    return fast_mp_invmod (a, b, c);
   }
 
   if ((res = mp_init (&x)) != MP_OKAY) {
diff --git a/bn_mp_jacobi.c b/bn_mp_jacobi.c
index b97d5f3..95aee42 100644
--- a/bn_mp_jacobi.c
+++ b/bn_mp_jacobi.c
@@ -20,9 +20,9 @@
 int
 mp_jacobi (mp_int * a, mp_int * n, int *c)
 {
-  mp_int    a1, n1, e;
-  int       s, r, res;
-  mp_digit  residue;
+  mp_int  a1, n1, e;
+  int     s, r, res;
+  mp_digit residue;
 
   /* step 1.  if a == 0, return 0 */
   if (mp_iszero (a) == 1) {
diff --git a/bn_mp_karatsuba_mul.c b/bn_mp_karatsuba_mul.c
index 0a8c34e..bee8eaa 100644
--- a/bn_mp_karatsuba_mul.c
+++ b/bn_mp_karatsuba_mul.c
@@ -36,8 +36,8 @@
 int
 mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int    x0, x1, y0, y1, t1, t2, x0y0, x1y1;
-  int       B, err, x;
+  mp_int  x0, x1, y0, y1, t1, t2, x0y0, x1y1;
+  int     B, err, x;
 
 
   err = MP_MEM;
diff --git a/bn_mp_karatsuba_sqr.c b/bn_mp_karatsuba_sqr.c
index 29396ba..3078588 100644
--- a/bn_mp_karatsuba_sqr.c
+++ b/bn_mp_karatsuba_sqr.c
@@ -22,8 +22,8 @@
 int
 mp_karatsuba_sqr (mp_int * a, mp_int * b)
 {
-  mp_int    x0, x1, t1, t2, x0x0, x1x1;
-  int       B, err, x;
+  mp_int  x0, x1, t1, t2, x0x0, x1x1;
+  int     B, err, x;
 
 
   err = MP_MEM;
diff --git a/bn_mp_lcm.c b/bn_mp_lcm.c
index 7d38135..60d5461 100644
--- a/bn_mp_lcm.c
+++ b/bn_mp_lcm.c
@@ -18,8 +18,8 @@
 int
 mp_lcm (mp_int * a, mp_int * b, mp_int * c)
 {
-  int       res;
-  mp_int    t;
+  int     res;
+  mp_int  t;
 
 
   if ((res = mp_init (&t)) != MP_OKAY) {
diff --git a/bn_mp_lshd.c b/bn_mp_lshd.c
index ea02409..44b0588 100644
--- a/bn_mp_lshd.c
+++ b/bn_mp_lshd.c
@@ -18,7 +18,7 @@
 int
 mp_lshd (mp_int * a, int b)
 {
-  int       x, res;
+  int     x, res;
 
 
   /* if its less than zero return */
diff --git a/bn_mp_mod.c b/bn_mp_mod.c
index 5b208a6..c4a7374 100644
--- a/bn_mp_mod.c
+++ b/bn_mp_mod.c
@@ -18,8 +18,8 @@
 int
 mp_mod (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int    t;
-  int       res;
+  mp_int  t;
+  int     res;
 
 
   if ((res = mp_init (&t)) != MP_OKAY) {
diff --git a/bn_mp_mod_2d.c b/bn_mp_mod_2d.c
index df73612..4c6f1f1 100644
--- a/bn_mp_mod_2d.c
+++ b/bn_mp_mod_2d.c
@@ -18,7 +18,7 @@
 int
 mp_mod_2d (mp_int * a, int b, mp_int * c)
 {
-  int       x, res;
+  int     x, res;
 
 
   /* if b is <= 0 then zero the int */
@@ -44,8 +44,7 @@ mp_mod_2d (mp_int * a, int b, mp_int * c)
   }
   /* clear the digit that is not completely outside/inside the modulus */
   c->dp[b / DIGIT_BIT] &=
-    (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) -
-		((mp_digit) 1));
+    (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
   mp_clamp (c);
   return MP_OKAY;
 }
diff --git a/bn_mp_mod_d.c b/bn_mp_mod_d.c
index 7b08d23..b557381 100644
--- a/bn_mp_mod_d.c
+++ b/bn_mp_mod_d.c
@@ -17,8 +17,8 @@
 int
 mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
 {
-  mp_int    t, t2;
-  int       res;
+  mp_int  t, t2;
+  int     res;
 
 
   if ((res = mp_init (&t)) != MP_OKAY) {
diff --git a/bn_mp_montgomery_calc_normalization.c b/bn_mp_montgomery_calc_normalization.c
new file mode 100644
index 0000000..06252de
--- /dev/null
+++ b/bn_mp_montgomery_calc_normalization.c
@@ -0,0 +1,53 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* calculates a = B^n mod b for Montgomery reduction
+ * Where B is the base [e.g. 2^DIGIT_BIT].  
+ * B^n mod b is computed by first computing
+ * A = B^(n-1) which doesn't require a reduction but a simple OR.
+ * then C = A * B = B^n is computed by performing upto DIGIT_BIT 
+ * shifts with subtractions when the result is greater than b.
+ *
+ * The method is slightly modified to shift B unconditionally upto just under
+ * the leading bit of b.  This saves alot of multiple precision shifting.
+ */
+int
+mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
+{
+  int     x, bits, res;
+
+  /* how many bits of last digit does b use */
+  bits = mp_count_bits (b) % DIGIT_BIT;
+
+  /* compute A = B^(n-1) * 2^(bits-1) */
+  if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
+    return res;
+  }
+
+  /* now compute C = A * B mod b */
+  for (x = bits - 1; x < DIGIT_BIT; x++) {
+    if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
+      return res;
+    }
+    if (mp_cmp_mag (a, b) != MP_LT) {
+      if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
+	return res;
+      }
+    }
+  }
+
+  return MP_OKAY;
+}
diff --git a/bn_mp_montgomery_reduce.c b/bn_mp_montgomery_reduce.c
index aeb2cde..586142a 100644
--- a/bn_mp_montgomery_reduce.c
+++ b/bn_mp_montgomery_reduce.c
@@ -18,14 +18,13 @@
 int
 mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
 {
-  int       ix, res, digs;
-  mp_digit  ui;
+  int     ix, res, digs;
+  mp_digit ui;
 
   digs = m->used * 2 + 1;
   if ((digs < 512)
       && digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-    res = fast_mp_montgomery_reduce (a, m, mp);
-    return res;
+    return fast_mp_montgomery_reduce (a, m, mp);
   }
 
   if (a->alloc < m->used * 2 + 1) {
@@ -51,9 +50,7 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
 
       mu = 0;
       for (iy = 0; iy < m->used; iy++) {
-	r =
-	  ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) +
-	  ((mp_word) * tmpy);
+	r = ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + ((mp_word) * tmpy);
 	mu = (r >> ((mp_word) DIGIT_BIT));
 	*tmpy++ = (r & ((mp_word) MP_MASK));
       }
@@ -71,9 +68,7 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
 
   /* if A >= m then A = A - m */
   if (mp_cmp_mag (a, m) != MP_LT) {
-    if ((res = s_mp_sub (a, m, a)) != MP_OKAY) {
-      return res;
-    }
+    return s_mp_sub (a, m, a);
   }
 
   return MP_OKAY;
diff --git a/bn_mp_montgomery_setup.c b/bn_mp_montgomery_setup.c
index 601e74c..c739895 100644
--- a/bn_mp_montgomery_setup.c
+++ b/bn_mp_montgomery_setup.c
@@ -18,8 +18,8 @@
 int
 mp_montgomery_setup (mp_int * a, mp_digit * mp)
 {
-  mp_int    t, tt;
-  int       res;
+  mp_int  t, tt;
+  int     res;
 
   if ((res = mp_init (&t)) != MP_OKAY) {
     return res;
diff --git a/bn_mp_mul.c b/bn_mp_mul.c
index cfa1467..064fc85 100644
--- a/bn_mp_mul.c
+++ b/bn_mp_mul.c
@@ -18,12 +18,26 @@
 int
 mp_mul (mp_int * a, mp_int * b, mp_int * c)
 {
-  int       res, neg;
+  int     res, neg;
   neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
   if (MIN (a->used, b->used) > KARATSUBA_MUL_CUTOFF) {
     res = mp_karatsuba_mul (a, b, c);
   } else {
-    res = s_mp_mul (a, b, c);
+
+    /* can we use the fast multiplier? 
+     *
+     * The fast multiplier can be used if the output will have less than 
+     * 512 digits and the number of digits won't affect carry propagation
+     */
+    int     digs = a->used + b->used + 1;
+
+    if ((digs < 512)
+	&& digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+      res = fast_s_mp_mul_digs (a, b, c, digs);
+    } else {
+      res = s_mp_mul (a, b, c);
+    }
+
   }
   c->sign = neg;
   return res;
diff --git a/bn_mp_mul_2.c b/bn_mp_mul_2.c
index dd5ecca..8174d0a 100644
--- a/bn_mp_mul_2.c
+++ b/bn_mp_mul_2.c
@@ -18,27 +18,48 @@
 int
 mp_mul_2 (mp_int * a, mp_int * b)
 {
-  mp_digit  r, rr;
-  int       x, res;
+  int     x, res, oldused;
 
+  /* Optimization: should copy and shift at the same time */
 
-  /* copy */
-  if ((res = mp_copy (a, b)) != MP_OKAY) {
-    return res;
+  if (b->alloc < a->used) {
+    if ((res = mp_grow (b, a->used)) != MP_OKAY) {
+      return res;
+    }
   }
 
-  if ((res = mp_grow (b, b->used + 1)) != MP_OKAY) {
-    return res;
-  }
-  ++b->used;
+  oldused = b->used;
+  b->used = a->used;
 
   /* shift any bit count < DIGIT_BIT */
-  r = 0;
-  for (x = 0; x < b->used; x++) {
-    rr = (b->dp[x] >> (DIGIT_BIT - 1)) & 1;
-    b->dp[x] = ((b->dp[x] << 1) | r) & MP_MASK;
-    r = rr;
+  {
+    register mp_digit r, rr, *tmpa, *tmpb;
+
+    r = 0;
+    tmpa = a->dp;
+    tmpb = b->dp;
+    for (x = 0; x < b->used; x++) {
+      rr = *tmpa >> (DIGIT_BIT - 1);
+      *tmpb++ = ((*tmpa++ << 1) | r) & MP_MASK;
+      r = rr;
+    }
+
+    /* new leading digit? */
+    if (r != 0) {
+      if (b->alloc == b->used) {
+	if ((res = mp_grow (b, b->used + 1)) != MP_OKAY) {
+	  return res;
+	}
+      }
+      /* add a MSB of 1 */
+      *tmpb = 1;
+      ++b->used;
+    }
+
+    tmpb = b->dp + b->used;
+    for (x = b->used; x < oldused; x++) {
+      *tmpb++ = 0;
+    }
   }
-  mp_clamp (b);
   return MP_OKAY;
 }
diff --git a/bn_mp_mul_2d.c b/bn_mp_mul_2d.c
index 7823eb9..97ee26c 100644
--- a/bn_mp_mul_2d.c
+++ b/bn_mp_mul_2d.c
@@ -18,8 +18,8 @@
 int
 mp_mul_2d (mp_int * a, int b, mp_int * c)
 {
-  mp_digit  d, r, rr;
-  int       x, res;
+  mp_digit d, r, rr;
+  int     x, res;
 
 
   /* copy */
diff --git a/bn_mp_mul_d.c b/bn_mp_mul_d.c
index bced9b7..164bcac 100644
--- a/bn_mp_mul_d.c
+++ b/bn_mp_mul_d.c
@@ -18,29 +18,40 @@
 int
 mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
 {
-  int       res, pa, ix;
-  mp_word   r;
-  mp_digit  u;
-  mp_int    t;
-
+  int     res, pa, olduse;
 
   pa = a->used;
-  if ((res = mp_init_size (&t, pa + 2)) != MP_OKAY) {
-    return res;
+  if (c->alloc < pa + 1) {
+    if ((res = mp_grow (c, pa + 1)) != MP_OKAY) {
+      return res;
+    }
   }
-  t.used = pa + 2;
 
-  u = 0;
-  for (ix = 0; ix < pa; ix++) {
-    r = ((mp_word) u) + ((mp_word) a->dp[ix]) * ((mp_word) b);
-    t.dp[ix] = (mp_digit) (r & ((mp_word) MP_MASK));
-    u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+  olduse = c->used;
+  c->used = pa + 1;
+
+  {
+    register mp_digit u, *tmpa, *tmpc;
+    register mp_word r;
+    register int ix;
+
+    tmpc = c->dp + c->used;
+    for (ix = c->used; ix < olduse; ix++) {
+      *tmpc++ = 0;
+    }
+
+    tmpa = a->dp;
+    tmpc = c->dp;
+
+    u = 0;
+    for (ix = 0; ix < pa; ix++) {
+      r = ((mp_word) u) + ((mp_word) * tmpa++) * ((mp_word) b);
+      *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
+      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+    }
+    *tmpc = u;
   }
-  t.dp[ix] = u;
 
-  t.sign = a->sign;
-  mp_clamp (&t);
-  mp_exch (&t, c);
-  mp_clear (&t);
+  mp_clamp (c);
   return MP_OKAY;
 }
diff --git a/bn_mp_mulmod.c b/bn_mp_mulmod.c
index 2cdbdda..abdf77b 100644
--- a/bn_mp_mulmod.c
+++ b/bn_mp_mulmod.c
@@ -18,8 +18,8 @@
 int
 mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 {
-  int       res;
-  mp_int    t;
+  int     res;
+  mp_int  t;
 
 
   if ((res = mp_init (&t)) != MP_OKAY) {
diff --git a/bn_mp_n_root.c b/bn_mp_n_root.c
index 72b3a8c..eb49b3f 100644
--- a/bn_mp_n_root.c
+++ b/bn_mp_n_root.c
@@ -17,12 +17,16 @@
 /* find the n'th root of an integer 
  *
  * Result found such that (c)^b <= a and (c+1)^b > a 
+ *
+ * This algorithm uses Newton's approximation x[i+1] = x[i] - f(x[i])/f'(x[i]) 
+ * which will find the root in log(N) time where each step involves a fair bit.  This
+ * is not meant to find huge roots [square and cube at most].
  */
 int
 mp_n_root (mp_int * a, mp_digit b, mp_int * c)
 {
-  mp_int    t1, t2, t3;
-  int       res, neg;
+  mp_int  t1, t2, t3;
+  int     res, neg;
 
   /* input must be positive if b is even */
   if ((b & 1) == 0 && a->sign == MP_NEG) {
diff --git a/bn_mp_neg.c b/bn_mp_neg.c
index a2bb7c4..fd2e497 100644
--- a/bn_mp_neg.c
+++ b/bn_mp_neg.c
@@ -18,7 +18,7 @@
 int
 mp_neg (mp_int * a, mp_int * b)
 {
-  int       res;
+  int     res;
   if ((res = mp_copy (a, b)) != MP_OKAY) {
     return res;
   }
diff --git a/bn_mp_or.c b/bn_mp_or.c
index a3843d9..e821bac 100644
--- a/bn_mp_or.c
+++ b/bn_mp_or.c
@@ -18,8 +18,8 @@
 int
 mp_or (mp_int * a, mp_int * b, mp_int * c)
 {
-  int res, ix, px;
-  mp_int t, *x;
+  int     res, ix, px;
+  mp_int  t, *x;
 
   if (a->used > b->used) {
     if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
diff --git a/bn_mp_rand.c b/bn_mp_rand.c
index c72dec9..dc13534 100644
--- a/bn_mp_rand.c
+++ b/bn_mp_rand.c
@@ -18,8 +18,8 @@
 int
 mp_rand (mp_int * a, int digits)
 {
-  int       res;
-  mp_digit  d;
+  int     res;
+  mp_digit d;
 
   mp_zero (a);
   if (digits <= 0) {
@@ -27,19 +27,20 @@ mp_rand (mp_int * a, int digits)
   }
 
   /* first place a random non-zero digit */
-  d = ((mp_digit) abs (rand ()));
-  d = d == 0 ? 1 : d;
+  do {
+    d = ((mp_digit) abs (rand ()));
+  } while (d == 0);
 
   if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
     return res;
   }
 
-
   while (digits-- > 0) {
-    if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) {
+    if ((res = mp_lshd (a, 1)) != MP_OKAY) {
       return res;
     }
-    if ((res = mp_lshd (a, 1)) != MP_OKAY) {
+
+    if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) {
       return res;
     }
   }
diff --git a/bn_mp_read_signed_bin.c b/bn_mp_read_signed_bin.c
index b5af4c0..8a9df88 100644
--- a/bn_mp_read_signed_bin.c
+++ b/bn_mp_read_signed_bin.c
@@ -18,7 +18,7 @@
 int
 mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
 {
-  int       res;
+  int     res;
 
   if ((res = mp_read_unsigned_bin (a, b + 1, c - 1)) != MP_OKAY) {
     return res;
diff --git a/bn_mp_read_unsigned_bin.c b/bn_mp_read_unsigned_bin.c
index 726b574..16e2f29 100644
--- a/bn_mp_read_unsigned_bin.c
+++ b/bn_mp_read_unsigned_bin.c
@@ -18,7 +18,7 @@
 int
 mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
 {
-  int       res;
+  int     res;
   mp_zero (a);
   while (c-- > 0) {
     if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
diff --git a/bn_mp_reduce.c b/bn_mp_reduce.c
index be5d18e..8f15458 100644
--- a/bn_mp_reduce.c
+++ b/bn_mp_reduce.c
@@ -20,7 +20,7 @@
 int
 mp_reduce_setup (mp_int * a, mp_int * b)
 {
-  int       res;
+  int     res;
 
 
   if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
@@ -36,8 +36,8 @@ mp_reduce_setup (mp_int * a, mp_int * b)
 int
 mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
 {
-  mp_int    q;
-  int       res, um = m->used;
+  mp_int  q;
+  int     res, um = m->used;
 
 
   if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
diff --git a/bn_mp_rshd.c b/bn_mp_rshd.c
index 8b37d09..39e631e 100644
--- a/bn_mp_rshd.c
+++ b/bn_mp_rshd.c
@@ -18,7 +18,7 @@
 void
 mp_rshd (mp_int * a, int b)
 {
-  int       x;
+  int     x;
 
 
   /* if b <= 0 then ignore it */
diff --git a/bn_mp_set_int.c b/bn_mp_set_int.c
index a690bb4..f22ab69 100644
--- a/bn_mp_set_int.c
+++ b/bn_mp_set_int.c
@@ -18,7 +18,7 @@
 int
 mp_set_int (mp_int * a, unsigned long b)
 {
-  int       x, res;
+  int     x, res;
 
   mp_zero (a);
 
diff --git a/bn_mp_sqr.c b/bn_mp_sqr.c
index 2ba877c..c8b5cb7 100644
--- a/bn_mp_sqr.c
+++ b/bn_mp_sqr.c
@@ -18,11 +18,18 @@
 int
 mp_sqr (mp_int * a, mp_int * b)
 {
-  int       res;
+  int     res;
   if (a->used > KARATSUBA_SQR_CUTOFF) {
     res = mp_karatsuba_sqr (a, b);
   } else {
-    res = s_mp_sqr (a, b);
+
+    /* can we use the fast multiplier? */
+    if (((a->used * 2 + 1) < 512)
+	&& a->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT) - 1))) {
+      res = fast_s_mp_sqr (a, b);
+    } else {
+      res = s_mp_sqr (a, b);
+    }
   }
   b->sign = MP_ZPOS;
   return res;
diff --git a/bn_mp_sqrmod.c b/bn_mp_sqrmod.c
index 66135d4..44f608f 100644
--- a/bn_mp_sqrmod.c
+++ b/bn_mp_sqrmod.c
@@ -18,8 +18,8 @@
 int
 mp_sqrmod (mp_int * a, mp_int * b, mp_int * c)
 {
-  int       res;
-  mp_int    t;
+  int     res;
+  mp_int  t;
 
 
   if ((res = mp_init (&t)) != MP_OKAY) {
diff --git a/bn_mp_sub.c b/bn_mp_sub.c
index 045dee5..1366c55 100644
--- a/bn_mp_sub.c
+++ b/bn_mp_sub.c
@@ -18,7 +18,7 @@
 int
 mp_sub (mp_int * a, mp_int * b, mp_int * c)
 {
-  int       sa, sb, res;
+  int     sa, sb, res;
 
 
   sa = a->sign;
diff --git a/bn_mp_sub_d.c b/bn_mp_sub_d.c
index 9839d5e..aebc414 100644
--- a/bn_mp_sub_d.c
+++ b/bn_mp_sub_d.c
@@ -18,8 +18,8 @@
 int
 mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
 {
-  mp_int    t;
-  int       res;
+  mp_int  t;
+  int     res;
 
 
   if ((res = mp_init (&t)) != MP_OKAY) {
diff --git a/bn_mp_submod.c b/bn_mp_submod.c
index b56d921..16fee71 100644
--- a/bn_mp_submod.c
+++ b/bn_mp_submod.c
@@ -18,8 +18,8 @@
 int
 mp_submod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 {
-  int       res;
-  mp_int    t;
+  int     res;
+  mp_int  t;
 
 
   if ((res = mp_init (&t)) != MP_OKAY) {
diff --git a/bn_mp_to_signed_bin.c b/bn_mp_to_signed_bin.c
index b00cf8f..41abac1 100644
--- a/bn_mp_to_signed_bin.c
+++ b/bn_mp_to_signed_bin.c
@@ -18,7 +18,7 @@
 int
 mp_to_signed_bin (mp_int * a, unsigned char *b)
 {
-  int       res;
+  int     res;
 
   if ((res = mp_to_unsigned_bin (a, b + 1)) != MP_OKAY) {
     return res;
diff --git a/bn_mp_to_unsigned_bin.c b/bn_mp_to_unsigned_bin.c
index b122555..eec9f75 100644
--- a/bn_mp_to_unsigned_bin.c
+++ b/bn_mp_to_unsigned_bin.c
@@ -18,8 +18,8 @@
 int
 mp_to_unsigned_bin (mp_int * a, unsigned char *b)
 {
-  int       x, res;
-  mp_int    t;
+  int     x, res;
+  mp_int  t;
 
   if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
     return res;
diff --git a/bn_mp_unsigned_bin_size.c b/bn_mp_unsigned_bin_size.c
index 1c92b5c..bee88e6 100644
--- a/bn_mp_unsigned_bin_size.c
+++ b/bn_mp_unsigned_bin_size.c
@@ -18,6 +18,6 @@
 int
 mp_unsigned_bin_size (mp_int * a)
 {
-  int       size = mp_count_bits (a);
+  int     size = mp_count_bits (a);
   return (size / 8 + ((size & 7) != 0 ? 1 : 0));
 }
diff --git a/bn_mp_xor.c b/bn_mp_xor.c
index c8e9f43..4a2ff9b 100644
--- a/bn_mp_xor.c
+++ b/bn_mp_xor.c
@@ -18,8 +18,8 @@
 int
 mp_xor (mp_int * a, mp_int * b, mp_int * c)
 {
-  int       res, ix, px;
-  mp_int    t, *x;
+  int     res, ix, px;
+  mp_int  t, *x;
 
   if (a->used > b->used) {
     if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
diff --git a/bn_radix.c b/bn_radix.c
index 1fc4b35..205c148 100644
--- a/bn_radix.c
+++ b/bn_radix.c
@@ -15,16 +15,15 @@
 #include <tommath.h>
 
 /* chars used in radix conversions */
-static const char *s_rmap =
-  "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
+static const char *s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
 
 
 /* read a string [ASCII] in a given radix */
 int
 mp_read_radix (mp_int * a, char *str, int radix)
 {
-  int       y, res, neg;
-  char      ch;
+  int     y, res, neg;
+  char    ch;
 
   if (radix < 2 || radix > 64) {
     return MP_VAL;
@@ -66,10 +65,10 @@ mp_read_radix (mp_int * a, char *str, int radix)
 int
 mp_toradix (mp_int * a, char *str, int radix)
 {
-  int       res, digs;
-  mp_int    t;
-  mp_digit  d;
-  char     *_s = str;
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+  char   *_s = str;
 
   if (radix < 2 || radix > 64) {
     return MP_VAL;
@@ -104,9 +103,9 @@ mp_toradix (mp_int * a, char *str, int radix)
 int
 mp_radix_size (mp_int * a, int radix)
 {
-  int       res, digs;
-  mp_int    t;
-  mp_digit  d;
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
 
   /* special case for binary */
   if (radix == 2) {
diff --git a/bn_reverse.c b/bn_reverse.c
index 10c2375..50109d7 100644
--- a/bn_reverse.c
+++ b/bn_reverse.c
@@ -18,7 +18,7 @@
 void
 bn_reverse (unsigned char *s, int len)
 {
-  int       ix, iy;
+  int     ix, iy;
   unsigned char t;
 
   ix = 0;
diff --git a/bn_s_mp_add.c b/bn_s_mp_add.c
index 369a0e1..328ec06 100644
--- a/bn_s_mp_add.c
+++ b/bn_s_mp_add.c
@@ -18,10 +18,8 @@
 int
 s_mp_add (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int   *x;
-  int       olduse, res, min, max, i;
-  mp_digit  u;
-
+  mp_int *x;
+  int     olduse, res, min, max;
 
   /* find sizes, we let |a| <= |b| which means we have to sort
    * them.  "x" will point to the input with the most digits
@@ -52,38 +50,48 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
   /* add digits from lower part */
 
   /* set the carry to zero */
-  u = 0;
-  for (i = 0; i < min; i++) {
-    /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
-    c->dp[i] = a->dp[i] + b->dp[i] + u;
-
-    /* U = carry bit of T[i] */
-    u = (c->dp[i] >> DIGIT_BIT) & 1;
+  {
+    register mp_digit u, *tmpa, *tmpb, *tmpc;
+    register int i;
 
-    /* take away carry bit from T[i] */
-    c->dp[i] &= MP_MASK;
-  }
+    /* alias for digit pointers */
+    tmpa = a->dp;
+    tmpb = b->dp;
+    tmpc = c->dp;
 
-  /* now copy higher words if any, that is in A+B if A or B has more digits add those in */
-  if (min != max) {
-    for (; i < max; i++) {
-      /* T[i] = X[i] + U */
-      c->dp[i] = x->dp[i] + u;
+    u = 0;
+    for (i = 0; i < min; i++) {
+      /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
+      *tmpc = *tmpa++ + *tmpb++ + u;
 
       /* U = carry bit of T[i] */
-      u = (c->dp[i] >> DIGIT_BIT) & 1;
+      u = *tmpc >> DIGIT_BIT;
 
       /* take away carry bit from T[i] */
-      c->dp[i] &= MP_MASK;
+      *tmpc++ &= MP_MASK;
     }
-  }
 
-  /* add carry */
-  c->dp[i] = u;
+    /* now copy higher words if any, that is in A+B if A or B has more digits add those in */
+    if (min != max) {
+      for (; i < max; i++) {
+	/* T[i] = X[i] + U */
+	*tmpc = x->dp[i] + u;
+
+	/* U = carry bit of T[i] */
+	u = *tmpc >> DIGIT_BIT;
 
-  /* clear digits above used (since we may not have grown result above) */
-  for (i = c->used; i < olduse; i++) {
-    c->dp[i] = 0;
+	/* take away carry bit from T[i] */
+	*tmpc++ &= MP_MASK;
+      }
+    }
+
+    /* add carry */
+    *tmpc++ = u;
+
+    /* clear digits above used (since we may not have grown result above) */
+    for (i = c->used; i < olduse; i++) {
+      *tmpc++ = 0;
+    }
   }
 
   mp_clamp (c);
diff --git a/bn_s_mp_mul_digs.c b/bn_s_mp_mul_digs.c
index 55522a1..f2b0d13 100644
--- a/bn_s_mp_mul_digs.c
+++ b/bn_s_mp_mul_digs.c
@@ -21,23 +21,11 @@
 int
 s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
-  mp_int    t;
-  int       res, pa, pb, ix, iy;
-  mp_digit  u;
-  mp_word   r;
-  mp_digit  tmpx, *tmpt, *tmpy;
-
-
-  /* can we use the fast multiplier? 
-   *
-   * The fast multiplier can be used if the output will have less than 
-   * 512 digits and the number of digits won't affect carry propagation
-   */
-  if ((digs < 512)
-      && digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-    res = fast_s_mp_mul_digs (a, b, c, digs);
-    return res;
-  }
+  mp_int  t;
+  int     res, pa, pb, ix, iy;
+  mp_digit u;
+  mp_word r;
+  mp_digit tmpx, *tmpt, *tmpy;
 
   if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
     return res;
@@ -61,9 +49,7 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
     /* compute the columns of the output and propagate the carry */
     for (iy = 0; iy < pb; iy++) {
       /* compute the column as a mp_word */
-      r =
-	((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) +
-	((mp_word) u);
+      r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);
 
       /* the new column is the lower part of the result */
       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
diff --git a/bn_s_mp_mul_high_digs.c b/bn_s_mp_mul_high_digs.c
index ff2530f..a43a593 100644
--- a/bn_s_mp_mul_high_digs.c
+++ b/bn_s_mp_mul_high_digs.c
@@ -20,20 +20,17 @@
 int
 s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
-  mp_int    t;
-  int       res, pa, pb, ix, iy;
-  mp_digit  u;
-  mp_word   r;
-  mp_digit  tmpx, *tmpt, *tmpy;
+  mp_int  t;
+  int     res, pa, pb, ix, iy;
+  mp_digit u;
+  mp_word r;
+  mp_digit tmpx, *tmpt, *tmpy;
 
 
   /* can we use the fast multiplier? */
   if (((a->used + b->used + 1) < 512)
-      && MAX (a->used,
-	      b->used) <
-      (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-    res = fast_s_mp_mul_high_digs (a, b, c, digs);
-    return res;
+      && MAX (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    return fast_s_mp_mul_high_digs (a, b, c, digs);
   }
 
   if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
@@ -58,9 +55,7 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 
     for (iy = digs - ix; iy < pb; iy++) {
       /* calculate the double precision result */
-      r =
-	((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) +
-	((mp_word) u);
+      r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);
 
       /* get the lower part */
       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
diff --git a/bn_s_mp_sqr.c b/bn_s_mp_sqr.c
index 94449a3..a0ec38b 100644
--- a/bn_s_mp_sqr.c
+++ b/bn_s_mp_sqr.c
@@ -18,18 +18,10 @@
 int
 s_mp_sqr (mp_int * a, mp_int * b)
 {
-  mp_int    t;
-  int       res, ix, iy, pa;
-  mp_word   r, u;
-  mp_digit  tmpx, *tmpt;
-
-  /* can we use the fast multiplier? */
-  if (((a->used * 2 + 1) < 512)
-      && a->used <
-      (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT) - 1))) {
-    res = fast_s_mp_sqr (a, b);
-    return res;
-  }
+  mp_int  t;
+  int     res, ix, iy, pa;
+  mp_word r, u;
+  mp_digit tmpx, *tmpt;
 
   pa = a->used;
   if ((res = mp_init_size (&t, pa + pa + 1)) != MP_OKAY) {
@@ -40,9 +32,7 @@ s_mp_sqr (mp_int * a, mp_int * b)
   for (ix = 0; ix < pa; ix++) {
     /* first calculate the digit at 2*ix */
     /* calculate double precision result */
-    r =
-      ((mp_word) t.dp[ix + ix]) +
-      ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+    r = ((mp_word) t.dp[ix + ix]) + ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
 
     /* store lower part in result */
     t.dp[ix + ix] = (mp_digit) (r & ((mp_word) MP_MASK));
diff --git a/bn_s_mp_sub.c b/bn_s_mp_sub.c
index f6da162..fe15d23 100644
--- a/bn_s_mp_sub.c
+++ b/bn_s_mp_sub.c
@@ -18,9 +18,7 @@
 int
 s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
 {
-  int       olduse, res, min, max, i;
-  mp_digit  u;
-
+  int     olduse, res, min, max;
 
   /* find sizes */
   min = b->used;
@@ -37,36 +35,48 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
 
   /* sub digits from lower part */
 
-  /* set carry to zero */
-  u = 0;
-  for (i = 0; i < min; i++) {
-    /* T[i] = A[i] - B[i] - U */
-    c->dp[i] = a->dp[i] - (b->dp[i] + u);
+  {
+    register mp_digit u, *tmpa, *tmpb, *tmpc;
+    register int i;
 
-    /* U = carry bit of T[i] */
-    u = (c->dp[i] >> DIGIT_BIT) & 1;
+    /* alias for digit pointers */
+    tmpa = a->dp;
+    tmpb = b->dp;
+    tmpc = c->dp;
 
-    /* Clear carry from T[i] */
-    c->dp[i] &= MP_MASK;
-  }
+    /* set carry to zero */
+    u = 0;
+    for (i = 0; i < min; i++) {
+      /* T[i] = A[i] - B[i] - U */
+      *tmpc = *tmpa++ - *tmpb++ - u;
+
+      /* U = carry bit of T[i] 
+       * Note this saves performing an AND operation since 
+       * if a carry does occur it will propagate all the way to the
+       * MSB.  As a result a single shift is required to get the carry
+       */
+      u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1);
+
+      /* Clear carry from T[i] */
+      *tmpc++ &= MP_MASK;
+    }
 
-  /* now copy higher words if any, e.g. if A has more digits than B  */
-  if (min != max) {
+    /* now copy higher words if any, e.g. if A has more digits than B  */
     for (; i < max; i++) {
       /* T[i] = A[i] - U */
-      c->dp[i] = a->dp[i] - u;
+      *tmpc = *tmpa++ - u;
 
       /* U = carry bit of T[i] */
-      u = (c->dp[i] >> DIGIT_BIT) & 1;
+      u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1);
 
       /* Clear carry from T[i] */
-      c->dp[i] &= MP_MASK;
+      *tmpc++ &= MP_MASK;
     }
-  }
 
-  /* clear digits above used (since we may not have grown result above) */
-  for (i = c->used; i < olduse; i++) {
-    c->dp[i] = 0;
+    /* clear digits above used (since we may not have grown result above) */
+    for (i = c->used; i < olduse; i++) {
+      *tmpc++ = 0;
+    }
   }
 
   mp_clamp (c);
diff --git a/bncore.c b/bncore.c
index 5c7d098..8863935 100644
--- a/bncore.c
+++ b/bncore.c
@@ -14,6 +14,6 @@
  */
 #include <tommath.h>
 
-int       KARATSUBA_MUL_CUTOFF = 80,	/* Min. number of digits before Karatsuba multiplication is used. */
-          KARATSUBA_SQR_CUTOFF = 80,	/* Min. number of digits before Karatsuba squaring is used. */
-          MONTGOMERY_EXPT_CUTOFF = 40;	/* max. number of digits that montgomery reductions will help for */
+int     KARATSUBA_MUL_CUTOFF = 80,	/* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 80,	/* Min. number of digits before Karatsuba squaring is used. */
+        MONTGOMERY_EXPT_CUTOFF = 74;	/* max. number of digits that montgomery reductions will help for */
diff --git a/changes.txt b/changes.txt
index e2c9903..c31526e 100644
--- a/changes.txt
+++ b/changes.txt
@@ -1,3 +1,9 @@
+Feb 13th, 2003
+v0.13  -- tons of minor speed-ups in low level add, sub, mul_2 and div_2 which propagate 
+          to other functions like mp_invmod, mp_div, etc...
+       -- Sped up mp_exptmod_fast by using new code to find R mod m [e.g. B^n mod m]
+       -- minor fixes
+
 Jan 17th, 2003
 v0.12  -- re-wrote the majority of the makefile so its more portable and will
           install via "make install" on most *nix platforms
diff --git a/demo/demo.c b/demo/demo.c
index 3c3ef07..8bf9acd 100644
--- a/demo/demo.c
+++ b/demo/demo.c
@@ -76,7 +76,7 @@ int main(void)
 {
    mp_int a, b, c, d, e, f;
    unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n;
-   int rr;
+   unsigned rr;
 
 #ifdef TIMER
    int n;
@@ -90,42 +90,20 @@ int main(void)
    mp_init(&e);
    mp_init(&f);
 
-#ifdef DEBUG
-   mp_read_radix(&a, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319", 10);
-   mp_read_radix(&b, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136318", 10);
-   mp_set(&c, 1);
-   reset_timings();
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   dump_timings();
-   return 0;
-#endif
 
 #ifdef TIMER
+goto multtime;
+
       printf("CLOCKS_PER_SEC == %lu\n", CLOCKS_PER_SEC);
       mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
       mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
       while (a.used * DIGIT_BIT < 8192) {
          reset();
-         for (rr = 0; rr < 1000000; rr++) {
+         for (rr = 0; rr < 10000000; rr++) {
              mp_add(&a, &b, &c);
          }
          tt = rdtsc();
-         printf("Adding %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
+         printf("Adding\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
          mp_sqr(&a, &a);
          mp_sqr(&b, &b);
       }
@@ -134,37 +112,40 @@ int main(void)
       mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
       while (a.used * DIGIT_BIT < 8192) {
          reset();
-         for (rr = 0; rr < 1000000; rr++) {
+         for (rr = 0; rr < 10000000; rr++) {
              mp_sub(&a, &b, &c);
          }
          tt = rdtsc();
-         printf("Subtracting %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
+         printf("Subtracting\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
          mp_sqr(&a, &a);
          mp_sqr(&b, &b);
       }
+      
+multtime:      
 
    mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
    while (a.used * DIGIT_BIT < 8192) {
       reset();
-      for (rr = 0; rr < 1000000; rr++) {
+      for (rr = 0; rr < 250000; rr++) {
           mp_sqr(&a, &b);
       }
       tt = rdtsc();
-      printf("Squaring %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
+      printf("Squaring\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
       mp_copy(&b, &a);
    }
    
    mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
    while (a.used * DIGIT_BIT < 8192) {
       reset();
-      for (rr = 0; rr < 1000000; rr++) {
+      for (rr = 0; rr < 250000; rr++) {
           mp_mul(&a, &a, &b);
       }
       tt = rdtsc();
-      printf("Multiplying %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
+      printf("Multiplying\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
       mp_copy(&b, &a);
    }
-   
+
+expttime:  
    {
       char *primes[] = {
          "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
@@ -180,7 +161,7 @@ int main(void)
       mp_read_radix(&a, primes[n], 10);
       mp_zero(&b);
       for (rr = 0; rr < mp_count_bits(&a); rr++) {
-         mp_mul_2d(&b, 1, &b);
+         mp_mul_2(&b, &b);
          b.dp[0] |= lbit();
          b.used  += 1;
       }
@@ -188,7 +169,7 @@ int main(void)
       mp_mod(&b, &c, &b);
       mp_set(&c, 3);
       reset();
-      for (rr = 0; rr < 100; rr++) {
+      for (rr = 0; rr < 50; rr++) {
           mp_exptmod(&c, &b, &a, &d);
       }
       tt = rdtsc();
@@ -201,16 +182,15 @@ int main(void)
          draw(&d);
          exit(0);
       }
-      printf("Exponentiating %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)100));
+      printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
    }
    }   
 
-
    mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
    mp_read_radix(&b, "234892374891378913789237289378973232333", 10);
    while (a.used * DIGIT_BIT < 8192) {
       reset();
-      for (rr = 0; rr < 100; rr++) {
+      for (rr = 0; rr < 10000; rr++) {
           mp_invmod(&b, &a, &c);
       }
       tt = rdtsc();
@@ -219,7 +199,7 @@ int main(void)
          printf("Failed to invert\n");
          return 0;
       }
-      printf("Inverting mod %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)100));
+      printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
       mp_sqr(&a, &a);
       mp_sqr(&b, &b);
    }
diff --git a/etc/makefile b/etc/makefile
index 7a98e09..bf4befb 100644
--- a/etc/makefile
+++ b/etc/makefile
@@ -1,15 +1,20 @@
-CFLAGS += -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops
+CFLAGS += -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops -I../
+
+
+# default lib name (requires install with root)
+# LIBNAME=-ltommath
+
+# libname when you can't install the lib with install
+LIBNAME=../libtommath.a
 
 pprime: pprime.o
-	$(CC) pprime.o -ltommath -o pprime
+	$(CC) pprime.o $(LIBNAME) -o pprime
 
 tune: tune.o
-	$(CC) tune.o -ltommath -o tune
+	$(CC) tune.o $(LIBNAME) -o tune
         
 mersenne: mersenne.o
-	$(CC) mersenne.o -ltommath -o mersenne
+	$(CC) mersenne.o $(LIBNAME) -o mersenne
         
 clean:
-	rm -f *.o pprime tune mersenne
-
-
+	rm -f *.o *.exe pprime tune mersenne 
\ No newline at end of file
diff --git a/etc/pprime.c b/etc/pprime.c
index c136901..6fea3da 100644
--- a/etc/pprime.c
+++ b/etc/pprime.c
@@ -5,7 +5,7 @@
  * Tom St Denis, tomstdenis@iahu.ca, http://tom.iahu.ca
  */
 #include <time.h>
-#include "bn.h"
+#include "tommath.h"
 
 /* fast square root */
 static    mp_digit
diff --git a/etc/tune.c b/etc/tune.c
index 0d03e77..73a44b7 100644
--- a/etc/tune.c
+++ b/etc/tune.c
@@ -18,9 +18,9 @@ time_mult (void)
 
   t1 = clock ();
   for (x = 8; x <= 128; x += 8) {
-    mp_rand (&a, x);
-    mp_rand (&b, x);
-    for (y = 0; y < 10000; y++) {
+    for (y = 0; y < 1000; y++) {
+      mp_rand (&a, x);
+      mp_rand (&b, x);
       mp_mul (&a, &b, &c);
     }
   }
@@ -42,8 +42,8 @@ time_sqr (void)
 
   t1 = clock ();
   for (x = 8; x <= 128; x += 8) {
-    mp_rand (&a, x);
-    for (y = 0; y < 10000; y++) {
+    for (y = 0; y < 1000; y++) {
+      mp_rand (&a, x);
       mp_sqr (&a, &b);
     }
   }
diff --git a/gen.pl b/gen.pl
new file mode 100644
index 0000000..fcfd57d
--- /dev/null
+++ b/gen.pl
@@ -0,0 +1,27 @@
+#!/usr/bin/perl
+#
+#Generates a "single file" you can use to quickly add the whole source 
+#without any makefile troubles
+#
+
+opendir(DIR,".");
+@files = readdir(DIR);
+closedir(DIR);
+
+open(OUT,">mpi.c");
+print OUT "/* File Generated Automatically by gen.pl */\n\n";
+for (@files) {
+   if ($_ =~ /\.c/ && !($_ =~ /mpi\.c/)) {
+      $fname = $_;
+      open(SRC,"<$fname");
+      print OUT "/* Start: $fname */\n";
+      while (<SRC>) {
+         print OUT $_;
+      }
+      close(SRC);
+      print OUT "\n/* End: $fname */\n\n";
+   }
+}
+print OUT "\n/* EOF */\n";
+close(OUT);
+   
\ No newline at end of file
diff --git a/ltmtest.exe b/ltmtest.exe
deleted file mode 100644
index cc5c1f5..0000000
Binary files a/ltmtest.exe and /dev/null differ
diff --git a/makefile b/makefile
index 5f6bcc6..9e6127d 100644
--- a/makefile
+++ b/makefile
@@ -1,6 +1,6 @@
 CFLAGS  +=  -I./ -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops
 
-VERSION=0.12
+VERSION=0.13
 
 default: libtommath.a
 
@@ -30,7 +30,7 @@ bn_mp_reduce.o bn_mp_montgomery_setup.o bn_fast_mp_montgomery_reduce.o bn_mp_mon
 bn_mp_exptmod_fast.o bn_mp_exptmod.o bn_mp_2expt.o bn_mp_n_root.o bn_mp_jacobi.o bn_reverse.o \
 bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_unsigned_bin.o \
 bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o bn_radix.o \
-bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o
+bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o
 
 libtommath.a:  $(OBJECTS)
 	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
@@ -60,8 +60,8 @@ docs:	docdvi
 	rm -f bn.log bn.aux bn.dvi
 	
 clean:
-	rm -f *.pdf *.o *.a etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest \
-        bn.log bn.aux bn.dvi *.log *.s
+	rm -f *.pdf *.o *.a *.exe etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
+        bn.log bn.aux bn.dvi *.log *.s mpi.c 
 	cd etc ; make clean
 
 zipup: clean docs
diff --git a/mpitest.exe b/mpitest.exe
deleted file mode 100644
index d32553a..0000000
Binary files a/mpitest.exe and /dev/null differ
diff --git a/mtest/mtest.c b/mtest/mtest.c
index 17aef8d..3759d15 100644
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@@ -89,7 +89,7 @@ int main(void)
    }
 
    for (;;) {
-       n = fgetc(rng) % 11;
+       n = 4; // fgetc(rng) % 11;
 
    if (n == 0) {
        /* add tests */
diff --git a/timings.txt b/timings.txt
index 117b9c5..128649e 100644
--- a/timings.txt
+++ b/timings.txt
@@ -1,39 +1,36 @@
-CLOCKS_PER_SEC == 1000000
-Adding 128-bit took 0.070000 ticks
-Adding 256-bit took 0.100000 ticks
-Adding 512-bit took 0.140000 ticks
-Adding 1024-bit took 0.210000 ticks
-Adding 2048-bit took 0.360000 ticks
-Adding 4096-bit took 0.670000 ticks
-Subtracting 128-bit took 0.090000 ticks
-Subtracting 256-bit took 0.120000 ticks
-Subtracting 512-bit took 0.140000 ticks
-Subtracting 1024-bit took 0.210000 ticks
-Subtracting 2048-bit took 0.330000 ticks
-Subtracting 4096-bit took 0.580000 ticks
-Squaring 128-bit took 0.320000 ticks
-Squaring 256-bit took 0.620000 ticks
-Squaring 512-bit took 1.410000 ticks
-Squaring 1024-bit took 3.730000 ticks
-Squaring 2048-bit took 11.580000 ticks
-Squaring 4096-bit took 44.540000 ticks
-Multiplying 128-bit took 0.270000 ticks
-Multiplying 256-bit took 0.650000 ticks
-Multiplying 512-bit took 1.630000 ticks
-Multiplying 1024-bit took 5.180000 ticks
-Multiplying 2048-bit took 19.210000 ticks
-Multiplying 4096-bit took 67.500000 ticks
-Exponentiating 513-bit took 2000.000000 ticks
-Exponentiating 769-bit took 5200.000000 ticks
-Exponentiating 1025-bit took 11400.000000 ticks
-Exponentiating 2049-bit took 75100.000000 ticks
-Exponentiating 2561-bit took 150000.000000 ticks
-Exponentiating 3073-bit took 237800.000000 ticks
-Exponentiating 4097-bit took 510600.000000 ticks
-Inverting mod 128-bit took 0.000000 ticks
-Inverting mod 256-bit took 200.000000 ticks
-Inverting mod 512-bit took 300.000000 ticks
-Inverting mod 1024-bit took 800.000000 ticks
-Inverting mod 2048-bit took 2500.000000 ticks
-Inverting mod 4096-bit took 8400.000000 ticks
-
+CLOCKS_PER_SEC == 1000
+Adding           128-bit =>  14534883/sec,       688 ticks
+Adding           256-bit =>  11037527/sec,       906 ticks
+Adding           512-bit =>   8650519/sec,      1156 ticks
+Adding          1024-bit =>   5871990/sec,      1703 ticks
+Adding          2048-bit =>   3575259/sec,      2797 ticks
+Adding          4096-bit =>   2018978/sec,      4953 ticks
+Subtracting      128-bit =>  11025358/sec,       907 ticks
+Subtracting      256-bit =>   9149130/sec,      1093 ticks
+Subtracting      512-bit =>   7440476/sec,      1344 ticks
+Subtracting     1024-bit =>   5078720/sec,      1969 ticks
+Subtracting     2048-bit =>   3168567/sec,      3156 ticks
+Subtracting     4096-bit =>   1833852/sec,      5453 ticks
+Squaring         128-bit =>   3205128/sec,        78 ticks
+Squaring         256-bit =>   1592356/sec,       157 ticks
+Squaring         512-bit =>    696378/sec,       359 ticks
+Squaring        1024-bit =>    266808/sec,       937 ticks
+Squaring        2048-bit =>     85999/sec,      2907 ticks
+Squaring        4096-bit =>     21949/sec,     11390 ticks
+Multiplying      128-bit =>   3205128/sec,        78 ticks
+Multiplying      256-bit =>   1592356/sec,       157 ticks
+Multiplying      512-bit =>    615763/sec,       406 ticks
+Multiplying     1024-bit =>    192752/sec,      1297 ticks
+Multiplying     2048-bit =>     53510/sec,      4672 ticks
+Multiplying     4096-bit =>     14801/sec,     16890 ticks
+Exponentiating   513-bit =>       531/sec,        47 ticks
+Exponentiating   769-bit =>       177/sec,       141 ticks
+Exponentiating  1025-bit =>        88/sec,       282 ticks
+Exponentiating  2049-bit =>        13/sec,      1890 ticks
+Exponentiating  2561-bit =>         6/sec,      3812 ticks
+Exponentiating  3073-bit =>         4/sec,      6031 ticks
+Exponentiating  4097-bit =>         1/sec,     12843 ticks
+Inverting mod    128-bit =>     19160/sec,      5219 ticks
+Inverting mod    256-bit =>      8290/sec,     12062 ticks
+Inverting mod    512-bit =>      3565/sec,     28047 ticks
+Inverting mod   1024-bit =>      1305/sec,     76594 ticks
\ No newline at end of file
diff --git a/timings2.txt b/timings2.txt
new file mode 100644
index 0000000..0b87e21
--- /dev/null
+++ b/timings2.txt
@@ -0,0 +1,36 @@
+CLOCKS_PER_SEC == 1000
+Adding           128-bit =>  15600624/sec,       641 ticks
+Adding           256-bit =>  12804097/sec,       781 ticks
+Adding           512-bit =>  10000000/sec,      1000 ticks
+Adding          1024-bit =>   7032348/sec,      1422 ticks
+Adding          2048-bit =>   4076640/sec,      2453 ticks
+Adding          4096-bit =>   2424242/sec,      4125 ticks
+Subtracting      128-bit =>  10845986/sec,       922 ticks
+Subtracting      256-bit =>   9416195/sec,      1062 ticks
+Subtracting      512-bit =>   7710100/sec,      1297 ticks
+Subtracting     1024-bit =>   5159958/sec,      1938 ticks
+Subtracting     2048-bit =>   3299241/sec,      3031 ticks
+Subtracting     4096-bit =>   1987676/sec,      5031 ticks
+Squaring         128-bit =>   3205128/sec,        78 ticks
+Squaring         256-bit =>   1592356/sec,       157 ticks
+Squaring         512-bit =>    696378/sec,       359 ticks
+Squaring        1024-bit =>    266524/sec,       938 ticks
+Squaring        2048-bit =>     86505/sec,      2890 ticks
+Squaring        4096-bit =>     22471/sec,     11125 ticks
+Multiplying      128-bit =>   3205128/sec,        78 ticks
+Multiplying      256-bit =>   1592356/sec,       157 ticks
+Multiplying      512-bit =>    615763/sec,       406 ticks
+Multiplying     1024-bit =>    190548/sec,      1312 ticks
+Multiplying     2048-bit =>     54418/sec,      4594 ticks
+Multiplying     4096-bit =>     14897/sec,     16781 ticks
+Exponentiating   513-bit =>       531/sec,        47 ticks
+Exponentiating   769-bit =>       177/sec,       141 ticks
+Exponentiating  1025-bit =>        84/sec,       297 ticks
+Exponentiating  2049-bit =>        13/sec,      1875 ticks
+Exponentiating  2561-bit =>         6/sec,      3766 ticks
+Exponentiating  3073-bit =>         4/sec,      6000 ticks
+Exponentiating  4097-bit =>         1/sec,     12750 ticks
+Inverting mod    128-bit =>     17301/sec,       578 ticks
+Inverting mod    256-bit =>      8103/sec,      1234 ticks
+Inverting mod    512-bit =>      3422/sec,      2922 ticks
+Inverting mod   1024-bit =>      1330/sec,      7516 ticks
\ No newline at end of file
diff --git a/timings3.txt b/timings3.txt
new file mode 100644
index 0000000..f269c2b
--- /dev/null
+++ b/timings3.txt
@@ -0,0 +1,5 @@
+Exponentiating   513-bit =>       531/sec,        94 ticks
+Exponentiating   769-bit =>       187/sec,       266 ticks
+Exponentiating  1025-bit =>        88/sec,       562 ticks
+Exponentiating  2049-bit =>        13/sec,      3719 ticks
+
diff --git a/tommath.h b/tommath.h
index 4ac6173..9db1781 100644
--- a/tommath.h
+++ b/tommath.h
@@ -289,6 +289,11 @@ int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
 /* setups the montgomery reduction */
 int mp_montgomery_setup(mp_int *a, mp_digit *mp);
 
+/* computes a = B^n mod b without division or multiplication useful for 
+ * normalizing numbers in a Montgomery system.
+ */
+int mp_montgomery_calc_normalization(mp_int *a, mp_int *b);
+
 /* computes xR^-1 == x (mod N) via Montgomery Reduction */
 int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
 
@@ -343,6 +348,5 @@ void bn_reverse(unsigned char *s, int len);
    }
 #endif
 
-
 #endif
c3-lang/libtommath

Commit b66471f74f1f16ebd823f71c5602cc6b68e69574