Commit 16c6ccc62c7cfcbca7853ad34388e8d006e861e7

Tom St Denis 2003-02-28T16:05:26

added libtommath-0.06

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
diff --git a/b.bat b/b.bat
index 38c1120..1d0b900 100644
--- a/b.bat
+++ b/b.bat
@@ -1,3 +1,3 @@
 nasm -f coff timer.asm
-gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o demo
+gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo
 gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo
diff --git a/bn.c b/bn.c
index 4a0f4f9..cf8a391 100644
--- a/bn.c
+++ b/bn.c
@@ -700,8 +700,8 @@ int mp_mul_2(mp_int *a, mp_int *b)
 /* low level addition */
 static int s_mp_add(mp_int *a, mp_int *b, mp_int *c)
 {
-   mp_int t, *x;
-   int res, min, max, i;
+   mp_int *x;
+   int olduse, res, min, max, i;
    mp_digit u;
    
    REGFUNC("s_mp_add");
@@ -724,45 +724,52 @@ static int s_mp_add(mp_int *a, mp_int *b, mp_int *c)
    }
    
    /* init result */
-   if ((res = mp_init_size(&t, max+1)) != MP_OKAY) {
-      DECFUNC();
-      return res;
+   if (c->alloc < max+1) {
+      if ((res = mp_grow(c, max+1)) != MP_OKAY) {
+         DECFUNC();
+         return res;
+      }
    }
-   t.used = max+1;
+ 
+   olduse  = c->used;
+   c->used = max + 1;
 
    /* add digits from lower part */
    u = 0;
    for (i = 0; i < min; i++) {
        /* T[i] = A[i] + B[i] + U */   
-       t.dp[i] = a->dp[i] + b->dp[i] + u;
+       c->dp[i] = a->dp[i] + b->dp[i] + u;
        
        /* U = carry bit of T[i] */       
-       u = (t.dp[i] >> DIGIT_BIT) & 1;
+       u = (c->dp[i] >> DIGIT_BIT) & 1;
        
        /* take away carry bit from T[i] */
-       t.dp[i] &= MP_MASK;
+       c->dp[i] &= MP_MASK;
    }
    
    /* now copy higher words if any, that is in A+B if A or B has more digits add those in */
    if (min != max) {
       for (; i < max; i++) { 
          /* T[i] = X[i] + U */
-         t.dp[i] = x->dp[i] + u;
+         c->dp[i] = x->dp[i] + u;
          
          /* U = carry bit of T[i] */
-         u = (t.dp[i] >> DIGIT_BIT) & 1;
+         u = (c->dp[i] >> DIGIT_BIT) & 1;
          
          /* take away carry bit from T[i] */
-         t.dp[i] &= MP_MASK;
+         c->dp[i] &= MP_MASK;
       }
    }
    
    /* add carry */
-   t.dp[i] = u;
-
-   mp_clamp(&t);
-   mp_exch(&t, c);
-   mp_clear(&t);
+   c->dp[i] = u;
+   
+   /* clear digits above used (since we may not have grown result above) */
+   for (i = c->used; i < olduse; i++) {
+      c->dp[i] = 0;
+   }
+   
+   mp_clamp(c);
    DECFUNC();
    return MP_OKAY;       
 }
@@ -770,8 +777,7 @@ static int s_mp_add(mp_int *a, mp_int *b, mp_int *c)
 /* low level subtraction (assumes a > b) */
 static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c)
 {
-   mp_int t;
-   int res, min, max, i;
+   int olduse, res, min, max, i;
    mp_digit u;
    
    REGFUNC("s_mp_sub");
@@ -784,42 +790,48 @@ static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c)
    max = a->used;
    
    /* init result */
-   if ((res = mp_init_size(&t, max)) != MP_OKAY) {
-      DECFUNC();
-      return res;
+   if (c->alloc < max) {
+      if ((res = mp_grow(c, max)) != MP_OKAY) {
+         DECFUNC();
+         return res;
+      }
    }
-   t.used = max;
+   olduse  = c->used;
+   c->used = max;
    
    /* sub digits from lower part */
    u = 0;
    for (i = 0; i < min; i++) {
        /* T[i] = A[i] - B[i] - U */
-       t.dp[i] = a->dp[i] - (b->dp[i] + u);
+       c->dp[i] = a->dp[i] - (b->dp[i] + u);
        
        /* U = carry bit of T[i] */
-       u = (t.dp[i] >> DIGIT_BIT) & 1;
+       u = (c->dp[i] >> DIGIT_BIT) & 1;
        
        /* Clear carry from T[i] */
-       t.dp[i] &= MP_MASK;
+       c->dp[i] &= MP_MASK;
    }
    
    /* now copy higher words if any, e.g. if A has more digits than B  */
    if (min != max) {
       for (; i < max; i++) { 
          /* T[i] = A[i] - U */
-         t.dp[i] = a->dp[i] - u;
+         c->dp[i] = a->dp[i] - u;
 
          /* U = carry bit of T[i] */
-         u = (t.dp[i] >> DIGIT_BIT) & 1;
+         u = (c->dp[i] >> DIGIT_BIT) & 1;
 
          /* Clear carry from T[i] */
-         t.dp[i] &= MP_MASK;
+         c->dp[i] &= MP_MASK;
       }
    }
    
-   mp_clamp(&t);
-   mp_exch(&t, c);
-   mp_clear(&t);
+   /* clear digits above used (since we may not have grown result above) */
+   for (i = c->used; i < olduse; i++) {
+      c->dp[i] = 0;
+   }
+
+   mp_clamp(c);
    DECFUNC();
    return MP_OKAY;       
 }
@@ -2941,7 +2953,7 @@ int mp_toradix(mp_int *a, char *str, int radix)
    int res, digs;
    mp_int t;
    mp_digit d;
-   unsigned char *_s = str;
+   char *_s = str;
    
    if (radix < 2 || radix > 64) {
       return MP_VAL;
@@ -2966,7 +2978,7 @@ int mp_toradix(mp_int *a, char *str, int radix)
        *str++ = s_rmap[d];
        ++digs;
    }
-   reverse(_s, digs);
+   reverse((unsigned char *)_s, digs);
    *str++ = '\0';
    mp_clear(&t);
    return MP_OKAY;
diff --git a/bn.h b/bn.h
index a87de8a..624c50d 100644
--- a/bn.h
+++ b/bn.h
@@ -12,7 +12,6 @@
  *
  * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
  */
-
 #ifndef BN_H_
 #define BN_H_
 
@@ -39,13 +38,18 @@
 #else
 #ifndef CRYPT
    #ifdef _MSC_VER
-      typedef __int64            ulong64;
+      typedef unsigned __int64   ulong64;
+      typedef signed __int64     long64;
    #else
       typedef unsigned long long ulong64;
+      typedef signed long long   long64;
    #endif   
 #endif   
+
+   /* default case */
    typedef unsigned long      mp_digit;
    typedef ulong64            mp_word;
+  
    #define DIGIT_BIT          28U
 #endif  
 
@@ -72,8 +76,14 @@
 
 typedef int           mp_err;
 
-#define KARATSUBA_MUL_CUTOFF    80                /* Min. number of digits before Karatsuba multiplication is used. */
-#define KARATSUBA_SQR_CUTOFF    80                /* Min. number of digits before Karatsuba squaring is used. */
+/* you'll have to tune these... */
+#ifdef FAST_FPU
+   #define KARATSUBA_MUL_CUTOFF    100     /* Min. number of digits before Karatsuba multiplication is used. */
+   #define KARATSUBA_SQR_CUTOFF    100     /* Min. number of digits before Karatsuba squaring is used. */
+#else
+   #define KARATSUBA_MUL_CUTOFF    80      /* Min. number of digits before Karatsuba multiplication is used. */
+   #define KARATSUBA_SQR_CUTOFF    80      /* Min. number of digits before Karatsuba squaring is used. */
+#endif
 
 typedef struct  {
     int used, alloc, sign;
diff --git a/bn.pdf b/bn.pdf
index 34156bc..d517dc4 100644
Binary files a/bn.pdf and b/bn.pdf differ
diff --git a/bn.tex b/bn.tex
index 9808a73..069f76b 100644
--- a/bn.tex
+++ b/bn.tex
@@ -1,7 +1,7 @@
 \documentclass{article}
 \begin{document}
 
-\title{LibTomMath v0.05 \\ A Free Multiple Precision Integer Library}
+\title{LibTomMath v0.06 \\ A Free Multiple Precision Integer Library}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
@@ -460,34 +460,34 @@ were observed.
 \begin{tabular}{c|c|c|c}
 \hline \textbf{Operation} & \textbf{Size (bits)} & \textbf{Time with MPI (cycles)} & \textbf{Time with LibTomMath (cycles)} \\
 \hline
-Inversion & 128 & 264,083  & 159,194  \\
-Inversion & 256 & 549,370  & 355,914  \\
-Inversion & 512 & 1,675,975  & 842,538  \\
-Inversion & 1024 & 5,237,957  & 2,170,804  \\
-Inversion & 2048 & 17,871,944  & 6,250,876  \\
-Inversion & 4096 & 66,610,468  & 18,161,612  \\
+Inversion & 128 & 264,083  & 59,782   \\
+Inversion & 256 & 549,370  & 146,915   \\
+Inversion & 512 & 1,675,975  & 367,172   \\
+Inversion & 1024 & 5,237,957  & 1,054,158   \\
+Inversion & 2048 & 17,871,944  & 3,459,683   \\
+Inversion & 4096 & 66,610,468  & 11,834,556   \\
 \hline
 Multiply & 128 & 1,426   & 828    \\
 Multiply & 256 & 2,551   & 1,393    \\
 Multiply & 512 & 7,913   & 2,926    \\
 Multiply & 1024 & 28,496   & 8,620  \\
 Multiply & 2048 & 109,897   & 28,967    \\
-Multiply & 4096 & 469,970   & 106,855    \\
+Multiply & 4096 & 469,970   & 105,387    \\
 \hline 
 Square & 128 & 1,319   & 869    \\
 Square & 256 & 1,776   & 1,362    \\
 Square & 512 & 5,399  & 2,571   \\
 Square & 1024 & 18,991  & 6,332    \\
 Square & 2048 & 72,126  & 18,426   \\
-Square & 4096 & 306,269  & 76,305  \\
+Square & 4096 & 306,269  & 74,908 \\
 \hline 
-Exptmod & 512 & 32,021,586  & 5,709,468 \\
-Exptmod & 768 & 97,595,492  & 12,473,526   \\
-Exptmod & 1024 & 223,302,532  & 23,593,075    \\
-Exptmod & 2048 & 1,682,223,369   & 121,992,481    \\
-Exptmod & 2560 & 3,268,615,571   & 258,155,605    \\
-Exptmod & 3072 & 5,597,240,141   & 399,800,504   \\
-Exptmod & 4096 & 13,347,270,891   & 826,013,375   
+Exptmod & 512 & 32,021,586  & 5,696,459  \\
+Exptmod & 768 & 97,595,492  & 12,428,274   \\
+Exptmod & 1024 & 223,302,532  & 22,834,316   \\
+Exptmod & 2048 & 1,682,223,369   & 119,888,049    \\
+Exptmod & 2560 & 3,268,615,571   & 250,901,263     \\
+Exptmod & 3072 & 5,597,240,141   & 391,716,431    \\
+Exptmod & 4096 & 13,347,270,891   & 814,429,647    
 
 \end{tabular}
 \end{center}
diff --git a/changes.txt b/changes.txt
index 7900121..e5b6294 100644
--- a/changes.txt
+++ b/changes.txt
@@ -1,3 +1,7 @@
+Dec 31st, 2002
+v0.06  -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc...
+       -- Cleaned up the header a bit more
+       
 Dec 30th, 2002
 v0.05  -- Builds with MSVC out of the box
        -- Fixed a bug in mp_invmod w.r.t. even moduli
diff --git a/demo.c b/demo.c
index 5e29d1f..0a1943f 100644
--- a/demo.c
+++ b/demo.c
@@ -1,5 +1,6 @@
 #include <time.h>
 
+
 #ifdef U_MPI
 #include <stdio.h>
 #include <string.h>
@@ -12,7 +13,6 @@
    #else
       typedef unsigned long long ulong64;
    #endif   
-   
 #else   
    #include "bn.h"
 #endif
@@ -22,6 +22,8 @@
 extern ulong64 rdtsc(void);
 extern void reset(void);
 #else 
+
+
 ulong64 _tt;
 void reset(void) { _tt = clock(); }
 ulong64 rdtsc(void) { return clock() - _tt; }
@@ -73,11 +75,11 @@ int mp_reduce_setup(mp_int *a, mp_int *b)
 }
 #endif
 
+   char cmd[4096], buf[4096];
 int main(void)
 {
    mp_int a, b, c, d, e, f;
    unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n;
-   unsigned char cmd[4096], buf[4096];
    int rr;
    
 #ifdef TIMER
@@ -92,6 +94,7 @@ int main(void)
    mp_init(&e);
    mp_init(&f);
    
+ 
    mp_read_radix(&a, "V//////////////////////////////////////////////////////////////////////////////////////", 64);
    mp_reduce_setup(&b, &a);
    printf("\n\n----\n\n");
@@ -102,12 +105,11 @@ int main(void)
    mp_sub_d(&a, 1, &c);
    mp_exptmod(&b, &c, &a, &d);
    mp_toradix(&d, buf, 10);
-   printf("b^p-1 == %s\n", buf);
-   
+   printf("b^p-1 == %s\n", buf); 
 
 #ifdef TIMER   
 
-   mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
+      mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
       mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
       while (a.used * DIGIT_BIT < 8192) {
          reset();
@@ -132,30 +134,31 @@ int main(void)
          mp_sqr(&a, &a);
          mp_sqr(&b, &b);
       }
-   
 
    mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
    while (a.used * DIGIT_BIT < 8192) {
       reset();
-      for (rr = 0; rr < 10000; rr++) {
+      for (rr = 0; rr < 100000; rr++) {
           mp_sqr(&a, &b);
       }
       tt = rdtsc();
-      printf("Squaring %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)10000));
+      printf("Squaring %d-bit took %lu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000));
       mp_copy(&b, &a);
    }
    
    mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
    while (a.used * DIGIT_BIT < 8192) {
       reset();
-      for (rr = 0; rr < 10000; rr++) {
+      for (rr = 0; rr < 100000; rr++) {
           mp_mul(&a, &a, &b);
       }
       tt = rdtsc();
-      printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)10000));
+      printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000));
       mp_copy(&b, &a);
    }
-   
+
+  
+ 
    
    {
       char *primes[] = {
@@ -213,6 +216,8 @@ int main(void)
       mp_sqr(&a, &a);
       mp_sqr(&b, &b);
    }
+   
+   return 0;
   
 #endif   
 
@@ -264,9 +269,9 @@ draw(&a);draw(&b);draw(&c);draw(&d);
           /* test the sign/unsigned storage functions */
           
           rr = mp_signed_bin_size(&c);
-          mp_to_signed_bin(&c, cmd);
+          mp_to_signed_bin(&c, (unsigned char *)cmd);
           memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
-          mp_read_signed_bin(&d, cmd, rr);
+          mp_read_signed_bin(&d, (unsigned char *)cmd, rr);
           if (mp_cmp(&c, &d) != MP_EQ) {
              printf("mp_signed_bin failure!\n");
              draw(&c);
@@ -276,9 +281,9 @@ draw(&a);draw(&b);draw(&c);draw(&d);
                     
           
           rr = mp_unsigned_bin_size(&c);
-          mp_to_unsigned_bin(&c, cmd);
+          mp_to_unsigned_bin(&c, (unsigned char *)cmd);
           memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
-          mp_read_unsigned_bin(&d, cmd, rr);
+          mp_read_unsigned_bin(&d, (unsigned char *)cmd, rr);
           if (mp_cmp_mag(&c, &d) != MP_EQ) {
              printf("mp_unsigned_bin failure!\n");
              draw(&c);
diff --git a/makefile b/makefile
index 58b57ad..ec94b70 100644
--- a/makefile
+++ b/makefile
@@ -1,7 +1,7 @@
 CC = gcc
 CFLAGS  += -DDEBUG -Wall -W -O3 -fomit-frame-pointer -funroll-loops 
 
-VERSION=0.05
+VERSION=0.06
 
 default: test