-
Notifications
You must be signed in to change notification settings - Fork 41
/
Copy pathnlz.c.txt
385 lines (321 loc) · 11.1 KB
/
nlz.c.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
// This has the programs for computing the number of leading zeros
// in a word.
// Max line length is 57, to fit in hacker.book.
// Compile with g++, not gcc.
#include <stdio.h>
#include <stdlib.h> // To define "exit", req'd by XLC.
#define LE 1 // 1 for little-endian, 0 for big-endian.
int pop(unsigned x) {
x = x - ((x >> 1) & 0x55555555);
x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
x = (x + (x >> 4)) & 0x0F0F0F0F;
x = x + (x << 8);
x = x + (x << 16);
return x >> 24;
}
int nlz1(unsigned x) {
int n;
if (x == 0) return(32);
n = 0;
if (x <= 0x0000FFFF) {n = n +16; x = x <<16;}
if (x <= 0x00FFFFFF) {n = n + 8; x = x << 8;}
if (x <= 0x0FFFFFFF) {n = n + 4; x = x << 4;}
if (x <= 0x3FFFFFFF) {n = n + 2; x = x << 2;}
if (x <= 0x7FFFFFFF) {n = n + 1;}
return n;
}
int nlz1a(unsigned x) {
int n;
/* if (x == 0) return(32); */
if ((int)x <= 0) return (~x >> 26) & 32;
n = 1;
if ((x >> 16) == 0) {n = n +16; x = x <<16;}
if ((x >> 24) == 0) {n = n + 8; x = x << 8;}
if ((x >> 28) == 0) {n = n + 4; x = x << 4;}
if ((x >> 30) == 0) {n = n + 2; x = x << 2;}
n = n - (x >> 31);
return n;
}
// On basic Risc, 12 to 20 instructions.
int nlz2(unsigned x) {
unsigned y;
int n;
n = 32;
y = x >>16; if (y != 0) {n = n -16; x = y;}
y = x >> 8; if (y != 0) {n = n - 8; x = y;}
y = x >> 4; if (y != 0) {n = n - 4; x = y;}
y = x >> 2; if (y != 0) {n = n - 2; x = y;}
y = x >> 1; if (y != 0) return n - 2;
return n - x;
}
// As above but coded as a loop for compactness:
// 23 to 33 basic Risc instructions.
int nlz2a(unsigned x) {
unsigned y;
int n, c;
n = 32;
c = 16;
do {
y = x >> c; if (y != 0) {n = n - c; x = y;}
c = c >> 1;
} while (c != 0);
return n - x;
}
int nlz3(int x) {
int y, n;
n = 0;
y = x;
L: if (x < 0) return n;
if (y == 0) return 32 - n;
n = n + 1;
x = x << 1;
y = y >> 1;
goto L;
}
int nlz4(unsigned x) {
int y, m, n;
y = -(x >> 16); // If left half of x is 0,
m = (y >> 16) & 16; // set n = 16. If left half
n = 16 - m; // is nonzero, set n = 0 and
x = x >> m; // shift x right 16.
// Now x is of the form 0000xxxx.
y = x - 0x100; // If positions 8-15 are 0,
m = (y >> 16) & 8; // add 8 to n and shift x left 8.
n = n + m;
x = x << m;
y = x - 0x1000; // If positions 12-15 are 0,
m = (y >> 16) & 4; // add 4 to n and shift x left 4.
n = n + m;
x = x << m;
y = x - 0x4000; // If positions 14-15 are 0,
m = (y >> 16) & 2; // add 2 to n and shift x left 2.
n = n + m;
x = x << m;
y = x >> 14; // Set y = 0, 1, 2, or 3.
m = y & ~(y >> 1); // Set m = 0, 1, 2, or 2 resp.
return n + 2 - m;
}
int nlz5(unsigned x) {
int pop(unsigned x);
x = x | (x >> 1);
x = x | (x >> 2);
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >>16);
return pop(~x);
}
/* The four programs below are not valid ANSI C programs. This is
because they refer to the same storage locations as two different types.
However, they work with xlc/AIX, gcc/AIX, and gcc/NT. If you try to
code them more compactly by declaring a variable xx to be "double," and
then using
n = 1054 - (*((unsigned *)&xx + LE) >> 20);
then you are violating not only the rule above, but also the ANSI C
rule that pointer arithmetic can be performed only on pointers to
array elements.
When coded with the above statement, the program fails with xlc,
gcc/AIX, and gcc/NT, at some optimization levels.
BTW, these programs use the "anonymous union" feature of C++, not
available in C. */
int nlz6(unsigned k) {
union {
unsigned asInt[2];
double asDouble;
};
int n;
asDouble = (double)k + 0.5;
n = 1054 - (asInt[LE] >> 20);
return n;
}
int nlz7(unsigned k) {
union {
unsigned asInt[2];
double asDouble;
};
int n;
asDouble = (double)k;
n = 1054 - (asInt[LE] >> 20);
n = (n & 31) + (n >> 9);
return n;
}
/* In single precision, round-to-nearest mode, the basic method fails for:
k = 0, k = 01FFFFFF, 03FFFFFE <= k <= 03FFFFFF,
07FFFFFC <= k <= 07FFFFFF,
0FFFFFF8 <= k <= 0FFFFFFF,
...
7FFFFFC0 <= k <= 7FFFFFFF.
FFFFFF80 <= k <= FFFFFFFF.
For k = 0 it gives 158, and for the other values it is too low by 1. */
int nlz8(unsigned k) {
union {
unsigned asInt;
float asFloat;
};
int n;
k = k & ~(k >> 1); /* Fix problem with rounding. */
asFloat = (float)k + 0.5f;
n = 158 - (asInt >> 23);
return n;
}
/* The example below shows how to make a macro for nlz. It uses an
extension to the C and C++ languages that is provided by the GNU C/C++
compiler, namely, that of allowing statements and declarations in
expressions (see "Using and Porting GNU CC", by Richard M. Stallman
(1998). The underscores are necessary to protect against the
possibility that the macro argument will conflict with one of its local
variables, e.g., NLZ(k). */
#define NLZ(kp) \
({union {unsigned _asInt; float _asFloat;}; \
unsigned _k = (kp), _kk = _k & ~(_k >> 1); \
_asFloat = (float)_kk + 0.5f; \
158 - (_asInt >> 23);})
int nlz8a(unsigned k) {
return NLZ(k) + NLZ(-5 + 1);
}
int nlz9(unsigned k) {
union {
unsigned asInt;
float asFloat;
};
int n;
k = k & ~(k >> 1); /* Fix problem with rounding. */
asFloat = (float)k;
n = 158 - (asInt >> 23);
n = (n & 31) + (n >> 6); /* Fix problem with k = 0. */
return n;
}
/* Below are three nearly equivalent programs for computing the number
of leading zeros in a word. This material is not in HD, but may be in a
future edition.
Immediately below is Robert Harley's algorithm, found at the
comp.arch newsgroup entry dated 7/12/96, pointed out to me by Norbert
Juffa.
Table entries marked "u" are unused. 14 ops including a multiply,
plus an indexed load.
The smallest multiplier that works is 0x045BCED1 = 17*65*129*513 (all
of form 2**k + 1). There are no multipliers of three terms of the form
2**k +- 1 that work, with a table size of 64 or 128. There are some,
with a table size of 64, if you precede the multiplication with x = x -
(x >> 1), but that seems less elegant. There are also some if you use a
table size of 256, the smallest is 0x01033CBF = 65*255*1025 (this would
save two instructions in the form of this algorithm with the
multiplication expanded into shifts and adds, but the table size is
getting a bit large). */
#define u 99
int nlz10(unsigned x) {
static char table[64] =
{32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u,
u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u,
17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18,
5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u};
x = x | (x >> 1); // Propagate leftmost
x = x | (x >> 2); // 1-bit to the right.
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >>16);
x = x*0x06EB14F9; // Multiplier is 7*255**3.
return table[x >> 26];
}
/* Harley's algorithm with multiply expanded.
19 elementary ops plus an indexed load. */
int nlz10a(unsigned x) {
static char table[64] =
{32,31, u,16, u,30, 3, u, 15, u, u, u,29,10, 2, u,
u, u,12,14,21, u,19, u, u,28, u,25, u, 9, 1, u,
17, u, 4, u, u, u,11, u, 13,22,20, u,26, u, u,18,
5, u, u,23, u,27, u, 6, u,24, 7, u, 8, u, 0, u};
x = x | (x >> 1); // Propagate leftmost
x = x | (x >> 2); // 1-bit to the right.
x = x | (x >> 4);
x = x | (x >> 8);
x = x | (x >> 16);
x = (x << 3) - x; // Multiply by 7.
x = (x << 8) - x; // Multiply by 255.
x = (x << 8) - x; // Again.
x = (x << 8) - x; // Again.
return table[x >> 26];
}
/* Julius Goryavsky's version of Harley's algorithm.
17 elementary ops plus an indexed load, if the machine
has "and not." */
int nlz10b(unsigned x) {
static char table[64] =
{32,20,19, u, u,18, u, 7, 10,17, u, u,14, u, 6, u,
u, 9, u,16, u, u, 1,26, u,13, u, u,24, 5, u, u,
u,21, u, 8,11, u,15, u, u, u, u, 2,27, 0,25, u,
22, u,12, u, u, 3,28, u, 23, u, 4,29, u, u,30,31};
x = x | (x >> 1); // Propagate leftmost
x = x | (x >> 2); // 1-bit to the right.
x = x | (x >> 4);
x = x | (x >> 8);
x = x & ~(x >> 16);
x = x*0xFD7049FF; // Activate this line or the following 3.
// x = (x << 9) - x; // Multiply by 511.
// x = (x << 11) - x; // Multiply by 2047.
// x = (x << 14) - x; // Multiply by 16383.
return table[x >> 26];
}
int errors;
void error(int x, int y) {
errors = errors + 1;
printf("Error for x = %08x, got %d\n", x, y);
}
int main() {
int i, n;
static unsigned test[] = {0,32, 1,31, 2,30, 3,30, 4,29, 5,29, 6,29,
7,29, 8,28, 9,28, 16,27, 32,26, 64,25, 128,24, 255,24, 256,23,
512,22, 1024,21, 2048,20, 4096,19, 8192,18, 16384,17, 32768,16,
65536,15, 0x20000,14, 0x40000,13, 0x80000,12, 0x100000,11,
0x200000,10, 0x400000,9, 0x800000,8, 0x1000000,7, 0x2000000,6,
0x4000000,5, 0x8000000,4, 0x0FFFFFFF,4, 0x10000000,3,
0x3000FFFF,2, 0x50003333,1, 0x7FFFFFFF,1, 0x80000000,0,
0xFFFFFFFF,0};
n = sizeof(test)/4;
printf("nlz1:\n");
for (i = 0; i < n; i += 2) {
if (nlz1(test[i]) != test[i+1]) error(test[i], nlz1(test[i]));}
printf("nlz1a:\n");
for (i = 0; i < n; i += 2) {
if (nlz1a(test[i]) != test[i+1]) error(test[i], nlz1a(test[i]));}
printf("nlz2:\n");
for (i = 0; i < n; i += 2) {
if (nlz2(test[i]) != test[i+1]) error(test[i], nlz2(test[i]));}
printf("nlz2a:\n");
for (i = 0; i < n; i += 2) {
if (nlz2a(test[i]) != test[i+1]) error(test[i], nlz2a(test[i]));}
printf("nlz3:\n");
for (i = 0; i < n; i += 2) {
if (nlz3(test[i]) != test[i+1]) error(test[i], nlz3(test[i]));}
printf("nlz4:\n");
for (i = 0; i < n; i += 2) {
if (nlz4(test[i]) != test[i+1]) error(test[i], nlz4(test[i]));}
printf("nlz5:\n");
for (i = 0; i < n; i += 2) {
if (nlz5(test[i]) != test[i+1]) error(test[i], nlz5(test[i]));}
printf("nlz6:\n");
for (i = 0; i < n; i += 2) {
if (nlz6(test[i]) != test[i+1]) error(test[i], nlz6(test[i]));}
printf("nlz7:\n");
for (i = 0; i < n; i += 2) {
if (nlz7(test[i]) != test[i+1]) error(test[i], nlz7(test[i]));}
printf("nlz8:\n");
for (i = 0; i < n; i += 2) {
if (nlz8(test[i]) != test[i+1]) error(test[i], nlz8(test[i]));}
printf("nlz8a:\n");
for (i = 0; i < n; i += 2) {
if (nlz8a(test[i]) != test[i+1]) error(test[i], nlz8a(test[i]));}
printf("nlz9:\n");
for (i = 0; i < n; i += 2) {
if (nlz9(test[i]) != test[i+1]) error(test[i], nlz9(test[i]));}
printf("nlz10:\n");
for (i = 0; i < n; i += 2) {
if (nlz10(test[i]) != test[i+1]) error(test[i], nlz10(test[i]));}
printf("nlz10a:\n");
for (i = 0; i < n; i += 2) {
if (nlz10a(test[i]) != test[i+1]) error(test[i], nlz10a(test[i]));}
printf("nlz10b:\n");
for (i = 0; i < n; i += 2) {
if (nlz10b(test[i]) != test[i+1]) error(test[i], nlz10b(test[i]));}
if (errors == 0)
printf("Passed all %d cases.\n", sizeof(test)/8);
}