|
1 /* |
|
2 --------------------------------------------------------------------------- |
|
3 Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved. |
|
4 |
|
5 LICENSE TERMS |
|
6 |
|
7 The free distribution and use of this software in both source and binary |
|
8 form is allowed (with or without changes) provided that: |
|
9 |
|
10 1. distributions of this source code include the above copyright |
|
11 notice, this list of conditions and the following disclaimer; |
|
12 |
|
13 2. distributions in binary form include the above copyright |
|
14 notice, this list of conditions and the following disclaimer |
|
15 in the documentation and/or other associated materials; |
|
16 |
|
17 3. the copyright holder's name is not used to endorse products |
|
18 built using this software without specific written permission. |
|
19 |
|
20 ALTERNATIVELY, provided that this notice is retained in full, this product |
|
21 may be distributed under the terms of the GNU General Public License (GPL), |
|
22 in which case the provisions of the GPL apply INSTEAD OF those given above. |
|
23 |
|
24 DISCLAIMER |
|
25 |
|
26 This software is provided 'as is' with no explicit or implied warranties |
|
27 in respect of its properties, including, but not limited to, correctness |
|
28 and/or fitness for purpose. |
|
29 --------------------------------------------------------------------------- |
|
30 Issue Date: 13/10/2006 |
|
31 |
|
32 An implementation of field multiplication in Galois Field GF(128) |
|
33 */ |
|
34 |
|
35 #ifndef GF128MUL_H |
|
36 #define GF128MUL_H |
|
37 |
|
38 #include <stdlib.h> |
|
39 #include <string.h> |
|
40 |
|
41 #include "mode_hdr.h" |
|
42 |
|
43 /* Table sizes for GF(128) Multiply. Normally larger tables give |
|
44 higher speed but cache loading might change this. Normally only |
|
45 one table size (or none at all) will be specified here |
|
46 */ |
|
47 |
|
48 #if 0 |
|
49 # define TABLES_64K |
|
50 #endif |
|
51 #if 1 |
|
52 # define TABLES_8K |
|
53 #endif |
|
54 #if 0 |
|
55 # define TABLES_4K |
|
56 #endif |
|
57 #if 0 |
|
58 # define TABLES_256 |
|
59 #endif |
|
60 |
|
61 /* Use of inlines is preferred but code blocks can also be expanded inline |
|
62 using 'defines'. But the latter approach will typically generate a LOT |
|
63 of code and is not recommended. |
|
64 */ |
|
65 #if 0 |
|
66 # define USE_INLINES |
|
67 #endif |
|
68 |
|
69 /* Speed critical loops can be unrolled to gain speed but consume more |
|
70 memory |
|
71 */ |
|
72 #if 0 |
|
73 # define UNROLL_LOOPS |
|
74 #endif |
|
75 |
|
76 /* Multiply a GF128 field element by x. Field elements are held in arrays |
|
77 of bytes in which field bits 8n..8n + 7 are held in byte[n], with lower |
|
78 indexed bits placed in the more numerically significant bit positions |
|
79 within bytes. |
|
80 |
|
81 On little endian machines the bit indexes translate into the bit |
|
82 positions within four 32-bit words in the following way |
|
83 |
|
84 MS x[0] LS MS x[1] LS |
|
85 ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls |
|
86 24...31 16...23 08...15 00...07 56...63 48...55 40...47 32...39 |
|
87 |
|
88 MS x[2] LS MS x[3] LS |
|
89 ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls |
|
90 88...95 80...87 72...79 64...71 120.127 112.119 104.111 96..103 |
|
91 |
|
92 On big endian machines the bit indexes translate into the bit |
|
93 positions within four 32-bit words in the following way |
|
94 |
|
95 MS x[0] LS MS x[1] LS |
|
96 ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls |
|
97 00...07 08...15 16...23 24...31 32...39 40...47 48...55 56...63 |
|
98 |
|
99 MS x[2] LS MS x[3] LS |
|
100 ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls |
|
101 64...71 72...79 80...87 88...95 96..103 104.111 112.119 120.127 |
|
102 */ |
|
103 |
|
104 #define GF_BYTE_LEN 16 |
|
105 |
|
106 #if defined( USE_INLINES ) |
|
107 # if defined( _MSC_VER ) |
|
108 # define gf_inline __inline |
|
109 # elif defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) |
|
110 # define gf_inline static inline |
|
111 # else |
|
112 # define gf_inline static |
|
113 # endif |
|
114 #endif |
|
115 |
|
116 #if defined(__cplusplus) |
|
117 extern "C" |
|
118 { |
|
119 #endif |
|
120 |
|
121 /* These functions multiply a field element x, by x^4 and by x^8 in the |
|
122 polynomial field representation. It uses 32-bit word operations to |
|
123 gain speed but compensates for machine endianess and hence works |
|
124 correctly on both styles of machine. |
|
125 */ |
|
126 extern const unsigned short gf_tab[256]; |
|
127 |
|
128 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN |
|
129 |
|
130 /* This section is not needed as GF(128) multiplication is now implemented |
|
131 but is left in place as it provides a template for an alternative little |
|
132 endian implementation approach based on conversion to and from big endian |
|
133 format |
|
134 */ |
|
135 #if 0 |
|
136 |
|
137 /* This is a template for mul_x. The mul_x4 and mul_x8 little endian |
|
138 alternative implementations (and their defined versions) follow the |
|
139 big endian functions below in the same way. |
|
140 */ |
|
141 |
|
142 gf_inline void mul_x(void *r, const void *x) |
|
143 { uint_32t _tt; |
|
144 bswap32_block(r, x, 4); |
|
145 _tt = gf_tab[(ui32_ptr(r)[3] << 7) & 0xff]; |
|
146 ui32_ptr(r)[3] = (ui32_ptr(r)[3] >> 1) | (ui32_ptr(r)[2] << 31); |
|
147 ui32_ptr(r)[2] = (ui32_ptr(r)[2] >> 1) | (ui32_ptr(r)[1] << 31); |
|
148 ui32_ptr(r)[1] = (ui32_ptr(r)[1] >> 1) | (ui32_ptr(r)[0] << 31); |
|
149 ui32_ptr(r)[0] = (ui32_ptr(r)[0] >> 1) ^ bswap_32(_tt); |
|
150 bswap32_block(r, r, 4); |
|
151 } |
|
152 |
|
153 #endif |
|
154 |
|
155 #define VERSION_1 |
|
156 |
|
157 #define MSK_80 (0x80 * (unit_cast(BFR_UNIT,-1) / 0xff)) |
|
158 #define MSK_F0 (0xf0 * (unit_cast(BFR_UNIT,-1) / 0xff)) |
|
159 |
|
160 #if defined( USE_INLINES ) |
|
161 |
|
162 #if BFR_UNIT == 64 |
|
163 |
|
164 gf_inline void mul_x(void *r, const void *x) |
|
165 { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 49) & MSK_80]; |
|
166 |
|
167 ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) & ~MSK_80 | ((ui64_ptr(x)[1] << 15) | (ui64_ptr(x)[0] >> 49)) & MSK_80; |
|
168 ui64_ptr(r)[0] = ((ui64_ptr(x)[0] >> 1) & ~MSK_80 | (ui64_ptr(x)[0] << 15) & MSK_80) ^ _tt; |
|
169 } |
|
170 |
|
171 #if defined( VERSION_1 ) |
|
172 |
|
173 gf_inline void mul_x4(void *x) |
|
174 { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & MSK_F0]; |
|
175 |
|
176 ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui64_ptr(x)[1] << 12) | (ui64_ptr(x)[0] >> 52)) & MSK_F0; |
|
177 ui64_ptr(x)[0] = ((ui64_ptr(x)[0] >> 4) & ~MSK_F0 | (ui64_ptr(x)[0] << 12) & MSK_F0) ^ _tt; |
|
178 } |
|
179 |
|
180 #else |
|
181 |
|
182 gf_inline void mul_x4(void *x) |
|
183 { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & 0xf0]; |
|
184 bswap64_block(x, x, 2); |
|
185 ui64_ptr(x)[1] = bswap_64((ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60)); |
|
186 ui64_ptr(x)[0] = bswap_64((ui64_ptr(x)[0] >> 4)) ^ _tt; |
|
187 } |
|
188 |
|
189 #endif |
|
190 |
|
191 gf_inline void mul_x8(void *x) |
|
192 { uint_64t _tt = gf_tab[ui64_ptr(x)[1] >> 56]; |
|
193 ui64_ptr(x)[1] = (ui64_ptr(x)[1] << 8) | (ui64_ptr(x)[0] >> 56); |
|
194 ui64_ptr(x)[0] = (ui64_ptr(x)[0] << 8) ^ _tt; |
|
195 } |
|
196 |
|
197 #elif BFR_UNIT == 32 |
|
198 |
|
199 gf_inline void mul_x(void *r, const void *x) |
|
200 { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 17) & MSK_80]; |
|
201 |
|
202 ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) & ~MSK_80 | ((ui32_ptr(x)[3] << 15) | (ui32_ptr(x)[2] >> 17)) & MSK_80; |
|
203 ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) & ~MSK_80 | ((ui32_ptr(x)[2] << 15) | (ui32_ptr(x)[1] >> 17)) & MSK_80; |
|
204 ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) & ~MSK_80 | ((ui32_ptr(x)[1] << 15) | (ui32_ptr(x)[0] >> 17)) & MSK_80; |
|
205 ui32_ptr(r)[0] = ((ui32_ptr(x)[0] >> 1) & ~MSK_80 | (ui32_ptr(x)[0] << 15) & MSK_80) ^ _tt; |
|
206 } |
|
207 |
|
208 #if defined( VERSION_1 ) |
|
209 |
|
210 gf_inline void mul_x4(void *x) |
|
211 { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & MSK_F0]; |
|
212 |
|
213 ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[3] << 12) | (ui32_ptr(x)[2] >> 20)) & MSK_F0; |
|
214 ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[2] << 12) | (ui32_ptr(x)[1] >> 20)) & MSK_F0; |
|
215 ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[1] << 12) | (ui32_ptr(x)[0] >> 20)) & MSK_F0; |
|
216 ui32_ptr(x)[0] = ((ui32_ptr(x)[0] >> 4) & ~MSK_F0 | (ui32_ptr(x)[0] << 12) & MSK_F0) ^ _tt; |
|
217 } |
|
218 |
|
219 #else |
|
220 |
|
221 gf_inline void mul_x4(void *x) |
|
222 { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & 0xf0]; |
|
223 bswap32_block(x, x, 4); |
|
224 ui32_ptr(x)[3] = bswap_32((ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28)); |
|
225 ui32_ptr(x)[2] = bswap_32((ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28)); |
|
226 ui32_ptr(x)[1] = bswap_32((ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28)); |
|
227 ui32_ptr(x)[0] = bswap_32((ui32_ptr(x)[0] >> 4)) ^ _tt; |
|
228 } |
|
229 |
|
230 #endif |
|
231 |
|
232 gf_inline void mul_x8(void *x) |
|
233 { uint_32t _tt = gf_tab[ui32_ptr(x)[3] >> 24]; |
|
234 |
|
235 ui32_ptr(x)[3] = (ui32_ptr(x)[3] << 8) | (ui32_ptr(x)[2] >> 24); |
|
236 ui32_ptr(x)[2] = (ui32_ptr(x)[2] << 8) | (ui32_ptr(x)[1] >> 24); |
|
237 ui32_ptr(x)[1] = (ui32_ptr(x)[1] << 8) | (ui32_ptr(x)[0] >> 24); |
|
238 ui32_ptr(x)[0] = (ui32_ptr(x)[0] << 8) ^ _tt; |
|
239 } |
|
240 |
|
241 #else |
|
242 |
|
243 gf_inline void mul_x(void *r, const void *x) |
|
244 { uint_8t _tt = ui8_ptr(x)[15] & 1; |
|
245 ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); |
|
246 ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); |
|
247 ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); |
|
248 ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); |
|
249 ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); |
|
250 ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); |
|
251 ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); |
|
252 ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); |
|
253 ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); |
|
254 ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); |
|
255 ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); |
|
256 ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); |
|
257 ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); |
|
258 ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); |
|
259 ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); |
|
260 ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00); |
|
261 } |
|
262 |
|
263 gf_inline void mul_x4(void *x) |
|
264 { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff]; |
|
265 ui8_ptr(x)[15] = (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4); |
|
266 ui8_ptr(x)[14] = (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4); |
|
267 ui8_ptr(x)[13] = (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4); |
|
268 ui8_ptr(x)[12] = (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4); |
|
269 ui8_ptr(x)[11] = (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4); |
|
270 ui8_ptr(x)[10] = (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4); |
|
271 ui8_ptr(x)[ 9] = (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4); |
|
272 ui8_ptr(x)[ 8] = (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4); |
|
273 ui8_ptr(x)[ 7] = (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4); |
|
274 ui8_ptr(x)[ 6] = (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4); |
|
275 ui8_ptr(x)[ 5] = (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4); |
|
276 ui8_ptr(x)[ 4] = (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4); |
|
277 ui8_ptr(x)[ 3] = (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4); |
|
278 ui8_ptr(x)[ 2] = (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4); |
|
279 ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt >> 8); |
|
280 ui8_ptr(x)[ 0] = (ui8_ptr(x)[ 0] >> 4) ^ (_tt & 0xff); |
|
281 } |
|
282 |
|
283 gf_inline void mul_x8(void *x) |
|
284 { uint_16t _tt = gf_tab[ui8_ptr(x)[15]]; |
|
285 memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15); |
|
286 ui8_ptr(x)[1] ^= (_tt >> 8); |
|
287 ui8_ptr(x)[0] = (_tt & 0xff); |
|
288 } |
|
289 |
|
290 #endif |
|
291 |
|
292 #else /* DEFINES */ |
|
293 |
|
294 #if BFR_UNIT == 64 |
|
295 |
|
296 #define mul_x(r, x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 49) & MSK_80]; \ |
|
297 ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) & ~MSK_80 \ |
|
298 | ((ui64_ptr(x)[1] << 15) | (ui64_ptr(x)[0] >> 49)) & MSK_80; \ |
|
299 ui64_ptr(r)[0] = ((ui64_ptr(x)[0] >> 1) & ~MSK_80 \ |
|
300 | (ui64_ptr(x)[0] << 15) & MSK_80) ^ _tt; \ |
|
301 } while(0) |
|
302 |
|
303 #if defined( VERSION_1 ) |
|
304 |
|
305 #define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & MSK_F0]; \ |
|
306 ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui64_ptr(x)[1] << 12) \ |
|
307 | (ui64_ptr(x)[0] >> 52)) & MSK_F0; \ |
|
308 ui64_ptr(x)[0] = ((ui64_ptr(x)[0] >> 4) & ~MSK_F0 \ |
|
309 | (ui64_ptr(x)[0] << 12) & MSK_F0) ^ _tt; \ |
|
310 } while(0) |
|
311 |
|
312 #else |
|
313 |
|
314 #define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & 0xf0]; \ |
|
315 bswap64_block(x, x, 2); \ |
|
316 ui64_ptr(x)[1] = bswap_64((ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60)); \ |
|
317 ui64_ptr(x)[0] = bswap_64((ui64_ptr(x)[0] >> 4)) ^ _tt; \ |
|
318 } while(0) |
|
319 |
|
320 #endif |
|
321 |
|
322 #define mul_x8(x) do { uint_64t _tt = gf_tab[ui64_ptr(x)[1] >> 56]; \ |
|
323 ui64_ptr(x)[1] = (ui64_ptr(x)[1] << 8) | (ui64_ptr(x)[0] >> 56); \ |
|
324 ui64_ptr(x)[0] = (ui64_ptr(x)[0] << 8) ^ _tt; \ |
|
325 } while(0) |
|
326 |
|
327 #elif BFR_UNIT == 32 |
|
328 |
|
329 #define mul_x(r, x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 17) & MSK_80]; \ |
|
330 ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) & ~MSK_80 | ((ui32_ptr(x)[3] << 15) \ |
|
331 | (ui32_ptr(x)[2] >> 17)) & MSK_80; \ |
|
332 ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) & ~MSK_80 | ((ui32_ptr(x)[2] << 15) \ |
|
333 | (ui32_ptr(x)[1] >> 17)) & MSK_80; \ |
|
334 ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) & ~MSK_80 | ((ui32_ptr(x)[1] << 15) \ |
|
335 | (ui32_ptr(x)[0] >> 17)) & MSK_80; \ |
|
336 ui32_ptr(r)[0] = ((ui32_ptr(x)[0] >> 1) & ~MSK_80 \ |
|
337 | (ui32_ptr(x)[0] << 15) & MSK_80) ^ _tt; \ |
|
338 } while(0) |
|
339 |
|
340 #if defined( VERSION_1 ) |
|
341 |
|
342 #define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & MSK_F0]; \ |
|
343 ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[3] << 12) \ |
|
344 | (ui32_ptr(x)[2] >> 20)) & MSK_F0; \ |
|
345 ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[2] << 12) \ |
|
346 | (ui32_ptr(x)[1] >> 20)) & MSK_F0; \ |
|
347 ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[1] << 12) \ |
|
348 | (ui32_ptr(x)[0] >> 20)) & MSK_F0; \ |
|
349 ui32_ptr(x)[0] = ((ui32_ptr(x)[0] >> 4) & ~MSK_F0 \ |
|
350 | (ui32_ptr(x)[0] << 12) & MSK_F0) ^ _tt; \ |
|
351 } while(0) |
|
352 |
|
353 #else |
|
354 |
|
355 #define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & 0xf0]; \ |
|
356 bswap32_block(x, x, 4); \ |
|
357 ui32_ptr(x)[3] = bswap_32((ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28)); \ |
|
358 ui32_ptr(x)[2] = bswap_32((ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28)); \ |
|
359 ui32_ptr(x)[1] = bswap_32((ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28)); \ |
|
360 ui32_ptr(x)[0] = bswap_32((ui32_ptr(x)[0] >> 4)) ^ _tt; \ |
|
361 } while(0) |
|
362 |
|
363 #endif |
|
364 |
|
365 #define mul_x8(x) do { uint_32t _tt = gf_tab[ui32_ptr(x)[3] >> 24]; \ |
|
366 ui32_ptr(x)[3] = (ui32_ptr(x)[3] << 8) | (ui32_ptr(x)[2] >> 24); \ |
|
367 ui32_ptr(x)[2] = (ui32_ptr(x)[2] << 8) | (ui32_ptr(x)[1] >> 24); \ |
|
368 ui32_ptr(x)[1] = (ui32_ptr(x)[1] << 8) | (ui32_ptr(x)[0] >> 24); \ |
|
369 ui32_ptr(x)[0] = (ui32_ptr(x)[0] << 8) ^ _tt; \ |
|
370 } while(0) |
|
371 |
|
372 #else |
|
373 |
|
374 #define mul_x(r, x) do { uint_8t _tt = ui8_ptr(x)[15] & 1; \ |
|
375 ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); \ |
|
376 ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); \ |
|
377 ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); \ |
|
378 ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); \ |
|
379 ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); \ |
|
380 ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); \ |
|
381 ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); \ |
|
382 ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); \ |
|
383 ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); \ |
|
384 ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); \ |
|
385 ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); \ |
|
386 ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); \ |
|
387 ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); \ |
|
388 ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); \ |
|
389 ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); \ |
|
390 ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00); \ |
|
391 } while(0) |
|
392 |
|
393 #define mul_x4(x) do { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff]; \ |
|
394 ui8_ptr(x)[15] = (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4); \ |
|
395 ui8_ptr(x)[14] = (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4); \ |
|
396 ui8_ptr(x)[13] = (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4); \ |
|
397 ui8_ptr(x)[12] = (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4); \ |
|
398 ui8_ptr(x)[11] = (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4); \ |
|
399 ui8_ptr(x)[10] = (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4); \ |
|
400 ui8_ptr(x)[ 9] = (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4); \ |
|
401 ui8_ptr(x)[ 8] = (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4); \ |
|
402 ui8_ptr(x)[ 7] = (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4); \ |
|
403 ui8_ptr(x)[ 6] = (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4); \ |
|
404 ui8_ptr(x)[ 5] = (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4); \ |
|
405 ui8_ptr(x)[ 4] = (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4); \ |
|
406 ui8_ptr(x)[ 3] = (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4); \ |
|
407 ui8_ptr(x)[ 2] = (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4); \ |
|
408 ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt >> 8); \ |
|
409 ui8_ptr(x)[ 0] = (ui8_ptr(x)[ 0] >> 4) ^ (_tt & 0xff); \ |
|
410 } while(0) |
|
411 |
|
412 #define mul_x8(x) do { uint_16t _tt = gf_tab[ui8_ptr(x)[15]]; \ |
|
413 memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15); \ |
|
414 ui8_ptr(x)[1] ^= (_tt >> 8); \ |
|
415 ui8_ptr(x)[0] = (_tt & 0xff); \ |
|
416 } while(0) |
|
417 |
|
418 #endif |
|
419 |
|
420 #endif |
|
421 |
|
422 #elif PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN |
|
423 |
|
424 #if defined( USE_INLINES ) |
|
425 |
|
426 #if BFR_UNIT == 64 |
|
427 |
|
428 gf_inline void mul_x(void *r, const void *x) |
|
429 { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 7) & 0xff]; |
|
430 ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) | (ui64_ptr(x)[0] << 63); |
|
431 ui64_ptr(r)[0] = (ui64_ptr(x)[0] >> 1) ^ (_tt << 48); |
|
432 } |
|
433 |
|
434 gf_inline void mul_x4(void *x) |
|
435 { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 4) & 0xff]; |
|
436 ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60); |
|
437 ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 4) ^ (_tt << 48); |
|
438 } |
|
439 |
|
440 gf_inline void mul_x8(void *x) |
|
441 { uint_64t _tt = gf_tab[ui64_ptr(x)[1] & 0xff]; |
|
442 ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 8) | (ui64_ptr(x)[0] << 56); |
|
443 ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 8) ^ (_tt << 48); |
|
444 } |
|
445 |
|
446 #elif BFR_UNIT == 32 |
|
447 |
|
448 gf_inline void mul_x(void *r, const void *x) |
|
449 { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 7) & 0xff]; |
|
450 ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) | (ui32_ptr(x)[2] << 31); |
|
451 ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) | (ui32_ptr(x)[1] << 31); |
|
452 ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) | (ui32_ptr(x)[0] << 31); |
|
453 ui32_ptr(r)[0] = (ui32_ptr(x)[0] >> 1) ^ (_tt << 16); |
|
454 } |
|
455 |
|
456 gf_inline void mul_x4(void *x) |
|
457 { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 4) & 0xff]; |
|
458 ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28); |
|
459 ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28); |
|
460 ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28); |
|
461 ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 4) ^ (_tt << 16); |
|
462 } |
|
463 |
|
464 gf_inline void mul_x8(void *x) |
|
465 { uint_32t _tt = gf_tab[ui32_ptr(x)[3] & 0xff]; |
|
466 ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 8) | (ui32_ptr(x)[2] << 24); |
|
467 ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 8) | (ui32_ptr(x)[1] << 24); |
|
468 ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 8) | (ui32_ptr(x)[0] << 24); |
|
469 ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 8) ^ (_tt << 16); |
|
470 } |
|
471 |
|
472 #else |
|
473 |
|
474 gf_inline void mul_x(void *r, const void *x) |
|
475 { uint_8t _tt = ui8_ptr(x)[15] & 1; |
|
476 ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); |
|
477 ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); |
|
478 ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); |
|
479 ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); |
|
480 ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); |
|
481 ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); |
|
482 ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); |
|
483 ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); |
|
484 ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); |
|
485 ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); |
|
486 ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); |
|
487 ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); |
|
488 ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); |
|
489 ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); |
|
490 ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); |
|
491 ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00); |
|
492 } |
|
493 |
|
494 gf_inline void mul_x4(void *x) |
|
495 { |
|
496 uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff]; |
|
497 ui8_ptr(x)[15] = (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4); |
|
498 ui8_ptr(x)[14] = (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4); |
|
499 ui8_ptr(x)[13] = (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4); |
|
500 ui8_ptr(x)[12] = (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4); |
|
501 ui8_ptr(x)[11] = (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4); |
|
502 ui8_ptr(x)[10] = (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4); |
|
503 ui8_ptr(x)[ 9] = (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4); |
|
504 ui8_ptr(x)[ 8] = (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4); |
|
505 ui8_ptr(x)[ 7] = (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4); |
|
506 ui8_ptr(x)[ 6] = (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4); |
|
507 ui8_ptr(x)[ 5] = (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4); |
|
508 ui8_ptr(x)[ 4] = (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4); |
|
509 ui8_ptr(x)[ 3] = (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4); |
|
510 ui8_ptr(x)[ 2] = (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4); |
|
511 ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt & 0xff); |
|
512 ui8_ptr(x)[ 0] = (ui8_ptr(x)[ 0] >> 4) ^ (_tt >> 8); |
|
513 } |
|
514 |
|
515 gf_inline void mul_x8(void *x) |
|
516 { uint_16t _tt = gf_tab[ui8_ptr(x)[15]]; |
|
517 memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15); |
|
518 ui8_ptr(x)[1] ^= (_tt & 0xff); |
|
519 ui8_ptr(x)[0] = (_tt >> 8); |
|
520 } |
|
521 |
|
522 #endif |
|
523 |
|
524 #else /* DEFINES */ |
|
525 |
|
526 #if BFR_UNIT == 64 |
|
527 |
|
528 #define mul_x(r, x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 7) & 0xff]; \ |
|
529 ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) | (ui64_ptr(x)[0] << 63); \ |
|
530 ui64_ptr(r)[0] = (ui64_ptr(x)[0] >> 1) ^ (_tt << 48); \ |
|
531 } while(0) |
|
532 |
|
533 #define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 4) & 0xff]; \ |
|
534 ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60); \ |
|
535 ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 4) ^ (_tt << 48); \ |
|
536 } while(0) |
|
537 |
|
538 #define mul_x8(x) do { uint_64t _tt = gf_tab[ui64_ptr(x)[1] & 0xff]; \ |
|
539 ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 8) | (ui64_ptr(x)[0] << 56); \ |
|
540 ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 8) ^ (_tt << 48); \ |
|
541 } while(0) |
|
542 |
|
543 #elif BFR_UNIT == 32 |
|
544 |
|
545 #define mul_x(r, x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 7) & 0xff]; \ |
|
546 ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) | (ui32_ptr(x)[2] << 31); \ |
|
547 ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) | (ui32_ptr(x)[1] << 31); \ |
|
548 ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) | (ui32_ptr(x)[0] << 31); \ |
|
549 ui32_ptr(r)[0] = (ui32_ptr(x)[0] >> 1) ^ (_tt << 16); \ |
|
550 } while(0) |
|
551 |
|
552 #define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 4) & 0xff]; \ |
|
553 ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28); \ |
|
554 ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28); \ |
|
555 ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28); \ |
|
556 ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 4) ^ (_tt << 16); \ |
|
557 } while(0) |
|
558 |
|
559 #define mul_x8(x) do { uint_32t _tt = gf_tab[ui32_ptr(x)[3] & 0xff]; \ |
|
560 ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 8) | (ui32_ptr(x)[2] << 24); \ |
|
561 ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 8) | (ui32_ptr(x)[1] << 24); \ |
|
562 ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 8) | (ui32_ptr(x)[0] << 24); \ |
|
563 ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 8) ^ (_tt << 16); \ |
|
564 } while(0) |
|
565 |
|
566 #else |
|
567 |
|
568 #define mul_x(r, x) do { uint_8t _tt = ui8_ptr(x)[15] & 1; \ |
|
569 ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); \ |
|
570 ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); \ |
|
571 ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); \ |
|
572 ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); \ |
|
573 ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); \ |
|
574 ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); \ |
|
575 ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); \ |
|
576 ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); \ |
|
577 ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); \ |
|
578 ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); \ |
|
579 ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); \ |
|
580 ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); \ |
|
581 ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); \ |
|
582 ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); \ |
|
583 ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); \ |
|
584 ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00); \ |
|
585 } while(0) |
|
586 |
|
587 #define mul_x4(x) do { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff]; \ |
|
588 ui8_ptr(x)[15] = (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4); \ |
|
589 ui8_ptr(x)[14] = (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4); \ |
|
590 ui8_ptr(x)[13] = (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4); \ |
|
591 ui8_ptr(x)[12] = (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4); \ |
|
592 ui8_ptr(x)[11] = (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4); \ |
|
593 ui8_ptr(x)[10] = (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4); \ |
|
594 ui8_ptr(x)[ 9] = (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4); \ |
|
595 ui8_ptr(x)[ 8] = (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4); \ |
|
596 ui8_ptr(x)[ 7] = (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4); \ |
|
597 ui8_ptr(x)[ 6] = (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4); \ |
|
598 ui8_ptr(x)[ 5] = (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4); \ |
|
599 ui8_ptr(x)[ 4] = (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4); \ |
|
600 ui8_ptr(x)[ 3] = (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4); \ |
|
601 ui8_ptr(x)[ 2] = (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4); \ |
|
602 ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt & 0xff); \ |
|
603 ui8_ptr(x)[ 0] = (ui8_ptr(x)[ 0] >> 4) ^ (_tt >> 8); \ |
|
604 } while(0) |
|
605 |
|
606 #define mul_x8(x) do { uint_16t _tt = gf_tab[ui8_ptr(x)[15]]; \ |
|
607 memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15); \ |
|
608 ui8_ptr(x)[1] ^= (_tt & 0xff); \ |
|
609 ui8_ptr(x)[0] = (_tt >> 8); \ |
|
610 } while(0) |
|
611 |
|
612 #endif |
|
613 |
|
614 #endif |
|
615 |
|
616 #else |
|
617 # error Platform byte order has not been set. |
|
618 #endif |
|
619 |
|
620 /* A slow generic version of gf_mul (a = a * b) */ |
|
621 |
|
622 void gf_mul(void *a, const void* b); |
|
623 |
|
624 /* This version uses 64k bytes of table space on the stack. |
|
625 A 16 byte buffer has to be multiplied by a 16 byte key |
|
626 value in GF(128). If we consider a GF(128) value in |
|
627 the buffer's lowest byte, we can construct a table of |
|
628 the 256 16 byte values that result from the 256 values |
|
629 of this byte. This requires 4096 bytes. But we also |
|
630 need tables for each of the 16 higher bytes in the |
|
631 buffer as well, which makes 64 kbytes in total. |
|
632 */ |
|
633 |
|
634 void init_64k_table(unsigned char g[], void *t); |
|
635 typedef uint_32t (*gf_t64k)[256][GF_BYTE_LEN >> 2]; |
|
636 #define tab64k(x) ((gf_t64k)x) |
|
637 #define xor_64k(i,a,t,r) xor_block_aligned(r, tab64k(t)[i][a[i]]) |
|
638 |
|
639 #if defined( USE_INLINES ) |
|
640 |
|
641 #if defined( UNROLL_LOOPS ) |
|
642 |
|
643 gf_inline void gf_mul_64k(unsigned char a[], void *t, void *r) |
|
644 { |
|
645 move_block_aligned(r, tab64k(t)[0][a[0]]); xor_64k( 1, a, t, r); |
|
646 xor_64k( 2, a, t, r); xor_64k( 3, a, t, r); |
|
647 xor_64k( 4, a, t, r); xor_64k( 5, a, t, r); |
|
648 xor_64k( 6, a, t, r); xor_64k( 7, a, t, r); |
|
649 xor_64k( 8, a, t, r); xor_64k( 9, a, t, r); |
|
650 xor_64k(10, a, t, r); xor_64k(11, a, t, r); |
|
651 xor_64k(12, a, t, r); xor_64k(13, a, t, r); |
|
652 xor_64k(14, a, t, r); xor_64k(15, a, t, r); |
|
653 move_block_aligned(a, r); |
|
654 } |
|
655 |
|
656 #else |
|
657 |
|
658 gf_inline void gf_mul_64k(unsigned char a[], void *t, void *r) |
|
659 { int i; |
|
660 move_block_aligned(r, tab64k(t)[0][a[0]]); |
|
661 for(i = 1; i < GF_BYTE_LEN; ++i) |
|
662 xor_64k(i, a, t, r); |
|
663 move_block_aligned(a, r); |
|
664 } |
|
665 |
|
666 #endif |
|
667 |
|
668 #else |
|
669 |
|
670 #if !defined( UNROLL_LOOPS ) |
|
671 |
|
672 #define gf_mul_64k(a, t, r) do { \ |
|
673 move_block_aligned(r, tab64k(t)[0][a[0]]); \ |
|
674 xor_64k( 1, a, t, r); \ |
|
675 xor_64k( 2, a, t, r); xor_64k( 3, a, t, r); \ |
|
676 xor_64k( 4, a, t, r); xor_64k( 5, a, t, r); \ |
|
677 xor_64k( 6, a, t, r); xor_64k( 7, a, t, r); \ |
|
678 xor_64k( 8, a, t, r); xor_64k( 9, a, t, r); \ |
|
679 xor_64k(10, a, t, r); xor_64k(11, a, t, r); \ |
|
680 xor_64k(12, a, t, r); xor_64k(13, a, t, r); \ |
|
681 xor_64k(14, a, t, r); xor_64k(15, a, t, r); \ |
|
682 move_block_aligned(a, r); \ |
|
683 } while(0) |
|
684 |
|
685 #else |
|
686 |
|
687 #define gf_mul_64k(a, t, r) do { int i; \ |
|
688 move_block_aligned(r, tab64k(t)[0][a[0]]); \ |
|
689 for(i = 1; i < GF_BYTE_LEN; ++i) \ |
|
690 { xor_64k(i, a, t, r); \ |
|
691 } \ |
|
692 move_block_aligned(a, r); \ |
|
693 } while(0) |
|
694 |
|
695 #endif |
|
696 |
|
697 #endif |
|
698 |
|
699 /* This version uses 8k bytes of table space on the stack. |
|
700 A 16 byte buffer has to be multiplied by a 16 byte key |
|
701 value in GF(128). If we consider a GF(128) value in |
|
702 the buffer's lowest 4-bits, we can construct a table of |
|
703 the 16 16 byte values that result from the 16 values |
|
704 of these 4 bits. This requires 256 bytes. But we also |
|
705 need tables for each of the 32 higher 4 bit groups, |
|
706 which makes 8 kbytes in total. |
|
707 */ |
|
708 |
|
709 void init_8k_table(unsigned char g[], void *t); |
|
710 |
|
711 typedef uint_32t (*gf_t8k)[16][GF_BYTE_LEN >> 2]; |
|
712 #define tab8k(x) ((gf_t8k)x) |
|
713 #define xor_8k(i,a,t,r) \ |
|
714 xor_block_aligned(r, tab8k(t)[i + i][a[i] & 15]); \ |
|
715 xor_block_aligned(r, tab8k(t)[i + i + 1][a[i] >> 4]) |
|
716 |
|
717 #if defined( USE_INLINES ) |
|
718 |
|
719 #if defined( UNROLL_LOOPS ) |
|
720 |
|
721 gf_inline void gf_mul_8k(unsigned char a[], void *t, void *r) |
|
722 { |
|
723 move_block_aligned(r, tab8k(t)[0][a[0] & 15]); |
|
724 xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]); |
|
725 xor_8k( 1, a, t, r); xor_8k( 2, a, t, r); xor_8k( 3, a, t, r); |
|
726 xor_8k( 4, a, t, r); xor_8k( 5, a, t, r); xor_8k( 6, a, t, r); xor_8k( 7, a, t, r); |
|
727 xor_8k( 8, a, t, r); xor_8k( 9, a, t, r); xor_8k(10, a, t, r); xor_8k(11, a, t, r); |
|
728 xor_8k(12, a, t, r); xor_8k(13, a, t, r); xor_8k(14, a, t, r); xor_8k(15, a, t, r); |
|
729 move_block_aligned(a, r); |
|
730 } |
|
731 |
|
732 #else |
|
733 |
|
734 gf_inline void gf_mul_8k(unsigned char a[], void *t, void *r) |
|
735 { int i; |
|
736 memcpy(r, tab8k(t)[0][a[0] & 15], GF_BYTE_LEN); |
|
737 xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]); |
|
738 for(i = 1; i < GF_BYTE_LEN; ++i) |
|
739 { xor_8k(i, a, t, r); |
|
740 } |
|
741 memcpy(a, r, GF_BYTE_LEN); |
|
742 } |
|
743 |
|
744 #endif |
|
745 |
|
746 #else |
|
747 |
|
748 #if defined( UNROLL_LOOPS ) |
|
749 |
|
750 #define gf_mul_8k(a, t, r) do { \ |
|
751 move_block_aligned(r, tab8k(t)[0][a[0] & 15]); \ |
|
752 xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]); \ |
|
753 xor_8k( 1, a, t, r); xor_8k( 2, a, t, r); \ |
|
754 xor_8k( 3, a, t, r); xor_8k( 4, a, t, r); \ |
|
755 xor_8k( 5, a, t, r); xor_8k( 6, a, t, r); \ |
|
756 xor_8k( 7, a, t, r); xor_8k( 8, a, t, r); \ |
|
757 xor_8k( 9, a, t, r); xor_8k(10, a, t, r); \ |
|
758 xor_8k(11, a, t, r); xor_8k(12, a, t, r); \ |
|
759 xor_8k(13, a, t, r); xor_8k(14, a, t, r); \ |
|
760 xor_8k(15, a, t, r); move_block_aligned(a, r); \ |
|
761 } while(0) |
|
762 |
|
763 #else |
|
764 |
|
765 #define gf_mul_8k(a, t, r) do { int i; \ |
|
766 memcpy(r, tab8k(t)[0][a[0] & 15], GF_BYTE_LEN); \ |
|
767 xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]); \ |
|
768 for(i = 1; i < GF_BYTE_LEN; ++i) \ |
|
769 { xor_8k(i, a, t, r); \ |
|
770 } \ |
|
771 memcpy(a, r, GF_BYTE_LEN); \ |
|
772 } while(0) |
|
773 |
|
774 #endif |
|
775 |
|
776 #endif |
|
777 |
|
778 /* This version uses 4k bytes of table space on the stack. |
|
779 A 16 byte buffer has to be multiplied by a 16 byte key |
|
780 value in GF(128). If we consider a GF(128) value in a |
|
781 single byte, we can construct a table of the 256 16 byte |
|
782 values that result from the 256 values of this byte. |
|
783 This requires 4096 bytes. If we take the highest byte in |
|
784 the buffer and use this table to get the result, we then |
|
785 have to multiply by x^120 to get the final value. For the |
|
786 next highest byte the result has to be multiplied by x^112 |
|
787 and so on. But we can do this by accumulating the result |
|
788 in an accumulator starting with the result for the top |
|
789 byte. We repeatedly multiply the accumulator value by |
|
790 x^8 and then add in (i.e. xor) the 16 bytes of the next |
|
791 lower byte in the buffer, stopping when we reach the |
|
792 lowest byte. This requires a 4096 byte table. |
|
793 */ |
|
794 |
|
795 void init_4k_table(unsigned char g[], void *t); |
|
796 |
|
797 typedef uint_32t (*gf_t4k)[GF_BYTE_LEN >> 2]; |
|
798 #define tab4k(x) ((gf_t4k)x) |
|
799 #define xor_4k(i,a,t,r) mul_x8(r); xor_block_aligned(r, tab4k(t)[a[i]]) |
|
800 |
|
801 #if defined( USE_INLINES ) |
|
802 |
|
803 #if defined( UNROLL_LOOPS ) |
|
804 |
|
805 gf_inline void gf_mul_4k(unsigned char a[], void *t, void *r) |
|
806 { |
|
807 move_block_aligned(r,tab4k(t)[a[15]]); |
|
808 xor_4k(14, a, t, r); xor_4k(13, a, t, r); xor_4k(12, a, t, r); |
|
809 xor_4k(11, a, t, r); xor_4k(10, a, t, r); xor_4k( 9, a, t, r); |
|
810 xor_4k( 8, a, t, r); xor_4k( 7, a, t, r); xor_4k( 6, a, t, r); |
|
811 xor_4k( 5, a, t, r); xor_4k( 4, a, t, r); xor_4k( 3, a, t, r); |
|
812 xor_4k( 2, a, t, r); xor_4k( 1, a, t, r); xor_4k( 0, a, t, r); |
|
813 move_block_aligned(a, r); |
|
814 } |
|
815 |
|
816 #else |
|
817 |
|
818 gf_inline void gf_mul_4k(unsigned char a[], void *t, void *r) |
|
819 { int i = 15; |
|
820 move_block_aligned(r,tab4k(t)[a[15]]); |
|
821 while(i--) |
|
822 { |
|
823 xor_4k(i, a, t, r); |
|
824 } |
|
825 move_block_aligned(a, r); |
|
826 } |
|
827 |
|
828 #endif |
|
829 |
|
830 #else |
|
831 |
|
832 #if defined( UNROLL_LOOPS ) |
|
833 |
|
834 #define gf_mul_4k(a, t, r) do { \ |
|
835 move_block_aligned(r,tab4k(t)[a[15]]); \ |
|
836 xor_4k(14, a, t, r); xor_4k(13, a, t, r); xor_4k(12, a, t, r); \ |
|
837 xor_4k(11, a, t, r); xor_4k(10, a, t, r); xor_4k( 9, a, t, r); \ |
|
838 xor_4k( 8, a, t, r); xor_4k( 7, a, t, r); xor_4k( 6, a, t, r); \ |
|
839 xor_4k( 5, a, t, r); xor_4k( 4, a, t, r); xor_4k( 3, a, t, r); \ |
|
840 xor_4k( 2, a, t, r); xor_4k( 1, a, t, r); xor_4k( 0, a, t, r); \ |
|
841 move_block_aligned(a, r); \ |
|
842 } while(0) |
|
843 |
|
844 #else |
|
845 |
|
846 #define gf_mul_4k(a, t, r) do { int i = 15; \ |
|
847 move_block_aligned(r,tab4k(t)[a[15]]); \ |
|
848 while(i--) \ |
|
849 { xor_4k(i, a, t, r); \ |
|
850 } \ |
|
851 move_block_aligned(a, r); \ |
|
852 } while(0) |
|
853 |
|
854 #endif |
|
855 |
|
856 #endif |
|
857 |
|
858 /* This version uses 256 bytes of table space on the stack. |
|
859 A 16 byte buffer has to be multiplied by a 16 byte key |
|
860 value in GF(128). If we consider a GF(128) value in a |
|
861 single 4-bit nibble, we can construct a table of the 16 |
|
862 16 byte values that result from the 16 values of this |
|
863 byte. This requires 256 bytes. If we take the highest |
|
864 4-bit nibble in the buffer and use this table to get the |
|
865 result, we then have to multiply by x^124 to get the |
|
866 final value. For the next highest byte the result has to |
|
867 be multiplied by x^120 and so on. But we can do this by |
|
868 accumulating the result in an accumulator starting with |
|
869 the result for the top nibble. We repeatedly multiply |
|
870 the accumulator value by x^4 and then add in (i.e. xor) |
|
871 the 16 bytes of the next lower nibble in the buffer, |
|
872 stopping when we reach the lowest nibblebyte. This uses |
|
873 a 256 byte table. |
|
874 */ |
|
875 |
|
876 void init_256_table(unsigned char g[], void *t); |
|
877 |
|
878 typedef uint_32t (*gf_t256)[GF_BYTE_LEN >> 2]; |
|
879 #define tab256(t) ((gf_t256)t) |
|
880 #define xor_256(i,a,t,r) \ |
|
881 mul_x4(r); xor_block_aligned(r, tab256(t)[a[i] & 15]); \ |
|
882 mul_x4(r); xor_block_aligned(r, tab256(t)[a[i] >> 4]) |
|
883 |
|
884 #if defined( USE_INLINES ) |
|
885 |
|
886 #if defined( UNROLL_LOOPS ) |
|
887 |
|
888 gf_inline void gf_mul_256(unsigned char a[], void *t, void *r) |
|
889 { |
|
890 move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); |
|
891 xor_block_aligned(r, tab256(t)[a[15] >> 4]); |
|
892 xor_256(14, a, t, r); xor_256(13, a, t, r); |
|
893 xor_256(12, a, t, r); xor_256(11, a, t, r); |
|
894 xor_256(10, a, t, r); xor_256( 9, a, t, r); |
|
895 xor_256( 8, a, t, r); xor_256( 7, a, t, r); |
|
896 xor_256( 6, a, t, r); xor_256( 5, a, t, r); |
|
897 xor_256( 4, a, t, r); xor_256( 3, a, t, r); |
|
898 xor_256( 2, a, t, r); xor_256( 1, a, t, r); |
|
899 xor_256( 0, a, t, r); move_block_aligned(a, r); |
|
900 } |
|
901 |
|
902 #else |
|
903 |
|
904 gf_inline void gf_mul_256(unsigned char a[], void *t, void *r) |
|
905 { int i = 15; |
|
906 move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); |
|
907 xor_block_aligned(r, tab256(t)[a[15] >> 4]); |
|
908 while(i--) |
|
909 { xor_256(i, a, t, r); |
|
910 } |
|
911 move_block_aligned(a, r); |
|
912 } |
|
913 |
|
914 #endif |
|
915 |
|
916 #else |
|
917 |
|
918 #if defined( UNROLL_LOOPS ) |
|
919 |
|
920 #define gf_mul_256(a, t, r) do { \ |
|
921 move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); \ |
|
922 xor_block_aligned(r, tab256(t)[a[15] >> 4]); \ |
|
923 xor_256(14, a, t, r); xor_256(13, a, t, r); \ |
|
924 xor_256(12, a, t, r); xor_256(11, a, t, r); \ |
|
925 xor_256(10, a, t, r); xor_256( 9, a, t, r); \ |
|
926 xor_256( 8, a, t, r); xor_256( 7, a, t, r); \ |
|
927 xor_256( 6, a, t, r); xor_256( 5, a, t, r); \ |
|
928 xor_256( 4, a, t, r); xor_256( 3, a, t, r); \ |
|
929 xor_256( 2, a, t, r); xor_256( 1, a, t, r); \ |
|
930 xor_256( 0, a, t, r); move_block_aligned(a, r); \ |
|
931 } while(0) |
|
932 |
|
933 #else |
|
934 |
|
935 #define gf_mul_256(a, t, r) do { int i = 15; \ |
|
936 move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); \ |
|
937 xor_block_aligned(r, tab256(t)[a[15] >> 4]); \ |
|
938 while(i--) \ |
|
939 { xor_256(i, a, t, r); \ |
|
940 } \ |
|
941 move_block_aligned(a, r); \ |
|
942 } while(0) |
|
943 |
|
944 #endif |
|
945 |
|
946 #endif |
|
947 |
|
948 #if defined(__cplusplus) |
|
949 } |
|
950 #endif |
|
951 |
|
952 #endif |