--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/servlib/gcm/gf128mul.h Thu Apr 21 14:57:45 2011 +0100
@@ -0,0 +1,952 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+ 1. distributions of this source code include the above copyright
+ notice, this list of conditions and the following disclaimer;
+
+ 2. distributions in binary form include the above copyright
+ notice, this list of conditions and the following disclaimer
+ in the documentation and/or other associated materials;
+
+ 3. the copyright holder's name is not used to endorse products
+ built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 13/10/2006
+
+ An implementation of field multiplication in Galois Field GF(128)
+*/
+
+#ifndef GF128MUL_H
+#define GF128MUL_H
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "mode_hdr.h"
+
+/* Table sizes for GF(128) Multiply. Normally larger tables give
+ higher speed but cache loading might change this. Normally only
+ one table size (or none at all) will be specified here
+*/
+
+#if 0
+# define TABLES_64K
+#endif
+#if 1
+# define TABLES_8K
+#endif
+#if 0
+# define TABLES_4K
+#endif
+#if 0
+# define TABLES_256
+#endif
+
+/* Use of inlines is preferred but code blocks can also be expanded inline
+ using 'defines'. But the latter approach will typically generate a LOT
+ of code and is not recommended.
+*/
+#if 0
+# define USE_INLINES
+#endif
+
+/* Speed critical loops can be unrolled to gain speed but consume more
+ memory
+*/
+#if 0
+# define UNROLL_LOOPS
+#endif
+
+/* Multiply a GF128 field element by x. Field elements are held in arrays
+ of bytes in which field bits 8n..8n + 7 are held in byte[n], with lower
+ indexed bits placed in the more numerically significant bit positions
+ within bytes.
+
+ On little endian machines the bit indexes translate into the bit
+ positions within four 32-bit words in the following way
+
+ MS x[0] LS MS x[1] LS
+ ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls
+ 24...31 16...23 08...15 00...07 56...63 48...55 40...47 32...39
+
+ MS x[2] LS MS x[3] LS
+ ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls
+ 88...95 80...87 72...79 64...71 120.127 112.119 104.111 96..103
+
+ On big endian machines the bit indexes translate into the bit
+ positions within four 32-bit words in the following way
+
+ MS x[0] LS MS x[1] LS
+ ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls
+ 00...07 08...15 16...23 24...31 32...39 40...47 48...55 56...63
+
+ MS x[2] LS MS x[3] LS
+ ms ls ms ls ms ls ms ls ms ls ms ls ms ls ms ls
+ 64...71 72...79 80...87 88...95 96..103 104.111 112.119 120.127
+*/
+
+#define GF_BYTE_LEN 16
+
+#if defined( USE_INLINES )
+# if defined( _MSC_VER )
+# define gf_inline __inline
+# elif defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+# define gf_inline static inline
+# else
+# define gf_inline static
+# endif
+#endif
+
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+
+/* These functions multiply a field element x, by x^4 and by x^8 in the
+ polynomial field representation. It uses 32-bit word operations to
+ gain speed but compensates for machine endianess and hence works
+ correctly on both styles of machine.
+*/
+extern const unsigned short gf_tab[256];
+
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+
+/* This section is not needed as GF(128) multiplication is now implemented
+ but is left in place as it provides a template for an alternative little
+ endian implementation approach based on conversion to and from big endian
+ format
+*/
+#if 0
+
+/* This is a template for mul_x. The mul_x4 and mul_x8 little endian
+ alternative implementations (and their defined versions) follow the
+ big endian functions below in the same way.
+*/
+
+gf_inline void mul_x(void *r, const void *x)
+{ uint_32t _tt;
+ bswap32_block(r, x, 4);
+ _tt = gf_tab[(ui32_ptr(r)[3] << 7) & 0xff];
+ ui32_ptr(r)[3] = (ui32_ptr(r)[3] >> 1) | (ui32_ptr(r)[2] << 31);
+ ui32_ptr(r)[2] = (ui32_ptr(r)[2] >> 1) | (ui32_ptr(r)[1] << 31);
+ ui32_ptr(r)[1] = (ui32_ptr(r)[1] >> 1) | (ui32_ptr(r)[0] << 31);
+ ui32_ptr(r)[0] = (ui32_ptr(r)[0] >> 1) ^ bswap_32(_tt);
+ bswap32_block(r, r, 4);
+}
+
+#endif
+
+#define VERSION_1
+
+#define MSK_80 (0x80 * (unit_cast(BFR_UNIT,-1) / 0xff))
+#define MSK_F0 (0xf0 * (unit_cast(BFR_UNIT,-1) / 0xff))
+
+#if defined( USE_INLINES )
+
+#if BFR_UNIT == 64
+
+ gf_inline void mul_x(void *r, const void *x)
+ { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 49) & MSK_80];
+
+ ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) & ~MSK_80 | ((ui64_ptr(x)[1] << 15) | (ui64_ptr(x)[0] >> 49)) & MSK_80;
+ ui64_ptr(r)[0] = ((ui64_ptr(x)[0] >> 1) & ~MSK_80 | (ui64_ptr(x)[0] << 15) & MSK_80) ^ _tt;
+ }
+
+ #if defined( VERSION_1 )
+
+ gf_inline void mul_x4(void *x)
+ { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & MSK_F0];
+
+ ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui64_ptr(x)[1] << 12) | (ui64_ptr(x)[0] >> 52)) & MSK_F0;
+ ui64_ptr(x)[0] = ((ui64_ptr(x)[0] >> 4) & ~MSK_F0 | (ui64_ptr(x)[0] << 12) & MSK_F0) ^ _tt;
+ }
+
+ #else
+
+ gf_inline void mul_x4(void *x)
+ { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & 0xf0];
+ bswap64_block(x, x, 2);
+ ui64_ptr(x)[1] = bswap_64((ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60));
+ ui64_ptr(x)[0] = bswap_64((ui64_ptr(x)[0] >> 4)) ^ _tt;
+ }
+
+ #endif
+
+ gf_inline void mul_x8(void *x)
+ { uint_64t _tt = gf_tab[ui64_ptr(x)[1] >> 56];
+ ui64_ptr(x)[1] = (ui64_ptr(x)[1] << 8) | (ui64_ptr(x)[0] >> 56);
+ ui64_ptr(x)[0] = (ui64_ptr(x)[0] << 8) ^ _tt;
+ }
+
+#elif BFR_UNIT == 32
+
+ gf_inline void mul_x(void *r, const void *x)
+ { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 17) & MSK_80];
+
+ ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) & ~MSK_80 | ((ui32_ptr(x)[3] << 15) | (ui32_ptr(x)[2] >> 17)) & MSK_80;
+ ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) & ~MSK_80 | ((ui32_ptr(x)[2] << 15) | (ui32_ptr(x)[1] >> 17)) & MSK_80;
+ ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) & ~MSK_80 | ((ui32_ptr(x)[1] << 15) | (ui32_ptr(x)[0] >> 17)) & MSK_80;
+ ui32_ptr(r)[0] = ((ui32_ptr(x)[0] >> 1) & ~MSK_80 | (ui32_ptr(x)[0] << 15) & MSK_80) ^ _tt;
+ }
+
+ #if defined( VERSION_1 )
+
+ gf_inline void mul_x4(void *x)
+ { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & MSK_F0];
+
+ ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[3] << 12) | (ui32_ptr(x)[2] >> 20)) & MSK_F0;
+ ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[2] << 12) | (ui32_ptr(x)[1] >> 20)) & MSK_F0;
+ ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[1] << 12) | (ui32_ptr(x)[0] >> 20)) & MSK_F0;
+ ui32_ptr(x)[0] = ((ui32_ptr(x)[0] >> 4) & ~MSK_F0 | (ui32_ptr(x)[0] << 12) & MSK_F0) ^ _tt;
+ }
+
+ #else
+
+ gf_inline void mul_x4(void *x)
+ { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & 0xf0];
+ bswap32_block(x, x, 4);
+ ui32_ptr(x)[3] = bswap_32((ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28));
+ ui32_ptr(x)[2] = bswap_32((ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28));
+ ui32_ptr(x)[1] = bswap_32((ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28));
+ ui32_ptr(x)[0] = bswap_32((ui32_ptr(x)[0] >> 4)) ^ _tt;
+ }
+
+ #endif
+
+ gf_inline void mul_x8(void *x)
+ { uint_32t _tt = gf_tab[ui32_ptr(x)[3] >> 24];
+
+ ui32_ptr(x)[3] = (ui32_ptr(x)[3] << 8) | (ui32_ptr(x)[2] >> 24);
+ ui32_ptr(x)[2] = (ui32_ptr(x)[2] << 8) | (ui32_ptr(x)[1] >> 24);
+ ui32_ptr(x)[1] = (ui32_ptr(x)[1] << 8) | (ui32_ptr(x)[0] >> 24);
+ ui32_ptr(x)[0] = (ui32_ptr(x)[0] << 8) ^ _tt;
+ }
+
+#else
+
+ gf_inline void mul_x(void *r, const void *x)
+ { uint_8t _tt = ui8_ptr(x)[15] & 1;
+ ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7);
+ ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7);
+ ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7);
+ ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7);
+ ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7);
+ ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7);
+ ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7);
+ ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7);
+ ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7);
+ ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7);
+ ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7);
+ ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7);
+ ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7);
+ ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7);
+ ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7);
+ ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00);
+ }
+
+ gf_inline void mul_x4(void *x)
+ { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff];
+ ui8_ptr(x)[15] = (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4);
+ ui8_ptr(x)[14] = (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4);
+ ui8_ptr(x)[13] = (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4);
+ ui8_ptr(x)[12] = (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4);
+ ui8_ptr(x)[11] = (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4);
+ ui8_ptr(x)[10] = (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4);
+ ui8_ptr(x)[ 9] = (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4);
+ ui8_ptr(x)[ 8] = (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4);
+ ui8_ptr(x)[ 7] = (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4);
+ ui8_ptr(x)[ 6] = (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4);
+ ui8_ptr(x)[ 5] = (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4);
+ ui8_ptr(x)[ 4] = (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4);
+ ui8_ptr(x)[ 3] = (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4);
+ ui8_ptr(x)[ 2] = (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4);
+ ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt >> 8);
+ ui8_ptr(x)[ 0] = (ui8_ptr(x)[ 0] >> 4) ^ (_tt & 0xff);
+ }
+
+ gf_inline void mul_x8(void *x)
+ { uint_16t _tt = gf_tab[ui8_ptr(x)[15]];
+ memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15);
+ ui8_ptr(x)[1] ^= (_tt >> 8);
+ ui8_ptr(x)[0] = (_tt & 0xff);
+ }
+
+#endif
+
+#else /* DEFINES */
+
+#if BFR_UNIT == 64
+
+ #define mul_x(r, x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 49) & MSK_80]; \
+ ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) & ~MSK_80 \
+ | ((ui64_ptr(x)[1] << 15) | (ui64_ptr(x)[0] >> 49)) & MSK_80; \
+ ui64_ptr(r)[0] = ((ui64_ptr(x)[0] >> 1) & ~MSK_80 \
+ | (ui64_ptr(x)[0] << 15) & MSK_80) ^ _tt; \
+ } while(0)
+
+ #if defined( VERSION_1 )
+
+ #define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & MSK_F0]; \
+ ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui64_ptr(x)[1] << 12) \
+ | (ui64_ptr(x)[0] >> 52)) & MSK_F0; \
+ ui64_ptr(x)[0] = ((ui64_ptr(x)[0] >> 4) & ~MSK_F0 \
+ | (ui64_ptr(x)[0] << 12) & MSK_F0) ^ _tt; \
+ } while(0)
+
+ #else
+
+ #define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & 0xf0]; \
+ bswap64_block(x, x, 2); \
+ ui64_ptr(x)[1] = bswap_64((ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60)); \
+ ui64_ptr(x)[0] = bswap_64((ui64_ptr(x)[0] >> 4)) ^ _tt; \
+ } while(0)
+
+ #endif
+
+ #define mul_x8(x) do { uint_64t _tt = gf_tab[ui64_ptr(x)[1] >> 56]; \
+ ui64_ptr(x)[1] = (ui64_ptr(x)[1] << 8) | (ui64_ptr(x)[0] >> 56); \
+ ui64_ptr(x)[0] = (ui64_ptr(x)[0] << 8) ^ _tt; \
+ } while(0)
+
+#elif BFR_UNIT == 32
+
+ #define mul_x(r, x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 17) & MSK_80]; \
+ ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) & ~MSK_80 | ((ui32_ptr(x)[3] << 15) \
+ | (ui32_ptr(x)[2] >> 17)) & MSK_80; \
+ ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) & ~MSK_80 | ((ui32_ptr(x)[2] << 15) \
+ | (ui32_ptr(x)[1] >> 17)) & MSK_80; \
+ ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) & ~MSK_80 | ((ui32_ptr(x)[1] << 15) \
+ | (ui32_ptr(x)[0] >> 17)) & MSK_80; \
+ ui32_ptr(r)[0] = ((ui32_ptr(x)[0] >> 1) & ~MSK_80 \
+ | (ui32_ptr(x)[0] << 15) & MSK_80) ^ _tt; \
+ } while(0)
+
+ #if defined( VERSION_1 )
+
+ #define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & MSK_F0]; \
+ ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[3] << 12) \
+ | (ui32_ptr(x)[2] >> 20)) & MSK_F0; \
+ ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[2] << 12) \
+ | (ui32_ptr(x)[1] >> 20)) & MSK_F0; \
+ ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[1] << 12) \
+ | (ui32_ptr(x)[0] >> 20)) & MSK_F0; \
+ ui32_ptr(x)[0] = ((ui32_ptr(x)[0] >> 4) & ~MSK_F0 \
+ | (ui32_ptr(x)[0] << 12) & MSK_F0) ^ _tt; \
+ } while(0)
+
+ #else
+
+ #define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & 0xf0]; \
+ bswap32_block(x, x, 4); \
+ ui32_ptr(x)[3] = bswap_32((ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28)); \
+ ui32_ptr(x)[2] = bswap_32((ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28)); \
+ ui32_ptr(x)[1] = bswap_32((ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28)); \
+ ui32_ptr(x)[0] = bswap_32((ui32_ptr(x)[0] >> 4)) ^ _tt; \
+ } while(0)
+
+ #endif
+
+#define mul_x8(x) do { uint_32t _tt = gf_tab[ui32_ptr(x)[3] >> 24]; \
+ ui32_ptr(x)[3] = (ui32_ptr(x)[3] << 8) | (ui32_ptr(x)[2] >> 24); \
+ ui32_ptr(x)[2] = (ui32_ptr(x)[2] << 8) | (ui32_ptr(x)[1] >> 24); \
+ ui32_ptr(x)[1] = (ui32_ptr(x)[1] << 8) | (ui32_ptr(x)[0] >> 24); \
+ ui32_ptr(x)[0] = (ui32_ptr(x)[0] << 8) ^ _tt; \
+ } while(0)
+
+#else
+
+ #define mul_x(r, x) do { uint_8t _tt = ui8_ptr(x)[15] & 1; \
+ ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); \
+ ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); \
+ ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); \
+ ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); \
+ ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); \
+ ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); \
+ ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); \
+ ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); \
+ ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); \
+ ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); \
+ ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); \
+ ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); \
+ ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); \
+ ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); \
+ ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); \
+ ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00); \
+ } while(0)
+
+ #define mul_x4(x) do { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff]; \
+ ui8_ptr(x)[15] = (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4); \
+ ui8_ptr(x)[14] = (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4); \
+ ui8_ptr(x)[13] = (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4); \
+ ui8_ptr(x)[12] = (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4); \
+ ui8_ptr(x)[11] = (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4); \
+ ui8_ptr(x)[10] = (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4); \
+ ui8_ptr(x)[ 9] = (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4); \
+ ui8_ptr(x)[ 8] = (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4); \
+ ui8_ptr(x)[ 7] = (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4); \
+ ui8_ptr(x)[ 6] = (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4); \
+ ui8_ptr(x)[ 5] = (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4); \
+ ui8_ptr(x)[ 4] = (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4); \
+ ui8_ptr(x)[ 3] = (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4); \
+ ui8_ptr(x)[ 2] = (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4); \
+ ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt >> 8); \
+ ui8_ptr(x)[ 0] = (ui8_ptr(x)[ 0] >> 4) ^ (_tt & 0xff); \
+ } while(0)
+
+ #define mul_x8(x) do { uint_16t _tt = gf_tab[ui8_ptr(x)[15]]; \
+ memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15); \
+ ui8_ptr(x)[1] ^= (_tt >> 8); \
+ ui8_ptr(x)[0] = (_tt & 0xff); \
+ } while(0)
+
+#endif
+
+#endif
+
+#elif PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
+
+#if defined( USE_INLINES )
+
+#if BFR_UNIT == 64
+
+ gf_inline void mul_x(void *r, const void *x)
+ { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 7) & 0xff];
+ ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) | (ui64_ptr(x)[0] << 63);
+ ui64_ptr(r)[0] = (ui64_ptr(x)[0] >> 1) ^ (_tt << 48);
+ }
+
+ gf_inline void mul_x4(void *x)
+ { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 4) & 0xff];
+ ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60);
+ ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 4) ^ (_tt << 48);
+ }
+
+ gf_inline void mul_x8(void *x)
+ { uint_64t _tt = gf_tab[ui64_ptr(x)[1] & 0xff];
+ ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 8) | (ui64_ptr(x)[0] << 56);
+ ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 8) ^ (_tt << 48);
+ }
+
+#elif BFR_UNIT == 32
+
+ gf_inline void mul_x(void *r, const void *x)
+ { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 7) & 0xff];
+ ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) | (ui32_ptr(x)[2] << 31);
+ ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) | (ui32_ptr(x)[1] << 31);
+ ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) | (ui32_ptr(x)[0] << 31);
+ ui32_ptr(r)[0] = (ui32_ptr(x)[0] >> 1) ^ (_tt << 16);
+ }
+
+ gf_inline void mul_x4(void *x)
+ { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 4) & 0xff];
+ ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28);
+ ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28);
+ ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28);
+ ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 4) ^ (_tt << 16);
+ }
+
+ gf_inline void mul_x8(void *x)
+ { uint_32t _tt = gf_tab[ui32_ptr(x)[3] & 0xff];
+ ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 8) | (ui32_ptr(x)[2] << 24);
+ ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 8) | (ui32_ptr(x)[1] << 24);
+ ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 8) | (ui32_ptr(x)[0] << 24);
+ ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 8) ^ (_tt << 16);
+ }
+
+#else
+
+ gf_inline void mul_x(void *r, const void *x)
+ { uint_8t _tt = ui8_ptr(x)[15] & 1;
+ ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7);
+ ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7);
+ ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7);
+ ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7);
+ ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7);
+ ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7);
+ ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7);
+ ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7);
+ ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7);
+ ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7);
+ ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7);
+ ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7);
+ ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7);
+ ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7);
+ ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7);
+ ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00);
+ }
+
+ gf_inline void mul_x4(void *x)
+ {
+ uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff];
+ ui8_ptr(x)[15] = (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4);
+ ui8_ptr(x)[14] = (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4);
+ ui8_ptr(x)[13] = (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4);
+ ui8_ptr(x)[12] = (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4);
+ ui8_ptr(x)[11] = (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4);
+ ui8_ptr(x)[10] = (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4);
+ ui8_ptr(x)[ 9] = (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4);
+ ui8_ptr(x)[ 8] = (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4);
+ ui8_ptr(x)[ 7] = (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4);
+ ui8_ptr(x)[ 6] = (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4);
+ ui8_ptr(x)[ 5] = (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4);
+ ui8_ptr(x)[ 4] = (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4);
+ ui8_ptr(x)[ 3] = (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4);
+ ui8_ptr(x)[ 2] = (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4);
+ ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt & 0xff);
+ ui8_ptr(x)[ 0] = (ui8_ptr(x)[ 0] >> 4) ^ (_tt >> 8);
+ }
+
+ gf_inline void mul_x8(void *x)
+ { uint_16t _tt = gf_tab[ui8_ptr(x)[15]];
+ memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15);
+ ui8_ptr(x)[1] ^= (_tt & 0xff);
+ ui8_ptr(x)[0] = (_tt >> 8);
+ }
+
+#endif
+
+#else /* DEFINES */
+
+#if BFR_UNIT == 64
+
+ #define mul_x(r, x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 7) & 0xff]; \
+ ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) | (ui64_ptr(x)[0] << 63); \
+ ui64_ptr(r)[0] = (ui64_ptr(x)[0] >> 1) ^ (_tt << 48); \
+ } while(0)
+
+ #define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 4) & 0xff]; \
+ ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60); \
+ ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 4) ^ (_tt << 48); \
+ } while(0)
+
+ #define mul_x8(x) do { uint_64t _tt = gf_tab[ui64_ptr(x)[1] & 0xff]; \
+ ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 8) | (ui64_ptr(x)[0] << 56); \
+ ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 8) ^ (_tt << 48); \
+ } while(0)
+
+#elif BFR_UNIT == 32
+
+ #define mul_x(r, x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 7) & 0xff]; \
+ ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) | (ui32_ptr(x)[2] << 31); \
+ ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) | (ui32_ptr(x)[1] << 31); \
+ ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) | (ui32_ptr(x)[0] << 31); \
+ ui32_ptr(r)[0] = (ui32_ptr(x)[0] >> 1) ^ (_tt << 16); \
+ } while(0)
+
+ #define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 4) & 0xff]; \
+ ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28); \
+ ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28); \
+ ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28); \
+ ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 4) ^ (_tt << 16); \
+ } while(0)
+
+ #define mul_x8(x) do { uint_32t _tt = gf_tab[ui32_ptr(x)[3] & 0xff]; \
+ ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 8) | (ui32_ptr(x)[2] << 24); \
+ ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 8) | (ui32_ptr(x)[1] << 24); \
+ ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 8) | (ui32_ptr(x)[0] << 24); \
+ ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 8) ^ (_tt << 16); \
+ } while(0)
+
+#else
+
+ #define mul_x(r, x) do { uint_8t _tt = ui8_ptr(x)[15] & 1; \
+ ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); \
+ ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); \
+ ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); \
+ ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); \
+ ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); \
+ ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); \
+ ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); \
+ ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); \
+ ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); \
+ ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); \
+ ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); \
+ ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); \
+ ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); \
+ ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); \
+ ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); \
+ ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00); \
+ } while(0)
+
+ #define mul_x4(x) do { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff]; \
+ ui8_ptr(x)[15] = (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4); \
+ ui8_ptr(x)[14] = (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4); \
+ ui8_ptr(x)[13] = (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4); \
+ ui8_ptr(x)[12] = (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4); \
+ ui8_ptr(x)[11] = (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4); \
+ ui8_ptr(x)[10] = (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4); \
+ ui8_ptr(x)[ 9] = (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4); \
+ ui8_ptr(x)[ 8] = (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4); \
+ ui8_ptr(x)[ 7] = (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4); \
+ ui8_ptr(x)[ 6] = (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4); \
+ ui8_ptr(x)[ 5] = (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4); \
+ ui8_ptr(x)[ 4] = (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4); \
+ ui8_ptr(x)[ 3] = (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4); \
+ ui8_ptr(x)[ 2] = (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4); \
+ ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt & 0xff); \
+ ui8_ptr(x)[ 0] = (ui8_ptr(x)[ 0] >> 4) ^ (_tt >> 8); \
+ } while(0)
+
+ #define mul_x8(x) do { uint_16t _tt = gf_tab[ui8_ptr(x)[15]]; \
+ memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15); \
+ ui8_ptr(x)[1] ^= (_tt & 0xff); \
+ ui8_ptr(x)[0] = (_tt >> 8); \
+ } while(0)
+
+#endif
+
+#endif
+
+#else
+# error Platform byte order has not been set.
+#endif
+
+/* A slow generic version of gf_mul (a = a * b) */
+
+void gf_mul(void *a, const void* b);
+
+/* This version uses 64k bytes of table space on the stack.
+ A 16 byte buffer has to be multiplied by a 16 byte key
+ value in GF(128). If we consider a GF(128) value in
+ the buffer's lowest byte, we can construct a table of
+ the 256 16 byte values that result from the 256 values
+ of this byte. This requires 4096 bytes. But we also
+ need tables for each of the 16 higher bytes in the
+ buffer as well, which makes 64 kbytes in total.
+*/
+
+void init_64k_table(unsigned char g[], void *t);
+typedef uint_32t (*gf_t64k)[256][GF_BYTE_LEN >> 2];
+#define tab64k(x) ((gf_t64k)x)
+#define xor_64k(i,a,t,r) xor_block_aligned(r, tab64k(t)[i][a[i]])
+
+#if defined( USE_INLINES )
+
+#if defined( UNROLL_LOOPS )
+
+gf_inline void gf_mul_64k(unsigned char a[], void *t, void *r)
+{
+ move_block_aligned(r, tab64k(t)[0][a[0]]); xor_64k( 1, a, t, r);
+ xor_64k( 2, a, t, r); xor_64k( 3, a, t, r);
+ xor_64k( 4, a, t, r); xor_64k( 5, a, t, r);
+ xor_64k( 6, a, t, r); xor_64k( 7, a, t, r);
+ xor_64k( 8, a, t, r); xor_64k( 9, a, t, r);
+ xor_64k(10, a, t, r); xor_64k(11, a, t, r);
+ xor_64k(12, a, t, r); xor_64k(13, a, t, r);
+ xor_64k(14, a, t, r); xor_64k(15, a, t, r);
+ move_block_aligned(a, r);
+}
+
+#else
+
+gf_inline void gf_mul_64k(unsigned char a[], void *t, void *r)
+{ int i;
+ move_block_aligned(r, tab64k(t)[0][a[0]]);
+ for(i = 1; i < GF_BYTE_LEN; ++i)
+ xor_64k(i, a, t, r);
+ move_block_aligned(a, r);
+}
+
+#endif
+
+#else
+
+#if !defined( UNROLL_LOOPS )
+
+#define gf_mul_64k(a, t, r) do { \
+ move_block_aligned(r, tab64k(t)[0][a[0]]); \
+ xor_64k( 1, a, t, r); \
+ xor_64k( 2, a, t, r); xor_64k( 3, a, t, r); \
+ xor_64k( 4, a, t, r); xor_64k( 5, a, t, r); \
+ xor_64k( 6, a, t, r); xor_64k( 7, a, t, r); \
+ xor_64k( 8, a, t, r); xor_64k( 9, a, t, r); \
+ xor_64k(10, a, t, r); xor_64k(11, a, t, r); \
+ xor_64k(12, a, t, r); xor_64k(13, a, t, r); \
+ xor_64k(14, a, t, r); xor_64k(15, a, t, r); \
+ move_block_aligned(a, r); \
+} while(0)
+
+#else
+
+#define gf_mul_64k(a, t, r) do { int i; \
+ move_block_aligned(r, tab64k(t)[0][a[0]]); \
+ for(i = 1; i < GF_BYTE_LEN; ++i) \
+ { xor_64k(i, a, t, r); \
+ } \
+ move_block_aligned(a, r); \
+} while(0)
+
+#endif
+
+#endif
+
+/* This version uses 8k bytes of table space on the stack.
+ A 16 byte buffer has to be multiplied by a 16 byte key
+ value in GF(128). If we consider a GF(128) value in
+ the buffer's lowest 4-bits, we can construct a table of
+ the 16 16 byte values that result from the 16 values
+ of these 4 bits. This requires 256 bytes. But we also
+ need tables for each of the 32 higher 4 bit groups,
+ which makes 8 kbytes in total.
+*/
+
+void init_8k_table(unsigned char g[], void *t);
+
+typedef uint_32t (*gf_t8k)[16][GF_BYTE_LEN >> 2];
+#define tab8k(x) ((gf_t8k)x)
+#define xor_8k(i,a,t,r) \
+ xor_block_aligned(r, tab8k(t)[i + i][a[i] & 15]); \
+ xor_block_aligned(r, tab8k(t)[i + i + 1][a[i] >> 4])
+
+#if defined( USE_INLINES )
+
+#if defined( UNROLL_LOOPS )
+
+gf_inline void gf_mul_8k(unsigned char a[], void *t, void *r)
+{
+ move_block_aligned(r, tab8k(t)[0][a[0] & 15]);
+ xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]);
+ xor_8k( 1, a, t, r); xor_8k( 2, a, t, r); xor_8k( 3, a, t, r);
+ xor_8k( 4, a, t, r); xor_8k( 5, a, t, r); xor_8k( 6, a, t, r); xor_8k( 7, a, t, r);
+ xor_8k( 8, a, t, r); xor_8k( 9, a, t, r); xor_8k(10, a, t, r); xor_8k(11, a, t, r);
+ xor_8k(12, a, t, r); xor_8k(13, a, t, r); xor_8k(14, a, t, r); xor_8k(15, a, t, r);
+ move_block_aligned(a, r);
+}
+
+#else
+
+gf_inline void gf_mul_8k(unsigned char a[], void *t, void *r)
+{ int i;
+ memcpy(r, tab8k(t)[0][a[0] & 15], GF_BYTE_LEN);
+ xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]);
+ for(i = 1; i < GF_BYTE_LEN; ++i)
+ { xor_8k(i, a, t, r);
+ }
+ memcpy(a, r, GF_BYTE_LEN);
+}
+
+#endif
+
+#else
+
+#if defined( UNROLL_LOOPS )
+
+#define gf_mul_8k(a, t, r) do { \
+ move_block_aligned(r, tab8k(t)[0][a[0] & 15]); \
+ xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]); \
+ xor_8k( 1, a, t, r); xor_8k( 2, a, t, r); \
+ xor_8k( 3, a, t, r); xor_8k( 4, a, t, r); \
+ xor_8k( 5, a, t, r); xor_8k( 6, a, t, r); \
+ xor_8k( 7, a, t, r); xor_8k( 8, a, t, r); \
+ xor_8k( 9, a, t, r); xor_8k(10, a, t, r); \
+ xor_8k(11, a, t, r); xor_8k(12, a, t, r); \
+ xor_8k(13, a, t, r); xor_8k(14, a, t, r); \
+ xor_8k(15, a, t, r); move_block_aligned(a, r); \
+} while(0)
+
+#else
+
+#define gf_mul_8k(a, t, r) do { int i; \
+ memcpy(r, tab8k(t)[0][a[0] & 15], GF_BYTE_LEN); \
+ xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]); \
+ for(i = 1; i < GF_BYTE_LEN; ++i) \
+ { xor_8k(i, a, t, r); \
+ } \
+ memcpy(a, r, GF_BYTE_LEN); \
+} while(0)
+
+#endif
+
+#endif
+
+/* This version uses 4k bytes of table space on the stack.
+ A 16 byte buffer has to be multiplied by a 16 byte key
+ value in GF(128). If we consider a GF(128) value in a
+ single byte, we can construct a table of the 256 16 byte
+ values that result from the 256 values of this byte.
+ This requires 4096 bytes. If we take the highest byte in
+ the buffer and use this table to get the result, we then
+ have to multiply by x^120 to get the final value. For the
+ next highest byte the result has to be multiplied by x^112
+ and so on. But we can do this by accumulating the result
+ in an accumulator starting with the result for the top
+ byte. We repeatedly multiply the accumulator value by
+ x^8 and then add in (i.e. xor) the 16 bytes of the next
+ lower byte in the buffer, stopping when we reach the
+ lowest byte. This requires a 4096 byte table.
+*/
+
+void init_4k_table(unsigned char g[], void *t);
+
+typedef uint_32t (*gf_t4k)[GF_BYTE_LEN >> 2];
+#define tab4k(x) ((gf_t4k)x)
+#define xor_4k(i,a,t,r) mul_x8(r); xor_block_aligned(r, tab4k(t)[a[i]])
+
+#if defined( USE_INLINES )
+
+#if defined( UNROLL_LOOPS )
+
+gf_inline void gf_mul_4k(unsigned char a[], void *t, void *r)
+{
+ move_block_aligned(r,tab4k(t)[a[15]]);
+ xor_4k(14, a, t, r); xor_4k(13, a, t, r); xor_4k(12, a, t, r);
+ xor_4k(11, a, t, r); xor_4k(10, a, t, r); xor_4k( 9, a, t, r);
+ xor_4k( 8, a, t, r); xor_4k( 7, a, t, r); xor_4k( 6, a, t, r);
+ xor_4k( 5, a, t, r); xor_4k( 4, a, t, r); xor_4k( 3, a, t, r);
+ xor_4k( 2, a, t, r); xor_4k( 1, a, t, r); xor_4k( 0, a, t, r);
+ move_block_aligned(a, r);
+}
+
+#else
+
+gf_inline void gf_mul_4k(unsigned char a[], void *t, void *r)
+{ int i = 15;
+ move_block_aligned(r,tab4k(t)[a[15]]);
+ while(i--)
+ {
+ xor_4k(i, a, t, r);
+ }
+ move_block_aligned(a, r);
+}
+
+#endif
+
+#else
+
+#if defined( UNROLL_LOOPS )
+
+#define gf_mul_4k(a, t, r) do { \
+ move_block_aligned(r,tab4k(t)[a[15]]); \
+ xor_4k(14, a, t, r); xor_4k(13, a, t, r); xor_4k(12, a, t, r); \
+ xor_4k(11, a, t, r); xor_4k(10, a, t, r); xor_4k( 9, a, t, r); \
+ xor_4k( 8, a, t, r); xor_4k( 7, a, t, r); xor_4k( 6, a, t, r); \
+ xor_4k( 5, a, t, r); xor_4k( 4, a, t, r); xor_4k( 3, a, t, r); \
+ xor_4k( 2, a, t, r); xor_4k( 1, a, t, r); xor_4k( 0, a, t, r); \
+ move_block_aligned(a, r); \
+} while(0)
+
+#else
+
+#define gf_mul_4k(a, t, r) do { int i = 15; \
+ move_block_aligned(r,tab4k(t)[a[15]]); \
+ while(i--) \
+ { xor_4k(i, a, t, r); \
+ } \
+ move_block_aligned(a, r); \
+} while(0)
+
+#endif
+
+#endif
+
+/* This version uses 256 bytes of table space on the stack.
+ A 16 byte buffer has to be multiplied by a 16 byte key
+ value in GF(128). If we consider a GF(128) value in a
+ single 4-bit nibble, we can construct a table of the 16
+ 16 byte values that result from the 16 values of this
+ byte. This requires 256 bytes. If we take the highest
+ 4-bit nibble in the buffer and use this table to get the
+ result, we then have to multiply by x^124 to get the
+ final value. For the next highest byte the result has to
+ be multiplied by x^120 and so on. But we can do this by
+ accumulating the result in an accumulator starting with
+ the result for the top nibble. We repeatedly multiply
+ the accumulator value by x^4 and then add in (i.e. xor)
+ the 16 bytes of the next lower nibble in the buffer,
+ stopping when we reach the lowest nibblebyte. This uses
+ a 256 byte table.
+*/
+
+void init_256_table(unsigned char g[], void *t);
+
+typedef uint_32t (*gf_t256)[GF_BYTE_LEN >> 2];
+#define tab256(t) ((gf_t256)t)
+#define xor_256(i,a,t,r) \
+ mul_x4(r); xor_block_aligned(r, tab256(t)[a[i] & 15]); \
+ mul_x4(r); xor_block_aligned(r, tab256(t)[a[i] >> 4])
+
+#if defined( USE_INLINES )
+
+#if defined( UNROLL_LOOPS )
+
+gf_inline void gf_mul_256(unsigned char a[], void *t, void *r)
+{
+ move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r);
+ xor_block_aligned(r, tab256(t)[a[15] >> 4]);
+ xor_256(14, a, t, r); xor_256(13, a, t, r);
+ xor_256(12, a, t, r); xor_256(11, a, t, r);
+ xor_256(10, a, t, r); xor_256( 9, a, t, r);
+ xor_256( 8, a, t, r); xor_256( 7, a, t, r);
+ xor_256( 6, a, t, r); xor_256( 5, a, t, r);
+ xor_256( 4, a, t, r); xor_256( 3, a, t, r);
+ xor_256( 2, a, t, r); xor_256( 1, a, t, r);
+ xor_256( 0, a, t, r); move_block_aligned(a, r);
+}
+
+#else
+
+gf_inline void gf_mul_256(unsigned char a[], void *t, void *r)
+{ int i = 15;
+ move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r);
+ xor_block_aligned(r, tab256(t)[a[15] >> 4]);
+ while(i--)
+ { xor_256(i, a, t, r);
+ }
+ move_block_aligned(a, r);
+}
+
+#endif
+
+#else
+
+#if defined( UNROLL_LOOPS )
+
+#define gf_mul_256(a, t, r) do { \
+ move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); \
+ xor_block_aligned(r, tab256(t)[a[15] >> 4]); \
+ xor_256(14, a, t, r); xor_256(13, a, t, r); \
+ xor_256(12, a, t, r); xor_256(11, a, t, r); \
+ xor_256(10, a, t, r); xor_256( 9, a, t, r); \
+ xor_256( 8, a, t, r); xor_256( 7, a, t, r); \
+ xor_256( 6, a, t, r); xor_256( 5, a, t, r); \
+ xor_256( 4, a, t, r); xor_256( 3, a, t, r); \
+ xor_256( 2, a, t, r); xor_256( 1, a, t, r); \
+ xor_256( 0, a, t, r); move_block_aligned(a, r); \
+} while(0)
+
+#else
+
+#define gf_mul_256(a, t, r) do { int i = 15; \
+ move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); \
+ xor_block_aligned(r, tab256(t)[a[15] >> 4]); \
+ while(i--) \
+ { xor_256(i, a, t, r); \
+ } \
+ move_block_aligned(a, r); \
+} while(0)
+
+#endif
+
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif