DTN2-BPQ: comparison servlib/gcm/gf128mul.h

equal deleted inserted replaced

--1:000000000000
+:2b3e5ec03512
+/*
+---------------------------------------------------------------------------
+Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved.
+LICENSE TERMS
+The free distribution and use of this software in both source and binary
+form is allowed (with or without changes) provided that:
+1. distributions of this source code include the above copyright
+notice, this list of conditions and the following disclaimer;
+2. distributions in binary form include the above copyright
+notice, this list of conditions and the following disclaimer
+in the documentation and/or other associated materials;
+3. the copyright holder's name is not used to endorse products
+built using this software without specific written permission.
+ALTERNATIVELY, provided that this notice is retained in full, this product
+may be distributed under the terms of the GNU General Public License (GPL),
+in which case the provisions of the GPL apply INSTEAD OF those given above.
+DISCLAIMER
+This software is provided 'as is' with no explicit or implied warranties
+in respect of its properties, including, but not limited to, correctness
+and/or fitness for purpose.
+---------------------------------------------------------------------------
+Issue Date: 13/10/2006
+An implementation of field multiplication in Galois Field GF(128)
+*/
+#ifndef GF128MUL_H
+#define GF128MUL_H
+#include <stdlib.h>
+#include <string.h>
+#include "mode_hdr.h"
+/*  Table sizes for GF(128) Multiply.  Normally larger tables give
+higher speed but cache loading might change this. Normally only
+one table size (or none at all) will be specified here
+*/
+#if 0
+#  define TABLES_64K
+#endif
+#if 1
+#  define TABLES_8K
+#endif
+#if 0
+#  define TABLES_4K
+#endif
+#if 0
+#  define TABLES_256
+#endif
+/*  Use of inlines is preferred but code blocks can also be expanded inline
+using 'defines'.  But the latter approach will typically generate a LOT
+of code and is not recommended.
+*/
+#if 0
+#  define USE_INLINES
+#endif
+/*  Speed critical loops can be unrolled to gain speed but consume more
+memory
+*/
+#if 0
+#  define UNROLL_LOOPS
+#endif
+/*  Multiply a GF128 field element by x. Field elements are held in arrays
+of bytes in which field bits 8n..8n + 7 are held in byte[n], with lower
+indexed bits placed in the more numerically significant bit positions
+within bytes.
+On little endian machines the bit indexes translate into the bit
+positions within four 32-bit words in the following way
+MS            x[0]           LS  MS            x[1]           LS
+ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
+24...31 16...23 08...15 00...07  56...63 48...55 40...47 32...39
+MS            x[2]           LS  MS            x[3]           LS
+ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
+88...95 80...87 72...79 64...71  120.127 112.119 104.111 96..103
+On big endian machines the bit indexes translate into the bit
+positions within four 32-bit words in the following way
+MS            x[0]           LS  MS            x[1]           LS
+ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
+00...07 08...15 16...23 24...31  32...39 40...47 48...55 56...63
+MS            x[2]           LS  MS            x[3]           LS
+ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
+64...71 72...79 80...87 88...95  96..103 104.111 112.119 120.127
+*/
+#define GF_BYTE_LEN 16
+#if defined( USE_INLINES )
+#  if defined( _MSC_VER )
+#    define gf_inline __inline
+#  elif defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
+#    define gf_inline static inline
+#  else
+#    define gf_inline static
+#  endif
+#endif
+#if defined(__cplusplus)
+extern "C"
+{
+#endif
+/*  These functions multiply a field element x, by x^4 and by x^8 in the
+polynomial field representation. It uses 32-bit word operations to
+gain speed but compensates for machine endianess and hence works
+correctly on both styles of machine.
+*/
+extern const unsigned short gf_tab[256];
+#if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
+/*  This section is not needed as GF(128) multiplication is now implemented
+but is left in place as it provides a template for an alternative little
+endian implementation approach based on conversion to and from big endian
+format
+*/
+#if 0
+/*  This is a template for mul_x.  The mul_x4 and mul_x8 little endian
+alternative implementations (and their defined versions) follow the
+big endian functions below in the same way.
+*/
+gf_inline void mul_x(void *r, const void *x)
+{   uint_32t _tt;
+bswap32_block(r, x, 4);
+_tt = gf_tab[(ui32_ptr(r)[3] << 7) & 0xff];
+ui32_ptr(r)[3] = (ui32_ptr(r)[3] >> 1) | (ui32_ptr(r)[2] << 31);
+ui32_ptr(r)[2] = (ui32_ptr(r)[2] >> 1) | (ui32_ptr(r)[1] << 31);
+ui32_ptr(r)[1] = (ui32_ptr(r)[1] >> 1) | (ui32_ptr(r)[0] << 31);
+ui32_ptr(r)[0] = (ui32_ptr(r)[0] >> 1) ^ bswap_32(_tt);
+bswap32_block(r, r, 4);
+}
+#endif
+#define VERSION_1
+#define MSK_80   (0x80 * (unit_cast(BFR_UNIT,-1) / 0xff))
+#define MSK_F0   (0xf0 * (unit_cast(BFR_UNIT,-1) / 0xff))
+#if defined( USE_INLINES )
+#if BFR_UNIT == 64
+gf_inline void mul_x(void *r, const void *x)
+{   uint_64t  _tt = gf_tab[(ui64_ptr(x)[1] >> 49) & MSK_80];
+ui64_ptr(r)[1] =  (ui64_ptr(x)[1] >> 1) & ~MSK_80 | ((ui64_ptr(x)[1] << 15) | (ui64_ptr(x)[0] >> 49)) & MSK_80;
+ui64_ptr(r)[0] = ((ui64_ptr(x)[0] >> 1) & ~MSK_80 |  (ui64_ptr(x)[0] << 15) & MSK_80) ^ _tt;
+}
+#if defined( VERSION_1 )
+gf_inline void mul_x4(void *x)
+{   uint_64t   _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & MSK_F0];
+ui64_ptr(x)[1] =  (ui64_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui64_ptr(x)[1] << 12) | (ui64_ptr(x)[0] >> 52)) & MSK_F0;
+ui64_ptr(x)[0] = ((ui64_ptr(x)[0] >> 4) & ~MSK_F0 |  (ui64_ptr(x)[0] << 12) & MSK_F0) ^ _tt;
+}
+#else
+gf_inline void mul_x4(void *x)
+{   uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & 0xf0];
+bswap64_block(x, x, 2);
+ui64_ptr(x)[1] = bswap_64((ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60));
+ui64_ptr(x)[0] = bswap_64((ui64_ptr(x)[0] >> 4)) ^ _tt;
+}
+#endif
+gf_inline void mul_x8(void *x)
+{   uint_64t _tt = gf_tab[ui64_ptr(x)[1] >> 56];
+ui64_ptr(x)[1] = (ui64_ptr(x)[1] << 8) | (ui64_ptr(x)[0] >> 56);
+ui64_ptr(x)[0] = (ui64_ptr(x)[0] << 8) ^ _tt;
+}
+#elif BFR_UNIT == 32
+gf_inline void mul_x(void *r, const void *x)
+{   uint_32t  _tt = gf_tab[(ui32_ptr(x)[3] >> 17) & MSK_80];
+ui32_ptr(r)[3] =  (ui32_ptr(x)[3] >> 1) & ~MSK_80 | ((ui32_ptr(x)[3] << 15) | (ui32_ptr(x)[2] >> 17)) & MSK_80;
+ui32_ptr(r)[2] =  (ui32_ptr(x)[2] >> 1) & ~MSK_80 | ((ui32_ptr(x)[2] << 15) | (ui32_ptr(x)[1] >> 17)) & MSK_80;
+ui32_ptr(r)[1] =  (ui32_ptr(x)[1] >> 1) & ~MSK_80 | ((ui32_ptr(x)[1] << 15) | (ui32_ptr(x)[0] >> 17)) & MSK_80;
+ui32_ptr(r)[0] = ((ui32_ptr(x)[0] >> 1) & ~MSK_80 |  (ui32_ptr(x)[0] << 15) & MSK_80) ^ _tt;
+}
+#if defined( VERSION_1 )
+gf_inline void mul_x4(void *x)
+{   uint_32t   _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & MSK_F0];
+ui32_ptr(x)[3] =  (ui32_ptr(x)[3] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[3] << 12) | (ui32_ptr(x)[2] >> 20)) & MSK_F0;
+ui32_ptr(x)[2] =  (ui32_ptr(x)[2] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[2] << 12) | (ui32_ptr(x)[1] >> 20)) & MSK_F0;
+ui32_ptr(x)[1] =  (ui32_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[1] << 12) | (ui32_ptr(x)[0] >> 20)) & MSK_F0;
+ui32_ptr(x)[0] = ((ui32_ptr(x)[0] >> 4) & ~MSK_F0 |  (ui32_ptr(x)[0] << 12) & MSK_F0) ^ _tt;
+}
+#else
+gf_inline void mul_x4(void *x)
+{   uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & 0xf0];
+bswap32_block(x, x, 4);
+ui32_ptr(x)[3] = bswap_32((ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28));
+ui32_ptr(x)[2] = bswap_32((ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28));
+ui32_ptr(x)[1] = bswap_32((ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28));
+ui32_ptr(x)[0] = bswap_32((ui32_ptr(x)[0] >> 4)) ^ _tt;
+}
+#endif
+gf_inline void mul_x8(void *x)
+{   uint_32t   _tt = gf_tab[ui32_ptr(x)[3] >> 24];
+ui32_ptr(x)[3] = (ui32_ptr(x)[3] << 8) | (ui32_ptr(x)[2] >> 24);
+ui32_ptr(x)[2] = (ui32_ptr(x)[2] << 8) | (ui32_ptr(x)[1] >> 24);
+ui32_ptr(x)[1] = (ui32_ptr(x)[1] << 8) | (ui32_ptr(x)[0] >> 24);
+ui32_ptr(x)[0] = (ui32_ptr(x)[0] << 8) ^ _tt;
+}
+#else
+gf_inline void mul_x(void *r, const void *x)
+{   uint_8t _tt = ui8_ptr(x)[15] & 1;
+ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7);
+ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7);
+ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7);
+ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7);
+ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7);
+ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7);
+ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7);
+ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7);
+ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7);
+ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7);
+ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7);
+ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7);
+ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7);
+ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7);
+ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7);
+ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00);
+}
+gf_inline void mul_x4(void *x)
+{   uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff];
+ui8_ptr(x)[15] =  (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4);
+ui8_ptr(x)[14] =  (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4);
+ui8_ptr(x)[13] =  (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4);
+ui8_ptr(x)[12] =  (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4);
+ui8_ptr(x)[11] =  (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4);
+ui8_ptr(x)[10] =  (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4);
+ui8_ptr(x)[ 9] =  (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4);
+ui8_ptr(x)[ 8] =  (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4);
+ui8_ptr(x)[ 7] =  (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4);
+ui8_ptr(x)[ 6] =  (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4);
+ui8_ptr(x)[ 5] =  (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4);
+ui8_ptr(x)[ 4] =  (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4);
+ui8_ptr(x)[ 3] =  (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4);
+ui8_ptr(x)[ 2] =  (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4);
+ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt >> 8);
+ui8_ptr(x)[ 0] =  (ui8_ptr(x)[ 0] >> 4) ^ (_tt & 0xff);
+}
+gf_inline void mul_x8(void *x)
+{   uint_16t _tt = gf_tab[ui8_ptr(x)[15]];
+memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15);
+ui8_ptr(x)[1] ^= (_tt >> 8);
+ui8_ptr(x)[0] = (_tt & 0xff);
+}
+#endif
+#else   /* DEFINES */
+#if BFR_UNIT == 64
+#define mul_x(r, x) do { uint_64t  _tt = gf_tab[(ui64_ptr(x)[1] >> 49) & MSK_80]; \
+ui64_ptr(r)[1] =  (ui64_ptr(x)[1] >> 1) & ~MSK_80                             \
+| ((ui64_ptr(x)[1] << 15) | (ui64_ptr(x)[0] >> 49)) & MSK_80;             \
+ui64_ptr(r)[0] = ((ui64_ptr(x)[0] >> 1) & ~MSK_80                             \
+|  (ui64_ptr(x)[0] << 15) & MSK_80) ^ _tt;                                \
+} while(0)
+#if defined( VERSION_1 )
+#define mul_x4(x) do { uint_64t   _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & MSK_F0];  \
+ui64_ptr(x)[1] =  (ui64_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui64_ptr(x)[1] << 12)   \
+| (ui64_ptr(x)[0] >> 52)) & MSK_F0;                                       \
+ui64_ptr(x)[0] = ((ui64_ptr(x)[0] >> 4) & ~MSK_F0                             \
+|  (ui64_ptr(x)[0] << 12) & MSK_F0) ^ _tt;                                \
+} while(0)
+#else
+#define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & 0xf0];        \
+bswap64_block(x, x, 2);                                                         \
+ui64_ptr(x)[1] = bswap_64((ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60));      \
+ui64_ptr(x)[0] = bswap_64((ui64_ptr(x)[0] >> 4)) ^ _tt;                         \
+} while(0)
+#endif
+#define mul_x8(x) do { uint_64t _tt = gf_tab[ui64_ptr(x)[1] >> 56];     \
+ui64_ptr(x)[1] = (ui64_ptr(x)[1] << 8) | (ui64_ptr(x)[0] >> 56);    \
+ui64_ptr(x)[0] = (ui64_ptr(x)[0] << 8) ^ _tt;                       \
+} while(0)
+#elif BFR_UNIT == 32
+#define mul_x(r, x) do { uint_32t  _tt = gf_tab[(ui32_ptr(x)[3] >> 17) & MSK_80]; \
+ui32_ptr(r)[3] =  (ui32_ptr(x)[3] >> 1) & ~MSK_80 | ((ui32_ptr(x)[3] << 15)   \
+| (ui32_ptr(x)[2] >> 17)) & MSK_80;                                       \
+ui32_ptr(r)[2] =  (ui32_ptr(x)[2] >> 1) & ~MSK_80 | ((ui32_ptr(x)[2] << 15)   \
+| (ui32_ptr(x)[1] >> 17)) & MSK_80;                                       \
+ui32_ptr(r)[1] =  (ui32_ptr(x)[1] >> 1) & ~MSK_80 | ((ui32_ptr(x)[1] << 15)   \
+| (ui32_ptr(x)[0] >> 17)) & MSK_80;                                       \
+ui32_ptr(r)[0] = ((ui32_ptr(x)[0] >> 1) & ~MSK_80                             \
+| (ui32_ptr(x)[0] << 15) & MSK_80) ^ _tt;                                 \
+} while(0)
+#if defined( VERSION_1 )
+#define mul_x4(x) do { uint_32t   _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & MSK_F0];  \
+ui32_ptr(x)[3] =  (ui32_ptr(x)[3] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[3] << 12)   \
+| (ui32_ptr(x)[2] >> 20)) & MSK_F0;                                       \
+ui32_ptr(x)[2] =  (ui32_ptr(x)[2] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[2] << 12)   \
+| (ui32_ptr(x)[1] >> 20)) & MSK_F0;                                       \
+ui32_ptr(x)[1] =  (ui32_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[1] << 12)   \
+| (ui32_ptr(x)[0] >> 20)) & MSK_F0;                                       \
+ui32_ptr(x)[0] = ((ui32_ptr(x)[0] >> 4) & ~MSK_F0                             \
+|  (ui32_ptr(x)[0] << 12) & MSK_F0) ^ _tt;                                \
+} while(0)
+#else
+#define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & 0xf0];    \
+bswap32_block(x, x, 4);                                                     \
+ui32_ptr(x)[3] = bswap_32((ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28));  \
+ui32_ptr(x)[2] = bswap_32((ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28));  \
+ui32_ptr(x)[1] = bswap_32((ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28));  \
+ui32_ptr(x)[0] = bswap_32((ui32_ptr(x)[0] >> 4)) ^ _tt;                     \
+} while(0)
+#endif
+#define mul_x8(x) do { uint_32t   _tt = gf_tab[ui32_ptr(x)[3] >> 24];       \
+ui32_ptr(x)[3] = (ui32_ptr(x)[3] << 8) | (ui32_ptr(x)[2] >> 24);    \
+ui32_ptr(x)[2] = (ui32_ptr(x)[2] << 8) | (ui32_ptr(x)[1] >> 24);    \
+ui32_ptr(x)[1] = (ui32_ptr(x)[1] << 8) | (ui32_ptr(x)[0] >> 24);    \
+ui32_ptr(x)[0] = (ui32_ptr(x)[0] << 8) ^ _tt;                       \
+} while(0)
+#else
+#define mul_x(r, x) do { uint_8t _tt = ui8_ptr(x)[15] & 1;          \
+ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); \
+ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); \
+ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); \
+ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); \
+ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); \
+ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); \
+ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); \
+ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); \
+ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); \
+ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); \
+ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); \
+ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); \
+ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); \
+ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); \
+ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); \
+ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00);   \
+} while(0)
+#define mul_x4(x) do { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff];         \
+ui8_ptr(x)[15] =  (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4);                \
+ui8_ptr(x)[14] =  (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4);                \
+ui8_ptr(x)[13] =  (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4);                \
+ui8_ptr(x)[12] =  (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4);                \
+ui8_ptr(x)[11] =  (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4);                \
+ui8_ptr(x)[10] =  (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4);                \
+ui8_ptr(x)[ 9] =  (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4);                \
+ui8_ptr(x)[ 8] =  (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4);                \
+ui8_ptr(x)[ 7] =  (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4);                \
+ui8_ptr(x)[ 6] =  (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4);                \
+ui8_ptr(x)[ 5] =  (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4);                \
+ui8_ptr(x)[ 4] =  (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4);                \
+ui8_ptr(x)[ 3] =  (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4);                \
+ui8_ptr(x)[ 2] =  (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4);                \
+ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt >> 8);  \
+ui8_ptr(x)[ 0] =  (ui8_ptr(x)[ 0] >> 4) ^ (_tt & 0xff);                         \
+} while(0)
+#define mul_x8(x) do { uint_16t _tt = gf_tab[ui8_ptr(x)[15]];   \
+memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15);                    \
+ui8_ptr(x)[1] ^= (_tt >> 8);                                \
+ui8_ptr(x)[0] = (_tt & 0xff);                               \
+} while(0)
+#endif
+#endif
+#elif PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
+#if defined( USE_INLINES )
+#if BFR_UNIT == 64
+gf_inline void mul_x(void *r, const void *x)
+{   uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 7) & 0xff];
+ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) | (ui64_ptr(x)[0] << 63);
+ui64_ptr(r)[0] = (ui64_ptr(x)[0] >> 1) ^ (_tt << 48);
+}
+gf_inline void mul_x4(void *x)
+{   uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 4) & 0xff];
+ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60);
+ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 4) ^ (_tt << 48);
+}
+gf_inline void mul_x8(void *x)
+{   uint_64t _tt = gf_tab[ui64_ptr(x)[1] & 0xff];
+ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 8) | (ui64_ptr(x)[0] << 56);
+ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 8) ^ (_tt << 48);
+}
+#elif BFR_UNIT == 32
+gf_inline void mul_x(void *r, const void *x)
+{   uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 7) & 0xff];
+ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) | (ui32_ptr(x)[2] << 31);
+ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) | (ui32_ptr(x)[1] << 31);
+ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) | (ui32_ptr(x)[0] << 31);
+ui32_ptr(r)[0] = (ui32_ptr(x)[0] >> 1) ^ (_tt << 16);
+}
+gf_inline void mul_x4(void *x)
+{   uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 4) & 0xff];
+ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28);
+ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28);
+ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28);
+ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 4) ^ (_tt << 16);
+}
+gf_inline void mul_x8(void *x)
+{   uint_32t _tt = gf_tab[ui32_ptr(x)[3] & 0xff];
+ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 8) | (ui32_ptr(x)[2] << 24);
+ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 8) | (ui32_ptr(x)[1] << 24);
+ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 8) | (ui32_ptr(x)[0] << 24);
+ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 8) ^ (_tt << 16);
+}
+#else
+gf_inline void mul_x(void *r, const void *x)
+{   uint_8t _tt = ui8_ptr(x)[15] & 1;
+ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7);
+ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7);
+ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7);
+ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7);
+ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7);
+ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7);
+ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7);
+ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7);
+ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7);
+ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7);
+ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7);
+ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7);
+ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7);
+ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7);
+ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7);
+ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00);
+}
+gf_inline void mul_x4(void *x)
+{
+uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff];
+ui8_ptr(x)[15] =  (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4);
+ui8_ptr(x)[14] =  (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4);
+ui8_ptr(x)[13] =  (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4);
+ui8_ptr(x)[12] =  (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4);
+ui8_ptr(x)[11] =  (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4);
+ui8_ptr(x)[10] =  (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4);
+ui8_ptr(x)[ 9] =  (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4);
+ui8_ptr(x)[ 8] =  (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4);
+ui8_ptr(x)[ 7] =  (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4);
+ui8_ptr(x)[ 6] =  (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4);
+ui8_ptr(x)[ 5] =  (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4);
+ui8_ptr(x)[ 4] =  (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4);
+ui8_ptr(x)[ 3] =  (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4);
+ui8_ptr(x)[ 2] =  (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4);
+ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt & 0xff);
+ui8_ptr(x)[ 0] =  (ui8_ptr(x)[ 0] >> 4) ^ (_tt >> 8);
+}
+gf_inline void mul_x8(void *x)
+{   uint_16t _tt = gf_tab[ui8_ptr(x)[15]];
+memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15);
+ui8_ptr(x)[1] ^= (_tt & 0xff);
+ui8_ptr(x)[0] = (_tt >> 8);
+}
+#endif
+#else   /* DEFINES */
+#if BFR_UNIT == 64
+#define mul_x(r, x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 7) & 0xff];   \
+ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) | (ui64_ptr(x)[0] << 63);            \
+ui64_ptr(r)[0] = (ui64_ptr(x)[0] >> 1) ^ (_tt << 48);                       \
+} while(0)
+#define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 4) & 0xff]; \
+ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60);        \
+ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 4) ^ (_tt << 48);                   \
+} while(0)
+#define mul_x8(x) do { uint_64t _tt = gf_tab[ui64_ptr(x)[1] & 0xff];    \
+ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 8) | (ui64_ptr(x)[0] << 56);    \
+ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 8) ^ (_tt << 48);               \
+} while(0)
+#elif BFR_UNIT == 32
+#define mul_x(r, x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 7) & 0xff];   \
+ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) | (ui32_ptr(x)[2] << 31);            \
+ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) | (ui32_ptr(x)[1] << 31);            \
+ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) | (ui32_ptr(x)[0] << 31);            \
+ui32_ptr(r)[0] = (ui32_ptr(x)[0] >> 1) ^ (_tt << 16);                       \
+} while(0)
+#define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 4) & 0xff]; \
+ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28);        \
+ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28);        \
+ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28);        \
+ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 4) ^ (_tt << 16);                   \
+} while(0)
+#define mul_x8(x) do { uint_32t _tt = gf_tab[ui32_ptr(x)[3] & 0xff];    \
+ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 8) | (ui32_ptr(x)[2] << 24);    \
+ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 8) | (ui32_ptr(x)[1] << 24);    \
+ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 8) | (ui32_ptr(x)[0] << 24);    \
+ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 8) ^ (_tt << 16);               \
+} while(0)
+#else
+#define mul_x(r, x) do { uint_8t _tt = ui8_ptr(x)[15] & 1;          \
+ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); \
+ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); \
+ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); \
+ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); \
+ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); \
+ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); \
+ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); \
+ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); \
+ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); \
+ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); \
+ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); \
+ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); \
+ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); \
+ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); \
+ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); \
+ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00);   \
+} while(0)
+#define mul_x4(x) do { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff]; \
+ui8_ptr(x)[15] =  (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4);        \
+ui8_ptr(x)[14] =  (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4);        \
+ui8_ptr(x)[13] =  (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4);        \
+ui8_ptr(x)[12] =  (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4);        \
+ui8_ptr(x)[11] =  (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4);        \
+ui8_ptr(x)[10] =  (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4);        \
+ui8_ptr(x)[ 9] =  (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4);        \
+ui8_ptr(x)[ 8] =  (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4);        \
+ui8_ptr(x)[ 7] =  (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4);        \
+ui8_ptr(x)[ 6] =  (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4);        \
+ui8_ptr(x)[ 5] =  (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4);        \
+ui8_ptr(x)[ 4] =  (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4);        \
+ui8_ptr(x)[ 3] =  (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4);        \
+ui8_ptr(x)[ 2] =  (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4);        \
+ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt & 0xff);    \
+ui8_ptr(x)[ 0] =  (ui8_ptr(x)[ 0] >> 4) ^ (_tt >> 8);                   \
+} while(0)
+#define mul_x8(x) do { uint_16t _tt = gf_tab[ui8_ptr(x)[15]];   \
+memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15);                    \
+ui8_ptr(x)[1] ^= (_tt & 0xff);                              \
+ui8_ptr(x)[0] = (_tt >> 8);                                 \
+} while(0)
+#endif
+#endif
+#else
+#  error Platform byte order has not been set.
+#endif
+/*  A slow generic version of gf_mul (a = a * b) */
+void gf_mul(void *a, const void* b);
+/*  This version uses 64k bytes of table space on the stack.
+A 16 byte buffer has to be multiplied by a 16 byte key
+value in GF(128).  If we consider a GF(128) value in
+the buffer's lowest byte, we can construct a table of
+the 256 16 byte values that result from the 256 values
+of this byte.  This requires 4096 bytes. But we also
+need tables for each of the 16 higher bytes in the
+buffer as well, which makes 64 kbytes in total.
+*/
+void init_64k_table(unsigned char g[], void *t);
+typedef uint_32t            (*gf_t64k)[256][GF_BYTE_LEN >> 2];
+#define tab64k(x)           ((gf_t64k)x)
+#define xor_64k(i,a,t,r)    xor_block_aligned(r, tab64k(t)[i][a[i]])
+#if defined( USE_INLINES )
+#if defined( UNROLL_LOOPS )
+gf_inline void gf_mul_64k(unsigned char a[], void *t, void *r)
+{
+move_block_aligned(r, tab64k(t)[0][a[0]]); xor_64k( 1, a, t, r);
+xor_64k( 2, a, t, r); xor_64k( 3, a, t, r);
+xor_64k( 4, a, t, r); xor_64k( 5, a, t, r);
+xor_64k( 6, a, t, r); xor_64k( 7, a, t, r);
+xor_64k( 8, a, t, r); xor_64k( 9, a, t, r);
+xor_64k(10, a, t, r); xor_64k(11, a, t, r);
+xor_64k(12, a, t, r); xor_64k(13, a, t, r);
+xor_64k(14, a, t, r); xor_64k(15, a, t, r);
+move_block_aligned(a, r);
+}
+#else
+gf_inline void gf_mul_64k(unsigned char a[], void *t, void *r)
+{   int i;
+move_block_aligned(r, tab64k(t)[0][a[0]]);
+for(i = 1; i < GF_BYTE_LEN; ++i)
+xor_64k(i, a, t, r);
+move_block_aligned(a, r);
+}
+#endif
+#else
+#if !defined( UNROLL_LOOPS )
+#define gf_mul_64k(a, t, r) do {                \
+move_block_aligned(r, tab64k(t)[0][a[0]]);  \
+xor_64k( 1, a, t, r);                       \
+xor_64k( 2, a, t, r); xor_64k( 3, a, t, r); \
+xor_64k( 4, a, t, r); xor_64k( 5, a, t, r); \
+xor_64k( 6, a, t, r); xor_64k( 7, a, t, r); \
+xor_64k( 8, a, t, r); xor_64k( 9, a, t, r); \
+xor_64k(10, a, t, r); xor_64k(11, a, t, r); \
+xor_64k(12, a, t, r); xor_64k(13, a, t, r); \
+xor_64k(14, a, t, r); xor_64k(15, a, t, r); \
+move_block_aligned(a, r);                   \
+} while(0)
+#else
+#define gf_mul_64k(a, t, r) do { int i;         \
+move_block_aligned(r, tab64k(t)[0][a[0]]);  \
+for(i = 1; i < GF_BYTE_LEN; ++i)            \
+{   xor_64k(i, a, t, r);                    \
+}                                           \
+move_block_aligned(a, r);                   \
+} while(0)
+#endif
+#endif
+/*  This version uses 8k bytes of table space on the stack.
+A 16 byte buffer has to be multiplied by a 16 byte key
+value in GF(128).  If we consider a GF(128) value in
+the buffer's lowest 4-bits, we can construct a table of
+the 16 16 byte values that result from the 16 values
+of these 4 bits. This requires 256 bytes. But we also
+need tables for each of the 32 higher 4 bit groups,
+which makes 8 kbytes in total.
+*/
+void init_8k_table(unsigned char g[], void *t);
+typedef uint_32t    (*gf_t8k)[16][GF_BYTE_LEN >> 2];
+#define tab8k(x)    ((gf_t8k)x)
+#define xor_8k(i,a,t,r)   \
+xor_block_aligned(r, tab8k(t)[i + i][a[i] & 15]); \
+xor_block_aligned(r, tab8k(t)[i + i + 1][a[i] >> 4])
+#if defined( USE_INLINES )
+#if defined( UNROLL_LOOPS )
+gf_inline void gf_mul_8k(unsigned char a[], void *t, void *r)
+{
+move_block_aligned(r, tab8k(t)[0][a[0] & 15]);
+xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]);
+xor_8k( 1, a, t, r); xor_8k( 2, a, t, r); xor_8k( 3, a, t, r);
+xor_8k( 4, a, t, r); xor_8k( 5, a, t, r); xor_8k( 6, a, t, r); xor_8k( 7, a, t, r);
+xor_8k( 8, a, t, r); xor_8k( 9, a, t, r); xor_8k(10, a, t, r); xor_8k(11, a, t, r);
+xor_8k(12, a, t, r); xor_8k(13, a, t, r); xor_8k(14, a, t, r); xor_8k(15, a, t, r);
+move_block_aligned(a, r);
+}
+#else
+gf_inline void gf_mul_8k(unsigned char a[], void *t, void *r)
+{   int i;
+memcpy(r, tab8k(t)[0][a[0] & 15], GF_BYTE_LEN);
+xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]);
+for(i = 1; i < GF_BYTE_LEN; ++i)
+{   xor_8k(i, a, t, r);
+}
+memcpy(a, r, GF_BYTE_LEN);
+}
+#endif
+#else
+#if defined( UNROLL_LOOPS )
+#define gf_mul_8k(a, t, r) do {                     \
+move_block_aligned(r, tab8k(t)[0][a[0] & 15]);  \
+xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]);   \
+xor_8k( 1, a, t, r); xor_8k( 2, a, t, r);       \
+xor_8k( 3, a, t, r); xor_8k( 4, a, t, r);       \
+xor_8k( 5, a, t, r); xor_8k( 6, a, t, r);       \
+xor_8k( 7, a, t, r); xor_8k( 8, a, t, r);       \
+xor_8k( 9, a, t, r); xor_8k(10, a, t, r);       \
+xor_8k(11, a, t, r); xor_8k(12, a, t, r);       \
+xor_8k(13, a, t, r); xor_8k(14, a, t, r);       \
+xor_8k(15, a, t, r); move_block_aligned(a, r);  \
+} while(0)
+#else
+#define gf_mul_8k(a, t, r) do { int i;              \
+memcpy(r, tab8k(t)[0][a[0] & 15], GF_BYTE_LEN); \
+xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]);   \
+for(i = 1; i < GF_BYTE_LEN; ++i)                \
+{   xor_8k(i, a, t, r);                         \
+}                                               \
+memcpy(a, r, GF_BYTE_LEN);                      \
+} while(0)
+#endif
+#endif
+/*  This version uses 4k bytes of table space on the stack.
+A 16 byte buffer has to be multiplied by a 16 byte key
+value in GF(128).  If we consider a GF(128) value in a
+single byte, we can construct a table of the 256 16 byte
+values that result from the 256 values of this byte.
+This requires 4096 bytes. If we take the highest byte in
+the buffer and use this table to get the result, we then
+have to multiply by x^120 to get the final value. For the
+next highest byte the result has to be multiplied by x^112
+and so on. But we can do this by accumulating the result
+in an accumulator starting with the result for the top
+byte.  We repeatedly multiply the accumulator value by
+x^8 and then add in (i.e. xor) the 16 bytes of the next
+lower byte in the buffer, stopping when we reach the
+lowest byte. This requires a 4096 byte table.
+*/
+void init_4k_table(unsigned char g[], void *t);
+typedef uint_32t        (*gf_t4k)[GF_BYTE_LEN >> 2];
+#define tab4k(x)        ((gf_t4k)x)
+#define xor_4k(i,a,t,r) mul_x8(r); xor_block_aligned(r, tab4k(t)[a[i]])
+#if defined( USE_INLINES )
+#if defined( UNROLL_LOOPS )
+gf_inline void gf_mul_4k(unsigned char a[], void *t, void *r)
+{
+move_block_aligned(r,tab4k(t)[a[15]]);
+xor_4k(14, a, t, r); xor_4k(13, a, t, r); xor_4k(12, a, t, r);
+xor_4k(11, a, t, r); xor_4k(10, a, t, r); xor_4k( 9, a, t, r);
+xor_4k( 8, a, t, r); xor_4k( 7, a, t, r); xor_4k( 6, a, t, r);
+xor_4k( 5, a, t, r); xor_4k( 4, a, t, r); xor_4k( 3, a, t, r);
+xor_4k( 2, a, t, r); xor_4k( 1, a, t, r); xor_4k( 0, a, t, r);
+move_block_aligned(a, r);
+}
+#else
+gf_inline void gf_mul_4k(unsigned char a[], void *t, void *r)
+{   int i = 15;
+move_block_aligned(r,tab4k(t)[a[15]]);
+while(i--)
+{
+xor_4k(i, a, t, r);
+}
+move_block_aligned(a, r);
+}
+#endif
+#else
+#if defined( UNROLL_LOOPS )
+#define gf_mul_4k(a, t, r) do {                                     \
+move_block_aligned(r,tab4k(t)[a[15]]);                          \
+xor_4k(14, a, t, r); xor_4k(13, a, t, r); xor_4k(12, a, t, r);  \
+xor_4k(11, a, t, r); xor_4k(10, a, t, r); xor_4k( 9, a, t, r);  \
+xor_4k( 8, a, t, r); xor_4k( 7, a, t, r); xor_4k( 6, a, t, r);  \
+xor_4k( 5, a, t, r); xor_4k( 4, a, t, r); xor_4k( 3, a, t, r);  \
+xor_4k( 2, a, t, r); xor_4k( 1, a, t, r); xor_4k( 0, a, t, r);  \
+move_block_aligned(a, r);                                       \
+} while(0)
+#else
+#define gf_mul_4k(a, t, r) do { int i = 15; \
+move_block_aligned(r,tab4k(t)[a[15]]);  \
+while(i--)                              \
+{   xor_4k(i, a, t, r);                 \
+}                                       \
+move_block_aligned(a, r);               \
+} while(0)
+#endif
+#endif
+/*  This version uses 256 bytes of table space on the stack.
+A 16 byte buffer has to be multiplied by a 16 byte key
+value in GF(128).  If we consider a GF(128) value in a
+single 4-bit nibble, we can construct a table of the 16
+16 byte  values that result from the 16 values of this
+byte.  This requires 256 bytes. If we take the highest
+4-bit nibble in the buffer and use this table to get the
+result, we then have to multiply by x^124 to get the
+final value. For the next highest byte the result has to
+be multiplied by x^120 and so on. But we can do this by
+accumulating the result in an accumulator starting with
+the result for the top nibble.  We repeatedly multiply
+the accumulator value by x^4 and then add in (i.e. xor)
+the 16 bytes of the next lower nibble in the buffer,
+stopping when we reach the lowest nibblebyte. This uses
+a 256 byte table.
+*/
+void init_256_table(unsigned char g[], void *t);
+typedef uint_32t    (*gf_t256)[GF_BYTE_LEN >> 2];
+#define tab256(t)   ((gf_t256)t)
+#define xor_256(i,a,t,r)    \
+mul_x4(r); xor_block_aligned(r, tab256(t)[a[i] & 15]);  \
+mul_x4(r); xor_block_aligned(r, tab256(t)[a[i] >> 4])
+#if defined( USE_INLINES )
+#if defined( UNROLL_LOOPS )
+gf_inline void gf_mul_256(unsigned char a[], void *t, void *r)
+{
+move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r);
+xor_block_aligned(r, tab256(t)[a[15] >> 4]);
+xor_256(14, a, t, r); xor_256(13, a, t, r);
+xor_256(12, a, t, r); xor_256(11, a, t, r);
+xor_256(10, a, t, r); xor_256( 9, a, t, r);
+xor_256( 8, a, t, r); xor_256( 7, a, t, r);
+xor_256( 6, a, t, r); xor_256( 5, a, t, r);
+xor_256( 4, a, t, r); xor_256( 3, a, t, r);
+xor_256( 2, a, t, r); xor_256( 1, a, t, r);
+xor_256( 0, a, t, r); move_block_aligned(a, r);
+}
+#else
+gf_inline void gf_mul_256(unsigned char a[], void *t, void *r)
+{   int i = 15;
+move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r);
+xor_block_aligned(r, tab256(t)[a[15] >> 4]);
+while(i--)
+{   xor_256(i, a, t, r);
+}
+move_block_aligned(a, r);
+}
+#endif
+#else
+#if defined( UNROLL_LOOPS )
+#define gf_mul_256(a, t, r) do {                            \
+move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); \
+xor_block_aligned(r, tab256(t)[a[15] >> 4]);            \
+xor_256(14, a, t, r); xor_256(13, a, t, r);             \
+xor_256(12, a, t, r); xor_256(11, a, t, r);             \
+xor_256(10, a, t, r); xor_256( 9, a, t, r);             \
+xor_256( 8, a, t, r); xor_256( 7, a, t, r);             \
+xor_256( 6, a, t, r); xor_256( 5, a, t, r);             \
+xor_256( 4, a, t, r); xor_256( 3, a, t, r);             \
+xor_256( 2, a, t, r); xor_256( 1, a, t, r);             \
+xor_256( 0, a, t, r); move_block_aligned(a, r);         \
+} while(0)
+#else
+#define gf_mul_256(a, t, r) do { int i = 15;                \
+move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); \
+xor_block_aligned(r, tab256(t)[a[15] >> 4]);            \
+while(i--)                                              \
+{   xor_256(i, a, t, r);                                \
+}                                                       \
+move_block_aligned(a, r);                               \
+} while(0)
+#endif
+#endif
+#if defined(__cplusplus)
+}
+#endif
+#endif