servlib/gcm/gf128mul.h
changeset 0 2b3e5ec03512
equal deleted inserted replaced
-1:000000000000 0:2b3e5ec03512
       
     1 /*
       
     2  ---------------------------------------------------------------------------
       
     3  Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved.
       
     4 
       
     5  LICENSE TERMS
       
     6 
       
     7  The free distribution and use of this software in both source and binary
       
     8  form is allowed (with or without changes) provided that:
       
     9 
       
    10    1. distributions of this source code include the above copyright
       
    11       notice, this list of conditions and the following disclaimer;
       
    12 
       
    13    2. distributions in binary form include the above copyright
       
    14       notice, this list of conditions and the following disclaimer
       
    15       in the documentation and/or other associated materials;
       
    16 
       
    17    3. the copyright holder's name is not used to endorse products
       
    18       built using this software without specific written permission.
       
    19 
       
    20  ALTERNATIVELY, provided that this notice is retained in full, this product
       
    21  may be distributed under the terms of the GNU General Public License (GPL),
       
    22  in which case the provisions of the GPL apply INSTEAD OF those given above.
       
    23 
       
    24  DISCLAIMER
       
    25 
       
    26  This software is provided 'as is' with no explicit or implied warranties
       
    27  in respect of its properties, including, but not limited to, correctness
       
    28  and/or fitness for purpose.
       
    29  ---------------------------------------------------------------------------
       
    30  Issue Date: 13/10/2006
       
    31 
       
    32  An implementation of field multiplication in Galois Field GF(128)
       
    33 */
       
    34 
       
    35 #ifndef GF128MUL_H
       
    36 #define GF128MUL_H
       
    37 
       
    38 #include <stdlib.h>
       
    39 #include <string.h>
       
    40 
       
    41 #include "mode_hdr.h"
       
    42 
       
    43 /*  Table sizes for GF(128) Multiply.  Normally larger tables give 
       
    44     higher speed but cache loading might change this. Normally only 
       
    45     one table size (or none at all) will be specified here
       
    46 */
       
    47 
       
    48 #if 0
       
    49 #  define TABLES_64K
       
    50 #endif
       
    51 #if 1
       
    52 #  define TABLES_8K
       
    53 #endif
       
    54 #if 0
       
    55 #  define TABLES_4K
       
    56 #endif
       
    57 #if 0
       
    58 #  define TABLES_256
       
    59 #endif
       
    60 
       
    61 /*  Use of inlines is preferred but code blocks can also be expanded inline
       
    62     using 'defines'.  But the latter approach will typically generate a LOT
       
    63     of code and is not recommended. 
       
    64 */
       
    65 #if 0
       
    66 #  define USE_INLINES
       
    67 #endif
       
    68 
       
    69 /*  Speed critical loops can be unrolled to gain speed but consume more
       
    70     memory
       
    71 */
       
    72 #if 0
       
    73 #  define UNROLL_LOOPS
       
    74 #endif
       
    75 
       
    76 /*  Multiply a GF128 field element by x. Field elements are held in arrays
       
    77     of bytes in which field bits 8n..8n + 7 are held in byte[n], with lower
       
    78     indexed bits placed in the more numerically significant bit positions
       
    79     within bytes.
       
    80 
       
    81     On little endian machines the bit indexes translate into the bit
       
    82     positions within four 32-bit words in the following way
       
    83 
       
    84     MS            x[0]           LS  MS            x[1]           LS
       
    85     ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
       
    86     24...31 16...23 08...15 00...07  56...63 48...55 40...47 32...39
       
    87 
       
    88     MS            x[2]           LS  MS            x[3]           LS
       
    89     ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
       
    90     88...95 80...87 72...79 64...71  120.127 112.119 104.111 96..103
       
    91 
       
    92     On big endian machines the bit indexes translate into the bit
       
    93     positions within four 32-bit words in the following way
       
    94 
       
    95     MS            x[0]           LS  MS            x[1]           LS
       
    96     ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
       
    97     00...07 08...15 16...23 24...31  32...39 40...47 48...55 56...63
       
    98 
       
    99     MS            x[2]           LS  MS            x[3]           LS
       
   100     ms   ls ms   ls ms   ls ms   ls  ms   ls ms   ls ms   ls ms   ls
       
   101     64...71 72...79 80...87 88...95  96..103 104.111 112.119 120.127
       
   102 */
       
   103 
       
   104 #define GF_BYTE_LEN 16
       
   105 
       
   106 #if defined( USE_INLINES )
       
   107 #  if defined( _MSC_VER )
       
   108 #    define gf_inline __inline
       
   109 #  elif defined( __GNUC__ ) || defined( __GNU_LIBRARY__ )
       
   110 #    define gf_inline static inline
       
   111 #  else
       
   112 #    define gf_inline static
       
   113 #  endif
       
   114 #endif
       
   115 
       
   116 #if defined(__cplusplus)
       
   117 extern "C"
       
   118 {
       
   119 #endif
       
   120 
       
   121 /*  These functions multiply a field element x, by x^4 and by x^8 in the 
       
   122     polynomial field representation. It uses 32-bit word operations to
       
   123     gain speed but compensates for machine endianess and hence works 
       
   124     correctly on both styles of machine.
       
   125 */
       
   126 extern const unsigned short gf_tab[256];
       
   127 
       
   128 #if PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN
       
   129 
       
   130 /*  This section is not needed as GF(128) multiplication is now implemented
       
   131     but is left in place as it provides a template for an alternative little
       
   132     endian implementation approach based on conversion to and from big endian
       
   133     format
       
   134 */
       
   135 #if 0
       
   136 
       
   137 /*  This is a template for mul_x.  The mul_x4 and mul_x8 little endian
       
   138     alternative implementations (and their defined versions) follow the 
       
   139     big endian functions below in the same way.
       
   140 */
       
   141 
       
   142 gf_inline void mul_x(void *r, const void *x)
       
   143 {   uint_32t _tt;
       
   144     bswap32_block(r, x, 4); 
       
   145     _tt = gf_tab[(ui32_ptr(r)[3] << 7) & 0xff];
       
   146     ui32_ptr(r)[3] = (ui32_ptr(r)[3] >> 1) | (ui32_ptr(r)[2] << 31);
       
   147     ui32_ptr(r)[2] = (ui32_ptr(r)[2] >> 1) | (ui32_ptr(r)[1] << 31);
       
   148     ui32_ptr(r)[1] = (ui32_ptr(r)[1] >> 1) | (ui32_ptr(r)[0] << 31);
       
   149     ui32_ptr(r)[0] = (ui32_ptr(r)[0] >> 1) ^ bswap_32(_tt);
       
   150     bswap32_block(r, r, 4);
       
   151 }
       
   152 
       
   153 #endif
       
   154 
       
   155 #define VERSION_1
       
   156 
       
   157 #define MSK_80   (0x80 * (unit_cast(BFR_UNIT,-1) / 0xff))
       
   158 #define MSK_F0   (0xf0 * (unit_cast(BFR_UNIT,-1) / 0xff))
       
   159 
       
   160 #if defined( USE_INLINES )
       
   161 
       
   162 #if BFR_UNIT == 64
       
   163 
       
   164     gf_inline void mul_x(void *r, const void *x)
       
   165     {   uint_64t  _tt = gf_tab[(ui64_ptr(x)[1] >> 49) & MSK_80];
       
   166 
       
   167         ui64_ptr(r)[1] =  (ui64_ptr(x)[1] >> 1) & ~MSK_80 | ((ui64_ptr(x)[1] << 15) | (ui64_ptr(x)[0] >> 49)) & MSK_80;
       
   168         ui64_ptr(r)[0] = ((ui64_ptr(x)[0] >> 1) & ~MSK_80 |  (ui64_ptr(x)[0] << 15) & MSK_80) ^ _tt;
       
   169     }
       
   170 
       
   171   #if defined( VERSION_1 )
       
   172 
       
   173     gf_inline void mul_x4(void *x)
       
   174     {   uint_64t   _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & MSK_F0];
       
   175 
       
   176         ui64_ptr(x)[1] =  (ui64_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui64_ptr(x)[1] << 12) | (ui64_ptr(x)[0] >> 52)) & MSK_F0;
       
   177         ui64_ptr(x)[0] = ((ui64_ptr(x)[0] >> 4) & ~MSK_F0 |  (ui64_ptr(x)[0] << 12) & MSK_F0) ^ _tt;
       
   178     }
       
   179 
       
   180   #else
       
   181 
       
   182     gf_inline void mul_x4(void *x)
       
   183     {   uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & 0xf0];
       
   184         bswap64_block(x, x, 2);
       
   185         ui64_ptr(x)[1] = bswap_64((ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60));
       
   186         ui64_ptr(x)[0] = bswap_64((ui64_ptr(x)[0] >> 4)) ^ _tt;
       
   187     }
       
   188 
       
   189   #endif
       
   190 
       
   191     gf_inline void mul_x8(void *x)
       
   192     {   uint_64t _tt = gf_tab[ui64_ptr(x)[1] >> 56];
       
   193         ui64_ptr(x)[1] = (ui64_ptr(x)[1] << 8) | (ui64_ptr(x)[0] >> 56); 
       
   194         ui64_ptr(x)[0] = (ui64_ptr(x)[0] << 8) ^ _tt;
       
   195     }
       
   196 
       
   197 #elif BFR_UNIT == 32
       
   198 
       
   199     gf_inline void mul_x(void *r, const void *x)
       
   200     {   uint_32t  _tt = gf_tab[(ui32_ptr(x)[3] >> 17) & MSK_80];
       
   201 
       
   202         ui32_ptr(r)[3] =  (ui32_ptr(x)[3] >> 1) & ~MSK_80 | ((ui32_ptr(x)[3] << 15) | (ui32_ptr(x)[2] >> 17)) & MSK_80;
       
   203         ui32_ptr(r)[2] =  (ui32_ptr(x)[2] >> 1) & ~MSK_80 | ((ui32_ptr(x)[2] << 15) | (ui32_ptr(x)[1] >> 17)) & MSK_80;
       
   204         ui32_ptr(r)[1] =  (ui32_ptr(x)[1] >> 1) & ~MSK_80 | ((ui32_ptr(x)[1] << 15) | (ui32_ptr(x)[0] >> 17)) & MSK_80;
       
   205         ui32_ptr(r)[0] = ((ui32_ptr(x)[0] >> 1) & ~MSK_80 |  (ui32_ptr(x)[0] << 15) & MSK_80) ^ _tt;
       
   206     }
       
   207 
       
   208   #if defined( VERSION_1 )
       
   209 
       
   210     gf_inline void mul_x4(void *x)
       
   211     {   uint_32t   _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & MSK_F0];
       
   212 
       
   213         ui32_ptr(x)[3] =  (ui32_ptr(x)[3] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[3] << 12) | (ui32_ptr(x)[2] >> 20)) & MSK_F0;
       
   214         ui32_ptr(x)[2] =  (ui32_ptr(x)[2] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[2] << 12) | (ui32_ptr(x)[1] >> 20)) & MSK_F0;
       
   215         ui32_ptr(x)[1] =  (ui32_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[1] << 12) | (ui32_ptr(x)[0] >> 20)) & MSK_F0;
       
   216         ui32_ptr(x)[0] = ((ui32_ptr(x)[0] >> 4) & ~MSK_F0 |  (ui32_ptr(x)[0] << 12) & MSK_F0) ^ _tt;
       
   217     }
       
   218 
       
   219   #else
       
   220 
       
   221     gf_inline void mul_x4(void *x)
       
   222     {   uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & 0xf0];
       
   223         bswap32_block(x, x, 4);
       
   224         ui32_ptr(x)[3] = bswap_32((ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28));
       
   225         ui32_ptr(x)[2] = bswap_32((ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28));
       
   226         ui32_ptr(x)[1] = bswap_32((ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28));
       
   227         ui32_ptr(x)[0] = bswap_32((ui32_ptr(x)[0] >> 4)) ^ _tt;
       
   228     }
       
   229 
       
   230   #endif
       
   231 
       
   232     gf_inline void mul_x8(void *x)
       
   233     {   uint_32t   _tt = gf_tab[ui32_ptr(x)[3] >> 24];
       
   234 
       
   235         ui32_ptr(x)[3] = (ui32_ptr(x)[3] << 8) | (ui32_ptr(x)[2] >> 24);
       
   236         ui32_ptr(x)[2] = (ui32_ptr(x)[2] << 8) | (ui32_ptr(x)[1] >> 24);
       
   237         ui32_ptr(x)[1] = (ui32_ptr(x)[1] << 8) | (ui32_ptr(x)[0] >> 24);
       
   238         ui32_ptr(x)[0] = (ui32_ptr(x)[0] << 8) ^ _tt;
       
   239     }
       
   240 
       
   241 #else
       
   242 
       
   243     gf_inline void mul_x(void *r, const void *x)
       
   244     {   uint_8t _tt = ui8_ptr(x)[15] & 1;
       
   245         ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7);
       
   246         ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7);
       
   247         ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7);
       
   248         ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7);
       
   249         ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7);
       
   250         ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7);
       
   251         ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7);
       
   252         ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7);
       
   253         ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7);
       
   254         ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7);
       
   255         ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7);
       
   256         ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7);
       
   257         ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7);
       
   258         ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7);
       
   259         ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7);
       
   260         ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00);
       
   261     }
       
   262 
       
   263     gf_inline void mul_x4(void *x)
       
   264     {   uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff];
       
   265         ui8_ptr(x)[15] =  (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4);
       
   266         ui8_ptr(x)[14] =  (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4);
       
   267         ui8_ptr(x)[13] =  (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4);
       
   268         ui8_ptr(x)[12] =  (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4);
       
   269         ui8_ptr(x)[11] =  (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4);
       
   270         ui8_ptr(x)[10] =  (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4);
       
   271         ui8_ptr(x)[ 9] =  (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4);
       
   272         ui8_ptr(x)[ 8] =  (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4);
       
   273         ui8_ptr(x)[ 7] =  (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4);
       
   274         ui8_ptr(x)[ 6] =  (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4);
       
   275         ui8_ptr(x)[ 5] =  (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4);
       
   276         ui8_ptr(x)[ 4] =  (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4);
       
   277         ui8_ptr(x)[ 3] =  (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4);
       
   278         ui8_ptr(x)[ 2] =  (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4);
       
   279         ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt >> 8);
       
   280         ui8_ptr(x)[ 0] =  (ui8_ptr(x)[ 0] >> 4) ^ (_tt & 0xff);
       
   281     }
       
   282 
       
   283     gf_inline void mul_x8(void *x)
       
   284     {   uint_16t _tt = gf_tab[ui8_ptr(x)[15]];
       
   285         memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15);
       
   286         ui8_ptr(x)[1] ^= (_tt >> 8);
       
   287         ui8_ptr(x)[0] = (_tt & 0xff);
       
   288     }
       
   289 
       
   290 #endif
       
   291 
       
   292 #else   /* DEFINES */
       
   293 
       
   294 #if BFR_UNIT == 64
       
   295 
       
   296     #define mul_x(r, x) do { uint_64t  _tt = gf_tab[(ui64_ptr(x)[1] >> 49) & MSK_80]; \
       
   297         ui64_ptr(r)[1] =  (ui64_ptr(x)[1] >> 1) & ~MSK_80                             \
       
   298             | ((ui64_ptr(x)[1] << 15) | (ui64_ptr(x)[0] >> 49)) & MSK_80;             \
       
   299         ui64_ptr(r)[0] = ((ui64_ptr(x)[0] >> 1) & ~MSK_80                             \
       
   300             |  (ui64_ptr(x)[0] << 15) & MSK_80) ^ _tt;                                \
       
   301     } while(0)
       
   302 
       
   303   #if defined( VERSION_1 )
       
   304 
       
   305     #define mul_x4(x) do { uint_64t   _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & MSK_F0];  \
       
   306         ui64_ptr(x)[1] =  (ui64_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui64_ptr(x)[1] << 12)   \
       
   307             | (ui64_ptr(x)[0] >> 52)) & MSK_F0;                                       \
       
   308         ui64_ptr(x)[0] = ((ui64_ptr(x)[0] >> 4) & ~MSK_F0                             \
       
   309             |  (ui64_ptr(x)[0] << 12) & MSK_F0) ^ _tt;                                \
       
   310     } while(0)
       
   311 
       
   312   #else
       
   313 
       
   314     #define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] >> 52) & 0xf0];        \
       
   315         bswap64_block(x, x, 2);                                                         \
       
   316         ui64_ptr(x)[1] = bswap_64((ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60));      \
       
   317         ui64_ptr(x)[0] = bswap_64((ui64_ptr(x)[0] >> 4)) ^ _tt;                         \
       
   318     } while(0)
       
   319 
       
   320   #endif
       
   321 
       
   322     #define mul_x8(x) do { uint_64t _tt = gf_tab[ui64_ptr(x)[1] >> 56];     \
       
   323         ui64_ptr(x)[1] = (ui64_ptr(x)[1] << 8) | (ui64_ptr(x)[0] >> 56);    \
       
   324         ui64_ptr(x)[0] = (ui64_ptr(x)[0] << 8) ^ _tt;                       \
       
   325     } while(0)
       
   326 
       
   327 #elif BFR_UNIT == 32
       
   328 
       
   329     #define mul_x(r, x) do { uint_32t  _tt = gf_tab[(ui32_ptr(x)[3] >> 17) & MSK_80]; \
       
   330         ui32_ptr(r)[3] =  (ui32_ptr(x)[3] >> 1) & ~MSK_80 | ((ui32_ptr(x)[3] << 15)   \
       
   331             | (ui32_ptr(x)[2] >> 17)) & MSK_80;                                       \
       
   332         ui32_ptr(r)[2] =  (ui32_ptr(x)[2] >> 1) & ~MSK_80 | ((ui32_ptr(x)[2] << 15)   \
       
   333             | (ui32_ptr(x)[1] >> 17)) & MSK_80;                                       \
       
   334         ui32_ptr(r)[1] =  (ui32_ptr(x)[1] >> 1) & ~MSK_80 | ((ui32_ptr(x)[1] << 15)   \
       
   335             | (ui32_ptr(x)[0] >> 17)) & MSK_80;                                       \
       
   336         ui32_ptr(r)[0] = ((ui32_ptr(x)[0] >> 1) & ~MSK_80                             \
       
   337             | (ui32_ptr(x)[0] << 15) & MSK_80) ^ _tt;                                 \
       
   338     } while(0)
       
   339 
       
   340   #if defined( VERSION_1 )
       
   341 
       
   342     #define mul_x4(x) do { uint_32t   _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & MSK_F0];  \
       
   343         ui32_ptr(x)[3] =  (ui32_ptr(x)[3] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[3] << 12)   \
       
   344             | (ui32_ptr(x)[2] >> 20)) & MSK_F0;                                       \
       
   345         ui32_ptr(x)[2] =  (ui32_ptr(x)[2] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[2] << 12)   \
       
   346             | (ui32_ptr(x)[1] >> 20)) & MSK_F0;                                       \
       
   347         ui32_ptr(x)[1] =  (ui32_ptr(x)[1] >> 4) & ~MSK_F0 | ((ui32_ptr(x)[1] << 12)   \
       
   348             | (ui32_ptr(x)[0] >> 20)) & MSK_F0;                                       \
       
   349         ui32_ptr(x)[0] = ((ui32_ptr(x)[0] >> 4) & ~MSK_F0                             \
       
   350             |  (ui32_ptr(x)[0] << 12) & MSK_F0) ^ _tt;                                \
       
   351     } while(0)
       
   352 
       
   353   #else
       
   354 
       
   355     #define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] >> 20) & 0xf0];    \
       
   356         bswap32_block(x, x, 4);                                                     \
       
   357         ui32_ptr(x)[3] = bswap_32((ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28));  \
       
   358         ui32_ptr(x)[2] = bswap_32((ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28));  \
       
   359         ui32_ptr(x)[1] = bswap_32((ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28));  \
       
   360         ui32_ptr(x)[0] = bswap_32((ui32_ptr(x)[0] >> 4)) ^ _tt;                     \
       
   361     } while(0)
       
   362 
       
   363   #endif
       
   364 
       
   365 #define mul_x8(x) do { uint_32t   _tt = gf_tab[ui32_ptr(x)[3] >> 24];       \
       
   366         ui32_ptr(x)[3] = (ui32_ptr(x)[3] << 8) | (ui32_ptr(x)[2] >> 24);    \
       
   367         ui32_ptr(x)[2] = (ui32_ptr(x)[2] << 8) | (ui32_ptr(x)[1] >> 24);    \
       
   368         ui32_ptr(x)[1] = (ui32_ptr(x)[1] << 8) | (ui32_ptr(x)[0] >> 24);    \
       
   369         ui32_ptr(x)[0] = (ui32_ptr(x)[0] << 8) ^ _tt;                       \
       
   370     } while(0)
       
   371 
       
   372 #else
       
   373 
       
   374     #define mul_x(r, x) do { uint_8t _tt = ui8_ptr(x)[15] & 1;          \
       
   375         ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); \
       
   376         ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); \
       
   377         ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); \
       
   378         ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); \
       
   379         ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); \
       
   380         ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); \
       
   381         ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); \
       
   382         ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); \
       
   383         ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); \
       
   384         ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); \
       
   385         ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); \
       
   386         ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); \
       
   387         ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); \
       
   388         ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); \
       
   389         ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); \
       
   390         ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00);   \
       
   391     } while(0)
       
   392 
       
   393     #define mul_x4(x) do { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff];         \
       
   394         ui8_ptr(x)[15] =  (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4);                \
       
   395         ui8_ptr(x)[14] =  (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4);                \
       
   396         ui8_ptr(x)[13] =  (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4);                \
       
   397         ui8_ptr(x)[12] =  (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4);                \
       
   398         ui8_ptr(x)[11] =  (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4);                \
       
   399         ui8_ptr(x)[10] =  (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4);                \
       
   400         ui8_ptr(x)[ 9] =  (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4);                \
       
   401         ui8_ptr(x)[ 8] =  (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4);                \
       
   402         ui8_ptr(x)[ 7] =  (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4);                \
       
   403         ui8_ptr(x)[ 6] =  (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4);                \
       
   404         ui8_ptr(x)[ 5] =  (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4);                \
       
   405         ui8_ptr(x)[ 4] =  (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4);                \
       
   406         ui8_ptr(x)[ 3] =  (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4);                \
       
   407         ui8_ptr(x)[ 2] =  (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4);                \
       
   408         ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt >> 8);  \
       
   409         ui8_ptr(x)[ 0] =  (ui8_ptr(x)[ 0] >> 4) ^ (_tt & 0xff);                         \
       
   410     } while(0)
       
   411 
       
   412     #define mul_x8(x) do { uint_16t _tt = gf_tab[ui8_ptr(x)[15]];   \
       
   413         memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15);                    \
       
   414         ui8_ptr(x)[1] ^= (_tt >> 8);                                \
       
   415         ui8_ptr(x)[0] = (_tt & 0xff);                               \
       
   416     } while(0)
       
   417 
       
   418 #endif 
       
   419 
       
   420 #endif
       
   421 
       
   422 #elif PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN
       
   423 
       
   424 #if defined( USE_INLINES )
       
   425 
       
   426 #if BFR_UNIT == 64
       
   427 
       
   428     gf_inline void mul_x(void *r, const void *x)
       
   429     {   uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 7) & 0xff];
       
   430         ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) | (ui64_ptr(x)[0] << 63);
       
   431         ui64_ptr(r)[0] = (ui64_ptr(x)[0] >> 1) ^ (_tt << 48);
       
   432     }
       
   433 
       
   434     gf_inline void mul_x4(void *x)
       
   435     {   uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 4) & 0xff];
       
   436         ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60);
       
   437         ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 4) ^ (_tt << 48);
       
   438     }
       
   439 
       
   440     gf_inline void mul_x8(void *x)
       
   441     {   uint_64t _tt = gf_tab[ui64_ptr(x)[1] & 0xff];
       
   442         ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 8) | (ui64_ptr(x)[0] << 56);
       
   443         ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 8) ^ (_tt << 48);
       
   444     }
       
   445 
       
   446 #elif BFR_UNIT == 32
       
   447 
       
   448     gf_inline void mul_x(void *r, const void *x)
       
   449     {   uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 7) & 0xff];
       
   450         ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) | (ui32_ptr(x)[2] << 31);
       
   451         ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) | (ui32_ptr(x)[1] << 31);
       
   452         ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) | (ui32_ptr(x)[0] << 31);
       
   453         ui32_ptr(r)[0] = (ui32_ptr(x)[0] >> 1) ^ (_tt << 16);
       
   454     }
       
   455 
       
   456     gf_inline void mul_x4(void *x)
       
   457     {   uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 4) & 0xff];
       
   458         ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28);
       
   459         ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28);
       
   460         ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28);
       
   461         ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 4) ^ (_tt << 16);
       
   462     }
       
   463 
       
   464     gf_inline void mul_x8(void *x)
       
   465     {   uint_32t _tt = gf_tab[ui32_ptr(x)[3] & 0xff];
       
   466         ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 8) | (ui32_ptr(x)[2] << 24);
       
   467         ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 8) | (ui32_ptr(x)[1] << 24);
       
   468         ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 8) | (ui32_ptr(x)[0] << 24);
       
   469         ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 8) ^ (_tt << 16);
       
   470     }
       
   471 
       
   472 #else
       
   473 
       
   474     gf_inline void mul_x(void *r, const void *x)
       
   475     {   uint_8t _tt = ui8_ptr(x)[15] & 1;
       
   476         ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7);
       
   477         ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7);
       
   478         ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7);
       
   479         ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7);
       
   480         ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7);
       
   481         ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7);
       
   482         ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7);
       
   483         ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7);
       
   484         ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7);
       
   485         ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7);
       
   486         ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7);
       
   487         ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7);
       
   488         ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7);
       
   489         ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7);
       
   490         ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7);
       
   491         ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00);
       
   492     }
       
   493 
       
   494     gf_inline void mul_x4(void *x)
       
   495     {
       
   496         uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff];
       
   497         ui8_ptr(x)[15] =  (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4);
       
   498         ui8_ptr(x)[14] =  (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4);
       
   499         ui8_ptr(x)[13] =  (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4);
       
   500         ui8_ptr(x)[12] =  (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4);
       
   501         ui8_ptr(x)[11] =  (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4);
       
   502         ui8_ptr(x)[10] =  (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4);
       
   503         ui8_ptr(x)[ 9] =  (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4);
       
   504         ui8_ptr(x)[ 8] =  (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4);
       
   505         ui8_ptr(x)[ 7] =  (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4);
       
   506         ui8_ptr(x)[ 6] =  (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4);
       
   507         ui8_ptr(x)[ 5] =  (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4);
       
   508         ui8_ptr(x)[ 4] =  (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4);
       
   509         ui8_ptr(x)[ 3] =  (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4);
       
   510         ui8_ptr(x)[ 2] =  (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4);
       
   511         ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt & 0xff);
       
   512         ui8_ptr(x)[ 0] =  (ui8_ptr(x)[ 0] >> 4) ^ (_tt >> 8);
       
   513     }
       
   514 
       
   515     gf_inline void mul_x8(void *x)
       
   516     {   uint_16t _tt = gf_tab[ui8_ptr(x)[15]];
       
   517         memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15);
       
   518         ui8_ptr(x)[1] ^= (_tt & 0xff);
       
   519         ui8_ptr(x)[0] = (_tt >> 8);
       
   520     }
       
   521 
       
   522 #endif
       
   523 
       
   524 #else   /* DEFINES */
       
   525 
       
   526 #if BFR_UNIT == 64
       
   527 
       
   528     #define mul_x(r, x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 7) & 0xff];   \
       
   529         ui64_ptr(r)[1] = (ui64_ptr(x)[1] >> 1) | (ui64_ptr(x)[0] << 63);            \
       
   530         ui64_ptr(r)[0] = (ui64_ptr(x)[0] >> 1) ^ (_tt << 48);                       \
       
   531     } while(0)
       
   532 
       
   533     #define mul_x4(x) do { uint_64t _tt = gf_tab[(ui64_ptr(x)[1] << 4) & 0xff]; \
       
   534         ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 4) | (ui64_ptr(x)[0] << 60);        \
       
   535         ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 4) ^ (_tt << 48);                   \
       
   536     } while(0)
       
   537 
       
   538     #define mul_x8(x) do { uint_64t _tt = gf_tab[ui64_ptr(x)[1] & 0xff];    \
       
   539         ui64_ptr(x)[1] = (ui64_ptr(x)[1] >> 8) | (ui64_ptr(x)[0] << 56);    \
       
   540         ui64_ptr(x)[0] = (ui64_ptr(x)[0] >> 8) ^ (_tt << 48);               \
       
   541     } while(0)
       
   542 
       
   543 #elif BFR_UNIT == 32
       
   544 
       
   545     #define mul_x(r, x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 7) & 0xff];   \
       
   546         ui32_ptr(r)[3] = (ui32_ptr(x)[3] >> 1) | (ui32_ptr(x)[2] << 31);            \
       
   547         ui32_ptr(r)[2] = (ui32_ptr(x)[2] >> 1) | (ui32_ptr(x)[1] << 31);            \
       
   548         ui32_ptr(r)[1] = (ui32_ptr(x)[1] >> 1) | (ui32_ptr(x)[0] << 31);            \
       
   549         ui32_ptr(r)[0] = (ui32_ptr(x)[0] >> 1) ^ (_tt << 16);                       \
       
   550     } while(0)
       
   551 
       
   552     #define mul_x4(x) do { uint_32t _tt = gf_tab[(ui32_ptr(x)[3] << 4) & 0xff]; \
       
   553         ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 4) | (ui32_ptr(x)[2] << 28);        \
       
   554         ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 4) | (ui32_ptr(x)[1] << 28);        \
       
   555         ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 4) | (ui32_ptr(x)[0] << 28);        \
       
   556         ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 4) ^ (_tt << 16);                   \
       
   557     } while(0)
       
   558 
       
   559     #define mul_x8(x) do { uint_32t _tt = gf_tab[ui32_ptr(x)[3] & 0xff];    \
       
   560         ui32_ptr(x)[3] = (ui32_ptr(x)[3] >> 8) | (ui32_ptr(x)[2] << 24);    \
       
   561         ui32_ptr(x)[2] = (ui32_ptr(x)[2] >> 8) | (ui32_ptr(x)[1] << 24);    \
       
   562         ui32_ptr(x)[1] = (ui32_ptr(x)[1] >> 8) | (ui32_ptr(x)[0] << 24);    \
       
   563         ui32_ptr(x)[0] = (ui32_ptr(x)[0] >> 8) ^ (_tt << 16);               \
       
   564     } while(0)
       
   565 
       
   566 #else
       
   567 
       
   568     #define mul_x(r, x) do { uint_8t _tt = ui8_ptr(x)[15] & 1;          \
       
   569         ui8_ptr(r)[15] = (ui8_ptr(x)[15] >> 1) | (ui8_ptr(x)[14] << 7); \
       
   570         ui8_ptr(r)[14] = (ui8_ptr(x)[14] >> 1) | (ui8_ptr(x)[13] << 7); \
       
   571         ui8_ptr(r)[13] = (ui8_ptr(x)[13] >> 1) | (ui8_ptr(x)[12] << 7); \
       
   572         ui8_ptr(r)[12] = (ui8_ptr(x)[12] >> 1) | (ui8_ptr(x)[11] << 7); \
       
   573         ui8_ptr(r)[11] = (ui8_ptr(x)[11] >> 1) | (ui8_ptr(x)[10] << 7); \
       
   574         ui8_ptr(r)[10] = (ui8_ptr(x)[10] >> 1) | (ui8_ptr(x)[ 9] << 7); \
       
   575         ui8_ptr(r)[ 9] = (ui8_ptr(x)[ 9] >> 1) | (ui8_ptr(x)[ 8] << 7); \
       
   576         ui8_ptr(r)[ 8] = (ui8_ptr(x)[ 8] >> 1) | (ui8_ptr(x)[ 7] << 7); \
       
   577         ui8_ptr(r)[ 7] = (ui8_ptr(x)[ 7] >> 1) | (ui8_ptr(x)[ 6] << 7); \
       
   578         ui8_ptr(r)[ 6] = (ui8_ptr(x)[ 6] >> 1) | (ui8_ptr(x)[ 5] << 7); \
       
   579         ui8_ptr(r)[ 5] = (ui8_ptr(x)[ 5] >> 1) | (ui8_ptr(x)[ 4] << 7); \
       
   580         ui8_ptr(r)[ 4] = (ui8_ptr(x)[ 4] >> 1) | (ui8_ptr(x)[ 3] << 7); \
       
   581         ui8_ptr(r)[ 3] = (ui8_ptr(x)[ 3] >> 1) | (ui8_ptr(x)[ 2] << 7); \
       
   582         ui8_ptr(r)[ 2] = (ui8_ptr(x)[ 2] >> 1) | (ui8_ptr(x)[ 1] << 7); \
       
   583         ui8_ptr(r)[ 1] = (ui8_ptr(x)[ 1] >> 1) | (ui8_ptr(x)[ 0] << 7); \
       
   584         ui8_ptr(r)[ 0] = (ui8_ptr(x)[ 0] >> 1) ^ (_tt ? 0xe1 : 0x00);   \
       
   585     } while(0)
       
   586 
       
   587     #define mul_x4(x) do { uint_16t _tt = gf_tab[(ui8_ptr(x)[15] << 4) & 0xff]; \
       
   588         ui8_ptr(x)[15] =  (ui8_ptr(x)[15] >> 4) | (ui8_ptr(x)[14] << 4);        \
       
   589         ui8_ptr(x)[14] =  (ui8_ptr(x)[14] >> 4) | (ui8_ptr(x)[13] << 4);        \
       
   590         ui8_ptr(x)[13] =  (ui8_ptr(x)[13] >> 4) | (ui8_ptr(x)[12] << 4);        \
       
   591         ui8_ptr(x)[12] =  (ui8_ptr(x)[12] >> 4) | (ui8_ptr(x)[11] << 4);        \
       
   592         ui8_ptr(x)[11] =  (ui8_ptr(x)[11] >> 4) | (ui8_ptr(x)[10] << 4);        \
       
   593         ui8_ptr(x)[10] =  (ui8_ptr(x)[10] >> 4) | (ui8_ptr(x)[ 9] << 4);        \
       
   594         ui8_ptr(x)[ 9] =  (ui8_ptr(x)[ 9] >> 4) | (ui8_ptr(x)[ 8] << 4);        \
       
   595         ui8_ptr(x)[ 8] =  (ui8_ptr(x)[ 8] >> 4) | (ui8_ptr(x)[ 7] << 4);        \
       
   596         ui8_ptr(x)[ 7] =  (ui8_ptr(x)[ 7] >> 4) | (ui8_ptr(x)[ 6] << 4);        \
       
   597         ui8_ptr(x)[ 6] =  (ui8_ptr(x)[ 6] >> 4) | (ui8_ptr(x)[ 5] << 4);        \
       
   598         ui8_ptr(x)[ 5] =  (ui8_ptr(x)[ 5] >> 4) | (ui8_ptr(x)[ 4] << 4);        \
       
   599         ui8_ptr(x)[ 4] =  (ui8_ptr(x)[ 4] >> 4) | (ui8_ptr(x)[ 3] << 4);        \
       
   600         ui8_ptr(x)[ 3] =  (ui8_ptr(x)[ 3] >> 4) | (ui8_ptr(x)[ 2] << 4);        \
       
   601         ui8_ptr(x)[ 2] =  (ui8_ptr(x)[ 2] >> 4) | (ui8_ptr(x)[ 1] << 4);        \
       
   602         ui8_ptr(x)[ 1] = ((ui8_ptr(x)[ 1] >> 4) | (ui8_ptr(x)[ 0] << 4)) ^ (_tt & 0xff);    \
       
   603         ui8_ptr(x)[ 0] =  (ui8_ptr(x)[ 0] >> 4) ^ (_tt >> 8);                   \
       
   604     } while(0)
       
   605 
       
   606     #define mul_x8(x) do { uint_16t _tt = gf_tab[ui8_ptr(x)[15]];   \
       
   607         memmove(ui8_ptr(x) + 1, ui8_ptr(x), 15);                    \
       
   608         ui8_ptr(x)[1] ^= (_tt & 0xff);                              \
       
   609         ui8_ptr(x)[0] = (_tt >> 8);                                 \
       
   610     } while(0)
       
   611 
       
   612 #endif
       
   613 
       
   614 #endif
       
   615 
       
   616 #else
       
   617 #  error Platform byte order has not been set. 
       
   618 #endif
       
   619 
       
   620 /*  A slow generic version of gf_mul (a = a * b) */
       
   621 
       
   622 void gf_mul(void *a, const void* b);
       
   623 
       
   624 /*  This version uses 64k bytes of table space on the stack.
       
   625     A 16 byte buffer has to be multiplied by a 16 byte key
       
   626     value in GF(128).  If we consider a GF(128) value in
       
   627     the buffer's lowest byte, we can construct a table of
       
   628     the 256 16 byte values that result from the 256 values
       
   629     of this byte.  This requires 4096 bytes. But we also
       
   630     need tables for each of the 16 higher bytes in the
       
   631     buffer as well, which makes 64 kbytes in total.
       
   632 */
       
   633 
       
   634 void init_64k_table(unsigned char g[], void *t);
       
   635 typedef uint_32t            (*gf_t64k)[256][GF_BYTE_LEN >> 2];
       
   636 #define tab64k(x)           ((gf_t64k)x)
       
   637 #define xor_64k(i,a,t,r)    xor_block_aligned(r, tab64k(t)[i][a[i]])
       
   638 
       
   639 #if defined( USE_INLINES )
       
   640 
       
   641 #if defined( UNROLL_LOOPS )
       
   642 
       
   643 gf_inline void gf_mul_64k(unsigned char a[], void *t, void *r)
       
   644 {
       
   645     move_block_aligned(r, tab64k(t)[0][a[0]]); xor_64k( 1, a, t, r);
       
   646     xor_64k( 2, a, t, r); xor_64k( 3, a, t, r);
       
   647     xor_64k( 4, a, t, r); xor_64k( 5, a, t, r);
       
   648     xor_64k( 6, a, t, r); xor_64k( 7, a, t, r);
       
   649     xor_64k( 8, a, t, r); xor_64k( 9, a, t, r);
       
   650     xor_64k(10, a, t, r); xor_64k(11, a, t, r);
       
   651     xor_64k(12, a, t, r); xor_64k(13, a, t, r);
       
   652     xor_64k(14, a, t, r); xor_64k(15, a, t, r);
       
   653     move_block_aligned(a, r);
       
   654 }
       
   655 
       
   656 #else
       
   657 
       
   658 gf_inline void gf_mul_64k(unsigned char a[], void *t, void *r)
       
   659 {   int i;
       
   660     move_block_aligned(r, tab64k(t)[0][a[0]]);
       
   661     for(i = 1; i < GF_BYTE_LEN; ++i)
       
   662         xor_64k(i, a, t, r);
       
   663     move_block_aligned(a, r);
       
   664 }
       
   665 
       
   666 #endif
       
   667 
       
   668 #else
       
   669 
       
   670 #if !defined( UNROLL_LOOPS )
       
   671 
       
   672 #define gf_mul_64k(a, t, r) do {                \
       
   673     move_block_aligned(r, tab64k(t)[0][a[0]]);  \
       
   674     xor_64k( 1, a, t, r);                       \
       
   675     xor_64k( 2, a, t, r); xor_64k( 3, a, t, r); \
       
   676     xor_64k( 4, a, t, r); xor_64k( 5, a, t, r); \
       
   677     xor_64k( 6, a, t, r); xor_64k( 7, a, t, r); \
       
   678     xor_64k( 8, a, t, r); xor_64k( 9, a, t, r); \
       
   679     xor_64k(10, a, t, r); xor_64k(11, a, t, r); \
       
   680     xor_64k(12, a, t, r); xor_64k(13, a, t, r); \
       
   681     xor_64k(14, a, t, r); xor_64k(15, a, t, r); \
       
   682     move_block_aligned(a, r);                   \
       
   683 } while(0)
       
   684 
       
   685 #else
       
   686 
       
   687 #define gf_mul_64k(a, t, r) do { int i;         \
       
   688     move_block_aligned(r, tab64k(t)[0][a[0]]);  \
       
   689     for(i = 1; i < GF_BYTE_LEN; ++i)            \
       
   690     {   xor_64k(i, a, t, r);                    \
       
   691     }                                           \
       
   692     move_block_aligned(a, r);                   \
       
   693 } while(0)
       
   694 
       
   695 #endif
       
   696 
       
   697 #endif
       
   698 
       
   699 /*  This version uses 8k bytes of table space on the stack.
       
   700     A 16 byte buffer has to be multiplied by a 16 byte key
       
   701     value in GF(128).  If we consider a GF(128) value in
       
   702     the buffer's lowest 4-bits, we can construct a table of
       
   703     the 16 16 byte values that result from the 16 values
       
   704     of these 4 bits. This requires 256 bytes. But we also
       
   705     need tables for each of the 32 higher 4 bit groups,
       
   706     which makes 8 kbytes in total.
       
   707 */
       
   708 
       
   709 void init_8k_table(unsigned char g[], void *t);
       
   710 
       
   711 typedef uint_32t    (*gf_t8k)[16][GF_BYTE_LEN >> 2];
       
   712 #define tab8k(x)    ((gf_t8k)x)
       
   713 #define xor_8k(i,a,t,r)   \
       
   714     xor_block_aligned(r, tab8k(t)[i + i][a[i] & 15]); \
       
   715     xor_block_aligned(r, tab8k(t)[i + i + 1][a[i] >> 4])
       
   716 
       
   717 #if defined( USE_INLINES )
       
   718 
       
   719 #if defined( UNROLL_LOOPS )
       
   720 
       
   721 gf_inline void gf_mul_8k(unsigned char a[], void *t, void *r)
       
   722 {
       
   723     move_block_aligned(r, tab8k(t)[0][a[0] & 15]);
       
   724     xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]);
       
   725                 xor_8k( 1, a, t, r); xor_8k( 2, a, t, r); xor_8k( 3, a, t, r);
       
   726     xor_8k( 4, a, t, r); xor_8k( 5, a, t, r); xor_8k( 6, a, t, r); xor_8k( 7, a, t, r);
       
   727     xor_8k( 8, a, t, r); xor_8k( 9, a, t, r); xor_8k(10, a, t, r); xor_8k(11, a, t, r);
       
   728     xor_8k(12, a, t, r); xor_8k(13, a, t, r); xor_8k(14, a, t, r); xor_8k(15, a, t, r);
       
   729     move_block_aligned(a, r);
       
   730 }
       
   731 
       
   732 #else
       
   733 
       
   734 gf_inline void gf_mul_8k(unsigned char a[], void *t, void *r)
       
   735 {   int i;
       
   736     memcpy(r, tab8k(t)[0][a[0] & 15], GF_BYTE_LEN);
       
   737     xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]);
       
   738     for(i = 1; i < GF_BYTE_LEN; ++i)
       
   739     {   xor_8k(i, a, t, r);
       
   740     }
       
   741     memcpy(a, r, GF_BYTE_LEN);
       
   742 }
       
   743 
       
   744 #endif
       
   745 
       
   746 #else
       
   747 
       
   748 #if defined( UNROLL_LOOPS )
       
   749 
       
   750 #define gf_mul_8k(a, t, r) do {                     \
       
   751     move_block_aligned(r, tab8k(t)[0][a[0] & 15]);  \
       
   752     xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]);   \
       
   753     xor_8k( 1, a, t, r); xor_8k( 2, a, t, r);       \
       
   754     xor_8k( 3, a, t, r); xor_8k( 4, a, t, r);       \
       
   755     xor_8k( 5, a, t, r); xor_8k( 6, a, t, r);       \
       
   756     xor_8k( 7, a, t, r); xor_8k( 8, a, t, r);       \
       
   757     xor_8k( 9, a, t, r); xor_8k(10, a, t, r);       \
       
   758     xor_8k(11, a, t, r); xor_8k(12, a, t, r);       \
       
   759     xor_8k(13, a, t, r); xor_8k(14, a, t, r);       \
       
   760     xor_8k(15, a, t, r); move_block_aligned(a, r);  \
       
   761 } while(0)
       
   762 
       
   763 #else
       
   764 
       
   765 #define gf_mul_8k(a, t, r) do { int i;              \
       
   766     memcpy(r, tab8k(t)[0][a[0] & 15], GF_BYTE_LEN); \
       
   767     xor_block_aligned(r, tab8k(t)[1][a[0] >> 4]);   \
       
   768     for(i = 1; i < GF_BYTE_LEN; ++i)                \
       
   769     {   xor_8k(i, a, t, r);                         \
       
   770     }                                               \
       
   771     memcpy(a, r, GF_BYTE_LEN);                      \
       
   772 } while(0)
       
   773 
       
   774 #endif
       
   775 
       
   776 #endif
       
   777 
       
   778 /*  This version uses 4k bytes of table space on the stack.
       
   779     A 16 byte buffer has to be multiplied by a 16 byte key
       
   780     value in GF(128).  If we consider a GF(128) value in a
       
   781     single byte, we can construct a table of the 256 16 byte
       
   782     values that result from the 256 values of this byte.
       
   783     This requires 4096 bytes. If we take the highest byte in
       
   784     the buffer and use this table to get the result, we then
       
   785     have to multiply by x^120 to get the final value. For the
       
   786     next highest byte the result has to be multiplied by x^112
       
   787     and so on. But we can do this by accumulating the result
       
   788     in an accumulator starting with the result for the top
       
   789     byte.  We repeatedly multiply the accumulator value by
       
   790     x^8 and then add in (i.e. xor) the 16 bytes of the next
       
   791     lower byte in the buffer, stopping when we reach the
       
   792     lowest byte. This requires a 4096 byte table.
       
   793 */
       
   794 
       
   795 void init_4k_table(unsigned char g[], void *t);
       
   796 
       
   797 typedef uint_32t        (*gf_t4k)[GF_BYTE_LEN >> 2];
       
   798 #define tab4k(x)        ((gf_t4k)x)
       
   799 #define xor_4k(i,a,t,r) mul_x8(r); xor_block_aligned(r, tab4k(t)[a[i]])
       
   800 
       
   801 #if defined( USE_INLINES )
       
   802 
       
   803 #if defined( UNROLL_LOOPS )
       
   804 
       
   805 gf_inline void gf_mul_4k(unsigned char a[], void *t, void *r)
       
   806 {
       
   807     move_block_aligned(r,tab4k(t)[a[15]]);
       
   808     xor_4k(14, a, t, r); xor_4k(13, a, t, r); xor_4k(12, a, t, r);
       
   809     xor_4k(11, a, t, r); xor_4k(10, a, t, r); xor_4k( 9, a, t, r);
       
   810     xor_4k( 8, a, t, r); xor_4k( 7, a, t, r); xor_4k( 6, a, t, r);
       
   811     xor_4k( 5, a, t, r); xor_4k( 4, a, t, r); xor_4k( 3, a, t, r);
       
   812     xor_4k( 2, a, t, r); xor_4k( 1, a, t, r); xor_4k( 0, a, t, r);
       
   813     move_block_aligned(a, r);
       
   814 }
       
   815 
       
   816 #else
       
   817 
       
   818 gf_inline void gf_mul_4k(unsigned char a[], void *t, void *r)
       
   819 {   int i = 15;
       
   820     move_block_aligned(r,tab4k(t)[a[15]]);
       
   821     while(i--)
       
   822     {
       
   823         xor_4k(i, a, t, r);
       
   824     }
       
   825     move_block_aligned(a, r);
       
   826 }
       
   827 
       
   828 #endif
       
   829 
       
   830 #else
       
   831 
       
   832 #if defined( UNROLL_LOOPS )
       
   833 
       
   834 #define gf_mul_4k(a, t, r) do {                                     \
       
   835     move_block_aligned(r,tab4k(t)[a[15]]);                          \
       
   836     xor_4k(14, a, t, r); xor_4k(13, a, t, r); xor_4k(12, a, t, r);  \
       
   837     xor_4k(11, a, t, r); xor_4k(10, a, t, r); xor_4k( 9, a, t, r);  \
       
   838     xor_4k( 8, a, t, r); xor_4k( 7, a, t, r); xor_4k( 6, a, t, r);  \
       
   839     xor_4k( 5, a, t, r); xor_4k( 4, a, t, r); xor_4k( 3, a, t, r);  \
       
   840     xor_4k( 2, a, t, r); xor_4k( 1, a, t, r); xor_4k( 0, a, t, r);  \
       
   841     move_block_aligned(a, r);                                       \
       
   842 } while(0)
       
   843 
       
   844 #else
       
   845 
       
   846 #define gf_mul_4k(a, t, r) do { int i = 15; \
       
   847     move_block_aligned(r,tab4k(t)[a[15]]);  \
       
   848     while(i--)                              \
       
   849     {   xor_4k(i, a, t, r);                 \
       
   850     }                                       \
       
   851     move_block_aligned(a, r);               \
       
   852 } while(0)
       
   853 
       
   854 #endif
       
   855 
       
   856 #endif
       
   857 
       
   858 /*  This version uses 256 bytes of table space on the stack.
       
   859     A 16 byte buffer has to be multiplied by a 16 byte key
       
   860     value in GF(128).  If we consider a GF(128) value in a
       
   861     single 4-bit nibble, we can construct a table of the 16
       
   862     16 byte  values that result from the 16 values of this
       
   863     byte.  This requires 256 bytes. If we take the highest
       
   864     4-bit nibble in the buffer and use this table to get the
       
   865     result, we then have to multiply by x^124 to get the
       
   866     final value. For the next highest byte the result has to
       
   867     be multiplied by x^120 and so on. But we can do this by
       
   868     accumulating the result in an accumulator starting with
       
   869     the result for the top nibble.  We repeatedly multiply
       
   870     the accumulator value by x^4 and then add in (i.e. xor)
       
   871     the 16 bytes of the next lower nibble in the buffer,
       
   872     stopping when we reach the lowest nibblebyte. This uses
       
   873     a 256 byte table.
       
   874 */
       
   875 
       
   876 void init_256_table(unsigned char g[], void *t);
       
   877 
       
   878 typedef uint_32t    (*gf_t256)[GF_BYTE_LEN >> 2];
       
   879 #define tab256(t)   ((gf_t256)t)
       
   880 #define xor_256(i,a,t,r)    \
       
   881     mul_x4(r); xor_block_aligned(r, tab256(t)[a[i] & 15]);  \
       
   882     mul_x4(r); xor_block_aligned(r, tab256(t)[a[i] >> 4])
       
   883 
       
   884 #if defined( USE_INLINES )
       
   885 
       
   886 #if defined( UNROLL_LOOPS )
       
   887 
       
   888 gf_inline void gf_mul_256(unsigned char a[], void *t, void *r)
       
   889 {
       
   890     move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r);
       
   891     xor_block_aligned(r, tab256(t)[a[15] >> 4]);
       
   892     xor_256(14, a, t, r); xor_256(13, a, t, r);
       
   893     xor_256(12, a, t, r); xor_256(11, a, t, r);
       
   894     xor_256(10, a, t, r); xor_256( 9, a, t, r);
       
   895     xor_256( 8, a, t, r); xor_256( 7, a, t, r);
       
   896     xor_256( 6, a, t, r); xor_256( 5, a, t, r);
       
   897     xor_256( 4, a, t, r); xor_256( 3, a, t, r);
       
   898     xor_256( 2, a, t, r); xor_256( 1, a, t, r);
       
   899     xor_256( 0, a, t, r); move_block_aligned(a, r);
       
   900 }
       
   901 
       
   902 #else
       
   903 
       
   904 gf_inline void gf_mul_256(unsigned char a[], void *t, void *r)
       
   905 {   int i = 15;
       
   906     move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r);
       
   907     xor_block_aligned(r, tab256(t)[a[15] >> 4]);
       
   908     while(i--)
       
   909     {   xor_256(i, a, t, r);
       
   910     }
       
   911     move_block_aligned(a, r);
       
   912 }
       
   913 
       
   914 #endif
       
   915 
       
   916 #else
       
   917 
       
   918 #if defined( UNROLL_LOOPS )
       
   919 
       
   920 #define gf_mul_256(a, t, r) do {                            \
       
   921     move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); \
       
   922     xor_block_aligned(r, tab256(t)[a[15] >> 4]);            \
       
   923     xor_256(14, a, t, r); xor_256(13, a, t, r);             \
       
   924     xor_256(12, a, t, r); xor_256(11, a, t, r);             \
       
   925     xor_256(10, a, t, r); xor_256( 9, a, t, r);             \
       
   926     xor_256( 8, a, t, r); xor_256( 7, a, t, r);             \
       
   927     xor_256( 6, a, t, r); xor_256( 5, a, t, r);             \
       
   928     xor_256( 4, a, t, r); xor_256( 3, a, t, r);             \
       
   929     xor_256( 2, a, t, r); xor_256( 1, a, t, r);             \
       
   930     xor_256( 0, a, t, r); move_block_aligned(a, r);         \
       
   931 } while(0)
       
   932 
       
   933 #else
       
   934 
       
   935 #define gf_mul_256(a, t, r) do { int i = 15;                \
       
   936     move_block_aligned(r,tab256(t)[a[15] & 15]); mul_x4(r); \
       
   937     xor_block_aligned(r, tab256(t)[a[15] >> 4]);            \
       
   938     while(i--)                                              \
       
   939     {   xor_256(i, a, t, r);                                \
       
   940     }                                                       \
       
   941     move_block_aligned(a, r);                               \
       
   942 } while(0)
       
   943 
       
   944 #endif
       
   945 
       
   946 #endif
       
   947 
       
   948 #if defined(__cplusplus)
       
   949 }
       
   950 #endif
       
   951 
       
   952 #endif