Mailing List Archive

[PATCH 4/5] rijndael/ppc: re-implement single-block mode, and implement OCB block cipher
Impressive speed wins over the cryptogams version:

Also, easier to maintain than an assembly version.

8x was only marginally faster than 6x. Probably could be sped up
with a vectorgather instruction.

Before:
ECB enc | 2.84 ns/B 336.1 MiB/s 5.38 c/B 1895
ECB dec | 2.89 ns/B 330.6 MiB/s 5.47 c/B 1895
CBC enc | 1.05 ns/B 908.3 MiB/s 1.99 c/B 1895
CBC dec | 0.221 ns/B 4315 MiB/s 0.419 c/B 1895
CFB enc | 4.41 ns/B 216.4 MiB/s 8.35 c/B 1895
CFB dec | 4.88 ns/B 195.3 MiB/s 9.26 c/B 1895
OFB enc | 5.06 ns/B 188.4 MiB/s 9.59 c/B 1895
OFB dec | 5.07 ns/B 188.2 MiB/s 9.60 c/B 1895
CTR enc | 0.218 ns/B 4374 MiB/s 0.413 c/B 1895
CTR dec | 0.219 ns/B 4349 MiB/s 0.416 c/B 1895
XTS enc | 0.681 ns/B 1400 MiB/s 1.29 c/B 1895
XTS dec | 0.687 ns/B 1387 MiB/s 1.30 c/B 1895
CCM enc | 4.21 ns/B 226.4 MiB/s 5.32 c/B 1264
CCM dec | 4.21 ns/B 226.7 MiB/s 5.32 c/B 1264
CCM auth | 3.99 ns/B 239.2 MiB/s 5.04 c/B 1264
EAX enc | 4.20 ns/B 227.2 MiB/s 5.30 c/B 1264
EAX dec | 4.21 ns/B 226.5 MiB/s 5.32 c/B 1264
EAX auth | 3.97 ns/B 239.9 MiB/s 5.02 c/B 1264
GCM enc | 19.81 ns/B 48.14 MiB/s 25.03 c/B 1264
GCM dec | 19.79 ns/B 48.18 MiB/s 25.01 c/B 1264
GCM auth | 19.55 ns/B 48.78 MiB/s 24.71 c/B 1264
OCB enc | 17.53 ns/B 54.41 MiB/s 14.77 c/B 842.4
OCB dec | 13.89 ns/B 68.67 MiB/s 17.55 c/B 1263
OCB auth | 9.14 ns/B 104.4 MiB/s 11.54 c/B 1264

After:
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 1.98 ns/B 482.6 MiB/s 3.75 c/B 1895 <=====
ECB dec | 1.80 ns/B 529.3 MiB/s 3.42 c/B 1895 <=====
CBC enc | 1.05 ns/B 907.7 MiB/s 1.99 c/B 1895
CBC dec | 0.221 ns/B 4317 MiB/s 0.419 c/B 1895
CFB enc | 1.65 ns/B 578.5 MiB/s 3.12 c/B 1895
CFB dec | 1.03 ns/B 925.9 MiB/s 1.95 c/B 1895
OFB enc | 2.34 ns/B 408.2 MiB/s 3.83 c/B 1638
OFB dec | 2.33 ns/B 410.1 MiB/s 3.81 c/B 1638
CTR enc | 0.216 ns/B 4416 MiB/s 0.409 c/B 1895
CTR dec | 0.216 ns/B 4422 MiB/s 0.409 c/B 1895
XTS enc | 0.557 ns/B 1712 MiB/s 1.06 c/B 1895
XTS dec | 0.561 ns/B 1701 MiB/s 1.06 c/B 1895
CCM enc | 1.87 ns/B 509.9 MiB/s 3.54 c/B 1895
CCM dec | 1.87 ns/B 509.8 MiB/s 3.55 c/B 1895
CCM auth | 1.65 ns/B 576.4 MiB/s 3.14 c/B 1895
EAX enc | 1.87 ns/B 510.3 MiB/s 3.54 c/B 1895
EAX dec | 1.87 ns/B 510.0 MiB/s 3.54 c/B 1895
EAX auth | 1.65 ns/B 576.9 MiB/s 3.13 c/B 1895
GCM enc | 3.55 ns/B 268.7 MiB/s 6.73 c/B 1895
GCM dec | 3.55 ns/B 268.7 MiB/s 6.73 c/B 1895
GCM auth | 3.33 ns/B 286.2 MiB/s 6.32 c/B 1895
OCB enc | 0.426 ns/B 2241 MiB/s 0.807 c/B 1895 <====
OCB dec | 0.409 ns/B 2333 MiB/s 0.775 c/B 1895 <====
OCB auth | 1.23 ns/B 772.7 MiB/s 2.34 c/B 1895

v2: changelog
v3: GNU coding standards (whoops)
Seperate out altivec stuff into its own compiled file with its own CFLAGS,
and wrangle autocrap to manage to do this.

2019-07-09 Shawn Landden <shawn@git.icu>
* cipher/Makefile.am: Build rijndael-ppc.c with altivec CFLAGS.
* configure.ac: Link powerpc targets to altivec rijndael-ppc.c.
* cipher/rijndael-ppc.c: New implementation of single-block mode, and implementation of OCB mode.
* cipher/rijndael-ppc.h: Header file for above.
* cipher/rijndael.c: Glue. (Note: this just #includes the above file, and does not build it separately.)
---
cipher/Makefile.am | 8 +
cipher/rijndael-ppc.c | 720 ++++++++++++++++++++++++++++++++++++++++++
cipher/rijndael-ppc.h | 38 +++
cipher/rijndael.c | 33 +-
configure.ac | 3 +
5 files changed, 778 insertions(+), 24 deletions(-)
create mode 100644 cipher/rijndael-ppc.c
create mode 100644 cipher/rijndael-ppc.h

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index fe98fa3d..bb7cc145 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -99,10 +99,11 @@ EXTRA_libcipher_la_SOURCES = \
rijndael-ppc8.pl \
rijndael-ppc8.S \
rijndael-ppc8be.S \
rijndael-ppc832.S \
rmd160.c \
+ rijndael-ppc.c \
rsa.c \
salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
scrypt.c \
seed.c \
serpent.c serpent-sse2-amd64.S \
@@ -166,10 +167,17 @@ instrumentation_munging = sed \
-e 's/-fcoverage[=,\-][=,a-z,A-Z,0-9,\,,\-]*//g'
else
instrumentation_munging = cat
endif

+# So what works in glibc's makefiles, doesn't work in automake....
+rijndael-ppc.lo: $(srcdir)/rijndael-ppc.c Makefile
+ `echo $(COMPILE) -mabi=altivec -maltivec -mvsx -mpower8-vector -c $< | $(instrumentation_munging) `
+
+rijndael-ppc.lo: $(srcdir)/rijndael-ppc.c Makefile
+ `echo $(LTCOMPILE) -mabi=altivec -maltivec -mvsx -mpower8-vector -c $< | $(instrumentation_munging) `
+
rijndael-aesni.o: $(srcdir)/rijndael-aesni.c Makefile
`echo $(COMPILE) -c $< | $(instrumentation_munging) `

rijndael-aesni.lo: $(srcdir)/rijndael-aesni.c Makefile
`echo $(LTCOMPILE) -c $< | $(instrumentation_munging) `
diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c
new file mode 100644
index 00000000..76930610
--- /dev/null
+++ b/cipher/rijndael-ppc.c
@@ -0,0 +1,720 @@
+/* Rijndael (AES) for GnuPG - PowerPC ISA 2.07 (POWER 8)
+ * Copyright (C) 2019 Shawn Landden
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Alternatively, this code may be used in OpenSSL from The OpenSSL Project,
+ * and Cryptogams by Andy Polyakov, and if made part of a release of either
+ * or both projects, is thereafter dual-licensed under the license said project
+ * is released under.
+ */
+
+#include <config.h>
+
+/* POWER 8 AES extensions */
+#include <altivec.h>
+#include "rijndael-internal.h"
+#include "./cipher-internal.h"
+
+typedef vector unsigned char block;
+vector unsigned char backwards = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+
+#ifdef __LITTLE_ENDIAN__
+#define swap_if_le(a) \
+ vec_perm(a, a, backwards)
+#elif __BIG_ENDIAN__
+#define swap_if_le(a) (a)
+#else
+#error "What endianness?"
+#endif
+
+/* Passes in AltiVec registers (big-endian)
+ * sadly compilers don't know how to unroll outer loops into
+ * inner loops with more registers on static functions,
+ * so that this can be properly optimized for OOO multi-issue
+ * without having to hand-unroll.
+ */
+static block _gcry_aes_ppc8_encrypt_altivec (const RIJNDAEL_context *ctx,
+ block a)
+{
+ int r;
+ int rounds = ctx->rounds;
+ block *rk = (block*)ctx->keyschenc;
+
+ //hexDump("sa", &a, sizeof(a));
+ a = rk[0] ^ a;
+ //hexDump("sa", &a, sizeof(a));
+ for (r = 1;r < rounds;r++)
+ {
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (a)
+ :"v" (rk[r])
+ );
+ //hexDump("sa", &a, sizeof(a));
+ }
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (a)
+ :"v" (rk[r])
+ );
+ //hexDump("end", &a, sizeof(a));
+ return a;
+}
+
+
+static block _gcry_aes_ppc8_decrypt_altivec (const RIJNDAEL_context *ctx,
+ block a)
+{
+ int r;
+ int rounds = ctx->rounds;
+ block *rk = (block*)ctx->keyschdec;
+
+ //hexDump("sa", &a, sizeof(a));
+ a = rk[0] ^ a;
+ //hexDump("sa", &a, sizeof(a));
+ for (r = 1;r < rounds;r++)
+ {
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (a)
+ :"v" (rk[r])
+ );
+ //hexDump("sa", &a, sizeof(a));
+ }
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (a)
+ :"v" (rk[r])
+ );
+ //hexDump("end", &a, sizeof(a));
+ return a;
+}
+
+unsigned int _gcry_aes_ppc8_encrypt (const RIJNDAEL_context *ctx,
+ unsigned char *b,
+ const unsigned char *a)
+{
+ uintptr_t zero = 0;
+ block sa;
+ //hexDump("key", rk_c, 16 * 15);
+
+ if ((uintptr_t)a % 16 == 0)
+ {
+ sa = vec_ld (0, a);
+ }
+ else
+ {
+ block unalignedprev, unalignedcur;
+ unalignedprev = vec_ld (0, a);
+ unalignedcur = vec_ld (16, a);
+ sa = vec_perm (unalignedprev, unalignedcur, vec_lvsl(0, a));
+ }
+
+ sa = swap_if_le(sa);
+ sa = _gcry_aes_ppc8_encrypt_altivec(ctx, sa);
+
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :
+ : "wa" (sa), "r" (zero), "r" ((uintptr_t)b));
+
+ return 0; /* does not use stack */
+}
+
+unsigned int _gcry_aes_ppc8_decrypt (const RIJNDAEL_context *ctx,
+ unsigned char *b,
+ const unsigned char *a)
+{
+ uintptr_t zero = 0;
+ block sa, unalignedprev, unalignedcur;
+
+ //hexDump("key", rk, 16 * 15);
+
+ if ((uintptr_t)a % 16 == 0)
+ {
+ sa = vec_ld(0, a);
+ }
+ else
+ {
+ unalignedprev = vec_ld (0, a);
+ unalignedcur = vec_ld (16, a);
+ sa = vec_perm (unalignedprev, unalignedcur, vec_lvsl(0, a));
+ }
+
+ sa = swap_if_le (sa);
+ sa = _gcry_aes_ppc8_decrypt_altivec (ctx, sa);
+
+ //hexDump("sa", &sa, sizeof(sa));
+ if ((uintptr_t)b % 16 == 0)
+ vec_vsx_st(swap_if_le(sa), 0, b);
+ else
+ {
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :
+ : "wa" (sa), "r" (zero), "r" ((uintptr_t)b));
+ }
+ return 0; /* does not use stack */
+}
+
+size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks,
+ int encrypt)
+{
+ RIJNDAEL_context *ctx = (void *)&c->context.c;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+
+ block *in = (block*)inbuf;
+ block *out = (block*)outbuf;
+ uintptr_t zero = 0;
+ int r;
+ int rounds = ctx->rounds;
+
+ if (encrypt)
+ {
+ const int unroll = 8;
+ block unalignedprev, ctr, iv;
+ if (((uintptr_t)inbuf % 16) != 0)
+ {
+ unalignedprev = vec_ld(0, in++);
+ }
+
+ iv = vec_ld (0, (block*)&c->u_iv.iv);
+ ctr = vec_ld (0, (block*)&c->u_ctr.ctr);
+
+//hexDump("ctr", &ctr, 16);
+//hexDump("key", &ctx->u1, sizeof(ctx->u1));
+ for ( ;nblocks >= unroll; nblocks -= unroll)
+ {
+ u64 i = c->u_mode.ocb.data_nblocks + 1;
+ block l0, l1, l2, l3, l4, l5, l6, l7;
+ block b0, b1, b2, b3, b4, b5, b6, b7;
+ block iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7;
+ const block *rk = (block*)&ctx->keyschenc;
+
+ c->u_mode.ocb.data_nblocks += unroll;
+
+//hexDump("iv", &iv, 16);
+ iv0 = iv;
+ if ((uintptr_t)inbuf % 16 == 0)
+ {
+ b0 = vec_ld (0, in++);
+//hexDump("start", &b0, 16);
+ b1 = vec_ld (0, in++);
+ b2 = vec_ld (0, in++);
+ b3 = vec_ld (0, in++);
+ b4 = vec_ld (0, in++);
+ b5 = vec_ld (0, in++);
+ b6 = vec_ld (0, in++);
+ b7 = vec_ld (0, in++);
+ }
+ else
+ {
+ block unaligned0, unaligned1, unaligned2,
+ unaligned3, unaligned4, unaligned5, unaligned6;
+ unaligned0 = vec_ld (0, in++);
+ unaligned1 = vec_ld (0, in++);
+ unaligned2 = vec_ld (0, in++);
+ unaligned3 = vec_ld (0, in++);
+ unaligned4 = vec_ld (0, in++);
+ unaligned5 = vec_ld (0, in++);
+ unaligned6 = vec_ld (0, in++);
+ b0 = vec_perm (unalignedprev, unaligned0, vec_lvsl (0, inbuf));
+//hexDump("start", &b0, 16);
+ unalignedprev = vec_ld (0, in++);
+ b1 = vec_perm(unaligned0, unaligned1, vec_lvsl (0, inbuf));
+ b2 = vec_perm(unaligned1, unaligned2, vec_lvsl (0, inbuf));
+ b3 = vec_perm(unaligned2, unaligned3, vec_lvsl (0, inbuf));
+ b4 = vec_perm(unaligned3, unaligned4, vec_lvsl (0, inbuf));
+ b5 = vec_perm(unaligned4, unaligned5, vec_lvsl (0, inbuf));
+ b6 = vec_perm(unaligned5, unaligned6, vec_lvsl (0, inbuf));
+ b7 = vec_perm(unaligned6, unalignedprev, vec_lvsl (0, inbuf));
+ }
+
+//hexDump("i", &i, sizeof(i));
+ l0 = *(block*)ocb_get_l (c, i++);
+//hexDump("l", &l0, 16);
+ l1 = *(block*)ocb_get_l (c, i++);
+ l2 = *(block*)ocb_get_l (c, i++);
+ l3 = *(block*)ocb_get_l (c, i++);
+ l4 = *(block*)ocb_get_l (c, i++);
+ l5 = *(block*)ocb_get_l (c, i++);
+ l6 = *(block*)ocb_get_l (c, i++);
+ l7 = *(block*)ocb_get_l (c, i++);
+
+ ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;
+
+ iv0 ^= l0;
+//hexDump("xorl", &iv0, 16);
+ b0 ^= iv0;
+//hexDump("xor", &b0, 16);
+ iv1 = iv0 ^ l1;
+ b1 ^= iv1;
+ iv2 = iv1 ^ l2;
+ b2 ^= iv2;
+ iv3 = iv2 ^ l3;
+ b3 ^= iv3;
+ iv4 = iv3 ^ l4;
+ b4 ^= iv4;
+ iv5 = iv4 ^ l5;
+ b5 ^= iv5;
+ iv6 = iv5 ^ l6;
+ b6 ^= iv6;
+ iv7 = iv6 ^ l7;
+ b7 ^= iv7;
+
+ b0 = swap_if_le (b0);
+//hexDump("swap", &b0, 16);
+ b1 = swap_if_le (b1);
+ b2 = swap_if_le (b2);
+ b3 = swap_if_le (b3);
+ b4 = swap_if_le (b4);
+ b5 = swap_if_le (b5);
+ b6 = swap_if_le (b6);
+ b7 = swap_if_le (b7);
+
+ b0 ^= rk[0];
+//hexDump("xor ??", &b0, 16);
+ b1 ^= rk[0];
+ b2 ^= rk[0];
+ b3 ^= rk[0];
+ b4 ^= rk[0];
+ b5 ^= rk[0];
+ b6 ^= rk[0];
+ b7 ^= rk[0];
+
+ for (r = 1;r < rounds;r++)
+ {
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (b0)
+ :"v" (rk[r]));
+//hexDump("round", &b0, 16);
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (b1)
+ :"v" (rk[r]));
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (b2)
+ :"v" (rk[r]));
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (b3)
+ :"v" (rk[r]));
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (b4)
+ :"v" (rk[r]));
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (b5)
+ :"v" (rk[r]));
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (b6)
+ :"v" (rk[r]));
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (b7)
+ :"v" (rk[r]));
+ }
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (b0)
+ :"v" (rk[r]));
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (b1)
+ :"v" (rk[r]));
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (b2)
+ :"v" (rk[r]));
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (b3)
+ :"v" (rk[r]));
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (b4)
+ :"v" (rk[r]));
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (b5)
+ :"v" (rk[r]));
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (b6)
+ :"v" (rk[r]));
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (b7)
+ :"v" (rk[r]));
+
+//hexDump("end", &b0, 16);
+ iv = iv7;
+//hexDump("end-iv5", &b0, 16);
+
+ // The unaligned store stxvb16x writes big-endian,
+ // so in the unaligned case we swap the iv instead of the bytes
+ if ((uintptr_t)outbuf % 16 == 0)
+ {
+ vec_vsx_st (swap_if_le (b0) ^ iv0, 0, out++);
+//hexDump("out", out - 1, 16);
+ vec_vsx_st (swap_if_le (b1) ^ iv1, 0, out++);
+ vec_vsx_st (swap_if_le (b2) ^ iv2, 0, out++);
+ vec_vsx_st (swap_if_le (b3) ^ iv3, 0, out++);
+ vec_vsx_st (swap_if_le (b4) ^ iv4, 0, out++);
+ vec_vsx_st (swap_if_le (b5) ^ iv5, 0, out++);
+ vec_vsx_st (swap_if_le (b6) ^ iv6, 0, out++);
+ vec_vsx_st (swap_if_le (b7) ^ iv7, 0, out++);
+ }
+ else
+ {
+ b0 ^= swap_if_le (iv0);
+ b1 ^= swap_if_le (iv1);
+ b2 ^= swap_if_le (iv2);
+ b3 ^= swap_if_le (iv3);
+ b4 ^= swap_if_le (iv4);
+ b5 ^= swap_if_le (iv5);
+ b6 ^= swap_if_le (iv6);
+ b7 ^= swap_if_le (iv7);
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b0), "r" (zero), "r" ((uintptr_t)(out++)));
+//hexDump("out-un", out - 1, 16);
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b1), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b2), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b3), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b4), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b5), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b6), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b7), "r" (zero), "r" ((uintptr_t)(out++)));
+ }
+ }
+
+ for ( ;nblocks; nblocks-- )
+ {
+ block b;
+ u64 i = ++c->u_mode.ocb.data_nblocks;
+ const block l = *(block*)ocb_get_l (c, i);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ iv ^= l;
+ if ((uintptr_t)in % 16 == 0)
+ {
+ b = vec_ld (0, in++);
+ }
+ else
+ {
+ block unalignedprevprev;
+ unalignedprevprev = unalignedprev;
+ unalignedprev = vec_ld (0, in++);
+ b = vec_perm (unalignedprevprev, unalignedprev, vec_lvsl (0, inbuf));
+ }
+//hexDump("start", &b, 16);
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ ctr ^= b;
+//hexDump("ctr", &ctr, 16);
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ b ^= iv;
+//hexDump("xoriv", &b, 16);
+ b = swap_if_le (b);
+ b = _gcry_aes_ppc8_encrypt_altivec (ctx, b);
+//hexDump("crypt", &b, 16);
+ if ((uintptr_t)out % 16 == 0)
+ vec_vsx_st (swap_if_le (b) ^ iv, 0, out++);
+ else
+ {
+ b ^= swap_if_le (iv);
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :
+ : "wa" (b), "r" (zero), "r" ((uintptr_t)out++));
+ }
+//hexDump("out", out - 1, 16);
+ }
+
+ // We want to store iv and ctr big-endian and the unaligned
+ // store stxvb16x stores them little endian, so we have to swap them.
+ iv = swap_if_le (iv);
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (iv), "r" (zero), "r" ((uintptr_t)&c->u_iv.iv));
+ ctr = swap_if_le (ctr);
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (ctr), "r" (zero), "r" ((uintptr_t)&c->u_ctr.ctr));
+ }
+ else
+ {
+ const int unroll = 8;
+ block unalignedprev, ctr, iv;
+ if (((uintptr_t)inbuf % 16) != 0)
+ {
+ unalignedprev = vec_ld (0, in++);
+ }
+
+ iv = vec_ld (0, (block*)&c->u_iv.iv);
+ ctr = vec_ld (0, (block*)&c->u_ctr.ctr);
+
+//hexDump("ctr", &ctr, 16);
+//hexDump("key", &ctx->u1, sizeof(ctx->u1));
+ for ( ;nblocks >= unroll; nblocks -= unroll)
+ {
+ u64 i = c->u_mode.ocb.data_nblocks + 1;
+ block l0, l1, l2, l3, l4, l5, l6, l7;
+ block b0, b1, b2, b3, b4, b5, b6, b7;
+ block iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7;
+ const block *rk = (block*)&ctx->keyschdec;
+
+ c->u_mode.ocb.data_nblocks += unroll;
+
+//hexDump("iv", &iv, 16);
+ iv0 = iv;
+ if ((uintptr_t)inbuf % 16 == 0)
+ {
+ b0 = vec_ld (0, in++);
+//hexDump("start", &b0, 16);
+ b1 = vec_ld (0, in++);
+ b2 = vec_ld (0, in++);
+ b3 = vec_ld (0, in++);
+ b4 = vec_ld (0, in++);
+ b5 = vec_ld (0, in++);
+ b6 = vec_ld (0, in++);
+ b7 = vec_ld (0, in++);
+ }
+ else
+ {
+ block unaligned0, unaligned1, unaligned2,
+ unaligned3, unaligned4, unaligned5, unaligned6;
+ unaligned0 = vec_ld (0, in++);
+ unaligned1 = vec_ld (0, in++);
+ unaligned2 = vec_ld (0, in++);
+ unaligned3 = vec_ld (0, in++);
+ unaligned4 = vec_ld (0, in++);
+ unaligned5 = vec_ld (0, in++);
+ unaligned6 = vec_ld (0, in++);
+ b0 = vec_perm (unalignedprev, unaligned0, vec_lvsl (0, inbuf));
+//hexDump("start", &b0, 16);
+ unalignedprev = vec_ld (0, in++);
+ b1 = vec_perm (unaligned0, unaligned1, vec_lvsl (0, inbuf));
+ b2 = vec_perm (unaligned1, unaligned2, vec_lvsl (0, inbuf));
+ b3 = vec_perm (unaligned2, unaligned3, vec_lvsl (0, inbuf));
+ b4 = vec_perm (unaligned3, unaligned4, vec_lvsl (0, inbuf));
+ b5 = vec_perm (unaligned4, unaligned5, vec_lvsl (0, inbuf));
+ b6 = vec_perm (unaligned5, unaligned6, vec_lvsl (0, inbuf));
+ b7 = vec_perm (unaligned6, unalignedprev, vec_lvsl (0, inbuf));
+ }
+
+//hexDump("i", &i, sizeof(i));
+ l0 = *(block*)ocb_get_l (c, i++);
+//hexDump("l", &l0, 16);
+ l1 = *(block*)ocb_get_l (c, i++);
+ l2 = *(block*)ocb_get_l (c, i++);
+ l3 = *(block*)ocb_get_l (c, i++);
+ l4 = *(block*)ocb_get_l (c, i++);
+ l5 = *(block*)ocb_get_l (c, i++);
+ l6 = *(block*)ocb_get_l (c, i++);
+ l7 = *(block*)ocb_get_l (c, i++);
+
+ iv0 ^= l0;
+//hexDump("xorl", &iv0, 16);
+ b0 ^= iv0;
+//hexDump("xor", &b0, 16);
+ iv1 = iv0 ^ l1;
+ b1 ^= iv1;
+ iv2 = iv1 ^ l2;
+ b2 ^= iv2;
+ iv3 = iv2 ^ l3;
+ b3 ^= iv3;
+ iv4 = iv3 ^ l4;
+ b4 ^= iv4;
+ iv5 = iv4 ^ l5;
+ b5 ^= iv5;
+ iv6 = iv5 ^ l6;
+ b6 ^= iv6;
+ iv7 = iv6 ^ l7;
+ b7 ^= iv7;
+
+ b0 = swap_if_le (b0);
+//hexDump("swap", &b0, 16);
+ b1 = swap_if_le (b1);
+ b2 = swap_if_le (b2);
+ b3 = swap_if_le (b3);
+ b4 = swap_if_le (b4);
+ b5 = swap_if_le (b5);
+ b6 = swap_if_le (b6);
+ b7 = swap_if_le (b7);
+
+ b0 ^= rk[0];
+//hexDump("xor ??", &b0, 16);
+ b1 ^= rk[0];
+ b2 ^= rk[0];
+ b3 ^= rk[0];
+ b4 ^= rk[0];
+ b5 ^= rk[0];
+ b6 ^= rk[0];
+ b7 ^= rk[0];
+
+ for (r = 1;r < rounds;r++)
+ {
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (b0)
+ :"v" (rk[r]));
+//hexDump("round", &b0, 16);
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (b1)
+ :"v" (rk[r]));
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (b2)
+ :"v" (rk[r]));
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (b3)
+ :"v" (rk[r]));
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (b4)
+ :"v" (rk[r]));
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (b5)
+ :"v" (rk[r]));
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (b6)
+ :"v" (rk[r]));
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (b7)
+ :"v" (rk[r]));
+ }
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (b0)
+ :"v" (rk[r]));
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (b1)
+ :"v" (rk[r]));
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (b2)
+ :"v" (rk[r]));
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (b3)
+ :"v" (rk[r]));
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (b4)
+ :"v" (rk[r]));
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (b5)
+ :"v" (rk[r]));
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (b6)
+ :"v" (rk[r]));
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (b7)
+ :"v" (rk[r]));
+
+//hexDump("end", &b0, 16);
+ iv = iv7;
+//hexDump("end-iv5", &b0, 16);
+
+ b0 = swap_if_le (b0) ^ iv0;
+ b1 = swap_if_le (b1) ^ iv1;
+ b2 = swap_if_le (b2) ^ iv2;
+ b3 = swap_if_le (b3) ^ iv3;
+ b4 = swap_if_le (b4) ^ iv4;
+ b5 = swap_if_le (b5) ^ iv5;
+ b6 = swap_if_le (b6) ^ iv6;
+ b7 = swap_if_le (b7) ^ iv7;
+
+ ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;
+
+ // The unaligned store stxvb16x writes big-endian
+ if ((uintptr_t)outbuf % 16 == 0)
+ {
+ vec_vsx_st (b0, 0, out++);
+ vec_vsx_st (b1, 0, out++);
+ vec_vsx_st (b2, 0, out++);
+ vec_vsx_st (b3, 0, out++);
+ vec_vsx_st (b4, 0, out++);
+ vec_vsx_st (b5, 0, out++);
+ vec_vsx_st (b6, 0, out++);
+ vec_vsx_st (b7, 0, out++);
+ }
+ else
+ {
+ b0 = swap_if_le (b0);
+ b1 = swap_if_le (b1);
+ b2 = swap_if_le (b2);
+ b3 = swap_if_le (b3);
+ b4 = swap_if_le (b4);
+ b5 = swap_if_le (b5);
+ b6 = swap_if_le (b6);
+ b7 = swap_if_le (b7);
+ __asm__ ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b0), "r" (zero), "r" ((uintptr_t)(out++)));
+//hexDump("out-un", out - 1, 16);
+ __asm__ ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b1), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b2), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b3), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b4), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b5), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b6), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b7), "r" (zero), "r" ((uintptr_t)(out++)));
+ }
+ }
+
+ for ( ;nblocks; nblocks-- )
+ {
+ block b;
+ u64 i = ++c->u_mode.ocb.data_nblocks;
+ const block l = *(block*)ocb_get_l (c, i);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ iv ^= l;
+ if ((uintptr_t)in % 16 == 0)
+ {
+ b = vec_ld (0, in++);
+ }
+ else
+ {
+ block unalignedprevprev;
+ unalignedprevprev = unalignedprev;
+ unalignedprev = vec_ld (0, in++);
+ b = vec_perm (unalignedprevprev, unalignedprev, vec_lvsl (0, inbuf));
+ }
+//hexDump("start", &b, 16);
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+//hexDump("ctr", &ctr, 16);
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ b ^= iv;
+//hexDump("xoriv", &b, 16);
+ b = swap_if_le (b);
+ b = _gcry_aes_ppc8_decrypt_altivec (ctx, b);
+//hexDump("crypt", &b, 16);
+ b = swap_if_le (b) ^ iv;
+ ctr ^= b;
+ if ((uintptr_t)out % 16 == 0)
+ vec_vsx_st (b, 0, out++);
+ else
+ {
+ b = swap_if_le (b);
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :
+ : "wa" (b), "r" (zero), "r" ((uintptr_t)out++));
+ }
+//hexDump("out", out - 1, 16);
+ }
+
+ // We want to store iv and ctr big-endian and the unaligned
+ // store stxvb16x stores them little endian, so we have to swap them.
+ iv = swap_if_le (iv);
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (iv), "r" (zero), "r" ((uintptr_t)&c->u_iv.iv));
+ ctr = swap_if_le(ctr);
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (ctr), "r" (zero), "r" ((uintptr_t)&c->u_ctr.ctr));
+ }
+ return 0;
+}
+
diff --git a/cipher/rijndael-ppc.h b/cipher/rijndael-ppc.h
new file mode 100644
index 00000000..95415a8e
--- /dev/null
+++ b/cipher/rijndael-ppc.h
@@ -0,0 +1,38 @@
+/* Rijndael (AES) for GnuPG - PowerPC ISA 2.07 (POWER 8)
+ * Copyright (C) 2019 Shawn Landden
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Alternatively, this code may be used in OpenSSL from The OpenSSL Project,
+ * and Cryptogams by Andy Polyakov, and if made part of a release of either
+ * or both projects, is thereafter dual-licensed under the license said project
+ * is released under.
+ */
+
+#include <config.h>
+#include "rijndael-internal.h"
+#include "./cipher-internal.h"
+
+size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks,
+ int encrypt);
+unsigned int _gcry_aes_ppc8_encrypt (const RIJNDAEL_context *ctx,
+ unsigned char *b,
+ const unsigned char *a);
+unsigned int _gcry_aes_ppc8_decrypt (const RIJNDAEL_context *ctx,
+ unsigned char *b,
+ const unsigned char *a);
+
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 2d1c38bf..ecfc0ac0 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -198,33 +198,11 @@ extern void _gcry_aes_armv8_ce_xts_crypt (void *context, unsigned char *tweak,
const void *inbuf_arg,
size_t nblocks, int encrypt);
#endif /*USE_ARM_ASM*/

#ifdef USE_PPC_ASM
-/* POWER 8 AES extensions */
-extern void aes_p8_encrypt (const unsigned char *in,
- unsigned char *out,
- const RIJNDAEL_context *ctx);
-static unsigned int _gcry_aes_ppc8_encrypt (const RIJNDAEL_context *ctx,
- unsigned char *out,
- const unsigned char *in)
-{
- /* When I tried to switch these registers in the assembly it broke. */
- aes_p8_encrypt (in, out, ctx);
- return 0; /* does not use stack */
-}
- /* this is the decryption key part of context */
-extern void aes_p8_decrypt (const unsigned char *in,
- unsigned char *out,
- const void *sboxes);
-static unsigned int _gcry_aes_ppc8_decrypt (const RIJNDAEL_context *ctx,
- unsigned char *out,
- const unsigned char *in)
-{
- aes_p8_decrypt (in, out, &ctx->u2);
- return 0; /* does not use stack */
-}
+#include "rijndael-ppc.h"
extern int aes_p8_set_encrypt_key (const unsigned char *userKey, const int bits,
RIJNDAEL_context *key);
extern int aes_p8_set_decrypt_key (const unsigned char *userKey, const int bits,
/* this is the decryption key part of context */
const unsigned (*)[15][4]);
@@ -295,11 +273,11 @@ static void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr,
const unsigned char *inbuf = inbuf_arg;
unsigned char *outbuf = outbuf_arg;
const RIJNDAEL_context *ctx = context;
const uint64_t two32 = 1ULL << 32;
int overflow;
- u64 s[2], e[2];
+ u64 s[2];
s[0] = buf_get_be64(ctr + 8);
overflow = two32 - (s[0] % two32) < nblocks;
#ifdef __builtin_expect
__builtin_expect(overflow, 0);
#endif
@@ -564,10 +542,11 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
if (hd) {
hd->bulk.cbc_dec = _gcry_aes_ppc8_cbc_dec;
hd->bulk.cbc_enc = _gcry_aes_ppc8_cbc_enc;
hd->bulk.xts_crypt = _gcry_aes_ppc8_xts_crypt;
hd->bulk.ctr_enc = _gcry_aes_ppc8_ctr_enc;
+ hd->bulk.ocb_crypt = _gcry_aes_ppc8_ocb_crypt;
}
}
#endif
else
{
@@ -1539,10 +1518,16 @@ _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
else if (ctx->use_arm_ce)
{
return _gcry_aes_armv8_ce_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt);
}
#endif /*USE_ARM_CE*/
+#ifdef USE_PPC_ASM
+ else if (ctx->use_ppc_asm)
+ {
+ return _gcry_aes_ppc8_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt);
+ }
+#endif /*USE_PPC_ASM*/
else if (encrypt)
{
union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;
rijndael_cryptfn_t encrypt_fn = ctx->encrypt_fn;

diff --git a/configure.ac b/configure.ac
index 2d8503ac..28458c68 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2240,22 +2240,25 @@ if test "$found" = "1" ; then
;;
powerpc64le-*-*)
# Build with the crypto extension implementation
GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc8.lo"
GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc8.lo sha256-ppc8.lo"
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc.lo"
;;
powerpc64-*-*)
# Big-Endian.
# Build with the crypto extension implementation
GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc8be.lo"
GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc8be.lo sha256-ppc8be.lo"
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc.lo"
;;
powerpc-*-*)
# Big-Endian.
# Build with the crypto extension implementation
GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc832.lo"
GCRYPT_CIPHERS="$GCRYPT_CIPHERS sha512-ppc832.lo sha256-ppc832.lo"
+ GCRYPT_CIPHERS="$GCRYPT_CIPHERS rijndael-ppc.lo"
;;
esac

case "$mpi_cpu_arch" in
x86)
--
2.20.1


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@gnupg.org
http://lists.gnupg.org/mailman/listinfo/gcrypt-devel
[PATCH 4/5] rijndael/ppc: re-implement single-block mode, and implement OCB block cipher [ In reply to ]
Impressive speed wins over the cryptogams version:

Also, easier to maintain than an assembly version.

8x was only marginally faster than 6x. Probably could be sped up
with a vectorgather instruction.

Before:
ECB enc | 2.84 ns/B 336.1 MiB/s 5.38 c/B 1895
ECB dec | 2.89 ns/B 330.6 MiB/s 5.47 c/B 1895
CBC enc | 1.05 ns/B 908.3 MiB/s 1.99 c/B 1895
CBC dec | 0.221 ns/B 4315 MiB/s 0.419 c/B 1895
CFB enc | 4.41 ns/B 216.4 MiB/s 8.35 c/B 1895
CFB dec | 4.88 ns/B 195.3 MiB/s 9.26 c/B 1895
OFB enc | 5.06 ns/B 188.4 MiB/s 9.59 c/B 1895
OFB dec | 5.07 ns/B 188.2 MiB/s 9.60 c/B 1895
CTR enc | 0.218 ns/B 4374 MiB/s 0.413 c/B 1895
CTR dec | 0.219 ns/B 4349 MiB/s 0.416 c/B 1895
XTS enc | 0.681 ns/B 1400 MiB/s 1.29 c/B 1895
XTS dec | 0.687 ns/B 1387 MiB/s 1.30 c/B 1895
CCM enc | 4.21 ns/B 226.4 MiB/s 5.32 c/B 1264
CCM dec | 4.21 ns/B 226.7 MiB/s 5.32 c/B 1264
CCM auth | 3.99 ns/B 239.2 MiB/s 5.04 c/B 1264
EAX enc | 4.20 ns/B 227.2 MiB/s 5.30 c/B 1264
EAX dec | 4.21 ns/B 226.5 MiB/s 5.32 c/B 1264
EAX auth | 3.97 ns/B 239.9 MiB/s 5.02 c/B 1264
GCM enc | 19.81 ns/B 48.14 MiB/s 25.03 c/B 1264
GCM dec | 19.79 ns/B 48.18 MiB/s 25.01 c/B 1264
GCM auth | 19.55 ns/B 48.78 MiB/s 24.71 c/B 1264
OCB enc | 17.53 ns/B 54.41 MiB/s 14.77 c/B 842.4
OCB dec | 13.89 ns/B 68.67 MiB/s 17.55 c/B 1263
OCB auth | 9.14 ns/B 104.4 MiB/s 11.54 c/B 1264

After:
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 1.98 ns/B 482.6 MiB/s 3.75 c/B 1895 <=====
ECB dec | 1.80 ns/B 529.3 MiB/s 3.42 c/B 1895 <=====
CBC enc | 1.05 ns/B 907.7 MiB/s 1.99 c/B 1895
CBC dec | 0.221 ns/B 4317 MiB/s 0.419 c/B 1895
CFB enc | 1.65 ns/B 578.5 MiB/s 3.12 c/B 1895
CFB dec | 1.03 ns/B 925.9 MiB/s 1.95 c/B 1895
OFB enc | 2.34 ns/B 408.2 MiB/s 3.83 c/B 1638
OFB dec | 2.33 ns/B 410.1 MiB/s 3.81 c/B 1638
CTR enc | 0.216 ns/B 4416 MiB/s 0.409 c/B 1895
CTR dec | 0.216 ns/B 4422 MiB/s 0.409 c/B 1895
XTS enc | 0.557 ns/B 1712 MiB/s 1.06 c/B 1895
XTS dec | 0.561 ns/B 1701 MiB/s 1.06 c/B 1895
CCM enc | 1.87 ns/B 509.9 MiB/s 3.54 c/B 1895
CCM dec | 1.87 ns/B 509.8 MiB/s 3.55 c/B 1895
CCM auth | 1.65 ns/B 576.4 MiB/s 3.14 c/B 1895
EAX enc | 1.87 ns/B 510.3 MiB/s 3.54 c/B 1895
EAX dec | 1.87 ns/B 510.0 MiB/s 3.54 c/B 1895
EAX auth | 1.65 ns/B 576.9 MiB/s 3.13 c/B 1895
GCM enc | 3.55 ns/B 268.7 MiB/s 6.73 c/B 1895
GCM dec | 3.55 ns/B 268.7 MiB/s 6.73 c/B 1895
GCM auth | 3.33 ns/B 286.2 MiB/s 6.32 c/B 1895
OCB enc | 0.426 ns/B 2241 MiB/s 0.807 c/B 1895 <====
OCB dec | 0.409 ns/B 2333 MiB/s 0.775 c/B 1895 <====
OCB auth | 1.23 ns/B 772.7 MiB/s 2.34 c/B 1895

This patch is NOT correct, because the altivec stuff needs to be compiled separately
and with the following CFLAGS:
-mabi=altivec -maltivec -mvsx -mpower8-vector
which should not be enabled for any code that is not feature-gated on Power 8
(even though modern ppc64le user-land will no longer run on Power 7).
However, I have tried to do this 3 times with autotools, and each time I get cryptic error
messages from a system I frankly don't see studying worthwhile.

Also, putting the cryptogams stuff in its own folder also requires some autotools wizardry,
as the straight-forward method I tried requires enabling a feature "subdir-objects" that
other parts of the libgcrypt code base say caused problems.

2019-07-09 Shawn Landden <shawn@git.icu>
* cipher/Makefile.am: Note new source file.
* cipher/rijndael-ppc.c: New implementation of single-block mode, and implementation of OCB mode.
* cipher/rijndael.c: Glue. (Note: this just #includes the above file, and does not build it separately.)
---
cipher/Makefile.am | 1 +
cipher/rijndael-ppc.c | 737 ++++++++++++++++++++++++++++++++++++++++++
cipher/rijndael.c | 33 +-
3 files changed, 747 insertions(+), 24 deletions(-)
create mode 100644 cipher/rijndael-ppc.c

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index fe98fa3d..9505b302 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -101,6 +101,7 @@ EXTRA_libcipher_la_SOURCES = \
rijndael-ppc8be.S \
rijndael-ppc832.S \
rmd160.c \
+ rijndael-ppc.c \
rsa.c \
salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
scrypt.c \
diff --git a/cipher/rijndael-ppc.c b/cipher/rijndael-ppc.c
new file mode 100644
index 00000000..59e674b4
--- /dev/null
+++ b/cipher/rijndael-ppc.c
@@ -0,0 +1,737 @@
+/* Rijndael (AES) for GnuPG - PowerPC ISA 2.07 (POWER 8)
+ * Copyright (C) 2019 Shawn Landden
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Alternatively, this code may be used in OpenSSL from The OpenSSL Project,
+ * and Cryptogams by Andy Polyakov, and if made part of a release of either
+ * or both projects, is thereafter dual-licensed under the license said project
+ * is released under.
+ */
+
+#include <config.h>
+#ifdef USE_PPC_ASM
+
+/* POWER 8 AES extensions */
+#include <altivec.h>
+#include "rijndael-internal.h"
+#include "./cipher-internal.h"
+
+typedef vector unsigned char block;
+vector unsigned char backwards = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+
+#ifdef __LITTLE_ENDIAN__
+#define swap_if_le(a) \
+ vec_perm(a, a, backwards)
+#elif __BIG_ENDIAN__
+#define swap_if_le(a) (a)
+#else
+#error "What endianness?"
+#endif
+
+/* Passes in AltiVec registers (big-endian)
+ * sadly compilers don't know how to unroll outer loops into
+ * inner loops with more registers on static functions,
+ * so that this can be properly optimized for OOO multi-issue
+ * without having to hand-unroll.
+ */
+static block _gcry_aes_ppc8_encrypt_altivec (const RIJNDAEL_context *ctx,
+ block a) {
+ int r;
+ uintptr_t zero = 0;
+ int rounds = ctx->rounds;
+ block *rk = (block*)ctx->keyschenc;
+
+ //hexDump("sa", &a, sizeof(a));
+ a = rk[0] ^ a;
+ //hexDump("sa", &a, sizeof(a));
+ for (r = 1;r < rounds;r++) {
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (a)
+ :"v" (rk[r])
+ );
+ //hexDump("sa", &a, sizeof(a));
+ }
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (a)
+ :"v" (rk[r])
+ );
+ //hexDump("end", &a, sizeof(a));
+ return a;
+}
+
+
+static block _gcry_aes_ppc8_decrypt_altivec (const RIJNDAEL_context *ctx,
+ block a) {
+ int r;
+ uintptr_t zero = 0;
+ int rounds = ctx->rounds;
+ block *rk = (block*)ctx->keyschdec;
+
+ //hexDump("sa", &a, sizeof(a));
+ a = rk[0] ^ a;
+ //hexDump("sa", &a, sizeof(a));
+ for (r = 1;r < rounds;r++) {
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (a)
+ :"v" (rk[r])
+ );
+ //hexDump("sa", &a, sizeof(a));
+ }
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (a)
+ :"v" (rk[r])
+ );
+ //hexDump("end", &a, sizeof(a));
+ return a;
+}
+
+unsigned int _gcry_aes_ppc8_encrypt (const RIJNDAEL_context *ctx,
+ unsigned char *b,
+ const unsigned char *a) {
+ uintptr_t zero = 0;
+ block sa;
+ //hexDump("key", rk_c, 16 * 15);
+
+ if ((uintptr_t)a % 16 == 0) {
+ sa = vec_ld(0, a);
+ } else {
+ block unalignedprev, unalignedcur;
+ unalignedprev = vec_ld(0, a);
+ unalignedcur = vec_ld(16, a);
+ sa = vec_perm(unalignedprev, unalignedcur, vec_lvsl(0, a));
+ }
+
+ sa = swap_if_le(sa);
+ sa = _gcry_aes_ppc8_encrypt_altivec(ctx, sa);
+
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :
+ : "wa" (sa), "r" (zero), "r" ((uintptr_t)b));
+
+ return 0; /* does not use stack */
+}
+unsigned int _gcry_aes_ppc8_decrypt (const RIJNDAEL_context *ctx,
+ unsigned char *b,
+ const unsigned char *a)
+{
+ int r;
+ uintptr_t zero = 0;
+ int rounds = ctx->rounds;
+ block sa, unalignedprev, unalignedcur;
+ block *rk = (block*)ctx->keyschdec;
+
+ //hexDump("key", rk, 16 * 15);
+
+ if ((uintptr_t)a % 16 == 0) {
+ sa = vec_ld(0, a);
+ } else {
+ unalignedprev = vec_ld(0, a);
+ unalignedcur = vec_ld(16, a);
+ sa = vec_perm(unalignedprev, unalignedcur, vec_lvsl(0, a));
+ }
+
+ sa = swap_if_le(sa);
+ sa = _gcry_aes_ppc8_decrypt_altivec(ctx, sa);
+
+ //hexDump("sa", &sa, sizeof(sa));
+ if ((uintptr_t)b % 16 == 0)
+ vec_vsx_st(swap_if_le(sa), 0, b);
+ else {
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :
+ : "wa" (sa), "r" (zero), "r" ((uintptr_t)b));
+ }
+ return 0; /* does not use stack */
+}
+size_t _gcry_aes_ppc8_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks,
+ int encrypt) {
+ RIJNDAEL_context *ctx = (void *)&c->context.c;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+
+ block *in = (block*)inbuf;
+ block *out = (block*)outbuf;
+ uintptr_t zero = 0;
+ int r;
+ int rounds = ctx->rounds;
+ int burn_depth = 0;
+
+ if (encrypt)
+ {
+ const int unroll = 8;
+ block unalignedprev, ctr, iv;
+ if (((uintptr_t)inbuf % 16) != 0) {
+ unalignedprev = vec_ld(0, in++);
+ }
+
+ iv = vec_ld(0, (block*)&c->u_iv.iv);
+ ctr = vec_ld(0, (block*)&c->u_ctr.ctr);
+
+//hexDump("ctr", &ctr, 16);
+//hexDump("key", &ctx->u1, sizeof(ctx->u1));
+ for ( ;nblocks >= unroll; nblocks -= unroll)
+ {
+ u64 i = c->u_mode.ocb.data_nblocks + 1;
+ block l0, l1, l2, l3, l4, l5, l6, l7;
+ block b0, b1, b2, b3, b4, b5, b6, b7;
+ block iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7;
+ const block *rk = (block*)&ctx->keyschenc;
+ int j;
+
+ c->u_mode.ocb.data_nblocks += unroll;
+
+//hexDump("iv", &iv, 16);
+ iv0 = iv;
+ if ((uintptr_t)inbuf % 16 == 0)
+ {
+ b0 = vec_ld(0, in++);
+//hexDump("start", &b0, 16);
+ b1 = vec_ld(0, in++);
+ b2 = vec_ld(0, in++);
+ b3 = vec_ld(0, in++);
+ b4 = vec_ld(0, in++);
+ b5 = vec_ld(0, in++);
+ b6 = vec_ld(0, in++);
+ b7 = vec_ld(0, in++);
+ }
+ else
+ {
+ block unaligned0, unaligned1, unaligned2,
+ unaligned3, unaligned4, unaligned5, unaligned6;
+ unaligned0 = vec_ld(0, in++);
+ unaligned1 = vec_ld(0, in++);
+ unaligned2 = vec_ld(0, in++);
+ unaligned3 = vec_ld(0, in++);
+ unaligned4 = vec_ld(0, in++);
+ unaligned5 = vec_ld(0, in++);
+ unaligned6 = vec_ld(0, in++);
+ b0 = vec_perm(unalignedprev, unaligned0, vec_lvsl(0, inbuf));
+//hexDump("start", &b0, 16);
+ unalignedprev = vec_ld(0, in++);
+ b1 = vec_perm(unaligned0, unaligned1, vec_lvsl(0, inbuf));
+ b2 = vec_perm(unaligned1, unaligned2, vec_lvsl(0, inbuf));
+ b3 = vec_perm(unaligned2, unaligned3, vec_lvsl(0, inbuf));
+ b4 = vec_perm(unaligned3, unaligned4, vec_lvsl(0, inbuf));
+ b5 = vec_perm(unaligned4, unaligned5, vec_lvsl(0, inbuf));
+ b6 = vec_perm(unaligned5, unaligned6, vec_lvsl(0, inbuf));
+ b7 = vec_perm(unaligned6, unalignedprev, vec_lvsl(0, inbuf));
+ }
+
+//hexDump("i", &i, sizeof(i));
+ l0 = *(block*)ocb_get_l(c, i++);
+//hexDump("l", &l0, 16);
+ l1 = *(block*)ocb_get_l(c, i++);
+ l2 = *(block*)ocb_get_l(c, i++);
+ l3 = *(block*)ocb_get_l(c, i++);
+ l4 = *(block*)ocb_get_l(c, i++);
+ l5 = *(block*)ocb_get_l(c, i++);
+ l6 = *(block*)ocb_get_l(c, i++);
+ l7 = *(block*)ocb_get_l(c, i++);
+
+ ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;
+
+ iv0 ^= l0;
+//hexDump("xorl", &iv0, 16);
+ b0 ^= iv0;
+//hexDump("xor", &b0, 16);
+ iv1 = iv0 ^ l1;
+ b1 ^= iv1;
+ iv2 = iv1 ^ l2;
+ b2 ^= iv2;
+ iv3 = iv2 ^ l3;
+ b3 ^= iv3;
+ iv4 = iv3 ^ l4;
+ b4 ^= iv4;
+ iv5 = iv4 ^ l5;
+ b5 ^= iv5;
+ iv6 = iv5 ^ l6;
+ b6 ^= iv6;
+ iv7 = iv6 ^ l7;
+ b7 ^= iv7;
+
+ b0 = swap_if_le(b0);
+//hexDump("swap", &b0, 16);
+ b1 = swap_if_le(b1);
+ b2 = swap_if_le(b2);
+ b3 = swap_if_le(b3);
+ b4 = swap_if_le(b4);
+ b5 = swap_if_le(b5);
+ b6 = swap_if_le(b6);
+ b7 = swap_if_le(b7);
+
+ b0 ^= rk[0];
+//hexDump("xor ??", &b0, 16);
+ b1 ^= rk[0];
+ b2 ^= rk[0];
+ b3 ^= rk[0];
+ b4 ^= rk[0];
+ b5 ^= rk[0];
+ b6 ^= rk[0];
+ b7 ^= rk[0];
+
+ for (r = 1;r < rounds;r++)
+ {
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (b0)
+ :"v" (rk[r])
+ );
+//hexDump("round", &b0, 16);
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (b1)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (b2)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (b3)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (b4)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (b5)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (b6)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vcipher %0, %0, %1\n\t"
+ :"+v" (b7)
+ :"v" (rk[r])
+ );
+ }
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (b0)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (b1)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (b2)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (b3)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (b4)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (b5)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (b6)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vcipherlast %0, %0, %1\n\t"
+ :"+v" (b7)
+ :"v" (rk[r])
+ );
+
+//hexDump("end", &b0, 16);
+ iv = iv7;
+//hexDump("end-iv5", &b0, 16);
+
+ // The unaligned store stxvb16x writes big-endian,
+ // so in the unaligned case we swap the iv instead of the bytes
+ if ((uintptr_t)outbuf % 16 == 0)
+ {
+ vec_vsx_st(swap_if_le(b0) ^ iv0, 0, out++);
+//hexDump("out", out - 1, 16);
+ vec_vsx_st(swap_if_le(b1) ^ iv1, 0, out++);
+ vec_vsx_st(swap_if_le(b2) ^ iv2, 0, out++);
+ vec_vsx_st(swap_if_le(b3) ^ iv3, 0, out++);
+ vec_vsx_st(swap_if_le(b4) ^ iv4, 0, out++);
+ vec_vsx_st(swap_if_le(b5) ^ iv5, 0, out++);
+ vec_vsx_st(swap_if_le(b6) ^ iv6, 0, out++);
+ vec_vsx_st(swap_if_le(b7) ^ iv7, 0, out++);
+ }
+ else
+ {
+ b0 ^= swap_if_le(iv0);
+ b1 ^= swap_if_le(iv1);
+ b2 ^= swap_if_le(iv2);
+ b3 ^= swap_if_le(iv3);
+ b4 ^= swap_if_le(iv4);
+ b5 ^= swap_if_le(iv5);
+ b6 ^= swap_if_le(iv6);
+ b7 ^= swap_if_le(iv7);
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b0), "r" (zero), "r" ((uintptr_t)(out++)));
+//hexDump("out-un", out - 1, 16);
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b1), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b2), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b3), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b4), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b5), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b6), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b7), "r" (zero), "r" ((uintptr_t)(out++)));
+ }
+ }
+
+ for ( ;nblocks; nblocks-- )
+ {
+ block b;
+ u64 i = ++c->u_mode.ocb.data_nblocks;
+ const block l = *(block*)ocb_get_l(c, i);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ iv ^= l;
+ if ((uintptr_t)in % 16 == 0) {
+ b = vec_ld(0, in++);
+ } else {
+ block unalignedprevprev;
+ unalignedprevprev = unalignedprev;
+ unalignedprev = vec_ld(0, in++);
+ b = vec_perm(unalignedprevprev, unalignedprev, vec_lvsl(0, inbuf));
+ }
+//hexDump("start", &b, 16);
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ ctr ^= b;
+//hexDump("ctr", &ctr, 16);
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ b ^= iv;
+//hexDump("xoriv", &b, 16);
+ b = swap_if_le(b);
+ b = _gcry_aes_ppc8_encrypt_altivec (ctx, b);
+//hexDump("crypt", &b, 16);
+ if ((uintptr_t)out % 16 == 0)
+ vec_vsx_st(swap_if_le(b) ^ iv, 0, out++);
+ else {
+ b ^= swap_if_le(iv);
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :
+ : "wa" (b), "r" (zero), "r" ((uintptr_t)out++));
+ }
+//hexDump("out", out - 1, 16);
+ }
+
+ // We want to store iv and ctr big-endian and the unaligned
+ // store stxvb16x stores them little endian, so we have to swap them.
+ iv = swap_if_le(iv);
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (iv), "r" (zero), "r" ((uintptr_t)&c->u_iv.iv));
+ ctr = swap_if_le(ctr);
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (ctr), "r" (zero), "r" ((uintptr_t)&c->u_ctr.ctr));
+ }
+ else
+ {
+ const int unroll = 8;
+ block unalignedprev, ctr, iv;
+ if (((uintptr_t)inbuf % 16) != 0) {
+ unalignedprev = vec_ld(0, in++);
+ }
+
+ iv = vec_ld(0, (block*)&c->u_iv.iv);
+ ctr = vec_ld(0, (block*)&c->u_ctr.ctr);
+
+//hexDump("ctr", &ctr, 16);
+//hexDump("key", &ctx->u1, sizeof(ctx->u1));
+ for ( ;nblocks >= unroll; nblocks -= unroll)
+ {
+ u64 i = c->u_mode.ocb.data_nblocks + 1;
+ block l0, l1, l2, l3, l4, l5, l6, l7;
+ block b0, b1, b2, b3, b4, b5, b6, b7;
+ block iv0, iv1, iv2, iv3, iv4, iv5, iv6, iv7;
+ const block *rk = (block*)&ctx->keyschdec;
+ int j;
+
+ c->u_mode.ocb.data_nblocks += unroll;
+
+//hexDump("iv", &iv, 16);
+ iv0 = iv;
+ if ((uintptr_t)inbuf % 16 == 0)
+ {
+ b0 = vec_ld(0, in++);
+//hexDump("start", &b0, 16);
+ b1 = vec_ld(0, in++);
+ b2 = vec_ld(0, in++);
+ b3 = vec_ld(0, in++);
+ b4 = vec_ld(0, in++);
+ b5 = vec_ld(0, in++);
+ b6 = vec_ld(0, in++);
+ b7 = vec_ld(0, in++);
+ }
+ else
+ {
+ block unaligned0, unaligned1, unaligned2,
+ unaligned3, unaligned4, unaligned5, unaligned6;
+ unaligned0 = vec_ld(0, in++);
+ unaligned1 = vec_ld(0, in++);
+ unaligned2 = vec_ld(0, in++);
+ unaligned3 = vec_ld(0, in++);
+ unaligned4 = vec_ld(0, in++);
+ unaligned5 = vec_ld(0, in++);
+ unaligned6 = vec_ld(0, in++);
+ b0 = vec_perm(unalignedprev, unaligned0, vec_lvsl(0, inbuf));
+//hexDump("start", &b0, 16);
+ unalignedprev = vec_ld(0, in++);
+ b1 = vec_perm(unaligned0, unaligned1, vec_lvsl(0, inbuf));
+ b2 = vec_perm(unaligned1, unaligned2, vec_lvsl(0, inbuf));
+ b3 = vec_perm(unaligned2, unaligned3, vec_lvsl(0, inbuf));
+ b4 = vec_perm(unaligned3, unaligned4, vec_lvsl(0, inbuf));
+ b5 = vec_perm(unaligned4, unaligned5, vec_lvsl(0, inbuf));
+ b6 = vec_perm(unaligned5, unaligned6, vec_lvsl(0, inbuf));
+ b7 = vec_perm(unaligned6, unalignedprev, vec_lvsl(0, inbuf));
+ }
+
+//hexDump("i", &i, sizeof(i));
+ l0 = *(block*)ocb_get_l(c, i++);
+//hexDump("l", &l0, 16);
+ l1 = *(block*)ocb_get_l(c, i++);
+ l2 = *(block*)ocb_get_l(c, i++);
+ l3 = *(block*)ocb_get_l(c, i++);
+ l4 = *(block*)ocb_get_l(c, i++);
+ l5 = *(block*)ocb_get_l(c, i++);
+ l6 = *(block*)ocb_get_l(c, i++);
+ l7 = *(block*)ocb_get_l(c, i++);
+
+ iv0 ^= l0;
+//hexDump("xorl", &iv0, 16);
+ b0 ^= iv0;
+//hexDump("xor", &b0, 16);
+ iv1 = iv0 ^ l1;
+ b1 ^= iv1;
+ iv2 = iv1 ^ l2;
+ b2 ^= iv2;
+ iv3 = iv2 ^ l3;
+ b3 ^= iv3;
+ iv4 = iv3 ^ l4;
+ b4 ^= iv4;
+ iv5 = iv4 ^ l5;
+ b5 ^= iv5;
+ iv6 = iv5 ^ l6;
+ b6 ^= iv6;
+ iv7 = iv6 ^ l7;
+ b7 ^= iv7;
+
+ b0 = swap_if_le(b0);
+//hexDump("swap", &b0, 16);
+ b1 = swap_if_le(b1);
+ b2 = swap_if_le(b2);
+ b3 = swap_if_le(b3);
+ b4 = swap_if_le(b4);
+ b5 = swap_if_le(b5);
+ b6 = swap_if_le(b6);
+ b7 = swap_if_le(b7);
+
+ b0 ^= rk[0];
+//hexDump("xor ??", &b0, 16);
+ b1 ^= rk[0];
+ b2 ^= rk[0];
+ b3 ^= rk[0];
+ b4 ^= rk[0];
+ b5 ^= rk[0];
+ b6 ^= rk[0];
+ b7 ^= rk[0];
+
+ for (r = 1;r < rounds;r++)
+ {
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (b0)
+ :"v" (rk[r])
+ );
+//hexDump("round", &b0, 16);
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (b1)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (b2)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (b3)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (b4)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (b5)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (b6)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vncipher %0, %0, %1\n\t"
+ :"+v" (b7)
+ :"v" (rk[r])
+ );
+ }
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (b0)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (b1)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (b2)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (b3)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (b4)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (b5)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (b6)
+ :"v" (rk[r])
+ );
+ __asm__ volatile ("vncipherlast %0, %0, %1\n\t"
+ :"+v" (b7)
+ :"v" (rk[r])
+ );
+
+//hexDump("end", &b0, 16);
+ iv = iv7;
+//hexDump("end-iv5", &b0, 16);
+
+ b0 = swap_if_le(b0) ^ iv0;
+ b1 = swap_if_le(b1) ^ iv1;
+ b2 = swap_if_le(b2) ^ iv2;
+ b3 = swap_if_le(b3) ^ iv3;
+ b4 = swap_if_le(b4) ^ iv4;
+ b5 = swap_if_le(b5) ^ iv5;
+ b6 = swap_if_le(b6) ^ iv6;
+ b7 = swap_if_le(b7) ^ iv7;
+
+ ctr ^= b0 ^ b1 ^ b2 ^ b3 ^ b4 ^ b5 ^ b6 ^ b7;
+
+ // The unaligned store stxvb16x writes big-endian
+ if ((uintptr_t)outbuf % 16 == 0)
+ {
+ vec_vsx_st(b0, 0, out++);
+ vec_vsx_st(b1, 0, out++);
+ vec_vsx_st(b2, 0, out++);
+ vec_vsx_st(b3, 0, out++);
+ vec_vsx_st(b4, 0, out++);
+ vec_vsx_st(b5, 0, out++);
+ vec_vsx_st(b6, 0, out++);
+ vec_vsx_st(b7, 0, out++);
+ }
+ else
+ {
+ b0 = swap_if_le(b0);
+ b1 = swap_if_le(b1);
+ b2 = swap_if_le(b2);
+ b3 = swap_if_le(b3);
+ b4 = swap_if_le(b4);
+ b5 = swap_if_le(b5);
+ b6 = swap_if_le(b6);
+ b7 = swap_if_le(b7);
+ __asm__ ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b0), "r" (zero), "r" ((uintptr_t)(out++)));
+//hexDump("out-un", out - 1, 16);
+ __asm__ ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b1), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b2), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b3), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b4), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b5), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b6), "r" (zero), "r" ((uintptr_t)(out++)));
+ __asm__ ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (b7), "r" (zero), "r" ((uintptr_t)(out++)));
+ }
+ }
+
+ for ( ;nblocks; nblocks-- )
+ {
+ block b;
+ u64 i = ++c->u_mode.ocb.data_nblocks;
+ const block l = *(block*)ocb_get_l(c, i);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ iv ^= l;
+ if ((uintptr_t)in % 16 == 0) {
+ b = vec_ld(0, in++);
+ } else {
+ block unalignedprevprev;
+ unalignedprevprev = unalignedprev;
+ unalignedprev = vec_ld(0, in++);
+ b = vec_perm(unalignedprevprev, unalignedprev, vec_lvsl(0, inbuf));
+ }
+//hexDump("start", &b, 16);
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+//hexDump("ctr", &ctr, 16);
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ b ^= iv;
+//hexDump("xoriv", &b, 16);
+ b = swap_if_le(b);
+ b = _gcry_aes_ppc8_decrypt_altivec (ctx, b);
+//hexDump("crypt", &b, 16);
+ b = swap_if_le(b) ^ iv;
+ ctr ^= b;
+ if ((uintptr_t)out % 16 == 0)
+ vec_vsx_st(b, 0, out++);
+ else {
+ b = swap_if_le(b);
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :
+ : "wa" (b), "r" (zero), "r" ((uintptr_t)out++));
+ }
+//hexDump("out", out - 1, 16);
+ }
+
+ // We want to store iv and ctr big-endian and the unaligned
+ // store stxvb16x stores them little endian, so we have to swap them.
+ iv = swap_if_le(iv);
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (iv), "r" (zero), "r" ((uintptr_t)&c->u_iv.iv));
+ ctr = swap_if_le(ctr);
+ __asm__ volatile ("stxvb16x %x0, %1, %2\n\t"
+ :: "wa" (ctr), "r" (zero), "r" ((uintptr_t)&c->u_ctr.ctr));
+ }
+ return 0;
+}
+
+#endif /* USE_PPC_ASM */
\ No newline at end of file
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 2d1c38bf..99850105 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -200,29 +200,7 @@ extern void _gcry_aes_armv8_ce_xts_crypt (void *context, unsigned char *tweak,
#endif /*USE_ARM_ASM*/

#ifdef USE_PPC_ASM
-/* POWER 8 AES extensions */
-extern void aes_p8_encrypt (const unsigned char *in,
- unsigned char *out,
- const RIJNDAEL_context *ctx);
-static unsigned int _gcry_aes_ppc8_encrypt (const RIJNDAEL_context *ctx,
- unsigned char *out,
- const unsigned char *in)
-{
- /* When I tried to switch these registers in the assembly it broke. */
- aes_p8_encrypt (in, out, ctx);
- return 0; /* does not use stack */
-}
- /* this is the decryption key part of context */
-extern void aes_p8_decrypt (const unsigned char *in,
- unsigned char *out,
- const void *sboxes);
-static unsigned int _gcry_aes_ppc8_decrypt (const RIJNDAEL_context *ctx,
- unsigned char *out,
- const unsigned char *in)
-{
- aes_p8_decrypt (in, out, &ctx->u2);
- return 0; /* does not use stack */
-}
+#include "rijndael-ppc.c"
extern int aes_p8_set_encrypt_key (const unsigned char *userKey, const int bits,
RIJNDAEL_context *key);
extern int aes_p8_set_decrypt_key (const unsigned char *userKey, const int bits,
@@ -297,7 +275,7 @@ static void _gcry_aes_ppc8_ctr_enc (void *context, unsigned char *ctr,
const RIJNDAEL_context *ctx = context;
const uint64_t two32 = 1ULL << 32;
int overflow;
- u64 s[2], e[2];
+ u64 s[2];
s[0] = buf_get_be64(ctr + 8);
overflow = two32 - (s[0] % two32) < nblocks;
#ifdef __builtin_expect
@@ -566,6 +544,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
hd->bulk.cbc_enc = _gcry_aes_ppc8_cbc_enc;
hd->bulk.xts_crypt = _gcry_aes_ppc8_xts_crypt;
hd->bulk.ctr_enc = _gcry_aes_ppc8_ctr_enc;
+ hd->bulk.ocb_crypt = _gcry_aes_ppc8_ocb_crypt;
}
}
#endif
@@ -1541,6 +1520,12 @@ _gcry_aes_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
return _gcry_aes_armv8_ce_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt);
}
#endif /*USE_ARM_CE*/
+#ifdef USE_PPC_ASM
+ else if (ctx->use_ppc_asm)
+ {
+ return _gcry_aes_ppc8_ocb_crypt (c, outbuf, inbuf, nblocks, encrypt);
+ }
+#endif /*USE_PPC_ASM*/
else if (encrypt)
{
union { unsigned char x1[16] ATTR_ALIGNED_16; u32 x32[4]; } l_tmp;
--
2.20.1


_______________________________________________
Gcrypt-devel mailing list
Gcrypt-devel@gnupg.org
http://lists.gnupg.org/mailman/listinfo/gcrypt-devel