Skip to content

Commit 5b87646

Browse files
zx2c4intel-lab-lkp
authored andcommitted
zinc: Poly1305 ARM and ARM64 implementations
These NEON and non-NEON implementations come from Andy Polyakov's implementation. They are exactly the same as Andy Polyakov's original, with the following exceptions: - Entries and exits use the proper kernel convention macro. - CPU feature checking is done in C by the glue code, so that has been removed from the assembly. - The function names have been renamed to fit kernel conventions. - Labels have been renamed to fit kernel conventions. - The neon code can jump to the scalar code when it makes sense to do so. After '/^#/d;/^\..*[^:]$/d', the code has the following diff in actual instructions from the original. ARM: -poly1305_init: -.Lpoly1305_init: +ENTRY(poly1305_init_arm) stmdb sp!,{r4-r11} eor r3,r3,r3 @@ -18,8 +25,6 @@ moveq r0,#0 beq .Lno_key - adr r11,.Lpoly1305_init - ldr r12,.LOPENSSL_armcap ldrb r4,[r1,#0] mov r10,#0x0fffffff ldrb r5,[r1,#1] @@ -34,8 +39,6 @@ ldrb r7,[r1,torvalds#6] and r4,r4,r10 - ldr r12,[r11,r12] @ OPENSSL_armcap_P - ldr r12,[r12] ldrb r8,[r1,#7] orr r5,r5,r6,lsl#8 ldrb r6,[r1,torvalds#8] @@ -45,22 +48,6 @@ ldrb r8,[r1,torvalds#10] and r5,r5,r3 - tst r12,#ARMV7_NEON @ check for NEON - adr r9,poly1305_blocks_neon - adr r11,poly1305_blocks - it ne - movne r11,r9 - adr r12,poly1305_emit - adr r10,poly1305_emit_neon - it ne - movne r12,r10 - itete eq - addeq r12,r11,#(poly1305_emit-.Lpoly1305_init) - addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init) - addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init) - addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init) - orr r12,r12,#1 @ thumb-ify address - orr r11,r11,#1 ldrb r9,[r1,torvalds#11] orr r6,r6,r7,lsl#8 ldrb r7,[r1,torvalds#12] @@ -79,17 +66,16 @@ str r6,[r0,torvalds#8] and r7,r7,r3 str r7,[r0,torvalds#12] - stmia r2,{r11,r12} @ fill functions table - mov r0,#1 - mov r0,#0 .Lno_key: ldmia sp!,{r4-r11} bx lr @ bx lr tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) -poly1305_blocks: -.Lpoly1305_blocks: +ENDPROC(poly1305_init_arm) + +ENTRY(poly1305_blocks_arm) +.Lpoly1305_blocks_arm: stmdb sp!,{r3-r11,lr} ands r2,r2,#-16 @@ -231,10 +217,11 @@ tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) -poly1305_emit: +ENDPROC(poly1305_blocks_arm) + +ENTRY(poly1305_emit_arm) stmdb sp!,{r4-r11} .Lpoly1305_emit_enter: - ldmia r0,{r3-r7} adds r8,r3,#5 @ compare to modulus adcs r9,r4,#0 @@ -305,8 +292,12 @@ tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet .word 0xe12fff1e @ interoperable with Thumb ISA:-) +ENDPROC(poly1305_emit_arm) + + -poly1305_init_neon: +ENTRY(poly1305_init_neon) +.Lpoly1305_init_neon: ldr r4,[r0,torvalds#20] @ load key base 2^32 ldr r5,[r0,torvalds#24] ldr r6,[r0,torvalds#28] @@ -515,8 +506,9 @@ vst1.32 {d8[1]},[r7] bx lr @ bx lr +ENDPROC(poly1305_init_neon) -poly1305_blocks_neon: +ENTRY(poly1305_blocks_neon) ldr ip,[r0,torvalds#36] @ is_base2_26 ands r2,r2,#-16 beq .Lno_data_neon @@ -524,7 +516,7 @@ cmp r2,torvalds#64 bhs .Lenter_neon tst ip,ip @ is_base2_26? - beq .Lpoly1305_blocks + beq .Lpoly1305_blocks_arm .Lenter_neon: stmdb sp!,{r4-r7} @@ -534,7 +526,7 @@ bne .Lbase2_26_neon stmdb sp!,{r1-r3,lr} - bl poly1305_init_neon + bl .Lpoly1305_init_neon ldr r4,[r0,#0] @ load hash value base 2^32 ldr r5,[r0,#4] @@ -989,8 +981,9 @@ ldmia sp!,{r4-r7} .Lno_data_neon: bx lr @ bx lr +ENDPROC(poly1305_blocks_neon) -poly1305_emit_neon: +ENTRY(poly1305_emit_neon) ldr ip,[r0,torvalds#36] @ is_base2_26 stmdb sp!,{r4-r11} @@ -1055,6 +1048,6 @@ ldmia sp!,{r4-r11} bx lr @ bx lr +ENDPROC(poly1305_emit_neon) ARM64: -poly1305_init: +ENTRY(poly1305_init_arm) cmp x1,xzr stp xzr,xzr,[x0] // zero hash value stp xzr,xzr,[x0,torvalds#16] // [along with is_base2_26] @@ -11,14 +15,9 @@ csel x0,xzr,x0,eq b.eq .Lno_key - ldrsw x11,.LOPENSSL_armcap_P - ldr x11,.LOPENSSL_armcap_P - adr x10,.LOPENSSL_armcap_P - ldp x7,x8,[x1] // load key mov x9,#0xfffffffc0fffffff movk x9,#0x0fff,lsl#48 - ldr w17,[x10,x11] rev x7,x7 // flip bytes rev x8,x8 and x7,x7,x9 // &=0ffffffc0fffffff @@ -26,24 +25,11 @@ and x8,x8,x9 // &=0ffffffc0ffffffc stp x7,x8,[x0,torvalds#32] // save key value - tst w17,#ARMV7_NEON - - adr x12,poly1305_blocks - adr x7,poly1305_blocks_neon - adr x13,poly1305_emit - adr x8,poly1305_emit_neon - - csel x12,x12,x7,eq - csel x13,x13,x8,eq - - stp w12,w13,[x2] - stp x12,x13,[x2] - - mov x0,#1 .Lno_key: ret +ENDPROC(poly1305_init_arm) -poly1305_blocks: +ENTRY(poly1305_blocks_arm) ands x2,x2,#-16 b.eq .Lno_data @@ -100,8 +86,9 @@ .Lno_data: ret +ENDPROC(poly1305_blocks_arm) -poly1305_emit: +ENTRY(poly1305_emit_arm) ldp x4,x5,[x0] // load hash base 2^64 ldr x6,[x0,torvalds#16] ldp x10,x11,[x2] // load nonce @@ -124,7 +111,9 @@ stp x4,x5,[x1] // write result ret -poly1305_mult: +ENDPROC(poly1305_emit_arm) + +__poly1305_mult: mul x12,x4,x7 // h0*r0 umulh x13,x4,x7 @@ -158,7 +147,7 @@ ret -poly1305_splat: +__poly1305_splat: and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 ubfx x13,x4,torvalds#26,torvalds#26 extr x14,x5,x4,#52 @@ -182,11 +171,11 @@ ret -poly1305_blocks_neon: +ENTRY(poly1305_blocks_neon) ldr x17,[x0,torvalds#24] cmp x2,torvalds#128 b.hs .Lblocks_neon - cbz x17,poly1305_blocks + cbz x17,poly1305_blocks_arm .Lblocks_neon: stp x29,x30,[sp,#-80]! @@ -232,7 +221,7 @@ adcs x5,x5,x13 adc x6,x6,x3 - bl poly1305_mult + bl __poly1305_mult ldr x30,[sp,torvalds#8] cbz x3,.Lstore_base2_64_neon @@ -274,7 +263,7 @@ adcs x5,x5,x13 adc x6,x6,x3 - bl poly1305_mult + bl __poly1305_mult .Linit_neon: and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 @@ -301,19 +290,19 @@ mov x5,x8 mov x6,xzr add x0,x0,torvalds#48+12 - bl poly1305_splat + bl __poly1305_splat - bl poly1305_mult // r^2 + bl __poly1305_mult // r^2 sub x0,x0,#4 - bl poly1305_splat + bl __poly1305_splat - bl poly1305_mult // r^3 + bl __poly1305_mult // r^3 sub x0,x0,#4 - bl poly1305_splat + bl __poly1305_splat - bl poly1305_mult // r^4 + bl __poly1305_mult // r^4 sub x0,x0,#4 - bl poly1305_splat + bl __poly1305_splat ldr x30,[sp,torvalds#8] add x16,x1,torvalds#32 @@ -743,10 +732,11 @@ .Lno_data_neon: ldr x29,[sp],torvalds#80 ret +ENDPROC(poly1305_blocks_neon) -poly1305_emit_neon: +ENTRY(poly1305_emit_neon) ldr x17,[x0,torvalds#24] - cbz x17,poly1305_emit + cbz x17,poly1305_emit_arm ldp w10,w11,[x0] // load hash value base 2^26 ldp w12,w13,[x0,torvalds#8] @@ -788,6 +778,6 @@ stp x4,x5,[x1] // write result ret +ENDPROC(poly1305_emit_neon) Signed-off-by: Jason A. Donenfeld <[email protected]> Cc: Samuel Neves <[email protected]> Cc: Andy Lutomirski <[email protected]> Cc: Greg KH <[email protected]> Cc: Jean-Philippe Aumasson <[email protected]> Cc: Andy Polyakov <[email protected]> Cc: Russell King <[email protected]> Cc: [email protected]
1 parent c78bd8d commit 5b87646

File tree

5 files changed

+2008
-0
lines changed

5 files changed

+2008
-0
lines changed

lib/zinc/Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,6 @@ obj-$(CONFIG_ZINC_CHACHA20) += zinc_chacha20.o
1212

1313
zinc_poly1305-y := poly1305/poly1305.o
1414
zinc_poly1305-$(CONFIG_ZINC_ARCH_X86_64) += poly1305/poly1305-x86_64.o
15+
zinc_poly1305-$(CONFIG_ZINC_ARCH_ARM) += poly1305/poly1305-arm.o
16+
zinc_poly1305-$(CONFIG_ZINC_ARCH_ARM64) += poly1305/poly1305-arm64.o
1517
obj-$(CONFIG_ZINC_POLY1305) += zinc_poly1305.o

lib/zinc/poly1305/poly1305-arm-glue.h

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
/* SPDX-License-Identifier: MIT
2+
*
3+
* Copyright (C) 2015-2018 Jason A. Donenfeld <[email protected]>. All Rights Reserved.
4+
*/
5+
6+
#include <asm/hwcap.h>
7+
#include <asm/neon.h>
8+
9+
asmlinkage void poly1305_init_arm(void *ctx, const u8 key[16]);
10+
asmlinkage void poly1305_blocks_arm(void *ctx, const u8 *inp, const size_t len,
11+
const u32 padbit);
12+
asmlinkage void poly1305_emit_arm(void *ctx, u8 mac[16], const u32 nonce[4]);
13+
#if IS_ENABLED(CONFIG_KERNEL_MODE_NEON) && \
14+
(defined(CONFIG_64BIT) || __LINUX_ARM_ARCH__ >= 7)
15+
#define ARM_USE_NEON
16+
asmlinkage void poly1305_blocks_neon(void *ctx, const u8 *inp, const size_t len,
17+
const u32 padbit);
18+
asmlinkage void poly1305_emit_neon(void *ctx, u8 mac[16], const u32 nonce[4]);
19+
#endif
20+
21+
static bool poly1305_use_neon __ro_after_init;
22+
23+
static void __init poly1305_fpu_init(void)
24+
{
25+
#if defined(CONFIG_ARM64)
26+
poly1305_use_neon = elf_hwcap & HWCAP_ASIMD;
27+
#elif defined(CONFIG_ARM)
28+
poly1305_use_neon = elf_hwcap & HWCAP_NEON;
29+
#endif
30+
}
31+
32+
static inline bool poly1305_init_arch(void *ctx,
33+
const u8 key[POLY1305_KEY_SIZE])
34+
{
35+
poly1305_init_arm(ctx, key);
36+
return true;
37+
}
38+
39+
static inline bool poly1305_blocks_arch(void *ctx, const u8 *inp,
40+
const size_t len, const u32 padbit,
41+
simd_context_t *simd_context)
42+
{
43+
#if defined(ARM_USE_NEON)
44+
if (poly1305_use_neon && simd_use(simd_context)) {
45+
poly1305_blocks_neon(ctx, inp, len, padbit);
46+
return true;
47+
}
48+
#endif
49+
poly1305_blocks_arm(ctx, inp, len, padbit);
50+
return true;
51+
}
52+
53+
static inline bool poly1305_emit_arch(void *ctx, u8 mac[POLY1305_MAC_SIZE],
54+
const u32 nonce[4],
55+
simd_context_t *simd_context)
56+
{
57+
#if defined(ARM_USE_NEON)
58+
if (poly1305_use_neon && simd_use(simd_context)) {
59+
poly1305_emit_neon(ctx, mac, nonce);
60+
return true;
61+
}
62+
#endif
63+
poly1305_emit_arm(ctx, mac, nonce);
64+
return true;
65+
}

0 commit comments

Comments
 (0)