~mpldr/go-yenc

eff50c3f50f9b3d647f683cdbc35bd847d62616c — Moritz Poldrack 1 year, 7 months ago 29b5d24
add unrolled assembler version
A testdata/benchmarks/asm-unrolled.go => testdata/benchmarks/asm-unrolled.go +6 -0
@@ 0,0 1,6 @@
// Code generated by command: go run asm-unrolled_gen.go -pkg yenc -stubs asm-unrolled.go -out asm-unrolled.s. DO NOT EDIT.

package yenc

// yenc encode using x86 assembler instructions using unrolled loops
func AssemblerUnrolled(input []byte, output []byte)

A testdata/benchmarks/asm-unrolled.s => testdata/benchmarks/asm-unrolled.s +535 -0
@@ 0,0 1,535 @@
// Code generated by command: go run asm-unrolled_gen.go -pkg yenc -stubs asm-unrolled.go -out asm-unrolled.s. DO NOT EDIT.

#include "textflag.h"

DATA consts<>+0(SB)/1, $0x01
DATA consts<>+1(SB)/1, $0x00
DATA consts<>+2(SB)/1, $0x00
DATA consts<>+3(SB)/1, $0x00
DATA consts<>+4(SB)/1, $0x00
DATA consts<>+5(SB)/1, $0x00
DATA consts<>+6(SB)/1, $0x00
DATA consts<>+7(SB)/1, $0x00
DATA consts<>+8(SB)/1, $0x00
DATA consts<>+9(SB)/1, $0x00
DATA consts<>+10(SB)/1, $0x01
DATA consts<>+11(SB)/1, $0x00
DATA consts<>+12(SB)/1, $0x00
DATA consts<>+13(SB)/1, $0x01
DATA consts<>+14(SB)/1, $0x00
DATA consts<>+15(SB)/1, $0x00
DATA consts<>+16(SB)/1, $0x00
DATA consts<>+17(SB)/1, $0x00
DATA consts<>+18(SB)/1, $0x00
DATA consts<>+19(SB)/1, $0x00
DATA consts<>+20(SB)/1, $0x00
DATA consts<>+21(SB)/1, $0x00
DATA consts<>+22(SB)/1, $0x00
DATA consts<>+23(SB)/1, $0x00
DATA consts<>+24(SB)/1, $0x00
DATA consts<>+25(SB)/1, $0x00
DATA consts<>+26(SB)/1, $0x00
DATA consts<>+27(SB)/1, $0x00
DATA consts<>+28(SB)/1, $0x00
DATA consts<>+29(SB)/1, $0x00
DATA consts<>+30(SB)/1, $0x00
DATA consts<>+31(SB)/1, $0x00
DATA consts<>+32(SB)/1, $0x00
DATA consts<>+33(SB)/1, $0x00
DATA consts<>+34(SB)/1, $0x00
DATA consts<>+35(SB)/1, $0x00
DATA consts<>+36(SB)/1, $0x00
DATA consts<>+37(SB)/1, $0x00
DATA consts<>+38(SB)/1, $0x00
DATA consts<>+39(SB)/1, $0x00
DATA consts<>+40(SB)/1, $0x00
DATA consts<>+41(SB)/1, $0x00
DATA consts<>+42(SB)/1, $0x00
DATA consts<>+43(SB)/1, $0x00
DATA consts<>+44(SB)/1, $0x00
DATA consts<>+45(SB)/1, $0x00
DATA consts<>+46(SB)/1, $0x00
DATA consts<>+47(SB)/1, $0x00
DATA consts<>+48(SB)/1, $0x00
DATA consts<>+49(SB)/1, $0x00
DATA consts<>+50(SB)/1, $0x00
DATA consts<>+51(SB)/1, $0x00
DATA consts<>+52(SB)/1, $0x00
DATA consts<>+53(SB)/1, $0x00
DATA consts<>+54(SB)/1, $0x00
DATA consts<>+55(SB)/1, $0x00
DATA consts<>+56(SB)/1, $0x00
DATA consts<>+57(SB)/1, $0x00
DATA consts<>+58(SB)/1, $0x00
DATA consts<>+59(SB)/1, $0x00
DATA consts<>+60(SB)/1, $0x00
DATA consts<>+61(SB)/1, $0x01
DATA consts<>+62(SB)/1, $0x00
DATA consts<>+63(SB)/1, $0x00
DATA consts<>+64(SB)/1, $0x00
DATA consts<>+65(SB)/1, $0x00
DATA consts<>+66(SB)/1, $0x00
DATA consts<>+67(SB)/1, $0x00
DATA consts<>+68(SB)/1, $0x00
DATA consts<>+69(SB)/1, $0x00
DATA consts<>+70(SB)/1, $0x00
DATA consts<>+71(SB)/1, $0x00
DATA consts<>+72(SB)/1, $0x00
DATA consts<>+73(SB)/1, $0x00
DATA consts<>+74(SB)/1, $0x00
DATA consts<>+75(SB)/1, $0x00
DATA consts<>+76(SB)/1, $0x00
DATA consts<>+77(SB)/1, $0x00
DATA consts<>+78(SB)/1, $0x00
DATA consts<>+79(SB)/1, $0x00
DATA consts<>+80(SB)/1, $0x00
DATA consts<>+81(SB)/1, $0x00
DATA consts<>+82(SB)/1, $0x00
DATA consts<>+83(SB)/1, $0x00
DATA consts<>+84(SB)/1, $0x00
DATA consts<>+85(SB)/1, $0x00
DATA consts<>+86(SB)/1, $0x00
DATA consts<>+87(SB)/1, $0x00
DATA consts<>+88(SB)/1, $0x00
DATA consts<>+89(SB)/1, $0x00
DATA consts<>+90(SB)/1, $0x00
DATA consts<>+91(SB)/1, $0x00
DATA consts<>+92(SB)/1, $0x00
DATA consts<>+93(SB)/1, $0x00
DATA consts<>+94(SB)/1, $0x00
DATA consts<>+95(SB)/1, $0x00
DATA consts<>+96(SB)/1, $0x00
DATA consts<>+97(SB)/1, $0x00
DATA consts<>+98(SB)/1, $0x00
DATA consts<>+99(SB)/1, $0x00
DATA consts<>+100(SB)/1, $0x00
DATA consts<>+101(SB)/1, $0x00
DATA consts<>+102(SB)/1, $0x00
DATA consts<>+103(SB)/1, $0x00
DATA consts<>+104(SB)/1, $0x00
DATA consts<>+105(SB)/1, $0x00
DATA consts<>+106(SB)/1, $0x00
DATA consts<>+107(SB)/1, $0x00
DATA consts<>+108(SB)/1, $0x00
DATA consts<>+109(SB)/1, $0x00
DATA consts<>+110(SB)/1, $0x00
DATA consts<>+111(SB)/1, $0x00
DATA consts<>+112(SB)/1, $0x00
DATA consts<>+113(SB)/1, $0x00
DATA consts<>+114(SB)/1, $0x00
DATA consts<>+115(SB)/1, $0x00
DATA consts<>+116(SB)/1, $0x00
DATA consts<>+117(SB)/1, $0x00
DATA consts<>+118(SB)/1, $0x00
DATA consts<>+119(SB)/1, $0x00
DATA consts<>+120(SB)/1, $0x00
DATA consts<>+121(SB)/1, $0x00
DATA consts<>+122(SB)/1, $0x00
DATA consts<>+123(SB)/1, $0x00
DATA consts<>+124(SB)/1, $0x00
DATA consts<>+125(SB)/1, $0x00
DATA consts<>+126(SB)/1, $0x00
DATA consts<>+127(SB)/1, $0x00
DATA consts<>+128(SB)/1, $0x00
DATA consts<>+129(SB)/1, $0x00
DATA consts<>+130(SB)/1, $0x00
DATA consts<>+131(SB)/1, $0x00
DATA consts<>+132(SB)/1, $0x00
DATA consts<>+133(SB)/1, $0x00
DATA consts<>+134(SB)/1, $0x00
DATA consts<>+135(SB)/1, $0x00
DATA consts<>+136(SB)/1, $0x00
DATA consts<>+137(SB)/1, $0x00
DATA consts<>+138(SB)/1, $0x00
DATA consts<>+139(SB)/1, $0x00
DATA consts<>+140(SB)/1, $0x00
DATA consts<>+141(SB)/1, $0x00
DATA consts<>+142(SB)/1, $0x00
DATA consts<>+143(SB)/1, $0x00
DATA consts<>+144(SB)/1, $0x00
DATA consts<>+145(SB)/1, $0x00
DATA consts<>+146(SB)/1, $0x00
DATA consts<>+147(SB)/1, $0x00
DATA consts<>+148(SB)/1, $0x00
DATA consts<>+149(SB)/1, $0x00
DATA consts<>+150(SB)/1, $0x00
DATA consts<>+151(SB)/1, $0x00
DATA consts<>+152(SB)/1, $0x00
DATA consts<>+153(SB)/1, $0x00
DATA consts<>+154(SB)/1, $0x00
DATA consts<>+155(SB)/1, $0x00
DATA consts<>+156(SB)/1, $0x00
DATA consts<>+157(SB)/1, $0x00
DATA consts<>+158(SB)/1, $0x00
DATA consts<>+159(SB)/1, $0x00
DATA consts<>+160(SB)/1, $0x00
DATA consts<>+161(SB)/1, $0x00
DATA consts<>+162(SB)/1, $0x00
DATA consts<>+163(SB)/1, $0x00
DATA consts<>+164(SB)/1, $0x00
DATA consts<>+165(SB)/1, $0x00
DATA consts<>+166(SB)/1, $0x00
DATA consts<>+167(SB)/1, $0x00
DATA consts<>+168(SB)/1, $0x00
DATA consts<>+169(SB)/1, $0x00
DATA consts<>+170(SB)/1, $0x00
DATA consts<>+171(SB)/1, $0x00
DATA consts<>+172(SB)/1, $0x00
DATA consts<>+173(SB)/1, $0x00
DATA consts<>+174(SB)/1, $0x00
DATA consts<>+175(SB)/1, $0x00
DATA consts<>+176(SB)/1, $0x00
DATA consts<>+177(SB)/1, $0x00
DATA consts<>+178(SB)/1, $0x00
DATA consts<>+179(SB)/1, $0x00
DATA consts<>+180(SB)/1, $0x00
DATA consts<>+181(SB)/1, $0x00
DATA consts<>+182(SB)/1, $0x00
DATA consts<>+183(SB)/1, $0x00
DATA consts<>+184(SB)/1, $0x00
DATA consts<>+185(SB)/1, $0x00
DATA consts<>+186(SB)/1, $0x00
DATA consts<>+187(SB)/1, $0x00
DATA consts<>+188(SB)/1, $0x00
DATA consts<>+189(SB)/1, $0x00
DATA consts<>+190(SB)/1, $0x00
DATA consts<>+191(SB)/1, $0x00
DATA consts<>+192(SB)/1, $0x00
DATA consts<>+193(SB)/1, $0x00
DATA consts<>+194(SB)/1, $0x00
DATA consts<>+195(SB)/1, $0x00
DATA consts<>+196(SB)/1, $0x00
DATA consts<>+197(SB)/1, $0x00
DATA consts<>+198(SB)/1, $0x00
DATA consts<>+199(SB)/1, $0x00
DATA consts<>+200(SB)/1, $0x00
DATA consts<>+201(SB)/1, $0x00
DATA consts<>+202(SB)/1, $0x00
DATA consts<>+203(SB)/1, $0x00
DATA consts<>+204(SB)/1, $0x00
DATA consts<>+205(SB)/1, $0x00
DATA consts<>+206(SB)/1, $0x00
DATA consts<>+207(SB)/1, $0x00
DATA consts<>+208(SB)/1, $0x00
DATA consts<>+209(SB)/1, $0x00
DATA consts<>+210(SB)/1, $0x00
DATA consts<>+211(SB)/1, $0x00
DATA consts<>+212(SB)/1, $0x00
DATA consts<>+213(SB)/1, $0x00
DATA consts<>+214(SB)/1, $0x00
DATA consts<>+215(SB)/1, $0x00
DATA consts<>+216(SB)/1, $0x00
DATA consts<>+217(SB)/1, $0x00
DATA consts<>+218(SB)/1, $0x00
DATA consts<>+219(SB)/1, $0x00
DATA consts<>+220(SB)/1, $0x00
DATA consts<>+221(SB)/1, $0x00
DATA consts<>+222(SB)/1, $0x00
DATA consts<>+223(SB)/1, $0x00
DATA consts<>+224(SB)/1, $0x00
DATA consts<>+225(SB)/1, $0x00
DATA consts<>+226(SB)/1, $0x00
DATA consts<>+227(SB)/1, $0x00
DATA consts<>+228(SB)/1, $0x00
DATA consts<>+229(SB)/1, $0x00
DATA consts<>+230(SB)/1, $0x00
DATA consts<>+231(SB)/1, $0x00
DATA consts<>+232(SB)/1, $0x00
DATA consts<>+233(SB)/1, $0x00
DATA consts<>+234(SB)/1, $0x00
DATA consts<>+235(SB)/1, $0x00
DATA consts<>+236(SB)/1, $0x00
DATA consts<>+237(SB)/1, $0x00
DATA consts<>+238(SB)/1, $0x00
DATA consts<>+239(SB)/1, $0x00
DATA consts<>+240(SB)/1, $0x00
DATA consts<>+241(SB)/1, $0x00
DATA consts<>+242(SB)/1, $0x00
DATA consts<>+243(SB)/1, $0x00
DATA consts<>+244(SB)/1, $0x00
DATA consts<>+245(SB)/1, $0x00
DATA consts<>+246(SB)/1, $0x00
DATA consts<>+247(SB)/1, $0x00
DATA consts<>+248(SB)/1, $0x00
DATA consts<>+249(SB)/1, $0x00
DATA consts<>+250(SB)/1, $0x00
DATA consts<>+251(SB)/1, $0x00
DATA consts<>+252(SB)/1, $0x00
DATA consts<>+253(SB)/1, $0x00
DATA consts<>+254(SB)/1, $0x00
DATA consts<>+255(SB)/1, $0x00
GLOBL consts<>(SB), RODATA|NOPTR, $256

// func AssemblerUnrolled(input []byte, output []byte)
TEXT ·AssemblerUnrolled(SB), NOSPLIT, $0-48
	// load input, input length, and output
	MOVQ input_base+0(FP), AX
	MOVQ input_len+8(FP), CX
	MOVQ output_base+24(FP), DX
	LEAQ consts<>+0(SB), SI

unroll_start:
	CMPQ CX, $0x08
	JL   encode_loop

	// load the next input value
	MOVBQZX (AX), BX

	// decrement number of bytes to be loaded and move the input pointer to the next byte
	DECQ CX
	INCQ AX

	// add 42
	ADDB $0x2a, BL

	// check if it's 0, 10, 13, or 61 and escape if so
	CMPB (SI)(BX*1), $0x01
	JE   escape_unroll_0

escape_done_unroll_0:
	// return encoded character
	MOVB BL, (DX)
	ADDQ $0x01, DX

	// load the next input value
	MOVBQZX (AX), BX

	// decrement number of bytes to be loaded and move the input pointer to the next byte
	DECQ CX
	INCQ AX

	// add 42
	ADDB $0x2a, BL

	// check if it's 0, 10, 13, or 61 and escape if so
	CMPB (SI)(BX*1), $0x01
	JE   escape_unroll_1

escape_done_unroll_1:
	// return encoded character
	MOVB BL, (DX)
	ADDQ $0x01, DX

	// load the next input value
	MOVBQZX (AX), BX

	// decrement number of bytes to be loaded and move the input pointer to the next byte
	DECQ CX
	INCQ AX

	// add 42
	ADDB $0x2a, BL

	// check if it's 0, 10, 13, or 61 and escape if so
	CMPB (SI)(BX*1), $0x01
	JE   escape_unroll_2

escape_done_unroll_2:
	// return encoded character
	MOVB BL, (DX)
	ADDQ $0x01, DX

	// load the next input value
	MOVBQZX (AX), BX

	// decrement number of bytes to be loaded and move the input pointer to the next byte
	DECQ CX
	INCQ AX

	// add 42
	ADDB $0x2a, BL

	// check if it's 0, 10, 13, or 61 and escape if so
	CMPB (SI)(BX*1), $0x01
	JE   escape_unroll_3

escape_done_unroll_3:
	// return encoded character
	MOVB BL, (DX)
	ADDQ $0x01, DX

	// load the next input value
	MOVBQZX (AX), BX

	// decrement number of bytes to be loaded and move the input pointer to the next byte
	DECQ CX
	INCQ AX

	// add 42
	ADDB $0x2a, BL

	// check if it's 0, 10, 13, or 61 and escape if so
	CMPB (SI)(BX*1), $0x01
	JE   escape_unroll_4

escape_done_unroll_4:
	// return encoded character
	MOVB BL, (DX)
	ADDQ $0x01, DX

	// load the next input value
	MOVBQZX (AX), BX

	// decrement number of bytes to be loaded and move the input pointer to the next byte
	DECQ CX
	INCQ AX

	// add 42
	ADDB $0x2a, BL

	// check if it's 0, 10, 13, or 61 and escape if so
	CMPB (SI)(BX*1), $0x01
	JE   escape_unroll_5

escape_done_unroll_5:
	// return encoded character
	MOVB BL, (DX)
	ADDQ $0x01, DX

	// load the next input value
	MOVBQZX (AX), BX

	// decrement number of bytes to be loaded and move the input pointer to the next byte
	DECQ CX
	INCQ AX

	// add 42
	ADDB $0x2a, BL

	// check if it's 0, 10, 13, or 61 and escape if so
	CMPB (SI)(BX*1), $0x01
	JE   escape_unroll_6

escape_done_unroll_6:
	// return encoded character
	MOVB BL, (DX)
	ADDQ $0x01, DX

	// load the next input value
	MOVBQZX (AX), BX

	// decrement number of bytes to be loaded and move the input pointer to the next byte
	DECQ CX
	INCQ AX

	// add 42
	ADDB $0x2a, BL

	// check if it's 0, 10, 13, or 61 and escape if so
	CMPB (SI)(BX*1), $0x01
	JE   escape_unroll_7

escape_done_unroll_7:
	// return encoded character
	MOVB BL, (DX)
	ADDQ $0x01, DX
	JMP  unroll_start

escape_unroll_0:
	// add 64
	ADDB $0x40, BL

	// write escape character to output
	MOVB $0x3d, (DX)
	ADDQ $0x01, DX
	JMP  escape_done_unroll_0

escape_unroll_1:
	// add 64
	ADDB $0x40, BL

	// write escape character to output
	MOVB $0x3d, (DX)
	ADDQ $0x01, DX
	JMP  escape_done_unroll_1

escape_unroll_2:
	// add 64
	ADDB $0x40, BL

	// write escape character to output
	MOVB $0x3d, (DX)
	ADDQ $0x01, DX
	JMP  escape_done_unroll_2

escape_unroll_3:
	// add 64
	ADDB $0x40, BL

	// write escape character to output
	MOVB $0x3d, (DX)
	ADDQ $0x01, DX
	JMP  escape_done_unroll_3

escape_unroll_4:
	// add 64
	ADDB $0x40, BL

	// write escape character to output
	MOVB $0x3d, (DX)
	ADDQ $0x01, DX
	JMP  escape_done_unroll_4

escape_unroll_5:
	// add 64
	ADDB $0x40, BL

	// write escape character to output
	MOVB $0x3d, (DX)
	ADDQ $0x01, DX
	JMP  escape_done_unroll_5

escape_unroll_6:
	// add 64
	ADDB $0x40, BL

	// write escape character to output
	MOVB $0x3d, (DX)
	ADDQ $0x01, DX
	JMP  escape_done_unroll_6

escape_unroll_7:
	// add 64
	ADDB $0x40, BL

	// write escape character to output
	MOVB $0x3d, (DX)
	ADDQ $0x01, DX
	JMP  escape_done_unroll_7

encode_loop:
	// when there are 0 byte left to encode, we're done
	CMPQ CX, $0x00
	JZ   done

	// load the next input value
	MOVBQZX (AX), BX

	// decrement number of bytes to be loaded and move the input pointer to the next byte
	DECQ CX
	INCQ AX

	// add 42
	ADDB $0x2a, BL

	// check if it's 0, 10, 13, or 61 and escape if so
	CMPB (SI)(BX*1), $0x01
	JE   escape

escape_done:
	// return encoded character
	MOVB BL, (DX)
	ADDQ $0x01, DX
	JMP  encode_loop

escape:
	// add 64
	ADDB $0x40, BL

	// write escape character to output
	MOVB $0x3d, (DX)
	ADDQ $0x01, DX
	JMP  escape_done

done:
	RET

A testdata/benchmarks/asm-unrolled_gen.go => testdata/benchmarks/asm-unrolled_gen.go +108 -0
@@ 0,0 1,108 @@
//go:build ignore

package main

import (
	"fmt"

	. "github.com/mmcloughlin/avo/build"
	. "github.com/mmcloughlin/avo/operand"
)

func main() {
	data := GLOBL("consts", RODATA|NOPTR)
	for i := 0; i < 256; i++ {
		k := 0
		if i == 0x00 || i == 0x0a || i == 0x0d || i == 0x3d {
			k = 1
		}
		DATA(i, U8(k))
	}

	TEXT("AssemblerUnrolled", NOSPLIT, "func(input, output []byte)")
	Doc("yenc encode using x86 assembler instructions using unrolled loops")

	Comment("load input, input length, and output")
	input := Load(Param("input").Base(), GP64())
	inputLen := Load(Param("input").Len(), GP64())
	output := Load(Param("output").Base(), GP64())
	value := GP64()
	LTPtr := Mem{Base: GP64()}
	LEAQ(data, LTPtr.Base)

	Label("unroll_start")
	CMPQ(inputLen, Imm(8))
	JL(LabelRef("encode_loop"))

	for i := 0; i < 8; i++ {
		Comment("load the next input value")
		MOVBQZX(Mem{Base: input}, value)

		Comment("decrement number of bytes to be loaded and move the input pointer to the next byte")
		DECQ(inputLen)
		INCQ(input)

		Comment("add 42")
		ADDB(U8(0x2a), value.As8())

		Comment("check if it's 0, 10, 13, or 61 and escape if so")
		CMPB(LTPtr.Idx(value, 1), U8(1))
		JE(LabelRef(fmt.Sprintf("escape_unroll_%d", i)))

		Label(fmt.Sprintf("escape_done_unroll_%d", i))
		Comment("return encoded character")
		MOVB(value.As8(), Mem{Base: output})
		ADDQ(Imm(1), output)
	}
	JMP(LabelRef("unroll_start"))

	for i := 0; i < 8; i++ {
		Label(fmt.Sprintf("escape_unroll_%d", i))
		Comment("add 64")
		ADDB(U8(64), value.As8())
		Comment("write escape character to output")
		MOVB(Imm(0x3d), Mem{Base: output})
		ADDQ(Imm(1), output)
		JMP(LabelRef(fmt.Sprintf("escape_done_unroll_%d", i)))
	}

	Label("encode_loop")
	Comment("when there are 0 byte left to encode, we're done")
	CMPQ(inputLen, Imm(0))
	JZ(LabelRef("done"))

	Comment("load the next input value")
	MOVBQZX(Mem{Base: input}, value)

	Comment("decrement number of bytes to be loaded and move the input pointer to the next byte")
	DECQ(inputLen)
	INCQ(input)

	Comment("add 42")
	ADDB(U8(0x2a), value.As8())

	Comment("check if it's 0, 10, 13, or 61 and escape if so")
	CMPB(LTPtr.Idx(value, 1), U8(1))
	JE(LabelRef("escape"))
	// or fallthrough

	Label("escape_done")
	Comment("return encoded character")
	MOVB(value.As8(), Mem{Base: output})
	ADDQ(Imm(1), output)
	JMP(LabelRef("encode_loop"))
	// end of encode loop

	Label("escape")
	Comment("add 64")
	ADDB(U8(64), value.As8())
	Comment("write escape character to output")
	MOVB(Imm(0x3d), Mem{Base: output})
	ADDQ(Imm(1), output)
	JMP(LabelRef("escape_done"))

	Label("done")
	RET()

	Generate()
}

M testdata/benchmarks/yenc_test.go => testdata/benchmarks/yenc_test.go +10 -0
@@ 56,6 56,11 @@ func TestEncodingMultibyte(t *testing.T) {
			AssemblerPlain(in[:], out)
			return out
		}},
		{"assembler-unrolled", func(in [8]byte) []byte {
			out := make([]byte, 16)
			AssemblerUnrolled(in[:], out)
			return out
		}},
	}
	for _, enc := range encoder8byte {
		t.Run(enc.name, func(t *testing.T) {


@@ 403,6 408,11 @@ func BenchmarkEncoding64Byte(b *testing.B) {
			AssemblerPlain(in[:], out)
			return out
		}},
		{"assembler-unrolled", func(in [64]byte) []byte {
			out := make([]byte, 128)
			AssemblerUnrolled(in[:], out)
			return out
		}},
	}
	for _, enc := range simdEncoder {
		b.Run(enc.name, func(b *testing.B) {