Skip to content

Commit 207c000

Browse files
committed
Add assembly converter
1 parent 0751108 commit 207c000

File tree

6 files changed

+654
-31
lines changed

6 files changed

+654
-31
lines changed

s2/_generate/gen.go

Lines changed: 208 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"flag"
99
"fmt"
1010
"math"
11+
"math/rand"
1112
"runtime"
1213

1314
. "github.com/mmcloughlin/avo/build"
@@ -93,6 +94,8 @@ func main() {
9394
o.genEmitCopyNoRepeat()
9495
o.snappy = false
9596
o.genMatchLen()
97+
o.cvtLZ4BlockAsm()
98+
9699
Generate()
97100
}
98101

@@ -1679,7 +1682,7 @@ func (o options) genEmitLiteral() {
16791682
// stack must have at least 32 bytes.
16801683
// retval will contain emitted bytes, but can be nil if this is not interesting.
16811684
// dstBase and litBase are updated.
1682-
// Uses 2 GP registers. With AVX 4 registers.
1685+
// Uses 2 GP registers.
16831686
// If updateDst is true dstBase will have the updated end pointer and an additional register will be used.
16841687
func (o options) emitLiteral(name string, litLen, retval, dstBase, litBase reg.GPVirtual, end LabelRef, updateDst bool) {
16851688
n := GP32()
@@ -2168,8 +2171,9 @@ func (o options) emitCopy(name string, length, offset, retval, dstBase reg.GPVir
21682171
// Inline call to emitRepeat. Will jump to end
21692172
if !o.snappy {
21702173
o.emitRepeat(name+"_emit_copy_short", length, offset, retval, dstBase, end, false)
2174+
} else {
2175+
JMP(LabelRef("two_byte_offset_" + name))
21712176
}
2172-
JMP(LabelRef("two_byte_offset_" + name))
21732177

21742178
Label("two_byte_offset_short_" + name)
21752179

@@ -2771,3 +2775,205 @@ func (o options) matchLenAlt(name string, a, b, len reg.GPVirtual, end LabelRef)
27712775
JMP(end)
27722776
return matched
27732777
}
2778+
2779+
func (o options) cvtLZ4BlockAsm() {
2780+
TEXT("cvtLZ4BlockAsm", NOSPLIT, "func(dst, src []byte) (uncompressed int, dstUsed int)")
2781+
Doc("cvtLZ4BlockAsm converts an LZ4 block to S2", "")
2782+
Pragma("noescape")
2783+
o.outputMargin = 8
2784+
2785+
const (
2786+
errCorrupt = -1
2787+
errDstTooSmall = -2
2788+
)
2789+
dst, dstLen, src, srcLen, retval := GP64(), GP64(), GP64(), GP64(), GP64()
2790+
2791+
// retval = 0
2792+
XORQ(retval, retval)
2793+
2794+
Load(Param("dst").Base(), dst)
2795+
Load(Param("dst").Len(), dstLen)
2796+
Load(Param("src").Base(), src)
2797+
Load(Param("src").Len(), srcLen)
2798+
srcEnd, dstEnd := GP64(), GP64()
2799+
LEAQ(Mem{Base: src, Index: srcLen, Scale: 1, Disp: 0}, srcEnd)
2800+
LEAQ(Mem{Base: dst, Index: dstLen, Scale: 1, Disp: -o.outputMargin}, dstEnd)
2801+
lastOffset := GP64()
2802+
XORQ(lastOffset, lastOffset)
2803+
2804+
checkSrc := func(reg reg.GPVirtual) {
2805+
if debug {
2806+
name := fmt.Sprintf("lz4_s2_ok_%d", rand.Int31())
2807+
2808+
CMPQ(reg, srcEnd)
2809+
JB(LabelRef(name))
2810+
JMP(LabelRef("lz4_s2_corrupt"))
2811+
Label(name)
2812+
} else {
2813+
CMPQ(reg, srcEnd)
2814+
JAE(LabelRef("lz4_s2_corrupt"))
2815+
}
2816+
}
2817+
checkDst := func(reg reg.GPVirtual) {
2818+
CMPQ(reg, dstEnd)
2819+
JAE(LabelRef("lz4_s2_dstfull"))
2820+
}
2821+
2822+
const lz4MinMatch = 4
2823+
2824+
Label("lz4_s2_loop")
2825+
checkSrc(src)
2826+
checkDst(dst)
2827+
token := GP64()
2828+
MOVBQZX(Mem{Base: src}, token)
2829+
ll, ml := GP64(), GP64()
2830+
MOVQ(token, ll)
2831+
MOVQ(token, ml)
2832+
SHRQ(U8(4), ll)
2833+
ANDQ(U8(0xf), ml)
2834+
2835+
// If upper nibble is 15, literal length is extended
2836+
{
2837+
CMPQ(token, U8(0xf0))
2838+
JB(LabelRef("lz4_s2_ll_end"))
2839+
Label("lz4_s2_ll_loop")
2840+
INCQ(src) // s++
2841+
checkSrc(src)
2842+
val := GP64()
2843+
MOVBQZX(Mem{Base: src}, val)
2844+
ADDQ(val, ll)
2845+
CMPQ(val, U8(255))
2846+
JEQ(LabelRef("lz4_s2_ll_loop"))
2847+
Label("lz4_s2_ll_end")
2848+
}
2849+
2850+
// if s+ll >= len(src)
2851+
endLits := GP64()
2852+
LEAQ(Mem{Base: src, Index: ll, Scale: 1}, endLits)
2853+
ADDQ(U8(lz4MinMatch), ml)
2854+
checkSrc(endLits)
2855+
INCQ(src) // s++
2856+
INCQ(endLits)
2857+
TESTQ(ll, ll)
2858+
JZ(LabelRef("lz4_s2_lits_done"))
2859+
{
2860+
dstEnd := GP64()
2861+
LEAQ(Mem{Base: dst, Index: ll, Scale: 1}, dstEnd)
2862+
checkDst(dstEnd)
2863+
o.outputMargin++
2864+
ADDQ(ll, retval)
2865+
o.emitLiteral("lz4_s2", ll, nil, dst, src, LabelRef("lz4_s2_lits_emit_done"), true)
2866+
o.outputMargin--
2867+
Label("lz4_s2_lits_emit_done")
2868+
MOVQ(endLits, src)
2869+
}
2870+
Label("lz4_s2_lits_done")
2871+
// if s == len(src) && ml == lz4MinMatch
2872+
CMPQ(src, srcEnd)
2873+
JNE(LabelRef("lz4_s2_match"))
2874+
CMPQ(ml, U8(lz4MinMatch))
2875+
JEQ(LabelRef("lz4_s2_done"))
2876+
JMP(LabelRef("lz4_s2_corrupt"))
2877+
2878+
Label("lz4_s2_match")
2879+
// if s >= len(src)-2 {
2880+
end := GP64()
2881+
LEAQ(Mem{Base: src, Disp: 2}, end)
2882+
checkSrc(end)
2883+
offset := GP64()
2884+
MOVWQZX(Mem{Base: src}, offset)
2885+
MOVQ(end, src) // s = s + 2
2886+
2887+
if debug {
2888+
// if offset == 0 {
2889+
TESTQ(offset, offset)
2890+
JNZ(LabelRef("lz4_s2_c1"))
2891+
JMP(LabelRef("lz4_s2_corrupt"))
2892+
2893+
Label("lz4_s2_c1")
2894+
2895+
// if int(offset) > uncompressed {
2896+
CMPQ(offset, retval)
2897+
JB(LabelRef("lz4_s2_c2"))
2898+
JMP(LabelRef("lz4_s2_corrupt"))
2899+
2900+
Label("lz4_s2_c2")
2901+
2902+
} else {
2903+
// if offset == 0 {
2904+
TESTQ(offset, offset)
2905+
JZ(LabelRef("lz4_s2_corrupt"))
2906+
2907+
// if int(offset) > uncompressed {
2908+
CMPQ(offset, retval)
2909+
JA(LabelRef("lz4_s2_corrupt"))
2910+
}
2911+
2912+
// if ml == lz4MinMatch+15 {
2913+
{
2914+
CMPQ(ml, U8(lz4MinMatch+15))
2915+
JNE(LabelRef("lz4_s2_ml_done"))
2916+
2917+
Label("lz4_s2_ml_loop")
2918+
val := GP64()
2919+
MOVBQZX(Mem{Base: src}, val)
2920+
INCQ(src) // s++
2921+
ADDQ(val, ml) // ml += val
2922+
checkSrc(src)
2923+
CMPQ(val, U8(255))
2924+
JEQ(LabelRef("lz4_s2_ml_loop"))
2925+
}
2926+
Label("lz4_s2_ml_done")
2927+
2928+
// uncompressed += ml
2929+
ADDQ(ml, retval)
2930+
CMPQ(offset, lastOffset)
2931+
JNE(LabelRef("lz4_s2_docopy"))
2932+
// Offsets can only be 16 bits
2933+
maxLength := o.maxLen
2934+
o.maxLen = 65535
2935+
{
2936+
// emitRepeat16(dst[d:], offset, ml)
2937+
o.emitRepeat("lz4_s2", ml, offset, nil, dst, LabelRef("lz4_s2_loop"), false)
2938+
}
2939+
Label("lz4_s2_docopy")
2940+
{
2941+
// emitCopy16(dst[d:], offset, ml)
2942+
MOVQ(offset, lastOffset)
2943+
o.emitCopy("lz4_s2", ml, offset, nil, dst, LabelRef("lz4_s2_loop"))
2944+
}
2945+
o.maxLen = maxLength
2946+
2947+
Label("lz4_s2_done")
2948+
{
2949+
tmp := GP64()
2950+
Load(Param("dst").Base(), tmp)
2951+
SUBQ(tmp, dst)
2952+
Store(retval, ReturnIndex(0))
2953+
Store(dst, ReturnIndex(1))
2954+
RET()
2955+
}
2956+
Label("lz4_s2_corrupt")
2957+
{
2958+
tmp := GP64()
2959+
if debug {
2960+
tmp := GP64()
2961+
Load(Param("dst").Base(), tmp)
2962+
SUBQ(tmp, dst)
2963+
Store(dst, ReturnIndex(1))
2964+
}
2965+
XORQ(tmp, tmp)
2966+
LEAQ(Mem{Base: tmp, Disp: errCorrupt}, retval)
2967+
Store(retval, ReturnIndex(0))
2968+
RET()
2969+
}
2970+
2971+
Label("lz4_s2_dstfull")
2972+
{
2973+
tmp := GP64()
2974+
XORQ(tmp, tmp)
2975+
LEAQ(Mem{Base: tmp, Disp: errDstTooSmall}, retval)
2976+
Store(retval, ReturnIndex(0))
2977+
RET()
2978+
}
2979+
}

s2/encode_amd64.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
package s2
55

6+
const hasAmd64Asm = true
7+
68
// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
79
// assumes that the varint-encoded length of the decompressed bytes has already
810
// been written.

s2/encode_go.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
package s2
55

6+
const hasAmd64Asm = false
7+
68
import (
79
"math/bits"
810
)

s2/encodeblock_amd64.go

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)