|
8 | 8 | "flag"
|
9 | 9 | "fmt"
|
10 | 10 | "math"
|
| 11 | + "math/rand" |
11 | 12 | "runtime"
|
12 | 13 |
|
13 | 14 | . "github.com/mmcloughlin/avo/build"
|
@@ -93,6 +94,8 @@ func main() {
|
93 | 94 | o.genEmitCopyNoRepeat()
|
94 | 95 | o.snappy = false
|
95 | 96 | o.genMatchLen()
|
| 97 | + o.cvtLZ4BlockAsm() |
| 98 | + |
96 | 99 | Generate()
|
97 | 100 | }
|
98 | 101 |
|
@@ -1679,7 +1682,7 @@ func (o options) genEmitLiteral() {
|
1679 | 1682 | // stack must have at least 32 bytes.
|
1680 | 1683 | // retval will contain emitted bytes, but can be nil if this is not interesting.
|
1681 | 1684 | // dstBase and litBase are updated.
|
1682 |
| -// Uses 2 GP registers. With AVX 4 registers. |
| 1685 | +// Uses 2 GP registers. |
1683 | 1686 | // If updateDst is true dstBase will have the updated end pointer and an additional register will be used.
|
1684 | 1687 | func (o options) emitLiteral(name string, litLen, retval, dstBase, litBase reg.GPVirtual, end LabelRef, updateDst bool) {
|
1685 | 1688 | n := GP32()
|
@@ -2168,8 +2171,9 @@ func (o options) emitCopy(name string, length, offset, retval, dstBase reg.GPVir
|
2168 | 2171 | // Inline call to emitRepeat. Will jump to end
|
2169 | 2172 | if !o.snappy {
|
2170 | 2173 | o.emitRepeat(name+"_emit_copy_short", length, offset, retval, dstBase, end, false)
|
| 2174 | + } else { |
| 2175 | + JMP(LabelRef("two_byte_offset_" + name)) |
2171 | 2176 | }
|
2172 |
| - JMP(LabelRef("two_byte_offset_" + name)) |
2173 | 2177 |
|
2174 | 2178 | Label("two_byte_offset_short_" + name)
|
2175 | 2179 |
|
@@ -2771,3 +2775,205 @@ func (o options) matchLenAlt(name string, a, b, len reg.GPVirtual, end LabelRef)
|
2771 | 2775 | JMP(end)
|
2772 | 2776 | return matched
|
2773 | 2777 | }
|
| 2778 | + |
| 2779 | +func (o options) cvtLZ4BlockAsm() { |
| 2780 | + TEXT("cvtLZ4BlockAsm", NOSPLIT, "func(dst, src []byte) (uncompressed int, dstUsed int)") |
| 2781 | + Doc("cvtLZ4BlockAsm converts an LZ4 block to S2", "") |
| 2782 | + Pragma("noescape") |
| 2783 | + o.outputMargin = 8 |
| 2784 | + |
| 2785 | + const ( |
| 2786 | + errCorrupt = -1 |
| 2787 | + errDstTooSmall = -2 |
| 2788 | + ) |
| 2789 | + dst, dstLen, src, srcLen, retval := GP64(), GP64(), GP64(), GP64(), GP64() |
| 2790 | + |
| 2791 | + // retval = 0 |
| 2792 | + XORQ(retval, retval) |
| 2793 | + |
| 2794 | + Load(Param("dst").Base(), dst) |
| 2795 | + Load(Param("dst").Len(), dstLen) |
| 2796 | + Load(Param("src").Base(), src) |
| 2797 | + Load(Param("src").Len(), srcLen) |
| 2798 | + srcEnd, dstEnd := GP64(), GP64() |
| 2799 | + LEAQ(Mem{Base: src, Index: srcLen, Scale: 1, Disp: 0}, srcEnd) |
| 2800 | + LEAQ(Mem{Base: dst, Index: dstLen, Scale: 1, Disp: -o.outputMargin}, dstEnd) |
| 2801 | + lastOffset := GP64() |
| 2802 | + XORQ(lastOffset, lastOffset) |
| 2803 | + |
| 2804 | + checkSrc := func(reg reg.GPVirtual) { |
| 2805 | + if debug { |
| 2806 | + name := fmt.Sprintf("lz4_s2_ok_%d", rand.Int31()) |
| 2807 | + |
| 2808 | + CMPQ(reg, srcEnd) |
| 2809 | + JB(LabelRef(name)) |
| 2810 | + JMP(LabelRef("lz4_s2_corrupt")) |
| 2811 | + Label(name) |
| 2812 | + } else { |
| 2813 | + CMPQ(reg, srcEnd) |
| 2814 | + JAE(LabelRef("lz4_s2_corrupt")) |
| 2815 | + } |
| 2816 | + } |
| 2817 | + checkDst := func(reg reg.GPVirtual) { |
| 2818 | + CMPQ(reg, dstEnd) |
| 2819 | + JAE(LabelRef("lz4_s2_dstfull")) |
| 2820 | + } |
| 2821 | + |
| 2822 | + const lz4MinMatch = 4 |
| 2823 | + |
| 2824 | + Label("lz4_s2_loop") |
| 2825 | + checkSrc(src) |
| 2826 | + checkDst(dst) |
| 2827 | + token := GP64() |
| 2828 | + MOVBQZX(Mem{Base: src}, token) |
| 2829 | + ll, ml := GP64(), GP64() |
| 2830 | + MOVQ(token, ll) |
| 2831 | + MOVQ(token, ml) |
| 2832 | + SHRQ(U8(4), ll) |
| 2833 | + ANDQ(U8(0xf), ml) |
| 2834 | + |
| 2835 | + // If upper nibble is 15, literal length is extended |
| 2836 | + { |
| 2837 | + CMPQ(token, U8(0xf0)) |
| 2838 | + JB(LabelRef("lz4_s2_ll_end")) |
| 2839 | + Label("lz4_s2_ll_loop") |
| 2840 | + INCQ(src) // s++ |
| 2841 | + checkSrc(src) |
| 2842 | + val := GP64() |
| 2843 | + MOVBQZX(Mem{Base: src}, val) |
| 2844 | + ADDQ(val, ll) |
| 2845 | + CMPQ(val, U8(255)) |
| 2846 | + JEQ(LabelRef("lz4_s2_ll_loop")) |
| 2847 | + Label("lz4_s2_ll_end") |
| 2848 | + } |
| 2849 | + |
| 2850 | + // if s+ll >= len(src) |
| 2851 | + endLits := GP64() |
| 2852 | + LEAQ(Mem{Base: src, Index: ll, Scale: 1}, endLits) |
| 2853 | + ADDQ(U8(lz4MinMatch), ml) |
| 2854 | + checkSrc(endLits) |
| 2855 | + INCQ(src) // s++ |
| 2856 | + INCQ(endLits) |
| 2857 | + TESTQ(ll, ll) |
| 2858 | + JZ(LabelRef("lz4_s2_lits_done")) |
| 2859 | + { |
| 2860 | + dstEnd := GP64() |
| 2861 | + LEAQ(Mem{Base: dst, Index: ll, Scale: 1}, dstEnd) |
| 2862 | + checkDst(dstEnd) |
| 2863 | + o.outputMargin++ |
| 2864 | + ADDQ(ll, retval) |
| 2865 | + o.emitLiteral("lz4_s2", ll, nil, dst, src, LabelRef("lz4_s2_lits_emit_done"), true) |
| 2866 | + o.outputMargin-- |
| 2867 | + Label("lz4_s2_lits_emit_done") |
| 2868 | + MOVQ(endLits, src) |
| 2869 | + } |
| 2870 | + Label("lz4_s2_lits_done") |
| 2871 | + // if s == len(src) && ml == lz4MinMatch |
| 2872 | + CMPQ(src, srcEnd) |
| 2873 | + JNE(LabelRef("lz4_s2_match")) |
| 2874 | + CMPQ(ml, U8(lz4MinMatch)) |
| 2875 | + JEQ(LabelRef("lz4_s2_done")) |
| 2876 | + JMP(LabelRef("lz4_s2_corrupt")) |
| 2877 | + |
| 2878 | + Label("lz4_s2_match") |
| 2879 | + // if s >= len(src)-2 { |
| 2880 | + end := GP64() |
| 2881 | + LEAQ(Mem{Base: src, Disp: 2}, end) |
| 2882 | + checkSrc(end) |
| 2883 | + offset := GP64() |
| 2884 | + MOVWQZX(Mem{Base: src}, offset) |
| 2885 | + MOVQ(end, src) // s = s + 2 |
| 2886 | + |
| 2887 | + if debug { |
| 2888 | + // if offset == 0 { |
| 2889 | + TESTQ(offset, offset) |
| 2890 | + JNZ(LabelRef("lz4_s2_c1")) |
| 2891 | + JMP(LabelRef("lz4_s2_corrupt")) |
| 2892 | + |
| 2893 | + Label("lz4_s2_c1") |
| 2894 | + |
| 2895 | + // if int(offset) > uncompressed { |
| 2896 | + CMPQ(offset, retval) |
| 2897 | + JB(LabelRef("lz4_s2_c2")) |
| 2898 | + JMP(LabelRef("lz4_s2_corrupt")) |
| 2899 | + |
| 2900 | + Label("lz4_s2_c2") |
| 2901 | + |
| 2902 | + } else { |
| 2903 | + // if offset == 0 { |
| 2904 | + TESTQ(offset, offset) |
| 2905 | + JZ(LabelRef("lz4_s2_corrupt")) |
| 2906 | + |
| 2907 | + // if int(offset) > uncompressed { |
| 2908 | + CMPQ(offset, retval) |
| 2909 | + JA(LabelRef("lz4_s2_corrupt")) |
| 2910 | + } |
| 2911 | + |
| 2912 | + // if ml == lz4MinMatch+15 { |
| 2913 | + { |
| 2914 | + CMPQ(ml, U8(lz4MinMatch+15)) |
| 2915 | + JNE(LabelRef("lz4_s2_ml_done")) |
| 2916 | + |
| 2917 | + Label("lz4_s2_ml_loop") |
| 2918 | + val := GP64() |
| 2919 | + MOVBQZX(Mem{Base: src}, val) |
| 2920 | + INCQ(src) // s++ |
| 2921 | + ADDQ(val, ml) // ml += val |
| 2922 | + checkSrc(src) |
| 2923 | + CMPQ(val, U8(255)) |
| 2924 | + JEQ(LabelRef("lz4_s2_ml_loop")) |
| 2925 | + } |
| 2926 | + Label("lz4_s2_ml_done") |
| 2927 | + |
| 2928 | + // uncompressed += ml |
| 2929 | + ADDQ(ml, retval) |
| 2930 | + CMPQ(offset, lastOffset) |
| 2931 | + JNE(LabelRef("lz4_s2_docopy")) |
| 2932 | + // Offsets can only be 16 bits |
| 2933 | + maxLength := o.maxLen |
| 2934 | + o.maxLen = 65535 |
| 2935 | + { |
| 2936 | + // emitRepeat16(dst[d:], offset, ml) |
| 2937 | + o.emitRepeat("lz4_s2", ml, offset, nil, dst, LabelRef("lz4_s2_loop"), false) |
| 2938 | + } |
| 2939 | + Label("lz4_s2_docopy") |
| 2940 | + { |
| 2941 | + // emitCopy16(dst[d:], offset, ml) |
| 2942 | + MOVQ(offset, lastOffset) |
| 2943 | + o.emitCopy("lz4_s2", ml, offset, nil, dst, LabelRef("lz4_s2_loop")) |
| 2944 | + } |
| 2945 | + o.maxLen = maxLength |
| 2946 | + |
| 2947 | + Label("lz4_s2_done") |
| 2948 | + { |
| 2949 | + tmp := GP64() |
| 2950 | + Load(Param("dst").Base(), tmp) |
| 2951 | + SUBQ(tmp, dst) |
| 2952 | + Store(retval, ReturnIndex(0)) |
| 2953 | + Store(dst, ReturnIndex(1)) |
| 2954 | + RET() |
| 2955 | + } |
| 2956 | + Label("lz4_s2_corrupt") |
| 2957 | + { |
| 2958 | + tmp := GP64() |
| 2959 | + if debug { |
| 2960 | + tmp := GP64() |
| 2961 | + Load(Param("dst").Base(), tmp) |
| 2962 | + SUBQ(tmp, dst) |
| 2963 | + Store(dst, ReturnIndex(1)) |
| 2964 | + } |
| 2965 | + XORQ(tmp, tmp) |
| 2966 | + LEAQ(Mem{Base: tmp, Disp: errCorrupt}, retval) |
| 2967 | + Store(retval, ReturnIndex(0)) |
| 2968 | + RET() |
| 2969 | + } |
| 2970 | + |
| 2971 | + Label("lz4_s2_dstfull") |
| 2972 | + { |
| 2973 | + tmp := GP64() |
| 2974 | + XORQ(tmp, tmp) |
| 2975 | + LEAQ(Mem{Base: tmp, Disp: errDstTooSmall}, retval) |
| 2976 | + Store(retval, ReturnIndex(0)) |
| 2977 | + RET() |
| 2978 | + } |
| 2979 | +} |
0 commit comments