tune for better branch prediction by avoiding more than 3 in a 16-byte block

amd_d_nrv2b.S amd_d_nrv2e.S l_lx_elf64amd.S committer: jreiser <jreiser> 1131994471 +0000
2025-10-05 19:20:23 +08:00 · 2005-11-14 18:54:31 +00:00 · 2005-11-14 18:54:31 +00:00 · beb4319d1e
commit beb4319d1e
parent 2558243c83
3 changed files with 26 additions and 13 deletions
--- a/src/stub/amd_d_nrv2b.S
+++ b/src/stub/amd_d_nrv2b.S
@ -34,7 +34,7 @@ lit_n2b:
        incq %rsi; movb %dl,(%rdi)
        incq %rdi
 top_n2b:
-        movzbl (%rsi),%edx  # speculate: literal, or bottom 8 bits of offset
+        movb (%rsi),%dl  # speculate: literal, or bottom 8 bits of offset
        jnextb1y lit_n2b
        lea 1(lenq),off  # [len= 0] off= 1
 offmore_n2b:
@ -42,7 +42,7 @@ offmore_n2b:
        jnextb0n offmore_n2b

        subl $ 3,off; jc len_n2b  # use previous offset
-        shll $ 8,off
+        shll $ 8,off; movzbl %dl,%edx
        orl %edx,off; incq %rsi
        xorl $~0,off; jz eof
        movslq off,disp  # XXX: 2GB
--- a/src/stub/amd_d_nrv2e.S
+++ b/src/stub/amd_d_nrv2e.S
@ -34,7 +34,7 @@ lit_n2e:
        incq %rsi; movb %dl,(%rdi)
        incq %rdi
 top_n2e:
-        movzbl (%rsi),%edx  # speculate: literal, or bottom 8 bits of offset
+        movb (%rsi),%dl  # speculate: literal, or bottom 8 bits of offset
        jnextb1y lit_n2e
        lea 1(lenq),off  # [len= 0] off= 1
        jmp getoff_n2e
@ -47,7 +47,7 @@ getoff_n2e:
        jnextb0n off_n2e

        subl $ 3,off; jc offprev_n2e
-        shll $ 8,off
+        shll $ 8,off; movzbl %dl,%edx
        orl %edx,off; incq %rsi
        xorl $~0,off; jz eof
        sarl off  # Carry= original low bit
--- a/src/stub/l_lx_elf64amd.S
+++ b/src/stub/l_lx_elf64amd.S
@ -91,17 +91,29 @@ decompress:  # (uchar const *src, size_t lsrc, uchar *dst, u32 &ldst, uint metho
        xorl bits,bits  # empty; force refill
        xorl len,len  # create loop invariant
        orq $~0,disp  # -1: initial displacement
-        jmp setup
+        call setup  # push &getbit [TUNED]
+ra_setup:

+/* AMD64 branch prediction is much worse if there are more than 3 branches
+   per 16-byte block.  The jnextb would suffer unless inlined.  getnextb is OK
+   using closed subroutine to save space, and should be OK on cycles because
+   CALL+RET should be predicted.  getnextb could partially expand, using closed
+   subroutine only for refill.
+*/
 /* jump on next bit {0,1} with prediction {y==>likely, n==>unlikely} */
 /* Prediction omitted for now. */
 #define jnextb0n jnextb0y
-#define jnextb0y addl bits,bits; jnz 0f; call refill; 0: jnc
+#define jnextb0y GETBIT; jnc
 #define jnextb1n jnextb1y
-#define jnextb1y addl bits,bits; jnz 0f; call refill; 0: jc
+#define jnextb1y GETBIT; jc
+#define GETBIT \
+        addl bits,bits; jnz 0f; \
+        movl (%rsi),bits; subq $-4,%rsi; \
+        adcl bits,bits; movb (%rsi),%dl; \
+0:

 /* rotate next bit into bottom bit of reg */
-#define getnextb(reg) addl bits,bits; jnz 0f; call refill; 0: adcl reg,reg
+#define getnextb(reg) call *%r11; adcl reg,reg

        ALIGN(1<<3)
 getbit:
@ -110,13 +122,13 @@ getbit:
 refill:
        movl (%rsi),bits; subq $-4,%rsi  # next 32 bits; set Carry
        adcl bits,bits  # LSB= 1 (CarryIn); CarryOut= next bit
-        movzbl (%rsi),%edx  # speculate: literal, or bottom 8 bits of offset
+        movb (%rsi),%dl  # speculate: literal, or bottom 8 bits of offset
        rep; ret

 copy:  # In: len, %rdi, disp;  Out: 0==len, %rdi, disp;  trashes %rax, %rdx
-        leaq (%rdi,disp),%rax; movb (%rax),%dl
-        cmpl $ 3,len;  jbe copy1  # perhaps extend this to length 5 or less?
-        cmpq $-4,disp; ja  copy1  # 4-byte chunks would overlap
+        leaq (%rdi,disp),%rax; cmpl $5,len  # <=3 is forced
+        movb (%rax),%dl; jbe copy1  # <=5 for better branch predict
+        cmpq $-4,disp;   ja  copy1  # 4-byte chunks would overlap
        subl $4,len  # adjust for termination cases
 copy4:
        movl (%rax),%edx; addq $4,      %rax; subl $4,len
@ -134,7 +146,8 @@ copy0:

 setup:
        cld
-        cmpl $ M_NRV2E_LE32,meth; je bot_n2e
+        pop %r11  # addq $ getbit - ra_setup,%r11  # &getbit
+        cmpl $ M_NRV2E_LE32,meth; je top_n2e
        cmpl $ M_NRV2B_LE32,meth; je top_n2b
 eof:
        pop %rcx  # &input_eof