From 25e6a3100499a634d4243766f9b540d23dafa345 Mon Sep 17 00:00:00 2001 From: John Reiser Date: Sun, 28 Jul 2024 15:08:45 -0700 Subject: [PATCH] i386 stub: prefer movzbl over movb fetch 32-bit mode on x86_64 hardware can fail movb fetch to register by writing 0x00 randomly instead of the fetched byte. Note that most CPU can write only 32 or 64 bits to the register file, so writing just 8 or 16 bits incurs a 1-cycle penalty in order to form 32 bits by Read-Modify-Write of the destination register. modified: stub/src/arch/i386/nrv2b_d32-easy.S modified: stub/src/i386-expand.S --- src/stub/src/arch/i386/nrv2b_d32-easy.S | 4 ++-- src/stub/src/i386-expand.S | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/stub/src/arch/i386/nrv2b_d32-easy.S b/src/stub/src/arch/i386/nrv2b_d32-easy.S index 6b37583f..cd3569d0 100644 --- a/src/stub/src/arch/i386/nrv2b_d32-easy.S +++ b/src/stub/src/arch/i386/nrv2b_d32-easy.S @@ -38,7 +38,7 @@ lit_n2b: incl %esi; movb %dl,(%edi) incl %edi top_n2b: - movb (%esi),%dl # prefetch: literal, or bottom 8 bits of offset + movzbl (%esi),%edx # prefetch: literal, or bottom 8 bits of offset jnextb1yp lit_n2b push $1; pop off offmore_n2b: @@ -46,7 +46,7 @@ offmore_n2b: jnextb0np offmore_n2b subl $ 3,off; jc len_n2b # use previous offset - shll $ 8,off; movzbl %dl,%edx + shll $ 8,off orl %edx,off; incl %esi xorl $~0,off; jz eof movl off,disp diff --git a/src/stub/src/i386-expand.S b/src/stub/src/i386-expand.S index 0098fab7..c6c780dc 100644 --- a/src/stub/src/i386-expand.S +++ b/src/stub/src/i386-expand.S @@ -51,7 +51,7 @@ NBPW= 4 #define GETBITp \ addl bits,bits; jnz 0f; \ movl (%esi),bits; sub $-4,%esi; \ - adcl bits,bits; movb (%esi),%dl; \ + adcl bits,bits; movzbl (%esi),%edx; \ 0: /* Same, but without prefetch (not useful for length of match.) */ #define jnextb0n jnextb0y @@ -125,7 +125,7 @@ __clear_cache: .globl __clear_cache refill: movl (%esi),bits; sub $-4,%esi // next 32 bits; set Carry adcl bits,bits // LSB= 1 (CarryIn); CarryOut= next bit - movb (%esi),%dl // pre-fetch: literal, or bottom 8 bits of offset + movzbl (%esi),%edx // pre-fetch: literal, or bottom 8 bits of offset rep; ret getbit: addl bits,bits; jz refill // Carry= next bit @@ -133,16 +133,16 @@ getbit: copy: // In: len, %edi, dispq; Out: 0==len, %edi, dispq; trashes %eax, %edx lea (%edi,dispq),%eax; cmpl $5,len // <=3 is forced - movb (%eax),%dl; jbe copy1 // <=5 for better branch predict + movzbl (%eax),%edx; jbe copy1 // <=5 for better branch predict cmpl $-4,displ; ja copy1 // 4-byte chunks would overlap subl $4,len // adjust for termination cases copy4: movl (%eax),%edx; add $4, %eax; subl $4,len movl %edx,(%edi); lea 4(%edi),%edi; jnc copy4 - addl $4,len; movb (%eax),%dl; jz copy0 + addl $4,len; movzbl (%eax),%edx; jz copy0 copy1: inc %eax; movb %dl,(%edi); dec len - movb (%eax),%dl + movzbl (%eax),%edx lea 1(%edi),%edi; jnz copy1 copy0: rep; ret