From 25e6a3100499a634d4243766f9b540d23dafa345 Mon Sep 17 00:00:00 2001
From: John Reiser <jreiser@BitWagon.com>
Date: Sun, 28 Jul 2024 15:08:45 -0700
Subject: [PATCH] i386 stub: prefer movzbl over movb fetch

32-bit mode on x86_64 hardware can fail movb fetch to register
by writing 0x00 randomly instead of the fetched byte.
Note that most CPU can write only 32 or 64 bits to the register file,
so writing just 8 or 16 bits incurs a 1-cycle penalty in order to
form 32 bits by Read-Modify-Write of the destination register.
	modified:   stub/src/arch/i386/nrv2b_d32-easy.S
	modified:   stub/src/i386-expand.S
---
 src/stub/src/arch/i386/nrv2b_d32-easy.S |  4 ++--
 src/stub/src/i386-expand.S              | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/stub/src/arch/i386/nrv2b_d32-easy.S b/src/stub/src/arch/i386/nrv2b_d32-easy.S
index 6b37583f..cd3569d0 100644
--- a/src/stub/src/arch/i386/nrv2b_d32-easy.S
+++ b/src/stub/src/arch/i386/nrv2b_d32-easy.S
@@ -38,7 +38,7 @@ lit_n2b:
         incl %esi; movb %dl,(%edi)
         incl %edi
 top_n2b:
-        movb (%esi),%dl  # prefetch: literal, or bottom 8 bits of offset
+        movzbl (%esi),%edx  # prefetch: literal, or bottom 8 bits of offset
         jnextb1yp lit_n2b
         push $1; pop off
 offmore_n2b:
@@ -46,7 +46,7 @@ offmore_n2b:
         jnextb0np offmore_n2b
 
         subl $ 3,off; jc len_n2b  # use previous offset
-        shll $ 8,off; movzbl %dl,%edx
+        shll $ 8,off
         orl %edx,off; incl %esi
         xorl $~0,off; jz eof
         movl     off,disp
diff --git a/src/stub/src/i386-expand.S b/src/stub/src/i386-expand.S
index 0098fab7..c6c780dc 100644
--- a/src/stub/src/i386-expand.S
+++ b/src/stub/src/i386-expand.S
@@ -51,7 +51,7 @@ NBPW= 4
 #define GETBITp \
         addl bits,bits; jnz 0f; \
         movl (%esi),bits; sub $-4,%esi; \
-        adcl bits,bits; movb (%esi),%dl; \
+        adcl bits,bits; movzbl (%esi),%edx; \
 0:
 /* Same, but without prefetch (not useful for length of match.) */
 #define jnextb0n jnextb0y
@@ -125,7 +125,7 @@ __clear_cache: .globl __clear_cache
 refill:
         movl (%esi),bits; sub $-4,%esi  // next 32 bits; set Carry
         adcl bits,bits  // LSB= 1 (CarryIn); CarryOut= next bit
-        movb (%esi),%dl  // pre-fetch: literal, or bottom 8 bits of offset
+        movzbl (%esi),%edx  // pre-fetch: literal, or bottom 8 bits of offset
         rep; ret
 getbit:
         addl bits,bits; jz refill  // Carry= next bit
@@ -133,16 +133,16 @@ getbit:
 
 copy:  // In: len, %edi, dispq;  Out: 0==len, %edi, dispq;  trashes %eax, %edx
         lea (%edi,dispq),%eax; cmpl $5,len  // <=3 is forced
-        movb (%eax),%dl; jbe copy1  // <=5 for better branch predict
+        movzbl (%eax),%edx; jbe copy1  // <=5 for better branch predict
         cmpl $-4,displ;   ja  copy1  // 4-byte chunks would overlap
         subl $4,len  // adjust for termination cases
 copy4:
         movl (%eax),%edx; add $4,      %eax; subl $4,len
         movl %edx,(%edi); lea  4(%edi),%edi; jnc copy4
-        addl $4,len; movb (%eax),%dl; jz copy0
+        addl $4,len; movzbl (%eax),%edx; jz copy0
 copy1:
         inc %eax; movb %dl,(%edi); dec len
-           movb (%eax),%dl
+           movzbl (%eax),%edx
                 lea 1(%edi),%edi;  jnz copy1
 copy0:
         rep; ret