1
0
mirror of https://github.com/upx/upx synced 2025-10-05 19:20:23 +08:00

i386 stub: prefer movzbl over movb fetch

32-bit mode on x86_64 hardware can fail movb fetch to register
by writing 0x00 randomly instead of the fetched byte.
Note that most CPU can write only 32 or 64 bits to the register file,
so writing just 8 or 16 bits incurs a 1-cycle penalty in order to
form 32 bits by Read-Modify-Write of the destination register.
	modified:   stub/src/arch/i386/nrv2b_d32-easy.S
	modified:   stub/src/i386-expand.S
This commit is contained in:
John Reiser 2024-07-28 15:08:45 -07:00
parent 3d58035b41
commit 25e6a31004
2 changed files with 7 additions and 7 deletions

View File

@ -38,7 +38,7 @@ lit_n2b:
incl %esi; movb %dl,(%edi)
incl %edi
top_n2b:
movb (%esi),%dl # prefetch: literal, or bottom 8 bits of offset
movzbl (%esi),%edx # prefetch: literal, or bottom 8 bits of offset
jnextb1yp lit_n2b
push $1; pop off
offmore_n2b:
@ -46,7 +46,7 @@ offmore_n2b:
jnextb0np offmore_n2b
subl $ 3,off; jc len_n2b # use previous offset
shll $ 8,off; movzbl %dl,%edx
shll $ 8,off
orl %edx,off; incl %esi
xorl $~0,off; jz eof
movl off,disp

View File

@ -51,7 +51,7 @@ NBPW= 4
#define GETBITp \
addl bits,bits; jnz 0f; \
movl (%esi),bits; sub $-4,%esi; \
adcl bits,bits; movb (%esi),%dl; \
adcl bits,bits; movzbl (%esi),%edx; \
0:
/* Same, but without prefetch (not useful for length of match.) */
#define jnextb0n jnextb0y
@ -125,7 +125,7 @@ __clear_cache: .globl __clear_cache
refill:
movl (%esi),bits; sub $-4,%esi // next 32 bits; set Carry
adcl bits,bits // LSB= 1 (CarryIn); CarryOut= next bit
movb (%esi),%dl // pre-fetch: literal, or bottom 8 bits of offset
movzbl (%esi),%edx // pre-fetch: literal, or bottom 8 bits of offset
rep; ret
getbit:
addl bits,bits; jz refill // Carry= next bit
@ -133,16 +133,16 @@ getbit:
copy: // In: len, %edi, dispq; Out: 0==len, %edi, dispq; trashes %eax, %edx
lea (%edi,dispq),%eax; cmpl $5,len // <=3 is forced
movb (%eax),%dl; jbe copy1 // <=5 for better branch predict
movzbl (%eax),%edx; jbe copy1 // <=5 for better branch predict
cmpl $-4,displ; ja copy1 // 4-byte chunks would overlap
subl $4,len // adjust for termination cases
copy4:
movl (%eax),%edx; add $4, %eax; subl $4,len
movl %edx,(%edi); lea 4(%edi),%edi; jnc copy4
addl $4,len; movb (%eax),%dl; jz copy0
addl $4,len; movzbl (%eax),%edx; jz copy0
copy1:
inc %eax; movb %dl,(%edi); dec len
movb (%eax),%dl
movzbl (%eax),%edx
lea 1(%edi),%edi; jnz copy1
copy0:
rep; ret