mirror of
https://github.com/upx/upx
synced 2025-09-28 19:06:07 +08:00
de-tabify; also speculate 1-byte loads to save a cycle or two
amd_d_nrv2b.S amd_d_nrv2e.S l_lx_elf64amd.S committer: jreiser <jreiser> 1131943590 +0000
This commit is contained in:
parent
b670e8d1ac
commit
2558243c83
|
@ -29,35 +29,40 @@
|
|||
<jreiser@users.sourceforge.net>
|
||||
*/
|
||||
|
||||
ALIGN(1<<3)
|
||||
ALIGN(1<<3)
|
||||
lit_n2b:
|
||||
movb (%rsi),%al; addq $1,%rsi
|
||||
movb %al,(%rdi); addq $1,%rdi
|
||||
incq %rsi; movb %dl,(%rdi)
|
||||
incq %rdi
|
||||
top_n2b:
|
||||
jnextb1y lit_n2b
|
||||
lea 1(lenq),off # [len= 0] off= 1
|
||||
movzbl (%rsi),%edx # speculate: literal, or bottom 8 bits of offset
|
||||
jnextb1y lit_n2b
|
||||
lea 1(lenq),off # [len= 0] off= 1
|
||||
offmore_n2b:
|
||||
getnextb(off)
|
||||
jnextb0n offmore_n2b
|
||||
getnextb(off)
|
||||
jnextb0n offmore_n2b
|
||||
|
||||
movzbl (%rsi),%edx
|
||||
subl $ 3,off; jc len_n2b # use previous offset
|
||||
shll $ 8,off
|
||||
orl %edx,off; incq %rsi
|
||||
xorl $~0,off; jz eof
|
||||
movslq off,disp # XXX: 2GB
|
||||
subl $ 3,off; jc len_n2b # use previous offset
|
||||
shll $ 8,off
|
||||
orl %edx,off; incq %rsi
|
||||
xorl $~0,off; jz eof
|
||||
movslq off,disp # XXX: 2GB
|
||||
len_n2b:
|
||||
lea 1(lenq),off # [len= 0] off= 1
|
||||
getnextb(len); getnextb(len) # two bits; cc set on result
|
||||
jnz gotlen_n2b # raw 1,2,3 ==> 2,3,4
|
||||
movl off,len # len= 1, the msb
|
||||
addl $3-1,off # raw 2.. ==> 5..
|
||||
lea 1(lenq),off # [len= 0] off= 1
|
||||
getnextb(len); getnextb(len) # two bits; cc set on result
|
||||
jnz gotlen_n2b # raw 1,2,3 ==> 2,3,4
|
||||
movl off,len # len= 1, the msb
|
||||
addl $3-1,off # raw 2.. ==> 5..
|
||||
lenmore_n2b:
|
||||
getnextb(len)
|
||||
jnextb0n lenmore_n2b
|
||||
getnextb(len)
|
||||
jnextb0n lenmore_n2b
|
||||
gotlen_n2b:
|
||||
cmpq $-0xd00,disp
|
||||
adcl off,len # len += off + (disp < -0xd00)
|
||||
call copy
|
||||
cmpq $-0xd00,disp
|
||||
adcl off,len # len += off + (disp < -0xd00)
|
||||
call copy
|
||||
bot_n2b: # In: 0==len
|
||||
jmp top_n2b
|
||||
|
||||
/*
|
||||
vi:ts=8:et:nowrap
|
||||
*/
|
||||
|
||||
|
|
|
@ -29,13 +29,14 @@
|
|||
<jreiser@users.sourceforge.net>
|
||||
*/
|
||||
|
||||
ALIGN(1<<3)
|
||||
ALIGN(1<<3)
|
||||
lit_n2e:
|
||||
movb (%rsi),%al; addq $1,%rsi
|
||||
movb %al,(%rdi); addq $1,%rdi
|
||||
incq %rsi; movb %dl,(%rdi)
|
||||
incq %rdi
|
||||
top_n2e:
|
||||
jnextb1y lit_n2e
|
||||
lea 1(lenq),off # [len= 0] off= 1
|
||||
movzbl (%rsi),%edx # speculate: literal, or bottom 8 bits of offset
|
||||
jnextb1y lit_n2e
|
||||
lea 1(lenq),off # [len= 0] off= 1
|
||||
jmp getoff_n2e
|
||||
|
||||
off_n2e:
|
||||
|
@ -45,15 +46,14 @@ getoff_n2e:
|
|||
getnextb(off)
|
||||
jnextb0n off_n2e
|
||||
|
||||
movzbl (%rsi),%edx #; xorl len,len # len= 0
|
||||
subl $ 3,off; jc offprev_n2e
|
||||
shll $ 8,off
|
||||
orl %edx,off; incq %rsi
|
||||
xorl $~0,off; jz eof
|
||||
sarl off # Carry= original low bit
|
||||
movslq off,disp # XXX: 2GB
|
||||
jc lenlast_n2e
|
||||
jmp lenmore_n2e
|
||||
subl $ 3,off; jc offprev_n2e
|
||||
shll $ 8,off
|
||||
orl %edx,off; incq %rsi
|
||||
xorl $~0,off; jz eof
|
||||
sarl off # Carry= original low bit
|
||||
movslq off,disp # XXX: 2GB
|
||||
jc lenlast_n2e
|
||||
jmp lenmore_n2e
|
||||
|
||||
offprev_n2e:
|
||||
jnextb1y lenlast_n2e
|
||||
|
@ -69,8 +69,13 @@ len_n2e:
|
|||
lenlast_n2e:
|
||||
getnextb(len) # 0,1,2,3
|
||||
gotlen_n2e:
|
||||
cmpq $-0x500,disp
|
||||
cmpq $-0x500,disp
|
||||
adcl $2,len # len += 2+ (disp < -0x500);
|
||||
call copy
|
||||
call copy
|
||||
bot_n2e: # In: 0==len
|
||||
jmp top_n2e
|
||||
|
||||
/*
|
||||
vi:ts=8:et:nowrap
|
||||
*/
|
||||
|
||||
|
|
|
@ -81,17 +81,17 @@ decompress: # (uchar const *src, size_t lsrc, uchar *dst, u32 &ldst, uint metho
|
|||
#define bits %ebx
|
||||
#define disp %rbp
|
||||
|
||||
push %rbp; push %rbx # C callable
|
||||
push ldst
|
||||
push dst
|
||||
push %rbp; push %rbx # C callable
|
||||
push ldst
|
||||
push dst
|
||||
addq src,lsrc; push lsrc # &input_eof
|
||||
|
||||
movq src,%rsi # hardware src for movsb, lodsb
|
||||
movq dst,%rdi # hardware dst for movsb
|
||||
xorl bits,bits # empty; force refill
|
||||
xorl len,len # create loop invariant
|
||||
orq $~0,disp # -1: initial displacement
|
||||
jmp setup
|
||||
movq src,%rsi # hardware src for movsb, lodsb
|
||||
movq dst,%rdi # hardware dst for movsb
|
||||
xorl bits,bits # empty; force refill
|
||||
xorl len,len # create loop invariant
|
||||
orq $~0,disp # -1: initial displacement
|
||||
jmp setup
|
||||
|
||||
/* jump on next bit {0,1} with prediction {y==>likely, n==>unlikely} */
|
||||
/* Prediction omitted for now. */
|
||||
|
@ -103,44 +103,46 @@ decompress: # (uchar const *src, size_t lsrc, uchar *dst, u32 &ldst, uint metho
|
|||
/* rotate next bit into bottom bit of reg */
|
||||
#define getnextb(reg) addl bits,bits; jnz 0f; call refill; 0: adcl reg,reg
|
||||
|
||||
ALIGN(1<<3)
|
||||
ALIGN(1<<3)
|
||||
getbit:
|
||||
addl bits,bits; jz refill # Carry= next bit
|
||||
rep; ret
|
||||
addl bits,bits; jz refill # Carry= next bit
|
||||
rep; ret
|
||||
refill:
|
||||
movl (%rsi),bits; subq $-4,%rsi # next 32 bits; set Carry
|
||||
adcl bits,bits # LSB= 1 (CarryIn); CarryOut= next bit
|
||||
rep; ret
|
||||
movl (%rsi),bits; subq $-4,%rsi # next 32 bits; set Carry
|
||||
adcl bits,bits # LSB= 1 (CarryIn); CarryOut= next bit
|
||||
movzbl (%rsi),%edx # speculate: literal, or bottom 8 bits of offset
|
||||
rep; ret
|
||||
|
||||
copy: # In: len, %rdi, disp; Out: 0==len, %rdi, disp; trashes %rax, %rdx
|
||||
leaq (%rdi,disp),%rdx
|
||||
cmpl $ 3,len; jbe copy1 # perhaps extend this to length 5 or less?
|
||||
cmpq $-4,disp; ja copy1 # 4-byte chunks would overlap
|
||||
subl $4,len # adjust for termination cases
|
||||
leaq (%rdi,disp),%rax; movb (%rax),%dl
|
||||
cmpl $ 3,len; jbe copy1 # perhaps extend this to length 5 or less?
|
||||
cmpq $-4,disp; ja copy1 # 4-byte chunks would overlap
|
||||
subl $4,len # adjust for termination cases
|
||||
copy4:
|
||||
movl (%rdx),%eax; leaq 4(%rdx),%rdx; subl $4,len
|
||||
movl %eax,(%rdi); leaq 4(%rdi),%rdi; jnc copy4
|
||||
addl $4,len; jz copy0
|
||||
movl (%rax),%edx; addq $4, %rax; subl $4,len
|
||||
movl %edx,(%rdi); leaq 4(%rdi),%rdi; jnc copy4
|
||||
addl $4,len; movb (%rax),%dl; jz copy0
|
||||
copy1:
|
||||
movb (%rdx), %al; leaq 1(%rdx),%rdx; subl $1,len
|
||||
movb %al,(%rdi); leaq 1(%rdi),%rdi; jnz copy1
|
||||
incq %rax; movb %dl,(%rdi); subl $1,len
|
||||
movb (%rax),%dl
|
||||
leaq 1(%rdi),%rdi; jnz copy1
|
||||
copy0:
|
||||
rep; ret
|
||||
rep; ret
|
||||
|
||||
#include "amd_d_nrv2e.S"
|
||||
#include "amd_d_nrv2b.S"
|
||||
|
||||
setup:
|
||||
cld
|
||||
cmpl $ M_NRV2E_LE32,meth; je bot_n2e
|
||||
cmpl $ M_NRV2B_LE32,meth; je top_n2b
|
||||
cld
|
||||
cmpl $ M_NRV2E_LE32,meth; je bot_n2e
|
||||
cmpl $ M_NRV2B_LE32,meth; je top_n2b
|
||||
eof:
|
||||
pop %rcx # &input_eof
|
||||
movq %rsi,%rax; subq %rcx,%rax # src -= eof; // return 0: good; else: bad
|
||||
pop %rdx; subq %rdx,%rdi # dst -= original dst
|
||||
pop %rcx; movl %edi,(%rcx) # actual length used at dst XXX: 4GB
|
||||
pop %rbx; pop %rbp
|
||||
ret
|
||||
pop %rcx # &input_eof
|
||||
movq %rsi,%rax; subq %rcx,%rax # src -= eof; // return 0: good; else: bad
|
||||
pop %rdx; subq %rdx,%rdi # dst -= original dst
|
||||
pop %rcx; movl %edi,(%rcx) # actual length used at dst XXX: 4GB
|
||||
pop %rbx; pop %rbp
|
||||
ret
|
||||
|
||||
/* Decompress the rest of this loader, and jump to it.
|
||||
Map a page to hold the decompressed bytes. Logically this could
|
||||
|
|
Loading…
Reference in New Issue
Block a user