mirror of
https://github.com/upx/upx
synced 2025-09-28 19:06:07 +08:00
tuning
amd_bxx.S amd_d_nrv2b.S amd_d_nrv2e.S l_lx_elf64amd.S committer: jreiser <jreiser> 1131939243 +0000
This commit is contained in:
parent
68f8358f06
commit
b670e8d1ac
|
@ -56,9 +56,9 @@ ckmark:
|
||||||
ckstart:
|
ckstart:
|
||||||
subq $4,%rcx
|
subq $4,%rcx
|
||||||
movb (%rdi),%al; addq $1,%rdi
|
movb (%rdi),%al; addq $1,%rdi
|
||||||
loop ckloop2 # prefix cannot overlap previous displacement
|
decq %rcx; jnz ckloop2 # prefix cannot overlap previous displacement
|
||||||
jrcxz ckend
|
jmp ckend
|
||||||
ckcount:
|
ckcount:
|
||||||
loop ckloop3
|
decq %rcx; jnz ckloop3
|
||||||
ckend:
|
ckend:
|
||||||
ret
|
ret
|
||||||
|
|
63
src/stub/amd_d_nrv2b.S
Normal file
63
src/stub/amd_d_nrv2b.S
Normal file
|
@ -0,0 +1,63 @@
|
||||||
|
/* amd_d_nrv2b.S -- AMD64 decompressor for NRV2B
|
||||||
|
|
||||||
|
This file is part of the UPX executable compressor.
|
||||||
|
|
||||||
|
Copyright (C) 1996-2004 Markus Franz Xaver Johannes Oberhumer
|
||||||
|
Copyright (C) 1996-2004 Laszlo Molnar
|
||||||
|
Copyright (C) 2000-2005 John F. Reiser
|
||||||
|
All Rights Reserved.
|
||||||
|
|
||||||
|
UPX and the UCL library are free software; you can redistribute them
|
||||||
|
and/or modify them under the terms of the GNU General Public License as
|
||||||
|
published by the Free Software Foundation; either version 2 of
|
||||||
|
the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program; see the file COPYING.
|
||||||
|
If not, write to the Free Software Foundation, Inc.,
|
||||||
|
59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
||||||
|
|
||||||
|
Markus F.X.J. Oberhumer Laszlo Molnar
|
||||||
|
<mfx@users.sourceforge.net> <ml1050@users.sourceforge.net>
|
||||||
|
|
||||||
|
John F. Reiser
|
||||||
|
<jreiser@users.sourceforge.net>
|
||||||
|
*/
|
||||||
|
|
||||||
|
ALIGN(1<<3)
|
||||||
|
lit_n2b:
|
||||||
|
movb (%rsi),%al; addq $1,%rsi
|
||||||
|
movb %al,(%rdi); addq $1,%rdi
|
||||||
|
top_n2b:
|
||||||
|
jnextb1y lit_n2b
|
||||||
|
lea 1(lenq),off # [len= 0] off= 1
|
||||||
|
offmore_n2b:
|
||||||
|
getnextb(off)
|
||||||
|
jnextb0n offmore_n2b
|
||||||
|
|
||||||
|
movzbl (%rsi),%edx
|
||||||
|
subl $ 3,off; jc len_n2b # use previous offset
|
||||||
|
shll $ 8,off
|
||||||
|
orl %edx,off; incq %rsi
|
||||||
|
xorl $~0,off; jz eof
|
||||||
|
movslq off,disp # XXX: 2GB
|
||||||
|
len_n2b:
|
||||||
|
lea 1(lenq),off # [len= 0] off= 1
|
||||||
|
getnextb(len); getnextb(len) # two bits; cc set on result
|
||||||
|
jnz gotlen_n2b # raw 1,2,3 ==> 2,3,4
|
||||||
|
movl off,len # len= 1, the msb
|
||||||
|
addl $3-1,off # raw 2.. ==> 5..
|
||||||
|
lenmore_n2b:
|
||||||
|
getnextb(len)
|
||||||
|
jnextb0n lenmore_n2b
|
||||||
|
gotlen_n2b:
|
||||||
|
cmpq $-0xd00,disp
|
||||||
|
adcl off,len # len += off + (disp < -0xd00)
|
||||||
|
call copy
|
||||||
|
bot_n2b: # In: 0==len
|
||||||
|
jmp top_n2b
|
|
@ -29,62 +29,6 @@
|
||||||
<jreiser@users.sourceforge.net>
|
<jreiser@users.sourceforge.net>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "amd_regs.h"
|
|
||||||
|
|
||||||
M_NRV2B_LE32=2 # ../conf.h
|
|
||||||
M_NRV2E_LE32=8
|
|
||||||
|
|
||||||
#define ALIGN(n) .align n
|
|
||||||
|
|
||||||
/* Returns 0 on success; non-zero on failure. */
|
|
||||||
decompress: # (uchar const *src, size_t lsrc, uchar *dst, size_t &ldst, uint method)
|
|
||||||
|
|
||||||
/* Arguments according to calling convention */
|
|
||||||
#define src %arg1
|
|
||||||
#define lsrc %arg2
|
|
||||||
#define dst %arg3
|
|
||||||
#define ldst %arg4 /* Out: actually a reference: &len_dst */
|
|
||||||
#define meth %arg5l
|
|
||||||
|
|
||||||
/* Working registers */
|
|
||||||
#define off %eax /* XXX: 2GB */
|
|
||||||
#define len %ecx /* XXX: 2GB */
|
|
||||||
#define lenq %rcx
|
|
||||||
#define bits %ebx
|
|
||||||
#define disp %rbp
|
|
||||||
|
|
||||||
push %rbp; push %rbx
|
|
||||||
push ldst
|
|
||||||
push dst
|
|
||||||
addq src,lsrc; push lsrc # &input_eof
|
|
||||||
|
|
||||||
movq src,%rsi # hardware src for movsb, lodsb
|
|
||||||
movq dst,%rdi # hardware dst for movsb
|
|
||||||
xorl bits,bits # empty; force refill
|
|
||||||
xorl len,len # create loop invariant
|
|
||||||
orq $~0,disp # -1: initial displacement
|
|
||||||
call setup_rdx
|
|
||||||
ra_setup_rdx:
|
|
||||||
|
|
||||||
/* jump on next bit {0,1} with prediction {y==>likely, n==>unlikely} */
|
|
||||||
/* Prediction omitted for now. */
|
|
||||||
#define jnextb0n jnextb0y
|
|
||||||
#define jnextb0y addl bits,bits; jnz 0f; call refill; 0: jnc
|
|
||||||
#define jnextb1n jnextb1y
|
|
||||||
#define jnextb1y addl bits,bits; jnz 0f; call refill; 0: jc
|
|
||||||
|
|
||||||
/* rotate next bit into bottom bit of reg */
|
|
||||||
#define getnextb(reg) addl bits,bits; jnz 0f; call refill; 0: adcl reg,reg
|
|
||||||
|
|
||||||
ALIGN(1<<3)
|
|
||||||
getbit:
|
|
||||||
addl bits,bits; jz refill # Carry= next bit
|
|
||||||
rep; ret
|
|
||||||
refill:
|
|
||||||
movl (%rsi),bits; subq $-4,%rsi # next 32 bits; set Carry
|
|
||||||
adcl bits,bits # LSB= 1 (CarryIn); CarryOut= next bit
|
|
||||||
rep; ret
|
|
||||||
|
|
||||||
ALIGN(1<<3)
|
ALIGN(1<<3)
|
||||||
lit_n2e:
|
lit_n2e:
|
||||||
movb (%rsi),%al; addq $1,%rsi
|
movb (%rsi),%al; addq $1,%rsi
|
||||||
|
@ -101,11 +45,11 @@ getoff_n2e:
|
||||||
getnextb(off)
|
getnextb(off)
|
||||||
jnextb0n off_n2e
|
jnextb0n off_n2e
|
||||||
|
|
||||||
/*xorl len,len # len= 0*/
|
movzbl (%rsi),%edx #; xorl len,len # len= 0
|
||||||
subl $3,off; jc offprev_n2e
|
subl $ 3,off; jc offprev_n2e
|
||||||
shll $8,off
|
shll $ 8,off
|
||||||
lodsb # requires off===%eax
|
orl %edx,off; incq %rsi
|
||||||
xorl $~0,off; jz eof_n2e
|
xorl $~0,off; jz eof
|
||||||
sarl off # Carry= original low bit
|
sarl off # Carry= original low bit
|
||||||
movslq off,disp # XXX: 2GB
|
movslq off,disp # XXX: 2GB
|
||||||
jc lenlast_n2e
|
jc lenlast_n2e
|
||||||
|
@ -127,62 +71,6 @@ lenlast_n2e:
|
||||||
gotlen_n2e:
|
gotlen_n2e:
|
||||||
cmpq $-0x500,disp
|
cmpq $-0x500,disp
|
||||||
adcl $2,len # len += 2+ (disp < -0x500);
|
adcl $2,len # len += 2+ (disp < -0x500);
|
||||||
push %rsi
|
call copy
|
||||||
leaq (%rdi,disp),%rsi
|
|
||||||
rep; movsb # len
|
|
||||||
pop %rsi
|
|
||||||
bot_n2e: # In: 0==len
|
bot_n2e: # In: 0==len
|
||||||
prefetch 0x7f(%rdi) # em64t has no prefetchw
|
|
||||||
jmp top_n2e
|
jmp top_n2e
|
||||||
|
|
||||||
|
|
||||||
ALIGN(1<<3)
|
|
||||||
lit_n2b:
|
|
||||||
movb (%rsi),%al; addq $1,%rsi
|
|
||||||
movb %al,(%rdi); addq $1,%rdi
|
|
||||||
top_n2b:
|
|
||||||
jnextb1y lit_n2b
|
|
||||||
lea 1(lenq),off # [len= 0] off= 1
|
|
||||||
offmore_n2b:
|
|
||||||
getnextb(off)
|
|
||||||
jnextb0n offmore_n2b
|
|
||||||
|
|
||||||
subl $3,off; jc len_n2b # use previous offset
|
|
||||||
shll $8,off
|
|
||||||
lodsb # requires off===%eax
|
|
||||||
xorl $~0,off; jz eof_n2b
|
|
||||||
movslq off,disp # XXX: 2GB
|
|
||||||
len_n2b:
|
|
||||||
lea 1(lenq),off # [len= 0] off= 1
|
|
||||||
getnextb(len); getnextb(len) # two bits; cc set on result
|
|
||||||
jnz gotlen_n2b # raw 1,2,3 ==> 2,3,4
|
|
||||||
movl off,len # len= 1, the msb
|
|
||||||
addl $3-1,off # raw 2.. ==> 5..
|
|
||||||
lenmore_n2b:
|
|
||||||
getnextb(len)
|
|
||||||
jnextb0n lenmore_n2b
|
|
||||||
gotlen_n2b:
|
|
||||||
cmpq $-0xd00,disp
|
|
||||||
adcl off,len # len += off + (disp < -0xd00)
|
|
||||||
push %rsi
|
|
||||||
leaq (%rdi,disp),%rsi
|
|
||||||
rep; movsb # len
|
|
||||||
pop %rsi
|
|
||||||
bot_n2b: # In: 0==len
|
|
||||||
prefetch 0x7f(%rdi) # em64t has no prefetchw
|
|
||||||
jmp top_n2b
|
|
||||||
|
|
||||||
setup_rdx:
|
|
||||||
cld
|
|
||||||
pop %rdx; addq $ getbit - ra_setup_rdx,%rdx # %rdx= &getbit
|
|
||||||
cmpl $ M_NRV2E_LE32,meth; je bot_n2e
|
|
||||||
cmpl $ M_NRV2B_LE32,meth; je bot_n2b
|
|
||||||
eof_n2e:
|
|
||||||
eof_n2b:
|
|
||||||
pop %rcx # &input_eof
|
|
||||||
movq %rsi,%rax; subq %rcx,%rax # src -= eof; // return 0: good; else: bad
|
|
||||||
pop %rdx; subq %rdx,%rdi # dst -= original dst
|
|
||||||
pop %rcx; movl %edi,(%rcx) # actual length used at dst XXX: 4GB
|
|
||||||
pop %rbx; pop %rbp
|
|
||||||
ret
|
|
||||||
|
|
||||||
|
|
|
@ -29,6 +29,8 @@
|
||||||
* <jreiser@users.sourceforge.net>
|
* <jreiser@users.sourceforge.net>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "amd_regs.h"
|
||||||
|
|
||||||
sz_l_info= 12
|
sz_l_info= 12
|
||||||
l_lsize= 8
|
l_lsize= 8
|
||||||
|
|
||||||
|
@ -53,12 +55,92 @@ PAGE_SHIFT= 12
|
||||||
PAGE_MASK= (~0<<PAGE_SHIFT)
|
PAGE_MASK= (~0<<PAGE_SHIFT)
|
||||||
PAGE_SIZE= -PAGE_MASK
|
PAGE_SIZE= -PAGE_MASK
|
||||||
|
|
||||||
|
M_NRV2B_LE32=2 # ../conf.h
|
||||||
|
M_NRV2E_LE32=8
|
||||||
|
|
||||||
|
#define ALIGN(n) .align n
|
||||||
|
|
||||||
/*__LEXEC000__*/
|
/*__LEXEC000__*/
|
||||||
_start: .globl _start
|
_start: .globl _start
|
||||||
call main # push &decompress
|
call main # push &decompress
|
||||||
#include "amd_d_nrv2e.S"
|
|
||||||
|
|
||||||
#include "amd_regs.h"
|
/* Returns 0 on success; non-zero on failure. */
|
||||||
|
decompress: # (uchar const *src, size_t lsrc, uchar *dst, u32 &ldst, uint method)
|
||||||
|
|
||||||
|
/* Arguments according to calling convention */
|
||||||
|
#define src %arg1
|
||||||
|
#define lsrc %arg2
|
||||||
|
#define dst %arg3
|
||||||
|
#define ldst %arg4 /* Out: actually a reference: &len_dst */
|
||||||
|
#define meth %arg5l
|
||||||
|
|
||||||
|
/* Working registers */
|
||||||
|
#define off %eax /* XXX: 2GB */
|
||||||
|
#define len %ecx /* XXX: 2GB */
|
||||||
|
#define lenq %rcx
|
||||||
|
#define bits %ebx
|
||||||
|
#define disp %rbp
|
||||||
|
|
||||||
|
push %rbp; push %rbx # C callable
|
||||||
|
push ldst
|
||||||
|
push dst
|
||||||
|
addq src,lsrc; push lsrc # &input_eof
|
||||||
|
|
||||||
|
movq src,%rsi # hardware src for movsb, lodsb
|
||||||
|
movq dst,%rdi # hardware dst for movsb
|
||||||
|
xorl bits,bits # empty; force refill
|
||||||
|
xorl len,len # create loop invariant
|
||||||
|
orq $~0,disp # -1: initial displacement
|
||||||
|
jmp setup
|
||||||
|
|
||||||
|
/* jump on next bit {0,1} with prediction {y==>likely, n==>unlikely} */
|
||||||
|
/* Prediction omitted for now. */
|
||||||
|
#define jnextb0n jnextb0y
|
||||||
|
#define jnextb0y addl bits,bits; jnz 0f; call refill; 0: jnc
|
||||||
|
#define jnextb1n jnextb1y
|
||||||
|
#define jnextb1y addl bits,bits; jnz 0f; call refill; 0: jc
|
||||||
|
|
||||||
|
/* rotate next bit into bottom bit of reg */
|
||||||
|
#define getnextb(reg) addl bits,bits; jnz 0f; call refill; 0: adcl reg,reg
|
||||||
|
|
||||||
|
ALIGN(1<<3)
|
||||||
|
getbit:
|
||||||
|
addl bits,bits; jz refill # Carry= next bit
|
||||||
|
rep; ret
|
||||||
|
refill:
|
||||||
|
movl (%rsi),bits; subq $-4,%rsi # next 32 bits; set Carry
|
||||||
|
adcl bits,bits # LSB= 1 (CarryIn); CarryOut= next bit
|
||||||
|
rep; ret
|
||||||
|
|
||||||
|
copy: # In: len, %rdi, disp; Out: 0==len, %rdi, disp; trashes %rax, %rdx
|
||||||
|
leaq (%rdi,disp),%rdx
|
||||||
|
cmpl $ 3,len; jbe copy1 # perhaps extend this to length 5 or less?
|
||||||
|
cmpq $-4,disp; ja copy1 # 4-byte chunks would overlap
|
||||||
|
subl $4,len # adjust for termination cases
|
||||||
|
copy4:
|
||||||
|
movl (%rdx),%eax; leaq 4(%rdx),%rdx; subl $4,len
|
||||||
|
movl %eax,(%rdi); leaq 4(%rdi),%rdi; jnc copy4
|
||||||
|
addl $4,len; jz copy0
|
||||||
|
copy1:
|
||||||
|
movb (%rdx), %al; leaq 1(%rdx),%rdx; subl $1,len
|
||||||
|
movb %al,(%rdi); leaq 1(%rdi),%rdi; jnz copy1
|
||||||
|
copy0:
|
||||||
|
rep; ret
|
||||||
|
|
||||||
|
#include "amd_d_nrv2e.S"
|
||||||
|
#include "amd_d_nrv2b.S"
|
||||||
|
|
||||||
|
setup:
|
||||||
|
cld
|
||||||
|
cmpl $ M_NRV2E_LE32,meth; je bot_n2e
|
||||||
|
cmpl $ M_NRV2B_LE32,meth; je top_n2b
|
||||||
|
eof:
|
||||||
|
pop %rcx # &input_eof
|
||||||
|
movq %rsi,%rax; subq %rcx,%rax # src -= eof; // return 0: good; else: bad
|
||||||
|
pop %rdx; subq %rdx,%rdi # dst -= original dst
|
||||||
|
pop %rcx; movl %edi,(%rcx) # actual length used at dst XXX: 4GB
|
||||||
|
pop %rbx; pop %rbp
|
||||||
|
ret
|
||||||
|
|
||||||
/* Decompress the rest of this loader, and jump to it.
|
/* Decompress the rest of this loader, and jump to it.
|
||||||
Map a page to hold the decompressed bytes. Logically this could
|
Map a page to hold the decompressed bytes. Logically this could
|
||||||
|
@ -91,7 +173,7 @@ unfold:
|
||||||
/* Load the addresses and lengths that ::pack3() patched in.
|
/* Load the addresses and lengths that ::pack3() patched in.
|
||||||
XXX: 2GB Note that PUSH $imm32 sign-extends to 64 bits.
|
XXX: 2GB Note that PUSH $imm32 sign-extends to 64 bits.
|
||||||
XXX: 4GB Note that MOVL $imm32,reg zero-extends to 64-bits.
|
XXX: 4GB Note that MOVL $imm32,reg zero-extends to 64-bits.
|
||||||
If desired, then use an temporary register to extend the 2GB PUSH to 4GB.
|
(Use an temporary register to obtain 4GB range on PUSH constant.)
|
||||||
*/
|
*/
|
||||||
.byte 0x68; .ascii "JMPU" # push $'JMPU' # for unmap in fold
|
.byte 0x68; .ascii "JMPU" # push $'JMPU' # for unmap in fold
|
||||||
.byte 0x68; .ascii "ADRU" # push $'ADRU' # for unmap in fold
|
.byte 0x68; .ascii "ADRU" # push $'ADRU' # for unmap in fold
|
||||||
|
@ -129,7 +211,6 @@ main:
|
||||||
call unfold # push &b_info
|
call unfold # push &b_info
|
||||||
/* { b_info={sz_unc, sz_cpr, {4 char}}, folded_loader...} */
|
/* { b_info={sz_unc, sz_cpr, {4 char}}, folded_loader...} */
|
||||||
|
|
||||||
eof:
|
|
||||||
/*__XTHEENDX__*/
|
/*__XTHEENDX__*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
Loading…
Reference in New Issue
Block a user