From b670e8d1acf564526277f74d0bd4b0a41d5ba925 Mon Sep 17 00:00:00 2001 From: John Reiser Date: Mon, 14 Nov 2005 03:34:03 +0000 Subject: [PATCH] tuning amd_bxx.S amd_d_nrv2b.S amd_d_nrv2e.S l_lx_elf64amd.S committer: jreiser 1131939243 +0000 --- src/stub/amd_bxx.S | 6 +- src/stub/amd_d_nrv2b.S | 63 ++++++++++++++++++++ src/stub/amd_d_nrv2e.S | 124 ++------------------------------------- src/stub/l_lx_elf64amd.S | 89 ++++++++++++++++++++++++++-- 4 files changed, 157 insertions(+), 125 deletions(-) create mode 100644 src/stub/amd_d_nrv2b.S diff --git a/src/stub/amd_bxx.S b/src/stub/amd_bxx.S index de8a70d5..6ce42c6b 100644 --- a/src/stub/amd_bxx.S +++ b/src/stub/amd_bxx.S @@ -56,9 +56,9 @@ ckmark: ckstart: subq $4,%rcx movb (%rdi),%al; addq $1,%rdi - loop ckloop2 # prefix cannot overlap previous displacement - jrcxz ckend + decq %rcx; jnz ckloop2 # prefix cannot overlap previous displacement + jmp ckend ckcount: - loop ckloop3 + decq %rcx; jnz ckloop3 ckend: ret diff --git a/src/stub/amd_d_nrv2b.S b/src/stub/amd_d_nrv2b.S new file mode 100644 index 00000000..570dfc2e --- /dev/null +++ b/src/stub/amd_d_nrv2b.S @@ -0,0 +1,63 @@ +/* amd_d_nrv2b.S -- AMD64 decompressor for NRV2B + + This file is part of the UPX executable compressor. + + Copyright (C) 1996-2004 Markus Franz Xaver Johannes Oberhumer + Copyright (C) 1996-2004 Laszlo Molnar + Copyright (C) 2000-2005 John F. Reiser + All Rights Reserved. + + UPX and the UCL library are free software; you can redistribute them + and/or modify them under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of + the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; see the file COPYING. + If not, write to the Free Software Foundation, Inc., + 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + Markus F.X.J. Oberhumer Laszlo Molnar + + + John F. Reiser + +*/ + + ALIGN(1<<3) +lit_n2b: + movb (%rsi),%al; addq $1,%rsi + movb %al,(%rdi); addq $1,%rdi +top_n2b: + jnextb1y lit_n2b + lea 1(lenq),off # [len= 0] off= 1 +offmore_n2b: + getnextb(off) + jnextb0n offmore_n2b + + movzbl (%rsi),%edx + subl $ 3,off; jc len_n2b # use previous offset + shll $ 8,off + orl %edx,off; incq %rsi + xorl $~0,off; jz eof + movslq off,disp # XXX: 2GB +len_n2b: + lea 1(lenq),off # [len= 0] off= 1 + getnextb(len); getnextb(len) # two bits; cc set on result + jnz gotlen_n2b # raw 1,2,3 ==> 2,3,4 + movl off,len # len= 1, the msb + addl $3-1,off # raw 2.. ==> 5.. +lenmore_n2b: + getnextb(len) + jnextb0n lenmore_n2b +gotlen_n2b: + cmpq $-0xd00,disp + adcl off,len # len += off + (disp < -0xd00) + call copy +bot_n2b: # In: 0==len + jmp top_n2b diff --git a/src/stub/amd_d_nrv2e.S b/src/stub/amd_d_nrv2e.S index c3129e7d..39f45b86 100644 --- a/src/stub/amd_d_nrv2e.S +++ b/src/stub/amd_d_nrv2e.S @@ -29,62 +29,6 @@ */ -#include "amd_regs.h" - -M_NRV2B_LE32=2 # ../conf.h -M_NRV2E_LE32=8 - -#define ALIGN(n) .align n - -/* Returns 0 on success; non-zero on failure. */ -decompress: # (uchar const *src, size_t lsrc, uchar *dst, size_t &ldst, uint method) - -/* Arguments according to calling convention */ -#define src %arg1 -#define lsrc %arg2 -#define dst %arg3 -#define ldst %arg4 /* Out: actually a reference: &len_dst */ -#define meth %arg5l - -/* Working registers */ -#define off %eax /* XXX: 2GB */ -#define len %ecx /* XXX: 2GB */ -#define lenq %rcx -#define bits %ebx -#define disp %rbp - - push %rbp; push %rbx - push ldst - push dst - addq src,lsrc; push lsrc # &input_eof - - movq src,%rsi # hardware src for movsb, lodsb - movq dst,%rdi # hardware dst for movsb - xorl bits,bits # empty; force refill - xorl len,len # create loop invariant - orq $~0,disp # -1: initial displacement - call setup_rdx -ra_setup_rdx: - -/* jump on next bit {0,1} with prediction {y==>likely, n==>unlikely} */ -/* Prediction omitted for now. */ -#define jnextb0n jnextb0y -#define jnextb0y addl bits,bits; jnz 0f; call refill; 0: jnc -#define jnextb1n jnextb1y -#define jnextb1y addl bits,bits; jnz 0f; call refill; 0: jc - -/* rotate next bit into bottom bit of reg */ -#define getnextb(reg) addl bits,bits; jnz 0f; call refill; 0: adcl reg,reg - - ALIGN(1<<3) -getbit: - addl bits,bits; jz refill # Carry= next bit - rep; ret -refill: - movl (%rsi),bits; subq $-4,%rsi # next 32 bits; set Carry - adcl bits,bits # LSB= 1 (CarryIn); CarryOut= next bit - rep; ret - ALIGN(1<<3) lit_n2e: movb (%rsi),%al; addq $1,%rsi @@ -101,11 +45,11 @@ getoff_n2e: getnextb(off) jnextb0n off_n2e - /*xorl len,len # len= 0*/ - subl $3,off; jc offprev_n2e - shll $8,off - lodsb # requires off===%eax - xorl $~0,off; jz eof_n2e + movzbl (%rsi),%edx #; xorl len,len # len= 0 + subl $ 3,off; jc offprev_n2e + shll $ 8,off + orl %edx,off; incq %rsi + xorl $~0,off; jz eof sarl off # Carry= original low bit movslq off,disp # XXX: 2GB jc lenlast_n2e @@ -127,62 +71,6 @@ lenlast_n2e: gotlen_n2e: cmpq $-0x500,disp adcl $2,len # len += 2+ (disp < -0x500); - push %rsi - leaq (%rdi,disp),%rsi - rep; movsb # len - pop %rsi + call copy bot_n2e: # In: 0==len - prefetch 0x7f(%rdi) # em64t has no prefetchw jmp top_n2e - - - ALIGN(1<<3) -lit_n2b: - movb (%rsi),%al; addq $1,%rsi - movb %al,(%rdi); addq $1,%rdi -top_n2b: - jnextb1y lit_n2b - lea 1(lenq),off # [len= 0] off= 1 -offmore_n2b: - getnextb(off) - jnextb0n offmore_n2b - - subl $3,off; jc len_n2b # use previous offset - shll $8,off - lodsb # requires off===%eax - xorl $~0,off; jz eof_n2b - movslq off,disp # XXX: 2GB -len_n2b: - lea 1(lenq),off # [len= 0] off= 1 - getnextb(len); getnextb(len) # two bits; cc set on result - jnz gotlen_n2b # raw 1,2,3 ==> 2,3,4 - movl off,len # len= 1, the msb - addl $3-1,off # raw 2.. ==> 5.. -lenmore_n2b: - getnextb(len) - jnextb0n lenmore_n2b -gotlen_n2b: - cmpq $-0xd00,disp - adcl off,len # len += off + (disp < -0xd00) - push %rsi - leaq (%rdi,disp),%rsi - rep; movsb # len - pop %rsi -bot_n2b: # In: 0==len - prefetch 0x7f(%rdi) # em64t has no prefetchw - jmp top_n2b - -setup_rdx: - cld - pop %rdx; addq $ getbit - ra_setup_rdx,%rdx # %rdx= &getbit - cmpl $ M_NRV2E_LE32,meth; je bot_n2e - cmpl $ M_NRV2B_LE32,meth; je bot_n2b -eof_n2e: -eof_n2b: - pop %rcx # &input_eof - movq %rsi,%rax; subq %rcx,%rax # src -= eof; // return 0: good; else: bad - pop %rdx; subq %rdx,%rdi # dst -= original dst - pop %rcx; movl %edi,(%rcx) # actual length used at dst XXX: 4GB - pop %rbx; pop %rbp - ret - diff --git a/src/stub/l_lx_elf64amd.S b/src/stub/l_lx_elf64amd.S index d10a4c82..43c27729 100644 --- a/src/stub/l_lx_elf64amd.S +++ b/src/stub/l_lx_elf64amd.S @@ -29,6 +29,8 @@ * */ +#include "amd_regs.h" + sz_l_info= 12 l_lsize= 8 @@ -53,12 +55,92 @@ PAGE_SHIFT= 12 PAGE_MASK= (~0<likely, n==>unlikely} */ +/* Prediction omitted for now. */ +#define jnextb0n jnextb0y +#define jnextb0y addl bits,bits; jnz 0f; call refill; 0: jnc +#define jnextb1n jnextb1y +#define jnextb1y addl bits,bits; jnz 0f; call refill; 0: jc + +/* rotate next bit into bottom bit of reg */ +#define getnextb(reg) addl bits,bits; jnz 0f; call refill; 0: adcl reg,reg + + ALIGN(1<<3) +getbit: + addl bits,bits; jz refill # Carry= next bit + rep; ret +refill: + movl (%rsi),bits; subq $-4,%rsi # next 32 bits; set Carry + adcl bits,bits # LSB= 1 (CarryIn); CarryOut= next bit + rep; ret + +copy: # In: len, %rdi, disp; Out: 0==len, %rdi, disp; trashes %rax, %rdx + leaq (%rdi,disp),%rdx + cmpl $ 3,len; jbe copy1 # perhaps extend this to length 5 or less? + cmpq $-4,disp; ja copy1 # 4-byte chunks would overlap + subl $4,len # adjust for termination cases +copy4: + movl (%rdx),%eax; leaq 4(%rdx),%rdx; subl $4,len + movl %eax,(%rdi); leaq 4(%rdi),%rdi; jnc copy4 + addl $4,len; jz copy0 +copy1: + movb (%rdx), %al; leaq 1(%rdx),%rdx; subl $1,len + movb %al,(%rdi); leaq 1(%rdi),%rdi; jnz copy1 +copy0: + rep; ret + +#include "amd_d_nrv2e.S" +#include "amd_d_nrv2b.S" + +setup: + cld + cmpl $ M_NRV2E_LE32,meth; je bot_n2e + cmpl $ M_NRV2B_LE32,meth; je top_n2b +eof: + pop %rcx # &input_eof + movq %rsi,%rax; subq %rcx,%rax # src -= eof; // return 0: good; else: bad + pop %rdx; subq %rdx,%rdi # dst -= original dst + pop %rcx; movl %edi,(%rcx) # actual length used at dst XXX: 4GB + pop %rbx; pop %rbp + ret /* Decompress the rest of this loader, and jump to it. Map a page to hold the decompressed bytes. Logically this could @@ -91,7 +173,7 @@ unfold: /* Load the addresses and lengths that ::pack3() patched in. XXX: 2GB Note that PUSH $imm32 sign-extends to 64 bits. XXX: 4GB Note that MOVL $imm32,reg zero-extends to 64-bits. - If desired, then use an temporary register to extend the 2GB PUSH to 4GB. + (Use an temporary register to obtain 4GB range on PUSH constant.) */ .byte 0x68; .ascii "JMPU" # push $'JMPU' # for unmap in fold .byte 0x68; .ascii "ADRU" # push $'ADRU' # for unmap in fold @@ -129,7 +211,6 @@ main: call unfold # push &b_info /* { b_info={sz_unc, sz_cpr, {4 char}}, folded_loader...} */ -eof: /*__XTHEENDX__*/ /*