From b670e8d1acf564526277f74d0bd4b0a41d5ba925 Mon Sep 17 00:00:00 2001
From: John Reiser <jreiser@BitWagon.com>
Date: Mon, 14 Nov 2005 03:34:03 +0000
Subject: [PATCH] tuning 	amd_bxx.S amd_d_nrv2b.S amd_d_nrv2e.S
 l_lx_elf64amd.S

committer: jreiser <jreiser> 1131939243 +0000
---
 src/stub/amd_bxx.S       |   6 +-
 src/stub/amd_d_nrv2b.S   |  63 ++++++++++++++++++++
 src/stub/amd_d_nrv2e.S   | 124 ++-------------------------------------
 src/stub/l_lx_elf64amd.S |  89 ++++++++++++++++++++++++++--
 4 files changed, 157 insertions(+), 125 deletions(-)
 create mode 100644 src/stub/amd_d_nrv2b.S
diff --git a/src/stub/amd_bxx.S b/src/stub/amd_bxx.S
index de8a70d5..6ce42c6b 100644
--- a/src/stub/amd_bxx.S
+++ b/src/stub/amd_bxx.S
@@ -56,9 +56,9 @@ ckmark:
 ckstart:
 	subq $4,%rcx
 	movb (%rdi),%al; addq $1,%rdi
-	loop ckloop2  # prefix cannot overlap previous displacement
-	jrcxz ckend
+	decq %rcx; jnz ckloop2  # prefix cannot overlap previous displacement
+	jmp ckend
 ckcount:
-	loop ckloop3
+	decq %rcx; jnz ckloop3
 ckend:
 	ret
diff --git a/src/stub/amd_d_nrv2b.S b/src/stub/amd_d_nrv2b.S
new file mode 100644
index 00000000..570dfc2e
--- /dev/null
+++ b/src/stub/amd_d_nrv2b.S
@@ -0,0 +1,63 @@
+/* amd_d_nrv2b.S -- AMD64 decompressor for NRV2B
+
+   This file is part of the UPX executable compressor.
+
+   Copyright (C) 1996-2004 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 1996-2004 Laszlo Molnar
+   Copyright (C) 2000-2005 John F. Reiser
+   All Rights Reserved.
+
+   UPX and the UCL library are free software; you can redistribute them
+   and/or modify them under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+   Markus F.X.J. Oberhumer              Laszlo Molnar
+   <mfx@users.sourceforge.net>          <ml1050@users.sourceforge.net>
+
+   John F. Reiser
+   <jreiser@users.sourceforge.net>
+*/
+
+	ALIGN(1<<3)
+lit_n2b:
+	movb (%rsi),%al; addq $1,%rsi
+	movb %al,(%rdi); addq $1,%rdi
+top_n2b:
+	jnextb1y lit_n2b
+	lea 1(lenq),off  # [len= 0] off= 1
+offmore_n2b:
+	getnextb(off)
+	jnextb0n offmore_n2b
+
+	movzbl (%rsi),%edx
+	subl $ 3,off; jc len_n2b  # use previous offset
+	shll $ 8,off
+	orl %edx,off; incq %rsi
+	xorl $~0,off; jz eof
+	movslq off,disp  # XXX: 2GB
+len_n2b:
+	lea 1(lenq),off  # [len= 0] off= 1
+	getnextb(len); getnextb(len)  # two bits; cc set on result
+	jnz gotlen_n2b  # raw 1,2,3 ==> 2,3,4
+	movl off,len  # len= 1, the msb
+	addl $3-1,off  # raw 2.. ==> 5..
+lenmore_n2b:
+	getnextb(len)
+	jnextb0n lenmore_n2b
+gotlen_n2b:
+	cmpq $-0xd00,disp
+	adcl off,len  # len += off + (disp < -0xd00)
+	call copy
+bot_n2b:  # In: 0==len
+        jmp top_n2b
diff --git a/src/stub/amd_d_nrv2e.S b/src/stub/amd_d_nrv2e.S
index c3129e7d..39f45b86 100644
--- a/src/stub/amd_d_nrv2e.S
+++ b/src/stub/amd_d_nrv2e.S
@@ -29,62 +29,6 @@
    <jreiser@users.sourceforge.net>
 */
 
-#include "amd_regs.h"
-
-M_NRV2B_LE32=2  # ../conf.h
-M_NRV2E_LE32=8
-
-#define ALIGN(n)  .align n
-
-/* Returns 0 on success; non-zero on failure. */
-decompress:  # (uchar const *src, size_t lsrc, uchar *dst, size_t &ldst, uint method)
-
-/* Arguments according to calling convention */
-#define src  %arg1
-#define lsrc %arg2
-#define dst  %arg3
-#define ldst %arg4  /* Out: actually a reference: &len_dst */
-#define meth %arg5l
-
-/* Working registers */
-#define off  %eax  /* XXX: 2GB */
-#define len  %ecx  /* XXX: 2GB */
-#define lenq %rcx
-#define bits %ebx
-#define disp %rbp
-
-	push %rbp; push %rbx
-	push ldst
-	push dst
-        addq src,lsrc; push lsrc  # &input_eof
-
-	movq src,%rsi  # hardware src for movsb, lodsb
-	movq dst,%rdi  # hardware dst for movsb
-	xorl bits,bits  # empty; force refill
-	xorl len,len  # create loop invariant
-	orq $~0,disp  # -1: initial displacement
-	call setup_rdx
-ra_setup_rdx:
-
-/* jump on next bit {0,1} with prediction {y==>likely, n==>unlikely} */
-/* Prediction omitted for now. */
-#define jnextb0n jnextb0y
-#define jnextb0y addl bits,bits; jnz 0f; call refill; 0: jnc
-#define jnextb1n jnextb1y
-#define jnextb1y addl bits,bits; jnz 0f; call refill; 0: jc
-
-/* rotate next bit into bottom bit of reg */
-#define getnextb(reg) addl bits,bits; jnz 0f; call refill; 0: adcl reg,reg
-
-	ALIGN(1<<3)
-getbit:
-	addl bits,bits; jz refill  # Carry= next bit
-	rep; ret
-refill:
-	movl (%rsi),bits; subq $-4,%rsi  # next 32 bits; set Carry
-	adcl bits,bits  # LSB= 1 (CarryIn); CarryOut= next bit
-	rep; ret
-
 	ALIGN(1<<3)
 lit_n2e:
 	movb (%rsi),%al; addq $1,%rsi
@@ -101,11 +45,11 @@ getoff_n2e:
         getnextb(off)
         jnextb0n off_n2e
 
-	/*xorl len,len  # len= 0*/
-	subl $3,off; jc offprev_n2e
-	shll $8,off
-	lodsb  # requires off===%eax
-	xorl $~0,off; jz eof_n2e
+	movzbl (%rsi),%edx  #; xorl len,len  # len= 0
+	subl $ 3,off; jc offprev_n2e
+	shll $ 8,off
+	orl %edx,off; incq %rsi
+	xorl $~0,off; jz eof
 	sarl off  # Carry= original low bit
 	movslq off,disp  # XXX: 2GB
 	jc lenlast_n2e
@@ -127,62 +71,6 @@ lenlast_n2e:
 gotlen_n2e:
 	cmpq $-0x500,disp
         adcl $2,len  # len += 2+ (disp < -0x500);
-	push %rsi
-	leaq (%rdi,disp),%rsi
-	rep; movsb  # len
-	pop %rsi
+	call copy
 bot_n2e:  # In: 0==len
-	prefetch 0x7f(%rdi)  # em64t has no prefetchw
         jmp top_n2e
-
-
-	ALIGN(1<<3)
-lit_n2b:
-	movb (%rsi),%al; addq $1,%rsi
-	movb %al,(%rdi); addq $1,%rdi
-top_n2b:
-	jnextb1y lit_n2b
-	lea 1(lenq),off  # [len= 0] off= 1
-offmore_n2b:
-	getnextb(off)
-	jnextb0n offmore_n2b
-
-	subl $3,off; jc len_n2b  # use previous offset
-	shll $8,off
-	lodsb  # requires off===%eax
-	xorl $~0,off; jz eof_n2b
-	movslq off,disp  # XXX: 2GB
-len_n2b:
-	lea 1(lenq),off  # [len= 0] off= 1
-	getnextb(len); getnextb(len)  # two bits; cc set on result
-	jnz gotlen_n2b  # raw 1,2,3 ==> 2,3,4
-	movl off,len  # len= 1, the msb
-	addl $3-1,off  # raw 2.. ==> 5..
-lenmore_n2b:
-	getnextb(len)
-	jnextb0n lenmore_n2b
-gotlen_n2b:
-	cmpq $-0xd00,disp
-	adcl off,len  # len += off + (disp < -0xd00)
-	push %rsi
-	leaq (%rdi,disp),%rsi
-	rep; movsb  # len
-	pop %rsi
-bot_n2b:  # In: 0==len
-	prefetch 0x7f(%rdi)  # em64t has no prefetchw
-        jmp top_n2b
-
-setup_rdx:
-	cld
-	pop %rdx; addq $ getbit - ra_setup_rdx,%rdx  # %rdx= &getbit
-	cmpl $ M_NRV2E_LE32,meth; je bot_n2e
-	cmpl $ M_NRV2B_LE32,meth; je bot_n2b
-eof_n2e:
-eof_n2b:
-	pop %rcx  # &input_eof
-	movq %rsi,%rax; subq %rcx,%rax  # src -= eof;  // return 0: good; else: bad
-	pop %rdx;       subq %rdx,%rdi  # dst -= original dst
-	pop %rcx;            movl %edi,(%rcx)  # actual length used at dst  XXX: 4GB
-	pop %rbx; pop %rbp
-	ret
-
diff --git a/src/stub/l_lx_elf64amd.S b/src/stub/l_lx_elf64amd.S
index d10a4c82..43c27729 100644
--- a/src/stub/l_lx_elf64amd.S
+++ b/src/stub/l_lx_elf64amd.S
@@ -29,6 +29,8 @@
 *  <jreiser@users.sourceforge.net>
 */
 
+#include "amd_regs.h"
+
 sz_l_info= 12
   l_lsize= 8
 
@@ -53,12 +55,92 @@ PAGE_SHIFT= 12
 PAGE_MASK= (~0<<PAGE_SHIFT)
 PAGE_SIZE= -PAGE_MASK
 
+M_NRV2B_LE32=2  # ../conf.h
+M_NRV2E_LE32=8
+
+#define ALIGN(n)  .align n
+
 /*__LEXEC000__*/
 _start: .globl _start
         call main  # push &decompress
-#include "amd_d_nrv2e.S"
 
-#include "amd_regs.h"
+/* Returns 0 on success; non-zero on failure. */
+decompress:  # (uchar const *src, size_t lsrc, uchar *dst, u32 &ldst, uint method)
+
+/* Arguments according to calling convention */
+#define src  %arg1
+#define lsrc %arg2
+#define dst  %arg3
+#define ldst %arg4  /* Out: actually a reference: &len_dst */
+#define meth %arg5l
+
+/* Working registers */
+#define off  %eax  /* XXX: 2GB */
+#define len  %ecx  /* XXX: 2GB */
+#define lenq %rcx
+#define bits %ebx
+#define disp %rbp
+
+	push %rbp; push %rbx  # C callable
+	push ldst
+	push dst
+        addq src,lsrc; push lsrc  # &input_eof
+
+	movq src,%rsi  # hardware src for movsb, lodsb
+	movq dst,%rdi  # hardware dst for movsb
+	xorl bits,bits  # empty; force refill
+	xorl len,len  # create loop invariant
+	orq $~0,disp  # -1: initial displacement
+	jmp setup
+
+/* jump on next bit {0,1} with prediction {y==>likely, n==>unlikely} */
+/* Prediction omitted for now. */
+#define jnextb0n jnextb0y
+#define jnextb0y addl bits,bits; jnz 0f; call refill; 0: jnc
+#define jnextb1n jnextb1y
+#define jnextb1y addl bits,bits; jnz 0f; call refill; 0: jc
+
+/* rotate next bit into bottom bit of reg */
+#define getnextb(reg) addl bits,bits; jnz 0f; call refill; 0: adcl reg,reg
+
+	ALIGN(1<<3)
+getbit:
+	addl bits,bits; jz refill  # Carry= next bit
+	rep; ret
+refill:
+	movl (%rsi),bits; subq $-4,%rsi  # next 32 bits; set Carry
+	adcl bits,bits  # LSB= 1 (CarryIn); CarryOut= next bit
+	rep; ret
+
+copy:  # In: len, %rdi, disp;  Out: 0==len, %rdi, disp;  trashes %rax, %rdx
+	leaq (%rdi,disp),%rdx
+	cmpl $ 3,len;  jbe copy1  # perhaps extend this to length 5 or less?
+	cmpq $-4,disp; ja  copy1  # 4-byte chunks would overlap
+	subl $4,len  # adjust for termination cases
+copy4:
+	movl (%rdx),%eax; leaq 4(%rdx),%rdx; subl $4,len
+	movl %eax,(%rdi); leaq 4(%rdi),%rdi; jnc copy4
+	addl $4,len; jz copy0
+copy1:
+	movb (%rdx), %al; leaq 1(%rdx),%rdx; subl $1,len
+	movb  %al,(%rdi); leaq 1(%rdi),%rdi; jnz copy1
+copy0:
+	rep; ret
+
+#include "amd_d_nrv2e.S"
+#include "amd_d_nrv2b.S"
+
+setup:
+	cld
+	cmpl $ M_NRV2E_LE32,meth; je bot_n2e
+	cmpl $ M_NRV2B_LE32,meth; je top_n2b
+eof:
+	pop %rcx  # &input_eof
+	movq %rsi,%rax; subq %rcx,%rax  # src -= eof;  // return 0: good; else: bad
+	pop %rdx;       subq %rdx,%rdi  # dst -= original dst
+	pop %rcx;            movl %edi,(%rcx)  # actual length used at dst  XXX: 4GB
+	pop %rbx; pop %rbp
+	ret
 
 /* Decompress the rest of this loader, and jump to it.
    Map a page to hold the decompressed bytes.  Logically this could
@@ -91,7 +173,7 @@ unfold:
 /* Load the addresses and lengths that ::pack3() patched in.
    XXX: 2GB  Note that  PUSH $imm32      sign-extends to 64 bits.
    XXX: 4GB  Note that  MOVL $imm32,reg  zero-extends to 64-bits.
-   If desired, then use an temporary register to extend the 2GB PUSH to 4GB.
+   (Use an temporary register to obtain 4GB range on PUSH constant.)
 */
         .byte   0x68; .ascii "JMPU"  # push $'JMPU'  # for unmap in fold
         .byte   0x68; .ascii "ADRU"  # push $'ADRU'  # for unmap in fold
@@ -129,7 +211,6 @@ main:
         call unfold  # push &b_info
         /* { b_info={sz_unc, sz_cpr, {4 char}}, folded_loader...} */
 
-eof:
 /*__XTHEENDX__*/
 
 /*