1
0
mirror of https://github.com/upx/upx synced 2025-09-28 19:06:07 +08:00
upx/src/stub/l_lx_elf64amd.S
John Reiser bff2a63ea2 prefetching does not help length of match
l_lx_elf64amd.S amd_d_nrv2b.S amd_d_nrv2e.S

committer: jreiser <jreiser> 1132290120 +0000
2005-11-18 05:02:00 +00:00

247 lines
8.5 KiB
ArmAsm

/* l_lx_elf64amd.S -- Linux program entry point & decompressor (Elf binary)
*
* This file is part of the UPX executable compressor.
*
* Copyright (C) 1996-2004 Markus Franz Xaver Johannes Oberhumer
* Copyright (C) 1996-2004 Laszlo Molnar
* Copyright (C) 2000-2005 John F. Reiser
* All Rights Reserved.
*
* UPX and the UCL library are free software; you can redistribute them
* and/or modify them under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; see the file COPYING.
* If not, write to the Free Software Foundation, Inc.,
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Markus F.X.J. Oberhumer Laszlo Molnar
* <mfx@users.sourceforge.net> <ml1050@users.sourceforge.net>
*
* John F. Reiser
* <jreiser@users.sourceforge.net>
*/
#include "amd_regs.h"
sz_l_info= 12
l_lsize= 8
sz_p_info= 12
sz_b_info= 12
sz_unc= 0
sz_cpr= 4
b_method= 8
PROT_READ= 1
PROT_WRITE= 2
PROT_EXEC= 4
MAP_PRIVATE= 2
MAP_FIXED= 0x10
MAP_ANONYMOUS= 0x20
SYS_mmap= 9 # 64-bit mode only!
PAGE_SHIFT= 12
PAGE_MASK= (~0<<PAGE_SHIFT)
PAGE_SIZE= -PAGE_MASK
M_NRV2B_LE32=2 # ../conf.h
M_NRV2E_LE32=8
#define ALIGN(n) .align n
/*__LEXEC000__*/
_start: .globl _start
call main # push &decompress
/* Returns 0 on success; non-zero on failure. */
decompress: # (uchar const *src, size_t lsrc, uchar *dst, u32 &ldst, uint method)
/* Arguments according to calling convention */
#define src %arg1
#define lsrc %arg2
#define dst %arg3
#define ldst %arg4 /* Out: actually a reference: &len_dst */
#define meth %arg5l
/* Working registers */
#define off %eax /* XXX: 2GB */
#define len %ecx /* XXX: 2GB */
#define lenq %rcx
#define bits %ebx
#define disp %rbp
push %rbp; push %rbx # C callable
push ldst
push dst
addq src,lsrc; push lsrc # &input_eof
movq src,%rsi # hardware src for movsb, lodsb
movq dst,%rdi # hardware dst for movsb
xorl bits,bits # empty; force refill
xorl len,len # create loop invariant
orq $~0,disp # -1: initial displacement
call setup # push &getbit [TUNED]
ra_setup:
/* AMD64 branch prediction is much worse if there are more than 3 branches
per 16-byte block. The jnextb would suffer unless inlined. getnextb is OK
using closed subroutine to save space, and should be OK on cycles because
CALL+RET should be predicted. getnextb could partially expand, using closed
subroutine only for refill.
*/
/* jump on next bit {0,1} with prediction {y==>likely, n==>unlikely} */
/* Prediction omitted for now. */
/* On refill: prefetch next byte, for latency reduction on literals and offsets. */
#define jnextb0np jnextb0yp
#define jnextb0yp GETBITp; jnc
#define jnextb1np jnextb1yp
#define jnextb1yp GETBITp; jc
#define GETBITp \
addl bits,bits; jnz 0f; \
movl (%rsi),bits; subq $-4,%rsi; \
adcl bits,bits; movb (%rsi),%dl; \
0:
/* Same, but without prefetch (not useful for length of match.) */
#define jnextb0n jnextb0y
#define jnextb0y GETBIT; jnc
#define jnextb1n jnextb1y
#define jnextb1y GETBIT; jc
#define GETBIT \
addl bits,bits; jnz 0f; \
movl (%rsi),bits; subq $-4,%rsi; \
adcl bits,bits; \
0:
/* rotate next bit into bottom bit of reg */
#define getnextbp(reg) call *%r11; adcl reg,reg
#define getnextb(reg) getnextbp(reg)
ALIGN(1<<3)
getbit:
addl bits,bits; jz refill # Carry= next bit
rep; ret
refill:
movl (%rsi),bits; subq $-4,%rsi # next 32 bits; set Carry
adcl bits,bits # LSB= 1 (CarryIn); CarryOut= next bit
movb (%rsi),%dl # speculate: literal, or bottom 8 bits of offset
rep; ret
copy: # In: len, %rdi, disp; Out: 0==len, %rdi, disp; trashes %rax, %rdx
leaq (%rdi,disp),%rax; cmpl $5,len # <=3 is forced
movb (%rax),%dl; jbe copy1 # <=5 for better branch predict
cmpq $-4,disp; ja copy1 # 4-byte chunks would overlap
subl $4,len # adjust for termination cases
copy4:
movl (%rax),%edx; addq $4, %rax; subl $4,len
movl %edx,(%rdi); leaq 4(%rdi),%rdi; jnc copy4
addl $4,len; movb (%rax),%dl; jz copy0
copy1:
incq %rax; movb %dl,(%rdi); subl $1,len
movb (%rax),%dl
leaq 1(%rdi),%rdi; jnz copy1
copy0:
rep; ret
#include "amd_d_nrv2e.S"
#include "amd_d_nrv2b.S"
setup:
cld
pop %r11 # addq $ getbit - ra_setup,%r11 # &getbit
cmpl $ M_NRV2E_LE32,meth; je top_n2e
cmpl $ M_NRV2B_LE32,meth; je top_n2b
eof:
pop %rcx # &input_eof
movq %rsi,%rax; subq %rcx,%rax # src -= eof; // return 0: good; else: bad
pop %rdx; subq %rdx,%rdi # dst -= original dst
pop %rcx; movl %edi,(%rcx) # actual length used at dst XXX: 4GB
pop %rbx; pop %rbp
ret
/* Decompress the rest of this loader, and jump to it.
Map a page to hold the decompressed bytes. Logically this could
be done by setting .p_memsz for our first PT_LOAD. But as of 2005-11-09,
linux 2.6.14 only does ".bss expansion" on the PT_LOAD that describes the
highest address. [I regard this as a bug, and it makes the kernel's
fs/binfmt_elf.c complicated, buggy, and insecure.] For us, that is the 2nd
PT_LOAD, which is the only way that linux allows to set the brk() for the
uncompressed program. [This is a significant kernel misfeature.]
*/
unfold:
pop %rbx # &b_info
/* Get some pages. If small, then get 1 page located just after the end
of the first PT_LOAD of the compressed program. This will still be below
all of the uncompressed program. If large (>=3MB compressed), then get enough
to duplicate the entire compressed PT_LOAD, plus 1 page, located just after
the brk() of the _un_compressed program. The address and length are pre-
calculated by PackLinuxElf64amd::pack3(), and patched in at compress time.
*/
.byte 7+0xB8; .ascii "ADRM" # movl $'ADRM',%edi XXX: 4GB
push $ PROT_READ | PROT_WRITE | PROT_EXEC; pop %arg3
.byte 6+0xB8; .ascii "LENM" # movl $'LENM',%esi XXX: 4GB
push $ MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS; pop %sys4
subl %arg5l,%arg5l #; subl %arg6l,%arg6l # MAP_ANON ==> ignore offset
push $ SYS_mmap; pop %rax
syscall # %rax= result; trashes %rcx,%r11 only
cmpl %eax,%edi; je 0f; hlt; 0: # XXX: 4GB
/* Load the addresses and lengths that ::pack3() patched in.
XXX: 2GB Note that PUSH $imm32 sign-extends to 64 bits.
XXX: 4GB Note that MOVL $imm32,reg zero-extends to 64-bits.
(Use an temporary register to obtain 4GB range on PUSH constant.)
*/
.byte 0x68; .ascii "JMPU" # push $'JMPU' # for unmap in fold
.byte 0x68; .ascii "ADRU" # push $'ADRU' # for unmap in fold
.byte 6+0xB8; .ascii "ADRC" # movl $'ADRC',%esi
.byte 0x68; .ascii "LENU" # push $'LENU' # for unmap in fold
.byte 1+0xB8; .ascii "CNTC" # movl $'CNTC',%ecx
.byte 0x68; .ascii "ADRX" # push $'ADRX' # for upx_main
.byte 0x68; .ascii "LENX" # push $'LENX' # for upx_main
/* Move and relocate if compressed overlaps uncompressed.
Move by 0 when total compressed executable is < 3MB.
*/
movl %edi,%edx # ADRM
subl %esi,%edx # (ADRM - ADRC) == relocation amount
addl %edx,%ebp # update &decompress
addl %edx,%ebx # update &b_info
cld
rep; movsq
xchgl %eax,%edi
/* Decompress the folded part of this stub, then execute it. */
movl %ebx,%esi # %arg2l= &b_info (relocated)
push %rax # ret_addr after decompression
xchgl %eax,%arg3l # %arg3= dst for unfolding XXX: 4GB
lodsl; movl %esi,%arg4l # &len_dst ==> &do_not_care XXX: 4GB
lodsl; xchgl %eax,%arg1l # sz_cpr XXX: 4GB
lodsl; movzbl %al,%arg5l # b_method
xchg %arg1l,%arg2l # XXX: 4GB
jmp *%rbp # goto decompress; return to unfolded loader
main:
# int3 # uncomment for debugging
pop %rbp # &decompress
call unfold # push &b_info
/* { b_info={sz_unc, sz_cpr, {4 char}}, folded_loader...} */
/*__XTHEENDX__*/
/*
vi:ts=8:et:nowrap
*/