mirror of
https://github.com/upx/upx
synced 2025-09-28 19:06:07 +08:00

l_lx_elf64amd.S amd_d_nrv2b.S amd_d_nrv2e.S committer: jreiser <jreiser> 1132290120 +0000
247 lines
8.5 KiB
ArmAsm
247 lines
8.5 KiB
ArmAsm
/* l_lx_elf64amd.S -- Linux program entry point & decompressor (Elf binary)
|
|
*
|
|
* This file is part of the UPX executable compressor.
|
|
*
|
|
* Copyright (C) 1996-2004 Markus Franz Xaver Johannes Oberhumer
|
|
* Copyright (C) 1996-2004 Laszlo Molnar
|
|
* Copyright (C) 2000-2005 John F. Reiser
|
|
* All Rights Reserved.
|
|
*
|
|
* UPX and the UCL library are free software; you can redistribute them
|
|
* and/or modify them under the terms of the GNU General Public License as
|
|
* published by the Free Software Foundation; either version 2 of
|
|
* the License, or (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; see the file COPYING.
|
|
* If not, write to the Free Software Foundation, Inc.,
|
|
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
*
|
|
* Markus F.X.J. Oberhumer Laszlo Molnar
|
|
* <mfx@users.sourceforge.net> <ml1050@users.sourceforge.net>
|
|
*
|
|
* John F. Reiser
|
|
* <jreiser@users.sourceforge.net>
|
|
*/
|
|
|
|
#include "amd_regs.h"
|
|
|
|
sz_l_info= 12
|
|
l_lsize= 8
|
|
|
|
sz_p_info= 12
|
|
|
|
sz_b_info= 12
|
|
sz_unc= 0
|
|
sz_cpr= 4
|
|
b_method= 8
|
|
|
|
PROT_READ= 1
|
|
PROT_WRITE= 2
|
|
PROT_EXEC= 4
|
|
|
|
MAP_PRIVATE= 2
|
|
MAP_FIXED= 0x10
|
|
MAP_ANONYMOUS= 0x20
|
|
|
|
SYS_mmap= 9 # 64-bit mode only!
|
|
|
|
PAGE_SHIFT= 12
|
|
PAGE_MASK= (~0<<PAGE_SHIFT)
|
|
PAGE_SIZE= -PAGE_MASK
|
|
|
|
M_NRV2B_LE32=2 # ../conf.h
|
|
M_NRV2E_LE32=8
|
|
|
|
#define ALIGN(n) .align n
|
|
|
|
/*__LEXEC000__*/
|
|
_start: .globl _start
|
|
call main # push &decompress
|
|
|
|
/* Returns 0 on success; non-zero on failure. */
|
|
decompress: # (uchar const *src, size_t lsrc, uchar *dst, u32 &ldst, uint method)
|
|
|
|
/* Arguments according to calling convention */
|
|
#define src %arg1
|
|
#define lsrc %arg2
|
|
#define dst %arg3
|
|
#define ldst %arg4 /* Out: actually a reference: &len_dst */
|
|
#define meth %arg5l
|
|
|
|
/* Working registers */
|
|
#define off %eax /* XXX: 2GB */
|
|
#define len %ecx /* XXX: 2GB */
|
|
#define lenq %rcx
|
|
#define bits %ebx
|
|
#define disp %rbp
|
|
|
|
push %rbp; push %rbx # C callable
|
|
push ldst
|
|
push dst
|
|
addq src,lsrc; push lsrc # &input_eof
|
|
|
|
movq src,%rsi # hardware src for movsb, lodsb
|
|
movq dst,%rdi # hardware dst for movsb
|
|
xorl bits,bits # empty; force refill
|
|
xorl len,len # create loop invariant
|
|
orq $~0,disp # -1: initial displacement
|
|
call setup # push &getbit [TUNED]
|
|
ra_setup:
|
|
|
|
/* AMD64 branch prediction is much worse if there are more than 3 branches
|
|
per 16-byte block. The jnextb would suffer unless inlined. getnextb is OK
|
|
using closed subroutine to save space, and should be OK on cycles because
|
|
CALL+RET should be predicted. getnextb could partially expand, using closed
|
|
subroutine only for refill.
|
|
*/
|
|
/* jump on next bit {0,1} with prediction {y==>likely, n==>unlikely} */
|
|
/* Prediction omitted for now. */
|
|
/* On refill: prefetch next byte, for latency reduction on literals and offsets. */
|
|
#define jnextb0np jnextb0yp
|
|
#define jnextb0yp GETBITp; jnc
|
|
#define jnextb1np jnextb1yp
|
|
#define jnextb1yp GETBITp; jc
|
|
#define GETBITp \
|
|
addl bits,bits; jnz 0f; \
|
|
movl (%rsi),bits; subq $-4,%rsi; \
|
|
adcl bits,bits; movb (%rsi),%dl; \
|
|
0:
|
|
/* Same, but without prefetch (not useful for length of match.) */
|
|
#define jnextb0n jnextb0y
|
|
#define jnextb0y GETBIT; jnc
|
|
#define jnextb1n jnextb1y
|
|
#define jnextb1y GETBIT; jc
|
|
#define GETBIT \
|
|
addl bits,bits; jnz 0f; \
|
|
movl (%rsi),bits; subq $-4,%rsi; \
|
|
adcl bits,bits; \
|
|
0:
|
|
|
|
/* rotate next bit into bottom bit of reg */
|
|
#define getnextbp(reg) call *%r11; adcl reg,reg
|
|
#define getnextb(reg) getnextbp(reg)
|
|
|
|
ALIGN(1<<3)
|
|
getbit:
|
|
addl bits,bits; jz refill # Carry= next bit
|
|
rep; ret
|
|
refill:
|
|
movl (%rsi),bits; subq $-4,%rsi # next 32 bits; set Carry
|
|
adcl bits,bits # LSB= 1 (CarryIn); CarryOut= next bit
|
|
movb (%rsi),%dl # speculate: literal, or bottom 8 bits of offset
|
|
rep; ret
|
|
|
|
copy: # In: len, %rdi, disp; Out: 0==len, %rdi, disp; trashes %rax, %rdx
|
|
leaq (%rdi,disp),%rax; cmpl $5,len # <=3 is forced
|
|
movb (%rax),%dl; jbe copy1 # <=5 for better branch predict
|
|
cmpq $-4,disp; ja copy1 # 4-byte chunks would overlap
|
|
subl $4,len # adjust for termination cases
|
|
copy4:
|
|
movl (%rax),%edx; addq $4, %rax; subl $4,len
|
|
movl %edx,(%rdi); leaq 4(%rdi),%rdi; jnc copy4
|
|
addl $4,len; movb (%rax),%dl; jz copy0
|
|
copy1:
|
|
incq %rax; movb %dl,(%rdi); subl $1,len
|
|
movb (%rax),%dl
|
|
leaq 1(%rdi),%rdi; jnz copy1
|
|
copy0:
|
|
rep; ret
|
|
|
|
#include "amd_d_nrv2e.S"
|
|
#include "amd_d_nrv2b.S"
|
|
|
|
setup:
|
|
cld
|
|
pop %r11 # addq $ getbit - ra_setup,%r11 # &getbit
|
|
cmpl $ M_NRV2E_LE32,meth; je top_n2e
|
|
cmpl $ M_NRV2B_LE32,meth; je top_n2b
|
|
eof:
|
|
pop %rcx # &input_eof
|
|
movq %rsi,%rax; subq %rcx,%rax # src -= eof; // return 0: good; else: bad
|
|
pop %rdx; subq %rdx,%rdi # dst -= original dst
|
|
pop %rcx; movl %edi,(%rcx) # actual length used at dst XXX: 4GB
|
|
pop %rbx; pop %rbp
|
|
ret
|
|
|
|
/* Decompress the rest of this loader, and jump to it.
|
|
Map a page to hold the decompressed bytes. Logically this could
|
|
be done by setting .p_memsz for our first PT_LOAD. But as of 2005-11-09,
|
|
linux 2.6.14 only does ".bss expansion" on the PT_LOAD that describes the
|
|
highest address. [I regard this as a bug, and it makes the kernel's
|
|
fs/binfmt_elf.c complicated, buggy, and insecure.] For us, that is the 2nd
|
|
PT_LOAD, which is the only way that linux allows to set the brk() for the
|
|
uncompressed program. [This is a significant kernel misfeature.]
|
|
*/
|
|
unfold:
|
|
pop %rbx # &b_info
|
|
|
|
/* Get some pages. If small, then get 1 page located just after the end
|
|
of the first PT_LOAD of the compressed program. This will still be below
|
|
all of the uncompressed program. If large (>=3MB compressed), then get enough
|
|
to duplicate the entire compressed PT_LOAD, plus 1 page, located just after
|
|
the brk() of the _un_compressed program. The address and length are pre-
|
|
calculated by PackLinuxElf64amd::pack3(), and patched in at compress time.
|
|
*/
|
|
.byte 7+0xB8; .ascii "ADRM" # movl $'ADRM',%edi XXX: 4GB
|
|
push $ PROT_READ | PROT_WRITE | PROT_EXEC; pop %arg3
|
|
.byte 6+0xB8; .ascii "LENM" # movl $'LENM',%esi XXX: 4GB
|
|
push $ MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS; pop %sys4
|
|
subl %arg5l,%arg5l #; subl %arg6l,%arg6l # MAP_ANON ==> ignore offset
|
|
push $ SYS_mmap; pop %rax
|
|
syscall # %rax= result; trashes %rcx,%r11 only
|
|
cmpl %eax,%edi; je 0f; hlt; 0: # XXX: 4GB
|
|
|
|
/* Load the addresses and lengths that ::pack3() patched in.
|
|
XXX: 2GB Note that PUSH $imm32 sign-extends to 64 bits.
|
|
XXX: 4GB Note that MOVL $imm32,reg zero-extends to 64-bits.
|
|
(Use an temporary register to obtain 4GB range on PUSH constant.)
|
|
*/
|
|
.byte 0x68; .ascii "JMPU" # push $'JMPU' # for unmap in fold
|
|
.byte 0x68; .ascii "ADRU" # push $'ADRU' # for unmap in fold
|
|
.byte 6+0xB8; .ascii "ADRC" # movl $'ADRC',%esi
|
|
.byte 0x68; .ascii "LENU" # push $'LENU' # for unmap in fold
|
|
.byte 1+0xB8; .ascii "CNTC" # movl $'CNTC',%ecx
|
|
.byte 0x68; .ascii "ADRX" # push $'ADRX' # for upx_main
|
|
.byte 0x68; .ascii "LENX" # push $'LENX' # for upx_main
|
|
|
|
/* Move and relocate if compressed overlaps uncompressed.
|
|
Move by 0 when total compressed executable is < 3MB.
|
|
*/
|
|
movl %edi,%edx # ADRM
|
|
subl %esi,%edx # (ADRM - ADRC) == relocation amount
|
|
addl %edx,%ebp # update &decompress
|
|
addl %edx,%ebx # update &b_info
|
|
|
|
cld
|
|
rep; movsq
|
|
xchgl %eax,%edi
|
|
|
|
/* Decompress the folded part of this stub, then execute it. */
|
|
movl %ebx,%esi # %arg2l= &b_info (relocated)
|
|
push %rax # ret_addr after decompression
|
|
xchgl %eax,%arg3l # %arg3= dst for unfolding XXX: 4GB
|
|
lodsl; movl %esi,%arg4l # &len_dst ==> &do_not_care XXX: 4GB
|
|
lodsl; xchgl %eax,%arg1l # sz_cpr XXX: 4GB
|
|
lodsl; movzbl %al,%arg5l # b_method
|
|
xchg %arg1l,%arg2l # XXX: 4GB
|
|
jmp *%rbp # goto decompress; return to unfolded loader
|
|
|
|
main:
|
|
# int3 # uncomment for debugging
|
|
pop %rbp # &decompress
|
|
call unfold # push &b_info
|
|
/* { b_info={sz_unc, sz_cpr, {4 char}}, folded_loader...} */
|
|
|
|
/*__XTHEENDX__*/
|
|
|
|
/*
|
|
vi:ts=8:et:nowrap
|
|
*/
|
|
|