diff --git a/src/p_lx_elf.cpp b/src/p_lx_elf.cpp index b6bf950b..08fe4b06 100644 --- a/src/p_lx_elf.cpp +++ b/src/p_lx_elf.cpp @@ -190,13 +190,11 @@ int const * PackLinuxElf64amd::getCompressionMethods(int method, int level) const { // No real dependency on LE32. - static const int m_nrv2e[] = { M_NRV2E_LE32, -1 }; - static const int m_nrv2b[] = { M_NRV2B_LE32, -1 }; + static const int l_method[] = { M_NRV2E_LE32, M_NRV2B_LE32, -1 }; /*return Packer::getDefaultCompressionMethods_le32(method, level);*/ - // 2005-04-23 FIXME: stub/l_lx_elfppc32.S hardwires ppc_d_nrv2e.S - UNUSED(method); UNUSED(level); UNUSED(m_nrv2b); - return m_nrv2e; + UNUSED(method); UNUSED(level); + return l_method; } int const * @@ -887,16 +885,34 @@ void PackLinuxElf32ppc::pack3(OutputFile *fo, Filter &ft) void PackLinuxElf64amd::pack3(OutputFile *fo, Filter &ft) { - unsigned disp; // 32 bits wide - unsigned const zero = 0; - unsigned len = fo->getBytesWritten(); - fo->write(&zero, 3& -len); // align to 0 mod 4 - len += (3& -len) + sizeof(disp); + char zero[-(~0<<4)]; + unsigned const hlen = sz_elf_hdrs + sizeof(l_info) + sizeof(p_info); + unsigned const len0 = fo->getBytesWritten(); + unsigned len = len0; + unsigned const frag = ~(~0<<4) & -len; // align to 0 mod 16 + memset(zero, 0, sizeof(zero)); + fo->write(&zero, frag); + len += frag; - // 5: sizeof(CALL instruction at _start which precedes f_decompress - set_native32(&disp, 5+ len - sz_elf_hdrs); +#define PAGE_MASK (~0u<<12) +#define PAGE_SIZE (-PAGE_MASK) + acc_uint64l_t const brk = getbrk(phdri, ehdri.e_phnum); + upx_byte *const p = const_cast(getLoader()); + lsize = getLoaderSize(); + // patch in order of descending address - fo->write(&disp, sizeof(disp)); + // compressed input for eXpansion + patch_le32(p,lsize,"LENX", len0 - hlen); + patch_le32(p,lsize,"ADRX", elfout.phdr[0].p_vaddr + hlen); + + patch_le32(p,lsize,"CNTC", 0>>3); // count for copy + patch_le32(p,lsize,"LENU", PAGE_SIZE + len); // len for unmap + patch_le32(p,lsize,"ADRC", PAGE_MASK & (~PAGE_MASK + brk)); // addr for copy + patch_le32(p,lsize,"ADRU", elfout.phdr[0].p_vaddr); // addr for unmap + patch_le32(p,lsize,"JMPU", 12+0x400000); // XXX trampoline for unmap + patch_le32(p,lsize,"LENM", PAGE_SIZE); // len for map + patch_le32(p,lsize,"ADRM", PAGE_MASK & (~PAGE_MASK + brk)); // addr for map +#undef PAGE_MASK super::pack3(fo, ft); } diff --git a/src/stub/a_lx_elf64.c b/src/stub/a_lx_elf64.c index 22255f3e..df2d70c6 100644 --- a/src/stub/a_lx_elf64.c +++ b/src/stub/a_lx_elf64.c @@ -161,14 +161,7 @@ ERR_LAB } } -// Create (or find) an escape hatch to use when munmapping ourselves the stub. -// Called by do_xmap to create it, and by assembler code to find it. -static void * -make_hatch(Elf64_Phdr const *const phdr) -{ - return 0; -} - +#if 0 /*{*/ static void upx_bzero(char *p, size_t len) { @@ -177,12 +170,12 @@ upx_bzero(char *p, size_t len) } while (--len); } #define bzero upx_bzero - +#endif /*}*/ static void auxv_up(Elf64_auxv_t *av, unsigned const type, uint64_t const value) { - if (av && 0==(1&(uint64_t)av)) /* PT_INTERP usually inhibits, except for hatch */ + if (av) for (;; ++av) { if (av->a_type==type || (av->a_type==AT_IGNORE && type!=AT_NULL)) { av->a_type = type; @@ -259,13 +252,11 @@ do_xmap( unsigned const prot = PF_TO_PROT(phdr->p_flags); Extent xo; size_t mlen = xo.size = phdr->p_filesz; - char *addr = xo.buf = (char *)phdr->p_vaddr; + char *addr = xo.buf = reloc + (char *)phdr->p_vaddr; char *haddr = phdr->p_memsz + addr; size_t frag = (long)addr &~ PAGE_MASK; mlen += frag; addr -= frag; - addr += reloc; - haddr += reloc; if (addr != mmap(addr, mlen, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_PRIVATE | (xi ? MAP_ANONYMOUS : 0), @@ -278,6 +269,10 @@ do_xmap( bzero(addr, frag); // fragment at lo end frag = (-mlen) &~ PAGE_MASK; // distance to next page boundary bzero(mlen+addr, frag); // fragment at hi end + if (xi && 0==phdr->p_offset) { + Elf64_Ehdr *const ehdr = (Elf64_Ehdr *)addr; + *(int *)&ehdr->e_ident[12] = 0x90c3050f; // syscall; ret; nop + } if (0!=mprotect(addr, mlen, prot)) { err_exit(10); ERR_LAB @@ -308,27 +303,25 @@ ERR_LAB void * upx_main( // returns entry address - struct l_info const *const li, + struct b_info const *const bi, // 1st block header size_t const sz_compressed, // total length Elf64_Ehdr *const ehdr, // temp char[sz_ehdr] for decompressing - size_t const sz_ehdr, + Elf64_auxv_t *const av, f_expand *const f_decompress, - f_unfilter *const f_unf, - Elf64_auxv_t *const av + f_unfilter *const f_unf ) { Elf64_Phdr const *phdr = (Elf64_Phdr const *)(1+ ehdr); Elf64_Addr entry; - Extent xi, xo, xi0; - xi.buf = (char *)(1+ (struct p_info const *)(1+ li)); // &b_info - xi.size = sz_compressed - (sizeof(struct l_info) + sizeof(struct p_info)); + Extent xo, xi1, xi2; xo.buf = (char *)ehdr; - xo.size = ((struct b_info const *)xi.buf)->sz_unc; - xi0 = xi; + xo.size = bi->sz_unc; + xi2.buf = (char *)bi; xi2.size = sz_compressed; + xi1.buf = (char *)bi; xi1.size = sz_compressed; // ehdr = Uncompress Ehdr and Phdrs - unpackExtent(&xi, &xo, f_decompress, 0); // never filtered? + unpackExtent(&xi2, &xo, f_decompress, 0); // never filtered? // AT_PHDR.a_un.a_val is set again by do_xmap if PT_PHDR is present. auxv_up(av, AT_PHDR , (unsigned long)(1+(Elf64_Ehdr *)phdr->p_vaddr)); @@ -337,7 +330,7 @@ upx_main( // returns entry address //auxv_up(av, AT_PHENT , ehdr->e_phentsize); /* this can never change */ //auxv_up(av, AT_PAGESZ, PAGE_SIZE); /* ld-linux.so.2 does not need this */ - entry = do_xmap(ehdr, &xi0, 0, av, f_decompress, f_unf); + entry = do_xmap(ehdr, &xi1, 0, av, f_decompress, f_unf); // "rewind" { // Map PT_INTERP program interpreter int j; diff --git a/src/stub/amd_d_nrv2e.S b/src/stub/amd_d_nrv2e.S index eeffef6f..bf6afb98 100644 --- a/src/stub/amd_d_nrv2e.S +++ b/src/stub/amd_d_nrv2e.S @@ -34,7 +34,7 @@ M_NRV2B_LE32=2 # ../conf.h M_NRV2E_LE32=8 -SZ_DLINE=128 # size of data cache line in Apple G5 +#define ALIGN(n) .align n /* Returns 0 on success; non-zero on failure. */ decompress: # (uchar const *src, size_t lsrc, uchar *dst, size_t &ldst, uint method) @@ -49,6 +49,7 @@ decompress: # (uchar const *src, size_t lsrc, uchar *dst, size_t &ldst, uint me /* Working registers */ #define off %eax /* XXX: 2GB */ #define len %ecx /* XXX: 2GB */ +#define lenq %rcx #define bits %ebx #define disp %rbp @@ -57,11 +58,12 @@ decompress: # (uchar const *src, size_t lsrc, uchar *dst, size_t &ldst, uint me push dst addq src,lsrc; push lsrc # &input_eof - movq src,%rsi # hardware src - movl $1<<31,%ebx # force refill - movq dst,%rdi # hardware dst + movq src,%rsi # hardware src for movsb, lodsb + movq dst,%rdi # hardware dst for movsb + subl bits,bits # empty; force refill + subl len,len # create loop invariant orq $~0,disp # -1: initial displacement - call setup_rdx_n2e + call setup_rdx ra_setup_rdx: /* jump on next bit {0,1} with prediction {y==>likely, n==>unlikely} */ @@ -74,24 +76,23 @@ ra_setup_rdx: /* rotate next bit into bottom bit of reg */ #define getnextb(reg) call *%rdx; adcl reg,reg - /*.align 1<<4 # not effective unless upx pays attention */ -get_refill_n2e: # In: 1==Carry - movl (%rsi),bits; leaq 4(%rsi),%rsi # next 32 bits + ALIGN(1<<3) +getbit: + addl bits,bits; jz refill # Carry= next bit + ret +refill: + movl (%rsi),bits; subq $-4,%rsi # next 32 bits; set Carry adcl bits,bits # LSB= 1 (CarryIn); CarryOut= next bit ret -getbit: - addl bits,bits; jz get_refill_n2e # Carry= next bit - ret + ALIGN(1<<3) lit_n2e: movsb # *%rdi++ = *%rsi++; top_n2e: - call *%rdx - jc lit_n2e - movl $1,off + jnextb1y lit_n2e + lea 1(lenq),off # [len= 0] off= 1 jmp getoff_n2e - /*.align 1<<4 # not effective unless upx pays attention */ off_n2e: dec off getnextb(off) @@ -99,20 +100,20 @@ getoff_n2e: getnextb(off) jnextb0n off_n2e - xorl len,len # len= 0 + /*xorl len,len # len= 0*/ subl $3,off; jc offprev_n2e shll $8,off lodsb # requires off===%eax xorl $~0,off; jz eof_n2e sarl off # Carry= original low bit - movslq off,disp + movslq off,disp # XXX: 2GB jc lenlast_n2e jmp lenmore_n2e offprev_n2e: jnextb1y lenlast_n2e lenmore_n2e: - movl $1,len + incl len # len= 1 jnextb1y lenlast_n2e len_n2e: getnextb(len) @@ -127,22 +128,58 @@ gotlen_n2e: adcl $2,len # len += 2+ (disp < -0x500); push %rsi leaq (%rdi,disp),%rsi - rep; movsb + rep; movsb # len pop %rsi -bot_n2e: - movb 2*SZ_DLINE(%rdi),%al # prefetch for store +bot_n2e: # In: 0==len + prefetch 0x7f(%rdi) # em64t has no prefetchw jmp top_n2e -setup_rdx_n2e: - pop %rdx; addq $ getbit - ra_setup_rdx,%rdx - cmpl $ M_NRV2E_LE32,meth; jne eof_n2e - jmp bot_n2e + ALIGN(1<<3) +lit_n2b: + movsb # *%rdi++ = %rsi++; +top_n2b: + jnextb1y lit_n2b + lea 1(lenq),off # [len= 0] off= 1 +offmore_n2b: + getnextb(off) + jnextb0n offmore_n2b + + subl $3,off; jc len_n2b # use previous offset + shll $8,off + lodsb # requires off===%eax + xorl $~0,off; jz eof_n2b + movslq off,disp # XXX: 2GB +len_n2b: + lea 1(lenq),off # [len= 0] off= 1 + getnextb(len); getnextb(len) # two bits; cc set on result + jnz gotlen_n2b # raw 1,2,3 ==> 2,3,4 + movl off,len # len= 1, the msb + addl $3-1,off # raw 2.. ==> 5.. +lenmore_n2b: + getnextb(len) + jnextb0n lenmore_n2b +gotlen_n2b: + cmpq $-0xd00,disp + adcl off,len # len += off + (disp < -0xd00) + push %rsi + leaq (%rdi,disp),%rsi + rep; movsb # len + pop %rsi +bot_n2b: # In: 0==len + prefetch 0x7f(%rdi) # em64t has no prefetchw + jmp top_n2b + +setup_rdx: + pop %rdx; addq $ getbit - ra_setup_rdx,%rdx # %rdx= &getbit + cmpl $ M_NRV2E_LE32,meth; je bot_n2e + cmpl $ M_NRV2B_LE32,meth; je bot_n2b eof_n2e: +eof_n2b: pop %rcx # &input_eof movq %rsi,%rax; subq %rcx,%rax # src -= eof; // return 0: good; else: bad - pop %rdx; subq %rdx,%rdi # dst -= original dst - pop %rcx; movl %edi,(%rcx) # actual length used at dst XXX: 4GB + pop %rdx; subq %rdx,%rdi # dst -= original dst + pop %rcx; movl %edi,(%rcx) # actual length used at dst XXX: 4GB pop %rbx; pop %rbp ret diff --git a/src/stub/fold_elf64amd.S b/src/stub/fold_elf64amd.S index 4adb0072..3c5682a9 100644 --- a/src/stub/fold_elf64amd.S +++ b/src/stub/fold_elf64amd.S @@ -41,46 +41,6 @@ sz_p_info= 12 OVERHEAD=2048 MAX_ELF_HDR=1024 -__NR_munmap= 91 - -/* In: - %rbp= &decompress; also 9+ (char *)&(offset to {l_info; p_info; b_info}) - %rsp= &{argc,argv...,0,env...,0,auxv...,strings} -*/ -fold_begin: - call L90 # ret_addr is &f_unfilter -#include "amd_bxx.S" - -zfind: - lodsq; testq %rax,%rax; jnz zfind - ret -L90: - pop %arg6 # &amdbxx: f_unfilter - movq %rsp,%rsi # stack pointer at execve - call zfind # %rsi= &env - call zfind # %rsi= &Elf64_auxv - subq $ OVERHEAD,%rsp - movq %rsp,%arg3 # &ELf64_Ehdr temporary space - push %rsi # arg7 - movq $ PAGE_MASK,%rcx # sign extend 32 bits to 64 bits - movl -9(%rbp),%arg2l # total size - offset to {l_info; p_info; b_info} - movq %arg6,%r15; andq %rcx,%r15 # %r15= &this_page - movq %rbp,%arg5 # &decompress: f_expand - movq %rbp,%arg1; subq %arg2,%arg1 # &l_info - movq %arg1,%r14; andq %rcx,%r14 # %r14= our_Elf64_Ehdr - movl sz_unc+sz_p_info+sz_l_info(%arg1),%arg4l # sz_elf_headers - call upx_main # Out: %rax= entry -/* entry= upx_main(l_info *arg1, total_size arg2, Elf64_Ehdr *arg3, sz_ehdr arg4, - f_decompr arg5, f_unfilter arg6, Elf32_Auxv_t *arg7 ) -*/ - addq $8+OVERHEAD,%rsp - push %rax # save &entry - movq %r14,%arg1 # &our_Elf64_Ehdr - movq %r15,%arg2 - subq %r14,%arg2 # size - call munmap # unmap compressed program; /proc/self/exe disappears - ret - /* 64-bit mode only! */ __NR_read= 0 __NR_open= 2 @@ -93,9 +53,51 @@ __NR_brk= 12 __NR_exit= 60 +/* In: + %rbp= &decompress + %rsp= &{LENX,ADRX,LENU,ADRU,JMPU,argc,argv...,0,env...,0,auxv...,0...,strings} +*/ +fold_begin: + call L90 # push &f_unfilter +#include "amd_bxx.S" +L90: + lea (1+5+1)*8(%rsp),%rsi # &argv[0] +0: + lodsq; testq %rax,%rax; jnz 0b # %rsi= &env; +0: + lodsq; testq %rax,%rax; jnz 0b # %rsi= &Elf64_auxv + + pop %arg6 # &amdbxx: f_unfilter + movq %rsi,%arg4 # &Elf64_auxv + pop %arg2 # LENX + pop %arg1 # ADRX + + subq $ OVERHEAD,%rsp + movq %rsp,%arg3 # &ELf64_Ehdr temporary space + movq %rbp,%arg5 # &decompress: f_expand + call upx_main # Out: %rax= entry +/* entry= upx_main(b_info *arg1, total_size arg2, Elf64_Ehdr *arg3, + Elf32_Auxv_t *arg4, f_decompr arg5, f_unfilter arg6 ) +*/ + addq $OVERHEAD,%rsp + pop %arg2 # LENU + pop %arg1 # ADRU + pop %rcx # JMPU + push %rax # &entry + push $ __NR_munmap; pop %rax + jmp *%rcx # goto: syscall; ret + +munmap: .globl munmap + movb $ __NR_munmap,%al; jmp sysgo +mprotect: .globl mprotect + movb $ __NR_mprotect,%al; jmp sysgo +brk: .globl brk + movb $ __NR_brk,%al; jmp sysgo + mmap: .globl mmap - movq %arg4,%sys4 movb $ __NR_mmap,%al +sysarg4: + movq %arg4,%sys4 sysgo: # NOTE: kernel demands 4th arg in %sys4, NOT %arg4 movzbl %al,%eax syscall @@ -111,13 +113,6 @@ open: .globl open close: .globl close movb $ __NR_close,%al; jmp sysgo -mprotect: .globl mprotect - movb $ __NR_mprotect,%al; jmp sysgo -munmap: .globl munmap - movb $ __NR_munmap,%al; jmp sysgo -brk: .globl brk - movb $ __NR_brk,%al; jmp sysgo - exit: .globl exit movb $ __NR_exit,%al; jmp sysgo diff --git a/src/stub/l_lx_elf64amd.S b/src/stub/l_lx_elf64amd.S index 3bae52c0..2ccd524a 100644 --- a/src/stub/l_lx_elf64amd.S +++ b/src/stub/l_lx_elf64amd.S @@ -29,13 +29,10 @@ * */ -/*__LEXEC000__*/ -_start: .globl _start -/* The following 'call' must be at _start; fold_begin knows this, - and so does PackLinuxElf64amd::pack3() . -*/ - call main # push address of decompressor -#include "amd_d_nrv2e.S" +sz_l_info= 12 + l_lsize= 8 + +sz_p_info= 12 sz_b_info= 12 sz_unc= 0 @@ -56,6 +53,14 @@ PAGE_SHIFT= 12 PAGE_MASK= (~0< ignore offset + push $ SYS_mmap; pop %rax + syscall # %rax= result; trashes %rcx,%r11 only + cmpl %eax,%edi; je 0f; hlt; 0: # XXX: 4GB + + .byte 0x68; .ascii "JMPU" # push $'JMPU' # for unmap in fold + .byte 0x68; .ascii "ADRU" # push $'ADRU' # for unmap in fold + .byte 6+0xB8; .ascii "ADRC" # movl $'ADRC',%esi + .byte 0x68; .ascii "LENU" # push $'LENU' # for unmap in fold + .byte 1+0xB8; .ascii "CNTC" # movl $'CNTC',%ecx + .byte 0x68; .ascii "ADRX" # push $'ADRX' # for upx_main + .byte 0x68; .ascii "LENX" # push $'LENX' # for upx_main + + movl %edi,%edx + subl %esi,%edx # relocation amount + addl %edx,%ebp # update &decompress + addl %edx,%ebx # update &b_info + + rep; movsq + xchgl %eax,%edi + + movl %ebx,%esi # %arg2l= &b_info (relocated) push %rax # ret_addr after decompression - .byte 0x92 # xchg %eax,%arg3l # %arg3= dst for unfolding XXX: 4GB - lodsl; movl %arg2l,%arg4l # &len_dst ==> &do_not_care XXX: 4GB - lodsl; .byte 0x97 # xchg %rax,%arg1l # sz_cpr XXX: 4GB + xchgl %eax,%arg3l # %arg3= dst for unfolding XXX: 4GB + lodsl; movl %esi,%arg4l # &len_dst ==> &do_not_care XXX: 4GB + lodsl; xchgl %eax,%arg1l # sz_cpr XXX: 4GB lodsl; movzbl %al,%arg5l # b_method - xchg %arg1l,%arg2l # XXX: 4GB + xchg %arg1l,%arg2l # XXX: 4GB jmp *%rbp # goto decompress; return to unfolded loader main: # int3 # uncomment for debugging - pop %rbp # &dcompress - call unfold + pop %rbp # &decompress + call unfold # push &b_info /* { b_info={sz_unc, sz_cpr, {4 char}}, folded_loader...} */ eof: