diff --git a/src/stub/amd_d_nrv2b.S b/src/stub/amd_d_nrv2b.S index 570dfc2e..e4a2f4ab 100644 --- a/src/stub/amd_d_nrv2b.S +++ b/src/stub/amd_d_nrv2b.S @@ -29,35 +29,40 @@ */ - ALIGN(1<<3) + ALIGN(1<<3) lit_n2b: - movb (%rsi),%al; addq $1,%rsi - movb %al,(%rdi); addq $1,%rdi + incq %rsi; movb %dl,(%rdi) + incq %rdi top_n2b: - jnextb1y lit_n2b - lea 1(lenq),off # [len= 0] off= 1 + movzbl (%rsi),%edx # speculate: literal, or bottom 8 bits of offset + jnextb1y lit_n2b + lea 1(lenq),off # [len= 0] off= 1 offmore_n2b: - getnextb(off) - jnextb0n offmore_n2b + getnextb(off) + jnextb0n offmore_n2b - movzbl (%rsi),%edx - subl $ 3,off; jc len_n2b # use previous offset - shll $ 8,off - orl %edx,off; incq %rsi - xorl $~0,off; jz eof - movslq off,disp # XXX: 2GB + subl $ 3,off; jc len_n2b # use previous offset + shll $ 8,off + orl %edx,off; incq %rsi + xorl $~0,off; jz eof + movslq off,disp # XXX: 2GB len_n2b: - lea 1(lenq),off # [len= 0] off= 1 - getnextb(len); getnextb(len) # two bits; cc set on result - jnz gotlen_n2b # raw 1,2,3 ==> 2,3,4 - movl off,len # len= 1, the msb - addl $3-1,off # raw 2.. ==> 5.. + lea 1(lenq),off # [len= 0] off= 1 + getnextb(len); getnextb(len) # two bits; cc set on result + jnz gotlen_n2b # raw 1,2,3 ==> 2,3,4 + movl off,len # len= 1, the msb + addl $3-1,off # raw 2.. ==> 5.. lenmore_n2b: - getnextb(len) - jnextb0n lenmore_n2b + getnextb(len) + jnextb0n lenmore_n2b gotlen_n2b: - cmpq $-0xd00,disp - adcl off,len # len += off + (disp < -0xd00) - call copy + cmpq $-0xd00,disp + adcl off,len # len += off + (disp < -0xd00) + call copy bot_n2b: # In: 0==len jmp top_n2b + +/* +vi:ts=8:et:nowrap +*/ + diff --git a/src/stub/amd_d_nrv2e.S b/src/stub/amd_d_nrv2e.S index 39f45b86..ef351e69 100644 --- a/src/stub/amd_d_nrv2e.S +++ b/src/stub/amd_d_nrv2e.S @@ -29,13 +29,14 @@ */ - ALIGN(1<<3) + ALIGN(1<<3) lit_n2e: - movb (%rsi),%al; addq $1,%rsi - movb %al,(%rdi); addq $1,%rdi + incq %rsi; movb %dl,(%rdi) + incq %rdi top_n2e: - jnextb1y lit_n2e - lea 1(lenq),off # [len= 0] off= 1 + movzbl (%rsi),%edx # speculate: literal, or bottom 8 bits of offset + jnextb1y lit_n2e + lea 1(lenq),off # [len= 0] off= 1 jmp getoff_n2e off_n2e: @@ -45,15 +46,14 @@ getoff_n2e: getnextb(off) jnextb0n off_n2e - movzbl (%rsi),%edx #; xorl len,len # len= 0 - subl $ 3,off; jc offprev_n2e - shll $ 8,off - orl %edx,off; incq %rsi - xorl $~0,off; jz eof - sarl off # Carry= original low bit - movslq off,disp # XXX: 2GB - jc lenlast_n2e - jmp lenmore_n2e + subl $ 3,off; jc offprev_n2e + shll $ 8,off + orl %edx,off; incq %rsi + xorl $~0,off; jz eof + sarl off # Carry= original low bit + movslq off,disp # XXX: 2GB + jc lenlast_n2e + jmp lenmore_n2e offprev_n2e: jnextb1y lenlast_n2e @@ -69,8 +69,13 @@ len_n2e: lenlast_n2e: getnextb(len) # 0,1,2,3 gotlen_n2e: - cmpq $-0x500,disp + cmpq $-0x500,disp adcl $2,len # len += 2+ (disp < -0x500); - call copy + call copy bot_n2e: # In: 0==len jmp top_n2e + +/* +vi:ts=8:et:nowrap +*/ + diff --git a/src/stub/l_lx_elf64amd.S b/src/stub/l_lx_elf64amd.S index 43c27729..54524db5 100644 --- a/src/stub/l_lx_elf64amd.S +++ b/src/stub/l_lx_elf64amd.S @@ -81,17 +81,17 @@ decompress: # (uchar const *src, size_t lsrc, uchar *dst, u32 &ldst, uint metho #define bits %ebx #define disp %rbp - push %rbp; push %rbx # C callable - push ldst - push dst + push %rbp; push %rbx # C callable + push ldst + push dst addq src,lsrc; push lsrc # &input_eof - movq src,%rsi # hardware src for movsb, lodsb - movq dst,%rdi # hardware dst for movsb - xorl bits,bits # empty; force refill - xorl len,len # create loop invariant - orq $~0,disp # -1: initial displacement - jmp setup + movq src,%rsi # hardware src for movsb, lodsb + movq dst,%rdi # hardware dst for movsb + xorl bits,bits # empty; force refill + xorl len,len # create loop invariant + orq $~0,disp # -1: initial displacement + jmp setup /* jump on next bit {0,1} with prediction {y==>likely, n==>unlikely} */ /* Prediction omitted for now. */ @@ -103,44 +103,46 @@ decompress: # (uchar const *src, size_t lsrc, uchar *dst, u32 &ldst, uint metho /* rotate next bit into bottom bit of reg */ #define getnextb(reg) addl bits,bits; jnz 0f; call refill; 0: adcl reg,reg - ALIGN(1<<3) + ALIGN(1<<3) getbit: - addl bits,bits; jz refill # Carry= next bit - rep; ret + addl bits,bits; jz refill # Carry= next bit + rep; ret refill: - movl (%rsi),bits; subq $-4,%rsi # next 32 bits; set Carry - adcl bits,bits # LSB= 1 (CarryIn); CarryOut= next bit - rep; ret + movl (%rsi),bits; subq $-4,%rsi # next 32 bits; set Carry + adcl bits,bits # LSB= 1 (CarryIn); CarryOut= next bit + movzbl (%rsi),%edx # speculate: literal, or bottom 8 bits of offset + rep; ret copy: # In: len, %rdi, disp; Out: 0==len, %rdi, disp; trashes %rax, %rdx - leaq (%rdi,disp),%rdx - cmpl $ 3,len; jbe copy1 # perhaps extend this to length 5 or less? - cmpq $-4,disp; ja copy1 # 4-byte chunks would overlap - subl $4,len # adjust for termination cases + leaq (%rdi,disp),%rax; movb (%rax),%dl + cmpl $ 3,len; jbe copy1 # perhaps extend this to length 5 or less? + cmpq $-4,disp; ja copy1 # 4-byte chunks would overlap + subl $4,len # adjust for termination cases copy4: - movl (%rdx),%eax; leaq 4(%rdx),%rdx; subl $4,len - movl %eax,(%rdi); leaq 4(%rdi),%rdi; jnc copy4 - addl $4,len; jz copy0 + movl (%rax),%edx; addq $4, %rax; subl $4,len + movl %edx,(%rdi); leaq 4(%rdi),%rdi; jnc copy4 + addl $4,len; movb (%rax),%dl; jz copy0 copy1: - movb (%rdx), %al; leaq 1(%rdx),%rdx; subl $1,len - movb %al,(%rdi); leaq 1(%rdi),%rdi; jnz copy1 + incq %rax; movb %dl,(%rdi); subl $1,len + movb (%rax),%dl + leaq 1(%rdi),%rdi; jnz copy1 copy0: - rep; ret + rep; ret #include "amd_d_nrv2e.S" #include "amd_d_nrv2b.S" setup: - cld - cmpl $ M_NRV2E_LE32,meth; je bot_n2e - cmpl $ M_NRV2B_LE32,meth; je top_n2b + cld + cmpl $ M_NRV2E_LE32,meth; je bot_n2e + cmpl $ M_NRV2B_LE32,meth; je top_n2b eof: - pop %rcx # &input_eof - movq %rsi,%rax; subq %rcx,%rax # src -= eof; // return 0: good; else: bad - pop %rdx; subq %rdx,%rdi # dst -= original dst - pop %rcx; movl %edi,(%rcx) # actual length used at dst XXX: 4GB - pop %rbx; pop %rbp - ret + pop %rcx # &input_eof + movq %rsi,%rax; subq %rcx,%rax # src -= eof; // return 0: good; else: bad + pop %rdx; subq %rdx,%rdi # dst -= original dst + pop %rcx; movl %edi,(%rcx) # actual length used at dst XXX: 4GB + pop %rbx; pop %rbp + ret /* Decompress the rest of this loader, and jump to it. Map a page to hold the decompressed bytes. Logically this could