Hacking convert v4a/lzma_d-arm.S from 32-bit to 64-bit

2025-09-28 19:06:07 +08:00 · 2016-02-14 21:26:37 -08:00 · 2016-02-14 21:26:37 -08:00 · fa522c0ffc
commit fa522c0ffc
parent 16241d010f
1 changed files with 148 additions and 469 deletions
--- a/src/stub/src/arch/arm/v8a/lzma_d.S
+++ b/src/stub/src/arch/arm/v8a/lzma_d.S
@ -1,471 +1,150 @@
-rcGetBit.2736:
-	mov	w3, 16777215
-	cmp	w28, w3
-	bhi	.L2
-	ldp	x4, x3, [x18]
-	cmp	x3, x4
-	beq	.L5
-	add	x4, x3, 1
-	str	x4, [x18, 8]
-	lsl	w28, w28, 8
-	ldrb	w2, [x3]
-	orr	w27, w2, w27, lsl 8
-.L2:
-	ldrh	w3, [x1]
-	lsr	w2, w28, 11
-	mul	w2, w2, w3
-	cmp	w2, w27
-	bhi	.L7
-	lsl	w0, w0, 1
-	sub	w3, w3, w3, lsr 5
-	sub	w27, w27, w2
-	sub	w28, w28, w2
-	strh	w3, [x1]
-	add	w0, w0, 1
-	ret
-.L7:
-	mov	w4, 2048
-	mov	w28, w2
-	sub	w2, w4, w3
-	lsl	w0, w0, 1
-	add	w3, w3, w2, asr 5
-	strh	w3, [x1]
-	ret
-.L5:
-	mov	w0, 1
-	ret
+/* lzma_d.S -- ARM decompressor for LZMA

-LzmaDecodeProperties:
-	cmp	w2, 4
-	ble	.L14
-	ldrb	w1, [x1]
-	mov	w2, 1
-	cmp	w1, 224
-	bhi	.L9
-	cmp	w1, 44
-	str	wzr, [x0, 8]
-	bls	.L10
-	mov	w2, 0
-.L11:
-	sub	w1, w1, #45
-	add	w2, w2, 1
-	uxtb	w1, w1
-	cmp	w1, 44
-	bhi	.L11
-	str	w2, [x0, 8]
-.L10:
-	cmp	w1, 8
-	str	wzr, [x0, 4]
-	bls	.L12
-	mov	w2, 1
-.L13:
-	sub	w1, w1, #9
-	mov	w3, w2
-	add	w2, w2, 1
-	uxtb	w1, w1
-	cmp	w1, 8
-	bhi	.L13
-	str	w3, [x0, 4]
-.L12:
-	mov	w2, 0
-	str	w1, [x0]
-.L9:
-	mov	w0, w2
-	ret
-.L14:
-	mov	w2, 1
-	mov	w0, w2
-	ret
+   This file is part of the UPX executable compressor.
+
+   Copyright (C) 1996-2015 Markus Franz Xaver Johannes Oberhumer
+   Copyright (C) 1996-2015 Laszlo Molnar
+   Copyright (C) 2000-2015 John F. Reiser
+   All Rights Reserved.
+
+   UPX and the UCL library are free software; you can redistribute them
+   and/or modify them under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of
+   the License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; see the file COPYING.
+   If not, write to the Free Software Foundation, Inc.,
+   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+   Markus F.X.J. Oberhumer              Laszlo Molnar
+   <markus@oberhumer.com>               <ml1050@users.sourceforge.net>
+
+   John F. Reiser
+   <jreiser@users.sourceforge.net>
+*/
+
+#define section .section
+
+  section LZMA_ELF00
+//decompress:  // (uchar const *src, size_t lsrc, uchar *dst, u32 &ldst, uint method)
+/* Arguments according to calling convention */
+#define src  x0
+#define lsrc w1
+#define dst  x2
+#define ldst x3  /* Out: actually a reference: &len_dst */
+#define meth w4
+
+  // bkpt  // debugging
+
+#define M_LZMA          14
+        cmp meth,#M_LZMA; bne not_lzma
+
+#if defined(LINUX_ARM_CACHEFLUSH)||defined(DARWIN_ARM_CACHEFLUSH)  /*{*/
+        PUSH4(dst,ldst, fp,lr)  // dst,ldst for cache flush
+#else  /*}{*/
+        PUSH2(          fp,lr)
+#endif  /*}*/
+
+#define a0 x0
+#define a1 x1
+#define a2 w2
+#define a3 x3
+#define a4 x4  /* outp */
+#define a5 w5  /* outSize */
+#define a6 x6  /* &outSizeProcessed */
+#define inSzP      3*4  /*   inSizeprocessed */
+#define State      4*4  /* CLzmaDecoderState */
+#define t0 w7
+#define t1 w8
+#define t1x x8
+
+//LzmaDecode(  // from lzmaSDK/C/7zip/Compress/LZMA_C/LzmaDecode.h
+//      a0= &CLzmaDecoderState,
+//      a1= inp,  a2= inSize,  a3= &inSizeProcessed,
+//      a4= outp, a5= outSize, a6= &outSizeProcessed
+//)
+#define LZMA_BASE_SIZE 1846
+#define LZMA_LIT_SIZE   768
+
+        ldrb t0,[src,#0]  // first byte, replaces LzmaDecodeProperties()
+        mov t1,#2*LZMA_LIT_SIZE
+        lsr t0,t0,#3  // lit_context_bits + lit_pos_bits
+        lsl t1,t1,t0  // 2*LZMA_LIT_SIZE << (lit_context_bits + lit_pos_bits)
+        mov t0,sp
+#define W 4  /* even #bits to round up so that 8 bits span all the 1's */
+        add t1,t1,#((~(~0<<W) + State + 2*LZMA_BASE_SIZE)>>W)<<W
+        sub sp,sp,t1  // alloca
+
+        ldr t1,[ldst]
+        str ldst,a6  // &outSizeProcessed
+        str t1,  a5  // outSize
+        str dst, a4   // outp
+
+        add r3,sp,#inSzP
+        mov t1x,#0
+1:  // clear inSizeProcessed and CLzmaDecoderState
+        str t1x,[r3],#8
+        cmp r3,t0
+        blo 1b
+
+        add a3,sp,#inSzP // &inSizeProcessed
+        sub a2,lsrc,#2  // inSize
+        mov a1, src  // inp
+
+        ldrb t1,[a1],#1  // first byte, replaces LzmaDecodeProperties()
+        and  t1,t1,#7  // posBits
+        strb t1,[sp,#2 + State]
+        ldrb t1,[a1],#1  // second byte, replaces LzmaDecodeProperties()
+        mov  a0,t1,LSR #4  // lit_pos_bits
+        strb a0,[sp,#1 + State]
+        and  t1,t1,#0xf  // lib_context_bits
+        strb t1,[sp,#0 + State]
+
+        add a0,sp,#State
+        bl 1f  // the call
+        mov sp,t0  // un-alloca
+
+#if defined(LINUX_ARM_CACHEFLUSH)  /*{*/
+        mov w3,w0  // save result value
+        POP2(x0,x1)  // dst, ldst
+        ldr x1,[x1]  // ldst by reference
+        add x1,x1,x0  // just beyond what was written
+        mov w2,#0
+        do_sys2 __ARM_NR_cacheflush  // decompressed region
+        mov x0,x3  // result value
+#endif  /*}*/
+#if defined(DARWIN_ARM_CACHEFLUSH)  /*{*/
+        mov w4,w0  // save result value
+        POP2(x0,x1)  // dst, ldst
+        ldr x1,[x1]  // ldst by reference
+        PUSH2(x0,x1); do_dcache_flush
+        POP2 (x0,x1); do_icache_invalidate
+        mov x0,x4  // result value
+#endif  /*}*/
+
+        POP {fp,pc}
+1:
+
+  section LZMA_DEC20
+#include "lzma_d_cf.S"
+
+  section LZMA_DEC10
+#if 0  /*{*/
+#include "lzma_d_cs.S"
+#else  /*}{*/
+#define PARAMETER_STYLE 3
+#include "lzma_d-arm.S"
+#endif  /*}*/
+
+  section LZMA_DEC30
+
+not_lzma:
+
+// vi:ts=8:et

-LzmaDecode:
-	stp	x29, x30, [sp, -112]!
-	mov	w8, 1024
-	add	x29, sp, 0
-	stp	x19, x20, [sp, 16]
-	stp	x21, x22, [sp, 32]
-	stp	x23, x24, [sp, 48]
-	stp	x25, x26, [sp, 64]
-	ldp	w19, w15, [x0]
-	ldr	w14, [x0, 8]
-	ldr	x11, [x0, 16]
-	str	wzr, [x3]
-	str	wzr, [x6]
-	ldr	w7, [x0, 4]
-	mov	w0, 768
-	add	w7, w19, w7
-	lsl	w7, w0, w7
-	mov	w0, 1
-	adds	w7, w7, 1846
-	lsl	w14, w0, w14
-	lsl	w0, w0, w15
-	sub	w14, w14, #1
-	sub	w15, w0, #1
-	mov	x0, 0
-	beq	.L22
-.L79:
-	strh	w8, [x11, x0, lsl 1]
-	add	x0, x0, 1
-	cmp	w7, w0
-	bhi	.L79
-.L22:
-	add	x2, x1, x2, uxtw
-	mov	w27, 0
-	str	x2, [x29, 96]
-	mov	w28, -1
-	add	x0, x1, 1
-	add	x10, x1, 6
-	add	x2, x2, 1
-.L20:
-	cmp	x0, x2
-	mov	x8, x0
-	beq	.L41
-	str	x0, [x29, 104]
-	add	x0, x0, 1
-	cmp	x0, x10
-	ldrb	w7, [x0, -2]
-	orr	w27, w7, w27, lsl 8
-	bne	.L20
-	stp	x1, x6, [x29, 80]
-	mov	x9, x4
-	mov	x25, x3
-	cbz	w5, .L63
-	mov	w23, 1
-	mov	w26, 8
-	mov	w22, w23
-	mov	w21, w23
-	mov	w7, w23
-	mov	w10, 0
-	mov	w13, 0
-	mov	w6, 0
-	sub	w20, w26, w19
-.L60:
-	and	w12, w14, w6
-	lsl	w8, w10, 4
-	add	x18, x29, 96
-	mov	w0, 0
-	sxtw	x1, w12
-	add	x8, x1, x8, sxtw
-	lsl	x8, x8, 1
-	add	x1, x11, x8
-	bl	rcGetBit.2736
-	cbnz	w0, .L26
-	and	w0, w15, w6
-	asr	w8, w13, w20
-	lsl	w0, w0, w19
-	cmp	w10, 6
-	add	w8, w0, w8
-	lsl	w0, w8, 2
-	sub	w8, w0, w8
-	ubfiz	x8, x8, 8, 24
-	add	x8, x8, 1846
-	add	x8, x11, x8, lsl 1
-	ble	.L64
-	sub	w0, w6, w7
-	mov	w2, 1
-	ldrb	w12, [x9, x0]
-	b	.L32
-.L96:
-	cbnz	w13, .L29
-.L30:
-	cmp	w2, 255
-	bgt	.L31
-.L32:
-	lsl	w12, w12, 1
-	mov	w0, w2
-	and	w13, w12, 256
-	add	x18, x29, 96
-	sxtw	x1, w13
-	add	x1, x1, 256
-	add	x1, x1, x2, sxtw
-	add	x1, x8, x1, lsl 1
-	bl	rcGetBit.2736
-	mov	w2, w0
-	tbz	x2, 0, .L96
-	cbnz	w13, .L30
-.L29:
-	cmp	w2, 255
-	bgt	.L31
-.L33:
-	add	x1, x8, x2, sxtw 1
-	mov	w0, w2
-	add	x18, x29, 96
-	bl	rcGetBit.2736
-	cmp	w0, 255
-	mov	w2, w0
-	ble	.L33
-	uxtb	w13, w0
-	cmp	w10, 3
-	strb	w13, [x9, w6, uxtw]
-	add	w6, w6, 1
-	bgt	.L62
-	mov	w10, 0
-.L34:
-	cmp	w5, w6
-	bhi	.L60
-.L58:
-	mov	w1, 16777215
-	cmp	w28, w1
-	bhi	.L97
-	ldp	x2, x1, [x29, 96]
-	cmp	x1, x2
-	beq	.L41
-	add	x8, x1, 1
-	str	x8, [x29, 104]
-	lsl	w28, w28, 8
-	ldrb	w0, [x1]
-	orr	w27, w0, w27, lsl 8
-.L25:
-	ldr	x0, [x29, 80]
-	ldp	x19, x20, [sp, 16]
-	sub	x8, x8, x0
-	ldr	x0, [x29, 88]
-	str	w8, [x25]
-	ldp	x21, x22, [sp, 32]
-	str	w6, [x0]
-	mov	w0, 0
-	ldp	x23, x24, [sp, 48]
-	ldp	x25, x26, [sp, 64]
-	ldp	x29, x30, [sp], 112
-	ret
-.L64:
-	mov	w2, 1
-	b	.L33
-.L31:
-	uxtb	w13, w2
-	strb	w13, [x9, w6, uxtw]
-	add	w6, w6, 1
-.L62:
-	cmp	w10, 9
-	bgt	.L35
-	sub	w10, w10, #3
-	b	.L34
-.L26:
-	sxtw	x13, w10
-	add	x18, x29, 96
-	add	x13, x13, 192
-	mov	w0, 0
-	lsl	x13, x13, 1
-	add	x1, x11, x13
-	bl	rcGetBit.2736
-	cbnz	w0, .L36
-	mov	w0, 3
-	cmp	w10, 6
-	mov	w23, w22
-	csel	w10, w0, wzr, gt
-	mov	w22, w21
-	add	x24, x11, 1636
-	mov	w21, w7
-.L38:
-	add	x18, x29, 96
-	mov	x1, x24
-	mov	w0, 0
-	bl	rcGetBit.2736
-	cbnz	w0, .L44
-	lsl	w12, w12, 3
-	mov	w8, -8
-	mov	w13, 3
-	sxtw	x12, w12
-	add	x12, x12, 2
-	add	x12, x24, x12, lsl 1
-.L45:
-	mov	w0, 1
-.L47:
-	add	x18, x29, 96
-	add	x1, x12, x0, uxtw 1
-	bl	rcGetBit.2736
-	subs	w13, w13, #1
-	bne	.L47
-	cmp	w10, 3
-	add	w8, w0, w8
-	bgt	.L48
-	mov	w7, 3
-	mov	w12, 6
-	cmp	w8, w7
-	mov	w0, 1
-	csel	w7, w8, w7, le
-	lsl	w7, w7, 6
-	sxtw	x7, w7
-	add	x7, x7, 432
-	lsl	x7, x7, 1
-.L49:
-	add	x1, x7, x0, uxtw 1
-	add	x18, x29, 96
-	add	x1, x11, x1
-	bl	rcGetBit.2736
-	subs	w12, w12, #1
-	bne	.L49
-	sub	w7, w0, #64
-	add	w10, w10, 7
-	cmp	w7, 3
-	mov	w12, w7
-	ble	.L50
-	cmp	w7, 13
-	and	w0, w7, 1
-	asr	w1, w7, 1
-	orr	w7, w0, 2
-	bgt	.L51
-	sub	w13, w1, #1
-	mov	x0, 688
-	lsl	w7, w7, w13
-	sub	x12, x0, x12, sxtw
-	add	x12, x12, x7, uxtw
-	add	x12, x11, x12, lsl 1
-	sub	x12, x12, #2
-.L52:
-	mov	w0, 1
-	mov	w24, w0
-.L57:
-	add	x1, x12, x0, sxtw 1
-	add	x18, x29, 96
-	bl	rcGetBit.2736
-	and	w3, w0, 1
-	orr	w1, w7, w24
-	cmp	w3, wzr
-	csel	w7, w1, w7, ne
-	subs	w13, w13, #1
-	lsl	w24, w24, 1
-	bne	.L57
-.L50:
-	adds	w7, w7, 1
-	beq	.L58
-.L48:
-	cmp	w7, w6
-	add	w0, w8, 2
-	bhi	.L41
-	mov	w1, w6
-.L59:
-	sub	w2, w6, w7
-	sub	w0, w0, #1
-	add	w6, w6, 1
-	cmp	w0, wzr
-	ccmp	w5, w6, 0, ne
-	ldrb	w13, [x9, x2]
-	strb	w13, [x9, w1, uxtw]
-	mov	w1, w6
-	bhi	.L59
-	b	.L34
-.L35:
-	sub	w10, w10, #6
-	b	.L34
-.L36:
-	add	x1, x13, 24
-	add	x18, x29, 96
-	add	x1, x11, x1
-	mov	w0, 0
-	bl	rcGetBit.2736
-	cbnz	w0, .L39
-	add	x1, x8, 480
-	add	x18, x29, 96
-	add	x1, x11, x1
-	bl	rcGetBit.2736
-	cbnz	w0, .L67
-	cbz	w6, .L41
-	sub	w0, w6, w7
-	cmp	w10, 6
-	mov	w10, 11
-	ldrb	w13, [x9, x0]
-	mov	w0, 9
-	strb	w13, [x9, w6, uxtw]
-	csel	w10, w10, w0, gt
-	add	w6, w6, 1
-	b	.L34
-.L44:
-	add	x18, x29, 96
-	add	x1, x24, 2
-	mov	w0, 0
-	bl	rcGetBit.2736
-	mov	w8, w0
-	cbnz	w0, .L46
-	lsl	w0, w12, 3
-	mov	w13, 3
-	sxtw	x0, w0
-	add	x0, x0, 130
-	add	x12, x24, x0, lsl 1
-	b	.L45
-.L39:
-	add	x1, x13, 48
-	add	x18, x29, 96
-	add	x1, x11, x1
-	mov	w0, 0
-	bl	rcGetBit.2736
-	cbnz	w0, .L98
-.L40:
-	cmp	w10, 6
-	mov	w0, w7
-	mov	w10, 11
-	mov	w7, w21
-	csel	w10, w10, w26, gt
-	mov	w21, w0
-	add	x24, x11, 2664
-	b	.L38
-.L46:
-	add	x12, x24, 516
-	mov	w8, -240
-	mov	w13, 8
-	b	.L45
-.L41:
-	ldp	x19, x20, [sp, 16]
-	ldp	x21, x22, [sp, 32]
-	ldp	x23, x24, [sp, 48]
-	ldp	x25, x26, [sp, 64]
-	mov	w0, 1
-	ldp	x29, x30, [sp], 112
-	ret
-.L51:
-	ldr	x12, [x29, 96]
-	sub	w1, w1, #5
-	mov	w3, 16777215
-.L55:
-	cmp	w28, w3
-	lsl	w7, w7, 1
-	bhi	.L53
-	ldr	x2, [x29, 104]
-	cmp	x2, x12
-	add	x4, x2, 1
-	beq	.L41
-	str	x4, [x29, 104]
-	lsl	w28, w28, 8
-	ldrb	w0, [x2]
-	orr	w27, w0, w27, lsl 8
-.L53:
-	lsr	w0, w28, 1
-	cmp	w0, w27
-	mov	w28, w0
-	bhi	.L54
-	sub	w27, w27, w0
-	orr	w7, w7, 1
-.L54:
-	subs	w1, w1, #1
-	bne	.L55
-	lsl	w7, w7, 4
-	add	x12, x11, 1604
-	mov	w13, 4
-	b	.L52
-.L67:
-	mov	w0, w21
-	mov	w21, w7
-	mov	w7, w0
-	b	.L40
-.L98:
-	add	x1, x13, 72
-	add	x18, x29, 96
-	add	x1, x11, x1
-	mov	w0, 0
-	bl	rcGetBit.2736
-	cbnz	w0, .L69
-	mov	w0, w21
-	mov	w21, w22
-	mov	w22, w0
-	b	.L40
-.L69:
-	mov	w0, w21
-	mov	w21, w23
-	mov	w23, w22
-	mov	w22, w0
-	b	.L40
-.L97:
-	ldr	x8, [x29, 104]
-	b	.L25
-.L63:
-	mov	w6, 0
-	b	.L25