From b7bbd81dda0459dfbc727ec326b8fd3dbd34e8b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A1szl=C3=B3=20Moln=C3=A1r?=
 <ezerotven+github@gmail.com>
Date: Thu, 6 Jul 2006 18:30:34 +0200
Subject: [PATCH] conversion of atari/tos to ElfLinker started

---
 src/stub/Makefile                  |  15 +-
 src/stub/src/arch/m68k/bits.ash    | 149 +++++-----
 src/stub/src/arch/m68k/nrv2e_d.ash | 107 +++----
 src/stub/src/m68k-atari.tos.asm    | 452 ++++++++++++++---------------
 4 files changed, 362 insertions(+), 361 deletions(-)

diff --git a/src/stub/Makefile b/src/stub/Makefile
index d67f3a66..61fbdfa0 100644
--- a/src/stub/Makefile
+++ b/src/stub/Makefile
@@ -608,14 +608,21 @@ i386-win32.pe.h : $(srcdir)/src/$$T.asm
 m68k-atari.tos-%.h : tc_list = m68k-atari.tos default
 
 tc.m68k-atari.tos.app-a68k  = perl -w $(srcdir)/src/arch/m68k/app-a68k.pl
-tc.m68k-atari.tos.asm-a68k  = a68k
+#tc.m68k-atari.tos.asm-a68k  = a68k
+tc.m68k-atari.tos.pp-asm    = gcc -E -nostdinc -x assembler-with-cpp -Wall
+tc.m68k-atari.tos.asm-a68k  = m68k-unknown-linux-gnu-as --register-prefix-optional
 
 m68k-atari.tos-nrv%.h : $(srcdir)/src/m68k-atari.tos.asm
 	# call gpp_inc to generate .d file
 	$(call tc,gpp_inc) --mode=c --MMD=$@ --MF=tmp/$T.i.d $< -o /dev/null
-	$(call tc,pp-asm) -D__A68K__ $(PP_FLAGS) $< -o tmp/$T.i
-	$(call tc,asm-a68k) -q -ltmp/$T.o.lst tmp/$T.i -otmp/$T.o
-	$(call tc,o2bin) tmp/$T.o tmp/$T.bin 'UPX1' 'UPX9'
+	$(call tc,pp-asm) -D__GAS__ $(PP_FLAGS) $< -o tmp/$T.i
+##	$(call tc,asm-a68k) -q -ltmp/$T.o.lst tmp/$T.i -otmp/$T.o
+	$(call tc,asm-a68k) tmp/$T.i -o tmp/$T.bin
+##	$(call tc,o2bin) tmp/$T.o tmp/$T.bin 'UPX1' 'UPX9'
+	$(call tc,m-objcopy) --strip-unneeded tmp/$T.bin
+	$(call tc,m-objcopy) -R .text -R .data -R .bss tmp/$T.bin
+	$(call tc,m-objcopy) -R .note -R .comment tmp/$T.bin
+	$(call tc,m-objdump) -trwh tmp/$T.bin >> tmp/$T.bin
 	$(call tc,bin2h) --ident=$(IDENT_PREFIX)loader$(IDENT_SUFFIX) tmp/$T.bin $@
 
 m68k-atari.tos-nrv2b% :        PP_FLAGS     = -DNRV2B
diff --git a/src/stub/src/arch/m68k/bits.ash b/src/stub/src/arch/m68k/bits.ash
index 2ff63394..a6ef1d27 100644
--- a/src/stub/src/arch/m68k/bits.ash
+++ b/src/stub/src/arch/m68k/bits.ash
@@ -1,3 +1,4 @@
+/*
 ;  bits.ash -- bit access for decompression
 ;
 ;  This file is part of the UCL data compression library.
@@ -24,15 +25,15 @@
 ;  <markus@oberhumer.com>
 ;  http://www.oberhumer.com/opensource/ucl/
 ;
+*/
 
-
-; ------------- ADDBITS -------------
+// ------------- ADDBITS -------------
 
 macro(ADDBITS)
 #if (NRV_BB == 8)
-                add.b   d0,d0           ; sets Z, C and X       ;  4
+                add.b   d0,d0           // sets Z, C and X       //  4
 #elif (NRV_BB == 32)
-                add.l   d0,d0           ; sets Z, C and X       ;  6
+                add.l   d0,d0           // sets Z, C and X       //  6
 #endif
         endm
 
@@ -40,95 +41,95 @@ macro(ADDBITS)
 #if 0
 macro(ADDXBITS)
 #if (NRV_BB == 8)
-                addx.b   d0,d0          ; sets C and X          ;  4
+                addx.b   d0,d0          // sets C and X          //  4
 #elif (NRV_BB == 32)
-                addx.l   d0,d0          ; sets C and X          ;  8
+                addx.l   d0,d0          // sets C and X          //  8
 #endif
         endm
 #endif
 
 
-; ------------- FILLBYTES_xx -------------
+// ------------- FILLBYTES_xx -------------
 
-; get 1 byte; then get 1 bit into both C and X
+// get 1 byte// then get 1 bit into both C and X
 macro(FILLBYTES_8)
-        ; note: we shift the X flag through -> must init d0.b with $80
-                move.b  (a0)+,d0                                ;  8
-                addx.b  d0,d0           ; sets C and X          ;  4
+        // note: we shift the X flag through -> must init d0.b with $80
+                move.b  (a0)+,d0                                //  8
+                addx.b  d0,d0           // sets C and X          //  4
         endm
 
 
-; get 32 bits in little endian format; then get 1 bit into both C and X
+// get 32 bits in little endian format// then get 1 bit into both C and X
 macro(FILLBYTES_LE32)
 #if 0
-                move.b  (a0)+,d0                                ;  8
-                ror.l   #8,d0                                   ; 24
-                move.b  (a0)+,d0                                ;  8
-                ror.l   #8,d0                                   ; 24
-                move.b  (a0)+,d0                                ;  8
-                ror.l   #8,d0                                   ; 24
-                move.b  (a0)+,d0                                ;  8
-                ror.l   #8,d0                                   ; 24
-                add.l   d0,d0           ; sets C and X          ;  6
-                bset    #0,d0           ; only changes Z        ; 12
-                                                           ;    -----
-                                                           ;     146
+                move.b  (a0)+,d0                                //  8
+                ror.l   #8,d0                                   // 24
+                move.b  (a0)+,d0                                //  8
+                ror.l   #8,d0                                   // 24
+                move.b  (a0)+,d0                                //  8
+                ror.l   #8,d0                                   // 24
+                move.b  (a0)+,d0                                //  8
+                ror.l   #8,d0                                   // 24
+                add.l   d0,d0           // sets C and X          //  6
+                bset    #0,d0           // only changes Z        // 12
+                                                           //    -----
+                                                           //     146
 #elif 1
-                move.b  3(a0),d0                                ; 12
-                lsl.w   #8,d0                                   ; 22
-                move.b  2(a0),d0                                ; 12
-                swap    d0                                      ;  4
-                move.b  1(a0),d0                                ; 12
-                lsl.w   #8,d0                                   ; 22
-                move.b  (a0),d0                                 ;  8
-                addq.l  #4,a0           ; does not affect flags ;  8
-                add.l   d0,d0           ; sets C and X          ;  6
-                bset    #0,d0           ; only changes Z        ; 12
-                                                           ;    -----
-                                                           ;     118
+                move.b  3(a0),d0                                // 12
+                lsl.w   #8,d0                                   // 22
+                move.b  2(a0),d0                                // 12
+                swap    d0                                      //  4
+                move.b  1(a0),d0                                // 12
+                lsl.w   #8,d0                                   // 22
+                move.b  (a0),d0                                 //  8
+                addq.l  #4,a0           // does not affect flags //  8
+                add.l   d0,d0           // sets C and X          //  6
+                bset    #0,d0           // only changes Z        // 12
+                                                           //    -----
+                                                           //     118
 #elif 1
-        ; note: we shift the X flag through -> must init d0.l with $80000000
-        ; note: rol/ror do not change X flag (but asl/asr/lsl/lsr do)
-                move.b  3(a0),d0                                ; 12
-                ror.w   #8,d0                                   ; 22
-                move.b  2(a0),d0                                ; 12
-                swap    d0                                      ;  4
-                move.b  1(a0),d0                                ; 12
-                ror.w   #8,d0                                   ; 22
-                move.b  (a0),d0                                 ;  8
-                addq.l  #4,a0           ; does not affect flags ;  8
-                addx.l  d0,d0           ; sets C and X          ;  8
-                                                           ;    -----
-                                                           ;     108
+        // note: we shift the X flag through -> must init d0.l with $80000000
+        // note: rol/ror do not change X flag (but asl/asr/lsl/lsr do)
+                move.b  3(a0),d0                                // 12
+                ror.w   #8,d0                                   // 22
+                move.b  2(a0),d0                                // 12
+                swap    d0                                      //  4
+                move.b  1(a0),d0                                // 12
+                ror.w   #8,d0                                   // 22
+                move.b  (a0),d0                                 //  8
+                addq.l  #4,a0           // does not affect flags //  8
+                addx.l  d0,d0           // sets C and X          //  8
+                                                           //    -----
+                                                           //     108
 #else
-        ; IMPORTANT: movep is not implemented on the 68060
+        // IMPORTANT: movep is not implemented on the 68060
 #  error "do not use movep"
-        ; note: we shift the X flag through -> must init d0.l with $80000000
-        ; note: must use dc.l because of a bug in the pasm assembler
-        ; note: may access past the end of the input; this is ok for UPX
-                dc.l    $01080003       ; movep.w 3(a0),d0      ; 16
-                move.b  2(a0),d0                                ; 12
-                swap    d0                                      ;  4
-                dc.l    $01080001       ; movep.w 1(a0),d0      ; 16
-                move.b  (a0),d0                                 ;  8
-                addq.l  #4,a0           ; does not affect flags ;  8
-                addx.l  d0,d0           ; sets C and X          ;  8
-                                                           ;    -----
-                                                           ;      72
+        // note: we shift the X flag through -> must init d0.l with $80000000
+        // note: must use dc.l because of a bug in the pasm assembler
+        // note: may access past the end of the input// this is ok for UPX
+                dc.l    $01080003       // movep.w 3(a0),d0      // 16
+                move.b  2(a0),d0                                // 12
+                swap    d0                                      //  4
+                dc.l    $01080001       // movep.w 1(a0),d0      // 16
+                move.b  (a0),d0                                 //  8
+                addq.l  #4,a0           // does not affect flags //  8
+                addx.l  d0,d0           // sets C and X          //  8
+                                                           //    -----
+                                                           //      72
 #endif
         endm
 
 
-; ------------- FILLBITS -------------
+// ------------- FILLBITS -------------
 
 macro(FILLBITS)
 #if (NRV_BB == 8)
-                ; no need for a subroutine
+                // no need for a subroutine
                 FILLBYTES_8
 #elif (NRV_BB == 32)
 # ifdef SMALL
 #  define FILLBYTES_SR FILLBYTES_LE32
-                bsr     fillbytes_sr                            ; 18
+                bsr     fillbytes_sr                            // 18
 # else
                 FILLBYTES_LE32
 # endif
@@ -136,24 +137,24 @@ macro(FILLBITS)
         endm
 
 
-; ------------- GETBIT -------------
+// ------------- GETBIT -------------
 
-; get one bit into both the Carry and eXtended flag
+// get one bit into both the Carry and eXtended flag
 macro(GETBIT)
 #if defined(__A68K__)
-                ADDBITS                                         ;  4 / 6
-                bne     \@                                      ; 10 (if jump)
+                ADDBITS                                         //  4 / 6
+                bne     \@                                      // 10 (if jump)
                 FILLBITS
 \@:
 #elif defined(__ASL__)
-                ADDBITS                                         ;  4 / 6
-                bne     done                                    ; 10 (if jump)
+                ADDBITS                                         //  4 / 6
+                bne     done                                    // 10 (if jump)
                 FILLBITS
 done:
 #else
 LOCAL done
-                ADDBITS                                         ;  4 / 6
-                bne     done                                    ; 10 (if jump)
+                ADDBITS                                         //  4 / 6
+                bne     done                                    // 10 (if jump)
                 FILLBITS
 done:
 #endif
@@ -161,5 +162,5 @@ done:
 
 
 
-; vi:ts=8:et
+// vi:ts=8:et
 
diff --git a/src/stub/src/arch/m68k/nrv2e_d.ash b/src/stub/src/arch/m68k/nrv2e_d.ash
index e346f5dd..1e189f49 100644
--- a/src/stub/src/arch/m68k/nrv2e_d.ash
+++ b/src/stub/src/arch/m68k/nrv2e_d.ash
@@ -1,3 +1,4 @@
+/*
 ;  n2e_d.ash -- NRV2E decompression in 68000 assembly
 ;
 ;  This file is part of the UCL data compression library.
@@ -56,27 +57,27 @@
 ;   we have max_match = 65535, so we can use word arithmetics on d2
 ;   we have max_offset < 2**23, so we can use partial word arithmetics on d1
 ;
+*/
 
-
-; ------------- constants & macros -------------
+// ------------- constants & macros -------------
 
 #if !defined(NRV_NO_INIT)
 
-                ;;move.l  #-$500,d6             ; 0xfffffb00
-                moveq.l #-$50,d6                ;   0xffffffb0
-                lsl.w   #4,d6                   ;   << 4
+                ////move.l  #-0x500,d6             // 0xfffffb00
+                moveq.l #-0x50,d6                //   0xffffffb0
+                lsl.w   #4,d6                   //   << 4
 
                 moveq.l #0,d7
-                moveq.l #-1,d5                  ; last_off = -1
+                moveq.l #-1,d5                  // last_off = -1
 
-                ; init d0 with high bit set
+                // init d0 with high bit set
 #if (NRV_BB == 8)
-                ;;move.b  #$80,d0                 ; init d0.b for FILLBYTES
-                moveq.l #-128,d0                ; d0.b = $80
+                ////move.b  #0x80,d0                 // init d0.b for FILLBYTES
+                moveq.l #-128,d0                // d0.b = 0x80
 #elif (NRV_BB == 32)
-                ;;move.l  #$80000000,d0           ; init d0.l for FILLBYTES
+                ////move.l  #0x80000000,d0           // init d0.l for FILLBYTES
                 moveq.l #1,d0
-                ror.l   #1,d0                   ; d0.l = $80000000
+                ror.l   #1,d0                   // d0.l = 0x80000000
 #endif
                 bra     decompr_start
 
@@ -88,12 +89,12 @@
 
 #if defined(FILLBYTES_SR)
 fillbytes_sr:   FILLBYTES_SR
-                rts                                             ; 16
+                rts                                             // 16
 #endif
 
 
 
-; ------------- DECOMPRESSION -------------
+// ------------- DECOMPRESSION -------------
 
 
 decompr_literal:
@@ -102,16 +103,16 @@ decompr_literal:
 decompr_start:
 decompr_loop:
 #ifdef SMALL
-        ;   cost literal:   4 + 10 + 10
-        ;   cost match:     4 + 10 +  8
-        ;   cost fillbits:  4 +  8
+        //   cost literal:   4 + 10 + 10
+        //   cost match:     4 + 10 +  8
+        //   cost fillbits:  4 +  8
                 GETBIT
                 bcs     decompr_literal
 #else
-        ; optimization: carry is clear -> we know that bits are available
-        ;   cost literal:   4 +  8 + 10
-        ;   cost match:     4 + 10
-        ;   cost fillbits:  4 +  8 +  8
+        // optimization: carry is clear -> we know that bits are available
+        //   cost literal:   4 +  8 + 10
+        //   cost match:     4 + 10
+        //   cost fillbits:  4 +  8 +  8
                 ADDBITS
                 bcc     decompr_match
                 bne     decompr_literal
@@ -128,16 +129,16 @@ decompr_l1:
                 GETBIT
                 addx.w  d1,d1
 #ifdef SMALL
-        ;   cost loop continue:  4 + 10 +  8
-        ;   cost loop break:     4 + 10 + 10
-        ;   cost fillbits:       4 +  8
+        //   cost loop continue:  4 + 10 +  8
+        //   cost loop break:     4 + 10 + 10
+        //   cost fillbits:       4 +  8
                 GETBIT
                 bcs     decompr_break1
 #else
-        ; optimization: carry is clear -> we know that bits are available
-        ;   cost loop continue:  4 + 10
-        ;   cost loop break:     4 +  8 + 10
-        ;   cost fillbits:       4 +  8 +  8
+        // optimization: carry is clear -> we know that bits are available
+        //   cost loop continue:  4 + 10
+        //   cost loop break:     4 +  8 + 10
+        //   cost fillbits:       4 +  8 +  8
                 ADDBITS
                 bcc     L(continue)
                 bne     decompr_break1
@@ -152,7 +153,7 @@ L(continue):
                 bra     decompr_end
 decompr_break1:
                 subq.w  #3,d1
-                bcs     decompr_prev_dist       ; last m_off
+                bcs     decompr_prev_dist       // last m_off
                 lsl.l   #8,d1
                 move.b  (a0)+,d1
                 not.l   d1
@@ -175,16 +176,16 @@ decompr_get_mlen2:
 decompr_l2:     GETBIT
                 addx.w  d2,d2
 #ifdef SMALL
-        ;   cost loop continue:  4 + 10 + 10
-        ;   cost loop break:     4 + 10 +  8
-        ;   cost fillbits:       4 +  8
+        //   cost loop continue:  4 + 10 + 10
+        //   cost loop break:     4 + 10 +  8
+        //   cost fillbits:       4 +  8
                 GETBIT
                 bcc     decompr_l2
 #else
-        ; optimization: carry is clear -> we know that bits are available
-        ;   cost loop continue:  4 + 10
-        ;   cost loop break:     4 +  8 + 10
-        ;   cost fillbits:       4 +  8 +  8
+        // optimization: carry is clear -> we know that bits are available
+        //   cost loop continue:  4 + 10
+        //   cost loop break:     4 +  8 + 10
+        //   cost fillbits:       4 +  8 +  8
                 ADDBITS
                 bcc     decompr_l2
                 bne     L(break)
@@ -200,29 +201,29 @@ decompr_got_mlen:
                 move.l  d1,d5
                 lea     0(a1,d1.l),a3
 
-                ; must use sub as cmp doesn't affect the X flag
+                // must use sub as cmp doesn't affect the X flag
                 sub.l   d6,d1
                 addx.w  d7,d2
 
-; TODO: partly unroll this loop; could use some magic with d7 for address
-;       computations, then compute a nice `jmp yyy(pc,dx.w)'
+// TODO: partly unroll this loop// could use some magic with d7 for address
+//       computations, then compute a nice `jmp yyy(pc,dx.w)'
 
 #if 1
-        ;   cost for any m_len:   12 + 22 * (m_len - 1) + 4
-        ;     38, 60, 82, 104, 126, 148, 170, 192, 214, 236
-                move.b  (a3)+,(a1)+                             ; 12
-L(copy):        move.b  (a3)+,(a1)+                             ; 12
-                dbra    d2,L(copy)                              ; 10 / 14
+        //   cost for any m_len:   12 + 22 * (m_len - 1) + 4
+        //     38, 60, 82, 104, 126, 148, 170, 192, 214, 236
+                move.b  (a3)+,(a1)+                             // 12
+L(copy):        move.b  (a3)+,(a1)+                             // 12
+                dbra    d2,L(copy)                              // 10 / 14
 #else
-        ;   cost for even m_len:  18 + 34 * (m_len / 2) + 4
-        ;   cost for odd m_len:   28 + 34 * (m_len / 2) + 4
-        ;     56, 66, 90, 100, 124, 134, 158, 168, 192, 202
-                lsr.w   #1,d2                                   ;  8
-                bcc     L(copy)                                 ; 10 /  8
-                move.b  (a3)+,(a1)+                             ; 12
-L(copy):        move.b  (a3)+,(a1)+                             ; 12
-                move.b  (a3)+,(a1)+                             ; 12
-                dbra    d2,L(copy)                              ; 10 / 14
+        //   cost for even m_len:  18 + 34 * (m_len / 2) + 4
+        //   cost for odd m_len:   28 + 34 * (m_len / 2) + 4
+        //     56, 66, 90, 100, 124, 134, 158, 168, 192, 202
+                lsr.w   #1,d2                                   //  8
+                bcc     L(copy)                                 // 10 /  8
+                move.b  (a3)+,(a1)+                             // 12
+L(copy):        move.b  (a3)+,(a1)+                             // 12
+                move.b  (a3)+,(a1)+                             // 12
+                dbra    d2,L(copy)                              // 10 / 14
 #endif
 
                 bra     decompr_loop
@@ -232,5 +233,5 @@ L(copy):        move.b  (a3)+,(a1)+                             ; 12
 decompr_end:
 
 
-; vi:ts=8:et
+// vi:ts=8:et
 
diff --git a/src/stub/src/m68k-atari.tos.asm b/src/stub/src/m68k-atari.tos.asm
index 37819f8d..c3cd91a8 100644
--- a/src/stub/src/m68k-atari.tos.asm
+++ b/src/stub/src/m68k-atari.tos.asm
@@ -1,3 +1,4 @@
+/*
 ;  l_tos.s -- loader & decompressor for the atari/tos format
 ;
 ;  This file is part of the UPX executable compressor.
@@ -24,12 +25,12 @@
 ;  Markus F.X.J. Oberhumer              Laszlo Molnar
 ;  <mfx@users.sourceforge.net>          <ml1050@users.sourceforge.net>
 ;
-
+*/
 
 #define NRV_BB  8
-#include "../../version.h"
 
 
+/*
 ;
 ; see also:
 ;   freemint/sys/mint/basepage.h
@@ -44,30 +45,16 @@
 ; by a simple perl script. We also maintain compatiblity with the pasm
 ; assembler (which must be started in the emulator window).
 ;
+*/
 
+#define L(label)      .L##label
+#define macro(name)   .macro  name
+#define endm          .endm
+#define section       .section
 
-#if defined(__A68K__)
-#  define align4        align   0,4
-#  define L(label)      \/**/label
-#  define macro(name)   name    macro
-#  define text          section code
-#elif defined(__ASL__)
-#  define align4        align   4
-#  define L(label)      $$/**/label
-#  define macro(name)   name    macro
-#  define text          section code
-#else
-#  define align4        align   4
-#  define L(label)      ./**/label
-#  define macro(name)   macro   name
-#endif
-
-; defines needed for including ident_[ns].ash
-#define db      dc.b
-#define dw      dc.w
-#define dd      dc.l
-
+.altmacro
 
+/*
 ; basepage offsets
 p_lowtpa        equ     $0      ; .l    pointer to self (bottom of TPA)
 p_hitpa         equ     $4      ; .l    pointer to top of TPA + 1
@@ -81,7 +68,11 @@ p_dta           equ     $20     ; .l    pointer to current DTA
 p_parent        equ     $24     ; .l    pointer to parent's basepage
 p_flags         equ     $28     ; .l    memory usage flags
 p_env           equ     $2c     ; .l    pointer to environment string
+*/
 
+p_tbase = 8
+
+/*
 ;
 ; long living registers:
 ;   d4  p_tbase - start of text segment
@@ -91,12 +82,13 @@ p_env           equ     $2c     ; .l    pointer to environment string
 ;                     - start of dirty bss
 ;   ASTACK (a7) - final startup code copied below stack
 ;
+*/
 
+/*************************************************************************
+// flush cache macros
+**************************************************************************/
 
-; /*************************************************************************
-; // flush cache macros
-; **************************************************************************/
-
+/*
 ; note:
 ;   GEMDOS/XBIOS trashes d0, d1, d2, a0, a1, a2
 
@@ -108,71 +100,72 @@ p_env           equ     $2c     ; .l    pointer to environment string
 ;
 ; Note that on a 68060 FreeMiNT just uses `cpusha bc' in all cases,
 ; so we don't bother passing base and length. (info: base would be d4)
+*/
 
 macro(MINT_FLUSH_CACHE)
-                pea     -1              ; length
-                clr.l   -(sp)           ; base
+                pea     -1              // length
+                clr.l   -(sp)           // base
 #if 0
-                move.w  #$0016,-(sp)    ; S_FLUSHCACHE (22)
-                move.w  #$0154,-(sp)    ; Ssystem (340)
+                move.w  #0x016,-(sp)    // S_FLUSHCACHE (22)
+                move.w  #0x154,-(sp)    // Ssystem (340)
 #else
-                move.l  #$01540016,-(sp)
+                move.l  #0x01540016,-(sp)
 #endif
-                trap    #1              ; GEMDOS
+                trap    #1              // GEMDOS
                 lea     12(sp),sp
         endm
 
 
-; First try `cpusha bc' (68040/68060). If that fails try temporary changing
-; the cache control register (68030).
+// First try `cpusha bc' (68040/68060). If that fails try temporary changing
+// the cache control register (68030).
 
 macro(SUPEXEC_FLUSH_CACHE)
-                pea     \@super(pc)
-                move.w  #$0026,-(sp)    ; Supexec (38)
-                trap    #14             ; XBIOS
+                pea     super(pc)
+                move.w  #0x0026,-(sp)    // Supexec (38)
+                trap    #14             // XBIOS
                 addq.l  #6,sp
-                bra     \@done
+                bra     done
 
 
-; exception handler
-\@exception:    move.l  a1,sp           ; restore stack (SSP)
-                jmp     (a0)            ; and continue
+// exception handler
+exception:      move.l  a1,sp           // restore stack (SSP)
+                jmp     (a0)            // and continue
 
 
-\@super:        move.l  ($10),-(sp)
-                move.l  ($2c),-(sp)
-                move.l  ($f4),-(sp)
-                move.l  sp,a1           ; save stack pointer (SSP)
+super:          move.l  (0x10),-(sp)
+                move.l  (0x2c),-(sp)
+                move.l  (0xf4),-(sp)
+                move.l  sp,a1           // save stack pointer (SSP)
 
-        ; set exception vectors
-                lea     \@exception(pc),a0
-                move.l  a0,($10)
-                move.l  a0,($2c)
-                move.l  a0,($f4)
-                nop                     ; flush write pipeline
+        // set exception vectors
+                lea     exception(pc),a0
+                move.l  a0,(0x10)
+                move.l  a0,(0x2c)
+                move.l  a0,(0xf4)
+                nop                     // flush write pipeline
 
-        ; try 68040 / 68060
-                lea     \@1(pc),a0
-                dc.w    $f4f8           ; cpusha bc
-                bra     \@ret
-\@1:
-        ; try 68030
-                lea     \@2(pc),a0
-                dc.l    $4e7a0002       ; movec.l cacr,d0
+        // try 68040 / 68060
+                lea     1(pc),a0
+                dc.w    0xf4f8          // cpusha bc
+                bra     ret
+1:
+        // try 68030
+                lea     2(pc),a0
+                movec.l cacr,d0
                 move.l  d0,d1
-                or.w    #$0808,d1
-                dc.l    $4e7b1002       ; movec.l d1,cacr
-                dc.l    $4e7b0002       ; movec.l d0,cacr
-;;;                bra     \@ret
-\@2:
+                or.w    #0x0808,d1
+                movec.l d1,cacr
+                movec.l d0,cacr
+//;;                bra     \@ret
+2:
 
-\@ret:          move.l  (sp)+,($f4)
-                move.l  (sp)+,($2c)
-                move.l  (sp)+,($10)
-                nop                     ; flush write pipeline
+ret:            move.l  (sp)+,(0xf4)
+                move.l  (sp)+,(0x2c)
+                move.l  (sp)+,(0x10)
+                nop                     // flush write pipeline
                 rts
 
-\@done:
+done:
         endm
 
 
@@ -180,9 +173,9 @@ macro(SUPEXEC_FLUSH_CACHE)
 macro(BOTH_FLUSH_CACHE)
                 MINT_FLUSH_CACHE
                 tst.l   d0
-                beq     \@done
+                beq     done2
                 SUPEXEC_FLUSH_CACHE
-\@done:
+done2:
         endm
 
 
@@ -198,70 +191,76 @@ macro(BOTH_FLUSH_CACHE)
 #endif
 
 
-; /*************************************************************************
-; // entry - the text segment of a compressed executable
-; //
-; // note: compressed programs never have the F_SHTEXT flag set,
-; //       so we can assume that the text, data & bss segments
-; //       are contiguous in memory
-; **************************************************************************/
+
+/*************************************************************************
+// entry - the text segment of a compressed executable
+//
+// note: compressed programs never have the F_SHTEXT flag set,
+//       so we can assume that the text, data & bss segments
+//       are contiguous in memory
+**************************************************************************/
 
 #if defined(__ASL__)
                 padding off
 #endif
-                text
-                dc.b    'UPX1'          ; marker for o2bin.pl
+
+section         tos0
+                //text
+                //dc.b    'UPX1'          // marker for o2bin.pl
 
 start:
-                move.l  a0,d0           ; a0 is basepage if accessory
+                move.l  a0,d0           // a0 is basepage if accessory
                 beq     L(l_app)
-                move.l  4(a0),sp        ; accessory - get stack
+                move.l  4(a0),sp        // accessory - get stack
                 bra     L(start)
 
-L(l_app):       move.l  4(sp),d0        ; application - get basepage
+L(l_app):       move.l  4(sp),d0        // application - get basepage
 L(start):       movem.l d1-d7/a0-a6,-(sp)
 
 
-; ------------- restore original basepage
+// ------------- restore original basepage
 
-        ; we also setup d4 and a6 here, and we prepare a4
+        // we also setup d4 and a6 here, and we prepare a4
 
-                move.l  d0,a2           ; a2 = basepage
+                move.l  d0,a2           // a2 = basepage
                 addq.l  #p_tbase,a2
                 move.l  (a2)+,a6
-                move.l  a6,d4                   ; d4 = p_tbase
-                move.l  #'up11',(a2)    ; p_tlen
+                move.l  a6,d4                   // d4 = p_tbase
+                move.l  up11,(a2)       // p_tlen
                 add.l   (a2)+,a6
-                move.l  a6,(a2)+        ; p_dbase
-                move.l  #'up12',(a2)    ; p_dlen
-                add.l   (a2)+,a6                ; a6 = decompressed p_bbase
-                move.l  (a2),a4                 ; a4 = compressed p_bbase
-                move.l  a6,(a2)+        ; p_bbase
-                move.l  #'up13',(a2)    ; p_blen
+                move.l  a6,(a2)+        // p_dbase
+                move.l  up12,(a2)       // p_dlen
+                add.l   (a2)+,a6                // a6 = decompressed p_bbase
+                move.l  (a2),a4                 // a4 = compressed p_bbase
+                move.l  a6,(a2)+        // p_bbase
+                move.l  up13,(a2)       // p_blen
 
 
-; ------------- copy data segment (from a4 to a3, downwards)
+// ------------- copy data segment (from a4 to a3, downwards)
 
-                ; a4 (top of compressed data) already initialized above
+                // a4 (top of compressed data) already initialized above
 
                 move.l  d4,a3
-                add.l   #'up21',a3      ; top of data segment + offset
+                add.l   up21,a3         // top of data segment + offset
 
 #if defined(SMALL)
 
-                move.l  #'up22',d0      ; (len / 4)
+                move.l  up22,d0         // (len / 4)
 
-        ; copy 4 bytes per loop
+        // copy 4 bytes per loop
 L(loop):        move.l  -(a4),-(a3)
-                ;;subq.l  #1,d0
-                dc.b    'u1'            ; subq.l #1,d0 / subq.w #1,d0
+section         subql_1d0
+                subq.l #1,d0
+section         subqw_1d0
+                subq.w #1,d0
+section         s_bneloop0
                 bne     L(loop)
 
 #else
 
-                move.l  #'up22',d0      ; (len / 160)
+                move.l  up22,d0         // (len / 160)
 
-        ; loop1 - use 10 registers to copy 4*10*4 = 160 bytes per loop
+        // loop1 - use 10 registers to copy 4*10*4 = 160 bytes per loop
 L(loop1):
                 lea.l   -160(a4),a4
                 movem.l 120(a4),d1-d3/d5-d7/a0-a2/a5
@@ -272,125 +271,139 @@ L(loop1):
                 movem.l d1-d3/d5-d7/a0-a2/a5,-(a3)
                 movem.l (a4),d1-d3/d5-d7/a0-a2/a5
                 movem.l d1-d3/d5-d7/a0-a2/a5,-(a3)
-                ;;subq.l  #1,d0
-                dc.b    'u1'            ; subq.l #1,d0 / subq.w #1,d0
+section         subql_1d0
+                subq.l #1,d0
+section         subqw_1d0
+                subq.w #1,d0
+section         s_bneloop0
                 bne     L(loop1)
 
-        ; loop2 - copy the remaining 4..160 bytes
-                ;;moveq.l #xx,d0          ; ((len % 160) / 4) - 1
-                dc.b    'u2'            ; moveq.l #xx,d0
+        // loop2 - copy the remaining 4..160 bytes
+                //;moveq.l #xx,d0          ; ((len % 160) / 4) - 1
+#if 0
+                dc.b    'u2'            // moveq.l #xx,d0
+#else
+                moveq.l  #copy_remain,d0
+#endif
 
 L(loop2):       move.l  -(a4),-(a3)
                 dbra    d0,L(loop2)
 
 #endif
 
-        ; a3 now points to the start of the compressed block
+        // a3 now points to the start of the compressed block
 
 
-; ------------- copy code to stack and setup ASTACK
+// ------------- copy code to stack and setup ASTACK
 
-; Copy the final startup code below the stack. This will get
-; called via "jmp (ASTACK)" after decompression and relocation.
+// Copy the final startup code below the stack. This will get
+// called via "jmp (ASTACK)" after decompression and relocation.
 
 copy_to_stack:
 
                 lea.l   clear_bss_end(pc),a2
-                move.l  d4,-(ASTACK)    ; entry point for final jmp
+                move.l  d4,-(ASTACK)    // entry point for final jmp
 
-                moveq.l #((clear_bss_end-clear_bss)/2-1),d5
-L(loop):        move.w  -(a2),-(ASTACK)
+//                moveq.l #((clear_bss_end-clear_bss)/2-1),d5
+                moveq.l #copy_to_stack_len,d5
+L(loop6):       move.w  -(a2),-(ASTACK)
                 subq.l  #1,d5
-                bcc     L(loop)
+                bcc     L(loop6)
 
 #ifdef FLUSH_CACHE
-                ; patch code: on the stack, the `rts' becomes a `nop'
-                move.w #$4e71,flush_cache_rts-clear_bss(ASTACK)
+                // patch code: on the stack, the `rts' becomes a `nop'
+                move.w #0x4e71,flush_cache_rts-clear_bss(ASTACK)
 #endif
 
-        ; note: d5.l is now -1 (needed for decompressor)
+        // note: d5.l is now -1 (needed for decompressor)
 
 
-; -------------
+// -------------
 
 #ifdef FLUSH_CACHE
                 bsr     flush_cache
 #endif
 
 
-; ------------- prepare decompressor
+// ------------- prepare decompressor
 
-        ; a3 still points to the start of the compressed block
-                move.l  d4,a4           ; dest. for decompressing
+        // a3 still points to the start of the compressed block
+                move.l  d4,a4           // dest. for decompressing
 
 #define NRV_NO_INIT
 
-                ;;moveq.l #-1,d5        ; last_off = -1
-                moveq.l #-128,d0        ; d0.b = $80
+                //;moveq.l #-1,d5        ; last_off = -1
+                moveq.l #-128,d0        // d0.b = $80
 #if defined(NRV2B)
                 moveq.l #-1,d7
-                moveq.l #-$68,d6        ; 0xffffff98
-                lsl.w   #5,d6           ; 0xfffff300 == -0xd00
+                moveq.l #-0x68,d6       // 0xffffff98
+                lsl.w   #5,d6           // 0xfffff300 == -0xd00
 #elif defined(NRV2D)
                 moveq.l #-1,d7
-                moveq.l #-$50,d6        ; 0xffffffb0
-                lsl.w   #4,d6           ; 0xfffffb00 == -0x500
+                moveq.l #-0x50,d6       // 0xffffffb0
+                lsl.w   #4,d6           // 0xfffffb00 == -0x500
 #elif defined(NRV2E)
                 moveq.l #0,d7
-                moveq.l #-$50,d6        ; 0xffffffb0
-                lsl.w   #4,d6           ; 0xfffffb00 == -0x500
+                moveq.l #-0x50,d6       // 0xffffffb0
+                lsl.w   #4,d6           // 0xfffffb00 == -0x500
 #else
 #  error
 #endif
 
 
-; ------------- jump to copied decompressor
+// ------------- jump to copied decompressor
 
                 move.l  d4,a2
-                add.l   #'up31',a2
-                jmp     (a2)            ; jmp decompr_start
+                add.l   #up31,a2
+                jmp     (a2)            // jmp decompr_start
 
 
-; /*************************************************************************
-; // this is the final part of the startup code which runs in the stack
-; **************************************************************************/
+/*************************************************************************
+// this is the final part of the startup code which runs in the stack
+**************************************************************************/
 
-; ------------- clear dirty bss
+// ------------- clear dirty bss
 
 clear_bss:
 
-        ; on entry:
-        ;   ASTACK      == pc == clear_bss (on stack)
-        ;   a6          start of dirty bss [long living register]
-        ;   d6.l        number of clr loops
-        ;   d3.l        0
+        // on entry:
+        //   ASTACK      == pc == clear_bss (on stack)
+        //   a6          start of dirty bss [long living register]
+        //   d6.l        number of clr loops
+        //   d3.l        0
 
 
 #if defined(SMALL)
-L(loop):        move.l  d3,(a6)+
-                ;;subq.l  #1,d6
-                dc.b    'u4'            ; subq.l #1,d6 / subq.w #1,d6
-                bne     L(loop)
+L(loop3):       move.l  d3,(a6)+
+section         subql_1d6
+                subq.l #1,d6
+section         subqw_1d6
+                subq.w #1,d6
+section         s_bneloop3
+                bne     L(loop3)
 #else
-        ; the dirty bss is usually not too large, so we don't
-        ; bother making movem optimizations here
-L(loop):        move.l  d3,(a6)+
+        // the dirty bss is usually not too large, so we don't
+        // bother making movem optimizations here
+L(loop3):       move.l  d3,(a6)+
                 move.l  d3,(a6)+
                 move.l  d3,(a6)+
                 move.l  d3,(a6)+
-                ;;subq.l  #1,d6
-                dc.b    'u4'            ; subq.l #1,d6 / subq.w #1,d6
-                bne     L(loop)
+section         subql_1d6
+                subq.l #1,d6
+section         subqw_1d6
+                subq.w #1,d6
+section         s_bneloop3
+                bne     L(loop3)
 #endif
 
 
-; ------------- flush the cache
+// ------------- flush the cache
 
 #ifdef FLUSH_CACHE
 
-; info:
-;  This is also called as a subroutine (before decompression, NOT running
-;  in the stack). When running in the stack the `rts' is replaced by a `nop'.
+// info:
+//  This is also called as a subroutine (before decompression, NOT running
+//  in the stack). When running in the stack the `rts' is replaced by a `nop'.
 
 flush_cache:
                 FLUSH_CACHE
@@ -400,23 +413,23 @@ flush_cache_rts:
 #endif
 
 
-; ------------- restore ASTACK
+// ------------- restore ASTACK
 
-                lea     clear_bss_end-clear_bss+4(ASTACK),sp
+                lea     clear_bss_size+4(ASTACK),sp
 
-        ;; assert sp == clear_bss_end(pc)+4
+        //; assert sp == clear_bss_end(pc)+4
 
 
-; ------------- clear the dirty stack
+// ------------- clear the dirty stack
 
 #if 0
 
-; better don't do this - we are currently running in the stack
-; and don't want to make yet another instruction-cache-line dirty
+// better don't do this - we are currently running in the stack
+// and don't want to make yet another instruction-cache-line dirty
 
 clear_dirty_stack:
 
-                ; clear down to clear_bss(pc) + 32 extra longs
+                // clear down to clear_bss(pc) + 32 extra longs
                 moveq.l #((L(loop)-clear_bss+3)/4+32-1),d0
                 lea     L(loop)(pc),a0
 L(loop):        move.l  d3,-(a0)
@@ -425,48 +438,46 @@ L(loop):        move.l  d3,-(a0)
 #endif
 
 
-; ------------- start program
+// ------------- start program
 
                 movem.l (sp)+,d1-d7/a0-a6
                 move.l  a0,d0
-                beq     L(l_app)
-                sub.l   sp,sp           ; accessory: no stack
-L(l_app):       dc.w    $4ef9           ; jmp $xxxxxxxx - jmp to text segment
+                beq     L(l_app1)
+                sub.l   sp,sp           // accessory: no stack
+L(l_app1):      dc.w    0x4ef9          // jmp $xxxxxxxx - jmp to text segment
 
 clear_bss_end:
 
 
-; /*************************************************************************
-; // UPX ident & packheader
-; **************************************************************************/
+/*************************************************************************
+// UPX ident & packheader
+**************************************************************************/
 
+#if 0
 #if defined(SMALL)
 #  include "include/ident_s.ash"
 #else
 #  include "include/ident_n.ash"
 #endif
+#endif
 
-                align4
+//                align4
 
-                ; 32 bytes - #include "header.ash"
-                dc.b    85,80,88,33     ;     UPX_MAGIC_LE32
-                dc.b    161,216,208,213 ;     UPX_MAGIC2_LE32
-                dc.l    0,0,0,0,0
-                dc.b    0,0,0,45
+#include "include/header2.ash"
 
 
-        ; end of text segment - size is a multiple of 4
+        // end of text segment - size is a multiple of 4
 
 
-; /*************************************************************************
-; // This part is appended after the compressed data.
-; // It runs in the last part of the dirty bss (after the
-; // relocations and the original fileheader).
-; **************************************************************************/
+/*************************************************************************
+// This part is appended after the compressed data.
+// It runs in the last part of the dirty bss (after the
+// relocations and the original fileheader).
+**************************************************************************/
 
 cutpoint:
 
-; ------------- decompress (from a3 to a4)
+// ------------- decompress (from a3 to a4)
 
 #define a0 A3
 #define a1 A4
@@ -474,9 +485,9 @@ cutpoint:
 #define d2 D3
 
 #if defined(NRV2B)
-#  include "arch/m68k/nrv2b_d.ash"
+//#  include "arch/m68k/nrv2b_d.ash"
 #elif defined(NRV2D)
-#  include "arch/m68k/nrv2d_d.ash"
+//#  include "arch/m68k/nrv2d_d.ash"
 #elif defined(NRV2E)
 #  include "arch/m68k/nrv2e_d.ash"
 #else
@@ -488,71 +499,52 @@ cutpoint:
 #undef a3
 #undef d2
 
-        ; note: d3.l is 0 from decompressor above
+        // note: d3.l is 0 from decompressor above
 
 
-; ------------- prepare d6 for clearing the dirty bss
+// ------------- prepare d6 for clearing the dirty bss
 
 #if defined(SMALL)
-                move.l  #'up41',d6      ; dirty_bss / 4
+                move.l  #up41,d6        // dirty_bss / 4
 #else
-                move.l  #'up41',d6      ; dirty_bss / 16
+                move.l  #up41,d6        // dirty_bss / 16
 #endif
 
 
-; ------------- test if we need to reloc
+section         reloc
 
-                dc.b    'u3'            ; moveq.l #1,d5 / jmp (ASTACK)
+                moveq.l #1,d5
 
+// The decompressed relocations now are just after the decompressed
+// data segment, i.e. at the beginning of the (dirty) bss.
 
-; ------------- reloc
+        // note: d3.l is still 0
 
-reloc:
-
-; The decompressed relocations now are just after the decompressed
-; data segment, i.e. at the beginning of the (dirty) bss.
-
-        ; note: d3.l is still 0
-
-                move.l  a6,a0           ; a0 = start of relocations
+                move.l  a6,a0           // a0 = start of relocations
 
                 move.l  d4,a1
-                add.l   (a0)+,a1        ; get initial fixup
+                add.l   (a0)+,a1        // get initial fixup
 
-L(loop1):       add.l   d3,a1           ; increase fixup
-                add.l   d4,(a1)         ; reloc one address
-L(loop2):       move.b  (a0)+,d3
+L(loopx1):      add.l   d3,a1           // increase fixup
+                add.l   d4,(a1)         // reloc one address
+L(loopx2):      move.b  (a0)+,d3
                 beq     reloc_end
-                cmp.b   d5,d3           ; note: d5.b is #1 from above
-                bne     L(loop1)
-                lea     254(a1),a1      ; d3 == 1 -> add 254, don't reloc
-                bra     L(loop2)
+                cmp.b   d5,d3           // note: d5.b is #1 from above
+                bne     L(loopx1)
+                lea     254(a1),a1      // d3 == 1 -> add 254, don't reloc
+                bra     L(loopx2)
 
 reloc_end:
 
 
-; ------------- clear dirty bss & start program
+// ------------- clear dirty bss & start program
 
-; We are currently running in the dirty bss.
-; Jump to the code we copied below the stack.
+// We are currently running in the dirty bss.
+// Jump to the code we copied below the stack.
 
-        ; note: d3.l is still 0
+        // note: d3.l is still 0
 
-                jmp     (ASTACK)        ; jmp clear_bss (on stack)
+                jmp     (ASTACK)        // jmp clear_bss (on stack)
 
-
-
-eof:
-                dc.w    cutpoint-start  ; size of entry
-                dc.w    eof-cutpoint    ; size of decompressor
-                dc.w    decompr_start-cutpoint  ; offset of decompressor start
-                dc.b    'UPX9'          ; marker for o2bin.pl
-
-#if defined(__ASL__)
-                endsection code
-#endif
-                end
-
-
-; vi:ts=8:et:nowrap
+// vi:ts=8:et:nowrap