I'm looking into the menu slide performance issue and it looks like it is mainly compiler optimization.
This is from the original code. The assembler for sliding the main menu onto the screen.
**************************************************************
* FUNCTION *
**************************************************************
void __stdcall display_slide_down_main_menu(void)
void <VOID> <RETURN>
undefined4 Stack[-0x14]:4 local_14 XREF[1]: 8000e7dc(*)
undefined4 Stack[-0x28]:4 local_28 XREF[1]: 8000e7d8(*)
display_slide_down_main_menu XREF[1]: setup_main_menu:8002916c(c)
8000e6f8 00 5e 2d e9 stmdb sp!,{r9 r10 r11 r12 lr}
8000e6fc f0 01 2d e9 stmdb sp!,{r4 r5 r6 r7 r8}
8000e700 dc 00 9f e5 ldr r0,[DAT_8000e7e4] = 001B1B1Bh
8000e704 ea 52 00 eb bl convert_color ushort convert_color(uint color)
8000e708 00 40 a0 e1 mov r4,r0
8000e70c 00 00 a0 e3 mov r0,#0x0
8000e710 e7 52 00 eb bl convert_color ushort convert_color(uint color)
8000e714 cc 60 9f e5 ldr r6,[PTR_DAT_8000e7e8] = 80192ef8
8000e718 00 50 a0 e1 mov r5,r0
8000e71c 02 20 a0 e3 mov r2,#0x2
8000e720 2e 00 a0 e3 mov r0,#0x2e
8000e724 b0 20 c6 e1 strh r2,[r6,#0x0]=>DAT_80192ef8 = 0154h
8000e728 97 10 a0 e3 mov r1,#0x97
8000e72c b2 00 c6 e1 strh r0,[r6,#0x2]=>DAT_80192efa = 0914h
8000e730 b4 10 c6 e1 strh r1,[r6,#0x4]=>DAT_80192efc = 6828h
8000e734 b6 00 c6 e1 strh r0,[r6,#0x6]=>DAT_80192efe = 1409h
8000e738 0e 24 00 eb bl set_screen_to_global_pointer void set_screen_to_global_pointe
8000e73c a8 a0 9f e5 ldr r10,[->BITMAP_MAIN_MENU] = 800ac420
8000e740 a8 b0 9f e5 ldr r11,[Global_Frame_Buffer_Pointer] = 8019CF60h
8000e744 eb 00 a0 e3 mov r0,#0xeb
LAB_8000e748 XREF[1]: 8000e7d4(j)
8000e748 eb 00 50 e3 cmp r0,#0xeb
8000e74c 00 30 a0 e1 mov r3,r0
8000e750 19 00 00 2a bcs LAB_8000e7bc
LAB_8000e754 XREF[1]: 8000e7b8(j)
8000e754 b2 c0 d6 e1 ldrh r12,[r6,#0x2]=>DAT_80192efa = 0914h
8000e758 00 20 43 e0 sub r2,r3,r0
8000e75c b0 70 d6 e1 ldrh r7,[r6,#0x0]=>DAT_80192ef8 = 0154h
8000e760 02 c0 8c e0 add r12,r12,r2
8000e764 95 10 a0 e3 mov r1,#0x95
8000e768 93 01 01 e0 mul r1,r3,r1
8000e76c 8c 91 8c e0 add r9,r12,r12, lsl #0x3
8000e770 00 80 9b e5 ldr r8,[r11,#0x0]=>DAT_8019cf60
8000e774 0c c2 89 e0 add r12,r9,r12, lsl #0x4
8000e778 8c c2 87 e0 add r12,r7,r12, lsl #0x5
8000e77c 95 20 a0 e3 mov r2,#0x95
8000e780 81 10 8a e0 add r1,r10,r1, lsl #0x1
8000e784 8c 70 88 e0 add r7,r8,r12, lsl #0x1
LAB_8000e788 XREF[1]: 8000e7a8(j)
8000e788 b2 c0 d1 e0 ldrh r12,[r1],#0x2=>TEXT_BITMAP_CALIB_START
8000e78c 00 00 5c e3 cmp r12,#0x0
8000e790 04 c0 a0 01 moveq r12,r4
8000e794 01 00 00 0a beq LAB_8000e7a0
8000e798 3e 0b 5c e3 cmp r12,#0xf800
8000e79c 05 c0 a0 01 moveq r12,r5
LAB_8000e7a0 XREF[1]: 8000e794(j)
8000e7a0 01 20 52 e2 subs r2,r2,#0x1
8000e7a4 b2 c0 c7 e0 strh r12,[r7],#0x2
8000e7a8 f6 ff ff 1a bne LAB_8000e788
8000e7ac 01 10 83 e2 add r1,r3,#0x1
8000e7b0 01 38 c1 e3 bic r3,r1,#0x10000
8000e7b4 eb 00 53 e3 cmp r3,#0xeb
8000e7b8 e5 ff ff 3a bcc LAB_8000e754
LAB_8000e7bc XREF[1]: 8000e750(j)
8000e7bc 30 10 9f e5 ldr r1,[DAT_8000e7f4] = 0000E38Fh
8000e7c0 90 01 01 e0 mul r1,r0,r1
8000e7c4 21 0a 40 e0 sub r0,r0,r1, lsr #0x14
8000e7c8 01 00 40 e2 sub r0,r0,#0x1
8000e7cc 00 08 a0 e1 mov r0,r0, lsl #0x10
8000e7d0 20 08 b0 e1 movs r0,r0, lsr #0x10
8000e7d4 db ff ff 1a bne LAB_8000e748
8000e7d8 f0 01 bd e8 ldmia sp!,{r4 r5 r6 r7 r8}=>local_28
8000e7dc 00 5e bd e8 ldmia sp!,{r9 r10 r11 r12 lr}=>local_14
8000e7e0 de 23 00 ea b set_frame_to_global_pointer
This is what Ghidra shows for the code I compile
**************************************************************
* FUNCTION *
**************************************************************
void __stdcall display_slide_top_rect_onto_screen(ushort
void <VOID> <RETURN>
ushort r0:2 xpos
ushort r1:2 ypos
ushort r2:2 width
ushort r3:2 height
uint Stack[0x0]:4 speed XREF[2]: 80001a94(R),
80001be0(R)
undefined4 Stack[-0x4]:4 local_4 XREF[2]: 80001a54(W),
80001c28(*)
undefined4 Stack[-0xc]:4 local_c XREF[4]: 80001b2c(W),
80001b60(R),
80001b98(R),
80001ba0(W)
undefined4 Stack[-0x10]:4 local_10 XREF[4]: 80001b14(W),
80001b50(R),
80001bb0(R),
80001bb8(W)
undefined2 Stack[-0x12]:2 local_12 XREF[7]: 80001ac4(W),
80001af0(R),
80001b30(R),
80001bd8(R),
80001bdc(R),
80001c0c(W),
80001c10(R)
undefined2 Stack[-0x14]:2 local_14 XREF[4]: 80001b34(W),
80001bbc(R),
80001bc4(W),
80001bc8(R)
undefined2 Stack[-0x16]:2 local_16 XREF[6]: 80001b40(W),
80001b48(R),
80001b58(R),
80001b70(R),
80001b78(W),
80001b7c(R)
undefined4 Stack[-0x1c]:4 local_1c XREF[3]: 80001ae0(W),
80001b04(R),
80001b20(R)
undefined2 Stack[-0x22]:2 local_22 XREF[2]: 80001a74(W),
80001ac8(R)
undefined2 Stack[-0x24]:2 local_24 XREF[2]: 80001a7c(W),
80001acc(R)
undefined2 Stack[-0x26]:2 local_26 XREF[2]: 80001a84(W),
80001b80(R)
undefined2 Stack[-0x28]:2 local_28 XREF[4]: 80001a8c(W),
80001a90(R),
80001aa8(R),
80001bcc(R)
display_slide_top_rect_onto_screen XREF[4]: FUN_80005ae0:80005bac(c),
FUN_80005f40:8000608c(c),
FUN_8000667c:800067c8(c),
FUN_80006de8:80006f00(c)
80001a54 04 b0 2d e5 str r11,[sp,#local_4]!
80001a58 00 b0 8d e2 add r11,sp,#0x0
80001a5c 24 d0 4d e2 sub sp,sp,#0x24
80001a60 00 c0 a0 e1 mov r12,xpos
80001a64 01 00 a0 e1 mov xpos,ypos
80001a68 02 10 a0 e1 mov ypos,width
80001a6c 03 20 a0 e1 mov width,height
80001a70 0c 30 a0 e1 mov height,r12
80001a74 be 31 4b e1 strh height,[r11,#local_22]
80001a78 00 30 a0 e1 mov height,xpos
80001a7c b0 32 4b e1 strh height,[r11,#local_24]
80001a80 01 30 a0 e1 mov height,ypos
80001a84 b2 32 4b e1 strh height,[r11,#local_26]
80001a88 02 30 a0 e1 mov height,width
80001a8c b4 32 4b e1 strh height,[r11,#local_28]
80001a90 b4 32 5b e1 ldrh height,[r11,#local_28]
80001a94 04 20 9b e5 ldr width,[r11,#speed]
80001a98 92 03 03 e0 mul height,width,height
80001a9c 23 3a a0 e1 mov height,height, lsr #0x14
80001aa0 03 38 a0 e1 mov height,height, lsl #0x10
80001aa4 23 38 a0 e1 mov height,height, lsr #0x10
80001aa8 b4 22 5b e1 ldrh width,[r11,#local_28]
80001aac 03 30 42 e0 sub height,width,height
80001ab0 03 38 a0 e1 mov height,height, lsl #0x10
80001ab4 23 38 a0 e1 mov height,height, lsr #0x10
80001ab8 01 30 43 e2 sub height,height,#0x1
80001abc 03 38 a0 e1 mov height,height, lsl #0x10
80001ac0 23 38 a0 e1 mov height,height, lsr #0x10
80001ac4 be 30 4b e1 strh height,[r11,#local_12]
80001ac8 be 21 5b e1 ldrh width,[r11,#local_22]
80001acc b0 32 5b e1 ldrh height,[r11,#local_24]
80001ad0 58 11 9f e5 ldr ypos,[DAT_80001c30] = 80012208h
80001ad4 b0 12 d1 e1 ldrh ypos,[ypos,#0x20]=>DAT_80012228
80001ad8 91 03 03 e0 mul height,ypos,height
80001adc 03 30 82 e0 add height,width,height
80001ae0 18 30 0b e5 str height,[r11,#local_1c]
80001ae4 49 00 00 ea b LAB_80001c10
LAB_80001ae8 XREF[1]: 80001c18(j)
80001ae8 40 31 9f e5 ldr height,[DAT_80001c30] = 80012208h
80001aec 0c 20 93 e5 ldr width,[height,#0xc]=>DAT_80012214
80001af0 fe 30 5b e1 ldrsh height,[r11,#local_12]
80001af4 34 11 9f e5 ldr ypos,[DAT_80001c30] = 80012208h
80001af8 b0 12 d1 e1 ldrh ypos,[ypos,#0x20]=>DAT_80012228
80001afc 91 03 03 e0 mul height,ypos,height
80001b00 03 10 a0 e1 mov ypos,height
80001b04 18 30 1b e5 ldr height,[r11,#local_1c]
80001b08 03 30 81 e0 add height,ypos,height
80001b0c 83 30 a0 e1 mov height,height, lsl #0x1
80001b10 03 30 82 e0 add height,width,height
80001b14 0c 30 0b e5 str height,[r11,#local_10]
80001b18 10 31 9f e5 ldr height,[DAT_80001c30] = 80012208h
80001b1c 08 20 93 e5 ldr width,[height,#0x8]=>DAT_80012210
80001b20 18 30 1b e5 ldr height,[r11,#local_1c]
80001b24 83 30 a0 e1 mov height,height, lsl #0x1
80001b28 03 30 82 e0 add height,width,height
80001b2c 08 30 0b e5 str height,[r11,#local_c]
80001b30 be 30 5b e1 ldrh height,[r11,#local_12]
80001b34 b0 31 4b e1 strh height,[r11,#local_14]
80001b38 22 00 00 ea b LAB_80001bc8
LAB_80001b3c XREF[1]: 80001bd4(j)
80001b3c 00 30 a0 e3 mov height,#0x0
80001b40 b2 31 4b e1 strh height,[r11,#local_16]
80001b44 0c 00 00 ea b LAB_80001b7c
LAB_80001b48 XREF[1]: 80001b88(j)
80001b48 b2 31 5b e1 ldrh height,[r11,#local_16]
80001b4c 83 30 a0 e1 mov height,height, lsl #0x1
80001b50 0c 20 1b e5 ldr width,[r11,#local_10]
80001b54 03 20 82 e0 add width,width,height
80001b58 b2 31 5b e1 ldrh height,[r11,#local_16]
80001b5c 83 30 a0 e1 mov height,height, lsl #0x1
80001b60 08 10 1b e5 ldr ypos,[r11,#local_c]
80001b64 03 30 81 e0 add height,ypos,height
80001b68 b0 20 d2 e1 ldrh width,[width,#0x0]
80001b6c b0 20 c3 e1 strh width,[height,#0x0]
80001b70 b2 31 5b e1 ldrh height,[r11,#local_16]
80001b74 01 30 83 e2 add height,height,#0x1
80001b78 b2 31 4b e1 strh height,[r11,#local_16]
LAB_80001b7c XREF[1]: 80001b44(j)
80001b7c b2 21 5b e1 ldrh width,[r11,#local_16]
80001b80 b2 32 5b e1 ldrh height,[r11,#local_26]
80001b84 03 00 52 e1 cmp width,height
80001b88 ee ff ff 9a bls LAB_80001b48
80001b8c 9c 30 9f e5 ldr height,[DAT_80001c30] = 80012208h
80001b90 b0 32 d3 e1 ldrh height,[height,#0x20]=>DAT_80012228
80001b94 83 30 a0 e1 mov height,height, lsl #0x1
80001b98 08 20 1b e5 ldr width,[r11,#local_c]
80001b9c 03 30 82 e0 add height,width,height
80001ba0 08 30 0b e5 str height,[r11,#local_c]
80001ba4 84 30 9f e5 ldr height,[DAT_80001c30] = 80012208h
80001ba8 b0 32 d3 e1 ldrh height,[height,#0x20]=>DAT_80012228
80001bac 83 30 a0 e1 mov height,height, lsl #0x1
80001bb0 0c 20 1b e5 ldr width,[r11,#local_10]
80001bb4 03 30 82 e0 add height,width,height
80001bb8 0c 30 0b e5 str height,[r11,#local_10]
80001bbc b0 31 5b e1 ldrh height,[r11,#local_14]
80001bc0 01 30 83 e2 add height,height,#0x1
80001bc4 b0 31 4b e1 strh height,[r11,#local_14]
LAB_80001bc8 XREF[1]: 80001b38(j)
80001bc8 b0 21 5b e1 ldrh width,[r11,#local_14]
80001bcc b4 32 5b e1 ldrh height,[r11,#local_28]
80001bd0 03 00 52 e1 cmp width,height
80001bd4 d8 ff ff 9a bls LAB_80001b3c
80001bd8 be 20 5b e1 ldrh width,[r11,#local_12]
80001bdc fe 30 5b e1 ldrsh height,[r11,#local_12]
80001be0 04 10 9b e5 ldr ypos,[r11,#speed]
80001be4 91 03 03 e0 mul height,ypos,height
80001be8 23 3a a0 e1 mov height,height, lsr #0x14
80001bec 03 38 a0 e1 mov height,height, lsl #0x10
80001bf0 23 38 a0 e1 mov height,height, lsr #0x10
80001bf4 03 30 42 e0 sub height,width,height
80001bf8 03 38 a0 e1 mov height,height, lsl #0x10
80001bfc 23 38 a0 e1 mov height,height, lsr #0x10
80001c00 01 30 43 e2 sub height,height,#0x1
80001c04 03 38 a0 e1 mov height,height, lsl #0x10
80001c08 23 38 a0 e1 mov height,height, lsr #0x10
80001c0c be 30 4b e1 strh height,[r11,#local_12]
LAB_80001c10 XREF[1]: 80001ae4(j)
80001c10 fe 30 5b e1 ldrsh height,[r11,#local_12]
80001c14 00 00 53 e3 cmp height,#0x0
80001c18 b2 ff ff aa bge LAB_80001ae8
80001c1c 00 00 a0 e1 mov xpos,xpos
80001c20 00 00 a0 e1 mov xpos,xpos
80001c24 00 d0 8b e2 add sp,r11,#0x0
80001c28 04 b0 9d e4 ldr r11=>local_4,[sp],#0x4
80001c2c 1e ff 2f e1 bx lr
It is much longer for the loop portion. The psuedo C output of Ghidra is not much different for the both
This is my C code
void display_slide_top_rect_onto_screen(uint16 xpos, uint16 ypos, uint16 width, uint16 height, uint32 speed)
{
uint16 *ptr1, *ptr2;
int16 startline; //Needs to be an int because it has to become negative to stop
uint16 line;
uint32 startxy;
uint16 pixel;
//Starting line of the rectangle to display first
startline = height - ((height * speed) >> 20) - 1;
//Start x,y offset for source and destination calculation
startxy = xpos + (ypos * displaydata.pixelsperline);
//Draw lines as long as is needed to get the whole rectangle on screen
while(startline >= 0)
{
//Source pointer is based on the current line
ptr2 = displaydata.sourcebuffer + startxy + (startline * displaydata.pixelsperline);
//Destination pointer is always the first line
ptr1 = displaydata.screenbuffer + startxy;
//Handle the needed number of lines for this loop
for(line=startline;line<=height;line++)
{
//Copy a single line to the screen buffer
for(pixel=0;pixel<=width;pixel++)
{
//Copy one pixel at a time
ptr1[pixel] = ptr2[pixel];
}
//Point to the next line of pixels in both destination and source
ptr1 += displaydata.pixelsperline;
ptr2 += displaydata.pixelsperline;
}
//Calculate the new starting line
startline = startline - 1 - ((startline * speed) >> 20);
}
}
This is what Ghidra makes of it
void display_slide_top_rect_onto_screen(ushort xpos,ushort ypos,ushort width,ushort height,uint speed)
{
int iVar1;
ushort local_16;
ushort local_14;
ushort local_12;
int local_10;
int local_c;
local_12 = (ushort)((((uint)height - (speed * height >> 0x14) & 0xffff) - 1) * 0x10000 >> 0x10);
iVar1 = (uint)xpos + (uint)*(ushort *)(DAT_80001c30 + 0x20) * (uint)ypos;
while (-1 < (short)local_12)
{
local_10 = *(int *)(DAT_80001c30 + 0xc) + ((uint)*(ushort *)(DAT_80001c30 + 0x20) * (int)(short)local_12 + iVar1) * 2;
local_c = *(int *)(DAT_80001c30 + 8) + iVar1 * 2;
local_14 = local_12;
while (local_14 <= height)
{
local_16 = 0;
while (local_16 <= width)
{
*(undefined2 *)(local_c + (uint)local_16 * 2) = *(undefined2 *)(local_10 + (uint)local_16 * 2);
local_16 = local_16 + 1;
}
local_c = local_c + (uint)*(ushort *)(DAT_80001c30 + 0x20) * 2;
local_10 = local_10 + (uint)*(ushort *)(DAT_80001c30 + 0x20) * 2;
local_14 = local_14 + 1;
}
local_12 = (ushort)((((uint)local_12 - (speed * (int)(short)local_12 >> 0x14) & 0xffff) - 1) * 0x10000 >> 0x10);
}
}
I can probably speed things up a bit by changing the variables to full integers instead of shorts. Only the pixel pointers need to be unsigned shorts.
If that does not do the trick I will have to raise the compilers optimization level and see what comes of that.