What about 32F4 ARM GCC - I seem to be on 9.3.1.
I would expect x86 compilers to be clever.
GCC produces
f:
sub sp, sp, #512
mov x2, -513
add x0, sp, 512
mov x3, 512
movk x2, 0xf7ff, lsl 16
movk x3, 0x800, lsl 16
add x2, x0, x2
mov x0, 134217728
.L2:
mov x1, x0
add x0, x0, 1
cmp x0, x3
ldrb w1, [x1]
strb w1, [x0, x2]
bne .L2
ldrb w0, [sp]
add sp, sp, 512
ret
Not as clever!
With -O0, it does it literally
copy:
sub sp, sp, #48
str x0, [sp, 24]
str x1, [sp, 16]
str x2, [sp, 8]
ldr x1, [sp, 16]
ldr x0, [sp, 8]
add x0, x1, x0
str x0, [sp, 40]
b .L2
.L3:
ldr x1, [sp, 16]
add x0, x1, 1
str x0, [sp, 16]
ldr x0, [sp, 24]
add x2, x0, 1
str x2, [sp, 24]
ldrb w1, [x1]
strb w1, [x0]
.L2:
ldr x1, [sp, 16]
ldr x0, [sp, 40]
cmp x1, x0
bne .L3
nop
nop
add sp, sp, 48
ret
f:
sub sp, sp, #544
stp x29, x30, [sp]
mov x29, sp
add x0, sp, 24
mov x2, 512
mov x1, 134217728
bl copy
ldrb w0, [sp, 24]
strb w0, [sp, 543]
ldrb w0, [sp, 543]
ldp x29, x30, [sp]
add sp, sp, 544
ret
In most applications the extra time will not be of relevance but the doubling of code size may well be.