oh oh.. i already see some of the tricks it adopted (that i can use with -O1 to force the compiler to optimize)
comparison #1
this routine produce the same code with -O1, -O2 and -O3
27: void shift_sentbuf(sentbuf_t* sentbuf,unsigned int times) {
28: int idx;
29: while (times > 0) {
004EFE E00001 CP0 W1
004F00 32000D BRA Z, 0x4F1C
004F1A 3AFFF3 BRA NZ, 0x4F02
30: for (idx=0;idx<(_SENT_BUF_SIZE-1);idx++) {
31: sentbuf->buffer[idx] = sentbuf->buffer[idx+1];
004F02 900120 MOV [W0+4], W2
004F04 9001B0 MOV [W0+6], W3
004F06 BE8802 MOV.D W2, [W0]
004F08 900140 MOV [W0+8], W2
004F0A 9001D0 MOV [W0+10], W3
004F0C 980022 MOV W2, [W0+4]
004F0E 980033 MOV W3, [W0+6]
004F10 900160 MOV [W0+12], W2
004F12 9001F0 MOV [W0+14], W3
004F14 980042 MOV W2, [W0+8]
004F16 980053 MOV W3, [W0+10]
32: }
33: times--;
004F18 E90081 DEC W1, W1
34: }
35: }
004F1C 060000 RETURN
with -Os, however
27: void shift_sentbuf(sentbuf_t* sentbuf,unsigned int times) {
28: int idx;
29: while (times > 0) {
004D14 370008 BRA 0x4D26
004D26 E00001 CP0 W1
30: for (idx=0;idx<(_SENT_BUF_SIZE-1);idx++) {
004D20 518FE3 SUB W3, #0x3, [W15]
004D22 3AFFF9 BRA NZ, 0x4D16
31: sentbuf->buffer[idx] = sentbuf->buffer[idx+1];
004D16 E80183 INC W3, W3
004D18 900222 MOV [W2+4], W4
004D1A 9002B2 MOV [W2+6], W5
004D1C BE8904 MOV.D W4, [W2]
004D1E 410164 ADD W2, #0x4, W2
32: }
33: times--;
004D24 E90081 DEC W1, W1
004D26 E00001 CP0 W1
004D28 320003 BRA Z, 0x4D30
004D2A 780100 MOV W0, W2
004D2C EB0180 CLR W3
004D2E 37FFF3 BRA 0x4D16
004D30 060000 RETURN
34: }
35: }
reason is simple: #define _SENT_BUF_SIZE 4
so the compiler decided to unroll the loop for small loops in every optimization level but -Os. i wouldn't have done that because in dsPIC33E branches have a high latency.. with 33F instead it would be fine but see it uses the MOV.D instructions!
this however can be applied with every optimization.. with -O1
10: sent_t pop_sentbuf(sentbuf_t* sentbuf) {
11: sent_t data;
12: unsigned int idx;
13: if (sentbuf->idx > 0) {
004ED0 900901 MOV [W1+16], W2
004ED2 E00002 CP0 W2
004ED4 320010 BRA Z, 0x4EF6
14: data = sentbuf->buffer[0];
004ED6 781831 MOV [W1++], [W0++]
004ED8 781021 MOV [W1--], [W0--]
15: for (idx=0;idx<_SENT_BUF_SIZE-1;idx++) {
16: sentbuf->buffer[idx] = sentbuf->buffer[idx+1];
004EDA 900221 MOV [W1+4], W4
004EDC 9002B1 MOV [W1+6], W5
004EDE BE8884 MOV.D W4, [W1]
004EE0 900241 MOV [W1+8], W4
004EE2 9002D1 MOV [W1+10], W5
004EE4 9800A4 MOV W4, [W1+4]
004EE6 9800B5 MOV W5, [W1+6]
004EE8 900261 MOV [W1+12], W4
004EEA 9002F1 MOV [W1+14], W5
004EEC 9800C4 MOV W4, [W1+8]
004EEE 9800D5 MOV W5, [W1+10]
17: }
18: sentbuf->idx--;
004EF0 E90102 DEC W2, W2
004EF2 980882 MOV W2, [W1+16]
004EF4 370003 BRA 0x4EFC
19: }
20: else {
21: data.Data_H = 0;
004EF6 EB0080 CLR W1
004EF8 780801 MOV W1, [W0]
22: data.Data_L = 0;
004EFA 980011 MOV W1, [W0+2]
23: }
24: return data;
25: }
004EFC 060000 RETURN
and with every other optimization level
10: sent_t pop_sentbuf(sentbuf_t* sentbuf) {
11: sent_t data;
12: unsigned int idx;
13: if (sentbuf->idx > 0) {
0057C4 900901 MOV [W1+16], W2
0057C6 E00002 CP0 W2
0057C8 320010 BRA Z, 0x57EA
14: data = sentbuf->buffer[0];
0057CA 781831 MOV [W1++], [W0++]
0057CC 781021 MOV [W1--], [W0--]
15: for (idx=0;idx<_SENT_BUF_SIZE-1;idx++) {
16: sentbuf->buffer[idx] = sentbuf->buffer[idx+1];
0057CE 900221 MOV [W1+4], W4
0057D0 9002B1 MOV [W1+6], W5
0057D2 BE8884 MOV.D W4, [W1]
0057D4 900241 MOV [W1+8], W4
0057D6 9002D1 MOV [W1+10], W5
0057D8 9800A4 MOV W4, [W1+4]
0057DA 9800B5 MOV W5, [W1+6]
0057DC 900261 MOV [W1+12], W4
0057DE 9002F1 MOV [W1+14], W5
0057E0 9800C4 MOV W4, [W1+8]
0057E2 9800D5 MOV W5, [W1+10]
17: }
18: sentbuf->idx--;
0057E4 E90102 DEC W2, W2
0057E6 980882 MOV W2, [W1+16]
0057E8 060000 RETURN
19: }
20: else {
21: data.Data_H = 0;
0057EA 780802 MOV W2, [W0]
22: data.Data_L = 0;
0057EC 980012 MOV W2, [W0+2]
23: }
24: return data;
25: }
0057EE 060000 RETURN
not much difference, right? but notice at the end, with -O1 at the end of the if (or at the end of switch statements) that go to the end of the routine, one could simply use return, instead with -O1 there is a branch to the end of the statement, which would be the general case. add "return" where needed and spare yourself 3/5 cycles! (this may be a "duh" but i would have expected the compiler to do it already with -O1)