> Note 3: Can anybody tell me why MSPGCC "forgets" some of the lines of
> the following code (same solution like #2)? E.g. the last two
> asm-instructions disappear.
>
> static uint64_t mul32(uint32_t arg_b, uint32_t arg_a) {
> asm("mov %A[src], &__MPY" : : [src] "r" (arg_a));// MOV al,MPY
> asm("mov %A[src], &__OP2" : : [src] "r" (arg_b));// MOV bl,OP2
> asm("mov %A[src], &__MAC" : : [src] "r" (arg_a));// MOV al,MAC
> asm("mov &__RESLO, %A[dst]" : [dst] "=r" (arg_a));// MOV RESLO,al
> asm("mov &__RESHI, &__RESLO");// MOV RESHI,RESLO
> asm("mov #0, &__RESHI");// MOV #0,RESHI
> asm("mov %B[src], &__OP2" : : [src] "r" (arg_b));// MOV bh,OP2
> asm("mov %A[src], &__MAC" : : [src] "r" (arg_b));// MOV bl,MAC
> asm("mov %B[src], &__OP2" : : [src] "r" (arg_a));// MOV ah,OP2
> asm("mov %B[src], &__MAC" : : [src] "r" (arg_a));// MOV ah,MAC
> asm("mov &__RESLO, %B[dst]" : [dst] "=r" (arg_a));// MOV RESLO,ah
> asm("mov &__RESHI, &__RESLO");// MOV RESHI,RESLO
> asm("mov &__SUMEXT, &__RESHI");// MOV SUMEXT,RESHI
> asm("mov %B[src], &__OP2" : : [src] "r" (arg_b));// MOV bh,OP2
> asm("mov &__RESLO, %A[dst]" : [dst] "=r" (arg_b));// MOV RESLO,bl
> asm("mov &__RESHI, %B[dst]" : [dst] "=r" (arg_b));// MOV RESHI,bh
> /*RET*/
> }
Becuase you aren't using "asm volatile" and you aren't using the
results. So GCC throws them away.
This is a good example of the way NOT to use gcc's asm().
Remember, asm() does NOT disable GCC's optimizer. It WILL rearrange
the statements based on the register dependencies. If you have any
dependencies that you do NOT tell GCC about (like needing to write to
__MAC before reading __RESLO), you are breaking the rules and your code
is not guaranteed to work.
The only reason you get away with so much is that GCC allows you to say
"asm voltaile", meaning that "this statement has some interesting effect
(like disabling interrupts) that should not be deleted and should not be
moved relative to the surrounding code", and any asm() with no outputs
ias assumed to be "asm voltaile", because otherwise it would be pointless.
You ARE allowed to put multiple lines in a single asm(). Just stick
a semicolon or a \n\t (the latter makes for prettier assembly code) in
the string. If you need a block of code to be emitted together, use that.
I'm not sure how you expected to return anything from the above, but
the right way to write it is:
static uint64_t mul32(uint32_t x, uint32_t y)
{
uint64_t product;
asm("mov %A[x], &__MPY\n\t"
"mov %A[y], &__OP2\n\t" // Form xl*yl
"mov %A[x], &__MAC\n\t"
"mov &__RESLO, %A[p]\n\t" // Copy low word to product
"mov &__RESHI, &__RESLO\n\t" // Shift result down
"mov #0, &__RESHI\n\t"
"mov %B[y], &__OP2\n\t" // Add xl*yh
"mov %A[y], &__MAC\n\t"
"mov %B[x], &__OP2\n\t" // Add yl*xh
"mov %B[x], &__MAC\n\t"
"mov &__RESLO, %B[p]\n\t" // Copy second-lowest word to
product
"mov &__RESHI, &__RESLO\n\t" // Shift result down
"mov &__SUMEXT, &__RESHI\n\t"
"mov %B[y], &__OP2\n\t" // Add xh*yh
"mov &__RESLO, %C[p]\n\t"
"mov &__RESHI, %D[p]" : [p] "=&r" (product) : [x] "%r" (x), [y] "y"
(b));
return product;
}
Or, of you're old-fashioned and just use the "0", "1", "2" implicit operand
names
rather than [explicit] ones:
static uint64_t mul32(uint32_t x, uint32_t y)
{
uint64_t product;
asm("mov %A1, &__MPY\n\t"
"mov %A2, &__OP2\n\t" // Form al*bl
"mov %A1, &__MAC\n\t"
"mov &__RESLO, %A0\n\t" // Copy low word to product
"mov &__RESHI, &__RESLO\n\t" // Shift result down
"mov #0, &__RESHI\n\t"
"mov %B2, &__OP2\n\t" // Add al*bh
"mov %A2, &__MAC\n\t"
"mov %B1, &__OP2\n\t" // Add bl*ah
"mov %B1, &__MAC\n\t"
"mov &__RESLO, %B0\n\t" // Copy second-lowest word to
product
"mov &__RESHI, &__RESLO\n\t" // Shift result down
"mov &__SUMEXT, &__RESHI\n\t"
"mov %B2, &__OP2\n\t" // Add ah*bh
"mov &__RESLO, %C0\n\t"
"mov &__RESHI, %D0" : "=&r" (product) : "%r" (x), "r" (y));
return product;
}
Note the "&" associated with "product", saying this is an "earlyclobber"
output, meaning that it can't overlap with the inputs, and the % with
the arguments, telling GCC that this operation is associative, so the
order of the inputs doesn't matter. The latter doesn't matter here,
where the operand constraints are identical, but if they're different
(one can be in memory and the other can't), it can allow a saving.
I wonder if you could speed the above up using %D0 as a temporary
pointer and knowing that RESLO, RESHI and SUMEXT are consecutive in
memory:
asm("mov %A1, &__MPY\n\t"
"mov %A2, &__OP2\n\t" // Form al*bl
"mov %A1, &__MAC\n\t"
"mov #__RESLO, %D0\n\t"
"mov @%DO+, %A0\n\t" // Copy low word to product
"mov @%D0,-2(%D0)\n\t" // Shift result down
"mov #0, @%D0\n\t"
"mov %B2, &__OP2\n\t" // Add al*bh
"mov %A2, &__MAC\n\t"
"mov %B1, &__OP2\n\t" // Add bl*ah
"mov %B1, &__MAC\n\t"
"mov #__RESLO, %D0\n\t"
"mov @%DO+, %B0\n\t" // Copy second-lowest word to
product
"mov @%D0+,-4(%D0)\n\t" // Shift result down
"mov @%D0,-2(%D0)\n\t"
"mov %B, &__OP2\n\t" // Add ah*bh
"mov &__RESLO, %C0\n\t"
"mov &__RESHI, %D0" : "=&r" (product) : "%r" (x), "r" (y));
In the first bunch, it adds a two-word instruction and saves two
instruction words, making a net no-op, but in the second, it saves
three and adds two.