Il 07/06/2012 12:21, Dinar Temirbulatov ha scritto: > oh, I found typo in comment in the end of patch. fixed.
Great improvement, thanks! Unfortunately we're not there yet, but much closer! I could understand the new code much better so I suggest some more improvements below, both to the comments and to the code generation. > diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c > index 8a86227..0f8120f 100644 > --- a/gcc/config/arm/arm.c > +++ b/gcc/config/arm/arm.c > @@ -7130,6 +7130,8 @@ arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* > total, bool speed) > *total = COSTS_N_INSNS (2); > else if (TARGET_HARD_FLOAT && mode == DFmode && !TARGET_VFP_SINGLE) > *total = COSTS_N_INSNS (4); > + else if (mode == DImode) > + *total = COSTS_N_INSNS (50); > else > *total = COSTS_N_INSNS (20); > return false; > diff --git a/gcc/config/mips/mips.c b/gcc/config/mips/mips.c > index 5bcb7a8..57bb4cc 100644 > --- a/gcc/config/mips/mips.c > +++ b/gcc/config/mips/mips.c > @@ -3879,8 +3879,13 @@ mips_rtx_costs (rtx x, int code, int outer_code, int > opno ATTRIBUTE_UNUSED, > } > *total = COSTS_N_INSNS (mips_idiv_insns ()); > } > - else if (mode == DImode) > + else if (mode == DImode) { > + if (!TARGET_64BIT) > + /* divide double integer libcall is expensive. */ > + *total = COSTS_N_INSNS (200); > + else > *total = mips_cost->int_div_di; > + } > else > *total = mips_cost->int_div_si; > return false; > diff --git a/gcc/expmed.c b/gcc/expmed.c > index 98f7c09..bb4d7cd 100644 > --- a/gcc/expmed.c > +++ b/gcc/expmed.c > @@ -3539,6 +3539,84 @@ expand_mult_highpart_optab (enum machine_mode mode, > rtx op0, rtx op1, > } > } > > + if (unsignedp > + && size - 1 > BITS_PER_WORD > + && (!optimize_size && (optimize>1)) Coding style: "(!optimize_size && optimize > 1)". > + && (4 * mul_cost[speed][mode] + 4 * add_cost[speed][mode] > + + shift_cost[speed][mode][31] < max_cost)) Thanks, this is now much cleaner and I could see other improvements. This should be 3 * mul_widen_cost[speed][mode] + mul_highpart_cost[speed][mode] + 4 * add_cost[speed][mode] + add_cost[speed][word_mode] That is because there is no shift really: a shift by 32 is simply moving the operand to the higher word, and an add of that value will ignore the lower word. Hence, summing carry_result is cheaper: that is add_cost[speed][word_mode]. On the other hand you also have to consider the comparison emitted by emit_store_flag_force, which will usually cost the same as an addition. That is the fourth add_cost[speed][mode]. > + { > + rtx x1, x0, y1, y0, z2, z0, tmp, u0, u0tmp, u1, carry, carry_result, > result; > + /* Extracting the higher part of the 64-bit multiplier. */ > + x1 = gen_highpart (word_mode, op0); > + x1 = force_reg (word_mode, x1); > + > + /* Extracting the lower part of the 64-bit multiplier. */ > + x0 = gen_lowpart (word_mode, op0); > + x0 = force_reg (word_mode, x0); > + > + /* Splitting the 64-bit constant for the higher and the lower parts. > */ > + y0 = gen_lowpart (word_mode, op1); > + y0 = force_reg (word_mode, y0); > + y1 = gen_highpart_mode (word_mode, mode, op1); > + > + z2 = gen_reg_rtx (mode); > + u0 = gen_reg_rtx (mode); You do not need the gen_reg_rtx; just pass a NULL_RTX target to expand_widening_mult. > + z2 = expand_widening_mult (mode, x1, y1, z2, 1, umul_widen_optab); > + Remove the empty line. Also, let's rename the values to make clear where is the multiplication of what: z2 -> u11 u0 -> u01 z0 -> u00 u1 -> u10 > + u0 = expand_widening_mult (mode, x0, y1, u0, 1, umul_widen_optab); > + > + z0 = gen_reg_rtx (mode); > + u1 = gen_reg_rtx (mode); gen_reg_rtx is not needed here too. > + z0 = expand_widening_mult (mode, x0, y0, z0, 1, umul_widen_optab); > + And neither is this blank line. > + u1 = expand_widening_mult (mode, x1, y0, u1, 1, umul_widen_optab); > + /* Compute the middle word of the three-word intermediate result. */ ^^^^^^ Oops, this is the low word, not the middle. But let's improve the comment to explain the algorithm. /* u00, u01, u10, u11 form a three-word value with the result in the top word, so we want to return this: ((u11 << 2*BITS_PER_WORD) + (u01 + u10 << BITS_PER_WORD) + u00) >> 3 * BITS_PER_WORD We then rewrite it this way: (u11 + ((u01 + u10 + (u00 >> BITS_PER_WORD)) >> BITS_PER_WORD) >> BITS_PER_WORD where the shifts are realized with gen_highpart and a conversion back to the wider mode. */ > + u0tmp = gen_highpart (word_mode, z0); > + u0tmp = force_reg (word_mode, u0tmp); > + u0tmp = convert_to_mode (mode, u0tmp, 1); u0tmp -> u00h Put the expand_inc (u01, u00h); before the comment. The formula is now above so we can say more simply: /* Summing u01, u10 and u00h together could have up to 2 * BITS_PER_WORD + 1 bits of precision. We compute the extra bit by checking for carry, and add 1 << BITS_PER_WORD to u11 if there is carry. */ > + > + expand_inc (u0, u0tmp); > + tmp = gen_reg_rtx (mode); > + tmp = expand_binop (mode, add_optab, u0, u1, tmp, 1, OPTAB_LIB_WIDEN); Now that you have a single emit_store_flag_force, you can avoid "tmp = gen_reg_rtx (mode)" and just use expand_inc (u01, u10); > + if (!tmp) > + return 0; This cannot fail, you can remove the "if". > + > + /* Checking for carry here. */ > + carry = gen_reg_rtx (mode); > + emit_store_flag_force (carry, GT, u0, tmp, mode, 1, 1); Since above you will use u01 as the target, you have to use u10 instead here: carry = emit_store_flag_force (NULL_RTX, GT, u10, u01, mode, 1, 1); i.e. operand > result. That's a nice improvement, and should generate optimal code like: add r0, r4 ; r0:r1 += 0:r4 u0 += u0h adc r1, 0 add r0, r2 ; r0:r1 += r2:r3 adc r1, r3 sub r2, r0 ; flags = r2:r3 CMP r0:r1 sbc r3, r1 it hi ; if r2:r3 > r0:r1 add r6, #1 ; ... r6:r7 += 1:0 add r6, r0 ; r6:r7 += r0:r1 adc r7, r1 for everything after the multiplications. This matches nicely the cost estimation above. > + carry_result = gen_reg_rtx (mode); No need for this gen_reg_rtx, either, by passing a NULL_RTX target below. > + carry_result = expand_shift (LSHIFT_EXPR, mode, carry, BITS_PER_WORD, > carry_result, 1); > + > + /* Adding 0x100000000 as carry here if required. */ Oops, a remnant of 32-bit specific code. /* Adding 1 << BITS_PER_WORD as carry here if required. */ > + expand_inc (z2, carry_result); > + > + /* Extracting the higher part of the sum. */ > + tmp = gen_highpart (word_mode, tmp); > + tmp = force_reg (word_mode, tmp); > + tmp = convert_to_mode (mode, tmp, 1); And these will use u01 instead of tmp. > + /* The final result. */ > + expand_inc (z2, tmp); > + return z2; > + > + } > + > /* Try widening multiplication of opposite signedness, and adjust. */ > moptab = unsignedp ? smul_widen_optab : umul_widen_optab; > if (widening_optab_handler (moptab, wider_mode, mode) != CODE_FOR_nothing > I hope you appreciate the improvements! Paolo