diff options
author | Roger Sayle <roger@nextmovesoftware.com> | 2022-05-19 17:54:38 +0100 |
---|---|---|
committer | Roger Sayle <roger@nextmovesoftware.com> | 2022-05-19 17:54:38 +0100 |
commit | d863ba23fb16122bb0547b0c678173be0d98f43c (patch) | |
tree | f8e5d507bac668e8cb40e94aae74975aef76a675 /gcc | |
parent | [PATCH, rs6000] Remove the (no longer used) BTC defines. (diff) | |
download | gcc-d863ba23fb16122bb0547b0c678173be0d98f43c.tar.gz gcc-d863ba23fb16122bb0547b0c678173be0d98f43c.tar.bz2 gcc-d863ba23fb16122bb0547b0c678173be0d98f43c.tar.xz |
PR middle-end/98865: Expand X*Y as X&-Y when Y is [0,1].
The patch is a revised solution for PR middle-end/98865 incorporating
the feedback/suggestions from Richard Biener's review here:
https://gcc.gnu.org/pipermail/gcc-patches/2022-May/593928.html
Most significantly, this patch now performs the transformation/optimization
during RTL expansion, where the target's rtx_costs can be used to determine
whether the original multiplication (that may potentially be implemented by
a shift or lea) is cheaper than a negation and a bit-wise and.
Previously the expression (x>>63)*y would be compiled with -O2 as
shrq $63, %rdi
movq %rdi, %rax
imulq %rsi, %rax
but with this patch now produces:
sarq $63, %rdi
movq %rdi, %rax
andq %rsi, %rax
Likewise the expression (x>>63)*135 [that appears in a hot-spot of the
Botan AES-128 benchmark] was previously:
shrq $63, %rdi
leaq (%rdi,%rdi,8), %rdx
movq %rdx, %rax
salq $4, %rax
subq %rdx, %rax
now becomes:
movq %rdi, %rax
sarq $63, %rax
andl $135, %eax
2022-05-19 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
PR middle-end/98865
* expr.cc (expand_expr_real_2) [MULT_EXPR]: Expand X*Y as X&Y
when both X and Y are [0, 1], X*Y as X&-Y when Y is [0,1] and
likewise X*Y as -X&Y when X is [0,1] using tree_nonzero_bits.
gcc/testsuite/ChangeLog
PR middle-end/98865
* gcc.target/i386/pr98865.c: New test case.
Diffstat (limited to 'gcc')
-rw-r--r-- | gcc/expr.cc | 32 | ||||
-rw-r--r-- | gcc/testsuite/gcc.target/i386/pr98865.c | 54 |
2 files changed, 86 insertions, 0 deletions
diff --git a/gcc/expr.cc b/gcc/expr.cc index 18060911793..7197996cec7 100644 --- a/gcc/expr.cc +++ b/gcc/expr.cc | |||
@@ -9541,6 +9541,38 @@ expand_expr_real_2 (sepops ops, rtx target, machine_mode tmode, | |||
9541 | } | 9541 | } |
9542 | 9542 | ||
9543 | expand_operands (treeop0, treeop1, subtarget, &op0, &op1, EXPAND_NORMAL); | 9543 | expand_operands (treeop0, treeop1, subtarget, &op0, &op1, EXPAND_NORMAL); |
9544 | |||
9545 | /* Expand X*Y as X&-Y when Y must be zero or one. */ | ||
9546 | if (SCALAR_INT_MODE_P (mode)) | ||
9547 | { | ||
9548 | bool bit0_p = tree_nonzero_bits (treeop0) == 1; | ||
9549 | bool bit1_p = tree_nonzero_bits (treeop1) == 1; | ||
9550 | |||
9551 | /* Expand X*Y as X&Y when both X and Y must be zero or one. */ | ||
9552 | if (bit0_p && bit1_p) | ||
9553 | return REDUCE_BIT_FIELD (expand_and (mode, op0, op1, target)); | ||
9554 | |||
9555 | if (bit0_p || bit1_p) | ||
9556 | { | ||
9557 | bool speed = optimize_insn_for_speed_p (); | ||
9558 | int cost = add_cost (speed, mode) + neg_cost (speed, mode); | ||
9559 | struct algorithm algorithm; | ||
9560 | enum mult_variant variant; | ||
9561 | if (CONST_INT_P (op1) | ||
9562 | ? !choose_mult_variant (mode, INTVAL (op1), | ||
9563 | &algorithm, &variant, cost) | ||
9564 | : cost < mul_cost (speed, mode)) | ||
9565 | { | ||
9566 | target = bit0_p ? expand_and (mode, negate_rtx (mode, op0), | ||
9567 | op1, target) | ||
9568 | : expand_and (mode, op0, | ||
9569 | negate_rtx (mode, op1), | ||
9570 | target); | ||
9571 | return REDUCE_BIT_FIELD (target); | ||
9572 | } | ||
9573 | } | ||
9574 | } | ||
9575 | |||
9544 | return REDUCE_BIT_FIELD (expand_mult (mode, op0, op1, target, unsignedp)); | 9576 | return REDUCE_BIT_FIELD (expand_mult (mode, op0, op1, target, unsignedp)); |
9545 | 9577 | ||
9546 | case TRUNC_MOD_EXPR: | 9578 | case TRUNC_MOD_EXPR: |
diff --git a/gcc/testsuite/gcc.target/i386/pr98865.c b/gcc/testsuite/gcc.target/i386/pr98865.c new file mode 100644 index 00000000000..d047c4bc2ac --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr98865.c | |||
@@ -0,0 +1,54 @@ | |||
1 | /* { dg-do compile } */ | ||
2 | /* { dg-options "-O2 -fdump-tree-optimized" } */ | ||
3 | |||
4 | #if __SIZEOF_INT__ == 4 | ||
5 | unsigned int foo(unsigned int a, unsigned int b) | ||
6 | { | ||
7 | return (a >> 31) * b; | ||
8 | } | ||
9 | |||
10 | int bar(int a, int b) | ||
11 | { | ||
12 | return -(a >> 31) * b; | ||
13 | } | ||
14 | |||
15 | int baz(int a, int b) | ||
16 | { | ||
17 | int c = a >> 31; | ||
18 | int d = -c; | ||
19 | return d * b; | ||
20 | } | ||
21 | |||
22 | unsigned int pin(int a, unsigned int b) | ||
23 | { | ||
24 | unsigned int t = a & 1; | ||
25 | return t * b; | ||
26 | } | ||
27 | #endif | ||
28 | |||
29 | #if __SIZEOF_LONG_LONG__ == 8 | ||
30 | unsigned long long fool(unsigned long long a, unsigned long long b) | ||
31 | { | ||
32 | return (a >> 63) * b; | ||
33 | } | ||
34 | |||
35 | long long barl (long long a, long long b) | ||
36 | { | ||
37 | return -(a >> 63) * b; | ||
38 | } | ||
39 | |||
40 | long long bazl (long long a, long long b) | ||
41 | { | ||
42 | long long c = a >> 63; | ||
43 | long long d = -c; | ||
44 | return d * b; | ||
45 | } | ||
46 | |||
47 | unsigned long long pinl(long long a, unsigned long long b) | ||
48 | { | ||
49 | unsigned long long t = a & 1; | ||
50 | return t * b; | ||
51 | } | ||
52 | #endif | ||
53 | |||
54 | /* { dg-final { scan-assembler-not "imul" } } */ | ||