summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCarl Love <carll@us.ibm.com>2017-10-03 15:18:09 -0500
committerCarl Love <carll@us.ibm.com>2017-10-03 15:18:09 -0500
commitb0aef250a74804423341b3ce804355037211e330 (patch)
treee769851a56a02946abed2ccd1d2096d2ae6973be
parentPPC64, Use the vperm code to implement the xxperm inst. (diff)
downloadvalgrind-b0aef250a74804423341b3ce804355037211e330.tar.gz
valgrind-b0aef250a74804423341b3ce804355037211e330.tar.bz2
valgrind-b0aef250a74804423341b3ce804355037211e330.tar.xz
PPC64, Re-implement the vpermr instruction using the Iop_Perm8x16.
The current implementation will generate a lot of Iops. The number of generated Iops can lead to Valgrind running out of temporary space. See bugzilla https://bugs.kde.org/show_bug.cgi?id=385208 as an example of the issue. Using Iop_Perm8x16 reduces the number of Iops significantly. bugzilla 385210
-rw-r--r--NEWS1
-rw-r--r--VEX/priv/guest_ppc_toIR.c135
2 files changed, 35 insertions, 101 deletions
diff --git a/NEWS b/NEWS
index 97a2a29..1ab22bb 100644
--- a/NEWS
+++ b/NEWS
@@ -61,6 +61,7 @@ n-i-bz Fix missing workq_ops operations (macOS)
61385182 PPC64 is missing support for the DSCR 61385182 PPC64 is missing support for the DSCR
62385207 PPC64, generate_store_FPRF() generates too many Iops 62385207 PPC64, generate_store_FPRF() generates too many Iops
63385208 PPC64, xxperm instruction exhausts temporary memory 63385208 PPC64, xxperm instruction exhausts temporary memory
64385210 PPC64, vpermr instruction could exhaust temporary memory
64 65
65Release 3.13.0 (15 June 2017) 66Release 3.13.0 (15 June 2017)
66~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 67~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/VEX/priv/guest_ppc_toIR.c b/VEX/priv/guest_ppc_toIR.c
index 1373d1c..1785959 100644
--- a/VEX/priv/guest_ppc_toIR.c
+++ b/VEX/priv/guest_ppc_toIR.c
@@ -24107,107 +24107,40 @@ static Bool dis_av_permute ( UInt theInstr )
24107 } 24107 }
24108 24108
24109 case 0x3B: { // vpermr (Vector Permute Right-indexed) 24109 case 0x3B: { // vpermr (Vector Permute Right-indexed)
24110 int i; 24110 /* limited to two args for IR, so have to play games... */
24111 IRTemp new_Vt[17]; 24111 IRTemp a_perm = newTemp( Ity_V128 );
24112 IRTemp tmp[16]; 24112 IRTemp b_perm = newTemp( Ity_V128 );
24113 IRTemp index[16]; 24113 IRTemp mask = newTemp( Ity_V128 );
24114 IRTemp index_gt16[16]; 24114 IRTemp vC_andF = newTemp( Ity_V128 );
24115 IRTemp mask[16]; 24115
24116 24116 DIP( "vpermr v%d,v%d,v%d,v%d\n",
24117 DIP("vpermr v%d,v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr, vC_addr); 24117 vD_addr, vA_addr, vB_addr, vC_addr);
24118 24118 /* Limit the Perm8x16 steering values to 0 .. 31 as that is what
24119 new_Vt[0] = newTemp( Ity_V128 ); 24119 IR specifies, and also to hide irrelevant bits from
24120 assign( new_Vt[0], binop( Iop_64HLtoV128, 24120 memcheck.
24121 mkU64( 0x0 ), 24121 */
24122 mkU64( 0x0 ) ) ); 24122
24123 24123 assign( vC_andF,
24124 for ( i = 0; i < 16; i++ ) { 24124 binop( Iop_Sub16x8,
24125 index_gt16[i] = newTemp( Ity_V128 ); 24125 binop( Iop_64HLtoV128,
24126 mask[i] = newTemp( Ity_V128 ); 24126 mkU64( 0x1F1F1F1F1F1F1F1F ),
24127 index[i] = newTemp( Ity_I32 ); 24127 mkU64( 0x1F1F1F1F1F1F1F1F ) ),
24128 tmp[i] = newTemp( Ity_V128 ); 24128 binop( Iop_AndV128, mkexpr( vC ),
24129 new_Vt[i+1] = newTemp( Ity_V128 ); 24129 unop( Iop_Dup8x16, mkU8( 0x1F ) ) ) ) );
24130 24130 assign( a_perm,
24131 assign( index[i], 24131 binop( Iop_Perm8x16, mkexpr( vA ), mkexpr( vC_andF ) ) );
24132 binop( Iop_Sub32, 24132 assign( b_perm,
24133 mkU32( 31 ), 24133 binop( Iop_Perm8x16, mkexpr( vB ), mkexpr( vC_andF ) ) );
24134 unop( Iop_64to32, 24134 // mask[i8] = (vC[i8]_4 == 1) ? 0xFF : 0x0
24135 unop( Iop_V128to64, 24135 assign( mask, binop(Iop_SarN8x16,
24136 binop( Iop_ShrV128, 24136 binop( Iop_ShlN8x16, mkexpr( vC_andF ),
24137 binop( Iop_AndV128, 24137 mkU8( 3 ) ), mkU8( 7 ) ) );
24138 binop( Iop_ShlV128, 24138 // dst = (a & ~mask) | (b & mask)
24139 binop( Iop_64HLtoV128, 24139 putVReg( vD_addr, binop( Iop_OrV128,
24140 mkU64( 0x0 ), 24140 binop( Iop_AndV128, mkexpr( a_perm ),
24141 mkU64( 0x3F ) ), 24141 unop( Iop_NotV128, mkexpr( mask ) ) ),
24142 mkU8( (15 - i) * 8 ) ), 24142 binop( Iop_AndV128, mkexpr( b_perm ),
24143 mkexpr( vC ) ), 24143 mkexpr( mask ) ) ) );
24144 mkU8( (15 - i) * 8 ) ) ) ) ) );
24145
24146 /* Determine if index < 16, src byte is vA[index], otherwise
24147 * vB[31-index]. Check if msb of index is 1 or not.
24148 */
24149 assign( index_gt16[i],
24150 binop( Iop_64HLtoV128,
24151 unop( Iop_1Sto64,
24152 unop( Iop_32to1,
24153 binop( Iop_Shr32,
24154 mkexpr( index[i] ),
24155 mkU8( 4 ) ) ) ),
24156 unop( Iop_1Sto64,
24157 unop( Iop_32to1,
24158 binop( Iop_Shr32,
24159 mkexpr( index[i] ),
24160 mkU8( 4 ) ) ) ) ) );
24161 assign( mask[i],
24162 binop( Iop_ShlV128,
24163 binop( Iop_64HLtoV128,
24164 mkU64( 0x0 ),
24165 mkU64( 0xFF ) ),
24166 unop( Iop_32to8,
24167 binop( Iop_Mul32,
24168 binop( Iop_Sub32,
24169 mkU32( 15 ),
24170 binop( Iop_And32,
24171 mkexpr( index[i] ),
24172 mkU32( 0xF ) ) ),
24173 mkU32( 8 ) ) ) ) );
24174
24175 /* Extract the indexed byte from vA and vB using the lower 4-bits
24176 * of the index. Then use the index_gt16 mask to select vA if the
24177 * index < 16 or vB if index > 15. Put the selected byte in the
24178 * least significant byte.
24179 */
24180 assign( tmp[i],
24181 binop( Iop_ShrV128,
24182 binop( Iop_OrV128,
24183 binop( Iop_AndV128,
24184 binop( Iop_AndV128,
24185 mkexpr( mask[i] ),
24186 mkexpr( vA ) ),
24187 unop( Iop_NotV128,
24188 mkexpr( index_gt16[i] ) ) ),
24189 binop( Iop_AndV128,
24190 binop( Iop_AndV128,
24191 mkexpr( mask[i] ),
24192 mkexpr( vB ) ),
24193 mkexpr( index_gt16[i] ) ) ),
24194 unop( Iop_32to8,
24195 binop( Iop_Mul32,
24196 binop( Iop_Sub32,
24197 mkU32( 15 ),
24198 binop( Iop_And32,
24199 mkexpr( index[i] ),
24200 mkU32( 0xF ) ) ),
24201 mkU32( 8 ) ) ) ) );
24202
24203 /* Move the selected byte to the position to store in the result */
24204 assign( new_Vt[i+1], binop( Iop_OrV128,
24205 binop( Iop_ShlV128,
24206 mkexpr( tmp[i] ),
24207 mkU8( (15 - i) * 8 ) ),
24208 mkexpr( new_Vt[i] ) ) );
24209 }
24210 putVReg( vD_addr, mkexpr( new_Vt[16] ) );
24211 return True; 24144 return True;
24212 } 24145 }
24213 24146