summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCarl Love <carll@us.ibm.com>2017-10-03 15:09:22 -0500
committerCarl Love <carll@us.ibm.com>2017-10-03 15:09:22 -0500
commita1d03d0d11c0b31a6d9f57baa4d46317fdd5f6ef (patch)
tree5e48e8a87120de31c8050b78c0bae4c916961974
parentPPC64, Replace body of generate_store_FPRF with C helper function. (diff)
downloadvalgrind-a1d03d0d11c0b31a6d9f57baa4d46317fdd5f6ef.tar.gz
valgrind-a1d03d0d11c0b31a6d9f57baa4d46317fdd5f6ef.tar.bz2
valgrind-a1d03d0d11c0b31a6d9f57baa4d46317fdd5f6ef.tar.xz
PPC64, Use the vperm code to implement the xxperm inst.
The current xxperm instruction implementation generates a huge number of Iops to explicitly do the permutation. The code was changed to use the Iop_Perm8x16 which is much more efficient so temporary memory doesn't get exhausted. Bugzilla 385208
-rw-r--r--NEWS1
-rw-r--r--VEX/priv/guest_ppc_toIR.c161
2 files changed, 43 insertions, 119 deletions
diff --git a/NEWS b/NEWS
index 20007cf..97a2a29 100644
--- a/NEWS
+++ b/NEWS
@@ -60,6 +60,7 @@ where XXXXXX is the bug number as listed below.
60n-i-bz Fix missing workq_ops operations (macOS) 60n-i-bz Fix missing workq_ops operations (macOS)
61385182 PPC64 is missing support for the DSCR 61385182 PPC64 is missing support for the DSCR
62385207 PPC64, generate_store_FPRF() generates too many Iops 62385207 PPC64, generate_store_FPRF() generates too many Iops
63385208 PPC64, xxperm instruction exhausts temporary memory
63 64
64Release 3.13.0 (15 June 2017) 65Release 3.13.0 (15 June 2017)
65~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 66~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/VEX/priv/guest_ppc_toIR.c b/VEX/priv/guest_ppc_toIR.c
index 0dae368..1373d1c 100644
--- a/VEX/priv/guest_ppc_toIR.c
+++ b/VEX/priv/guest_ppc_toIR.c
@@ -22319,15 +22319,17 @@ dis_vx_permute_misc( UInt theInstr, UInt opc2 )
22319 case 0x68: // xxperm (VSX Permute ) 22319 case 0x68: // xxperm (VSX Permute )
22320 case 0xE8: // xxpermr (VSX Permute right-index ) 22320 case 0xE8: // xxpermr (VSX Permute right-index )
22321 { 22321 {
22322 int i; 22322
22323 IRTemp new_Vt[17]; 22323 /* The xxperm instruction performs the same operation as
22324 IRTemp perm_val[16]; 22324 the vperm except the xxperm operates on the VSR register
22325 IRTemp perm_val_gt16[16]; 22325 file. while vperm operates on the VR register file.
22326 IRTemp tmp_val[16]; 22326 Lets borrow some code here from vperm. The mapping of
22327 IRTemp perm_idx[16]; 22327 the source registers is also a little different.
22328 IRTemp perm_mask = newTemp( Ity_V128 ); 22328 */
22329 IRTemp val_mask = newTemp( Ity_V128 ); 22329 IRTemp a_perm = newTemp(Ity_V128);
22330 int dest_shift_amount = 0; 22330 IRTemp b_perm = newTemp(Ity_V128);
22331 IRTemp mask = newTemp(Ity_V128);
22332 IRTemp perm_val = newTemp(Ity_V128);
22331 22333
22332 if ( opc2 == 0x68 ) { 22334 if ( opc2 == 0x68 ) {
22333 DIP("xxperm v%d,v%d,v%d\n", (UInt)XT, (UInt)XA, (UInt)XB); 22335 DIP("xxperm v%d,v%d,v%d\n", (UInt)XT, (UInt)XA, (UInt)XB);
@@ -22337,119 +22339,40 @@ dis_vx_permute_misc( UInt theInstr, UInt opc2 )
22337 DIP("xxpermr v%d,v%d,v%d\n", (UInt)XT, (UInt)XA, (UInt)XB); 22339 DIP("xxpermr v%d,v%d,v%d\n", (UInt)XT, (UInt)XA, (UInt)XB);
22338 } 22340 }
22339 22341
22340 new_Vt[0] = newTemp( Ity_V128 );
22341
22342 assign( vT, getVSReg( XT ) ); 22342 assign( vT, getVSReg( XT ) );
22343 22343
22344 assign( new_Vt[0], binop( Iop_64HLtoV128, 22344 if ( opc2 == 0x68 ) // xxperm
22345 mkU64( 0x0 ), mkU64( 0x0 ) ) ); 22345 assign( perm_val,
22346 assign( perm_mask, binop( Iop_64HLtoV128, 22346 binop( Iop_AndV128, mkexpr( vB ),
22347 mkU64( 0x0 ), mkU64( 0x1F ) ) ); 22347 unop( Iop_Dup8x16, mkU8( 0x1F ) ) ) );
22348 assign( val_mask, binop( Iop_64HLtoV128,
22349 mkU64( 0x0 ), mkU64( 0xFF ) ) );
22350
22351 /* For each permute index in XB, the permute list, select the byte
22352 * from XA indexed by the permute index if the permute index is less
22353 * then 16. Copy the selected byte to the destination location in
22354 * the result.
22355 */
22356 for ( i = 0; i < 16; i++ ) {
22357 perm_val_gt16[i] = newTemp( Ity_V128 );
22358 perm_val[i] = newTemp( Ity_V128 );
22359 perm_idx[i] = newTemp( Ity_I8 );
22360 tmp_val[i] = newTemp( Ity_V128 );
22361 new_Vt[i+1] = newTemp( Ity_V128 );
22362
22363 /* create mask to extract the permute index value from vB,
22364 * store value in least significant bits of perm_val
22365 */
22366 if ( opc2 == 0x68 )
22367 /* xxperm, the perm value is the index value in XB */
22368 assign( perm_val[i], binop( Iop_ShrV128,
22369 binop( Iop_AndV128,
22370 mkexpr(vB),
22371 binop( Iop_ShlV128,
22372 mkexpr( perm_mask ),
22373 mkU8( (15 - i) * 8 ) ) ),
22374 mkU8( (15 - i) * 8 ) ) );
22375 22348
22376 else 22349 else // xxpermr
22377 /* xxpermr, the perm value is 31 - index value in XB */ 22350 assign( perm_val,
22378 assign( perm_val[i], 22351 binop( Iop_Sub16x8,
22379 binop( Iop_Sub8x16, 22352 binop( Iop_64HLtoV128,
22380 binop( Iop_64HLtoV128, 22353 mkU64( 0x1F1F1F1F1F1F1F1F ),
22381 mkU64( 0 ), mkU64( 31 ) ), 22354 mkU64( 0x1F1F1F1F1F1F1F1F ) ),
22382 binop( Iop_ShrV128, 22355 binop( Iop_AndV128, mkexpr( vB ),
22383 binop( Iop_AndV128, 22356 unop( Iop_Dup8x16, mkU8( 0x1F ) ) ) ) );
22384 mkexpr( vB ), 22357
22385 binop( Iop_ShlV128, 22358 /* Limit the Perm8x16 steering values to 0 .. 31 as that is what
22386 mkexpr( perm_mask ), 22359 IR specifies, and also to hide irrelevant bits from
22387 mkU8( ( 15 - i ) * 8 ) ) ), 22360 memcheck.
22388 mkU8( ( 15 - i ) * 8 ) ) ) ); 22361 */
22389 22362 assign( a_perm,
22390 /* Determine if the perm_val[] > 16. If it is, then the value 22363 binop( Iop_Perm8x16, mkexpr( vA ), mkexpr( perm_val ) ) );
22391 * will come from xT otherwise it comes from xA. Either way, 22364 assign( b_perm,
22392 * create the mask to get the value from the source using the 22365 binop( Iop_Perm8x16, mkexpr( vT ), mkexpr( perm_val ) ) );
22393 * lower 3 bits of perm_val[]. Create a 128 bit mask from the 22366 assign( mask, binop( Iop_SarN8x16,
22394 * upper bit of perm_val[] to be used to select from xT or xA. 22367 binop( Iop_ShlN8x16, mkexpr( perm_val ),
22395 */ 22368 mkU8( 3 ) ),
22396 assign( perm_val_gt16[i], 22369 mkU8( 7 ) ) );
22397 binop(Iop_64HLtoV128, 22370 // dst = (a & ~mask) | (b & mask)
22398 unop( Iop_1Sto64, 22371 putVSReg( XT, binop( Iop_OrV128,
22399 unop( Iop_64to1, 22372 binop( Iop_AndV128, mkexpr( a_perm ),
22400 unop( Iop_V128to64, 22373 unop( Iop_NotV128, mkexpr( mask ) ) ),
22401 binop( Iop_ShrV128, 22374 binop( Iop_AndV128, mkexpr( b_perm ),
22402 mkexpr( perm_val[i] ), 22375 mkexpr( mask ) ) ) );
22403 mkU8( 4 ) ) ) ) ),
22404 unop( Iop_1Sto64,
22405 unop( Iop_64to1,
22406 unop( Iop_V128to64,
22407 binop( Iop_ShrV128,
22408 mkexpr( perm_val[i] ),
22409 mkU8( 4 ) ) ) ) ) ) );
22410
22411 assign( perm_idx[i],
22412 unop(Iop_32to8,
22413 binop( Iop_Mul32,
22414 binop( Iop_Sub32,
22415 mkU32( 15 ),
22416 unop( Iop_64to32,
22417 binop( Iop_And64,
22418 unop( Iop_V128to64,
22419 mkexpr( perm_val[i] ) ),
22420 mkU64( 0xF ) ) ) ),
22421 mkU32( 8 ) ) ) );
22422
22423 dest_shift_amount = ( 15 - i )*8;
22424
22425 /* Use perm_val_gt16 to select value from vA or vT */
22426 assign( tmp_val[i],
22427 binop( Iop_ShlV128,
22428 binop( Iop_ShrV128,
22429 binop( Iop_OrV128,
22430 binop( Iop_AndV128,
22431 mkexpr( vA ),
22432 binop( Iop_AndV128,
22433 unop( Iop_NotV128,
22434 mkexpr( perm_val_gt16[i] ) ),
22435 binop( Iop_ShlV128,
22436 mkexpr( val_mask ),
22437 mkexpr( perm_idx[i] ) ) ) ),
22438 binop( Iop_AndV128,
22439 mkexpr( vT ),
22440 binop( Iop_AndV128,
22441 mkexpr( perm_val_gt16[i] ),
22442 binop( Iop_ShlV128,
22443 mkexpr( val_mask ),
22444 mkexpr( perm_idx[i] ) ) ) ) ),
22445 mkexpr( perm_idx[i] ) ),
22446 mkU8( dest_shift_amount ) ) );
22447
22448 assign( new_Vt[i+1], binop( Iop_OrV128,
22449 mkexpr( tmp_val[i] ),
22450 mkexpr( new_Vt[i] ) ) );
22451 }
22452 putVSReg( XT, mkexpr( new_Vt[16] ) );
22453 break; 22376 break;
22454 } 22377 }
22455 22378