diff lib/Target/Hexagon/HexagonPatterns.td @ 147:c2174574ed3a

LLVM 10
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Wed, 14 Aug 2019 16:55:33 +0900
parents 3a76565eade5
children
line wrap: on
line diff
--- a/lib/Target/Hexagon/HexagonPatterns.td	Sat Feb 17 09:57:20 2018 +0900
+++ b/lib/Target/Hexagon/HexagonPatterns.td	Wed Aug 14 16:55:33 2019 +0900
@@ -1,9 +1,8 @@
 //==- HexagonPatterns.td - Target Description for Hexagon -*- tablegen -*-===//
 //
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 
@@ -100,6 +99,17 @@
 def HWI16:  PatLeaf<(VecPI16 HvxWR:$R)>;
 def HWI32:  PatLeaf<(VecPI32 HvxWR:$R)>;
 
+def SDTVecVecIntOp:
+  SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, SDTCisSameAs<1,2>,
+                       SDTCisVT<3,i32>]>;
+
+def HexagonVALIGN:     SDNode<"HexagonISD::VALIGN",     SDTVecVecIntOp>;
+def HexagonVALIGNADDR: SDNode<"HexagonISD::VALIGNADDR", SDTIntUnaryOp>;
+
+def valign: PatFrag<(ops node:$Vt, node:$Vs, node:$Ru),
+                    (HexagonVALIGN node:$Vt, node:$Vs, node:$Ru)>;
+def valignaddr: PatFrag<(ops node:$Addr), (HexagonVALIGNADDR node:$Addr)>;
+
 // Pattern fragments to extract the low and high subregisters from a
 // 64-bit value.
 def LoReg: OutPatFrag<(ops node:$Rs), (EXTRACT_SUBREG (i64 $Rs), isub_lo)>;
@@ -166,6 +176,11 @@
   return CurDAG->getTargetConstant(V-32, SDLoc(N), MVT::i32);
 }]>;
 
+class Subi<int From>: SDNodeXForm<imm,
+  "int32_t V = " # From # " - N->getSExtValue();" #
+  "return CurDAG->getTargetConstant(V, SDLoc(N), MVT::i32);"
+>;
+
 def Log2_32: SDNodeXForm<imm, [{
   uint32_t V = N->getZExtValue();
   return CurDAG->getTargetConstant(Log2_32(V), SDLoc(N), MVT::i32);
@@ -207,6 +222,8 @@
 def I32toI1:  OutPatFrag<(ops node:$Rs), (i1 (C2_cmpgtui (i32 $Rs), (i32 0)))>;
 def ToZext64: OutPatFrag<(ops node:$Rs), (i64 (A4_combineir 0, (i32 $Rs)))>;
 def ToSext64: OutPatFrag<(ops node:$Rs), (i64 (A2_sxtw (i32 $Rs)))>;
+def ToAext64: OutPatFrag<(ops node:$Rs),
+  (REG_SEQUENCE DoubleRegs, (i32 (IMPLICIT_DEF)), isub_hi, (i32 $Rs), isub_lo)>;
 
 def Combinew: OutPatFrag<(ops node:$Rs, node:$Rt),
   (REG_SEQUENCE DoubleRegs, $Rs, isub_hi, $Rt, isub_lo)>;
@@ -235,6 +252,9 @@
 def Zext64: PatFrag<(ops node:$Rs), (i64 (zext node:$Rs))>;
 def Sext64: PatLeaf<(i64 Usxtw:$Rs)>;
 
+def azext: PatFrags<(ops node:$Rs), [(zext node:$Rs), (anyext node:$Rs)]>;
+def asext: PatFrags<(ops node:$Rs), [(sext node:$Rs), (anyext node:$Rs)]>;
+
 def: Pat<(IsOrAdd (i32 AddrFI:$Rs), s32_0ImmPred:$off),
          (PS_fi (i32 AddrFI:$Rs), imm:$off)>;
 
@@ -246,8 +266,25 @@
 class Not2<PatFrag P>
   : PatFrag<(ops node:$A, node:$B), (P node:$A, (not node:$B))>;
 
+// If there is a constant operand that feeds the and/or instruction,
+// do not generate the compound instructions.
+// It is not always profitable, as some times we end up with a transfer.
+// Check the below example.
+// ra = #65820; rb = lsr(rb, #8); rc ^= and (rb, ra)
+// Instead this is preferable.
+// ra = and (#65820, lsr(ra, #8)); rb = xor(rb, ra)
+class Su_ni1<PatFrag Op>
+  : PatFrag<Op.Operands, !head(Op.Fragments), [{
+            if (hasOneUse(N)){
+              // Check if Op1 is an immediate operand.
+              SDValue Op1 = N->getOperand(1);
+              return !isa<ConstantSDNode>(Op1);
+            }
+            return false;}],
+            Op.OperandTransform>;
+
 class Su<PatFrag Op>
-  : PatFrag<Op.Operands, Op.Fragment, [{ return hasOneUse(N); }],
+  : PatFrag<Op.Operands, !head(Op.Fragments), [{ return hasOneUse(N); }],
             Op.OperandTransform>;
 
 // Main selection macros.
@@ -271,9 +308,9 @@
         (MI RegPred:$Rx, RegPred:$Rs, imm:$I)>;
 
 class AccRRR_pat<InstHexagon MI, PatFrag AccOp, PatFrag Op,
-                 PatFrag RsPred, PatFrag RtPred>
-  : Pat<(AccOp RsPred:$Rx, (Op RsPred:$Rs, RtPred:$Rt)),
-        (MI RsPred:$Rx, RsPred:$Rs, RtPred:$Rt)>;
+                 PatFrag RxPred, PatFrag RsPred, PatFrag RtPred>
+  : Pat<(AccOp RxPred:$Rx, (Op RsPred:$Rs, RtPred:$Rt)),
+        (MI RxPred:$Rx, RsPred:$Rs, RtPred:$Rt)>;
 
 multiclass SelMinMax_pats<PatFrag CmpOp, PatFrag Val,
                           InstHexagon InstA, InstHexagon InstB> {
@@ -289,6 +326,7 @@
 def Sub: pf2<sub>;    def Or:  pf2<or>;     def Srl: pf2<srl>;
 def Mul: pf2<mul>;    def Xor: pf2<xor>;    def Shl: pf2<shl>;
 
+def Rol: pf2<rotl>;
 
 // --(1) Immediate -------------------------------------------------------
 //
@@ -336,38 +374,34 @@
 // --(2) Type cast -------------------------------------------------------
 //
 
-let Predicates = [HasV5T] in {
-  def: OpR_R_pat<F2_conv_sf2df,      pf1<fpextend>,   f64, F32>;
-  def: OpR_R_pat<F2_conv_df2sf,      pf1<fpround>,    f32, F64>;
-
-  def: OpR_R_pat<F2_conv_w2sf,       pf1<sint_to_fp>, f32, I32>;
-  def: OpR_R_pat<F2_conv_d2sf,       pf1<sint_to_fp>, f32, I64>;
-  def: OpR_R_pat<F2_conv_w2df,       pf1<sint_to_fp>, f64, I32>;
-  def: OpR_R_pat<F2_conv_d2df,       pf1<sint_to_fp>, f64, I64>;
-
-  def: OpR_R_pat<F2_conv_uw2sf,      pf1<uint_to_fp>, f32, I32>;
-  def: OpR_R_pat<F2_conv_ud2sf,      pf1<uint_to_fp>, f32, I64>;
-  def: OpR_R_pat<F2_conv_uw2df,      pf1<uint_to_fp>, f64, I32>;
-  def: OpR_R_pat<F2_conv_ud2df,      pf1<uint_to_fp>, f64, I64>;
-
-  def: OpR_R_pat<F2_conv_sf2w_chop,  pf1<fp_to_sint>, i32, F32>;
-  def: OpR_R_pat<F2_conv_df2w_chop,  pf1<fp_to_sint>, i32, F64>;
-  def: OpR_R_pat<F2_conv_sf2d_chop,  pf1<fp_to_sint>, i64, F32>;
-  def: OpR_R_pat<F2_conv_df2d_chop,  pf1<fp_to_sint>, i64, F64>;
-
-  def: OpR_R_pat<F2_conv_sf2uw_chop, pf1<fp_to_uint>, i32, F32>;
-  def: OpR_R_pat<F2_conv_df2uw_chop, pf1<fp_to_uint>, i32, F64>;
-  def: OpR_R_pat<F2_conv_sf2ud_chop, pf1<fp_to_uint>, i64, F32>;
-  def: OpR_R_pat<F2_conv_df2ud_chop, pf1<fp_to_uint>, i64, F64>;
-}
+def: OpR_R_pat<F2_conv_sf2df,      pf1<fpextend>,   f64, F32>;
+def: OpR_R_pat<F2_conv_df2sf,      pf1<fpround>,    f32, F64>;
+
+def: OpR_R_pat<F2_conv_w2sf,       pf1<sint_to_fp>, f32, I32>;
+def: OpR_R_pat<F2_conv_d2sf,       pf1<sint_to_fp>, f32, I64>;
+def: OpR_R_pat<F2_conv_w2df,       pf1<sint_to_fp>, f64, I32>;
+def: OpR_R_pat<F2_conv_d2df,       pf1<sint_to_fp>, f64, I64>;
+
+def: OpR_R_pat<F2_conv_uw2sf,      pf1<uint_to_fp>, f32, I32>;
+def: OpR_R_pat<F2_conv_ud2sf,      pf1<uint_to_fp>, f32, I64>;
+def: OpR_R_pat<F2_conv_uw2df,      pf1<uint_to_fp>, f64, I32>;
+def: OpR_R_pat<F2_conv_ud2df,      pf1<uint_to_fp>, f64, I64>;
+
+def: OpR_R_pat<F2_conv_sf2w_chop,  pf1<fp_to_sint>, i32, F32>;
+def: OpR_R_pat<F2_conv_df2w_chop,  pf1<fp_to_sint>, i32, F64>;
+def: OpR_R_pat<F2_conv_sf2d_chop,  pf1<fp_to_sint>, i64, F32>;
+def: OpR_R_pat<F2_conv_df2d_chop,  pf1<fp_to_sint>, i64, F64>;
+
+def: OpR_R_pat<F2_conv_sf2uw_chop, pf1<fp_to_uint>, i32, F32>;
+def: OpR_R_pat<F2_conv_df2uw_chop, pf1<fp_to_uint>, i32, F64>;
+def: OpR_R_pat<F2_conv_sf2ud_chop, pf1<fp_to_uint>, i64, F32>;
+def: OpR_R_pat<F2_conv_df2ud_chop, pf1<fp_to_uint>, i64, F64>;
 
 // Bitcast is different than [fp|sint|uint]_to_[sint|uint|fp].
-let Predicates = [HasV5T] in {
-  def: Pat<(i32 (bitconvert F32:$v)), (I32:$v)>;
-  def: Pat<(f32 (bitconvert I32:$v)), (F32:$v)>;
-  def: Pat<(i64 (bitconvert F64:$v)), (I64:$v)>;
-  def: Pat<(f64 (bitconvert I64:$v)), (F64:$v)>;
-}
+def: Pat<(i32 (bitconvert F32:$v)), (I32:$v)>;
+def: Pat<(f32 (bitconvert I32:$v)), (F32:$v)>;
+def: Pat<(i64 (bitconvert F64:$v)), (I64:$v)>;
+def: Pat<(f64 (bitconvert I64:$v)), (F64:$v)>;
 
 multiclass Cast_pat<ValueType Ta, ValueType Tb, RegisterClass RC> {
   def: Pat<(Tb (bitconvert (Ta RC:$Rs))), (Tb RC:$Rs)>;
@@ -391,44 +425,48 @@
 def: Pat<(sext_inreg I64:$Rs, i16), (A2_sxtw (A2_sxth (LoReg $Rs)))>;
 def: Pat<(sext_inreg I64:$Rs, i8),  (A2_sxtw (A2_sxtb (LoReg $Rs)))>;
 
-def: Pat<(i64 (sext I1:$Pu)),
-         (Combinew (C2_muxii PredRegs:$Pu, -1, 0),
-                   (C2_muxii PredRegs:$Pu, -1, 0))>;
-
-def: Pat<(i32   (sext I1:$Pu)),   (C2_muxii I1:$Pu, -1, 0)>;
-def: Pat<(i32   (zext I1:$Pu)),   (C2_muxii I1:$Pu, 1, 0)>;
-def: Pat<(i64   (zext I1:$Pu)),   (ToZext64 (C2_muxii I1:$Pu, 1, 0))>;
-def: Pat<(v2i16 (sext V2I1:$Pu)), (S2_vtrunehb (C2_mask V2I1:$Pu))>;
-def: Pat<(v2i32 (sext V2I1:$Pu)), (C2_mask V2I1:$Pu)>;
-def: Pat<(v4i8  (sext V4I1:$Pu)), (S2_vtrunehb (C2_mask V4I1:$Pu))>;
-def: Pat<(v4i16 (sext V4I1:$Pu)), (C2_mask V4I1:$Pu)>;
-def: Pat<(v8i8  (sext V8I1:$Pu)), (C2_mask V8I1:$Pu)>;
-
 def: Pat<(i64 (sext I32:$Rs)), (A2_sxtw I32:$Rs)>;
 def: Pat<(Zext64 I32:$Rs),     (ToZext64 $Rs)>;
 def: Pat<(Aext64 I32:$Rs),     (ToZext64 $Rs)>;
 
 def: Pat<(i32 (trunc I64:$Rs)), (LoReg $Rs)>;
-def: Pat<(i1 (trunc I64:$Rs)),  (C2_tfrrp (LoReg $Rs))>;
+def: Pat<(i1 (trunc I32:$Rs)),  (S2_tstbit_i I32:$Rs, 0)>;
+def: Pat<(i1 (trunc I64:$Rs)),  (S2_tstbit_i (LoReg $Rs), 0)>;
 
 let AddedComplexity = 20 in {
   def: Pat<(and I32:$Rs, 255),   (A2_zxtb I32:$Rs)>;
   def: Pat<(and I32:$Rs, 65535), (A2_zxth I32:$Rs)>;
 }
 
-def: Pat<(i32 (anyext I1:$Pu)), (C2_muxii I1:$Pu, 1, 0)>;
-def: Pat<(i64 (anyext I1:$Pu)), (ToZext64 (C2_muxii I1:$Pu, 1, 0))>;
-
-def: Pat<(v8i8  (zext   V8I1:$Pu)),  (C2_mask V8I1:$Pu)>;
-def: Pat<(v4i16 (zext   V4I1:$Pu)),  (C2_mask V4I1:$Pu)>;
-def: Pat<(v2i32 (zext   V2I1:$Pu)),  (C2_mask V2I1:$Pu)>;
-def: Pat<(v4i8  (zext   V4I1:$Pu)),  (LoReg (C2_mask V4I1:$Pu))>;
-def: Pat<(v2i16 (zext   V2I1:$Pu)),  (LoReg (C2_mask V2I1:$Pu))>;
-
-def: Pat<(v4i16 (zext   V4I8:$Rs)),  (S2_vzxtbh V4I8:$Rs)>;
-def: Pat<(v2i32 (zext   V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>;
-def: Pat<(v4i16 (anyext V4I8:$Rs)),  (S2_vzxtbh V4I8:$Rs)>;
-def: Pat<(v2i32 (anyext V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>;
+// Extensions from i1 or vectors of i1.
+def: Pat<(i32 (azext I1:$Pu)), (C2_muxii I1:$Pu, 1, 0)>;
+def: Pat<(i64 (azext I1:$Pu)), (ToZext64 (C2_muxii I1:$Pu, 1, 0))>;
+def: Pat<(i32  (sext I1:$Pu)), (C2_muxii I1:$Pu, -1, 0)>;
+def: Pat<(i64  (sext I1:$Pu)), (Combinew (C2_muxii PredRegs:$Pu, -1, 0),
+                                         (C2_muxii PredRegs:$Pu, -1, 0))>;
+
+def: Pat<(v2i16 (sext V2I1:$Pu)), (S2_vtrunehb (C2_mask V2I1:$Pu))>;
+def: Pat<(v2i32 (sext V2I1:$Pu)), (C2_mask V2I1:$Pu)>;
+def: Pat<(v4i8  (sext V4I1:$Pu)), (S2_vtrunehb (C2_mask V4I1:$Pu))>;
+def: Pat<(v4i16 (sext V4I1:$Pu)), (C2_mask V4I1:$Pu)>;
+def: Pat<(v8i8  (sext V8I1:$Pu)), (C2_mask V8I1:$Pu)>;
+
+def Vsplatpi: OutPatFrag<(ops node:$V),
+                         (Combinew (A2_tfrsi $V), (A2_tfrsi $V))>;
+
+def: Pat<(v2i16 (azext V2I1:$Pu)),
+         (A2_andir (LoReg (C2_mask V2I1:$Pu)), (i32 0x00010001))>;
+def: Pat<(v2i32 (azext V2I1:$Pu)),
+         (A2_andp (C2_mask V2I1:$Pu), (A2_combineii (i32 1), (i32 1)))>;
+def: Pat<(v4i8 (azext V4I1:$Pu)),
+         (A2_andir (LoReg (C2_mask V4I1:$Pu)), (i32 0x01010101))>;
+def: Pat<(v4i16 (azext V4I1:$Pu)),
+         (A2_andp (C2_mask V4I1:$Pu), (Vsplatpi (i32 0x00010001)))>;
+def: Pat<(v8i8 (azext V8I1:$Pu)),
+         (A2_andp (C2_mask V8I1:$Pu), (Vsplatpi (i32 0x01010101)))>;
+
+def: Pat<(v4i16 (azext  V4I8:$Rs)),  (S2_vzxtbh V4I8:$Rs)>;
+def: Pat<(v2i32 (azext  V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>;
 def: Pat<(v4i16 (sext   V4I8:$Rs)),  (S2_vsxtbh V4I8:$Rs)>;
 def: Pat<(v2i32 (sext   V2I16:$Rs)), (S2_vsxthw V2I16:$Rs)>;
 
@@ -470,10 +508,10 @@
 }
 
 multiclass BoolAccRRR_pat<InstHexagon MI, PatFrag AccOp, PatFrag Op> {
-  def: AccRRR_pat<MI, AccOp, Op,   I1,   I1>;
-  def: AccRRR_pat<MI, AccOp, Op, V2I1, V2I1>;
-  def: AccRRR_pat<MI, AccOp, Op, V4I1, V4I1>;
-  def: AccRRR_pat<MI, AccOp, Op, V8I1, V8I1>;
+  def: AccRRR_pat<MI, AccOp, Op,   I1,   I1,   I1>;
+  def: AccRRR_pat<MI, AccOp, Op, V2I1, V2I1, V2I1>;
+  def: AccRRR_pat<MI, AccOp, Op, V4I1, V4I1, V4I1>;
+  def: AccRRR_pat<MI, AccOp, Op, V8I1, V8I1, V8I1>;
 }
 
 defm: BoolOpR_RR_pat<C2_and,   And>;
@@ -518,7 +556,7 @@
 // Patfrag to convert the usual comparison patfrags (e.g. setlt) to ones
 // that reverse the order of the operands.
 class RevCmp<PatFrag F>
-  : PatFrag<(ops node:$rhs, node:$lhs), F.Fragment, F.PredicateCode,
+  : PatFrag<(ops node:$rhs, node:$lhs), !head(F.Fragments), F.PredicateCode,
             F.OperandTransform>;
 
 def: OpR_RR_pat<C2_cmpeq,     seteq,          i1,   I32>;
@@ -562,31 +600,29 @@
 def: OpR_RR_pat<A2_vcmpwgtu,  setugt,         i1,   V2I32>;
 def: OpR_RR_pat<A2_vcmpwgtu,  setugt,         v2i1, V2I32>;
 
-let Predicates = [HasV5T] in {
-  def: OpR_RR_pat<F2_sfcmpeq,   seteq,          i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpgt,   setgt,          i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpge,   setge,          i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpeq,   setoeq,         i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpgt,   setogt,         i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpge,   setoge,         i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpgt,   RevCmp<setolt>, i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpge,   RevCmp<setole>, i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpgt,   RevCmp<setlt>,  i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpge,   RevCmp<setle>,  i1, F32>;
-  def: OpR_RR_pat<F2_sfcmpuo,   setuo,          i1, F32>;
-
-  def: OpR_RR_pat<F2_dfcmpeq,   seteq,          i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpgt,   setgt,          i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpge,   setge,          i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpeq,   setoeq,         i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpgt,   setogt,         i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpge,   setoge,         i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpgt,   RevCmp<setolt>, i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpge,   RevCmp<setole>, i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpgt,   RevCmp<setlt>,  i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpge,   RevCmp<setle>,  i1, F64>;
-  def: OpR_RR_pat<F2_dfcmpuo,   setuo,          i1, F64>;
-}
+def: OpR_RR_pat<F2_sfcmpeq,   seteq,          i1, F32>;
+def: OpR_RR_pat<F2_sfcmpgt,   setgt,          i1, F32>;
+def: OpR_RR_pat<F2_sfcmpge,   setge,          i1, F32>;
+def: OpR_RR_pat<F2_sfcmpeq,   setoeq,         i1, F32>;
+def: OpR_RR_pat<F2_sfcmpgt,   setogt,         i1, F32>;
+def: OpR_RR_pat<F2_sfcmpge,   setoge,         i1, F32>;
+def: OpR_RR_pat<F2_sfcmpgt,   RevCmp<setolt>, i1, F32>;
+def: OpR_RR_pat<F2_sfcmpge,   RevCmp<setole>, i1, F32>;
+def: OpR_RR_pat<F2_sfcmpgt,   RevCmp<setlt>,  i1, F32>;
+def: OpR_RR_pat<F2_sfcmpge,   RevCmp<setle>,  i1, F32>;
+def: OpR_RR_pat<F2_sfcmpuo,   setuo,          i1, F32>;
+
+def: OpR_RR_pat<F2_dfcmpeq,   seteq,          i1, F64>;
+def: OpR_RR_pat<F2_dfcmpgt,   setgt,          i1, F64>;
+def: OpR_RR_pat<F2_dfcmpge,   setge,          i1, F64>;
+def: OpR_RR_pat<F2_dfcmpeq,   setoeq,         i1, F64>;
+def: OpR_RR_pat<F2_dfcmpgt,   setogt,         i1, F64>;
+def: OpR_RR_pat<F2_dfcmpge,   setoge,         i1, F64>;
+def: OpR_RR_pat<F2_dfcmpgt,   RevCmp<setolt>, i1, F64>;
+def: OpR_RR_pat<F2_dfcmpge,   RevCmp<setole>, i1, F64>;
+def: OpR_RR_pat<F2_dfcmpgt,   RevCmp<setlt>,  i1, F64>;
+def: OpR_RR_pat<F2_dfcmpge,   RevCmp<setle>,  i1, F64>;
+def: OpR_RR_pat<F2_dfcmpuo,   setuo,          i1, F64>;
 
 // Avoid C4_cmpneqi, C4_cmpltei, C4_cmplteui, since they cannot form compounds.
 
@@ -597,27 +633,40 @@
 def: Pat<(i1 (setule I32:$Rs, anyimm:$u5)),
          (C2_not (C2_cmpgtui I32:$Rs, imm:$u5))>;
 
-def: Pat<(i1 (setne I32:$Rs, I32:$Rt)),
-         (C2_not (C2_cmpeq I32:$Rs, I32:$Rt))>;
-def: Pat<(i1 (setle I32:$Rs, I32:$Rt)),
-         (C2_not (C2_cmpgt I32:$Rs, I32:$Rt))>;
-def: Pat<(i1 (setule I32:$Rs, I32:$Rt)),
-         (C2_not (C2_cmpgtu I32:$Rs, I32:$Rt))>;
-def: Pat<(i1 (setge I32:$Rs, I32:$Rt)),
-         (C2_not (C2_cmpgt I32:$Rt, I32:$Rs))>;
-def: Pat<(i1 (setuge I32:$Rs, I32:$Rt)),
-         (C2_not (C2_cmpgtu I32:$Rt, I32:$Rs))>;
-
-def: Pat<(i1 (setle I64:$Rs, I64:$Rt)),
-         (C2_not (C2_cmpgtp I64:$Rs, I64:$Rt))>;
-def: Pat<(i1 (setne I64:$Rs, I64:$Rt)),
-         (C2_not (C2_cmpeqp I64:$Rs, I64:$Rt))>;
-def: Pat<(i1 (setge I64:$Rs, I64:$Rt)),
-         (C2_not (C2_cmpgtp I64:$Rt, I64:$Rs))>;
-def: Pat<(i1 (setuge I64:$Rs, I64:$Rt)),
-         (C2_not (C2_cmpgtup I64:$Rt, I64:$Rs))>;
-def: Pat<(i1 (setule I64:$Rs, I64:$Rt)),
-         (C2_not (C2_cmpgtup I64:$Rs, I64:$Rt))>;
+class OpmR_RR_pat<PatFrag Output, PatFrag Op, ValueType ResType,
+                  PatFrag RsPred, PatFrag RtPred = RsPred>
+  : Pat<(ResType (Op RsPred:$Rs, RtPred:$Rt)),
+        (Output RsPred:$Rs, RtPred:$Rt)>;
+
+class Outn<InstHexagon MI>
+  : OutPatFrag<(ops node:$Rs, node:$Rt),
+               (C2_not (MI $Rs, $Rt))>;
+
+def: OpmR_RR_pat<Outn<C2_cmpeq>,    setne,          i1,   I32>;
+def: OpmR_RR_pat<Outn<C2_cmpgt>,    setle,          i1,   I32>;
+def: OpmR_RR_pat<Outn<C2_cmpgtu>,   setule,         i1,   I32>;
+def: OpmR_RR_pat<Outn<C2_cmpgt>,    RevCmp<setge>,  i1,   I32>;
+def: OpmR_RR_pat<Outn<C2_cmpgtu>,   RevCmp<setuge>, i1,   I32>;
+def: OpmR_RR_pat<Outn<C2_cmpeqp>,   setne,          i1,   I64>;
+def: OpmR_RR_pat<Outn<C2_cmpgtp>,   setle,          i1,   I64>;
+def: OpmR_RR_pat<Outn<C2_cmpgtup>,  setule,         i1,   I64>;
+def: OpmR_RR_pat<Outn<C2_cmpgtp>,   RevCmp<setge>,  i1,   I64>;
+def: OpmR_RR_pat<Outn<C2_cmpgtup>,  RevCmp<setuge>, i1,   I64>;
+def: OpmR_RR_pat<Outn<A2_vcmpbeq>,  setne,          v8i1, V8I8>;
+def: OpmR_RR_pat<Outn<A4_vcmpbgt>,  setle,          v8i1, V8I8>;
+def: OpmR_RR_pat<Outn<A2_vcmpbgtu>, setule,         v8i1, V8I8>;
+def: OpmR_RR_pat<Outn<A4_vcmpbgt>,  RevCmp<setge>,  v8i1, V8I8>;
+def: OpmR_RR_pat<Outn<A2_vcmpbgtu>, RevCmp<setuge>, v8i1, V8I8>;
+def: OpmR_RR_pat<Outn<A2_vcmpheq>,  setne,          v4i1, V4I16>;
+def: OpmR_RR_pat<Outn<A2_vcmphgt>,  setle,          v4i1, V4I16>;
+def: OpmR_RR_pat<Outn<A2_vcmphgtu>, setule,         v4i1, V4I16>;
+def: OpmR_RR_pat<Outn<A2_vcmphgt>,  RevCmp<setge>,  v4i1, V4I16>;
+def: OpmR_RR_pat<Outn<A2_vcmphgtu>, RevCmp<setuge>, v4i1, V4I16>;
+def: OpmR_RR_pat<Outn<A2_vcmpweq>,  setne,          v2i1, V2I32>;
+def: OpmR_RR_pat<Outn<A2_vcmpwgt>,  setle,          v2i1, V2I32>;
+def: OpmR_RR_pat<Outn<A2_vcmpwgtu>, setule,         v2i1, V2I32>;
+def: OpmR_RR_pat<Outn<A2_vcmpwgt>,  RevCmp<setge>,  v2i1, V2I32>;
+def: OpmR_RR_pat<Outn<A2_vcmpwgtu>, RevCmp<setuge>, v2i1, V2I32>;
 
 let AddedComplexity = 100 in {
   def: Pat<(i1 (seteq (and (xor I32:$Rs, I32:$Rt), 255), 0)),
@@ -679,25 +728,10 @@
 def: Pat<(i32 (zext (i1 (setne I32:$Rs, anyimm:$s8)))),
          (A4_rcmpneqi I32:$Rs, imm:$s8)>;
 
-def: Pat<(i1 (setne I1:$Ps, I1:$Pt)),
-         (C2_xor I1:$Ps, I1:$Pt)>;
-
-def: Pat<(i1 (seteq V4I8:$Rs, V4I8:$Rt)),
-         (A2_vcmpbeq (ToZext64 $Rs), (ToZext64 $Rt))>;
-def: Pat<(i1 (setgt V4I8:$Rs, V4I8:$Rt)),
-         (A4_vcmpbgt (ToZext64 $Rs), (ToZext64 $Rt))>;
-def: Pat<(i1 (setugt V4I8:$Rs, V4I8:$Rt)),
-         (A2_vcmpbgtu (ToZext64 $Rs), (ToZext64 $Rt))>;
-
-def: Pat<(i1 (seteq V2I16:$Rs, V2I16:$Rt)),
-         (A2_vcmpheq (ToZext64 $Rs), (ToZext64 $Rt))>;
-def: Pat<(i1 (setgt V2I16:$Rs, V2I16:$Rt)),
-         (A2_vcmphgt (ToZext64 $Rs), (ToZext64 $Rt))>;
-def: Pat<(i1 (setugt V2I16:$Rs, V2I16:$Rt)),
-         (A2_vcmphgtu (ToZext64 $Rs), (ToZext64 $Rt))>;
-
-def: Pat<(v2i1 (setne V2I32:$Rs, V2I32:$Rt)),
-         (C2_not (v2i1 (A2_vcmpbeq V2I32:$Rs, V2I32:$Rt)))>;
+def: Pat<(i1 (seteq I1:$Ps, (i1 -1))), (I1:$Ps)>;
+def: Pat<(i1 (setne I1:$Ps, (i1 -1))), (C2_not I1:$Ps)>;
+def: Pat<(i1 (seteq I1:$Ps, I1:$Pt)),  (C2_xor I1:$Ps, (C2_not I1:$Pt))>;
+def: Pat<(i1 (setne I1:$Ps, I1:$Pt)),  (C2_xor I1:$Ps, I1:$Pt)>;
 
 // Floating-point comparisons with checks for ordered/unordered status.
 
@@ -705,47 +739,34 @@
   : OutPatFrag<(ops node:$Rs, node:$Rt),
                (MI1 (MI2 $Rs, $Rt), (MI3 $Rs, $Rt))>;
 
-class OpmR_RR_pat<PatFrag Output, PatFrag Op, ValueType ResType,
-                  PatFrag RsPred, PatFrag RtPred = RsPred>
-  : Pat<(ResType (Op RsPred:$Rs, RtPred:$Rt)),
-        (Output RsPred:$Rs, RtPred:$Rt)>;
-
 class Cmpuf<InstHexagon MI>:  T3<C2_or,  F2_sfcmpuo, MI>;
 class Cmpud<InstHexagon MI>:  T3<C2_or,  F2_dfcmpuo, MI>;
 
 class Cmpufn<InstHexagon MI>: T3<C2_orn, F2_sfcmpuo, MI>;
 class Cmpudn<InstHexagon MI>: T3<C2_orn, F2_dfcmpuo, MI>;
 
-let Predicates = [HasV5T] in {
-  def: OpmR_RR_pat<Cmpuf<F2_sfcmpeq>,  setueq,         i1, F32>;
-  def: OpmR_RR_pat<Cmpuf<F2_sfcmpge>,  setuge,         i1, F32>;
-  def: OpmR_RR_pat<Cmpuf<F2_sfcmpgt>,  setugt,         i1, F32>;
-  def: OpmR_RR_pat<Cmpuf<F2_sfcmpge>,  RevCmp<setule>, i1, F32>;
-  def: OpmR_RR_pat<Cmpuf<F2_sfcmpgt>,  RevCmp<setult>, i1, F32>;
-  def: OpmR_RR_pat<Cmpufn<F2_sfcmpeq>, setune,         i1, F32>;
-
-  def: OpmR_RR_pat<Cmpud<F2_dfcmpeq>,  setueq,         i1, F64>;
-  def: OpmR_RR_pat<Cmpud<F2_dfcmpge>,  setuge,         i1, F64>;
-  def: OpmR_RR_pat<Cmpud<F2_dfcmpgt>,  setugt,         i1, F64>;
-  def: OpmR_RR_pat<Cmpud<F2_dfcmpge>,  RevCmp<setule>, i1, F64>;
-  def: OpmR_RR_pat<Cmpud<F2_dfcmpgt>,  RevCmp<setult>, i1, F64>;
-  def: OpmR_RR_pat<Cmpudn<F2_dfcmpeq>, setune,         i1, F64>;
-}
-
-class Outn<InstHexagon MI>
-  : OutPatFrag<(ops node:$Rs, node:$Rt),
-               (C2_not (MI $Rs, $Rt))>;
-
-let Predicates = [HasV5T] in {
-  def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setone, i1, F32>;
-  def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setne,  i1, F32>;
-
-  def: OpmR_RR_pat<Outn<F2_dfcmpeq>, setone, i1, F64>;
-  def: OpmR_RR_pat<Outn<F2_dfcmpeq>, setne,  i1, F64>;
-
-  def: OpmR_RR_pat<Outn<F2_sfcmpuo>, seto,   i1, F32>;
-  def: OpmR_RR_pat<Outn<F2_dfcmpuo>, seto,   i1, F64>;
-}
+def: OpmR_RR_pat<Cmpuf<F2_sfcmpeq>,  setueq,         i1, F32>;
+def: OpmR_RR_pat<Cmpuf<F2_sfcmpge>,  setuge,         i1, F32>;
+def: OpmR_RR_pat<Cmpuf<F2_sfcmpgt>,  setugt,         i1, F32>;
+def: OpmR_RR_pat<Cmpuf<F2_sfcmpge>,  RevCmp<setule>, i1, F32>;
+def: OpmR_RR_pat<Cmpuf<F2_sfcmpgt>,  RevCmp<setult>, i1, F32>;
+def: OpmR_RR_pat<Cmpufn<F2_sfcmpeq>, setune,         i1, F32>;
+
+def: OpmR_RR_pat<Cmpud<F2_dfcmpeq>,  setueq,         i1, F64>;
+def: OpmR_RR_pat<Cmpud<F2_dfcmpge>,  setuge,         i1, F64>;
+def: OpmR_RR_pat<Cmpud<F2_dfcmpgt>,  setugt,         i1, F64>;
+def: OpmR_RR_pat<Cmpud<F2_dfcmpge>,  RevCmp<setule>, i1, F64>;
+def: OpmR_RR_pat<Cmpud<F2_dfcmpgt>,  RevCmp<setult>, i1, F64>;
+def: OpmR_RR_pat<Cmpudn<F2_dfcmpeq>, setune,         i1, F64>;
+
+def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setone, i1, F32>;
+def: OpmR_RR_pat<Outn<F2_sfcmpeq>, setne,  i1, F32>;
+
+def: OpmR_RR_pat<Outn<F2_dfcmpeq>, setone, i1, F64>;
+def: OpmR_RR_pat<Outn<F2_dfcmpeq>, setne,  i1, F64>;
+
+def: OpmR_RR_pat<Outn<F2_sfcmpuo>, seto,   i1, F32>;
+def: OpmR_RR_pat<Outn<F2_dfcmpuo>, seto,   i1, F64>;
 
 
 // --(6) Select ----------------------------------------------------------
@@ -775,32 +796,30 @@
          (Combinew (C2_mux I1:$Pu, (HiReg $Rs), (HiReg $Rt)),
                    (C2_mux I1:$Pu, (LoReg $Rs), (LoReg $Rt)))>;
 
-let Predicates = [HasV5T] in {
-  def: Pat<(select I1:$Pu, F32:$Rs, f32ImmPred:$I),
-           (C2_muxir I1:$Pu, F32:$Rs, (ftoi $I))>;
-  def: Pat<(select I1:$Pu, f32ImmPred:$I, F32:$Rt),
-           (C2_muxri I1:$Pu, (ftoi $I), F32:$Rt)>;
-  def: Pat<(select I1:$Pu, F32:$Rs, F32:$Rt),
-           (C2_mux I1:$Pu, F32:$Rs, F32:$Rt)>;
-  def: Pat<(select I1:$Pu, F64:$Rs, F64:$Rt),
-           (Combinew (C2_mux I1:$Pu, (HiReg $Rs), (HiReg $Rt)),
-                     (C2_mux I1:$Pu, (LoReg $Rs), (LoReg $Rt)))>;
-
-  def: Pat<(select (i1 (setult F32:$Ra, F32:$Rb)), F32:$Rs, F32:$Rt),
-           (C2_mux (F2_sfcmpgt F32:$Rb, F32:$Ra), F32:$Rs, F32:$Rt)>;
-  def: Pat<(select (i1 (setult F64:$Ra, F64:$Rb)), F64:$Rs, F64:$Rt),
-           (C2_vmux (F2_dfcmpgt F64:$Rb, F64:$Ra), F64:$Rs, F64:$Rt)>;
-
-  def: Pat<(select (not I1:$Pu), f32ImmPred:$I, F32:$Rs),
-           (C2_muxir I1:$Pu, F32:$Rs, (ftoi $I))>;
-  def: Pat<(select (not I1:$Pu), F32:$Rt, f32ImmPred:$I),
-           (C2_muxri I1:$Pu, (ftoi $I), F32:$Rt)>;
-}
+def: Pat<(select I1:$Pu, F32:$Rs, f32ImmPred:$I),
+         (C2_muxir I1:$Pu, F32:$Rs, (ftoi $I))>;
+def: Pat<(select I1:$Pu, f32ImmPred:$I, F32:$Rt),
+         (C2_muxri I1:$Pu, (ftoi $I), F32:$Rt)>;
+def: Pat<(select I1:$Pu, F32:$Rs, F32:$Rt),
+         (C2_mux I1:$Pu, F32:$Rs, F32:$Rt)>;
+def: Pat<(select I1:$Pu, F64:$Rs, F64:$Rt),
+         (Combinew (C2_mux I1:$Pu, (HiReg $Rs), (HiReg $Rt)),
+                   (C2_mux I1:$Pu, (LoReg $Rs), (LoReg $Rt)))>;
+
+def: Pat<(select (i1 (setult F32:$Ra, F32:$Rb)), F32:$Rs, F32:$Rt),
+         (C2_mux (F2_sfcmpgt F32:$Rb, F32:$Ra), F32:$Rs, F32:$Rt)>;
+def: Pat<(select (i1 (setult F64:$Ra, F64:$Rb)), F64:$Rs, F64:$Rt),
+         (C2_vmux (F2_dfcmpgt F64:$Rb, F64:$Ra), F64:$Rs, F64:$Rt)>;
+
+def: Pat<(select (not I1:$Pu), f32ImmPred:$I, F32:$Rs),
+         (C2_muxir I1:$Pu, F32:$Rs, (ftoi $I))>;
+def: Pat<(select (not I1:$Pu), F32:$Rt, f32ImmPred:$I),
+         (C2_muxri I1:$Pu, (ftoi $I), F32:$Rt)>;
 
 def: Pat<(select I1:$Pu, V4I8:$Rs, V4I8:$Rt),
-         (LoReg (C2_vmux I1:$Pu, (ToZext64 $Rs), (ToZext64 $Rt)))>;
+         (LoReg (C2_vmux I1:$Pu, (ToAext64 $Rs), (ToAext64 $Rt)))>;
 def: Pat<(select I1:$Pu, V2I16:$Rs, V2I16:$Rt),
-         (LoReg (C2_vmux I1:$Pu, (ToZext64 $Rs), (ToZext64 $Rt)))>;
+         (LoReg (C2_vmux I1:$Pu, (ToAext64 $Rs), (ToAext64 $Rt)))>;
 def: Pat<(select I1:$Pu, V2I32:$Rs, V2I32:$Rt),
          (Combinew (C2_mux I1:$Pu, (HiReg $Rs), (HiReg $Rt)),
                    (C2_mux I1:$Pu, (LoReg $Rs), (LoReg $Rt)))>;
@@ -863,7 +882,7 @@
   defm: SelMinMax_pats<setult, I64, A2_minup, A2_maxup>;
 }
 
-let AddedComplexity = 100, Predicates = [HasV5T] in {
+let AddedComplexity = 100 in {
   defm: SelMinMax_pats<setolt, F32, F2_sfmin, F2_sfmax>;
   defm: SelMinMax_pats<setole, F32, F2_sfmin, F2_sfmax>;
   defm: SelMinMax_pats<setogt, F32, F2_sfmax, F2_sfmin>;
@@ -919,7 +938,7 @@
 
 let AddedComplexity = 10 in
 def: Pat<(v8i8 (HexagonVSPLAT I32:$Rs)), (S6_vsplatrbp I32:$Rs)>,
-     Requires<[HasV62T]>;
+     Requires<[HasV62]>;
 def: Pat<(v8i8 (HexagonVSPLAT I32:$Rs)),
          (Combinew (S2_vsplatrb I32:$Rs), (S2_vsplatrb I32:$Rs))>;
 
@@ -980,11 +999,95 @@
 def: OpR_RR_pat<S2_lsr_r_p, Srl, i64, I64, I32>;
 def: OpR_RR_pat<S2_asl_r_p, Shl, i64, I64, I32>;
 
+// Funnel shifts.
+def IsMul8_U3: PatLeaf<(i32 imm), [{
+  uint64_t V = N->getZExtValue();
+  return V % 8 == 0 && isUInt<3>(V / 8);
+}]>;
+
+def Divu8: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() / 8, SDLoc(N), MVT::i32);
+}]>;
+
+// Funnel shift-left.
+def FShl32i: OutPatFrag<(ops node:$Rs, node:$Rt, node:$S),
+  (HiReg (S2_asl_i_p (Combinew $Rs, $Rt), $S))>;
+def FShl32r: OutPatFrag<(ops node:$Rs, node:$Rt, node:$Ru),
+  (HiReg (S2_asl_r_p (Combinew $Rs, $Rt), $Ru))>;
+
+def FShl64i: OutPatFrag<(ops node:$Rs, node:$Rt, node:$S),
+  (S2_lsr_i_p_or (S2_asl_i_p $Rt, $S),  $Rs, (Subi<64> $S))>;
+def FShl64r: OutPatFrag<(ops node:$Rs, node:$Rt, node:$Ru),
+  (S2_lsr_r_p_or (S2_asl_r_p $Rt, $Ru), $Rs, (A2_subri 64, $Ru))>;
+
+// Combined SDNodeXForm: (Divu8 (Subi<64> $S))
+def Divu64_8: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((64 - N->getSExtValue()) / 8,
+                                   SDLoc(N), MVT::i32);
+}]>;
+
+// Special cases:
+let AddedComplexity = 100 in {
+  def: Pat<(fshl I32:$Rs, I32:$Rt, (i32 16)),
+           (A2_combine_hl I32:$Rs, I32:$Rt)>;
+  def: Pat<(fshl I64:$Rs, I64:$Rt, IsMul8_U3:$S),
+           (S2_valignib I64:$Rs, I64:$Rt, (Divu64_8 $S))>;
+}
+
+let Predicates = [HasV60], AddedComplexity = 50 in {
+  def: OpR_RI_pat<S6_rol_i_r, Rol, i32, I32, u5_0ImmPred>;
+  def: OpR_RI_pat<S6_rol_i_p, Rol, i64, I64, u6_0ImmPred>;
+}
+let AddedComplexity = 30 in {
+  def: Pat<(rotl I32:$Rs, u5_0ImmPred:$S),          (FShl32i $Rs, $Rs, imm:$S)>;
+  def: Pat<(rotl I64:$Rs, u6_0ImmPred:$S),          (FShl64i $Rs, $Rs, imm:$S)>;
+  def: Pat<(fshl I32:$Rs, I32:$Rt, u5_0ImmPred:$S), (FShl32i $Rs, $Rt, imm:$S)>;
+  def: Pat<(fshl I64:$Rs, I64:$Rt, u6_0ImmPred:$S), (FShl64i $Rs, $Rt, imm:$S)>;
+}
+def: Pat<(rotl I32:$Rs, I32:$Rt),           (FShl32r $Rs, $Rs, $Rt)>;
+def: Pat<(rotl I64:$Rs, I32:$Rt),           (FShl64r $Rs, $Rs, $Rt)>;
+def: Pat<(fshl I32:$Rs, I32:$Rt, I32:$Ru),  (FShl32r $Rs, $Rt, $Ru)>;
+def: Pat<(fshl I64:$Rs, I64:$Rt, I32:$Ru),  (FShl64r $Rs, $Rt, $Ru)>;
+
+// Funnel shift-right.
+def FShr32i: OutPatFrag<(ops node:$Rs, node:$Rt, node:$S),
+  (LoReg (S2_lsr_i_p (Combinew $Rs, $Rt), $S))>;
+def FShr32r: OutPatFrag<(ops node:$Rs, node:$Rt, node:$Ru),
+  (LoReg (S2_lsr_r_p (Combinew $Rs, $Rt), $Ru))>;
+
+def FShr64i: OutPatFrag<(ops node:$Rs, node:$Rt, node:$S),
+  (S2_asl_i_p_or (S2_lsr_i_p $Rt, $S),  $Rs, (Subi<64> $S))>;
+def FShr64r: OutPatFrag<(ops node:$Rs, node:$Rt, node:$Ru),
+  (S2_asl_r_p_or (S2_lsr_r_p $Rt, $Ru), $Rs, (A2_subri 64, $Ru))>;
+
+// Special cases:
+let AddedComplexity = 100 in {
+  def: Pat<(fshr I32:$Rs, I32:$Rt, (i32 16)),
+           (A2_combine_hl I32:$Rs, I32:$Rt)>;
+  def: Pat<(fshr I64:$Rs, I64:$Rt, IsMul8_U3:$S),
+           (S2_valignib I64:$Rs, I64:$Rt, (Divu8 $S))>;
+}
+
+let Predicates = [HasV60], AddedComplexity = 50 in {
+  def: Pat<(rotr I32:$Rs, u5_0ImmPred:$S), (S6_rol_i_r I32:$Rs, (Subi<32> $S))>;
+  def: Pat<(rotr I64:$Rs, u6_0ImmPred:$S), (S6_rol_i_p I64:$Rs, (Subi<64> $S))>;
+}
+let AddedComplexity = 30 in {
+  def: Pat<(rotr I32:$Rs, u5_0ImmPred:$S),          (FShr32i $Rs, $Rs, imm:$S)>;
+  def: Pat<(rotr I64:$Rs, u6_0ImmPred:$S),          (FShr64i $Rs, $Rs, imm:$S)>;
+  def: Pat<(fshr I32:$Rs, I32:$Rt, u5_0ImmPred:$S), (FShr32i $Rs, $Rt, imm:$S)>;
+  def: Pat<(fshr I64:$Rs, I64:$Rt, u6_0ImmPred:$S), (FShr64i $Rs, $Rt, imm:$S)>;
+}
+def: Pat<(rotr I32:$Rs, I32:$Rt),           (FShr32r $Rs, $Rs, $Rt)>;
+def: Pat<(rotr I64:$Rs, I32:$Rt),           (FShr64r $Rs, $Rs, $Rt)>;
+def: Pat<(fshr I32:$Rs, I32:$Rt, I32:$Ru),  (FShr32r $Rs, $Rt, $Ru)>;
+def: Pat<(fshr I64:$Rs, I64:$Rt, I32:$Ru),  (FShr64r $Rs, $Rt, $Ru)>;
+
 
 def: Pat<(sra (add (sra I32:$Rs, u5_0ImmPred:$u5), 1), (i32 1)),
          (S2_asr_i_r_rnd I32:$Rs, imm:$u5)>;
 def: Pat<(sra (add (sra I64:$Rs, u6_0ImmPred:$u6), 1), (i32 1)),
-         (S2_asr_i_p_rnd I64:$Rs, imm:$u6)>, Requires<[HasV5T]>;
+         (S2_asr_i_p_rnd I64:$Rs, imm:$u6)>;
 
 // Prefer S2_addasl_rrri over S2_asl_i_r_acc.
 let AddedComplexity = 120 in
@@ -1025,41 +1128,55 @@
   def: AccRRI_pat<S2_asl_i_p_and,   And, Su<Shl>, I64, u6_0ImmPred>;
   def: AccRRI_pat<S2_asl_i_p_or,    Or,  Su<Shl>, I64, u6_0ImmPred>;
   def: AccRRI_pat<S2_asl_i_p_xacc,  Xor, Su<Shl>, I64, u6_0ImmPred>;
+
+  let Predicates = [HasV60] in {
+    def: AccRRI_pat<S6_rol_i_r_acc,   Add, Su<Rol>, I32, u5_0ImmPred>;
+    def: AccRRI_pat<S6_rol_i_r_nac,   Sub, Su<Rol>, I32, u5_0ImmPred>;
+    def: AccRRI_pat<S6_rol_i_r_and,   And, Su<Rol>, I32, u5_0ImmPred>;
+    def: AccRRI_pat<S6_rol_i_r_or,    Or,  Su<Rol>, I32, u5_0ImmPred>;
+    def: AccRRI_pat<S6_rol_i_r_xacc,  Xor, Su<Rol>, I32, u5_0ImmPred>;
+
+    def: AccRRI_pat<S6_rol_i_p_acc,   Add, Su<Rol>, I64, u6_0ImmPred>;
+    def: AccRRI_pat<S6_rol_i_p_nac,   Sub, Su<Rol>, I64, u6_0ImmPred>;
+    def: AccRRI_pat<S6_rol_i_p_and,   And, Su<Rol>, I64, u6_0ImmPred>;
+    def: AccRRI_pat<S6_rol_i_p_or,    Or,  Su<Rol>, I64, u6_0ImmPred>;
+    def: AccRRI_pat<S6_rol_i_p_xacc,  Xor, Su<Rol>, I64, u6_0ImmPred>;
+  }
 }
 
 let AddedComplexity = 100 in {
-  def: AccRRR_pat<S2_asr_r_r_acc,   Add, Su<Sra>, I32, I32>;
-  def: AccRRR_pat<S2_asr_r_r_nac,   Sub, Su<Sra>, I32, I32>;
-  def: AccRRR_pat<S2_asr_r_r_and,   And, Su<Sra>, I32, I32>;
-  def: AccRRR_pat<S2_asr_r_r_or,    Or,  Su<Sra>, I32, I32>;
-
-  def: AccRRR_pat<S2_asr_r_p_acc,   Add, Su<Sra>, I64, I32>;
-  def: AccRRR_pat<S2_asr_r_p_nac,   Sub, Su<Sra>, I64, I32>;
-  def: AccRRR_pat<S2_asr_r_p_and,   And, Su<Sra>, I64, I32>;
-  def: AccRRR_pat<S2_asr_r_p_or,    Or,  Su<Sra>, I64, I32>;
-  def: AccRRR_pat<S2_asr_r_p_xor,   Xor, Su<Sra>, I64, I32>;
-
-  def: AccRRR_pat<S2_lsr_r_r_acc,   Add, Su<Srl>, I32, I32>;
-  def: AccRRR_pat<S2_lsr_r_r_nac,   Sub, Su<Srl>, I32, I32>;
-  def: AccRRR_pat<S2_lsr_r_r_and,   And, Su<Srl>, I32, I32>;
-  def: AccRRR_pat<S2_lsr_r_r_or,    Or,  Su<Srl>, I32, I32>;
-
-  def: AccRRR_pat<S2_lsr_r_p_acc,   Add, Su<Srl>, I64, I32>;
-  def: AccRRR_pat<S2_lsr_r_p_nac,   Sub, Su<Srl>, I64, I32>;
-  def: AccRRR_pat<S2_lsr_r_p_and,   And, Su<Srl>, I64, I32>;
-  def: AccRRR_pat<S2_lsr_r_p_or,    Or,  Su<Srl>, I64, I32>;
-  def: AccRRR_pat<S2_lsr_r_p_xor,   Xor, Su<Srl>, I64, I32>;
-
-  def: AccRRR_pat<S2_asl_r_r_acc,   Add, Su<Shl>, I32, I32>;
-  def: AccRRR_pat<S2_asl_r_r_nac,   Sub, Su<Shl>, I32, I32>;
-  def: AccRRR_pat<S2_asl_r_r_and,   And, Su<Shl>, I32, I32>;
-  def: AccRRR_pat<S2_asl_r_r_or,    Or,  Su<Shl>, I32, I32>;
-
-  def: AccRRR_pat<S2_asl_r_p_acc,   Add, Su<Shl>, I64, I32>;
-  def: AccRRR_pat<S2_asl_r_p_nac,   Sub, Su<Shl>, I64, I32>;
-  def: AccRRR_pat<S2_asl_r_p_and,   And, Su<Shl>, I64, I32>;
-  def: AccRRR_pat<S2_asl_r_p_or,    Or,  Su<Shl>, I64, I32>;
-  def: AccRRR_pat<S2_asl_r_p_xor,   Xor, Su<Shl>, I64, I32>;
+  def: AccRRR_pat<S2_asr_r_r_acc,   Add, Su<Sra>, I32, I32, I32>;
+  def: AccRRR_pat<S2_asr_r_r_nac,   Sub, Su<Sra>, I32, I32, I32>;
+  def: AccRRR_pat<S2_asr_r_r_and,   And, Su<Sra>, I32, I32, I32>;
+  def: AccRRR_pat<S2_asr_r_r_or,    Or,  Su<Sra>, I32, I32, I32>;
+
+  def: AccRRR_pat<S2_asr_r_p_acc,   Add, Su<Sra>, I64, I64, I32>;
+  def: AccRRR_pat<S2_asr_r_p_nac,   Sub, Su<Sra>, I64, I64, I32>;
+  def: AccRRR_pat<S2_asr_r_p_and,   And, Su<Sra>, I64, I64, I32>;
+  def: AccRRR_pat<S2_asr_r_p_or,    Or,  Su<Sra>, I64, I64, I32>;
+  def: AccRRR_pat<S2_asr_r_p_xor,   Xor, Su<Sra>, I64, I64, I32>;
+
+  def: AccRRR_pat<S2_lsr_r_r_acc,   Add, Su<Srl>, I32, I32, I32>;
+  def: AccRRR_pat<S2_lsr_r_r_nac,   Sub, Su<Srl>, I32, I32, I32>;
+  def: AccRRR_pat<S2_lsr_r_r_and,   And, Su<Srl>, I32, I32, I32>;
+  def: AccRRR_pat<S2_lsr_r_r_or,    Or,  Su<Srl>, I32, I32, I32>;
+
+  def: AccRRR_pat<S2_lsr_r_p_acc,   Add, Su<Srl>, I64, I64, I32>;
+  def: AccRRR_pat<S2_lsr_r_p_nac,   Sub, Su<Srl>, I64, I64, I32>;
+  def: AccRRR_pat<S2_lsr_r_p_and,   And, Su<Srl>, I64, I64, I32>;
+  def: AccRRR_pat<S2_lsr_r_p_or,    Or,  Su<Srl>, I64, I64, I32>;
+  def: AccRRR_pat<S2_lsr_r_p_xor,   Xor, Su<Srl>, I64, I64, I32>;
+
+  def: AccRRR_pat<S2_asl_r_r_acc,   Add, Su<Shl>, I32, I32, I32>;
+  def: AccRRR_pat<S2_asl_r_r_nac,   Sub, Su<Shl>, I32, I32, I32>;
+  def: AccRRR_pat<S2_asl_r_r_and,   And, Su<Shl>, I32, I32, I32>;
+  def: AccRRR_pat<S2_asl_r_r_or,    Or,  Su<Shl>, I32, I32, I32>;
+
+  def: AccRRR_pat<S2_asl_r_p_acc,   Add, Su<Shl>, I64, I64, I32>;
+  def: AccRRR_pat<S2_asl_r_p_nac,   Sub, Su<Shl>, I64, I64, I32>;
+  def: AccRRR_pat<S2_asl_r_p_and,   And, Su<Shl>, I64, I64, I32>;
+  def: AccRRR_pat<S2_asl_r_p_or,    Or,  Su<Shl>, I64, I64, I32>;
+  def: AccRRR_pat<S2_asl_r_p_xor,   Xor, Su<Shl>, I64, I64, I32>;
 }
 
 
@@ -1092,14 +1209,6 @@
          (Combinew (A2_combine_ll I32:$d, I32:$c),
                    (A2_combine_ll I32:$b, I32:$a))>;
 
-def: Pat<(or (or (shl (or (shl (i32 (extloadi8 (add I32:$b, 3))),
-                               (i32 8)),
-                          (i32 (zextloadi8 (add I32:$b, 2)))),
-                      (i32 16)),
-                 (shl (i32 (zextloadi8 (add I32:$b, 1))), (i32 8))),
-             (zextloadi8 I32:$b)),
-         (A2_swiz (L2_loadri_io IntRegs:$b, 0))>;
-
 let AddedComplexity = 200 in {
   def: Pat<(or (shl I32:$Rt, (i32 16)), (and I32:$Rs, (i32 65535))),
            (A2_combine_ll I32:$Rt, I32:$Rs)>;
@@ -1145,32 +1254,38 @@
 def: Pat<(shl V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c))),
          (S2_asl_i_vh V4I16:$b, imm:$c)>;
 
+def: Pat<(HexagonVASR V2I16:$Rs, u4_0ImmPred:$S),
+         (LoReg (S2_asr_i_vh (ToAext64 $Rs), imm:$S))>;
+def: Pat<(HexagonVASL V2I16:$Rs, u4_0ImmPred:$S),
+         (LoReg (S2_asl_i_vh (ToAext64 $Rs), imm:$S))>;
+def: Pat<(HexagonVLSR V2I16:$Rs, u4_0ImmPred:$S),
+         (LoReg (S2_lsr_i_vh (ToAext64 $Rs), imm:$S))>;
+def: Pat<(HexagonVASR V2I16:$Rs, I32:$Rt),
+         (LoReg (S2_asr_i_vh (ToAext64 $Rs), I32:$Rt))>;
+def: Pat<(HexagonVASL V2I16:$Rs, I32:$Rt),
+         (LoReg (S2_asl_i_vh (ToAext64 $Rs), I32:$Rt))>;
+def: Pat<(HexagonVLSR V2I16:$Rs, I32:$Rt),
+         (LoReg (S2_lsr_i_vh (ToAext64 $Rs), I32:$Rt))>;
+
 
 // --(9) Arithmetic/bitwise ----------------------------------------------
 //
 
-def: Pat<(abs I32:$Rs), (A2_abs   I32:$Rs)>;
-def: Pat<(not I32:$Rs), (A2_subri -1, I32:$Rs)>;
-def: Pat<(not I64:$Rs), (A2_notp  I64:$Rs)>;
-
-let Predicates = [HasV5T] in {
-  def: Pat<(fabs F32:$Rs), (S2_clrbit_i    F32:$Rs, 31)>;
-  def: Pat<(fneg F32:$Rs), (S2_togglebit_i F32:$Rs, 31)>;
-
-  def: Pat<(fabs F64:$Rs),
-           (Combinew (S2_clrbit_i (HiReg $Rs), 31),
-                     (i32 (LoReg $Rs)))>;
-  def: Pat<(fneg F64:$Rs),
-           (Combinew (S2_togglebit_i (HiReg $Rs), 31),
-                     (i32 (LoReg $Rs)))>;
-}
-
-let AddedComplexity = 50 in
-def: Pat<(xor (add (sra I32:$Rs, (i32 31)),
-                   I32:$Rs),
-              (sra I32:$Rs, (i32 31))),
-         (A2_abs I32:$Rs)>;
-
+def: Pat<(abs  I32:$Rs), (A2_abs   I32:$Rs)>;
+def: Pat<(abs  I64:$Rs), (A2_absp  I64:$Rs)>;
+def: Pat<(not  I32:$Rs), (A2_subri -1, I32:$Rs)>;
+def: Pat<(not  I64:$Rs), (A2_notp  I64:$Rs)>;
+def: Pat<(ineg I64:$Rs), (A2_negp  I64:$Rs)>;
+
+def: Pat<(fabs F32:$Rs), (S2_clrbit_i    F32:$Rs, 31)>;
+def: Pat<(fneg F32:$Rs), (S2_togglebit_i F32:$Rs, 31)>;
+
+def: Pat<(fabs F64:$Rs),
+         (Combinew (S2_clrbit_i (HiReg $Rs), 31),
+                   (i32 (LoReg $Rs)))>;
+def: Pat<(fneg F64:$Rs),
+         (Combinew (S2_togglebit_i (HiReg $Rs), 31),
+                   (i32 (LoReg $Rs)))>;
 
 def: Pat<(add I32:$Rs, anyimm:$s16),   (A2_addi   I32:$Rs,  imm:$s16)>;
 def: Pat<(or  I32:$Rs, anyimm:$s10),   (A2_orir   I32:$Rs,  imm:$s10)>;
@@ -1200,18 +1315,20 @@
 def: OpR_RR_pat<A2_vsubh,     Sub,        v4i16, V4I16>;
 def: OpR_RR_pat<A2_vsubw,     Sub,        v2i32, V2I32>;
 
+def: OpR_RR_pat<A2_and,       And,        v4i8,  V4I8>;
+def: OpR_RR_pat<A2_xor,       Xor,        v4i8,  V4I8>;
+def: OpR_RR_pat<A2_or,        Or,         v4i8,  V4I8>;
 def: OpR_RR_pat<A2_and,       And,        v2i16, V2I16>;
 def: OpR_RR_pat<A2_xor,       Xor,        v2i16, V2I16>;
 def: OpR_RR_pat<A2_or,        Or,         v2i16, V2I16>;
-
 def: OpR_RR_pat<A2_andp,      And,        v8i8,  V8I8>;
-def: OpR_RR_pat<A2_andp,      And,        v4i16, V4I16>;
-def: OpR_RR_pat<A2_andp,      And,        v2i32, V2I32>;
 def: OpR_RR_pat<A2_orp,       Or,         v8i8,  V8I8>;
+def: OpR_RR_pat<A2_xorp,      Xor,        v8i8,  V8I8>;
+def: OpR_RR_pat<A2_andp,      And,        v4i16, V4I16>;
 def: OpR_RR_pat<A2_orp,       Or,         v4i16, V4I16>;
+def: OpR_RR_pat<A2_xorp,      Xor,        v4i16, V4I16>;
+def: OpR_RR_pat<A2_andp,      And,        v2i32, V2I32>;
 def: OpR_RR_pat<A2_orp,       Or,         v2i32, V2I32>;
-def: OpR_RR_pat<A2_xorp,      Xor,        v8i8,  V8I8>;
-def: OpR_RR_pat<A2_xorp,      Xor,        v4i16, V4I16>;
 def: OpR_RR_pat<A2_xorp,      Xor,        v2i32, V2I32>;
 
 def: OpR_RR_pat<M2_mpyi,      Mul,        i32,   I32>;
@@ -1234,12 +1351,15 @@
 def: OpR_RR_pat<C2_and,       Mul,        v4i1,  V4I1>;
 def: OpR_RR_pat<C2_and,       Mul,        v8i1,  V8I1>;
 
-let Predicates = [HasV5T] in {
-  def: OpR_RR_pat<F2_sfadd,     pf2<fadd>,    f32, F32>;
-  def: OpR_RR_pat<F2_sfsub,     pf2<fsub>,    f32, F32>;
-  def: OpR_RR_pat<F2_sfmpy,     pf2<fmul>,    f32, F32>;
-  def: OpR_RR_pat<F2_sfmin,     pf2<fminnum>, f32, F32>;
-  def: OpR_RR_pat<F2_sfmax,     pf2<fmaxnum>, f32, F32>;
+def: OpR_RR_pat<F2_sfadd,     pf2<fadd>,    f32, F32>;
+def: OpR_RR_pat<F2_sfsub,     pf2<fsub>,    f32, F32>;
+def: OpR_RR_pat<F2_sfmpy,     pf2<fmul>,    f32, F32>;
+def: OpR_RR_pat<F2_sfmin,     pf2<fminnum>, f32, F32>;
+def: OpR_RR_pat<F2_sfmax,     pf2<fmaxnum>, f32, F32>;
+
+let Predicates = [HasV66] in {
+  def: OpR_RR_pat<F2_dfadd,     pf2<fadd>,    f64, F64>;
+  def: OpR_RR_pat<F2_dfsub,     pf2<fsub>,    f64, F64>;
 }
 
 // In expressions like a0*b0 + a1*b1 + ..., prefer to generate multiply-add,
@@ -1247,12 +1367,14 @@
 let AddedComplexity = 10 in {
   def: AccRRI_pat<M2_macsip,    Add, Su<Mul>, I32, u32_0ImmPred>;
   def: AccRRI_pat<M2_macsin,    Sub, Su<Mul>, I32, u32_0ImmPred>;
-  def: AccRRR_pat<M2_maci,      Add, Su<Mul>, I32, I32>;
+  def: AccRRR_pat<M2_maci,      Add, Su<Mul>, I32, I32, I32>;
+  let Predicates = [HasV66] in
+  def: AccRRR_pat<M2_mnaci,     Sub, Su<Mul>, I32, I32, I32>;
 }
 
 def: AccRRI_pat<M2_naccii,    Sub, Su<Add>, I32, s32_0ImmPred>;
 def: AccRRI_pat<M2_accii,     Add, Su<Add>, I32, s32_0ImmPred>;
-def: AccRRR_pat<M2_acci,      Add, Su<Add>, I32, I32>;
+def: AccRRR_pat<M2_acci,      Add, Su<Add>, I32, I32, I32>;
 
 // Mulh for vectors
 //
@@ -1320,24 +1442,24 @@
 def: Pat<(add Sext64:$Rs, I64:$Rt),
          (A2_addsp (LoReg Sext64:$Rs), I64:$Rt)>;
 
-def: AccRRR_pat<M4_and_and,   And, Su<And>,       I32,  I32>;
-def: AccRRR_pat<M4_and_or,    And, Su<Or>,        I32,  I32>;
-def: AccRRR_pat<M4_and_xor,   And, Su<Xor>,       I32,  I32>;
-def: AccRRR_pat<M4_or_and,    Or,  Su<And>,       I32,  I32>;
-def: AccRRR_pat<M4_or_or,     Or,  Su<Or>,        I32,  I32>;
-def: AccRRR_pat<M4_or_xor,    Or,  Su<Xor>,       I32,  I32>;
-def: AccRRR_pat<M4_xor_and,   Xor, Su<And>,       I32,  I32>;
-def: AccRRR_pat<M4_xor_or,    Xor, Su<Or>,        I32,  I32>;
-def: AccRRR_pat<M2_xor_xacc,  Xor, Su<Xor>,       I32,  I32>;
-def: AccRRR_pat<M4_xor_xacc,  Xor, Su<Xor>,       I64,  I64>;
+def: AccRRR_pat<M4_and_and,   And, Su_ni1<And>,  I32,  I32,  I32>;
+def: AccRRR_pat<M4_and_or,    And, Su_ni1<Or>,   I32,  I32,  I32>;
+def: AccRRR_pat<M4_and_xor,   And, Su<Xor>,      I32,  I32,  I32>;
+def: AccRRR_pat<M4_or_and,    Or,  Su_ni1<And>,  I32,  I32,  I32>;
+def: AccRRR_pat<M4_or_or,     Or,  Su_ni1<Or>,   I32,  I32,  I32>;
+def: AccRRR_pat<M4_or_xor,    Or,  Su<Xor>,      I32,  I32,  I32>;
+def: AccRRR_pat<M4_xor_and,   Xor, Su_ni1<And>,  I32,  I32,  I32>;
+def: AccRRR_pat<M4_xor_or,    Xor, Su_ni1<Or>,   I32,  I32,  I32>;
+def: AccRRR_pat<M2_xor_xacc,  Xor, Su<Xor>,      I32,  I32,  I32>;
+def: AccRRR_pat<M4_xor_xacc,  Xor, Su<Xor>,      I64,  I64,  I64>;
 
 // For dags like (or (and (not _), _), (shl _, _)) where the "or" with
 // one argument matches the patterns below, and with the other argument
 // matches S2_asl_r_r_or, etc, prefer the patterns below.
 let AddedComplexity = 110 in {  // greater than S2_asl_r_r_and/or/xor.
-  def: AccRRR_pat<M4_and_andn,  And, Su<Not2<And>>, I32,  I32>;
-  def: AccRRR_pat<M4_or_andn,   Or,  Su<Not2<And>>, I32,  I32>;
-  def: AccRRR_pat<M4_xor_andn,  Xor, Su<Not2<And>>, I32,  I32>;
+  def: AccRRR_pat<M4_and_andn,  And, Su<Not2<And>>, I32,  I32,  I32>;
+  def: AccRRR_pat<M4_or_andn,   Or,  Su<Not2<And>>, I32,  I32,  I32>;
+  def: AccRRR_pat<M4_xor_andn,  Xor, Su<Not2<And>>, I32,  I32,  I32>;
 }
 
 // S4_addaddi and S4_subaddi don't have tied operands, so give them
@@ -1473,14 +1595,12 @@
          (M4_mpyrr_addr IntRegs:$Ru, IntRegs:$Ry, IntRegs:$Rs)>;
 
 
-let Predicates = [HasV5T] in {
-  def: Pat<(fma F32:$Rs, F32:$Rt, F32:$Rx),
-           (F2_sffma F32:$Rx, F32:$Rs, F32:$Rt)>;
-  def: Pat<(fma (fneg F32:$Rs), F32:$Rt, F32:$Rx),
-           (F2_sffms F32:$Rx, F32:$Rs, F32:$Rt)>;
-  def: Pat<(fma F32:$Rs, (fneg F32:$Rt), F32:$Rx),
-           (F2_sffms F32:$Rx, F32:$Rs, F32:$Rt)>;
-}
+def: Pat<(fma F32:$Rs, F32:$Rt, F32:$Rx),
+         (F2_sffma F32:$Rx, F32:$Rs, F32:$Rt)>;
+def: Pat<(fma (fneg F32:$Rs), F32:$Rt, F32:$Rx),
+         (F2_sffms F32:$Rx, F32:$Rs, F32:$Rt)>;
+def: Pat<(fma F32:$Rs, (fneg F32:$Rt), F32:$Rx),
+         (F2_sffms F32:$Rx, F32:$Rs, F32:$Rt)>;
 
 
 def: Pat<(mul V2I32:$Rs, V2I32:$Rt),
@@ -1491,9 +1611,9 @@
 // Add/subtract two v4i8: Hexagon does not have an insn for this one, so
 // we use the double add v8i8, and use only the low part of the result.
 def: Pat<(add V4I8:$Rs, V4I8:$Rt),
-         (LoReg (A2_vaddub (ToZext64 $Rs), (ToZext64 $Rt)))>;
+         (LoReg (A2_vaddub (ToAext64 $Rs), (ToAext64 $Rt)))>;
 def: Pat<(sub V4I8:$Rs, V4I8:$Rt),
-         (LoReg (A2_vsubub (ToZext64 $Rs), (ToZext64 $Rt)))>;
+         (LoReg (A2_vsubub (ToAext64 $Rs), (ToAext64 $Rt)))>;
 
 // Use M2_vmpy2s_s0 for half-word vector multiply. It multiplies two
 // half-words, and saturates the result to a 32-bit value, except the
@@ -1507,14 +1627,12 @@
 
 // Multiplies two v4i8 vectors.
 def: Pat<(v4i8 (mul V4I8:$Rs, V4I8:$Rt)),
-         (S2_vtrunehb (M5_vmpybuu V4I8:$Rs, V4I8:$Rt))>,
-     Requires<[HasV5T]>;
+         (S2_vtrunehb (M5_vmpybuu V4I8:$Rs, V4I8:$Rt))>;
 
 // Multiplies two v8i8 vectors.
 def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)),
          (Combinew (S2_vtrunehb (M5_vmpybuu (HiReg $Rs), (HiReg $Rt))),
-                   (S2_vtrunehb (M5_vmpybuu (LoReg $Rs), (LoReg $Rt))))>,
-     Requires<[HasV5T]>;
+                   (S2_vtrunehb (M5_vmpybuu (LoReg $Rs), (LoReg $Rt))))>;
 
 
 // --(10) Bit ------------------------------------------------------------
@@ -1827,7 +1945,12 @@
   defm: Loadxi_pat<zextloadv2i8,    v2i16, anyimm1, L2_loadbzw2_io>;
   defm: Loadxi_pat<zextloadv4i8,    v4i16, anyimm2, L2_loadbzw4_io>;
   defm: Loadxi_pat<load,            i32,   anyimm2, L2_loadri_io>;
+  defm: Loadxi_pat<load,            v2i16, anyimm2, L2_loadri_io>;
+  defm: Loadxi_pat<load,            v4i8,  anyimm2, L2_loadri_io>;
   defm: Loadxi_pat<load,            i64,   anyimm3, L2_loadrd_io>;
+  defm: Loadxi_pat<load,            v2i32, anyimm3, L2_loadrd_io>;
+  defm: Loadxi_pat<load,            v4i16, anyimm3, L2_loadrd_io>;
+  defm: Loadxi_pat<load,            v8i8,  anyimm3, L2_loadrd_io>;
   defm: Loadxi_pat<load,            f32,   anyimm2, L2_loadri_io>;
   defm: Loadxi_pat<load,            f64,   anyimm3, L2_loadrd_io>;
   // No sextloadi1.
@@ -1839,10 +1962,10 @@
 }
 
 let AddedComplexity = 30 in {
-  defm: Loadxim_pat<extloadi1,    i64, ToZext64, anyimm0, L2_loadrub_io>;
-  defm: Loadxim_pat<extloadi8,    i64, ToZext64, anyimm0, L2_loadrub_io>;
-  defm: Loadxim_pat<extloadi16,   i64, ToZext64, anyimm1, L2_loadruh_io>;
-  defm: Loadxim_pat<extloadi32,   i64, ToZext64, anyimm2, L2_loadri_io>;
+  defm: Loadxim_pat<extloadi1,    i64, ToAext64, anyimm0, L2_loadrub_io>;
+  defm: Loadxim_pat<extloadi8,    i64, ToAext64, anyimm0, L2_loadrub_io>;
+  defm: Loadxim_pat<extloadi16,   i64, ToAext64, anyimm1, L2_loadruh_io>;
+  defm: Loadxim_pat<extloadi32,   i64, ToAext64, anyimm2, L2_loadri_io>;
   defm: Loadxim_pat<zextloadi1,   i64, ToZext64, anyimm0, L2_loadrub_io>;
   defm: Loadxim_pat<zextloadi8,   i64, ToZext64, anyimm0, L2_loadrub_io>;
   defm: Loadxim_pat<zextloadi16,  i64, ToZext64, anyimm1, L2_loadruh_io>;
@@ -1865,68 +1988,83 @@
   def: Loadxu_pat<zextloadi16,  i32,   anyimm1, L4_loadruh_ur>;
   def: Loadxu_pat<zextloadv2i8, v2i16, anyimm1, L4_loadbzw2_ur>;
   def: Loadxu_pat<zextloadv4i8, v4i16, anyimm2, L4_loadbzw4_ur>;
+  def: Loadxu_pat<load,         i32,   anyimm2, L4_loadri_ur>;
+  def: Loadxu_pat<load,         v2i16, anyimm2, L4_loadri_ur>;
+  def: Loadxu_pat<load,         v4i8,  anyimm2, L4_loadri_ur>;
+  def: Loadxu_pat<load,         i64,   anyimm3, L4_loadrd_ur>;
+  def: Loadxu_pat<load,         v2i32, anyimm3, L4_loadrd_ur>;
+  def: Loadxu_pat<load,         v4i16, anyimm3, L4_loadrd_ur>;
+  def: Loadxu_pat<load,         v8i8,  anyimm3, L4_loadrd_ur>;
   def: Loadxu_pat<load,         f32,   anyimm2, L4_loadri_ur>;
   def: Loadxu_pat<load,         f64,   anyimm3, L4_loadrd_ur>;
-  def: Loadxu_pat<load,         i32,   anyimm2, L4_loadri_ur>;
-  def: Loadxu_pat<load,         i64,   anyimm3, L4_loadrd_ur>;
 
   def: Loadxum_pat<sextloadi8,  i64, anyimm0, ToSext64, L4_loadrb_ur>;
   def: Loadxum_pat<zextloadi8,  i64, anyimm0, ToZext64, L4_loadrub_ur>;
-  def: Loadxum_pat<extloadi8,   i64, anyimm0, ToZext64, L4_loadrub_ur>;
+  def: Loadxum_pat<extloadi8,   i64, anyimm0, ToAext64, L4_loadrub_ur>;
   def: Loadxum_pat<sextloadi16, i64, anyimm1, ToSext64, L4_loadrh_ur>;
   def: Loadxum_pat<zextloadi16, i64, anyimm1, ToZext64, L4_loadruh_ur>;
-  def: Loadxum_pat<extloadi16,  i64, anyimm1, ToZext64, L4_loadruh_ur>;
+  def: Loadxum_pat<extloadi16,  i64, anyimm1, ToAext64, L4_loadruh_ur>;
   def: Loadxum_pat<sextloadi32, i64, anyimm2, ToSext64, L4_loadri_ur>;
   def: Loadxum_pat<zextloadi32, i64, anyimm2, ToZext64, L4_loadri_ur>;
-  def: Loadxum_pat<extloadi32,  i64, anyimm2, ToZext64, L4_loadri_ur>;
+  def: Loadxum_pat<extloadi32,  i64, anyimm2, ToAext64, L4_loadri_ur>;
 }
 
 let AddedComplexity = 40 in {
-  def: Loadxr_shl_pat<extloadi8,     i32, L4_loadrub_rr>;
-  def: Loadxr_shl_pat<zextloadi8,    i32, L4_loadrub_rr>;
-  def: Loadxr_shl_pat<sextloadi8,    i32, L4_loadrb_rr>;
-  def: Loadxr_shl_pat<extloadi16,    i32, L4_loadruh_rr>;
-  def: Loadxr_shl_pat<zextloadi16,   i32, L4_loadruh_rr>;
-  def: Loadxr_shl_pat<sextloadi16,   i32, L4_loadrh_rr>;
-  def: Loadxr_shl_pat<load,          i32, L4_loadri_rr>;
-  def: Loadxr_shl_pat<load,          i64, L4_loadrd_rr>;
-  def: Loadxr_shl_pat<load,          f32, L4_loadri_rr>;
-  def: Loadxr_shl_pat<load,          f64, L4_loadrd_rr>;
+  def: Loadxr_shl_pat<extloadi8,     i32,   L4_loadrub_rr>;
+  def: Loadxr_shl_pat<zextloadi8,    i32,   L4_loadrub_rr>;
+  def: Loadxr_shl_pat<sextloadi8,    i32,   L4_loadrb_rr>;
+  def: Loadxr_shl_pat<extloadi16,    i32,   L4_loadruh_rr>;
+  def: Loadxr_shl_pat<zextloadi16,   i32,   L4_loadruh_rr>;
+  def: Loadxr_shl_pat<sextloadi16,   i32,   L4_loadrh_rr>;
+  def: Loadxr_shl_pat<load,          i32,   L4_loadri_rr>;
+  def: Loadxr_shl_pat<load,          v2i16, L4_loadri_rr>;
+  def: Loadxr_shl_pat<load,          v4i8,  L4_loadri_rr>;
+  def: Loadxr_shl_pat<load,          i64,   L4_loadrd_rr>;
+  def: Loadxr_shl_pat<load,          v2i32, L4_loadrd_rr>;
+  def: Loadxr_shl_pat<load,          v4i16, L4_loadrd_rr>;
+  def: Loadxr_shl_pat<load,          v8i8,  L4_loadrd_rr>;
+  def: Loadxr_shl_pat<load,          f32,   L4_loadri_rr>;
+  def: Loadxr_shl_pat<load,          f64,   L4_loadrd_rr>;
 }
 
 let AddedComplexity = 20 in {
-  def: Loadxr_add_pat<extloadi8,     i32, L4_loadrub_rr>;
-  def: Loadxr_add_pat<zextloadi8,    i32, L4_loadrub_rr>;
-  def: Loadxr_add_pat<sextloadi8,    i32, L4_loadrb_rr>;
-  def: Loadxr_add_pat<extloadi16,    i32, L4_loadruh_rr>;
-  def: Loadxr_add_pat<zextloadi16,   i32, L4_loadruh_rr>;
-  def: Loadxr_add_pat<sextloadi16,   i32, L4_loadrh_rr>;
-  def: Loadxr_add_pat<load,          i32, L4_loadri_rr>;
-  def: Loadxr_add_pat<load,          i64, L4_loadrd_rr>;
-  def: Loadxr_add_pat<load,          f32, L4_loadri_rr>;
-  def: Loadxr_add_pat<load,          f64, L4_loadrd_rr>;
+  def: Loadxr_add_pat<extloadi8,     i32,   L4_loadrub_rr>;
+  def: Loadxr_add_pat<zextloadi8,    i32,   L4_loadrub_rr>;
+  def: Loadxr_add_pat<sextloadi8,    i32,   L4_loadrb_rr>;
+  def: Loadxr_add_pat<extloadi16,    i32,   L4_loadruh_rr>;
+  def: Loadxr_add_pat<zextloadi16,   i32,   L4_loadruh_rr>;
+  def: Loadxr_add_pat<sextloadi16,   i32,   L4_loadrh_rr>;
+  def: Loadxr_add_pat<load,          i32,   L4_loadri_rr>;
+  def: Loadxr_add_pat<load,          v2i16, L4_loadri_rr>;
+  def: Loadxr_add_pat<load,          v4i8,  L4_loadri_rr>;
+  def: Loadxr_add_pat<load,          i64,   L4_loadrd_rr>;
+  def: Loadxr_add_pat<load,          v2i32, L4_loadrd_rr>;
+  def: Loadxr_add_pat<load,          v4i16, L4_loadrd_rr>;
+  def: Loadxr_add_pat<load,          v8i8,  L4_loadrd_rr>;
+  def: Loadxr_add_pat<load,          f32,   L4_loadri_rr>;
+  def: Loadxr_add_pat<load,          f64,   L4_loadrd_rr>;
 }
 
 let AddedComplexity = 40 in {
-  def: Loadxrm_shl_pat<extloadi8,    i64, ToZext64, L4_loadrub_rr>;
+  def: Loadxrm_shl_pat<extloadi8,    i64, ToAext64, L4_loadrub_rr>;
   def: Loadxrm_shl_pat<zextloadi8,   i64, ToZext64, L4_loadrub_rr>;
   def: Loadxrm_shl_pat<sextloadi8,   i64, ToSext64, L4_loadrb_rr>;
-  def: Loadxrm_shl_pat<extloadi16,   i64, ToZext64, L4_loadruh_rr>;
+  def: Loadxrm_shl_pat<extloadi16,   i64, ToAext64, L4_loadruh_rr>;
   def: Loadxrm_shl_pat<zextloadi16,  i64, ToZext64, L4_loadruh_rr>;
   def: Loadxrm_shl_pat<sextloadi16,  i64, ToSext64, L4_loadrh_rr>;
-  def: Loadxrm_shl_pat<extloadi32,   i64, ToZext64, L4_loadri_rr>;
+  def: Loadxrm_shl_pat<extloadi32,   i64, ToAext64, L4_loadri_rr>;
   def: Loadxrm_shl_pat<zextloadi32,  i64, ToZext64, L4_loadri_rr>;
   def: Loadxrm_shl_pat<sextloadi32,  i64, ToSext64, L4_loadri_rr>;
 }
 
 let AddedComplexity = 20 in {
-  def: Loadxrm_add_pat<extloadi8,    i64, ToZext64, L4_loadrub_rr>;
+  def: Loadxrm_add_pat<extloadi8,    i64, ToAext64, L4_loadrub_rr>;
   def: Loadxrm_add_pat<zextloadi8,   i64, ToZext64, L4_loadrub_rr>;
   def: Loadxrm_add_pat<sextloadi8,   i64, ToSext64, L4_loadrb_rr>;
-  def: Loadxrm_add_pat<extloadi16,   i64, ToZext64, L4_loadruh_rr>;
+  def: Loadxrm_add_pat<extloadi16,   i64, ToAext64, L4_loadruh_rr>;
   def: Loadxrm_add_pat<zextloadi16,  i64, ToZext64, L4_loadruh_rr>;
   def: Loadxrm_add_pat<sextloadi16,  i64, ToSext64, L4_loadrh_rr>;
-  def: Loadxrm_add_pat<extloadi32,   i64, ToZext64, L4_loadri_rr>;
+  def: Loadxrm_add_pat<extloadi32,   i64, ToAext64, L4_loadri_rr>;
   def: Loadxrm_add_pat<zextloadi32,  i64, ToZext64, L4_loadri_rr>;
   def: Loadxrm_add_pat<sextloadi32,  i64, ToSext64, L4_loadri_rr>;
 }
@@ -1934,17 +2072,22 @@
 // Absolute address
 
 let AddedComplexity  = 60 in {
-  def: Loada_pat<zextloadi1,      i32, anyimm0, PS_loadrubabs>;
-  def: Loada_pat<sextloadi8,      i32, anyimm0, PS_loadrbabs>;
-  def: Loada_pat<extloadi8,       i32, anyimm0, PS_loadrubabs>;
-  def: Loada_pat<zextloadi8,      i32, anyimm0, PS_loadrubabs>;
-  def: Loada_pat<sextloadi16,     i32, anyimm1, PS_loadrhabs>;
-  def: Loada_pat<extloadi16,      i32, anyimm1, PS_loadruhabs>;
-  def: Loada_pat<zextloadi16,     i32, anyimm1, PS_loadruhabs>;
-  def: Loada_pat<load,            i32, anyimm2, PS_loadriabs>;
-  def: Loada_pat<load,            i64, anyimm3, PS_loadrdabs>;
-  def: Loada_pat<load,            f32, anyimm2, PS_loadriabs>;
-  def: Loada_pat<load,            f64, anyimm3, PS_loadrdabs>;
+  def: Loada_pat<zextloadi1,      i32,   anyimm0, PS_loadrubabs>;
+  def: Loada_pat<sextloadi8,      i32,   anyimm0, PS_loadrbabs>;
+  def: Loada_pat<extloadi8,       i32,   anyimm0, PS_loadrubabs>;
+  def: Loada_pat<zextloadi8,      i32,   anyimm0, PS_loadrubabs>;
+  def: Loada_pat<sextloadi16,     i32,   anyimm1, PS_loadrhabs>;
+  def: Loada_pat<extloadi16,      i32,   anyimm1, PS_loadruhabs>;
+  def: Loada_pat<zextloadi16,     i32,   anyimm1, PS_loadruhabs>;
+  def: Loada_pat<load,            i32,   anyimm2, PS_loadriabs>;
+  def: Loada_pat<load,            v2i16, anyimm2, PS_loadriabs>;
+  def: Loada_pat<load,            v4i8,  anyimm2, PS_loadriabs>;
+  def: Loada_pat<load,            i64,   anyimm3, PS_loadrdabs>;
+  def: Loada_pat<load,            v2i32, anyimm3, PS_loadrdabs>;
+  def: Loada_pat<load,            v4i16, anyimm3, PS_loadrdabs>;
+  def: Loada_pat<load,            v8i8,  anyimm3, PS_loadrdabs>;
+  def: Loada_pat<load,            f32,   anyimm2, PS_loadriabs>;
+  def: Loada_pat<load,            f64,   anyimm3, PS_loadrdabs>;
 
   def: Loada_pat<atomic_load_8,   i32, anyimm0, PS_loadrubabs>;
   def: Loada_pat<atomic_load_16,  i32, anyimm1, PS_loadruhabs>;
@@ -1953,13 +2096,13 @@
 }
 
 let AddedComplexity  = 30 in {
-  def: Loadam_pat<extloadi8,      i64, anyimm0, ToZext64, PS_loadrubabs>;
+  def: Loadam_pat<extloadi8,      i64, anyimm0, ToAext64, PS_loadrubabs>;
   def: Loadam_pat<sextloadi8,     i64, anyimm0, ToSext64, PS_loadrbabs>;
   def: Loadam_pat<zextloadi8,     i64, anyimm0, ToZext64, PS_loadrubabs>;
-  def: Loadam_pat<extloadi16,     i64, anyimm1, ToZext64, PS_loadruhabs>;
+  def: Loadam_pat<extloadi16,     i64, anyimm1, ToAext64, PS_loadruhabs>;
   def: Loadam_pat<sextloadi16,    i64, anyimm1, ToSext64, PS_loadrhabs>;
   def: Loadam_pat<zextloadi16,    i64, anyimm1, ToZext64, PS_loadruhabs>;
-  def: Loadam_pat<extloadi32,     i64, anyimm2, ToZext64, PS_loadriabs>;
+  def: Loadam_pat<extloadi32,     i64, anyimm2, ToAext64, PS_loadriabs>;
   def: Loadam_pat<sextloadi32,    i64, anyimm2, ToSext64, PS_loadriabs>;
   def: Loadam_pat<zextloadi32,    i64, anyimm2, ToZext64, PS_loadriabs>;
 
@@ -1970,18 +2113,23 @@
 // GP-relative address
 
 let AddedComplexity  = 100 in {
-  def: Loada_pat<extloadi1,       i32, addrgp,  L2_loadrubgp>;
-  def: Loada_pat<zextloadi1,      i32, addrgp,  L2_loadrubgp>;
-  def: Loada_pat<extloadi8,       i32, addrgp,  L2_loadrubgp>;
-  def: Loada_pat<sextloadi8,      i32, addrgp,  L2_loadrbgp>;
-  def: Loada_pat<zextloadi8,      i32, addrgp,  L2_loadrubgp>;
-  def: Loada_pat<extloadi16,      i32, addrgp,  L2_loadruhgp>;
-  def: Loada_pat<sextloadi16,     i32, addrgp,  L2_loadrhgp>;
-  def: Loada_pat<zextloadi16,     i32, addrgp,  L2_loadruhgp>;
-  def: Loada_pat<load,            i32, addrgp,  L2_loadrigp>;
-  def: Loada_pat<load,            i64, addrgp,  L2_loadrdgp>;
-  def: Loada_pat<load,            f32, addrgp,  L2_loadrigp>;
-  def: Loada_pat<load,            f64, addrgp,  L2_loadrdgp>;
+  def: Loada_pat<extloadi1,       i32,   addrgp,  L2_loadrubgp>;
+  def: Loada_pat<zextloadi1,      i32,   addrgp,  L2_loadrubgp>;
+  def: Loada_pat<extloadi8,       i32,   addrgp,  L2_loadrubgp>;
+  def: Loada_pat<sextloadi8,      i32,   addrgp,  L2_loadrbgp>;
+  def: Loada_pat<zextloadi8,      i32,   addrgp,  L2_loadrubgp>;
+  def: Loada_pat<extloadi16,      i32,   addrgp,  L2_loadruhgp>;
+  def: Loada_pat<sextloadi16,     i32,   addrgp,  L2_loadrhgp>;
+  def: Loada_pat<zextloadi16,     i32,   addrgp,  L2_loadruhgp>;
+  def: Loada_pat<load,            i32,   addrgp,  L2_loadrigp>;
+  def: Loada_pat<load,            v2i16, addrgp,  L2_loadrigp>;
+  def: Loada_pat<load,            v4i8,  addrgp,  L2_loadrigp>;
+  def: Loada_pat<load,            i64,   addrgp,  L2_loadrdgp>;
+  def: Loada_pat<load,            v2i32, addrgp,  L2_loadrdgp>;
+  def: Loada_pat<load,            v4i16, addrgp,  L2_loadrdgp>;
+  def: Loada_pat<load,            v8i8,  addrgp,  L2_loadrdgp>;
+  def: Loada_pat<load,            f32,   addrgp,  L2_loadrigp>;
+  def: Loada_pat<load,            f64,   addrgp,  L2_loadrdgp>;
 
   def: Loada_pat<atomic_load_8,   i32, addrgp,  L2_loadrubgp>;
   def: Loada_pat<atomic_load_16,  i32, addrgp,  L2_loadruhgp>;
@@ -1990,13 +2138,13 @@
 }
 
 let AddedComplexity  = 70 in {
-  def: Loadam_pat<extloadi8,      i64, addrgp,  ToZext64, L2_loadrubgp>;
+  def: Loadam_pat<extloadi8,      i64, addrgp,  ToAext64, L2_loadrubgp>;
   def: Loadam_pat<sextloadi8,     i64, addrgp,  ToSext64, L2_loadrbgp>;
   def: Loadam_pat<zextloadi8,     i64, addrgp,  ToZext64, L2_loadrubgp>;
-  def: Loadam_pat<extloadi16,     i64, addrgp,  ToZext64, L2_loadruhgp>;
+  def: Loadam_pat<extloadi16,     i64, addrgp,  ToAext64, L2_loadruhgp>;
   def: Loadam_pat<sextloadi16,    i64, addrgp,  ToSext64, L2_loadrhgp>;
   def: Loadam_pat<zextloadi16,    i64, addrgp,  ToZext64, L2_loadruhgp>;
-  def: Loadam_pat<extloadi32,     i64, addrgp,  ToZext64, L2_loadrigp>;
+  def: Loadam_pat<extloadi32,     i64, addrgp,  ToAext64, L2_loadrigp>;
   def: Loadam_pat<sextloadi32,    i64, addrgp,  ToSext64, L2_loadrigp>;
   def: Loadam_pat<zextloadi32,    i64, addrgp,  ToZext64, L2_loadrigp>;
 
@@ -2136,7 +2284,7 @@
 // swapped. This relies on the knowledge that the F.Fragment uses names
 // "ptr" and "val".
 class AtomSt<PatFrag F>
-  : PatFrag<(ops node:$val, node:$ptr), F.Fragment, F.PredicateCode,
+  : PatFrag<(ops node:$val, node:$ptr), !head(F.Fragments), F.PredicateCode,
             F.OperandTransform> {
   let IsAtomic = F.IsAtomic;
   let MemoryVT = F.MemoryVT;
@@ -2252,16 +2400,26 @@
 
 // GP-relative address
 let AddedComplexity = 120 in {
-  def: Storea_pat<truncstorei8,             I32, addrgp, S2_storerbgp>;
-  def: Storea_pat<truncstorei16,            I32, addrgp, S2_storerhgp>;
-  def: Storea_pat<store,                    I32, addrgp, S2_storerigp>;
-  def: Storea_pat<store,                    I64, addrgp, S2_storerdgp>;
-  def: Storea_pat<store,                    F32, addrgp, S2_storerigp>;
-  def: Storea_pat<store,                    F64, addrgp, S2_storerdgp>;
-  def: Storea_pat<AtomSt<atomic_store_8>,   I32, addrgp, S2_storerbgp>;
-  def: Storea_pat<AtomSt<atomic_store_16>,  I32, addrgp, S2_storerhgp>;
-  def: Storea_pat<AtomSt<atomic_store_32>,  I32, addrgp, S2_storerigp>;
-  def: Storea_pat<AtomSt<atomic_store_64>,  I64, addrgp, S2_storerdgp>;
+  def: Storea_pat<truncstorei8,               I32, addrgp, S2_storerbgp>;
+  def: Storea_pat<truncstorei16,              I32, addrgp, S2_storerhgp>;
+  def: Storea_pat<store,                      I32, addrgp, S2_storerigp>;
+  def: Storea_pat<store,                     V4I8, addrgp, S2_storerigp>;
+  def: Storea_pat<store,                    V2I16, addrgp, S2_storerigp>;
+  def: Storea_pat<store,                      I64, addrgp, S2_storerdgp>;
+  def: Storea_pat<store,                     V8I8, addrgp, S2_storerdgp>;
+  def: Storea_pat<store,                    V4I16, addrgp, S2_storerdgp>;
+  def: Storea_pat<store,                    V2I32, addrgp, S2_storerdgp>;
+  def: Storea_pat<store,                      F32, addrgp, S2_storerigp>;
+  def: Storea_pat<store,                      F64, addrgp, S2_storerdgp>;
+  def: Storea_pat<AtomSt<atomic_store_8>,     I32, addrgp, S2_storerbgp>;
+  def: Storea_pat<AtomSt<atomic_store_16>,    I32, addrgp, S2_storerhgp>;
+  def: Storea_pat<AtomSt<atomic_store_32>,    I32, addrgp, S2_storerigp>;
+  def: Storea_pat<AtomSt<atomic_store_32>,   V4I8, addrgp, S2_storerigp>;
+  def: Storea_pat<AtomSt<atomic_store_32>,  V2I16, addrgp, S2_storerigp>;
+  def: Storea_pat<AtomSt<atomic_store_64>,    I64, addrgp, S2_storerdgp>;
+  def: Storea_pat<AtomSt<atomic_store_64>,   V8I8, addrgp, S2_storerdgp>;
+  def: Storea_pat<AtomSt<atomic_store_64>,  V4I16, addrgp, S2_storerdgp>;
+  def: Storea_pat<AtomSt<atomic_store_64>,  V2I32, addrgp, S2_storerdgp>;
 
   def: Stoream_pat<truncstorei8,  I64, addrgp, LoReg,    S2_storerbgp>;
   def: Stoream_pat<truncstorei16, I64, addrgp, LoReg,    S2_storerhgp>;
@@ -2271,16 +2429,26 @@
 
 // Absolute address
 let AddedComplexity = 110 in {
-  def: Storea_pat<truncstorei8,             I32, anyimm0, PS_storerbabs>;
-  def: Storea_pat<truncstorei16,            I32, anyimm1, PS_storerhabs>;
-  def: Storea_pat<store,                    I32, anyimm2, PS_storeriabs>;
-  def: Storea_pat<store,                    I64, anyimm3, PS_storerdabs>;
-  def: Storea_pat<store,                    F32, anyimm2, PS_storeriabs>;
-  def: Storea_pat<store,                    F64, anyimm3, PS_storerdabs>;
-  def: Storea_pat<AtomSt<atomic_store_8>,   I32, anyimm0, PS_storerbabs>;
-  def: Storea_pat<AtomSt<atomic_store_16>,  I32, anyimm1, PS_storerhabs>;
-  def: Storea_pat<AtomSt<atomic_store_32>,  I32, anyimm2, PS_storeriabs>;
-  def: Storea_pat<AtomSt<atomic_store_64>,  I64, anyimm3, PS_storerdabs>;
+  def: Storea_pat<truncstorei8,               I32, anyimm0, PS_storerbabs>;
+  def: Storea_pat<truncstorei16,              I32, anyimm1, PS_storerhabs>;
+  def: Storea_pat<store,                      I32, anyimm2, PS_storeriabs>;
+  def: Storea_pat<store,                     V4I8, anyimm2, PS_storeriabs>;
+  def: Storea_pat<store,                    V2I16, anyimm2, PS_storeriabs>;
+  def: Storea_pat<store,                      I64, anyimm3, PS_storerdabs>;
+  def: Storea_pat<store,                     V8I8, anyimm3, PS_storerdabs>;
+  def: Storea_pat<store,                    V4I16, anyimm3, PS_storerdabs>;
+  def: Storea_pat<store,                    V2I32, anyimm3, PS_storerdabs>;
+  def: Storea_pat<store,                      F32, anyimm2, PS_storeriabs>;
+  def: Storea_pat<store,                      F64, anyimm3, PS_storerdabs>;
+  def: Storea_pat<AtomSt<atomic_store_8>,     I32, anyimm0, PS_storerbabs>;
+  def: Storea_pat<AtomSt<atomic_store_16>,    I32, anyimm1, PS_storerhabs>;
+  def: Storea_pat<AtomSt<atomic_store_32>,    I32, anyimm2, PS_storeriabs>;
+  def: Storea_pat<AtomSt<atomic_store_32>,   V4I8, anyimm2, PS_storeriabs>;
+  def: Storea_pat<AtomSt<atomic_store_32>,  V2I16, anyimm2, PS_storeriabs>;
+  def: Storea_pat<AtomSt<atomic_store_64>,    I64, anyimm3, PS_storerdabs>;
+  def: Storea_pat<AtomSt<atomic_store_64>,   V8I8, anyimm3, PS_storerdabs>;
+  def: Storea_pat<AtomSt<atomic_store_64>,  V4I16, anyimm3, PS_storerdabs>;
+  def: Storea_pat<AtomSt<atomic_store_64>,  V2I32, anyimm3, PS_storerdabs>;
 
   def: Stoream_pat<truncstorei8,  I64, anyimm0, LoReg,    PS_storerbabs>;
   def: Stoream_pat<truncstorei16, I64, anyimm1, LoReg,    PS_storerhabs>;
@@ -2290,12 +2458,17 @@
 
 // Reg<<S + Imm
 let AddedComplexity = 100 in {
-  def: Storexu_shl_pat<truncstorei8,  I32, anyimm0, S4_storerb_ur>;
-  def: Storexu_shl_pat<truncstorei16, I32, anyimm1, S4_storerh_ur>;
-  def: Storexu_shl_pat<store,         I32, anyimm2, S4_storeri_ur>;
-  def: Storexu_shl_pat<store,         I64, anyimm3, S4_storerd_ur>;
-  def: Storexu_shl_pat<store,         F32, anyimm2, S4_storeri_ur>;
-  def: Storexu_shl_pat<store,         F64, anyimm3, S4_storerd_ur>;
+  def: Storexu_shl_pat<truncstorei8,    I32, anyimm0, S4_storerb_ur>;
+  def: Storexu_shl_pat<truncstorei16,   I32, anyimm1, S4_storerh_ur>;
+  def: Storexu_shl_pat<store,           I32, anyimm2, S4_storeri_ur>;
+  def: Storexu_shl_pat<store,          V4I8, anyimm2, S4_storeri_ur>;
+  def: Storexu_shl_pat<store,         V2I16, anyimm2, S4_storeri_ur>;
+  def: Storexu_shl_pat<store,           I64, anyimm3, S4_storerd_ur>;
+  def: Storexu_shl_pat<store,          V8I8, anyimm3, S4_storerd_ur>;
+  def: Storexu_shl_pat<store,         V4I16, anyimm3, S4_storerd_ur>;
+  def: Storexu_shl_pat<store,         V2I32, anyimm3, S4_storerd_ur>;
+  def: Storexu_shl_pat<store,           F32, anyimm2, S4_storeri_ur>;
+  def: Storexu_shl_pat<store,           F64, anyimm3, S4_storerd_ur>;
 
   def: Pat<(store I1:$Pu, (add (shl I32:$Rs, u2_0ImmPred:$u2), anyimm:$A)),
            (S4_storerb_ur IntRegs:$Rs, imm:$u2, imm:$A, (I1toI32 I1:$Pu))>;
@@ -2303,12 +2476,17 @@
 
 // Reg<<S + Reg
 let AddedComplexity = 90 in {
-  def: Storexr_shl_pat<truncstorei8,  I32, S4_storerb_rr>;
-  def: Storexr_shl_pat<truncstorei16, I32, S4_storerh_rr>;
-  def: Storexr_shl_pat<store,         I32, S4_storeri_rr>;
-  def: Storexr_shl_pat<store,         I64, S4_storerd_rr>;
-  def: Storexr_shl_pat<store,         F32, S4_storeri_rr>;
-  def: Storexr_shl_pat<store,         F64, S4_storerd_rr>;
+  def: Storexr_shl_pat<truncstorei8,    I32, S4_storerb_rr>;
+  def: Storexr_shl_pat<truncstorei16,   I32, S4_storerh_rr>;
+  def: Storexr_shl_pat<store,           I32, S4_storeri_rr>;
+  def: Storexr_shl_pat<store,          V4I8, S4_storeri_rr>;
+  def: Storexr_shl_pat<store,         V2I16, S4_storeri_rr>;
+  def: Storexr_shl_pat<store,           I64, S4_storerd_rr>;
+  def: Storexr_shl_pat<store,          V8I8, S4_storerd_rr>;
+  def: Storexr_shl_pat<store,         V4I16, S4_storerd_rr>;
+  def: Storexr_shl_pat<store,         V2I32, S4_storerd_rr>;
+  def: Storexr_shl_pat<store,           F32, S4_storeri_rr>;
+  def: Storexr_shl_pat<store,           F64, S4_storerd_rr>;
 
   def: Pat<(store I1:$Pu, (add (shl I32:$Rs, u2_0ImmPred:$u2), I32:$Rt)),
            (S4_storerb_ur IntRegs:$Rt, IntRegs:$Rs, imm:$u2, (I1toI32 I1:$Pu))>;
@@ -2360,20 +2538,30 @@
 
 // Fi+Imm, Fi, store-register
 let AddedComplexity = 60 in {
-  defm: Storexi_fi_add_pat<truncstorei8,  I32, anyimm, S2_storerb_io>;
-  defm: Storexi_fi_add_pat<truncstorei16, I32, anyimm, S2_storerh_io>;
-  defm: Storexi_fi_add_pat<store,         I32, anyimm, S2_storeri_io>;
-  defm: Storexi_fi_add_pat<store,         I64, anyimm, S2_storerd_io>;
-  defm: Storexi_fi_add_pat<store,         F32, anyimm, S2_storeri_io>;
-  defm: Storexi_fi_add_pat<store,         F64, anyimm, S2_storerd_io>;
+  defm: Storexi_fi_add_pat<truncstorei8,    I32, anyimm, S2_storerb_io>;
+  defm: Storexi_fi_add_pat<truncstorei16,   I32, anyimm, S2_storerh_io>;
+  defm: Storexi_fi_add_pat<store,           I32, anyimm, S2_storeri_io>;
+  defm: Storexi_fi_add_pat<store,          V4I8, anyimm, S2_storeri_io>;
+  defm: Storexi_fi_add_pat<store,         V2I16, anyimm, S2_storeri_io>;
+  defm: Storexi_fi_add_pat<store,           I64, anyimm, S2_storerd_io>;
+  defm: Storexi_fi_add_pat<store,          V8I8, anyimm, S2_storerd_io>;
+  defm: Storexi_fi_add_pat<store,         V4I16, anyimm, S2_storerd_io>;
+  defm: Storexi_fi_add_pat<store,         V2I32, anyimm, S2_storerd_io>;
+  defm: Storexi_fi_add_pat<store,           F32, anyimm, S2_storeri_io>;
+  defm: Storexi_fi_add_pat<store,           F64, anyimm, S2_storerd_io>;
   defm: Storexim_fi_add_pat<store, I1, anyimm, I1toI32, S2_storerb_io>;
 
-  def: Storexi_fi_pat<truncstorei8,   I32, S2_storerb_io>;
-  def: Storexi_fi_pat<truncstorei16,  I32, S2_storerh_io>;
-  def: Storexi_fi_pat<store,          I32, S2_storeri_io>;
-  def: Storexi_fi_pat<store,          I64, S2_storerd_io>;
-  def: Storexi_fi_pat<store,          F32, S2_storeri_io>;
-  def: Storexi_fi_pat<store,          F64, S2_storerd_io>;
+  def: Storexi_fi_pat<truncstorei8,     I32, S2_storerb_io>;
+  def: Storexi_fi_pat<truncstorei16,    I32, S2_storerh_io>;
+  def: Storexi_fi_pat<store,            I32, S2_storeri_io>;
+  def: Storexi_fi_pat<store,           V4I8, S2_storeri_io>;
+  def: Storexi_fi_pat<store,          V2I16, S2_storeri_io>;
+  def: Storexi_fi_pat<store,            I64, S2_storerd_io>;
+  def: Storexi_fi_pat<store,           V8I8, S2_storerd_io>;
+  def: Storexi_fi_pat<store,          V4I16, S2_storerd_io>;
+  def: Storexi_fi_pat<store,          V2I32, S2_storerd_io>;
+  def: Storexi_fi_pat<store,            F32, S2_storeri_io>;
+  def: Storexi_fi_pat<store,            F64, S2_storerd_io>;
   def: Storexim_fi_pat<store, I1, I1toI32, S2_storerb_io>;
 }
 
@@ -2398,32 +2586,47 @@
 
 // Reg+Imm, store-register
 let AddedComplexity = 40 in {
-  defm: Storexi_pat<truncstorei8,   I32, anyimm0, S2_storerb_io>;
-  defm: Storexi_pat<truncstorei16,  I32, anyimm1, S2_storerh_io>;
-  defm: Storexi_pat<store,          I32, anyimm2, S2_storeri_io>;
-  defm: Storexi_pat<store,          I64, anyimm3, S2_storerd_io>;
-  defm: Storexi_pat<store,          F32, anyimm2, S2_storeri_io>;
-  defm: Storexi_pat<store,          F64, anyimm3, S2_storerd_io>;
+  defm: Storexi_pat<truncstorei8,     I32, anyimm0, S2_storerb_io>;
+  defm: Storexi_pat<truncstorei16,    I32, anyimm1, S2_storerh_io>;
+  defm: Storexi_pat<store,            I32, anyimm2, S2_storeri_io>;
+  defm: Storexi_pat<store,           V4I8, anyimm2, S2_storeri_io>;
+  defm: Storexi_pat<store,          V2I16, anyimm2, S2_storeri_io>;
+  defm: Storexi_pat<store,            I64, anyimm3, S2_storerd_io>;
+  defm: Storexi_pat<store,           V8I8, anyimm3, S2_storerd_io>;
+  defm: Storexi_pat<store,          V4I16, anyimm3, S2_storerd_io>;
+  defm: Storexi_pat<store,          V2I32, anyimm3, S2_storerd_io>;
+  defm: Storexi_pat<store,            F32, anyimm2, S2_storeri_io>;
+  defm: Storexi_pat<store,            F64, anyimm3, S2_storerd_io>;
 
   defm: Storexim_pat<truncstorei8,  I64, anyimm0, LoReg,   S2_storerb_io>;
   defm: Storexim_pat<truncstorei16, I64, anyimm1, LoReg,   S2_storerh_io>;
   defm: Storexim_pat<truncstorei32, I64, anyimm2, LoReg,   S2_storeri_io>;
   defm: Storexim_pat<store,         I1,  anyimm0, I1toI32, S2_storerb_io>;
 
-  defm: Storexi_pat<AtomSt<atomic_store_8>,  I32, anyimm0, S2_storerb_io>;
-  defm: Storexi_pat<AtomSt<atomic_store_16>, I32, anyimm1, S2_storerh_io>;
-  defm: Storexi_pat<AtomSt<atomic_store_32>, I32, anyimm2, S2_storeri_io>;
-  defm: Storexi_pat<AtomSt<atomic_store_64>, I64, anyimm3, S2_storerd_io>;
+  defm: Storexi_pat<AtomSt<atomic_store_8>,     I32, anyimm0, S2_storerb_io>;
+  defm: Storexi_pat<AtomSt<atomic_store_16>,    I32, anyimm1, S2_storerh_io>;
+  defm: Storexi_pat<AtomSt<atomic_store_32>,    I32, anyimm2, S2_storeri_io>;
+  defm: Storexi_pat<AtomSt<atomic_store_32>,   V4I8, anyimm2, S2_storeri_io>;
+  defm: Storexi_pat<AtomSt<atomic_store_32>,  V2I16, anyimm2, S2_storeri_io>;
+  defm: Storexi_pat<AtomSt<atomic_store_64>,    I64, anyimm3, S2_storerd_io>;
+  defm: Storexi_pat<AtomSt<atomic_store_64>,   V8I8, anyimm3, S2_storerd_io>;
+  defm: Storexi_pat<AtomSt<atomic_store_64>,  V4I16, anyimm3, S2_storerd_io>;
+  defm: Storexi_pat<AtomSt<atomic_store_64>,  V2I32, anyimm3, S2_storerd_io>;
 }
 
 // Reg+Reg
 let AddedComplexity = 30 in {
-  def: Storexr_add_pat<truncstorei8,  I32, S4_storerb_rr>;
-  def: Storexr_add_pat<truncstorei16, I32, S4_storerh_rr>;
-  def: Storexr_add_pat<store,         I32, S4_storeri_rr>;
-  def: Storexr_add_pat<store,         I64, S4_storerd_rr>;
-  def: Storexr_add_pat<store,         F32, S4_storeri_rr>;
-  def: Storexr_add_pat<store,         F64, S4_storerd_rr>;
+  def: Storexr_add_pat<truncstorei8,    I32, S4_storerb_rr>;
+  def: Storexr_add_pat<truncstorei16,   I32, S4_storerh_rr>;
+  def: Storexr_add_pat<store,           I32, S4_storeri_rr>;
+  def: Storexr_add_pat<store,          V4I8, S4_storeri_rr>;
+  def: Storexr_add_pat<store,         V2I16, S4_storeri_rr>;
+  def: Storexr_add_pat<store,           I64, S4_storerd_rr>;
+  def: Storexr_add_pat<store,          V8I8, S4_storerd_rr>;
+  def: Storexr_add_pat<store,         V4I16, S4_storerd_rr>;
+  def: Storexr_add_pat<store,         V2I32, S4_storerd_rr>;
+  def: Storexr_add_pat<store,           F32, S4_storeri_rr>;
+  def: Storexr_add_pat<store,           F64, S4_storerd_rr>;
 
   def: Pat<(store I1:$Pu, (add I32:$Rs, I32:$Rt)),
            (S4_storerb_rr IntRegs:$Rs, IntRegs:$Rt, 0, (I1toI32 I1:$Pu))>;
@@ -2442,22 +2645,32 @@
 
 // Reg, store-register
 let AddedComplexity = 10 in {
-  def: Storexi_base_pat<truncstorei8,   I32, S2_storerb_io>;
-  def: Storexi_base_pat<truncstorei16,  I32, S2_storerh_io>;
-  def: Storexi_base_pat<store,          I32, S2_storeri_io>;
-  def: Storexi_base_pat<store,          I64, S2_storerd_io>;
-  def: Storexi_base_pat<store,          F32, S2_storeri_io>;
-  def: Storexi_base_pat<store,          F64, S2_storerd_io>;
+  def: Storexi_base_pat<truncstorei8,     I32, S2_storerb_io>;
+  def: Storexi_base_pat<truncstorei16,    I32, S2_storerh_io>;
+  def: Storexi_base_pat<store,            I32, S2_storeri_io>;
+  def: Storexi_base_pat<store,           V4I8, S2_storeri_io>;
+  def: Storexi_base_pat<store,          V2I16, S2_storeri_io>;
+  def: Storexi_base_pat<store,            I64, S2_storerd_io>;
+  def: Storexi_base_pat<store,           V8I8, S2_storerd_io>;
+  def: Storexi_base_pat<store,          V4I16, S2_storerd_io>;
+  def: Storexi_base_pat<store,          V2I32, S2_storerd_io>;
+  def: Storexi_base_pat<store,            F32, S2_storeri_io>;
+  def: Storexi_base_pat<store,            F64, S2_storerd_io>;
 
   def: Storexim_base_pat<truncstorei8,  I64, LoReg,   S2_storerb_io>;
   def: Storexim_base_pat<truncstorei16, I64, LoReg,   S2_storerh_io>;
   def: Storexim_base_pat<truncstorei32, I64, LoReg,   S2_storeri_io>;
   def: Storexim_base_pat<store,         I1,  I1toI32, S2_storerb_io>;
 
-  def: Storexi_base_pat<AtomSt<atomic_store_8>,   I32, S2_storerb_io>;
-  def: Storexi_base_pat<AtomSt<atomic_store_16>,  I32, S2_storerh_io>;
-  def: Storexi_base_pat<AtomSt<atomic_store_32>,  I32, S2_storeri_io>;
-  def: Storexi_base_pat<AtomSt<atomic_store_64>,  I64, S2_storerd_io>;
+  def: Storexi_base_pat<AtomSt<atomic_store_8>,     I32, S2_storerb_io>;
+  def: Storexi_base_pat<AtomSt<atomic_store_16>,    I32, S2_storerh_io>;
+  def: Storexi_base_pat<AtomSt<atomic_store_32>,    I32, S2_storeri_io>;
+  def: Storexi_base_pat<AtomSt<atomic_store_32>,   V4I8, S2_storeri_io>;
+  def: Storexi_base_pat<AtomSt<atomic_store_32>,  V2I16, S2_storeri_io>;
+  def: Storexi_base_pat<AtomSt<atomic_store_64>,    I64, S2_storerd_io>;
+  def: Storexi_base_pat<AtomSt<atomic_store_64>,   V8I8, S2_storerd_io>;
+  def: Storexi_base_pat<AtomSt<atomic_store_64>,  V4I16, S2_storerd_io>;
+  def: Storexi_base_pat<AtomSt<atomic_store_64>,  V2I32, S2_storerd_io>;
 }
 
 
@@ -2541,8 +2754,10 @@
 
 multiclass Memopxr_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
                        SDNode Oper, InstHexagon MI> {
-  defm: Memopxr_base_pat <Load, Store,          Oper, MI>;
-  defm: Memopxr_add_pat  <Load, Store, ImmPred, Oper, MI>;
+  let Predicates = [UseMEMOPS] in {
+    defm: Memopxr_base_pat <Load, Store,          Oper, MI>;
+    defm: Memopxr_add_pat  <Load, Store, ImmPred, Oper, MI>;
+  }
 }
 
 let AddedComplexity = 200 in {
@@ -2640,8 +2855,10 @@
 multiclass Memopxi_pat<PatFrag Load, PatFrag Store, PatFrag ImmPred,
                        SDNode Oper, PatFrag Arg, SDNodeXForm ArgMod,
                        InstHexagon MI> {
-  defm: Memopxi_base_pat <Load, Store,          Oper, Arg, ArgMod, MI>;
-  defm: Memopxi_add_pat  <Load, Store, ImmPred, Oper, Arg, ArgMod, MI>;
+  let Predicates = [UseMEMOPS] in {
+    defm: Memopxi_base_pat <Load, Store,          Oper, Arg, ArgMod, MI>;
+    defm: Memopxi_add_pat  <Load, Store, ImmPred, Oper, Arg, ArgMod, MI>;
+  }
 }
 
 let AddedComplexity = 220 in {
@@ -2800,6 +3017,8 @@
          (J2_jumpf I1:$Pu, bb:$dst)>;
 def: Pat<(brcond (i1 (setne I1:$Pu, -1)), bb:$dst),
          (J2_jumpf I1:$Pu, bb:$dst)>;
+def: Pat<(brcond (i1 (seteq I1:$Pu, 0)), bb:$dst),
+         (J2_jumpf I1:$Pu, bb:$dst)>;
 def: Pat<(brcond (i1 (setne I1:$Pu, 0)), bb:$dst),
          (J2_jumpt I1:$Pu, bb:$dst)>;
 
@@ -2862,9 +3081,26 @@
 def HexagonBARRIER: SDNode<"HexagonISD::BARRIER", SDTNone, [SDNPHasChain]>;
 def: Pat<(HexagonBARRIER), (Y2_barrier)>;
 
+def: Pat<(trap), (PS_crash)>;
+
 // Read cycle counter.
 def SDTInt64Leaf: SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>;
 def HexagonREADCYCLE: SDNode<"HexagonISD::READCYCLE", SDTInt64Leaf,
   [SDNPHasChain]>;
 
 def: Pat<(HexagonREADCYCLE), (A4_tfrcpp UPCYCLE)>;
+
+// The declared return value of the store-locked intrinsics is i32, but
+// the instructions actually define i1. To avoid register copies from
+// IntRegs to PredRegs and back, fold the entire pattern checking the
+// result against true/false.
+let AddedComplexity = 100 in {
+  def: Pat<(i1 (setne (int_hexagon_S2_storew_locked I32:$Rs, I32:$Rt), 0)),
+           (S2_storew_locked I32:$Rs, I32:$Rt)>;
+  def: Pat<(i1 (seteq (int_hexagon_S2_storew_locked I32:$Rs, I32:$Rt), 0)),
+           (C2_not (S2_storew_locked I32:$Rs, I32:$Rt))>;
+  def: Pat<(i1 (setne (int_hexagon_S4_stored_locked I32:$Rs, I64:$Rt), 0)),
+           (S4_stored_locked I32:$Rs, I64:$Rt)>;
+  def: Pat<(i1 (seteq (int_hexagon_S4_stored_locked I32:$Rs, I64:$Rt), 0)),
+           (C2_not (S4_stored_locked I32:$Rs, I64:$Rt))>;
+}