view lib/Transforms/Vectorize/VecUtils.cpp @ 3:9ad51c7bc036

1st commit. remove git dir and add all files.
author Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
date Wed, 15 May 2013 06:43:32 +0900
parents
children
line wrap: on
line source

//===- VecUtils.cpp --- Vectorization Utilities ---------------------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#define DEBUG_TYPE "SLP"

#include "VecUtils.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/Verifier.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <map>

using namespace llvm;

static const unsigned MinVecRegSize = 128;

static const unsigned RecursionMaxDepth = 6;

namespace llvm {

BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl,
                 TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp) :
  BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp)  {
  numberInstructions();
}

void BoUpSLP::numberInstructions() {
  int Loc = 0;
  InstrIdx.clear();
  InstrVec.clear();
  // Number the instructions in the block.
  for (BasicBlock::iterator it=BB->begin(), e=BB->end(); it != e; ++it) {
    InstrIdx[it] = Loc++;
    InstrVec.push_back(it);
    assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation");
  }
}

Value *BoUpSLP::getPointerOperand(Value *I) {
  if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand();
  if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand();
  return 0;
}

unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
  if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace();
  if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace();
  return -1;
}

bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
  Value *PtrA = getPointerOperand(A);
  Value *PtrB = getPointerOperand(B);
  unsigned ASA = getAddressSpaceOperand(A);
  unsigned ASB = getAddressSpaceOperand(B);

  // Check that the address spaces match and that the pointers are valid.
  if (!PtrA || !PtrB || (ASA != ASB)) return false;

  // Check that A and B are of the same type.
  if (PtrA->getType() != PtrB->getType()) return false;

  // Calculate the distance.
  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
  const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB);
  const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV);

  // Non constant distance.
  if (!ConstOffSCEV) return false;

  int64_t Offset = ConstOffSCEV->getValue()->getSExtValue();
  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
  // The Instructions are connsecutive if the size of the first load/store is
  // the same as the offset.
  int64_t Sz = DL->getTypeStoreSize(Ty);
  return ((-Offset) == Sz);
}

bool BoUpSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) {
  Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
  unsigned Sz = DL->getTypeSizeInBits(StoreTy);
  unsigned VF = MinVecRegSize / Sz;

  if (!isPowerOf2_32(Sz) || VF < 2) return false;

  bool Changed = false;
  // Look for profitable vectorizable trees at all offsets, starting at zero.
  for (unsigned i = 0, e = Chain.size(); i < e; ++i) {
    if (i + VF > e) return Changed;
    DEBUG(dbgs()<<"SLP: Analyzing " << VF << " stores at offset "<< i << "\n");
    ArrayRef<Value *> Operands = Chain.slice(i, VF);

    int Cost = getTreeCost(Operands);
    DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
    if (Cost < CostThreshold) {
      DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
      vectorizeTree(Operands, VF);
      i += VF - 1;
      Changed = true;
    }
  }

  return Changed;
}

bool BoUpSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) {
  ValueSet Heads, Tails;
  SmallDenseMap<Value*, Value*> ConsecutiveChain;

  // We may run into multiple chains that merge into a single chain. We mark the
  // stores that we vectorized so that we don't visit the same store twice.
  ValueSet VectorizedStores;
  bool Changed = false;

  // Do a quadratic search on all of the given stores and find
  // all of the pairs of loads that follow each other.
  for (unsigned i = 0, e = Stores.size(); i < e; ++i)
    for (unsigned j = 0; j < e; ++j) {
      if (i == j) continue;
      if (isConsecutiveAccess(Stores[i], Stores[j])) {
        Tails.insert(Stores[j]);
        Heads.insert(Stores[i]);
        ConsecutiveChain[Stores[i]] = Stores[j];
      }
    }

  // For stores that start but don't end a link in the chain:
  for (ValueSet::iterator it = Heads.begin(), e = Heads.end();it != e; ++it) {
    if (Tails.count(*it)) continue;

    // We found a store instr that starts a chain. Now follow the chain and try
    // to vectorize it.
    ValueList Operands;
    Value *I = *it;
    // Collect the chain into a list.
    while (Tails.count(I) || Heads.count(I)) {
      if (VectorizedStores.count(I)) break;
      Operands.push_back(I);
      // Move to the next value in the chain.
      I = ConsecutiveChain[I];
    }

    bool Vectorized = vectorizeStoreChain(Operands, costThreshold);

    // Mark the vectorized stores so that we don't vectorize them again.
    if (Vectorized)
      VectorizedStores.insert(Operands.begin(), Operands.end());
    Changed |= Vectorized;
  }

  return Changed;
}

int BoUpSLP::getScalarizationCost(ArrayRef<Value *> VL) {
  // Find the type of the operands in VL.
  Type *ScalarTy = VL[0]->getType();
  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
    ScalarTy = SI->getValueOperand()->getType();
  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
  // Find the cost of inserting/extracting values from the vector.
  return getScalarizationCost(VecTy);
}

int BoUpSLP::getScalarizationCost(Type *Ty) {
  int Cost = 0;
  for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
    Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
  return Cost;
}

AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) {
  if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI);
  if (LoadInst *LI = dyn_cast<LoadInst>(I)) return AA->getLocation(LI);
  return AliasAnalysis::Location();
}

Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) {
  assert(Src->getParent() == Dst->getParent() && "Not the same BB");
  BasicBlock::iterator I = Src, E = Dst;
  /// Scan all of the instruction from SRC to DST and check if
  /// the source may alias.
  for (++I; I != E; ++I) {
    // Ignore store instructions that are marked as 'ignore'.
    if (MemBarrierIgnoreList.count(I)) continue;
    if (Src->mayWriteToMemory()) /* Write */ {
      if (!I->mayReadOrWriteMemory()) continue;
    } else /* Read */ {
      if (!I->mayWriteToMemory()) continue;
    }
    AliasAnalysis::Location A = getLocation(&*I);
    AliasAnalysis::Location B = getLocation(Src);

    if (!A.Ptr || !B.Ptr || AA->alias(A, B))
      return I;
  }
  return 0;
}

void BoUpSLP::vectorizeArith(ArrayRef<Value *> Operands) {
  Value *Vec = vectorizeTree(Operands, Operands.size());
  BasicBlock::iterator Loc = cast<Instruction>(Vec);
  IRBuilder<> Builder(++Loc);
  // After vectorizing the operands we need to generate extractelement
  // instructions and replace all of the uses of the scalar values with
  // the values that we extracted from the vectorized tree.
  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
    Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i));
    Operands[i]->replaceAllUsesWith(S);
  }
}

int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) {
  // Get rid of the list of stores that were removed, and from the
  // lists of instructions with multiple users.
  MemBarrierIgnoreList.clear();
  LaneMap.clear();
  MultiUserVals.clear();
  MustScalarize.clear();
  MustExtract.clear();

  // Find the location of the last root.
  unsigned LastRootIndex = InstrIdx[GetLastInstr(VL, VL.size())];

  // Scan the tree and find which value is used by which lane, and which values
  // must be scalarized.
  getTreeUses_rec(VL, 0);

  // Check that instructions with multiple users can be vectorized. Mark unsafe
  // instructions.
  for (ValueSet::iterator it = MultiUserVals.begin(),
       e = MultiUserVals.end(); it != e; ++it) {
    // Check that all of the users of this instr are within the tree
    // and that they are all from the same lane.
    int Lane = -1;
    for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end();
         I != E; ++I) {
      if (LaneMap.find(*I) == LaneMap.end()) {
        DEBUG(dbgs()<<"SLP: Instr " << **it << " has multiple users.\n");

        // We don't have an ordering problem if the user is not in this basic
        // block.
        Instruction *Inst = cast<Instruction>(*I);
        if (Inst->getParent() == BB) {
          // We don't have an ordering problem if the user is after the last
          // root.
          unsigned Idx = InstrIdx[Inst];
          if (Idx < LastRootIndex) {
            MustScalarize.insert(*it);
            DEBUG(dbgs()<<"SLP: Adding to MustScalarize "
                  "because of an unsafe out of tree usage.\n");
            break;
          }
        }

        DEBUG(dbgs()<<"SLP: Adding to MustExtract "
              "because of a safe out of tree usage.\n");
        MustExtract.insert(*it);
        continue;
      }
      if (Lane == -1) Lane = LaneMap[*I];
      if (Lane != LaneMap[*I]) {
        MustScalarize.insert(*it);
        DEBUG(dbgs()<<"SLP: Adding " << **it <<
              " to MustScalarize because multiple lane use it: "
              << Lane << " and " << LaneMap[*I] << ".\n");
        break;
      }
    }
  }

  // Now calculate the cost of vectorizing the tree.
  return getTreeCost_rec(VL, 0);
}

void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) {
  if (Depth == RecursionMaxDepth) return;

  // Don't handle vectors.
  if (VL[0]->getType()->isVectorTy()) return;
  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
    if (SI->getValueOperand()->getType()->isVectorTy()) return;

  // Check if all of the operands are constants.
  bool AllConst = true;
  bool AllSameScalar = true;
  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
    AllConst &= isa<Constant>(VL[i]);
    AllSameScalar &= (VL[0] == VL[i]);
    Instruction *I = dyn_cast<Instruction>(VL[i]);
    // If one of the instructions is out of this BB, we need to scalarize all.
    if (I && I->getParent() != BB) return;
  }

  // If all of the operands are identical or constant we have a simple solution.
  if (AllConst || AllSameScalar) return;

  // Scalarize unknown structures.
  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
  if (!VL0) return;

  unsigned Opcode = VL0->getOpcode();
  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
    Instruction *I = dyn_cast<Instruction>(VL[i]);
    // If not all of the instructions are identical then we have to scalarize.
    if (!I || Opcode != I->getOpcode()) return;
  }

  // Mark instructions with multiple users.
  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
    Instruction *I = dyn_cast<Instruction>(VL[i]);
    // Remember to check if all of the users of this instr are vectorized
    // within our tree.
    if (I && I->getNumUses() > 1) MultiUserVals.insert(I);
  }

  for (int i = 0, e = VL.size(); i < e; ++i) {
    // Check that the instruction is only used within
    // one lane.
    if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) return;
    // Make this instruction as 'seen' and remember the lane.
    LaneMap[VL[i]] = i;
  }

  switch (Opcode) {
    case Instruction::ZExt:
    case Instruction::SExt:
    case Instruction::FPToUI:
    case Instruction::FPToSI:
    case Instruction::FPExt:
    case Instruction::PtrToInt:
    case Instruction::IntToPtr:
    case Instruction::SIToFP:
    case Instruction::UIToFP:
    case Instruction::Trunc:
    case Instruction::FPTrunc:
    case Instruction::BitCast:
    case Instruction::Add:
    case Instruction::FAdd:
    case Instruction::Sub:
    case Instruction::FSub:
    case Instruction::Mul:
    case Instruction::FMul:
    case Instruction::UDiv:
    case Instruction::SDiv:
    case Instruction::FDiv:
    case Instruction::URem:
    case Instruction::SRem:
    case Instruction::FRem:
    case Instruction::Shl:
    case Instruction::LShr:
    case Instruction::AShr:
    case Instruction::And:
    case Instruction::Or:
    case Instruction::Xor: {
      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
        ValueList Operands;
        // Prepare the operand vector.
        for (unsigned j = 0; j < VL.size(); ++j)
          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));

        getTreeUses_rec(Operands, Depth+1);
      }
      return;
    }
    case Instruction::Store: {
      ValueList Operands;
      for (unsigned j = 0; j < VL.size(); ++j)
        Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
      getTreeUses_rec(Operands, Depth+1);
      return;
    }
    default:
    return;
  }
}

int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
  Type *ScalarTy = VL[0]->getType();

  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
    ScalarTy = SI->getValueOperand()->getType();

  /// Don't mess with vectors.
  if (ScalarTy->isVectorTy()) return max_cost;
  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());

  if (Depth == RecursionMaxDepth) return getScalarizationCost(VecTy);

  // Check if all of the operands are constants.
  bool AllConst = true;
  bool AllSameScalar = true;
  bool MustScalarizeFlag = false;
  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
    AllConst &= isa<Constant>(VL[i]);
    AllSameScalar &= (VL[0] == VL[i]);
    // Must have a single use.
    Instruction *I = dyn_cast<Instruction>(VL[i]);
    MustScalarizeFlag |= MustScalarize.count(VL[i]);
    // This instruction is outside the basic block.
    if (I && I->getParent() != BB)
      return getScalarizationCost(VecTy);
  }

  // Is this a simple vector constant.
  if (AllConst) return 0;

  // If all of the operands are identical we can broadcast them.
  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
  if (AllSameScalar) {
    // If we are in a loop, and this is not an instruction (e.g. constant or
    // argument) or the instruction is defined outside the loop then assume
    // that the cost is zero.
    if (L && (!VL0 || !L->contains(VL0)))
      return 0;

    // We need to broadcast the scalar.
    return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
  }

  // If this is not a constant, or a scalar from outside the loop then we
  // need to scalarize it.
  if (MustScalarizeFlag)
    return getScalarizationCost(VecTy);

  if (!VL0) return getScalarizationCost(VecTy);
  assert(VL0->getParent() == BB && "Wrong BB");

  unsigned Opcode = VL0->getOpcode();
  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
    Instruction *I = dyn_cast<Instruction>(VL[i]);
    // If not all of the instructions are identical then we have to scalarize.
    if (!I || Opcode != I->getOpcode()) return getScalarizationCost(VecTy);
  }

  // Check if it is safe to sink the loads or the stores.
  if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
    int MaxIdx = InstrIdx[VL0];
    for (unsigned i = 1, e = VL.size(); i < e; ++i )
      MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);

    Instruction *Last = InstrVec[MaxIdx];
    for (unsigned i = 0, e = VL.size(); i < e; ++i ) {
      if (VL[i] == Last) continue;
      Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last);
      if (Barrier) {
        DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " <<
              *Last << "\n because of " << *Barrier << "\n");
        return max_cost;
      }
    }
  }

  // Calculate the extract cost.
  unsigned ExternalUserExtractCost = 0;
  for (unsigned i = 0, e = VL.size(); i < e; ++i)
    if (MustExtract.count(VL[i]))
      ExternalUserExtractCost +=
        TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);

  switch (Opcode) {
  case Instruction::ZExt:
  case Instruction::SExt:
  case Instruction::FPToUI:
  case Instruction::FPToSI:
  case Instruction::FPExt:
  case Instruction::PtrToInt:
  case Instruction::IntToPtr:
  case Instruction::SIToFP:
  case Instruction::UIToFP:
  case Instruction::Trunc:
  case Instruction::FPTrunc:
  case Instruction::BitCast: {
    int Cost = ExternalUserExtractCost;
    ValueList Operands;
    Type *SrcTy = VL0->getOperand(0)->getType();
    // Prepare the operand vector.
    for (unsigned j = 0; j < VL.size(); ++j) {
      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
      // Check that the casted type is the same for all users.
      if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy)
        return getScalarizationCost(VecTy);
    }

    Cost += getTreeCost_rec(Operands, Depth+1);
    if (Cost >= max_cost) return max_cost;

    // Calculate the cost of this instruction.
    int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
                                                       VL0->getType(), SrcTy);

    VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
    int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
    Cost += (VecCost - ScalarCost);
    return Cost;
  }
  case Instruction::Add:
  case Instruction::FAdd:
  case Instruction::Sub:
  case Instruction::FSub:
  case Instruction::Mul:
  case Instruction::FMul:
  case Instruction::UDiv:
  case Instruction::SDiv:
  case Instruction::FDiv:
  case Instruction::URem:
  case Instruction::SRem:
  case Instruction::FRem:
  case Instruction::Shl:
  case Instruction::LShr:
  case Instruction::AShr:
  case Instruction::And:
  case Instruction::Or:
  case Instruction::Xor: {
    int Cost = ExternalUserExtractCost;
    // Calculate the cost of all of the operands.
    for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
      ValueList Operands;
      // Prepare the operand vector.
      for (unsigned j = 0; j < VL.size(); ++j)
        Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));

      Cost += getTreeCost_rec(Operands, Depth+1);
      if (Cost >= max_cost) return max_cost;
    }

    // Calculate the cost of this instruction.
    int ScalarCost = VecTy->getNumElements() *
      TTI->getArithmeticInstrCost(Opcode, ScalarTy);

    int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
    Cost += (VecCost - ScalarCost);
    return Cost;
  }
  case Instruction::Load: {
    // If we are scalarize the loads, add the cost of forming the vector.
    for (unsigned i = 0, e = VL.size()-1; i < e; ++i)
      if (!isConsecutiveAccess(VL[i], VL[i+1]))
        return getScalarizationCost(VecTy);

    // Cost of wide load - cost of scalar loads.
    int ScalarLdCost = VecTy->getNumElements() *
      TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
    int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
    return VecLdCost - ScalarLdCost + ExternalUserExtractCost;
  }
  case Instruction::Store: {
    // We know that we can merge the stores. Calculate the cost.
    int ScalarStCost = VecTy->getNumElements() *
      TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
    int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,0);
    int StoreCost = VecStCost - ScalarStCost;

    ValueList Operands;
    for (unsigned j = 0; j < VL.size(); ++j) {
      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
      MemBarrierIgnoreList.insert(VL[j]);
    }

    int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1);
    return TotalCost + ExternalUserExtractCost;
  }
  default:
    // Unable to vectorize unknown instructions.
    return getScalarizationCost(VecTy);
  }
}

Instruction *BoUpSLP::GetLastInstr(ArrayRef<Value *> VL, unsigned VF) {
  int MaxIdx = InstrIdx[BB->getFirstNonPHI()];
  for (unsigned i = 0; i < VF; ++i )
    MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
  return InstrVec[MaxIdx + 1];
}

Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) {
  IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements()));
  Value *Vec = UndefValue::get(Ty);
  for (unsigned i=0; i < Ty->getNumElements(); ++i) {
    // Generate the 'InsertElement' instruction.
    Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
    // Remember that this instruction is used as part of a 'gather' sequence.
    // The caller of the bottom-up slp vectorizer can try to hoist the sequence
    // if the users are outside of the basic block.
    GatherInstructions.push_back(Vec);
  }

  for (unsigned i = 0; i < Ty->getNumElements(); ++i)
    VectorizedValues[VL[i]] = Vec;

  return Vec;
}

Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) {
  Value *V = vectorizeTree_rec(VL, VF);

  Instruction *LastInstr = GetLastInstr(VL, VL.size());
  int LastInstrIdx = InstrIdx[LastInstr];
  IRBuilder<> Builder(LastInstr);
  for (ValueSet::iterator it = MustExtract.begin(), e = MustExtract.end();
       it != e; ++it) {
    Instruction *I = cast<Instruction>(*it);
    Value *Vec = VectorizedValues[I];
    assert(LaneMap.count(I) && "Unable to find the lane for the external use");
    Value *Idx = Builder.getInt32(LaneMap[I]);
    Value *Extract = Builder.CreateExtractElement(Vec, Idx);
    bool Replaced = false;
    for (Value::use_iterator U = I->use_begin(), UE = U->use_end(); U != UE;
         ++U) {
      Instruction *UI = cast<Instruction>(*U);
      if (UI->getParent() != I->getParent() || InstrIdx[UI] > LastInstrIdx)
        UI->replaceUsesOfWith(I ,Extract);
      Replaced = true;
    }
    assert(Replaced && "Must replace at least one outside user");
    (void)Replaced;
  }

  // We moved some instructions around. We have to number them again
  // before we can do any analysis.
  numberInstructions();
  MustScalarize.clear();
  MustExtract.clear();
  VectorizedValues.clear();
  return V;
}

Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
  Type *ScalarTy = VL[0]->getType();
  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
    ScalarTy = SI->getValueOperand()->getType();
  VectorType *VecTy = VectorType::get(ScalarTy, VF);

  // Check if all of the operands are constants or identical.
  bool AllConst = true;
  bool AllSameScalar = true;
  for (unsigned i = 0, e = VF; i < e; ++i) {
    AllConst &= isa<Constant>(VL[i]);
    AllSameScalar &= (VL[0] == VL[i]);
    // The instruction must be in the same BB, and it must be vectorizable.
    Instruction *I = dyn_cast<Instruction>(VL[i]);
    if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB))
      return Scalarize(VL, VecTy);
  }

  // Check that this is a simple vector constant.
  if (AllConst || AllSameScalar) return Scalarize(VL, VecTy);

  // Scalarize unknown structures.
  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
  if (!VL0) return Scalarize(VL, VecTy);

  if (VectorizedValues.count(VL0)) return VectorizedValues[VL0];

  unsigned Opcode = VL0->getOpcode();
  for (unsigned i = 0, e = VF; i < e; ++i) {
    Instruction *I = dyn_cast<Instruction>(VL[i]);
    // If not all of the instructions are identical then we have to scalarize.
    if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy);
  }

  switch (Opcode) {
  case Instruction::ZExt:
  case Instruction::SExt:
  case Instruction::FPToUI:
  case Instruction::FPToSI:
  case Instruction::FPExt:
  case Instruction::PtrToInt:
  case Instruction::IntToPtr:
  case Instruction::SIToFP:
  case Instruction::UIToFP:
  case Instruction::Trunc:
  case Instruction::FPTrunc:
  case Instruction::BitCast: {
    ValueList INVL;
    for (int i = 0; i < VF; ++i)
      INVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
    Value *InVec = vectorizeTree_rec(INVL, VF);
    IRBuilder<> Builder(GetLastInstr(VL, VF));
    CastInst *CI = dyn_cast<CastInst>(VL0);
    Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);

    for (int i = 0; i < VF; ++i)
      VectorizedValues[VL[i]] = V;

    return V;
  }
  case Instruction::Add:
  case Instruction::FAdd:
  case Instruction::Sub:
  case Instruction::FSub:
  case Instruction::Mul:
  case Instruction::FMul:
  case Instruction::UDiv:
  case Instruction::SDiv:
  case Instruction::FDiv:
  case Instruction::URem:
  case Instruction::SRem:
  case Instruction::FRem:
  case Instruction::Shl:
  case Instruction::LShr:
  case Instruction::AShr:
  case Instruction::And:
  case Instruction::Or:
  case Instruction::Xor: {
    ValueList LHSVL, RHSVL;
    for (int i = 0; i < VF; ++i) {
      LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
      RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
    }

    Value *LHS = vectorizeTree_rec(LHSVL, VF);
    Value *RHS = vectorizeTree_rec(RHSVL, VF);
    IRBuilder<> Builder(GetLastInstr(VL, VF));
    BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
    Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS,RHS);

    for (int i = 0; i < VF; ++i)
      VectorizedValues[VL[i]] = V;

    return V;
  }
  case Instruction::Load: {
    LoadInst *LI = cast<LoadInst>(VL0);
    unsigned Alignment = LI->getAlignment();

    // Check if all of the loads are consecutive.
    for (unsigned i = 1, e = VF; i < e; ++i)
      if (!isConsecutiveAccess(VL[i-1], VL[i]))
        return Scalarize(VL, VecTy);

    IRBuilder<> Builder(GetLastInstr(VL, VF));
    Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
                                          VecTy->getPointerTo());
    LI = Builder.CreateLoad(VecPtr);
    LI->setAlignment(Alignment);

    for (int i = 0; i < VF; ++i)
      VectorizedValues[VL[i]] = LI;

    return LI;
  }
  case Instruction::Store: {
    StoreInst *SI = cast<StoreInst>(VL0);
    unsigned Alignment = SI->getAlignment();

    ValueList ValueOp;
    for (int i = 0; i < VF; ++i)
      ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());

    Value *VecValue = vectorizeTree_rec(ValueOp, VF);

    IRBuilder<> Builder(GetLastInstr(VL, VF));
    Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
                                          VecTy->getPointerTo());
    Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment);

    for (int i = 0; i < VF; ++i)
      cast<Instruction>(VL[i])->eraseFromParent();
    return 0;
  }
  default:
    return Scalarize(VL, VecTy);
  }
}

} // end of namespace