Mercurial > hg > Members > tobaru > cbc > CbC_llvm
diff lib/CodeGen/AtomicExpandPass.cpp @ 83:60c9769439b8
LLVM 3.7
author | Tatsuki IHA <e125716@ie.u-ryukyu.ac.jp> |
---|---|
date | Wed, 18 Feb 2015 14:55:36 +0900 |
parents | 54457678186b |
children | afa8332a0e37 |
line wrap: on
line diff
--- a/lib/CodeGen/AtomicExpandPass.cpp Mon Sep 08 22:07:30 2014 +0900 +++ b/lib/CodeGen/AtomicExpandPass.cpp Wed Feb 18 14:55:36 2015 +0900 @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // // This file contains a pass (at IR level) to replace atomic instructions with -// appropriate (intrinsic-based) ldrex/strex loops. +// either (intrinsic-based) load-linked/store-conditional loops or AtomicCmpXchg. // //===----------------------------------------------------------------------===// @@ -31,20 +31,29 @@ namespace { class AtomicExpand: public FunctionPass { const TargetMachine *TM; + const TargetLowering *TLI; public: static char ID; // Pass identification, replacement for typeid explicit AtomicExpand(const TargetMachine *TM = nullptr) - : FunctionPass(ID), TM(TM) { + : FunctionPass(ID), TM(TM), TLI(nullptr) { initializeAtomicExpandPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override; private: + bool bracketInstWithFences(Instruction *I, AtomicOrdering Order, + bool IsStore, bool IsLoad); bool expandAtomicLoad(LoadInst *LI); + bool expandAtomicLoadToLL(LoadInst *LI); + bool expandAtomicLoadToCmpXchg(LoadInst *LI); bool expandAtomicStore(StoreInst *SI); bool expandAtomicRMW(AtomicRMWInst *AI); + bool expandAtomicRMWToLLSC(AtomicRMWInst *AI); + bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI); bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI); + bool isIdempotentRMW(AtomicRMWInst *AI); + bool simplifyIdempotentRMW(AtomicRMWInst *AI); }; } @@ -59,9 +68,9 @@ } bool AtomicExpand::runOnFunction(Function &F) { - if (!TM || !TM->getSubtargetImpl()->enableAtomicExpand()) + if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand()) return false; - auto TargetLowering = TM->getSubtargetImpl()->getTargetLowering(); + TLI = TM->getSubtargetImpl(F)->getTargetLowering(); SmallVector<Instruction *, 1> AtomicInsts; @@ -78,47 +87,100 @@ auto SI = dyn_cast<StoreInst>(I); auto RMWI = dyn_cast<AtomicRMWInst>(I); auto CASI = dyn_cast<AtomicCmpXchgInst>(I); - assert((LI || SI || RMWI || CASI || isa<FenceInst>(I)) && "Unknown atomic instruction"); - if (LI && TargetLowering->shouldExpandAtomicLoadInIR(LI)) { + auto FenceOrdering = Monotonic; + bool IsStore, IsLoad; + if (TLI->getInsertFencesForAtomic()) { + if (LI && isAtLeastAcquire(LI->getOrdering())) { + FenceOrdering = LI->getOrdering(); + LI->setOrdering(Monotonic); + IsStore = false; + IsLoad = true; + } else if (SI && isAtLeastRelease(SI->getOrdering())) { + FenceOrdering = SI->getOrdering(); + SI->setOrdering(Monotonic); + IsStore = true; + IsLoad = false; + } else if (RMWI && (isAtLeastRelease(RMWI->getOrdering()) || + isAtLeastAcquire(RMWI->getOrdering()))) { + FenceOrdering = RMWI->getOrdering(); + RMWI->setOrdering(Monotonic); + IsStore = IsLoad = true; + } else if (CASI && !TLI->hasLoadLinkedStoreConditional() && + (isAtLeastRelease(CASI->getSuccessOrdering()) || + isAtLeastAcquire(CASI->getSuccessOrdering()))) { + // If a compare and swap is lowered to LL/SC, we can do smarter fence + // insertion, with a stronger one on the success path than on the + // failure path. As a result, fence insertion is directly done by + // expandAtomicCmpXchg in that case. + FenceOrdering = CASI->getSuccessOrdering(); + CASI->setSuccessOrdering(Monotonic); + CASI->setFailureOrdering(Monotonic); + IsStore = IsLoad = true; + } + + if (FenceOrdering != Monotonic) { + MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad); + } + } + + if (LI && TLI->shouldExpandAtomicLoadInIR(LI)) { MadeChange |= expandAtomicLoad(LI); - } else if (SI && TargetLowering->shouldExpandAtomicStoreInIR(SI)) { + } else if (SI && TLI->shouldExpandAtomicStoreInIR(SI)) { MadeChange |= expandAtomicStore(SI); - } else if (RMWI && TargetLowering->shouldExpandAtomicRMWInIR(RMWI)) { - MadeChange |= expandAtomicRMW(RMWI); - } else if (CASI) { + } else if (RMWI) { + // There are two different ways of expanding RMW instructions: + // - into a load if it is idempotent + // - into a Cmpxchg/LL-SC loop otherwise + // we try them in that order. + MadeChange |= + (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) || + (TLI->shouldExpandAtomicRMWInIR(RMWI) && expandAtomicRMW(RMWI)); + } else if (CASI && TLI->hasLoadLinkedStoreConditional()) { MadeChange |= expandAtomicCmpXchg(CASI); } } return MadeChange; } +bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order, + bool IsStore, bool IsLoad) { + IRBuilder<> Builder(I); + + auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad); + + auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad); + // The trailing fence is emitted before the instruction instead of after + // because there is no easy way of setting Builder insertion point after + // an instruction. So we must erase it from the BB, and insert it back + // in the right place. + // We have a guard here because not every atomic operation generates a + // trailing fence. + if (TrailingFence) { + TrailingFence->removeFromParent(); + TrailingFence->insertAfter(I); + } + + return (LeadingFence || TrailingFence); +} + bool AtomicExpand::expandAtomicLoad(LoadInst *LI) { - auto TLI = TM->getSubtargetImpl()->getTargetLowering(); - // If getInsertFencesForAtomic() returns true, then the target does not want - // to deal with memory orders, and emitLeading/TrailingFence should take care - // of everything. Otherwise, emitLeading/TrailingFence are no-op and we - // should preserve the ordering. - AtomicOrdering MemOpOrder = - TLI->getInsertFencesForAtomic() ? Monotonic : LI->getOrdering(); + if (TLI->hasLoadLinkedStoreConditional()) + return expandAtomicLoadToLL(LI); + else + return expandAtomicLoadToCmpXchg(LI); +} + +bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) { IRBuilder<> Builder(LI); - // Note that although no fence is required before atomic load on ARM, it is - // required before SequentiallyConsistent loads for the recommended Power - // mapping (see http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html). - // So we let the target choose what to emit. - TLI->emitLeadingFence(Builder, LI->getOrdering(), - /*IsStore=*/false, /*IsLoad=*/true); - - // The only 64-bit load guaranteed to be single-copy atomic by ARM is - // an ldrexd (A3.5.3). + // On some architectures, load-linked instructions are atomic for larger + // sizes than normal loads. For example, the only 64-bit load guaranteed + // to be single-copy atomic by ARM is an ldrexd (A3.5.3). Value *Val = - TLI->emitLoadLinked(Builder, LI->getPointerOperand(), MemOpOrder); - - TLI->emitTrailingFence(Builder, LI->getOrdering(), - /*IsStore=*/false, /*IsLoad=*/true); + TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering()); LI->replaceAllUsesWith(Val); LI->eraseFromParent(); @@ -126,10 +188,31 @@ return true; } +bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) { + IRBuilder<> Builder(LI); + AtomicOrdering Order = LI->getOrdering(); + Value *Addr = LI->getPointerOperand(); + Type *Ty = cast<PointerType>(Addr->getType())->getElementType(); + Constant *DummyVal = Constant::getNullValue(Ty); + + Value *Pair = Builder.CreateAtomicCmpXchg( + Addr, DummyVal, DummyVal, Order, + AtomicCmpXchgInst::getStrongestFailureOrdering(Order)); + Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded"); + + LI->replaceAllUsesWith(Loaded); + LI->eraseFromParent(); + + return true; +} + bool AtomicExpand::expandAtomicStore(StoreInst *SI) { - // The only atomic 64-bit store on ARM is an strexd that succeeds, which means - // we need a loop and the entire instruction is essentially an "atomicrmw - // xchg" that ignores the value loaded. + // This function is only called on atomic stores that are too large to be + // atomic if implemented as a native store. So we replace them by an + // atomic swap, that can be implemented for example as a ldrex/strex on ARM + // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes. + // It is the responsibility of the target to only return true in + // shouldExpandAtomicRMW in cases where this is required and possible. IRBuilder<> Builder(SI); AtomicRMWInst *AI = Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(), @@ -141,18 +224,55 @@ } bool AtomicExpand::expandAtomicRMW(AtomicRMWInst *AI) { - auto TLI = TM->getSubtargetImpl()->getTargetLowering(); - AtomicOrdering Order = AI->getOrdering(); + if (TLI->hasLoadLinkedStoreConditional()) + return expandAtomicRMWToLLSC(AI); + else + return expandAtomicRMWToCmpXchg(AI); +} + +/// Emit IR to implement the given atomicrmw operation on values in registers, +/// returning the new value. +static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder, + Value *Loaded, Value *Inc) { + Value *NewVal; + switch (Op) { + case AtomicRMWInst::Xchg: + return Inc; + case AtomicRMWInst::Add: + return Builder.CreateAdd(Loaded, Inc, "new"); + case AtomicRMWInst::Sub: + return Builder.CreateSub(Loaded, Inc, "new"); + case AtomicRMWInst::And: + return Builder.CreateAnd(Loaded, Inc, "new"); + case AtomicRMWInst::Nand: + return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new"); + case AtomicRMWInst::Or: + return Builder.CreateOr(Loaded, Inc, "new"); + case AtomicRMWInst::Xor: + return Builder.CreateXor(Loaded, Inc, "new"); + case AtomicRMWInst::Max: + NewVal = Builder.CreateICmpSGT(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + case AtomicRMWInst::Min: + NewVal = Builder.CreateICmpSLE(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + case AtomicRMWInst::UMax: + NewVal = Builder.CreateICmpUGT(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + case AtomicRMWInst::UMin: + NewVal = Builder.CreateICmpULE(Loaded, Inc); + return Builder.CreateSelect(NewVal, Loaded, Inc, "new"); + default: + llvm_unreachable("Unknown atomic op"); + } +} + +bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) { + AtomicOrdering MemOpOrder = AI->getOrdering(); Value *Addr = AI->getPointerOperand(); BasicBlock *BB = AI->getParent(); Function *F = BB->getParent(); LLVMContext &Ctx = F->getContext(); - // If getInsertFencesForAtomic() returns true, then the target does not want - // to deal with memory orders, and emitLeading/TrailingFence should take care - // of everything. Otherwise, emitLeading/TrailingFence are no-op and we - // should preserve the ordering. - AtomicOrdering MemOpOrder = - TLI->getInsertFencesForAtomic() ? Monotonic : Order; // Given: atomicrmw some_op iN* %addr, iN %incr ordering // @@ -179,56 +299,14 @@ // the branch entirely. std::prev(BB->end())->eraseFromParent(); Builder.SetInsertPoint(BB); - TLI->emitLeadingFence(Builder, Order, /*IsStore=*/true, /*IsLoad=*/true); Builder.CreateBr(LoopBB); // Start the main loop block now that we've taken care of the preliminaries. Builder.SetInsertPoint(LoopBB); Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder); - Value *NewVal; - switch (AI->getOperation()) { - case AtomicRMWInst::Xchg: - NewVal = AI->getValOperand(); - break; - case AtomicRMWInst::Add: - NewVal = Builder.CreateAdd(Loaded, AI->getValOperand(), "new"); - break; - case AtomicRMWInst::Sub: - NewVal = Builder.CreateSub(Loaded, AI->getValOperand(), "new"); - break; - case AtomicRMWInst::And: - NewVal = Builder.CreateAnd(Loaded, AI->getValOperand(), "new"); - break; - case AtomicRMWInst::Nand: - NewVal = Builder.CreateNot(Builder.CreateAnd(Loaded, AI->getValOperand()), - "new"); - break; - case AtomicRMWInst::Or: - NewVal = Builder.CreateOr(Loaded, AI->getValOperand(), "new"); - break; - case AtomicRMWInst::Xor: - NewVal = Builder.CreateXor(Loaded, AI->getValOperand(), "new"); - break; - case AtomicRMWInst::Max: - NewVal = Builder.CreateICmpSGT(Loaded, AI->getValOperand()); - NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new"); - break; - case AtomicRMWInst::Min: - NewVal = Builder.CreateICmpSLE(Loaded, AI->getValOperand()); - NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new"); - break; - case AtomicRMWInst::UMax: - NewVal = Builder.CreateICmpUGT(Loaded, AI->getValOperand()); - NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new"); - break; - case AtomicRMWInst::UMin: - NewVal = Builder.CreateICmpULE(Loaded, AI->getValOperand()); - NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new"); - break; - default: - llvm_unreachable("Unknown atomic op"); - } + Value *NewVal = + performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); Value *StoreSuccess = TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder); @@ -237,7 +315,6 @@ Builder.CreateCondBr(TryAgain, LoopBB, ExitBB); Builder.SetInsertPoint(ExitBB, ExitBB->begin()); - TLI->emitTrailingFence(Builder, Order, /*IsStore=*/true, /*IsLoad=*/true); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); @@ -245,8 +322,71 @@ return true; } +bool AtomicExpand::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI) { + AtomicOrdering MemOpOrder = + AI->getOrdering() == Unordered ? Monotonic : AI->getOrdering(); + Value *Addr = AI->getPointerOperand(); + BasicBlock *BB = AI->getParent(); + Function *F = BB->getParent(); + LLVMContext &Ctx = F->getContext(); + + // Given: atomicrmw some_op iN* %addr, iN %incr ordering + // + // The standard expansion we produce is: + // [...] + // %init_loaded = load atomic iN* %addr + // br label %loop + // loop: + // %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ] + // %new = some_op iN %loaded, %incr + // %pair = cmpxchg iN* %addr, iN %loaded, iN %new + // %new_loaded = extractvalue { iN, i1 } %pair, 0 + // %success = extractvalue { iN, i1 } %pair, 1 + // br i1 %success, label %atomicrmw.end, label %loop + // atomicrmw.end: + // [...] + BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end"); + BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB); + + // This grabs the DebugLoc from AI. + IRBuilder<> Builder(AI); + + // The split call above "helpfully" added a branch at the end of BB (to the + // wrong place), but we want a load. It's easiest to just remove + // the branch entirely. + std::prev(BB->end())->eraseFromParent(); + Builder.SetInsertPoint(BB); + LoadInst *InitLoaded = Builder.CreateLoad(Addr); + // Atomics require at least natural alignment. + InitLoaded->setAlignment(AI->getType()->getPrimitiveSizeInBits()); + Builder.CreateBr(LoopBB); + + // Start the main loop block now that we've taken care of the preliminaries. + Builder.SetInsertPoint(LoopBB); + PHINode *Loaded = Builder.CreatePHI(AI->getType(), 2, "loaded"); + Loaded->addIncoming(InitLoaded, BB); + + Value *NewVal = + performAtomicOp(AI->getOperation(), Builder, Loaded, AI->getValOperand()); + + Value *Pair = Builder.CreateAtomicCmpXchg( + Addr, Loaded, NewVal, MemOpOrder, + AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder)); + Value *NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded"); + Loaded->addIncoming(NewLoaded, LoopBB); + + Value *Success = Builder.CreateExtractValue(Pair, 1, "success"); + Builder.CreateCondBr(Success, ExitBB, LoopBB); + + Builder.SetInsertPoint(ExitBB, ExitBB->begin()); + + AI->replaceAllUsesWith(NewLoaded); + AI->eraseFromParent(); + + return true; +} + bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { - auto TLI = TM->getSubtargetImpl()->getTargetLowering(); AtomicOrdering SuccessOrder = CI->getSuccessOrdering(); AtomicOrdering FailureOrder = CI->getFailureOrdering(); Value *Addr = CI->getPointerOperand(); @@ -378,3 +518,32 @@ CI->eraseFromParent(); return true; } + +bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) { + auto C = dyn_cast<ConstantInt>(RMWI->getValOperand()); + if(!C) + return false; + + AtomicRMWInst::BinOp Op = RMWI->getOperation(); + switch(Op) { + case AtomicRMWInst::Add: + case AtomicRMWInst::Sub: + case AtomicRMWInst::Or: + case AtomicRMWInst::Xor: + return C->isZero(); + case AtomicRMWInst::And: + return C->isMinusOne(); + // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/... + default: + return false; + } +} + +bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) { + if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) { + if (TLI->shouldExpandAtomicLoadInIR(ResultingLoad)) + expandAtomicLoad(ResultingLoad); + return true; + } + return false; +}