annotate mlir/lib/Transforms/PipelineDataTransfer.cpp @ 150:1d019706d866

LLVM10
author anatofuz
date Thu, 13 Feb 2020 15:10:13 +0900
parents
children 0572611fdcc8
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
150
anatofuz
parents:
diff changeset
1 //===- PipelineDataTransfer.cpp --- Pass for pipelining data movement ---*-===//
anatofuz
parents:
diff changeset
2 //
anatofuz
parents:
diff changeset
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
anatofuz
parents:
diff changeset
4 // See https://llvm.org/LICENSE.txt for license information.
anatofuz
parents:
diff changeset
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
anatofuz
parents:
diff changeset
6 //
anatofuz
parents:
diff changeset
7 //===----------------------------------------------------------------------===//
anatofuz
parents:
diff changeset
8 //
anatofuz
parents:
diff changeset
9 // This file implements a pass to pipeline data transfers.
anatofuz
parents:
diff changeset
10 //
anatofuz
parents:
diff changeset
11 //===----------------------------------------------------------------------===//
anatofuz
parents:
diff changeset
12
anatofuz
parents:
diff changeset
13 #include "mlir/Transforms/Passes.h"
anatofuz
parents:
diff changeset
14
anatofuz
parents:
diff changeset
15 #include "mlir/Analysis/AffineAnalysis.h"
anatofuz
parents:
diff changeset
16 #include "mlir/Analysis/LoopAnalysis.h"
anatofuz
parents:
diff changeset
17 #include "mlir/Analysis/Utils.h"
anatofuz
parents:
diff changeset
18 #include "mlir/Dialect/AffineOps/AffineOps.h"
anatofuz
parents:
diff changeset
19 #include "mlir/Dialect/StandardOps/Ops.h"
anatofuz
parents:
diff changeset
20 #include "mlir/IR/Builders.h"
anatofuz
parents:
diff changeset
21 #include "mlir/Pass/Pass.h"
anatofuz
parents:
diff changeset
22 #include "mlir/Transforms/LoopUtils.h"
anatofuz
parents:
diff changeset
23 #include "mlir/Transforms/Utils.h"
anatofuz
parents:
diff changeset
24 #include "llvm/ADT/DenseMap.h"
anatofuz
parents:
diff changeset
25 #include "llvm/Support/Debug.h"
anatofuz
parents:
diff changeset
26 #define DEBUG_TYPE "affine-pipeline-data-transfer"
anatofuz
parents:
diff changeset
27
anatofuz
parents:
diff changeset
28 using namespace mlir;
anatofuz
parents:
diff changeset
29
anatofuz
parents:
diff changeset
30 namespace {
anatofuz
parents:
diff changeset
31
anatofuz
parents:
diff changeset
32 struct PipelineDataTransfer : public FunctionPass<PipelineDataTransfer> {
anatofuz
parents:
diff changeset
33 void runOnFunction() override;
anatofuz
parents:
diff changeset
34 void runOnAffineForOp(AffineForOp forOp);
anatofuz
parents:
diff changeset
35
anatofuz
parents:
diff changeset
36 std::vector<AffineForOp> forOps;
anatofuz
parents:
diff changeset
37 };
anatofuz
parents:
diff changeset
38
anatofuz
parents:
diff changeset
39 } // end anonymous namespace
anatofuz
parents:
diff changeset
40
anatofuz
parents:
diff changeset
41 /// Creates a pass to pipeline explicit movement of data across levels of the
anatofuz
parents:
diff changeset
42 /// memory hierarchy.
anatofuz
parents:
diff changeset
43 std::unique_ptr<OpPassBase<FuncOp>> mlir::createPipelineDataTransferPass() {
anatofuz
parents:
diff changeset
44 return std::make_unique<PipelineDataTransfer>();
anatofuz
parents:
diff changeset
45 }
anatofuz
parents:
diff changeset
46
anatofuz
parents:
diff changeset
47 // Returns the position of the tag memref operand given a DMA operation.
anatofuz
parents:
diff changeset
48 // Temporary utility: will be replaced when DmaStart/DmaFinish abstract op's are
anatofuz
parents:
diff changeset
49 // added. TODO(b/117228571)
anatofuz
parents:
diff changeset
50 static unsigned getTagMemRefPos(Operation &dmaInst) {
anatofuz
parents:
diff changeset
51 assert(isa<AffineDmaStartOp>(dmaInst) || isa<AffineDmaWaitOp>(dmaInst));
anatofuz
parents:
diff changeset
52 if (auto dmaStartOp = dyn_cast<AffineDmaStartOp>(dmaInst)) {
anatofuz
parents:
diff changeset
53 return dmaStartOp.getTagMemRefOperandIndex();
anatofuz
parents:
diff changeset
54 }
anatofuz
parents:
diff changeset
55 // First operand for a dma finish operation.
anatofuz
parents:
diff changeset
56 return 0;
anatofuz
parents:
diff changeset
57 }
anatofuz
parents:
diff changeset
58
anatofuz
parents:
diff changeset
59 /// Doubles the buffer of the supplied memref on the specified 'affine.for'
anatofuz
parents:
diff changeset
60 /// operation by adding a leading dimension of size two to the memref.
anatofuz
parents:
diff changeset
61 /// Replaces all uses of the old memref by the new one while indexing the newly
anatofuz
parents:
diff changeset
62 /// added dimension by the loop IV of the specified 'affine.for' operation
anatofuz
parents:
diff changeset
63 /// modulo 2. Returns false if such a replacement cannot be performed.
anatofuz
parents:
diff changeset
64 static bool doubleBuffer(Value oldMemRef, AffineForOp forOp) {
anatofuz
parents:
diff changeset
65 auto *forBody = forOp.getBody();
anatofuz
parents:
diff changeset
66 OpBuilder bInner(forBody, forBody->begin());
anatofuz
parents:
diff changeset
67
anatofuz
parents:
diff changeset
68 // Doubles the shape with a leading dimension extent of 2.
anatofuz
parents:
diff changeset
69 auto doubleShape = [&](MemRefType oldMemRefType) -> MemRefType {
anatofuz
parents:
diff changeset
70 // Add the leading dimension in the shape for the double buffer.
anatofuz
parents:
diff changeset
71 ArrayRef<int64_t> oldShape = oldMemRefType.getShape();
anatofuz
parents:
diff changeset
72 SmallVector<int64_t, 4> newShape(1 + oldMemRefType.getRank());
anatofuz
parents:
diff changeset
73 newShape[0] = 2;
anatofuz
parents:
diff changeset
74 std::copy(oldShape.begin(), oldShape.end(), newShape.begin() + 1);
anatofuz
parents:
diff changeset
75 return MemRefType::Builder(oldMemRefType)
anatofuz
parents:
diff changeset
76 .setShape(newShape)
anatofuz
parents:
diff changeset
77 .setAffineMaps({});
anatofuz
parents:
diff changeset
78 };
anatofuz
parents:
diff changeset
79
anatofuz
parents:
diff changeset
80 auto oldMemRefType = oldMemRef.getType().cast<MemRefType>();
anatofuz
parents:
diff changeset
81 auto newMemRefType = doubleShape(oldMemRefType);
anatofuz
parents:
diff changeset
82
anatofuz
parents:
diff changeset
83 // The double buffer is allocated right before 'forInst'.
anatofuz
parents:
diff changeset
84 auto *forInst = forOp.getOperation();
anatofuz
parents:
diff changeset
85 OpBuilder bOuter(forInst);
anatofuz
parents:
diff changeset
86 // Put together alloc operands for any dynamic dimensions of the memref.
anatofuz
parents:
diff changeset
87 SmallVector<Value, 4> allocOperands;
anatofuz
parents:
diff changeset
88 unsigned dynamicDimCount = 0;
anatofuz
parents:
diff changeset
89 for (auto dimSize : oldMemRefType.getShape()) {
anatofuz
parents:
diff changeset
90 if (dimSize == -1)
anatofuz
parents:
diff changeset
91 allocOperands.push_back(bOuter.create<DimOp>(forInst->getLoc(), oldMemRef,
anatofuz
parents:
diff changeset
92 dynamicDimCount++));
anatofuz
parents:
diff changeset
93 }
anatofuz
parents:
diff changeset
94
anatofuz
parents:
diff changeset
95 // Create and place the alloc right before the 'affine.for' operation.
anatofuz
parents:
diff changeset
96 Value newMemRef =
anatofuz
parents:
diff changeset
97 bOuter.create<AllocOp>(forInst->getLoc(), newMemRefType, allocOperands);
anatofuz
parents:
diff changeset
98
anatofuz
parents:
diff changeset
99 // Create 'iv mod 2' value to index the leading dimension.
anatofuz
parents:
diff changeset
100 auto d0 = bInner.getAffineDimExpr(0);
anatofuz
parents:
diff changeset
101 int64_t step = forOp.getStep();
anatofuz
parents:
diff changeset
102 auto modTwoMap = AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0,
anatofuz
parents:
diff changeset
103 {d0.floorDiv(step) % 2});
anatofuz
parents:
diff changeset
104 auto ivModTwoOp = bInner.create<AffineApplyOp>(forOp.getLoc(), modTwoMap,
anatofuz
parents:
diff changeset
105 forOp.getInductionVar());
anatofuz
parents:
diff changeset
106
anatofuz
parents:
diff changeset
107 // replaceAllMemRefUsesWith will succeed unless the forOp body has
anatofuz
parents:
diff changeset
108 // non-dereferencing uses of the memref (dealloc's are fine though).
anatofuz
parents:
diff changeset
109 if (failed(replaceAllMemRefUsesWith(
anatofuz
parents:
diff changeset
110 oldMemRef, newMemRef,
anatofuz
parents:
diff changeset
111 /*extraIndices=*/{ivModTwoOp},
anatofuz
parents:
diff changeset
112 /*indexRemap=*/AffineMap(),
anatofuz
parents:
diff changeset
113 /*extraOperands=*/{},
anatofuz
parents:
diff changeset
114 /*symbolOperands=*/{},
anatofuz
parents:
diff changeset
115 /*domInstFilter=*/&*forOp.getBody()->begin()))) {
anatofuz
parents:
diff changeset
116 LLVM_DEBUG(
anatofuz
parents:
diff changeset
117 forOp.emitError("memref replacement for double buffering failed"));
anatofuz
parents:
diff changeset
118 ivModTwoOp.erase();
anatofuz
parents:
diff changeset
119 return false;
anatofuz
parents:
diff changeset
120 }
anatofuz
parents:
diff changeset
121 // Insert the dealloc op right after the for loop.
anatofuz
parents:
diff changeset
122 bOuter.setInsertionPointAfter(forInst);
anatofuz
parents:
diff changeset
123 bOuter.create<DeallocOp>(forInst->getLoc(), newMemRef);
anatofuz
parents:
diff changeset
124
anatofuz
parents:
diff changeset
125 return true;
anatofuz
parents:
diff changeset
126 }
anatofuz
parents:
diff changeset
127
anatofuz
parents:
diff changeset
128 /// Returns success if the IR is in a valid state.
anatofuz
parents:
diff changeset
129 void PipelineDataTransfer::runOnFunction() {
anatofuz
parents:
diff changeset
130 // Do a post order walk so that inner loop DMAs are processed first. This is
anatofuz
parents:
diff changeset
131 // necessary since 'affine.for' operations nested within would otherwise
anatofuz
parents:
diff changeset
132 // become invalid (erased) when the outer loop is pipelined (the pipelined one
anatofuz
parents:
diff changeset
133 // gets deleted and replaced by a prologue, a new steady-state loop and an
anatofuz
parents:
diff changeset
134 // epilogue).
anatofuz
parents:
diff changeset
135 forOps.clear();
anatofuz
parents:
diff changeset
136 getFunction().walk([&](AffineForOp forOp) { forOps.push_back(forOp); });
anatofuz
parents:
diff changeset
137 for (auto forOp : forOps)
anatofuz
parents:
diff changeset
138 runOnAffineForOp(forOp);
anatofuz
parents:
diff changeset
139 }
anatofuz
parents:
diff changeset
140
anatofuz
parents:
diff changeset
141 // Check if tags of the dma start op and dma wait op match.
anatofuz
parents:
diff changeset
142 static bool checkTagMatch(AffineDmaStartOp startOp, AffineDmaWaitOp waitOp) {
anatofuz
parents:
diff changeset
143 if (startOp.getTagMemRef() != waitOp.getTagMemRef())
anatofuz
parents:
diff changeset
144 return false;
anatofuz
parents:
diff changeset
145 auto startIndices = startOp.getTagIndices();
anatofuz
parents:
diff changeset
146 auto waitIndices = waitOp.getTagIndices();
anatofuz
parents:
diff changeset
147 // Both of these have the same number of indices since they correspond to the
anatofuz
parents:
diff changeset
148 // same tag memref.
anatofuz
parents:
diff changeset
149 for (auto it = startIndices.begin(), wIt = waitIndices.begin(),
anatofuz
parents:
diff changeset
150 e = startIndices.end();
anatofuz
parents:
diff changeset
151 it != e; ++it, ++wIt) {
anatofuz
parents:
diff changeset
152 // Keep it simple for now, just checking if indices match.
anatofuz
parents:
diff changeset
153 // TODO(mlir-team): this would in general need to check if there is no
anatofuz
parents:
diff changeset
154 // intervening write writing to the same tag location, i.e., memory last
anatofuz
parents:
diff changeset
155 // write/data flow analysis. This is however sufficient/powerful enough for
anatofuz
parents:
diff changeset
156 // now since the DMA generation pass or the input for it will always have
anatofuz
parents:
diff changeset
157 // start/wait with matching tags (same SSA operand indices).
anatofuz
parents:
diff changeset
158 if (*it != *wIt)
anatofuz
parents:
diff changeset
159 return false;
anatofuz
parents:
diff changeset
160 }
anatofuz
parents:
diff changeset
161 return true;
anatofuz
parents:
diff changeset
162 }
anatofuz
parents:
diff changeset
163
anatofuz
parents:
diff changeset
164 // Identify matching DMA start/finish operations to overlap computation with.
anatofuz
parents:
diff changeset
165 static void findMatchingStartFinishInsts(
anatofuz
parents:
diff changeset
166 AffineForOp forOp,
anatofuz
parents:
diff changeset
167 SmallVectorImpl<std::pair<Operation *, Operation *>> &startWaitPairs) {
anatofuz
parents:
diff changeset
168
anatofuz
parents:
diff changeset
169 // Collect outgoing DMA operations - needed to check for dependences below.
anatofuz
parents:
diff changeset
170 SmallVector<AffineDmaStartOp, 4> outgoingDmaOps;
anatofuz
parents:
diff changeset
171 for (auto &op : *forOp.getBody()) {
anatofuz
parents:
diff changeset
172 auto dmaStartOp = dyn_cast<AffineDmaStartOp>(op);
anatofuz
parents:
diff changeset
173 if (dmaStartOp && dmaStartOp.isSrcMemorySpaceFaster())
anatofuz
parents:
diff changeset
174 outgoingDmaOps.push_back(dmaStartOp);
anatofuz
parents:
diff changeset
175 }
anatofuz
parents:
diff changeset
176
anatofuz
parents:
diff changeset
177 SmallVector<Operation *, 4> dmaStartInsts, dmaFinishInsts;
anatofuz
parents:
diff changeset
178 for (auto &op : *forOp.getBody()) {
anatofuz
parents:
diff changeset
179 // Collect DMA finish operations.
anatofuz
parents:
diff changeset
180 if (isa<AffineDmaWaitOp>(op)) {
anatofuz
parents:
diff changeset
181 dmaFinishInsts.push_back(&op);
anatofuz
parents:
diff changeset
182 continue;
anatofuz
parents:
diff changeset
183 }
anatofuz
parents:
diff changeset
184 auto dmaStartOp = dyn_cast<AffineDmaStartOp>(op);
anatofuz
parents:
diff changeset
185 if (!dmaStartOp)
anatofuz
parents:
diff changeset
186 continue;
anatofuz
parents:
diff changeset
187
anatofuz
parents:
diff changeset
188 // Only DMAs incoming into higher memory spaces are pipelined for now.
anatofuz
parents:
diff changeset
189 // TODO(bondhugula): handle outgoing DMA pipelining.
anatofuz
parents:
diff changeset
190 if (!dmaStartOp.isDestMemorySpaceFaster())
anatofuz
parents:
diff changeset
191 continue;
anatofuz
parents:
diff changeset
192
anatofuz
parents:
diff changeset
193 // Check for dependence with outgoing DMAs. Doing this conservatively.
anatofuz
parents:
diff changeset
194 // TODO(andydavis,bondhugula): use the dependence analysis to check for
anatofuz
parents:
diff changeset
195 // dependences between an incoming and outgoing DMA in the same iteration.
anatofuz
parents:
diff changeset
196 auto it = outgoingDmaOps.begin();
anatofuz
parents:
diff changeset
197 for (; it != outgoingDmaOps.end(); ++it) {
anatofuz
parents:
diff changeset
198 if (it->getDstMemRef() == dmaStartOp.getSrcMemRef())
anatofuz
parents:
diff changeset
199 break;
anatofuz
parents:
diff changeset
200 }
anatofuz
parents:
diff changeset
201 if (it != outgoingDmaOps.end())
anatofuz
parents:
diff changeset
202 continue;
anatofuz
parents:
diff changeset
203
anatofuz
parents:
diff changeset
204 // We only double buffer if the buffer is not live out of loop.
anatofuz
parents:
diff changeset
205 auto memref = dmaStartOp.getOperand(dmaStartOp.getFasterMemPos());
anatofuz
parents:
diff changeset
206 bool escapingUses = false;
anatofuz
parents:
diff changeset
207 for (auto *user : memref.getUsers()) {
anatofuz
parents:
diff changeset
208 // We can double buffer regardless of dealloc's outside the loop.
anatofuz
parents:
diff changeset
209 if (isa<DeallocOp>(user))
anatofuz
parents:
diff changeset
210 continue;
anatofuz
parents:
diff changeset
211 if (!forOp.getBody()->findAncestorOpInBlock(*user)) {
anatofuz
parents:
diff changeset
212 LLVM_DEBUG(llvm::dbgs()
anatofuz
parents:
diff changeset
213 << "can't pipeline: buffer is live out of loop\n";);
anatofuz
parents:
diff changeset
214 escapingUses = true;
anatofuz
parents:
diff changeset
215 break;
anatofuz
parents:
diff changeset
216 }
anatofuz
parents:
diff changeset
217 }
anatofuz
parents:
diff changeset
218 if (!escapingUses)
anatofuz
parents:
diff changeset
219 dmaStartInsts.push_back(&op);
anatofuz
parents:
diff changeset
220 }
anatofuz
parents:
diff changeset
221
anatofuz
parents:
diff changeset
222 // For each start operation, we look for a matching finish operation.
anatofuz
parents:
diff changeset
223 for (auto *dmaStartInst : dmaStartInsts) {
anatofuz
parents:
diff changeset
224 for (auto *dmaFinishInst : dmaFinishInsts) {
anatofuz
parents:
diff changeset
225 if (checkTagMatch(cast<AffineDmaStartOp>(dmaStartInst),
anatofuz
parents:
diff changeset
226 cast<AffineDmaWaitOp>(dmaFinishInst))) {
anatofuz
parents:
diff changeset
227 startWaitPairs.push_back({dmaStartInst, dmaFinishInst});
anatofuz
parents:
diff changeset
228 break;
anatofuz
parents:
diff changeset
229 }
anatofuz
parents:
diff changeset
230 }
anatofuz
parents:
diff changeset
231 }
anatofuz
parents:
diff changeset
232 }
anatofuz
parents:
diff changeset
233
anatofuz
parents:
diff changeset
234 /// Overlap DMA transfers with computation in this loop. If successful,
anatofuz
parents:
diff changeset
235 /// 'forOp' is deleted, and a prologue, a new pipelined loop, and epilogue are
anatofuz
parents:
diff changeset
236 /// inserted right before where it was.
anatofuz
parents:
diff changeset
237 void PipelineDataTransfer::runOnAffineForOp(AffineForOp forOp) {
anatofuz
parents:
diff changeset
238 auto mayBeConstTripCount = getConstantTripCount(forOp);
anatofuz
parents:
diff changeset
239 if (!mayBeConstTripCount.hasValue()) {
anatofuz
parents:
diff changeset
240 LLVM_DEBUG(
anatofuz
parents:
diff changeset
241 forOp.emitRemark("won't pipeline due to unknown trip count loop"));
anatofuz
parents:
diff changeset
242 return;
anatofuz
parents:
diff changeset
243 }
anatofuz
parents:
diff changeset
244
anatofuz
parents:
diff changeset
245 SmallVector<std::pair<Operation *, Operation *>, 4> startWaitPairs;
anatofuz
parents:
diff changeset
246 findMatchingStartFinishInsts(forOp, startWaitPairs);
anatofuz
parents:
diff changeset
247
anatofuz
parents:
diff changeset
248 if (startWaitPairs.empty()) {
anatofuz
parents:
diff changeset
249 LLVM_DEBUG(forOp.emitRemark("No dma start/finish pairs\n"));
anatofuz
parents:
diff changeset
250 return;
anatofuz
parents:
diff changeset
251 }
anatofuz
parents:
diff changeset
252
anatofuz
parents:
diff changeset
253 // Double the buffers for the higher memory space memref's.
anatofuz
parents:
diff changeset
254 // Identify memref's to replace by scanning through all DMA start
anatofuz
parents:
diff changeset
255 // operations. A DMA start operation has two memref's - the one from the
anatofuz
parents:
diff changeset
256 // higher level of memory hierarchy is the one to double buffer.
anatofuz
parents:
diff changeset
257 // TODO(bondhugula): check whether double-buffering is even necessary.
anatofuz
parents:
diff changeset
258 // TODO(bondhugula): make this work with different layouts: assuming here that
anatofuz
parents:
diff changeset
259 // the dimension we are adding here for the double buffering is the outermost
anatofuz
parents:
diff changeset
260 // dimension.
anatofuz
parents:
diff changeset
261 for (auto &pair : startWaitPairs) {
anatofuz
parents:
diff changeset
262 auto *dmaStartInst = pair.first;
anatofuz
parents:
diff changeset
263 Value oldMemRef = dmaStartInst->getOperand(
anatofuz
parents:
diff changeset
264 cast<AffineDmaStartOp>(dmaStartInst).getFasterMemPos());
anatofuz
parents:
diff changeset
265 if (!doubleBuffer(oldMemRef, forOp)) {
anatofuz
parents:
diff changeset
266 // Normally, double buffering should not fail because we already checked
anatofuz
parents:
diff changeset
267 // that there are no uses outside.
anatofuz
parents:
diff changeset
268 LLVM_DEBUG(llvm::dbgs()
anatofuz
parents:
diff changeset
269 << "double buffering failed for" << dmaStartInst << "\n";);
anatofuz
parents:
diff changeset
270 // IR still valid and semantically correct.
anatofuz
parents:
diff changeset
271 return;
anatofuz
parents:
diff changeset
272 }
anatofuz
parents:
diff changeset
273 // If the old memref has no more uses, remove its 'dead' alloc if it was
anatofuz
parents:
diff changeset
274 // alloc'ed. (note: DMA buffers are rarely function live-in; but a 'dim'
anatofuz
parents:
diff changeset
275 // operation could have been used on it if it was dynamically shaped in
anatofuz
parents:
diff changeset
276 // order to create the double buffer above.)
anatofuz
parents:
diff changeset
277 // '-canonicalize' does this in a more general way, but we'll anyway do the
anatofuz
parents:
diff changeset
278 // simple/common case so that the output / test cases looks clear.
anatofuz
parents:
diff changeset
279 if (auto *allocInst = oldMemRef.getDefiningOp()) {
anatofuz
parents:
diff changeset
280 if (oldMemRef.use_empty()) {
anatofuz
parents:
diff changeset
281 allocInst->erase();
anatofuz
parents:
diff changeset
282 } else if (oldMemRef.hasOneUse()) {
anatofuz
parents:
diff changeset
283 if (auto dealloc = dyn_cast<DeallocOp>(*oldMemRef.user_begin())) {
anatofuz
parents:
diff changeset
284 dealloc.erase();
anatofuz
parents:
diff changeset
285 allocInst->erase();
anatofuz
parents:
diff changeset
286 }
anatofuz
parents:
diff changeset
287 }
anatofuz
parents:
diff changeset
288 }
anatofuz
parents:
diff changeset
289 }
anatofuz
parents:
diff changeset
290
anatofuz
parents:
diff changeset
291 // Double the buffers for tag memrefs.
anatofuz
parents:
diff changeset
292 for (auto &pair : startWaitPairs) {
anatofuz
parents:
diff changeset
293 auto *dmaFinishInst = pair.second;
anatofuz
parents:
diff changeset
294 Value oldTagMemRef =
anatofuz
parents:
diff changeset
295 dmaFinishInst->getOperand(getTagMemRefPos(*dmaFinishInst));
anatofuz
parents:
diff changeset
296 if (!doubleBuffer(oldTagMemRef, forOp)) {
anatofuz
parents:
diff changeset
297 LLVM_DEBUG(llvm::dbgs() << "tag double buffering failed\n";);
anatofuz
parents:
diff changeset
298 return;
anatofuz
parents:
diff changeset
299 }
anatofuz
parents:
diff changeset
300 // If the old tag has no uses or a single dealloc use, remove it.
anatofuz
parents:
diff changeset
301 // (canonicalization handles more complex cases).
anatofuz
parents:
diff changeset
302 if (auto *tagAllocInst = oldTagMemRef.getDefiningOp()) {
anatofuz
parents:
diff changeset
303 if (oldTagMemRef.use_empty()) {
anatofuz
parents:
diff changeset
304 tagAllocInst->erase();
anatofuz
parents:
diff changeset
305 } else if (oldTagMemRef.hasOneUse()) {
anatofuz
parents:
diff changeset
306 if (auto dealloc = dyn_cast<DeallocOp>(*oldTagMemRef.user_begin())) {
anatofuz
parents:
diff changeset
307 dealloc.erase();
anatofuz
parents:
diff changeset
308 tagAllocInst->erase();
anatofuz
parents:
diff changeset
309 }
anatofuz
parents:
diff changeset
310 }
anatofuz
parents:
diff changeset
311 }
anatofuz
parents:
diff changeset
312 }
anatofuz
parents:
diff changeset
313
anatofuz
parents:
diff changeset
314 // Double buffering would have invalidated all the old DMA start/wait insts.
anatofuz
parents:
diff changeset
315 startWaitPairs.clear();
anatofuz
parents:
diff changeset
316 findMatchingStartFinishInsts(forOp, startWaitPairs);
anatofuz
parents:
diff changeset
317
anatofuz
parents:
diff changeset
318 // Store shift for operation for later lookup for AffineApplyOp's.
anatofuz
parents:
diff changeset
319 DenseMap<Operation *, unsigned> instShiftMap;
anatofuz
parents:
diff changeset
320 for (auto &pair : startWaitPairs) {
anatofuz
parents:
diff changeset
321 auto *dmaStartInst = pair.first;
anatofuz
parents:
diff changeset
322 assert(isa<AffineDmaStartOp>(dmaStartInst));
anatofuz
parents:
diff changeset
323 instShiftMap[dmaStartInst] = 0;
anatofuz
parents:
diff changeset
324 // Set shifts for DMA start op's affine operand computation slices to 0.
anatofuz
parents:
diff changeset
325 SmallVector<AffineApplyOp, 4> sliceOps;
anatofuz
parents:
diff changeset
326 mlir::createAffineComputationSlice(dmaStartInst, &sliceOps);
anatofuz
parents:
diff changeset
327 if (!sliceOps.empty()) {
anatofuz
parents:
diff changeset
328 for (auto sliceOp : sliceOps) {
anatofuz
parents:
diff changeset
329 instShiftMap[sliceOp.getOperation()] = 0;
anatofuz
parents:
diff changeset
330 }
anatofuz
parents:
diff changeset
331 } else {
anatofuz
parents:
diff changeset
332 // If a slice wasn't created, the reachable affine.apply op's from its
anatofuz
parents:
diff changeset
333 // operands are the ones that go with it.
anatofuz
parents:
diff changeset
334 SmallVector<Operation *, 4> affineApplyInsts;
anatofuz
parents:
diff changeset
335 SmallVector<Value, 4> operands(dmaStartInst->getOperands());
anatofuz
parents:
diff changeset
336 getReachableAffineApplyOps(operands, affineApplyInsts);
anatofuz
parents:
diff changeset
337 for (auto *op : affineApplyInsts) {
anatofuz
parents:
diff changeset
338 instShiftMap[op] = 0;
anatofuz
parents:
diff changeset
339 }
anatofuz
parents:
diff changeset
340 }
anatofuz
parents:
diff changeset
341 }
anatofuz
parents:
diff changeset
342 // Everything else (including compute ops and dma finish) are shifted by one.
anatofuz
parents:
diff changeset
343 for (auto &op : *forOp.getBody()) {
anatofuz
parents:
diff changeset
344 if (instShiftMap.find(&op) == instShiftMap.end()) {
anatofuz
parents:
diff changeset
345 instShiftMap[&op] = 1;
anatofuz
parents:
diff changeset
346 }
anatofuz
parents:
diff changeset
347 }
anatofuz
parents:
diff changeset
348
anatofuz
parents:
diff changeset
349 // Get shifts stored in map.
anatofuz
parents:
diff changeset
350 std::vector<uint64_t> shifts(forOp.getBody()->getOperations().size());
anatofuz
parents:
diff changeset
351 unsigned s = 0;
anatofuz
parents:
diff changeset
352 for (auto &op : *forOp.getBody()) {
anatofuz
parents:
diff changeset
353 assert(instShiftMap.find(&op) != instShiftMap.end());
anatofuz
parents:
diff changeset
354 shifts[s++] = instShiftMap[&op];
anatofuz
parents:
diff changeset
355
anatofuz
parents:
diff changeset
356 // Tagging operations with shifts for debugging purposes.
anatofuz
parents:
diff changeset
357 LLVM_DEBUG({
anatofuz
parents:
diff changeset
358 OpBuilder b(&op);
anatofuz
parents:
diff changeset
359 op.setAttr("shift", b.getI64IntegerAttr(shifts[s - 1]));
anatofuz
parents:
diff changeset
360 });
anatofuz
parents:
diff changeset
361 }
anatofuz
parents:
diff changeset
362
anatofuz
parents:
diff changeset
363 if (!isInstwiseShiftValid(forOp, shifts)) {
anatofuz
parents:
diff changeset
364 // Violates dependences.
anatofuz
parents:
diff changeset
365 LLVM_DEBUG(llvm::dbgs() << "Shifts invalid - unexpected\n";);
anatofuz
parents:
diff changeset
366 return;
anatofuz
parents:
diff changeset
367 }
anatofuz
parents:
diff changeset
368
anatofuz
parents:
diff changeset
369 if (failed(instBodySkew(forOp, shifts))) {
anatofuz
parents:
diff changeset
370 LLVM_DEBUG(llvm::dbgs() << "op body skewing failed - unexpected\n";);
anatofuz
parents:
diff changeset
371 return;
anatofuz
parents:
diff changeset
372 }
anatofuz
parents:
diff changeset
373 }
anatofuz
parents:
diff changeset
374
anatofuz
parents:
diff changeset
375 static PassRegistration<PipelineDataTransfer> pass(
anatofuz
parents:
diff changeset
376 "affine-pipeline-data-transfer",
anatofuz
parents:
diff changeset
377 "Pipeline non-blocking data transfers between explicitly managed levels of "
anatofuz
parents:
diff changeset
378 "the memory hierarchy");