comparison lib/Target/R600/R600ControlFlowFinalizer.cpp @ 77:54457678186b LLVM3.6

LLVM 3.6
author Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
date Mon, 08 Sep 2014 22:06:00 +0900
parents e4204d083e25
children 60c9769439b8
comparison
equal deleted inserted replaced
34:e874dbf0ad9d 77:54457678186b
10 /// \file 10 /// \file
11 /// This pass compute turns all control flow pseudo instructions into native one 11 /// This pass compute turns all control flow pseudo instructions into native one
12 /// computing their address on the fly ; it also sets STACK_SIZE info. 12 /// computing their address on the fly ; it also sets STACK_SIZE info.
13 //===----------------------------------------------------------------------===// 13 //===----------------------------------------------------------------------===//
14 14
15 #define DEBUG_TYPE "r600cf"
16 #include "llvm/Support/Debug.h" 15 #include "llvm/Support/Debug.h"
17 #include "AMDGPU.h" 16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
18 #include "R600Defines.h" 18 #include "R600Defines.h"
19 #include "R600InstrInfo.h" 19 #include "R600InstrInfo.h"
20 #include "R600MachineFunctionInfo.h" 20 #include "R600MachineFunctionInfo.h"
21 #include "R600RegisterInfo.h" 21 #include "R600RegisterInfo.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h" 22 #include "llvm/CodeGen/MachineFunctionPass.h"
24 #include "llvm/CodeGen/MachineRegisterInfo.h" 24 #include "llvm/CodeGen/MachineRegisterInfo.h"
25 #include "llvm/Support/raw_ostream.h" 25 #include "llvm/Support/raw_ostream.h"
26 26
27 using namespace llvm; 27 using namespace llvm;
28 28
29 #define DEBUG_TYPE "r600cf"
30
29 namespace { 31 namespace {
32
33 struct CFStack {
34
35 enum StackItem {
36 ENTRY = 0,
37 SUB_ENTRY = 1,
38 FIRST_NON_WQM_PUSH = 2,
39 FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
40 };
41
42 const AMDGPUSubtarget &ST;
43 std::vector<StackItem> BranchStack;
44 std::vector<StackItem> LoopStack;
45 unsigned MaxStackSize;
46 unsigned CurrentEntries;
47 unsigned CurrentSubEntries;
48
49 CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st),
50 // We need to reserve a stack entry for CALL_FS in vertex shaders.
51 MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0),
52 CurrentEntries(0), CurrentSubEntries(0) { }
53
54 unsigned getLoopDepth();
55 bool branchStackContains(CFStack::StackItem);
56 bool requiresWorkAroundForInst(unsigned Opcode);
57 unsigned getSubEntrySize(CFStack::StackItem Item);
58 void updateMaxStackSize();
59 void pushBranch(unsigned Opcode, bool isWQM = false);
60 void pushLoop();
61 void popBranch();
62 void popLoop();
63 };
64
65 unsigned CFStack::getLoopDepth() {
66 return LoopStack.size();
67 }
68
69 bool CFStack::branchStackContains(CFStack::StackItem Item) {
70 for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
71 E = BranchStack.end(); I != E; ++I) {
72 if (*I == Item)
73 return true;
74 }
75 return false;
76 }
77
78 bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
79 if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() &&
80 getLoopDepth() > 1)
81 return true;
82
83 if (!ST.hasCFAluBug())
84 return false;
85
86 switch(Opcode) {
87 default: return false;
88 case AMDGPU::CF_ALU_PUSH_BEFORE:
89 case AMDGPU::CF_ALU_ELSE_AFTER:
90 case AMDGPU::CF_ALU_BREAK:
91 case AMDGPU::CF_ALU_CONTINUE:
92 if (CurrentSubEntries == 0)
93 return false;
94 if (ST.getWavefrontSize() == 64) {
95 // We are being conservative here. We only require this work-around if
96 // CurrentSubEntries > 3 &&
97 // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
98 //
99 // We have to be conservative, because we don't know for certain that
100 // our stack allocation algorithm for Evergreen/NI is correct. Applying this
101 // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
102 // resources without any problems.
103 return CurrentSubEntries > 3;
104 } else {
105 assert(ST.getWavefrontSize() == 32);
106 // We are being conservative here. We only require the work-around if
107 // CurrentSubEntries > 7 &&
108 // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
109 // See the comment on the wavefront size == 64 case for why we are
110 // being conservative.
111 return CurrentSubEntries > 7;
112 }
113 }
114 }
115
116 unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
117 switch(Item) {
118 default:
119 return 0;
120 case CFStack::FIRST_NON_WQM_PUSH:
121 assert(!ST.hasCaymanISA());
122 if (ST.getGeneration() <= AMDGPUSubtarget::R700) {
123 // +1 For the push operation.
124 // +2 Extra space required.
125 return 3;
126 } else {
127 // Some documentation says that this is not necessary on Evergreen,
128 // but experimentation has show that we need to allocate 1 extra
129 // sub-entry for the first non-WQM push.
130 // +1 For the push operation.
131 // +1 Extra space required.
132 return 2;
133 }
134 case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
135 assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
136 // +1 For the push operation.
137 // +1 Extra space required.
138 return 2;
139 case CFStack::SUB_ENTRY:
140 return 1;
141 }
142 }
143
144 void CFStack::updateMaxStackSize() {
145 unsigned CurrentStackSize = CurrentEntries +
146 (RoundUpToAlignment(CurrentSubEntries, 4) / 4);
147 MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
148 }
149
150 void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
151 CFStack::StackItem Item = CFStack::ENTRY;
152 switch(Opcode) {
153 case AMDGPU::CF_PUSH_EG:
154 case AMDGPU::CF_ALU_PUSH_BEFORE:
155 if (!isWQM) {
156 if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
157 Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI
158 // See comment in
159 // CFStack::getSubEntrySize()
160 else if (CurrentEntries > 0 &&
161 ST.getGeneration() > AMDGPUSubtarget::EVERGREEN &&
162 !ST.hasCaymanISA() &&
163 !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
164 Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
165 else
166 Item = CFStack::SUB_ENTRY;
167 } else
168 Item = CFStack::ENTRY;
169 break;
170 }
171 BranchStack.push_back(Item);
172 if (Item == CFStack::ENTRY)
173 CurrentEntries++;
174 else
175 CurrentSubEntries += getSubEntrySize(Item);
176 updateMaxStackSize();
177 }
178
179 void CFStack::pushLoop() {
180 LoopStack.push_back(CFStack::ENTRY);
181 CurrentEntries++;
182 updateMaxStackSize();
183 }
184
185 void CFStack::popBranch() {
186 CFStack::StackItem Top = BranchStack.back();
187 if (Top == CFStack::ENTRY)
188 CurrentEntries--;
189 else
190 CurrentSubEntries-= getSubEntrySize(Top);
191 BranchStack.pop_back();
192 }
193
194 void CFStack::popLoop() {
195 CurrentEntries--;
196 LoopStack.pop_back();
197 }
30 198
31 class R600ControlFlowFinalizer : public MachineFunctionPass { 199 class R600ControlFlowFinalizer : public MachineFunctionPass {
32 200
33 private: 201 private:
34 typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile; 202 typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
298 MachineInstr *MI = *It; 466 MachineInstr *MI = *It;
299 CounterPropagateAddr(MI, Addr); 467 CounterPropagateAddr(MI, Addr);
300 } 468 }
301 } 469 }
302 470
303 unsigned getHWStackSize(unsigned StackSubEntry, bool hasPush) const {
304 switch (ST.getGeneration()) {
305 case AMDGPUSubtarget::R600:
306 case AMDGPUSubtarget::R700:
307 if (hasPush)
308 StackSubEntry += 2;
309 break;
310 case AMDGPUSubtarget::EVERGREEN:
311 if (hasPush)
312 StackSubEntry ++;
313 case AMDGPUSubtarget::NORTHERN_ISLANDS:
314 StackSubEntry += 2;
315 break;
316 default: llvm_unreachable("Not a VLIW4/VLIW5 GPU");
317 }
318 return (StackSubEntry + 3)/4; // Need ceil value of StackSubEntry/4
319 }
320
321 public: 471 public:
322 R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID), 472 R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
323 TII (0), TRI(0), 473 TII (nullptr), TRI(nullptr),
324 ST(tm.getSubtarget<AMDGPUSubtarget>()) { 474 ST(tm.getSubtarget<AMDGPUSubtarget>()) {
325 const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>(); 475 const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
326 MaxFetchInst = ST.getTexVTXClauseSize(); 476 MaxFetchInst = ST.getTexVTXClauseSize();
327 } 477 }
328 478
329 virtual bool runOnMachineFunction(MachineFunction &MF) { 479 bool runOnMachineFunction(MachineFunction &MF) override {
330 TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); 480 TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo());
331 TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo()); 481 TRI = static_cast<const R600RegisterInfo *>(
332 482 MF.getSubtarget().getRegisterInfo());
333 unsigned MaxStack = 0; 483 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
334 unsigned CurrentStack = 0; 484
335 unsigned CurrentLoopDepth = 0; 485 CFStack CFStack(ST, MFI->getShaderType());
336 bool HasPush = false;
337 for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; 486 for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
338 ++MB) { 487 ++MB) {
339 MachineBasicBlock &MBB = *MB; 488 MachineBasicBlock &MBB = *MB;
340 unsigned CfCount = 0; 489 unsigned CfCount = 0;
341 std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; 490 std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
342 std::vector<MachineInstr * > IfThenElseStack; 491 std::vector<MachineInstr * > IfThenElseStack;
343 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); 492 if (MFI->getShaderType() == ShaderType::VERTEX) {
344 if (MFI->ShaderType == 1) {
345 BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), 493 BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
346 getHWInstrDesc(CF_CALL_FS)); 494 getHWInstrDesc(CF_CALL_FS));
347 CfCount++; 495 CfCount++;
348 MaxStack = 1;
349 } 496 }
350 std::vector<ClauseFile> FetchClauses, AluClauses; 497 std::vector<ClauseFile> FetchClauses, AluClauses;
351 std::vector<MachineInstr *> LastAlu(1); 498 std::vector<MachineInstr *> LastAlu(1);
352 std::vector<MachineInstr *> ToPopAfter; 499 std::vector<MachineInstr *> ToPopAfter;
353 500
355 I != E;) { 502 I != E;) {
356 if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { 503 if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
357 DEBUG(dbgs() << CfCount << ":"; I->dump();); 504 DEBUG(dbgs() << CfCount << ":"; I->dump(););
358 FetchClauses.push_back(MakeFetchClause(MBB, I)); 505 FetchClauses.push_back(MakeFetchClause(MBB, I));
359 CfCount++; 506 CfCount++;
507 LastAlu.back() = nullptr;
360 continue; 508 continue;
361 } 509 }
362 510
363 MachineBasicBlock::iterator MI = I; 511 MachineBasicBlock::iterator MI = I;
364 if (MI->getOpcode() != AMDGPU::ENDIF) 512 if (MI->getOpcode() != AMDGPU::ENDIF)
365 LastAlu.back() = 0; 513 LastAlu.back() = nullptr;
366 if (MI->getOpcode() == AMDGPU::CF_ALU) 514 if (MI->getOpcode() == AMDGPU::CF_ALU)
367 LastAlu.back() = MI; 515 LastAlu.back() = MI;
368 I++; 516 I++;
517 bool RequiresWorkAround =
518 CFStack.requiresWorkAroundForInst(MI->getOpcode());
369 switch (MI->getOpcode()) { 519 switch (MI->getOpcode()) {
370 case AMDGPU::CF_ALU_PUSH_BEFORE: 520 case AMDGPU::CF_ALU_PUSH_BEFORE:
371 CurrentStack++; 521 if (RequiresWorkAround) {
372 MaxStack = std::max(MaxStack, CurrentStack); 522 DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
373 HasPush = true; 523 BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
374 if (ST.hasCaymanISA() && CurrentLoopDepth > 1) {
375 BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_CM))
376 .addImm(CfCount + 1) 524 .addImm(CfCount + 1)
377 .addImm(1); 525 .addImm(1);
378 MI->setDesc(TII->get(AMDGPU::CF_ALU)); 526 MI->setDesc(TII->get(AMDGPU::CF_ALU));
379 CfCount++; 527 CfCount++;
380 } 528 CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
529 } else
530 CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
531
381 case AMDGPU::CF_ALU: 532 case AMDGPU::CF_ALU:
382 I = MI; 533 I = MI;
383 AluClauses.push_back(MakeALUClause(MBB, I)); 534 AluClauses.push_back(MakeALUClause(MBB, I));
384 DEBUG(dbgs() << CfCount << ":"; MI->dump();); 535 DEBUG(dbgs() << CfCount << ":"; MI->dump(););
385 CfCount++; 536 CfCount++;
386 break; 537 break;
387 case AMDGPU::WHILELOOP: { 538 case AMDGPU::WHILELOOP: {
388 CurrentStack+=4; 539 CFStack.pushLoop();
389 CurrentLoopDepth++;
390 MaxStack = std::max(MaxStack, CurrentStack);
391 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 540 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
392 getHWInstrDesc(CF_WHILE_LOOP)) 541 getHWInstrDesc(CF_WHILE_LOOP))
393 .addImm(1); 542 .addImm(1);
394 std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, 543 std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
395 std::set<MachineInstr *>()); 544 std::set<MachineInstr *>());
398 MI->eraseFromParent(); 547 MI->eraseFromParent();
399 CfCount++; 548 CfCount++;
400 break; 549 break;
401 } 550 }
402 case AMDGPU::ENDLOOP: { 551 case AMDGPU::ENDLOOP: {
403 CurrentStack-=4; 552 CFStack.popLoop();
404 CurrentLoopDepth--;
405 std::pair<unsigned, std::set<MachineInstr *> > Pair = 553 std::pair<unsigned, std::set<MachineInstr *> > Pair =
406 LoopStack.back(); 554 LoopStack.back();
407 LoopStack.pop_back(); 555 LoopStack.pop_back();
408 CounterPropagateAddr(Pair.second, CfCount); 556 CounterPropagateAddr(Pair.second, CfCount);
409 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) 557 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
411 MI->eraseFromParent(); 559 MI->eraseFromParent();
412 CfCount++; 560 CfCount++;
413 break; 561 break;
414 } 562 }
415 case AMDGPU::IF_PREDICATE_SET: { 563 case AMDGPU::IF_PREDICATE_SET: {
416 LastAlu.push_back(0); 564 LastAlu.push_back(nullptr);
417 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 565 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
418 getHWInstrDesc(CF_JUMP)) 566 getHWInstrDesc(CF_JUMP))
419 .addImm(0) 567 .addImm(0)
420 .addImm(0); 568 .addImm(0);
421 IfThenElseStack.push_back(MIb); 569 IfThenElseStack.push_back(MIb);
437 MI->eraseFromParent(); 585 MI->eraseFromParent();
438 CfCount++; 586 CfCount++;
439 break; 587 break;
440 } 588 }
441 case AMDGPU::ENDIF: { 589 case AMDGPU::ENDIF: {
442 CurrentStack--; 590 CFStack.popBranch();
443 if (LastAlu.back()) { 591 if (LastAlu.back()) {
444 ToPopAfter.push_back(LastAlu.back()); 592 ToPopAfter.push_back(LastAlu.back());
445 } else { 593 } else {
446 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), 594 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
447 getHWInstrDesc(CF_POP)) 595 getHWInstrDesc(CF_POP))
512 .addImm(Alu->getOperand(6).getImm()) 660 .addImm(Alu->getOperand(6).getImm())
513 .addImm(Alu->getOperand(7).getImm()) 661 .addImm(Alu->getOperand(7).getImm())
514 .addImm(Alu->getOperand(8).getImm()); 662 .addImm(Alu->getOperand(8).getImm());
515 Alu->eraseFromParent(); 663 Alu->eraseFromParent();
516 } 664 }
517 MFI->StackSize = getHWStackSize(MaxStack, HasPush); 665 MFI->StackSize = CFStack.MaxStackSize;
518 } 666 }
519 667
520 return false; 668 return false;
521 } 669 }
522 670
523 const char *getPassName() const { 671 const char *getPassName() const override {
524 return "R600 Control Flow Finalizer Pass"; 672 return "R600 Control Flow Finalizer Pass";
525 } 673 }
526 }; 674 };
527 675
528 char R600ControlFlowFinalizer::ID = 0; 676 char R600ControlFlowFinalizer::ID = 0;