Mercurial > hg > CbC > CbC_llvm
comparison lib/Target/R600/R600ControlFlowFinalizer.cpp @ 77:54457678186b LLVM3.6
LLVM 3.6
author | Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 08 Sep 2014 22:06:00 +0900 |
parents | e4204d083e25 |
children | 60c9769439b8 |
comparison
equal
deleted
inserted
replaced
34:e874dbf0ad9d | 77:54457678186b |
---|---|
10 /// \file | 10 /// \file |
11 /// This pass compute turns all control flow pseudo instructions into native one | 11 /// This pass compute turns all control flow pseudo instructions into native one |
12 /// computing their address on the fly ; it also sets STACK_SIZE info. | 12 /// computing their address on the fly ; it also sets STACK_SIZE info. |
13 //===----------------------------------------------------------------------===// | 13 //===----------------------------------------------------------------------===// |
14 | 14 |
15 #define DEBUG_TYPE "r600cf" | |
16 #include "llvm/Support/Debug.h" | 15 #include "llvm/Support/Debug.h" |
17 #include "AMDGPU.h" | 16 #include "AMDGPU.h" |
17 #include "AMDGPUSubtarget.h" | |
18 #include "R600Defines.h" | 18 #include "R600Defines.h" |
19 #include "R600InstrInfo.h" | 19 #include "R600InstrInfo.h" |
20 #include "R600MachineFunctionInfo.h" | 20 #include "R600MachineFunctionInfo.h" |
21 #include "R600RegisterInfo.h" | 21 #include "R600RegisterInfo.h" |
22 #include "llvm/CodeGen/MachineFunctionPass.h" | 22 #include "llvm/CodeGen/MachineFunctionPass.h" |
24 #include "llvm/CodeGen/MachineRegisterInfo.h" | 24 #include "llvm/CodeGen/MachineRegisterInfo.h" |
25 #include "llvm/Support/raw_ostream.h" | 25 #include "llvm/Support/raw_ostream.h" |
26 | 26 |
27 using namespace llvm; | 27 using namespace llvm; |
28 | 28 |
29 #define DEBUG_TYPE "r600cf" | |
30 | |
29 namespace { | 31 namespace { |
32 | |
33 struct CFStack { | |
34 | |
35 enum StackItem { | |
36 ENTRY = 0, | |
37 SUB_ENTRY = 1, | |
38 FIRST_NON_WQM_PUSH = 2, | |
39 FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 | |
40 }; | |
41 | |
42 const AMDGPUSubtarget &ST; | |
43 std::vector<StackItem> BranchStack; | |
44 std::vector<StackItem> LoopStack; | |
45 unsigned MaxStackSize; | |
46 unsigned CurrentEntries; | |
47 unsigned CurrentSubEntries; | |
48 | |
49 CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st), | |
50 // We need to reserve a stack entry for CALL_FS in vertex shaders. | |
51 MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), | |
52 CurrentEntries(0), CurrentSubEntries(0) { } | |
53 | |
54 unsigned getLoopDepth(); | |
55 bool branchStackContains(CFStack::StackItem); | |
56 bool requiresWorkAroundForInst(unsigned Opcode); | |
57 unsigned getSubEntrySize(CFStack::StackItem Item); | |
58 void updateMaxStackSize(); | |
59 void pushBranch(unsigned Opcode, bool isWQM = false); | |
60 void pushLoop(); | |
61 void popBranch(); | |
62 void popLoop(); | |
63 }; | |
64 | |
65 unsigned CFStack::getLoopDepth() { | |
66 return LoopStack.size(); | |
67 } | |
68 | |
69 bool CFStack::branchStackContains(CFStack::StackItem Item) { | |
70 for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(), | |
71 E = BranchStack.end(); I != E; ++I) { | |
72 if (*I == Item) | |
73 return true; | |
74 } | |
75 return false; | |
76 } | |
77 | |
78 bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { | |
79 if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() && | |
80 getLoopDepth() > 1) | |
81 return true; | |
82 | |
83 if (!ST.hasCFAluBug()) | |
84 return false; | |
85 | |
86 switch(Opcode) { | |
87 default: return false; | |
88 case AMDGPU::CF_ALU_PUSH_BEFORE: | |
89 case AMDGPU::CF_ALU_ELSE_AFTER: | |
90 case AMDGPU::CF_ALU_BREAK: | |
91 case AMDGPU::CF_ALU_CONTINUE: | |
92 if (CurrentSubEntries == 0) | |
93 return false; | |
94 if (ST.getWavefrontSize() == 64) { | |
95 // We are being conservative here. We only require this work-around if | |
96 // CurrentSubEntries > 3 && | |
97 // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) | |
98 // | |
99 // We have to be conservative, because we don't know for certain that | |
100 // our stack allocation algorithm for Evergreen/NI is correct. Applying this | |
101 // work-around when CurrentSubEntries > 3 allows us to over-allocate stack | |
102 // resources without any problems. | |
103 return CurrentSubEntries > 3; | |
104 } else { | |
105 assert(ST.getWavefrontSize() == 32); | |
106 // We are being conservative here. We only require the work-around if | |
107 // CurrentSubEntries > 7 && | |
108 // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) | |
109 // See the comment on the wavefront size == 64 case for why we are | |
110 // being conservative. | |
111 return CurrentSubEntries > 7; | |
112 } | |
113 } | |
114 } | |
115 | |
116 unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { | |
117 switch(Item) { | |
118 default: | |
119 return 0; | |
120 case CFStack::FIRST_NON_WQM_PUSH: | |
121 assert(!ST.hasCaymanISA()); | |
122 if (ST.getGeneration() <= AMDGPUSubtarget::R700) { | |
123 // +1 For the push operation. | |
124 // +2 Extra space required. | |
125 return 3; | |
126 } else { | |
127 // Some documentation says that this is not necessary on Evergreen, | |
128 // but experimentation has show that we need to allocate 1 extra | |
129 // sub-entry for the first non-WQM push. | |
130 // +1 For the push operation. | |
131 // +1 Extra space required. | |
132 return 2; | |
133 } | |
134 case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: | |
135 assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN); | |
136 // +1 For the push operation. | |
137 // +1 Extra space required. | |
138 return 2; | |
139 case CFStack::SUB_ENTRY: | |
140 return 1; | |
141 } | |
142 } | |
143 | |
144 void CFStack::updateMaxStackSize() { | |
145 unsigned CurrentStackSize = CurrentEntries + | |
146 (RoundUpToAlignment(CurrentSubEntries, 4) / 4); | |
147 MaxStackSize = std::max(CurrentStackSize, MaxStackSize); | |
148 } | |
149 | |
150 void CFStack::pushBranch(unsigned Opcode, bool isWQM) { | |
151 CFStack::StackItem Item = CFStack::ENTRY; | |
152 switch(Opcode) { | |
153 case AMDGPU::CF_PUSH_EG: | |
154 case AMDGPU::CF_ALU_PUSH_BEFORE: | |
155 if (!isWQM) { | |
156 if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) | |
157 Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI | |
158 // See comment in | |
159 // CFStack::getSubEntrySize() | |
160 else if (CurrentEntries > 0 && | |
161 ST.getGeneration() > AMDGPUSubtarget::EVERGREEN && | |
162 !ST.hasCaymanISA() && | |
163 !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) | |
164 Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; | |
165 else | |
166 Item = CFStack::SUB_ENTRY; | |
167 } else | |
168 Item = CFStack::ENTRY; | |
169 break; | |
170 } | |
171 BranchStack.push_back(Item); | |
172 if (Item == CFStack::ENTRY) | |
173 CurrentEntries++; | |
174 else | |
175 CurrentSubEntries += getSubEntrySize(Item); | |
176 updateMaxStackSize(); | |
177 } | |
178 | |
179 void CFStack::pushLoop() { | |
180 LoopStack.push_back(CFStack::ENTRY); | |
181 CurrentEntries++; | |
182 updateMaxStackSize(); | |
183 } | |
184 | |
185 void CFStack::popBranch() { | |
186 CFStack::StackItem Top = BranchStack.back(); | |
187 if (Top == CFStack::ENTRY) | |
188 CurrentEntries--; | |
189 else | |
190 CurrentSubEntries-= getSubEntrySize(Top); | |
191 BranchStack.pop_back(); | |
192 } | |
193 | |
194 void CFStack::popLoop() { | |
195 CurrentEntries--; | |
196 LoopStack.pop_back(); | |
197 } | |
30 | 198 |
31 class R600ControlFlowFinalizer : public MachineFunctionPass { | 199 class R600ControlFlowFinalizer : public MachineFunctionPass { |
32 | 200 |
33 private: | 201 private: |
34 typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile; | 202 typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile; |
298 MachineInstr *MI = *It; | 466 MachineInstr *MI = *It; |
299 CounterPropagateAddr(MI, Addr); | 467 CounterPropagateAddr(MI, Addr); |
300 } | 468 } |
301 } | 469 } |
302 | 470 |
303 unsigned getHWStackSize(unsigned StackSubEntry, bool hasPush) const { | |
304 switch (ST.getGeneration()) { | |
305 case AMDGPUSubtarget::R600: | |
306 case AMDGPUSubtarget::R700: | |
307 if (hasPush) | |
308 StackSubEntry += 2; | |
309 break; | |
310 case AMDGPUSubtarget::EVERGREEN: | |
311 if (hasPush) | |
312 StackSubEntry ++; | |
313 case AMDGPUSubtarget::NORTHERN_ISLANDS: | |
314 StackSubEntry += 2; | |
315 break; | |
316 default: llvm_unreachable("Not a VLIW4/VLIW5 GPU"); | |
317 } | |
318 return (StackSubEntry + 3)/4; // Need ceil value of StackSubEntry/4 | |
319 } | |
320 | |
321 public: | 471 public: |
322 R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID), | 472 R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID), |
323 TII (0), TRI(0), | 473 TII (nullptr), TRI(nullptr), |
324 ST(tm.getSubtarget<AMDGPUSubtarget>()) { | 474 ST(tm.getSubtarget<AMDGPUSubtarget>()) { |
325 const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>(); | 475 const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>(); |
326 MaxFetchInst = ST.getTexVTXClauseSize(); | 476 MaxFetchInst = ST.getTexVTXClauseSize(); |
327 } | 477 } |
328 | 478 |
329 virtual bool runOnMachineFunction(MachineFunction &MF) { | 479 bool runOnMachineFunction(MachineFunction &MF) override { |
330 TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); | 480 TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); |
331 TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo()); | 481 TRI = static_cast<const R600RegisterInfo *>( |
332 | 482 MF.getSubtarget().getRegisterInfo()); |
333 unsigned MaxStack = 0; | 483 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); |
334 unsigned CurrentStack = 0; | 484 |
335 unsigned CurrentLoopDepth = 0; | 485 CFStack CFStack(ST, MFI->getShaderType()); |
336 bool HasPush = false; | |
337 for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; | 486 for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; |
338 ++MB) { | 487 ++MB) { |
339 MachineBasicBlock &MBB = *MB; | 488 MachineBasicBlock &MBB = *MB; |
340 unsigned CfCount = 0; | 489 unsigned CfCount = 0; |
341 std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; | 490 std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; |
342 std::vector<MachineInstr * > IfThenElseStack; | 491 std::vector<MachineInstr * > IfThenElseStack; |
343 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); | 492 if (MFI->getShaderType() == ShaderType::VERTEX) { |
344 if (MFI->ShaderType == 1) { | |
345 BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), | 493 BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), |
346 getHWInstrDesc(CF_CALL_FS)); | 494 getHWInstrDesc(CF_CALL_FS)); |
347 CfCount++; | 495 CfCount++; |
348 MaxStack = 1; | |
349 } | 496 } |
350 std::vector<ClauseFile> FetchClauses, AluClauses; | 497 std::vector<ClauseFile> FetchClauses, AluClauses; |
351 std::vector<MachineInstr *> LastAlu(1); | 498 std::vector<MachineInstr *> LastAlu(1); |
352 std::vector<MachineInstr *> ToPopAfter; | 499 std::vector<MachineInstr *> ToPopAfter; |
353 | 500 |
355 I != E;) { | 502 I != E;) { |
356 if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { | 503 if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { |
357 DEBUG(dbgs() << CfCount << ":"; I->dump();); | 504 DEBUG(dbgs() << CfCount << ":"; I->dump();); |
358 FetchClauses.push_back(MakeFetchClause(MBB, I)); | 505 FetchClauses.push_back(MakeFetchClause(MBB, I)); |
359 CfCount++; | 506 CfCount++; |
507 LastAlu.back() = nullptr; | |
360 continue; | 508 continue; |
361 } | 509 } |
362 | 510 |
363 MachineBasicBlock::iterator MI = I; | 511 MachineBasicBlock::iterator MI = I; |
364 if (MI->getOpcode() != AMDGPU::ENDIF) | 512 if (MI->getOpcode() != AMDGPU::ENDIF) |
365 LastAlu.back() = 0; | 513 LastAlu.back() = nullptr; |
366 if (MI->getOpcode() == AMDGPU::CF_ALU) | 514 if (MI->getOpcode() == AMDGPU::CF_ALU) |
367 LastAlu.back() = MI; | 515 LastAlu.back() = MI; |
368 I++; | 516 I++; |
517 bool RequiresWorkAround = | |
518 CFStack.requiresWorkAroundForInst(MI->getOpcode()); | |
369 switch (MI->getOpcode()) { | 519 switch (MI->getOpcode()) { |
370 case AMDGPU::CF_ALU_PUSH_BEFORE: | 520 case AMDGPU::CF_ALU_PUSH_BEFORE: |
371 CurrentStack++; | 521 if (RequiresWorkAround) { |
372 MaxStack = std::max(MaxStack, CurrentStack); | 522 DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); |
373 HasPush = true; | 523 BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) |
374 if (ST.hasCaymanISA() && CurrentLoopDepth > 1) { | |
375 BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_CM)) | |
376 .addImm(CfCount + 1) | 524 .addImm(CfCount + 1) |
377 .addImm(1); | 525 .addImm(1); |
378 MI->setDesc(TII->get(AMDGPU::CF_ALU)); | 526 MI->setDesc(TII->get(AMDGPU::CF_ALU)); |
379 CfCount++; | 527 CfCount++; |
380 } | 528 CFStack.pushBranch(AMDGPU::CF_PUSH_EG); |
529 } else | |
530 CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); | |
531 | |
381 case AMDGPU::CF_ALU: | 532 case AMDGPU::CF_ALU: |
382 I = MI; | 533 I = MI; |
383 AluClauses.push_back(MakeALUClause(MBB, I)); | 534 AluClauses.push_back(MakeALUClause(MBB, I)); |
384 DEBUG(dbgs() << CfCount << ":"; MI->dump();); | 535 DEBUG(dbgs() << CfCount << ":"; MI->dump();); |
385 CfCount++; | 536 CfCount++; |
386 break; | 537 break; |
387 case AMDGPU::WHILELOOP: { | 538 case AMDGPU::WHILELOOP: { |
388 CurrentStack+=4; | 539 CFStack.pushLoop(); |
389 CurrentLoopDepth++; | |
390 MaxStack = std::max(MaxStack, CurrentStack); | |
391 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), | 540 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), |
392 getHWInstrDesc(CF_WHILE_LOOP)) | 541 getHWInstrDesc(CF_WHILE_LOOP)) |
393 .addImm(1); | 542 .addImm(1); |
394 std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, | 543 std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, |
395 std::set<MachineInstr *>()); | 544 std::set<MachineInstr *>()); |
398 MI->eraseFromParent(); | 547 MI->eraseFromParent(); |
399 CfCount++; | 548 CfCount++; |
400 break; | 549 break; |
401 } | 550 } |
402 case AMDGPU::ENDLOOP: { | 551 case AMDGPU::ENDLOOP: { |
403 CurrentStack-=4; | 552 CFStack.popLoop(); |
404 CurrentLoopDepth--; | |
405 std::pair<unsigned, std::set<MachineInstr *> > Pair = | 553 std::pair<unsigned, std::set<MachineInstr *> > Pair = |
406 LoopStack.back(); | 554 LoopStack.back(); |
407 LoopStack.pop_back(); | 555 LoopStack.pop_back(); |
408 CounterPropagateAddr(Pair.second, CfCount); | 556 CounterPropagateAddr(Pair.second, CfCount); |
409 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) | 557 BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) |
411 MI->eraseFromParent(); | 559 MI->eraseFromParent(); |
412 CfCount++; | 560 CfCount++; |
413 break; | 561 break; |
414 } | 562 } |
415 case AMDGPU::IF_PREDICATE_SET: { | 563 case AMDGPU::IF_PREDICATE_SET: { |
416 LastAlu.push_back(0); | 564 LastAlu.push_back(nullptr); |
417 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), | 565 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), |
418 getHWInstrDesc(CF_JUMP)) | 566 getHWInstrDesc(CF_JUMP)) |
419 .addImm(0) | 567 .addImm(0) |
420 .addImm(0); | 568 .addImm(0); |
421 IfThenElseStack.push_back(MIb); | 569 IfThenElseStack.push_back(MIb); |
437 MI->eraseFromParent(); | 585 MI->eraseFromParent(); |
438 CfCount++; | 586 CfCount++; |
439 break; | 587 break; |
440 } | 588 } |
441 case AMDGPU::ENDIF: { | 589 case AMDGPU::ENDIF: { |
442 CurrentStack--; | 590 CFStack.popBranch(); |
443 if (LastAlu.back()) { | 591 if (LastAlu.back()) { |
444 ToPopAfter.push_back(LastAlu.back()); | 592 ToPopAfter.push_back(LastAlu.back()); |
445 } else { | 593 } else { |
446 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), | 594 MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), |
447 getHWInstrDesc(CF_POP)) | 595 getHWInstrDesc(CF_POP)) |
512 .addImm(Alu->getOperand(6).getImm()) | 660 .addImm(Alu->getOperand(6).getImm()) |
513 .addImm(Alu->getOperand(7).getImm()) | 661 .addImm(Alu->getOperand(7).getImm()) |
514 .addImm(Alu->getOperand(8).getImm()); | 662 .addImm(Alu->getOperand(8).getImm()); |
515 Alu->eraseFromParent(); | 663 Alu->eraseFromParent(); |
516 } | 664 } |
517 MFI->StackSize = getHWStackSize(MaxStack, HasPush); | 665 MFI->StackSize = CFStack.MaxStackSize; |
518 } | 666 } |
519 | 667 |
520 return false; | 668 return false; |
521 } | 669 } |
522 | 670 |
523 const char *getPassName() const { | 671 const char *getPassName() const override { |
524 return "R600 Control Flow Finalizer Pass"; | 672 return "R600 Control Flow Finalizer Pass"; |
525 } | 673 } |
526 }; | 674 }; |
527 | 675 |
528 char R600ControlFlowFinalizer::ID = 0; | 676 char R600ControlFlowFinalizer::ID = 0; |