83
|
1 //===-- PPCTargetTransformInfo.cpp - PPC specific TTI ---------------------===//
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
2 //
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
3 // The LLVM Compiler Infrastructure
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
4 //
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
5 // This file is distributed under the University of Illinois Open Source
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
6 // License. See LICENSE.TXT for details.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
7 //
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
8 //===----------------------------------------------------------------------===//
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
9
|
83
|
10 #include "PPCTargetTransformInfo.h"
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
11 #include "llvm/Analysis/TargetTransformInfo.h"
|
83
|
12 #include "llvm/CodeGen/BasicTTIImpl.h"
|
77
|
13 #include "llvm/Support/CommandLine.h"
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
14 #include "llvm/Support/Debug.h"
|
77
|
15 #include "llvm/Target/CostTable.h"
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
16 #include "llvm/Target/TargetLowering.h"
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
17 using namespace llvm;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
18
|
77
|
19 #define DEBUG_TYPE "ppctti"
|
|
20
|
|
21 static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
|
|
22 cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
|
|
23
|
100
|
24 // This is currently only used for the data prefetch pass which is only enabled
|
|
25 // for BG/Q by default.
|
|
26 static cl::opt<unsigned>
|
|
27 CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
|
|
28 cl::desc("The loop prefetch cache line size"));
|
|
29
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
30 //===----------------------------------------------------------------------===//
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
31 //
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
32 // PPC cost model.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
33 //
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
34 //===----------------------------------------------------------------------===//
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
35
|
83
|
36 TargetTransformInfo::PopcntSupportKind
|
|
37 PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
38 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
39 if (ST->hasPOPCNTD() && TyWidth <= 64)
|
83
|
40 return TTI::PSK_FastHardware;
|
|
41 return TTI::PSK_Software;
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
42 }
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
43
|
95
|
44 int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
|
77
|
45 if (DisablePPCConstHoist)
|
83
|
46 return BaseT::getIntImmCost(Imm, Ty);
|
77
|
47
|
|
48 assert(Ty->isIntegerTy());
|
|
49
|
|
50 unsigned BitSize = Ty->getPrimitiveSizeInBits();
|
|
51 if (BitSize == 0)
|
|
52 return ~0U;
|
|
53
|
|
54 if (Imm == 0)
|
83
|
55 return TTI::TCC_Free;
|
77
|
56
|
|
57 if (Imm.getBitWidth() <= 64) {
|
|
58 if (isInt<16>(Imm.getSExtValue()))
|
83
|
59 return TTI::TCC_Basic;
|
77
|
60
|
|
61 if (isInt<32>(Imm.getSExtValue())) {
|
|
62 // A constant that can be materialized using lis.
|
|
63 if ((Imm.getZExtValue() & 0xFFFF) == 0)
|
83
|
64 return TTI::TCC_Basic;
|
77
|
65
|
83
|
66 return 2 * TTI::TCC_Basic;
|
77
|
67 }
|
|
68 }
|
|
69
|
83
|
70 return 4 * TTI::TCC_Basic;
|
77
|
71 }
|
|
72
|
95
|
73 int PPCTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
|
|
74 Type *Ty) {
|
77
|
75 if (DisablePPCConstHoist)
|
83
|
76 return BaseT::getIntImmCost(IID, Idx, Imm, Ty);
|
77
|
77
|
|
78 assert(Ty->isIntegerTy());
|
|
79
|
|
80 unsigned BitSize = Ty->getPrimitiveSizeInBits();
|
|
81 if (BitSize == 0)
|
|
82 return ~0U;
|
|
83
|
|
84 switch (IID) {
|
83
|
85 default:
|
|
86 return TTI::TCC_Free;
|
77
|
87 case Intrinsic::sadd_with_overflow:
|
|
88 case Intrinsic::uadd_with_overflow:
|
|
89 case Intrinsic::ssub_with_overflow:
|
|
90 case Intrinsic::usub_with_overflow:
|
|
91 if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
|
83
|
92 return TTI::TCC_Free;
|
|
93 break;
|
|
94 case Intrinsic::experimental_stackmap:
|
|
95 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
|
|
96 return TTI::TCC_Free;
|
|
97 break;
|
|
98 case Intrinsic::experimental_patchpoint_void:
|
|
99 case Intrinsic::experimental_patchpoint_i64:
|
|
100 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
|
|
101 return TTI::TCC_Free;
|
77
|
102 break;
|
|
103 }
|
83
|
104 return PPCTTIImpl::getIntImmCost(Imm, Ty);
|
77
|
105 }
|
|
106
|
95
|
107 int PPCTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
|
|
108 Type *Ty) {
|
77
|
109 if (DisablePPCConstHoist)
|
83
|
110 return BaseT::getIntImmCost(Opcode, Idx, Imm, Ty);
|
77
|
111
|
|
112 assert(Ty->isIntegerTy());
|
|
113
|
|
114 unsigned BitSize = Ty->getPrimitiveSizeInBits();
|
|
115 if (BitSize == 0)
|
|
116 return ~0U;
|
|
117
|
|
118 unsigned ImmIdx = ~0U;
|
|
119 bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
|
|
120 ZeroFree = false;
|
|
121 switch (Opcode) {
|
83
|
122 default:
|
|
123 return TTI::TCC_Free;
|
77
|
124 case Instruction::GetElementPtr:
|
|
125 // Always hoist the base address of a GetElementPtr. This prevents the
|
|
126 // creation of new constants for every base constant that gets constant
|
|
127 // folded with the offset.
|
|
128 if (Idx == 0)
|
83
|
129 return 2 * TTI::TCC_Basic;
|
|
130 return TTI::TCC_Free;
|
77
|
131 case Instruction::And:
|
|
132 RunFree = true; // (for the rotate-and-mask instructions)
|
|
133 // Fallthrough...
|
|
134 case Instruction::Add:
|
|
135 case Instruction::Or:
|
|
136 case Instruction::Xor:
|
|
137 ShiftedFree = true;
|
|
138 // Fallthrough...
|
|
139 case Instruction::Sub:
|
|
140 case Instruction::Mul:
|
|
141 case Instruction::Shl:
|
|
142 case Instruction::LShr:
|
|
143 case Instruction::AShr:
|
|
144 ImmIdx = 1;
|
|
145 break;
|
|
146 case Instruction::ICmp:
|
|
147 UnsignedFree = true;
|
|
148 ImmIdx = 1;
|
|
149 // Fallthrough... (zero comparisons can use record-form instructions)
|
|
150 case Instruction::Select:
|
|
151 ZeroFree = true;
|
|
152 break;
|
|
153 case Instruction::PHI:
|
|
154 case Instruction::Call:
|
|
155 case Instruction::Ret:
|
|
156 case Instruction::Load:
|
|
157 case Instruction::Store:
|
|
158 break;
|
|
159 }
|
|
160
|
|
161 if (ZeroFree && Imm == 0)
|
83
|
162 return TTI::TCC_Free;
|
77
|
163
|
|
164 if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
|
|
165 if (isInt<16>(Imm.getSExtValue()))
|
83
|
166 return TTI::TCC_Free;
|
77
|
167
|
|
168 if (RunFree) {
|
|
169 if (Imm.getBitWidth() <= 32 &&
|
|
170 (isShiftedMask_32(Imm.getZExtValue()) ||
|
|
171 isShiftedMask_32(~Imm.getZExtValue())))
|
83
|
172 return TTI::TCC_Free;
|
77
|
173
|
|
174 if (ST->isPPC64() &&
|
|
175 (isShiftedMask_64(Imm.getZExtValue()) ||
|
|
176 isShiftedMask_64(~Imm.getZExtValue())))
|
83
|
177 return TTI::TCC_Free;
|
77
|
178 }
|
|
179
|
|
180 if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
|
83
|
181 return TTI::TCC_Free;
|
77
|
182
|
|
183 if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
|
83
|
184 return TTI::TCC_Free;
|
77
|
185 }
|
|
186
|
83
|
187 return PPCTTIImpl::getIntImmCost(Imm, Ty);
|
77
|
188 }
|
|
189
|
83
|
190 void PPCTTIImpl::getUnrollingPreferences(Loop *L,
|
|
191 TTI::UnrollingPreferences &UP) {
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
192 if (ST->getDarwinDirective() == PPC::DIR_A2) {
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
193 // The A2 is in-order with a deep pipeline, and concatenation unrolling
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
194 // helps expose latency-hiding opportunities to the instruction scheduler.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
195 UP.Partial = UP.Runtime = true;
|
95
|
196
|
|
197 // We unroll a lot on the A2 (hundreds of instructions), and the benefits
|
|
198 // often outweigh the cost of a division to compute the trip count.
|
|
199 UP.AllowExpensiveTripCount = true;
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
200 }
|
83
|
201
|
|
202 BaseT::getUnrollingPreferences(L, UP);
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
203 }
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
204
|
95
|
205 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
|
|
206 // On the A2, always unroll aggressively. For QPX unaligned loads, we depend
|
|
207 // on combining the loads generated for consecutive accesses, and failure to
|
|
208 // do so is particularly expensive. This makes it much more likely (compared
|
|
209 // to only using concatenation unrolling).
|
|
210 if (ST->getDarwinDirective() == PPC::DIR_A2)
|
|
211 return true;
|
|
212
|
|
213 return LoopHasReductions;
|
|
214 }
|
|
215
|
|
216 bool PPCTTIImpl::enableInterleavedAccessVectorization() {
|
|
217 return true;
|
|
218 }
|
|
219
|
83
|
220 unsigned PPCTTIImpl::getNumberOfRegisters(bool Vector) {
|
95
|
221 if (Vector && !ST->hasAltivec() && !ST->hasQPX())
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
222 return 0;
|
77
|
223 return ST->hasVSX() ? 64 : 32;
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
224 }
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
225
|
83
|
226 unsigned PPCTTIImpl::getRegisterBitWidth(bool Vector) {
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
227 if (Vector) {
|
95
|
228 if (ST->hasQPX()) return 256;
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
229 if (ST->hasAltivec()) return 128;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
230 return 0;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
231 }
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
232
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
233 if (ST->isPPC64())
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
234 return 64;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
235 return 32;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
236
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
237 }
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
238
|
100
|
239 unsigned PPCTTIImpl::getCacheLineSize() {
|
|
240 // This is currently only used for the data prefetch pass which is only
|
|
241 // enabled for BG/Q by default.
|
|
242 return CacheLineSize;
|
|
243 }
|
|
244
|
95
|
245 unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
246 unsigned Directive = ST->getDarwinDirective();
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
247 // The 440 has no SIMD support, but floating-point instructions
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
248 // have a 5-cycle latency, so unroll by 5x for latency hiding.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
249 if (Directive == PPC::DIR_440)
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
250 return 5;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
251
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
252 // The A2 has no SIMD support, but floating-point instructions
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
253 // have a 6-cycle latency, so unroll by 6x for latency hiding.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
254 if (Directive == PPC::DIR_A2)
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
255 return 6;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
256
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
257 // FIXME: For lack of any better information, do no harm...
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
258 if (Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500)
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
259 return 1;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
260
|
83
|
261 // For P7 and P8, floating-point instructions have a 6-cycle latency and
|
|
262 // there are two execution units, so unroll by 12x for latency hiding.
|
|
263 if (Directive == PPC::DIR_PWR7 ||
|
|
264 Directive == PPC::DIR_PWR8)
|
|
265 return 12;
|
|
266
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
267 // For most things, modern systems have two execution units (and
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
268 // out-of-order execution).
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
269 return 2;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
270 }
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
271
|
95
|
272 int PPCTTIImpl::getArithmeticInstrCost(
|
83
|
273 unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
|
|
274 TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
|
|
275 TTI::OperandValueProperties Opd2PropInfo) {
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
276 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
277
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
278 // Fallback to the default implementation.
|
83
|
279 return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
|
|
280 Opd1PropInfo, Opd2PropInfo);
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
281 }
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
282
|
95
|
283 int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
|
|
284 Type *SubTp) {
|
|
285 // Legalize the type.
|
|
286 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
|
|
287
|
|
288 // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
|
|
289 // (at least in the sense that there need only be one non-loop-invariant
|
|
290 // instruction). We need one such shuffle instruction for each actual
|
|
291 // register (this is not true for arbitrary shuffles, but is true for the
|
|
292 // structured types of shuffles covered by TTI::ShuffleKind).
|
|
293 return LT.first;
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
294 }
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
295
|
95
|
296 int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
297 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
298
|
83
|
299 return BaseT::getCastInstrCost(Opcode, Dst, Src);
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
300 }
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
301
|
95
|
302 int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
|
83
|
303 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
304 }
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
305
|
95
|
306 int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
307 assert(Val->isVectorTy() && "This must be a vector type");
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
308
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
309 int ISD = TLI->InstructionOpcodeToISD(Opcode);
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
310 assert(ISD && "Invalid opcode");
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
311
|
77
|
312 if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
|
|
313 // Double-precision scalars are already located in index #0.
|
|
314 if (Index == 0)
|
|
315 return 0;
|
|
316
|
83
|
317 return BaseT::getVectorInstrCost(Opcode, Val, Index);
|
95
|
318 } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
|
|
319 // Floating point scalars are already located in index #0.
|
|
320 if (Index == 0)
|
|
321 return 0;
|
|
322
|
|
323 return BaseT::getVectorInstrCost(Opcode, Val, Index);
|
77
|
324 }
|
|
325
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
326 // Estimated cost of a load-hit-store delay. This was obtained
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
327 // experimentally as a minimum needed to prevent unprofitable
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
328 // vectorization for the paq8p benchmark. It may need to be
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
329 // raised further if other unprofitable cases remain.
|
77
|
330 unsigned LHSPenalty = 2;
|
|
331 if (ISD == ISD::INSERT_VECTOR_ELT)
|
|
332 LHSPenalty += 7;
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
333
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
334 // Vector element insert/extract with Altivec is very expensive,
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
335 // because they require store and reload with the attendant
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
336 // processor stall for load-hit-store. Until VSX is available,
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
337 // these need to be estimated as very costly.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
338 if (ISD == ISD::EXTRACT_VECTOR_ELT ||
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
339 ISD == ISD::INSERT_VECTOR_ELT)
|
83
|
340 return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index);
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
341
|
83
|
342 return BaseT::getVectorInstrCost(Opcode, Val, Index);
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
343 }
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
344
|
95
|
345 int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
|
346 unsigned AddressSpace) {
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
347 // Legalize the type.
|
95
|
348 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
349 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
350 "Invalid Opcode");
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
351
|
95
|
352 int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
|
|
353
|
|
354 // Aligned loads and stores are easy.
|
|
355 unsigned SrcBytes = LT.second.getStoreSize();
|
|
356 if (!SrcBytes || !Alignment || Alignment >= SrcBytes)
|
|
357 return Cost;
|
|
358
|
|
359 bool IsAltivecType = ST->hasAltivec() &&
|
|
360 (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
|
|
361 LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
|
|
362 bool IsVSXType = ST->hasVSX() &&
|
|
363 (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
|
|
364 bool IsQPXType = ST->hasQPX() &&
|
|
365 (LT.second == MVT::v4f64 || LT.second == MVT::v4f32);
|
77
|
366
|
95
|
367 // If we can use the permutation-based load sequence, then this is also
|
|
368 // relatively cheap (not counting loop-invariant instructions): one load plus
|
|
369 // one permute (the last load in a series has extra cost, but we're
|
|
370 // neglecting that here). Note that on the P7, we should do unaligned loads
|
|
371 // for Altivec types using the VSX instructions, but that's more expensive
|
|
372 // than using the permutation-based load sequence. On the P8, that's no
|
|
373 // longer true.
|
|
374 if (Opcode == Instruction::Load &&
|
|
375 ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
|
|
376 Alignment >= LT.second.getScalarType().getStoreSize())
|
|
377 return Cost + LT.first; // Add the cost of the permutations.
|
77
|
378
|
95
|
379 // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
|
|
380 // P7, unaligned vector loads are more expensive than the permutation-based
|
|
381 // load sequence, so that might be used instead, but regardless, the net cost
|
|
382 // is about the same (not counting loop-invariant instructions).
|
|
383 if (IsVSXType || (ST->hasVSX() && IsAltivecType))
|
|
384 return Cost;
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
385
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
386 // PPC in general does not support unaligned loads and stores. They'll need
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
387 // to be decomposed based on the alignment factor.
|
95
|
388
|
|
389 // Add the cost of each scalar load or store.
|
|
390 Cost += LT.first*(SrcBytes/Alignment-1);
|
77
|
391
|
95
|
392 // For a vector type, there is also scalarization overhead (only for
|
|
393 // stores, loads are expanded using the vector-load + permutation sequence,
|
|
394 // which is much less expensive).
|
|
395 if (Src->isVectorTy() && Opcode == Instruction::Store)
|
|
396 for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
|
|
397 Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
|
0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
398
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
399 return Cost;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
400 }
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
401
|
95
|
402 int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
|
403 unsigned Factor,
|
|
404 ArrayRef<unsigned> Indices,
|
|
405 unsigned Alignment,
|
|
406 unsigned AddressSpace) {
|
|
407 assert(isa<VectorType>(VecTy) &&
|
|
408 "Expect a vector type for interleaved memory op");
|
|
409
|
|
410 // Legalize the type.
|
|
411 std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, VecTy);
|
|
412
|
|
413 // Firstly, the cost of load/store operation.
|
|
414 int Cost = getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace);
|
|
415
|
|
416 // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
|
|
417 // (at least in the sense that there need only be one non-loop-invariant
|
|
418 // instruction). For each result vector, we need one shuffle per incoming
|
|
419 // vector (except that the first shuffle can take two incoming vectors
|
|
420 // because it does not need to take itself).
|
|
421 Cost += Factor*(LT.first-1);
|
|
422
|
|
423 return Cost;
|
|
424 }
|
|
425
|