0
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
1 ; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx -combiner-stress-load-slicing < %s -o - | FileCheck %s --check-prefix=STRESS
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
2 ; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx < %s -o - | FileCheck %s --check-prefix=REGULAR
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
3 ;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
4 ; <rdar://problem/14477220>
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
5
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
6 %class.Complex = type { float, float }
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
7
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
8
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
9 ; Check that independant slices leads to independant loads then the slices leads to
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
10 ; different register file.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
11 ;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
12 ; The layout is:
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
13 ; LSB 0 1 2 3 | 4 5 6 7 MSB
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
14 ; Low High
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
15 ; The base address points to 0 and is 8-bytes aligned.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
16 ; Low slice starts at 0 (base) and is 8-bytes aligned.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
17 ; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
18 ;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
19 ; STRESS-LABEL: t1:
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
20 ; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
21 ; STRESS: vmovss 64([[BASE:[^(]+]]), [[OUT_Real:%xmm[0-9]+]]
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
22 ; Add low slice: out[out_start].real, this is base + 0.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
23 ; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
24 ; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
25 ; STRESS-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
26 ; Add high slice: out[out_start].imm, this is base + 4.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
27 ; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
28 ; Swap Imm and Real.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
29 ; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
30 ; Put the results back into out[out_start].
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
31 ; STRESS-NEXT: vmovq [[RES_Vec]], ([[BASE]])
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
32 ;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
33 ; Same for REGULAR, we eliminate register bank copy with each slices.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
34 ; REGULAR-LABEL: t1:
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
35 ; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
36 ; REGULAR: vmovss 64([[BASE:[^)]+]]), [[OUT_Real:%xmm[0-9]+]]
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
37 ; Add low slice: out[out_start].real, this is base + 0.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
38 ; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
39 ; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
40 ; REGULAR-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
41 ; Add high slice: out[out_start].imm, this is base + 4.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
42 ; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
43 ; Swap Imm and Real.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
44 ; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
45 ; Put the results back into out[out_start].
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
46 ; REGULAR-NEXT: vmovq [[RES_Vec]], ([[BASE]])
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
47 define void @t1(%class.Complex* nocapture %out, i64 %out_start) {
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
48 entry:
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
49 %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
50 %tmp = bitcast %class.Complex* %arrayidx to i64*
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
51 %tmp1 = load i64* %tmp, align 8
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
52 %t0.sroa.0.0.extract.trunc = trunc i64 %tmp1 to i32
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
53 %tmp2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
54 %t0.sroa.2.0.extract.shift = lshr i64 %tmp1, 32
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
55 %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
56 %tmp3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
57 %add = add i64 %out_start, 8
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
58 %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
59 %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
60 %tmp4 = load float* %i.i, align 4
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
61 %add.i = fadd float %tmp4, %tmp2
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
62 %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
63 %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
64 %tmp5 = load float* %r.i, align 4
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
65 %add5.i = fadd float %tmp5, %tmp3
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
66 %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
67 %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
68 store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
69 ret void
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
70 }
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
71
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
72 ; Function Attrs: nounwind
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
73 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
74
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
75 ; Function Attrs: nounwind
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
76 declare void @llvm.lifetime.start(i64, i8* nocapture)
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
77
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
78 ; Function Attrs: nounwind
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
79 declare void @llvm.lifetime.end(i64, i8* nocapture)
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
80
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
81 ; Check that we do not read outside of the chunk of bits of the original loads.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
82 ;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
83 ; The 64-bits should have been split in one 32-bits and one 16-bits slices.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
84 ; The 16-bits should be zero extended to match the final type.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
85 ;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
86 ; The memory layout is:
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
87 ; LSB 0 1 2 3 | 4 5 | 6 7 MSB
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
88 ; Low High
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
89 ; The base address points to 0 and is 8-bytes aligned.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
90 ; Low slice starts at 0 (base) and is 8-bytes aligned.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
91 ; High slice starts at 6 (base + 6-bytes) and is 2-bytes aligned.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
92 ;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
93 ; STRESS-LABEL: t2:
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
94 ; STRESS: movzwl 6([[BASE:[^)]+]]), %eax
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
95 ; STRESS-NEXT: addl ([[BASE]]), %eax
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
96 ; STRESS-NEXT: ret
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
97 ;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
98 ; For the REGULAR heuristic, this is not profitable to slice things that are not
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
99 ; next to each other in memory. Here we have a hole with bytes #4-5.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
100 ; REGULAR-LABEL: t2:
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
101 ; REGULAR: shrq $48
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
102 define i32 @t2(%class.Complex* nocapture %out, i64 %out_start) {
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
103 %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
104 %bitcast = bitcast %class.Complex* %arrayidx to i64*
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
105 %chunk64 = load i64* %bitcast, align 8
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
106 %slice32_low = trunc i64 %chunk64 to i32
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
107 %shift48 = lshr i64 %chunk64, 48
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
108 %slice32_high = trunc i64 %shift48 to i32
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
109 %res = add i32 %slice32_high, %slice32_low
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
110 ret i32 %res
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
111 }
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
112
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
113 ; Check that we do not optimize overlapping slices.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
114 ;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
115 ; The 64-bits should NOT have been split in as slices are overlapping.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
116 ; First slice uses bytes numbered 0 to 3.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
117 ; Second slice uses bytes numbered 6 and 7.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
118 ; Third slice uses bytes numbered 4 to 7.
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
119 ;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
120 ; STRESS-LABEL: t3:
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
121 ; STRESS: shrq $48
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
122 ; STRESS: shrq $32
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
123 ;
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
124 ; REGULAR-LABEL: t3:
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
125 ; REGULAR: shrq $48
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
126 ; REGULAR: shrq $32
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
127 define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) {
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
128 %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
129 %bitcast = bitcast %class.Complex* %arrayidx to i64*
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
130 %chunk64 = load i64* %bitcast, align 8
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
131 %slice32_low = trunc i64 %chunk64 to i32
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
132 %shift48 = lshr i64 %chunk64, 48
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
133 %slice32_high = trunc i64 %shift48 to i32
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
134 %shift32 = lshr i64 %chunk64, 32
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
135 %slice32_lowhigh = trunc i64 %shift32 to i32
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
136 %tmpres = add i32 %slice32_high, %slice32_low
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
137 %res = add i32 %slice32_lowhigh, %tmpres
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
138 ret i32 %res
|
Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp>
parents:
diff
changeset
|
139 }
|