Mercurial > hg > CbC > CbC_llvm
comparison test/CodeGen/AArch64/fp16-vector-load-store.ll @ 77:54457678186b LLVM3.6
LLVM 3.6
author | Kaito Tokumori <e105711@ie.u-ryukyu.ac.jp> |
---|---|
date | Mon, 08 Sep 2014 22:06:00 +0900 |
parents | |
children | afa8332a0e37 |
comparison
equal
deleted
inserted
replaced
34:e874dbf0ad9d | 77:54457678186b |
---|---|
1 ; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s | |
2 | |
3 ; Simple load of v4i16 | |
4 define <4 x half> @load_64(<4 x half>* nocapture readonly %a) #0 { | |
5 ; CHECK-LABEL: load_64: | |
6 ; CHECK: ldr d0, [x0] | |
7 entry: | |
8 %0 = load <4 x half>* %a, align 8 | |
9 ret <4 x half> %0 | |
10 } | |
11 | |
12 ; Simple load of v8i16 | |
13 define <8 x half> @load_128(<8 x half>* nocapture readonly %a) #0 { | |
14 ; CHECK-LABEL: load_128: | |
15 ; CHECK: ldr q0, [x0] | |
16 entry: | |
17 %0 = load <8 x half>* %a, align 16 | |
18 ret <8 x half> %0 | |
19 } | |
20 | |
21 ; Duplicating load to v4i16 | |
22 define <4 x half> @load_dup_64(half* nocapture readonly %a) #0 { | |
23 ; CHECK-LABEL: load_dup_64: | |
24 ; CHECK: ld1r { v0.4h }, [x0] | |
25 entry: | |
26 %0 = load half* %a, align 2 | |
27 %1 = insertelement <4 x half> undef, half %0, i32 0 | |
28 %2 = shufflevector <4 x half> %1, <4 x half> undef, <4 x i32> zeroinitializer | |
29 ret <4 x half> %2 | |
30 } | |
31 | |
32 ; Duplicating load to v8i16 | |
33 define <8 x half> @load_dup_128(half* nocapture readonly %a) #0 { | |
34 ; CHECK-LABEL: load_dup_128: | |
35 ; CHECK: ld1r { v0.8h }, [x0] | |
36 entry: | |
37 %0 = load half* %a, align 2 | |
38 %1 = insertelement <8 x half> undef, half %0, i32 0 | |
39 %2 = shufflevector <8 x half> %1, <8 x half> undef, <8 x i32> zeroinitializer | |
40 ret <8 x half> %2 | |
41 } | |
42 | |
43 ; Load to one lane of v4f16 | |
44 define <4 x half> @load_lane_64(half* nocapture readonly %a, <4 x half> %b) #0 { | |
45 ; CHECK-LABEL: load_lane_64: | |
46 ; CHECK: ld1 { v0.h }[2], [x0] | |
47 entry: | |
48 %0 = load half* %a, align 2 | |
49 %1 = insertelement <4 x half> %b, half %0, i32 2 | |
50 ret <4 x half> %1 | |
51 } | |
52 | |
53 ; Load to one lane of v8f16 | |
54 define <8 x half> @load_lane_128(half* nocapture readonly %a, <8 x half> %b) #0 { | |
55 ; CHECK-LABEL: load_lane_128: | |
56 ; CHECK: ld1 { v0.h }[5], [x0] | |
57 entry: | |
58 %0 = load half* %a, align 2 | |
59 %1 = insertelement <8 x half> %b, half %0, i32 5 | |
60 ret <8 x half> %1 | |
61 } | |
62 | |
63 ; Simple store of v4f16 | |
64 define void @store_64(<4 x half>* nocapture %a, <4 x half> %b) #1 { | |
65 ; CHECK-LABEL: store_64: | |
66 ; CHECK: str d0, [x0] | |
67 entry: | |
68 store <4 x half> %b, <4 x half>* %a, align 8 | |
69 ret void | |
70 } | |
71 | |
72 ; Simple store of v8f16 | |
73 define void @store_128(<8 x half>* nocapture %a, <8 x half> %b) #1 { | |
74 ; CHECK-LABEL: store_128: | |
75 ; CHECK: str q0, [x0] | |
76 entry: | |
77 store <8 x half> %b, <8 x half>* %a, align 16 | |
78 ret void | |
79 } | |
80 | |
81 ; Store from one lane of v4f16 | |
82 define void @store_lane_64(half* nocapture %a, <4 x half> %b) #1 { | |
83 ; CHECK-LABEL: store_lane_64: | |
84 ; CHECK: st1 { v0.h }[2], [x0] | |
85 entry: | |
86 %0 = extractelement <4 x half> %b, i32 2 | |
87 store half %0, half* %a, align 2 | |
88 ret void | |
89 } | |
90 | |
91 ; Store from one lane of v8f16 | |
92 define void @store_lane_128(half* nocapture %a, <8 x half> %b) #1 { | |
93 ; CHECK-LABEL: store_lane_128: | |
94 ; CHECK: st1 { v0.h }[5], [x0] | |
95 entry: | |
96 %0 = extractelement <8 x half> %b, i32 5 | |
97 store half %0, half* %a, align 2 | |
98 ret void | |
99 } | |
100 | |
101 ; NEON intrinsics - (de-)interleaving loads and stores | |
102 declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>*) | |
103 declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>*) | |
104 declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>*) | |
105 declare void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*) | |
106 declare void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*) | |
107 declare void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*) | |
108 declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>*) | |
109 declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>*) | |
110 declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>*) | |
111 declare void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*) | |
112 declare void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*) | |
113 declare void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*) | |
114 | |
115 ; Load 2 x v4f16 with de-interleaving | |
116 define { <4 x half>, <4 x half> } @load_interleave_64_2(<4 x half>* %a) #0 { | |
117 ; CHECK-LABEL: load_interleave_64_2: | |
118 ; CHECK: ld2 { v0.4h, v1.4h }, [x0] | |
119 entry: | |
120 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2.v4f16.p0v4f16(<4 x half>* %a) | |
121 ret { <4 x half>, <4 x half> } %0 | |
122 } | |
123 | |
124 ; Load 3 x v4f16 with de-interleaving | |
125 define { <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_3(<4 x half>* %a) #0 { | |
126 ; CHECK-LABEL: load_interleave_64_3: | |
127 ; CHECK: ld3 { v0.4h, v1.4h, v2.4h }, [x0] | |
128 entry: | |
129 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3.v4f16.p0v4f16(<4 x half>* %a) | |
130 ret { <4 x half>, <4 x half>, <4 x half> } %0 | |
131 } | |
132 | |
133 ; Load 4 x v4f16 with de-interleaving | |
134 define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_interleave_64_4(<4 x half>* %a) #0 { | |
135 ; CHECK-LABEL: load_interleave_64_4: | |
136 ; CHECK: ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] | |
137 entry: | |
138 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4.v4f16.p0v4f16(<4 x half>* %a) | |
139 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 | |
140 } | |
141 | |
142 ; Store 2 x v4f16 with interleaving | |
143 define void @store_interleave_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 { | |
144 ; CHECK-LABEL: store_interleave_64_2: | |
145 ; CHECK: st2 { v0.4h, v1.4h }, [x0] | |
146 entry: | |
147 tail call void @llvm.aarch64.neon.st2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a) | |
148 ret void | |
149 } | |
150 | |
151 ; Store 3 x v4f16 with interleaving | |
152 define void @store_interleave_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { | |
153 ; CHECK-LABEL: store_interleave_64_3: | |
154 ; CHECK: st3 { v0.4h, v1.4h, v2.4h }, [x0] | |
155 entry: | |
156 tail call void @llvm.aarch64.neon.st3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a) | |
157 ret void | |
158 } | |
159 | |
160 ; Store 4 x v4f16 with interleaving | |
161 define void @store_interleave_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { | |
162 ; CHECK-LABEL: store_interleave_64_4: | |
163 ; CHECK: st4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] | |
164 entry: | |
165 tail call void @llvm.aarch64.neon.st4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a) | |
166 ret void | |
167 } | |
168 | |
169 ; Load 2 x v8f16 with de-interleaving | |
170 define { <8 x half>, <8 x half> } @load_interleave_128_2(<8 x half>* %a) #0 { | |
171 ; CHECK-LABEL: load_interleave_128_2: | |
172 ; CHECK: ld2 { v0.8h, v1.8h }, [x0] | |
173 entry: | |
174 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2.v8f16.p0v8f16(<8 x half>* %a) | |
175 ret { <8 x half>, <8 x half> } %0 | |
176 } | |
177 | |
178 ; Load 3 x v8f16 with de-interleaving | |
179 define { <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_3(<8 x half>* %a) #0 { | |
180 ; CHECK-LABEL: load_interleave_128_3: | |
181 ; CHECK: ld3 { v0.8h, v1.8h, v2.8h }, [x0] | |
182 entry: | |
183 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3.v8f16.p0v8f16(<8 x half>* %a) | |
184 ret { <8 x half>, <8 x half>, <8 x half> } %0 | |
185 } | |
186 | |
187 ; Load 8 x v8f16 with de-interleaving | |
188 define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_interleave_128_4(<8 x half>* %a) #0 { | |
189 ; CHECK-LABEL: load_interleave_128_4: | |
190 ; CHECK: ld4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] | |
191 entry: | |
192 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4.v8f16.p0v8f16(<8 x half>* %a) | |
193 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 | |
194 } | |
195 | |
196 ; Store 2 x v8f16 with interleaving | |
197 define void @store_interleave_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 { | |
198 ; CHECK-LABEL: store_interleave_128_2: | |
199 ; CHECK: st2 { v0.8h, v1.8h }, [x0] | |
200 entry: | |
201 tail call void @llvm.aarch64.neon.st2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a) | |
202 ret void | |
203 } | |
204 | |
205 ; Store 3 x v8f16 with interleaving | |
206 define void @store_interleave_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { | |
207 ; CHECK-LABEL: store_interleave_128_3: | |
208 ; CHECK: st3 { v0.8h, v1.8h, v2.8h }, [x0] | |
209 entry: | |
210 tail call void @llvm.aarch64.neon.st3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a) | |
211 ret void | |
212 } | |
213 | |
214 ; Store 8 x v8f16 with interleaving | |
215 define void @store_interleave_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { | |
216 ; CHECK-LABEL: store_interleave_128_4: | |
217 ; CHECK: st4 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] | |
218 entry: | |
219 tail call void @llvm.aarch64.neon.st4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a) | |
220 ret void | |
221 } | |
222 | |
223 ; NEON intrinsics - duplicating loads | |
224 declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half*) | |
225 declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half*) | |
226 declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half*) | |
227 declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half*) | |
228 declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half*) | |
229 declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half*) | |
230 | |
231 ; Load 2 x v4f16 with duplication | |
232 define { <4 x half>, <4 x half> } @load_dup_64_2(half* %a) #0 { | |
233 ; CHECK-LABEL: load_dup_64_2: | |
234 ; CHECK: ld2r { v0.4h, v1.4h }, [x0] | |
235 entry: | |
236 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2r.v4f16.p0f16(half* %a) | |
237 ret { <4 x half>, <4 x half> } %0 | |
238 } | |
239 | |
240 ; Load 3 x v4f16 with duplication | |
241 define { <4 x half>, <4 x half>, <4 x half> } @load_dup_64_3(half* %a) #0 { | |
242 ; CHECK-LABEL: load_dup_64_3: | |
243 ; CHECK: ld3r { v0.4h, v1.4h, v2.4h }, [x0] | |
244 entry: | |
245 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3r.v4f16.p0f16(half* %a) | |
246 ret { <4 x half>, <4 x half>, <4 x half> } %0 | |
247 } | |
248 | |
249 ; Load 4 x v4f16 with duplication | |
250 define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_dup_64_4(half* %a) #0 { | |
251 ; CHECK-LABEL: load_dup_64_4: | |
252 ; CHECK: ld4r { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] | |
253 entry: | |
254 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4r.v4f16.p0f16(half* %a) | |
255 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 | |
256 } | |
257 | |
258 ; Load 2 x v8f16 with duplication | |
259 define { <8 x half>, <8 x half> } @load_dup_128_2(half* %a) #0 { | |
260 ; CHECK-LABEL: load_dup_128_2: | |
261 ; CHECK: ld2r { v0.8h, v1.8h }, [x0] | |
262 entry: | |
263 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2r.v8f16.p0f16(half* %a) | |
264 ret { <8 x half>, <8 x half> } %0 | |
265 } | |
266 | |
267 ; Load 3 x v8f16 with duplication | |
268 define { <8 x half>, <8 x half>, <8 x half> } @load_dup_128_3(half* %a) #0 { | |
269 ; CHECK-LABEL: load_dup_128_3: | |
270 ; CHECK: ld3r { v0.8h, v1.8h, v2.8h }, [x0] | |
271 entry: | |
272 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3r.v8f16.p0f16(half* %a) | |
273 ret { <8 x half>, <8 x half>, <8 x half> } %0 | |
274 } | |
275 | |
276 ; Load 8 x v8f16 with duplication | |
277 define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_dup_128_4(half* %a) #0 { | |
278 ; CHECK-LABEL: load_dup_128_4: | |
279 ; CHECK: ld4r { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] | |
280 entry: | |
281 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4r.v8f16.p0f16(half* %a) | |
282 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 | |
283 } | |
284 | |
285 | |
286 ; NEON intrinsics - loads and stores to/from one lane | |
287 declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*) | |
288 declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*) | |
289 declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*) | |
290 declare void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half>, <4 x half>, i64, half*) | |
291 declare void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, i64, half*) | |
292 declare void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, i64, half*) | |
293 declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*) | |
294 declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*) | |
295 declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*) | |
296 declare void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half>, <8 x half>, i64, half*) | |
297 declare void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, i64, half*) | |
298 declare void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, i64, half*) | |
299 | |
300 ; Load one lane of 2 x v4f16 | |
301 define { <4 x half>, <4 x half> } @load_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 { | |
302 ; CHECK-LABEL: load_lane_64_2: | |
303 ; CHECK: ld2 { v0.h, v1.h }[2], [x0] | |
304 entry: | |
305 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a) | |
306 ret { <4 x half>, <4 x half> } %0 | |
307 } | |
308 | |
309 ; Load one lane of 3 x v4f16 | |
310 define { <4 x half>, <4 x half>, <4 x half> } @load_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { | |
311 ; CHECK-LABEL: load_lane_64_3: | |
312 ; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0] | |
313 entry: | |
314 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a) | |
315 ret { <4 x half>, <4 x half>, <4 x half> } %0 | |
316 } | |
317 | |
318 ; Load one lane of 4 x v4f16 | |
319 define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { | |
320 ; CHECK-LABEL: load_lane_64_4: | |
321 ; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] | |
322 entry: | |
323 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a) | |
324 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 | |
325 } | |
326 | |
327 ; Store one lane of 2 x v4f16 | |
328 define void @store_lane_64_2(half* %a, <4 x half> %b, <4 x half> %c) #0 { | |
329 ; CHECK-LABEL: store_lane_64_2: | |
330 ; CHECK: st2 { v0.h, v1.h }[2], [x0] | |
331 entry: | |
332 tail call void @llvm.aarch64.neon.st2lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, i64 2, half* %a) | |
333 ret void | |
334 } | |
335 | |
336 ; Store one lane of 3 x v4f16 | |
337 define void @store_lane_64_3(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { | |
338 ; CHECK-LABEL: store_lane_64_3: | |
339 ; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0] | |
340 entry: | |
341 tail call void @llvm.aarch64.neon.st3lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, i64 2, half* %a) | |
342 ret void | |
343 } | |
344 | |
345 ; Store one lane of 4 x v4f16 | |
346 define void @store_lane_64_4(half* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { | |
347 ; CHECK-LABEL: store_lane_64_4: | |
348 ; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] | |
349 entry: | |
350 tail call void @llvm.aarch64.neon.st4lane.v4f16.p0f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, i64 2, half* %a) | |
351 ret void | |
352 } | |
353 | |
354 ; Load one lane of 2 x v8f16 | |
355 define { <8 x half>, <8 x half> } @load_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 { | |
356 ; CHECK-LABEL: load_lane_128_2: | |
357 ; CHECK: ld2 { v0.h, v1.h }[2], [x0] | |
358 entry: | |
359 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a) | |
360 ret { <8 x half>, <8 x half> } %0 | |
361 } | |
362 | |
363 ; Load one lane of 3 x v8f16 | |
364 define { <8 x half>, <8 x half>, <8 x half> } @load_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { | |
365 ; CHECK-LABEL: load_lane_128_3: | |
366 ; CHECK: ld3 { v0.h, v1.h, v2.h }[2], [x0] | |
367 entry: | |
368 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a) | |
369 ret { <8 x half>, <8 x half>, <8 x half> } %0 | |
370 } | |
371 | |
372 ; Load one lane of 8 x v8f16 | |
373 define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { | |
374 ; CHECK-LABEL: load_lane_128_4: | |
375 ; CHECK: ld4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] | |
376 entry: | |
377 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a) | |
378 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 | |
379 } | |
380 | |
381 ; Store one lane of 2 x v8f16 | |
382 define void @store_lane_128_2(half* %a, <8 x half> %b, <8 x half> %c) #0 { | |
383 ; CHECK-LABEL: store_lane_128_2: | |
384 ; CHECK: st2 { v0.h, v1.h }[2], [x0] | |
385 entry: | |
386 tail call void @llvm.aarch64.neon.st2lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, i64 2, half* %a) | |
387 ret void | |
388 } | |
389 | |
390 ; Store one lane of 3 x v8f16 | |
391 define void @store_lane_128_3(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { | |
392 ; CHECK-LABEL: store_lane_128_3: | |
393 ; CHECK: st3 { v0.h, v1.h, v2.h }[2], [x0] | |
394 entry: | |
395 tail call void @llvm.aarch64.neon.st3lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, i64 2, half* %a) | |
396 ret void | |
397 } | |
398 | |
399 ; Store one lane of 8 x v8f16 | |
400 define void @store_lane_128_4(half* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { | |
401 ; CHECK-LABEL: store_lane_128_4: | |
402 ; CHECK: st4 { v0.h, v1.h, v2.h, v3.h }[2], [x0] | |
403 entry: | |
404 tail call void @llvm.aarch64.neon.st4lane.v8f16.p0f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, i64 2, half* %a) | |
405 ret void | |
406 } | |
407 | |
408 ; NEON intrinsics - load/store without interleaving | |
409 declare { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>*) | |
410 declare { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>*) | |
411 declare { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>*) | |
412 declare void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>*) | |
413 declare void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>*) | |
414 declare void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half>, <4 x half>, <4 x half>, <4 x half>, <4 x half>*) | |
415 declare { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>*) | |
416 declare { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>*) | |
417 declare { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>*) | |
418 declare void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>*) | |
419 declare void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>*) | |
420 declare void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half>, <8 x half>, <8 x half>, <8 x half>, <8 x half>*) | |
421 | |
422 ; Load 2 x v4f16 without de-interleaving | |
423 define { <4 x half>, <4 x half> } @load_64_2(<4 x half>* %a) #0 { | |
424 ; CHECK-LABEL: load_64_2: | |
425 ; CHECK: ld1 { v0.4h, v1.4h }, [x0] | |
426 entry: | |
427 %0 = tail call { <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x2.v4f16.p0v4f16(<4 x half>* %a) | |
428 ret { <4 x half>, <4 x half> } %0 | |
429 } | |
430 | |
431 ; Load 3 x v4f16 without de-interleaving | |
432 define { <4 x half>, <4 x half>, <4 x half> } @load_64_3(<4 x half>* %a) #0 { | |
433 ; CHECK-LABEL: load_64_3: | |
434 ; CHECK: ld1 { v0.4h, v1.4h, v2.4h }, [x0] | |
435 entry: | |
436 %0 = tail call { <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x3.v4f16.p0v4f16(<4 x half>* %a) | |
437 ret { <4 x half>, <4 x half>, <4 x half> } %0 | |
438 } | |
439 | |
440 ; Load 4 x v4f16 without de-interleaving | |
441 define { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @load_64_4(<4 x half>* %a) #0 { | |
442 ; CHECK-LABEL: load_64_4: | |
443 ; CHECK: ld1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] | |
444 entry: | |
445 %0 = tail call { <4 x half>, <4 x half>, <4 x half>, <4 x half> } @llvm.aarch64.neon.ld1x4.v4f16.p0v4f16(<4 x half>* %a) | |
446 ret { <4 x half>, <4 x half>, <4 x half>, <4 x half> } %0 | |
447 } | |
448 | |
449 ; Store 2 x v4f16 without interleaving | |
450 define void @store_64_2(<4 x half>* %a, <4 x half> %b, <4 x half> %c) #0 { | |
451 ; CHECK-LABEL: store_64_2: | |
452 ; CHECK: st1 { v0.4h, v1.4h }, [x0] | |
453 entry: | |
454 tail call void @llvm.aarch64.neon.st1x2.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half>* %a) | |
455 ret void | |
456 } | |
457 | |
458 ; Store 3 x v4f16 without interleaving | |
459 define void @store_64_3(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d) #0 { | |
460 ; CHECK-LABEL: store_64_3: | |
461 ; CHECK: st1 { v0.4h, v1.4h, v2.4h }, [x0] | |
462 entry: | |
463 tail call void @llvm.aarch64.neon.st1x3.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half>* %a) | |
464 ret void | |
465 } | |
466 | |
467 ; Store 4 x v4f16 without interleaving | |
468 define void @store_64_4(<4 x half>* %a, <4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e) #0 { | |
469 ; CHECK-LABEL: store_64_4: | |
470 ; CHECK: st1 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0] | |
471 entry: | |
472 tail call void @llvm.aarch64.neon.st1x4.v4f16.p0v4f16(<4 x half> %b, <4 x half> %c, <4 x half> %d, <4 x half> %e, <4 x half>* %a) | |
473 ret void | |
474 } | |
475 | |
476 ; Load 2 x v8f16 without de-interleaving | |
477 define { <8 x half>, <8 x half> } @load_128_2(<8 x half>* %a) #0 { | |
478 ; CHECK-LABEL: load_128_2: | |
479 ; CHECK: ld1 { v0.8h, v1.8h }, [x0] | |
480 entry: | |
481 %0 = tail call { <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x2.v8f16.p0v8f16(<8 x half>* %a) | |
482 ret { <8 x half>, <8 x half> } %0 | |
483 } | |
484 | |
485 ; Load 3 x v8f16 without de-interleaving | |
486 define { <8 x half>, <8 x half>, <8 x half> } @load_128_3(<8 x half>* %a) #0 { | |
487 ; CHECK-LABEL: load_128_3: | |
488 ; CHECK: ld1 { v0.8h, v1.8h, v2.8h }, [x0] | |
489 entry: | |
490 %0 = tail call { <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x3.v8f16.p0v8f16(<8 x half>* %a) | |
491 ret { <8 x half>, <8 x half>, <8 x half> } %0 | |
492 } | |
493 | |
494 ; Load 8 x v8f16 without de-interleaving | |
495 define { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @load_128_4(<8 x half>* %a) #0 { | |
496 ; CHECK-LABEL: load_128_4: | |
497 ; CHECK: ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] | |
498 entry: | |
499 %0 = tail call { <8 x half>, <8 x half>, <8 x half>, <8 x half> } @llvm.aarch64.neon.ld1x4.v8f16.p0v8f16(<8 x half>* %a) | |
500 ret { <8 x half>, <8 x half>, <8 x half>, <8 x half> } %0 | |
501 } | |
502 | |
503 ; Store 2 x v8f16 without interleaving | |
504 define void @store_128_2(<8 x half>* %a, <8 x half> %b, <8 x half> %c) #0 { | |
505 ; CHECK-LABEL: store_128_2: | |
506 ; CHECK: st1 { v0.8h, v1.8h }, [x0] | |
507 entry: | |
508 tail call void @llvm.aarch64.neon.st1x2.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half>* %a) | |
509 ret void | |
510 } | |
511 | |
512 ; Store 3 x v8f16 without interleaving | |
513 define void @store_128_3(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d) #0 { | |
514 ; CHECK-LABEL: store_128_3: | |
515 ; CHECK: st1 { v0.8h, v1.8h, v2.8h }, [x0] | |
516 entry: | |
517 tail call void @llvm.aarch64.neon.st1x3.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half>* %a) | |
518 ret void | |
519 } | |
520 | |
521 ; Store 8 x v8f16 without interleaving | |
522 define void @store_128_4(<8 x half>* %a, <8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e) #0 { | |
523 ; CHECK-LABEL: store_128_4: | |
524 ; CHECK: st1 { v0.8h, v1.8h, v2.8h, v3.8h }, [x0] | |
525 entry: | |
526 tail call void @llvm.aarch64.neon.st1x4.v8f16.p0v8f16(<8 x half> %b, <8 x half> %c, <8 x half> %d, <8 x half> %e, <8 x half>* %a) | |
527 ret void | |
528 } |