comparison clang/test/CodeGen/aarch64-bf16-ldst-intrinsics.c @ 221:79ff65ed7e25

LLVM12 Original
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Tue, 15 Jun 2021 19:15:29 +0900
parents
children c4bab56944e8
comparison
equal deleted inserted replaced
220:42394fc6a535 221:79ff65ed7e25
1 // RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \
2 // RUN: -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK64
3 // RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi hard \
4 // RUN: -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK32
5
6 // REQUIRES: arm-registered-target,aarch64-registered-target
7
8 #include "arm_neon.h"
9
10 bfloat16x4_t test_vld1_bf16(bfloat16_t const *ptr) {
11 return vld1_bf16(ptr);
12 }
13 // CHECK-LABEL: test_vld1_bf16
14 // CHECK64: %1 = load <4 x bfloat>, <4 x bfloat>* %0
15 // CHECK64-NEXT: ret <4 x bfloat> %1
16 // CHECK32: %1 = load <4 x bfloat>, <4 x bfloat>* %0, align 2
17 // CHECK32-NEXT: ret <4 x bfloat> %1
18
19 bfloat16x8_t test_vld1q_bf16(bfloat16_t const *ptr) {
20 return vld1q_bf16(ptr);
21 }
22 // CHECK-LABEL: test_vld1q_bf16
23 // CHECK64: %1 = load <8 x bfloat>, <8 x bfloat>* %0
24 // CHECK64-NEXT: ret <8 x bfloat> %1
25 // CHECK32: %1 = load <8 x bfloat>, <8 x bfloat>* %0, align 2
26 // CHECK32-NEXT: ret <8 x bfloat> %1
27
28 bfloat16x4_t test_vld1_lane_bf16(bfloat16_t const *ptr, bfloat16x4_t src) {
29 return vld1_lane_bf16(ptr, src, 0);
30 }
31 // CHECK-LABEL: test_vld1_lane_bf16
32 // CHECK64: %0 = load bfloat, bfloat* %ptr, align 2
33 // CHECK64-NEXT: %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0
34 // CHECK64-NEXT: ret <4 x bfloat> %vld1_lane
35 // CHECK32: %0 = load bfloat, bfloat* %ptr, align 2
36 // CHECK32-NEXT: %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0
37 // CHECK32-NEXT: ret <4 x bfloat> %vld1_lane
38
39 bfloat16x8_t test_vld1q_lane_bf16(bfloat16_t const *ptr, bfloat16x8_t src) {
40 return vld1q_lane_bf16(ptr, src, 7);
41 }
42 // CHECK-LABEL: test_vld1q_lane_bf16
43 // CHECK64: %0 = load bfloat, bfloat* %ptr, align 2
44 // CHECK64-NEXT: %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7
45 // CHECK64-NEXT: ret <8 x bfloat> %vld1_lane
46 // CHECK32: %0 = load bfloat, bfloat* %ptr, align 2
47 // CHECK32-NEXT: %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7
48 // CHECK32-NEXT: ret <8 x bfloat> %vld1_lane
49
50 bfloat16x4_t test_vld1_dup_bf16(bfloat16_t const *ptr) {
51 return vld1_dup_bf16(ptr);
52 }
53 // CHECK-LABEL: test_vld1_dup_bf16
54 // CHECK64: %0 = load bfloat, bfloat* %ptr, align 2
55 // CHECK64-NEXT: %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0
56 // CHECK64-NEXT: %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer
57 // CHECK64-NEXT: ret <4 x bfloat> %lane
58 // CHECK32: %0 = load bfloat, bfloat* %ptr, align 2
59 // CHECK32-NEXT: %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0
60 // CHECK32-NEXT: %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer
61 // CHECK32-NEXT: ret <4 x bfloat> %lane
62
63 bfloat16x4x2_t test_vld1_bf16_x2(bfloat16_t const *ptr) {
64 return vld1_bf16_x2(ptr);
65 }
66 // CHECK-LABEL: test_vld1_bf16_x2
67 // CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0bf16(bfloat* %ptr)
68 // CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0bf16(bfloat* %ptr)
69
70 bfloat16x8x2_t test_vld1q_bf16_x2(bfloat16_t const *ptr) {
71 return vld1q_bf16_x2(ptr);
72 }
73 // CHECK-LABEL: test_vld1q_bf16_x2
74 // CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0bf16(bfloat* %ptr)
75 // CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0bf16(bfloat* %ptr)
76
77 bfloat16x4x3_t test_vld1_bf16_x3(bfloat16_t const *ptr) {
78 return vld1_bf16_x3(ptr);
79 }
80 // CHECK-LABEL: test_vld1_bf16_x3
81 // CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0bf16(bfloat* %ptr)
82 // CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0bf16(bfloat* %ptr)
83
84 bfloat16x8x3_t test_vld1q_bf16_x3(bfloat16_t const *ptr) {
85 return vld1q_bf16_x3(ptr);
86 }
87 // CHECK-LABEL: test_vld1q_bf16_x3
88 // CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0bf16(bfloat* %ptr)
89 // CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0bf16(bfloat* %ptr)
90
91 bfloat16x4x4_t test_vld1_bf16_x4(bfloat16_t const *ptr) {
92 return vld1_bf16_x4(ptr);
93 }
94 // CHECK-LABEL: test_vld1_bf16_x4
95 // CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0bf16(bfloat* %ptr)
96 // CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0bf16(bfloat* %ptr)
97
98 bfloat16x8x4_t test_vld1q_bf16_x4(bfloat16_t const *ptr) {
99 return vld1q_bf16_x4(ptr);
100 }
101 // CHECK-LABEL: test_vld1q_bf16_x4
102 // CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0bf16(bfloat* %ptr)
103 // CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0bf16(bfloat* %ptr)
104
105 bfloat16x8_t test_vld1q_dup_bf16(bfloat16_t const *ptr) {
106 return vld1q_dup_bf16(ptr);
107 }
108 // CHECK-LABEL: test_vld1q_dup_bf16
109 // CHECK64: %0 = load bfloat, bfloat* %ptr, align 2
110 // CHECK64-NEXT: %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0
111 // CHECK64-NEXT: %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer
112 // CHECK64-NEXT: ret <8 x bfloat> %lane
113 // CHECK32: %0 = load bfloat, bfloat* %ptr, align 2
114 // CHECK32-NEXT: %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0
115 // CHECK32-NEXT: %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer
116 // CHECK32-NEXT: ret <8 x bfloat> %lane
117
118 bfloat16x4x2_t test_vld2_bf16(bfloat16_t const *ptr) {
119 return vld2_bf16(ptr);
120 }
121 // CHECK-LABEL: test_vld2_bf16
122 // CHECK64: %0 = bitcast bfloat* %ptr to <4 x bfloat>*
123 // CHECK64-NEXT: %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0v4bf16(<4 x bfloat>* %0)
124 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
125 // CHECK32-NEXT: %vld2_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0i8(i8* %0, i32 2)
126
127 bfloat16x8x2_t test_vld2q_bf16(bfloat16_t const *ptr) {
128 return vld2q_bf16(ptr);
129 }
130 // CHECK-LABEL: test_vld2q_bf16
131 // CHECK64: %0 = bitcast bfloat* %ptr to <8 x bfloat>*
132 // CHECK64-NEXT: %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0v8bf16(<8 x bfloat>* %0)
133 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
134 // CHECK32-NEXT: %vld2q_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0i8(i8* %0, i32 2)
135
136 bfloat16x4x2_t test_vld2_lane_bf16(bfloat16_t const *ptr, bfloat16x4x2_t src) {
137 return vld2_lane_bf16(ptr, src, 1);
138 }
139 // CHECK-LABEL: test_vld2_lane_bf16
140 // CHECK64: %vld2_lane = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, i64 1, i8* %0)
141 // CHECK32: %vld2_lane_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0i8(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2)
142
143 bfloat16x8x2_t test_vld2q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x2_t src) {
144 return vld2q_lane_bf16(ptr, src, 7);
145 }
146 // CHECK-LABEL: test_vld2q_lane_bf16
147 // CHECK64: %vld2_lane = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, i64 7, i8* %0)
148 // CHECK32: %vld2q_lane_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0i8(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2)
149
150 bfloat16x4x3_t test_vld3_bf16(bfloat16_t const *ptr) {
151 return vld3_bf16(ptr);
152 }
153 // CHECK-LABEL: test_vld3_bf16
154 // CHECK64: %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0v4bf16(<4 x bfloat>* %0)
155 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
156 // CHECK32-NEXT: %vld3_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0i8(i8* %0, i32 2)
157
158 bfloat16x8x3_t test_vld3q_bf16(bfloat16_t const *ptr) {
159 return vld3q_bf16(ptr);
160 }
161 // CHECK-LABEL: test_vld3q_bf16
162 // CHECK64: %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0v8bf16(<8 x bfloat>* %0)
163 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
164 // CHECK32-NEXT: %vld3q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0i8(i8* %0, i32 2)
165
166 bfloat16x4x3_t test_vld3_lane_bf16(bfloat16_t const *ptr, bfloat16x4x3_t src) {
167 return vld3_lane_bf16(ptr, src, 1);
168 }
169 // CHECK-LABEL: test_vld3_lane_bf16
170 // CHECK64: %vld3_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, i64 1, i8* %0)
171 // CHECK32: %3 = bitcast bfloat* %ptr to i8*
172 // CHECK32-NEXT: %vld3_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0i8(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2)
173
174 bfloat16x8x3_t test_vld3q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x3_t src) {
175 return vld3q_lane_bf16(ptr, src, 7);
176 // return vld3q_lane_bf16(ptr, src, 8);
177 }
178 // CHECK-LABEL: test_vld3q_lane_bf16
179 // CHECK64: %vld3_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, i64 7, i8* %0)
180 // CHECK32: %3 = bitcast bfloat* %ptr to i8*
181 // CHECK32-NEXT: %vld3q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0i8(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2)
182
183 bfloat16x4x4_t test_vld4_bf16(bfloat16_t const *ptr) {
184 return vld4_bf16(ptr);
185 }
186 // CHECK-LABEL: test_vld4_bf16
187 // CHECK64: %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0v4bf16(<4 x bfloat>* %0)
188 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
189 // CHECK32-NEXT: %vld4_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0i8(i8* %0, i32 2)
190
191 bfloat16x8x4_t test_vld4q_bf16(bfloat16_t const *ptr) {
192 return vld4q_bf16(ptr);
193 }
194 // CHECK-LABEL: test_vld4q_bf16
195 // CHECK64: %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0v8bf16(<8 x bfloat>* %0)
196 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
197 // CHECK32-NEXT: %vld4q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0i8(i8* %0, i32 2)
198
199 bfloat16x4x4_t test_vld4_lane_bf16(bfloat16_t const *ptr, bfloat16x4x4_t src) {
200 return vld4_lane_bf16(ptr, src, 1);
201 }
202 // CHECK-LABEL: test_vld4_lane_bf16
203 // CHECK64: %vld4_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, <4 x bfloat> %src.coerce.fca.3.extract, i64 1, i8* %0)
204 // CHECK32: %4 = bitcast bfloat* %ptr to i8*
205 // CHECK32-NEXT: %vld4_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0i8(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2)
206
207 bfloat16x8x4_t test_vld4q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x4_t src) {
208 return vld4q_lane_bf16(ptr, src, 7);
209 }
210 // CHECK-LABEL: test_vld4q_lane_bf16
211 // CHECK64: %vld4_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, <8 x bfloat> %src.coerce.fca.3.extract, i64 7, i8* %0)
212 // CHECK32: %4 = bitcast bfloat* %ptr to i8*
213 // CHECK32-NEXT: %vld4q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0i8(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2)
214
215 bfloat16x4x2_t test_vld2_dup_bf16(bfloat16_t const *ptr) {
216 return vld2_dup_bf16(ptr);
217 }
218 // CHECK-LABEL: test_vld2_dup_bf16
219 // CHECK64: %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0bf16(bfloat* %ptr)
220 // CHECK32: %vld2_dup_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0i8(i8* %0, i32 2)
221
222 bfloat16x8x2_t test_vld2q_dup_bf16(bfloat16_t const *ptr) {
223 return vld2q_dup_bf16(ptr);
224 }
225 // CHECK-LABEL: test_vld2q_dup_bf16
226 // CHECK64: %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0bf16(bfloat* %ptr)
227 // CHECK32: %vld2q_dup_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0i8(i8* %0, i32 2)
228
229 bfloat16x4x3_t test_vld3_dup_bf16(bfloat16_t const *ptr) {
230 return vld3_dup_bf16(ptr);
231 }
232 // CHECK-LABEL: test_vld3_dup_bf16
233 // CHECK64: %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0bf16(bfloat* %ptr)
234 // CHECK32: %vld3_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0i8(i8* %0, i32 2)
235
236 bfloat16x8x3_t test_vld3q_dup_bf16(bfloat16_t const *ptr) {
237 return vld3q_dup_bf16(ptr);
238 }
239 // CHECK-LABEL: test_vld3q_dup_bf16
240 // CHECK64: %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0bf16(bfloat* %ptr)
241 // CHECK32: %vld3q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0i8(i8* %0, i32 2)
242
243 bfloat16x4x4_t test_vld4_dup_bf16(bfloat16_t const *ptr) {
244 return vld4_dup_bf16(ptr);
245 }
246 // CHECK-LABEL: test_vld4_dup_bf16
247 // CHECK64: %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0bf16(bfloat* %ptr)
248 // CHECK32: %vld4_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0i8(i8* %0, i32 2)
249
250 bfloat16x8x4_t test_vld4q_dup_bf16(bfloat16_t const *ptr) {
251 return vld4q_dup_bf16(ptr);
252 }
253 // CHECK-LABEL: test_vld4q_dup_bf16
254 // CHECK64: %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0bf16(bfloat* %ptr)
255 // CHECK32: %vld4q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0i8(i8* %0, i32 2)
256
257 void test_vst1_bf16(bfloat16_t *ptr, bfloat16x4_t val) {
258 vst1_bf16(ptr, val);
259 }
260 // CHECK-LABEL: test_vst1_bf16
261 // CHECK64: %0 = bitcast bfloat* %ptr to <4 x bfloat>*
262 // CHECK64-NEXT: store <4 x bfloat> %val, <4 x bfloat>* %0, align 2
263 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
264 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1.p0i8.v4bf16(i8* %0, <4 x bfloat> %val, i32 2)
265
266 void test_vst1q_bf16(bfloat16_t *ptr, bfloat16x8_t val) {
267 vst1q_bf16(ptr, val);
268 }
269 // CHECK-LABEL: test_vst1q_bf16
270 // CHECK64: %0 = bitcast bfloat* %ptr to <8 x bfloat>*
271 // CHECK64-NEXT: store <8 x bfloat> %val, <8 x bfloat>* %0, align 2
272 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
273 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1.p0i8.v8bf16(i8* %0, <8 x bfloat> %val, i32 2)
274
275 void test_vst1_lane_bf16(bfloat16_t *ptr, bfloat16x4_t val) {
276 vst1_lane_bf16(ptr, val, 1);
277 }
278 // CHECK-LABEL: test_vst1_lane_bf16
279 // CHECK64: %0 = extractelement <4 x bfloat> %val, i32 1
280 // CHECK64-NEXT: store bfloat %0, bfloat* %ptr, align 2
281 // CHECK32: %0 = extractelement <4 x bfloat> %val, i32 1
282 // CHECK32-NEXT: store bfloat %0, bfloat* %ptr, align 2
283
284 void test_vst1q_lane_bf16(bfloat16_t *ptr, bfloat16x8_t val) {
285 vst1q_lane_bf16(ptr, val, 7);
286 }
287 // CHECK-LABEL: test_vst1q_lane_bf16
288 // CHECK64: %0 = extractelement <8 x bfloat> %val, i32 7
289 // CHECK64-NEXT: store bfloat %0, bfloat* %ptr, align 2
290 // CHECK32: %0 = extractelement <8 x bfloat> %val, i32 7
291 // CHECK32-NEXT: store bfloat %0, bfloat* %ptr, align 2
292
293 void test_vst1_bf16_x2(bfloat16_t *ptr, bfloat16x4x2_t val) {
294 vst1_bf16_x2(ptr, val);
295 }
296 // CHECK-LABEL: test_vst1_bf16_x2
297 // CHECK64: tail call void @llvm.aarch64.neon.st1x2.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr)
298 // CHECK32: tail call void @llvm.arm.neon.vst1x2.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1)
299
300 void test_vst1q_bf16_x2(bfloat16_t *ptr, bfloat16x8x2_t val) {
301 vst1q_bf16_x2(ptr, val);
302 }
303 // CHECK-LABEL: test_vst1q_bf16_x2
304 // CHECK64: tail call void @llvm.aarch64.neon.st1x2.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr)
305 // CHECK32: tail call void @llvm.arm.neon.vst1x2.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1)
306
307 void test_vst1_bf16_x3(bfloat16_t *ptr, bfloat16x4x3_t val) {
308 vst1_bf16_x3(ptr, val);
309 }
310 // CHECK-LABEL: test_vst1_bf16_x3
311 // CHECK64: tail call void @llvm.aarch64.neon.st1x3.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr)
312 // CHECK32: tail call void @llvm.arm.neon.vst1x3.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2)
313
314 void test_vst1q_bf16_x3(bfloat16_t *ptr, bfloat16x8x3_t val) {
315 vst1q_bf16_x3(ptr, val);
316 }
317 // CHECK-LABEL: test_vst1q_bf16_x3
318 // CHECK64: tail call void @llvm.aarch64.neon.st1x3.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr)
319 // CHECK32: tail call void @llvm.arm.neon.vst1x3.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2)
320
321 void test_vst1_bf16_x4(bfloat16_t *ptr, bfloat16x4x4_t val) {
322 vst1_bf16_x4(ptr, val);
323 }
324 // CHECK-LABEL: test_vst1_bf16_x4
325 // CHECK64: tail call void @llvm.aarch64.neon.st1x4.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr)
326 // CHECK32: tail call void @llvm.arm.neon.vst1x4.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3)
327
328 void test_vst1q_bf16_x4(bfloat16_t *ptr, bfloat16x8x4_t val) {
329 vst1q_bf16_x4(ptr, val);
330 }
331 // CHECK-LABEL: test_vst1q_bf16_x4
332 // CHECK64: tail call void @llvm.aarch64.neon.st1x4.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr)
333 // CHECK32: tail call void @llvm.arm.neon.vst1x4.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3)
334
335 void test_vst2_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) {
336 vst2_bf16(ptr, val);
337 }
338 // CHECK-LABEL: test_vst2_bf16
339 // CHECK64: tail call void @llvm.aarch64.neon.st2.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i8* %0)
340 // CHECK32: tail call void @llvm.arm.neon.vst2.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 2)
341
342 void test_vst2q_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) {
343 vst2q_bf16(ptr, val);
344 }
345 // CHECK-LABEL: test_vst2q_bf16
346 // CHECK64: tail call void @llvm.aarch64.neon.st2.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i8* %0)
347 // CHECK32: tail call void @llvm.arm.neon.vst2.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 2)
348
349 void test_vst2_lane_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) {
350 vst2_lane_bf16(ptr, val, 1);
351 }
352 // CHECK-LABEL: test_vst2_lane_bf16
353 // CHECK64: tail call void @llvm.aarch64.neon.st2lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i64 1, i8* %0)
354 // CHECK32: tail call void @llvm.arm.neon.vst2lane.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2)
355
356 void test_vst2q_lane_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) {
357 vst2q_lane_bf16(ptr, val, 7);
358 }
359 // CHECK-LABEL: test_vst2q_lane_bf16
360 // CHECK64: tail call void @llvm.aarch64.neon.st2lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i64 7, i8* %0)
361 // CHECK32: tail call void @llvm.arm.neon.vst2lane.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2)
362
363 void test_vst3_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) {
364 vst3_bf16(ptr, val);
365 }
366 // CHECK-LABEL: test_vst3_bf16
367 // CHECK64: tail call void @llvm.aarch64.neon.st3.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i8* %0)
368 // CHECK32: tail call void @llvm.arm.neon.vst3.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 2)
369
370 void test_vst3q_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) {
371 vst3q_bf16(ptr, val);
372 }
373 // CHECK-LABEL: test_vst3q_bf16
374 // CHECK64: tail call void @llvm.aarch64.neon.st3.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i8* %0)
375 // CHECK32: tail call void @llvm.arm.neon.vst3.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 2)
376
377 void test_vst3_lane_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) {
378 vst3_lane_bf16(ptr, val, 1);
379 }
380 // CHECK-LABEL: test_vst3_lane_bf16
381 // CHECK64: tail call void @llvm.aarch64.neon.st3lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i64 1, i8* %0)
382 // CHECK32: tail call void @llvm.arm.neon.vst3lane.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2)
383
384 void test_vst3q_lane_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) {
385 vst3q_lane_bf16(ptr, val, 7);
386 }
387 // CHECK-LABEL: test_vst3q_lane_bf16
388 // CHECK64: tail call void @llvm.aarch64.neon.st3lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i64 7, i8* %0)
389 // CHECK32: tail call void @llvm.arm.neon.vst3lane.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2)
390
391 void test_vst4_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) {
392 vst4_bf16(ptr, val);
393 }
394 // CHECK-LABEL: test_vst4_bf16
395 // CHECK64: tail call void @llvm.aarch64.neon.st4.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i8* %0)
396 // CHECK32: tail call void @llvm.arm.neon.vst4.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 2)
397
398 void test_vst4q_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) {
399 vst4q_bf16(ptr, val);
400 }
401 // CHECK-LABEL: test_vst4q_bf16
402 // CHECK64: tail call void @llvm.aarch64.neon.st4.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i8* %0)
403 // CHECK32: tail call void @llvm.arm.neon.vst4.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 2)
404
405 void test_vst4_lane_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) {
406 vst4_lane_bf16(ptr, val, 1);
407 }
408 // CHECK-LABEL: test_vst4_lane_bf16
409 // CHECK64: tail call void @llvm.aarch64.neon.st4lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i64 1, i8* %0)
410 // CHECK32: tail call void @llvm.arm.neon.vst4lane.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2)
411
412 void test_vst4q_lane_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) {
413 vst4q_lane_bf16(ptr, val, 7);
414 }
415 // CHECK-LABEL: test_vst4q_lane_bf16
416 // CHECK64: tail call void @llvm.aarch64.neon.st4lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i64 7, i8* %0)
417 // CHECK32: tail call void @llvm.arm.neon.vst4lane.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2)