Mercurial > hg > CbC > CbC_llvm
comparison clang/test/CodeGen/aarch64-bf16-ldst-intrinsics.c @ 221:79ff65ed7e25
LLVM12 Original
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 15 Jun 2021 19:15:29 +0900 |
parents | |
children | c4bab56944e8 |
comparison
equal
deleted
inserted
replaced
220:42394fc6a535 | 221:79ff65ed7e25 |
---|---|
1 // RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \ | |
2 // RUN: -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK64 | |
3 // RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi hard \ | |
4 // RUN: -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK32 | |
5 | |
6 // REQUIRES: arm-registered-target,aarch64-registered-target | |
7 | |
8 #include "arm_neon.h" | |
9 | |
10 bfloat16x4_t test_vld1_bf16(bfloat16_t const *ptr) { | |
11 return vld1_bf16(ptr); | |
12 } | |
13 // CHECK-LABEL: test_vld1_bf16 | |
14 // CHECK64: %1 = load <4 x bfloat>, <4 x bfloat>* %0 | |
15 // CHECK64-NEXT: ret <4 x bfloat> %1 | |
16 // CHECK32: %1 = load <4 x bfloat>, <4 x bfloat>* %0, align 2 | |
17 // CHECK32-NEXT: ret <4 x bfloat> %1 | |
18 | |
19 bfloat16x8_t test_vld1q_bf16(bfloat16_t const *ptr) { | |
20 return vld1q_bf16(ptr); | |
21 } | |
22 // CHECK-LABEL: test_vld1q_bf16 | |
23 // CHECK64: %1 = load <8 x bfloat>, <8 x bfloat>* %0 | |
24 // CHECK64-NEXT: ret <8 x bfloat> %1 | |
25 // CHECK32: %1 = load <8 x bfloat>, <8 x bfloat>* %0, align 2 | |
26 // CHECK32-NEXT: ret <8 x bfloat> %1 | |
27 | |
28 bfloat16x4_t test_vld1_lane_bf16(bfloat16_t const *ptr, bfloat16x4_t src) { | |
29 return vld1_lane_bf16(ptr, src, 0); | |
30 } | |
31 // CHECK-LABEL: test_vld1_lane_bf16 | |
32 // CHECK64: %0 = load bfloat, bfloat* %ptr, align 2 | |
33 // CHECK64-NEXT: %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0 | |
34 // CHECK64-NEXT: ret <4 x bfloat> %vld1_lane | |
35 // CHECK32: %0 = load bfloat, bfloat* %ptr, align 2 | |
36 // CHECK32-NEXT: %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0 | |
37 // CHECK32-NEXT: ret <4 x bfloat> %vld1_lane | |
38 | |
39 bfloat16x8_t test_vld1q_lane_bf16(bfloat16_t const *ptr, bfloat16x8_t src) { | |
40 return vld1q_lane_bf16(ptr, src, 7); | |
41 } | |
42 // CHECK-LABEL: test_vld1q_lane_bf16 | |
43 // CHECK64: %0 = load bfloat, bfloat* %ptr, align 2 | |
44 // CHECK64-NEXT: %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7 | |
45 // CHECK64-NEXT: ret <8 x bfloat> %vld1_lane | |
46 // CHECK32: %0 = load bfloat, bfloat* %ptr, align 2 | |
47 // CHECK32-NEXT: %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7 | |
48 // CHECK32-NEXT: ret <8 x bfloat> %vld1_lane | |
49 | |
50 bfloat16x4_t test_vld1_dup_bf16(bfloat16_t const *ptr) { | |
51 return vld1_dup_bf16(ptr); | |
52 } | |
53 // CHECK-LABEL: test_vld1_dup_bf16 | |
54 // CHECK64: %0 = load bfloat, bfloat* %ptr, align 2 | |
55 // CHECK64-NEXT: %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0 | |
56 // CHECK64-NEXT: %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer | |
57 // CHECK64-NEXT: ret <4 x bfloat> %lane | |
58 // CHECK32: %0 = load bfloat, bfloat* %ptr, align 2 | |
59 // CHECK32-NEXT: %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0 | |
60 // CHECK32-NEXT: %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer | |
61 // CHECK32-NEXT: ret <4 x bfloat> %lane | |
62 | |
63 bfloat16x4x2_t test_vld1_bf16_x2(bfloat16_t const *ptr) { | |
64 return vld1_bf16_x2(ptr); | |
65 } | |
66 // CHECK-LABEL: test_vld1_bf16_x2 | |
67 // CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0bf16(bfloat* %ptr) | |
68 // CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0bf16(bfloat* %ptr) | |
69 | |
70 bfloat16x8x2_t test_vld1q_bf16_x2(bfloat16_t const *ptr) { | |
71 return vld1q_bf16_x2(ptr); | |
72 } | |
73 // CHECK-LABEL: test_vld1q_bf16_x2 | |
74 // CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0bf16(bfloat* %ptr) | |
75 // CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0bf16(bfloat* %ptr) | |
76 | |
77 bfloat16x4x3_t test_vld1_bf16_x3(bfloat16_t const *ptr) { | |
78 return vld1_bf16_x3(ptr); | |
79 } | |
80 // CHECK-LABEL: test_vld1_bf16_x3 | |
81 // CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0bf16(bfloat* %ptr) | |
82 // CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0bf16(bfloat* %ptr) | |
83 | |
84 bfloat16x8x3_t test_vld1q_bf16_x3(bfloat16_t const *ptr) { | |
85 return vld1q_bf16_x3(ptr); | |
86 } | |
87 // CHECK-LABEL: test_vld1q_bf16_x3 | |
88 // CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0bf16(bfloat* %ptr) | |
89 // CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0bf16(bfloat* %ptr) | |
90 | |
91 bfloat16x4x4_t test_vld1_bf16_x4(bfloat16_t const *ptr) { | |
92 return vld1_bf16_x4(ptr); | |
93 } | |
94 // CHECK-LABEL: test_vld1_bf16_x4 | |
95 // CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0bf16(bfloat* %ptr) | |
96 // CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0bf16(bfloat* %ptr) | |
97 | |
98 bfloat16x8x4_t test_vld1q_bf16_x4(bfloat16_t const *ptr) { | |
99 return vld1q_bf16_x4(ptr); | |
100 } | |
101 // CHECK-LABEL: test_vld1q_bf16_x4 | |
102 // CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0bf16(bfloat* %ptr) | |
103 // CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0bf16(bfloat* %ptr) | |
104 | |
105 bfloat16x8_t test_vld1q_dup_bf16(bfloat16_t const *ptr) { | |
106 return vld1q_dup_bf16(ptr); | |
107 } | |
108 // CHECK-LABEL: test_vld1q_dup_bf16 | |
109 // CHECK64: %0 = load bfloat, bfloat* %ptr, align 2 | |
110 // CHECK64-NEXT: %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0 | |
111 // CHECK64-NEXT: %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer | |
112 // CHECK64-NEXT: ret <8 x bfloat> %lane | |
113 // CHECK32: %0 = load bfloat, bfloat* %ptr, align 2 | |
114 // CHECK32-NEXT: %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0 | |
115 // CHECK32-NEXT: %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer | |
116 // CHECK32-NEXT: ret <8 x bfloat> %lane | |
117 | |
118 bfloat16x4x2_t test_vld2_bf16(bfloat16_t const *ptr) { | |
119 return vld2_bf16(ptr); | |
120 } | |
121 // CHECK-LABEL: test_vld2_bf16 | |
122 // CHECK64: %0 = bitcast bfloat* %ptr to <4 x bfloat>* | |
123 // CHECK64-NEXT: %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0v4bf16(<4 x bfloat>* %0) | |
124 // CHECK32: %0 = bitcast bfloat* %ptr to i8* | |
125 // CHECK32-NEXT: %vld2_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0i8(i8* %0, i32 2) | |
126 | |
127 bfloat16x8x2_t test_vld2q_bf16(bfloat16_t const *ptr) { | |
128 return vld2q_bf16(ptr); | |
129 } | |
130 // CHECK-LABEL: test_vld2q_bf16 | |
131 // CHECK64: %0 = bitcast bfloat* %ptr to <8 x bfloat>* | |
132 // CHECK64-NEXT: %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0v8bf16(<8 x bfloat>* %0) | |
133 // CHECK32: %0 = bitcast bfloat* %ptr to i8* | |
134 // CHECK32-NEXT: %vld2q_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0i8(i8* %0, i32 2) | |
135 | |
136 bfloat16x4x2_t test_vld2_lane_bf16(bfloat16_t const *ptr, bfloat16x4x2_t src) { | |
137 return vld2_lane_bf16(ptr, src, 1); | |
138 } | |
139 // CHECK-LABEL: test_vld2_lane_bf16 | |
140 // CHECK64: %vld2_lane = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, i64 1, i8* %0) | |
141 // CHECK32: %vld2_lane_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0i8(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2) | |
142 | |
143 bfloat16x8x2_t test_vld2q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x2_t src) { | |
144 return vld2q_lane_bf16(ptr, src, 7); | |
145 } | |
146 // CHECK-LABEL: test_vld2q_lane_bf16 | |
147 // CHECK64: %vld2_lane = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, i64 7, i8* %0) | |
148 // CHECK32: %vld2q_lane_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0i8(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2) | |
149 | |
150 bfloat16x4x3_t test_vld3_bf16(bfloat16_t const *ptr) { | |
151 return vld3_bf16(ptr); | |
152 } | |
153 // CHECK-LABEL: test_vld3_bf16 | |
154 // CHECK64: %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0v4bf16(<4 x bfloat>* %0) | |
155 // CHECK32: %0 = bitcast bfloat* %ptr to i8* | |
156 // CHECK32-NEXT: %vld3_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0i8(i8* %0, i32 2) | |
157 | |
158 bfloat16x8x3_t test_vld3q_bf16(bfloat16_t const *ptr) { | |
159 return vld3q_bf16(ptr); | |
160 } | |
161 // CHECK-LABEL: test_vld3q_bf16 | |
162 // CHECK64: %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0v8bf16(<8 x bfloat>* %0) | |
163 // CHECK32: %0 = bitcast bfloat* %ptr to i8* | |
164 // CHECK32-NEXT: %vld3q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0i8(i8* %0, i32 2) | |
165 | |
166 bfloat16x4x3_t test_vld3_lane_bf16(bfloat16_t const *ptr, bfloat16x4x3_t src) { | |
167 return vld3_lane_bf16(ptr, src, 1); | |
168 } | |
169 // CHECK-LABEL: test_vld3_lane_bf16 | |
170 // CHECK64: %vld3_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, i64 1, i8* %0) | |
171 // CHECK32: %3 = bitcast bfloat* %ptr to i8* | |
172 // CHECK32-NEXT: %vld3_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0i8(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2) | |
173 | |
174 bfloat16x8x3_t test_vld3q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x3_t src) { | |
175 return vld3q_lane_bf16(ptr, src, 7); | |
176 // return vld3q_lane_bf16(ptr, src, 8); | |
177 } | |
178 // CHECK-LABEL: test_vld3q_lane_bf16 | |
179 // CHECK64: %vld3_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, i64 7, i8* %0) | |
180 // CHECK32: %3 = bitcast bfloat* %ptr to i8* | |
181 // CHECK32-NEXT: %vld3q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0i8(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2) | |
182 | |
183 bfloat16x4x4_t test_vld4_bf16(bfloat16_t const *ptr) { | |
184 return vld4_bf16(ptr); | |
185 } | |
186 // CHECK-LABEL: test_vld4_bf16 | |
187 // CHECK64: %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0v4bf16(<4 x bfloat>* %0) | |
188 // CHECK32: %0 = bitcast bfloat* %ptr to i8* | |
189 // CHECK32-NEXT: %vld4_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0i8(i8* %0, i32 2) | |
190 | |
191 bfloat16x8x4_t test_vld4q_bf16(bfloat16_t const *ptr) { | |
192 return vld4q_bf16(ptr); | |
193 } | |
194 // CHECK-LABEL: test_vld4q_bf16 | |
195 // CHECK64: %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0v8bf16(<8 x bfloat>* %0) | |
196 // CHECK32: %0 = bitcast bfloat* %ptr to i8* | |
197 // CHECK32-NEXT: %vld4q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0i8(i8* %0, i32 2) | |
198 | |
199 bfloat16x4x4_t test_vld4_lane_bf16(bfloat16_t const *ptr, bfloat16x4x4_t src) { | |
200 return vld4_lane_bf16(ptr, src, 1); | |
201 } | |
202 // CHECK-LABEL: test_vld4_lane_bf16 | |
203 // CHECK64: %vld4_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, <4 x bfloat> %src.coerce.fca.3.extract, i64 1, i8* %0) | |
204 // CHECK32: %4 = bitcast bfloat* %ptr to i8* | |
205 // CHECK32-NEXT: %vld4_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0i8(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2) | |
206 | |
207 bfloat16x8x4_t test_vld4q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x4_t src) { | |
208 return vld4q_lane_bf16(ptr, src, 7); | |
209 } | |
210 // CHECK-LABEL: test_vld4q_lane_bf16 | |
211 // CHECK64: %vld4_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, <8 x bfloat> %src.coerce.fca.3.extract, i64 7, i8* %0) | |
212 // CHECK32: %4 = bitcast bfloat* %ptr to i8* | |
213 // CHECK32-NEXT: %vld4q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0i8(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2) | |
214 | |
215 bfloat16x4x2_t test_vld2_dup_bf16(bfloat16_t const *ptr) { | |
216 return vld2_dup_bf16(ptr); | |
217 } | |
218 // CHECK-LABEL: test_vld2_dup_bf16 | |
219 // CHECK64: %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0bf16(bfloat* %ptr) | |
220 // CHECK32: %vld2_dup_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0i8(i8* %0, i32 2) | |
221 | |
222 bfloat16x8x2_t test_vld2q_dup_bf16(bfloat16_t const *ptr) { | |
223 return vld2q_dup_bf16(ptr); | |
224 } | |
225 // CHECK-LABEL: test_vld2q_dup_bf16 | |
226 // CHECK64: %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0bf16(bfloat* %ptr) | |
227 // CHECK32: %vld2q_dup_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0i8(i8* %0, i32 2) | |
228 | |
229 bfloat16x4x3_t test_vld3_dup_bf16(bfloat16_t const *ptr) { | |
230 return vld3_dup_bf16(ptr); | |
231 } | |
232 // CHECK-LABEL: test_vld3_dup_bf16 | |
233 // CHECK64: %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0bf16(bfloat* %ptr) | |
234 // CHECK32: %vld3_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0i8(i8* %0, i32 2) | |
235 | |
236 bfloat16x8x3_t test_vld3q_dup_bf16(bfloat16_t const *ptr) { | |
237 return vld3q_dup_bf16(ptr); | |
238 } | |
239 // CHECK-LABEL: test_vld3q_dup_bf16 | |
240 // CHECK64: %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0bf16(bfloat* %ptr) | |
241 // CHECK32: %vld3q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0i8(i8* %0, i32 2) | |
242 | |
243 bfloat16x4x4_t test_vld4_dup_bf16(bfloat16_t const *ptr) { | |
244 return vld4_dup_bf16(ptr); | |
245 } | |
246 // CHECK-LABEL: test_vld4_dup_bf16 | |
247 // CHECK64: %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0bf16(bfloat* %ptr) | |
248 // CHECK32: %vld4_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0i8(i8* %0, i32 2) | |
249 | |
250 bfloat16x8x4_t test_vld4q_dup_bf16(bfloat16_t const *ptr) { | |
251 return vld4q_dup_bf16(ptr); | |
252 } | |
253 // CHECK-LABEL: test_vld4q_dup_bf16 | |
254 // CHECK64: %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0bf16(bfloat* %ptr) | |
255 // CHECK32: %vld4q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0i8(i8* %0, i32 2) | |
256 | |
257 void test_vst1_bf16(bfloat16_t *ptr, bfloat16x4_t val) { | |
258 vst1_bf16(ptr, val); | |
259 } | |
260 // CHECK-LABEL: test_vst1_bf16 | |
261 // CHECK64: %0 = bitcast bfloat* %ptr to <4 x bfloat>* | |
262 // CHECK64-NEXT: store <4 x bfloat> %val, <4 x bfloat>* %0, align 2 | |
263 // CHECK32: %0 = bitcast bfloat* %ptr to i8* | |
264 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1.p0i8.v4bf16(i8* %0, <4 x bfloat> %val, i32 2) | |
265 | |
266 void test_vst1q_bf16(bfloat16_t *ptr, bfloat16x8_t val) { | |
267 vst1q_bf16(ptr, val); | |
268 } | |
269 // CHECK-LABEL: test_vst1q_bf16 | |
270 // CHECK64: %0 = bitcast bfloat* %ptr to <8 x bfloat>* | |
271 // CHECK64-NEXT: store <8 x bfloat> %val, <8 x bfloat>* %0, align 2 | |
272 // CHECK32: %0 = bitcast bfloat* %ptr to i8* | |
273 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1.p0i8.v8bf16(i8* %0, <8 x bfloat> %val, i32 2) | |
274 | |
275 void test_vst1_lane_bf16(bfloat16_t *ptr, bfloat16x4_t val) { | |
276 vst1_lane_bf16(ptr, val, 1); | |
277 } | |
278 // CHECK-LABEL: test_vst1_lane_bf16 | |
279 // CHECK64: %0 = extractelement <4 x bfloat> %val, i32 1 | |
280 // CHECK64-NEXT: store bfloat %0, bfloat* %ptr, align 2 | |
281 // CHECK32: %0 = extractelement <4 x bfloat> %val, i32 1 | |
282 // CHECK32-NEXT: store bfloat %0, bfloat* %ptr, align 2 | |
283 | |
284 void test_vst1q_lane_bf16(bfloat16_t *ptr, bfloat16x8_t val) { | |
285 vst1q_lane_bf16(ptr, val, 7); | |
286 } | |
287 // CHECK-LABEL: test_vst1q_lane_bf16 | |
288 // CHECK64: %0 = extractelement <8 x bfloat> %val, i32 7 | |
289 // CHECK64-NEXT: store bfloat %0, bfloat* %ptr, align 2 | |
290 // CHECK32: %0 = extractelement <8 x bfloat> %val, i32 7 | |
291 // CHECK32-NEXT: store bfloat %0, bfloat* %ptr, align 2 | |
292 | |
293 void test_vst1_bf16_x2(bfloat16_t *ptr, bfloat16x4x2_t val) { | |
294 vst1_bf16_x2(ptr, val); | |
295 } | |
296 // CHECK-LABEL: test_vst1_bf16_x2 | |
297 // CHECK64: tail call void @llvm.aarch64.neon.st1x2.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr) | |
298 // CHECK32: tail call void @llvm.arm.neon.vst1x2.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1) | |
299 | |
300 void test_vst1q_bf16_x2(bfloat16_t *ptr, bfloat16x8x2_t val) { | |
301 vst1q_bf16_x2(ptr, val); | |
302 } | |
303 // CHECK-LABEL: test_vst1q_bf16_x2 | |
304 // CHECK64: tail call void @llvm.aarch64.neon.st1x2.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr) | |
305 // CHECK32: tail call void @llvm.arm.neon.vst1x2.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1) | |
306 | |
307 void test_vst1_bf16_x3(bfloat16_t *ptr, bfloat16x4x3_t val) { | |
308 vst1_bf16_x3(ptr, val); | |
309 } | |
310 // CHECK-LABEL: test_vst1_bf16_x3 | |
311 // CHECK64: tail call void @llvm.aarch64.neon.st1x3.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr) | |
312 // CHECK32: tail call void @llvm.arm.neon.vst1x3.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2) | |
313 | |
314 void test_vst1q_bf16_x3(bfloat16_t *ptr, bfloat16x8x3_t val) { | |
315 vst1q_bf16_x3(ptr, val); | |
316 } | |
317 // CHECK-LABEL: test_vst1q_bf16_x3 | |
318 // CHECK64: tail call void @llvm.aarch64.neon.st1x3.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr) | |
319 // CHECK32: tail call void @llvm.arm.neon.vst1x3.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2) | |
320 | |
321 void test_vst1_bf16_x4(bfloat16_t *ptr, bfloat16x4x4_t val) { | |
322 vst1_bf16_x4(ptr, val); | |
323 } | |
324 // CHECK-LABEL: test_vst1_bf16_x4 | |
325 // CHECK64: tail call void @llvm.aarch64.neon.st1x4.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr) | |
326 // CHECK32: tail call void @llvm.arm.neon.vst1x4.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3) | |
327 | |
328 void test_vst1q_bf16_x4(bfloat16_t *ptr, bfloat16x8x4_t val) { | |
329 vst1q_bf16_x4(ptr, val); | |
330 } | |
331 // CHECK-LABEL: test_vst1q_bf16_x4 | |
332 // CHECK64: tail call void @llvm.aarch64.neon.st1x4.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr) | |
333 // CHECK32: tail call void @llvm.arm.neon.vst1x4.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3) | |
334 | |
335 void test_vst2_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) { | |
336 vst2_bf16(ptr, val); | |
337 } | |
338 // CHECK-LABEL: test_vst2_bf16 | |
339 // CHECK64: tail call void @llvm.aarch64.neon.st2.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i8* %0) | |
340 // CHECK32: tail call void @llvm.arm.neon.vst2.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 2) | |
341 | |
342 void test_vst2q_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) { | |
343 vst2q_bf16(ptr, val); | |
344 } | |
345 // CHECK-LABEL: test_vst2q_bf16 | |
346 // CHECK64: tail call void @llvm.aarch64.neon.st2.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i8* %0) | |
347 // CHECK32: tail call void @llvm.arm.neon.vst2.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 2) | |
348 | |
349 void test_vst2_lane_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) { | |
350 vst2_lane_bf16(ptr, val, 1); | |
351 } | |
352 // CHECK-LABEL: test_vst2_lane_bf16 | |
353 // CHECK64: tail call void @llvm.aarch64.neon.st2lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i64 1, i8* %0) | |
354 // CHECK32: tail call void @llvm.arm.neon.vst2lane.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2) | |
355 | |
356 void test_vst2q_lane_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) { | |
357 vst2q_lane_bf16(ptr, val, 7); | |
358 } | |
359 // CHECK-LABEL: test_vst2q_lane_bf16 | |
360 // CHECK64: tail call void @llvm.aarch64.neon.st2lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i64 7, i8* %0) | |
361 // CHECK32: tail call void @llvm.arm.neon.vst2lane.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2) | |
362 | |
363 void test_vst3_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) { | |
364 vst3_bf16(ptr, val); | |
365 } | |
366 // CHECK-LABEL: test_vst3_bf16 | |
367 // CHECK64: tail call void @llvm.aarch64.neon.st3.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i8* %0) | |
368 // CHECK32: tail call void @llvm.arm.neon.vst3.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 2) | |
369 | |
370 void test_vst3q_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) { | |
371 vst3q_bf16(ptr, val); | |
372 } | |
373 // CHECK-LABEL: test_vst3q_bf16 | |
374 // CHECK64: tail call void @llvm.aarch64.neon.st3.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i8* %0) | |
375 // CHECK32: tail call void @llvm.arm.neon.vst3.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 2) | |
376 | |
377 void test_vst3_lane_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) { | |
378 vst3_lane_bf16(ptr, val, 1); | |
379 } | |
380 // CHECK-LABEL: test_vst3_lane_bf16 | |
381 // CHECK64: tail call void @llvm.aarch64.neon.st3lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i64 1, i8* %0) | |
382 // CHECK32: tail call void @llvm.arm.neon.vst3lane.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2) | |
383 | |
384 void test_vst3q_lane_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) { | |
385 vst3q_lane_bf16(ptr, val, 7); | |
386 } | |
387 // CHECK-LABEL: test_vst3q_lane_bf16 | |
388 // CHECK64: tail call void @llvm.aarch64.neon.st3lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i64 7, i8* %0) | |
389 // CHECK32: tail call void @llvm.arm.neon.vst3lane.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2) | |
390 | |
391 void test_vst4_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) { | |
392 vst4_bf16(ptr, val); | |
393 } | |
394 // CHECK-LABEL: test_vst4_bf16 | |
395 // CHECK64: tail call void @llvm.aarch64.neon.st4.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i8* %0) | |
396 // CHECK32: tail call void @llvm.arm.neon.vst4.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 2) | |
397 | |
398 void test_vst4q_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) { | |
399 vst4q_bf16(ptr, val); | |
400 } | |
401 // CHECK-LABEL: test_vst4q_bf16 | |
402 // CHECK64: tail call void @llvm.aarch64.neon.st4.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i8* %0) | |
403 // CHECK32: tail call void @llvm.arm.neon.vst4.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 2) | |
404 | |
405 void test_vst4_lane_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) { | |
406 vst4_lane_bf16(ptr, val, 1); | |
407 } | |
408 // CHECK-LABEL: test_vst4_lane_bf16 | |
409 // CHECK64: tail call void @llvm.aarch64.neon.st4lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i64 1, i8* %0) | |
410 // CHECK32: tail call void @llvm.arm.neon.vst4lane.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2) | |
411 | |
412 void test_vst4q_lane_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) { | |
413 vst4q_lane_bf16(ptr, val, 7); | |
414 } | |
415 // CHECK-LABEL: test_vst4q_lane_bf16 | |
416 // CHECK64: tail call void @llvm.aarch64.neon.st4lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i64 7, i8* %0) | |
417 // CHECK32: tail call void @llvm.arm.neon.vst4lane.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2) |