236
|
1 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
|
252
|
2 // RUN: %clang_cc1 -triple aarch64 -target-feature +neon -target-feature +bf16 \
|
221
|
3 // RUN: -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK64
|
252
|
4 // RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi hard \
|
221
|
5 // RUN: -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK32
|
|
6
|
|
7 // REQUIRES: arm-registered-target,aarch64-registered-target
|
|
8
|
|
9 #include "arm_neon.h"
|
|
10
|
236
|
11 // CHECK-LABEL: @test_vld1_bf16(
|
|
12 // CHECK-NEXT: entry:
|
252
|
13 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x bfloat>, ptr [[PTR:%.*]], align 2
|
236
|
14 // CHECK-NEXT: ret <4 x bfloat> [[TMP1]]
|
|
15 //
|
221
|
16 bfloat16x4_t test_vld1_bf16(bfloat16_t const *ptr) {
|
|
17 return vld1_bf16(ptr);
|
|
18 }
|
|
19
|
236
|
20 // CHECK-LABEL: @test_vld1q_bf16(
|
|
21 // CHECK-NEXT: entry:
|
252
|
22 // CHECK-NEXT: [[TMP1:%.*]] = load <8 x bfloat>, ptr [[PTR:%.*]], align 2
|
236
|
23 // CHECK-NEXT: ret <8 x bfloat> [[TMP1]]
|
|
24 //
|
221
|
25 bfloat16x8_t test_vld1q_bf16(bfloat16_t const *ptr) {
|
|
26 return vld1q_bf16(ptr);
|
|
27 }
|
|
28
|
236
|
29 // CHECK-LABEL: @test_vld1_lane_bf16(
|
|
30 // CHECK-NEXT: entry:
|
252
|
31 // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[PTR:%.*]], align 2
|
236
|
32 // CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <4 x bfloat> [[SRC:%.*]], bfloat [[TMP0]], i64 0
|
|
33 // CHECK-NEXT: ret <4 x bfloat> [[VLD1_LANE]]
|
|
34 //
|
221
|
35 bfloat16x4_t test_vld1_lane_bf16(bfloat16_t const *ptr, bfloat16x4_t src) {
|
|
36 return vld1_lane_bf16(ptr, src, 0);
|
|
37 }
|
|
38
|
236
|
39 // CHECK-LABEL: @test_vld1q_lane_bf16(
|
|
40 // CHECK-NEXT: entry:
|
252
|
41 // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[PTR:%.*]], align 2
|
236
|
42 // CHECK-NEXT: [[VLD1_LANE:%.*]] = insertelement <8 x bfloat> [[SRC:%.*]], bfloat [[TMP0]], i64 7
|
|
43 // CHECK-NEXT: ret <8 x bfloat> [[VLD1_LANE]]
|
|
44 //
|
221
|
45 bfloat16x8_t test_vld1q_lane_bf16(bfloat16_t const *ptr, bfloat16x8_t src) {
|
|
46 return vld1q_lane_bf16(ptr, src, 7);
|
|
47 }
|
|
48
|
236
|
49 // CHECK-LABEL: @test_vld1_dup_bf16(
|
|
50 // CHECK-NEXT: entry:
|
252
|
51 // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[PTR:%.*]], align 2
|
236
|
52 // CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x bfloat> poison, bfloat [[TMP0]], i64 0
|
|
53 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> poison, <4 x i32> zeroinitializer
|
|
54 // CHECK-NEXT: ret <4 x bfloat> [[LANE]]
|
|
55 //
|
221
|
56 bfloat16x4_t test_vld1_dup_bf16(bfloat16_t const *ptr) {
|
|
57 return vld1_dup_bf16(ptr);
|
|
58 }
|
|
59
|
236
|
60 // CHECK64-LABEL: @test_vld1_bf16_x2(
|
|
61 // CHECK64-NEXT: entry:
|
252
|
62 // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0(ptr [[PTR:%.*]])
|
236
|
63 // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0
|
|
64 // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1
|
|
65 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T:%.*]] poison, <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0
|
|
66 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1
|
|
67 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]]
|
|
68 //
|
|
69 // CHECK32-LABEL: @test_vld1_bf16_x2(
|
|
70 // CHECK32-NEXT: entry:
|
252
|
71 // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0(ptr [[PTR:%.*]])
|
236
|
72 // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0
|
|
73 // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1
|
|
74 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <2 x i32>
|
|
75 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <2 x i32>
|
|
76 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[TMP0]], 0
|
|
77 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP1]], 1
|
|
78 // CHECK32-NEXT: ret [2 x <2 x i32>] [[DOTFCA_1_INSERT]]
|
|
79 //
|
221
|
80 bfloat16x4x2_t test_vld1_bf16_x2(bfloat16_t const *ptr) {
|
|
81 return vld1_bf16_x2(ptr);
|
|
82 }
|
|
83
|
236
|
84 // CHECK64-LABEL: @test_vld1q_bf16_x2(
|
|
85 // CHECK64-NEXT: entry:
|
252
|
86 // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0(ptr [[PTR:%.*]])
|
236
|
87 // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0
|
|
88 // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1
|
|
89 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T:%.*]] poison, <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0
|
|
90 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1
|
|
91 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]]
|
|
92 //
|
|
93 // CHECK32-LABEL: @test_vld1q_bf16_x2(
|
|
94 // CHECK32-NEXT: entry:
|
252
|
95 // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0(ptr [[PTR:%.*]])
|
236
|
96 // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0
|
|
97 // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1
|
|
98 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <4 x i32>
|
|
99 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <4 x i32>
|
|
100 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[TMP0]], 0
|
|
101 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP1]], 1
|
|
102 // CHECK32-NEXT: ret [2 x <4 x i32>] [[DOTFCA_1_INSERT]]
|
|
103 //
|
221
|
104 bfloat16x8x2_t test_vld1q_bf16_x2(bfloat16_t const *ptr) {
|
|
105 return vld1q_bf16_x2(ptr);
|
|
106 }
|
|
107
|
236
|
108 // CHECK64-LABEL: @test_vld1_bf16_x3(
|
|
109 // CHECK64-NEXT: entry:
|
252
|
110 // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0(ptr [[PTR:%.*]])
|
236
|
111 // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0
|
|
112 // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1
|
|
113 // CHECK64-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 2
|
|
114 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T:%.*]] poison, <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0
|
|
115 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1
|
|
116 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD1XN_FCA_2_EXTRACT]], 0, 2
|
|
117 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]]
|
|
118 //
|
|
119 // CHECK32-LABEL: @test_vld1_bf16_x3(
|
|
120 // CHECK32-NEXT: entry:
|
252
|
121 // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0(ptr [[PTR:%.*]])
|
236
|
122 // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0
|
|
123 // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1
|
|
124 // CHECK32-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 2
|
|
125 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <2 x i32>
|
|
126 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <2 x i32>
|
|
127 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_2_EXTRACT]] to <2 x i32>
|
|
128 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[TMP0]], 0
|
|
129 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP1]], 1
|
|
130 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP2]], 2
|
|
131 // CHECK32-NEXT: ret [3 x <2 x i32>] [[DOTFCA_2_INSERT]]
|
|
132 //
|
221
|
133 bfloat16x4x3_t test_vld1_bf16_x3(bfloat16_t const *ptr) {
|
|
134 return vld1_bf16_x3(ptr);
|
|
135 }
|
|
136
|
236
|
137 // CHECK64-LABEL: @test_vld1q_bf16_x3(
|
|
138 // CHECK64-NEXT: entry:
|
252
|
139 // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0(ptr [[PTR:%.*]])
|
236
|
140 // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0
|
|
141 // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1
|
|
142 // CHECK64-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 2
|
|
143 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T:%.*]] poison, <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0
|
|
144 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1
|
|
145 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD1XN_FCA_2_EXTRACT]], 0, 2
|
|
146 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]]
|
|
147 //
|
|
148 // CHECK32-LABEL: @test_vld1q_bf16_x3(
|
|
149 // CHECK32-NEXT: entry:
|
252
|
150 // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0(ptr [[PTR:%.*]])
|
236
|
151 // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0
|
|
152 // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1
|
|
153 // CHECK32-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 2
|
|
154 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <4 x i32>
|
|
155 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <4 x i32>
|
|
156 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_2_EXTRACT]] to <4 x i32>
|
|
157 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[TMP0]], 0
|
|
158 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP1]], 1
|
|
159 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP2]], 2
|
|
160 // CHECK32-NEXT: ret [3 x <4 x i32>] [[DOTFCA_2_INSERT]]
|
|
161 //
|
221
|
162 bfloat16x8x3_t test_vld1q_bf16_x3(bfloat16_t const *ptr) {
|
|
163 return vld1q_bf16_x3(ptr);
|
|
164 }
|
|
165
|
236
|
166 // CHECK64-LABEL: @test_vld1_bf16_x4(
|
|
167 // CHECK64-NEXT: entry:
|
252
|
168 // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0(ptr [[PTR:%.*]])
|
236
|
169 // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0
|
|
170 // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1
|
|
171 // CHECK64-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 2
|
|
172 // CHECK64-NEXT: [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 3
|
|
173 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T:%.*]] poison, <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0
|
|
174 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1
|
|
175 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD1XN_FCA_2_EXTRACT]], 0, 2
|
|
176 // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x bfloat> [[VLD1XN_FCA_3_EXTRACT]], 0, 3
|
|
177 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]]
|
|
178 //
|
|
179 // CHECK32-LABEL: @test_vld1_bf16_x4(
|
|
180 // CHECK32-NEXT: entry:
|
252
|
181 // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0(ptr [[PTR:%.*]])
|
236
|
182 // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 0
|
|
183 // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 1
|
|
184 // CHECK32-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 2
|
|
185 // CHECK32-NEXT: [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD1XN]], 3
|
|
186 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <2 x i32>
|
|
187 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <2 x i32>
|
|
188 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_2_EXTRACT]] to <2 x i32>
|
|
189 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD1XN_FCA_3_EXTRACT]] to <2 x i32>
|
|
190 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[TMP0]], 0
|
|
191 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP1]], 1
|
|
192 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP2]], 2
|
|
193 // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_2_INSERT]], <2 x i32> [[TMP3]], 3
|
|
194 // CHECK32-NEXT: ret [4 x <2 x i32>] [[DOTFCA_3_INSERT]]
|
|
195 //
|
221
|
196 bfloat16x4x4_t test_vld1_bf16_x4(bfloat16_t const *ptr) {
|
|
197 return vld1_bf16_x4(ptr);
|
|
198 }
|
|
199
|
236
|
200 // CHECK64-LABEL: @test_vld1q_bf16_x4(
|
|
201 // CHECK64-NEXT: entry:
|
252
|
202 // CHECK64-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0(ptr [[PTR:%.*]])
|
236
|
203 // CHECK64-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0
|
|
204 // CHECK64-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1
|
|
205 // CHECK64-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 2
|
|
206 // CHECK64-NEXT: [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 3
|
|
207 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T:%.*]] poison, <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]], 0, 0
|
|
208 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]], 0, 1
|
|
209 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD1XN_FCA_2_EXTRACT]], 0, 2
|
|
210 // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x bfloat> [[VLD1XN_FCA_3_EXTRACT]], 0, 3
|
|
211 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]]
|
|
212 //
|
|
213 // CHECK32-LABEL: @test_vld1q_bf16_x4(
|
|
214 // CHECK32-NEXT: entry:
|
252
|
215 // CHECK32-NEXT: [[VLD1XN:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0(ptr [[PTR:%.*]])
|
236
|
216 // CHECK32-NEXT: [[VLD1XN_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 0
|
|
217 // CHECK32-NEXT: [[VLD1XN_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 1
|
|
218 // CHECK32-NEXT: [[VLD1XN_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 2
|
|
219 // CHECK32-NEXT: [[VLD1XN_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD1XN]], 3
|
|
220 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_0_EXTRACT]] to <4 x i32>
|
|
221 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_1_EXTRACT]] to <4 x i32>
|
|
222 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_2_EXTRACT]] to <4 x i32>
|
|
223 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD1XN_FCA_3_EXTRACT]] to <4 x i32>
|
|
224 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[TMP0]], 0
|
|
225 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP1]], 1
|
|
226 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP2]], 2
|
|
227 // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_2_INSERT]], <4 x i32> [[TMP3]], 3
|
|
228 // CHECK32-NEXT: ret [4 x <4 x i32>] [[DOTFCA_3_INSERT]]
|
|
229 //
|
221
|
230 bfloat16x8x4_t test_vld1q_bf16_x4(bfloat16_t const *ptr) {
|
|
231 return vld1q_bf16_x4(ptr);
|
|
232 }
|
|
233
|
236
|
234 // CHECK-LABEL: @test_vld1q_dup_bf16(
|
|
235 // CHECK-NEXT: entry:
|
252
|
236 // CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[PTR:%.*]], align 2
|
236
|
237 // CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x bfloat> poison, bfloat [[TMP0]], i64 0
|
|
238 // CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x bfloat> [[TMP1]], <8 x bfloat> poison, <8 x i32> zeroinitializer
|
|
239 // CHECK-NEXT: ret <8 x bfloat> [[LANE]]
|
|
240 //
|
221
|
241 bfloat16x8_t test_vld1q_dup_bf16(bfloat16_t const *ptr) {
|
|
242 return vld1q_dup_bf16(ptr);
|
|
243 }
|
|
244
|
236
|
245 // CHECK64-LABEL: @test_vld2_bf16(
|
|
246 // CHECK64-NEXT: entry:
|
252
|
247 // CHECK64-NEXT: [[VLD2:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0(ptr [[PTR:%.*]])
|
236
|
248 // CHECK64-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2]], 0
|
|
249 // CHECK64-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2]], 1
|
|
250 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T:%.*]] poison, <4 x bfloat> [[VLD2_FCA_0_EXTRACT]], 0, 0
|
|
251 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD2_FCA_1_EXTRACT]], 0, 1
|
|
252 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]]
|
|
253 //
|
|
254 // CHECK32-LABEL: @test_vld2_bf16(
|
|
255 // CHECK32-NEXT: entry:
|
252
|
256 // CHECK32-NEXT: [[VLD2_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0(ptr [[PTR:%.*]], i32 2)
|
236
|
257 // CHECK32-NEXT: [[VLD2_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_V]], 0
|
|
258 // CHECK32-NEXT: [[VLD2_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_V]], 1
|
|
259 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD2_V_FCA_0_EXTRACT]] to <2 x i32>
|
|
260 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD2_V_FCA_1_EXTRACT]] to <2 x i32>
|
|
261 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0
|
|
262 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1
|
|
263 // CHECK32-NEXT: ret [2 x <2 x i32>] [[DOTFCA_1_INSERT]]
|
|
264 //
|
221
|
265 bfloat16x4x2_t test_vld2_bf16(bfloat16_t const *ptr) {
|
|
266 return vld2_bf16(ptr);
|
|
267 }
|
|
268
|
236
|
269 // CHECK64-LABEL: @test_vld2q_bf16(
|
|
270 // CHECK64-NEXT: entry:
|
252
|
271 // CHECK64-NEXT: [[VLD2:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0(ptr [[PTR:%.*]])
|
236
|
272 // CHECK64-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2]], 0
|
|
273 // CHECK64-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2]], 1
|
|
274 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T:%.*]] poison, <8 x bfloat> [[VLD2_FCA_0_EXTRACT]], 0, 0
|
|
275 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD2_FCA_1_EXTRACT]], 0, 1
|
|
276 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]]
|
|
277 //
|
|
278 // CHECK32-LABEL: @test_vld2q_bf16(
|
|
279 // CHECK32-NEXT: entry:
|
252
|
280 // CHECK32-NEXT: [[VLD2Q_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0(ptr [[PTR:%.*]], i32 2)
|
236
|
281 // CHECK32-NEXT: [[VLD2Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_V]], 0
|
|
282 // CHECK32-NEXT: [[VLD2Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_V]], 1
|
|
283 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD2Q_V_FCA_0_EXTRACT]] to <4 x i32>
|
|
284 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD2Q_V_FCA_1_EXTRACT]] to <4 x i32>
|
|
285 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0
|
|
286 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1
|
|
287 // CHECK32-NEXT: ret [2 x <4 x i32>] [[DOTFCA_1_INSERT]]
|
|
288 //
|
221
|
289 bfloat16x8x2_t test_vld2q_bf16(bfloat16_t const *ptr) {
|
|
290 return vld2q_bf16(ptr);
|
|
291 }
|
|
292
|
236
|
293 // CHECK64-LABEL: @test_vld2_lane_bf16(
|
|
294 // CHECK64-NEXT: entry:
|
|
295 // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[SRC_COERCE:%.*]], 0
|
|
296 // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[SRC_COERCE]], 1
|
252
|
297 // CHECK64-NEXT: [[VLD2_LANE:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0(<4 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], i64 1, ptr [[PTR:%.*]])
|
236
|
298 // CHECK64-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_LANE]], 0
|
|
299 // CHECK64-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_LANE]], 1
|
|
300 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T:%.*]] poison, <4 x bfloat> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0
|
|
301 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1
|
|
302 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]]
|
|
303 //
|
|
304 // CHECK32-LABEL: @test_vld2_lane_bf16(
|
|
305 // CHECK32-NEXT: entry:
|
|
306 // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[SRC_COERCE:%.*]], 0
|
|
307 // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[SRC_COERCE]], 1
|
|
308 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
|
|
309 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
|
252
|
310 // CHECK32-NEXT: [[VLD2_LANE_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], i32 1, i32 2)
|
236
|
311 // CHECK32-NEXT: [[VLD2_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_LANE_V]], 0
|
|
312 // CHECK32-NEXT: [[VLD2_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_LANE_V]], 1
|
|
313 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD2_LANE_V_FCA_0_EXTRACT]] to <2 x i32>
|
|
314 // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[VLD2_LANE_V_FCA_1_EXTRACT]] to <2 x i32>
|
|
315 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[TMP3]], 0
|
|
316 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP4]], 1
|
|
317 // CHECK32-NEXT: ret [2 x <2 x i32>] [[DOTFCA_1_INSERT]]
|
|
318 //
|
221
|
319 bfloat16x4x2_t test_vld2_lane_bf16(bfloat16_t const *ptr, bfloat16x4x2_t src) {
|
|
320 return vld2_lane_bf16(ptr, src, 1);
|
|
321 }
|
|
322
|
236
|
323 // CHECK64-LABEL: @test_vld2q_lane_bf16(
|
|
324 // CHECK64-NEXT: entry:
|
|
325 // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[SRC_COERCE:%.*]], 0
|
|
326 // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[SRC_COERCE]], 1
|
252
|
327 // CHECK64-NEXT: [[VLD2_LANE:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0(<8 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], i64 7, ptr [[PTR:%.*]])
|
236
|
328 // CHECK64-NEXT: [[VLD2_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2_LANE]], 0
|
|
329 // CHECK64-NEXT: [[VLD2_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2_LANE]], 1
|
|
330 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T:%.*]] poison, <8 x bfloat> [[VLD2_LANE_FCA_0_EXTRACT]], 0, 0
|
|
331 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD2_LANE_FCA_1_EXTRACT]], 0, 1
|
|
332 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]]
|
|
333 //
|
|
334 // CHECK32-LABEL: @test_vld2q_lane_bf16(
|
|
335 // CHECK32-NEXT: entry:
|
|
336 // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[SRC_COERCE:%.*]], 0
|
|
337 // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[SRC_COERCE]], 1
|
|
338 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
|
|
339 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
|
252
|
340 // CHECK32-NEXT: [[VLD2Q_LANE_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], i32 7, i32 2)
|
236
|
341 // CHECK32-NEXT: [[VLD2Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_LANE_V]], 0
|
|
342 // CHECK32-NEXT: [[VLD2Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_LANE_V]], 1
|
|
343 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD2Q_LANE_V_FCA_0_EXTRACT]] to <4 x i32>
|
|
344 // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <8 x bfloat> [[VLD2Q_LANE_V_FCA_1_EXTRACT]] to <4 x i32>
|
|
345 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[TMP3]], 0
|
|
346 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP4]], 1
|
|
347 // CHECK32-NEXT: ret [2 x <4 x i32>] [[DOTFCA_1_INSERT]]
|
|
348 //
|
221
|
349 bfloat16x8x2_t test_vld2q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x2_t src) {
|
|
350 return vld2q_lane_bf16(ptr, src, 7);
|
|
351 }
|
|
352
|
236
|
353 // CHECK64-LABEL: @test_vld3_bf16(
|
|
354 // CHECK64-NEXT: entry:
|
252
|
355 // CHECK64-NEXT: [[VLD3:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0(ptr [[PTR:%.*]])
|
236
|
356 // CHECK64-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 0
|
|
357 // CHECK64-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 1
|
|
358 // CHECK64-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 2
|
|
359 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T:%.*]] poison, <4 x bfloat> [[VLD3_FCA_0_EXTRACT]], 0, 0
|
|
360 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD3_FCA_1_EXTRACT]], 0, 1
|
|
361 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD3_FCA_2_EXTRACT]], 0, 2
|
|
362 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]]
|
|
363 //
|
|
364 // CHECK32-LABEL: @test_vld3_bf16(
|
|
365 // CHECK32-NEXT: entry:
|
252
|
366 // CHECK32-NEXT: [[VLD3_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0(ptr [[PTR:%.*]], i32 2)
|
236
|
367 // CHECK32-NEXT: [[VLD3_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_V]], 0
|
|
368 // CHECK32-NEXT: [[VLD3_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_V]], 1
|
|
369 // CHECK32-NEXT: [[VLD3_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_V]], 2
|
|
370 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD3_V_FCA_0_EXTRACT]] to <2 x i32>
|
|
371 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD3_V_FCA_1_EXTRACT]] to <2 x i32>
|
|
372 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD3_V_FCA_2_EXTRACT]] to <2 x i32>
|
|
373 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0
|
|
374 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1
|
|
375 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP3]], 2
|
|
376 // CHECK32-NEXT: ret [3 x <2 x i32>] [[DOTFCA_2_INSERT]]
|
|
377 //
|
221
|
378 bfloat16x4x3_t test_vld3_bf16(bfloat16_t const *ptr) {
|
|
379 return vld3_bf16(ptr);
|
|
380 }
|
|
381
|
236
|
382 // CHECK64-LABEL: @test_vld3q_bf16(
|
|
383 // CHECK64-NEXT: entry:
|
252
|
384 // CHECK64-NEXT: [[VLD3:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0(ptr [[PTR:%.*]])
|
236
|
385 // CHECK64-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 0
|
|
386 // CHECK64-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 1
|
|
387 // CHECK64-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 2
|
|
388 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T:%.*]] poison, <8 x bfloat> [[VLD3_FCA_0_EXTRACT]], 0, 0
|
|
389 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD3_FCA_1_EXTRACT]], 0, 1
|
|
390 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD3_FCA_2_EXTRACT]], 0, 2
|
|
391 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]]
|
|
392 //
|
|
393 // CHECK32-LABEL: @test_vld3q_bf16(
|
|
394 // CHECK32-NEXT: entry:
|
252
|
395 // CHECK32-NEXT: [[VLD3Q_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0(ptr [[PTR:%.*]], i32 2)
|
236
|
396 // CHECK32-NEXT: [[VLD3Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_V]], 0
|
|
397 // CHECK32-NEXT: [[VLD3Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_V]], 1
|
|
398 // CHECK32-NEXT: [[VLD3Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_V]], 2
|
|
399 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD3Q_V_FCA_0_EXTRACT]] to <4 x i32>
|
|
400 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD3Q_V_FCA_1_EXTRACT]] to <4 x i32>
|
|
401 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD3Q_V_FCA_2_EXTRACT]] to <4 x i32>
|
|
402 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0
|
|
403 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1
|
|
404 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP3]], 2
|
|
405 // CHECK32-NEXT: ret [3 x <4 x i32>] [[DOTFCA_2_INSERT]]
|
|
406 //
|
221
|
407 bfloat16x8x3_t test_vld3q_bf16(bfloat16_t const *ptr) {
|
|
408 return vld3q_bf16(ptr);
|
|
409 }
|
|
410
|
236
|
411 // CHECK64-LABEL: @test_vld3_lane_bf16(
|
|
412 // CHECK64-NEXT: entry:
|
|
413 // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[SRC_COERCE:%.*]], 0
|
|
414 // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[SRC_COERCE]], 1
|
|
415 // CHECK64-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[SRC_COERCE]], 2
|
252
|
416 // CHECK64-NEXT: [[VLD3_LANE:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0(<4 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_2_EXTRACT]], i64 1, ptr [[PTR:%.*]])
|
236
|
417 // CHECK64-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE]], 0
|
|
418 // CHECK64-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE]], 1
|
|
419 // CHECK64-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE]], 2
|
|
420 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T:%.*]] poison, <4 x bfloat> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0
|
|
421 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1
|
|
422 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2
|
|
423 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]]
|
|
424 //
|
|
425 // CHECK32-LABEL: @test_vld3_lane_bf16(
|
|
426 // CHECK32-NEXT: entry:
|
|
427 // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[SRC_COERCE:%.*]], 0
|
|
428 // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[SRC_COERCE]], 1
|
|
429 // CHECK32-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[SRC_COERCE]], 2
|
|
430 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
|
|
431 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
|
|
432 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_2_EXTRACT]] to <4 x bfloat>
|
252
|
433 // CHECK32-NEXT: [[VLD3_LANE_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], i32 1, i32 2)
|
236
|
434 // CHECK32-NEXT: [[VLD3_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE_V]], 0
|
|
435 // CHECK32-NEXT: [[VLD3_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE_V]], 1
|
|
436 // CHECK32-NEXT: [[VLD3_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_LANE_V]], 2
|
|
437 // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[VLD3_LANE_V_FCA_0_EXTRACT]] to <2 x i32>
|
|
438 // CHECK32-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[VLD3_LANE_V_FCA_1_EXTRACT]] to <2 x i32>
|
|
439 // CHECK32-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[VLD3_LANE_V_FCA_2_EXTRACT]] to <2 x i32>
|
|
440 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[TMP4]], 0
|
|
441 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP5]], 1
|
|
442 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP6]], 2
|
|
443 // CHECK32-NEXT: ret [3 x <2 x i32>] [[DOTFCA_2_INSERT]]
|
|
444 //
|
221
|
445 bfloat16x4x3_t test_vld3_lane_bf16(bfloat16_t const *ptr, bfloat16x4x3_t src) {
|
|
446 return vld3_lane_bf16(ptr, src, 1);
|
|
447 }
|
|
448
|
236
|
449 // CHECK64-LABEL: @test_vld3q_lane_bf16(
|
|
450 // CHECK64-NEXT: entry:
|
|
451 // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[SRC_COERCE:%.*]], 0
|
|
452 // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[SRC_COERCE]], 1
|
|
453 // CHECK64-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[SRC_COERCE]], 2
|
252
|
454 // CHECK64-NEXT: [[VLD3_LANE:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0(<8 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[PTR:%.*]])
|
236
|
455 // CHECK64-NEXT: [[VLD3_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3_LANE]], 0
|
|
456 // CHECK64-NEXT: [[VLD3_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3_LANE]], 1
|
|
457 // CHECK64-NEXT: [[VLD3_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3_LANE]], 2
|
|
458 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T:%.*]] poison, <8 x bfloat> [[VLD3_LANE_FCA_0_EXTRACT]], 0, 0
|
|
459 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD3_LANE_FCA_1_EXTRACT]], 0, 1
|
|
460 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD3_LANE_FCA_2_EXTRACT]], 0, 2
|
|
461 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]]
|
|
462 //
|
|
463 // CHECK32-LABEL: @test_vld3q_lane_bf16(
|
|
464 // CHECK32-NEXT: entry:
|
|
465 // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[SRC_COERCE:%.*]], 0
|
|
466 // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[SRC_COERCE]], 1
|
|
467 // CHECK32-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[SRC_COERCE]], 2
|
|
468 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
|
|
469 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
|
|
470 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_2_EXTRACT]] to <8 x bfloat>
|
252
|
471 // CHECK32-NEXT: [[VLD3Q_LANE_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], i32 7, i32 2)
|
236
|
472 // CHECK32-NEXT: [[VLD3Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_LANE_V]], 0
|
|
473 // CHECK32-NEXT: [[VLD3Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_LANE_V]], 1
|
|
474 // CHECK32-NEXT: [[VLD3Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_LANE_V]], 2
|
|
475 // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <8 x bfloat> [[VLD3Q_LANE_V_FCA_0_EXTRACT]] to <4 x i32>
|
|
476 // CHECK32-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[VLD3Q_LANE_V_FCA_1_EXTRACT]] to <4 x i32>
|
|
477 // CHECK32-NEXT: [[TMP6:%.*]] = bitcast <8 x bfloat> [[VLD3Q_LANE_V_FCA_2_EXTRACT]] to <4 x i32>
|
|
478 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[TMP4]], 0
|
|
479 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP5]], 1
|
|
480 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP6]], 2
|
|
481 // CHECK32-NEXT: ret [3 x <4 x i32>] [[DOTFCA_2_INSERT]]
|
|
482 //
|
221
|
483 bfloat16x8x3_t test_vld3q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x3_t src) {
|
|
484 return vld3q_lane_bf16(ptr, src, 7);
|
|
485 // return vld3q_lane_bf16(ptr, src, 8);
|
|
486 }
|
|
487
|
236
|
488 // CHECK64-LABEL: @test_vld4_bf16(
|
|
489 // CHECK64-NEXT: entry:
|
252
|
490 // CHECK64-NEXT: [[VLD4:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0(ptr [[PTR:%.*]])
|
236
|
491 // CHECK64-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 0
|
|
492 // CHECK64-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 1
|
|
493 // CHECK64-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 2
|
|
494 // CHECK64-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 3
|
|
495 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T:%.*]] poison, <4 x bfloat> [[VLD4_FCA_0_EXTRACT]], 0, 0
|
|
496 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD4_FCA_1_EXTRACT]], 0, 1
|
|
497 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD4_FCA_2_EXTRACT]], 0, 2
|
|
498 // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x bfloat> [[VLD4_FCA_3_EXTRACT]], 0, 3
|
|
499 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]]
|
|
500 //
|
|
501 // CHECK32-LABEL: @test_vld4_bf16(
|
|
502 // CHECK32-NEXT: entry:
|
252
|
503 // CHECK32-NEXT: [[VLD4_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0(ptr [[PTR:%.*]], i32 2)
|
236
|
504 // CHECK32-NEXT: [[VLD4_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_V]], 0
|
|
505 // CHECK32-NEXT: [[VLD4_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_V]], 1
|
|
506 // CHECK32-NEXT: [[VLD4_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_V]], 2
|
|
507 // CHECK32-NEXT: [[VLD4_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_V]], 3
|
|
508 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD4_V_FCA_0_EXTRACT]] to <2 x i32>
|
|
509 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD4_V_FCA_1_EXTRACT]] to <2 x i32>
|
|
510 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD4_V_FCA_2_EXTRACT]] to <2 x i32>
|
|
511 // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[VLD4_V_FCA_3_EXTRACT]] to <2 x i32>
|
|
512 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0
|
|
513 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1
|
|
514 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP3]], 2
|
|
515 // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_2_INSERT]], <2 x i32> [[TMP4]], 3
|
|
516 // CHECK32-NEXT: ret [4 x <2 x i32>] [[DOTFCA_3_INSERT]]
|
|
517 //
|
221
|
518 bfloat16x4x4_t test_vld4_bf16(bfloat16_t const *ptr) {
|
|
519 return vld4_bf16(ptr);
|
|
520 }
|
|
521
|
236
|
522 // CHECK64-LABEL: @test_vld4q_bf16(
|
|
523 // CHECK64-NEXT: entry:
|
252
|
524 // CHECK64-NEXT: [[VLD4:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0(ptr [[PTR:%.*]])
|
236
|
525 // CHECK64-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 0
|
|
526 // CHECK64-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 1
|
|
527 // CHECK64-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 2
|
|
528 // CHECK64-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 3
|
|
529 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T:%.*]] poison, <8 x bfloat> [[VLD4_FCA_0_EXTRACT]], 0, 0
|
|
530 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD4_FCA_1_EXTRACT]], 0, 1
|
|
531 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD4_FCA_2_EXTRACT]], 0, 2
|
|
532 // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x bfloat> [[VLD4_FCA_3_EXTRACT]], 0, 3
|
|
533 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]]
|
|
534 //
|
|
535 // CHECK32-LABEL: @test_vld4q_bf16(
|
|
536 // CHECK32-NEXT: entry:
|
252
|
537 // CHECK32-NEXT: [[VLD4Q_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0(ptr [[PTR:%.*]], i32 2)
|
236
|
538 // CHECK32-NEXT: [[VLD4Q_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_V]], 0
|
|
539 // CHECK32-NEXT: [[VLD4Q_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_V]], 1
|
|
540 // CHECK32-NEXT: [[VLD4Q_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_V]], 2
|
|
541 // CHECK32-NEXT: [[VLD4Q_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_V]], 3
|
|
542 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD4Q_V_FCA_0_EXTRACT]] to <4 x i32>
|
|
543 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD4Q_V_FCA_1_EXTRACT]] to <4 x i32>
|
|
544 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD4Q_V_FCA_2_EXTRACT]] to <4 x i32>
|
|
545 // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <8 x bfloat> [[VLD4Q_V_FCA_3_EXTRACT]] to <4 x i32>
|
|
546 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0
|
|
547 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1
|
|
548 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP3]], 2
|
|
549 // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_2_INSERT]], <4 x i32> [[TMP4]], 3
|
|
550 // CHECK32-NEXT: ret [4 x <4 x i32>] [[DOTFCA_3_INSERT]]
|
|
551 //
|
221
|
552 bfloat16x8x4_t test_vld4q_bf16(bfloat16_t const *ptr) {
|
|
553 return vld4q_bf16(ptr);
|
|
554 }
|
|
555
|
236
|
556 // CHECK64-LABEL: @test_vld4_lane_bf16(
|
|
557 // CHECK64-NEXT: entry:
|
|
558 // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[SRC_COERCE:%.*]], 0
|
|
559 // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[SRC_COERCE]], 1
|
|
560 // CHECK64-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[SRC_COERCE]], 2
|
|
561 // CHECK64-NEXT: [[SRC_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[SRC_COERCE]], 3
|
252
|
562 // CHECK64-NEXT: [[VLD4_LANE:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0(<4 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_2_EXTRACT]], <4 x bfloat> [[SRC_COERCE_FCA_3_EXTRACT]], i64 1, ptr [[PTR:%.*]])
|
236
|
563 // CHECK64-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE]], 0
|
|
564 // CHECK64-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE]], 1
|
|
565 // CHECK64-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE]], 2
|
|
566 // CHECK64-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE]], 3
|
|
567 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T:%.*]] poison, <4 x bfloat> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0
|
|
568 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1
|
|
569 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2
|
|
570 // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x bfloat> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3
|
|
571 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]]
|
|
572 //
|
|
573 // CHECK32-LABEL: @test_vld4_lane_bf16(
|
|
574 // CHECK32-NEXT: entry:
|
|
575 // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[SRC_COERCE:%.*]], 0
|
|
576 // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[SRC_COERCE]], 1
|
|
577 // CHECK32-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[SRC_COERCE]], 2
|
|
578 // CHECK32-NEXT: [[SRC_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[SRC_COERCE]], 3
|
|
579 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
|
|
580 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
|
|
581 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_2_EXTRACT]] to <4 x bfloat>
|
|
582 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SRC_COERCE_FCA_3_EXTRACT]] to <4 x bfloat>
|
252
|
583 // CHECK32-NEXT: [[VLD4_LANE_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], i32 1, i32 2)
|
236
|
584 // CHECK32-NEXT: [[VLD4_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE_V]], 0
|
|
585 // CHECK32-NEXT: [[VLD4_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE_V]], 1
|
|
586 // CHECK32-NEXT: [[VLD4_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE_V]], 2
|
|
587 // CHECK32-NEXT: [[VLD4_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_LANE_V]], 3
|
|
588 // CHECK32-NEXT: [[TMP5:%.*]] = bitcast <4 x bfloat> [[VLD4_LANE_V_FCA_0_EXTRACT]] to <2 x i32>
|
|
589 // CHECK32-NEXT: [[TMP6:%.*]] = bitcast <4 x bfloat> [[VLD4_LANE_V_FCA_1_EXTRACT]] to <2 x i32>
|
|
590 // CHECK32-NEXT: [[TMP7:%.*]] = bitcast <4 x bfloat> [[VLD4_LANE_V_FCA_2_EXTRACT]] to <2 x i32>
|
|
591 // CHECK32-NEXT: [[TMP8:%.*]] = bitcast <4 x bfloat> [[VLD4_LANE_V_FCA_3_EXTRACT]] to <2 x i32>
|
|
592 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[TMP5]], 0
|
|
593 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP6]], 1
|
|
594 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP7]], 2
|
|
595 // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_2_INSERT]], <2 x i32> [[TMP8]], 3
|
|
596 // CHECK32-NEXT: ret [4 x <2 x i32>] [[DOTFCA_3_INSERT]]
|
|
597 //
|
221
|
598 bfloat16x4x4_t test_vld4_lane_bf16(bfloat16_t const *ptr, bfloat16x4x4_t src) {
|
|
599 return vld4_lane_bf16(ptr, src, 1);
|
|
600 }
|
|
601
|
236
|
602 // CHECK64-LABEL: @test_vld4q_lane_bf16(
|
|
603 // CHECK64-NEXT: entry:
|
|
604 // CHECK64-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[SRC_COERCE:%.*]], 0
|
|
605 // CHECK64-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[SRC_COERCE]], 1
|
|
606 // CHECK64-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[SRC_COERCE]], 2
|
|
607 // CHECK64-NEXT: [[SRC_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[SRC_COERCE]], 3
|
252
|
608 // CHECK64-NEXT: [[VLD4_LANE:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0(<8 x bfloat> [[SRC_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_2_EXTRACT]], <8 x bfloat> [[SRC_COERCE_FCA_3_EXTRACT]], i64 7, ptr [[PTR:%.*]])
|
236
|
609 // CHECK64-NEXT: [[VLD4_LANE_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4_LANE]], 0
|
|
610 // CHECK64-NEXT: [[VLD4_LANE_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4_LANE]], 1
|
|
611 // CHECK64-NEXT: [[VLD4_LANE_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4_LANE]], 2
|
|
612 // CHECK64-NEXT: [[VLD4_LANE_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4_LANE]], 3
|
|
613 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T:%.*]] poison, <8 x bfloat> [[VLD4_LANE_FCA_0_EXTRACT]], 0, 0
|
|
614 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD4_LANE_FCA_1_EXTRACT]], 0, 1
|
|
615 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD4_LANE_FCA_2_EXTRACT]], 0, 2
|
|
616 // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x bfloat> [[VLD4_LANE_FCA_3_EXTRACT]], 0, 3
|
|
617 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]]
|
|
618 //
|
|
619 // CHECK32-LABEL: @test_vld4q_lane_bf16(
|
|
620 // CHECK32-NEXT: entry:
|
|
621 // CHECK32-NEXT: [[SRC_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[SRC_COERCE:%.*]], 0
|
|
622 // CHECK32-NEXT: [[SRC_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[SRC_COERCE]], 1
|
|
623 // CHECK32-NEXT: [[SRC_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[SRC_COERCE]], 2
|
|
624 // CHECK32-NEXT: [[SRC_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[SRC_COERCE]], 3
|
|
625 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
|
|
626 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
|
|
627 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_2_EXTRACT]] to <8 x bfloat>
|
|
628 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[SRC_COERCE_FCA_3_EXTRACT]] to <8 x bfloat>
|
252
|
629 // CHECK32-NEXT: [[VLD4Q_LANE_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP3]], i32 7, i32 2)
|
236
|
630 // CHECK32-NEXT: [[VLD4Q_LANE_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_LANE_V]], 0
|
|
631 // CHECK32-NEXT: [[VLD4Q_LANE_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_LANE_V]], 1
|
|
632 // CHECK32-NEXT: [[VLD4Q_LANE_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_LANE_V]], 2
|
|
633 // CHECK32-NEXT: [[VLD4Q_LANE_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_LANE_V]], 3
|
|
634 // CHECK32-NEXT: [[TMP5:%.*]] = bitcast <8 x bfloat> [[VLD4Q_LANE_V_FCA_0_EXTRACT]] to <4 x i32>
|
|
635 // CHECK32-NEXT: [[TMP6:%.*]] = bitcast <8 x bfloat> [[VLD4Q_LANE_V_FCA_1_EXTRACT]] to <4 x i32>
|
|
636 // CHECK32-NEXT: [[TMP7:%.*]] = bitcast <8 x bfloat> [[VLD4Q_LANE_V_FCA_2_EXTRACT]] to <4 x i32>
|
|
637 // CHECK32-NEXT: [[TMP8:%.*]] = bitcast <8 x bfloat> [[VLD4Q_LANE_V_FCA_3_EXTRACT]] to <4 x i32>
|
|
638 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[TMP5]], 0
|
|
639 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP6]], 1
|
|
640 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP7]], 2
|
|
641 // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_2_INSERT]], <4 x i32> [[TMP8]], 3
|
|
642 // CHECK32-NEXT: ret [4 x <4 x i32>] [[DOTFCA_3_INSERT]]
|
|
643 //
|
221
|
644 bfloat16x8x4_t test_vld4q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x4_t src) {
|
|
645 return vld4q_lane_bf16(ptr, src, 7);
|
|
646 }
|
|
647
|
236
|
648 // CHECK64-LABEL: @test_vld2_dup_bf16(
|
|
649 // CHECK64-NEXT: entry:
|
252
|
650 // CHECK64-NEXT: [[VLD2:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0(ptr [[PTR:%.*]])
|
236
|
651 // CHECK64-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2]], 0
|
|
652 // CHECK64-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2]], 1
|
|
653 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T:%.*]] poison, <4 x bfloat> [[VLD2_FCA_0_EXTRACT]], 0, 0
|
|
654 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD2_FCA_1_EXTRACT]], 0, 1
|
|
655 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X2_T]] [[DOTFCA_0_1_INSERT]]
|
|
656 //
|
|
657 // CHECK32-LABEL: @test_vld2_dup_bf16(
|
|
658 // CHECK32-NEXT: entry:
|
252
|
659 // CHECK32-NEXT: [[VLD2_DUP_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0(ptr [[PTR:%.*]], i32 2)
|
236
|
660 // CHECK32-NEXT: [[VLD2_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_DUP_V]], 0
|
|
661 // CHECK32-NEXT: [[VLD2_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat> } [[VLD2_DUP_V]], 1
|
|
662 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD2_DUP_V_FCA_0_EXTRACT]] to <2 x i32>
|
|
663 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD2_DUP_V_FCA_1_EXTRACT]] to <2 x i32>
|
|
664 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0
|
|
665 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1
|
|
666 // CHECK32-NEXT: ret [2 x <2 x i32>] [[DOTFCA_1_INSERT]]
|
|
667 //
|
221
|
668 bfloat16x4x2_t test_vld2_dup_bf16(bfloat16_t const *ptr) {
|
|
669 return vld2_dup_bf16(ptr);
|
|
670 }
|
|
671
|
236
|
672 // CHECK64-LABEL: @test_vld2q_dup_bf16(
|
|
673 // CHECK64-NEXT: entry:
|
252
|
674 // CHECK64-NEXT: [[VLD2:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0(ptr [[PTR:%.*]])
|
236
|
675 // CHECK64-NEXT: [[VLD2_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2]], 0
|
|
676 // CHECK64-NEXT: [[VLD2_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2]], 1
|
|
677 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T:%.*]] poison, <8 x bfloat> [[VLD2_FCA_0_EXTRACT]], 0, 0
|
|
678 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD2_FCA_1_EXTRACT]], 0, 1
|
|
679 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X2_T]] [[DOTFCA_0_1_INSERT]]
|
|
680 //
|
|
681 // CHECK32-LABEL: @test_vld2q_dup_bf16(
|
|
682 // CHECK32-NEXT: entry:
|
252
|
683 // CHECK32-NEXT: [[VLD2Q_DUP_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0(ptr [[PTR:%.*]], i32 2)
|
236
|
684 // CHECK32-NEXT: [[VLD2Q_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_DUP_V]], 0
|
|
685 // CHECK32-NEXT: [[VLD2Q_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat> } [[VLD2Q_DUP_V]], 1
|
|
686 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD2Q_DUP_V_FCA_0_EXTRACT]] to <4 x i32>
|
|
687 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD2Q_DUP_V_FCA_1_EXTRACT]] to <4 x i32>
|
|
688 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0
|
|
689 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1
|
|
690 // CHECK32-NEXT: ret [2 x <4 x i32>] [[DOTFCA_1_INSERT]]
|
|
691 //
|
221
|
692 bfloat16x8x2_t test_vld2q_dup_bf16(bfloat16_t const *ptr) {
|
|
693 return vld2q_dup_bf16(ptr);
|
|
694 }
|
|
695
|
236
|
696 // CHECK64-LABEL: @test_vld3_dup_bf16(
|
|
697 // CHECK64-NEXT: entry:
|
252
|
698 // CHECK64-NEXT: [[VLD3:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0(ptr [[PTR:%.*]])
|
236
|
699 // CHECK64-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 0
|
|
700 // CHECK64-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 1
|
|
701 // CHECK64-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3]], 2
|
|
702 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T:%.*]] poison, <4 x bfloat> [[VLD3_FCA_0_EXTRACT]], 0, 0
|
|
703 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD3_FCA_1_EXTRACT]], 0, 1
|
|
704 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD3_FCA_2_EXTRACT]], 0, 2
|
|
705 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X3_T]] [[DOTFCA_0_2_INSERT]]
|
|
706 //
|
|
707 // CHECK32-LABEL: @test_vld3_dup_bf16(
|
|
708 // CHECK32-NEXT: entry:
|
252
|
709 // CHECK32-NEXT: [[VLD3_DUP_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0(ptr [[PTR:%.*]], i32 2)
|
236
|
710 // CHECK32-NEXT: [[VLD3_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_DUP_V]], 0
|
|
711 // CHECK32-NEXT: [[VLD3_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_DUP_V]], 1
|
|
712 // CHECK32-NEXT: [[VLD3_DUP_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD3_DUP_V]], 2
|
|
713 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD3_DUP_V_FCA_0_EXTRACT]] to <2 x i32>
|
|
714 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD3_DUP_V_FCA_1_EXTRACT]] to <2 x i32>
|
|
715 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD3_DUP_V_FCA_2_EXTRACT]] to <2 x i32>
|
|
716 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0
|
|
717 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1
|
|
718 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP3]], 2
|
|
719 // CHECK32-NEXT: ret [3 x <2 x i32>] [[DOTFCA_2_INSERT]]
|
|
720 //
|
221
|
721 bfloat16x4x3_t test_vld3_dup_bf16(bfloat16_t const *ptr) {
|
|
722 return vld3_dup_bf16(ptr);
|
|
723 }
|
|
724
|
236
|
725 // CHECK64-LABEL: @test_vld3q_dup_bf16(
|
|
726 // CHECK64-NEXT: entry:
|
252
|
727 // CHECK64-NEXT: [[VLD3:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0(ptr [[PTR:%.*]])
|
236
|
728 // CHECK64-NEXT: [[VLD3_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 0
|
|
729 // CHECK64-NEXT: [[VLD3_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 1
|
|
730 // CHECK64-NEXT: [[VLD3_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3]], 2
|
|
731 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T:%.*]] poison, <8 x bfloat> [[VLD3_FCA_0_EXTRACT]], 0, 0
|
|
732 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD3_FCA_1_EXTRACT]], 0, 1
|
|
733 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD3_FCA_2_EXTRACT]], 0, 2
|
|
734 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X3_T]] [[DOTFCA_0_2_INSERT]]
|
|
735 //
|
|
736 // CHECK32-LABEL: @test_vld3q_dup_bf16(
|
|
737 // CHECK32-NEXT: entry:
|
252
|
738 // CHECK32-NEXT: [[VLD3Q_DUP_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0(ptr [[PTR:%.*]], i32 2)
|
236
|
739 // CHECK32-NEXT: [[VLD3Q_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_DUP_V]], 0
|
|
740 // CHECK32-NEXT: [[VLD3Q_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_DUP_V]], 1
|
|
741 // CHECK32-NEXT: [[VLD3Q_DUP_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD3Q_DUP_V]], 2
|
|
742 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD3Q_DUP_V_FCA_0_EXTRACT]] to <4 x i32>
|
|
743 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD3Q_DUP_V_FCA_1_EXTRACT]] to <4 x i32>
|
|
744 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD3Q_DUP_V_FCA_2_EXTRACT]] to <4 x i32>
|
|
745 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0
|
|
746 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1
|
|
747 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP3]], 2
|
|
748 // CHECK32-NEXT: ret [3 x <4 x i32>] [[DOTFCA_2_INSERT]]
|
|
749 //
|
221
|
750 bfloat16x8x3_t test_vld3q_dup_bf16(bfloat16_t const *ptr) {
|
|
751 return vld3q_dup_bf16(ptr);
|
|
752 }
|
|
753
|
236
|
754 // CHECK64-LABEL: @test_vld4_dup_bf16(
|
|
755 // CHECK64-NEXT: entry:
|
252
|
756 // CHECK64-NEXT: [[VLD4:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0(ptr [[PTR:%.*]])
|
236
|
757 // CHECK64-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 0
|
|
758 // CHECK64-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 1
|
|
759 // CHECK64-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 2
|
|
760 // CHECK64-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4]], 3
|
|
761 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T:%.*]] poison, <4 x bfloat> [[VLD4_FCA_0_EXTRACT]], 0, 0
|
|
762 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_0_INSERT]], <4 x bfloat> [[VLD4_FCA_1_EXTRACT]], 0, 1
|
|
763 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_1_INSERT]], <4 x bfloat> [[VLD4_FCA_2_EXTRACT]], 0, 2
|
|
764 // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_2_INSERT]], <4 x bfloat> [[VLD4_FCA_3_EXTRACT]], 0, 3
|
|
765 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X4X4_T]] [[DOTFCA_0_3_INSERT]]
|
|
766 //
|
|
767 // CHECK32-LABEL: @test_vld4_dup_bf16(
|
|
768 // CHECK32-NEXT: entry:
|
252
|
769 // CHECK32-NEXT: [[VLD4_DUP_V:%.*]] = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0(ptr [[PTR:%.*]], i32 2)
|
236
|
770 // CHECK32-NEXT: [[VLD4_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_DUP_V]], 0
|
|
771 // CHECK32-NEXT: [[VLD4_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_DUP_V]], 1
|
|
772 // CHECK32-NEXT: [[VLD4_DUP_V_FCA_2_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_DUP_V]], 2
|
|
773 // CHECK32-NEXT: [[VLD4_DUP_V_FCA_3_EXTRACT:%.*]] = extractvalue { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } [[VLD4_DUP_V]], 3
|
|
774 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x bfloat> [[VLD4_DUP_V_FCA_0_EXTRACT]] to <2 x i32>
|
|
775 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x bfloat> [[VLD4_DUP_V_FCA_1_EXTRACT]] to <2 x i32>
|
|
776 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x bfloat> [[VLD4_DUP_V_FCA_2_EXTRACT]] to <2 x i32>
|
|
777 // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <4 x bfloat> [[VLD4_DUP_V_FCA_3_EXTRACT]] to <2 x i32>
|
|
778 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <2 x i32>] poison, <2 x i32> [[TMP1]], 0
|
|
779 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_0_INSERT]], <2 x i32> [[TMP2]], 1
|
|
780 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_1_INSERT]], <2 x i32> [[TMP3]], 2
|
|
781 // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <2 x i32>] [[DOTFCA_2_INSERT]], <2 x i32> [[TMP4]], 3
|
|
782 // CHECK32-NEXT: ret [4 x <2 x i32>] [[DOTFCA_3_INSERT]]
|
|
783 //
|
221
|
784 bfloat16x4x4_t test_vld4_dup_bf16(bfloat16_t const *ptr) {
|
|
785 return vld4_dup_bf16(ptr);
|
|
786 }
|
|
787
|
236
|
788 // CHECK64-LABEL: @test_vld4q_dup_bf16(
|
|
789 // CHECK64-NEXT: entry:
|
252
|
790 // CHECK64-NEXT: [[VLD4:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0(ptr [[PTR:%.*]])
|
236
|
791 // CHECK64-NEXT: [[VLD4_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 0
|
|
792 // CHECK64-NEXT: [[VLD4_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 1
|
|
793 // CHECK64-NEXT: [[VLD4_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 2
|
|
794 // CHECK64-NEXT: [[VLD4_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4]], 3
|
|
795 // CHECK64-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T:%.*]] poison, <8 x bfloat> [[VLD4_FCA_0_EXTRACT]], 0, 0
|
|
796 // CHECK64-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_0_INSERT]], <8 x bfloat> [[VLD4_FCA_1_EXTRACT]], 0, 1
|
|
797 // CHECK64-NEXT: [[DOTFCA_0_2_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_1_INSERT]], <8 x bfloat> [[VLD4_FCA_2_EXTRACT]], 0, 2
|
|
798 // CHECK64-NEXT: [[DOTFCA_0_3_INSERT:%.*]] = insertvalue [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_2_INSERT]], <8 x bfloat> [[VLD4_FCA_3_EXTRACT]], 0, 3
|
|
799 // CHECK64-NEXT: ret [[STRUCT_BFLOAT16X8X4_T]] [[DOTFCA_0_3_INSERT]]
|
|
800 //
|
|
801 // CHECK32-LABEL: @test_vld4q_dup_bf16(
|
|
802 // CHECK32-NEXT: entry:
|
252
|
803 // CHECK32-NEXT: [[VLD4Q_DUP_V:%.*]] = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0(ptr [[PTR:%.*]], i32 2)
|
236
|
804 // CHECK32-NEXT: [[VLD4Q_DUP_V_FCA_0_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_DUP_V]], 0
|
|
805 // CHECK32-NEXT: [[VLD4Q_DUP_V_FCA_1_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_DUP_V]], 1
|
|
806 // CHECK32-NEXT: [[VLD4Q_DUP_V_FCA_2_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_DUP_V]], 2
|
|
807 // CHECK32-NEXT: [[VLD4Q_DUP_V_FCA_3_EXTRACT:%.*]] = extractvalue { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } [[VLD4Q_DUP_V]], 3
|
|
808 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <8 x bfloat> [[VLD4Q_DUP_V_FCA_0_EXTRACT]] to <4 x i32>
|
|
809 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <8 x bfloat> [[VLD4Q_DUP_V_FCA_1_EXTRACT]] to <4 x i32>
|
|
810 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <8 x bfloat> [[VLD4Q_DUP_V_FCA_2_EXTRACT]] to <4 x i32>
|
|
811 // CHECK32-NEXT: [[TMP4:%.*]] = bitcast <8 x bfloat> [[VLD4Q_DUP_V_FCA_3_EXTRACT]] to <4 x i32>
|
|
812 // CHECK32-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <4 x i32>] poison, <4 x i32> [[TMP1]], 0
|
|
813 // CHECK32-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_0_INSERT]], <4 x i32> [[TMP2]], 1
|
|
814 // CHECK32-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_1_INSERT]], <4 x i32> [[TMP3]], 2
|
|
815 // CHECK32-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <4 x i32>] [[DOTFCA_2_INSERT]], <4 x i32> [[TMP4]], 3
|
|
816 // CHECK32-NEXT: ret [4 x <4 x i32>] [[DOTFCA_3_INSERT]]
|
|
817 //
|
221
|
818 bfloat16x8x4_t test_vld4q_dup_bf16(bfloat16_t const *ptr) {
|
|
819 return vld4q_dup_bf16(ptr);
|
|
820 }
|
|
821
|
236
|
822 // CHECK64-LABEL: @test_vst1_bf16(
|
|
823 // CHECK64-NEXT: entry:
|
252
|
824 // CHECK64-NEXT: store <4 x bfloat> [[VAL:%.*]], ptr [[PTR:%.*]], align 2
|
236
|
825 // CHECK64-NEXT: ret void
|
|
826 //
|
|
827 // CHECK32-LABEL: @test_vst1_bf16(
|
|
828 // CHECK32-NEXT: entry:
|
252
|
829 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[VAL:%.*]], i32 2)
|
236
|
830 // CHECK32-NEXT: ret void
|
|
831 //
|
221
|
832 void test_vst1_bf16(bfloat16_t *ptr, bfloat16x4_t val) {
|
|
833 vst1_bf16(ptr, val);
|
|
834 }
|
|
835
|
236
|
836 // CHECK64-LABEL: @test_vst1q_bf16(
|
|
837 // CHECK64-NEXT: entry:
|
252
|
838 // CHECK64-NEXT: store <8 x bfloat> [[VAL:%.*]], ptr [[PTR:%.*]], align 2
|
236
|
839 // CHECK64-NEXT: ret void
|
|
840 //
|
|
841 // CHECK32-LABEL: @test_vst1q_bf16(
|
|
842 // CHECK32-NEXT: entry:
|
252
|
843 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[VAL:%.*]], i32 2)
|
236
|
844 // CHECK32-NEXT: ret void
|
|
845 //
|
221
|
846 void test_vst1q_bf16(bfloat16_t *ptr, bfloat16x8_t val) {
|
|
847 vst1q_bf16(ptr, val);
|
|
848 }
|
|
849
|
236
|
850 // CHECK-LABEL: @test_vst1_lane_bf16(
|
|
851 // CHECK-NEXT: entry:
|
|
852 // CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x bfloat> [[VAL:%.*]], i64 1
|
252
|
853 // CHECK-NEXT: store bfloat [[TMP0]], ptr [[PTR:%.*]], align 2
|
236
|
854 // CHECK-NEXT: ret void
|
|
855 //
|
221
|
856 void test_vst1_lane_bf16(bfloat16_t *ptr, bfloat16x4_t val) {
|
|
857 vst1_lane_bf16(ptr, val, 1);
|
|
858 }
|
|
859
|
236
|
860 // CHECK-LABEL: @test_vst1q_lane_bf16(
|
|
861 // CHECK-NEXT: entry:
|
|
862 // CHECK-NEXT: [[TMP0:%.*]] = extractelement <8 x bfloat> [[VAL:%.*]], i64 7
|
252
|
863 // CHECK-NEXT: store bfloat [[TMP0]], ptr [[PTR:%.*]], align 2
|
236
|
864 // CHECK-NEXT: ret void
|
|
865 //
|
221
|
866 void test_vst1q_lane_bf16(bfloat16_t *ptr, bfloat16x8_t val) {
|
|
867 vst1q_lane_bf16(ptr, val, 7);
|
|
868 }
|
|
869
|
236
|
870 // CHECK64-LABEL: @test_vst1_bf16_x2(
|
|
871 // CHECK64-NEXT: entry:
|
|
872 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
873 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE]], 1
|
252
|
874 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x2.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR:%.*]])
|
236
|
875 // CHECK64-NEXT: ret void
|
|
876 //
|
|
877 // CHECK32-LABEL: @test_vst1_bf16_x2(
|
|
878 // CHECK32-NEXT: entry:
|
|
879 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE:%.*]], 0
|
|
880 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE]], 1
|
|
881 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
|
|
882 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
|
252
|
883 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x2.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]])
|
236
|
884 // CHECK32-NEXT: ret void
|
|
885 //
|
221
|
886 void test_vst1_bf16_x2(bfloat16_t *ptr, bfloat16x4x2_t val) {
|
|
887 vst1_bf16_x2(ptr, val);
|
|
888 }
|
|
889
|
236
|
890 // CHECK64-LABEL: @test_vst1q_bf16_x2(
|
|
891 // CHECK64-NEXT: entry:
|
|
892 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
893 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE]], 1
|
252
|
894 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x2.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR:%.*]])
|
236
|
895 // CHECK64-NEXT: ret void
|
|
896 //
|
|
897 // CHECK32-LABEL: @test_vst1q_bf16_x2(
|
|
898 // CHECK32-NEXT: entry:
|
|
899 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE:%.*]], 0
|
|
900 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE]], 1
|
|
901 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
|
|
902 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
|
252
|
903 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x2.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]])
|
236
|
904 // CHECK32-NEXT: ret void
|
|
905 //
|
221
|
906 void test_vst1q_bf16_x2(bfloat16_t *ptr, bfloat16x8x2_t val) {
|
|
907 vst1q_bf16_x2(ptr, val);
|
|
908 }
|
|
909
|
236
|
910 // CHECK64-LABEL: @test_vst1_bf16_x3(
|
|
911 // CHECK64-NEXT: entry:
|
|
912 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
913 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 1
|
|
914 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 2
|
252
|
915 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x3.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR:%.*]])
|
236
|
916 // CHECK64-NEXT: ret void
|
|
917 //
|
|
918 // CHECK32-LABEL: @test_vst1_bf16_x3(
|
|
919 // CHECK32-NEXT: entry:
|
|
920 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE:%.*]], 0
|
|
921 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 1
|
|
922 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 2
|
|
923 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
|
|
924 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
|
|
925 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat>
|
252
|
926 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x3.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]])
|
236
|
927 // CHECK32-NEXT: ret void
|
|
928 //
|
221
|
929 void test_vst1_bf16_x3(bfloat16_t *ptr, bfloat16x4x3_t val) {
|
|
930 vst1_bf16_x3(ptr, val);
|
|
931 }
|
|
932
|
236
|
933 // CHECK64-LABEL: @test_vst1q_bf16_x3(
|
|
934 // CHECK64-NEXT: entry:
|
|
935 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
936 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 1
|
|
937 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 2
|
252
|
938 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x3.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR:%.*]])
|
236
|
939 // CHECK64-NEXT: ret void
|
|
940 //
|
|
941 // CHECK32-LABEL: @test_vst1q_bf16_x3(
|
|
942 // CHECK32-NEXT: entry:
|
|
943 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE:%.*]], 0
|
|
944 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 1
|
|
945 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 2
|
|
946 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
|
|
947 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
|
|
948 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat>
|
252
|
949 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x3.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]])
|
236
|
950 // CHECK32-NEXT: ret void
|
|
951 //
|
221
|
952 void test_vst1q_bf16_x3(bfloat16_t *ptr, bfloat16x8x3_t val) {
|
|
953 vst1q_bf16_x3(ptr, val);
|
|
954 }
|
|
955
|
236
|
956 // CHECK64-LABEL: @test_vst1_bf16_x4(
|
|
957 // CHECK64-NEXT: entry:
|
|
958 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
959 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 1
|
|
960 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 2
|
|
961 // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 3
|
252
|
962 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x4.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR:%.*]])
|
236
|
963 // CHECK64-NEXT: ret void
|
|
964 //
|
|
965 // CHECK32-LABEL: @test_vst1_bf16_x4(
|
|
966 // CHECK32-NEXT: entry:
|
|
967 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE:%.*]], 0
|
|
968 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 1
|
|
969 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 2
|
|
970 // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 3
|
|
971 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
|
|
972 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
|
|
973 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat>
|
|
974 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <4 x bfloat>
|
252
|
975 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x4.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]])
|
236
|
976 // CHECK32-NEXT: ret void
|
|
977 //
|
221
|
978 void test_vst1_bf16_x4(bfloat16_t *ptr, bfloat16x4x4_t val) {
|
|
979 vst1_bf16_x4(ptr, val);
|
|
980 }
|
|
981
|
236
|
982 // CHECK64-LABEL: @test_vst1q_bf16_x4(
|
|
983 // CHECK64-NEXT: entry:
|
|
984 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
985 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 1
|
|
986 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 2
|
|
987 // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 3
|
252
|
988 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st1x4.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR:%.*]])
|
236
|
989 // CHECK64-NEXT: ret void
|
|
990 //
|
|
991 // CHECK32-LABEL: @test_vst1q_bf16_x4(
|
|
992 // CHECK32-NEXT: entry:
|
|
993 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE:%.*]], 0
|
|
994 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 1
|
|
995 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 2
|
|
996 // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 3
|
|
997 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
|
|
998 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
|
|
999 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat>
|
|
1000 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <8 x bfloat>
|
252
|
1001 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1x4.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP3]])
|
236
|
1002 // CHECK32-NEXT: ret void
|
|
1003 //
|
221
|
1004 void test_vst1q_bf16_x4(bfloat16_t *ptr, bfloat16x8x4_t val) {
|
|
1005 vst1q_bf16_x4(ptr, val);
|
|
1006 }
|
|
1007
|
236
|
1008 // CHECK64-LABEL: @test_vst2_bf16(
|
|
1009 // CHECK64-NEXT: entry:
|
|
1010 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
1011 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE]], 1
|
252
|
1012 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st2.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR:%.*]])
|
236
|
1013 // CHECK64-NEXT: ret void
|
|
1014 //
|
|
1015 // CHECK32-LABEL: @test_vst2_bf16(
|
|
1016 // CHECK32-NEXT: entry:
|
|
1017 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE:%.*]], 0
|
|
1018 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE]], 1
|
|
1019 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
|
|
1020 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
|
252
|
1021 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst2.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], i32 2)
|
236
|
1022 // CHECK32-NEXT: ret void
|
|
1023 //
|
221
|
1024 void test_vst2_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) {
|
|
1025 vst2_bf16(ptr, val);
|
|
1026 }
|
|
1027
|
236
|
1028 // CHECK64-LABEL: @test_vst2q_bf16(
|
|
1029 // CHECK64-NEXT: entry:
|
|
1030 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
1031 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE]], 1
|
252
|
1032 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st2.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], ptr [[PTR:%.*]])
|
236
|
1033 // CHECK64-NEXT: ret void
|
|
1034 //
|
|
1035 // CHECK32-LABEL: @test_vst2q_bf16(
|
|
1036 // CHECK32-NEXT: entry:
|
|
1037 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE:%.*]], 0
|
|
1038 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE]], 1
|
|
1039 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
|
|
1040 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
|
252
|
1041 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst2.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], i32 2)
|
236
|
1042 // CHECK32-NEXT: ret void
|
|
1043 //
|
221
|
1044 void test_vst2q_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) {
|
|
1045 vst2q_bf16(ptr, val);
|
|
1046 }
|
|
1047
|
236
|
1048 // CHECK64-LABEL: @test_vst2_lane_bf16(
|
|
1049 // CHECK64-NEXT: entry:
|
|
1050 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
1051 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x bfloat>] [[VAL_COERCE]], 1
|
252
|
1052 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st2lane.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], i64 1, ptr [[PTR:%.*]])
|
236
|
1053 // CHECK64-NEXT: ret void
|
|
1054 //
|
|
1055 // CHECK32-LABEL: @test_vst2_lane_bf16(
|
|
1056 // CHECK32-NEXT: entry:
|
|
1057 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE:%.*]], 0
|
|
1058 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <2 x i32>] [[VAL_COERCE]], 1
|
|
1059 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
|
|
1060 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
|
252
|
1061 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst2lane.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], i32 1, i32 2)
|
236
|
1062 // CHECK32-NEXT: ret void
|
|
1063 //
|
221
|
1064 void test_vst2_lane_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) {
|
|
1065 vst2_lane_bf16(ptr, val, 1);
|
|
1066 }
|
|
1067
|
236
|
1068 // CHECK64-LABEL: @test_vst2q_lane_bf16(
|
|
1069 // CHECK64-NEXT: entry:
|
|
1070 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
1071 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x bfloat>] [[VAL_COERCE]], 1
|
252
|
1072 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st2lane.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], i64 7, ptr [[PTR:%.*]])
|
236
|
1073 // CHECK64-NEXT: ret void
|
|
1074 //
|
|
1075 // CHECK32-LABEL: @test_vst2q_lane_bf16(
|
|
1076 // CHECK32-NEXT: entry:
|
|
1077 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE:%.*]], 0
|
|
1078 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <4 x i32>] [[VAL_COERCE]], 1
|
|
1079 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
|
|
1080 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
|
252
|
1081 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst2lane.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], i32 7, i32 2)
|
236
|
1082 // CHECK32-NEXT: ret void
|
|
1083 //
|
221
|
1084 void test_vst2q_lane_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) {
|
|
1085 vst2q_lane_bf16(ptr, val, 7);
|
|
1086 }
|
|
1087
|
236
|
1088 // CHECK64-LABEL: @test_vst3_bf16(
|
|
1089 // CHECK64-NEXT: entry:
|
|
1090 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
1091 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 1
|
|
1092 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 2
|
252
|
1093 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st3.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR:%.*]])
|
236
|
1094 // CHECK64-NEXT: ret void
|
|
1095 //
|
|
1096 // CHECK32-LABEL: @test_vst3_bf16(
|
|
1097 // CHECK32-NEXT: entry:
|
|
1098 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE:%.*]], 0
|
|
1099 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 1
|
|
1100 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 2
|
|
1101 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
|
|
1102 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
|
|
1103 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat>
|
252
|
1104 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst3.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], i32 2)
|
236
|
1105 // CHECK32-NEXT: ret void
|
|
1106 //
|
221
|
1107 void test_vst3_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) {
|
|
1108 vst3_bf16(ptr, val);
|
|
1109 }
|
|
1110
|
236
|
1111 // CHECK64-LABEL: @test_vst3q_bf16(
|
|
1112 // CHECK64-NEXT: entry:
|
|
1113 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
1114 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 1
|
|
1115 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 2
|
252
|
1116 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st3.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], ptr [[PTR:%.*]])
|
236
|
1117 // CHECK64-NEXT: ret void
|
|
1118 //
|
|
1119 // CHECK32-LABEL: @test_vst3q_bf16(
|
|
1120 // CHECK32-NEXT: entry:
|
|
1121 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE:%.*]], 0
|
|
1122 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 1
|
|
1123 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 2
|
|
1124 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
|
|
1125 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
|
|
1126 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat>
|
252
|
1127 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst3.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], i32 2)
|
236
|
1128 // CHECK32-NEXT: ret void
|
|
1129 //
|
221
|
1130 void test_vst3q_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) {
|
|
1131 vst3q_bf16(ptr, val);
|
|
1132 }
|
|
1133
|
236
|
1134 // CHECK64-LABEL: @test_vst3_lane_bf16(
|
|
1135 // CHECK64-NEXT: entry:
|
|
1136 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
1137 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 1
|
|
1138 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x bfloat>] [[VAL_COERCE]], 2
|
252
|
1139 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st3lane.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], i64 1, ptr [[PTR:%.*]])
|
236
|
1140 // CHECK64-NEXT: ret void
|
|
1141 //
|
|
1142 // CHECK32-LABEL: @test_vst3_lane_bf16(
|
|
1143 // CHECK32-NEXT: entry:
|
|
1144 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE:%.*]], 0
|
|
1145 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 1
|
|
1146 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <2 x i32>] [[VAL_COERCE]], 2
|
|
1147 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
|
|
1148 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
|
|
1149 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat>
|
252
|
1150 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst3lane.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], i32 1, i32 2)
|
236
|
1151 // CHECK32-NEXT: ret void
|
|
1152 //
|
221
|
1153 void test_vst3_lane_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) {
|
|
1154 vst3_lane_bf16(ptr, val, 1);
|
|
1155 }
|
|
1156
|
236
|
1157 // CHECK64-LABEL: @test_vst3q_lane_bf16(
|
|
1158 // CHECK64-NEXT: entry:
|
|
1159 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
1160 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 1
|
|
1161 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x bfloat>] [[VAL_COERCE]], 2
|
252
|
1162 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st3lane.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], i64 7, ptr [[PTR:%.*]])
|
236
|
1163 // CHECK64-NEXT: ret void
|
|
1164 //
|
|
1165 // CHECK32-LABEL: @test_vst3q_lane_bf16(
|
|
1166 // CHECK32-NEXT: entry:
|
|
1167 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE:%.*]], 0
|
|
1168 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 1
|
|
1169 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <4 x i32>] [[VAL_COERCE]], 2
|
|
1170 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
|
|
1171 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
|
|
1172 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat>
|
252
|
1173 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst3lane.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], i32 7, i32 2)
|
236
|
1174 // CHECK32-NEXT: ret void
|
|
1175 //
|
221
|
1176 void test_vst3q_lane_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) {
|
|
1177 vst3q_lane_bf16(ptr, val, 7);
|
|
1178 }
|
|
1179
|
236
|
1180 // CHECK64-LABEL: @test_vst4_bf16(
|
|
1181 // CHECK64-NEXT: entry:
|
|
1182 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
1183 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 1
|
|
1184 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 2
|
|
1185 // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 3
|
252
|
1186 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st4.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR:%.*]])
|
236
|
1187 // CHECK64-NEXT: ret void
|
|
1188 //
|
|
1189 // CHECK32-LABEL: @test_vst4_bf16(
|
|
1190 // CHECK32-NEXT: entry:
|
|
1191 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE:%.*]], 0
|
|
1192 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 1
|
|
1193 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 2
|
|
1194 // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 3
|
|
1195 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
|
|
1196 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
|
|
1197 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat>
|
|
1198 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <4 x bfloat>
|
252
|
1199 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst4.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], i32 2)
|
236
|
1200 // CHECK32-NEXT: ret void
|
|
1201 //
|
221
|
1202 void test_vst4_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) {
|
|
1203 vst4_bf16(ptr, val);
|
|
1204 }
|
|
1205
|
236
|
1206 // CHECK64-LABEL: @test_vst4q_bf16(
|
|
1207 // CHECK64-NEXT: entry:
|
|
1208 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
1209 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 1
|
|
1210 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 2
|
|
1211 // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 3
|
252
|
1212 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st4.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], ptr [[PTR:%.*]])
|
236
|
1213 // CHECK64-NEXT: ret void
|
|
1214 //
|
|
1215 // CHECK32-LABEL: @test_vst4q_bf16(
|
|
1216 // CHECK32-NEXT: entry:
|
|
1217 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE:%.*]], 0
|
|
1218 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 1
|
|
1219 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 2
|
|
1220 // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 3
|
|
1221 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
|
|
1222 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
|
|
1223 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat>
|
|
1224 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <8 x bfloat>
|
252
|
1225 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst4.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP3]], i32 2)
|
236
|
1226 // CHECK32-NEXT: ret void
|
|
1227 //
|
221
|
1228 void test_vst4q_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) {
|
|
1229 vst4q_bf16(ptr, val);
|
|
1230 }
|
|
1231
|
236
|
1232 // CHECK64-LABEL: @test_vst4_lane_bf16(
|
|
1233 // CHECK64-NEXT: entry:
|
|
1234 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
1235 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 1
|
|
1236 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 2
|
|
1237 // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x bfloat>] [[VAL_COERCE]], 3
|
252
|
1238 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st4lane.v4bf16.p0(<4 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <4 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], i64 1, ptr [[PTR:%.*]])
|
236
|
1239 // CHECK64-NEXT: ret void
|
|
1240 //
|
|
1241 // CHECK32-LABEL: @test_vst4_lane_bf16(
|
|
1242 // CHECK32-NEXT: entry:
|
|
1243 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE:%.*]], 0
|
|
1244 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 1
|
|
1245 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 2
|
|
1246 // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <2 x i32>] [[VAL_COERCE]], 3
|
|
1247 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <4 x bfloat>
|
|
1248 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <4 x bfloat>
|
|
1249 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <4 x bfloat>
|
|
1250 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <4 x bfloat>
|
252
|
1251 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst4lane.p0.v4bf16(ptr [[PTR:%.*]], <4 x bfloat> [[TMP0]], <4 x bfloat> [[TMP1]], <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], i32 1, i32 2)
|
236
|
1252 // CHECK32-NEXT: ret void
|
|
1253 //
|
221
|
1254 void test_vst4_lane_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) {
|
|
1255 vst4_lane_bf16(ptr, val, 1);
|
|
1256 }
|
|
1257
|
236
|
1258 // CHECK64-LABEL: @test_vst4q_lane_bf16(
|
|
1259 // CHECK64-NEXT: entry:
|
|
1260 // CHECK64-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE:%.*]], 0
|
|
1261 // CHECK64-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 1
|
|
1262 // CHECK64-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 2
|
|
1263 // CHECK64-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x bfloat>] [[VAL_COERCE]], 3
|
252
|
1264 // CHECK64-NEXT: tail call void @llvm.aarch64.neon.st4lane.v8bf16.p0(<8 x bfloat> [[VAL_COERCE_FCA_0_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_1_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_2_EXTRACT]], <8 x bfloat> [[VAL_COERCE_FCA_3_EXTRACT]], i64 7, ptr [[PTR:%.*]])
|
236
|
1265 // CHECK64-NEXT: ret void
|
|
1266 //
|
|
1267 // CHECK32-LABEL: @test_vst4q_lane_bf16(
|
|
1268 // CHECK32-NEXT: entry:
|
|
1269 // CHECK32-NEXT: [[VAL_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE:%.*]], 0
|
|
1270 // CHECK32-NEXT: [[VAL_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 1
|
|
1271 // CHECK32-NEXT: [[VAL_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 2
|
|
1272 // CHECK32-NEXT: [[VAL_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <4 x i32>] [[VAL_COERCE]], 3
|
|
1273 // CHECK32-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_0_EXTRACT]] to <8 x bfloat>
|
|
1274 // CHECK32-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_1_EXTRACT]] to <8 x bfloat>
|
|
1275 // CHECK32-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_2_EXTRACT]] to <8 x bfloat>
|
|
1276 // CHECK32-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[VAL_COERCE_FCA_3_EXTRACT]] to <8 x bfloat>
|
252
|
1277 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst4lane.p0.v8bf16(ptr [[PTR:%.*]], <8 x bfloat> [[TMP0]], <8 x bfloat> [[TMP1]], <8 x bfloat> [[TMP2]], <8 x bfloat> [[TMP3]], i32 7, i32 2)
|
236
|
1278 // CHECK32-NEXT: ret void
|
|
1279 //
|
221
|
1280 void test_vst4q_lane_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) {
|
|
1281 vst4q_lane_bf16(ptr, val, 7);
|
|
1282 }
|