120
|
1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
|
2
|
|
3 ; GCN-LABEL: {{^}}store_build_vector_multiple_uses_v4i32:
|
|
4 ; GCN: buffer_load_dword
|
|
5 ; GCN: buffer_load_dword
|
|
6 ; GCN: buffer_load_dword
|
|
7 ; GCN: buffer_load_dword
|
|
8
|
|
9 ; GCN: buffer_store_dwordx4
|
|
10 ; GCN: buffer_store_dwordx4
|
|
11
|
|
12 ; GCN: buffer_store_dword
|
|
13 ; GCN: buffer_store_dword
|
|
14 ; GCN: buffer_store_dword
|
|
15 ; GCN: buffer_store_dword
|
121
|
16 define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
|
120
|
17 <4 x i32> addrspace(1)* noalias %out1,
|
|
18 i32 addrspace(1)* noalias %out2,
|
|
19 i32 addrspace(1)* %in) {
|
|
20 %elt0 = load volatile i32, i32 addrspace(1)* %in
|
|
21 %elt1 = load volatile i32, i32 addrspace(1)* %in
|
|
22 %elt2 = load volatile i32, i32 addrspace(1)* %in
|
|
23 %elt3 = load volatile i32, i32 addrspace(1)* %in
|
|
24
|
|
25 %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
|
|
26 %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
|
|
27 %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
|
|
28 %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
|
|
29
|
|
30 store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out0
|
|
31 store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out1
|
|
32
|
|
33 %extract0 = extractelement <4 x i32> %vec3, i32 0
|
|
34 %extract1 = extractelement <4 x i32> %vec3, i32 1
|
|
35 %extract2 = extractelement <4 x i32> %vec3, i32 2
|
|
36 %extract3 = extractelement <4 x i32> %vec3, i32 3
|
|
37
|
|
38 store volatile i32 %extract0, i32 addrspace(1)* %out2
|
|
39 store volatile i32 %extract1, i32 addrspace(1)* %out2
|
|
40 store volatile i32 %extract2, i32 addrspace(1)* %out2
|
|
41 store volatile i32 %extract3, i32 addrspace(1)* %out2
|
|
42
|
|
43 ret void
|
|
44 }
|
|
45
|
|
46 ; GCN-LABEL: {{^}}store_build_vector_multiple_extract_uses_v4i32:
|
|
47 ; GCN: buffer_load_dword
|
|
48 ; GCN: buffer_load_dword
|
|
49 ; GCN: buffer_load_dword
|
|
50 ; GCN: buffer_load_dword
|
|
51
|
|
52 ; GCN: buffer_store_dwordx4
|
|
53
|
|
54 ; GCN: buffer_store_dword
|
|
55 ; GCN: buffer_store_dword
|
|
56 ; GCN: buffer_store_dword
|
|
57 ; GCN: buffer_store_dword
|
121
|
58 define amdgpu_kernel void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
|
120
|
59 <4 x i32> addrspace(1)* noalias %out1,
|
|
60 i32 addrspace(1)* noalias %out2,
|
|
61 i32 addrspace(1)* %in) {
|
|
62 %elt0 = load volatile i32, i32 addrspace(1)* %in
|
|
63 %elt1 = load volatile i32, i32 addrspace(1)* %in
|
|
64 %elt2 = load volatile i32, i32 addrspace(1)* %in
|
|
65 %elt3 = load volatile i32, i32 addrspace(1)* %in
|
|
66
|
|
67 %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
|
|
68 %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
|
|
69 %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
|
|
70 %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
|
|
71
|
|
72 %extract0 = extractelement <4 x i32> %vec3, i32 0
|
|
73 %extract1 = extractelement <4 x i32> %vec3, i32 1
|
|
74 %extract2 = extractelement <4 x i32> %vec3, i32 2
|
|
75 %extract3 = extractelement <4 x i32> %vec3, i32 3
|
|
76
|
|
77 %op0 = add i32 %extract0, 3
|
|
78 %op1 = sub i32 %extract1, 9
|
|
79 %op2 = xor i32 %extract2, 1231412
|
|
80 %op3 = and i32 %extract3, 258233412312
|
|
81
|
|
82 store <4 x i32> %vec3, <4 x i32> addrspace(1)* %out0
|
|
83
|
|
84 store volatile i32 %op0, i32 addrspace(1)* %out2
|
|
85 store volatile i32 %op1, i32 addrspace(1)* %out2
|
|
86 store volatile i32 %op2, i32 addrspace(1)* %out2
|
|
87 store volatile i32 %op3, i32 addrspace(1)* %out2
|
|
88
|
|
89 ret void
|
|
90 }
|
|
91
|
|
92 ; GCN-LABEL: {{^}}store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64:
|
|
93 ; GCN: buffer_load_dword
|
|
94 ; GCN: buffer_load_dword
|
|
95 ; GCN: buffer_load_dword
|
|
96 ; GCN: buffer_load_dword
|
|
97
|
|
98 ; GCN: buffer_store_dwordx4
|
|
99
|
|
100 ; GCN: buffer_store_dwordx2
|
|
101 ; GCN: buffer_store_dwordx2
|
121
|
102 define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0,
|
120
|
103 <4 x i32> addrspace(1)* noalias %out1,
|
|
104 i64 addrspace(1)* noalias %out2,
|
|
105 i32 addrspace(1)* %in) {
|
|
106 %elt0 = load volatile i32, i32 addrspace(1)* %in
|
|
107 %elt1 = load volatile i32, i32 addrspace(1)* %in
|
|
108 %elt2 = load volatile i32, i32 addrspace(1)* %in
|
|
109 %elt3 = load volatile i32, i32 addrspace(1)* %in
|
|
110
|
|
111 %vec0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
|
|
112 %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
|
|
113 %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
|
|
114 %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
|
|
115
|
|
116 %bc.vec3 = bitcast <4 x i32> %vec3 to <2 x i64>
|
|
117 store <2 x i64> %bc.vec3, <2 x i64> addrspace(1)* %out0
|
|
118
|
|
119 %extract0 = extractelement <2 x i64> %bc.vec3, i32 0
|
|
120 %extract1 = extractelement <2 x i64> %bc.vec3, i32 1
|
|
121
|
|
122 store volatile i64 %extract0, i64 addrspace(1)* %out2
|
|
123 store volatile i64 %extract1, i64 addrspace(1)* %out2
|
|
124
|
|
125 ret void
|
|
126 }
|