121
|
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
|
|
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
|
|
3
|
|
4 ; FIXME: Need to handle non-uniform case for function below (load without gep).
|
|
5 ; GCN-LABEL: {{^}}v_test_sub_i16:
|
|
6 ; VI: flat_load_ushort [[A:v[0-9]+]]
|
|
7 ; VI: flat_load_ushort [[B:v[0-9]+]]
|
|
8 ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
|
|
9 ; VI-NEXT: buffer_store_short [[ADD]]
|
|
10 define amdgpu_kernel void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
|
|
11 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
12 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
|
|
13 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
|
|
14 %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
|
|
15 %a = load volatile i16, i16 addrspace(1)* %gep.in0
|
|
16 %b = load volatile i16, i16 addrspace(1)* %gep.in1
|
|
17 %add = sub i16 %a, %b
|
|
18 store i16 %add, i16 addrspace(1)* %out
|
|
19 ret void
|
|
20 }
|
|
21
|
|
22 ; FIXME: Need to handle non-uniform case for function below (load without gep).
|
|
23 ; GCN-LABEL: {{^}}v_test_sub_i16_constant:
|
|
24 ; VI: flat_load_ushort [[A:v[0-9]+]]
|
|
25 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffff85, [[A]]
|
|
26 ; VI-NEXT: buffer_store_short [[ADD]]
|
|
27 define amdgpu_kernel void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
|
|
28 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
29 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
|
|
30 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
|
|
31 %a = load volatile i16, i16 addrspace(1)* %gep.in0
|
|
32 %add = sub i16 %a, 123
|
|
33 store i16 %add, i16 addrspace(1)* %out
|
|
34 ret void
|
|
35 }
|
|
36
|
|
37 ; FIXME: Need to handle non-uniform case for function below (load without gep).
|
|
38 ; GCN-LABEL: {{^}}v_test_sub_i16_neg_constant:
|
|
39 ; VI: flat_load_ushort [[A:v[0-9]+]]
|
|
40 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x34d, [[A]]
|
|
41 ; VI-NEXT: buffer_store_short [[ADD]]
|
|
42 define amdgpu_kernel void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
|
|
43 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
44 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
|
|
45 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
|
|
46 %a = load volatile i16, i16 addrspace(1)* %gep.in0
|
|
47 %add = sub i16 %a, -845
|
|
48 store i16 %add, i16 addrspace(1)* %out
|
|
49 ret void
|
|
50 }
|
|
51
|
|
52 ; FIXME: Need to handle non-uniform case for function below (load without gep).
|
|
53 ; GCN-LABEL: {{^}}v_test_sub_i16_inline_63:
|
|
54 ; VI: flat_load_ushort [[A:v[0-9]+]]
|
|
55 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], 63, [[A]]
|
|
56 ; VI-NEXT: buffer_store_short [[ADD]]
|
|
57 define amdgpu_kernel void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
|
|
58 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
59 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
|
|
60 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
|
|
61 %a = load volatile i16, i16 addrspace(1)* %gep.in0
|
|
62 %add = sub i16 %a, 63
|
|
63 store i16 %add, i16 addrspace(1)* %out
|
|
64 ret void
|
|
65 }
|
|
66
|
|
67 ; FIXME: Need to handle non-uniform case for function below (load without gep).
|
|
68 ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i32:
|
|
69 ; VI: flat_load_ushort [[A:v[0-9]+]]
|
|
70 ; VI: flat_load_ushort [[B:v[0-9]+]]
|
|
71 ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
|
|
72 ; VI-NEXT: buffer_store_dword [[ADD]]
|
|
73 define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
|
|
74 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
75 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
|
|
76 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
|
|
77 %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
|
|
78 %a = load volatile i16, i16 addrspace(1)* %gep.in0
|
|
79 %b = load volatile i16, i16 addrspace(1)* %gep.in1
|
|
80 %add = sub i16 %a, %b
|
|
81 %ext = zext i16 %add to i32
|
|
82 store i32 %ext, i32 addrspace(1)* %out
|
|
83 ret void
|
|
84 }
|
|
85
|
|
86 ; FIXME: Need to handle non-uniform case for function below (load without gep).
|
|
87 ; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64:
|
|
88 ; VI: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
|
|
89 ; VI: flat_load_ushort [[A:v[0-9]+]]
|
|
90 ; VI: flat_load_ushort [[B:v[0-9]+]]
|
|
91 ; VI-DAG: v_sub_u16_e32 v[[ADD:[0-9]+]], [[A]], [[B]]
|
|
92 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
|
93 define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
|
|
94 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
95 %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
|
|
96 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
|
|
97 %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
|
|
98 %a = load volatile i16, i16 addrspace(1)* %gep.in0
|
|
99 %b = load volatile i16, i16 addrspace(1)* %gep.in1
|
|
100 %add = sub i16 %a, %b
|
|
101 %ext = zext i16 %add to i64
|
|
102 store i64 %ext, i64 addrspace(1)* %out
|
|
103 ret void
|
|
104 }
|
|
105
|
|
106 ; FIXME: Need to handle non-uniform case for function below (load without gep).
|
|
107 ; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i32:
|
|
108 ; VI: flat_load_ushort [[A:v[0-9]+]]
|
|
109 ; VI: flat_load_ushort [[B:v[0-9]+]]
|
|
110 ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
|
|
111 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
|
|
112 ; VI-NEXT: buffer_store_dword [[SEXT]]
|
|
113 define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
|
|
114 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
115 %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
|
|
116 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
|
|
117 %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
|
|
118 %a = load i16, i16 addrspace(1)* %gep.in0
|
|
119 %b = load i16, i16 addrspace(1)* %gep.in1
|
|
120 %add = sub i16 %a, %b
|
|
121 %ext = sext i16 %add to i32
|
|
122 store i32 %ext, i32 addrspace(1)* %out
|
|
123 ret void
|
|
124 }
|
|
125
|
|
126 ; FIXME: Need to handle non-uniform case for function below (load without gep).
|
|
127 ; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i64:
|
|
128 ; VI: flat_load_ushort [[A:v[0-9]+]]
|
|
129 ; VI: flat_load_ushort [[B:v[0-9]+]]
|
|
130 ; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
|
|
131 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
|
|
132 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
|
|
133 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
|
|
134 define amdgpu_kernel void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
|
|
135 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
136 %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
|
|
137 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
|
|
138 %gep.in1 = getelementptr inbounds i16, i16 addrspace(1)* %in1, i32 %tid
|
|
139 %a = load i16, i16 addrspace(1)* %gep.in0
|
|
140 %b = load i16, i16 addrspace(1)* %gep.in1
|
|
141 %add = sub i16 %a, %b
|
|
142 %ext = sext i16 %add to i64
|
|
143 store i64 %ext, i64 addrspace(1)* %out
|
|
144 ret void
|
|
145 }
|
|
146
|
|
147 @lds = addrspace(3) global [512 x i32] undef, align 4
|
|
148
|
|
149 ; GCN-LABEL: {{^}}v_test_sub_i16_constant_commute:
|
|
150 ; VI: v_subrev_u16_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}}
|
|
151 ; CI: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 0x800, v{{[0-9]+}}
|
|
152 define amdgpu_kernel void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
|
|
153 %size = call i32 @llvm.amdgcn.groupstaticsize()
|
|
154 %size.trunc = trunc i32 %size to i16
|
|
155 call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
|
|
156 %tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
157 %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
|
|
158 %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
|
|
159 %a = load volatile i16, i16 addrspace(1)* %gep.in0
|
|
160 %add = sub i16 %a, %size.trunc
|
|
161 store i16 %add, i16 addrspace(1)* %out
|
|
162 ret void
|
|
163 }
|
|
164
|
|
165 declare i32 @llvm.amdgcn.workitem.id.x() #0
|
|
166 declare i32 @llvm.amdgcn.groupstaticsize() #0
|
|
167
|
|
168 attributes #0 = { nounwind readnone }
|
|
169 attributes #1 = { nounwind }
|