Mercurial > hg > CbC > CbC_llvm
comparison llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @ 221:79ff65ed7e25
LLVM12 Original
author | Shinji KONO <kono@ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 15 Jun 2021 19:15:29 +0900 |
parents | 0572611fdcc8 |
children | c4bab56944e8 |
comparison
equal
deleted
inserted
replaced
220:42394fc6a535 | 221:79ff65ed7e25 |
---|---|
1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | 1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py |
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s | 2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s |
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI %s | 3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s |
4 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s | 4 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s |
5 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s | |
5 | 6 |
6 define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { | 7 define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { |
7 ; GFX9-LABEL: s_shl_v2i16: | 8 ; GFX9-LABEL: s_shl_v2i16: |
8 ; GFX9: ; %bb.0: | 9 ; GFX9: ; %bb.0: |
9 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 | 10 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 |
10 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c | 11 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c |
11 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x30 | 12 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x30 |
12 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 | 13 ; GFX9-NEXT: s_mov_b32 s7, 0xf000 |
13 ; GFX9-NEXT: s_mov_b32 s6, -1 | 14 ; GFX9-NEXT: s_mov_b32 s6, -1 |
14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) | 15 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
15 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 | 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 |
16 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0 | 17 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, s3, v0 |
17 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 | 18 ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
18 ; GFX9-NEXT: s_endpgm | 19 ; GFX9-NEXT: s_endpgm |
19 ; | 20 ; |
20 ; VI-LABEL: s_shl_v2i16: | 21 ; VI-LABEL: s_shl_v2i16: |
21 ; VI: ; %bb.0: | 22 ; VI: ; %bb.0: |
57 ; CI-NEXT: s_and_b32 s1, s1, s3 | 58 ; CI-NEXT: s_and_b32 s1, s1, s3 |
58 ; CI-NEXT: s_or_b32 s0, s1, s0 | 59 ; CI-NEXT: s_or_b32 s0, s1, s0 |
59 ; CI-NEXT: v_mov_b32_e32 v0, s0 | 60 ; CI-NEXT: v_mov_b32_e32 v0, s0 |
60 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 | 61 ; CI-NEXT: buffer_store_dword v0, off, s[4:7], 0 |
61 ; CI-NEXT: s_endpgm | 62 ; CI-NEXT: s_endpgm |
63 ; | |
64 ; GFX10-LABEL: s_shl_v2i16: | |
65 ; GFX10: ; %bb.0: | |
66 ; GFX10-NEXT: s_clause 0x2 | |
67 ; GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c | |
68 ; GFX10-NEXT: s_load_dword s3, s[0:1], 0x30 | |
69 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 | |
70 ; GFX10-NEXT: s_mov_b32 s7, 0x31016000 | |
71 ; GFX10-NEXT: s_mov_b32 s6, -1 | |
72 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) | |
73 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, s3, s2 | |
74 ; GFX10-NEXT: buffer_store_dword v0, off, s[4:7], 0 | |
75 ; GFX10-NEXT: s_endpgm | |
62 %result = shl <2 x i16> %lhs, %rhs | 76 %result = shl <2 x i16> %lhs, %rhs |
63 store <2 x i16> %result, <2 x i16> addrspace(1)* %out | 77 store <2 x i16> %result, <2 x i16> addrspace(1)* %out |
64 ret void | 78 ret void |
65 } | 79 } |
66 | 80 |
67 define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { | 81 define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { |
68 ; GFX9-LABEL: v_shl_v2i16: | 82 ; GFX9-LABEL: v_shl_v2i16: |
69 ; GFX9: ; %bb.0: | 83 ; GFX9: ; %bb.0: |
70 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | 84 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 |
71 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 85 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
72 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) | 86 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
73 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 | 87 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] |
74 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 | 88 ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 |
75 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | |
76 ; GFX9-NEXT: global_load_dword v4, v[0:1], off | |
77 ; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 | |
78 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 | |
79 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 | |
80 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | |
81 ; GFX9-NEXT: s_waitcnt vmcnt(0) | 89 ; GFX9-NEXT: s_waitcnt vmcnt(0) |
82 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4 | 90 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v2, v1 |
83 ; GFX9-NEXT: global_store_dword v[2:3], v0, off | 91 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] |
84 ; GFX9-NEXT: s_endpgm | 92 ; GFX9-NEXT: s_endpgm |
85 ; | 93 ; |
86 ; VI-LABEL: v_shl_v2i16: | 94 ; VI-LABEL: v_shl_v2i16: |
87 ; VI: ; %bb.0: | 95 ; VI: ; %bb.0: |
88 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | 96 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 |
89 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 97 ; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0 |
90 ; VI-NEXT: s_waitcnt lgkmcnt(0) | 98 ; VI-NEXT: s_waitcnt lgkmcnt(0) |
91 ; VI-NEXT: v_mov_b32_e32 v1, s3 | 99 ; VI-NEXT: v_mov_b32_e32 v1, s3 |
92 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 | 100 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 |
93 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 101 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
94 ; VI-NEXT: v_mov_b32_e32 v3, s1 | 102 ; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0 |
95 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 | 103 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc |
96 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc | 104 ; VI-NEXT: flat_load_dword v5, v[0:1] |
97 ; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0 | 105 ; VI-NEXT: flat_load_dword v2, v[2:3] |
98 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc | 106 ; VI-NEXT: v_mov_b32_e32 v1, s1 |
99 ; VI-NEXT: flat_load_dword v0, v[0:1] | 107 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4 |
100 ; VI-NEXT: flat_load_dword v1, v[4:5] | 108 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
101 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 109 ; VI-NEXT: s_waitcnt vmcnt(0) |
102 ; VI-NEXT: v_lshlrev_b16_e32 v4, v1, v0 | 110 ; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5 |
103 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 | 111 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 |
104 ; VI-NEXT: v_or_b32_e32 v0, v4, v0 | 112 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 |
105 ; VI-NEXT: flat_store_dword v[2:3], v0 | 113 ; VI-NEXT: flat_store_dword v[0:1], v2 |
106 ; VI-NEXT: s_endpgm | 114 ; VI-NEXT: s_endpgm |
107 ; | 115 ; |
108 ; CI-LABEL: v_shl_v2i16: | 116 ; CI-LABEL: v_shl_v2i16: |
109 ; CI: ; %bb.0: | 117 ; CI: ; %bb.0: |
110 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | 118 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 |
114 ; CI-NEXT: v_mov_b32_e32 v1, 0 | 122 ; CI-NEXT: v_mov_b32_e32 v1, 0 |
115 ; CI-NEXT: s_waitcnt lgkmcnt(0) | 123 ; CI-NEXT: s_waitcnt lgkmcnt(0) |
116 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] | 124 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] |
117 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 | 125 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 |
118 ; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 | 126 ; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4 |
119 ; CI-NEXT: s_mov_b32 s8, 0xffff | 127 ; CI-NEXT: s_mov_b32 s0, 0xffff |
120 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] | 128 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] |
121 ; CI-NEXT: s_waitcnt vmcnt(1) | 129 ; CI-NEXT: s_waitcnt vmcnt(1) |
122 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 | 130 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2 |
123 ; CI-NEXT: s_waitcnt vmcnt(0) | 131 ; CI-NEXT: s_waitcnt vmcnt(0) |
124 ; CI-NEXT: v_and_b32_e32 v5, s8, v3 | 132 ; CI-NEXT: v_and_b32_e32 v5, s0, v3 |
125 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 | 133 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 |
126 ; CI-NEXT: v_lshl_b32_e32 v3, v4, v3 | 134 ; CI-NEXT: v_lshl_b32_e32 v3, v4, v3 |
127 ; CI-NEXT: v_lshl_b32_e32 v2, v2, v5 | 135 ; CI-NEXT: v_lshl_b32_e32 v2, v2, v5 |
128 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 | 136 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
129 ; CI-NEXT: v_and_b32_e32 v2, s8, v2 | 137 ; CI-NEXT: v_and_b32_e32 v2, s0, v2 |
130 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 | 138 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 |
131 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 | 139 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 |
132 ; CI-NEXT: s_endpgm | 140 ; CI-NEXT: s_endpgm |
141 ; | |
142 ; GFX10-LABEL: v_shl_v2i16: | |
143 ; GFX10: ; %bb.0: | |
144 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | |
145 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 | |
146 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) | |
147 ; GFX10-NEXT: s_clause 0x1 | |
148 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] | |
149 ; GFX10-NEXT: global_load_dword v2, v0, s[2:3] offset:4 | |
150 ; GFX10-NEXT: s_waitcnt vmcnt(0) | |
151 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v2, v1 | |
152 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] | |
153 ; GFX10-NEXT: s_endpgm | |
133 %tid = call i32 @llvm.amdgcn.workitem.id.x() | 154 %tid = call i32 @llvm.amdgcn.workitem.id.x() |
134 %tid.ext = sext i32 %tid to i64 | 155 %tid.ext = sext i32 %tid to i64 |
135 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext | 156 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext |
136 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext | 157 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext |
137 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1 | 158 %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1 |
144 | 165 |
145 define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { | 166 define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { |
146 ; GFX9-LABEL: shl_v_s_v2i16: | 167 ; GFX9-LABEL: shl_v_s_v2i16: |
147 ; GFX9: ; %bb.0: | 168 ; GFX9: ; %bb.0: |
148 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | 169 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 |
149 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 | 170 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 |
150 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 171 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
151 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) | 172 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
152 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 | 173 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] |
153 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 | |
154 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | |
155 ; GFX9-NEXT: global_load_dword v0, v[0:1], off | |
156 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 | |
157 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 | |
158 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | |
159 ; GFX9-NEXT: s_waitcnt vmcnt(0) | 174 ; GFX9-NEXT: s_waitcnt vmcnt(0) |
160 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0 | 175 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, s2, v1 |
161 ; GFX9-NEXT: global_store_dword v[2:3], v0, off | 176 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] |
162 ; GFX9-NEXT: s_endpgm | 177 ; GFX9-NEXT: s_endpgm |
163 ; | 178 ; |
164 ; VI-LABEL: shl_v_s_v2i16: | 179 ; VI-LABEL: shl_v_s_v2i16: |
165 ; VI: ; %bb.0: | 180 ; VI: ; %bb.0: |
166 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | 181 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 |
168 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 183 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
169 ; VI-NEXT: s_waitcnt lgkmcnt(0) | 184 ; VI-NEXT: s_waitcnt lgkmcnt(0) |
170 ; VI-NEXT: v_mov_b32_e32 v1, s7 | 185 ; VI-NEXT: v_mov_b32_e32 v1, s7 |
171 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 | 186 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 |
172 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 187 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
173 ; VI-NEXT: flat_load_dword v0, v[0:1] | 188 ; VI-NEXT: flat_load_dword v3, v[0:1] |
174 ; VI-NEXT: s_lshr_b32 s1, s0, 16 | 189 ; VI-NEXT: s_lshr_b32 s1, s0, 16 |
175 ; VI-NEXT: v_mov_b32_e32 v4, s1 | 190 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 |
176 ; VI-NEXT: v_mov_b32_e32 v3, s5 | 191 ; VI-NEXT: v_mov_b32_e32 v2, s1 |
177 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 | 192 ; VI-NEXT: v_mov_b32_e32 v1, s5 |
178 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc | 193 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
179 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 194 ; VI-NEXT: s_waitcnt vmcnt(0) |
180 ; VI-NEXT: v_lshlrev_b16_e32 v1, s0, v0 | 195 ; VI-NEXT: v_lshlrev_b16_e32 v4, s0, v3 |
181 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 | 196 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 |
182 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 | 197 ; VI-NEXT: v_or_b32_e32 v2, v4, v2 |
183 ; VI-NEXT: flat_store_dword v[2:3], v0 | 198 ; VI-NEXT: flat_store_dword v[0:1], v2 |
184 ; VI-NEXT: s_endpgm | 199 ; VI-NEXT: s_endpgm |
185 ; | 200 ; |
186 ; CI-LABEL: shl_v_s_v2i16: | 201 ; CI-LABEL: shl_v_s_v2i16: |
187 ; CI: ; %bb.0: | 202 ; CI: ; %bb.0: |
188 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | 203 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 |
189 ; CI-NEXT: s_load_dword s0, s[0:1], 0xd | 204 ; CI-NEXT: s_load_dword s8, s[0:1], 0xd |
190 ; CI-NEXT: s_mov_b32 s8, 0xffff | |
191 ; CI-NEXT: s_mov_b32 s3, 0xf000 | 205 ; CI-NEXT: s_mov_b32 s3, 0xf000 |
192 ; CI-NEXT: s_mov_b32 s2, 0 | 206 ; CI-NEXT: s_mov_b32 s2, 0 |
193 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 | 207 ; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
194 ; CI-NEXT: s_waitcnt lgkmcnt(0) | 208 ; CI-NEXT: s_waitcnt lgkmcnt(0) |
195 ; CI-NEXT: s_lshr_b32 s9, s0, 16 | |
196 ; CI-NEXT: s_and_b32 s10, s0, s8 | |
197 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] | 209 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] |
198 ; CI-NEXT: v_mov_b32_e32 v1, 0 | 210 ; CI-NEXT: v_mov_b32_e32 v1, 0 |
199 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 | 211 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 |
212 ; CI-NEXT: s_mov_b32 s0, 0xffff | |
213 ; CI-NEXT: s_lshr_b32 s1, s8, 16 | |
214 ; CI-NEXT: s_and_b32 s8, s8, s0 | |
200 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] | 215 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] |
201 ; CI-NEXT: s_waitcnt vmcnt(0) | 216 ; CI-NEXT: s_waitcnt vmcnt(0) |
202 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 | 217 ; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 |
203 ; CI-NEXT: v_lshlrev_b32_e32 v2, s10, v2 | 218 ; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2 |
204 ; CI-NEXT: v_lshlrev_b32_e32 v3, s9, v3 | 219 ; CI-NEXT: v_lshlrev_b32_e32 v3, s1, v3 |
205 ; CI-NEXT: v_and_b32_e32 v2, s8, v2 | 220 ; CI-NEXT: v_and_b32_e32 v2, s0, v2 |
206 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 | 221 ; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 |
207 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 | 222 ; CI-NEXT: v_or_b32_e32 v2, v2, v3 |
208 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 | 223 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 |
209 ; CI-NEXT: s_endpgm | 224 ; CI-NEXT: s_endpgm |
225 ; | |
226 ; GFX10-LABEL: shl_v_s_v2i16: | |
227 ; GFX10: ; %bb.0: | |
228 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | |
229 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 | |
230 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 | |
231 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) | |
232 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] | |
233 ; GFX10-NEXT: s_waitcnt vmcnt(0) | |
234 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, s0, v1 | |
235 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] | |
236 ; GFX10-NEXT: s_endpgm | |
210 %tid = call i32 @llvm.amdgcn.workitem.id.x() | 237 %tid = call i32 @llvm.amdgcn.workitem.id.x() |
211 %tid.ext = sext i32 %tid to i64 | 238 %tid.ext = sext i32 %tid to i64 |
212 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext | 239 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext |
213 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext | 240 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext |
214 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep | 241 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep |
219 | 246 |
220 define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { | 247 define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { |
221 ; GFX9-LABEL: shl_s_v_v2i16: | 248 ; GFX9-LABEL: shl_s_v_v2i16: |
222 ; GFX9: ; %bb.0: | 249 ; GFX9: ; %bb.0: |
223 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | 250 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 |
224 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 | 251 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 |
225 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 252 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
226 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) | 253 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
227 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 | 254 ; GFX9-NEXT: global_load_dword v1, v0, s[6:7] |
228 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 | |
229 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | |
230 ; GFX9-NEXT: global_load_dword v0, v[0:1], off | |
231 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 | |
232 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 | |
233 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | |
234 ; GFX9-NEXT: s_waitcnt vmcnt(0) | 255 ; GFX9-NEXT: s_waitcnt vmcnt(0) |
235 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, s0 | 256 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s2 |
236 ; GFX9-NEXT: global_store_dword v[2:3], v0, off | 257 ; GFX9-NEXT: global_store_dword v0, v1, s[4:5] |
237 ; GFX9-NEXT: s_endpgm | 258 ; GFX9-NEXT: s_endpgm |
238 ; | 259 ; |
239 ; VI-LABEL: shl_s_v_v2i16: | 260 ; VI-LABEL: shl_s_v_v2i16: |
240 ; VI: ; %bb.0: | 261 ; VI: ; %bb.0: |
241 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | 262 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 |
243 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 264 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
244 ; VI-NEXT: s_waitcnt lgkmcnt(0) | 265 ; VI-NEXT: s_waitcnt lgkmcnt(0) |
245 ; VI-NEXT: v_mov_b32_e32 v1, s7 | 266 ; VI-NEXT: v_mov_b32_e32 v1, s7 |
246 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 | 267 ; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2 |
247 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 268 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
248 ; VI-NEXT: flat_load_dword v0, v[0:1] | 269 ; VI-NEXT: flat_load_dword v3, v[0:1] |
249 ; VI-NEXT: s_lshr_b32 s1, s0, 16 | 270 ; VI-NEXT: s_lshr_b32 s1, s0, 16 |
250 ; VI-NEXT: v_mov_b32_e32 v4, s1 | 271 ; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2 |
251 ; VI-NEXT: v_mov_b32_e32 v3, s5 | 272 ; VI-NEXT: v_mov_b32_e32 v2, s1 |
252 ; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2 | 273 ; VI-NEXT: v_mov_b32_e32 v1, s5 |
253 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc | 274 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
254 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 275 ; VI-NEXT: s_waitcnt vmcnt(0) |
255 ; VI-NEXT: v_lshlrev_b16_e64 v1, v0, s0 | 276 ; VI-NEXT: v_lshlrev_b16_e64 v4, v3, s0 |
256 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 277 ; VI-NEXT: v_lshlrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD |
257 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 | 278 ; VI-NEXT: v_or_b32_e32 v2, v4, v2 |
258 ; VI-NEXT: flat_store_dword v[2:3], v0 | 279 ; VI-NEXT: flat_store_dword v[0:1], v2 |
259 ; VI-NEXT: s_endpgm | 280 ; VI-NEXT: s_endpgm |
260 ; | 281 ; |
261 ; CI-LABEL: shl_s_v_v2i16: | 282 ; CI-LABEL: shl_s_v_v2i16: |
262 ; CI: ; %bb.0: | 283 ; CI: ; %bb.0: |
263 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | 284 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 |
268 ; CI-NEXT: s_waitcnt lgkmcnt(0) | 289 ; CI-NEXT: s_waitcnt lgkmcnt(0) |
269 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] | 290 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] |
270 ; CI-NEXT: v_mov_b32_e32 v1, 0 | 291 ; CI-NEXT: v_mov_b32_e32 v1, 0 |
271 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 | 292 ; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 |
272 ; CI-NEXT: s_mov_b32 s0, 0xffff | 293 ; CI-NEXT: s_mov_b32 s0, 0xffff |
273 ; CI-NEXT: s_lshr_b32 s9, s8, 16 | 294 ; CI-NEXT: s_lshr_b32 s1, s8, 16 |
274 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] | 295 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] |
275 ; CI-NEXT: s_waitcnt vmcnt(0) | 296 ; CI-NEXT: s_waitcnt vmcnt(0) |
276 ; CI-NEXT: v_and_b32_e32 v3, s0, v2 | 297 ; CI-NEXT: v_and_b32_e32 v3, s0, v2 |
277 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 | 298 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 |
278 ; CI-NEXT: v_lshl_b32_e32 v2, s9, v2 | 299 ; CI-NEXT: v_lshl_b32_e32 v2, s1, v2 |
279 ; CI-NEXT: v_lshl_b32_e32 v3, s8, v3 | 300 ; CI-NEXT: v_lshl_b32_e32 v3, s8, v3 |
280 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 | 301 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
281 ; CI-NEXT: v_and_b32_e32 v3, s0, v3 | 302 ; CI-NEXT: v_and_b32_e32 v3, s0, v3 |
282 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 | 303 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 |
283 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 | 304 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 |
284 ; CI-NEXT: s_endpgm | 305 ; CI-NEXT: s_endpgm |
306 ; | |
307 ; GFX10-LABEL: shl_s_v_v2i16: | |
308 ; GFX10: ; %bb.0: | |
309 ; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 | |
310 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 | |
311 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34 | |
312 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) | |
313 ; GFX10-NEXT: global_load_dword v1, v0, s[6:7] | |
314 ; GFX10-NEXT: s_waitcnt vmcnt(0) | |
315 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, s0 | |
316 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] | |
317 ; GFX10-NEXT: s_endpgm | |
285 %tid = call i32 @llvm.amdgcn.workitem.id.x() | 318 %tid = call i32 @llvm.amdgcn.workitem.id.x() |
286 %tid.ext = sext i32 %tid to i64 | 319 %tid.ext = sext i32 %tid to i64 |
287 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext | 320 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext |
288 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext | 321 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext |
289 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep | 322 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep |
294 | 327 |
295 define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { | 328 define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { |
296 ; GFX9-LABEL: shl_imm_v_v2i16: | 329 ; GFX9-LABEL: shl_imm_v_v2i16: |
297 ; GFX9: ; %bb.0: | 330 ; GFX9: ; %bb.0: |
298 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | 331 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 |
299 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 332 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
300 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) | 333 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
301 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 | 334 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] |
302 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 | |
303 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | |
304 ; GFX9-NEXT: global_load_dword v0, v[0:1], off | |
305 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 | |
306 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 | |
307 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | |
308 ; GFX9-NEXT: s_waitcnt vmcnt(0) | 335 ; GFX9-NEXT: s_waitcnt vmcnt(0) |
309 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, 8 op_sel_hi:[1,0] | 336 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] |
310 ; GFX9-NEXT: global_store_dword v[2:3], v0, off | 337 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] |
311 ; GFX9-NEXT: s_endpgm | 338 ; GFX9-NEXT: s_endpgm |
312 ; | 339 ; |
313 ; VI-LABEL: shl_imm_v_v2i16: | 340 ; VI-LABEL: shl_imm_v_v2i16: |
314 ; VI: ; %bb.0: | 341 ; VI: ; %bb.0: |
315 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | 342 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 |
317 ; VI-NEXT: v_mov_b32_e32 v4, 8 | 344 ; VI-NEXT: v_mov_b32_e32 v4, 8 |
318 ; VI-NEXT: s_waitcnt lgkmcnt(0) | 345 ; VI-NEXT: s_waitcnt lgkmcnt(0) |
319 ; VI-NEXT: v_mov_b32_e32 v1, s3 | 346 ; VI-NEXT: v_mov_b32_e32 v1, s3 |
320 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 | 347 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 |
321 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 348 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
322 ; VI-NEXT: flat_load_dword v0, v[0:1] | 349 ; VI-NEXT: flat_load_dword v3, v[0:1] |
323 ; VI-NEXT: v_mov_b32_e32 v3, s1 | 350 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 |
324 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 | 351 ; VI-NEXT: v_mov_b32_e32 v1, s1 |
325 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc | 352 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
326 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 353 ; VI-NEXT: s_waitcnt vmcnt(0) |
327 ; VI-NEXT: v_lshlrev_b16_e64 v1, v0, 8 | 354 ; VI-NEXT: v_lshlrev_b16_e64 v2, v3, 8 |
328 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | 355 ; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD |
329 ; VI-NEXT: v_or_b32_e32 v0, v1, v0 | 356 ; VI-NEXT: v_or_b32_e32 v2, v2, v3 |
330 ; VI-NEXT: flat_store_dword v[2:3], v0 | 357 ; VI-NEXT: flat_store_dword v[0:1], v2 |
331 ; VI-NEXT: s_endpgm | 358 ; VI-NEXT: s_endpgm |
332 ; | 359 ; |
333 ; CI-LABEL: shl_imm_v_v2i16: | 360 ; CI-LABEL: shl_imm_v_v2i16: |
334 ; CI: ; %bb.0: | 361 ; CI: ; %bb.0: |
335 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | 362 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 |
349 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 | 376 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 |
350 ; CI-NEXT: v_and_b32_e32 v3, 0xfff8, v3 | 377 ; CI-NEXT: v_and_b32_e32 v3, 0xfff8, v3 |
351 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 | 378 ; CI-NEXT: v_or_b32_e32 v2, v3, v2 |
352 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 | 379 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 |
353 ; CI-NEXT: s_endpgm | 380 ; CI-NEXT: s_endpgm |
381 ; | |
382 ; GFX10-LABEL: shl_imm_v_v2i16: | |
383 ; GFX10: ; %bb.0: | |
384 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | |
385 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 | |
386 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) | |
387 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] | |
388 ; GFX10-NEXT: s_waitcnt vmcnt(0) | |
389 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] | |
390 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] | |
391 ; GFX10-NEXT: s_endpgm | |
354 %tid = call i32 @llvm.amdgcn.workitem.id.x() | 392 %tid = call i32 @llvm.amdgcn.workitem.id.x() |
355 %tid.ext = sext i32 %tid to i64 | 393 %tid.ext = sext i32 %tid to i64 |
356 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext | 394 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext |
357 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext | 395 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext |
358 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep | 396 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep |
363 | 401 |
364 define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { | 402 define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { |
365 ; GFX9-LABEL: shl_v_imm_v2i16: | 403 ; GFX9-LABEL: shl_v_imm_v2i16: |
366 ; GFX9: ; %bb.0: | 404 ; GFX9: ; %bb.0: |
367 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | 405 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 |
368 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 406 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 |
369 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) | 407 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
370 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 | 408 ; GFX9-NEXT: global_load_dword v1, v0, s[2:3] |
371 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 | |
372 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | |
373 ; GFX9-NEXT: global_load_dword v0, v[0:1], off | |
374 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 | |
375 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 | |
376 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | |
377 ; GFX9-NEXT: s_waitcnt vmcnt(0) | 409 ; GFX9-NEXT: s_waitcnt vmcnt(0) |
378 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] | 410 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] |
379 ; GFX9-NEXT: global_store_dword v[2:3], v0, off | 411 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] |
380 ; GFX9-NEXT: s_endpgm | 412 ; GFX9-NEXT: s_endpgm |
381 ; | 413 ; |
382 ; VI-LABEL: shl_v_imm_v2i16: | 414 ; VI-LABEL: shl_v_imm_v2i16: |
383 ; VI: ; %bb.0: | 415 ; VI: ; %bb.0: |
384 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | 416 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 |
385 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 | 417 ; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 |
386 ; VI-NEXT: s_waitcnt lgkmcnt(0) | 418 ; VI-NEXT: s_waitcnt lgkmcnt(0) |
387 ; VI-NEXT: v_mov_b32_e32 v1, s3 | 419 ; VI-NEXT: v_mov_b32_e32 v1, s3 |
388 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 | 420 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 |
389 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 421 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
390 ; VI-NEXT: flat_load_dword v0, v[0:1] | 422 ; VI-NEXT: flat_load_dword v3, v[0:1] |
391 ; VI-NEXT: v_mov_b32_e32 v3, s1 | 423 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2 |
392 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 | 424 ; VI-NEXT: v_mov_b32_e32 v1, s1 |
393 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc | 425 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
394 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 426 ; VI-NEXT: s_waitcnt vmcnt(0) |
395 ; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0 | 427 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 |
396 ; VI-NEXT: v_and_b32_e32 v1, 0xff000000, v1 | 428 ; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2 |
397 ; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0 | 429 ; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3 |
398 ; VI-NEXT: v_or_b32_e32 v0, v0, v1 | 430 ; VI-NEXT: v_or_b32_e32 v2, v3, v2 |
399 ; VI-NEXT: flat_store_dword v[2:3], v0 | 431 ; VI-NEXT: flat_store_dword v[0:1], v2 |
400 ; VI-NEXT: s_endpgm | 432 ; VI-NEXT: s_endpgm |
401 ; | 433 ; |
402 ; CI-LABEL: shl_v_imm_v2i16: | 434 ; CI-LABEL: shl_v_imm_v2i16: |
403 ; CI: ; %bb.0: | 435 ; CI: ; %bb.0: |
404 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | 436 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 |
413 ; CI-NEXT: s_waitcnt vmcnt(0) | 445 ; CI-NEXT: s_waitcnt vmcnt(0) |
414 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 | 446 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 |
415 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 | 447 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 |
416 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 | 448 ; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 |
417 ; CI-NEXT: s_endpgm | 449 ; CI-NEXT: s_endpgm |
450 ; | |
451 ; GFX10-LABEL: shl_v_imm_v2i16: | |
452 ; GFX10: ; %bb.0: | |
453 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | |
454 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 | |
455 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) | |
456 ; GFX10-NEXT: global_load_dword v1, v0, s[2:3] | |
457 ; GFX10-NEXT: s_waitcnt vmcnt(0) | |
458 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] | |
459 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] | |
460 ; GFX10-NEXT: s_endpgm | |
418 %tid = call i32 @llvm.amdgcn.workitem.id.x() | 461 %tid = call i32 @llvm.amdgcn.workitem.id.x() |
419 %tid.ext = sext i32 %tid to i64 | 462 %tid.ext = sext i32 %tid to i64 |
420 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext | 463 %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext |
421 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext | 464 %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext |
422 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep | 465 %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep |
427 | 470 |
428 define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { | 471 define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { |
429 ; GFX9-LABEL: v_shl_v4i16: | 472 ; GFX9-LABEL: v_shl_v4i16: |
430 ; GFX9: ; %bb.0: | 473 ; GFX9: ; %bb.0: |
431 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | 474 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 |
432 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 | 475 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
433 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) | 476 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
434 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 | 477 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] |
435 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 | 478 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 |
436 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | |
437 ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off | |
438 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 | |
439 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 | |
440 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 | |
441 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | |
442 ; GFX9-NEXT: s_waitcnt vmcnt(0) | 479 ; GFX9-NEXT: s_waitcnt vmcnt(0) |
443 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v5 | 480 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, v3, v1 |
444 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4 | 481 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0 |
445 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off | 482 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] |
446 ; GFX9-NEXT: s_endpgm | 483 ; GFX9-NEXT: s_endpgm |
447 ; | 484 ; |
448 ; VI-LABEL: v_shl_v4i16: | 485 ; VI-LABEL: v_shl_v4i16: |
449 ; VI: ; %bb.0: | 486 ; VI: ; %bb.0: |
450 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | 487 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 |
451 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 | 488 ; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0 |
452 ; VI-NEXT: s_waitcnt lgkmcnt(0) | 489 ; VI-NEXT: s_waitcnt lgkmcnt(0) |
453 ; VI-NEXT: v_mov_b32_e32 v1, s3 | 490 ; VI-NEXT: v_mov_b32_e32 v1, s3 |
454 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 | 491 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4 |
455 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 492 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
456 ; VI-NEXT: v_mov_b32_e32 v3, s1 | 493 ; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0 |
457 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 | 494 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc |
458 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc | |
459 ; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0 | |
460 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc | |
461 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] | 495 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] |
462 ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] | 496 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] |
463 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 497 ; VI-NEXT: v_mov_b32_e32 v5, s1 |
464 ; VI-NEXT: v_lshlrev_b16_e32 v6, v5, v1 | 498 ; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4 |
465 ; VI-NEXT: v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 | 499 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc |
466 ; VI-NEXT: v_lshlrev_b16_e32 v5, v4, v0 | 500 ; VI-NEXT: s_waitcnt vmcnt(0) |
467 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 | 501 ; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1 |
502 ; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 | |
503 ; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0 | |
504 ; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 | |
468 ; VI-NEXT: v_or_b32_e32 v1, v6, v1 | 505 ; VI-NEXT: v_or_b32_e32 v1, v6, v1 |
469 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 | 506 ; VI-NEXT: v_or_b32_e32 v0, v3, v0 |
470 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] | 507 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] |
471 ; VI-NEXT: s_endpgm | 508 ; VI-NEXT: s_endpgm |
472 ; | 509 ; |
473 ; CI-LABEL: v_shl_v4i16: | 510 ; CI-LABEL: v_shl_v4i16: |
474 ; CI: ; %bb.0: | 511 ; CI: ; %bb.0: |
475 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 | 512 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 |
479 ; CI-NEXT: v_mov_b32_e32 v1, 0 | 516 ; CI-NEXT: v_mov_b32_e32 v1, 0 |
480 ; CI-NEXT: s_waitcnt lgkmcnt(0) | 517 ; CI-NEXT: s_waitcnt lgkmcnt(0) |
481 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] | 518 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] |
482 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 | 519 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 |
483 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 | 520 ; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8 |
484 ; CI-NEXT: s_mov_b32 s8, 0xffff | 521 ; CI-NEXT: s_mov_b32 s0, 0xffff |
485 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] | 522 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] |
486 ; CI-NEXT: s_waitcnt vmcnt(1) | 523 ; CI-NEXT: s_waitcnt vmcnt(1) |
487 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 | 524 ; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2 |
488 ; CI-NEXT: s_waitcnt vmcnt(0) | 525 ; CI-NEXT: s_waitcnt vmcnt(0) |
489 ; CI-NEXT: v_and_b32_e32 v8, s8, v4 | 526 ; CI-NEXT: v_and_b32_e32 v8, s0, v4 |
490 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 | 527 ; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 |
491 ; CI-NEXT: v_and_b32_e32 v9, s8, v5 | 528 ; CI-NEXT: v_and_b32_e32 v9, s0, v5 |
492 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 | 529 ; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3 |
493 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 | 530 ; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5 |
494 ; CI-NEXT: v_lshl_b32_e32 v5, v7, v5 | 531 ; CI-NEXT: v_lshl_b32_e32 v5, v7, v5 |
495 ; CI-NEXT: v_lshl_b32_e32 v3, v3, v9 | 532 ; CI-NEXT: v_lshl_b32_e32 v3, v3, v9 |
496 ; CI-NEXT: v_lshl_b32_e32 v4, v6, v4 | 533 ; CI-NEXT: v_lshl_b32_e32 v4, v6, v4 |
497 ; CI-NEXT: v_lshl_b32_e32 v2, v2, v8 | 534 ; CI-NEXT: v_lshl_b32_e32 v2, v2, v8 |
498 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 | 535 ; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5 |
499 ; CI-NEXT: v_and_b32_e32 v3, s8, v3 | 536 ; CI-NEXT: v_and_b32_e32 v3, s0, v3 |
500 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 | 537 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
501 ; CI-NEXT: v_and_b32_e32 v2, s8, v2 | 538 ; CI-NEXT: v_and_b32_e32 v2, s0, v2 |
502 ; CI-NEXT: v_or_b32_e32 v3, v3, v5 | 539 ; CI-NEXT: v_or_b32_e32 v3, v3, v5 |
503 ; CI-NEXT: v_or_b32_e32 v2, v2, v4 | 540 ; CI-NEXT: v_or_b32_e32 v2, v2, v4 |
504 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 | 541 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 |
505 ; CI-NEXT: s_endpgm | 542 ; CI-NEXT: s_endpgm |
543 ; | |
544 ; GFX10-LABEL: v_shl_v4i16: | |
545 ; GFX10: ; %bb.0: | |
546 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | |
547 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0 | |
548 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) | |
549 ; GFX10-NEXT: s_clause 0x1 | |
550 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] | |
551 ; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 | |
552 ; GFX10-NEXT: s_waitcnt vmcnt(0) | |
553 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, v3, v1 | |
554 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, v2, v0 | |
555 ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] | |
556 ; GFX10-NEXT: s_endpgm | |
506 %tid = call i32 @llvm.amdgcn.workitem.id.x() | 557 %tid = call i32 @llvm.amdgcn.workitem.id.x() |
507 %tid.ext = sext i32 %tid to i64 | 558 %tid.ext = sext i32 %tid to i64 |
508 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext | 559 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext |
509 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext | 560 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext |
510 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1 | 561 %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1 |
519 ; GFX9-LABEL: shl_v_imm_v4i16: | 570 ; GFX9-LABEL: shl_v_imm_v4i16: |
520 ; GFX9: ; %bb.0: | 571 ; GFX9: ; %bb.0: |
521 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | 572 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 |
522 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 | 573 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 |
523 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) | 574 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) |
524 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 | 575 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] |
525 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 | |
526 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc | |
527 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off | |
528 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 | |
529 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 | |
530 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc | |
531 ; GFX9-NEXT: s_waitcnt vmcnt(0) | 576 ; GFX9-NEXT: s_waitcnt vmcnt(0) |
532 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] | 577 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] |
533 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] | 578 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] |
534 ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off | 579 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] |
535 ; GFX9-NEXT: s_endpgm | 580 ; GFX9-NEXT: s_endpgm |
536 ; | 581 ; |
537 ; VI-LABEL: shl_v_imm_v4i16: | 582 ; VI-LABEL: shl_v_imm_v4i16: |
538 ; VI: ; %bb.0: | 583 ; VI: ; %bb.0: |
539 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | 584 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 |
540 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 | 585 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 |
541 ; VI-NEXT: s_waitcnt lgkmcnt(0) | 586 ; VI-NEXT: s_waitcnt lgkmcnt(0) |
587 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 | |
542 ; VI-NEXT: v_mov_b32_e32 v1, s3 | 588 ; VI-NEXT: v_mov_b32_e32 v1, s3 |
543 ; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2 | |
544 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc | 589 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc |
545 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] | 590 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] |
591 ; VI-NEXT: s_mov_b32 s2, 0xff000000 | |
592 ; VI-NEXT: v_mov_b32_e32 v3, s1 | |
546 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 | 593 ; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 |
547 ; VI-NEXT: s_mov_b32 s0, 0xff000000 | |
548 ; VI-NEXT: v_mov_b32_e32 v3, s1 | |
549 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc | 594 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc |
550 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | 595 ; VI-NEXT: s_waitcnt vmcnt(0) |
551 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 | 596 ; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1 |
552 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0 | 597 ; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0 |
553 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 | 598 ; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0 |
554 ; VI-NEXT: v_and_b32_e32 v0, s0, v0 | 599 ; VI-NEXT: v_and_b32_e32 v0, s2, v0 |
555 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 | 600 ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1 |
556 ; VI-NEXT: v_and_b32_e32 v4, s0, v4 | 601 ; VI-NEXT: v_and_b32_e32 v4, s2, v4 |
557 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 | 602 ; VI-NEXT: v_or_b32_e32 v1, v1, v4 |
558 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 | 603 ; VI-NEXT: v_or_b32_e32 v0, v5, v0 |
559 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] | 604 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] |
560 ; VI-NEXT: s_endpgm | 605 ; VI-NEXT: s_endpgm |
561 ; | 606 ; |
567 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 | 612 ; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 |
568 ; CI-NEXT: v_mov_b32_e32 v1, 0 | 613 ; CI-NEXT: v_mov_b32_e32 v1, 0 |
569 ; CI-NEXT: s_waitcnt lgkmcnt(0) | 614 ; CI-NEXT: s_waitcnt lgkmcnt(0) |
570 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] | 615 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7] |
571 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 | 616 ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 |
572 ; CI-NEXT: s_mov_b32 s8, 0xff00 | 617 ; CI-NEXT: s_mov_b32 s0, 0xff00 |
573 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] | 618 ; CI-NEXT: s_mov_b64 s[6:7], s[2:3] |
574 ; CI-NEXT: s_waitcnt vmcnt(0) | 619 ; CI-NEXT: s_waitcnt vmcnt(0) |
575 ; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3 | 620 ; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3 |
576 ; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 | 621 ; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 |
577 ; CI-NEXT: v_and_b32_e32 v4, s8, v4 | 622 ; CI-NEXT: v_and_b32_e32 v4, s0, v4 |
578 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 | 623 ; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 |
579 ; CI-NEXT: v_and_b32_e32 v3, s8, v3 | 624 ; CI-NEXT: v_and_b32_e32 v3, s0, v3 |
580 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 | 625 ; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4 |
581 ; CI-NEXT: v_or_b32_e32 v3, v3, v4 | 626 ; CI-NEXT: v_or_b32_e32 v3, v3, v4 |
582 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 | 627 ; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2 |
583 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 | 628 ; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 |
584 ; CI-NEXT: s_endpgm | 629 ; CI-NEXT: s_endpgm |
630 ; | |
631 ; GFX10-LABEL: shl_v_imm_v4i16: | |
632 ; GFX10: ; %bb.0: | |
633 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 | |
634 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0 | |
635 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) | |
636 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] | |
637 ; GFX10-NEXT: s_waitcnt vmcnt(0) | |
638 ; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] | |
639 ; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] | |
640 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] | |
641 ; GFX10-NEXT: s_endpgm | |
585 %tid = call i32 @llvm.amdgcn.workitem.id.x() | 642 %tid = call i32 @llvm.amdgcn.workitem.id.x() |
586 %tid.ext = sext i32 %tid to i64 | 643 %tid.ext = sext i32 %tid to i64 |
587 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext | 644 %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext |
588 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext | 645 %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext |
589 %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep | 646 %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep |