@@ -211,28 +211,15 @@ template<>
211
211
/* for other fc*/
212
212
for (int word_id = 0 ; word_id < max_len; word_id++) {
213
213
_attn_outs[0 ]->reshape (first_fc_out_0_shape);
214
- // if (word_id > 1) {
215
- // break;
216
- // }
217
214
218
215
if (word_id > 0 ) {
219
216
Shape h_shape = {seq_num, N_0, 1 , 1 };
220
217
_first_fc_out_1.reshape (h_shape);
221
218
222
- // auto kernel_1 = saber_find_fast_sass_gemm(false, !fc_vec[0].is_transpose_weights, seq_num, N_0, hidden_size);
223
219
auto kernel_1 = saber_find_fast_sass_gemm (false , false , seq_num, N_0, hidden_size);
224
220
kernel_1 (seq_num, N_0, hidden_size, 1 .0f ,
225
221
_cell_out.data (), 0 .f ,
226
- fc_vec[0 ].weights ->data () + K_0 * N_0, _first_fc_out_1.mutable_data (), stream);
227
- // cudaDeviceSynchronize();
228
- // print_tensor_device(_lstm_out);
229
- // print_tensor_device(*(fc_vec[0]->weights));
230
- // cudaDeviceSynchronize();
231
- // gemm(_handle, false, false, seq_num, N_0, hidden_size,
232
- // 1.0, _lstm_out.data() + (word_id - 1) * seq_num * hidden_size,
233
- // fc_vec[0]->weights->data() + K_0 * N_0,
234
- // 0.f, _first_fc_out_1.mutable_data());
235
- // cudaDeviceSynchronize();
222
+ fc_vec[0 ].weights ->data () + K_0 * N_0, _first_fc_out_1.mutable_data (), stream);
236
223
237
224
sequence_bias_relu<<<CUDA_GET_BLOCKS(_attn_outs[0 ]->valid_size ()), CUDA_NUM_THREADS, 0, stream>>>(_first_fc_out_0.data(), _first_fc_out_1.data(), fc_vec[0].bias->data(),
238
225
_dev_seq_id_map.data(), M_0, N_0, _attn_outs[0]->mutable_data());
@@ -252,7 +239,6 @@ template<>
252
239
auto fc_in_data = _attn_outs[i - 1 ]->data ();
253
240
auto fc_out_data = _attn_outs[i]->mutable_data ();
254
241
255
- // auto kernel = saber_find_fast_sass_gemm(false, !fc_vec[i].is_transpose_weights, M, N, K);
256
242
auto kernel = saber_find_fast_sass_gemm (false , false , M, N, K);
257
243
kernel (M, N, K, 1 .0f , fc_in_data, 0 .0f , fc_vec[i].weights ->data (), fc_out_data, stream);
258
244
bias_relu<<<CUDA_GET_BLOCKS(_attn_outs[i]->valid_size ()), CUDA_NUM_THREADS, 0, stream>>>(fc_out_data, fc_vec[i].bias->data (), _attn_outs[i]->valid_size(), N, fc_out_data);
@@ -268,14 +254,6 @@ template<>
268
254
sequence_softmax<<<CUDA_GET_BLOCKS(seq_num), CUDA_NUM_THREADS, 0 , stream>>> (_attn_outs[fc_num - 1 ]->data (), _dev_offset.data(), seq_num, _softmax_out.mutable_data());
269
255
270
256
sequence_pool<<<CUDA_GET_BLOCKS(seq_num * dim), CUDA_NUM_THREADS, 0 , stream>>> (input->data (), _softmax_out.data(), _dev_offset.data(), seq_num, inputs[0]->num(), dim, _pool_out.mutable_data());
271
- /* data after pool need be sorted or append*/
272
- // cudaDeviceSynchronize();
273
- // record_dev_tensorfile<NV>(_pool_out.mutable_data(), _pool_out.valid_size(), "./sequence_pool_out_cu.txt");
274
- // record_dev_tensorfile<NV>(_softmax_out.mutable_data(), _softmax_out.valid_size(), "./softmax_out_cu.txt");
275
- // record_dev_tensorfile<NV>(_attn_outs[0]->mutable_data(), _attn_outs[0]->valid_size(), "./attn_fc_0_cu.txt");
276
- // record_dev_tensorfile<NV>(_attn_outs[1]->mutable_data(), _attn_outs[1]->valid_size(), "./attn_fc_1_cu.txt");
277
- // record_dev_tensorfile<NV>(_first_fc_out_1.mutable_data(), _first_fc_out_1.valid_size(), "./first_fc_1_cu.txt");
278
- // record_dev_tensorfile<NV>(attn_param.fc_vec[0].weights->data() + 30, /*attn_param.fc_vec[0]->weights->valid_size()*/ 15, "./fc_0_weight.txt");
279
257
280
258
281
259
auto x_data = _pool_out.data();
@@ -298,11 +276,6 @@ template<>
298
276
_dev_offset.data(), seq_num, word_num, hidden_size, outputs[0 ]->mutable_data ());
299
277
300
278
outputs[0 ]->set_seq_offset (inputs[0 ]->get_seq_offset ());
301
- CUDA_CHECK (cudaDeviceSynchronize());
302
- CUDA_CHECK (cudaPeekAtLastError());
303
- // cudaDeviceSynchronize();
304
- // record_dev_tensorfile<NV>(outputs[0]->data(), outputs[0]->valid_size(), "./final_out.txt");
305
- // record_dev_tensorfile<NV>(_lstm_out.mutable_data(), _lstm_out.valid_size(), "./lstm_out.txt");
306
279
return SaberSuccess;
307
280
}
308
281
0 commit comments