@@ -33,12 +33,15 @@ using LoDTensor = framework::LoDTensor;
33
33
using SelectedRows = framework::SelectedRows;
34
34
using DDim = framework::DDim;
35
35
36
+ constexpr int64_t kNoPadding = -1 ;
37
+
36
38
#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
37
- !defined(__OSX__) && !defined(PADDLE_WITH_CUDA)
39
+ !defined(__OSX__)
38
40
template <typename T>
39
41
void prepare_csr_data (const std::vector<uint64_t > &offset,
40
42
const int64_t *ids_data, const size_t idx_width,
41
- T *csr_vals, int *csr_colmuns, int *csr_row_idx) {
43
+ T *csr_vals, int *csr_colmuns, int *csr_row_idx,
44
+ int64_t padding_idx = kNoPadding ) {
42
45
int val_idx = 0 ;
43
46
int row_idx = 0 ;
44
47
csr_row_idx[0 ] = 0 ;
@@ -52,9 +55,11 @@ void prepare_csr_data(const std::vector<uint64_t> &offset,
52
55
53
56
// construct a map for creating csr
54
57
for (size_t j = offset[i]; j < offset[i + 1 ]; ++j) {
55
- unsigned int word_idx =
56
- static_cast <unsigned int >(ids_data[idx + j * idx_width]);
57
- ++ids_map[word_idx];
58
+ auto ids_value = ids_data[idx + j * idx_width];
59
+ if (ids_value != padding_idx) {
60
+ unsigned int word_idx = static_cast <unsigned int >(ids_value);
61
+ ++ids_map[word_idx];
62
+ }
58
63
}
59
64
60
65
VLOG (4 ) << " ====sequence %d====" << i;
@@ -124,16 +129,17 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
124
129
FusedEmbeddingSeqPoolLastDim (table_var->dims (), ids_t ->dims ());
125
130
const auto &ids_lod = ids_t ->lod ();
126
131
// in run time, the LoD of ids must be 1
127
- PADDLE_ENFORCE (ids_lod.size (), 1UL ,
128
- " The LoD level of Input(Ids) must be 1" );
132
+ PADDLE_ENFORCE_EQ (ids_lod.size (), 1UL ,
133
+ " The LoD level of Input(Ids) must be 1" );
129
134
int64_t batch_size = ids_lod[0 ].size () - 1 ;
130
135
// in run time, the shape from Ids -> output
131
136
// should be [seq_length, 1] -> [batch_size, last_dim]
132
137
output_t ->Resize ({batch_size, last_dim});
133
138
134
139
if (combiner_type == " sum" ) {
135
140
#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
136
- !defined (__OSX__) && !defined (PADDLE_WITH_CUDA)
141
+ !defined (__OSX__)
142
+ int64_t padding_idx = context.Attr <int64_t >(" padding_idx" );
137
143
auto output = output_t ->mutable_data <T>(context.GetPlace ());
138
144
int64_t table_height = table_var->dims ()[0 ];
139
145
int64_t table_width = table_var->dims ()[1 ];
@@ -151,7 +157,7 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
151
157
auto csr_colmuns = csr_colmuns_t .mutable_data <int >(context.GetPlace ());
152
158
auto csr_row_idx = csr_row_idx_t .mutable_data <int >(context.GetPlace ());
153
159
prepare_csr_data<T>(offset, ids_t ->data <int64_t >(), idx_width, csr_vals,
154
- csr_colmuns, csr_row_idx);
160
+ csr_colmuns, csr_row_idx, padding_idx );
155
161
156
162
const char transa = ' N' ;
157
163
const T alpha = 1.0 ;
@@ -226,18 +232,19 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
226
232
}
227
233
} else {
228
234
#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
229
- !defined (__OSX__) && ! defined (PADDLE_WITH_CUDA)
235
+ !defined (__OSX__)
230
236
auto *ids = context.Input <LoDTensor>(" Ids" );
231
237
auto *d_output = context.Input <LoDTensor>(framework::GradVarName (" Out" ));
232
238
auto *d_table = context.Output <LoDTensor>(framework::GradVarName (" W" ));
239
+ int64_t padding_idx = context.Attr <int64_t >(" padding_idx" );
233
240
234
241
d_table->Resize (table_dim);
235
242
auto *d_table_data = d_table->mutable_data <T>(context.GetPlace ());
236
243
memset (d_table_data, 0 , d_table->numel () * sizeof (T));
237
244
238
245
const auto &ids_lod = ids->lod ();
239
- PADDLE_ENFORCE (ids_lod.size (), 1UL ,
240
- " The LoD level of Input(Ids) must be 1" );
246
+ PADDLE_ENFORCE_EQ (ids_lod.size (), 1UL ,
247
+ " The LoD level of Input(Ids) must be 1" );
241
248
const std::vector<uint64_t > offset = ids_lod[0 ];
242
249
auto len = ids->numel ();
243
250
int idx_width = len / offset.back ();
@@ -251,23 +258,21 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
251
258
auto csr_colmuns = csr_colmuns_t .mutable_data <int >(context.GetPlace ());
252
259
auto csr_row_idx = csr_row_idx_t .mutable_data <int >(context.GetPlace ());
253
260
prepare_csr_data<T>(offset, ids->data <int64_t >(), idx_width, csr_vals,
254
- csr_colmuns, csr_row_idx);
261
+ csr_colmuns, csr_row_idx, padding_idx );
255
262
256
263
auto *d_output_data = d_output->data <T>();
257
- const char transa = ' T' ;
258
- const T alpha = 1.0 ;
259
- const T beta = 0.0 ;
260
- const char matdescra[] = {' G' , ' L' , ' N' , ' C' };
261
-
262
- const int m = batch_size * idx_width;
263
- const int n = table_dim[1 ];
264
- const int k = table_dim[1 ];
265
-
266
264
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
267
- blas.CSRMM (&transa, &m, &n, &k, &alpha, matdescra, (const T *)csr_vals,
268
- (const int *)csr_colmuns, (const int *)csr_row_idx,
269
- (const int *)csr_row_idx + 1 , d_output_data, &n, &beta,
270
- d_table_data, &n);
265
+ int width = static_cast <int >(table_dim[1 ]);
266
+ int num_seq = batch_size * idx_width;
267
+ LOG (INFO) << " num seq = " << num_seq << " width = " << width;
268
+ for (int i = 0 ; i < num_seq; ++i) {
269
+ for (int j = csr_row_idx[i]; j < csr_row_idx[i + 1 ]; ++j) {
270
+ unsigned int word_idx = csr_colmuns[j];
271
+ T val = csr_vals[j];
272
+ blas.AXPY (width, val, d_output_data + i * width,
273
+ d_table_data + word_idx * width);
274
+ }
275
+ }
271
276
#else
272
277
LOG (ERROR) << " Dense is not supported in fused_embedding_seq_pool_op now" ;
273
278
#endif
0 commit comments