Skip to content

Commit 2ae4c26

Browse files
DesmonDayroot
authored and
root
committed
add graphsage slot feature (PaddlePaddle#126)
1 parent 520e680 commit 2ae4c26

File tree

1 file changed

+64
-49
lines changed

1 file changed

+64
-49
lines changed

paddle/fluid/framework/data_feed.cu

+64-49
Original file line numberDiff line numberDiff line change
@@ -737,6 +737,8 @@ int GraphDataGenerator::GenerateBatch() {
737737
int total_instance = 0;
738738
platform::CUDADeviceGuard guard(gpuid_);
739739
int res = 0;
740+
741+
std::shared_ptr<phi::Allocation> final_sage_nodes;
740742
if (!gpu_graph_training_) {
741743
while (cursor_ < h_device_keys_.size()) {
742744
size_t device_key_size = h_device_keys_[cursor_]->size();
@@ -776,7 +778,6 @@ int GraphDataGenerator::GenerateBatch() {
776778
0,
777779
stream_>>>(clk_tensor_ptr_, total_instance);
778780
} else {
779-
780781
auto node_buf = memory::AllocShared(
781782
place_, total_instance * sizeof(uint64_t));
782783
int64_t* node_buf_ptr = reinterpret_cast<int64_t* >(node_buf->ptr());
@@ -789,7 +790,7 @@ int GraphDataGenerator::GenerateBatch() {
789790
phi::DenseTensor inverse_;
790791
VLOG(1) << "generate sample graph";
791792
uint64_t* node_buf_ptr_ = reinterpret_cast<uint64_t* >(node_buf->ptr());
792-
std::shared_ptr<phi::Allocation> final_infer_nodes =
793+
final_sage_nodes =
793794
GenerateSampleGraph(node_buf_ptr_, total_instance, &uniq_instance_,
794795
&inverse_);
795796
id_tensor_ptr_ =
@@ -803,7 +804,7 @@ int GraphDataGenerator::GenerateBatch() {
803804
feed_vec_[index_offset]->mutable_data<int>({total_instance}, this->place_);
804805

805806
VLOG(1) << "copy id and index";
806-
cudaMemcpy(id_tensor_ptr_, final_infer_nodes->ptr(),
807+
cudaMemcpy(id_tensor_ptr_, final_sage_nodes->ptr(),
807808
sizeof(int64_t) * uniq_instance_,
808809
cudaMemcpyDeviceToDevice);
809810
cudaMemcpy(index_tensor_ptr_, inverse_.data<int>(), sizeof(int) * total_instance,
@@ -840,31 +841,7 @@ int GraphDataGenerator::GenerateBatch() {
840841
total_instance *= 2;
841842
}
842843

843-
int64_t *slot_tensor_ptr_[slot_num_];
844-
int64_t *slot_lod_tensor_ptr_[slot_num_];
845-
if (slot_num_ > 0) {
846-
for (int i = 0; i < slot_num_; ++i) {
847-
slot_tensor_ptr_[i] = feed_vec_[3 + 2 * i]->mutable_data<int64_t>(
848-
{total_instance * h_slot_feature_num_map_[i], 1}, this->place_);
849-
slot_lod_tensor_ptr_[i] = feed_vec_[3 + 2 * i + 1]->mutable_data<int64_t>(
850-
{total_instance + 1}, this->place_);
851-
}
852-
if (FLAGS_enable_opt_get_features || !gpu_graph_training_) {
853-
cudaMemcpyAsync(d_slot_tensor_ptr_->ptr(),
854-
slot_tensor_ptr_,
855-
sizeof(uint64_t *) * slot_num_,
856-
cudaMemcpyHostToDevice,
857-
stream_);
858-
cudaMemcpyAsync(d_slot_lod_tensor_ptr_->ptr(),
859-
slot_lod_tensor_ptr_,
860-
sizeof(uint64_t *) * slot_num_,
861-
cudaMemcpyHostToDevice,
862-
stream_);
863-
}
864-
}
865-
866844
uint64_t *ins_cursor, *ins_buf;
867-
std::shared_ptr<phi::Allocation> final_nodes;
868845
phi::DenseTensor inverse;
869846
if (gpu_graph_training_) {
870847
VLOG(2) << "total_instance: " << total_instance
@@ -893,7 +870,7 @@ int GraphDataGenerator::GenerateBatch() {
893870
stream_>>>(clk_tensor_ptr_, total_instance);
894871
} else {
895872
VLOG(2) << gpuid_ << " " << "Ready to enter GenerateSampleGraph";
896-
final_nodes = GenerateSampleGraph(ins_cursor, total_instance, &uniq_instance_,
873+
final_sage_nodes = GenerateSampleGraph(ins_cursor, total_instance, &uniq_instance_,
897874
&inverse);
898875
VLOG(2) << "Copy Final Results";
899876
id_tensor_ptr_ =
@@ -907,7 +884,7 @@ int GraphDataGenerator::GenerateBatch() {
907884
feed_vec_[index_offset]->mutable_data<int>({total_instance}, this->place_);
908885

909886
cudaMemcpyAsync(id_tensor_ptr_,
910-
final_nodes->ptr(),
887+
final_sage_nodes->ptr(),
911888
sizeof(int64_t) * uniq_instance_,
912889
cudaMemcpyDeviceToDevice,
913890
stream_);
@@ -930,23 +907,60 @@ int GraphDataGenerator::GenerateBatch() {
930907
ins_cursor = (uint64_t *)id_tensor_ptr_; // NOLINT
931908
}
932909

910+
int64_t *slot_tensor_ptr_[slot_num_];
911+
int64_t *slot_lod_tensor_ptr_[slot_num_];
933912
if (slot_num_ > 0) {
913+
int slot_instance = sage_mode_ == true ? uniq_instance_ : total_instance;
914+
for (int i = 0; i < slot_num_; ++i) {
915+
slot_tensor_ptr_[i] = feed_vec_[3 + 2 * i]->mutable_data<int64_t>(
916+
{slot_instance * h_slot_feature_num_map_[i], 1}, this->place_);
917+
slot_lod_tensor_ptr_[i] = feed_vec_[3 + 2 * i + 1]->mutable_data<int64_t>(
918+
{slot_instance + 1}, this->place_);
919+
}
920+
if (FLAGS_enable_opt_get_features || !gpu_graph_training_) {
921+
cudaMemcpyAsync(d_slot_tensor_ptr_->ptr(),
922+
slot_tensor_ptr_,
923+
sizeof(uint64_t *) * slot_num_,
924+
cudaMemcpyHostToDevice,
925+
stream_);
926+
cudaMemcpyAsync(d_slot_lod_tensor_ptr_->ptr(),
927+
slot_lod_tensor_ptr_,
928+
sizeof(uint64_t *) * slot_num_,
929+
cudaMemcpyHostToDevice,
930+
stream_);
931+
}
932+
if (sage_mode_) {
933+
d_feature_buf_ =
934+
memory::AllocShared(place_, slot_instance * slot_num_ * sizeof(uint64_t));
935+
}
934936
uint64_t *feature_buf = reinterpret_cast<uint64_t *>(d_feature_buf_->ptr());
935937
if (FLAGS_enable_opt_get_features || !gpu_graph_training_) {
936-
FillFeatureBuf(ins_cursor, feature_buf, total_instance);
937-
// FillFeatureBuf(id_tensor_ptr_, feature_buf, total_instance);
938+
if (!sage_mode_) {
939+
FillFeatureBuf(ins_cursor, feature_buf, slot_instance);
940+
} else {
941+
uint64_t* sage_nodes_ptr = reinterpret_cast<uint64_t *>(final_sage_nodes->ptr());
942+
FillFeatureBuf(sage_nodes_ptr, feature_buf, slot_instance);
943+
}
938944
if (debug_mode_) {
939-
uint64_t h_walk[total_instance]; // NOLINT
940-
cudaMemcpy(h_walk,
941-
ins_cursor,
942-
total_instance * sizeof(uint64_t),
943-
cudaMemcpyDeviceToHost);
944-
uint64_t h_feature[total_instance * fea_num_per_node_];
945+
uint64_t h_walk[slot_instance];
946+
if (!sage_mode_) {
947+
cudaMemcpy(h_walk,
948+
ins_cursor,
949+
slot_instance * sizeof(uint64_t),
950+
cudaMemcpyDeviceToHost);
951+
} else {
952+
uint64_t* sage_nodes_ptr = reinterpret_cast<uint64_t *>(final_sage_nodes->ptr());
953+
cudaMemcpy(h_walk,
954+
sage_nodes_ptr,
955+
slot_instance * sizeof(uint64_t),
956+
cudaMemcpyDeviceToHost);
957+
}
958+
uint64_t h_feature[slot_instance * fea_num_per_node_];
945959
cudaMemcpy(h_feature,
946960
feature_buf,
947-
total_instance * fea_num_per_node_ * sizeof(uint64_t),
961+
slot_instance * fea_num_per_node_ * sizeof(uint64_t),
948962
cudaMemcpyDeviceToHost);
949-
for (int i = 0; i < total_instance; ++i) {
963+
for (int i = 0; i < slot_instance; ++i) {
950964
std::stringstream ss;
951965
for (int j = 0; j < fea_num_per_node_; ++j) {
952966
ss << h_feature[i * fea_num_per_node_ + j] << " ";
@@ -957,26 +971,25 @@ int GraphDataGenerator::GenerateBatch() {
957971
<< "] = " << ss.str();
958972
}
959973
}
960-
961-
GraphFillSlotKernel<<<GET_BLOCKS(total_instance * fea_num_per_node_),
974+
GraphFillSlotKernel<<<GET_BLOCKS(slot_instance * fea_num_per_node_),
962975
CUDA_NUM_THREADS,
963976
0,
964977
stream_>>>((uint64_t *)d_slot_tensor_ptr_->ptr(),
965978
feature_buf,
966-
total_instance * fea_num_per_node_,
967-
total_instance,
979+
slot_instance * fea_num_per_node_,
980+
slot_instance,
968981
slot_num_,
969982
(int*)d_slot_feature_num_map_->ptr(),
970983
fea_num_per_node_,
971984
(int*)d_actual_slot_id_map_->ptr(),
972985
(int*)d_fea_offset_map_->ptr());
973-
GraphFillSlotLodKernelOpt<<<GET_BLOCKS((total_instance + 1) * slot_num_),
986+
GraphFillSlotLodKernelOpt<<<GET_BLOCKS((slot_instance + 1) * slot_num_),
974987
CUDA_NUM_THREADS,
975988
0,
976989
stream_>>>(
977-
(uint64_t *)d_slot_lod_tensor_ptr_->ptr(), // NOLINT
978-
(total_instance + 1) * slot_num_,
979-
total_instance + 1,
990+
(uint64_t *)d_slot_lod_tensor_ptr_->ptr(),
991+
(slot_instance + 1) * slot_num_,
992+
slot_instance + 1,
980993
(int*)d_slot_feature_num_map_->ptr());
981994
} else {
982995
for (int i = 0; i < slot_num_; ++i) {
@@ -1519,8 +1532,10 @@ void GraphDataGenerator::AllocResource(
15191532
d_ins_buf_ =
15201533
memory::AllocShared(place_, (batch_size_ * 2 * 2) * sizeof(uint64_t));
15211534
if (slot_num_ > 0) {
1522-
d_feature_buf_ = memory::AllocShared(
1523-
place_, (batch_size_ * 2 * 2) * slot_num_ * sizeof(uint64_t));
1535+
if (!sage_mode_) {
1536+
d_feature_buf_ = memory::AllocShared(
1537+
place_, (batch_size_ * 2 * 2) * slot_num_ * sizeof(uint64_t));
1538+
}
15241539
}
15251540
d_pair_num_ = memory::AllocShared(place_, sizeof(int));
15261541
if (FLAGS_enable_opt_get_features && slot_num_ > 0) {

0 commit comments

Comments
 (0)