@@ -737,6 +737,8 @@ int GraphDataGenerator::GenerateBatch() {
737
737
int total_instance = 0 ;
738
738
platform::CUDADeviceGuard guard (gpuid_);
739
739
int res = 0 ;
740
+
741
+ std::shared_ptr<phi::Allocation> final_sage_nodes;
740
742
if (!gpu_graph_training_) {
741
743
while (cursor_ < h_device_keys_.size ()) {
742
744
size_t device_key_size = h_device_keys_[cursor_]->size ();
@@ -776,7 +778,6 @@ int GraphDataGenerator::GenerateBatch() {
776
778
0 ,
777
779
stream_>>> (clk_tensor_ptr_, total_instance);
778
780
} else {
779
-
780
781
auto node_buf = memory::AllocShared (
781
782
place_, total_instance * sizeof (uint64_t ));
782
783
int64_t * node_buf_ptr = reinterpret_cast <int64_t * >(node_buf->ptr ());
@@ -789,7 +790,7 @@ int GraphDataGenerator::GenerateBatch() {
789
790
phi::DenseTensor inverse_;
790
791
VLOG (1 ) << " generate sample graph" ;
791
792
uint64_t * node_buf_ptr_ = reinterpret_cast <uint64_t * >(node_buf->ptr ());
792
- std::shared_ptr<phi::Allocation> final_infer_nodes =
793
+ final_sage_nodes =
793
794
GenerateSampleGraph (node_buf_ptr_, total_instance, &uniq_instance_,
794
795
&inverse_);
795
796
id_tensor_ptr_ =
@@ -803,7 +804,7 @@ int GraphDataGenerator::GenerateBatch() {
803
804
feed_vec_[index_offset]->mutable_data <int >({total_instance}, this ->place_ );
804
805
805
806
VLOG (1 ) << " copy id and index" ;
806
- cudaMemcpy (id_tensor_ptr_, final_infer_nodes ->ptr (),
807
+ cudaMemcpy (id_tensor_ptr_, final_sage_nodes ->ptr (),
807
808
sizeof (int64_t ) * uniq_instance_,
808
809
cudaMemcpyDeviceToDevice);
809
810
cudaMemcpy (index_tensor_ptr_, inverse_.data <int >(), sizeof (int ) * total_instance,
@@ -840,31 +841,7 @@ int GraphDataGenerator::GenerateBatch() {
840
841
total_instance *= 2 ;
841
842
}
842
843
843
- int64_t *slot_tensor_ptr_[slot_num_];
844
- int64_t *slot_lod_tensor_ptr_[slot_num_];
845
- if (slot_num_ > 0 ) {
846
- for (int i = 0 ; i < slot_num_; ++i) {
847
- slot_tensor_ptr_[i] = feed_vec_[3 + 2 * i]->mutable_data <int64_t >(
848
- {total_instance * h_slot_feature_num_map_[i], 1 }, this ->place_ );
849
- slot_lod_tensor_ptr_[i] = feed_vec_[3 + 2 * i + 1 ]->mutable_data <int64_t >(
850
- {total_instance + 1 }, this ->place_ );
851
- }
852
- if (FLAGS_enable_opt_get_features || !gpu_graph_training_) {
853
- cudaMemcpyAsync (d_slot_tensor_ptr_->ptr (),
854
- slot_tensor_ptr_,
855
- sizeof (uint64_t *) * slot_num_,
856
- cudaMemcpyHostToDevice,
857
- stream_);
858
- cudaMemcpyAsync (d_slot_lod_tensor_ptr_->ptr (),
859
- slot_lod_tensor_ptr_,
860
- sizeof (uint64_t *) * slot_num_,
861
- cudaMemcpyHostToDevice,
862
- stream_);
863
- }
864
- }
865
-
866
844
uint64_t *ins_cursor, *ins_buf;
867
- std::shared_ptr<phi::Allocation> final_nodes;
868
845
phi::DenseTensor inverse;
869
846
if (gpu_graph_training_) {
870
847
VLOG (2 ) << " total_instance: " << total_instance
@@ -893,7 +870,7 @@ int GraphDataGenerator::GenerateBatch() {
893
870
stream_>>> (clk_tensor_ptr_, total_instance);
894
871
} else {
895
872
VLOG (2 ) << gpuid_ << " " << " Ready to enter GenerateSampleGraph" ;
896
- final_nodes = GenerateSampleGraph (ins_cursor, total_instance, &uniq_instance_,
873
+ final_sage_nodes = GenerateSampleGraph (ins_cursor, total_instance, &uniq_instance_,
897
874
&inverse);
898
875
VLOG (2 ) << " Copy Final Results" ;
899
876
id_tensor_ptr_ =
@@ -907,7 +884,7 @@ int GraphDataGenerator::GenerateBatch() {
907
884
feed_vec_[index_offset]->mutable_data <int >({total_instance}, this ->place_ );
908
885
909
886
cudaMemcpyAsync (id_tensor_ptr_,
910
- final_nodes ->ptr (),
887
+ final_sage_nodes ->ptr (),
911
888
sizeof (int64_t ) * uniq_instance_,
912
889
cudaMemcpyDeviceToDevice,
913
890
stream_);
@@ -930,23 +907,60 @@ int GraphDataGenerator::GenerateBatch() {
930
907
ins_cursor = (uint64_t *)id_tensor_ptr_; // NOLINT
931
908
}
932
909
910
+ int64_t *slot_tensor_ptr_[slot_num_];
911
+ int64_t *slot_lod_tensor_ptr_[slot_num_];
933
912
if (slot_num_ > 0 ) {
913
+ int slot_instance = sage_mode_ == true ? uniq_instance_ : total_instance;
914
+ for (int i = 0 ; i < slot_num_; ++i) {
915
+ slot_tensor_ptr_[i] = feed_vec_[3 + 2 * i]->mutable_data <int64_t >(
916
+ {slot_instance * h_slot_feature_num_map_[i], 1 }, this ->place_ );
917
+ slot_lod_tensor_ptr_[i] = feed_vec_[3 + 2 * i + 1 ]->mutable_data <int64_t >(
918
+ {slot_instance + 1 }, this ->place_ );
919
+ }
920
+ if (FLAGS_enable_opt_get_features || !gpu_graph_training_) {
921
+ cudaMemcpyAsync (d_slot_tensor_ptr_->ptr (),
922
+ slot_tensor_ptr_,
923
+ sizeof (uint64_t *) * slot_num_,
924
+ cudaMemcpyHostToDevice,
925
+ stream_);
926
+ cudaMemcpyAsync (d_slot_lod_tensor_ptr_->ptr (),
927
+ slot_lod_tensor_ptr_,
928
+ sizeof (uint64_t *) * slot_num_,
929
+ cudaMemcpyHostToDevice,
930
+ stream_);
931
+ }
932
+ if (sage_mode_) {
933
+ d_feature_buf_ =
934
+ memory::AllocShared (place_, slot_instance * slot_num_ * sizeof (uint64_t ));
935
+ }
934
936
uint64_t *feature_buf = reinterpret_cast <uint64_t *>(d_feature_buf_->ptr ());
935
937
if (FLAGS_enable_opt_get_features || !gpu_graph_training_) {
936
- FillFeatureBuf (ins_cursor, feature_buf, total_instance);
937
- // FillFeatureBuf(id_tensor_ptr_, feature_buf, total_instance);
938
+ if (!sage_mode_) {
939
+ FillFeatureBuf (ins_cursor, feature_buf, slot_instance);
940
+ } else {
941
+ uint64_t * sage_nodes_ptr = reinterpret_cast <uint64_t *>(final_sage_nodes->ptr ());
942
+ FillFeatureBuf (sage_nodes_ptr, feature_buf, slot_instance);
943
+ }
938
944
if (debug_mode_) {
939
- uint64_t h_walk[total_instance]; // NOLINT
940
- cudaMemcpy (h_walk,
941
- ins_cursor,
942
- total_instance * sizeof (uint64_t ),
943
- cudaMemcpyDeviceToHost);
944
- uint64_t h_feature[total_instance * fea_num_per_node_];
945
+ uint64_t h_walk[slot_instance];
946
+ if (!sage_mode_) {
947
+ cudaMemcpy (h_walk,
948
+ ins_cursor,
949
+ slot_instance * sizeof (uint64_t ),
950
+ cudaMemcpyDeviceToHost);
951
+ } else {
952
+ uint64_t * sage_nodes_ptr = reinterpret_cast <uint64_t *>(final_sage_nodes->ptr ());
953
+ cudaMemcpy (h_walk,
954
+ sage_nodes_ptr,
955
+ slot_instance * sizeof (uint64_t ),
956
+ cudaMemcpyDeviceToHost);
957
+ }
958
+ uint64_t h_feature[slot_instance * fea_num_per_node_];
945
959
cudaMemcpy (h_feature,
946
960
feature_buf,
947
- total_instance * fea_num_per_node_ * sizeof (uint64_t ),
961
+ slot_instance * fea_num_per_node_ * sizeof (uint64_t ),
948
962
cudaMemcpyDeviceToHost);
949
- for (int i = 0 ; i < total_instance ; ++i) {
963
+ for (int i = 0 ; i < slot_instance ; ++i) {
950
964
std::stringstream ss;
951
965
for (int j = 0 ; j < fea_num_per_node_; ++j) {
952
966
ss << h_feature[i * fea_num_per_node_ + j] << " " ;
@@ -957,26 +971,25 @@ int GraphDataGenerator::GenerateBatch() {
957
971
<< " ] = " << ss.str ();
958
972
}
959
973
}
960
-
961
- GraphFillSlotKernel<<<GET_BLOCKS(total_instance * fea_num_per_node_),
974
+ GraphFillSlotKernel<<<GET_BLOCKS(slot_instance * fea_num_per_node_),
962
975
CUDA_NUM_THREADS,
963
976
0 ,
964
977
stream_>>> ((uint64_t *)d_slot_tensor_ptr_->ptr (),
965
978
feature_buf,
966
- total_instance * fea_num_per_node_,
967
- total_instance ,
979
+ slot_instance * fea_num_per_node_,
980
+ slot_instance ,
968
981
slot_num_,
969
982
(int *)d_slot_feature_num_map_->ptr (),
970
983
fea_num_per_node_,
971
984
(int *)d_actual_slot_id_map_->ptr (),
972
985
(int *)d_fea_offset_map_->ptr ());
973
- GraphFillSlotLodKernelOpt<<<GET_BLOCKS((total_instance + 1 ) * slot_num_),
986
+ GraphFillSlotLodKernelOpt<<<GET_BLOCKS((slot_instance + 1 ) * slot_num_),
974
987
CUDA_NUM_THREADS,
975
988
0 ,
976
989
stream_>>> (
977
- (uint64_t *)d_slot_lod_tensor_ptr_->ptr (), // NOLINT
978
- (total_instance + 1 ) * slot_num_,
979
- total_instance + 1 ,
990
+ (uint64_t *)d_slot_lod_tensor_ptr_->ptr (),
991
+ (slot_instance + 1 ) * slot_num_,
992
+ slot_instance + 1 ,
980
993
(int *)d_slot_feature_num_map_->ptr ());
981
994
} else {
982
995
for (int i = 0 ; i < slot_num_; ++i) {
@@ -1519,8 +1532,10 @@ void GraphDataGenerator::AllocResource(
1519
1532
d_ins_buf_ =
1520
1533
memory::AllocShared (place_, (batch_size_ * 2 * 2 ) * sizeof (uint64_t ));
1521
1534
if (slot_num_ > 0 ) {
1522
- d_feature_buf_ = memory::AllocShared (
1523
- place_, (batch_size_ * 2 * 2 ) * slot_num_ * sizeof (uint64_t ));
1535
+ if (!sage_mode_) {
1536
+ d_feature_buf_ = memory::AllocShared (
1537
+ place_, (batch_size_ * 2 * 2 ) * slot_num_ * sizeof (uint64_t ));
1538
+ }
1524
1539
}
1525
1540
d_pair_num_ = memory::AllocShared (place_, sizeof (int ));
1526
1541
if (FLAGS_enable_opt_get_features && slot_num_ > 0 ) {
0 commit comments