Make win_put/get/accum weights in double

BichengYing · BichengYing · commit 03723d674187 · 2020-05-09T11:31:31.000-07:00
diff --git a/bluefog/common/common.h b/bluefog/common/common.h
@@ -233,8 +233,8 @@ struct TensorTableEntry {
   int device = CPU_DEVICE_ID;
   // Source and destination of ranks used in win ops.
   // It maps the src(dst) rank to the weight.
-  std::unordered_map<int, float> dst_weights = {};
-  std::unordered_map<int, float> src_weights = {};
+  std::unordered_map<int, double> dst_weights = {};
+  std::unordered_map<int, double> src_weights = {};
 
   // The ops requires the mutex.
   bool require_mutex = false;
diff --git a/bluefog/common/mpi_controller.cc b/bluefog/common/mpi_controller.cc
@@ -257,7 +257,7 @@ int MPIController::SetTopology(int indegree, const int* sources, int outdegree,
 }
 
 int MPIController::SetTopologyWeights(int indegree, const int* sources,
-                                      float self_weight, const float* neighbor_weights) {
+                                      double self_weight, const double* neighbor_weights) {
   // We assume when this function is called, the base topology has already
   // been set. Here the neighbor_weights specifies the weights from the sources.
   if (!mpi_ctx_.IsTopoSetup()) {
@@ -281,8 +281,8 @@ int MPIController::LoadTopology(int* indegree, int*& sources, int* outdegree,
 }
 
 int MPIController::LoadTopologyWeights(
-    float& self_weight,
-    const std::unordered_map<int, float>*& neighbor_weights) {
+    double& self_weight,
+    const std::unordered_map<int, double>*& neighbor_weights) {
   if (!mpi_ctx_.IsWeighted()) {
     return 0;
   }
@@ -538,7 +538,7 @@ void MPIController::WinPut(TensorTableEntry& entry) {
   Status timeline_status = GetBluefogTimeline(timeline_ptr);
   for (auto kv : entry.dst_weights) {
     int target_rank = kv.first;
-    float weight = kv.second;
+    double weight = kv.second;
 
     BFLOG(TRACE, rank_) << "Start MPI_Put for " << entry.tensor_name << " to " << target_rank;
 
@@ -603,7 +603,7 @@ void MPIController::WinAccumulate(TensorTableEntry& entry) {
 
   for (auto kv : entry.dst_weights) {
     int target_rank = kv.first;
-    float weight = kv.second;
+    double weight = kv.second;
     // avoid putting the tensor for itself (NOT valid).
     if (target_rank == rank_) continue;
 
diff --git a/bluefog/common/mpi_controller.h b/bluefog/common/mpi_controller.h
@@ -66,11 +66,11 @@ class MPIController {
   int SetTopology(int indegree, const int* sources, int outdegree,
                   const int* destinations);
   int SetTopologyWeights(int indegree, const int* sources,
-                         float self_weight, const float* neighbor_weights);
+                         double self_weight, const double* neighbor_weights);
   int LoadTopology(int* indegree, int*& sources, int* outdegree,
                    int*& destinations);
-  int LoadTopologyWeights(float& self_weight,
-                          const std::unordered_map<int, float>*& neighbor_weights);
+  int LoadTopologyWeights(double& self_weight,
+                          const std::unordered_map<int, double>*& neighbor_weights);
 
   Status WinCreate(std::shared_ptr<Tensor> tensor,
                    std::vector<std::shared_ptr<Tensor>> neighbor_tensors,
@@ -126,8 +126,8 @@ class MPIController {
   // COMM_WORLD ranks of processes running on this node.
   std::vector<int> local_comm_ranks_;
 
-  float self_weight_;
-  std::unordered_map<int, float> neighbor_weights_;
+  double self_weight_;
+  std::unordered_map<int, double> neighbor_weights_;
 };
 
 class WinMutexGuard {
diff --git a/bluefog/common/operations.cc b/bluefog/common/operations.cc
@@ -283,7 +283,7 @@ int bluefog_set_topology(int indegree, const int* sources, int outdegree,
 
 int bluefog_set_topology_with_weights(int indegree, const int* sources,
                                       int outdegree, const int* destinations,
-                                      float self_weight, const float* neighbor_weights) {
+                                      double self_weight, const double* neighbor_weights) {
   int ret = bluefog_set_topology(indegree, sources, outdegree, destinations);
   if (ret != 1) {
     return ret;
@@ -302,8 +302,8 @@ int bluefog_load_topology(int* indegree, int*& sources, int* outdegree,
 }
 
 int bluefog_load_topology_weights(
-    float& self_weight_,
-    const std::unordered_map<int, float>*& neighbor_weights_) {
+    double& self_weight_,
+    const std::unordered_map<int, double>*& neighbor_weights_) {
   if (!bluefog_global.initialization_done) {
     return -1;
   }
@@ -434,7 +434,7 @@ Status EnqueueTensorNeighborAllreduce(std::shared_ptr<OpContext> context,
 
 Status EnqueueTensorWindowPut(std::shared_ptr<Tensor> tensor,
                               const std::string& name,
-                              const std::unordered_map<int, float>& dst_weights,
+                              const std::unordered_map<int, double>& dst_weights,
                               const int device, 
                               const bool require_mutex, 
                               StatusCallback callback) {
@@ -456,7 +456,7 @@ Status EnqueueTensorWindowPut(std::shared_ptr<Tensor> tensor,
 
 Status EnqueueTensorWindowAccumulate(
     std::shared_ptr<Tensor> tensor, const std::string& name,
-    const std::unordered_map<int, float>& dst_weights, const int device,
+    const std::unordered_map<int, double>& dst_weights, const int device,
     const bool require_mutex, StatusCallback callback) {
   TensorTableEntry e;
   e.tensor_name = name;
@@ -475,7 +475,7 @@ Status EnqueueTensorWindowAccumulate(
 }
 
 Status EnqueueTensorWindowGet(const std::string& name,
-                              const std::unordered_map<int, float>& src_weights,
+                              const std::unordered_map<int, double>& src_weights,
                               const bool require_mutex,
                               StatusCallback callback) {
   TensorTableEntry e;
diff --git a/bluefog/common/operations.h b/bluefog/common/operations.h
@@ -75,7 +75,7 @@ int bluefog_set_topology(int indegree, const int* sources,
 // Bluefog is not initialized or failed.
 int bluefog_set_topology_with_weights(int indegree, const int* sources,
                                       int outdegree, const int* destinations,
-                                      float self_weight, const float* neighbor_weights);
+                                      double self_weight, const double* neighbor_weights);
 
 // C interface to load the virtual topology for MPI graph communicator.
 // Self-rank is never included no matter self-loop is presented in setup or not.
@@ -85,8 +85,8 @@ int bluefog_load_topology(int* indegree, int*& sources,
 
 // Load the weights for neighbors. 
 // TODO(ybc) Make it as C compatible interface.
-int bluefog_load_topology_weights(float& self_weight, 
-                                  const std::unordered_map<int, float>*& neighbor_weights);
+int bluefog_load_topology_weights(double& self_weight, 
+                                  const std::unordered_map<int, double>*& neighbor_weights);
 
 
 // C interface to allow python to call timeline.
@@ -124,17 +124,17 @@ Status EnqueueTensorNeighborAllreduce(std::shared_ptr<OpContext> context,
 
 Status EnqueueTensorWindowPut(std::shared_ptr<Tensor> tensor,
                               const std::string& name,
-                              const std::unordered_map<int, float>& dst_ranks,
+                              const std::unordered_map<int, double>& dst_ranks,
                               const int device, const bool require_mutex,
                               StatusCallback callback);
 
 Status EnqueueTensorWindowAccumulate(
     std::shared_ptr<Tensor> tensor, const std::string& name,
-    const std::unordered_map<int, float>& dst_ranks, const int device,
+    const std::unordered_map<int, double>& dst_ranks, const int device,
     const bool require_mutex, StatusCallback callback);
 
 Status EnqueueTensorWindowGet(const std::string& name,
-                              const std::unordered_map<int, float>& src_ranks,
+                              const std::unordered_map<int, double>& src_ranks,
                               const bool require_mutex,
                               StatusCallback callback);
 
diff --git a/bluefog/common/topology_util.py b/bluefog/common/topology_util.py
@@ -53,7 +53,7 @@ def PowerTwoRingGraph(size: int) -> nx.DiGraph:
     """Generate graph topology such that each points only
     connected to a point such that the index difference is power of 2.
 
-    Example: A PowerTwoRingGraph with 16 nodes:
+    Example: A PowerTwoRingGraph with 12 nodes:
 
     .. plot::
         :context: close-figs
diff --git a/bluefog/torch/mpi_win_ops.cc b/bluefog/torch/mpi_win_ops.cc
@@ -350,7 +350,7 @@ int DoWinFree(const std::string& name) {
 }
 
 int DoWinPut(::torch::Tensor tensor, const std::string& name,
-             const std::unordered_map<int, float>& dst_weights,
+             const std::unordered_map<int, double>& dst_weights,
              const bool require_mutex) {
   ThrowIfError(common::CheckInitialized());
 
@@ -386,7 +386,7 @@ int DoWinPut(::torch::Tensor tensor, const std::string& name,
 }
 
 int DoWinAccumulate(::torch::Tensor tensor, const std::string& name,
-                    const std::unordered_map<int, float>& dst_weights,
+                    const std::unordered_map<int, double>& dst_weights,
                     const bool require_mutex) {
   ThrowIfError(common::CheckInitialized());
 
@@ -421,7 +421,7 @@ int DoWinAccumulate(::torch::Tensor tensor, const std::string& name,
 }
 
 int DoWinGet(const std::string& name,
-             const std::unordered_map<int, float>& src_weights,
+             const std::unordered_map<int, double>& src_weights,
              const bool require_mutex) {
   ThrowIfError(common::CheckInitialized());
 
diff --git a/bluefog/torch/mpi_win_ops.h b/bluefog/torch/mpi_win_ops.h
@@ -144,7 +144,7 @@ WIN_SYNC_H(torch_cuda_DoubleTensor, THCudaDoubleTensor)
 #define WIN_PUT_H(torch_Tensor, THTensor)                             \
   extern "C" int bluefog_torch_win_put_##torch_Tensor(                \
       THTensor* tensor, char* name,                                   \
-      const std::unordered_map<int, float>& dst_weights,              \
+      const std::unordered_map<int, double>& dst_weights,             \
       const bool require_mutex);
 
 WIN_PUT_H(torch_IntTensor, THIntTensor)
@@ -162,7 +162,7 @@ WIN_PUT_H(torch_cuda_DoubleTensor, THCudaDoubleTensor)
 #define WIN_ACCUMULATE_H(torch_Tensor, THTensor)                         \
   extern "C" int bluefog_torch_win_accumulate_##torch_Tensor(            \
       THTensor* tensor, char* name,                                      \
-      const std::unordered_map<int, float>& dst_weights,                 \
+      const std::unordered_map<int, double>& dst_weights,                \
       const bool require_mutex);
 
 WIN_ACCUMULATE_H(torch_IntTensor, THIntTensor)
@@ -178,7 +178,7 @@ WIN_ACCUMULATE_H(torch_cuda_DoubleTensor, THCudaDoubleTensor)
 #endif
 
 extern "C" int bluefog_torch_win_GET(
-    char* name, const std::unordered_map<int, float>& src_weights,
+    char* name, const std::unordered_map<int, double>& src_weights,
     const bool require_mutex);
 
 extern "C" int bluefog_torch_win_free(char* name);
diff --git a/examples/pytorch_logistic_regression.py b/examples/pytorch_logistic_regression.py
@@ -277,13 +277,10 @@ def logistic_loss_step(x_, tensor_name):
     alpha_pd = 1e-1  # step-size for Push-DIGing
     mse_pd = []
     for i in range(maxite):
-        if i % 10 == 0:
-            bf.barrier()
-
         w[:n] = w[:n] - alpha_pd*w[n:2*n]
         bf.win_accumulate(
             w, name="w_buff",
-            dst_weights={rank: 0.5 / (outdegree)
+            dst_weights={rank: 1.0 / (outdegree*2)
                         for rank in bf.out_neighbor_ranks()},
             require_mutex=True)
         w.div_(2)
@@ -296,6 +293,8 @@ def logistic_loss_step(x_, tensor_name):
 
         w[n:2*n] += grad - grad_prev
         grad_prev = grad
+        if i % 10 == 0:
+            bf.barrier()
         if bf.rank() == 0:
             mse_pd.append(torch.norm(x.data - w_opt, p=2))
 

Original file line number	Diff line number	Diff line change
`@@ -350,7 +350,7 @@ int DoWinFree(const std::string& name) {`
`350`	`350`	`}`
`351`	`351`
`352`	`352`	`int DoWinPut(::torch::Tensor tensor, const std::string& name,`
`353`		`- const std::unordered_map<int, float>& dst_weights,`
	`353`	`+ const std::unordered_map<int, double>& dst_weights,`
`354`	`354`	`const bool require_mutex) {`
`355`	`355`	`ThrowIfError(common::CheckInitialized());`
`356`	`356`
`@@ -386,7 +386,7 @@ int DoWinPut(::torch::Tensor tensor, const std::string& name,`
`386`	`386`	`}`
`387`	`387`
`388`	`388`	`int DoWinAccumulate(::torch::Tensor tensor, const std::string& name,`
`389`		`- const std::unordered_map<int, float>& dst_weights,`
	`389`	`+ const std::unordered_map<int, double>& dst_weights,`
`390`	`390`	`const bool require_mutex) {`
`391`	`391`	`ThrowIfError(common::CheckInitialized());`
`392`	`392`
`@@ -421,7 +421,7 @@ int DoWinAccumulate(::torch::Tensor tensor, const std::string& name,`
`421`	`421`	`}`
`422`	`422`
`423`	`423`	`int DoWinGet(const std::string& name,`
`424`		`- const std::unordered_map<int, float>& src_weights,`
	`424`	`+ const std::unordered_map<int, double>& src_weights,`
`425`	`425`	`const bool require_mutex) {`
`426`	`426`	`ThrowIfError(common::CheckInitialized());`
`427`	`427`