@@ -18,7 +18,6 @@ package job
18
18
19
19
import (
20
20
"fmt"
21
- "os"
22
21
"sync"
23
22
"time"
24
23
@@ -28,7 +27,6 @@ import (
28
27
"github.com/PaddlePaddle/PaddleFlow/pkg/common/config"
29
28
"github.com/PaddlePaddle/PaddleFlow/pkg/common/schema"
30
29
"github.com/PaddlePaddle/PaddleFlow/pkg/job/api"
31
- "github.com/PaddlePaddle/PaddleFlow/pkg/job/runtime"
32
30
"github.com/PaddlePaddle/PaddleFlow/pkg/job/runtime_v2"
33
31
"github.com/PaddlePaddle/PaddleFlow/pkg/metrics"
34
32
"github.com/PaddlePaddle/PaddleFlow/pkg/model"
@@ -40,8 +38,6 @@ const (
40
38
defaultCacheSize = 500
41
39
defaultExpireTime = 30
42
40
defaultJobLoop = 1
43
- // EnvRuntimeVersion contains the version of PaddleFlow runtime
44
- EnvRuntimeVersion = "PF_RUNTIME_VERSION"
45
41
)
46
42
47
43
type ActiveClustersFunc func () []model.ClusterInfo
@@ -65,8 +61,6 @@ type JobManagerImpl struct {
65
61
// clusterRuntimes contains cluster status and runtime services
66
62
clusterRuntimes ClusterRuntimes
67
63
clusterSyncPeriod time.Duration
68
-
69
- isRuntimeV2 bool
70
64
}
71
65
72
66
func NewJobManagerImpl () (* JobManagerImpl , error ) {
@@ -109,50 +103,7 @@ func (m *JobManagerImpl) Start(activeClusters ActiveClustersFunc, activeQueueJob
109
103
/// init config for job manager
110
104
m .init ()
111
105
// start job manager
112
- rVersion := os .Getenv (EnvRuntimeVersion )
113
- if rVersion == "v1" {
114
- m .startRuntime ()
115
- } else {
116
- m .isRuntimeV2 = true
117
- m .startRuntimeV2 ()
118
- }
119
- }
120
-
121
- func (m * JobManagerImpl ) startRuntime () {
122
- log .Infof ("Start job manager on runtime!" )
123
- // submit job to cluster
124
- go m .pJobProcessLoop ()
125
-
126
- for {
127
- // get active clusters
128
- clusters := m .activeClusters ()
129
-
130
- for _ , cluster := range clusters {
131
- clusterID := api .ClusterID (cluster .ID )
132
- // skip when cluster status is offline
133
- if cluster .Status == model .ClusterStatusOffLine {
134
- log .Warnf ("cluster[%s] status is %s, skip it" , cluster .ID , model .ClusterStatusOffLine )
135
- m .stopClusterRuntime (clusterID )
136
- continue
137
- }
138
-
139
- _ , find := m .clusterRuntimes .Get (clusterID )
140
- if ! find {
141
- runtimeSvc , err := runtime .GetOrCreateRuntime (cluster )
142
- if err != nil {
143
- log .Errorf ("new runtime for cluster[%s] failed, err: %v. skip it" , cluster .ID , err )
144
- continue
145
- }
146
- log .Infof ("Create new runtime with cluster <%s>" , cluster .ID )
147
-
148
- cr := NewClusterRuntimeInfo (cluster .Name , runtimeSvc )
149
- m .clusterRuntimes .Store (clusterID , cr )
150
- // start runtime for new cluster
151
- go m .Run (runtimeSvc , cr .StopCh , clusterID )
152
- }
153
- }
154
- time .Sleep (m .clusterSyncPeriod )
155
- }
106
+ m .startRuntime ()
156
107
}
157
108
158
109
func (m * JobManagerImpl ) stopClusterRuntime (clusterID api.ClusterID ) {
@@ -163,21 +114,10 @@ func (m *JobManagerImpl) stopClusterRuntime(clusterID api.ClusterID) {
163
114
close (cr .StopCh )
164
115
}
165
116
m .clusterRuntimes .Delete (clusterID )
166
- runtime .PFRuntimeMap .Delete (clusterID )
167
117
runtime_v2 .PFRuntimeMap .Delete (clusterID )
168
118
m .stopClusterQueueSubmit (clusterID )
169
119
}
170
120
171
- func (m * JobManagerImpl ) Run (runtimeService runtime.RuntimeService , stopCh <- chan struct {}, clusterID api.ClusterID ) {
172
- log .Infof ("Start %s!" , runtimeService .Name ())
173
- // start queue sync
174
- go runtimeService .SyncQueue (stopCh )
175
- // start job sync
176
- go runtimeService .SyncJob (stopCh )
177
- // start job gc
178
- go runtimeService .GCJob (stopCh )
179
- }
180
-
181
121
func (m * JobManagerImpl ) pJobProcessLoop () {
182
122
log .Infof ("start job process loop ..." )
183
123
for {
@@ -245,27 +185,19 @@ func (m *JobManagerImpl) pSubmitQueueJob(jobQueue *api.JobQueue, clusterRuntime
245
185
log .Infof ("Leaving submit %s job in queue %s, total elapsed time: %s" , job .ID , name , time .Since (startTime ))
246
186
} else {
247
187
// TODO: add to config
248
- // time.Sleep(m.jobLoopPeriod)
249
188
time .Sleep (200 * time .Millisecond )
250
189
}
251
190
}
252
191
}
253
192
}
254
193
255
- func (m * JobManagerImpl ) submitJob (clusterRuntime * ClusterRuntimeInfo , job * api.PFJob ) {
256
- if clusterRuntime == nil || job == nil {
194
+ // submitJob submit a job to cluster
195
+ func (m * JobManagerImpl ) submitJob (clusterRuntime * ClusterRuntimeInfo , jobInfo * api.PFJob ) {
196
+ if clusterRuntime == nil || jobInfo == nil {
257
197
log .Errorf ("submit job to cluster failed, err: clusterRuntime or job is nil" )
258
198
return
259
199
}
260
- if m .isRuntimeV2 {
261
- m .submitJobV1 (clusterRuntime .RuntimeV2Svc .SubmitJob , job )
262
- } else {
263
- m .submitJobV1 (clusterRuntime .RuntimeSvc .SubmitJob , job )
264
- }
265
- }
266
200
267
- // submitJob submit a job to cluster
268
- func (m * JobManagerImpl ) submitJobV1 (jobSubmit func (* api.PFJob ) error , jobInfo * api.PFJob ) {
269
201
log .Infof ("begin to submit job %s to cluster" , jobInfo .ID )
270
202
startTime := time .Now ()
271
203
job , err := storage .Job .GetJobByID (jobInfo .ID )
@@ -277,7 +209,7 @@ func (m *JobManagerImpl) submitJobV1(jobSubmit func(*api.PFJob) error, jobInfo *
277
209
if job .Status == schema .StatusJobInit {
278
210
var jobStatus schema.JobStatus
279
211
var msg string
280
- err = jobSubmit (jobInfo )
212
+ err = clusterRuntime . RuntimeSvc . SubmitJob (jobInfo )
281
213
if err != nil {
282
214
// new job failed, update db and skip this job
283
215
msg = fmt .Sprintf ("submit job to cluster failed, err: %s" , err )
@@ -320,8 +252,8 @@ func (m *JobManagerImpl) stopQueueSubmit(queueID api.QueueID) {
320
252
}
321
253
322
254
// PaddleFlow runtime v2
323
- // startRuntimeV2 start job manager on runtime v2
324
- func (m * JobManagerImpl ) startRuntimeV2 () {
255
+ // startRuntime start job manager on runtime v2
256
+ func (m * JobManagerImpl ) startRuntime () {
325
257
log .Infof ("Start job manager on runtime v2!" )
326
258
// submit job to cluster
327
259
go m .pJobProcessLoop ()
@@ -348,7 +280,7 @@ func (m *JobManagerImpl) startRuntimeV2() {
348
280
}
349
281
log .Infof ("Create new runtime with cluster <%s>" , cluster .ID )
350
282
351
- cr := NewClusterRuntimeV2Info (cluster .Name , runtimeSvc )
283
+ cr := NewClusterRuntimeInfo (cluster .Name , runtimeSvc )
352
284
m .clusterRuntimes .Store (clusterID , cr )
353
285
// start runtime for new cluster
354
286
go runtimeSvc .SyncController (cr .StopCh )
@@ -407,28 +339,19 @@ func (m *JobManagerImpl) GetQueue(queueID api.QueueID) (*clusterQueue, bool) {
407
339
408
340
// ClusterRuntimeInfo defines cluster runtime
409
341
type ClusterRuntimeInfo struct {
410
- Name string
411
- StopCh chan struct {}
412
- RuntimeSvc runtime.RuntimeService
413
- RuntimeV2Svc runtime_v2.RuntimeService
342
+ Name string
343
+ StopCh chan struct {}
344
+ RuntimeSvc runtime_v2.RuntimeService
414
345
}
415
346
416
- func NewClusterRuntimeInfo (name string , r runtime .RuntimeService ) * ClusterRuntimeInfo {
347
+ func NewClusterRuntimeInfo (name string , r runtime_v2 .RuntimeService ) * ClusterRuntimeInfo {
417
348
return & ClusterRuntimeInfo {
418
349
Name : name ,
419
350
StopCh : make (chan struct {}),
420
351
RuntimeSvc : r ,
421
352
}
422
353
}
423
354
424
- func NewClusterRuntimeV2Info (name string , r runtime_v2.RuntimeService ) * ClusterRuntimeInfo {
425
- return & ClusterRuntimeInfo {
426
- Name : name ,
427
- StopCh : make (chan struct {}),
428
- RuntimeV2Svc : r ,
429
- }
430
- }
431
-
432
355
// ClusterRuntimes contains cluster runtimes
433
356
type ClusterRuntimes struct {
434
357
sync.RWMutex
0 commit comments