Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit b6bc5af

Browse files
committedJan 21, 2022
move selfmon (projecteru2#540)
1 parent a00c69b commit b6bc5af

File tree

12 files changed

+83
-84
lines changed

12 files changed

+83
-84
lines changed
 

‎cluster/calcium/calcium.go

+3-20
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@ import (
1515
"github.com/projecteru2/core/source/github"
1616
"github.com/projecteru2/core/source/gitlab"
1717
"github.com/projecteru2/core/store"
18-
"github.com/projecteru2/core/store/etcdv3"
19-
"github.com/projecteru2/core/store/redis"
2018
"github.com/projecteru2/core/types"
2119

2220
"github.com/pkg/errors"
@@ -31,27 +29,16 @@ type Calcium struct {
3129
watcher discovery.Service
3230
wal *WAL
3331
identifier string
34-
selfmon *NodeStatusWatcher
3532
}
3633

3734
// New returns a new cluster config
3835
func New(config types.Config, t *testing.T) (*Calcium, error) {
3936
logger := log.WithField("Calcium", "New").WithField("config", config)
4037

4138
// set store
42-
var store store.Store
43-
var err error
44-
switch config.Store {
45-
case types.Redis:
46-
store, err = redis.New(config, t)
47-
if err != nil {
48-
return nil, logger.Err(context.TODO(), errors.WithStack(err))
49-
}
50-
default:
51-
store, err = etcdv3.New(config, t)
52-
if err != nil {
53-
return nil, logger.Err(context.TODO(), errors.WithStack(err))
54-
}
39+
store, err := store.NewStore(config, t)
40+
if err != nil {
41+
return nil, logger.Err(context.TODO(), errors.WithStack(err))
5542
}
5643

5744
// set scheduler
@@ -83,10 +70,6 @@ func New(config types.Config, t *testing.T) (*Calcium, error) {
8370
cal := &Calcium{store: store, config: config, scheduler: potassium, source: scm, watcher: watcher}
8471
cal.wal, err = newCalciumWAL(cal)
8572
cal.identifier = config.Identifier()
86-
cal.selfmon = NewNodeStatusWatcher(cal)
87-
88-
// start node status watcher
89-
go cal.selfmon.run()
9073

9174
return cal, logger.Err(nil, errors.WithStack(err)) //nolint
9275
}

‎cluster/calcium/dissociate.go

+2-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import (
44
"context"
55

66
"github.com/projecteru2/core/log"
7-
"github.com/projecteru2/core/store"
87
"github.com/projecteru2/core/types"
98
"github.com/projecteru2/core/utils"
109

@@ -34,7 +33,7 @@ func (c *Calcium) DissociateWorkload(ctx context.Context, ids []string) (chan *t
3433
ctx,
3534
// if
3635
func(ctx context.Context) (err error) {
37-
if err = c.store.UpdateNodeResource(ctx, node, &workload.ResourceMeta, store.ActionIncr); err == nil {
36+
if err = c.store.UpdateNodeResource(ctx, node, &workload.ResourceMeta, types.ActionIncr); err == nil {
3837
log.Infof(ctx, "[DissociateWorkload] Workload %s dissociated", workload.ID)
3938
}
4039
return errors.WithStack(err)
@@ -48,7 +47,7 @@ func (c *Calcium) DissociateWorkload(ctx context.Context, ids []string) (chan *t
4847
if failedByCond {
4948
return nil
5049
}
51-
return errors.WithStack(c.store.UpdateNodeResource(ctx, node, &workload.ResourceMeta, store.ActionDecr))
50+
return errors.WithStack(c.store.UpdateNodeResource(ctx, node, &workload.ResourceMeta, types.ActionDecr))
5251
},
5352
c.config.GlobalTimeout,
5453
)

‎cluster/calcium/remove.go

+2-3
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import (
66
"sync"
77

88
"github.com/projecteru2/core/log"
9-
"github.com/projecteru2/core/store"
109
"github.com/projecteru2/core/types"
1110
"github.com/projecteru2/core/utils"
1211

@@ -42,7 +41,7 @@ func (c *Calcium) RemoveWorkload(ctx context.Context, ids []string, force bool,
4241
ctx,
4342
// if
4443
func(ctx context.Context) error {
45-
return errors.WithStack(c.store.UpdateNodeResource(ctx, node, &workload.ResourceMeta, store.ActionIncr))
44+
return errors.WithStack(c.store.UpdateNodeResource(ctx, node, &workload.ResourceMeta, types.ActionIncr))
4645
},
4746
// then
4847
func(ctx context.Context) (err error) {
@@ -56,7 +55,7 @@ func (c *Calcium) RemoveWorkload(ctx context.Context, ids []string, force bool,
5655
if failedByCond {
5756
return nil
5857
}
59-
return errors.WithStack(c.store.UpdateNodeResource(ctx, node, &workload.ResourceMeta, store.ActionDecr))
58+
return errors.WithStack(c.store.UpdateNodeResource(ctx, node, &workload.ResourceMeta, types.ActionDecr))
6059
},
6160
c.config.GlobalTimeout,
6261
)

‎core.go

+7
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/projecteru2/core/metrics"
1919
"github.com/projecteru2/core/rpc"
2020
pb "github.com/projecteru2/core/rpc/gen"
21+
"github.com/projecteru2/core/selfmon"
2122
"github.com/projecteru2/core/utils"
2223
"github.com/projecteru2/core/version"
2324

@@ -135,6 +136,12 @@ func serve(c *cli.Context) error {
135136
ctx, cancel := signal.NotifyContext(c.Context, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
136137
defer cancel()
137138

139+
// start node status checker
140+
go selfmon.RunNodeStatusWatcher(ctx, config, cluster, t)
141+
142+
// start engine cache checker
143+
go factory.EngineCacheChecker(ctx, config.ConnectionTimeout)
144+
138145
<-ctx.Done()
139146

140147
log.Info("[main] Interrupt by signal")

‎engine/factory/factory.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ var (
3333
)
3434

3535
func getEngineCacheKey(endpoint, ca, cert, key string) string {
36-
return endpoint + "-" + utils.SHA256(fmt.Sprintf(":%v:%v:%v", ca, cert, key))[:8]
36+
return fmt.Sprintf("%v-%v", endpoint, utils.SHA256(fmt.Sprintf(":%v:%v:%v", ca, cert, key))[:8])
3737
}
3838

3939
type engineParams struct {

‎cluster/calcium/selfmon.go ‎selfmon/selfmon.go

+38-38
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
1-
package calcium
1+
package selfmon
22

33
import (
44
"context"
55
"math/rand"
6-
"os/signal"
7-
"syscall"
6+
"testing"
87
"time"
98

109
"github.com/pkg/errors"
1110

11+
"github.com/projecteru2/core/cluster"
1212
"github.com/projecteru2/core/log"
13-
coretypes "github.com/projecteru2/core/types"
13+
"github.com/projecteru2/core/store"
14+
"github.com/projecteru2/core/types"
1415
"github.com/projecteru2/core/utils"
1516
)
1617

@@ -19,22 +20,32 @@ const ActiveKey = "/selfmon/active"
1920

2021
// NodeStatusWatcher monitors the changes of node status
2122
type NodeStatusWatcher struct {
22-
id int64
23-
cal *Calcium
23+
id int64
24+
config types.Config
25+
cluster cluster.Cluster
26+
store store.Store
2427
}
2528

26-
// NewNodeStatusWatcher .
27-
func NewNodeStatusWatcher(cal *Calcium) *NodeStatusWatcher {
29+
// RunNodeStatusWatcher .
30+
func RunNodeStatusWatcher(ctx context.Context, config types.Config, cluster cluster.Cluster, t *testing.T) {
2831
rand.Seed(time.Now().UnixNano())
2932
id := rand.Int63n(10000) // nolint
30-
return &NodeStatusWatcher{
31-
id: id,
32-
cal: cal,
33+
store, err := store.NewStore(config, t)
34+
if err != nil {
35+
log.Errorf(context.TODO(), "[RunNodeStatusWatcher] %v failed to create store, err: %v", id, err)
36+
return
37+
}
38+
39+
watcher := &NodeStatusWatcher{
40+
id: id,
41+
config: config,
42+
store: store,
43+
cluster: cluster,
3344
}
45+
watcher.run(ctx)
3446
}
3547

36-
func (n *NodeStatusWatcher) run() {
37-
ctx := n.getSignalContext(context.TODO())
48+
func (n *NodeStatusWatcher) run(ctx context.Context) {
3849
for {
3950
select {
4051
case <-ctx.Done():
@@ -45,22 +56,11 @@ func (n *NodeStatusWatcher) run() {
4556
log.Errorf(ctx, "[NodeStatusWatcher] %v stops watching, err: %v", n.id, err)
4657
}
4758
})
48-
time.Sleep(n.cal.config.ConnectionTimeout)
59+
time.Sleep(n.config.ConnectionTimeout)
4960
}
5061
}
5162
}
5263

53-
func (n *NodeStatusWatcher) getSignalContext(ctx context.Context) context.Context {
54-
exitCtx, cancel := signal.NotifyContext(ctx, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
55-
go func() {
56-
defer cancel()
57-
<-exitCtx.Done()
58-
log.Warnf(ctx, "[NodeStatusWatcher] watcher %v receives a signal to exit", n.id)
59-
}()
60-
61-
return exitCtx
62-
}
63-
6464
// withActiveLock acquires the active lock synchronously
6565
func (n *NodeStatusWatcher) withActiveLock(parentCtx context.Context, f func(ctx context.Context)) {
6666
ctx, cancel := context.WithCancel(parentCtx)
@@ -90,7 +90,7 @@ func (n *NodeStatusWatcher) withActiveLock(parentCtx context.Context, f func(ctx
9090
if errors.Is(err, context.Canceled) {
9191
log.Info("[Register] context canceled")
9292
return
93-
} else if !errors.Is(err, coretypes.ErrKeyExists) {
93+
} else if !errors.Is(err, types.ErrKeyExists) {
9494
log.Errorf(ctx, "[Register] failed to re-register: %v", err)
9595
time.Sleep(time.Second)
9696
continue
@@ -126,20 +126,20 @@ func (n *NodeStatusWatcher) withActiveLock(parentCtx context.Context, f func(ctx
126126
}
127127

128128
func (n *NodeStatusWatcher) register(ctx context.Context) (<-chan struct{}, func(), error) {
129-
return n.cal.store.StartEphemeral(ctx, ActiveKey, n.cal.config.HAKeepaliveInterval)
129+
return n.store.StartEphemeral(ctx, ActiveKey, n.config.HAKeepaliveInterval)
130130
}
131131

132132
func (n *NodeStatusWatcher) initNodeStatus(ctx context.Context) {
133133
log.Debug(ctx, "[NodeStatusWatcher] init node status started")
134-
nodes := make(chan *coretypes.Node)
134+
nodes := make(chan *types.Node)
135135

136136
go func() {
137137
defer close(nodes)
138138
// Get all nodes which are active status, and regardless of pod.
139139
var err error
140-
var ch <-chan *coretypes.Node
141-
utils.WithTimeout(ctx, n.cal.config.GlobalTimeout, func(ctx context.Context) {
142-
ch, err = n.cal.ListPodNodes(ctx, &coretypes.ListNodesOptions{
140+
var ch <-chan *types.Node
141+
utils.WithTimeout(ctx, n.config.GlobalTimeout, func(ctx context.Context) {
142+
ch, err = n.cluster.ListPodNodes(ctx, &types.ListNodesOptions{
143143
Podname: "",
144144
Labels: nil,
145145
All: true,
@@ -161,9 +161,9 @@ func (n *NodeStatusWatcher) initNodeStatus(ctx context.Context) {
161161
}()
162162

163163
for node := range nodes {
164-
status, err := n.cal.GetNodeStatus(ctx, node.Name)
164+
status, err := n.cluster.GetNodeStatus(ctx, node.Name)
165165
if err != nil {
166-
status = &coretypes.NodeStatus{
166+
status = &types.NodeStatus{
167167
Nodename: node.Name,
168168
Podname: node.Podname,
169169
Alive: false,
@@ -178,15 +178,15 @@ func (n *NodeStatusWatcher) monitor(ctx context.Context) error {
178178
go n.initNodeStatus(ctx)
179179

180180
// monitor node status
181-
messageChan := n.cal.NodeStatusStream(ctx)
181+
messageChan := n.cluster.NodeStatusStream(ctx)
182182
log.Infof(ctx, "[NodeStatusWatcher] %v watch node status started", n.id)
183183
defer log.Infof(ctx, "[NodeStatusWatcher] %v stop watching node status", n.id)
184184

185185
for {
186186
select {
187187
case message, ok := <-messageChan:
188188
if !ok {
189-
return coretypes.ErrMessageChanClosed
189+
return types.ErrMessageChanClosed
190190
}
191191
go n.dealNodeStatusMessage(ctx, message)
192192
case <-ctx.Done():
@@ -195,18 +195,18 @@ func (n *NodeStatusWatcher) monitor(ctx context.Context) error {
195195
}
196196
}
197197

198-
func (n *NodeStatusWatcher) dealNodeStatusMessage(ctx context.Context, message *coretypes.NodeStatus) {
198+
func (n *NodeStatusWatcher) dealNodeStatusMessage(ctx context.Context, message *types.NodeStatus) {
199199
if message.Error != nil {
200200
log.Errorf(ctx, "[NodeStatusWatcher] deal with node status stream message failed %+v", message)
201201
return
202202
}
203203

204204
// TODO maybe we need a distributed lock to control concurrency
205-
opts := &coretypes.SetNodeOptions{
205+
opts := &types.SetNodeOptions{
206206
Nodename: message.Nodename,
207207
WorkloadsDown: !message.Alive,
208208
}
209-
if _, err := n.cal.SetNode(ctx, opts); err != nil {
209+
if _, err := n.cluster.SetNode(ctx, opts); err != nil {
210210
log.Errorf(ctx, "[NodeStatusWatcher] set node %s failed %v", message.Nodename, err)
211211
return
212212
}

‎store/etcdv3/node.go

+2-3
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ import (
1717
"github.com/projecteru2/core/engine/fake"
1818
"github.com/projecteru2/core/log"
1919
"github.com/projecteru2/core/metrics"
20-
"github.com/projecteru2/core/store"
2120
"github.com/projecteru2/core/types"
2221
"github.com/projecteru2/core/utils"
2322
)
@@ -178,9 +177,9 @@ func (m *Mercury) UpdateNodes(ctx context.Context, nodes ...*types.Node) error {
178177
// UpdateNodeResource update cpu and memory on a node, either add or subtract
179178
func (m *Mercury) UpdateNodeResource(ctx context.Context, node *types.Node, resource *types.ResourceMeta, action string) error {
180179
switch action {
181-
case store.ActionIncr:
180+
case types.ActionIncr:
182181
node.RecycleResources(resource)
183-
case store.ActionDecr:
182+
case types.ActionDecr:
184183
node.PreserveResources(resource)
185184
default:
186185
return types.ErrUnknownControlType

‎store/etcdv3/node_test.go

+2-3
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import (
77
"testing"
88
"time"
99

10-
"github.com/projecteru2/core/store"
1110
"github.com/projecteru2/core/types"
1211

1312
"github.com/stretchr/testify/assert"
@@ -223,8 +222,8 @@ func TestUpdateNodeResource(t *testing.T) {
223222
assert.NoError(t, err)
224223
assert.Equal(t, node.Name, "test")
225224
assert.Error(t, m.UpdateNodeResource(ctx, node, nil, "wtf"))
226-
assert.NoError(t, m.UpdateNodeResource(ctx, node, &types.ResourceMeta{CPU: map[string]int64{"0": 100}}, store.ActionIncr))
227-
assert.NoError(t, m.UpdateNodeResource(ctx, node, &types.ResourceMeta{CPU: map[string]int64{"0": 100}}, store.ActionDecr))
225+
assert.NoError(t, m.UpdateNodeResource(ctx, node, &types.ResourceMeta{CPU: map[string]int64{"0": 100}}, types.ActionIncr))
226+
assert.NoError(t, m.UpdateNodeResource(ctx, node, &types.ResourceMeta{CPU: map[string]int64{"0": 100}}, types.ActionDecr))
228227
}
229228

230229
func TestExtractNodename(t *testing.T) {

‎store/redis/node.go

+2-3
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ import (
1414
"github.com/projecteru2/core/engine/fake"
1515
"github.com/projecteru2/core/log"
1616
"github.com/projecteru2/core/metrics"
17-
"github.com/projecteru2/core/store"
1817
"github.com/projecteru2/core/types"
1918
"github.com/projecteru2/core/utils"
2019

@@ -169,9 +168,9 @@ func (r *Rediaron) UpdateNodes(ctx context.Context, nodes ...*types.Node) error
169168
// UpdateNodeResource update cpu and memory on a node, either add or subtract
170169
func (r *Rediaron) UpdateNodeResource(ctx context.Context, node *types.Node, resource *types.ResourceMeta, action string) error {
171170
switch action {
172-
case store.ActionIncr:
171+
case types.ActionIncr:
173172
node.RecycleResources(resource)
174-
case store.ActionDecr:
173+
case types.ActionDecr:
175174
node.PreserveResources(resource)
176175
default:
177176
return types.ErrUnknownControlType

‎store/redis/node_test.go

+2-3
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import (
66
"path/filepath"
77
"time"
88

9-
"github.com/projecteru2/core/store"
109
"github.com/projecteru2/core/types"
1110
)
1211

@@ -213,8 +212,8 @@ func (s *RediaronTestSuite) TestUpdateNodeResource() {
213212
s.NoError(err)
214213
s.Equal(node.Name, "test")
215214
s.Error(s.rediaron.UpdateNodeResource(ctx, node, nil, "wtf"))
216-
s.NoError(s.rediaron.UpdateNodeResource(ctx, node, &types.ResourceMeta{CPU: map[string]int64{"0": 100}}, store.ActionIncr))
217-
s.NoError(s.rediaron.UpdateNodeResource(ctx, node, &types.ResourceMeta{CPU: map[string]int64{"0": 100}}, store.ActionDecr))
215+
s.NoError(s.rediaron.UpdateNodeResource(ctx, node, &types.ResourceMeta{CPU: map[string]int64{"0": 100}}, types.ActionIncr))
216+
s.NoError(s.rediaron.UpdateNodeResource(ctx, node, &types.ResourceMeta{CPU: map[string]int64{"0": 100}}, types.ActionDecr))
218217
}
219218

220219
func (s *RediaronTestSuite) TestExtractNodename() {

‎store/store.go

+14-7
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,16 @@ package store
22

33
import (
44
"context"
5+
"testing"
56
"time"
67

78
"github.com/projecteru2/core/lock"
9+
"github.com/projecteru2/core/store/etcdv3"
10+
"github.com/projecteru2/core/store/redis"
811
"github.com/projecteru2/core/strategy"
912
"github.com/projecteru2/core/types"
1013
)
1114

12-
const (
13-
// ActionIncr for incr resource
14-
ActionIncr = "+"
15-
// ActionDecr for decr resource
16-
ActionDecr = "-"
17-
)
18-
1915
// Store store eru data
2016
type Store interface {
2117
// service
@@ -65,3 +61,14 @@ type Store interface {
6561
// distributed lock
6662
CreateLock(key string, ttl time.Duration) (lock.DistributedLock, error)
6763
}
64+
65+
// NewStore creates a store
66+
func NewStore(config types.Config, t *testing.T) (store Store, err error) {
67+
switch config.Store {
68+
case types.Redis:
69+
store, err = redis.New(config, t)
70+
default:
71+
store, err = etcdv3.New(config, t)
72+
}
73+
return store, err
74+
}

‎types/action.go

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
package types
2+
3+
const (
4+
// ActionIncr for incr resource
5+
ActionIncr = "+"
6+
// ActionDecr for decr resource
7+
ActionDecr = "-"
8+
)

0 commit comments

Comments
 (0)
Please sign in to comment.