Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPVR-122] Node should be up instead of down after restarting eru-agent for node #497

Merged
merged 2 commits into from
Oct 27, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cluster/calcium/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ func (c *Calcium) SetNode(ctx context.Context, opts *types.SetNodeOptions) (*typ
n.Bypass = (opts.BypassOpt == types.TriTrue) || (opts.BypassOpt == types.TriKeep && n.Bypass)
if n.IsDown() {
logger.Errorf(ctx, "[SetNodeAvailable] node marked down: %s", opts.Nodename)
// remove node status
err := c.store.SetNodeStatus(ctx, node, -1)
if err != nil {
// don't return here
log.Errorf(ctx, "[SetNode] failed to set node status, err: %v", err)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"%+v", errors.WithStack(err)

}
}
if opts.WorkloadsDown {
workloads, err := c.store.ListNodeWorkloads(ctx, opts.Nodename, nil)
Expand Down
1 change: 1 addition & 0 deletions cluster/calcium/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ func TestSetNode(t *testing.T) {
store.On("CreateLock", mock.Anything, mock.Anything).Return(lock, nil)
lock.On("Lock", mock.Anything).Return(context.TODO(), nil)
lock.On("Unlock", mock.Anything).Return(nil)
store.On("SetNodeStatus", mock.Anything, mock.Anything, mock.Anything).Return(nil)

// fail by validating
_, err := c.SetNode(ctx, &types.SetNodeOptions{Nodename: ""})
Expand Down
16 changes: 11 additions & 5 deletions store/etcdv3/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,13 +290,22 @@ func (m *Mercury) doGetNodes(ctx context.Context, kvs []*mvccpb.KeyValue, labels
}

// SetNodeStatus sets status for a node, value will expire after ttl seconds
// ttl should be larger than 0
// ttl < 0 means to delete node status
// this is heartbeat of node
func (m *Mercury) SetNodeStatus(ctx context.Context, node *types.Node, ttl int64) error {
if ttl <= 0 {
if ttl == 0 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

还是记录一下, 虽然不知道为啥 ttl=0 就报错, 隔壁 workload status ttl=0 是不过期, 不过先这样吧, 反正没这需求...

return types.ErrNodeStatusTTL
}

// nodenames are unique
statusKey := filepath.Join(nodeStatusPrefix, node.Name)
entityKey := fmt.Sprintf(nodeInfoKey, node.Name)

if ttl < 0 {
_, err := m.Delete(ctx, statusKey)
return err
}

data, err := json.Marshal(types.NodeStatus{
Nodename: node.Name,
Podname: node.Podname,
Expand All @@ -306,9 +315,6 @@ func (m *Mercury) SetNodeStatus(ctx context.Context, node *types.Node, ttl int64
return err
}

// nodenames are unique
statusKey := filepath.Join(nodeStatusPrefix, node.Name)
entityKey := fmt.Sprintf(nodeInfoKey, node.Name)
return m.BindStatus(ctx, entityKey, statusKey, string(data), ttl)
}

Expand Down
4 changes: 2 additions & 2 deletions store/etcdv3/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,13 +141,13 @@ RdCPRPt513WozkJZZAjUSP2U
m.config.CertPath = "/tmp"
node3, err := m.doAddNode(ctx, nodename3, endpoint3, podname, ca, cert, certkey, cpu, share, memory, storage, labels, nil, nil, nil)
assert.NoError(t, err)
engine3, err := m.makeClient(ctx, node3, true)
engine3, err := m.makeClient(ctx, node3)
assert.NoError(t, err)
_, err = engine3.Info(ctx)
assert.Error(t, err)
// failed by get key
node3.Name = "nokey"
_, err = m.makeClient(ctx, node3, true)
_, err = m.makeClient(ctx, node3)
assert.NoError(t, err)
}

Expand Down
14 changes: 10 additions & 4 deletions store/redis/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -288,13 +288,21 @@ func (r *Rediaron) doGetNodes(ctx context.Context, kvs map[string]string, labels
}

// SetNodeStatus sets status for a node, value will expire after ttl seconds
// ttl should be larger than 0
// ttl < 0 means delete node status
// this is heartbeat of node
func (r *Rediaron) SetNodeStatus(ctx context.Context, node *types.Node, ttl int64) error {
if ttl <= 0 {
if ttl == 0 {
return types.ErrNodeStatusTTL
}

// nodenames are unique
key := filepath.Join(nodeStatusPrefix, node.Name)

if ttl < 0 {
_, err := r.cli.Del(ctx, key).Result()
return err
}

data, err := json.Marshal(types.NodeStatus{
Nodename: node.Name,
Podname: node.Podname,
Expand All @@ -304,8 +312,6 @@ func (r *Rediaron) SetNodeStatus(ctx context.Context, node *types.Node, ttl int6
return err
}

// nodenames are unique
key := filepath.Join(nodeStatusPrefix, node.Name)
_, err = r.cli.Set(ctx, key, string(data), time.Duration(ttl)*time.Second).Result()
return err
}
Expand Down
4 changes: 2 additions & 2 deletions store/redis/node_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -138,13 +138,13 @@ RdCPRPt513WozkJZZAjUSP2U
s.rediaron.config.CertPath = "/tmp"
node3, err := s.rediaron.doAddNode(ctx, nodename3, endpoint3, podname, ca, cert, certkey, cpu, share, memory, storage, labels, nil, nil, nil)
s.NoError(err)
engine3, err := s.rediaron.makeClient(ctx, node3, true)
engine3, err := s.rediaron.makeClient(ctx, node3)
s.NoError(err)
_, err = engine3.Info(ctx)
s.Error(err)
// failed by get key
node3.Name = "nokey"
_, err = s.rediaron.makeClient(ctx, node3, true)
_, err = s.rediaron.makeClient(ctx, node3)
s.NoError(err)
}

Expand Down