Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dynamically configurable standby instance #824

Merged
merged 9 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions api/cirrus_ci_service.proto
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ message PollRequest {
repeated int64 old_running_tasks = 2 [deprecated = true];
map<string, double> resources_in_use = 3;
repeated string running_tasks = 4;
StandbyInstanceInformation availableStandbyInstanceInformation = 5;
}

message QueryRunningTasksRequest {
Expand Down Expand Up @@ -196,6 +197,24 @@ message PollResponse {
bool shutdown = 4;

repeated string tasks_to_stop = 5;

StandbyInstanceParameters updated_standby_instance_parameters = 6;
}

message StandbyInstanceInformation {
StandbyInstanceParameters parameters = 1;
uint64 age_seconds = 2; // since warming up
}

message StandbyInstanceParameters {
message Warmup {
string script = 1;
uint64 timeout_seconds = 2;
}

Isolation isolation = 1;
map<string, double> resources = 2;
Warmup warmup = 3;
}

message WorkerInfo {
Expand Down
2 changes: 1 addition & 1 deletion internal/commands/worker/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ func buildWorker(output io.Writer) (*worker.Worker, error) {

// Configure standby
if standby := config.Standby; standby != nil {
opts = append(opts, worker.WithStandby(standby))
opts = append(opts, worker.WithStandby(standby.StandbyInstanceParameters))
}

// Configure resource modifiers
Expand Down
41 changes: 41 additions & 0 deletions internal/commands/worker/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"github.com/stretchr/testify/require"
"path/filepath"
"testing"
"time"
)

// TestUnknownFields ensures that we will error out on configuration files
Expand Down Expand Up @@ -71,3 +72,43 @@ func TestRestrictForceSoftnet(t *testing.T) {

require.True(t, config.Security.AllowedIsolations.Tart.ForceSoftnet)
}

func TestStandby(t *testing.T) {
config, err := parseConfig(filepath.Join("testdata", "standby.yml"))
require.NoError(t, err)

// Verify pre-pull configuration exists
require.NotNil(t, config.TartPrePull)
require.Equal(t, 3*time.Hour, config.TartPrePull.CheckInterval)

// Verify pre-pull images
expectedImages := []string{
"ghcr.io/cirruslabs/macos-runner:sonoma",
"ghcr.io/cirruslabs/macos-runner:sequoia",
}
require.Equal(t, expectedImages, config.TartPrePull.Images)

// Verify standby configuration exists
require.NotNil(t, config.Standby)

// Verify resources
require.Equal(t, float64(1), config.Standby.Resources["tart-vms"])

// Verify isolation configuration
require.NotNil(t, config.Standby.Isolation)
require.NotNil(t, config.Standby.Isolation.GetTart())

tart := config.Standby.Isolation.GetTart()
require.Equal(t, "ghcr.io/cirruslabs/macos-runner:sonoma", tart.Image)
require.Equal(t, "admin", tart.User)
require.Equal(t, "admin", tart.Password)
require.Equal(t, "1920x1080", tart.Display)
require.True(t, tart.Softnet)
require.Equal(t, uint32(4), tart.Cpu)
require.Equal(t, uint32(16384), tart.Memory)

// Verify warmup configuration
require.NotNil(t, config.Standby.Warmup)
require.Equal(t, "xcrun simctl list || true", config.Standby.Warmup.Script)
require.Equal(t, uint64(600), config.Standby.Warmup.TimeoutSeconds)
}
25 changes: 25 additions & 0 deletions internal/commands/worker/testdata/standby.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
token: e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855

name: "MacMini-Rack-1-Slot-2"

tart-pre-pull:
check-interval: 3h
images:
- ghcr.io/cirruslabs/macos-runner:sonoma
- ghcr.io/cirruslabs/macos-runner:sequoia

standby:
resources:
tart-vms: 1
isolation:
tart:
image: ghcr.io/cirruslabs/macos-runner:sonoma
user: admin
password: admin
display: 1920x1080
softnet: true
cpu: 4
memory: 16384
warmup:
script: xcrun simctl list || true
timeout: 10m
5 changes: 3 additions & 2 deletions internal/worker/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"github.com/cirruslabs/cirrus-cli/internal/worker/resourcemodifier"
"github.com/cirruslabs/cirrus-cli/internal/worker/security"
"github.com/cirruslabs/cirrus-cli/internal/worker/upstream"
"github.com/cirruslabs/cirrus-cli/pkg/api"
"github.com/sirupsen/logrus"
)

Expand Down Expand Up @@ -39,9 +40,9 @@ func WithSecurity(security *security.Security) Option {
}
}

func WithStandby(standby *StandbyConfig) Option {
func WithStandby(standby *api.StandbyInstanceParameters) Option {
return func(e *Worker) {
e.standbyConfig = standby
e.standbyParameters = standby
}
}

Expand Down
46 changes: 7 additions & 39 deletions internal/worker/standby.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,15 @@ import (
"fmt"
"github.com/cirruslabs/cirrus-cli/pkg/api"
"github.com/cirruslabs/cirrus-cli/pkg/parser/boolevator"
"github.com/cirruslabs/cirrus-cli/pkg/parser/instance/isolation"
"github.com/cirruslabs/cirrus-cli/pkg/parser/instance"
"github.com/cirruslabs/cirrus-cli/pkg/parser/issue"
"github.com/cirruslabs/cirrus-cli/pkg/parser/node"
"github.com/cirruslabs/cirrus-cli/pkg/parser/parserkit"
"gopkg.in/yaml.v3"
"strconv"
"time"
)

type StandbyConfig struct {
Isolation *api.Isolation `yaml:"isolation"`
Resources map[string]float64 `yaml:"resources"`
Warmup Warmup `yaml:"warmup"`
}

type Warmup struct {
Script string `yaml:"script"`
Timeout time.Duration `yaml:"timeout"`
*api.StandbyInstanceParameters
}

var ErrIsolationMissing = errors.New("isolation configuration is required for standby")
Expand All @@ -39,22 +30,21 @@ func (standby *StandbyConfig) UnmarshalYAML(value *yaml.Node) error {
return err
}

isolationNode := documentNode.FindChild("isolation")
if isolationNode == nil {
if isolationNode := documentNode.FindChild("isolation"); isolationNode == nil {
return ErrIsolationMissing
}
// Parse isolation
parserKit := &parserkit.ParserKit{
Boolevator: boolevator.New(),
IssueRegistry: issue.NewRegistry(),
}
isolationParser := isolation.NewIsolation(nil, parserKit)
if err := isolationParser.Parse(isolationNode, parserKit); err != nil {
instanceParser := instance.NewStandbyParameters(nil, parserKit)
if err := instanceParser.Parse(documentNode, parserKit); err != nil {
return err
}

// Only allow Tart and Vetu to be configured as standby
switch isolationType := isolationParser.Proto().Type.(type) {
switch isolationType := instanceParser.Proto().Isolation.Type.(type) {
case *api.Isolation_Tart_:
// OK
case *api.Isolation_Vetu_:
Expand All @@ -63,29 +53,7 @@ func (standby *StandbyConfig) UnmarshalYAML(value *yaml.Node) error {
return fmt.Errorf("%w, got %T", ErrUnsupportedIsolation, isolationType)
}

standby.Isolation = isolationParser.Proto()

// Parse resources
standby.Resources = make(map[string]float64)
if resourcesNode := documentNode.FindChild("resources"); resourcesNode != nil {
for _, resourceNode := range resourcesNode.Children {
resourceValueRaw, err := resourceNode.FlattenedValue()
if err != nil {
return err
}
resourceValue, err := strconv.ParseFloat(resourceValueRaw, 64)
if err != nil {
return err
}
standby.Resources[resourceNode.Name] = resourceValue
}
}

if warmupNode := documentNode.FindChild("warmup"); warmupNode != nil {
if err := warmupNode.YAMLNode.Decode(&standby.Warmup); err != nil {
return err
}
}
standby.StandbyInstanceParameters = instanceParser.Proto()

return nil
}
2 changes: 1 addition & 1 deletion internal/worker/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ func (worker *Worker) getInstance(
worker.standbyInstanceStartedAt = time.Time{}

// Return the standby instance if matches the isolation required by the task
if proto.Equal(worker.standbyConfig.Isolation, isolation) {
if proto.Equal(worker.standbyParameters.Isolation, isolation) {
worker.logger.Debugf("standby instance matches the task's isolation configuration, " +
"yielding it to the task")
worker.standbyHitCounter.Add(ctx, 1, metric.WithAttributes(standbyInstance.Attributes()...))
Expand Down
42 changes: 35 additions & 7 deletions internal/worker/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/attribute"
"go.opentelemetry.io/otel/metric"
"google.golang.org/protobuf/proto"
"math"
"os"
"runtime"
Expand Down Expand Up @@ -56,7 +57,7 @@ type Worker struct {
logger logrus.FieldLogger
echelonLogger *echelon.Logger

standbyConfig *StandbyConfig
standbyParameters *api.StandbyInstanceParameters
standbyInstance abstract.Instance
standbyInstanceStartedAt time.Time

Expand Down Expand Up @@ -252,7 +253,7 @@ func (worker *Worker) Run(ctx context.Context) error {

func (worker *Worker) tryCreateStandby(ctx context.Context) {
// Do nothing if no standby instance is configured
if worker.standbyConfig == nil {
if worker.standbyParameters == nil {
return
}

Expand All @@ -262,14 +263,14 @@ func (worker *Worker) tryCreateStandby(ctx context.Context) {
}

// Do nothing if there are tasks that are running to simplify the resource management
if !worker.canFitResources(worker.standbyConfig.Resources) {
if !worker.canFitResources(worker.standbyParameters.Resources) {
return
}

worker.logger.Debugf("creating a new standby instance with isolation %s", worker.standbyConfig.Isolation)
worker.logger.Debugf("creating a new standby instance with isolation %s", worker.standbyParameters.Isolation)

standbyInstance, err := persistentworker.New(worker.standbyConfig.Isolation, worker.security,
worker.resourceModifierManager.Acquire(worker.standbyConfig.Resources), worker.logger)
standbyInstance, err := persistentworker.New(worker.standbyParameters.Isolation, worker.security,
worker.resourceModifierManager.Acquire(worker.standbyParameters.Resources), worker.logger)
if err != nil {
worker.logger.Errorf("failed to create a standby instance: %v", err)

Expand Down Expand Up @@ -303,8 +304,9 @@ func (worker *Worker) tryCreateStandby(ctx context.Context) {

worker.logger.Debugf("warming-up the standby instance")

warmUpTimeout := time.Duration(worker.standbyParameters.Warmup.TimeoutSeconds) * time.Second
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another point is that since the worker.standbyParameters and its inner messages are now nil-able, and the standby parameters can be set from the backend, it would be nice if we can check for nil here in a more defensive manner.

if err := standbyInstance.(abstract.WarmableInstance).Warmup(ctx, "standby", nil, lazyPull,
worker.standbyConfig.Warmup.Script, worker.standbyConfig.Warmup.Timeout, worker.echelonLogger); err != nil {
worker.standbyParameters.Warmup.Script, warmUpTimeout, worker.echelonLogger); err != nil {
worker.logger.Errorf("failed to warm-up a standby instance: %v", err)

if err := standbyInstance.Close(ctx); err != nil {
Expand Down Expand Up @@ -339,6 +341,13 @@ func (worker *Worker) pollSingleUpstream(ctx context.Context, upstream *upstream
ResourcesInUse: worker.resourcesInUse(),
}

if worker.standbyInstance != nil {
request.AvailableStandbyInstanceInformation = &api.StandbyInstanceInformation{
Parameters: worker.standbyParameters,
AgeSeconds: uint64(time.Since(worker.standbyInstanceStartedAt).Seconds()),
}
}

response, err := upstream.Poll(ctx, request)
if err != nil {
return err
Expand All @@ -363,6 +372,10 @@ func (worker *Worker) pollSingleUpstream(ctx context.Context, upstream *upstream
return ErrShutdown
}

if response.UpdatedStandbyInstanceParameters != nil {
worker.UpdateStandby(ctx, response.UpdatedStandbyInstanceParameters)
}

return nil
}

Expand Down Expand Up @@ -426,3 +439,18 @@ func (worker *Worker) pollIntervalSeconds() uint32 {

return result
}

func (worker *Worker) UpdateStandby(ctx context.Context, parameters *api.StandbyInstanceParameters) {
if worker.standbyInstance != nil && !proto.Equal(worker.standbyParameters, parameters) {
worker.logger.Infof("terminating the standby instance since the parameters have changed")

if err := worker.standbyInstance.Close(ctx); err != nil {
worker.logger.Errorf("failed to terminate the standby instance: %v", err)
}

worker.standbyInstance = nil
worker.standbyInstanceStartedAt = time.Time{}
}

worker.standbyParameters = parameters
}
8 changes: 4 additions & 4 deletions internal/worker/worker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ func TestWorkerStandByTart(t *testing.T) {
)
require.NoError(t, err)

standbyConfig := &worker.StandbyConfig{
standbyParameters := &api.StandbyInstanceParameters{
Isolation: isolation,
}

Expand All @@ -290,7 +290,7 @@ func TestWorkerStandByTart(t *testing.T) {
"[[ \"$CIRRUS_VM_ID\" = cirrus-cli-standby* ]]",
},
worker.WithUpstream(upstream),
worker.WithStandby(standbyConfig),
worker.WithStandby(standbyParameters),
)
}

Expand Down Expand Up @@ -331,7 +331,7 @@ func TestWorkerStandByVetu(t *testing.T) {
)
require.NoError(t, err)

standbyConfig := &worker.StandbyConfig{
standbyParameters := &api.StandbyInstanceParameters{
Isolation: isolation,
}

Expand All @@ -340,7 +340,7 @@ func TestWorkerStandByVetu(t *testing.T) {
"[[ \"$CIRRUS_VM_ID\" = cirrus-cli-standby* ]]",
},
worker.WithUpstream(upstream),
worker.WithStandby(standbyConfig),
worker.WithStandby(standbyParameters),
)
}

Expand Down
Loading