diff --git a/BUILD.bazel b/BUILD.bazel index 298e8b322..4a2da0b6f 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -22,6 +22,7 @@ go_library( "//config:go_default_library", "//server:go_default_library", "//utils/idle:go_default_library", + "//utils/metrics:go_default_library", "//utils/rlimit:go_default_library", "@com_github_abbot_go_http_auth//:go_default_library", "@com_github_grpc_ecosystem_go_grpc_prometheus//:go_default_library", diff --git a/README.md b/README.md index 70cd71e52..b89855136 100644 --- a/README.md +++ b/README.md @@ -223,6 +223,23 @@ host: localhost # If true, enable experimental remote asset API support: #experimental_remote_asset_api: true + +# Allows mapping HTTP and gRPC headers to prometheus +# labels. Headers can be set by bazel client as: +# --remote_header=os=ubuntu18-04. Not all counters are +# affected. +#metrics: +# categories: +# os: +# - rhel7 +# - rhel8 +# - ubuntu16-04 +# - ubuntu18-04 +# branch: +# - master +# user: +# - ci + ``` ## Docker diff --git a/cache/cache.go b/cache/cache.go index 49c7cc93c..da21930f4 100644 --- a/cache/cache.go +++ b/cache/cache.go @@ -21,6 +21,8 @@ const ( // used for HTTP when running with the --disable_http_ac_validation // commandline flag. RAW + + UNKNOWN ) func (e EntryKind) String() string { @@ -30,7 +32,10 @@ func (e EntryKind) String() string { if e == CAS { return "cas" } - return "raw" + if e == RAW { + return "raw" + } + return "unknown" } // Logger is designed to be satisfied by log.Logger. diff --git a/config/config.go b/config/config.go index c9efc2da3..7df906921 100644 --- a/config/config.go +++ b/config/config.go @@ -35,6 +35,11 @@ type HTTPBackendConfig struct { BaseURL string `yaml:"url"` } +// Metrics stores configuration for prometheus metrics. +type Metrics struct { + Categories map[string][]string `yaml:"categories"` +} + // Config holds the top-level configuration for bazel-remote. type Config struct { Host string `yaml:"host"` @@ -55,6 +60,7 @@ type Config struct { DisableGRPCACDepsCheck bool `yaml:"disable_grpc_ac_deps_check"` EnableACKeyInstanceMangling bool `yaml:"enable_ac_key_instance_mangling"` EnableEndpointMetrics bool `yaml:"enable_endpoint_metrics"` + Metrics *Metrics `yaml:"metrics"` ExperimentalRemoteAssetAPI bool `yaml:"experimental_remote_asset_api"` HTTPReadTimeout time.Duration `yaml:"http_read_timeout"` HTTPWriteTimeout time.Duration `yaml:"http_write_timeout"` @@ -73,6 +79,7 @@ func New(dir string, maxSize int, host string, port int, grpcPort int, disableGRPCACDepsCheck bool, enableACKeyInstanceMangling bool, enableEndpointMetrics bool, + metrics *Metrics, experimentalRemoteAssetAPI bool, httpReadTimeout time.Duration, httpWriteTimeout time.Duration) (*Config, error) { @@ -95,6 +102,7 @@ func New(dir string, maxSize int, host string, port int, grpcPort int, DisableGRPCACDepsCheck: disableGRPCACDepsCheck, EnableACKeyInstanceMangling: enableACKeyInstanceMangling, EnableEndpointMetrics: enableEndpointMetrics, + Metrics: metrics, ExperimentalRemoteAssetAPI: experimentalRemoteAssetAPI, HTTPReadTimeout: httpReadTimeout, HTTPWriteTimeout: httpWriteTimeout, diff --git a/main.go b/main.go index 98876863d..f945ebc10 100644 --- a/main.go +++ b/main.go @@ -22,6 +22,7 @@ import ( "github.com/buchgr/bazel-remote/config" "github.com/buchgr/bazel-remote/server" "github.com/buchgr/bazel-remote/utils/idle" + "github.com/buchgr/bazel-remote/utils/metrics" "github.com/buchgr/bazel-remote/utils/rlimit" grpc_prometheus "github.com/grpc-ecosystem/go-grpc-prometheus" @@ -283,6 +284,7 @@ func main() { ctx.Bool("disable_grpc_ac_deps_check"), ctx.Bool("enable_ac_key_instance_mangling"), ctx.Bool("enable_endpoint_metrics"), + nil, ctx.Bool("experimental_remote_asset_api"), ctx.Duration("http_read_timeout"), ctx.Duration("http_write_timeout"), @@ -311,6 +313,7 @@ func main() { accessLogger := log.New(os.Stdout, "", logFlags) errorLogger := log.New(os.Stderr, "", logFlags) + metrics := metrics.NewMetrics(c.Metrics) var proxyCache cache.Proxy if c.GoogleCloudStorage != nil { @@ -344,8 +347,7 @@ func main() { } validateAC := !c.DisableHTTPACValidation - h := server.NewHTTPCache(diskCache, accessLogger, errorLogger, validateAC, c.EnableACKeyInstanceMangling, gitCommit) - + h := server.NewHTTPCache(diskCache, accessLogger, errorLogger, metrics, validateAC, c.EnableACKeyInstanceMangling, gitCommit) var htpasswdSecrets auth.SecretProvider cacheHandler := h.CacheHandler if c.HtpasswdFile != "" { @@ -444,7 +446,7 @@ func main() { validateAC, c.EnableACKeyInstanceMangling, enableRemoteAssetAPI, - diskCache, accessLogger, errorLogger) + diskCache, accessLogger, errorLogger, metrics) if err3 != nil { log.Fatal(err3) } diff --git a/server/BUILD.bazel b/server/BUILD.bazel index 00db468d8..f67d07b59 100644 --- a/server/BUILD.bazel +++ b/server/BUILD.bazel @@ -18,6 +18,7 @@ go_library( "//cache:go_default_library", "//cache/disk:go_default_library", "//utils/idle:go_default_library", + "//utils/metrics:go_default_library", "@com_github_abbot_go_http_auth//:go_default_library", "@com_github_bazelbuild_remote_apis//build/bazel/remote/asset/v1:go_default_library", "@com_github_bazelbuild_remote_apis//build/bazel/remote/execution/v2:go_default_library", diff --git a/server/grpc.go b/server/grpc.go index a89ec78d6..e78a7b71e 100644 --- a/server/grpc.go +++ b/server/grpc.go @@ -18,7 +18,7 @@ import ( "github.com/buchgr/bazel-remote/cache" "github.com/buchgr/bazel-remote/cache/disk" - + "github.com/buchgr/bazel-remote/utils/metrics" _ "github.com/mostynb/go-grpc-compression/snappy" // Register snappy _ "github.com/mostynb/go-grpc-compression/zstd" // and zstd support. ) @@ -39,6 +39,7 @@ type grpcServer struct { errorLogger cache.Logger depsCheck bool mangleACKeys bool + metrics metrics.Metrics } // ListenAndServeGRPC creates a new gRPC server and listens on the given @@ -48,27 +49,28 @@ func ListenAndServeGRPC(addr string, opts []grpc.ServerOption, validateACDeps bool, mangleACKeys bool, enableRemoteAssetAPI bool, - c *disk.Cache, a cache.Logger, e cache.Logger) error { + c *disk.Cache, a cache.Logger, e cache.Logger, m metrics.Metrics) error { listener, err := net.Listen("tcp", addr) if err != nil { return err } - return serveGRPC(listener, opts, validateACDeps, mangleACKeys, enableRemoteAssetAPI, c, a, e) + return serveGRPC(listener, opts, validateACDeps, mangleACKeys, enableRemoteAssetAPI, c, a, e, m) } func serveGRPC(l net.Listener, opts []grpc.ServerOption, validateACDepsCheck bool, mangleACKeys bool, enableRemoteAssetAPI bool, - c *disk.Cache, a cache.Logger, e cache.Logger) error { + c *disk.Cache, a cache.Logger, e cache.Logger, m metrics.Metrics) error { srv := grpc.NewServer(opts...) s := &grpcServer{ cache: c, accessLogger: a, errorLogger: e, depsCheck: validateACDepsCheck, mangleACKeys: mangleACKeys, + metrics: m, } pb.RegisterActionCacheServer(srv, s) pb.RegisterCapabilitiesServer(srv, s) diff --git a/server/grpc_ac.go b/server/grpc_ac.go index a92d91bb0..d330aa616 100644 --- a/server/grpc_ac.go +++ b/server/grpc_ac.go @@ -13,10 +13,12 @@ import ( pb "github.com/bazelbuild/remote-apis/build/bazel/remote/execution/v2" "github.com/golang/protobuf/proto" "google.golang.org/grpc/codes" + "google.golang.org/grpc/metadata" "google.golang.org/grpc/peer" "google.golang.org/grpc/status" "github.com/buchgr/bazel-remote/cache" + "github.com/buchgr/bazel-remote/utils/metrics" ) var ( @@ -63,6 +65,7 @@ func (s *grpcServer) GetActionResult(ctx context.Context, } if rdr == nil || sizeBytes <= 0 { s.accessLogger.Printf("%s %s %s", logPrefix, req.ActionDigest.Hash, "NOT FOUND") + s.incAcRequestMetrics(metrics.METHOD_GET, metrics.NOT_FOUND, ctx) return nil, status.Error(codes.NotFound, fmt.Sprintf("%s not found in AC", req.ActionDigest.Hash)) } @@ -82,6 +85,7 @@ func (s *grpcServer) GetActionResult(ctx context.Context, } s.accessLogger.Printf("%s %s OK", logPrefix, req.ActionDigest.Hash) + s.incAcRequestMetrics(metrics.METHOD_GET, metrics.OK, ctx) return result, nil } @@ -93,6 +97,7 @@ func (s *grpcServer) GetActionResult(ctx context.Context, if result == nil { s.accessLogger.Printf("%s %s NOT FOUND", logPrefix, req.ActionDigest.Hash) + s.incAcRequestMetrics(metrics.METHOD_GET, metrics.NOT_FOUND, ctx) return nil, status.Error(codes.NotFound, fmt.Sprintf("%s not found in AC", req.ActionDigest.Hash)) } @@ -129,6 +134,7 @@ func (s *grpcServer) GetActionResult(ctx context.Context, } s.accessLogger.Printf("GRPC AC GET %s OK", req.ActionDigest.Hash) + s.incAcRequestMetrics(metrics.METHOD_GET, metrics.OK, ctx) return result, nil } @@ -290,6 +296,7 @@ func (s *grpcServer) UpdateActionResult(ctx context.Context, } s.accessLogger.Printf("GRPC AC PUT %s OK", req.ActionDigest.Hash) + s.incAcRequestMetrics(metrics.METHOD_PUT, metrics.OK, ctx) // Trivia: the RE API wants us to return the ActionResult from the // request, in order to follow this standard method style guide: @@ -331,3 +338,8 @@ func addWorkerMetadataGRPC(ctx context.Context, ar *pb.ActionResult) { ar.ExecutionMetadata.Worker = worker } + +func (s *grpcServer) incAcRequestMetrics(method metrics.Method, status metrics.Status, ctx context.Context) { + headers, _ := metadata.FromIncomingContext(ctx) + s.metrics.IncomingRequestCompleted(metrics.AC, method, status, headers, metrics.GRPC) +} diff --git a/server/grpc_test.go b/server/grpc_test.go index 89eff7b9c..1654e1e3b 100644 --- a/server/grpc_test.go +++ b/server/grpc_test.go @@ -73,6 +73,7 @@ func TestMain(m *testing.M) { accessLogger := testutils.NewSilentLogger() errorLogger := testutils.NewSilentLogger() + metrics := testutils.NewMetricsStub() listener = bufconn.Listen(bufSize) @@ -87,7 +88,7 @@ func TestMain(m *testing.M) { validateAC, mangleACKeys, enableRemoteAssetAPI, - diskCache, accessLogger, errorLogger) + diskCache, accessLogger, errorLogger, metrics) if err2 != nil { fmt.Println(err2) os.Exit(1) diff --git a/server/http.go b/server/http.go index 8e336a742..08a2eb4d0 100644 --- a/server/http.go +++ b/server/http.go @@ -17,6 +17,7 @@ import ( pb "github.com/bazelbuild/remote-apis/build/bazel/remote/execution/v2" "github.com/buchgr/bazel-remote/cache" "github.com/buchgr/bazel-remote/cache/disk" + "github.com/buchgr/bazel-remote/utils/metrics" "github.com/golang/protobuf/jsonpb" "github.com/golang/protobuf/proto" ) @@ -36,6 +37,7 @@ type httpCache struct { validateAC bool mangleACKeys bool gitCommit string + metrics metrics.Metrics } type statusPageData struct { @@ -51,8 +53,7 @@ type statusPageData struct { // accessLogger will print one line for each HTTP request to stdout. // errorLogger will print unexpected server errors. Inexistent files and malformed URLs will not // be reported. -func NewHTTPCache(cache *disk.Cache, accessLogger cache.Logger, errorLogger cache.Logger, validateAC bool, mangleACKeys bool, commit string) HTTPCache { - +func NewHTTPCache(cache *disk.Cache, accessLogger cache.Logger, errorLogger cache.Logger, metrics metrics.Metrics, validateAC bool, mangleACKeys bool, commit string) HTTPCache { _, _, numItems := cache.Stats() errorLogger.Printf("Loaded %d existing disk cache items.", numItems) @@ -63,6 +64,7 @@ func NewHTTPCache(cache *disk.Cache, accessLogger cache.Logger, errorLogger cach errorLogger: errorLogger, validateAC: validateAC, mangleACKeys: mangleACKeys, + metrics: metrics, } if commit != "{STABLE_GIT_COMMIT}" { @@ -106,32 +108,32 @@ func (h *httpCache) handleContainsValidAC(w http.ResponseWriter, r *http.Request _, data, err := h.cache.GetValidatedActionResult(hash) if err != nil { http.Error(w, "Not found", http.StatusNotFound) - h.logResponse(http.StatusNotFound, r) + h.logResponse(http.StatusNotFound, r, cache.AC) return } if data == nil { http.Error(w, "Not found", http.StatusNotFound) - h.logResponse(http.StatusNotFound, r) + h.logResponse(http.StatusNotFound, r, cache.AC) return } w.Header().Set("Content-Length", strconv.FormatInt(int64(len(data)), 10)) w.WriteHeader(http.StatusOK) - h.logResponse(http.StatusOK, r) + h.logResponse(http.StatusOK, r, cache.AC) } func (h *httpCache) handleGetValidAC(w http.ResponseWriter, r *http.Request, hash string) { _, data, err := h.cache.GetValidatedActionResult(hash) if err != nil { http.Error(w, "Not found", http.StatusNotFound) - h.logResponse(http.StatusNotFound, r) + h.logResponse(http.StatusNotFound, r, cache.AC) return } if data == nil { http.Error(w, "Not found", http.StatusNotFound) - h.logResponse(http.StatusNotFound, r) + h.logResponse(http.StatusNotFound, r, cache.AC) return } @@ -139,7 +141,7 @@ func (h *httpCache) handleGetValidAC(w http.ResponseWriter, r *http.Request, has ar := &pb.ActionResult{} err = proto.Unmarshal(data, ar) if err != nil { - h.logResponse(http.StatusInternalServerError, r) + h.logResponse(http.StatusInternalServerError, r, cache.AC) return } @@ -147,10 +149,11 @@ func (h *httpCache) handleGetValidAC(w http.ResponseWriter, r *http.Request, has marshaler := jsonpb.Marshaler{} err = marshaler.Marshal(w, ar) if err != nil { - h.logResponse(http.StatusInternalServerError, r) + h.logResponse(http.StatusInternalServerError, r, cache.AC) return } + h.logResponse(http.StatusOK, r, cache.AC) return } @@ -159,18 +162,20 @@ func (h *httpCache) handleGetValidAC(w http.ResponseWriter, r *http.Request, has bytesWritten, err := w.Write(data) if err != nil { - h.logResponse(http.StatusInternalServerError, r) + h.logResponse(http.StatusInternalServerError, r, cache.AC) return } if bytesWritten != len(data) { - h.logResponse(http.StatusInternalServerError, r) + h.logResponse(http.StatusInternalServerError, r, cache.AC) return } + + h.logResponse(http.StatusOK, r, cache.AC) } // Helper function for logging responses -func (h *httpCache) logResponse(code int, r *http.Request) { +func (h *httpCache) logResponse(code int, r *http.Request, kind cache.EntryKind) { // Parse the client ip:port var clientAddress string var err error @@ -179,6 +184,36 @@ func (h *httpCache) logResponse(code int, r *http.Request) { clientAddress = r.RemoteAddr } h.accessLogger.Printf("%4s %d %15s %s", r.Method, code, clientAddress, r.URL.Path) + h.increaseMetrics(code, r, kind) +} + +func (h *httpCache) increaseMetrics(code int, r *http.Request, kind cache.EntryKind) { + if kind == cache.AC { + // See comments in metrics.go about why only counting AC requests, + // and why mapping several statuses as "other". + + var status metrics.Status + var method metrics.Method + switch r.Method { + case http.MethodGet: + method = metrics.METHOD_GET + case http.MethodPut: + method = metrics.METHOD_PUT + case http.MethodHead: + method = metrics.METHOD_HEAD + default: + method = metrics.METHOD_OTHER + } + switch code { + case http.StatusOK: + status = metrics.OK + case http.StatusNotFound: + status = metrics.NOT_FOUND + default: + status = metrics.OTHER_STATUS + } + h.metrics.IncomingRequestCompleted(metrics.AC, method, status, r.Header, metrics.HTTP) + } } func (h *httpCache) CacheHandler(w http.ResponseWriter, r *http.Request) { @@ -187,7 +222,7 @@ func (h *httpCache) CacheHandler(w http.ResponseWriter, r *http.Request) { kind, hash, instance, err := parseRequestURL(r.URL.Path, h.validateAC) if err != nil { http.Error(w, err.Error(), http.StatusBadRequest) - h.logResponse(http.StatusBadRequest, r) + h.logResponse(http.StatusBadRequest, r, cache.UNKNOWN) return } @@ -216,7 +251,7 @@ func (h *httpCache) CacheHandler(w http.ResponseWriter, r *http.Request) { if rdr == nil { http.Error(w, "Not found", http.StatusNotFound) - h.logResponse(http.StatusNotFound, r) + h.logResponse(http.StatusNotFound, r, kind) return } defer rdr.Close() @@ -225,7 +260,7 @@ func (h *httpCache) CacheHandler(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Length", strconv.FormatInt(sizeBytes, 10)) io.Copy(w, rdr) - h.logResponse(http.StatusOK, r) + h.logResponse(http.StatusOK, r, kind) case http.MethodPut: contentLength := r.ContentLength @@ -289,7 +324,7 @@ func (h *httpCache) CacheHandler(w http.ResponseWriter, r *http.Request) { } h.errorLogger.Printf("PUT %s: %s", path(kind, hash), err) } else { - h.logResponse(http.StatusOK, r) + h.logResponse(http.StatusOK, r, kind) } case http.MethodHead: @@ -304,18 +339,17 @@ func (h *httpCache) CacheHandler(w http.ResponseWriter, r *http.Request) { ok, size := h.cache.Contains(kind, hash, -1) if !ok { http.Error(w, "Not found", http.StatusNotFound) - h.logResponse(http.StatusNotFound, r) + h.logResponse(http.StatusNotFound, r, kind) return } - w.Header().Set("Content-Length", strconv.FormatInt(size, 10)) w.WriteHeader(http.StatusOK) - h.logResponse(http.StatusOK, r) + h.logResponse(http.StatusOK, r, kind) default: msg := fmt.Sprintf("Method '%s' not supported.", html.EscapeString(m)) http.Error(w, msg, http.StatusMethodNotAllowed) - h.logResponse(http.StatusMethodNotAllowed, r) + h.logResponse(http.StatusMethodNotAllowed, r, kind) } } diff --git a/server/http_test.go b/server/http_test.go index 104b79093..b33af3943 100644 --- a/server/http_test.go +++ b/server/http_test.go @@ -36,8 +36,7 @@ func TestDownloadFile(t *testing.T) { } c := disk.New(cacheDir, blobSize, nil) - h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), true, false, "") - + h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), testutils.NewMetricsStub(), true, false, "") req, err := http.NewRequest("GET", "/cas/"+hash, bytes.NewReader([]byte{})) if err != nil { t.Fatal(err) @@ -99,7 +98,7 @@ func TestUploadFilesConcurrently(t *testing.T) { } c := disk.New(cacheDir, 1000*1024, nil) - h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), true, false, "") + h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), testutils.NewMetricsStub(), true, false, "") handler := http.HandlerFunc(h.CacheHandler) var wg sync.WaitGroup @@ -157,7 +156,7 @@ func TestUploadSameFileConcurrently(t *testing.T) { numWorkers := 100 c := disk.New(cacheDir, int64(len(data)*numWorkers), nil) - h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), true, false, "") + h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), testutils.NewMetricsStub(), true, false, "") handler := http.HandlerFunc(h.CacheHandler) var wg sync.WaitGroup @@ -203,7 +202,7 @@ func TestUploadCorruptedFile(t *testing.T) { } c := disk.New(cacheDir, 2048, nil) - h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), true, false, "") + h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), testutils.NewMetricsStub(), true, false, "") rr := httptest.NewRecorder() handler := http.HandlerFunc(h.CacheHandler) handler.ServeHTTP(rr, r) @@ -245,7 +244,7 @@ func TestUploadEmptyActionResult(t *testing.T) { c := disk.New(cacheDir, 2048, nil) validate := true mangle := false - h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), validate, mangle, "") + h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), testutils.NewMetricsStub(), validate, mangle, "") rr := httptest.NewRecorder() handler := http.HandlerFunc(h.CacheHandler) handler.ServeHTTP(rr, r) @@ -302,7 +301,7 @@ func testEmptyBlobAvailable(t *testing.T, method string) { c := disk.New(cacheDir, 2048, nil) validate := true mangle := false - h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), validate, mangle, "") + h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), testutils.NewMetricsStub(), validate, mangle, "") rr := httptest.NewRecorder() handler := http.HandlerFunc(h.CacheHandler) handler.ServeHTTP(rr, r) @@ -325,7 +324,7 @@ func TestStatusPage(t *testing.T) { } c := disk.New(cacheDir, 2048, nil) - h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), true, false, "") + h := NewHTTPCache(c, testutils.NewSilentLogger(), testutils.NewSilentLogger(), testutils.NewMetricsStub(), true, false, "") rr := httptest.NewRecorder() handler := http.HandlerFunc(h.StatusPageHandler) handler.ServeHTTP(rr, r) @@ -466,7 +465,7 @@ func TestRemoteReturnsNotFound(t *testing.T) { defer os.RemoveAll(cacheDir) emptyCache := disk.New(cacheDir, 1024, nil) - h := NewHTTPCache(emptyCache, testutils.NewSilentLogger(), testutils.NewSilentLogger(), true, false, "") + h := NewHTTPCache(emptyCache, testutils.NewSilentLogger(), testutils.NewSilentLogger(), testutils.NewMetricsStub(), true, false, "") // create a fake http.Request _, hash := testutils.RandomDataAndHash(1024) url, _ := url.Parse(fmt.Sprintf("http://localhost:8080/ac/%s", hash)) diff --git a/utils/BUILD.bazel b/utils/BUILD.bazel index cdeaa623e..a6632cb43 100644 --- a/utils/BUILD.bazel +++ b/utils/BUILD.bazel @@ -5,4 +5,7 @@ go_library( srcs = ["testutils.go"], importpath = "github.com/buchgr/bazel-remote/utils", visibility = ["//visibility:public"], + deps = [ + "//utils/metrics:go_default_library", + ] ) diff --git a/utils/metrics/BUILD.bazel b/utils/metrics/BUILD.bazel new file mode 100644 index 000000000..1901f9091 --- /dev/null +++ b/utils/metrics/BUILD.bazel @@ -0,0 +1,13 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "go_default_library", + srcs = ["metrics.go"], + importpath = "github.com/buchgr/bazel-remote/utils/metrics", + visibility = ["//visibility:public"], + deps = [ + "//config:go_default_library", + "@com_github_prometheus_client_golang//prometheus:go_default_library", + "@com_github_prometheus_client_golang//prometheus/promauto:go_default_library", + ], +) diff --git a/utils/metrics/metrics.go b/utils/metrics/metrics.go new file mode 100644 index 000000000..d912c7438 --- /dev/null +++ b/utils/metrics/metrics.go @@ -0,0 +1,187 @@ +package metrics + +import ( + "github.com/buchgr/bazel-remote/config" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + "strings" +) + +// TODO Add test cases for this file + +type Method int +type Status int +type Kind int +type Protocol int + +const ( + METHOD_GET Method = iota + METHOD_HEAD + METHOD_PUT + METHOD_OTHER +) + +const ( + OK Status = iota + NOT_FOUND + OTHER_STATUS +) + +const ( + AC Kind = iota + CAS +) + +const ( + HTTP Protocol = iota + GRPC +) + +func (e Method) String() string { + // Actually HTTP names, but can be conceptually mapped also to GRPC protocol. + if e == METHOD_GET { + return "GET" + } + if e == METHOD_HEAD { + return "HEAD" + } + if e == METHOD_PUT { + return "PUT" + } + return "other" +} + +func (e Status) String() string { + // Names that works for both HTTP and GRPC, instead of HTTP or GRPC specific codes. + if e == OK { + return "OK" + } + if e == NOT_FOUND { + return "NotFound" + } + return "other" +} + +func (e Kind) String() string { + if e == AC { + return "AC" + } + if e == CAS { + return "CAS" + } + return "other" +} + +type Metrics interface { + // TODO Document interface + IncomingRequestCompleted(kind Kind, method Method, status Status, headers map[string][]string, protocol Protocol) +} + +type metrics struct { + categoryValues map[string]map[string]struct{} + counterIncomingCompletedReqs *prometheus.CounterVec +} + +func NewMetrics(config *config.Metrics) Metrics { + + labels := []string{"method", "status", "kind"} + categoryValues := make(map[string]map[string]struct{}) + + if config != nil && config.Categories != nil { + for categoryName, whiteListedValues := range config.Categories { + // Normalize to lower case since canonical for gRPC headers + // and convention for prometheus. + categoryName := strings.ToLower(categoryName) + + // Store white listed category values as set for efficient access + whiteListedSet := make(map[string]struct{}) + for _, categoryValue := range whiteListedValues { + whiteListedSet[categoryValue] = struct{}{} + } + categoryValues[categoryName] = whiteListedSet + + // Construct a prometheus label for each category. + // Prometheus does not allow changing set of + // labels until next time bazel-remote is + // restarted. + labels = append(labels, categoryName) + } + } + + // For now we only count AC requests, and only the most common status codes, + // becuse: + // + // - No identified use case for others. + // - Limit number of prometheus time series (if many configured categories). + // - Reduce performance overhead of counters (if many configured categories). + // - Would otherwise require injecting invocations in more places. + // + // But the naming, and the labels, of the counter, are generic to allow + // counting additional requests types or status codes in the future. Without + // having to rename the counter and get issues with non continous history of + // metrics. + + counterIncomingCompletedReqs := promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "bazel_remote_incoming_requests_completed_total", + Help: "The number of incoming HTTP and gRPC request. Currently only AC requests", + }, + labels) + + m := &metrics{ + categoryValues: categoryValues, + counterIncomingCompletedReqs: counterIncomingCompletedReqs, + } + return m +} + +func getLabelValueFromHeaderValues(headerValues []string, whiteListedValues map[string]struct{}) string { + for _, headerValue := range headerValues { + // Prometheus only allows one value per label. + // Pick the first white listed header value we find. + if _, ok := whiteListedValues[headerValue]; ok { + return headerValue + } + } + + // The values found in the header has not been white listed in + // the configuration file. Represent them as "other". + // + // The white listening is an attempt to avoid polluting + // prometheus with too many different time series. + // + // https://prometheus.io/docs/practices/naming/ warns about: + // + // "CAUTION: Remember that every unique combination of key-value + // label pairs represents a new time series, which can dramatically + // increase the amount of data stored. Do not use labels to store + // dimensions with high cardinality (many different label values), + // such as user IDs, email addresses, or other unbounded sets of + // values." + // + // It would have been nice if bazel-remote could reload the set + // of white listed values from updated configuration file, by + // SIGHUP signal instead of having to restart bazel-remote. + return "other" +} + +func (m *metrics) IncomingRequestCompleted(kind Kind, method Method, status Status, headers map[string][]string, protocol Protocol) { + labels := make(prometheus.Labels) + labels["method"] = method.String() + labels["status"] = status.String() + labels["kind"] = kind.String() + for labelName := range m.categoryValues { + // The canonical form of gRPC and HTTP/2 headers is lowercase "category" + headerName := labelName + if protocol == HTTP { + // but the golang http library is normalizing HTTP/1.1 headers as "Category". + headerName = strings.Title(headerName) + } + if headerValues, ok := headers[headerName]; ok { + labels[labelName] = getLabelValueFromHeaderValues(headerValues, m.categoryValues[labelName]) + } else { + labels[labelName] = "" // no header for this label + } + } + m.counterIncomingCompletedReqs.With(labels).Inc() +} diff --git a/utils/testutils.go b/utils/testutils.go index f0ee56037..15322eb7d 100644 --- a/utils/testutils.go +++ b/utils/testutils.go @@ -8,6 +8,8 @@ import ( "log" "os" "testing" + + "github.com/buchgr/bazel-remote/utils/metrics" ) // TempDir creates a temporary directory and returns its name. If an error @@ -47,3 +49,13 @@ func RandomDataAndHash(size int64) ([]byte, string) { func NewSilentLogger() *log.Logger { return log.New(ioutil.Discard, "", 0) } + +type metricsStub struct{} + +func NewMetricsStub() *metricsStub { + return new(metricsStub) +} + +func (m metricsStub) IncomingRequestCompleted(kind metrics.Kind, method metrics.Method, status metrics.Status, headers map[string][]string, protocol metrics.Protocol) { + // Do nothing +}