Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Zipkin Receiver: default timestamp #1068

Merged
merged 4 commits into from
Jun 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion exporter/zipkinexporter/zipkin.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ func (ze *zipkinExporter) PushTraceData(_ context.Context, td consumerdata.Trace

body, err := ze.serializer.Serialize(tbatch)
if err != nil {
return len(td.Spans), err
return len(td.Spans), consumererror.Permanent(err)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed the serialization failure to a permanent error because it had caused an infinite loop in the retry processor otherwise. Is there any reason we'd want to retry on a serialization error?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't think of any reason to retry them

}

req, err := http.NewRequest("POST", ze.url, bytes.NewReader(body))
Expand Down
1 change: 1 addition & 0 deletions receiver/zipkinreceiver/trace_receiver.go
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,7 @@ func zipkinSpanToTraceSpan(zs *zipkinmodel.SpanModel) (*tracepb.Span, *commonpb.
}

node := nodeFromZipkinEndpoints(zs, pbs)
zipkin.SetTimestampsIfUnset(pbs)

return pbs, node
}
Expand Down
3 changes: 2 additions & 1 deletion receiver/zipkinreceiver/trace_receiver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,8 @@ func TestSpanKindTranslation(t *testing.T) {
TraceID: zipkinmodel.TraceID{Low: 123},
ID: 456,
},
Kind: tt.zipkinKind,
Kind: tt.zipkinKind,
Timestamp: time.Now(),
}
ocSpan, _ := zipkinSpanToTraceSpan(zs)
assert.EqualValues(t, tt.ocKind, ocSpan.Kind)
Expand Down
1 change: 1 addition & 0 deletions translator/trace/zipkin/attributekeys.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@ const (
RemoteEndpointIPv6 = "zipkin.remoteEndpoint.ipv6"
RemoteEndpointPort = "zipkin.remoteEndpoint.port"
RemoteEndpointServiceName = "zipkin.remoteEndpoint.serviceName"
StartTimeAbsent = "otel.zipkin.absentField.startTime"
)
7 changes: 7 additions & 0 deletions translator/trace/zipkin/protospan_to_zipkinv1.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package zipkin
import (
"net"
"strconv"
"time"

commonpb "github.com/census-instrumentation/opencensus-proto/gen-go/agent/common/v1"
resourcepb "github.com/census-instrumentation/opencensus-proto/gen-go/resource/v1"
Expand Down Expand Up @@ -243,7 +244,13 @@ func OCSpanProtoToZipkin(
if spanKindFromAttributes {
redundantKeys[tracetranslator.TagSpanKind] = true
}

startTime := internal.TimestampToTime(s.StartTime)
if _, ok := attrMap[StartTimeAbsent]; ok {
redundantKeys[StartTimeAbsent] = true
startTime = time.Time{}
}

z := &zipkinmodel.SpanModel{
SpanContext: zipkinmodel.SpanContext{
TraceID: convertTraceID(s.TraceId),
Expand Down
25 changes: 25 additions & 0 deletions translator/trace/zipkin/zipkinv1_to_protospan.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"fmt"
"math"
"strconv"
"time"

commonpb "github.com/census-instrumentation/opencensus-proto/gen-go/agent/common/v1"
tracepb "github.com/census-instrumentation/opencensus-proto/gen-go/trace/v1"
Expand All @@ -27,6 +28,7 @@ import (
"github.com/pkg/errors"

"go.opentelemetry.io/collector/consumer/consumerdata"
"go.opentelemetry.io/collector/internal"
tracetranslator "go.opentelemetry.io/collector/translator/trace"
)

Expand Down Expand Up @@ -175,6 +177,7 @@ func zipkinV1ToOCSpan(zSpan *zipkinV1Span) (*tracepb.Span, *annotationParseResul
}

setSpanKind(ocSpan, parsedAnnotations.Kind, parsedAnnotations.ExtendedKind)
SetTimestampsIfUnset(ocSpan)

return ocSpan, parsedAnnotations, nil
}
Expand Down Expand Up @@ -481,3 +484,25 @@ func (ep *endpoint) createAttributeMap() map[string]string {
}
return attributeMap
}

func SetTimestampsIfUnset(span *tracepb.Span) {
// zipkin allows timestamp to be unset, but opentelemetry-collector expects it to have a value.
// If this is unset, the conversion from open census to the internal trace format breaks
// what should be an identity transformation oc -> internal -> oc
if span.StartTime == nil {
now := internal.TimeToTimestamp(time.Now())
span.StartTime = now
span.EndTime = now

if span.Attributes == nil {
span.Attributes = &tracepb.Span_Attributes{}
}
if span.Attributes.AttributeMap == nil {
span.Attributes.AttributeMap = make(map[string]*tracepb.AttributeValue, 1)
}
span.Attributes.AttributeMap[StartTimeAbsent] = &tracepb.AttributeValue{
Value: &tracepb.AttributeValue_BoolValue{
BoolValue: true,
}}
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure about this but I don't know what the right thing to do is. Addin/modifying data like this just to make serialize happy doesn't sound that great to me. This mean backends will show misleading information to users.

Alternatively, we could just drop such spans as a validation step in the zipkin receiver and log messages but we can't do that as zipkin spec allows for the possibility of not having timestamps. Is there a way we can serialize it differently without adding fake timestamps?

If there is no other viable solution, we should at least add an attribute to the span that represents that the timestamps are fake and shouldn't be trusted. Also, can we may be use more obvious fake value for timestamp like unix epoch or something?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Zipkin spec on this:

Epoch microseconds of the start of this span, possibly absent if
incomplete.

For example, 1502787600000000 corresponds to 2017-08-15 09:00 UTC

This value should be set directly by instrumentation, using the most
precise value possible. For example, gettimeofday or multiplying epoch
millis by 1000.

There are three known edge-cases where this could be reported absent.

    A span was allocated but never started (ex not yet received a timestamp)
    The span’s start event was lost
    Data about a completed span (ex tags) were sent after the fact

Copy link
Contributor

@owais owais Jun 3, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A span was allocated but never started (ex not yet received a timestamp)

If this was the only edge-case, we could just drop the span as it didn't actually represent a real operation but we have two more scenarios that can result in spans without timestamps so we can't just drop.

It looks like setting a static timestamp that is obviously fake along with a well-known attribute that says the timestamps were injected by Otel collector as a workaround is probably our best bet. Does Jaeger proto allow having empty timestamps? If so, an OTLP>Jaeger/Zipkin translator should probably set the timestamp to nil again when it encounters this tag on a span. One concern with setting a very old timestamp is that it might affect some backends in an unexpected way. For example, some backend might drop the span very soon if it stores only recent data and by span operation timestamps instead of ingestion timestamp.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not familiar with Jaeger, will need someone else to weigh in on that

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like setting a static timestamp that is obviously fake along with a well-known attribute that says the timestamps were injected by Otel collector as a workaround is probably our best bet. Does Jaeger proto allow having empty timestamps? If so, an OTLP>Jaeger/Zipkin translator should probably set the timestamp to nil again when it encounters this tag on a span. One concern with setting a very old timestamp is that it might affect some backends in an unexpected way. For example, some backend might drop the span very soon if it stores only recent data and by span operation timestamps instead of ingestion timestamp.

I agree with this approach. We can set a fake current timestamp along with an attribute identifying this workaround and use it in Jaeger/zipkin exporters to reconstruct null timestamps.

Copy link
Contributor

@owais owais Jun 5, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds like a zipkin convention. We should probably have an otel convention about such tags. May be otel.zipkin.missingTimestamp or otel.zipkin.missingFields with value being comma separated list of field names? May be we should have otel collector specific conventions for behavior like this. Perhaps prefix with otelcol or something?

We can set a fake current timestamp

Why current timestamp? Can we do zero time? Like 0 seconds since unix epoch? I'm afraid if we use current time and the span get exported using something other than zipkin, it might be very misleading to users when seeing the span in a tracing backend. "zero time" will at least raise some eye brows.

Copy link
Member

@dmitryax dmitryax Jun 5, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@owais , 0 timestamps look cleaner, but if I understood your concern right it doesn't look like the best way:

One concern with setting a very old timestamp is that it might affect some backends in an unexpected way. For example, some backend might drop the span very soon if it stores only recent data and by span operation timestamps instead of ingestion timestamp.

IMO still dropping spans with 0 timestamp looks like the best way to handle it in otel-collector. @chris-smith-zocdoc could you please let us know if the spans with no timestamps are actually visible in the backend that you use or they needed for aggregation purposes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO still dropping spans with 0 timestamp looks like the best way to handle it in otel-collector

I disagree. Throwing away data seems suboptimal.

Through coincidence they actually end up as a zero timestamp right now, but after the translation from OC -> internal -> OC for the zipkin export, the timestamp is becoming invalid and causing the serialization issue.

could you please let us know if the spans with no timestamps are actually visible in the backend that you use or they needed for aggregation purposes?

Yes they're visible. We have two backends, one is zipkin the other is honeycomb.

The data in these spans are just events and annotations, sent after the original span. This appears to be a common behavior in the zipkin libraries we're using for javascript and scala.

[
    {
        "traceId": "404059d49fac24fc",
        "id": "404059d49fac24ff",
        "annotations": [
            {
                "timestamp": 1589292979495000,
                "value": "zipkin-js.flush"
            }
        ],
        "tags": {
            "response_size_bytes": "9622"
        }
    }
]

In zipkin the data is merged into the original span.

In honeycomb these end up as a second event in their system. I've considered building a buffer/merge processor (similar to tail sampling) to do the merge in OTel before sending it to them.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree that throwing away the spans does not sound correct for people who might be using zipkin end to end (or even some parts of it) since it is a valid zipkin use case. I still have concerns about zero'ed timestamps causing backends to drop spans too soon. May be backends can use the tag we inject to detect this edge-case but not sure we can expect all backends to do this especially since it is not part of the otel spec. I guess adding current timestamp for both start and end is the safest choice here even though it's not very optimal for backends other than zipkin.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good. Let's stick with the current timestamp.

Also @owais had a good point about about the attribute name. We probably should not use a name that looks like zipkin convention. @owais, what about otel.zipkin.absentField.startTime?

}
114 changes: 83 additions & 31 deletions translator/trace/zipkin/zipkinv1_to_protospan_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"sort"
"strconv"
"testing"
"time"

commonpb "github.com/census-instrumentation/opencensus-proto/gen-go/agent/common/v1"
tracepb "github.com/census-instrumentation/opencensus-proto/gen-go/trace/v1"
Expand All @@ -30,6 +31,7 @@ import (
"github.com/stretchr/testify/require"

"go.opentelemetry.io/collector/consumer/consumerdata"
"go.opentelemetry.io/collector/internal"
tracetranslator "go.opentelemetry.io/collector/translator/trace"
)

Expand Down Expand Up @@ -247,14 +249,15 @@ func sortTraceByNodeName(trace []consumerdata.TraceData) {

func TestZipkinAnnotationsToOCStatus(t *testing.T) {
type test struct {
name string
haveTags []*binaryAnnotation
wantAttributes *tracepb.Span_Attributes
wantStatus *tracepb.Status
}

cases := []test{
// only status.code tag
{
name: "only status.code tag",
haveTags: []*binaryAnnotation{{
Key: "status.code",
Value: "13",
Expand All @@ -264,17 +267,19 @@ func TestZipkinAnnotationsToOCStatus(t *testing.T) {
Code: 13,
},
},
// only status.message tag

{
name: "only status.message tag",
haveTags: []*binaryAnnotation{{
Key: "status.message",
Value: "Forbidden",
}},
wantAttributes: nil,
wantStatus: nil,
},
// both status.code and status.message

{
name: "both status.code and status.message",
haveTags: []*binaryAnnotation{
{
Key: "status.code",
Expand All @@ -292,8 +297,8 @@ func TestZipkinAnnotationsToOCStatus(t *testing.T) {
},
},

// http status.code
{
name: "http status.code",
haveTags: []*binaryAnnotation{
{
Key: "http.status_code",
Expand Down Expand Up @@ -324,8 +329,8 @@ func TestZipkinAnnotationsToOCStatus(t *testing.T) {
},
},

// http and oc
{
name: "http and oc",
haveTags: []*binaryAnnotation{
{
Key: "http.status_code",
Expand Down Expand Up @@ -364,8 +369,8 @@ func TestZipkinAnnotationsToOCStatus(t *testing.T) {
},
},

// http and only oc code
{
name: "http and only oc code",
haveTags: []*binaryAnnotation{
{
Key: "http.status_code",
Expand Down Expand Up @@ -398,8 +403,9 @@ func TestZipkinAnnotationsToOCStatus(t *testing.T) {
Code: 14,
},
},
// http and only oc message

{
name: "http and only oc message",
haveTags: []*binaryAnnotation{
{
Key: "http.status_code",
Expand Down Expand Up @@ -434,8 +440,8 @@ func TestZipkinAnnotationsToOCStatus(t *testing.T) {
},
},

// census tags
{
name: "census tags",
haveTags: []*binaryAnnotation{
{
Key: "census.status_code",
Expand All @@ -453,8 +459,8 @@ func TestZipkinAnnotationsToOCStatus(t *testing.T) {
},
},

// census tags priority over others
{
name: "census tags priority over others",
haveTags: []*binaryAnnotation{
{
Key: "census.status_code",
Expand Down Expand Up @@ -506,33 +512,79 @@ func TestZipkinAnnotationsToOCStatus(t *testing.T) {
fakeSpanID := "0000000000000001"

for i, c := range cases {
zSpans := []*zipkinV1Span{{
ID: fakeSpanID,
TraceID: fakeTraceID,
BinaryAnnotations: c.haveTags,
}}
zBytes, err := json.Marshal(zSpans)
if err != nil {
t.Errorf("#%d: Unexpected error: %v", i, err)
continue
}
gb, err := V1JSONBatchToOCProto(zBytes)
if err != nil {
t.Errorf("#%d: Unexpected error: %v", i, err)
continue
}
gs := gb[0].Spans[0]
t.Run(c.name, func(t *testing.T) {
zSpans := []*zipkinV1Span{{
ID: fakeSpanID,
TraceID: fakeTraceID,
BinaryAnnotations: c.haveTags,
Timestamp: 1,
}}
zBytes, err := json.Marshal(zSpans)
if err != nil {
t.Errorf("#%d: Unexpected error: %v", i, err)
return
}
gb, err := V1JSONBatchToOCProto(zBytes)
if err != nil {
t.Errorf("#%d: Unexpected error: %v", i, err)
return
}
gs := gb[0].Spans[0]

if !reflect.DeepEqual(gs.Attributes, c.wantAttributes) {
t.Fatalf("Unsuccessful conversion\nGot:\n\t%v\nWant:\n\t%v", gs.Attributes, c.wantAttributes)
}
if !reflect.DeepEqual(gs.Attributes, c.wantAttributes) {
t.Fatalf("Unsuccessful conversion\nGot:\n\t%v\nWant:\n\t%v", gs.Attributes, c.wantAttributes)
}

if !reflect.DeepEqual(gs.Status, c.wantStatus) {
t.Fatalf("Unsuccessful conversion: %d\nGot:\n\t%v\nWant:\n\t%v", i, gs.Status, c.wantStatus)
}
if !reflect.DeepEqual(gs.Status, c.wantStatus) {
t.Fatalf("Unsuccessful conversion: %d\nGot:\n\t%v\nWant:\n\t%v", i, gs.Status, c.wantStatus)
}
})
}
}

func TestSpanWithoutTimestampGetsTag(t *testing.T) {
fakeTraceID := "00000000000000010000000000000002"
fakeSpanID := "0000000000000001"
zSpans := []*zipkinV1Span{
{
ID: fakeSpanID,
TraceID: fakeTraceID,
Timestamp: 0, // no timestamp field
},
}
zBytes, err := json.Marshal(zSpans)
if err != nil {
t.Errorf("Unexpected error: %v", err)
return
}

testStart := time.Now()

gb, err := V1JSONBatchToOCProto(zBytes)
if err != nil {
t.Errorf("Unexpected error: %v", err)
return
}

gs := gb[0].Spans[0]
assert.NotNil(t, gs.StartTime)
assert.NotNil(t, gs.EndTime)

assert.True(t, internal.TimestampToTime(gs.StartTime).Sub(testStart) >= 0)

wantAttributes := &tracepb.Span_Attributes{
AttributeMap: map[string]*tracepb.AttributeValue{
StartTimeAbsent: {
Value: &tracepb.AttributeValue_BoolValue{
BoolValue: true,
},
},
},
}

assert.EqualValues(t, gs.Attributes, wantAttributes)
}

func TestJSONHTTPToGRPCStatusCode(t *testing.T) {
fakeTraceID := "00000000000000010000000000000002"
fakeSpanID := "0000000000000001"
Expand Down