From dcd0d4e953b6395fb9287f372db4d3e2ad3c3126 Mon Sep 17 00:00:00 2001 From: Andrea Marziali Date: Thu, 11 Jun 2026 08:45:32 +0200 Subject: [PATCH 1/9] wip --- .../test/InstrumentationSpecification.groovy | 40 ++ .../test/AbstractInstrumentationTest.java | 7 +- .../test/scopediag/ContinuationRecord.java | 80 +++ .../test/scopediag/ScopeDiagnostics.java | 198 +++++++ .../scopediag/ScopeDiagnosticsExtension.java | 76 +++ .../scopediag/ScopeDiagnosticsReport.java | 510 ++++++++++++++++++ .../agent/test/scopediag/ScopeEvent.java | 32 ++ .../agent/test/scopediag/ScopeSources.java | 25 + .../agent/test/scopediag/StackFilter.java | 71 +++ .../scopediag/TrackScopeContinuations.java | 31 ++ .../ScopeDiagnosticsIntegrationTest.java | 71 +++ .../scopediag/ScopeDiagnosticsReportTest.java | 145 +++++ .../agent/test/scopediag/StackFilterTest.java | 53 ++ .../src/test/groovy/PubSubTest.groovy | 4 + .../java/datadog/trace/core/PendingTrace.java | 2 + .../scopemanager/ContinuationDiagnostics.java | 73 +++ .../core/scopemanager/ScopeContinuation.java | 24 + ...ope-continuation-leak-diagnostic-design.md | 197 +++++++ 18 files changed, 1638 insertions(+), 1 deletion(-) create mode 100644 dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationRecord.java create mode 100644 dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnostics.java create mode 100644 dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsExtension.java create mode 100644 dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReport.java create mode 100644 dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeEvent.java create mode 100644 dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeSources.java create mode 100644 dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/StackFilter.java create mode 100644 dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/TrackScopeContinuations.java create mode 100644 dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsIntegrationTest.java create mode 100644 dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReportTest.java create mode 100644 dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/StackFilterTest.java create mode 100644 dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ContinuationDiagnostics.java create mode 100644 docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md diff --git a/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy b/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy index 8f1a6cd6df4..2f84bc1e5d1 100644 --- a/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy +++ b/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy @@ -34,6 +34,8 @@ import datadog.metrics.impl.DDSketchHistograms import datadog.metrics.impl.MonitoringImpl import datadog.trace.agent.test.asserts.ListWriterAssert import datadog.trace.agent.test.asserts.TagsAssert +import datadog.trace.agent.test.scopediag.ScopeDiagnostics +import datadog.trace.agent.test.scopediag.TrackScopeContinuations import datadog.trace.agent.test.datastreams.MockFeaturesDiscovery import datadog.trace.agent.test.datastreams.RecordingDatastreamsPayloadWriter import datadog.trace.agent.tooling.AgentInstaller @@ -466,6 +468,9 @@ abstract class InstrumentationSpecification extends DDSpecification implements A } TEST_WRITER.start() + if (scopeDiagConfig() != null) { + ScopeDiagnostics.startRecording() + } TEST_DATA_STREAMS_WRITER.clear() TEST_DATA_STREAMS_MONITORING.clear() @@ -499,6 +504,8 @@ abstract class InstrumentationSpecification extends DDSpecification implements A } TEST_TRACER.flush() + reportScopeDiagnostics() + def util = new MockUtil() util.detachMock(STATS_D_CLIENT) @@ -522,6 +529,39 @@ abstract class InstrumentationSpecification extends DDSpecification implements A assert InstrumentationErrors.noErrors(): InstrumentationErrors.describeErrors() } + /** Resolves the {@link TrackScopeContinuations} annotation from the feature method or spec class. */ + private TrackScopeContinuations scopeDiagConfig() { + def method = specificationContext?.currentFeature?.featureMethod?.reflection + def ann = method?.getAnnotation(TrackScopeContinuations) + if (ann == null) { + ann = this.class.getAnnotation(TrackScopeContinuations) + } + return ann + } + + private void reportScopeDiagnostics() { + def config = scopeDiagConfig() + if (config == null) { + return + } + ScopeDiagnostics.stop() + def report = ScopeDiagnostics.report() + println(config.gantt() ? report.renderGantt() : report.renderSummary()) + if (config.json()) { + ScopeDiagnostics.writeJson(specificationContext.currentIteration.name) + } + if (config.mermaid()) { + println("Mermaid timeline written to " + ScopeDiagnostics.writeMermaid(specificationContext.currentIteration.name)) + } + try { + if (config.failOnLeak()) { + ScopeDiagnostics.assertNoLeaks() + } + } finally { + ScopeDiagnostics.reset() + } + } + private void doCheckRepeatedFinish() { for (Map.Entry> entry: this.spanFinishLocations.entrySet()) { if (entry.value.size() == 1) { diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/AbstractInstrumentationTest.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/AbstractInstrumentationTest.java index 5f724853972..8a9e5f82b5b 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/AbstractInstrumentationTest.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/AbstractInstrumentationTest.java @@ -7,6 +7,7 @@ import datadog.instrument.classinject.ClassInjector; import datadog.trace.agent.test.assertions.TraceAssertions; import datadog.trace.agent.test.assertions.TraceMatcher; +import datadog.trace.agent.test.scopediag.ScopeDiagnosticsExtension; import datadog.trace.agent.tooling.AgentInstaller; import datadog.trace.agent.tooling.InstrumenterModule; import datadog.trace.agent.tooling.TracerInstaller; @@ -54,7 +55,11 @@ * */ @WithConfig(key = "detailed.instrumentation.errors", value = "true") -@ExtendWith({TestClassShadowingExtension.class, AllowContextTestingExtension.class}) +@ExtendWith({ + TestClassShadowingExtension.class, + AllowContextTestingExtension.class, + ScopeDiagnosticsExtension.class +}) public abstract class AbstractInstrumentationTest { static final Instrumentation INSTRUMENTATION = ByteBuddyAgent.getInstrumentation(); diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationRecord.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationRecord.java new file mode 100644 index 00000000000..ba4842afff8 --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationRecord.java @@ -0,0 +1,80 @@ +package datadog.trace.agent.test.scopediag; + +import datadog.trace.api.DDTraceId; +import java.util.ArrayList; +import java.util.List; + +/** + * The correlated lifecycle of one scope continuation: its capture, every activation, and every + * resolution (finish/cancel). Built incrementally as events arrive on different threads, so all + * mutating access is synchronized on the instance. + */ +public final class ContinuationRecord { + public final long seq; + public final DDTraceId traceId; + public final long spanId; + public final byte source; + + /** + * {@code true} when activation/resolution was seen without a preceding capture in this window. + */ + public final boolean orphan; + + private ScopeEvent capture; + private final List activations = new ArrayList<>(1); + private final List resolutions = new ArrayList<>(1); + + ContinuationRecord( + long seq, DDTraceId traceId, long spanId, byte source, boolean orphan, ScopeEvent capture) { + this.seq = seq; + this.traceId = traceId; + this.spanId = spanId; + this.source = source; + this.orphan = orphan; + this.capture = capture; + } + + synchronized void addActivation(ScopeEvent event) { + activations.add(event); + } + + synchronized void addResolution(ScopeEvent event) { + resolutions.add(event); + } + + public synchronized ScopeEvent capture() { + return capture; + } + + public synchronized List activations() { + return new ArrayList<>(activations); + } + + public synchronized List resolutions() { + return new ArrayList<>(resolutions); + } + + public synchronized boolean isActivated() { + return !activations.isEmpty(); + } + + public synchronized boolean isResolved() { + return !resolutions.isEmpty(); + } + + /** Earliest known event time for ordering the timeline. */ + public synchronized long firstNanos() { + long min = capture != null ? capture.nanos : Long.MAX_VALUE; + for (ScopeEvent e : activations) { + min = Math.min(min, e.nanos); + } + for (ScopeEvent e : resolutions) { + min = Math.min(min, e.nanos); + } + return min; + } + + public String sourceName() { + return ScopeSources.name(source); + } +} diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnostics.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnostics.java new file mode 100644 index 00000000000..a92d1c15fd6 --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnostics.java @@ -0,0 +1,198 @@ +package datadog.trace.agent.test.scopediag; + +import datadog.trace.api.DDTraceId; +import datadog.trace.bootstrap.instrumentation.api.AgentScope; +import datadog.trace.core.scopemanager.ContinuationDiagnostics; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.IdentityHashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicLong; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Test-time engine that records scope-continuation lifecycle events and renders leak reports. + * + *

It installs itself as the {@link ContinuationDiagnostics.Listener} while recording and + * correlates events by continuation identity (an {@link IdentityHashMap}, never {@code + * equals}/{@code hashCode}). It assumes a single test runs at a time per JVM (true for + * instrumentation tests); {@link #reset()} isolates one test from the next. + * + *

Usage: + * + *

+ *   ScopeDiagnostics.startRecording();
+ *   ... exercise code under test ...
+ *   log.info(ScopeDiagnostics.report().renderGantt());
+ *   ScopeDiagnostics.assertNoLeaks();   // optional
+ *   ScopeDiagnostics.stop();
+ * 
+ */ +public final class ScopeDiagnostics { + private static final Logger log = LoggerFactory.getLogger(ScopeDiagnostics.class); + private static final int DEFAULT_MAX_FRAMES = 6; + + private static final ScopeDiagnostics INSTANCE = new ScopeDiagnostics(); + + private final Map records = + Collections.synchronizedMap( + new IdentityHashMap()); + private final Map rootWrittenNanos = new ConcurrentHashMap<>(); + private final AtomicLong seq = new AtomicLong(); + private volatile StackFilter stackFilter = new StackFilter(DEFAULT_MAX_FRAMES); + + private final Listener listener = new Listener(); + + private ScopeDiagnostics() {} + + // ---- public static facade ------------------------------------------------ + + /** Clears any prior data and starts recording with the default stack depth. */ + public static void startRecording() { + startRecording(DEFAULT_MAX_FRAMES); + } + + /** Clears any prior data and starts recording, keeping up to {@code maxFrames} per stack. */ + public static void startRecording(int maxFrames) { + INSTANCE.reset(); + INSTANCE.stackFilter = new StackFilter(maxFrames); + ContinuationDiagnostics.install(INSTANCE.listener); + } + + /** Stops recording (uninstalls the listener). Recorded data remains queryable until reset. */ + public static void stop() { + ContinuationDiagnostics.clear(); + } + + /** Discards all recorded data. */ + public static void reset() { + INSTANCE.records.clear(); + INSTANCE.rootWrittenNanos.clear(); + INSTANCE.seq.set(0); + } + + /** Builds an immutable snapshot report of everything recorded so far. */ + public static ScopeDiagnosticsReport report() { + List snapshot; + synchronized (INSTANCE.records) { + snapshot = new ArrayList<>(INSTANCE.records.values()); + } + return new ScopeDiagnosticsReport(snapshot, new ConcurrentHashMap<>(INSTANCE.rootWrittenNanos)); + } + + /** + * Fails with an {@link AssertionError} (carrying the leak summary) if any never-resolved leak or + * double/invalid resolution was recorded. Late-after-root is reported but does not fail. + */ + public static void assertNoLeaks() { + ScopeDiagnosticsReport report = report(); + if (report.hasProblems()) { + throw new AssertionError( + "Scope continuation leaks detected:\n" + + report.renderSummary() + + "\n" + + report.renderGantt()); + } + } + + /** Writes the JSON report under {@code build/scope-diagnostics/.json}. */ + public static Path writeJson(String name) { + return write(name, ".json", report().toJson()); + } + + /** Writes the Mermaid Gantt timeline under {@code build/scope-diagnostics/.md}. */ + public static Path writeMermaid(String name) { + return write(name, ".md", report().toMermaidGantt()); + } + + private static Path write(String name, String extension, String content) { + String safe = name.replaceAll("[^A-Za-z0-9._-]", "_"); + Path path = Paths.get("build", "scope-diagnostics", safe + extension); + try { + Files.createDirectories(path.getParent()); + Files.write(path, content.getBytes(StandardCharsets.UTF_8)); + } catch (IOException e) { + log.warn("Failed to write scope diagnostics to {}", path, e); + } + return path; + } + + // ---- listener implementation --------------------------------------------- + + private ScopeEvent event(ScopeEvent.Type type) { + return new ScopeEvent( + type, + Thread.currentThread().getName(), + System.nanoTime(), + stackFilter.filter(new Throwable().getStackTrace())); + } + + private final class Listener implements ContinuationDiagnostics.Listener { + @Override + public void onCapture(AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source) { + try { + ContinuationRecord record = + new ContinuationRecord( + seq.getAndIncrement(), + traceId, + spanId, + source, + false, + event(ScopeEvent.Type.CAPTURE)); + records.put(id, record); + } catch (Throwable ignored) { + // diagnostics must never disturb the tracer + } + } + + @Override + public void onActivate( + AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source) { + try { + recordFor(id, traceId, spanId, source).addActivation(event(ScopeEvent.Type.ACTIVATE)); + } catch (Throwable ignored) { + } + } + + @Override + public void onResolve(AgentScope.Continuation id, boolean cancelled) { + try { + ScopeEvent.Type type = + cancelled ? ScopeEvent.Type.RESOLVE_CANCEL : ScopeEvent.Type.RESOLVE_FINISH; + recordFor(id, DDTraceId.ZERO, 0, (byte) -1).addResolution(event(type)); + } catch (Throwable ignored) { + } + } + + @Override + public void onRootWritten(DDTraceId traceId) { + try { + rootWrittenNanos.putIfAbsent(traceId, System.nanoTime()); + } catch (Throwable ignored) { + } + } + + /** Returns the record for an id, creating an orphan record if capture was not observed. */ + private ContinuationRecord recordFor( + AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source) { + synchronized (records) { + ContinuationRecord existing = records.get(id); + if (existing != null) { + return existing; + } + ContinuationRecord orphan = + new ContinuationRecord(seq.getAndIncrement(), traceId, spanId, source, true, null); + records.put(id, orphan); + return orphan; + } + } + } +} diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsExtension.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsExtension.java new file mode 100644 index 00000000000..65ebcfc325d --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsExtension.java @@ -0,0 +1,76 @@ +package datadog.trace.agent.test.scopediag; + +import java.lang.reflect.AnnotatedElement; +import java.util.Optional; +import org.junit.jupiter.api.extension.AfterEachCallback; +import org.junit.jupiter.api.extension.BeforeEachCallback; +import org.junit.jupiter.api.extension.ExtensionContext; +import org.junit.platform.commons.support.AnnotationSupport; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * JUnit5 extension that enables {@link ScopeDiagnostics} for tests annotated with {@link + * TrackScopeContinuations}. Registered on {@code AbstractInstrumentationTest} but dormant unless + * the test class or method carries the annotation, so unannotated tests pay nothing. + * + *

Per test: resets and starts recording before; after, renders the Gantt to the log, writes the + * JSON report, and optionally asserts no leaks. + */ +public final class ScopeDiagnosticsExtension implements BeforeEachCallback, AfterEachCallback { + private static final Logger log = LoggerFactory.getLogger(ScopeDiagnosticsExtension.class); + + @Override + public void beforeEach(ExtensionContext context) { + if (resolve(context) != null) { + ScopeDiagnostics.startRecording(); + } + } + + @Override + public void afterEach(ExtensionContext context) { + TrackScopeContinuations config = resolve(context); + if (config == null) { + return; + } + ScopeDiagnostics.stop(); + ScopeDiagnosticsReport report = ScopeDiagnostics.report(); + if (config.gantt()) { + log.info("[{}] {}", context.getDisplayName(), report.renderGantt()); + } else { + log.info("[{}] {}", context.getDisplayName(), report.renderSummary()); + } + if (config.json()) { + ScopeDiagnostics.writeJson(context.getUniqueId()); + } + if (config.mermaid()) { + log.info( + "[{}] Mermaid timeline written to {}", + context.getDisplayName(), + ScopeDiagnostics.writeMermaid(context.getUniqueId())); + } + try { + if (config.failOnLeak()) { + ScopeDiagnostics.assertNoLeaks(); + } + } finally { + ScopeDiagnostics.reset(); + } + } + + /** Method-level annotation wins; otherwise the test class (incl. inherited). */ + private static TrackScopeContinuations resolve(ExtensionContext context) { + Optional element = context.getElement(); + if (element.isPresent()) { + Optional onElement = + AnnotationSupport.findAnnotation(element.get(), TrackScopeContinuations.class); + if (onElement.isPresent()) { + return onElement.get(); + } + } + return context + .getTestClass() + .flatMap(c -> AnnotationSupport.findAnnotation(c, TrackScopeContinuations.class)) + .orElse(null); + } +} diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReport.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReport.java new file mode 100644 index 00000000000..4b5aead05f3 --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReport.java @@ -0,0 +1,510 @@ +package datadog.trace.agent.test.scopediag; + +import datadog.trace.api.DDTraceId; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +/** + * An immutable snapshot of the recorded continuation lifecycles plus the derived leak findings. + * Renders three views: a text Gantt timeline, a leak-only summary, and a JSON document. + */ +public final class ScopeDiagnosticsReport { + /** A derived problem classification for a single continuation. */ + public enum Flag { + /** Captured but never resolved (neither finished nor cancelled) within the window. */ + LEAK, + /** Activated or resolved after the root span of its trace had already been written. */ + LATE, + /** Resolved more than once, or activated after being resolved. */ + DOUBLE + } + + private final List records; + private final Map rootWrittenNanos; + private final long t0; + private final Map> findings; + + ScopeDiagnosticsReport(List records, Map rootWrittenNanos) { + this.records = new ArrayList<>(records); + this.records.sort((a, b) -> Long.compare(a.firstNanos(), b.firstNanos())); + this.rootWrittenNanos = rootWrittenNanos; + this.t0 = computeT0(this.records); + this.findings = classify(this.records, rootWrittenNanos); + } + + private static long computeT0(List records) { + long min = Long.MAX_VALUE; + for (ContinuationRecord r : records) { + min = Math.min(min, r.firstNanos()); + } + return min == Long.MAX_VALUE ? 0 : min; + } + + private static Map> classify( + List records, Map rootWrittenNanos) { + Map> result = new LinkedHashMap<>(); + for (ContinuationRecord r : records) { + EnumSet flags = EnumSet.noneOf(Flag.class); + + if (!r.isResolved()) { + flags.add(Flag.LEAK); + } + + List resolutions = r.resolutions(); + List activations = r.activations(); + if (resolutions.size() > 1) { + flags.add(Flag.DOUBLE); + } + if (!resolutions.isEmpty()) { + long firstResolve = resolutions.get(0).nanos; + for (ScopeEvent a : activations) { + if (a.nanos > firstResolve) { + flags.add(Flag.DOUBLE); + break; + } + } + } + + Long rootNanos = rootWrittenNanos.get(r.traceId); + if (rootNanos != null) { + if (laterThan(activations, rootNanos) || laterThan(resolutions, rootNanos)) { + flags.add(Flag.LATE); + } + } + + if (!flags.isEmpty()) { + result.put(r, flags); + } + } + return result; + } + + private static boolean laterThan(List events, long nanos) { + for (ScopeEvent e : events) { + if (e.nanos > nanos) { + return true; + } + } + return false; + } + + public List records() { + return new ArrayList<>(records); + } + + public Map> findings() { + return new LinkedHashMap<>(findings); + } + + public int leakCount() { + return countWith(Flag.LEAK); + } + + public int lateCount() { + return countWith(Flag.LATE); + } + + public int doubleCount() { + return countWith(Flag.DOUBLE); + } + + private int countWith(Flag flag) { + int n = 0; + for (EnumSet flags : findings.values()) { + if (flags.contains(flag)) { + n++; + } + } + return n; + } + + /** True when there is a genuine bug to fail on: a never-resolved leak or a double resolution. */ + public boolean hasProblems() { + return leakCount() > 0 || doubleCount() > 0; + } + + // ---- rendering ----------------------------------------------------------- + + /** Full cross-thread Gantt timeline, one block per continuation. */ + public String renderGantt() { + StringBuilder sb = new StringBuilder(); + sb.append("Scope continuation timeline (") + .append(records.size()) + .append(" continuations, ") + .append(leakCount()) + .append(" leaked, ") + .append(lateCount()) + .append(" late-after-root, ") + .append(doubleCount()) + .append(" double/invalid)\n"); + if (records.isEmpty()) { + sb.append(" (no continuations captured)\n"); + return sb.toString(); + } + for (ContinuationRecord r : records) { + EnumSet flags = findings.getOrDefault(r, EnumSet.noneOf(Flag.class)); + sb.append("\n#") + .append(r.seq) + .append(" trace=") + .append(r.traceId) + .append(" span=") + .append(r.spanId) + .append(" src=") + .append(r.sourceName()); + if (r.orphan) { + sb.append(" [ORPHAN]"); + } + if (!flags.isEmpty()) { + sb.append(" ").append(flags); + } + sb.append('\n'); + appendEvent(sb, "capture ", r.capture()); + for (ScopeEvent a : r.activations()) { + appendEvent(sb, "activate", a); + } + for (ScopeEvent res : r.resolutions()) { + appendEvent(sb, res.type == ScopeEvent.Type.RESOLVE_CANCEL ? "cancel " : "finish ", res); + } + if (!r.isResolved()) { + sb.append(" ").append("LEAKED ").append(" (never finished or cancelled)\n"); + } + } + return sb.toString(); + } + + private static final int GANTT_FRAMES = 3; + + private void appendEvent(StringBuilder sb, String label, ScopeEvent event) { + if (event == null) { + sb.append(" ").append(label).append(" (not observed)\n"); + return; + } + sb.append(" ") + .append(label) + .append(" +") + .append(relMillis(event.nanos)) + .append("ms @ ") + .append(event.threadName) + .append(" at ") + .append(event.callsite() == null ? "" : event.callsite()) + .append('\n'); + // a couple of caller frames give context when the top frame is generic (Thread.run, a + // listener dispatch, etc.) — they reveal which library/app code drove the event + StackTraceElement[] stack = event.stack; + if (stack != null) { + for (int i = 1; i < stack.length && i < GANTT_FRAMES; i++) { + sb.append(" from ").append(stack[i]).append('\n'); + } + } + } + + /** Leak-only summary: just the flagged continuations with their callsites. */ + public String renderSummary() { + StringBuilder sb = new StringBuilder(); + sb.append("Scope continuation problems: ") + .append(leakCount()) + .append(" leaked, ") + .append(lateCount()) + .append(" late-after-root, ") + .append(doubleCount()) + .append(" double/invalid\n"); + if (findings.isEmpty()) { + sb.append(" (none)\n"); + return sb.toString(); + } + for (Map.Entry> e : findings.entrySet()) { + ContinuationRecord r = e.getKey(); + ScopeEvent capture = r.capture(); + sb.append(" ") + .append(e.getValue()) + .append(" #") + .append(r.seq) + .append(" trace=") + .append(r.traceId) + .append(" src=") + .append(r.sourceName()) + .append(" captured at ") + .append(capture == null || capture.callsite() == null ? "" : capture.callsite()) + .append('\n'); + } + return sb.toString(); + } + + /** + * Renders a Mermaid Gantt diagram (wrapped in a ```mermaid fence) that renders inline in GitHub, + * GitLab, and most IDEs. Continuations are grouped into swimlanes by the thread where they were + * activated/resolved; each is a bar from capture to resolution. Leaks extend to the end of the + * window and are marked {@code crit} (red); late-after-root is marked {@code active}. Times are + * milliseconds relative to the first event (rounded — this is an overview, not a profiler). + */ + public String toMermaidGantt() { + StringBuilder sb = new StringBuilder(); + sb.append("```mermaid\n"); + sb.append("gantt\n"); + sb.append(" title Scope continuations (") + .append(records.size()) + .append(" total, ") + .append(leakCount()) + .append(" leaked, ") + .append(lateCount()) + .append(" late, ") + .append(doubleCount()) + .append(" double)\n"); + sb.append(" dateFormat x\n"); + sb.append(" axisFormat %Lms\n"); + if (records.isEmpty()) { + sb.append(" section none\n no continuations captured :done, 0, 1\n```\n"); + return sb.toString(); + } + + long windowEnd = relMillisRounded(maxNanos()); + + // A continuation contributes a "cap" marker in its capture-thread lane and (when it ran) a + // run/finish bar in its activation/resolution-thread lane. The same #seq appearing in two lanes + // makes the cross-thread hop visible: captured here, continued/cancelled there. + Map> lanes = new LinkedHashMap<>(); + for (ContinuationRecord r : records) { + EnumSet flags = findings.getOrDefault(r, EnumSet.noneOf(Flag.class)); + ScopeEvent capture = r.capture(); + List activations = r.activations(); + List resolutions = r.resolutions(); + boolean leak = flags.contains(Flag.LEAK); + + if (capture != null) { + long start = relMillisRounded(capture.nanos); + if (leak && activations.isEmpty()) { + // captured but never even activated: leak lives, open-ended, in the capture lane + addTask( + lanes, + capture.threadName, + "#" + r.seq + " LEAK cap " + where(capture), + "crit", + start, + Math.max(start + 1, windowEnd)); + } else { + addTask( + lanes, + capture.threadName, + "cap #" + r.seq + " " + where(capture), + "milestone", + start, + start); + } + } + + if (!activations.isEmpty()) { + ScopeEvent firstAct = activations.get(0); + long runStart = relMillisRounded(firstAct.nanos); + if (resolutions.isEmpty()) { + // activated but never closed: open-ended leak in the activation lane + addTask( + lanes, + firstAct.threadName, + "#" + r.seq + " LEAK ran " + where(firstAct), + "crit", + runStart, + Math.max(runStart + 1, windowEnd)); + } else { + ScopeEvent lastRes = resolutions.get(resolutions.size() - 1); + long runEnd = relMillisRounded(lastRes.nanos); + if (runEnd <= runStart) { + runEnd = runStart + 1; + } + String verb = lastRes.type == ScopeEvent.Type.RESOLVE_CANCEL ? "cancel" : "fin"; + String tag = + flags.contains(Flag.DOUBLE) ? "crit" : flags.contains(Flag.LATE) ? "active" : "done"; + addTask( + lanes, + lastRes.threadName, + "#" + r.seq + " " + verb + " " + where(lastRes), + tag, + runStart, + runEnd); + } + } else if (!resolutions.isEmpty()) { + // resolved without an observed activation (e.g. cancelled before use) + ScopeEvent lastRes = resolutions.get(resolutions.size() - 1); + long start = relMillisRounded(lastRes.nanos); + String verb = lastRes.type == ScopeEvent.Type.RESOLVE_CANCEL ? "cancel" : "fin"; + addTask( + lanes, + lastRes.threadName, + "#" + r.seq + " " + verb + " " + where(lastRes), + "done", + start, + start + 1); + } + } + + for (Map.Entry> lane : lanes.entrySet()) { + sb.append(" section ").append(sanitize(lane.getKey())).append('\n'); + for (String task : lane.getValue()) { + sb.append(" ").append(task).append('\n'); + } + } + sb.append("```\n"); + return sb.toString(); + } + + private void addTask( + Map> lanes, + String thread, + String label, + String tag, + long start, + long end) { + lanes + .computeIfAbsent(thread, k -> new ArrayList<>()) + .add(sanitize(label) + " :" + tag + ", " + start + ", " + end); + } + + /** Short "SimpleClass.method" for a Mermaid label, or "?" when the callsite was filtered out. */ + private static String where(ScopeEvent event) { + StackTraceElement frame = event == null ? null : event.callsite(); + if (frame == null) { + return "?"; + } + String className = frame.getClassName(); + int dot = className.lastIndexOf('.'); + String simple = dot >= 0 ? className.substring(dot + 1) : className; + return simple + "." + frame.getMethodName(); + } + + private long maxNanos() { + long max = t0; + for (ContinuationRecord r : records) { + ScopeEvent c = r.capture(); + if (c != null) { + max = Math.max(max, c.nanos); + } + for (ScopeEvent e : r.activations()) { + max = Math.max(max, e.nanos); + } + for (ScopeEvent e : r.resolutions()) { + max = Math.max(max, e.nanos); + } + } + return max; + } + + private long relMillisRounded(long nanos) { + return Math.round((nanos - t0) / 1_000_000.0); + } + + /** Strips characters that would break Mermaid task/section syntax. */ + private static String sanitize(String s) { + return s.replace(':', ' ').replace(',', ' ').replace(';', ' ').replace('\n', ' '); + } + + /** Machine-readable view for CI artifacts and later tooling. */ + public String toJson() { + StringBuilder sb = new StringBuilder(); + sb.append("{\"continuations\":["); + for (int i = 0; i < records.size(); i++) { + if (i > 0) { + sb.append(','); + } + appendRecordJson(sb, records.get(i)); + } + sb.append("],\"summary\":{\"total\":") + .append(records.size()) + .append(",\"leaked\":") + .append(leakCount()) + .append(",\"lateAfterRoot\":") + .append(lateCount()) + .append(",\"doubleOrInvalid\":") + .append(doubleCount()) + .append("}}"); + return sb.toString(); + } + + private void appendRecordJson(StringBuilder sb, ContinuationRecord r) { + EnumSet flags = findings.getOrDefault(r, EnumSet.noneOf(Flag.class)); + sb.append("{\"seq\":") + .append(r.seq) + .append(",\"traceId\":\"") + .append(r.traceId) + .append("\",\"spanId\":") + .append(r.spanId) + .append(",\"source\":\"") + .append(r.sourceName()) + .append("\",\"orphan\":") + .append(r.orphan) + .append(",\"flags\":["); + boolean first = true; + for (Flag f : flags) { + if (!first) { + sb.append(','); + } + sb.append('"').append(f).append('"'); + first = false; + } + sb.append("],\"capture\":"); + appendEventJson(sb, r.capture()); + sb.append(",\"activations\":["); + appendEventsJson(sb, r.activations()); + sb.append("],\"resolutions\":["); + appendEventsJson(sb, r.resolutions()); + sb.append("]}"); + } + + private void appendEventsJson(StringBuilder sb, List events) { + for (int i = 0; i < events.size(); i++) { + if (i > 0) { + sb.append(','); + } + appendEventJson(sb, events.get(i)); + } + } + + private void appendEventJson(StringBuilder sb, ScopeEvent event) { + if (event == null) { + sb.append("null"); + return; + } + sb.append("{\"type\":\"") + .append(event.type) + .append("\",\"thread\":\"") + .append(jsonEscape(event.threadName)) + .append("\",\"relMillis\":") + .append(relMillis(event.nanos)) + .append(",\"callsite\":\"") + .append(jsonEscape(event.callsite() == null ? "" : event.callsite().toString())) + .append("\"}"); + } + + private String relMillis(long nanos) { + double ms = (nanos - t0) / 1_000_000.0; + return String.format("%.3f", ms); + } + + private static String jsonEscape(String s) { + StringBuilder out = new StringBuilder(s.length() + 8); + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + switch (c) { + case '"': + out.append("\\\""); + break; + case '\\': + out.append("\\\\"); + break; + case '\n': + out.append("\\n"); + break; + case '\r': + out.append("\\r"); + break; + case '\t': + out.append("\\t"); + break; + default: + out.append(c); + } + } + return out.toString(); + } +} diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeEvent.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeEvent.java new file mode 100644 index 00000000000..5bc6bb203e3 --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeEvent.java @@ -0,0 +1,32 @@ +package datadog.trace.agent.test.scopediag; + +/** + * A single observed point in a continuation's lifecycle. Time, thread, and stack are captured by + * the recorder on the event's own thread (notifications are synchronous), so they reflect the + * thread that actually captured/activated/resolved the continuation. + */ +public final class ScopeEvent { + public enum Type { + CAPTURE, + ACTIVATE, + RESOLVE_FINISH, + RESOLVE_CANCEL + } + + public final Type type; + public final String threadName; + public final long nanos; + public final StackTraceElement[] stack; + + ScopeEvent(Type type, String threadName, long nanos, StackTraceElement[] stack) { + this.type = type; + this.threadName = threadName; + this.nanos = nanos; + this.stack = stack; + } + + /** The most relevant (top, post-filter) frame, or {@code null} if none survived filtering. */ + public StackTraceElement callsite() { + return stack != null && stack.length > 0 ? stack[0] : null; + } +} diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeSources.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeSources.java new file mode 100644 index 00000000000..7bf7a81fa3b --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeSources.java @@ -0,0 +1,25 @@ +package datadog.trace.agent.test.scopediag; + +/** + * Maps the {@code byte} scope source used by the tracer to a readable name. Mirrors the constants + * in {@code datadog.trace.core.scopemanager.ContinuableScope} (which are package-private and not + * visible from here). + */ +final class ScopeSources { + private ScopeSources() {} + + static String name(byte source) { + switch (source) { + case 0: + return "INSTRUMENTATION"; + case 1: + return "MANUAL"; + case 2: + return "ITERATION"; + case 3: + return "CONTEXT"; + default: + return "UNKNOWN(" + source + ")"; + } + } +} diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/StackFilter.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/StackFilter.java new file mode 100644 index 00000000000..17b53169061 --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/StackFilter.java @@ -0,0 +1,71 @@ +package datadog.trace.agent.test.scopediag; + +import java.util.ArrayList; +import java.util.List; + +/** + * Trims a raw stack trace down to the frames that point at where a continuation was + * captured/activated/resolved: it drops the diagnostic plumbing, the scope-manager internals, and + * the executor/reflection scaffolding, keeping the top {@code maxFrames} meaningful frames. + */ +final class StackFilter { + private static final String[] DROP_PREFIXES = { + // diagnostic harness itself + "datadog.trace.agent.test.scopediag.", + // tracer scope/continuation machinery and the capture/activate plumbing it sits behind + "datadog.trace.core.", + "datadog.trace.bootstrap.instrumentation.java.concurrent.", + "datadog.trace.bootstrap.instrumentation.api.", + "datadog.trace.bootstrap.InstrumentationContext", + // JDK executor/reflection scaffolding between the caller and the capture + "java.lang.Thread.getStackTrace", + "java.util.concurrent.ThreadPoolExecutor", + "java.util.concurrent.ScheduledThreadPoolExecutor", + "java.util.concurrent.ForkJoinPool", + "java.util.concurrent.ForkJoinWorkerThread", + "java.util.concurrent.Executors$", + "java.util.concurrent.FutureTask", + "java.util.concurrent.CompletableFuture", + "jdk.internal.reflect.", + "java.lang.reflect.", + "sun.reflect.", + // Spock/Groovy/ByteBuddy mock & dynamic-dispatch scaffolding (test harness, not a callsite) + "org.spockframework.mock.", + "org.codehaus.groovy.", + "groovy.lang.", + "net.bytebuddy.", + }; + + private final int maxFrames; + + StackFilter(int maxFrames) { + this.maxFrames = maxFrames; + } + + StackTraceElement[] filter(StackTraceElement[] raw) { + if (raw == null) { + return new StackTraceElement[0]; + } + List kept = new ArrayList<>(maxFrames); + for (StackTraceElement frame : raw) { + if (isDropped(frame)) { + continue; + } + kept.add(frame); + if (kept.size() >= maxFrames) { + break; + } + } + return kept.toArray(new StackTraceElement[0]); + } + + private static boolean isDropped(StackTraceElement frame) { + String fqn = frame.getClassName() + "." + frame.getMethodName(); + for (String prefix : DROP_PREFIXES) { + if (fqn.startsWith(prefix) || frame.getClassName().startsWith(prefix)) { + return true; + } + } + return false; + } +} diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/TrackScopeContinuations.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/TrackScopeContinuations.java new file mode 100644 index 00000000000..e7301e92ef4 --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/TrackScopeContinuations.java @@ -0,0 +1,31 @@ +package datadog.trace.agent.test.scopediag; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Inherited; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +/** + * Opt-in marker that enables the scope-continuation leak diagnostic for an instrumentation test + * class or method. When absent the diagnostic stays dormant, so existing tests are unaffected. + * + *

Honored by both the JUnit5 {@link ScopeDiagnosticsExtension} and the Groovy/Spock {@code + * InstrumentationSpecification}. + */ +@Retention(RetentionPolicy.RUNTIME) +@Target({ElementType.TYPE, ElementType.METHOD}) +@Inherited +public @interface TrackScopeContinuations { + /** Fail the test if a never-resolved leak or double/invalid resolution is detected. */ + boolean failOnLeak() default false; + + /** Render the full text Gantt timeline to the log after the test. */ + boolean gantt() default true; + + /** Write the JSON report under {@code build/scope-diagnostics/}. */ + boolean json() default true; + + /** Write a Mermaid Gantt timeline ({@code .md}) under {@code build/scope-diagnostics/}. */ + boolean mermaid() default true; +} diff --git a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsIntegrationTest.java b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsIntegrationTest.java new file mode 100644 index 00000000000..16f3779884e --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsIntegrationTest.java @@ -0,0 +1,71 @@ +package datadog.trace.agent.test.scopediag; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import datadog.trace.bootstrap.instrumentation.api.AgentScope; +import datadog.trace.bootstrap.instrumentation.api.AgentSpan; +import datadog.trace.common.writer.ListWriter; +import datadog.trace.core.CoreTracer; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; + +/** + * Exercises the full path: a real {@link CoreTracer} capturing continuations via {@code + * captureSpan} drives {@code ScopeContinuation} -> {@code ContinuationDiagnostics} -> {@link + * ScopeDiagnostics}, and the derived report classifies the leak correctly. + */ +class ScopeDiagnosticsIntegrationTest { + + private CoreTracer tracer; + + @AfterEach + void tearDown() { + ScopeDiagnostics.stop(); + ScopeDiagnostics.reset(); + if (tracer != null) { + tracer.close(); + } + } + + @Test + void capturesRealLeakAndResolvedContinuation() { + tracer = CoreTracer.builder().writer(new ListWriter()).strictTraceWrites(false).build(); + + ScopeDiagnostics.startRecording(); + + AgentSpan span = tracer.startSpan("test", "op"); + AgentScope.Continuation leaked = tracer.captureSpan(span); // captured, never resolved + AgentScope.Continuation resolved = tracer.captureSpan(span); + resolved.cancel(); + + ScopeDiagnosticsReport report = ScopeDiagnostics.report(); + + assertEquals(2, report.records().size(), "both captures recorded"); + assertEquals(1, report.leakCount(), "exactly the un-resolved continuation leaks"); + assertTrue(report.hasProblems()); + // keep a reference so the leak isn't reclaimed before the assertion + assertFalse(leaked.toString().isEmpty()); + + span.finish(); + } + + @Test + void resolvedContinuationDoesNotLeak() { + tracer = CoreTracer.builder().writer(new ListWriter()).strictTraceWrites(false).build(); + + ScopeDiagnostics.startRecording(); + + AgentSpan span = tracer.startSpan("test", "op"); + AgentScope.Continuation continuation = tracer.captureSpan(span); + AgentScope scope = continuation.activate(); + scope.close(); + span.finish(); + + ScopeDiagnosticsReport report = ScopeDiagnostics.report(); + + assertEquals(1, report.records().size()); + assertEquals(0, report.leakCount(), "activated then closed continuation is resolved"); + } +} diff --git a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReportTest.java b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReportTest.java new file mode 100644 index 00000000000..5303172be58 --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReportTest.java @@ -0,0 +1,145 @@ +package datadog.trace.agent.test.scopediag; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import datadog.trace.api.DDTraceId; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.junit.jupiter.api.Test; + +class ScopeDiagnosticsReportTest { + + private static final StackTraceElement[] STACK = { + new StackTraceElement("com.app.Worker", "submit", "Worker.java", 42) + }; + + private static ScopeEvent event(ScopeEvent.Type type, String thread, long nanos) { + return new ScopeEvent(type, thread, nanos, STACK); + } + + private static ContinuationRecord record(long seq, DDTraceId trace) { + return new ContinuationRecord( + seq, trace, 7L, (byte) 0, false, event(ScopeEvent.Type.CAPTURE, "main", 1000)); + } + + @Test + void resolvedContinuationHasNoFlags() { + ContinuationRecord r = record(0, DDTraceId.from(10)); + r.addActivation(event(ScopeEvent.Type.ACTIVATE, "pool-1", 2000)); + r.addResolution(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 3000)); + + ScopeDiagnosticsReport report = report(list(r), map()); + + assertEquals(0, report.leakCount()); + assertEquals(0, report.lateCount()); + assertEquals(0, report.doubleCount()); + assertFalse(report.hasProblems()); + } + + @Test + void neverResolvedIsFlaggedAsLeak() { + ContinuationRecord r = record(0, DDTraceId.from(11)); + + ScopeDiagnosticsReport report = report(list(r), map()); + + assertEquals(1, report.leakCount()); + assertTrue(report.hasProblems()); + assertTrue(report.renderSummary().contains("LEAK")); + assertTrue(report.renderGantt().contains("Worker.java:42")); + } + + @Test + void resolutionAfterRootWriteIsFlaggedLate() { + DDTraceId trace = DDTraceId.from(12); + ContinuationRecord r = record(0, trace); + r.addActivation(event(ScopeEvent.Type.ACTIVATE, "pool-1", 5000)); + r.addResolution(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 6000)); + + Map rootWritten = map(); + rootWritten.put(trace, 4000L); // root written before the activation/resolution + + ScopeDiagnosticsReport report = report(list(r), rootWritten); + + assertEquals(1, report.lateCount()); + assertEquals(0, report.leakCount()); // it is resolved, just late + } + + @Test + void multipleResolutionsAreFlaggedDouble() { + ContinuationRecord r = record(0, DDTraceId.from(13)); + r.addActivation(event(ScopeEvent.Type.ACTIVATE, "pool-1", 2000)); + r.addResolution(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 3000)); + r.addResolution(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 3500)); + + ScopeDiagnosticsReport report = report(list(r), map()); + + assertEquals(1, report.doubleCount()); + assertTrue(report.hasProblems()); + } + + @Test + void activationAfterResolutionIsFlaggedDouble() { + ContinuationRecord r = record(0, DDTraceId.from(14)); + r.addResolution(event(ScopeEvent.Type.RESOLVE_CANCEL, "pool-1", 2000)); + r.addActivation(event(ScopeEvent.Type.ACTIVATE, "pool-2", 3000)); // activate after cancel + + ScopeDiagnosticsReport report = report(list(r), map()); + + assertEquals(1, report.doubleCount()); + } + + @Test + void jsonContainsSummaryAndRecords() { + ContinuationRecord r = record(0, DDTraceId.from(15)); + ScopeDiagnosticsReport report = report(list(r), map()); + + String json = report.toJson(); + assertTrue(json.contains("\"summary\"")); + assertTrue(json.contains("\"leaked\":1")); + assertTrue(json.contains("\"flags\":[\"LEAK\"]")); + } + + @Test + void mermaidGanttGroupsByThreadAndMarksLeakCrit() { + DDTraceId trace = DDTraceId.from(20); + ContinuationRecord resolved = record(0, trace); + resolved.addActivation(event(ScopeEvent.Type.ACTIVATE, "pool-1", 2000)); + resolved.addResolution(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 3000)); + ContinuationRecord leak = record(1, trace); // capture only -> leak + + String mermaid = report(list(resolved, leak), map()).toMermaidGantt(); + + assertTrue(mermaid.startsWith("```mermaid")); + assertTrue(mermaid.contains("gantt")); + assertTrue(mermaid.contains("dateFormat x")); + assertTrue(mermaid.contains("section main")); // capture lane = capture thread + assertTrue(mermaid.contains("section pool-1")); // run lane = resolution thread + assertTrue(mermaid.contains("cap #0 Worker.submit :milestone")); // where captured + assertTrue(mermaid.contains("#0 fin Worker.submit :done")); // where finished + assertTrue(mermaid.contains("#1 LEAK cap Worker.submit :crit")); // leak in capture lane + assertTrue(mermaid.trim().endsWith("```")); + } + + // ---- helpers ------------------------------------------------------------- + + private static ScopeDiagnosticsReport report( + List records, Map rootWritten) { + return new ScopeDiagnosticsReport(records, rootWritten); + } + + private static List list(ContinuationRecord... rs) { + List l = new ArrayList<>(); + for (ContinuationRecord r : rs) { + l.add(r); + } + return l; + } + + private static Map map() { + return new HashMap<>(); + } +} diff --git a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/StackFilterTest.java b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/StackFilterTest.java new file mode 100644 index 00000000000..3853ab274fd --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/StackFilterTest.java @@ -0,0 +1,53 @@ +package datadog.trace.agent.test.scopediag; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.Test; + +class StackFilterTest { + + private static StackTraceElement frame(String cls, String method) { + return new StackTraceElement(cls, method, "Src.java", 1); + } + + @Test + void dropsPlumbingAndKeepsAppFrames() { + StackTraceElement[] raw = { + frame("java.lang.Thread", "getStackTrace"), + frame("datadog.trace.agent.test.scopediag.ScopeDiagnostics", "event"), + frame("datadog.trace.core.scopemanager.ScopeContinuation", "register"), + frame("java.util.concurrent.ThreadPoolExecutor", "execute"), + frame("com.app.Service", "doWork"), + frame("com.app.Main", "main"), + }; + + StackTraceElement[] filtered = new StackFilter(6).filter(raw); + + assertEquals(2, filtered.length); + assertEquals("com.app.Service", filtered[0].getClassName()); + assertEquals("com.app.Main", filtered[1].getClassName()); + } + + @Test + void respectsMaxFrames() { + StackTraceElement[] raw = { + frame("com.app.A", "a"), frame("com.app.B", "b"), frame("com.app.C", "c"), + }; + + assertEquals(2, new StackFilter(2).filter(raw).length); + } + + @Test + void handlesNullStack() { + assertEquals(0, new StackFilter(6).filter(null).length); + } + + @Test + void keepsScopeManagerFreeStacks() { + StackTraceElement[] raw = {frame("com.app.Only", "here")}; + StackTraceElement[] filtered = new StackFilter(6).filter(raw); + assertEquals(1, filtered.length); + assertTrue(filtered[0].getClassName().startsWith("com.app")); + } +} diff --git a/dd-java-agent/instrumentation/google-pubsub-1.116/src/test/groovy/PubSubTest.groovy b/dd-java-agent/instrumentation/google-pubsub-1.116/src/test/groovy/PubSubTest.groovy index 09892853956..f6bed488b21 100644 --- a/dd-java-agent/instrumentation/google-pubsub-1.116/src/test/groovy/PubSubTest.groovy +++ b/dd-java-agent/instrumentation/google-pubsub-1.116/src/test/groovy/PubSubTest.groovy @@ -21,6 +21,7 @@ import com.google.pubsub.v1.SubscriptionName import com.google.pubsub.v1.TopicName import datadog.trace.agent.test.asserts.TraceAssert import datadog.trace.agent.test.naming.VersionedNamingTestBase +import datadog.trace.agent.test.scopediag.TrackScopeContinuations import datadog.trace.agent.test.utils.TraceUtils import datadog.trace.api.DDSpanTypes import datadog.trace.api.DDTags @@ -39,6 +40,9 @@ import org.testcontainers.containers.PubSubEmulatorContainer import org.testcontainers.utility.DockerImageName import spock.lang.Shared +import java.nio.charset.StandardCharsets + +@TrackScopeContinuations abstract class PubSubTest extends VersionedNamingTestBase { private static final String PROJECT_ID = "dd-trace-java" diff --git a/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java b/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java index 3372404a42f..559c0b9252f 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java @@ -7,6 +7,7 @@ import datadog.trace.bootstrap.instrumentation.api.AgentScope; import datadog.trace.core.CoreTracer.ConfigSnapshot; import datadog.trace.core.monitor.HealthMetrics; +import datadog.trace.core.scopemanager.ContinuationDiagnostics; import java.util.ArrayList; import java.util.Collection; import java.util.List; @@ -376,6 +377,7 @@ private int write(boolean isPartial) { synchronized (this) { if (!isPartial) { rootSpanWritten = true; + ContinuationDiagnostics.notifyRootWritten(traceId); } int size = size(); boolean writeRunningSpans = diff --git a/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ContinuationDiagnostics.java b/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ContinuationDiagnostics.java new file mode 100644 index 00000000000..ca9065bfcd9 --- /dev/null +++ b/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ContinuationDiagnostics.java @@ -0,0 +1,73 @@ +package datadog.trace.core.scopemanager; + +import datadog.trace.api.DDTraceId; +import datadog.trace.bootstrap.instrumentation.api.AgentScope; + +/** + * Test-time diagnostic seam for tracking the lifecycle of scope {@link AgentScope.Continuation + * continuations} as they are captured on one thread and activated/cancelled on another. + * + *

This class is inert in production: the {@link Listener} is {@code null} unless test code + * installs one via {@link #install(Listener)}. Each notification site reads the listener once + * through a single {@code volatile} read and skips the call when none is installed, so the cost + * when disabled is one volatile read per scope lifecycle event with no allocation and no behavior + * change. + * + *

The seam intentionally carries only continuation identity plus the trace/span ids and + * {@code source}. Wall-clock time, the current thread, and the capturing/activating/resolving stack + * trace are gathered by the listener itself: notifications are synchronous and run on the event's + * own thread, so the listener observes the correct thread and call stack. + */ +public final class ContinuationDiagnostics { + /** + * Receives scope-continuation lifecycle events. Implementations live in test code and must never + * throw back into the tracer (callers do not guard against exceptions). + */ + public interface Listener { + /** A continuation was captured (registered) and may be transferred to another thread. */ + void onCapture(AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source); + + /** A captured continuation was activated, resuming the scope (possibly on another thread). */ + void onActivate(AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source); + + /** + * A continuation was resolved: either finished normally (all activations closed or a clean + * cancel) or cancelled while activations/holds were outstanding. + * + * @param cancelled {@code true} if resolved via the cancel-with-outstanding-work path. + */ + void onResolve(AgentScope.Continuation id, boolean cancelled); + + /** The root span of the given trace has been written (the trace's root close signal). */ + void onRootWritten(DDTraceId traceId); + } + + private static volatile Listener listener; + + private ContinuationDiagnostics() {} + + /** Installs the diagnostic listener. Intended for test code only. */ + public static void install(Listener listener) { + ContinuationDiagnostics.listener = listener; + } + + /** Removes any installed diagnostic listener, returning the seam to its inert state. */ + public static void clear() { + ContinuationDiagnostics.listener = null; + } + + static Listener listener() { + return listener; + } + + /** + * Notifies the installed listener (if any) that the root span of the given trace has been + * written. Safe to call from any package; inert when no listener is installed. + */ + public static void notifyRootWritten(DDTraceId traceId) { + final Listener l = listener; + if (l != null) { + l.onRootWritten(traceId); + } + } +} diff --git a/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ScopeContinuation.java b/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ScopeContinuation.java index 364f1ac402b..210b44ea30b 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ScopeContinuation.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ScopeContinuation.java @@ -64,6 +64,13 @@ final class ScopeContinuation implements AgentScope.Continuation { ScopeContinuation register() { traceCollector.registerContinuation(this); scopeManager.healthMetrics.onCaptureContinuation(); + final ContinuationDiagnostics.Listener listener = ContinuationDiagnostics.listener(); + if (listener != null) { + final AgentSpan span = span(); + if (span != null) { + listener.onCapture(this, span.getTraceId(), span.getSpanId(), source); + } + } return this; } @@ -78,6 +85,13 @@ public AgentScope.Continuation hold() { public AgentScope activate() { if (COUNT.incrementAndGet(this) > 0) { // speculative update succeeded, continuation can be activated + final ContinuationDiagnostics.Listener listener = ContinuationDiagnostics.listener(); + if (listener != null) { + final AgentSpan span = span(); + if (span != null) { + listener.onActivate(this, span.getTraceId(), span.getSpanId(), source); + } + } return scopeManager.continueSpan(this, context, source); } else { // continuation cancelled or too many activations; rollback count @@ -99,11 +113,20 @@ public void cancel() { if (COUNT.compareAndSet(this, current, CANCELLED)) { traceCollector.removeContinuation(this); scopeManager.healthMetrics.onFinishContinuation(); + notifyResolve(false); return; } current = count; } scopeManager.healthMetrics.onCancelContinuation(); + notifyResolve(true); + } + + private void notifyResolve(final boolean cancelled) { + final ContinuationDiagnostics.Listener listener = ContinuationDiagnostics.listener(); + if (listener != null) { + listener.onResolve(this, cancelled); + } } void cancelFromContinuedScopeClose() { @@ -111,6 +134,7 @@ void cancelFromContinuedScopeClose() { // fast path: only one activation of the continuation (no hold) traceCollector.removeContinuation(this); scopeManager.healthMetrics.onFinishContinuation(); + notifyResolve(false); } else if (COUNT.decrementAndGet(this) == 0) { // slow path: multiple activations, all have now closed (no hold) cancel(); diff --git a/docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md b/docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md new file mode 100644 index 00000000000..3149bc249ae --- /dev/null +++ b/docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md @@ -0,0 +1,197 @@ +# Cross-thread scope-continuation leak diagnostic — design + +**Date:** 2026-06-10 +**Status:** Approved (design); implementation pending +**Scope:** Test-time only + +## Problem + +dd-trace-java transfers trace scopes between threads via *continuations*: a scope is +**captured** on one thread (`ScopeContinuation`, registered with the trace collector, +bumping `PendingTrace.pendingReferenceCount`) and later **activated** and/or **cancelled** +on another thread/executor. When a captured continuation is never resolved (neither +activated nor cancelled), or is resolved after the root span has already finished, the +`PendingTraceBuffer` either keeps the trace alive past the root close or discards a span +that arrives too late. In tests this surfaces as the need to disable `strictTraceWrites`, +masking the underlying leak instead of locating it. + +There is no way today to see **where** a continuation was captured, on **which thread** it +was activated/cancelled, and **whether** it leaked. The existing `HealthMetrics` +callbacks (`onCaptureContinuation` / `onFinishContinuation` / `onCancelContinuation`) are +**no-arg** — they carry no continuation identity and no trace id, so capture→activate→cancel +cannot be correlated and "late after root close" cannot be computed. + +## Goal + +A passive, test-time diagnostic that records the lifecycle of every scope continuation with +correlated identity, captures the capture/activate/resolve **callsites** (stack traces), +and renders a Gantt-style timeline plus a leak summary, so a developer can locate the +instrumentation that leaks continuations across threads. + +### Findings the diagnostic must derive +1. **Never-resolved captures** — captured but never activated *and* never cancelled within + the recording window. The classic leak that keeps `pendingReferenceCount > 0` and forces + `strictTraceWrites(false)`. +2. **Cross-thread timeline** — full Gantt of every capture→activate/resolve: capture + thread + callsite, activation thread + callsite, latency, and whether resolution happened + on a different thread/executor (even for non-leaking continuations). +3. **Late activation after root close** — captures that *are* resolved, but activated/closed + after the root span of their trace already finished (`rootSpanWritten`). +4. **Double / invalid resolution** — a continuation activated after cancel, resolved more + than once, or otherwise mishandled in the capture/activate handshake. + +## Non-goals +- Production runtime diagnostic (this is test-time only; production stays inert). +- HTML/visual timeline rendering (text + JSON only). +- Failing tests by default — passive/report-only unless explicitly opted in. +- Parallel in-JVM test execution support (instrumentation tests run one-at-a-time per JVM). + +## Architecture + +Four layers: a minimal production **seam** in `dd-trace-core`, a test-side **recorder +engine**, **renderers**, and **harness integration** (JUnit5 + Spock + static API). + +### 1. Production seam — `dd-trace-core` (`datadog.trace.core.scopemanager`) + +A new `ContinuationDiagnostics` holder, inert in production (listener is `null` unless test +code installs one): + +```java +public final class ContinuationDiagnostics { + public interface Listener { + void onCapture(AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source); + void onActivate(AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source); + void onResolve(AgentScope.Continuation id, boolean cancelled); // finish vs cancel + void onRootWritten(DDTraceId traceId); + } + private static volatile Listener LISTENER; // null in prod + public static void install(Listener l) { LISTENER = l; } + public static void clear() { LISTENER = null; } + static Listener listener() { return LISTENER; } // single volatile read +} +``` + +**Identity** is the `AgentScope.Continuation` instance itself (`ScopeContinuation implements +AgentScope.Continuation`), used only as an identity key downstream (never `equals`/`hashCode`). +`AgentScope.Continuation` lives in `internal-api`, so it is a safe public identity type. + +**Call sites** — each guarded by `Listener l = listener(); if (l != null) { ... }`: + +| Location | Method | Notify | +|---|---|---| +| `ScopeContinuation.register()` | after `traceCollector.registerContinuation(this)` | `onCapture(this, traceId, spanId, source)` | +| `ScopeContinuation.activate()` | successful branch (`COUNT.incrementAndGet(this) > 0`) | `onActivate(this, traceId, spanId, source)` | +| `ScopeContinuation.cancel()` | mirror `onFinishContinuation` (count==0) → `onResolve(this,false)`; mirror `onCancelContinuation` → `onResolve(this,true)` | +| `ScopeContinuation.cancelFromContinuedScopeClose()` | where it calls `onFinishContinuation` | `onResolve(this, false)` | +| `PendingTrace.write()` | non-partial path where `rootSpanWritten = true` | `onRootWritten(traceId)` | + +`traceId`/`spanId` at capture/activate are obtained from the available `context` via +`spanFromContext(context)` (already imported in that package). `source` is the existing +`byte` source field (`INSTRUMENTATION`/`MANUAL`/`ITERATION`/`CONTEXT`). + +**The seam carries only identity + ids + source.** Timestamp, thread, and stack trace are +captured by the recorder *inside* the callback — callbacks run synchronously on the event's +own thread, so `Thread.currentThread()` and `new Throwable().getStackTrace()` are accurate +for the capturing/activating/resolving thread. This keeps the production footprint to a +handful of null-guarded calls with zero behavior change and zero allocation when off. + +### 2. Recorder engine — `:dd-java-agent:instrumentation-testing` (`datadog.trace.agent.test.scopediag`) + +`ScopeDiagnostics` — static facade and the `Listener` implementation that installs itself: + +- API: `startRecording()`, `reset()`, `stop()`, `report() -> ScopeDiagnosticsReport`, + `assertNoLeaks()`. +- Correlation store: `Collections.synchronizedMap(new IdentityHashMap())` (identity-keyed; never calls `equals`/`hashCode` on the continuation). +- `ContinuationRecord`: monotonic seq id, `traceId`, `spanId`, `source`, the capture + `Event`, a list of activation `Event`s, and the resolution `Event`(s). +- `Event`: `{ threadName, nanos (System.nanoTime), filtered StackTraceElement[] }`. +- `Map rootWrittenNanos` for late-after-root detection. +- A global, time-ordered event list backing the timeline. +- **Stack filter**: drops frames in `datadog.trace.core.scopemanager`, the diagnostics + package, and `java.util.concurrent` executor internals; keeps the top N meaningful frames + (default 5, configurable). Goal: surface the integration advice + app callsite. + +**Derived findings** computed at `report()` time from the records + `rootWrittenNanos`: +never-resolved, late-after-root, double/invalid, and the full cross-thread timeline. + +### 3. Renderers + +- **Text Gantt → SLF4J logger** (default output while recording): one row per continuation — + seq, trace/span, source, `capture thread@callsite`, `activate +Δms thread@callsite`, + `resolve(type) +Δms`, and a flag column (`LEAK` / `LATE` / `DBL`). Times are relative to the + first event in the window. +- **Leak-only summary**: only the flagged problems, each with its callsite(s) — a quick scan + / failure message. +- **JSON → `build/scope-diagnostics/.json`**: events + derived findings, for CI + artifacts and later tooling. + +### 4. Harness integration (passive by default — no impact on existing tests) + +- **Static API** (works anywhere, incl. Groovy and non-JUnit code): + `ScopeDiagnostics.startRecording()` / `report()` / `assertNoLeaks()`. +- **JUnit5**: `ScopeDiagnosticsExtension` (`BeforeEachCallback`, `AfterEachCallback`) + registered on `AbstractInstrumentationTest` via `@ExtendWith`, but **dormant** unless the + test class or method carries `@TrackScopeContinuations`. When enabled: `reset()` + + `startRecording()` beforeEach; afterEach renders the text Gantt to the log, writes the JSON + file, and calls `assertNoLeaks()` only if `@TrackScopeContinuations(failOnLeak = true)`. +- **Spock**: `InstrumentationSpecification.setup()/cleanup()` honor the same + `@TrackScopeContinuations` annotation (and/or an overridable `trackScopeContinuations()` + returning `false` by default), reusing the same static `ScopeDiagnostics` engine. + +`@TrackScopeContinuations` attributes: `failOnLeak` (default `false`), +`json` (default `true`), `gantt` (default `true`). + +## Data flow + +``` +capture thread: captureSpan() -> new ScopeContinuation -> register() + -> Listener.onCapture(this, traceId, spanId, source) + -> recorder: create ContinuationRecord{seq, ids, source, captureEvent(thread, t, stack)} + +worker thread: continuation.activate() (COUNT>0) + -> Listener.onActivate(this, ...) -> recorder: append activation Event + +worker thread: continued scope close / cancel + -> Listener.onResolve(this, cancelled) -> recorder: append resolution Event + +any thread: PendingTrace.write() (rootSpanWritten=true) + -> Listener.onRootWritten(traceId) -> recorder: rootWrittenNanos[traceId] = now + +report(): walk records -> classify (never-resolved / late-after-root / double) -> render +``` + +## Error handling +- Recorder callbacks must never throw into tracer code: each `Listener` method body is + wrapped so any diagnostic failure is swallowed/logged, never propagated into + `ScopeContinuation`/`PendingTrace`. +- An `onActivate`/`onResolve` for a continuation with no recorded capture (e.g. captured + before `startRecording()` in the previous test) is recorded as an "orphan" entry rather + than dropped — itself a useful cross-test leak signal. + +## Testing strategy +- Unit tests (JUnit5) for the recorder/classifier: feed synthetic `Listener` event sequences + (capture-only; capture+activate+resolve; capture+late-resolve vs `onRootWritten`; + activate-after-cancel) and assert the derived findings and rendered summary. +- Unit tests for the stack filter (frames in/out). +- Integration test: a small instrumentation test annotated `@TrackScopeContinuations` that + deliberately captures a continuation and never resolves it (e.g. submit to an executor that + drops the task), asserting `report()` flags exactly one never-resolved leak with the + expected capture callsite. Mirror in one Spock spec to verify the Groovy hook. +- Verify production inertness: with no listener installed, the guarded call sites are + exercised by existing tracer-core tests with no behavior change (no new failures). + +## Cost & caveats +- **When off:** one volatile read per scope lifecycle event, no allocation, no behavior change. +- **When on:** a stack-trace capture per event (test-time, acceptable). +- Assumes **one test at a time per JVM** (holds for instrumentation tests). `reset()` isolates + tests; events from a previous test's leaked continuation arriving after `reset()` are + attributed to the new test as orphans — a useful, not harmful, signal. + +## Module placement summary +| Piece | Module | +|---|---| +| `ContinuationDiagnostics` + seam call sites | `dd-trace-core` (`datadog.trace.core.scopemanager`, + `PendingTrace`) | +| `ScopeDiagnostics` engine, records, classifier, renderers | `:dd-java-agent:instrumentation-testing` (`datadog.trace.agent.test.scopediag`) | +| `ScopeDiagnosticsExtension`, `@TrackScopeContinuations` | `:dd-java-agent:instrumentation-testing` | +| `AbstractInstrumentationTest` / `InstrumentationSpecification` wiring | `:dd-java-agent:instrumentation-testing` | From 2dd7708b676210c198b295e8958e7598e8b8cdd4 Mon Sep 17 00:00:00 2001 From: Andrea Marziali Date: Thu, 11 Jun 2026 10:38:18 +0200 Subject: [PATCH 2/9] wip2 --- .../trace/agent/test/InstrumentationSpecification.groovy | 5 +++-- .../google-pubsub-1.116/src/test/groovy/PubSubTest.groovy | 7 ++++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy b/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy index 2f84bc1e5d1..fe2c4df5e5e 100644 --- a/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy +++ b/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy @@ -547,11 +547,12 @@ abstract class InstrumentationSpecification extends DDSpecification implements A ScopeDiagnostics.stop() def report = ScopeDiagnostics.report() println(config.gantt() ? report.renderGantt() : report.renderSummary()) + def reportName = specificationContext.currentSpec.name + "." + specificationContext.currentIteration.name if (config.json()) { - ScopeDiagnostics.writeJson(specificationContext.currentIteration.name) + ScopeDiagnostics.writeJson(reportName) } if (config.mermaid()) { - println("Mermaid timeline written to " + ScopeDiagnostics.writeMermaid(specificationContext.currentIteration.name)) + println("Mermaid timeline written to " + ScopeDiagnostics.writeMermaid(reportName)) } try { if (config.failOnLeak()) { diff --git a/dd-java-agent/instrumentation/google-pubsub-1.116/src/test/groovy/PubSubTest.groovy b/dd-java-agent/instrumentation/google-pubsub-1.116/src/test/groovy/PubSubTest.groovy index f6bed488b21..5443ac22c4c 100644 --- a/dd-java-agent/instrumentation/google-pubsub-1.116/src/test/groovy/PubSubTest.groovy +++ b/dd-java-agent/instrumentation/google-pubsub-1.116/src/test/groovy/PubSubTest.groovy @@ -42,7 +42,7 @@ import spock.lang.Shared import java.nio.charset.StandardCharsets -@TrackScopeContinuations +@TrackScopeContinuations(failOnLeak = true) abstract class PubSubTest extends VersionedNamingTestBase { private static final String PROJECT_ID = "dd-trace-java" @@ -71,6 +71,11 @@ abstract class PubSubTest extends VersionedNamingTestBase { null } + @Override + boolean useStrictTraceWrites() { + true + } + boolean shadowGrpcSpans() { true } From 83b5e4244be020b011add1f1b5fbeb99bc25c887 Mon Sep 17 00:00:00 2001 From: Andrea Marziali Date: Fri, 12 Jun 2026 10:04:10 +0200 Subject: [PATCH 3/9] wip --- .../datadog/trace/core/scopemanager/ScopeContinuation.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ScopeContinuation.java b/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ScopeContinuation.java index 210b44ea30b..6ce500b7aa1 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ScopeContinuation.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ScopeContinuation.java @@ -118,8 +118,10 @@ public void cancel() { } current = count; } + // Reaching here means cancel() was called while activations are still outstanding: it does + // not de-reference the continuation (the active scopes will, on close), so it is not a + // resolution for leak-tracking purposes — only the existing health metric is recorded. scopeManager.healthMetrics.onCancelContinuation(); - notifyResolve(true); } private void notifyResolve(final boolean cancelled) { From 632f6260987c1a9b6640db67c5dd2e0f633807a9 Mon Sep 17 00:00:00 2001 From: Andrea Marziali Date: Fri, 12 Jun 2026 18:21:38 +0200 Subject: [PATCH 4/9] wipwip --- .../test/scopediag/ContinuationAdvice.java | 63 +++++++ .../test/scopediag/PendingTraceAdvice.java | 26 +++ .../scopediag/ScopeContinuationProbe.java | 159 ++++++++++++++++++ .../ScopeContinuationTransformer.java | 55 ++++++ .../test/scopediag/ScopeDiagnostics.java | 38 +++-- .../scopediag/ScopeContinuationProbeTest.java | 27 +++ .../ScopeDiagnosticsIntegrationTest.java | 2 +- .../java/datadog/trace/core/PendingTrace.java | 2 - .../scopemanager/ContinuationDiagnostics.java | 73 -------- .../core/scopemanager/ScopeContinuation.java | 26 --- 10 files changed, 358 insertions(+), 113 deletions(-) create mode 100644 dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationAdvice.java create mode 100644 dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/PendingTraceAdvice.java create mode 100644 dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbe.java create mode 100644 dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationTransformer.java create mode 100644 dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbeTest.java delete mode 100644 dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ContinuationDiagnostics.java diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationAdvice.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationAdvice.java new file mode 100644 index 00000000000..c56a64736c7 --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationAdvice.java @@ -0,0 +1,63 @@ +package datadog.trace.agent.test.scopediag; + +import net.bytebuddy.asm.Advice; + +/** + * Test-only ByteBuddy advice woven into {@code datadog.trace.core.scopemanager.ScopeContinuation}. + * + *

The target type is package-private and cannot be named here, so {@code this} is typed as + * {@link Object} and re-cast to the public {@code AgentScope.Continuation} supertype inside {@link + * ScopeContinuationProbe}. {@link Advice.FieldValue} reads the private {@code count} field — legal + * because the advice is inlined into the field's own class. + */ +public final class ContinuationAdvice { + private ContinuationAdvice() {} + + /** {@code register()} — the continuation was captured. */ + public static final class Register { + @Advice.OnMethodExit(suppress = Throwable.class) + public static void exit(@Advice.This Object self) { + ScopeContinuationProbe.onCapture(self); + } + } + + /** {@code activate()} — a (possibly noop) activation; the probe filters the rollback branch. */ + public static final class Activate { + @Advice.OnMethodExit(suppress = Throwable.class) + public static void exit(@Advice.This Object self, @Advice.Return Object scope) { + ScopeContinuationProbe.onActivate(self, scope); + } + } + + /** {@code cancel()} — resolution detected via the {@code count} transition. */ + public static final class Cancel { + @Advice.OnMethodEnter + public static int enter(@Advice.FieldValue("count") int count) { + return count; + } + + @Advice.OnMethodExit(suppress = Throwable.class) + public static void exit( + @Advice.This Object self, + @Advice.Enter int countBefore, + @Advice.FieldValue("count") int countAfter) { + ScopeContinuationProbe.onResolve(self, countBefore, countAfter); + } + } + + /** {@code cancelFromContinuedScopeClose()} — same resolution detection as {@link Cancel}. */ + public static final class CancelFromClose { + @Advice.OnMethodEnter + public static int enter(@Advice.FieldValue("count") int count) { + return count; + } + + @Advice.OnMethodExit(suppress = Throwable.class) + public static void exit( + @Advice.This Object self, + @Advice.Enter int countBefore, + @Advice.FieldValue("count") int countAfter) { + ScopeContinuationProbe.onResolve(self, countBefore, countAfter); + } + } +} diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/PendingTraceAdvice.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/PendingTraceAdvice.java new file mode 100644 index 00000000000..48264955738 --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/PendingTraceAdvice.java @@ -0,0 +1,26 @@ +package datadog.trace.agent.test.scopediag; + +import net.bytebuddy.asm.Advice; + +/** + * Test-only ByteBuddy advice woven into {@code datadog.trace.core.PendingTrace}. Fires the + * root-written signal at the exact site where the old production seam did: inside {@code + * write(boolean)} on the non-partial path, just before {@code rootSpanWritten} is set, gated on it + * not already being set. {@link Advice.FieldValue} reads the private {@code traceId}/{@code + * rootSpanWritten} fields — legal because the advice is inlined into their own class. + */ +public final class PendingTraceAdvice { + private PendingTraceAdvice() {} + + public static final class Write { + @Advice.OnMethodEnter(suppress = Throwable.class) + public static void enter( + @Advice.Argument(0) boolean isPartial, + @Advice.FieldValue("rootSpanWritten") boolean alreadyWritten, + @Advice.FieldValue("traceId") Object traceId) { + if (!isPartial && !alreadyWritten) { + ScopeContinuationProbe.onRootWritten(traceId); + } + } + } +} diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbe.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbe.java new file mode 100644 index 00000000000..855072fd557 --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbe.java @@ -0,0 +1,159 @@ +package datadog.trace.agent.test.scopediag; + +import datadog.trace.api.DDTraceId; +import datadog.trace.bootstrap.instrumentation.api.AgentScope; +import datadog.trace.bootstrap.instrumentation.api.AgentSpan; +import datadog.trace.bootstrap.instrumentation.api.AgentTracer; +import java.lang.reflect.Field; +import java.util.Collections; +import java.util.IdentityHashMap; +import java.util.Set; + +/** + * Recorder hook that the test-only ByteBuddy advice ({@link ContinuationAdvice}, {@link + * PendingTraceAdvice}) funnels scope-continuation lifecycle events into. It replaces the former + * production {@code ContinuationDiagnostics} seam: the advice is woven into {@code + * datadog.trace.core.scopemanager.ScopeContinuation} and {@code datadog.trace.core.PendingTrace} at + * test time only, so production tracer code carries no diagnostic footprint at all. + * + *

Inlined advice runs in the same app classloader as this class at test time, so it can call + * these statics directly. Every entry point first checks the {@link #recording} flag and is fully + * wrapped so a diagnostic failure can never propagate back into the tracer. + */ +public final class ScopeContinuationProbe { + /** + * Mirrors {@code ScopeContinuation.CANCELLED} (see {@code + * dd-trace-core/.../scopemanager/ScopeContinuation.java}). A continuation is resolved exactly + * when its {@code count} field transitions to this sentinel during a cancel call. Kept in sync by + * {@code ScopeContinuationProbeTest}. + */ + static final int CANCELLED = Integer.MIN_VALUE >> 1; + + private static volatile boolean recording = false; + private static volatile boolean installed = false; + + /** Cached reflective handle to the package-private {@code ScopeContinuation.source} field. */ + private static volatile Field sourceField; + + /** + * Continuations already recorded as resolved, by identity. The clean-resolution path that flips + * {@code count} to {@link #CANCELLED} can be observed by two inlined advice frames (the {@code + * cancelFromContinuedScopeClose} slow path delegates to {@code cancel()}); since {@code + * CANCELLED} is terminal, dedup by identity so each continuation resolves exactly once — matching + * the single {@code notifyResolve} the old production seam emitted. + */ + private static final Set resolved = + Collections.synchronizedSet(Collections.newSetFromMap(new IdentityHashMap<>())); + + private ScopeContinuationProbe() {} + + /** Installs the transformer (once per JVM) and starts recording. */ + static synchronized void enable() { + if (!installed) { + ScopeContinuationTransformer.install(); + installed = true; + } + recording = true; + } + + /** Stops recording. The transformer stays installed (inert while not recording). */ + static void disable() { + recording = false; + } + + /** Clears per-window state. Called from {@link ScopeDiagnostics#reset()}. */ + static void reset() { + resolved.clear(); + } + + // ---- advice entry points (public so inlined advice can reference them) ------------------- + + /** {@code ScopeContinuation.register()} exit: the continuation was captured. */ + public static void onCapture(Object self) { + if (!recording) { + return; + } + try { + AgentScope.Continuation continuation = (AgentScope.Continuation) self; + AgentSpan span = continuation.span(); + if (span != null) { + ScopeDiagnostics.recordCapture( + continuation, span.getTraceId(), span.getSpanId(), sourceOf(self)); + } + } catch (Throwable ignored) { + // diagnostics must never disturb the tracer + } + } + + /** + * {@code ScopeContinuation.activate()} exit: a real activation happened. The rollback branch + * returns the {@link AgentTracer#noopScope() noop scope} singleton, so a returned noop scope is + * skipped — this exactly reproduces the original "success branch only" semantics. + */ + public static void onActivate(Object self, Object returnedScope) { + if (!recording) { + return; + } + if (returnedScope == AgentTracer.noopScope()) { + return; + } + try { + AgentScope.Continuation continuation = (AgentScope.Continuation) self; + AgentSpan span = continuation.span(); + if (span != null) { + ScopeDiagnostics.recordActivate( + continuation, span.getTraceId(), span.getSpanId(), sourceOf(self)); + } + } catch (Throwable ignored) { + } + } + + /** + * {@code ScopeContinuation.cancel()} / {@code cancelFromContinuedScopeClose()} exit. The original + * production seam fired only from inside the clean resolution branch; here we detect that branch + * by observing the {@code count} field transition to {@link #CANCELLED} during this call. A + * cancel with outstanding activations leaves {@code count} unchanged (not a resolution). + */ + public static void onResolve(Object self, int countBefore, int countAfter) { + if (!recording) { + return; + } + if (countBefore == CANCELLED || countAfter != CANCELLED) { + return; + } + try { + if (resolved.add(self)) { + ScopeDiagnostics.recordResolve((AgentScope.Continuation) self, false); + } + } catch (Throwable ignored) { + } + } + + /** {@code PendingTrace.write()} root-written site. */ + public static void onRootWritten(Object traceId) { + if (!recording) { + return; + } + try { + ScopeDiagnostics.recordRootWritten((DDTraceId) traceId); + } catch (Throwable ignored) { + } + } + + /** + * Reads the package-private {@code source} byte field, falling back to the {@code -1} sentinel. + */ + private static byte sourceOf(Object self) { + try { + Field field = sourceField; + if (field == null) { + field = self.getClass().getDeclaredField("source"); + field.setAccessible(true); + sourceField = field; + } + return field.getByte(self); + } catch (Throwable ignored) { + return (byte) -1; + } + } +} diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationTransformer.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationTransformer.java new file mode 100644 index 00000000000..b0a3614af11 --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationTransformer.java @@ -0,0 +1,55 @@ +package datadog.trace.agent.test.scopediag; + +import static net.bytebuddy.matcher.ElementMatchers.named; +import static net.bytebuddy.matcher.ElementMatchers.takesArguments; + +import java.lang.instrument.Instrumentation; +import net.bytebuddy.agent.ByteBuddyAgent; +import net.bytebuddy.agent.builder.AgentBuilder; +import net.bytebuddy.agent.builder.ResettableClassFileTransformer; +import net.bytebuddy.asm.Advice; + +/** + * Installs the test-only diagnostic advice into the tracer's own {@code + * datadog.trace.core.scopemanager.ScopeContinuation} and {@code datadog.trace.core.PendingTrace}. + * + *

These classes sit under {@code datadog.trace.core.*}, which the tracer's own {@code + * AgentBuilder} hard-ignores. We therefore install a separate {@link AgentBuilder} on the + * raw {@link Instrumentation} with no global-ignore filter, using retransformation so the + * already-loaded classes are rewoven on install. The advice is schema-preserving, so {@code + * disableClassFormatChanges()} + {@code REDEFINE} keep it retransform-safe. Installed once per JVM. + */ +final class ScopeContinuationTransformer { + private static volatile ResettableClassFileTransformer transformer; + + private ScopeContinuationTransformer() {} + + static synchronized void install() { + if (transformer != null) { + return; + } + Instrumentation instrumentation = ByteBuddyAgent.getInstrumentation(); + transformer = + new AgentBuilder.Default() + .disableClassFormatChanges() + .with(AgentBuilder.RedefinitionStrategy.RETRANSFORMATION) + .with(AgentBuilder.TypeStrategy.Default.REDEFINE) + .type(named("datadog.trace.core.scopemanager.ScopeContinuation")) + .transform( + (builder, type, classLoader, module, pd) -> + builder + .visit(Advice.to(ContinuationAdvice.Register.class).on(named("register"))) + .visit(Advice.to(ContinuationAdvice.Activate.class).on(named("activate"))) + .visit(Advice.to(ContinuationAdvice.Cancel.class).on(named("cancel"))) + .visit( + Advice.to(ContinuationAdvice.CancelFromClose.class) + .on(named("cancelFromContinuedScopeClose")))) + .type(named("datadog.trace.core.PendingTrace")) + .transform( + (builder, type, classLoader, module, pd) -> + builder.visit( + Advice.to(PendingTraceAdvice.Write.class) + .on(named("write").and(takesArguments(boolean.class))))) + .installOn(instrumentation); + } +} diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnostics.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnostics.java index a92d1c15fd6..b1c241104ad 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnostics.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnostics.java @@ -2,7 +2,6 @@ import datadog.trace.api.DDTraceId; import datadog.trace.bootstrap.instrumentation.api.AgentScope; -import datadog.trace.core.scopemanager.ContinuationDiagnostics; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; @@ -21,8 +20,8 @@ /** * Test-time engine that records scope-continuation lifecycle events and renders leak reports. * - *

It installs itself as the {@link ContinuationDiagnostics.Listener} while recording and - * correlates events by continuation identity (an {@link IdentityHashMap}, never {@code + *

While recording, {@link ScopeContinuationProbe} (test-only bytecode advice) feeds it lifecycle + * events, which it correlates by continuation identity (an {@link IdentityHashMap}, never {@code * equals}/{@code hashCode}). It assumes a single test runs at a time per JVM (true for * instrumentation tests); {@link #reset()} isolates one test from the next. * @@ -64,12 +63,12 @@ public static void startRecording() { public static void startRecording(int maxFrames) { INSTANCE.reset(); INSTANCE.stackFilter = new StackFilter(maxFrames); - ContinuationDiagnostics.install(INSTANCE.listener); + ScopeContinuationProbe.enable(); } - /** Stops recording (uninstalls the listener). Recorded data remains queryable until reset. */ + /** Stops recording (the probe goes inert). Recorded data remains queryable until reset. */ public static void stop() { - ContinuationDiagnostics.clear(); + ScopeContinuationProbe.disable(); } /** Discards all recorded data. */ @@ -77,6 +76,7 @@ public static void reset() { INSTANCE.records.clear(); INSTANCE.rootWrittenNanos.clear(); INSTANCE.seq.set(0); + ScopeContinuationProbe.reset(); } /** Builds an immutable snapshot report of everything recorded so far. */ @@ -135,8 +135,27 @@ private ScopeEvent event(ScopeEvent.Type type) { stackFilter.filter(new Throwable().getStackTrace())); } - private final class Listener implements ContinuationDiagnostics.Listener { - @Override + // ---- static forwarders called by ScopeContinuationProbe ------------------ + + static void recordCapture( + AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source) { + INSTANCE.listener.onCapture(id, traceId, spanId, source); + } + + static void recordActivate( + AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source) { + INSTANCE.listener.onActivate(id, traceId, spanId, source); + } + + static void recordResolve(AgentScope.Continuation id, boolean cancelled) { + INSTANCE.listener.onResolve(id, cancelled); + } + + static void recordRootWritten(DDTraceId traceId) { + INSTANCE.listener.onRootWritten(traceId); + } + + private final class Listener { public void onCapture(AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source) { try { ContinuationRecord record = @@ -153,7 +172,6 @@ public void onCapture(AgentScope.Continuation id, DDTraceId traceId, long spanId } } - @Override public void onActivate( AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source) { try { @@ -162,7 +180,6 @@ public void onActivate( } } - @Override public void onResolve(AgentScope.Continuation id, boolean cancelled) { try { ScopeEvent.Type type = @@ -172,7 +189,6 @@ public void onResolve(AgentScope.Continuation id, boolean cancelled) { } } - @Override public void onRootWritten(DDTraceId traceId) { try { rootWrittenNanos.putIfAbsent(traceId, System.nanoTime()); diff --git a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbeTest.java b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbeTest.java new file mode 100644 index 00000000000..76cba5d71d8 --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbeTest.java @@ -0,0 +1,27 @@ +package datadog.trace.agent.test.scopediag; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.lang.reflect.Field; +import org.junit.jupiter.api.Test; + +/** + * Guards the {@code CANCELLED} sentinel that {@link ScopeContinuationProbe} duplicates from {@code + * datadog.trace.core.scopemanager.ScopeContinuation}. If the production constant ever changes, the + * probe's resolution detection would silently break — this test fails instead. + */ +class ScopeContinuationProbeTest { + + @Test + void cancelledSentinelMatchesProduction() throws Exception { + assertEquals(Integer.MIN_VALUE >> 1, ScopeContinuationProbe.CANCELLED); + + Class scopeContinuation = Class.forName("datadog.trace.core.scopemanager.ScopeContinuation"); + Field cancelled = scopeContinuation.getDeclaredField("CANCELLED"); + cancelled.setAccessible(true); + assertEquals( + cancelled.getInt(null), + ScopeContinuationProbe.CANCELLED, + "ScopeContinuationProbe.CANCELLED is out of sync with ScopeContinuation.CANCELLED"); + } +} diff --git a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsIntegrationTest.java b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsIntegrationTest.java index 16f3779884e..f3dc112d8ea 100644 --- a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsIntegrationTest.java +++ b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsIntegrationTest.java @@ -13,7 +13,7 @@ /** * Exercises the full path: a real {@link CoreTracer} capturing continuations via {@code - * captureSpan} drives {@code ScopeContinuation} -> {@code ContinuationDiagnostics} -> {@link + * captureSpan} drives {@code ScopeContinuation} -> {@code ScopeContinuationProbe} -> {@link * ScopeDiagnostics}, and the derived report classifies the leak correctly. */ class ScopeDiagnosticsIntegrationTest { diff --git a/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java b/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java index 559c0b9252f..3372404a42f 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/PendingTrace.java @@ -7,7 +7,6 @@ import datadog.trace.bootstrap.instrumentation.api.AgentScope; import datadog.trace.core.CoreTracer.ConfigSnapshot; import datadog.trace.core.monitor.HealthMetrics; -import datadog.trace.core.scopemanager.ContinuationDiagnostics; import java.util.ArrayList; import java.util.Collection; import java.util.List; @@ -377,7 +376,6 @@ private int write(boolean isPartial) { synchronized (this) { if (!isPartial) { rootSpanWritten = true; - ContinuationDiagnostics.notifyRootWritten(traceId); } int size = size(); boolean writeRunningSpans = diff --git a/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ContinuationDiagnostics.java b/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ContinuationDiagnostics.java deleted file mode 100644 index ca9065bfcd9..00000000000 --- a/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ContinuationDiagnostics.java +++ /dev/null @@ -1,73 +0,0 @@ -package datadog.trace.core.scopemanager; - -import datadog.trace.api.DDTraceId; -import datadog.trace.bootstrap.instrumentation.api.AgentScope; - -/** - * Test-time diagnostic seam for tracking the lifecycle of scope {@link AgentScope.Continuation - * continuations} as they are captured on one thread and activated/cancelled on another. - * - *

This class is inert in production: the {@link Listener} is {@code null} unless test code - * installs one via {@link #install(Listener)}. Each notification site reads the listener once - * through a single {@code volatile} read and skips the call when none is installed, so the cost - * when disabled is one volatile read per scope lifecycle event with no allocation and no behavior - * change. - * - *

The seam intentionally carries only continuation identity plus the trace/span ids and - * {@code source}. Wall-clock time, the current thread, and the capturing/activating/resolving stack - * trace are gathered by the listener itself: notifications are synchronous and run on the event's - * own thread, so the listener observes the correct thread and call stack. - */ -public final class ContinuationDiagnostics { - /** - * Receives scope-continuation lifecycle events. Implementations live in test code and must never - * throw back into the tracer (callers do not guard against exceptions). - */ - public interface Listener { - /** A continuation was captured (registered) and may be transferred to another thread. */ - void onCapture(AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source); - - /** A captured continuation was activated, resuming the scope (possibly on another thread). */ - void onActivate(AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source); - - /** - * A continuation was resolved: either finished normally (all activations closed or a clean - * cancel) or cancelled while activations/holds were outstanding. - * - * @param cancelled {@code true} if resolved via the cancel-with-outstanding-work path. - */ - void onResolve(AgentScope.Continuation id, boolean cancelled); - - /** The root span of the given trace has been written (the trace's root close signal). */ - void onRootWritten(DDTraceId traceId); - } - - private static volatile Listener listener; - - private ContinuationDiagnostics() {} - - /** Installs the diagnostic listener. Intended for test code only. */ - public static void install(Listener listener) { - ContinuationDiagnostics.listener = listener; - } - - /** Removes any installed diagnostic listener, returning the seam to its inert state. */ - public static void clear() { - ContinuationDiagnostics.listener = null; - } - - static Listener listener() { - return listener; - } - - /** - * Notifies the installed listener (if any) that the root span of the given trace has been - * written. Safe to call from any package; inert when no listener is installed. - */ - public static void notifyRootWritten(DDTraceId traceId) { - final Listener l = listener; - if (l != null) { - l.onRootWritten(traceId); - } - } -} diff --git a/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ScopeContinuation.java b/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ScopeContinuation.java index 6ce500b7aa1..364f1ac402b 100644 --- a/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ScopeContinuation.java +++ b/dd-trace-core/src/main/java/datadog/trace/core/scopemanager/ScopeContinuation.java @@ -64,13 +64,6 @@ final class ScopeContinuation implements AgentScope.Continuation { ScopeContinuation register() { traceCollector.registerContinuation(this); scopeManager.healthMetrics.onCaptureContinuation(); - final ContinuationDiagnostics.Listener listener = ContinuationDiagnostics.listener(); - if (listener != null) { - final AgentSpan span = span(); - if (span != null) { - listener.onCapture(this, span.getTraceId(), span.getSpanId(), source); - } - } return this; } @@ -85,13 +78,6 @@ public AgentScope.Continuation hold() { public AgentScope activate() { if (COUNT.incrementAndGet(this) > 0) { // speculative update succeeded, continuation can be activated - final ContinuationDiagnostics.Listener listener = ContinuationDiagnostics.listener(); - if (listener != null) { - final AgentSpan span = span(); - if (span != null) { - listener.onActivate(this, span.getTraceId(), span.getSpanId(), source); - } - } return scopeManager.continueSpan(this, context, source); } else { // continuation cancelled or too many activations; rollback count @@ -113,30 +99,18 @@ public void cancel() { if (COUNT.compareAndSet(this, current, CANCELLED)) { traceCollector.removeContinuation(this); scopeManager.healthMetrics.onFinishContinuation(); - notifyResolve(false); return; } current = count; } - // Reaching here means cancel() was called while activations are still outstanding: it does - // not de-reference the continuation (the active scopes will, on close), so it is not a - // resolution for leak-tracking purposes — only the existing health metric is recorded. scopeManager.healthMetrics.onCancelContinuation(); } - private void notifyResolve(final boolean cancelled) { - final ContinuationDiagnostics.Listener listener = ContinuationDiagnostics.listener(); - if (listener != null) { - listener.onResolve(this, cancelled); - } - } - void cancelFromContinuedScopeClose() { if (COUNT.compareAndSet(this, 1, CANCELLED)) { // fast path: only one activation of the continuation (no hold) traceCollector.removeContinuation(this); scopeManager.healthMetrics.onFinishContinuation(); - notifyResolve(false); } else if (COUNT.decrementAndGet(this) == 0) { // slow path: multiple activations, all have now closed (no hold) cancel(); From 1fe9821a238d277c15dc8ba81ffa4f7be524c60c Mon Sep 17 00:00:00 2001 From: Andrea Marziali Date: Mon, 15 Jun 2026 11:01:00 +0200 Subject: [PATCH 5/9] wipwipwip --- .../scopediag/ContinuableScopeAdvice.java | 39 ++ .../test/scopediag/ContinuationAdvice.java | 35 +- .../test/scopediag/ContinuationRecord.java | 182 +++++- .../test/scopediag/ContinuationStatus.java | 18 + .../trace/agent/test/scopediag/Failure.java | 21 + .../scopediag/ScopeContinuationProbe.java | 197 +++++- .../ScopeContinuationTransformer.java | 16 +- .../test/scopediag/ScopeDiagnostics.java | 164 ++++- .../scopediag/ScopeDiagnosticsReport.java | 615 ++++++++++++------ .../agent/test/scopediag/ScopeEvent.java | 10 +- .../agent/test/scopediag/ScopeRecord.java | 117 ++++ .../scopediag/ScopeContinuationProbeTest.java | 53 +- .../ScopeDiagnosticsIntegrationTest.java | 57 ++ .../scopediag/ScopeDiagnosticsReportTest.java | 69 +- .../agent/test/scopediag/ScopeRecordTest.java | 97 +++ 15 files changed, 1396 insertions(+), 294 deletions(-) create mode 100644 dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuableScopeAdvice.java create mode 100644 dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationStatus.java create mode 100644 dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/Failure.java create mode 100644 dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeRecord.java create mode 100644 dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeRecordTest.java diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuableScopeAdvice.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuableScopeAdvice.java new file mode 100644 index 00000000000..3dab0453128 --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuableScopeAdvice.java @@ -0,0 +1,39 @@ +package datadog.trace.agent.test.scopediag; + +import net.bytebuddy.asm.Advice; + +/** + * Test-only ByteBuddy advice woven into {@code datadog.trace.core.scopemanager.ContinuableScope} + * (and, by inheritance, {@code ContinuingScope}) to track the scope activation lifecycle. + * + *

The target type is package-private, so {@code this} is typed as {@link Object} and re-cast + * inside {@link ScopeContinuationProbe}. {@code afterActivated} is the open point (first call per + * scope identity), {@code onProperClose} the pop, and {@code close} the wrong-thread check. + */ +public final class ContinuableScopeAdvice { + private ContinuableScopeAdvice() {} + + /** {@code afterActivated()} — the scope became active. */ + public static final class AfterActivated { + @Advice.OnMethodExit(suppress = Throwable.class) + public static void exit(@Advice.This Object scope) { + ScopeContinuationProbe.onScopeOpen(scope); + } + } + + /** {@code onProperClose()} — the scope was popped from its thread's stack. */ + public static final class OnProperClose { + @Advice.OnMethodExit(suppress = Throwable.class) + public static void exit(@Advice.This Object scope) { + ScopeContinuationProbe.onScopeClose(scope); + } + } + + /** {@code close()} — check for an out-of-order / wrong-thread close. */ + public static final class Close { + @Advice.OnMethodEnter(suppress = Throwable.class) + public static void enter(@Advice.This Object scope) { + ScopeContinuationProbe.onScopeClosing(scope); + } + } +} diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationAdvice.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationAdvice.java index c56a64736c7..16e0e2e29ff 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationAdvice.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationAdvice.java @@ -29,10 +29,22 @@ public static void exit(@Advice.This Object self, @Advice.Return Object scope) { } } - /** {@code cancel()} — resolution detected via the {@code count} transition. */ + /** + * Resolution detected via the {@code count} transition. Applied to both {@code cancel()} and + * {@code cancelFromContinuedScopeClose()} — they need identical before/after observation. + * + *

The resolve timestamp is captured at method entry (the {@code ddResolveNanos} + * local), not at exit: the body itself may call {@code removeContinuation() -> + * PendingTrace.write()}, which is exactly where the root-written timestamp is taken. Timestamping + * at exit would place the resolution after the root write it triggered, producing a spurious + * late-finish. + */ public static final class Cancel { @Advice.OnMethodEnter - public static int enter(@Advice.FieldValue("count") int count) { + public static int enter( + @Advice.FieldValue("count") int count, + @Advice.Local("ddResolveNanos") long ddResolveNanos) { + ddResolveNanos = System.nanoTime(); return count; } @@ -40,24 +52,9 @@ public static int enter(@Advice.FieldValue("count") int count) { public static void exit( @Advice.This Object self, @Advice.Enter int countBefore, + @Advice.Local("ddResolveNanos") long ddResolveNanos, @Advice.FieldValue("count") int countAfter) { - ScopeContinuationProbe.onResolve(self, countBefore, countAfter); - } - } - - /** {@code cancelFromContinuedScopeClose()} — same resolution detection as {@link Cancel}. */ - public static final class CancelFromClose { - @Advice.OnMethodEnter - public static int enter(@Advice.FieldValue("count") int count) { - return count; - } - - @Advice.OnMethodExit(suppress = Throwable.class) - public static void exit( - @Advice.This Object self, - @Advice.Enter int countBefore, - @Advice.FieldValue("count") int countAfter) { - ScopeContinuationProbe.onResolve(self, countBefore, countAfter); + ScopeContinuationProbe.onResolve(self, countBefore, countAfter, ddResolveNanos); } } } diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationRecord.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationRecord.java index ba4842afff8..c7e452889e8 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationRecord.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationRecord.java @@ -2,73 +2,196 @@ import datadog.trace.api.DDTraceId; import java.util.ArrayList; +import java.util.EnumSet; import java.util.List; /** - * The correlated lifecycle of one scope continuation: its capture, every activation, and every - * resolution (finish/cancel). Built incrementally as events arrive on different threads, so all - * mutating access is synchronized on the instance. + * The correlated continuation lifecycle: its capture, every resume (activation), any + * failed activation, and its terminal resolution (finish/cancel). Scope activation lifetimes are + * modelled separately by {@link ScopeRecord}; the scopes a continuation spawned are linked here by + * their seq ids ({@link #scopeRecordSeqs()}). + * + *

Built incrementally as events arrive on different threads, so all mutating access is + * synchronized on the instance. */ public final class ContinuationRecord { public final long seq; public final DDTraceId traceId; public final long spanId; + public final String spanName; public final byte source; - /** - * {@code true} when activation/resolution was seen without a preceding capture in this window. - */ + /** {@code true} when a resume/resolution was seen without a preceding capture in this window. */ public final boolean orphan; - private ScopeEvent capture; - private final List activations = new ArrayList<>(1); - private final List resolutions = new ArrayList<>(1); + private final ScopeEvent capture; + private final List resumes = new ArrayList<>(1); + private final List failedActivations = new ArrayList<>(0); + private ScopeEvent terminal; + private final List extraTerminals = new ArrayList<>(0); + private final List scopeRecordSeqs = new ArrayList<>(1); ContinuationRecord( - long seq, DDTraceId traceId, long spanId, byte source, boolean orphan, ScopeEvent capture) { + long seq, + DDTraceId traceId, + long spanId, + String spanName, + byte source, + boolean orphan, + ScopeEvent capture) { this.seq = seq; this.traceId = traceId; this.spanId = spanId; + this.spanName = spanName; this.source = source; this.orphan = orphan; this.capture = capture; } - synchronized void addActivation(ScopeEvent event) { - activations.add(event); + // ---- mutation ------------------------------------------------------------ + + synchronized void addResume(ScopeEvent event) { + resumes.add(event); } - synchronized void addResolution(ScopeEvent event) { - resolutions.add(event); + synchronized void addFailedActivation(ScopeEvent event) { + failedActivations.add(event); } + /** First terminal sets {@link #terminal}; any subsequent terminal is a double-finish signal. */ + synchronized void setTerminalOrExtra(ScopeEvent event) { + if (terminal == null) { + terminal = event; + } else { + extraTerminals.add(event); + } + } + + synchronized void linkScope(long scopeSeq) { + scopeRecordSeqs.add(scopeSeq); + } + + // ---- accessors ----------------------------------------------------------- + public synchronized ScopeEvent capture() { return capture; } - public synchronized List activations() { - return new ArrayList<>(activations); + public synchronized List resumes() { + return new ArrayList<>(resumes); + } + + public synchronized List failedActivations() { + return new ArrayList<>(failedActivations); + } + + public synchronized ScopeEvent terminal() { + return terminal; } - public synchronized List resolutions() { - return new ArrayList<>(resolutions); + public synchronized List extraTerminals() { + return new ArrayList<>(extraTerminals); } - public synchronized boolean isActivated() { - return !activations.isEmpty(); + public synchronized List scopeRecordSeqs() { + return new ArrayList<>(scopeRecordSeqs); + } + + public synchronized boolean isResumed() { + return !resumes.isEmpty(); } public synchronized boolean isResolved() { - return !resolutions.isEmpty(); + return terminal != null; + } + + // ---- derived ------------------------------------------------------------- + + public synchronized ContinuationStatus status() { + if (terminal != null) { + return terminal.type == ScopeEvent.Type.RESOLVE_CANCEL + ? ContinuationStatus.CANCELLED + : ContinuationStatus.FINISHED; + } + return ContinuationStatus.LEAKED; + } + + /** + * Derives the failure set for this continuation. {@code rootWrittenNanos} may be {@code null}. + */ + public synchronized EnumSet failures(Long rootWrittenNanos) { + EnumSet failures = EnumSet.noneOf(Failure.class); + if (terminal == null) { + failures.add(Failure.LEAKED); + } + if (!extraTerminals.isEmpty()) { + failures.add(Failure.DOUBLE_FINISH); + } + if (!failedActivations.isEmpty() || resumedAfterTerminal()) { + failures.add(Failure.ACTIVATE_AFTER_RESOLVE); + } + if (rootWrittenNanos != null + && (laterThan(terminal, rootWrittenNanos) || laterThan(resumes, rootWrittenNanos))) { + failures.add(Failure.LATE_FINISH); + } + return failures; + } + + private boolean resumedAfterTerminal() { + if (terminal == null) { + return false; + } + for (ScopeEvent r : resumes) { + if (r.nanos > terminal.nanos) { + return true; + } + } + return false; + } + + /** {@code true} when capture and any resume/terminal happened on different threads. */ + public synchronized boolean threadHandoff() { + if (capture == null) { + return false; + } + String captureThread = capture.threadName; + for (ScopeEvent r : resumes) { + if (!captureThread.equals(r.threadName)) { + return true; + } + } + return terminal != null && !captureThread.equals(terminal.threadName); + } + + /** Nanos between capture and the first resume, or {@code null} if not both observed. */ + public synchronized Long captureToFirstResumeNanos() { + if (capture == null || resumes.isEmpty()) { + return null; + } + return resumes.get(0).nanos - capture.nanos; + } + + /** Nanos between capture and the terminal resolution, or {@code null} if not both observed. */ + public synchronized Long ageAtTerminalNanos() { + if (capture == null || terminal == null) { + return null; + } + return terminal.nanos - capture.nanos; } /** Earliest known event time for ordering the timeline. */ public synchronized long firstNanos() { long min = capture != null ? capture.nanos : Long.MAX_VALUE; - for (ScopeEvent e : activations) { + for (ScopeEvent e : resumes) { + min = Math.min(min, e.nanos); + } + for (ScopeEvent e : failedActivations) { min = Math.min(min, e.nanos); } - for (ScopeEvent e : resolutions) { + if (terminal != null) { + min = Math.min(min, terminal.nanos); + } + for (ScopeEvent e : extraTerminals) { min = Math.min(min, e.nanos); } return min; @@ -77,4 +200,17 @@ public synchronized long firstNanos() { public String sourceName() { return ScopeSources.name(source); } + + private static boolean laterThan(ScopeEvent event, long nanos) { + return event != null && event.nanos > nanos; + } + + private static boolean laterThan(List events, long nanos) { + for (ScopeEvent e : events) { + if (e.nanos > nanos) { + return true; + } + } + return false; + } } diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationStatus.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationStatus.java new file mode 100644 index 00000000000..213a57a62a8 --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationStatus.java @@ -0,0 +1,18 @@ +package datadog.trace.agent.test.scopediag; + +/** + * Derived lifecycle state of a continuation, for rendering. The authoritative bug signal is the + * {@link Failure} set, not this status. + */ +public enum ContinuationStatus { + /** Captured but not yet resumed or resolved. */ + CAPTURED, + /** Resumed at least once but not yet resolved. */ + RESUMED, + /** Resolved normally (all activations closed or a clean cancel with no outstanding work). */ + FINISHED, + /** Resolved via the cancel-with-outstanding-work path. */ + CANCELLED, + /** Captured (and possibly resumed) but never resolved within the recording window. */ + LEAKED +} diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/Failure.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/Failure.java new file mode 100644 index 00000000000..6632609584d --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/Failure.java @@ -0,0 +1,21 @@ +package datadog.trace.agent.test.scopediag; + +/** + * A derived failure classification. Shared by {@link ContinuationRecord} (continuation-lifetime + * failures) and {@link ScopeRecord} (scope-lifetime failures). See {@link + * ScopeDiagnosticsReport#hasProblems()} for which of these fail a test versus are report-only. + */ +public enum Failure { + /** Continuation captured but never resolved within the window. */ + LEAKED, + /** Continuation resolved/resumed after the root span of its trace was already written. */ + LATE_FINISH, + /** Continuation resolved more than once. */ + DOUBLE_FINISH, + /** Continuation activated after it had already been resolved. */ + ACTIVATE_AFTER_RESOLVE, + /** Scope closed while not on top of its thread's stack (closed on the wrong thread / order). */ + CLOSE_WRONG_THREAD, + /** Scope opened but never closed within the window. */ + NEVER_CLOSED +} diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbe.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbe.java index 855072fd557..8e8d7d3eb06 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbe.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbe.java @@ -5,6 +5,7 @@ import datadog.trace.bootstrap.instrumentation.api.AgentSpan; import datadog.trace.bootstrap.instrumentation.api.AgentTracer; import java.lang.reflect.Field; +import java.lang.reflect.Method; import java.util.Collections; import java.util.IdentityHashMap; import java.util.Set; @@ -35,6 +36,13 @@ public final class ScopeContinuationProbe { /** Cached reflective handle to the package-private {@code ScopeContinuation.source} field. */ private static volatile Field sourceField; + // cached reflective handles for scope-lifecycle reads (set-once, best-effort) + private static volatile Field scopeSourceField; // ContinuableScope.source + private static volatile Field continuationField; // ContinuingScope.continuation + private static volatile Field scopeManagerField; // ContinuableScope.scopeManager + private static volatile Method scopeStackMethod; // ContinuableScopeManager.scopeStack() + private static volatile Method checkTopMethod; // ScopeStack.checkTop(ContinuableScope) + /** * Continuations already recorded as resolved, by identity. The clean-resolution path that flips * {@code count} to {@link #CANCELLED} can be observed by two inlined advice frames (the {@code @@ -78,7 +86,7 @@ public static void onCapture(Object self) { AgentSpan span = continuation.span(); if (span != null) { ScopeDiagnostics.recordCapture( - continuation, span.getTraceId(), span.getSpanId(), sourceOf(self)); + continuation, span.getTraceId(), span.getSpanId(), spanName(span), sourceOf(self)); } } catch (Throwable ignored) { // diagnostics must never disturb the tracer @@ -94,15 +102,18 @@ public static void onActivate(Object self, Object returnedScope) { if (!recording) { return; } - if (returnedScope == AgentTracer.noopScope()) { - return; - } try { AgentScope.Continuation continuation = (AgentScope.Continuation) self; + if (returnedScope == AgentTracer.noopScope()) { + // activate() returned the noop scope: the continuation was already resolved. This is the + // activate-after-resolve signal — the engine records it only if a terminal was seen. + ScopeDiagnostics.recordActivateFailed(continuation); + return; + } AgentSpan span = continuation.span(); if (span != null) { ScopeDiagnostics.recordActivate( - continuation, span.getTraceId(), span.getSpanId(), sourceOf(self)); + continuation, span.getTraceId(), span.getSpanId(), spanName(span), sourceOf(self)); } } catch (Throwable ignored) { } @@ -114,16 +125,24 @@ public static void onActivate(Object self, Object returnedScope) { * by observing the {@code count} field transition to {@link #CANCELLED} during this call. A * cancel with outstanding activations leaves {@code count} unchanged (not a resolution). */ - public static void onResolve(Object self, int countBefore, int countAfter) { + public static void onResolve(Object self, int countBefore, int countAfter, long resolveNanos) { if (!recording) { return; } - if (countBefore == CANCELLED || countAfter != CANCELLED) { - return; + if (countAfter != CANCELLED) { + return; // not a resolution } try { - if (resolved.add(self)) { - ScopeDiagnostics.recordResolve((AgentScope.Continuation) self, false); + AgentScope.Continuation continuation = (AgentScope.Continuation) self; + if (countBefore == CANCELLED) { + // already cancelled before this call: a genuine second finish/cancel (the slow-path + // artifact always transitions 1->CANCELLED, never CANCELLED->CANCELLED). Surface it as a + // double finish, bypassing the first-resolution dedup. + ScopeDiagnostics.recordResolve(continuation, false, resolveNanos); + } else if (resolved.add(self)) { + // first clean transition to CANCELLED; later observations of the SAME transition (the + // cancelFromContinuedScopeClose slow path's nested cancel()) are suppressed + ScopeDiagnostics.recordResolve(continuation, false, resolveNanos); } } catch (Throwable ignored) { } @@ -140,6 +159,66 @@ public static void onRootWritten(Object traceId) { } } + /** + * {@code ContinuableScope.afterActivated()} exit: a scope became active. Re-activations (parent + * restored after a child closes) reach here too; the engine keeps only the first per scope + * identity. Links to the spawning continuation when the scope is a {@code ContinuingScope}. + */ + public static void onScopeOpen(Object scope) { + if (!recording) { + return; + } + try { + AgentSpan span = ((AgentScope) scope).span(); + DDTraceId traceId = span != null ? span.getTraceId() : DDTraceId.ZERO; + long spanId = span != null ? span.getSpanId() : 0L; + String name = span != null ? spanName(span) : null; + ScopeDiagnostics.recordScopeOpen( + scope, traceId, spanId, name, scopeSourceOf(scope), continuationOf(scope)); + } catch (Throwable ignored) { + } + } + + /** + * {@code ContinuableScope.onProperClose()} exit: the scope was popped from its thread's stack. + */ + public static void onScopeClose(Object scope) { + if (!recording) { + return; + } + try { + ScopeDiagnostics.recordScopeClose(scope); + } catch (Throwable ignored) { + } + } + + /** + * {@code ContinuableScope.close()} entry: if the scope is not on top of its thread's stack, this + * is an out-of-order / wrong-thread close. Best-effort — silently does nothing if the internal + * stack check cannot be reached reflectively. + */ + public static void onScopeClosing(Object scope) { + if (!recording) { + return; + } + try { + if (isNotOnTop(scope)) { + ScopeDiagnostics.recordScopeCloseWrongThread(scope); + } + } catch (Throwable ignored) { + } + } + + /** Snapshots the span name as a String (the CharSequence may mutate later), or {@code null}. */ + private static String spanName(AgentSpan span) { + try { + CharSequence name = span.getSpanName(); + return name == null ? null : name.toString(); + } catch (Throwable ignored) { + return null; + } + } + /** * Reads the package-private {@code source} byte field, falling back to the {@code -1} sentinel. */ @@ -156,4 +235,102 @@ private static byte sourceOf(Object self) { return (byte) -1; } } + + /** Reads the {@code source} byte of a scope (declared on {@code ContinuableScope}). */ + private static byte scopeSourceOf(Object scope) { + try { + Field field = scopeSourceField; + if (field == null) { + field = findField(scope.getClass(), "source"); + scopeSourceField = field; + } + return field != null ? field.getByte(scope) : (byte) -1; + } catch (Throwable ignored) { + return (byte) -1; + } + } + + /** + * The continuation that spawned a scope, read from {@code ContinuingScope.continuation}; {@code + * null} for a plain (non-continuation) scope. + */ + private static AgentScope.Continuation continuationOf(Object scope) { + try { + Field field = continuationField; + if (field == null) { + field = findField(scope.getClass(), "continuation"); + continuationField = field; + } + if (field == null || !field.getDeclaringClass().isInstance(scope)) { + return null; // not a ContinuingScope + } + Object value = field.get(scope); + return value instanceof AgentScope.Continuation ? (AgentScope.Continuation) value : null; + } catch (Throwable ignored) { + return null; + } + } + + /** Best-effort: {@code true} when the scope is not on top of its thread's scope stack. */ + private static boolean isNotOnTop(Object scope) { + try { + Field managerField = scopeManagerField; + if (managerField == null) { + managerField = findField(scope.getClass(), "scopeManager"); + scopeManagerField = managerField; + } + Object manager = managerField != null ? managerField.get(scope) : null; + if (manager == null) { + return false; + } + Method stackMethod = scopeStackMethod; + if (stackMethod == null) { + stackMethod = findMethod(manager.getClass(), "scopeStack", 0); + scopeStackMethod = stackMethod; + } + Object stack = stackMethod != null ? stackMethod.invoke(manager) : null; + if (stack == null) { + return false; + } + Method check = checkTopMethod; + if (check == null) { + check = findMethod(stack.getClass(), "checkTop", 1); + checkTopMethod = check; + } + if (check == null) { + return false; + } + Object onTop = check.invoke(stack, scope); + return onTop instanceof Boolean && !((Boolean) onTop); + } catch (Throwable ignored) { + return false; + } + } + + /** Finds a named field declared on a class or any superclass, made accessible. */ + private static Field findField(Class cls, String name) { + for (Class c = cls; c != null; c = c.getSuperclass()) { + try { + Field f = c.getDeclaredField(name); + f.setAccessible(true); + return f; + } catch (NoSuchFieldException ignored) { + // keep walking up + } + } + return null; + } + + /** Finds a named method with the given parameter count on a class or superclass, accessible. */ + private static Method findMethod(Class cls, String name, int paramCount) { + for (Class c = cls; c != null; c = c.getSuperclass()) { + for (Method m : c.getDeclaredMethods()) { + if (m.getName().equals(name) && m.getParameterCount() == paramCount) { + m.setAccessible(true); + return m; + } + } + } + return null; + } } diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationTransformer.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationTransformer.java index b0a3614af11..9e0babfd969 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationTransformer.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationTransformer.java @@ -40,16 +40,26 @@ static synchronized void install() { builder .visit(Advice.to(ContinuationAdvice.Register.class).on(named("register"))) .visit(Advice.to(ContinuationAdvice.Activate.class).on(named("activate"))) - .visit(Advice.to(ContinuationAdvice.Cancel.class).on(named("cancel"))) .visit( - Advice.to(ContinuationAdvice.CancelFromClose.class) - .on(named("cancelFromContinuedScopeClose")))) + Advice.to(ContinuationAdvice.Cancel.class) + .on(named("cancel").or(named("cancelFromContinuedScopeClose"))))) .type(named("datadog.trace.core.PendingTrace")) .transform( (builder, type, classLoader, module, pd) -> builder.visit( Advice.to(PendingTraceAdvice.Write.class) .on(named("write").and(takesArguments(boolean.class))))) + .type(named("datadog.trace.core.scopemanager.ContinuableScope")) + .transform( + (builder, type, classLoader, module, pd) -> + builder + .visit( + Advice.to(ContinuableScopeAdvice.AfterActivated.class) + .on(named("afterActivated"))) + .visit( + Advice.to(ContinuableScopeAdvice.OnProperClose.class) + .on(named("onProperClose"))) + .visit(Advice.to(ContinuableScopeAdvice.Close.class).on(named("close")))) .installOn(instrumentation); } } diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnostics.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnostics.java index b1c241104ad..131573e7240 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnostics.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnostics.java @@ -18,12 +18,15 @@ import org.slf4j.LoggerFactory; /** - * Test-time engine that records scope-continuation lifecycle events and renders leak reports. + * Test-time engine that records scope/continuation lifecycle events and renders leak reports. * - *

While recording, {@link ScopeContinuationProbe} (test-only bytecode advice) feeds it lifecycle - * events, which it correlates by continuation identity (an {@link IdentityHashMap}, never {@code - * equals}/{@code hashCode}). It assumes a single test runs at a time per JVM (true for - * instrumentation tests); {@link #reset()} isolates one test from the next. + *

It models two correlated lifecycles separately: continuations ({@link ContinuationRecord} — + * captured/resumed/finished) and scopes ({@link ScopeRecord} — opened/closed). While recording, + * {@link ScopeContinuationProbe} (test-only bytecode advice) feeds it events, which it correlates + * by identity (an {@link IdentityHashMap}, never {@code equals}/{@code hashCode}) — continuations + * by their {@code AgentScope.Continuation} instance, scopes by their scope instance. It assumes a + * single test runs at a time per JVM (true for instrumentation tests); {@link #reset()} isolates + * one test from the next. * *

Usage: * @@ -44,8 +47,11 @@ public final class ScopeDiagnostics { private final Map records = Collections.synchronizedMap( new IdentityHashMap()); + private final Map scopeRecords = + Collections.synchronizedMap(new IdentityHashMap()); private final Map rootWrittenNanos = new ConcurrentHashMap<>(); private final AtomicLong seq = new AtomicLong(); + private final AtomicLong scopeSeq = new AtomicLong(); private volatile StackFilter stackFilter = new StackFilter(DEFAULT_MAX_FRAMES); private final Listener listener = new Listener(); @@ -74,29 +80,37 @@ public static void stop() { /** Discards all recorded data. */ public static void reset() { INSTANCE.records.clear(); + INSTANCE.scopeRecords.clear(); INSTANCE.rootWrittenNanos.clear(); INSTANCE.seq.set(0); + INSTANCE.scopeSeq.set(0); ScopeContinuationProbe.reset(); } /** Builds an immutable snapshot report of everything recorded so far. */ public static ScopeDiagnosticsReport report() { - List snapshot; + List continuations; synchronized (INSTANCE.records) { - snapshot = new ArrayList<>(INSTANCE.records.values()); + continuations = new ArrayList<>(INSTANCE.records.values()); } - return new ScopeDiagnosticsReport(snapshot, new ConcurrentHashMap<>(INSTANCE.rootWrittenNanos)); + List scopes; + synchronized (INSTANCE.scopeRecords) { + scopes = new ArrayList<>(INSTANCE.scopeRecords.values()); + } + return new ScopeDiagnosticsReport( + continuations, scopes, new ConcurrentHashMap<>(INSTANCE.rootWrittenNanos)); } /** - * Fails with an {@link AssertionError} (carrying the leak summary) if any never-resolved leak or - * double/invalid resolution was recorded. Late-after-root is reported but does not fail. + * Fails with an {@link AssertionError} (carrying the problem summary) if the report flags a + * genuine bug (see {@link ScopeDiagnosticsReport#hasProblems()}). Report-only signals such as + * late-after-root and close-on-wrong-thread do not fail. */ public static void assertNoLeaks() { ScopeDiagnosticsReport report = report(); if (report.hasProblems()) { throw new AssertionError( - "Scope continuation leaks detected:\n" + "Scope continuation problems detected:\n" + report.renderSummary() + "\n" + report.renderGantt()); @@ -128,41 +142,70 @@ private static Path write(String name, String extension, String content) { // ---- listener implementation --------------------------------------------- private ScopeEvent event(ScopeEvent.Type type) { + return event(type, System.nanoTime()); + } + + /** Builds an event with an explicit timestamp (thread and stack are still captured now). */ + private ScopeEvent event(ScopeEvent.Type type, long nanos) { return new ScopeEvent( type, Thread.currentThread().getName(), - System.nanoTime(), + nanos, stackFilter.filter(new Throwable().getStackTrace())); } // ---- static forwarders called by ScopeContinuationProbe ------------------ static void recordCapture( - AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source) { - INSTANCE.listener.onCapture(id, traceId, spanId, source); + AgentScope.Continuation id, DDTraceId traceId, long spanId, String spanName, byte source) { + INSTANCE.listener.onCapture(id, traceId, spanId, spanName, source); } static void recordActivate( - AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source) { - INSTANCE.listener.onActivate(id, traceId, spanId, source); + AgentScope.Continuation id, DDTraceId traceId, long spanId, String spanName, byte source) { + INSTANCE.listener.onActivate(id, traceId, spanId, spanName, source); + } + + static void recordActivateFailed(AgentScope.Continuation id) { + INSTANCE.listener.onActivateFailed(id); } - static void recordResolve(AgentScope.Continuation id, boolean cancelled) { - INSTANCE.listener.onResolve(id, cancelled); + static void recordResolve(AgentScope.Continuation id, boolean cancelled, long resolveNanos) { + INSTANCE.listener.onResolve(id, cancelled, resolveNanos); } static void recordRootWritten(DDTraceId traceId) { INSTANCE.listener.onRootWritten(traceId); } + static void recordScopeOpen( + Object scope, + DDTraceId traceId, + long spanId, + String spanName, + byte source, + AgentScope.Continuation continuation) { + INSTANCE.listener.onScopeOpen(scope, traceId, spanId, spanName, source, continuation); + } + + static void recordScopeClose(Object scope) { + INSTANCE.listener.onScopeClose(scope); + } + + static void recordScopeCloseWrongThread(Object scope) { + INSTANCE.listener.onScopeCloseWrongThread(scope); + } + private final class Listener { - public void onCapture(AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source) { + void onCapture( + AgentScope.Continuation id, DDTraceId traceId, long spanId, String spanName, byte source) { try { ContinuationRecord record = new ContinuationRecord( seq.getAndIncrement(), traceId, spanId, + spanName, source, false, event(ScopeEvent.Type.CAPTURE)); @@ -172,40 +215,107 @@ public void onCapture(AgentScope.Continuation id, DDTraceId traceId, long spanId } } - public void onActivate( - AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source) { + void onActivate( + AgentScope.Continuation id, DDTraceId traceId, long spanId, String spanName, byte source) { try { - recordFor(id, traceId, spanId, source).addActivation(event(ScopeEvent.Type.ACTIVATE)); + recordFor(id, traceId, spanId, spanName, source).addResume(event(ScopeEvent.Type.ACTIVATE)); } catch (Throwable ignored) { } } - public void onResolve(AgentScope.Continuation id, boolean cancelled) { + void onActivateFailed(AgentScope.Continuation id) { + try { + ContinuationRecord record = records.get(id); + // only an activation of an already-resolved continuation is a real failure; a plain + // rollback (e.g. cancelled before any capture was recorded) is benign and ignored + if (record != null && record.isResolved()) { + record.addFailedActivation(event(ScopeEvent.Type.ACTIVATE_FAILED)); + } + } catch (Throwable ignored) { + } + } + + void onResolve(AgentScope.Continuation id, boolean cancelled, long resolveNanos) { try { ScopeEvent.Type type = cancelled ? ScopeEvent.Type.RESOLVE_CANCEL : ScopeEvent.Type.RESOLVE_FINISH; - recordFor(id, DDTraceId.ZERO, 0, (byte) -1).addResolution(event(type)); + recordFor(id, DDTraceId.ZERO, 0, null, (byte) -1) + .setTerminalOrExtra(event(type, resolveNanos)); } catch (Throwable ignored) { } } - public void onRootWritten(DDTraceId traceId) { + void onRootWritten(DDTraceId traceId) { try { rootWrittenNanos.putIfAbsent(traceId, System.nanoTime()); } catch (Throwable ignored) { } } + void onScopeOpen( + Object scope, + DDTraceId traceId, + long spanId, + String spanName, + byte source, + AgentScope.Continuation continuation) { + try { + synchronized (scopeRecords) { + if (scopeRecords.containsKey(scope)) { + return; // re-activation of an already-open scope, not a new open + } + ContinuationRecord owner = continuation != null ? records.get(continuation) : null; + Long continuationSeq = owner != null ? owner.seq : null; + long s = scopeSeq.getAndIncrement(); + scopeRecords.put( + scope, + new ScopeRecord( + s, + traceId, + spanId, + spanName, + source, + continuationSeq, + event(ScopeEvent.Type.SCOPE_OPEN))); + if (owner != null) { + owner.linkScope(s); + } + } + } catch (Throwable ignored) { + } + } + + void onScopeClose(Object scope) { + try { + ScopeRecord record = scopeRecords.get(scope); + if (record != null) { + record.setClose(event(ScopeEvent.Type.SCOPE_CLOSE)); + } + } catch (Throwable ignored) { + } + } + + void onScopeCloseWrongThread(Object scope) { + try { + ScopeRecord record = scopeRecords.get(scope); + if (record != null) { + record.addWrongThreadClose(event(ScopeEvent.Type.SCOPE_CLOSE_WRONG_THREAD)); + } + } catch (Throwable ignored) { + } + } + /** Returns the record for an id, creating an orphan record if capture was not observed. */ private ContinuationRecord recordFor( - AgentScope.Continuation id, DDTraceId traceId, long spanId, byte source) { + AgentScope.Continuation id, DDTraceId traceId, long spanId, String spanName, byte source) { synchronized (records) { ContinuationRecord existing = records.get(id); if (existing != null) { return existing; } ContinuationRecord orphan = - new ContinuationRecord(seq.getAndIncrement(), traceId, spanId, source, true, null); + new ContinuationRecord( + seq.getAndIncrement(), traceId, spanId, spanName, source, true, null); records.put(id, orphan); return orphan; } diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReport.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReport.java index 4b5aead05f3..fadb69435d1 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReport.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReport.java @@ -8,174 +8,286 @@ import java.util.Map; /** - * An immutable snapshot of the recorded continuation lifecycles plus the derived leak findings. - * Renders three views: a text Gantt timeline, a leak-only summary, and a JSON document. + * An immutable snapshot of the recorded continuation and scope lifecycles plus the derived failure + * findings. Renders a text Gantt timeline, a problem-only summary, a Mermaid Gantt, and JSON. + * + *

The two lifecycles are kept separate: {@link ContinuationRecord} (captured → resumed → + * finished) and {@link ScopeRecord} (opened → closed). A scope spawned by resuming a continuation + * is linked to it ({@link ContinuationRecord#scopeRecordSeqs()} / {@link + * ScopeRecord#continuationSeq}) and rendered nested under it; other scopes render in a separate + * section. */ public final class ScopeDiagnosticsReport { - /** A derived problem classification for a single continuation. */ - public enum Flag { - /** Captured but never resolved (neither finished nor cancelled) within the window. */ - LEAK, - /** Activated or resolved after the root span of its trace had already been written. */ - LATE, - /** Resolved more than once, or activated after being resolved. */ - DOUBLE - } - - private final List records; + private final List continuations; + private final List scopes; private final Map rootWrittenNanos; private final long t0; - private final Map> findings; - - ScopeDiagnosticsReport(List records, Map rootWrittenNanos) { - this.records = new ArrayList<>(records); - this.records.sort((a, b) -> Long.compare(a.firstNanos(), b.firstNanos())); + private final Map> continuationFailures; + private final Map> scopeFailures; + private final Map scopeBySeq; + + ScopeDiagnosticsReport( + List continuations, + List scopes, + Map rootWrittenNanos) { + this.continuations = new ArrayList<>(continuations); + this.continuations.sort((a, b) -> Long.compare(a.firstNanos(), b.firstNanos())); + this.scopes = new ArrayList<>(scopes); + this.scopes.sort((a, b) -> Long.compare(a.firstNanos(), b.firstNanos())); this.rootWrittenNanos = rootWrittenNanos; - this.t0 = computeT0(this.records); - this.findings = classify(this.records, rootWrittenNanos); + this.t0 = computeT0(this.continuations, this.scopes); + this.continuationFailures = classifyContinuations(this.continuations, rootWrittenNanos); + this.scopeFailures = classifyScopes(this.scopes); + this.scopeBySeq = new LinkedHashMap<>(); + for (ScopeRecord s : this.scopes) { + scopeBySeq.put(s.seq, s); + } } - private static long computeT0(List records) { + private static long computeT0(List continuations, List scopes) { long min = Long.MAX_VALUE; - for (ContinuationRecord r : records) { + for (ContinuationRecord r : continuations) { min = Math.min(min, r.firstNanos()); } + for (ScopeRecord s : scopes) { + min = Math.min(min, s.firstNanos()); + } return min == Long.MAX_VALUE ? 0 : min; } - private static Map> classify( + private static Map> classifyContinuations( List records, Map rootWrittenNanos) { - Map> result = new LinkedHashMap<>(); + Map> result = new LinkedHashMap<>(); for (ContinuationRecord r : records) { - EnumSet flags = EnumSet.noneOf(Flag.class); - - if (!r.isResolved()) { - flags.add(Flag.LEAK); - } - - List resolutions = r.resolutions(); - List activations = r.activations(); - if (resolutions.size() > 1) { - flags.add(Flag.DOUBLE); - } - if (!resolutions.isEmpty()) { - long firstResolve = resolutions.get(0).nanos; - for (ScopeEvent a : activations) { - if (a.nanos > firstResolve) { - flags.add(Flag.DOUBLE); - break; - } - } - } - - Long rootNanos = rootWrittenNanos.get(r.traceId); - if (rootNanos != null) { - if (laterThan(activations, rootNanos) || laterThan(resolutions, rootNanos)) { - flags.add(Flag.LATE); - } - } - - if (!flags.isEmpty()) { - result.put(r, flags); + EnumSet failures = r.failures(rootWrittenNanos.get(r.traceId)); + if (!failures.isEmpty()) { + result.put(r, failures); } } return result; } - private static boolean laterThan(List events, long nanos) { - for (ScopeEvent e : events) { - if (e.nanos > nanos) { - return true; + private static Map> classifyScopes(List scopes) { + Map> result = new LinkedHashMap<>(); + for (ScopeRecord s : scopes) { + EnumSet failures = s.failures(); + if (!failures.isEmpty()) { + result.put(s, failures); } } - return false; + return result; } + // ---- accessors ----------------------------------------------------------- + public List records() { - return new ArrayList<>(records); + return new ArrayList<>(continuations); + } + + public List scopeRecords() { + return new ArrayList<>(scopes); + } + + public Map> findings() { + return new LinkedHashMap<>(continuationFailures); } - public Map> findings() { - return new LinkedHashMap<>(findings); + public Map> scopeFindings() { + return new LinkedHashMap<>(scopeFailures); } public int leakCount() { - return countWith(Flag.LEAK); + return countWith(continuationFailures, Failure.LEAKED); } public int lateCount() { - return countWith(Flag.LATE); + return countWith(continuationFailures, Failure.LATE_FINISH); } public int doubleCount() { - return countWith(Flag.DOUBLE); + return countWith(continuationFailures, Failure.DOUBLE_FINISH); + } + + public int activateAfterResolveCount() { + return countWith(continuationFailures, Failure.ACTIVATE_AFTER_RESOLVE); + } + + public int neverClosedScopeCount() { + return countWith(scopeFailures, Failure.NEVER_CLOSED); } - private int countWith(Flag flag) { + public int closeWrongThreadCount() { + return countWith(scopeFailures, Failure.CLOSE_WRONG_THREAD); + } + + private static int countWith(Map> findings, Failure failure) { int n = 0; - for (EnumSet flags : findings.values()) { - if (flags.contains(flag)) { + for (EnumSet f : findings.values()) { + if (f.contains(failure)) { n++; } } return n; } - /** True when there is a genuine bug to fail on: a never-resolved leak or a double resolution. */ + /** + * True when there is a genuine bug to fail on: a never-resolved leak, a double finish, an + * activation after resolve, or a scope that was never closed. {@link Failure#LATE_FINISH} and + * {@link Failure#CLOSE_WRONG_THREAD} are reported but do not fail (frequently legitimate async or + * teardown ordering). + */ public boolean hasProblems() { - return leakCount() > 0 || doubleCount() > 0; + return leakCount() > 0 + || doubleCount() > 0 + || activateAfterResolveCount() > 0 + || neverClosedScopeCount() > 0; } - // ---- rendering ----------------------------------------------------------- + // ---- rendering: text Gantt ----------------------------------------------- - /** Full cross-thread Gantt timeline, one block per continuation. */ + private static final int GANTT_FRAMES = 3; + + /** Full cross-thread Gantt timeline: one block per continuation, with nested scope bars. */ public String renderGantt() { StringBuilder sb = new StringBuilder(); - sb.append("Scope continuation timeline (") - .append(records.size()) - .append(" continuations, ") - .append(leakCount()) - .append(" leaked, ") - .append(lateCount()) - .append(" late-after-root, ") - .append(doubleCount()) - .append(" double/invalid)\n"); - if (records.isEmpty()) { - sb.append(" (no continuations captured)\n"); + appendHeader(sb, "Scope/continuation timeline"); + if (continuations.isEmpty() && scopes.isEmpty()) { + sb.append(" (nothing captured)\n"); return sb.toString(); } - for (ContinuationRecord r : records) { - EnumSet flags = findings.getOrDefault(r, EnumSet.noneOf(Flag.class)); + + for (ContinuationRecord r : continuations) { + EnumSet failures = + continuationFailures.getOrDefault(r, EnumSet.noneOf(Failure.class)); sb.append("\n#") .append(r.seq) + .append(' ') + .append(r.status()) .append(" trace=") .append(r.traceId) .append(" span=") - .append(r.spanId) - .append(" src=") - .append(r.sourceName()); + .append(r.spanId); + if (r.spanName != null) { + sb.append(" \"").append(r.spanName).append('"'); + } + sb.append(" src=").append(r.sourceName()); if (r.orphan) { sb.append(" [ORPHAN]"); } - if (!flags.isEmpty()) { - sb.append(" ").append(flags); + if (r.threadHandoff()) { + sb.append(" [handoff]"); } - sb.append('\n'); + if (!failures.isEmpty()) { + sb.append(' ').append(failures); + } + sb.append(timing(r)).append('\n'); + appendEvent(sb, "capture ", r.capture()); - for (ScopeEvent a : r.activations()) { - appendEvent(sb, "activate", a); + for (ScopeEvent a : r.resumes()) { + appendEvent(sb, "resume ", a); } - for (ScopeEvent res : r.resolutions()) { - appendEvent(sb, res.type == ScopeEvent.Type.RESOLVE_CANCEL ? "cancel " : "finish ", res); + for (ScopeEvent f : r.failedActivations()) { + appendEvent(sb, "act-fail", f); + } + ScopeEvent terminal = r.terminal(); + if (terminal != null) { + appendEvent( + sb, + terminal.type == ScopeEvent.Type.RESOLVE_CANCEL ? "cancel " : "finish ", + terminal); + } + for (ScopeEvent extra : r.extraTerminals()) { + appendEvent(sb, "DOUBLE ", extra); + } + for (long scopeSeq : r.scopeRecordSeqs()) { + ScopeRecord scope = scopeBySeq.get(scopeSeq); + if (scope != null) { + appendScopeLine(sb, " ", scope); + } + } + if (terminal == null) { + sb.append(" LEAKED (never finished or cancelled)\n"); + } + } + + List orphanScopes = new ArrayList<>(); + for (ScopeRecord s : scopes) { + if (s.continuationSeq == null) { + orphanScopes.add(s); } - if (!r.isResolved()) { - sb.append(" ").append("LEAKED ").append(" (never finished or cancelled)\n"); + } + if (!orphanScopes.isEmpty()) { + sb.append("\nNon-continuation scopes:\n"); + for (ScopeRecord s : orphanScopes) { + appendScopeLine(sb, " ", s); } } return sb.toString(); } - private static final int GANTT_FRAMES = 3; + private void appendHeader(StringBuilder sb, String title) { + sb.append(title) + .append(" (") + .append(continuations.size()) + .append(" continuations, ") + .append(scopes.size()) + .append(" scopes; ") + .append(leakCount()) + .append(" leaked, ") + .append(lateCount()) + .append(" late, ") + .append(doubleCount()) + .append(" double, ") + .append(activateAfterResolveCount()) + .append(" activate-after-resolve | scopes: ") + .append(neverClosedScopeCount()) + .append(" never-closed, ") + .append(closeWrongThreadCount()) + .append(" wrong-thread)\n"); + } + + private String timing(ContinuationRecord r) { + StringBuilder sb = new StringBuilder(); + Long capToResume = r.captureToFirstResumeNanos(); + Long age = r.ageAtTerminalNanos(); + if (capToResume != null) { + sb.append(" cap->resume=").append(millis(capToResume)).append("ms"); + } + if (age != null) { + sb.append(" age=").append(millis(age)).append("ms"); + } + return sb.toString(); + } + + private void appendScopeLine(StringBuilder sb, String indent, ScopeRecord scope) { + EnumSet failures = scopeFailures.getOrDefault(scope, EnumSet.noneOf(Failure.class)); + sb.append(indent).append("scope#").append(scope.seq).append(' ').append(scope.sourceName()); + if (scope.spanName != null) { + sb.append(" \"").append(scope.spanName).append('"'); + } + ScopeEvent open = scope.open(); + ScopeEvent close = scope.close(); + if (open != null) { + sb.append(" open +").append(relMillis(open.nanos)).append("ms @ ").append(open.threadName); + } + if (close != null) { + sb.append(" close +") + .append(relMillis(close.nanos)) + .append("ms @ ") + .append(close.threadName); + Long active = scope.activeDurationNanos(); + if (active != null) { + sb.append(" (active ").append(millis(active)).append("ms)"); + } + } + if (scope.threadHandoff()) { + sb.append(" [handoff]"); + } + if (!failures.isEmpty()) { + sb.append(' ').append(failures); + } + sb.append('\n'); + } private void appendEvent(StringBuilder sb, String label, ScopeEvent event) { if (event == null) { @@ -191,8 +303,6 @@ private void appendEvent(StringBuilder sb, String label, ScopeEvent event) { .append(" at ") .append(event.callsite() == null ? "" : event.callsite()) .append('\n'); - // a couple of caller frames give context when the top frame is generic (Thread.run, a - // listener dispatch, etc.) — they reveal which library/app code drove the event StackTraceElement[] stack = event.stack; if (stack != null) { for (int i = 1; i < stack.length && i < GANTT_FRAMES; i++) { @@ -201,21 +311,15 @@ private void appendEvent(StringBuilder sb, String label, ScopeEvent event) { } } - /** Leak-only summary: just the flagged continuations with their callsites. */ + /** Problem-only summary: just the flagged continuations and scopes with their callsites. */ public String renderSummary() { StringBuilder sb = new StringBuilder(); - sb.append("Scope continuation problems: ") - .append(leakCount()) - .append(" leaked, ") - .append(lateCount()) - .append(" late-after-root, ") - .append(doubleCount()) - .append(" double/invalid\n"); - if (findings.isEmpty()) { + appendHeader(sb, "Scope/continuation problems"); + if (continuationFailures.isEmpty() && scopeFailures.isEmpty()) { sb.append(" (none)\n"); return sb.toString(); } - for (Map.Entry> e : findings.entrySet()) { + for (Map.Entry> e : continuationFailures.entrySet()) { ContinuationRecord r = e.getKey(); ScopeEvent capture = r.capture(); sb.append(" ") @@ -230,53 +334,67 @@ public String renderSummary() { .append(capture == null || capture.callsite() == null ? "" : capture.callsite()) .append('\n'); } + for (Map.Entry> e : scopeFailures.entrySet()) { + ScopeRecord s = e.getKey(); + ScopeEvent open = s.open(); + sb.append(" ") + .append(e.getValue()) + .append(" scope#") + .append(s.seq) + .append(" trace=") + .append(s.traceId) + .append(" src=") + .append(s.sourceName()) + .append(" opened at ") + .append(open == null || open.callsite() == null ? "" : open.callsite()) + .append('\n'); + } return sb.toString(); } + // ---- rendering: Mermaid Gantt -------------------------------------------- + /** - * Renders a Mermaid Gantt diagram (wrapped in a ```mermaid fence) that renders inline in GitHub, - * GitLab, and most IDEs. Continuations are grouped into swimlanes by the thread where they were - * activated/resolved; each is a bar from capture to resolution. Leaks extend to the end of the - * window and are marked {@code crit} (red); late-after-root is marked {@code active}. Times are - * milliseconds relative to the first event (rounded — this is an overview, not a profiler). + * Renders a Mermaid Gantt diagram (wrapped in a ```mermaid fence). Continuations and scopes are + * grouped into swimlanes by thread; a continuation contributes a capture marker in its capture + * lane and a run/finish bar in its resume/terminal lane, and each linked scope contributes an + * open→close bar in its close lane. Leaks/never-closed extend to the end of the window and are + * marked {@code crit}; late/wrong-thread are marked {@code active}. */ public String toMermaidGantt() { StringBuilder sb = new StringBuilder(); sb.append("```mermaid\n"); sb.append("gantt\n"); - sb.append(" title Scope continuations (") - .append(records.size()) - .append(" total, ") + sb.append(" title Scope/continuations (") + .append(continuations.size()) + .append(" cont, ") + .append(scopes.size()) + .append(" scopes, ") .append(leakCount()) .append(" leaked, ") - .append(lateCount()) - .append(" late, ") - .append(doubleCount()) - .append(" double)\n"); + .append(neverClosedScopeCount()) + .append(" never-closed)\n"); sb.append(" dateFormat x\n"); sb.append(" axisFormat %Lms\n"); - if (records.isEmpty()) { - sb.append(" section none\n no continuations captured :done, 0, 1\n```\n"); + if (continuations.isEmpty() && scopes.isEmpty()) { + sb.append(" section none\n nothing captured :done, 0, 1\n```\n"); return sb.toString(); } long windowEnd = relMillisRounded(maxNanos()); - - // A continuation contributes a "cap" marker in its capture-thread lane and (when it ran) a - // run/finish bar in its activation/resolution-thread lane. The same #seq appearing in two lanes - // makes the cross-thread hop visible: captured here, continued/cancelled there. Map> lanes = new LinkedHashMap<>(); - for (ContinuationRecord r : records) { - EnumSet flags = findings.getOrDefault(r, EnumSet.noneOf(Flag.class)); + + for (ContinuationRecord r : continuations) { + EnumSet failures = + continuationFailures.getOrDefault(r, EnumSet.noneOf(Failure.class)); ScopeEvent capture = r.capture(); - List activations = r.activations(); - List resolutions = r.resolutions(); - boolean leak = flags.contains(Flag.LEAK); + List resumes = r.resumes(); + ScopeEvent terminal = r.terminal(); + boolean leak = failures.contains(Failure.LEAKED); if (capture != null) { long start = relMillisRounded(capture.nanos); - if (leak && activations.isEmpty()) { - // captured but never even activated: leak lives, open-ended, in the capture lane + if (leak && resumes.isEmpty()) { addTask( lanes, capture.threadName, @@ -295,50 +413,66 @@ public String toMermaidGantt() { } } - if (!activations.isEmpty()) { - ScopeEvent firstAct = activations.get(0); - long runStart = relMillisRounded(firstAct.nanos); - if (resolutions.isEmpty()) { - // activated but never closed: open-ended leak in the activation lane + if (!resumes.isEmpty()) { + ScopeEvent firstResume = resumes.get(0); + long runStart = relMillisRounded(firstResume.nanos); + if (terminal == null) { addTask( lanes, - firstAct.threadName, - "#" + r.seq + " LEAK ran " + where(firstAct), + firstResume.threadName, + "#" + r.seq + " LEAK ran " + where(firstResume), "crit", runStart, Math.max(runStart + 1, windowEnd)); } else { - ScopeEvent lastRes = resolutions.get(resolutions.size() - 1); - long runEnd = relMillisRounded(lastRes.nanos); - if (runEnd <= runStart) { - runEnd = runStart + 1; - } - String verb = lastRes.type == ScopeEvent.Type.RESOLVE_CANCEL ? "cancel" : "fin"; - String tag = - flags.contains(Flag.DOUBLE) ? "crit" : flags.contains(Flag.LATE) ? "active" : "done"; + long runEnd = Math.max(runStart + 1, relMillisRounded(terminal.nanos)); + String verb = terminal.type == ScopeEvent.Type.RESOLVE_CANCEL ? "cancel" : "fin"; + String tag = continuationTag(failures); addTask( lanes, - lastRes.threadName, - "#" + r.seq + " " + verb + " " + where(lastRes), + terminal.threadName, + "#" + r.seq + " " + verb + " " + where(terminal), tag, runStart, runEnd); } - } else if (!resolutions.isEmpty()) { - // resolved without an observed activation (e.g. cancelled before use) - ScopeEvent lastRes = resolutions.get(resolutions.size() - 1); - long start = relMillisRounded(lastRes.nanos); - String verb = lastRes.type == ScopeEvent.Type.RESOLVE_CANCEL ? "cancel" : "fin"; + } else if (terminal != null) { + long start = relMillisRounded(terminal.nanos); + String verb = terminal.type == ScopeEvent.Type.RESOLVE_CANCEL ? "cancel" : "fin"; addTask( lanes, - lastRes.threadName, - "#" + r.seq + " " + verb + " " + where(lastRes), + terminal.threadName, + "#" + r.seq + " " + verb + " " + where(terminal), "done", start, start + 1); } } + for (ScopeRecord s : scopes) { + EnumSet failures = scopeFailures.getOrDefault(s, EnumSet.noneOf(Failure.class)); + ScopeEvent open = s.open(); + ScopeEvent close = s.close(); + if (open == null) { + continue; + } + long start = relMillisRounded(open.nanos); + String label = "scope#" + s.seq + " " + where(open); + if (close == null) { + addTask( + lanes, + open.threadName, + label + " NEVER-CLOSED", + "crit", + start, + Math.max(start + 1, windowEnd)); + } else { + long end = Math.max(start + 1, relMillisRounded(close.nanos)); + String tag = failures.contains(Failure.CLOSE_WRONG_THREAD) ? "active" : "done"; + addTask(lanes, close.threadName, label, tag, start, end); + } + } + for (Map.Entry> lane : lanes.entrySet()) { sb.append(" section ").append(sanitize(lane.getKey())).append('\n'); for (String task : lane.getValue()) { @@ -349,6 +483,17 @@ public String toMermaidGantt() { return sb.toString(); } + private static String continuationTag(EnumSet failures) { + if (failures.contains(Failure.DOUBLE_FINISH) + || failures.contains(Failure.ACTIVATE_AFTER_RESOLVE)) { + return "crit"; + } + if (failures.contains(Failure.LATE_FINISH)) { + return "active"; + } + return "done"; + } + private void addTask( Map> lanes, String thread, @@ -361,7 +506,6 @@ private void addTask( .add(sanitize(label) + " :" + tag + ", " + start + ", " + end); } - /** Short "SimpleClass.method" for a Mermaid label, or "?" when the callsite was filtered out. */ private static String where(ScopeEvent event) { StackTraceElement frame = event == null ? null : event.callsite(); if (frame == null) { @@ -375,80 +519,160 @@ private static String where(ScopeEvent event) { private long maxNanos() { long max = t0; - for (ContinuationRecord r : records) { - ScopeEvent c = r.capture(); - if (c != null) { - max = Math.max(max, c.nanos); - } - for (ScopeEvent e : r.activations()) { - max = Math.max(max, e.nanos); + for (ContinuationRecord r : continuations) { + max = Math.max(max, lastNanos(r)); + } + for (ScopeRecord s : scopes) { + ScopeEvent open = s.open(); + ScopeEvent close = s.close(); + if (open != null) { + max = Math.max(max, open.nanos); } - for (ScopeEvent e : r.resolutions()) { - max = Math.max(max, e.nanos); + if (close != null) { + max = Math.max(max, close.nanos); } } return max; } + private static long lastNanos(ContinuationRecord r) { + long max = r.capture() != null ? r.capture().nanos : Long.MIN_VALUE; + for (ScopeEvent e : r.resumes()) { + max = Math.max(max, e.nanos); + } + if (r.terminal() != null) { + max = Math.max(max, r.terminal().nanos); + } + for (ScopeEvent e : r.extraTerminals()) { + max = Math.max(max, e.nanos); + } + return max == Long.MIN_VALUE ? 0 : max; + } + private long relMillisRounded(long nanos) { return Math.round((nanos - t0) / 1_000_000.0); } - /** Strips characters that would break Mermaid task/section syntax. */ private static String sanitize(String s) { return s.replace(':', ' ').replace(',', ' ').replace(';', ' ').replace('\n', ' '); } + // ---- rendering: JSON ----------------------------------------------------- + /** Machine-readable view for CI artifacts and later tooling. */ public String toJson() { StringBuilder sb = new StringBuilder(); sb.append("{\"continuations\":["); - for (int i = 0; i < records.size(); i++) { + for (int i = 0; i < continuations.size(); i++) { if (i > 0) { sb.append(','); } - appendRecordJson(sb, records.get(i)); + appendContinuationJson(sb, continuations.get(i)); } - sb.append("],\"summary\":{\"total\":") - .append(records.size()) + sb.append("],\"scopes\":["); + for (int i = 0; i < scopes.size(); i++) { + if (i > 0) { + sb.append(','); + } + appendScopeJson(sb, scopes.get(i)); + } + sb.append("],\"summary\":{\"continuations\":") + .append(continuations.size()) + .append(",\"scopes\":") + .append(scopes.size()) .append(",\"leaked\":") .append(leakCount()) - .append(",\"lateAfterRoot\":") + .append(",\"lateFinish\":") .append(lateCount()) - .append(",\"doubleOrInvalid\":") + .append(",\"doubleFinish\":") .append(doubleCount()) + .append(",\"activateAfterResolve\":") + .append(activateAfterResolveCount()) + .append(",\"neverClosedScopes\":") + .append(neverClosedScopeCount()) + .append(",\"closeWrongThread\":") + .append(closeWrongThreadCount()) .append("}}"); return sb.toString(); } - private void appendRecordJson(StringBuilder sb, ContinuationRecord r) { - EnumSet flags = findings.getOrDefault(r, EnumSet.noneOf(Flag.class)); + private void appendContinuationJson(StringBuilder sb, ContinuationRecord r) { + EnumSet failures = continuationFailures.getOrDefault(r, EnumSet.noneOf(Failure.class)); sb.append("{\"seq\":") .append(r.seq) - .append(",\"traceId\":\"") + .append(",\"status\":\"") + .append(r.status()) + .append("\",\"traceId\":\"") .append(r.traceId) .append("\",\"spanId\":") .append(r.spanId) + .append(",\"spanName\":") + .append(jsonString(r.spanName)) .append(",\"source\":\"") .append(r.sourceName()) .append("\",\"orphan\":") .append(r.orphan) - .append(",\"flags\":["); + .append(",\"threadHandoff\":") + .append(r.threadHandoff()) + .append(",\"captureToFirstResumeMs\":") + .append(millisOrNull(r.captureToFirstResumeNanos())) + .append(",\"ageAtTerminalMs\":") + .append(millisOrNull(r.ageAtTerminalNanos())) + .append(",\"scopeSeqs\":") + .append(r.scopeRecordSeqs()) + .append(",\"failures\":["); + appendFailuresJson(sb, failures); + sb.append("],\"capture\":"); + appendEventJson(sb, r.capture()); + sb.append(",\"resumes\":["); + appendEventsJson(sb, r.resumes()); + sb.append("],\"failedActivations\":["); + appendEventsJson(sb, r.failedActivations()); + sb.append("],\"terminal\":"); + appendEventJson(sb, r.terminal()); + sb.append(",\"extraTerminals\":["); + appendEventsJson(sb, r.extraTerminals()); + sb.append("]}"); + } + + private void appendScopeJson(StringBuilder sb, ScopeRecord s) { + EnumSet failures = scopeFailures.getOrDefault(s, EnumSet.noneOf(Failure.class)); + sb.append("{\"seq\":") + .append(s.seq) + .append(",\"closed\":") + .append(s.closed()) + .append(",\"traceId\":\"") + .append(s.traceId) + .append("\",\"spanId\":") + .append(s.spanId) + .append(",\"spanName\":") + .append(jsonString(s.spanName)) + .append(",\"source\":\"") + .append(s.sourceName()) + .append("\",\"continuationSeq\":") + .append(s.continuationSeq) + .append(",\"threadHandoff\":") + .append(s.threadHandoff()) + .append(",\"activeDurationMs\":") + .append(millisOrNull(s.activeDurationNanos())) + .append(",\"failures\":["); + appendFailuresJson(sb, failures); + sb.append("],\"open\":"); + appendEventJson(sb, s.open()); + sb.append(",\"close\":"); + appendEventJson(sb, s.close()); + sb.append("}"); + } + + private static void appendFailuresJson(StringBuilder sb, EnumSet failures) { boolean first = true; - for (Flag f : flags) { + for (Failure f : failures) { if (!first) { sb.append(','); } sb.append('"').append(f).append('"'); first = false; } - sb.append("],\"capture\":"); - appendEventJson(sb, r.capture()); - sb.append(",\"activations\":["); - appendEventsJson(sb, r.activations()); - sb.append("],\"resolutions\":["); - appendEventsJson(sb, r.resolutions()); - sb.append("]}"); } private void appendEventsJson(StringBuilder sb, List events) { @@ -477,8 +701,19 @@ private void appendEventJson(StringBuilder sb, ScopeEvent event) { } private String relMillis(long nanos) { - double ms = (nanos - t0) / 1_000_000.0; - return String.format("%.3f", ms); + return String.format("%.3f", (nanos - t0) / 1_000_000.0); + } + + private static String millis(long nanos) { + return String.format("%.3f", nanos / 1_000_000.0); + } + + private static String millisOrNull(Long nanos) { + return nanos == null ? "null" : millis(nanos); + } + + private static String jsonString(String s) { + return s == null ? "null" : "\"" + jsonEscape(s) + "\""; } private static String jsonEscape(String s) { diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeEvent.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeEvent.java index 5bc6bb203e3..f622f19de9a 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeEvent.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeEvent.java @@ -9,8 +9,16 @@ public final class ScopeEvent { public enum Type { CAPTURE, ACTIVATE, + /** An {@code activate()} that returned the noop scope after the continuation was resolved. */ + ACTIVATE_FAILED, RESOLVE_FINISH, - RESOLVE_CANCEL + RESOLVE_CANCEL, + /** A scope became active (first activation). */ + SCOPE_OPEN, + /** A scope was popped from its thread's stack. */ + SCOPE_CLOSE, + /** A scope was closed while not on top of its thread's stack. */ + SCOPE_CLOSE_WRONG_THREAD } public final Type type; diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeRecord.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeRecord.java new file mode 100644 index 00000000000..1cfc4eb71f0 --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeRecord.java @@ -0,0 +1,117 @@ +package datadog.trace.agent.test.scopediag; + +import datadog.trace.api.DDTraceId; +import java.util.ArrayList; +import java.util.EnumSet; +import java.util.List; + +/** + * The scope activation lifecycle: a scope opened (first activation) → closed (popped from + * its thread's stack). Distinct from the continuation lifecycle ({@link ContinuationRecord}); when + * a scope was spawned by resuming a continuation, {@link #continuationSeq} links back to that + * continuation's {@link ContinuationRecord#seq}. + */ +public final class ScopeRecord { + public final long seq; + public final DDTraceId traceId; + public final long spanId; + public final String spanName; + public final byte source; + + /** + * The seq of the continuation that spawned this scope, or {@code null} for a plain activation. + */ + public final Long continuationSeq; + + private final ScopeEvent open; + private ScopeEvent close; + private final List wrongThreadCloses = new ArrayList<>(0); + + ScopeRecord( + long seq, + DDTraceId traceId, + long spanId, + String spanName, + byte source, + Long continuationSeq, + ScopeEvent open) { + this.seq = seq; + this.traceId = traceId; + this.spanId = spanId; + this.spanName = spanName; + this.source = source; + this.continuationSeq = continuationSeq; + this.open = open; + } + + // ---- mutation ------------------------------------------------------------ + + synchronized void setClose(ScopeEvent event) { + if (close == null) { + close = event; + } + } + + synchronized void addWrongThreadClose(ScopeEvent event) { + wrongThreadCloses.add(event); + } + + // ---- accessors ----------------------------------------------------------- + + public synchronized ScopeEvent open() { + return open; + } + + public synchronized ScopeEvent close() { + return close; + } + + public synchronized List wrongThreadCloses() { + return new ArrayList<>(wrongThreadCloses); + } + + public synchronized boolean closed() { + return close != null; + } + + // ---- derived ------------------------------------------------------------- + + /** {@code true} when the scope was opened and closed on different threads. */ + public synchronized boolean threadHandoff() { + return open != null && close != null && !open.threadName.equals(close.threadName); + } + + /** Nanos the scope was active, or {@code null} if it was never closed. */ + public synchronized Long activeDurationNanos() { + if (open == null || close == null) { + return null; + } + return close.nanos - open.nanos; + } + + public synchronized EnumSet failures() { + EnumSet failures = EnumSet.noneOf(Failure.class); + if (open != null && close == null) { + failures.add(Failure.NEVER_CLOSED); + } + if (!wrongThreadCloses.isEmpty()) { + failures.add(Failure.CLOSE_WRONG_THREAD); + } + return failures; + } + + public synchronized long firstNanos() { + long min = open != null ? open.nanos : Long.MAX_VALUE; + if (close != null) { + min = Math.min(min, close.nanos); + } + for (ScopeEvent e : wrongThreadCloses) { + min = Math.min(min, e.nanos); + } + return min; + } + + public String sourceName() { + return ScopeSources.name(source); + } +} diff --git a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbeTest.java b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbeTest.java index 76cba5d71d8..4250b851bbb 100644 --- a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbeTest.java +++ b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbeTest.java @@ -1,14 +1,17 @@ package datadog.trace.agent.test.scopediag; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.lang.reflect.Field; +import java.lang.reflect.Method; import org.junit.jupiter.api.Test; /** - * Guards the {@code CANCELLED} sentinel that {@link ScopeContinuationProbe} duplicates from {@code - * datadog.trace.core.scopemanager.ScopeContinuation}. If the production constant ever changes, the - * probe's resolution detection would silently break — this test fails instead. + * Guards the production names/constants that {@link ScopeContinuationProbe} relies on reflectively + * or mirrors. If any of these are renamed/changed in the tracer, these assertions fail loudly + * instead of the diagnostic silently going dark. */ class ScopeContinuationProbeTest { @@ -24,4 +27,48 @@ void cancelledSentinelMatchesProduction() throws Exception { ScopeContinuationProbe.CANCELLED, "ScopeContinuationProbe.CANCELLED is out of sync with ScopeContinuation.CANCELLED"); } + + @Test + void scopeLifecycleHooksExist() throws Exception { + Class scope = Class.forName("datadog.trace.core.scopemanager.ContinuableScope"); + assertNotNull(scope.getDeclaredMethod("afterActivated"), "afterActivated() (scope open)"); + assertNotNull(scope.getDeclaredMethod("onProperClose"), "onProperClose() (scope close)"); + assertNotNull(scope.getDeclaredMethod("close"), "close() (wrong-thread check)"); + // source byte read reflectively in the probe + assertNotNull(findField(scope, "source"), "ContinuableScope.source"); + + Class continuing = Class.forName("datadog.trace.core.scopemanager.ContinuingScope"); + assertNotNull( + continuing.getDeclaredField("continuation"), "ContinuingScope.continuation (scope link)"); + } + + @Test + void wrongThreadCheckChainExists() throws Exception { + Class scope = Class.forName("datadog.trace.core.scopemanager.ContinuableScope"); + assertNotNull(findField(scope, "scopeManager"), "ContinuableScope.scopeManager"); + Class manager = Class.forName("datadog.trace.core.scopemanager.ContinuableScopeManager"); + assertNotNull(manager.getDeclaredMethod("scopeStack"), "ContinuableScopeManager.scopeStack()"); + Class stack = Class.forName("datadog.trace.core.scopemanager.ScopeStack"); + assertTrue(hasMethod(stack, "checkTop", 1), "ScopeStack.checkTop(scope)"); + } + + private static Field findField(Class cls, String name) { + for (Class c = cls; c != null; c = c.getSuperclass()) { + try { + return c.getDeclaredField(name); + } catch (NoSuchFieldException ignored) { + // keep walking + } + } + return null; + } + + private static boolean hasMethod(Class cls, String name, int params) { + for (Method m : cls.getDeclaredMethods()) { + if (m.getName().equals(name) && m.getParameterCount() == params) { + return true; + } + } + return false; + } } diff --git a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsIntegrationTest.java b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsIntegrationTest.java index f3dc112d8ea..748d817077f 100644 --- a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsIntegrationTest.java +++ b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsIntegrationTest.java @@ -2,12 +2,14 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; import datadog.trace.bootstrap.instrumentation.api.AgentScope; import datadog.trace.bootstrap.instrumentation.api.AgentSpan; import datadog.trace.common.writer.ListWriter; import datadog.trace.core.CoreTracer; +import java.util.List; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; @@ -68,4 +70,59 @@ void resolvedContinuationDoesNotLeak() { assertEquals(1, report.records().size()); assertEquals(0, report.leakCount(), "activated then closed continuation is resolved"); } + + @Test + void scopeLifetimeRecordedAndLinkedToContinuation() { + tracer = CoreTracer.builder().writer(new ListWriter()).strictTraceWrites(false).build(); + + ScopeDiagnostics.startRecording(); + + AgentSpan span = tracer.startSpan("test", "op"); + AgentScope.Continuation continuation = tracer.captureSpan(span); + AgentScope scope = continuation.activate(); + scope.close(); + span.finish(); + + ScopeDiagnosticsReport report = ScopeDiagnostics.report(); + + ScopeRecord linked = continuationScope(report); + assertNotNull(linked, "the resumed scope was recorded"); + assertNotNull(linked.open(), "scope open observed"); + assertTrue(linked.closed(), "scope close observed"); + assertEquals(0, report.neverClosedScopeCount()); + // the scope links back to its continuation record + assertEquals(1, report.records().size()); + assertEquals(Long.valueOf(report.records().get(0).seq), linked.continuationSeq); + } + + @Test + void neverClosedScopeIsFlagged() { + tracer = CoreTracer.builder().writer(new ListWriter()).strictTraceWrites(false).build(); + + ScopeDiagnostics.startRecording(); + + AgentSpan span = tracer.startSpan("test", "op"); + AgentScope.Continuation continuation = tracer.captureSpan(span); + AgentScope scope = continuation.activate(); // opened, never closed + + ScopeDiagnosticsReport report = ScopeDiagnostics.report(); + + assertEquals(1, report.neverClosedScopeCount(), "the open scope never closed"); + assertEquals(1, report.leakCount(), "and the continuation it backs also leaks"); + assertTrue(report.hasProblems()); + + // clean up so the open scope does not pollute this thread's scope stack for later tests + scope.close(); + span.finish(); + } + + private static ScopeRecord continuationScope(ScopeDiagnosticsReport report) { + List scopes = report.scopeRecords(); + for (ScopeRecord s : scopes) { + if (s.continuationSeq != null) { + return s; + } + } + return null; + } } diff --git a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReportTest.java b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReportTest.java index 5303172be58..a2c4bdbb33b 100644 --- a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReportTest.java +++ b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReportTest.java @@ -23,20 +23,22 @@ private static ScopeEvent event(ScopeEvent.Type type, String thread, long nanos) private static ContinuationRecord record(long seq, DDTraceId trace) { return new ContinuationRecord( - seq, trace, 7L, (byte) 0, false, event(ScopeEvent.Type.CAPTURE, "main", 1000)); + seq, trace, 7L, "op", (byte) 0, false, event(ScopeEvent.Type.CAPTURE, "main", 1000)); } @Test - void resolvedContinuationHasNoFlags() { + void resolvedContinuationHasNoFailures() { ContinuationRecord r = record(0, DDTraceId.from(10)); - r.addActivation(event(ScopeEvent.Type.ACTIVATE, "pool-1", 2000)); - r.addResolution(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 3000)); + r.addResume(event(ScopeEvent.Type.ACTIVATE, "pool-1", 2000)); + r.setTerminalOrExtra(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 3000)); ScopeDiagnosticsReport report = report(list(r), map()); assertEquals(0, report.leakCount()); assertEquals(0, report.lateCount()); assertEquals(0, report.doubleCount()); + assertEquals(ContinuationStatus.FINISHED, r.status()); + assertTrue(r.threadHandoff()); // captured on main, resolved on pool-1 assertFalse(report.hasProblems()); } @@ -47,8 +49,9 @@ void neverResolvedIsFlaggedAsLeak() { ScopeDiagnosticsReport report = report(list(r), map()); assertEquals(1, report.leakCount()); + assertEquals(ContinuationStatus.LEAKED, r.status()); assertTrue(report.hasProblems()); - assertTrue(report.renderSummary().contains("LEAK")); + assertTrue(report.renderSummary().contains("LEAKED")); assertTrue(report.renderGantt().contains("Worker.java:42")); } @@ -56,8 +59,8 @@ void neverResolvedIsFlaggedAsLeak() { void resolutionAfterRootWriteIsFlaggedLate() { DDTraceId trace = DDTraceId.from(12); ContinuationRecord r = record(0, trace); - r.addActivation(event(ScopeEvent.Type.ACTIVATE, "pool-1", 5000)); - r.addResolution(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 6000)); + r.addResume(event(ScopeEvent.Type.ACTIVATE, "pool-1", 5000)); + r.setTerminalOrExtra(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 6000)); Map rootWritten = map(); rootWritten.put(trace, 4000L); // root written before the activation/resolution @@ -68,12 +71,27 @@ void resolutionAfterRootWriteIsFlaggedLate() { assertEquals(0, report.leakCount()); // it is resolved, just late } + @Test + void lateFinishDoesNotFail() { + DDTraceId trace = DDTraceId.from(120); + ContinuationRecord r = record(0, trace); + r.addResume(event(ScopeEvent.Type.ACTIVATE, "pool-1", 5000)); + r.setTerminalOrExtra(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 6000)); + Map rootWritten = map(); + rootWritten.put(trace, 4000L); + + ScopeDiagnosticsReport report = report(list(r), rootWritten); + + assertEquals(1, report.lateCount()); + assertFalse(report.hasProblems()); // late-finish is report-only + } + @Test void multipleResolutionsAreFlaggedDouble() { ContinuationRecord r = record(0, DDTraceId.from(13)); - r.addActivation(event(ScopeEvent.Type.ACTIVATE, "pool-1", 2000)); - r.addResolution(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 3000)); - r.addResolution(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 3500)); + r.addResume(event(ScopeEvent.Type.ACTIVATE, "pool-1", 2000)); + r.setTerminalOrExtra(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 3000)); + r.setTerminalOrExtra(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 3500)); ScopeDiagnosticsReport report = report(list(r), map()); @@ -82,14 +100,28 @@ void multipleResolutionsAreFlaggedDouble() { } @Test - void activationAfterResolutionIsFlaggedDouble() { + void activationAfterResolveIsFailure() { ContinuationRecord r = record(0, DDTraceId.from(14)); - r.addResolution(event(ScopeEvent.Type.RESOLVE_CANCEL, "pool-1", 2000)); - r.addActivation(event(ScopeEvent.Type.ACTIVATE, "pool-2", 3000)); // activate after cancel + r.setTerminalOrExtra(event(ScopeEvent.Type.RESOLVE_CANCEL, "pool-1", 2000)); + r.addResume(event(ScopeEvent.Type.ACTIVATE, "pool-2", 3000)); // resume after cancel ScopeDiagnosticsReport report = report(list(r), map()); - assertEquals(1, report.doubleCount()); + assertEquals(1, report.activateAfterResolveCount()); + assertEquals(0, report.doubleCount()); + assertTrue(report.hasProblems()); + } + + @Test + void failedActivationIsActivateAfterResolve() { + ContinuationRecord r = record(0, DDTraceId.from(141)); + r.setTerminalOrExtra(event(ScopeEvent.Type.RESOLVE_CANCEL, "pool-1", 2000)); + r.addFailedActivation(event(ScopeEvent.Type.ACTIVATE_FAILED, "pool-2", 3000)); + + ScopeDiagnosticsReport report = report(list(r), map()); + + assertEquals(1, report.activateAfterResolveCount()); + assertTrue(report.hasProblems()); } @Test @@ -100,15 +132,16 @@ void jsonContainsSummaryAndRecords() { String json = report.toJson(); assertTrue(json.contains("\"summary\"")); assertTrue(json.contains("\"leaked\":1")); - assertTrue(json.contains("\"flags\":[\"LEAK\"]")); + assertTrue(json.contains("\"failures\":[\"LEAKED\"]")); + assertTrue(json.contains("\"scopes\":[")); } @Test void mermaidGanttGroupsByThreadAndMarksLeakCrit() { DDTraceId trace = DDTraceId.from(20); ContinuationRecord resolved = record(0, trace); - resolved.addActivation(event(ScopeEvent.Type.ACTIVATE, "pool-1", 2000)); - resolved.addResolution(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 3000)); + resolved.addResume(event(ScopeEvent.Type.ACTIVATE, "pool-1", 2000)); + resolved.setTerminalOrExtra(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 3000)); ContinuationRecord leak = record(1, trace); // capture only -> leak String mermaid = report(list(resolved, leak), map()).toMermaidGantt(); @@ -128,7 +161,7 @@ void mermaidGanttGroupsByThreadAndMarksLeakCrit() { private static ScopeDiagnosticsReport report( List records, Map rootWritten) { - return new ScopeDiagnosticsReport(records, rootWritten); + return new ScopeDiagnosticsReport(records, new ArrayList<>(), rootWritten); } private static List list(ContinuationRecord... rs) { diff --git a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeRecordTest.java b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeRecordTest.java new file mode 100644 index 00000000000..eb7746f4854 --- /dev/null +++ b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeRecordTest.java @@ -0,0 +1,97 @@ +package datadog.trace.agent.test.scopediag; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import datadog.trace.api.DDTraceId; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import org.junit.jupiter.api.Test; + +class ScopeRecordTest { + + private static final StackTraceElement[] STACK = { + new StackTraceElement("com.app.Worker", "run", "Worker.java", 7) + }; + + private static ScopeEvent event(ScopeEvent.Type type, String thread, long nanos) { + return new ScopeEvent(type, thread, nanos, STACK); + } + + private static ScopeRecord scope(long seq, Long continuationSeq, String openThread, long nanos) { + return new ScopeRecord( + seq, + DDTraceId.from(1), + 9L, + "op", + (byte) 0, + continuationSeq, + event(ScopeEvent.Type.SCOPE_OPEN, openThread, nanos)); + } + + private static ScopeDiagnosticsReport report(ScopeRecord... scopes) { + List list = new ArrayList<>(); + for (ScopeRecord s : scopes) { + list.add(s); + } + return new ScopeDiagnosticsReport(new ArrayList<>(), list, new HashMap<>()); + } + + @Test + void openAndClosedHasNoFailures() { + ScopeRecord s = scope(0, null, "main", 1000); + s.setClose(event(ScopeEvent.Type.SCOPE_CLOSE, "main", 3000)); + + assertTrue(s.closed()); + assertEquals(0, s.failures().size()); + assertFalse(s.threadHandoff()); + assertEquals(Long.valueOf(2000), s.activeDurationNanos()); + assertFalse(report(s).hasProblems()); + } + + @Test + void openWithoutCloseIsNeverClosed() { + ScopeRecord s = scope(0, null, "main", 1000); + + assertFalse(s.closed()); + assertTrue(s.failures().contains(Failure.NEVER_CLOSED)); + + ScopeDiagnosticsReport report = report(s); + assertEquals(1, report.neverClosedScopeCount()); + assertTrue(report.hasProblems()); // never-closed is a genuine bug + } + + @Test + void openAndCloseOnDifferentThreadsIsHandoff() { + ScopeRecord s = scope(0, null, "main", 1000); + s.setClose(event(ScopeEvent.Type.SCOPE_CLOSE, "pool-1", 2000)); + + assertTrue(s.threadHandoff()); + } + + @Test + void wrongThreadCloseIsReportedButDoesNotFail() { + ScopeRecord s = scope(0, null, "main", 1000); + s.setClose(event(ScopeEvent.Type.SCOPE_CLOSE, "main", 2000)); + s.addWrongThreadClose(event(ScopeEvent.Type.SCOPE_CLOSE_WRONG_THREAD, "pool-2", 1500)); + + assertTrue(s.failures().contains(Failure.CLOSE_WRONG_THREAD)); + + ScopeDiagnosticsReport report = report(s); + assertEquals(1, report.closeWrongThreadCount()); + assertFalse(report.hasProblems()); // wrong-thread is report-only + } + + @Test + void jsonContainsScopeFields() { + ScopeRecord s = scope(3, 1L, "main", 1000); + s.setClose(event(ScopeEvent.Type.SCOPE_CLOSE, "main", 2000)); + + String json = report(s).toJson(); + assertTrue(json.contains("\"scopes\":[")); + assertTrue(json.contains("\"continuationSeq\":1")); + assertTrue(json.contains("\"closed\":true")); + } +} From c71235859dfc52101bc43c3c88817597f7a6b1fb Mon Sep 17 00:00:00 2001 From: Andrea Marziali Date: Tue, 16 Jun 2026 17:14:25 +0200 Subject: [PATCH 6/9] fix wip --- .../netty41/NettyChannelPipelineInstrumentation.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/dd-java-agent/instrumentation/netty/netty-4.1/src/main/java/datadog/trace/instrumentation/netty41/NettyChannelPipelineInstrumentation.java b/dd-java-agent/instrumentation/netty/netty-4.1/src/main/java/datadog/trace/instrumentation/netty41/NettyChannelPipelineInstrumentation.java index 7fce2fd12eb..afea2c3eb1f 100644 --- a/dd-java-agent/instrumentation/netty/netty-4.1/src/main/java/datadog/trace/instrumentation/netty41/NettyChannelPipelineInstrumentation.java +++ b/dd-java-agent/instrumentation/netty/netty-4.1/src/main/java/datadog/trace/instrumentation/netty41/NettyChannelPipelineInstrumentation.java @@ -275,7 +275,15 @@ public static void cleanupHttp2ConnectParentContinuation( @Advice.Enter final boolean cleanupHttp2Continuation, @Advice.This final ChannelPipeline pipeline, @Advice.Return final ChannelFuture future) { - if (!cleanupHttp2Continuation) { + // Resolve the connect-parent continuation at connect-complete for HTTP/2 dd connections + // (existing behaviour) and for channels that never get a dd HTTP client tracing handler to + // consume it (e.g. armeria, which runs its own codec/pipeline). Otherwise it is captured and + // never resolved, leaking a pending reference. Channels that DO get a dd client handler keep + // the continuation for the HTTP/1 request write to consume, so they are left untouched here. + final boolean noDdClientHandler = + pipeline.get(HttpClientTracingHandler.class) == null + && pipeline.get(HttpClientRequestTracingHandler.class) == null; + if (!cleanupHttp2Continuation && !noDdClientHandler) { return; } if (future == null) { From 5e514bcaf3c8e5d2f993ddd4c683893b99f7cf43 Mon Sep 17 00:00:00 2001 From: Andrea Marziali Date: Mon, 29 Jun 2026 10:40:30 +0200 Subject: [PATCH 7/9] final --- .../investigate-continuation-leakage/SKILL.md | 119 ++++ .../test/InstrumentationSpecification.groovy | 9 +- .../test/scopediag/ContinuationAdvice.java | 26 +- .../scopediag/ScopeContinuationProbe.java | 20 +- .../test/scopediag/ScopeDiagnostics.java | 71 +-- .../scopediag/ScopeDiagnosticsExtension.java | 19 +- .../scopediag/ScopeDiagnosticsReport.java | 565 +----------------- .../agent/test/scopediag/StackFilter.java | 4 + .../scopediag/TrackScopeContinuations.java | 9 - .../scopediag/ScopeContinuationProbeTest.java | 25 + .../ScopeDiagnosticsIntegrationTest.java | 29 + .../scopediag/ScopeDiagnosticsReportTest.java | 36 +- .../agent/test/scopediag/ScopeRecordTest.java | 11 - .../src/test/groovy/PubSubTest.groovy | 9 - .../NettyChannelPipelineInstrumentation.java | 10 +- ...ope-continuation-leak-diagnostic-design.md | 22 +- 16 files changed, 264 insertions(+), 720 deletions(-) create mode 100644 .claude/skills/investigate-continuation-leakage/SKILL.md diff --git a/.claude/skills/investigate-continuation-leakage/SKILL.md b/.claude/skills/investigate-continuation-leakage/SKILL.md new file mode 100644 index 00000000000..fcec43bd4db --- /dev/null +++ b/.claude/skills/investigate-continuation-leakage/SKILL.md @@ -0,0 +1,119 @@ +--- +name: investigate-continuation-leakage +description: > + Investigate scope-continuation leaks in an instrumentation. Use when asked to "investigate + continuation leakage", "find scope leaks", "why does this integration leak continuations", + "debug a leaked trace / pendingReferenceCount", or when a test needed strictTraceWrites(false) + to pass. Runs the chosen instrumentation test with the scope-continuation diagnostic enabled, + reads the logged problem summary, recaps the findings, and renders a DAG of the leaks. +user-invocable: true +context: fork +allowed-tools: + - Bash + - Read + - Edit + - Glob + - Grep + - AskUserQuestion +--- + +# Investigate scope-continuation leakage + +dd-trace-java moves trace scopes across threads via *continuations*: a scope is **captured** on +one thread (`ScopeContinuation`, bumping `PendingTrace.pendingReferenceCount`) and later +**activated** and/or **cancelled** on another. A continuation that is never resolved (the classic +leak), resolved twice, resolved after its root span was written, or activated after resolve, keeps +a trace alive or drops a late span — and in tests forces `strictTraceWrites(false)`, masking the +bug instead of locating it. + +The test-time diagnostic in `datadog.trace.agent.test.scopediag` records the lifecycle and logs a +problem summary. This skill drives that diagnostic, reads the logged summary, recaps it in plain +language, and renders a diagram. **The Java code no longer renders Gantt/Mermaid — you (the LLM) +produce the diagram from the recorded data.** + +Background: `docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md`. +Test-run conventions: `docs/how_to_test.md`. + +## Step 1 — Select the target + +Identify the suspect instrumentation. If the user named it, resolve the module directory under +`dd-java-agent/instrumentation//-` with Glob. If ambiguous, list +the candidate test classes (Glob `**/src/test/**/*Test.{java,groovy}` in the module) and ask the +user which test to run with `AskUserQuestion`. You want one concrete test class (and ideally one +method) plus its Gradle module path, e.g. `:dd-java-agent:instrumentation:google-pubsub-1.116`. + +## Step 2 — Enable tracking (temporary) + +With `Edit`, add the opt-in annotation to the chosen test class (or a single method): + +- Add import `datadog.trace.agent.test.scopediag.TrackScopeContinuations`. +- Annotate the class/method with `@TrackScopeContinuations`. Leave the default `failOnLeak=false` — + you want the report, not a failing test (a red test would still print the report, but the default + keeps the run green so the build doesn't stop early). + +This works for both JUnit 5 Java tests (extension is auto-registered on +`AbstractInstrumentationTest`) and Groovy `InstrumentationSpecification` subclasses. **Record the +exact file path** — you will revert it in Step 7. + +## Step 3 — Run the test, capturing the diagnostic output + +The diagnostic does **not** write a file; it logs a problem summary at the end of each test. Run +with output captured so you can read that summary: + +```bash +./gradlew :dd-java-agent:instrumentation:-:test --tests '' --info 2>&1 | tee /tmp/scopediag-run.txt +``` + +(For the diagnostic harness's own tests the module is `:dd-java-agent:instrumentation-testing`.) +If the SLF4J line is not visible in console output, read the per-test captured stdout under +`/build/test-results/**/*.xml` (the `` element) or the HTML report under +`/build/reports/tests/`. + +## Step 4 — Collect the diagnostic output + +Find the summary block emitted by `ScopeDiagnosticsReport.renderSummary()` (Grep the captured output +for `Scope/continuation problems`). Its shape: + +``` +Scope/continuation problems (N continuations, M scopes; X leaked, Y late, Z double, + W activate-after-resolve | scopes: P never-closed, Q wrong-thread) + [LEAKED] # trace= src= captured at + [NEVER_CLOSED] scope# trace= src=<...> opened at + ... +``` + +> **Note (current limitation):** the machine-readable JSON feed was removed, so only the +> **problem-only** summary is available — flagged continuations/scopes with their failure set and +> capture/open callsite. The full per-event timeline (every resume/resolve, threads, per-event +> timestamps) is **not** in this output, so a time-accurate Gantt cannot be reconstructed. If a +> Gantt or richer analysis is needed, a structured data feed must be re-enabled in +> `ScopeDiagnosticsReport` first. + +## Step 5 — Summarize ("resume") + +Give a plain-language recap: + +- The header counts (leaked / late / double / activate-after-resolve / never-closed / wrong-thread). +- For each flagged continuation/scope: its failure set, the **capture/open callsite** (cite it as + `file:line`), and the `source`. +- A one-line hypothesis: which instrumentation advice captured the continuation and where it should + have resolved it. + +## Step 6 — Visualize + +Render a **DAG** (the summary lacks the per-event timing a Gantt needs). Mermaid `flowchart LR`: +a node per flagged continuation (`#seq spanName` + failure) and per flagged scope, an edge from each +to its trace/root, and from a scope to its owning continuation when that link is evident. Color +`LEAKED` / `DOUBLE_FINISH` / `NEVER_CLOSED` nodes red, `LATE_FINISH` / `CLOSE_WRONG_THREAD` amber. +Annotate nodes with the callsite. If the user explicitly wants a Gantt, explain the timeline data is +not currently emitted (see the Step 4 note) and offer to re-enable the feed. + +## Step 7 — Revert + +Undo the temporary annotation so the working tree is clean: + +```bash +git checkout -- +``` + +Report: the findings summary, the diagram, and that the annotation was reverted. diff --git a/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy b/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy index fe2c4df5e5e..78b8183531a 100644 --- a/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy +++ b/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy @@ -546,14 +546,7 @@ abstract class InstrumentationSpecification extends DDSpecification implements A } ScopeDiagnostics.stop() def report = ScopeDiagnostics.report() - println(config.gantt() ? report.renderGantt() : report.renderSummary()) - def reportName = specificationContext.currentSpec.name + "." + specificationContext.currentIteration.name - if (config.json()) { - ScopeDiagnostics.writeJson(reportName) - } - if (config.mermaid()) { - println("Mermaid timeline written to " + ScopeDiagnostics.writeMermaid(reportName)) - } + println(report.renderSummary()) try { if (config.failOnLeak()) { ScopeDiagnostics.assertNoLeaks() diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationAdvice.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationAdvice.java index 16e0e2e29ff..300650aa4ac 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationAdvice.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ContinuationAdvice.java @@ -21,17 +21,32 @@ public static void exit(@Advice.This Object self) { } } - /** {@code activate()} — a (possibly noop) activation; the probe filters the rollback branch. */ + /** + * {@code activate()} — a (possibly noop) activation; the probe filters the rollback branch. + * + *

The activation timestamp is captured at method entry, not exit: the same-span reuse + * optimization ({@code ContinuableScopeManager.continueSpan}) cancels the continuation from + * inside {@code activate()} before it returns, so timestamping the resume at exit would + * order it after that internal resolution and spuriously flag {@code ACTIVATE_AFTER_RESOLVE}. + */ public static final class Activate { + @Advice.OnMethodEnter + public static long enter() { + return System.nanoTime(); + } + @Advice.OnMethodExit(suppress = Throwable.class) - public static void exit(@Advice.This Object self, @Advice.Return Object scope) { - ScopeContinuationProbe.onActivate(self, scope); + public static void exit( + @Advice.This Object self, @Advice.Enter long ddActivateNanos, @Advice.Return Object scope) { + ScopeContinuationProbe.onActivate(self, scope, ddActivateNanos); } } /** * Resolution detected via the {@code count} transition. Applied to both {@code cancel()} and - * {@code cancelFromContinuedScopeClose()} — they need identical before/after observation. + * {@code cancelFromContinuedScopeClose()} — they need identical before/after observation. The + * originating method name ({@code #m}) distinguishes an explicit cancel from a normal + * finish-on-scope-close. * *

The resolve timestamp is captured at method entry (the {@code ddResolveNanos} * local), not at exit: the body itself may call {@code removeContinuation() -> @@ -51,10 +66,11 @@ public static int enter( @Advice.OnMethodExit(suppress = Throwable.class) public static void exit( @Advice.This Object self, + @Advice.Origin("#m") String method, @Advice.Enter int countBefore, @Advice.Local("ddResolveNanos") long ddResolveNanos, @Advice.FieldValue("count") int countAfter) { - ScopeContinuationProbe.onResolve(self, countBefore, countAfter, ddResolveNanos); + ScopeContinuationProbe.onResolve(self, method, countBefore, countAfter, ddResolveNanos); } } } diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbe.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbe.java index 8e8d7d3eb06..6bf1de9582b 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbe.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbe.java @@ -98,7 +98,7 @@ public static void onCapture(Object self) { * returns the {@link AgentTracer#noopScope() noop scope} singleton, so a returned noop scope is * skipped — this exactly reproduces the original "success branch only" semantics. */ - public static void onActivate(Object self, Object returnedScope) { + public static void onActivate(Object self, Object returnedScope, long activateNanos) { if (!recording) { return; } @@ -113,7 +113,12 @@ public static void onActivate(Object self, Object returnedScope) { AgentSpan span = continuation.span(); if (span != null) { ScopeDiagnostics.recordActivate( - continuation, span.getTraceId(), span.getSpanId(), spanName(span), sourceOf(self)); + continuation, + span.getTraceId(), + span.getSpanId(), + spanName(span), + sourceOf(self), + activateNanos); } } catch (Throwable ignored) { } @@ -125,24 +130,29 @@ public static void onActivate(Object self, Object returnedScope) { * by observing the {@code count} field transition to {@link #CANCELLED} during this call. A * cancel with outstanding activations leaves {@code count} unchanged (not a resolution). */ - public static void onResolve(Object self, int countBefore, int countAfter, long resolveNanos) { + public static void onResolve( + Object self, String method, int countBefore, int countAfter, long resolveNanos) { if (!recording) { return; } if (countAfter != CANCELLED) { return; // not a resolution } + // An explicit cancel() is a discard; cancelFromContinuedScopeClose() is a normal finish once + // the continued scope closes. (Caveat: the rare cancelFromContinuedScopeClose slow path + // delegates to cancel(), so a multi-activation finish is recorded as a cancel.) + boolean cancelled = "cancel".equals(method); try { AgentScope.Continuation continuation = (AgentScope.Continuation) self; if (countBefore == CANCELLED) { // already cancelled before this call: a genuine second finish/cancel (the slow-path // artifact always transitions 1->CANCELLED, never CANCELLED->CANCELLED). Surface it as a // double finish, bypassing the first-resolution dedup. - ScopeDiagnostics.recordResolve(continuation, false, resolveNanos); + ScopeDiagnostics.recordResolve(continuation, cancelled, resolveNanos); } else if (resolved.add(self)) { // first clean transition to CANCELLED; later observations of the SAME transition (the // cancelFromContinuedScopeClose slow path's nested cancel()) are suppressed - ScopeDiagnostics.recordResolve(continuation, false, resolveNanos); + ScopeDiagnostics.recordResolve(continuation, cancelled, resolveNanos); } } catch (Throwable ignored) { } diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnostics.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnostics.java index 131573e7240..9b6f6e2fdb0 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnostics.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnostics.java @@ -2,11 +2,6 @@ import datadog.trace.api.DDTraceId; import datadog.trace.bootstrap.instrumentation.api.AgentScope; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; import java.util.ArrayList; import java.util.Collections; import java.util.IdentityHashMap; @@ -14,8 +9,6 @@ import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /** * Test-time engine that records scope/continuation lifecycle events and renders leak reports. @@ -33,13 +26,12 @@ *

  *   ScopeDiagnostics.startRecording();
  *   ... exercise code under test ...
- *   log.info(ScopeDiagnostics.report().renderGantt());
+ *   System.out.println(ScopeDiagnostics.report().renderSummary());
  *   ScopeDiagnostics.assertNoLeaks();   // optional
  *   ScopeDiagnostics.stop();
  * 
*/ public final class ScopeDiagnostics { - private static final Logger log = LoggerFactory.getLogger(ScopeDiagnostics.class); private static final int DEFAULT_MAX_FRAMES = 6; private static final ScopeDiagnostics INSTANCE = new ScopeDiagnostics(); @@ -109,49 +101,27 @@ public static ScopeDiagnosticsReport report() { public static void assertNoLeaks() { ScopeDiagnosticsReport report = report(); if (report.hasProblems()) { - throw new AssertionError( - "Scope continuation problems detected:\n" - + report.renderSummary() - + "\n" - + report.renderGantt()); + throw new AssertionError("Scope continuation problems detected:\n" + report.renderSummary()); } } - /** Writes the JSON report under {@code build/scope-diagnostics/.json}. */ - public static Path writeJson(String name) { - return write(name, ".json", report().toJson()); - } - - /** Writes the Mermaid Gantt timeline under {@code build/scope-diagnostics/.md}. */ - public static Path writeMermaid(String name) { - return write(name, ".md", report().toMermaidGantt()); - } - - private static Path write(String name, String extension, String content) { - String safe = name.replaceAll("[^A-Za-z0-9._-]", "_"); - Path path = Paths.get("build", "scope-diagnostics", safe + extension); - try { - Files.createDirectories(path.getParent()); - Files.write(path, content.getBytes(StandardCharsets.UTF_8)); - } catch (IOException e) { - log.warn("Failed to write scope diagnostics to {}", path, e); - } - return path; - } - // ---- listener implementation --------------------------------------------- + private static final StackTraceElement[] NO_STACK = new StackTraceElement[0]; + private ScopeEvent event(ScopeEvent.Type type) { return event(type, System.nanoTime()); } /** Builds an event with an explicit timestamp (thread and stack are still captured now). */ private ScopeEvent event(ScopeEvent.Type type, long nanos) { - return new ScopeEvent( - type, - Thread.currentThread().getName(), - nanos, - stackFilter.filter(new Throwable().getStackTrace())); + // Capturing a stack per event is the dominant cost and perturbs the very timings we record; + // skip it entirely when callsites are disabled (maxFrames <= 0) rather than walking then + // trimming. + StackFilter filter = stackFilter; + StackTraceElement[] stack = + filter.maxFrames() <= 0 ? NO_STACK : filter.filter(new Throwable().getStackTrace()); + return new ScopeEvent(type, Thread.currentThread().getName(), nanos, stack); } // ---- static forwarders called by ScopeContinuationProbe ------------------ @@ -162,8 +132,13 @@ static void recordCapture( } static void recordActivate( - AgentScope.Continuation id, DDTraceId traceId, long spanId, String spanName, byte source) { - INSTANCE.listener.onActivate(id, traceId, spanId, spanName, source); + AgentScope.Continuation id, + DDTraceId traceId, + long spanId, + String spanName, + byte source, + long nanos) { + INSTANCE.listener.onActivate(id, traceId, spanId, spanName, source, nanos); } static void recordActivateFailed(AgentScope.Continuation id) { @@ -216,9 +191,15 @@ void onCapture( } void onActivate( - AgentScope.Continuation id, DDTraceId traceId, long spanId, String spanName, byte source) { + AgentScope.Continuation id, + DDTraceId traceId, + long spanId, + String spanName, + byte source, + long nanos) { try { - recordFor(id, traceId, spanId, spanName, source).addResume(event(ScopeEvent.Type.ACTIVATE)); + recordFor(id, traceId, spanId, spanName, source) + .addResume(event(ScopeEvent.Type.ACTIVATE, nanos)); } catch (Throwable ignored) { } } diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsExtension.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsExtension.java index 65ebcfc325d..b2132aa27aa 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsExtension.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsExtension.java @@ -14,8 +14,8 @@ * TrackScopeContinuations}. Registered on {@code AbstractInstrumentationTest} but dormant unless * the test class or method carries the annotation, so unannotated tests pay nothing. * - *

Per test: resets and starts recording before; after, renders the Gantt to the log, writes the - * JSON report, and optionally asserts no leaks. + *

Per test: resets and starts recording before; after, logs the problem summary and optionally + * asserts no leaks. */ public final class ScopeDiagnosticsExtension implements BeforeEachCallback, AfterEachCallback { private static final Logger log = LoggerFactory.getLogger(ScopeDiagnosticsExtension.class); @@ -35,20 +35,7 @@ public void afterEach(ExtensionContext context) { } ScopeDiagnostics.stop(); ScopeDiagnosticsReport report = ScopeDiagnostics.report(); - if (config.gantt()) { - log.info("[{}] {}", context.getDisplayName(), report.renderGantt()); - } else { - log.info("[{}] {}", context.getDisplayName(), report.renderSummary()); - } - if (config.json()) { - ScopeDiagnostics.writeJson(context.getUniqueId()); - } - if (config.mermaid()) { - log.info( - "[{}] Mermaid timeline written to {}", - context.getDisplayName(), - ScopeDiagnostics.writeMermaid(context.getUniqueId())); - } + log.info("[{}] {}", context.getDisplayName(), report.renderSummary()); try { if (config.failOnLeak()) { ScopeDiagnostics.assertNoLeaks(); diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReport.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReport.java index fadb69435d1..f836f6b9ac3 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReport.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReport.java @@ -9,22 +9,20 @@ /** * An immutable snapshot of the recorded continuation and scope lifecycles plus the derived failure - * findings. Renders a text Gantt timeline, a problem-only summary, a Mermaid Gantt, and JSON. + * findings. Exposes a problem-only text summary ({@link #renderSummary()}); richer visualizations + * (Gantt, DAG) are produced by the {@code investigate-continuation-leakage} skill from the recorded + * data rather than rendered here. * *

The two lifecycles are kept separate: {@link ContinuationRecord} (captured → resumed → * finished) and {@link ScopeRecord} (opened → closed). A scope spawned by resuming a continuation * is linked to it ({@link ContinuationRecord#scopeRecordSeqs()} / {@link - * ScopeRecord#continuationSeq}) and rendered nested under it; other scopes render in a separate - * section. + * ScopeRecord#continuationSeq}). */ public final class ScopeDiagnosticsReport { private final List continuations; private final List scopes; - private final Map rootWrittenNanos; - private final long t0; private final Map> continuationFailures; private final Map> scopeFailures; - private final Map scopeBySeq; ScopeDiagnosticsReport( List continuations, @@ -34,25 +32,8 @@ public final class ScopeDiagnosticsReport { this.continuations.sort((a, b) -> Long.compare(a.firstNanos(), b.firstNanos())); this.scopes = new ArrayList<>(scopes); this.scopes.sort((a, b) -> Long.compare(a.firstNanos(), b.firstNanos())); - this.rootWrittenNanos = rootWrittenNanos; - this.t0 = computeT0(this.continuations, this.scopes); this.continuationFailures = classifyContinuations(this.continuations, rootWrittenNanos); this.scopeFailures = classifyScopes(this.scopes); - this.scopeBySeq = new LinkedHashMap<>(); - for (ScopeRecord s : this.scopes) { - scopeBySeq.put(s.seq, s); - } - } - - private static long computeT0(List continuations, List scopes) { - long min = Long.MAX_VALUE; - for (ContinuationRecord r : continuations) { - min = Math.min(min, r.firstNanos()); - } - for (ScopeRecord s : scopes) { - min = Math.min(min, s.firstNanos()); - } - return min == Long.MAX_VALUE ? 0 : min; } private static Map> classifyContinuations( @@ -143,87 +124,7 @@ public boolean hasProblems() { || neverClosedScopeCount() > 0; } - // ---- rendering: text Gantt ----------------------------------------------- - - private static final int GANTT_FRAMES = 3; - - /** Full cross-thread Gantt timeline: one block per continuation, with nested scope bars. */ - public String renderGantt() { - StringBuilder sb = new StringBuilder(); - appendHeader(sb, "Scope/continuation timeline"); - if (continuations.isEmpty() && scopes.isEmpty()) { - sb.append(" (nothing captured)\n"); - return sb.toString(); - } - - for (ContinuationRecord r : continuations) { - EnumSet failures = - continuationFailures.getOrDefault(r, EnumSet.noneOf(Failure.class)); - sb.append("\n#") - .append(r.seq) - .append(' ') - .append(r.status()) - .append(" trace=") - .append(r.traceId) - .append(" span=") - .append(r.spanId); - if (r.spanName != null) { - sb.append(" \"").append(r.spanName).append('"'); - } - sb.append(" src=").append(r.sourceName()); - if (r.orphan) { - sb.append(" [ORPHAN]"); - } - if (r.threadHandoff()) { - sb.append(" [handoff]"); - } - if (!failures.isEmpty()) { - sb.append(' ').append(failures); - } - sb.append(timing(r)).append('\n'); - - appendEvent(sb, "capture ", r.capture()); - for (ScopeEvent a : r.resumes()) { - appendEvent(sb, "resume ", a); - } - for (ScopeEvent f : r.failedActivations()) { - appendEvent(sb, "act-fail", f); - } - ScopeEvent terminal = r.terminal(); - if (terminal != null) { - appendEvent( - sb, - terminal.type == ScopeEvent.Type.RESOLVE_CANCEL ? "cancel " : "finish ", - terminal); - } - for (ScopeEvent extra : r.extraTerminals()) { - appendEvent(sb, "DOUBLE ", extra); - } - for (long scopeSeq : r.scopeRecordSeqs()) { - ScopeRecord scope = scopeBySeq.get(scopeSeq); - if (scope != null) { - appendScopeLine(sb, " ", scope); - } - } - if (terminal == null) { - sb.append(" LEAKED (never finished or cancelled)\n"); - } - } - - List orphanScopes = new ArrayList<>(); - for (ScopeRecord s : scopes) { - if (s.continuationSeq == null) { - orphanScopes.add(s); - } - } - if (!orphanScopes.isEmpty()) { - sb.append("\nNon-continuation scopes:\n"); - for (ScopeRecord s : orphanScopes) { - appendScopeLine(sb, " ", s); - } - } - return sb.toString(); - } + // ---- rendering: text summary --------------------------------------------- private void appendHeader(StringBuilder sb, String title) { sb.append(title) @@ -246,71 +147,6 @@ private void appendHeader(StringBuilder sb, String title) { .append(" wrong-thread)\n"); } - private String timing(ContinuationRecord r) { - StringBuilder sb = new StringBuilder(); - Long capToResume = r.captureToFirstResumeNanos(); - Long age = r.ageAtTerminalNanos(); - if (capToResume != null) { - sb.append(" cap->resume=").append(millis(capToResume)).append("ms"); - } - if (age != null) { - sb.append(" age=").append(millis(age)).append("ms"); - } - return sb.toString(); - } - - private void appendScopeLine(StringBuilder sb, String indent, ScopeRecord scope) { - EnumSet failures = scopeFailures.getOrDefault(scope, EnumSet.noneOf(Failure.class)); - sb.append(indent).append("scope#").append(scope.seq).append(' ').append(scope.sourceName()); - if (scope.spanName != null) { - sb.append(" \"").append(scope.spanName).append('"'); - } - ScopeEvent open = scope.open(); - ScopeEvent close = scope.close(); - if (open != null) { - sb.append(" open +").append(relMillis(open.nanos)).append("ms @ ").append(open.threadName); - } - if (close != null) { - sb.append(" close +") - .append(relMillis(close.nanos)) - .append("ms @ ") - .append(close.threadName); - Long active = scope.activeDurationNanos(); - if (active != null) { - sb.append(" (active ").append(millis(active)).append("ms)"); - } - } - if (scope.threadHandoff()) { - sb.append(" [handoff]"); - } - if (!failures.isEmpty()) { - sb.append(' ').append(failures); - } - sb.append('\n'); - } - - private void appendEvent(StringBuilder sb, String label, ScopeEvent event) { - if (event == null) { - sb.append(" ").append(label).append(" (not observed)\n"); - return; - } - sb.append(" ") - .append(label) - .append(" +") - .append(relMillis(event.nanos)) - .append("ms @ ") - .append(event.threadName) - .append(" at ") - .append(event.callsite() == null ? "" : event.callsite()) - .append('\n'); - StackTraceElement[] stack = event.stack; - if (stack != null) { - for (int i = 1; i < stack.length && i < GANTT_FRAMES; i++) { - sb.append(" from ").append(stack[i]).append('\n'); - } - } - } - /** Problem-only summary: just the flagged continuations and scopes with their callsites. */ public String renderSummary() { StringBuilder sb = new StringBuilder(); @@ -351,395 +187,4 @@ public String renderSummary() { } return sb.toString(); } - - // ---- rendering: Mermaid Gantt -------------------------------------------- - - /** - * Renders a Mermaid Gantt diagram (wrapped in a ```mermaid fence). Continuations and scopes are - * grouped into swimlanes by thread; a continuation contributes a capture marker in its capture - * lane and a run/finish bar in its resume/terminal lane, and each linked scope contributes an - * open→close bar in its close lane. Leaks/never-closed extend to the end of the window and are - * marked {@code crit}; late/wrong-thread are marked {@code active}. - */ - public String toMermaidGantt() { - StringBuilder sb = new StringBuilder(); - sb.append("```mermaid\n"); - sb.append("gantt\n"); - sb.append(" title Scope/continuations (") - .append(continuations.size()) - .append(" cont, ") - .append(scopes.size()) - .append(" scopes, ") - .append(leakCount()) - .append(" leaked, ") - .append(neverClosedScopeCount()) - .append(" never-closed)\n"); - sb.append(" dateFormat x\n"); - sb.append(" axisFormat %Lms\n"); - if (continuations.isEmpty() && scopes.isEmpty()) { - sb.append(" section none\n nothing captured :done, 0, 1\n```\n"); - return sb.toString(); - } - - long windowEnd = relMillisRounded(maxNanos()); - Map> lanes = new LinkedHashMap<>(); - - for (ContinuationRecord r : continuations) { - EnumSet failures = - continuationFailures.getOrDefault(r, EnumSet.noneOf(Failure.class)); - ScopeEvent capture = r.capture(); - List resumes = r.resumes(); - ScopeEvent terminal = r.terminal(); - boolean leak = failures.contains(Failure.LEAKED); - - if (capture != null) { - long start = relMillisRounded(capture.nanos); - if (leak && resumes.isEmpty()) { - addTask( - lanes, - capture.threadName, - "#" + r.seq + " LEAK cap " + where(capture), - "crit", - start, - Math.max(start + 1, windowEnd)); - } else { - addTask( - lanes, - capture.threadName, - "cap #" + r.seq + " " + where(capture), - "milestone", - start, - start); - } - } - - if (!resumes.isEmpty()) { - ScopeEvent firstResume = resumes.get(0); - long runStart = relMillisRounded(firstResume.nanos); - if (terminal == null) { - addTask( - lanes, - firstResume.threadName, - "#" + r.seq + " LEAK ran " + where(firstResume), - "crit", - runStart, - Math.max(runStart + 1, windowEnd)); - } else { - long runEnd = Math.max(runStart + 1, relMillisRounded(terminal.nanos)); - String verb = terminal.type == ScopeEvent.Type.RESOLVE_CANCEL ? "cancel" : "fin"; - String tag = continuationTag(failures); - addTask( - lanes, - terminal.threadName, - "#" + r.seq + " " + verb + " " + where(terminal), - tag, - runStart, - runEnd); - } - } else if (terminal != null) { - long start = relMillisRounded(terminal.nanos); - String verb = terminal.type == ScopeEvent.Type.RESOLVE_CANCEL ? "cancel" : "fin"; - addTask( - lanes, - terminal.threadName, - "#" + r.seq + " " + verb + " " + where(terminal), - "done", - start, - start + 1); - } - } - - for (ScopeRecord s : scopes) { - EnumSet failures = scopeFailures.getOrDefault(s, EnumSet.noneOf(Failure.class)); - ScopeEvent open = s.open(); - ScopeEvent close = s.close(); - if (open == null) { - continue; - } - long start = relMillisRounded(open.nanos); - String label = "scope#" + s.seq + " " + where(open); - if (close == null) { - addTask( - lanes, - open.threadName, - label + " NEVER-CLOSED", - "crit", - start, - Math.max(start + 1, windowEnd)); - } else { - long end = Math.max(start + 1, relMillisRounded(close.nanos)); - String tag = failures.contains(Failure.CLOSE_WRONG_THREAD) ? "active" : "done"; - addTask(lanes, close.threadName, label, tag, start, end); - } - } - - for (Map.Entry> lane : lanes.entrySet()) { - sb.append(" section ").append(sanitize(lane.getKey())).append('\n'); - for (String task : lane.getValue()) { - sb.append(" ").append(task).append('\n'); - } - } - sb.append("```\n"); - return sb.toString(); - } - - private static String continuationTag(EnumSet failures) { - if (failures.contains(Failure.DOUBLE_FINISH) - || failures.contains(Failure.ACTIVATE_AFTER_RESOLVE)) { - return "crit"; - } - if (failures.contains(Failure.LATE_FINISH)) { - return "active"; - } - return "done"; - } - - private void addTask( - Map> lanes, - String thread, - String label, - String tag, - long start, - long end) { - lanes - .computeIfAbsent(thread, k -> new ArrayList<>()) - .add(sanitize(label) + " :" + tag + ", " + start + ", " + end); - } - - private static String where(ScopeEvent event) { - StackTraceElement frame = event == null ? null : event.callsite(); - if (frame == null) { - return "?"; - } - String className = frame.getClassName(); - int dot = className.lastIndexOf('.'); - String simple = dot >= 0 ? className.substring(dot + 1) : className; - return simple + "." + frame.getMethodName(); - } - - private long maxNanos() { - long max = t0; - for (ContinuationRecord r : continuations) { - max = Math.max(max, lastNanos(r)); - } - for (ScopeRecord s : scopes) { - ScopeEvent open = s.open(); - ScopeEvent close = s.close(); - if (open != null) { - max = Math.max(max, open.nanos); - } - if (close != null) { - max = Math.max(max, close.nanos); - } - } - return max; - } - - private static long lastNanos(ContinuationRecord r) { - long max = r.capture() != null ? r.capture().nanos : Long.MIN_VALUE; - for (ScopeEvent e : r.resumes()) { - max = Math.max(max, e.nanos); - } - if (r.terminal() != null) { - max = Math.max(max, r.terminal().nanos); - } - for (ScopeEvent e : r.extraTerminals()) { - max = Math.max(max, e.nanos); - } - return max == Long.MIN_VALUE ? 0 : max; - } - - private long relMillisRounded(long nanos) { - return Math.round((nanos - t0) / 1_000_000.0); - } - - private static String sanitize(String s) { - return s.replace(':', ' ').replace(',', ' ').replace(';', ' ').replace('\n', ' '); - } - - // ---- rendering: JSON ----------------------------------------------------- - - /** Machine-readable view for CI artifacts and later tooling. */ - public String toJson() { - StringBuilder sb = new StringBuilder(); - sb.append("{\"continuations\":["); - for (int i = 0; i < continuations.size(); i++) { - if (i > 0) { - sb.append(','); - } - appendContinuationJson(sb, continuations.get(i)); - } - sb.append("],\"scopes\":["); - for (int i = 0; i < scopes.size(); i++) { - if (i > 0) { - sb.append(','); - } - appendScopeJson(sb, scopes.get(i)); - } - sb.append("],\"summary\":{\"continuations\":") - .append(continuations.size()) - .append(",\"scopes\":") - .append(scopes.size()) - .append(",\"leaked\":") - .append(leakCount()) - .append(",\"lateFinish\":") - .append(lateCount()) - .append(",\"doubleFinish\":") - .append(doubleCount()) - .append(",\"activateAfterResolve\":") - .append(activateAfterResolveCount()) - .append(",\"neverClosedScopes\":") - .append(neverClosedScopeCount()) - .append(",\"closeWrongThread\":") - .append(closeWrongThreadCount()) - .append("}}"); - return sb.toString(); - } - - private void appendContinuationJson(StringBuilder sb, ContinuationRecord r) { - EnumSet failures = continuationFailures.getOrDefault(r, EnumSet.noneOf(Failure.class)); - sb.append("{\"seq\":") - .append(r.seq) - .append(",\"status\":\"") - .append(r.status()) - .append("\",\"traceId\":\"") - .append(r.traceId) - .append("\",\"spanId\":") - .append(r.spanId) - .append(",\"spanName\":") - .append(jsonString(r.spanName)) - .append(",\"source\":\"") - .append(r.sourceName()) - .append("\",\"orphan\":") - .append(r.orphan) - .append(",\"threadHandoff\":") - .append(r.threadHandoff()) - .append(",\"captureToFirstResumeMs\":") - .append(millisOrNull(r.captureToFirstResumeNanos())) - .append(",\"ageAtTerminalMs\":") - .append(millisOrNull(r.ageAtTerminalNanos())) - .append(",\"scopeSeqs\":") - .append(r.scopeRecordSeqs()) - .append(",\"failures\":["); - appendFailuresJson(sb, failures); - sb.append("],\"capture\":"); - appendEventJson(sb, r.capture()); - sb.append(",\"resumes\":["); - appendEventsJson(sb, r.resumes()); - sb.append("],\"failedActivations\":["); - appendEventsJson(sb, r.failedActivations()); - sb.append("],\"terminal\":"); - appendEventJson(sb, r.terminal()); - sb.append(",\"extraTerminals\":["); - appendEventsJson(sb, r.extraTerminals()); - sb.append("]}"); - } - - private void appendScopeJson(StringBuilder sb, ScopeRecord s) { - EnumSet failures = scopeFailures.getOrDefault(s, EnumSet.noneOf(Failure.class)); - sb.append("{\"seq\":") - .append(s.seq) - .append(",\"closed\":") - .append(s.closed()) - .append(",\"traceId\":\"") - .append(s.traceId) - .append("\",\"spanId\":") - .append(s.spanId) - .append(",\"spanName\":") - .append(jsonString(s.spanName)) - .append(",\"source\":\"") - .append(s.sourceName()) - .append("\",\"continuationSeq\":") - .append(s.continuationSeq) - .append(",\"threadHandoff\":") - .append(s.threadHandoff()) - .append(",\"activeDurationMs\":") - .append(millisOrNull(s.activeDurationNanos())) - .append(",\"failures\":["); - appendFailuresJson(sb, failures); - sb.append("],\"open\":"); - appendEventJson(sb, s.open()); - sb.append(",\"close\":"); - appendEventJson(sb, s.close()); - sb.append("}"); - } - - private static void appendFailuresJson(StringBuilder sb, EnumSet failures) { - boolean first = true; - for (Failure f : failures) { - if (!first) { - sb.append(','); - } - sb.append('"').append(f).append('"'); - first = false; - } - } - - private void appendEventsJson(StringBuilder sb, List events) { - for (int i = 0; i < events.size(); i++) { - if (i > 0) { - sb.append(','); - } - appendEventJson(sb, events.get(i)); - } - } - - private void appendEventJson(StringBuilder sb, ScopeEvent event) { - if (event == null) { - sb.append("null"); - return; - } - sb.append("{\"type\":\"") - .append(event.type) - .append("\",\"thread\":\"") - .append(jsonEscape(event.threadName)) - .append("\",\"relMillis\":") - .append(relMillis(event.nanos)) - .append(",\"callsite\":\"") - .append(jsonEscape(event.callsite() == null ? "" : event.callsite().toString())) - .append("\"}"); - } - - private String relMillis(long nanos) { - return String.format("%.3f", (nanos - t0) / 1_000_000.0); - } - - private static String millis(long nanos) { - return String.format("%.3f", nanos / 1_000_000.0); - } - - private static String millisOrNull(Long nanos) { - return nanos == null ? "null" : millis(nanos); - } - - private static String jsonString(String s) { - return s == null ? "null" : "\"" + jsonEscape(s) + "\""; - } - - private static String jsonEscape(String s) { - StringBuilder out = new StringBuilder(s.length() + 8); - for (int i = 0; i < s.length(); i++) { - char c = s.charAt(i); - switch (c) { - case '"': - out.append("\\\""); - break; - case '\\': - out.append("\\\\"); - break; - case '\n': - out.append("\\n"); - break; - case '\r': - out.append("\\r"); - break; - case '\t': - out.append("\\t"); - break; - default: - out.append(c); - } - } - return out.toString(); - } } diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/StackFilter.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/StackFilter.java index 17b53169061..8b360d38fa0 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/StackFilter.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/StackFilter.java @@ -42,6 +42,10 @@ final class StackFilter { this.maxFrames = maxFrames; } + int maxFrames() { + return maxFrames; + } + StackTraceElement[] filter(StackTraceElement[] raw) { if (raw == null) { return new StackTraceElement[0]; diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/TrackScopeContinuations.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/TrackScopeContinuations.java index e7301e92ef4..86450ca859f 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/TrackScopeContinuations.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/TrackScopeContinuations.java @@ -19,13 +19,4 @@ public @interface TrackScopeContinuations { /** Fail the test if a never-resolved leak or double/invalid resolution is detected. */ boolean failOnLeak() default false; - - /** Render the full text Gantt timeline to the log after the test. */ - boolean gantt() default true; - - /** Write the JSON report under {@code build/scope-diagnostics/}. */ - boolean json() default true; - - /** Write a Mermaid Gantt timeline ({@code .md}) under {@code build/scope-diagnostics/}. */ - boolean mermaid() default true; } diff --git a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbeTest.java b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbeTest.java index 4250b851bbb..73e34d755d8 100644 --- a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbeTest.java +++ b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeContinuationProbeTest.java @@ -28,6 +28,31 @@ void cancelledSentinelMatchesProduction() throws Exception { "ScopeContinuationProbe.CANCELLED is out of sync with ScopeContinuation.CANCELLED"); } + @Test + void continuationHooksExist() throws Exception { + Class scopeContinuation = Class.forName("datadog.trace.core.scopemanager.ScopeContinuation"); + // methods woven by ScopeContinuationTransformer (matched by name) + assertNotNull(scopeContinuation.getDeclaredMethod("register"), "register() (capture)"); + assertNotNull(scopeContinuation.getDeclaredMethod("activate"), "activate() (resume)"); + assertNotNull(scopeContinuation.getDeclaredMethod("cancel"), "cancel() (resolve)"); + assertNotNull( + scopeContinuation.getDeclaredMethod("cancelFromContinuedScopeClose"), + "cancelFromContinuedScopeClose() (resolve)"); + // fields read by the Cancel advice (@Advice.FieldValue) and the probe (reflection) + assertNotNull(findField(scopeContinuation, "count"), "ScopeContinuation.count"); + assertNotNull(findField(scopeContinuation, "source"), "ScopeContinuation.source"); + } + + @Test + void rootWrittenHookExists() throws Exception { + Class pendingTrace = Class.forName("datadog.trace.core.PendingTrace"); + // PendingTraceAdvice matches write(boolean) and reads these fields via @Advice.FieldValue + assertNotNull( + pendingTrace.getDeclaredMethod("write", boolean.class), "PendingTrace.write(boolean)"); + assertNotNull(findField(pendingTrace, "rootSpanWritten"), "PendingTrace.rootSpanWritten"); + assertNotNull(findField(pendingTrace, "traceId"), "PendingTrace.traceId"); + } + @Test void scopeLifecycleHooksExist() throws Exception { Class scope = Class.forName("datadog.trace.core.scopemanager.ContinuableScope"); diff --git a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsIntegrationTest.java b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsIntegrationTest.java index 748d817077f..2c6a9df8ded 100644 --- a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsIntegrationTest.java +++ b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsIntegrationTest.java @@ -53,6 +53,35 @@ void capturesRealLeakAndResolvedContinuation() { span.finish(); } + @Test + void sameSpanReactivationIsNotFlaggedActivateAfterResolve() { + tracer = CoreTracer.builder().writer(new ListWriter()).strictTraceWrites(false).build(); + + ScopeDiagnostics.startRecording(); + + AgentSpan span = tracer.startSpan("test", "op"); + AgentScope active = tracer.activateSpan(span); // span becomes the active top scope + // Capturing then immediately activating the already-active span hits the continueSpan reuse + // optimization: it cancels the continuation from inside activate() before activate() returns. + // The resume must be timestamped at activate() entry (not exit) so it does not appear to occur + // after that internal resolution and spuriously trip ACTIVATE_AFTER_RESOLVE. + AgentScope.Continuation continuation = tracer.captureSpan(span); + AgentScope reused = continuation.activate(); + reused.close(); + active.close(); + span.finish(); + + ScopeDiagnosticsReport report = ScopeDiagnostics.report(); + + assertEquals(1, report.records().size()); + assertEquals( + 0, + report.activateAfterResolveCount(), + "a same-span re-activation resolved during activate() is not activate-after-resolve"); + assertEquals(0, report.leakCount()); + assertFalse(report.hasProblems()); + } + @Test void resolvedContinuationDoesNotLeak() { tracer = CoreTracer.builder().writer(new ListWriter()).strictTraceWrites(false).build(); diff --git a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReportTest.java b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReportTest.java index a2c4bdbb33b..094c0c3e471 100644 --- a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReportTest.java +++ b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReportTest.java @@ -52,7 +52,8 @@ void neverResolvedIsFlaggedAsLeak() { assertEquals(ContinuationStatus.LEAKED, r.status()); assertTrue(report.hasProblems()); assertTrue(report.renderSummary().contains("LEAKED")); - assertTrue(report.renderGantt().contains("Worker.java:42")); + // the capture callsite is surfaced in the problem summary + assertTrue(report.renderSummary().contains("Worker.java:42")); } @Test @@ -124,39 +125,6 @@ void failedActivationIsActivateAfterResolve() { assertTrue(report.hasProblems()); } - @Test - void jsonContainsSummaryAndRecords() { - ContinuationRecord r = record(0, DDTraceId.from(15)); - ScopeDiagnosticsReport report = report(list(r), map()); - - String json = report.toJson(); - assertTrue(json.contains("\"summary\"")); - assertTrue(json.contains("\"leaked\":1")); - assertTrue(json.contains("\"failures\":[\"LEAKED\"]")); - assertTrue(json.contains("\"scopes\":[")); - } - - @Test - void mermaidGanttGroupsByThreadAndMarksLeakCrit() { - DDTraceId trace = DDTraceId.from(20); - ContinuationRecord resolved = record(0, trace); - resolved.addResume(event(ScopeEvent.Type.ACTIVATE, "pool-1", 2000)); - resolved.setTerminalOrExtra(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 3000)); - ContinuationRecord leak = record(1, trace); // capture only -> leak - - String mermaid = report(list(resolved, leak), map()).toMermaidGantt(); - - assertTrue(mermaid.startsWith("```mermaid")); - assertTrue(mermaid.contains("gantt")); - assertTrue(mermaid.contains("dateFormat x")); - assertTrue(mermaid.contains("section main")); // capture lane = capture thread - assertTrue(mermaid.contains("section pool-1")); // run lane = resolution thread - assertTrue(mermaid.contains("cap #0 Worker.submit :milestone")); // where captured - assertTrue(mermaid.contains("#0 fin Worker.submit :done")); // where finished - assertTrue(mermaid.contains("#1 LEAK cap Worker.submit :crit")); // leak in capture lane - assertTrue(mermaid.trim().endsWith("```")); - } - // ---- helpers ------------------------------------------------------------- private static ScopeDiagnosticsReport report( diff --git a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeRecordTest.java b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeRecordTest.java index eb7746f4854..7ad0112a432 100644 --- a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeRecordTest.java +++ b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeRecordTest.java @@ -83,15 +83,4 @@ void wrongThreadCloseIsReportedButDoesNotFail() { assertEquals(1, report.closeWrongThreadCount()); assertFalse(report.hasProblems()); // wrong-thread is report-only } - - @Test - void jsonContainsScopeFields() { - ScopeRecord s = scope(3, 1L, "main", 1000); - s.setClose(event(ScopeEvent.Type.SCOPE_CLOSE, "main", 2000)); - - String json = report(s).toJson(); - assertTrue(json.contains("\"scopes\":[")); - assertTrue(json.contains("\"continuationSeq\":1")); - assertTrue(json.contains("\"closed\":true")); - } } diff --git a/dd-java-agent/instrumentation/google-pubsub-1.116/src/test/groovy/PubSubTest.groovy b/dd-java-agent/instrumentation/google-pubsub-1.116/src/test/groovy/PubSubTest.groovy index 5443ac22c4c..09892853956 100644 --- a/dd-java-agent/instrumentation/google-pubsub-1.116/src/test/groovy/PubSubTest.groovy +++ b/dd-java-agent/instrumentation/google-pubsub-1.116/src/test/groovy/PubSubTest.groovy @@ -21,7 +21,6 @@ import com.google.pubsub.v1.SubscriptionName import com.google.pubsub.v1.TopicName import datadog.trace.agent.test.asserts.TraceAssert import datadog.trace.agent.test.naming.VersionedNamingTestBase -import datadog.trace.agent.test.scopediag.TrackScopeContinuations import datadog.trace.agent.test.utils.TraceUtils import datadog.trace.api.DDSpanTypes import datadog.trace.api.DDTags @@ -40,9 +39,6 @@ import org.testcontainers.containers.PubSubEmulatorContainer import org.testcontainers.utility.DockerImageName import spock.lang.Shared -import java.nio.charset.StandardCharsets - -@TrackScopeContinuations(failOnLeak = true) abstract class PubSubTest extends VersionedNamingTestBase { private static final String PROJECT_ID = "dd-trace-java" @@ -71,11 +67,6 @@ abstract class PubSubTest extends VersionedNamingTestBase { null } - @Override - boolean useStrictTraceWrites() { - true - } - boolean shadowGrpcSpans() { true } diff --git a/dd-java-agent/instrumentation/netty/netty-4.1/src/main/java/datadog/trace/instrumentation/netty41/NettyChannelPipelineInstrumentation.java b/dd-java-agent/instrumentation/netty/netty-4.1/src/main/java/datadog/trace/instrumentation/netty41/NettyChannelPipelineInstrumentation.java index afea2c3eb1f..7fce2fd12eb 100644 --- a/dd-java-agent/instrumentation/netty/netty-4.1/src/main/java/datadog/trace/instrumentation/netty41/NettyChannelPipelineInstrumentation.java +++ b/dd-java-agent/instrumentation/netty/netty-4.1/src/main/java/datadog/trace/instrumentation/netty41/NettyChannelPipelineInstrumentation.java @@ -275,15 +275,7 @@ public static void cleanupHttp2ConnectParentContinuation( @Advice.Enter final boolean cleanupHttp2Continuation, @Advice.This final ChannelPipeline pipeline, @Advice.Return final ChannelFuture future) { - // Resolve the connect-parent continuation at connect-complete for HTTP/2 dd connections - // (existing behaviour) and for channels that never get a dd HTTP client tracing handler to - // consume it (e.g. armeria, which runs its own codec/pipeline). Otherwise it is captured and - // never resolved, leaking a pending reference. Channels that DO get a dd client handler keep - // the continuation for the HTTP/1 request write to consume, so they are left untouched here. - final boolean noDdClientHandler = - pipeline.get(HttpClientTracingHandler.class) == null - && pipeline.get(HttpClientRequestTracingHandler.class) == null; - if (!cleanupHttp2Continuation && !noDdClientHandler) { + if (!cleanupHttp2Continuation) { return; } if (future == null) { diff --git a/docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md b/docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md index 3149bc249ae..560bdb3f3fe 100644 --- a/docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md +++ b/docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md @@ -117,14 +117,18 @@ never-resolved, late-after-root, double/invalid, and the full cross-thread timel ### 3. Renderers -- **Text Gantt → SLF4J logger** (default output while recording): one row per continuation — - seq, trace/span, source, `capture thread@callsite`, `activate +Δms thread@callsite`, - `resolve(type) +Δms`, and a flag column (`LEAK` / `LATE` / `DBL`). Times are relative to the - first event in the window. +> **Addendum (2026-06):** The in-Java Gantt renderers (text `renderGantt()` and Mermaid +> `toMermaidGantt()`) were removed, and subsequently the **JSON output was dropped too** (its only +> consumer was an LLM, which reads structured text fine, and the hand-rolled serializer was a +> liability). Java now emits **only the leak-only text summary** (`renderSummary()`), logged at the +> end of each tracked test. The `investigate-continuation-leakage` skill reads that summary and +> renders a DAG of the flagged leaks. Note: the summary is problem-only and carries no per-event +> timeline, so a time-accurate Gantt requires re-enabling a structured feed. The +> `@TrackScopeContinuations` attributes were reduced to just `failOnLeak` (default `false`); +> `gantt()`/`mermaid()`/`json()` were all removed. + - **Leak-only summary**: only the flagged problems, each with its callsite(s) — a quick scan - / failure message. -- **JSON → `build/scope-diagnostics/.json`**: events + derived findings, for CI - artifacts and later tooling. + / failure message, and the sole output consumed by the skill. ### 4. Harness integration (passive by default — no impact on existing tests) @@ -139,8 +143,8 @@ never-resolved, late-after-root, double/invalid, and the full cross-thread timel `@TrackScopeContinuations` annotation (and/or an overridable `trackScopeContinuations()` returning `false` by default), reusing the same static `ScopeDiagnostics` engine. -`@TrackScopeContinuations` attributes: `failOnLeak` (default `false`), -`json` (default `true`), `gantt` (default `true`). +`@TrackScopeContinuations` attributes: `failOnLeak` (default `false`). (The `gantt`/`mermaid`/`json` +attributes were removed — see the Renderers addendum.) ## Data flow From 5bfaa6ca7ca78c4033152d1feb3719f7d8e49674 Mon Sep 17 00:00:00 2001 From: Andrea Marziali Date: Mon, 29 Jun 2026 14:11:13 +0200 Subject: [PATCH 8/9] more report --- .../investigate-continuation-leakage/SKILL.md | 80 +++++--- .../test/InstrumentationSpecification.groovy | 3 +- .../scopediag/ScopeDiagnosticsExtension.java | 6 +- .../scopediag/ScopeDiagnosticsReport.java | 188 +++++++++++++++++- .../scopediag/ScopeDiagnosticsReportTest.java | 22 ++ ...ope-continuation-leak-diagnostic-design.md | 27 ++- 6 files changed, 278 insertions(+), 48 deletions(-) diff --git a/.claude/skills/investigate-continuation-leakage/SKILL.md b/.claude/skills/investigate-continuation-leakage/SKILL.md index fcec43bd4db..5123b48d441 100644 --- a/.claude/skills/investigate-continuation-leakage/SKILL.md +++ b/.claude/skills/investigate-continuation-leakage/SKILL.md @@ -5,7 +5,8 @@ description: > continuation leakage", "find scope leaks", "why does this integration leak continuations", "debug a leaked trace / pendingReferenceCount", or when a test needed strictTraceWrites(false) to pass. Runs the chosen instrumentation test with the scope-continuation diagnostic enabled, - reads the logged problem summary, recaps the findings, and renders a DAG of the leaks. + reads the logged full timeline, recaps the findings, and renders a Gantt or DAG (works whether + or not anything leaked). user-invocable: true context: fork allowed-tools: @@ -27,9 +28,10 @@ a trace alive or drops a late span — and in tests forces `strictTraceWrites(fa bug instead of locating it. The test-time diagnostic in `datadog.trace.agent.test.scopediag` records the lifecycle and logs a -problem summary. This skill drives that diagnostic, reads the logged summary, recaps it in plain -language, and renders a diagram. **The Java code no longer renders Gantt/Mermaid — you (the LLM) -produce the diagram from the recorded data.** +full timeline of every continuation and scope (regardless of whether anything leaked). This skill +drives that diagnostic, reads the logged timeline, recaps it in plain language, and renders a +diagram. **The Java code no longer renders Gantt/Mermaid — you (the LLM) produce the diagram from +the timeline.** Background: `docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md`. Test-run conventions: `docs/how_to_test.md`. @@ -57,8 +59,10 @@ exact file path** — you will revert it in Step 7. ## Step 3 — Run the test, capturing the diagnostic output -The diagnostic does **not** write a file; it logs a problem summary at the end of each test. Run -with output captured so you can read that summary: +The diagnostic does **not** write a file; at the end of every tracked test it logs the **full +timeline** (`ScopeDiagnosticsReport.renderTimeline()`) — every continuation and scope with its +events, threads, relative timing, and callsites, regardless of whether anything leaked. Run with +output captured: ```bash ./gradlew :dd-java-agent:instrumentation:-:test --tests '' --info 2>&1 | tee /tmp/scopediag-run.txt @@ -71,42 +75,56 @@ If the SLF4J line is not visible in console output, read the per-test captured s ## Step 4 — Collect the diagnostic output -Find the summary block emitted by `ScopeDiagnosticsReport.renderSummary()` (Grep the captured output -for `Scope/continuation problems`). Its shape: +Grep the captured output for `Scope/continuation timeline` — one block per test. Shape: ``` -Scope/continuation problems (N continuations, M scopes; X leaked, Y late, Z double, +Scope/continuation timeline (N continuations, M scopes; X leaked, Y late, Z double, W activate-after-resolve | scopes: P never-closed, Q wrong-thread) - [LEAKED] # trace= src= captured at - [NEVER_CLOSED] scope# trace= src=<...> opened at - ... + +# trace= span= "" src= [ORPHAN] [handoff] {failures} cap->resume= age= + capture +<Δms> @ at + resume +<Δms> @ at <...> + finish +<Δms> @ at <...> (or cancel / DOUBLE / act-fail) + scope# "" open +<Δms> @ close +<Δms> @ (active ) [handoff] {failures} + LEAKED (never finished or cancelled) (only when unresolved) +... +Non-continuation scopes: + scope# ... ``` -> **Note (current limitation):** the machine-readable JSON feed was removed, so only the -> **problem-only** summary is available — flagged continuations/scopes with their failure set and -> capture/open callsite. The full per-event timeline (every resume/resolve, threads, per-event -> timestamps) is **not** in this output, so a time-accurate Gantt cannot be reconstructed. If a -> Gantt or richer analysis is needed, a structured data feed must be re-enabled in -> `ScopeDiagnosticsReport` first. +Every record is listed (not just flagged ones), so you can reconstruct the full graph whether or not +anything leaked. `+Δms` is relative to the first recorded event. (`renderSummary()` — the +problem-only view — still backs `assertNoLeaks` failure messages, but the timeline is the feed.) ## Step 5 — Summarize ("resume") Give a plain-language recap: - The header counts (leaked / late / double / activate-after-resolve / never-closed / wrong-thread). -- For each flagged continuation/scope: its failure set, the **capture/open callsite** (cite it as - `file:line`), and the `source`. -- A one-line hypothesis: which instrumentation advice captured the continuation and where it should - have resolved it. - -## Step 6 — Visualize - -Render a **DAG** (the summary lacks the per-event timing a Gantt needs). Mermaid `flowchart LR`: -a node per flagged continuation (`#seq spanName` + failure) and per flagged scope, an edge from each -to its trace/root, and from a scope to its owning continuation when that link is evident. Color -`LEAKED` / `DOUBLE_FINISH` / `NEVER_CLOSED` nodes red, `LATE_FINISH` / `CLOSE_WRONG_THREAD` amber. -Annotate nodes with the callsite. If the user explicitly wants a Gantt, explain the timeline data is -not currently emitted (see the Step 4 note) and offer to re-enable the feed. +- The dominant flow: where continuations are captured (callsite/thread) and where they're resumed / + resolved (thread), plus any thread handoffs. +- For each flagged record (if any): its failure set and capture/open callsite (cite `file:line`). +- A one-line hypothesis when there's a problem: which advice captured the continuation and where it + should have resolved it. + +## Step 6 — Visualize (auto-pick, user may override) + +Build the diagram from the **timeline** (works whether or not there are leaks): + +- **Gantt** — when the signal is **temporal / cross-thread** (thread handoffs, late-after-root, + never-closed, or the user wants the time view). Mermaid `gantt`, one `section` per thread; a bar + per continuation from capture→resolve and per scope from open→close using the `+Δms` offsets. Mark + leaks / never-closed `crit` to the window end; late / wrong-thread `active`; resolved-on-time + `done`; capture-only points as `milestone`. +- **DAG** — when the signal is **structural / ownership** (orphans, double-finish, + activate-after-resolve, or continuation→scope lineage). Mermaid `flowchart LR`: a node per + continuation (`#seq spanName`), its spawned scopes (linked via the nested `scope#` lines), edges + capture→resume→resolve labelled with thread + `+Δms`. Color leaked / double red, late amber, + resolved green. + +If there are no problems, the diagram simply shows the healthy capture→continue→resolve flow (all +green) — that is the expected "regardless of leak" output. If unsure which shape, ask with +`AskUserQuestion`. ## Step 7 — Revert diff --git a/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy b/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy index 78b8183531a..546aae12f7b 100644 --- a/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy +++ b/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy @@ -546,7 +546,8 @@ abstract class InstrumentationSpecification extends DDSpecification implements A } ScopeDiagnostics.stop() def report = ScopeDiagnostics.report() - println(report.renderSummary()) + // Always dump the full timeline so a graph/report can be built regardless of leaks. + println(report.renderTimeline()) try { if (config.failOnLeak()) { ScopeDiagnostics.assertNoLeaks() diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsExtension.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsExtension.java index b2132aa27aa..1351ca02fb9 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsExtension.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsExtension.java @@ -14,7 +14,7 @@ * TrackScopeContinuations}. Registered on {@code AbstractInstrumentationTest} but dormant unless * the test class or method carries the annotation, so unannotated tests pay nothing. * - *

Per test: resets and starts recording before; after, logs the problem summary and optionally + *

Per test: resets and starts recording before; after, logs the full timeline and optionally * asserts no leaks. */ public final class ScopeDiagnosticsExtension implements BeforeEachCallback, AfterEachCallback { @@ -35,7 +35,9 @@ public void afterEach(ExtensionContext context) { } ScopeDiagnostics.stop(); ScopeDiagnosticsReport report = ScopeDiagnostics.report(); - log.info("[{}] {}", context.getDisplayName(), report.renderSummary()); + // Always dump the full timeline so a graph/report can be built regardless of whether anything + // leaked; renderSummary() remains the concise problem-only view used by assertNoLeaks. + log.info("[{}] {}", context.getDisplayName(), report.renderTimeline()); try { if (config.failOnLeak()) { ScopeDiagnostics.assertNoLeaks(); diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReport.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReport.java index f836f6b9ac3..ac6dab350a3 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReport.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReport.java @@ -9,20 +9,23 @@ /** * An immutable snapshot of the recorded continuation and scope lifecycles plus the derived failure - * findings. Exposes a problem-only text summary ({@link #renderSummary()}); richer visualizations - * (Gantt, DAG) are produced by the {@code investigate-continuation-leakage} skill from the recorded - * data rather than rendered here. + * findings. Exposes two text renderings: a problem-only summary ({@link #renderSummary()}) for + * quick triage / assertion messages, and a complete timeline ({@link #renderTimeline()}) that dumps + * every continuation and scope with its full event lineage — emitted on every tracked test + * regardless of whether anything leaked, so a graph or report can always be produced from it. * *

The two lifecycles are kept separate: {@link ContinuationRecord} (captured → resumed → * finished) and {@link ScopeRecord} (opened → closed). A scope spawned by resuming a continuation * is linked to it ({@link ContinuationRecord#scopeRecordSeqs()} / {@link - * ScopeRecord#continuationSeq}). + * ScopeRecord#continuationSeq}) and rendered nested under it in the timeline. */ public final class ScopeDiagnosticsReport { private final List continuations; private final List scopes; + private final long t0; private final Map> continuationFailures; private final Map> scopeFailures; + private final Map scopeBySeq; ScopeDiagnosticsReport( List continuations, @@ -32,8 +35,24 @@ public final class ScopeDiagnosticsReport { this.continuations.sort((a, b) -> Long.compare(a.firstNanos(), b.firstNanos())); this.scopes = new ArrayList<>(scopes); this.scopes.sort((a, b) -> Long.compare(a.firstNanos(), b.firstNanos())); + this.t0 = computeT0(this.continuations, this.scopes); this.continuationFailures = classifyContinuations(this.continuations, rootWrittenNanos); this.scopeFailures = classifyScopes(this.scopes); + this.scopeBySeq = new LinkedHashMap<>(); + for (ScopeRecord s : this.scopes) { + scopeBySeq.put(s.seq, s); + } + } + + private static long computeT0(List continuations, List scopes) { + long min = Long.MAX_VALUE; + for (ContinuationRecord r : continuations) { + min = Math.min(min, r.firstNanos()); + } + for (ScopeRecord s : scopes) { + min = Math.min(min, s.firstNanos()); + } + return min == Long.MAX_VALUE ? 0 : min; } private static Map> classifyContinuations( @@ -187,4 +206,165 @@ public String renderSummary() { } return sb.toString(); } + + // ---- rendering: complete timeline ---------------------------------------- + + private static final int TIMELINE_FRAMES = 3; + + /** + * Complete cross-thread timeline: one block per continuation (capture → resume(s) → terminal), + * with the scopes it spawned nested under it, followed by any non-continuation scopes. Unlike + * {@link #renderSummary()} this lists all records, not just the flagged ones, so a graph + * (Gantt/DAG) or report can be reconstructed from it whether or not anything leaked. Each event + * carries its relative time ({@code +Δms} from the first recorded event), thread, and callsite. + */ + public String renderTimeline() { + StringBuilder sb = new StringBuilder(); + appendHeader(sb, "Scope/continuation timeline"); + if (continuations.isEmpty() && scopes.isEmpty()) { + sb.append(" (nothing captured)\n"); + return sb.toString(); + } + + for (ContinuationRecord r : continuations) { + EnumSet failures = + continuationFailures.getOrDefault(r, EnumSet.noneOf(Failure.class)); + sb.append("\n#") + .append(r.seq) + .append(' ') + .append(r.status()) + .append(" trace=") + .append(r.traceId) + .append(" span=") + .append(r.spanId); + if (r.spanName != null) { + sb.append(" \"").append(r.spanName).append('"'); + } + sb.append(" src=").append(r.sourceName()); + if (r.orphan) { + sb.append(" [ORPHAN]"); + } + if (r.threadHandoff()) { + sb.append(" [handoff]"); + } + if (!failures.isEmpty()) { + sb.append(' ').append(failures); + } + sb.append(timing(r)).append('\n'); + + appendEvent(sb, "capture ", r.capture()); + for (ScopeEvent a : r.resumes()) { + appendEvent(sb, "resume ", a); + } + for (ScopeEvent f : r.failedActivations()) { + appendEvent(sb, "act-fail", f); + } + ScopeEvent terminal = r.terminal(); + if (terminal != null) { + appendEvent( + sb, + terminal.type == ScopeEvent.Type.RESOLVE_CANCEL ? "cancel " : "finish ", + terminal); + } + for (ScopeEvent extra : r.extraTerminals()) { + appendEvent(sb, "DOUBLE ", extra); + } + for (long scopeSeq : r.scopeRecordSeqs()) { + ScopeRecord scope = scopeBySeq.get(scopeSeq); + if (scope != null) { + appendScopeLine(sb, " ", scope); + } + } + if (terminal == null) { + sb.append(" LEAKED (never finished or cancelled)\n"); + } + } + + List orphanScopes = new ArrayList<>(); + for (ScopeRecord s : scopes) { + if (s.continuationSeq == null) { + orphanScopes.add(s); + } + } + if (!orphanScopes.isEmpty()) { + sb.append("\nNon-continuation scopes:\n"); + for (ScopeRecord s : orphanScopes) { + appendScopeLine(sb, " ", s); + } + } + return sb.toString(); + } + + private String timing(ContinuationRecord r) { + StringBuilder sb = new StringBuilder(); + Long capToResume = r.captureToFirstResumeNanos(); + Long age = r.ageAtTerminalNanos(); + if (capToResume != null) { + sb.append(" cap->resume=").append(millis(capToResume)).append("ms"); + } + if (age != null) { + sb.append(" age=").append(millis(age)).append("ms"); + } + return sb.toString(); + } + + private void appendScopeLine(StringBuilder sb, String indent, ScopeRecord scope) { + EnumSet failures = scopeFailures.getOrDefault(scope, EnumSet.noneOf(Failure.class)); + sb.append(indent).append("scope#").append(scope.seq).append(' ').append(scope.sourceName()); + if (scope.spanName != null) { + sb.append(" \"").append(scope.spanName).append('"'); + } + ScopeEvent open = scope.open(); + ScopeEvent close = scope.close(); + if (open != null) { + sb.append(" open +").append(relMillis(open.nanos)).append("ms @ ").append(open.threadName); + } + if (close != null) { + sb.append(" close +") + .append(relMillis(close.nanos)) + .append("ms @ ") + .append(close.threadName); + Long active = scope.activeDurationNanos(); + if (active != null) { + sb.append(" (active ").append(millis(active)).append("ms)"); + } + } + if (scope.threadHandoff()) { + sb.append(" [handoff]"); + } + if (!failures.isEmpty()) { + sb.append(' ').append(failures); + } + sb.append('\n'); + } + + private void appendEvent(StringBuilder sb, String label, ScopeEvent event) { + if (event == null) { + sb.append(" ").append(label).append(" (not observed)\n"); + return; + } + sb.append(" ") + .append(label) + .append(" +") + .append(relMillis(event.nanos)) + .append("ms @ ") + .append(event.threadName) + .append(" at ") + .append(event.callsite() == null ? "" : event.callsite()) + .append('\n'); + StackTraceElement[] stack = event.stack; + if (stack != null) { + for (int i = 1; i < stack.length && i < TIMELINE_FRAMES; i++) { + sb.append(" from ").append(stack[i]).append('\n'); + } + } + } + + private String relMillis(long nanos) { + return String.format("%.3f", (nanos - t0) / 1_000_000.0); + } + + private static String millis(long nanos) { + return String.format("%.3f", nanos / 1_000_000.0); + } } diff --git a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReportTest.java b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReportTest.java index 094c0c3e471..964ba7f521a 100644 --- a/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReportTest.java +++ b/dd-java-agent/instrumentation-testing/src/test/java/datadog/trace/agent/test/scopediag/ScopeDiagnosticsReportTest.java @@ -125,6 +125,28 @@ void failedActivationIsActivateAfterResolve() { assertTrue(report.hasProblems()); } + @Test + void timelineRendersResolvedContinuationEvenWithoutProblems() { + ContinuationRecord r = record(0, DDTraceId.from(30)); + r.addResume(event(ScopeEvent.Type.ACTIVATE, "pool-1", 2000)); + r.setTerminalOrExtra(event(ScopeEvent.Type.RESOLVE_FINISH, "pool-1", 3000)); + + ScopeDiagnosticsReport report = report(list(r), map()); + + // a clean run: the summary reports no problems ... + assertFalse(report.hasProblems()); + assertTrue(report.renderSummary().contains("(none)")); + + // ... but the timeline still dumps the full lineage so a graph/report can be built + String timeline = report.renderTimeline(); + assertTrue(timeline.contains("#0 FINISHED")); + assertTrue(timeline.contains("capture")); + assertTrue(timeline.contains("resume")); + assertTrue(timeline.contains("finish")); + assertTrue(timeline.contains("Worker.java:42")); // callsite preserved + assertTrue(timeline.contains("@ pool-1")); // resume/finish thread preserved + } + // ---- helpers ------------------------------------------------------------- private static ScopeDiagnosticsReport report( diff --git a/docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md b/docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md index 560bdb3f3fe..d52cc742b02 100644 --- a/docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md +++ b/docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md @@ -117,18 +117,25 @@ never-resolved, late-after-root, double/invalid, and the full cross-thread timel ### 3. Renderers -> **Addendum (2026-06):** The in-Java Gantt renderers (text `renderGantt()` and Mermaid -> `toMermaidGantt()`) were removed, and subsequently the **JSON output was dropped too** (its only -> consumer was an LLM, which reads structured text fine, and the hand-rolled serializer was a -> liability). Java now emits **only the leak-only text summary** (`renderSummary()`), logged at the -> end of each tracked test. The `investigate-continuation-leakage` skill reads that summary and -> renders a DAG of the flagged leaks. Note: the summary is problem-only and carries no per-event -> timeline, so a time-accurate Gantt requires re-enabling a structured feed. The -> `@TrackScopeContinuations` attributes were reduced to just `failOnLeak` (default `false`); +> **Addendum (2026-06):** The Mermaid renderer (`toMermaidGantt()`) and the **JSON output** were +> removed — visualization moved to the `investigate-continuation-leakage` skill (an LLM reads text +> fine, and the hand-rolled JSON serializer was a liability). Java now emits two text views, both +> logged at the end of each tracked test: +> +> - **`renderTimeline()`** — the complete dump: *every* continuation and scope with its full event +> lineage (capture → resume(s) → terminal, threads, `+Δms` relative timing, callsites, nested +> spawned scopes), emitted **regardless of whether anything leaked**. This is the feed the skill +> reads to render a Gantt or DAG, so a graph/report can always be produced. (This replaces the old +> `renderGantt()`, minus the chart framing.) +> - **`renderSummary()`** — the problem-only view (flagged records + callsites), still used for +> `assertNoLeaks` failure messages. +> +> The `@TrackScopeContinuations` attributes were reduced to just `failOnLeak` (default `false`); > `gantt()`/`mermaid()`/`json()` were all removed. -- **Leak-only summary**: only the flagged problems, each with its callsite(s) — a quick scan - / failure message, and the sole output consumed by the skill. +- **Full timeline**: all continuations/scopes with per-event threads, relative timing, and + callsites — the feed the skill consumes to build a graph or report, leak or no leak. +- **Leak-only summary**: just the flagged problems with callsites — the `assertNoLeaks` message. ### 4. Harness integration (passive by default — no impact on existing tests) From 9061b1b68bb0c9d8fc662523e3b387d522b0276b Mon Sep 17 00:00:00 2001 From: Andrea Marziali Date: Tue, 30 Jun 2026 09:12:41 +0200 Subject: [PATCH 9/9] removeme --- .claude/skills/investigate-continuation-leakage/SKILL.md | 8 +++++++- .../trace/agent/test/InstrumentationSpecification.groovy | 3 +++ .../trace/agent/test/AbstractInstrumentationTest.java | 4 ++++ ...026-06-10-scope-continuation-leak-diagnostic-design.md | 8 ++++++++ 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/.claude/skills/investigate-continuation-leakage/SKILL.md b/.claude/skills/investigate-continuation-leakage/SKILL.md index 5123b48d441..b3985e3239a 100644 --- a/.claude/skills/investigate-continuation-leakage/SKILL.md +++ b/.claude/skills/investigate-continuation-leakage/SKILL.md @@ -44,7 +44,13 @@ the candidate test classes (Glob `**/src/test/**/*Test.{java,groovy}` in the mod user which test to run with `AskUserQuestion`. You want one concrete test class (and ideally one method) plus its Gradle module path, e.g. `:dd-java-agent:instrumentation:google-pubsub-1.116`. -## Step 2 — Enable tracking (temporary) +> **Note:** tracking is now **always-on** for every instrumentation test (`@TrackScopeContinuations` +> sits on the `AbstractInstrumentationTest` / `InstrumentationSpecification` base classes, report-only). +> If that base-class annotation is present, **skip Step 2** — just run the test (Step 3) and read the +> timeline. Only do Step 2 when tracking is *not* already inherited (e.g. the base annotation was +> removed) or you want method-level `failOnLeak=true` enforcement. + +## Step 2 — Enable tracking (only if not already inherited) With `Edit`, add the opt-in annotation to the chosen test class (or a single method): diff --git a/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy b/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy index 546aae12f7b..d9f12b8a215 100644 --- a/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy +++ b/dd-java-agent/instrumentation-testing/src/main/groovy/datadog/trace/agent/test/InstrumentationSpecification.groovy @@ -114,6 +114,9 @@ import spock.lang.Shared @SuppressWarnings('UnnecessaryDotClass') @ExtendWith(TestClassShadowingExtension.class) @ExtendWith(TooManyInvocationsErrorHandler.class) +// Track scope continuations for every instrumentation spec (report-only: failOnLeak defaults to +// false). @Inherited, so all specs get the full timeline dumped after each test. +@TrackScopeContinuations abstract class InstrumentationSpecification extends DDSpecification implements AgentBuilder.Listener { private static final long TIMEOUT_MILLIS = TimeUnit.SECONDS.toMillis(20) diff --git a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/AbstractInstrumentationTest.java b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/AbstractInstrumentationTest.java index 8a9e5f82b5b..81e6e906f54 100644 --- a/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/AbstractInstrumentationTest.java +++ b/dd-java-agent/instrumentation-testing/src/main/java/datadog/trace/agent/test/AbstractInstrumentationTest.java @@ -8,6 +8,7 @@ import datadog.trace.agent.test.assertions.TraceAssertions; import datadog.trace.agent.test.assertions.TraceMatcher; import datadog.trace.agent.test.scopediag.ScopeDiagnosticsExtension; +import datadog.trace.agent.test.scopediag.TrackScopeContinuations; import datadog.trace.agent.tooling.AgentInstaller; import datadog.trace.agent.tooling.InstrumenterModule; import datadog.trace.agent.tooling.TracerInstaller; @@ -60,6 +61,9 @@ AllowContextTestingExtension.class, ScopeDiagnosticsExtension.class }) +// Track scope continuations for every instrumentation test (report-only: failOnLeak defaults to +// false). @Inherited, so all subclasses get the full timeline dumped after each test. +@TrackScopeContinuations public abstract class AbstractInstrumentationTest { static final Instrumentation INSTRUMENTATION = ByteBuddyAgent.getInstrumentation(); diff --git a/docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md b/docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md index d52cc742b02..8de35897bfe 100644 --- a/docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md +++ b/docs/superpowers/specs/2026-06-10-scope-continuation-leak-diagnostic-design.md @@ -132,6 +132,14 @@ never-resolved, late-after-root, double/invalid, and the full cross-thread timel > > The `@TrackScopeContinuations` attributes were reduced to just `failOnLeak` (default `false`); > `gantt()`/`mermaid()`/`json()` were all removed. +> +> **Always-on (2026-06):** `@TrackScopeContinuations` is now placed on the two base classes +> (`AbstractInstrumentationTest` and `InstrumentationSpecification`) and is `@Inherited`, so **every** +> instrumentation test tracks continuations and dumps the full timeline after each test — in +> report-only mode (`failOnLeak=false`; leaks are logged, never fail the test). This trades per-test +> cost (a ByteBuddy retransform of the tracer core classes per test JVM + a filtered stack capture +> per scope/continuation event) for fleet-wide visibility. Individual tests/methods can still carry +> their own `@TrackScopeContinuations(failOnLeak = true)` to opt into enforcement. - **Full timeline**: all continuations/scopes with per-event threads, relative timing, and callsites — the feed the skill consumes to build a graph or report, leak or no leak.