Skip to content

Commit 20be562

Browse files
committed
Add the randomized allocation sampling feature
This feature allows profilers to do allocation profiling based off randomized samples. It has better theoretical and empirically observed accuracy than our current allocation profiling approaches while also maintaining low performance overhead. It is designed for use in production profiling scenarios. For more information about usage and implementation, see the included doc docs/design/features/RandomizedAllocationSampling.md
1 parent 6a0c632 commit 20be562

19 files changed

+822
-31
lines changed

docs/design/features/RandomizedAllocationSampling.md

Lines changed: 317 additions & 0 deletions
Large diffs are not rendered by default.

src/coreclr/inc/eventtracebase.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1333,17 +1333,19 @@ namespace ETW
13331333
#define ETWLoaderStaticLoad 0 // Static reference load
13341334
#define ETWLoaderDynamicLoad 1 // Dynamic assembly load
13351335

1336+
#if defined (FEATURE_EVENT_TRACE)
1337+
EXTERN_C DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_PROVIDER_DOTNET_Context;
1338+
EXTERN_C DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_PRIVATE_PROVIDER_DOTNET_Context;
1339+
EXTERN_C DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_RUNDOWN_PROVIDER_DOTNET_Context;
1340+
EXTERN_C DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_STRESS_PROVIDER_DOTNET_Context;
1341+
#endif // FEATURE_EVENT_TRACE
1342+
13361343
#if defined(FEATURE_EVENT_TRACE) && !defined(HOST_UNIX)
13371344
//
13381345
// The ONE and only ONE global instantiation of this class
13391346
//
13401347
extern ETW::CEtwTracer * g_pEtwTracer;
13411348

1342-
EXTERN_C DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_PROVIDER_DOTNET_Context;
1343-
EXTERN_C DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_PRIVATE_PROVIDER_DOTNET_Context;
1344-
EXTERN_C DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_RUNDOWN_PROVIDER_DOTNET_Context;
1345-
EXTERN_C DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_STRESS_PROVIDER_DOTNET_Context;
1346-
13471349
//
13481350
// Special Handling of Startup events
13491351
//

src/coreclr/minipal/Windows/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ set(SOURCES
33
dn-u16.cpp
44
${CLR_SRC_NATIVE_DIR}/minipal/utf8.c
55
${CLR_SRC_NATIVE_DIR}/minipal/time.c
6+
${CLR_SRC_NATIVE_DIR}/minipal/xoshiro128pp.c
67
)
78

89
if(NOT CLR_CROSS_COMPONENTS_BUILD)

src/coreclr/nativeaot/Runtime/GCHelpers.cpp

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,12 @@
2929

3030
#include "gcdesc.h"
3131

32+
#ifdef FEATURE_EVENT_TRACE
33+
#include "clretwallmain.h"
34+
#else // FEATURE_EVENT_TRACE
35+
#include "etmdummy.h"
36+
#endif // FEATURE_EVENT_TRACE
37+
3238
#define RH_LARGE_OBJECT_SIZE 85000
3339

3440
MethodTable g_FreeObjectEEType;
@@ -471,6 +477,29 @@ EXTERN_C int64_t QCALLTYPE RhGetTotalAllocatedBytesPrecise()
471477
return allocated;
472478
}
473479

480+
void FireAllocationSampled(GC_ALLOC_FLAGS flags, size_t size, size_t samplingBudgetOffset, Object* orObject)
481+
{
482+
#ifdef FEATURE_EVENT_TRACE
483+
void* typeId = GetLastAllocEEType();
484+
// Note: Just as for AllocationTick, the type name cannot be retrieved
485+
WCHAR* name = nullptr;
486+
487+
if (typeId != nullptr)
488+
{
489+
unsigned int allocKind =
490+
(flags & GC_ALLOC_PINNED_OBJECT_HEAP) ? 2 :
491+
(flags & GC_ALLOC_LARGE_OBJECT_HEAP) ? 1 :
492+
0; // SOH
493+
unsigned int heapIndex = 0;
494+
#ifdef BACKGROUND_GC
495+
gc_heap* hp = gc_heap::heap_of((BYTE*)orObject);
496+
heapIndex = hp->heap_number;
497+
#endif
498+
FireEtwAllocationSampled(allocKind, GetClrInstanceId(), typeId, name, heapIndex, (BYTE*)orObject, size, samplingBudgetOffset);
499+
}
500+
#endif
501+
}
502+
474503
static Object* GcAllocInternal(MethodTable* pEEType, uint32_t uFlags, uintptr_t numElements, Thread* pThread)
475504
{
476505
ASSERT(!pThread->IsDoNotTriggerGcSet());
@@ -539,8 +568,47 @@ static Object* GcAllocInternal(MethodTable* pEEType, uint32_t uFlags, uintptr_t
539568
// Save the MethodTable for instrumentation purposes.
540569
tls_pLastAllocationEEType = pEEType;
541570

542-
Object* pObject = GCHeapUtilities::GetGCHeap()->Alloc(pThread->GetAllocContext(), cbSize, uFlags);
543-
pThread->GetEEAllocContext()->UpdateCombinedLimit();
571+
// check for dynamic allocation sampling
572+
ee_alloc_context* pEEAllocContext = pThread->GetEEAllocContext();
573+
gc_alloc_context* pAllocContext = pEEAllocContext->GetGCAllocContext();
574+
bool isSampled = false;
575+
size_t availableSpace = 0;
576+
size_t samplingBudget = 0;
577+
578+
bool isRandomizedSamplingEnabled = ee_alloc_context::IsRandomizedSamplingEnabled();
579+
if (isRandomizedSamplingEnabled)
580+
{
581+
// The number bytes we can allocate before we need to emit a sampling event.
582+
// This calculation is only valid if combined_limit < alloc_limit.
583+
samplingBudget = (size_t)(pEEAllocContext->combined_limit - pAllocContext->alloc_ptr);
584+
585+
// The number of bytes available in the current allocation context
586+
availableSpace = (size_t)(pAllocContext->alloc_limit - pAllocContext->alloc_ptr);
587+
588+
// Check to see if the allocated object overlaps a sampled byte
589+
// in this AC. This happens when both:
590+
// 1) The AC contains a sampled byte (combined_limit < alloc_limit)
591+
// 2) The object is large enough to overlap it (samplingBudget < aligned_size)
592+
//
593+
// Note that the AC could have no remaining space for allocations (alloc_ptr =
594+
// alloc_limit = combined_limit). When a thread hasn't done any SOH allocations
595+
// yet it also starts in an empty state where alloc_ptr = alloc_limit =
596+
// combined_limit = nullptr. The (1) check handles both of these situations
597+
// properly as an empty AC can not have a sampled byte inside of it.
598+
isSampled =
599+
(pEEAllocContext->combined_limit < pAllocContext->alloc_limit) &&
600+
(samplingBudget < cbSize);
601+
602+
// if the object overflows the AC, we need to sample the remaining bytes
603+
// the sampling budget only included at most the bytes inside the AC
604+
if (cbSize > availableSpace && !isSampled)
605+
{
606+
samplingBudget = ee_alloc_context::ComputeGeometricRandom() + availableSpace;
607+
isSampled = (samplingBudget < cbSize);
608+
}
609+
}
610+
611+
Object* pObject = GCHeapUtilities::GetGCHeap()->Alloc(pAllocContext, cbSize, uFlags);
544612
if (pObject == NULL)
545613
return NULL;
546614

@@ -551,6 +619,19 @@ static Object* GcAllocInternal(MethodTable* pEEType, uint32_t uFlags, uintptr_t
551619
((Array*)pObject)->InitArrayLength((uint32_t)numElements);
552620
}
553621

622+
if (isSampled)
623+
{
624+
FireAllocationSampled((GC_ALLOC_FLAGS)uFlags, cbSize, samplingBudget, pObject);
625+
}
626+
627+
// There are a variety of conditions that may have invalidated the previous combined_limit value
628+
// such as not allocating the object in the AC memory region (UOH allocations), moving the AC, adding
629+
// extra alignment padding, allocating a new AC, or allocating an object that consumed the sampling budget.
630+
// Rather than test for all the different invalidation conditions individually we conservatively always
631+
// recompute it. If sampling isn't enabled this inlined function is just trivially setting
632+
// combined_limit=alloc_limit.
633+
pEEAllocContext->UpdateCombinedLimit(isRandomizedSamplingEnabled);
634+
554635
if (uFlags & GC_ALLOC_USER_OLD_HEAP)
555636
GCHeapUtilities::GetGCHeap()->PublishObject((uint8_t*)pObject);
556637

src/coreclr/nativeaot/Runtime/disabledeventtrace.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@
1212

1313
void EventTracing_Initialize() { }
1414

15+
bool IsRuntimeProviderEnabled(uint8_t level, uint64_t keyword)
16+
{
17+
return false;
18+
}
19+
1520
void ETW::GCLog::FireGcStart(ETW_GC_INFO * pGcInfo) { }
1621

1722
#ifdef FEATURE_ETW

src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Native runtime events supported by aot runtime.
22

3+
AllocationSampled
34
BGC1stConEnd
45
BGC1stNonConEnd
56
BGC1stSweepEnd

src/coreclr/nativeaot/Runtime/eventtrace.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ DOTNET_TRACE_CONTEXT MICROSOFT_WINDOWS_DOTNETRUNTIME_PRIVATE_PROVIDER_DOTNET_Con
3737
MICROSOFT_WINDOWS_DOTNETRUNTIME_PRIVATE_PROVIDER_EVENTPIPE_Context
3838
};
3939

40+
bool IsRuntimeProviderEnabled(uint8_t level, uint64_t keyword)
41+
{
42+
return RUNTIME_PROVIDER_CATEGORY_ENABLED(level, keyword);
43+
}
44+
4045
volatile LONGLONG ETW::GCLog::s_l64LastClientSequenceNumber = 0;
4146

4247
//---------------------------------------------------------------------------------------
@@ -245,4 +250,4 @@ void EventPipeEtwCallbackDotNETRuntimePrivate(
245250
_Inout_opt_ PVOID CallbackContext)
246251
{
247252
EtwCallbackCommon(DotNETRuntimePrivate, ControlCode, Level, MatchAnyKeyword, FilterData, true);
248-
}
253+
}

src/coreclr/nativeaot/Runtime/eventtracebase.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ void InitializeEventTracing();
3030

3131
#ifdef FEATURE_EVENT_TRACE
3232

33+
bool IsRuntimeProviderEnabled(uint8_t level, uint64_t keyword);
34+
3335
// !!!!!!! NOTE !!!!!!!!
3436
// The flags must match those in the ETW manifest exactly
3537
// !!!!!!! NOTE !!!!!!!!
@@ -102,6 +104,7 @@ struct ProfilingScanContext;
102104
#define CLR_GCHEAPSURVIVALANDMOVEMENT_KEYWORD 0x400000
103105
#define CLR_MANAGEDHEAPCOLLECT_KEYWORD 0x800000
104106
#define CLR_GCHEAPANDTYPENAMES_KEYWORD 0x1000000
107+
#define CLR_ALLOCATIONSAMPLING_KEYWORD 0x80000000000
105108

106109
//
107110
// Using KEYWORDZERO means when checking the events category ignore the keyword

src/coreclr/nativeaot/Runtime/gctoclreventsink.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "common.h"
55
#include "gctoclreventsink.h"
66
#include "thread.h"
7+
#include "eventtracebase.h"
78

89
GCToCLREventSink g_gcToClrEventSink;
910

@@ -174,6 +175,14 @@ void GCToCLREventSink::FireGCAllocationTick_V4(uint64_t allocationAmount,
174175
{
175176
LIMITED_METHOD_CONTRACT;
176177

178+
#ifdef FEATURE_EVENT_TRACE
179+
if (IsRuntimeProviderEnabled(TRACE_LEVEL_INFORMATION, CLR_ALLOCATIONSAMPLING_KEYWORD))
180+
{
181+
// skip AllocationTick if AllocationSampled is emitted
182+
return;
183+
}
184+
#endif // FEATURE_EVENT_TRACE
185+
177186
void * typeId = GetLastAllocEEType();
178187
WCHAR * name = nullptr;
179188

src/coreclr/nativeaot/Runtime/thread.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,13 @@ static Thread* g_RuntimeInitializingThread;
3535

3636
#endif //!DACCESS_COMPILE
3737

38+
ee_alloc_context::PerThreadRandom::PerThreadRandom()
39+
{
40+
minipal_xoshiro128pp_init(&random_state, (uint32_t)PalGetTickCount64());
41+
}
42+
43+
thread_local ee_alloc_context::PerThreadRandom ee_alloc_context::t_random = PerThreadRandom();
44+
3845
PInvokeTransitionFrame* Thread::GetTransitionFrame()
3946
{
4047
if (ThreadStore::GetSuspendingThread() == this)

src/coreclr/nativeaot/Runtime/thread.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
#include "StackFrameIterator.h"
88
#include "slist.h" // DefaultSListTraits
9+
#include <minipal/xoshiro128pp.h>
910

1011
struct gc_alloc_context;
1112
class RuntimeInstance;
@@ -113,7 +114,19 @@ struct ee_alloc_context
113114

114115
gc_alloc_context* GetGCAllocContext();
115116
uint8_t* GetCombinedLimit();
116-
void UpdateCombinedLimit();
117+
void UpdateCombinedLimit(bool samplingEnabled);
118+
static bool IsRandomizedSamplingEnabled();
119+
static int ComputeGeometricRandom();
120+
121+
struct PerThreadRandom
122+
{
123+
minipal_xoshiro128pp random_state;
124+
125+
PerThreadRandom();
126+
float NextFloat();
127+
};
128+
129+
static thread_local PerThreadRandom t_random;
117130
};
118131

119132

src/coreclr/nativeaot/Runtime/thread.inl

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33

44
#ifndef DACCESS_COMPILE
55

6+
#include "eventtracebase.h"
67

8+
const uint32_t SamplingDistributionMean = (100 * 1024);
79

810
inline gc_alloc_context* ee_alloc_context::GetGCAllocContext()
911
{
@@ -22,11 +24,43 @@ struct _thread_inl_gc_alloc_context
2224
uint8_t* alloc_limit;
2325
};
2426

25-
inline void ee_alloc_context::UpdateCombinedLimit()
27+
28+
inline bool ee_alloc_context::IsRandomizedSamplingEnabled()
29+
{
30+
#ifdef FEATURE_EVENT_TRACE
31+
return IsRuntimeProviderEnabled(TRACE_LEVEL_INFORMATION, CLR_ALLOCATIONSAMPLING_KEYWORD);
32+
#else
33+
return false;
34+
#endif // FEATURE_EVENT_TRACE
35+
}
36+
37+
inline void ee_alloc_context::UpdateCombinedLimit(bool samplingEnabled)
38+
{
39+
if (!samplingEnabled)
40+
{
41+
combined_limit = ((_thread_inl_gc_alloc_context*)GetGCAllocContext())->alloc_limit;
42+
}
43+
else
44+
{
45+
// compute the next sampling limit based on a geometric distribution
46+
uint8_t* sampling_limit = ((_thread_inl_gc_alloc_context*)GetGCAllocContext())->alloc_ptr + ComputeGeometricRandom();
47+
48+
// if the sampling limit is larger than the allocation context, no sampling will occur in this AC
49+
combined_limit = min(sampling_limit, ((_thread_inl_gc_alloc_context*)GetGCAllocContext())->alloc_limit);
50+
}
51+
}
52+
53+
inline int ee_alloc_context::ComputeGeometricRandom()
54+
{
55+
// compute a random sample from the Geometric distribution
56+
float probability = t_random.NextFloat();
57+
int threshold = (int)(-log(1 - probability) * SamplingDistributionMean);
58+
return threshold;
59+
}
60+
61+
inline float ee_alloc_context::PerThreadRandom::NextFloat()
2662
{
27-
// The randomized allocation sampling feature is being submitted in stages. For now sampling is never enabled so
28-
// combined_limit is always the same as alloc_limit.
29-
combined_limit = ((_thread_inl_gc_alloc_context*)GetGCAllocContext())->alloc_limit;
63+
return (float)minipal_xoshiro128pp_next(&random_state) * (1.0F/UINT32_MAX);
3064
}
3165

3266
// Set the m_pDeferredTransitionFrame field for GC allocation helpers that setup transition frame

0 commit comments

Comments
 (0)