Skip to content

Commit 9e6be01

Browse files
vilterpNHDaly
andcommitted
Allow threadsafe access to buffer ot type inference profiling trees
Co-authored-by: Nathan Daly <[email protected]>
1 parent 0eafda8 commit 9e6be01

File tree

6 files changed

+131
-66
lines changed

6 files changed

+131
-66
lines changed

base/compiler/typeinfer.jl

Lines changed: 66 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ being used for this purpose alone.
2222
"""
2323
module Timings
2424

25-
using Core.Compiler: -, +, :, Vector, length, first, empty!, push!, pop!, @inline,
25+
using Core.Compiler: -, +, :, >, Vector, length, first, empty!, push!, pop!, @inline,
2626
@inbounds, copy, backtrace
2727

2828
# What we record for any given frame we infer during type inference.
@@ -47,13 +47,16 @@ end
4747

4848
_typeinf_identifier(frame::InferenceFrameInfo) = frame
4949

50+
_typeinf_frame_linfo(frame::Core.Compiler.InferenceState) = frame.linfo
51+
_typeinf_frame_linfo(frame::InferenceFrameInfo) = frame.mi
52+
5053
"""
51-
Core.Compiler.Timing(mi_info, start_time, ...)
54+
Core.Compiler.Timings.Timing(mi_info, start_time, ...)
5255
5356
Internal type containing the timing result for running type inference on a single
5457
MethodInstance.
5558
"""
56-
struct Timing
59+
mutable struct Timing
5760
mi_info::InferenceFrameInfo
5861
start_time::UInt64
5962
cur_start_time::UInt64
@@ -66,6 +69,31 @@ Timing(mi_info, start_time) = Timing(mi_info, start_time, start_time, UInt64(0),
6669

6770
_time_ns() = ccall(:jl_hrtime, UInt64, ()) # Re-implemented here because Base not yet available.
6871

72+
"""
73+
Core.Compiler.Timings.clear_and_fetch_timings()
74+
75+
Return, then clear, the previously recorded type inference timings.
76+
77+
This fetches a vector of all of the type inference timings that have _finished_ as of this call. Note
78+
that there may be concurrent invocations of inference that are still running in another thread, but
79+
which haven't yet been added to this buffer. Those can be fetched in a future call.
80+
"""
81+
function clear_and_fetch_timings()
82+
# Pass in the type, since the C code doesn't know about our Timing struct.
83+
ccall(:jl_typeinf_profiling_clear_and_fetch, Any, (Any, Any,),
84+
_finished_timings, Vector{Timing})::Vector{Timing}
85+
end
86+
87+
function finish_timing_profile(timing::Timing)
88+
ccall(:jl_typeinf_profiling_push_timing, Cvoid, (Any, Any,), _finished_timings, timing)
89+
end
90+
91+
# DO NOT ACCESS DIRECTLY. This vector should only be accessed through the
92+
# functions above. It is a buffer that lives in the Julia module only to be *rooted*
93+
# for GC, but all accesses to the vector must go through C code, in order to be
94+
# thread safe.
95+
const _finished_timings = Timing[]
96+
6997
# We keep a stack of the Timings for each of the MethodInstances currently being timed.
7098
# Since type inference currently operates via a depth-first search (during abstract
7199
# evaluation), this vector operates like a call stack. The last node in _timings is the
@@ -74,16 +102,21 @@ _time_ns() = ccall(:jl_hrtime, UInt64, ()) # Re-implemented here because Base n
74102
# call structure through type inference is recorded. (It's recorded as a tree, not a graph,
75103
# because we create a new node for duplicates.)
76104
const _timings = Timing[]
105+
77106
# ROOT() is an empty function used as the top-level Timing node to measure all time spent
78107
# *not* in type inference during a given recording trace. It is used as a "dummy" node.
79108
function ROOT() end
80109
const ROOTmi = Core.Compiler.specialize_method(
81110
first(Core.Compiler.methods(ROOT)), Tuple{typeof(ROOT)}, Core.svec())
111+
82112
"""
83113
Core.Compiler.reset_timings()
84114
85115
Empty out the previously recorded type inference timings (`Core.Compiler._timings`), and
86116
start the ROOT() timer again. `ROOT()` measures all time spent _outside_ inference.
117+
118+
!!! info
119+
This function is deprecated as of Julia 1.9; use [`clear_and_fetch_timings`](@ref) instead.
87120
"""
88121
function reset_timings()
89122
empty!(_timings)
@@ -93,7 +126,6 @@ function reset_timings()
93126
_time_ns()))
94127
return nothing
95128
end
96-
reset_timings()
97129

98130
# (This is split into a function so that it can be called both in this module, at the top
99131
# of `enter_new_timer()`, and once at the Very End of the operation, by whoever started
@@ -105,44 +137,31 @@ reset_timings()
105137
parent_timer = _timings[end]
106138
accum_time = stop_time - parent_timer.cur_start_time
107139

108-
# Add in accum_time ("modify" the immutable struct)
140+
# Add in accum_time
109141
@inbounds begin
110-
_timings[end] = Timing(
111-
parent_timer.mi_info,
112-
parent_timer.start_time,
113-
parent_timer.cur_start_time,
114-
parent_timer.time + accum_time,
115-
parent_timer.children,
116-
parent_timer.bt,
117-
)
142+
_timings[end].time += accum_time
118143
end
119144
return nothing
120145
end
121146

122147
@inline function enter_new_timer(frame)
123148
# Very first thing, stop the active timer: get the current time and add in the
124149
# time since it was last started to its aggregate exclusive time.
125-
close_current_timer()
126-
127-
mi_info = _typeinf_identifier(frame)
150+
if length(_timings) > 0
151+
close_current_timer()
152+
end
128153

129154
# Start the new timer right before returning
155+
mi_info = _typeinf_identifier(frame)
130156
push!(_timings, Timing(mi_info, UInt64(0)))
131157
len = length(_timings)
132158
new_timer = @inbounds _timings[len]
159+
133160
# Set the current time _after_ appending the node, to try to exclude the
134161
# overhead from measurement.
135162
start = _time_ns()
136-
137-
@inbounds begin
138-
_timings[len] = Timing(
139-
new_timer.mi_info,
140-
start,
141-
start,
142-
new_timer.time,
143-
new_timer.children,
144-
)
145-
end
163+
new_timer.start_time = start
164+
new_timer.cur_start_time = start
146165

147166
return nothing
148167
end
@@ -154,43 +173,38 @@ end
154173
# Finish the new timer
155174
stop_time = _time_ns()
156175

157-
expected_mi_info = _typeinf_identifier(_expected_frame_)
176+
expected_linfo = _typeinf_frame_linfo(_expected_frame_)
158177

159178
# Grab the new timer again because it might have been modified in _timings
160179
# (since it's an immutable struct)
161180
# And remove it from the current timings stack
162181
new_timer = pop!(_timings)
163-
Core.Compiler.@assert new_timer.mi_info.mi === expected_mi_info.mi
182+
Core.Compiler.@assert new_timer.mi_info.mi === expected_linfo
164183

165-
# Prepare to unwind one level of the stack and record in the parent
166-
parent_timer = _timings[end]
184+
# check for two cases: normal case & backcompat case
185+
is_profile_root_normal = length(_timings) === 0
186+
is_profile_root_backcompat = length(_timings) === 1 && _timings[1] === ROOTmi
187+
is_profile_root = is_profile_root_normal || is_profile_root_backcompat
167188

168189
accum_time = stop_time - new_timer.cur_start_time
169190
# Add in accum_time ("modify" the immutable struct)
170-
new_timer = Timing(
171-
new_timer.mi_info,
172-
new_timer.start_time,
173-
new_timer.cur_start_time,
174-
new_timer.time + accum_time,
175-
new_timer.children,
176-
parent_timer.mi_info.mi === ROOTmi ? backtrace() : nothing,
177-
)
178-
# Record the final timing with the original parent timer
179-
push!(parent_timer.children, new_timer)
180-
181-
# And finally restart the parent timer:
182-
len = length(_timings)
183-
@inbounds begin
184-
_timings[len] = Timing(
185-
parent_timer.mi_info,
186-
parent_timer.start_time,
187-
_time_ns(),
188-
parent_timer.time,
189-
parent_timer.children,
190-
parent_timer.bt,
191-
)
191+
new_timer.time += accum_time
192+
if is_profile_root
193+
new_timer.bt = backtrace()
192194
end
193195

196+
# Prepare to unwind one level of the stack and record in the parent
197+
if is_profile_root
198+
finish_timing_profile(new_timer)
199+
else
200+
parent_timer = _timings[end]
201+
202+
# Record the final timing with the original parent timer
203+
push!(parent_timer.children, new_timer)
204+
205+
# And finally restart the parent timer:
206+
parent_timer.cur_start_time = _time_ns()
207+
end
194208
return nothing
195209
end
196210

doc/src/devdocs/locks.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ The following is a leaf lock (level 2), and only acquires level 1 locks (safepoi
4444
> * Module->lock
4545
> * JLDebuginfoPlugin::PluginMutex
4646
> * newly_inferred_mutex
47+
> * typeinf_profiling_lock
4748
4849
The following is a level 3 lock, which can only acquire level 1 or level 2 locks internally:
4950

src/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ endif
4343
SRCS := \
4444
jltypes gf typemap smallintset ast builtins module interpreter symbol \
4545
dlload sys init task array staticdata toplevel jl_uv datatype \
46-
simplevector runtime_intrinsics precompile jloptions \
46+
simplevector runtime_intrinsics precompile jloptions inference-profiling \
4747
threading partr stackwalk gc gc-debug gc-pages gc-stacks gc-alloc-profiler method \
4848
jlapi signal-handling safepoint timing subtype rtutils gc-heap-snapshot \
4949
crc32c APInt-C processor ircode opaque_closure codegen-stubs coverage runtime_ccall
@@ -299,6 +299,7 @@ $(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc.h
299299
$(BUILDDIR)/gc.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h
300300
$(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-heap-snapshot.h
301301
$(BUILDDIR)/gc-alloc-profiler.o $(BUILDDIR)/gc-alloc-profiler.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-alloc-profiler.h
302+
$(BUILDDIR)/inference-profiling.o $(BUILDDIR)/inference-profiling.dbg.obj: $(SRCDIR)/gc.h
302303
$(BUILDDIR)/init.o $(BUILDDIR)/init.dbg.obj: $(SRCDIR)/builtin_proto.h
303304
$(BUILDDIR)/interpreter.o $(BUILDDIR)/interpreter.dbg.obj: $(SRCDIR)/builtin_proto.h
304305
$(BUILDDIR)/jitlayers.o $(BUILDDIR)/jitlayers.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/llvm-codegen-shared.h $(SRCDIR)/debug-registry.h

src/inference-profiling.c

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// Implementation for type inference profiling
2+
3+
#include "julia.h"
4+
#include "julia_internal.h"
5+
6+
jl_mutex_t typeinf_profiling_lock;
7+
8+
// == exported interface ==
9+
10+
JL_DLLEXPORT jl_array_t* jl_typeinf_profiling_clear_and_fetch(
11+
jl_array_t *inference_profiling_results_array,
12+
jl_value_t *array_timing_type
13+
)
14+
{
15+
JL_LOCK(&typeinf_profiling_lock);
16+
17+
size_t len = jl_array_len(inference_profiling_results_array);
18+
19+
jl_array_t *out = jl_alloc_array_1d(array_timing_type, len);
20+
JL_GC_PUSH1(&out);
21+
22+
memcpy(out->data, inference_profiling_results_array->data, len * sizeof(void*));
23+
24+
jl_array_del_end(inference_profiling_results_array, len);
25+
26+
JL_UNLOCK(&typeinf_profiling_lock);
27+
28+
JL_GC_POP();
29+
return out;
30+
}
31+
32+
JL_DLLEXPORT void jl_typeinf_profiling_push_timing(
33+
jl_array_t *inference_profiling_results_array,
34+
jl_value_t *timing
35+
)
36+
{
37+
JL_LOCK(&typeinf_profiling_lock);
38+
39+
jl_array_grow_end(inference_profiling_results_array, 1);
40+
size_t len = jl_array_len(inference_profiling_results_array);
41+
jl_array_ptr_set(inference_profiling_results_array, len - 1, timing);
42+
43+
JL_UNLOCK(&typeinf_profiling_lock);
44+
}

src/jl_exported_funcs.inc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,8 @@
484484
XX(jl_typeinf_lock_end) \
485485
XX(jl_typeinf_timing_begin) \
486486
XX(jl_typeinf_timing_end) \
487+
XX(jl_typeinf_profiling_clear_and_fetch) \
488+
XX(jl_typeinf_profiling_push_timing) \
487489
XX(jl_typename_str) \
488490
XX(jl_typeof_str) \
489491
XX(jl_types_equal) \

test/compiler/inference.jl

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3804,15 +3804,19 @@ f37532(T, x) = (Core.bitcast(Ptr{T}, x); x)
38043804
# Helper functions for Core.Compiler.Timings. These are normally accessed via a package -
38053805
# usually (SnoopCompileCore).
38063806
function time_inference(f)
3807-
Core.Compiler.Timings.reset_timings()
38083807
Core.Compiler.__set_measure_typeinf(true)
38093808
f()
38103809
Core.Compiler.__set_measure_typeinf(false)
3811-
Core.Compiler.Timings.close_current_timer()
3812-
return Core.Compiler.Timings._timings[1]
3810+
return Core.Compiler.Timings.clear_and_fetch_timings()
38133811
end
3814-
function depth(t::Core.Compiler.Timings.Timing)
3815-
maximum(depth.(t.children), init=0) + 1
3812+
function max_depth(t::Vector{Core.Compiler.Timings.Timing})
3813+
maximum(max_depth.(t), init=0) + 1
3814+
end
3815+
function max_depth(t::Core.Compiler.Timings.Timing)
3816+
maximum(max_depth.(t.children), init=0) + 1
3817+
end
3818+
function flatten_times(t::Vector{Core.Compiler.Timings.Timing})
3819+
collect(Iterators.flatten(flatten_times(el) for el in t))
38163820
end
38173821
function flatten_times(t::Core.Compiler.Timings.Timing)
38183822
collect(Iterators.flatten([(t.time => t.mi_info,), flatten_times.(t.children)...]))
@@ -3829,14 +3833,14 @@ end
38293833
timing1 = time_inference() do
38303834
@eval M1.g(2, 3.0)
38313835
end
3832-
@test occursin(r"Core.Compiler.Timings.Timing\(InferenceFrameInfo for Core.Compiler.Timings.ROOT\(\)\) with \d+ children", sprint(show, timing1))
3836+
@test timing1 isa Vector{Core.Compiler.Timings.Timing}
38333837
# The last two functions to be inferred should be `i` and `i2`, inferred at runtime with
38343838
# their concrete types.
38353839
@test sort([mi_info.mi.def.name for (time,mi_info) in flatten_times(timing1)[end-1:end]]) == [:i, :i2]
3836-
@test all(child->isa(child.bt, Vector), timing1.children)
3837-
@test all(child->child.bt===nothing, timing1.children[1].children)
3840+
@test all(child->isa(child.bt, Vector), timing1)
3841+
@test all(child->child.bt===nothing, timing1[1].children)
38383842
# Test the stacktrace
3839-
@test isa(stacktrace(timing1.children[1].bt), Vector{Base.StackTraces.StackFrame})
3843+
@test isa(stacktrace(timing1[1].bt), Vector{Base.StackTraces.StackFrame})
38403844
# Test that inference has cached some of the Method Instances
38413845
timing2 = time_inference() do
38423846
@eval M1.g(2, 3.0)
@@ -3857,16 +3861,16 @@ end
38573861
end
38583862
end
38593863
end
3860-
@test occursin("thunk from $(@__MODULE__) starting at $(@__FILE__):$((@__LINE__) - 5)", string(timingmod.children))
3864+
@test occursin("thunk from $(@__MODULE__) starting at $(@__FILE__):$((@__LINE__) - 5)", string(timingmod))
38613865
# END LINE NUMBER SENSITIVITY
38623866

38633867
# Recursive function
38643868
@eval module _Recursive f(n::Integer) = n == 0 ? 0 : f(n-1) + 1 end
38653869
timing = time_inference() do
38663870
@eval _Recursive.f(Base.inferencebarrier(5))
38673871
end
3868-
@test 2 <= depth(timing) <= 3 # root -> f (-> +)
3869-
@test 2 <= length(flatten_times(timing)) <= 3 # root, f, +
3872+
@test 1 <= max_depth(timing) <= 2 # f (-> +)
3873+
@test 1 <= length(flatten_times(timing)) <= 2 # f, +
38703874

38713875
# Functions inferred with multiple constants
38723876
@eval module C
@@ -3894,7 +3898,6 @@ end
38943898
@test !isempty(ft)
38953899
str = sprint(show, ft)
38963900
@test occursin("InferenceFrameInfo for /(1::$Int, ::$Int)", str) # inference constants
3897-
@test occursin("InferenceFrameInfo for Core.Compiler.Timings.ROOT()", str) # qualified
38983901
# loopc has internal slots, check constant printing in this case
38993902
sel = filter(ti -> ti.second.mi.def.name === :loopc, ft)
39003903
ifi = sel[end].second

0 commit comments

Comments
 (0)