Skip to content

Commit 1451ac5

Browse files
vilterpNHDaly
andcommitted
Allow threadsafe access to buffer ot type inference profiling trees
Co-authored-by: Nathan Daly <[email protected]>
1 parent cdcb07f commit 1451ac5

File tree

6 files changed

+131
-66
lines changed

6 files changed

+131
-66
lines changed

base/compiler/typeinfer.jl

Lines changed: 62 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ being used for this purpose alone.
2222
"""
2323
module Timings
2424

25-
using Core.Compiler: -, +, :, Vector, length, first, empty!, push!, pop!, @inline,
25+
using Core.Compiler: -, +, :, >, Vector, length, first, empty!, push!, pop!, @inline,
2626
@inbounds, copy, backtrace
2727

2828
# What we record for any given frame we infer during type inference.
@@ -47,13 +47,16 @@ end
4747

4848
_typeinf_identifier(frame::InferenceFrameInfo) = frame
4949

50+
_typeinf_frame_linfo(frame::Core.Compiler.InferenceState) = frame.linfo
51+
_typeinf_frame_linfo(frame::InferenceFrameInfo) = frame.mi
52+
5053
"""
51-
Core.Compiler.Timing(mi_info, start_time, ...)
54+
Core.Compiler.Timings.Timing(mi_info, start_time, ...)
5255
5356
Internal type containing the timing result for running type inference on a single
5457
MethodInstance.
5558
"""
56-
struct Timing
59+
mutable struct Timing
5760
mi_info::InferenceFrameInfo
5861
start_time::UInt64
5962
cur_start_time::UInt64
@@ -66,6 +69,28 @@ Timing(mi_info, start_time) = Timing(mi_info, start_time, start_time, UInt64(0),
6669

6770
_time_ns() = ccall(:jl_hrtime, UInt64, ()) # Re-implemented here because Base not yet available.
6871

72+
"""
73+
Core.Compiler.Timings.clear_and_fetch_timings()
74+
75+
Return, then clear, the previously recorded type inference timings (`Core.Compiler._timings`).
76+
77+
This fetches a vector of all of the type inference timings that have _finished_ as of this call. Note
78+
that there may be concurrent invocations of inference that are still running in another thread, but
79+
which haven't yet been added to this buffer. Those can be fetched in a future call.
80+
"""
81+
function clear_and_fetch_timings()
82+
# Pass in the type, since the C code doesn't know about our Timing struct.
83+
ccall(:jl_typeinf_profiling_clear_and_fetch, Any, (Any, Any,),
84+
_finished_timings, Vector{Timing})::Vector{Timing}
85+
end
86+
87+
function finish_timing_profile(timing::Timing)
88+
ccall(:jl_typeinf_profiling_push_timing, Cvoid, (Any, Any,), _finished_timings, timing)
89+
end
90+
91+
# TODO(PR): Comment this vector, explain that it should only be accessed thru C.
92+
const _finished_timings = Timing[]
93+
6994
# We keep a stack of the Timings for each of the MethodInstances currently being timed.
7095
# Since type inference currently operates via a depth-first search (during abstract
7196
# evaluation), this vector operates like a call stack. The last node in _timings is the
@@ -74,16 +99,20 @@ _time_ns() = ccall(:jl_hrtime, UInt64, ()) # Re-implemented here because Base n
7499
# call structure through type inference is recorded. (It's recorded as a tree, not a graph,
75100
# because we create a new node for duplicates.)
76101
const _timings = Timing[]
102+
77103
# ROOT() is an empty function used as the top-level Timing node to measure all time spent
78104
# *not* in type inference during a given recording trace. It is used as a "dummy" node.
79105
function ROOT() end
80106
const ROOTmi = Core.Compiler.specialize_method(
81107
first(Core.Compiler.methods(ROOT)), Tuple{typeof(ROOT)}, Core.svec())
108+
82109
"""
83110
Core.Compiler.reset_timings()
84111
85112
Empty out the previously recorded type inference timings (`Core.Compiler._timings`), and
86113
start the ROOT() timer again. `ROOT()` measures all time spent _outside_ inference.
114+
115+
DEPRECATED: this will be removed; use `clear_and_fetch_timings` instead.
87116
"""
88117
function reset_timings()
89118
empty!(_timings)
@@ -93,7 +122,6 @@ function reset_timings()
93122
_time_ns()))
94123
return nothing
95124
end
96-
reset_timings()
97125

98126
# (This is split into a function so that it can be called both in this module, at the top
99127
# of `enter_new_timer()`, and once at the Very End of the operation, by whoever started
@@ -105,44 +133,31 @@ reset_timings()
105133
parent_timer = _timings[end]
106134
accum_time = stop_time - parent_timer.cur_start_time
107135

108-
# Add in accum_time ("modify" the immutable struct)
136+
# Add in accum_time
109137
@inbounds begin
110-
_timings[end] = Timing(
111-
parent_timer.mi_info,
112-
parent_timer.start_time,
113-
parent_timer.cur_start_time,
114-
parent_timer.time + accum_time,
115-
parent_timer.children,
116-
parent_timer.bt,
117-
)
138+
_timings[end].time += accum_time
118139
end
119140
return nothing
120141
end
121142

122143
@inline function enter_new_timer(frame)
123144
# Very first thing, stop the active timer: get the current time and add in the
124145
# time since it was last started to its aggregate exclusive time.
125-
close_current_timer()
126-
127-
mi_info = _typeinf_identifier(frame)
146+
if length(_timings) > 0
147+
close_current_timer()
148+
end
128149

129150
# Start the new timer right before returning
151+
mi_info = _typeinf_identifier(frame)
130152
push!(_timings, Timing(mi_info, UInt64(0)))
131153
len = length(_timings)
132154
new_timer = @inbounds _timings[len]
155+
133156
# Set the current time _after_ appending the node, to try to exclude the
134157
# overhead from measurement.
135158
start = _time_ns()
136-
137-
@inbounds begin
138-
_timings[len] = Timing(
139-
new_timer.mi_info,
140-
start,
141-
start,
142-
new_timer.time,
143-
new_timer.children,
144-
)
145-
end
159+
new_timer.start_time = start
160+
new_timer.cur_start_time = start
146161

147162
return nothing
148163
end
@@ -154,43 +169,38 @@ end
154169
# Finish the new timer
155170
stop_time = _time_ns()
156171

157-
expected_mi_info = _typeinf_identifier(_expected_frame_)
172+
expected_linfo = _typeinf_frame_linfo(_expected_frame_)
158173

159174
# Grab the new timer again because it might have been modified in _timings
160175
# (since it's an immutable struct)
161176
# And remove it from the current timings stack
162177
new_timer = pop!(_timings)
163-
Core.Compiler.@assert new_timer.mi_info.mi === expected_mi_info.mi
178+
Core.Compiler.@assert new_timer.mi_info.mi === expected_linfo
164179

165-
# Prepare to unwind one level of the stack and record in the parent
166-
parent_timer = _timings[end]
180+
# check for two cases: normal case & backcompat case
181+
is_profile_root_normal = length(_timings) === 0
182+
is_profile_root_backcompat = length(_timings) === 1 && _timings[1] === ROOTmi
183+
is_profile_root = is_profile_root_normal || is_profile_root_backcompat
167184

168185
accum_time = stop_time - new_timer.cur_start_time
169186
# Add in accum_time ("modify" the immutable struct)
170-
new_timer = Timing(
171-
new_timer.mi_info,
172-
new_timer.start_time,
173-
new_timer.cur_start_time,
174-
new_timer.time + accum_time,
175-
new_timer.children,
176-
parent_timer.mi_info.mi === ROOTmi ? backtrace() : nothing,
177-
)
178-
# Record the final timing with the original parent timer
179-
push!(parent_timer.children, new_timer)
180-
181-
# And finally restart the parent timer:
182-
len = length(_timings)
183-
@inbounds begin
184-
_timings[len] = Timing(
185-
parent_timer.mi_info,
186-
parent_timer.start_time,
187-
_time_ns(),
188-
parent_timer.time,
189-
parent_timer.children,
190-
parent_timer.bt,
191-
)
187+
new_timer.time += accum_time
188+
if is_profile_root
189+
new_timer.bt = backtrace()
192190
end
193191

192+
# Prepare to unwind one level of the stack and record in the parent
193+
if is_profile_root
194+
finish_timing_profile(new_timer)
195+
else
196+
parent_timer = _timings[end]
197+
198+
# Record the final timing with the original parent timer
199+
push!(parent_timer.children, new_timer)
200+
201+
# And finally restart the parent timer:
202+
parent_timer.cur_start_time = _time_ns()
203+
end
194204
return nothing
195205
end
196206

doc/src/devdocs/locks.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ The following is a leaf lock (level 2), and only acquires level 1 locks (safepoi
4444
> * Module->lock
4545
> * JLDebuginfoPlugin::PluginMutex
4646
> * newly_inferred_mutex
47+
> * typeinf_profiling_lock
4748
4849
The following is a level 3 lock, which can only acquire level 1 or level 2 locks internally:
4950

src/Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ endif
4343
SRCS := \
4444
jltypes gf typemap smallintset ast builtins module interpreter symbol \
4545
dlload sys init task array staticdata toplevel jl_uv datatype \
46-
simplevector runtime_intrinsics precompile jloptions \
46+
simplevector runtime_intrinsics precompile jloptions inference-profiling \
4747
threading partr stackwalk gc gc-debug gc-pages gc-stacks gc-alloc-profiler method \
4848
jlapi signal-handling safepoint timing subtype rtutils gc-heap-snapshot \
4949
crc32c APInt-C processor ircode opaque_closure codegen-stubs coverage runtime_ccall
@@ -299,6 +299,7 @@ $(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc.h
299299
$(BUILDDIR)/gc.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h
300300
$(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-heap-snapshot.h
301301
$(BUILDDIR)/gc-alloc-profiler.o $(BUILDDIR)/gc-alloc-profiler.dbg.obj: $(SRCDIR)/gc.h $(SRCDIR)/gc-alloc-profiler.h
302+
$(BUILDDIR)/inference-profiling.o $(BUILDDIR)/inference-profiling.dbg.obj: $(SRCDIR)/gc.h
302303
$(BUILDDIR)/init.o $(BUILDDIR)/init.dbg.obj: $(SRCDIR)/builtin_proto.h
303304
$(BUILDDIR)/interpreter.o $(BUILDDIR)/interpreter.dbg.obj: $(SRCDIR)/builtin_proto.h
304305
$(BUILDDIR)/jitlayers.o $(BUILDDIR)/jitlayers.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/codegen_shared.h $(SRCDIR)/debug-registry.h

src/inference-profiling.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
// Implementation for type inference profiling
2+
3+
#include "julia.h"
4+
#include "julia_internal.h"
5+
6+
jl_mutex_t typeinf_profiling_lock;
7+
8+
// == exported interface ==
9+
10+
extern "C" {
11+
12+
JL_DLLEXPORT jl_array_t* jl_typeinf_profiling_clear_and_fetch(
13+
jl_array_t *inference_profiling_results_array,
14+
jl_value_t *array_timing_type
15+
)
16+
{
17+
JL_LOCK(&typeinf_profiling_lock);
18+
19+
size_t len = jl_array_len(inference_profiling_results_array);
20+
21+
jl_array_t *out = jl_alloc_array_1d(array_timing_type, len);
22+
JL_GC_PUSH1(&out);
23+
24+
memcpy(out->data, inference_profiling_results_array->data, len * sizeof(void*));
25+
26+
jl_array_del_end(inference_profiling_results_array, len);
27+
28+
JL_UNLOCK(&typeinf_profiling_lock);
29+
30+
JL_GC_POP();
31+
return out;
32+
}
33+
34+
JL_DLLEXPORT void jl_typeinf_profiling_push_timing(
35+
jl_array_t *inference_profiling_results_array,
36+
jl_value_t *timing
37+
)
38+
{
39+
JL_LOCK(&typeinf_profiling_lock);
40+
41+
jl_array_grow_end(inference_profiling_results_array, 1);
42+
size_t len = jl_array_len(inference_profiling_results_array);
43+
jl_array_ptr_set(inference_profiling_results_array, len - 1, timing);
44+
45+
JL_UNLOCK(&typeinf_profiling_lock);
46+
}
47+
48+
} // extern "C"

src/jl_exported_funcs.inc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,8 @@
484484
XX(jl_typeinf_lock_end) \
485485
XX(jl_typeinf_timing_begin) \
486486
XX(jl_typeinf_timing_end) \
487+
XX(jl_typeinf_profiling_clear_and_fetch) \
488+
XX(jl_typeinf_profiling_push_timing) \
487489
XX(jl_typename_str) \
488490
XX(jl_typeof_str) \
489491
XX(jl_types_equal) \

test/compiler/inference.jl

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3804,15 +3804,19 @@ f37532(T, x) = (Core.bitcast(Ptr{T}, x); x)
38043804
# Helper functions for Core.Compiler.Timings. These are normally accessed via a package -
38053805
# usually (SnoopCompileCore).
38063806
function time_inference(f)
3807-
Core.Compiler.Timings.reset_timings()
38083807
Core.Compiler.__set_measure_typeinf(true)
38093808
f()
38103809
Core.Compiler.__set_measure_typeinf(false)
3811-
Core.Compiler.Timings.close_current_timer()
3812-
return Core.Compiler.Timings._timings[1]
3810+
return Core.Compiler.Timings.clear_and_fetch_timings()
38133811
end
3814-
function depth(t::Core.Compiler.Timings.Timing)
3815-
maximum(depth.(t.children), init=0) + 1
3812+
function max_depth(t::Vector{Core.Compiler.Timings.Timing})
3813+
maximum(max_depth.(t), init=0) + 1
3814+
end
3815+
function max_depth(t::Core.Compiler.Timings.Timing)
3816+
maximum(max_depth.(t.children), init=0) + 1
3817+
end
3818+
function flatten_times(t::Vector{Core.Compiler.Timings.Timing})
3819+
collect(Iterators.flatten(flatten_times(el) for el in t))
38163820
end
38173821
function flatten_times(t::Core.Compiler.Timings.Timing)
38183822
collect(Iterators.flatten([(t.time => t.mi_info,), flatten_times.(t.children)...]))
@@ -3829,14 +3833,14 @@ end
38293833
timing1 = time_inference() do
38303834
@eval M1.g(2, 3.0)
38313835
end
3832-
@test occursin(r"Core.Compiler.Timings.Timing\(InferenceFrameInfo for Core.Compiler.Timings.ROOT\(\)\) with \d+ children", sprint(show, timing1))
3836+
@test timing1 isa Vector{Core.Compiler.Timings.Timing}
38333837
# The last two functions to be inferred should be `i` and `i2`, inferred at runtime with
38343838
# their concrete types.
38353839
@test sort([mi_info.mi.def.name for (time,mi_info) in flatten_times(timing1)[end-1:end]]) == [:i, :i2]
3836-
@test all(child->isa(child.bt, Vector), timing1.children)
3837-
@test all(child->child.bt===nothing, timing1.children[1].children)
3840+
@test all(child->isa(child.bt, Vector), timing1)
3841+
@test all(child->child.bt===nothing, timing1[1].children)
38383842
# Test the stacktrace
3839-
@test isa(stacktrace(timing1.children[1].bt), Vector{Base.StackTraces.StackFrame})
3843+
@test isa(stacktrace(timing1[1].bt), Vector{Base.StackTraces.StackFrame})
38403844
# Test that inference has cached some of the Method Instances
38413845
timing2 = time_inference() do
38423846
@eval M1.g(2, 3.0)
@@ -3857,16 +3861,16 @@ end
38573861
end
38583862
end
38593863
end
3860-
@test occursin("thunk from $(@__MODULE__) starting at $(@__FILE__):$((@__LINE__) - 5)", string(timingmod.children))
3864+
@test occursin("thunk from $(@__MODULE__) starting at $(@__FILE__):$((@__LINE__) - 5)", string(timingmod))
38613865
# END LINE NUMBER SENSITIVITY
38623866

38633867
# Recursive function
38643868
@eval module _Recursive f(n::Integer) = n == 0 ? 0 : f(n-1) + 1 end
38653869
timing = time_inference() do
38663870
@eval _Recursive.f(Base.inferencebarrier(5))
38673871
end
3868-
@test 2 <= depth(timing) <= 3 # root -> f (-> +)
3869-
@test 2 <= length(flatten_times(timing)) <= 3 # root, f, +
3872+
@test 1 <= max_depth(timing) <= 2 # f (-> +)
3873+
@test 1 <= length(flatten_times(timing)) <= 2 # f, +
38703874

38713875
# Functions inferred with multiple constants
38723876
@eval module C
@@ -3894,7 +3898,6 @@ end
38943898
@test !isempty(ft)
38953899
str = sprint(show, ft)
38963900
@test occursin("InferenceFrameInfo for /(1::$Int, ::$Int)", str) # inference constants
3897-
@test occursin("InferenceFrameInfo for Core.Compiler.Timings.ROOT()", str) # qualified
38983901
# loopc has internal slots, check constant printing in this case
38993902
sel = filter(ti -> ti.second.mi.def.name === :loopc, ft)
39003903
ifi = sel[end].second

0 commit comments

Comments
 (0)