Skip to content

Commit cb330a0

Browse files
author
Adhityaa Chandrasekar
committed
server.go: use worker goroutines for fewer stack allocations
Currently (go1.13.4), the default stack size for newly spawned goroutines is 2048 bytes. This is insufficient when processing gRPC requests as the we often require more than 4 KiB stacks. This causes the Go runtime to call runtime.morestack at least twice per RPC, which causes performance to suffer needlessly as stack reallocations require all sorts of internal work such as changing pointers to point to new addresses. See golang/go#18138 for more details. Since this stack growth is guaranteed to happen at least twice per RPC, reusing goroutines gives us two wins: 1. The stack is already grown to 8 KiB after the first RPC, so subsequent RPCs do not call runtime.morestack. 2. We eliminate the need to spawn a new goroutine for each request (even though they're relatively inexpensive). Performance improves across the board. The improvement is especially visible in small, unary requests as the overhead of stack reallocation is higher, percentage-wise. QPS is up anywhere between 3% and 5% depending on the number of concurrent RPC requests in flight. Latency is down ~3%. There is even a 1% decrease in memory footprint in some cases, though that is an unintended, but happy coincidence. unary-networkMode_none-bufConn_false-keepalive_false-benchTime_1m0s-trace_false-latency_0s-kbps_0-MTU_0-maxConcurrentCalls_8-reqSize_1B-respSize_1B-compressor_off-channelz_false-preloader_false Title Before After Percentage TotalOps 2613512 2701705 3.37% SendOps 0 0 NaN% RecvOps 0 0 NaN% Bytes/op 8657.00 8654.17 -0.03% Allocs/op 173.37 173.28 0.00% ReqT/op 348468.27 360227.33 3.37% RespT/op 348468.27 360227.33 3.37% 50th-Lat 174.601µs 167.378µs -4.14% 90th-Lat 233.132µs 229.087µs -1.74% 99th-Lat 438.98µs 441.857µs 0.66% Avg-Lat 183.263µs 177.26µs -3.28%
1 parent dc49de8 commit cb330a0

File tree

2 files changed

+71
-4
lines changed

2 files changed

+71
-4
lines changed

internal/transport/transport.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,11 @@ type Stream struct {
285285
contentSubtype string
286286
}
287287

288+
// ID returns the stream ID.
289+
func (s *Stream) ID() uint32 {
290+
return s.id
291+
}
292+
288293
// isHeaderSent is only valid on the server-side.
289294
func (s *Stream) isHeaderSent() bool {
290295
return atomic.LoadUint32(&s.headerSent) == 1

server.go

Lines changed: 66 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -712,15 +712,73 @@ func (s *Server) newHTTP2Transport(c net.Conn, authInfo credentials.AuthInfo) tr
712712
return st
713713
}
714714

715+
func floorCPUCount() uint32 {
716+
n := uint32(runtime.NumCPU())
717+
for i := uint32(1 << 31); i >= 2; i >>= 1 {
718+
if n&i > 0 {
719+
return i
720+
}
721+
}
722+
723+
return 1
724+
}
725+
726+
// workerStackReset defines how often the stack must be reset. Every N
727+
// requests, by spawning a new goroutine in its place, a worker can reset its
728+
// stack so that large stacks don't live in memory forever. 2^16 should allow
729+
// each goroutine stack to live for at least a few seconds in a typical
730+
// workload (assuming a QPS of a few thousand requests/sec).
731+
const workerStackReset = 1 << 16
732+
733+
// streamWorkers blocks on a *transport.Stream channel forever and waits for
734+
// data to be fed by serveStreams. This allows different requests to be
735+
// processed by the same goroutine, removing the need for expensive stack
736+
// re-allocations (see the runtime.morestack problem [1]).
737+
//
738+
// [1] https://github.com/golang/go/issues/18138
739+
func (s *Server) streamWorker(st transport.ServerTransport, wg *sync.WaitGroup, ch chan *transport.Stream) {
740+
completed := 0
741+
for stream := range ch {
742+
s.handleStream(st, stream, s.traceInfo(st, stream))
743+
wg.Done()
744+
completed++
745+
if completed == workerStackReset {
746+
go s.streamWorker(st, wg, ch)
747+
return
748+
}
749+
}
750+
}
751+
752+
// numWorkers defines the number of stream handling workers. After experiments
753+
// with different CPU counts, using the floor of the number of CPUs available
754+
// was found to be the number optimal for performance across the board (QPS,
755+
// latency).
756+
var numWorkers = floorCPUCount()
757+
758+
// workerMask is used to perform bitwise AND operations instead of expensive
759+
// module operations on integers.
760+
var workerMask = numWorkers - 1
761+
715762
func (s *Server) serveStreams(st transport.ServerTransport) {
716763
defer st.Close()
717764
var wg sync.WaitGroup
765+
766+
streamChannels := make([]chan *transport.Stream, numWorkers)
767+
for i := range streamChannels {
768+
streamChannels[i] = make(chan *transport.Stream)
769+
go s.streamWorker(st, &wg, streamChannels[i])
770+
}
771+
718772
st.HandleStreams(func(stream *transport.Stream) {
719773
wg.Add(1)
720-
go func() {
721-
defer wg.Done()
722-
s.handleStream(st, stream, s.traceInfo(st, stream))
723-
}()
774+
select {
775+
case streamChannels[stream.ID()&workerMask] <- stream:
776+
default:
777+
go func() {
778+
s.handleStream(st, stream, s.traceInfo(st, stream))
779+
wg.Done()
780+
}()
781+
}
724782
}, func(ctx context.Context, method string) context.Context {
725783
if !EnableTracing {
726784
return ctx
@@ -729,6 +787,10 @@ func (s *Server) serveStreams(st transport.ServerTransport) {
729787
return trace.NewContext(ctx, tr)
730788
})
731789
wg.Wait()
790+
791+
for _, ch := range streamChannels {
792+
close(ch)
793+
}
732794
}
733795

734796
var _ http.Handler = (*Server)(nil)

0 commit comments

Comments
 (0)