Skip to content

Commit 502a6c4

Browse files
committed
fix: Pin redis-py to 4.5.5 and enhance logging for Redis reconnection and failover (#1620)
This commit only includes redis-py 4.5.5 pinning, connection pool mis-creation in `AgentRegistry.handle_kernel_log()`, and limitation of `max_connections` for the default connection pool. Added an explicit warning if the native sentinel client is used, because the rework on the sentinel connection pool (#1586) targets 23.09 only. Backported-from: main Backported-to: 23.03
1 parent a69fccf commit 502a6c4

File tree

14 files changed

+218
-175
lines changed

14 files changed

+218
-175
lines changed

changes/1620.fix.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Improve logging when retrying redis connections during failover and use explicit names for all redis connection pools

python.lock

Lines changed: 123 additions & 124 deletions
Large diffs are not rendered by default.

requirements.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ dataclasses-json~=0.5.7
2929
etcetra==0.1.17
3030
faker~=13.12.0
3131
graphene~=2.1.9
32-
hiredis~=2.2.3
3332
humanize>=3.1.0
3433
ifaddr~=0.2
3534
inquirer~=2.9.2
@@ -55,7 +54,8 @@ pyzmq~=24.0.1
5554
PyJWT~=2.0
5655
PyYAML~=6.0
5756
packaging>=21.3
58-
redis[hiredis]~=4.6.0
57+
hiredis>=2.2.3
58+
redis[hiredis]==4.5.5
5959
rich~=12.2
6060
SQLAlchemy[postgresql_asyncpg]~=1.4.40
6161
setproctitle~=1.3.2
@@ -69,7 +69,7 @@ tqdm>=4.61
6969
trafaret~=2.1
7070
typeguard~=2.10
7171
typing_extensions~=4.3
72-
uvloop>=0.17; sys_platform != "Windows"
72+
uvloop~=0.17.0; sys_platform != "Windows" # 0.18 breaks the API and adds Python 3.12 support
7373
yarl~=1.8.2 # FIXME: revert to >=1.7 after aio-libs/yarl#862 is resolved
7474
zipstream-new~=1.1.8
7575

src/ai/backend/agent/agent.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767

6868
from ai.backend.common import msgpack, redis_helper
6969
from ai.backend.common.config import model_definition_iv
70-
from ai.backend.common.defs import REDIS_STREAM_DB
70+
from ai.backend.common.defs import REDIS_STAT_DB, REDIS_STREAM_DB
7171
from ai.backend.common.docker import MAX_KERNELSPEC, MIN_KERNELSPEC, ImageRef
7272
from ai.backend.common.events import (
7373
AbstractEvent,
@@ -616,8 +616,16 @@ async def __ainit__(self) -> None:
616616
node_id=self.local_config["agent"]["id"],
617617
consumer_group=EVENT_DISPATCHER_CONSUMER_GROUP,
618618
)
619-
self.redis_stream_pool = redis_helper.get_redis_object(self.local_config["redis"], db=4)
620-
self.redis_stat_pool = redis_helper.get_redis_object(self.local_config["redis"], db=0)
619+
self.redis_stream_pool = redis_helper.get_redis_object(
620+
self.local_config["redis"],
621+
name="stream",
622+
db=REDIS_STREAM_DB,
623+
)
624+
self.redis_stat_pool = redis_helper.get_redis_object(
625+
self.local_config["redis"],
626+
name="stat",
627+
db=REDIS_STAT_DB,
628+
)
621629

622630
alloc_map_mod.log_alloc_map = self.local_config["debug"]["log-alloc-map"]
623631
computers, self.slots = await self.detect_resources()

src/ai/backend/common/events.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,8 @@
99
import socket
1010
import uuid
1111
from collections import defaultdict
12-
from types import TracebackType
1312
from typing import (
1413
Any,
15-
Awaitable,
1614
Callable,
1715
ClassVar,
1816
Coroutine,
@@ -32,8 +30,8 @@
3230
from aiotools.context import aclosing
3331
from aiotools.server import process_index
3432
from aiotools.taskgroup import PersistentTaskGroup
33+
from aiotools.taskgroup.types import AsyncExceptionHandler
3534
from redis.asyncio import ConnectionPool
36-
from typing_extensions import TypeAlias
3735

3836
from . import msgpack, redis_helper
3937
from .logging import BraceStyleAdapter
@@ -57,10 +55,6 @@
5755

5856
log = BraceStyleAdapter(logging.getLogger(__spec__.name)) # type: ignore[name-defined]
5957

60-
PTGExceptionHandler: TypeAlias = Callable[
61-
[Type[Exception], Exception, TracebackType], Awaitable[None]
62-
]
63-
6458

6559
class AbstractEvent(metaclass=abc.ABCMeta):
6660
# derivatives should define the fields.
@@ -688,16 +682,18 @@ def __init__(
688682
log_events: bool = False,
689683
*,
690684
consumer_group: str,
691-
service_name: str = None,
685+
service_name: str | None = None,
692686
stream_key: str = "events",
693-
node_id: str = None,
694-
consumer_exception_handler: PTGExceptionHandler = None,
695-
subscriber_exception_handler: PTGExceptionHandler = None,
687+
node_id: str | None = None,
688+
consumer_exception_handler: AsyncExceptionHandler | None = None,
689+
subscriber_exception_handler: AsyncExceptionHandler | None = None,
696690
) -> None:
697691
_redis_config = redis_config.copy()
698692
if service_name:
699693
_redis_config["service_name"] = service_name
700-
self.redis_client = redis_helper.get_redis_object(_redis_config, db=db)
694+
self.redis_client = redis_helper.get_redis_object(
695+
_redis_config, name="event_dispatcher.stream", db=db
696+
)
701697
self._log_events = log_events
702698
self._closed = False
703699
self.consumers = defaultdict(set)
@@ -743,7 +739,7 @@ def consume(
743739
callback: EventCallback[TContext, TEvent],
744740
coalescing_opts: CoalescingOptions = None,
745741
*,
746-
name: str = None,
742+
name: str | None = None,
747743
) -> EventHandler[TContext, TEvent]:
748744
if name is None:
749745
name = f"evh-{secrets.token_urlsafe(16)}"
@@ -766,9 +762,9 @@ def subscribe(
766762
event_cls: Type[TEvent],
767763
context: TContext,
768764
callback: EventCallback[TContext, TEvent],
769-
coalescing_opts: CoalescingOptions = None,
765+
coalescing_opts: CoalescingOptions | None = None,
770766
*,
771-
name: str = None,
767+
name: str | None = None,
772768
) -> EventHandler[TContext, TEvent]:
773769
if name is None:
774770
name = f"evh-{secrets.token_urlsafe(16)}"
@@ -892,15 +888,19 @@ def __init__(
892888
redis_config: EtcdRedisConfig,
893889
db: int = 0,
894890
*,
895-
service_name: str = None,
891+
service_name: str | None = None,
896892
stream_key: str = "events",
897893
log_events: bool = False,
898894
) -> None:
899895
_redis_config = redis_config.copy()
900896
if service_name:
901897
_redis_config["service_name"] = service_name
902898
self._closed = False
903-
self.redis_client = redis_helper.get_redis_object(_redis_config, db=db)
899+
self.redis_client = redis_helper.get_redis_object(
900+
_redis_config,
901+
name="event_producer.stream",
902+
db=db,
903+
)
904904
self._log_events = log_events
905905
self._stream_key = stream_key
906906

@@ -930,7 +930,7 @@ async def produce_event(
930930
)
931931

932932

933-
def _generate_consumer_id(node_id: str = None) -> str:
933+
def _generate_consumer_id(node_id: str | None = None) -> str:
934934
h = hashlib.sha1()
935935
h.update(str(node_id or socket.getfqdn()).encode("utf8"))
936936
hostname_hash = h.hexdigest()

src/ai/backend/common/redis_helper.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
import redis.exceptions
2222
import yarl
23-
from redis.asyncio import Redis
23+
from redis.asyncio import ConnectionPool, Redis
2424
from redis.asyncio.client import Pipeline, PubSub
2525
from redis.asyncio.sentinel import (
2626
MasterNotFoundError,
@@ -59,8 +59,6 @@
5959

6060

6161
_default_conn_opts: Mapping[str, Any] = {
62-
"socket_timeout": 5.0,
63-
"socket_connect_timeout": 2.0,
6462
"socket_keepalive": True,
6563
"socket_keepalive_options": _keepalive_options,
6664
"retry": Retry(ExponentialBackoff(), 10),
@@ -69,7 +67,10 @@
6967
redis.exceptions.TimeoutError,
7068
],
7169
}
72-
70+
_default_conn_pool_opts: Mapping[str, Any] = {
71+
"max_connections": 16,
72+
# "timeout": 20.0, # for redis-py 5.0+
73+
}
7374

7475
_scripts: Dict[str, str] = {}
7576

@@ -471,10 +472,24 @@ async def read_stream_by_group(
471472

472473
def get_redis_object(
473474
redis_config: EtcdRedisConfig,
475+
name: str, # placeholder for backported codes
474476
db: int = 0,
475477
**kwargs,
476478
) -> RedisConnectionInfo:
479+
conn_opts = {
480+
**_default_conn_opts,
481+
**kwargs,
482+
# "lib_name": None, # disable implicit "CLIENT SETINFO" (for redis-py 5.0+)
483+
# "lib_version": None, # disable implicit "CLIENT SETINFO" (for redis-py 5.0+)
484+
}
485+
conn_pool_opts = {
486+
**_default_conn_pool_opts,
487+
}
477488
if _sentinel_addresses := redis_config.get("sentinel"):
489+
log.warning(
490+
"Native sentinel client in 23.03 has imperfect implementation. "
491+
"It is not recommended to use it."
492+
)
478493
sentinel_addresses: Any = None
479494
if isinstance(_sentinel_addresses, str):
480495
sentinel_addresses = DelimiterSeperatedList(HostPortPair).check_and_return(
@@ -503,7 +518,14 @@ def get_redis_object(
503518
redis_url[1]
504519
).with_password(redis_config.get("password")) / str(db)
505520
return RedisConnectionInfo(
506-
client=Redis.from_url(str(url), **kwargs),
521+
client=Redis(
522+
connection_pool=ConnectionPool.from_url(
523+
str(url),
524+
**conn_pool_opts,
525+
),
526+
**conn_opts,
527+
auto_close_connection_pool=True,
528+
),
507529
service_name=None,
508530
)
509531

src/ai/backend/manager/api/logs.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,13 @@
1111
import sqlalchemy as sa
1212
import trafaret as t
1313
from aiohttp import web
14+
from dateutil.relativedelta import relativedelta
1415

15-
from ai.backend.common import redis_helper
1616
from ai.backend.common import validators as tx
17-
from ai.backend.common.defs import REDIS_LIVE_DB
1817
from ai.backend.common.distributed import GlobalTimer
1918
from ai.backend.common.events import AbstractEvent, EmptyEventArgs, EventHandler
2019
from ai.backend.common.logging import BraceStyleAdapter
21-
from ai.backend.common.types import AgentId, LogSeverity, RedisConnectionInfo
20+
from ai.backend.common.types import AgentId, LogSeverity
2221

2322
from ..defs import LockID
2423
from ..models import UserRole, error_logs, groups
@@ -219,6 +218,7 @@ async def log_cleanup_task(app: web.Application, src: AgentId, event: DoLogClean
219218
raw_lifetime = await etcd.get("config/logs/error/retention")
220219
if raw_lifetime is None:
221220
raw_lifetime = "90d"
221+
lifetime: dt.timedelta | relativedelta
222222
try:
223223
lifetime = tx.TimeDuration().check(raw_lifetime)
224224
except ValueError:
@@ -239,7 +239,6 @@ async def log_cleanup_task(app: web.Application, src: AgentId, event: DoLogClean
239239
@attrs.define(slots=True, auto_attribs=True, init=False)
240240
class PrivateContext:
241241
log_cleanup_timer: GlobalTimer
242-
log_cleanup_timer_redis: RedisConnectionInfo
243242
log_cleanup_timer_evh: EventHandler[web.Application, DoLogCleanupEvent]
244243

245244

@@ -251,10 +250,6 @@ async def init(app: web.Application) -> None:
251250
app,
252251
log_cleanup_task,
253252
)
254-
app_ctx.log_cleanup_timer_redis = redis_helper.get_redis_object(
255-
root_ctx.shared_config.data["redis"],
256-
db=REDIS_LIVE_DB,
257-
)
258253
app_ctx.log_cleanup_timer = GlobalTimer(
259254
root_ctx.distributed_lock_factory(LockID.LOCKID_LOG_CLEANUP_TIMER, 20.0),
260255
root_ctx.event_producer,
@@ -270,7 +265,6 @@ async def shutdown(app: web.Application) -> None:
270265
app_ctx: PrivateContext = app["logs.context"]
271266
await app_ctx.log_cleanup_timer.leave()
272267
root_ctx.event_dispatcher.unconsume(app_ctx.log_cleanup_timer_evh)
273-
await app_ctx.log_cleanup_timer_redis.close()
274268

275269

276270
def create_app(

src/ai/backend/manager/api/ratelimit.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ async def init(app: web.Application) -> None:
9494
root_ctx: RootContext = app["_root.context"]
9595
app_ctx: PrivateContext = app["ratelimit.context"]
9696
app_ctx.redis_rlim = redis_helper.get_redis_object(
97-
root_ctx.shared_config.data["redis"], db=REDIS_RLIM_DB
97+
root_ctx.shared_config.data["redis"], name="ratelimit", db=REDIS_RLIM_DB
9898
)
9999
app_ctx.redis_rlim_script = await redis_helper.execute(
100100
app_ctx.redis_rlim, lambda r: r.script_load(_rlim_script)

src/ai/backend/manager/cli/context.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,14 +66,24 @@ async def redis_ctx(cli_ctx: CLIContext) -> AsyncIterator[RedisConnectionSet]:
6666
await shared_config.reload()
6767
raw_redis_config = await shared_config.etcd.get_prefix("config/redis")
6868
local_config["redis"] = redis_config_iv.check(raw_redis_config)
69-
redis_live = redis_helper.get_redis_object(shared_config.data["redis"], db=REDIS_LIVE_DB)
70-
redis_stat = redis_helper.get_redis_object(shared_config.data["redis"], db=REDIS_STAT_DB)
69+
redis_live = redis_helper.get_redis_object(
70+
shared_config.data["redis"],
71+
name="mgr_cli.live",
72+
db=REDIS_LIVE_DB,
73+
)
74+
redis_stat = redis_helper.get_redis_object(
75+
shared_config.data["redis"],
76+
name="mgr_cli.stat",
77+
db=REDIS_STAT_DB,
78+
)
7179
redis_image = redis_helper.get_redis_object(
7280
shared_config.data["redis"],
81+
name="mgr_cli.image",
7382
db=REDIS_IMAGE_DB,
7483
)
7584
redis_stream = redis_helper.get_redis_object(
7685
shared_config.data["redis"],
86+
name="mgr_cli.stream",
7787
db=REDIS_STREAM_DB,
7888
)
7989
yield RedisConnectionSet(

src/ai/backend/manager/idle.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,10 +183,12 @@ def __init__(
183183
self._lock_factory = lock_factory
184184
self._redis_live = redis_helper.get_redis_object(
185185
self._shared_config.data["redis"],
186+
name="idle.live",
186187
db=REDIS_LIVE_DB,
187188
)
188189
self._redis_stat = redis_helper.get_redis_object(
189190
self._shared_config.data["redis"],
191+
name="idle.stat",
190192
db=REDIS_STAT_DB,
191193
)
192194
self._grace_period_checker: NewUserGracePeriodChecker = NewUserGracePeriodChecker(

0 commit comments

Comments
 (0)