Skip to content

Commit acdfe8e

Browse files
htejunsmb49
authored andcommitted
cgroup: Use separate src/dst nodes when preloading css_sets for migration
BugLink: https://bugs.launchpad.net/bugs/1988219 commit 07fd5b6 upstream. Each cset (css_set) is pinned by its tasks. When we're moving tasks around across csets for a migration, we need to hold the source and destination csets to ensure that they don't go away while we're moving tasks about. This is done by linking cset->mg_preload_node on either the mgctx->preloaded_src_csets or mgctx->preloaded_dst_csets list. Using the same cset->mg_preload_node for both the src and dst lists was deemed okay as a cset can't be both the source and destination at the same time. Unfortunately, this overloading becomes problematic when multiple tasks are involved in a migration and some of them are identity noop migrations while others are actually moving across cgroups. For example, this can happen with the following sequence on cgroup1: #1> mkdir -p /sys/fs/cgroup/misc/a/b #2> echo $$ > /sys/fs/cgroup/misc/a/cgroup.procs #3> RUN_A_COMMAND_WHICH_CREATES_MULTIPLE_THREADS & #4> PID=$! #5> echo $PID > /sys/fs/cgroup/misc/a/b/tasks #6> echo $PID > /sys/fs/cgroup/misc/a/cgroup.procs the process including the group leader back into a. In this final migration, non-leader threads would be doing identity migration while the group leader is doing an actual one. After #3, let's say the whole process was in cset A, and that after #4, the leader moves to cset B. Then, during #6, the following happens: 1. cgroup_migrate_add_src() is called on B for the leader. 2. cgroup_migrate_add_src() is called on A for the other threads. 3. cgroup_migrate_prepare_dst() is called. It scans the src list. 4. It notices that B wants to migrate to A, so it tries to A to the dst list but realizes that its ->mg_preload_node is already busy. 5. and then it notices A wants to migrate to A as it's an identity migration, it culls it by list_del_init()'ing its ->mg_preload_node and putting references accordingly. 6. The rest of migration takes place with B on the src list but nothing on the dst list. This means that A isn't held while migration is in progress. If all tasks leave A before the migration finishes and the incoming task pins it, the cset will be destroyed leading to use-after-free. This is caused by overloading cset->mg_preload_node for both src and dst preload lists. We wanted to exclude the cset from the src list but ended up inadvertently excluding it from the dst list too. This patch fixes the issue by separating out cset->mg_preload_node into ->mg_src_preload_node and ->mg_dst_preload_node, so that the src and dst preloadings don't interfere with each other. Signed-off-by: Tejun Heo <[email protected]> Reported-by: Mukesh Ojha <[email protected]> Reported-by: shisiyuan <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Link: https://www.spinics.net/lists/cgroups/msg33313.html Fixes: f817de9 ("cgroup: prepare migration path for unified hierarchy") Cc: [email protected] # v3.16+ Signed-off-by: Greg Kroah-Hartman <[email protected]> Signed-off-by: Kamal Mostafa <[email protected]> Signed-off-by: Stefan Bader <[email protected]>
1 parent 7349d13 commit acdfe8e

File tree

2 files changed

+25
-15
lines changed

2 files changed

+25
-15
lines changed

include/linux/cgroup-defs.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,8 @@ struct css_set {
255255
* List of csets participating in the on-going migration either as
256256
* source or destination. Protected by cgroup_mutex.
257257
*/
258-
struct list_head mg_preload_node;
258+
struct list_head mg_src_preload_node;
259+
struct list_head mg_dst_preload_node;
259260
struct list_head mg_node;
260261

261262
/*

kernel/cgroup/cgroup.c

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -743,7 +743,8 @@ struct css_set init_css_set = {
743743
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
744744
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
745745
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
746-
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
746+
.mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
747+
.mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
747748
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
748749

749750
/*
@@ -1219,7 +1220,8 @@ static struct css_set *find_css_set(struct css_set *old_cset,
12191220
INIT_LIST_HEAD(&cset->threaded_csets);
12201221
INIT_HLIST_NODE(&cset->hlist);
12211222
INIT_LIST_HEAD(&cset->cgrp_links);
1222-
INIT_LIST_HEAD(&cset->mg_preload_node);
1223+
INIT_LIST_HEAD(&cset->mg_src_preload_node);
1224+
INIT_LIST_HEAD(&cset->mg_dst_preload_node);
12231225
INIT_LIST_HEAD(&cset->mg_node);
12241226

12251227
/* Copy the set of subsystem state objects generated in
@@ -2629,21 +2631,27 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
26292631
*/
26302632
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
26312633
{
2632-
LIST_HEAD(preloaded);
26332634
struct css_set *cset, *tmp_cset;
26342635

26352636
lockdep_assert_held(&cgroup_mutex);
26362637

26372638
spin_lock_irq(&css_set_lock);
26382639

2639-
list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
2640-
list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
2640+
list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
2641+
mg_src_preload_node) {
2642+
cset->mg_src_cgrp = NULL;
2643+
cset->mg_dst_cgrp = NULL;
2644+
cset->mg_dst_cset = NULL;
2645+
list_del_init(&cset->mg_src_preload_node);
2646+
put_css_set_locked(cset);
2647+
}
26412648

2642-
list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
2649+
list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
2650+
mg_dst_preload_node) {
26432651
cset->mg_src_cgrp = NULL;
26442652
cset->mg_dst_cgrp = NULL;
26452653
cset->mg_dst_cset = NULL;
2646-
list_del_init(&cset->mg_preload_node);
2654+
list_del_init(&cset->mg_dst_preload_node);
26472655
put_css_set_locked(cset);
26482656
}
26492657

@@ -2685,7 +2693,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
26852693

26862694
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
26872695

2688-
if (!list_empty(&src_cset->mg_preload_node))
2696+
if (!list_empty(&src_cset->mg_src_preload_node))
26892697
return;
26902698

26912699
WARN_ON(src_cset->mg_src_cgrp);
@@ -2696,7 +2704,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
26962704
src_cset->mg_src_cgrp = src_cgrp;
26972705
src_cset->mg_dst_cgrp = dst_cgrp;
26982706
get_css_set(src_cset);
2699-
list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
2707+
list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
27002708
}
27012709

27022710
/**
@@ -2721,7 +2729,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
27212729

27222730
/* look up the dst cset for each src cset and link it to src */
27232731
list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2724-
mg_preload_node) {
2732+
mg_src_preload_node) {
27252733
struct css_set *dst_cset;
27262734
struct cgroup_subsys *ss;
27272735
int ssid;
@@ -2740,16 +2748,16 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
27402748
if (src_cset == dst_cset) {
27412749
src_cset->mg_src_cgrp = NULL;
27422750
src_cset->mg_dst_cgrp = NULL;
2743-
list_del_init(&src_cset->mg_preload_node);
2751+
list_del_init(&src_cset->mg_src_preload_node);
27442752
put_css_set(src_cset);
27452753
put_css_set(dst_cset);
27462754
continue;
27472755
}
27482756

27492757
src_cset->mg_dst_cset = dst_cset;
27502758

2751-
if (list_empty(&dst_cset->mg_preload_node))
2752-
list_add_tail(&dst_cset->mg_preload_node,
2759+
if (list_empty(&dst_cset->mg_dst_preload_node))
2760+
list_add_tail(&dst_cset->mg_dst_preload_node,
27532761
&mgctx->preloaded_dst_csets);
27542762
else
27552763
put_css_set(dst_cset);
@@ -2980,7 +2988,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
29802988
goto out_finish;
29812989

29822990
spin_lock_irq(&css_set_lock);
2983-
list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
2991+
list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
2992+
mg_src_preload_node) {
29842993
struct task_struct *task, *ntask;
29852994

29862995
/* all tasks in src_csets need to be migrated */

0 commit comments

Comments
 (0)