From 169410eba271afc9f0fb476d996795aa26770c6d Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:19 +0800 Subject: [PATCH 1/7] bpf: Check rcu_read_lock_trace_held() before calling bpf map helpers These three bpf_map_{lookup,update,delete}_elem() helpers are also available for sleepable bpf program, so add the corresponding lock assertion for sleepable bpf program, otherwise the following warning will be reported when a sleepable bpf program manipulates bpf map under interpreter mode (aka bpf_jit_enable=0): WARNING: CPU: 3 PID: 4985 at kernel/bpf/helpers.c:40 ...... CPU: 3 PID: 4985 Comm: test_progs Not tainted 6.6.0+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ...... RIP: 0010:bpf_map_lookup_elem+0x54/0x60 ...... Call Trace: ? __warn+0xa5/0x240 ? bpf_map_lookup_elem+0x54/0x60 ? report_bug+0x1ba/0x1f0 ? handle_bug+0x40/0x80 ? exc_invalid_op+0x18/0x50 ? asm_exc_invalid_op+0x1b/0x20 ? __pfx_bpf_map_lookup_elem+0x10/0x10 ? rcu_lockdep_current_cpu_online+0x65/0xb0 ? rcu_is_watching+0x23/0x50 ? bpf_map_lookup_elem+0x54/0x60 ? __pfx_bpf_map_lookup_elem+0x10/0x10 ___bpf_prog_run+0x513/0x3b70 __bpf_prog_run32+0x9d/0xd0 ? __bpf_prog_enter_sleepable_recur+0xad/0x120 ? __bpf_prog_enter_sleepable_recur+0x3e/0x120 bpf_trampoline_6442580665+0x4d/0x1000 __x64_sys_getpgid+0x5/0x30 ? do_syscall_64+0x36/0xb0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/helpers.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index b45a8381f9bd..ee9bdf29246a 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -32,12 +32,13 @@ * * Different map implementations will rely on rcu in map methods * lookup/update/delete, therefore eBPF programs must run under rcu lock - * if program is allowed to access maps, so check rcu_read_lock_held in - * all three functions. + * if program is allowed to access maps, so check rcu_read_lock_held() or + * rcu_read_lock_trace_held() in all three functions. */ BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return (unsigned long) map->ops->map_lookup_elem(map, key); } @@ -53,7 +54,8 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto = { BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, void *, value, u64, flags) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return map->ops->map_update_elem(map, key, value, flags); } @@ -70,7 +72,8 @@ const struct bpf_func_proto bpf_map_update_elem_proto = { BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) { - WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && + !rcu_read_lock_bh_held()); return map->ops->map_delete_elem(map, key); } From 20c20bd11a0702ce4dc9300c3da58acf551d9725 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:20 +0800 Subject: [PATCH 2/7] bpf: Add map and need_defer parameters to .map_fd_put_ptr() map is the pointer of outer map, and need_defer needs some explanation. need_defer tells the implementation to defer the reference release of the passed element and ensure that the element is still alive before the bpf program, which may manipulate it, exits. The following three cases will invoke map_fd_put_ptr() and different need_defer values will be passed to these callers: 1) release the reference of the old element in the map during map update or map deletion. The release must be deferred, otherwise the bpf program may incur use-after-free problem, so need_defer needs to be true. 2) release the reference of the to-be-added element in the error path of map update. The to-be-added element is not visible to any bpf program, so it is OK to pass false for need_defer parameter. 3) release the references of all elements in the map during map release. Any bpf program which has access to the map must have been exited and released, so need_defer=false will be OK. These two parameters will be used by the following patches to fix the potential use-after-free problem for map-in-map. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-3-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +++++- kernel/bpf/arraymap.c | 12 +++++++----- kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/map_in_map.c | 2 +- kernel/bpf/map_in_map.h | 2 +- 5 files changed, 17 insertions(+), 11 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eb447b0a9423..d273348cfb2f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -106,7 +106,11 @@ struct bpf_map_ops { /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); - void (*map_fd_put_ptr)(void *ptr); + /* If need_defer is true, the implementation should guarantee that + * the to-be-put element is still alive before the bpf program, which + * may manipulate it, exists. + */ + void (*map_fd_put_ptr)(struct bpf_map *map, void *ptr, bool need_defer); int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); void (*map_seq_show_elem)(struct bpf_map *map, void *key, diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 2058e89b5ddd..f9aed5909d6e 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -867,7 +867,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, } if (old_ptr) - map->ops->map_fd_put_ptr(old_ptr); + map->ops->map_fd_put_ptr(map, old_ptr, true); return 0; } @@ -890,7 +890,7 @@ static long fd_array_map_delete_elem(struct bpf_map *map, void *key) } if (old_ptr) { - map->ops->map_fd_put_ptr(old_ptr); + map->ops->map_fd_put_ptr(map, old_ptr, true); return 0; } else { return -ENOENT; @@ -913,8 +913,9 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map, return prog; } -static void prog_fd_array_put_ptr(void *ptr) +static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { + /* bpf_prog is freed after one RCU or tasks trace grace period */ bpf_prog_put(ptr); } @@ -1239,8 +1240,9 @@ err_out: return ee; } -static void perf_event_fd_array_put_ptr(void *ptr) +static void perf_event_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { + /* bpf_perf_event is freed after one RCU grace period */ bpf_event_entry_free_rcu(ptr); } @@ -1294,7 +1296,7 @@ static void *cgroup_fd_array_get_ptr(struct bpf_map *map, return cgroup_get_from_fd(fd); } -static void cgroup_fd_array_put_ptr(void *ptr) +static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { /* cgroup_put free cgrp after a rcu grace period */ cgroup_put(ptr); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fd8d4b0addfc..5b9146fa825f 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -897,7 +897,7 @@ static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) if (map->ops->map_fd_put_ptr) { ptr = fd_htab_map_get_ptr(map, l); - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, true); } } @@ -2484,7 +2484,7 @@ static void fd_htab_map_free(struct bpf_map *map) hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { void *ptr = fd_htab_map_get_ptr(map, l); - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, false); } } @@ -2525,7 +2525,7 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, ret = htab_map_update_elem(map, key, &ptr, map_flags); if (ret) - map->ops->map_fd_put_ptr(ptr); + map->ops->map_fd_put_ptr(map, ptr, false); return ret; } diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index cd5eafaba97e..2dfeb5835e16 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -127,7 +127,7 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, return inner_map; } -void bpf_map_fd_put_ptr(void *ptr) +void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { /* ptr->ops->map_free() has to go through one * rcu grace period by itself. diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index bcb7534afb3c..7d61602354de 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h @@ -13,7 +13,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); void bpf_map_meta_free(struct bpf_map *map_meta); void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, int ufd); -void bpf_map_fd_put_ptr(void *ptr); +void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer); u32 bpf_map_fd_sys_lookup_elem(void *ptr); #endif From 79d93b3c6ffd79abcd8e43345980aa1e904879c4 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:21 +0800 Subject: [PATCH 3/7] bpf: Set need_defer as false when clearing fd array during map free Both map deletion operation, map release and map free operation use fd_array_map_delete_elem() to remove the element from fd array and need_defer is always true in fd_array_map_delete_elem(). For the map deletion operation and map release operation, need_defer=true is necessary, because the bpf program, which accesses the element in fd array, may still alive. However for map free operation, it is certain that the bpf program which owns the fd array has already been exited, so setting need_defer as false is appropriate for map free operation. So fix it by adding need_defer parameter to bpf_fd_array_map_clear() and adding a new helper __fd_array_map_delete_elem() to handle the map deletion, map release and map free operations correspondingly. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-4-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/arraymap.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index f9aed5909d6e..4a4a67956e21 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -871,7 +871,7 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, return 0; } -static long fd_array_map_delete_elem(struct bpf_map *map, void *key) +static long __fd_array_map_delete_elem(struct bpf_map *map, void *key, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); void *old_ptr; @@ -890,13 +890,18 @@ static long fd_array_map_delete_elem(struct bpf_map *map, void *key) } if (old_ptr) { - map->ops->map_fd_put_ptr(map, old_ptr, true); + map->ops->map_fd_put_ptr(map, old_ptr, need_defer); return 0; } else { return -ENOENT; } } +static long fd_array_map_delete_elem(struct bpf_map *map, void *key) +{ + return __fd_array_map_delete_elem(map, key, true); +} + static void *prog_fd_array_get_ptr(struct bpf_map *map, struct file *map_file, int fd) { @@ -925,13 +930,13 @@ static u32 prog_fd_array_sys_lookup_elem(void *ptr) } /* decrement refcnt of all bpf_progs that are stored in this map */ -static void bpf_fd_array_map_clear(struct bpf_map *map) +static void bpf_fd_array_map_clear(struct bpf_map *map, bool need_defer) { struct bpf_array *array = container_of(map, struct bpf_array, map); int i; for (i = 0; i < array->map.max_entries; i++) - fd_array_map_delete_elem(map, &i); + __fd_array_map_delete_elem(map, &i, need_defer); } static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, @@ -1110,7 +1115,7 @@ static void prog_array_map_clear_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_array_aux, work)->map; - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, true); bpf_map_put(map); } @@ -1260,7 +1265,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, for (i = 0; i < array->map.max_entries; i++) { ee = READ_ONCE(array->ptrs[i]); if (ee && ee->map_file == map_file) - fd_array_map_delete_elem(map, &i); + __fd_array_map_delete_elem(map, &i, true); } rcu_read_unlock(); } @@ -1268,7 +1273,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, static void perf_event_fd_array_map_free(struct bpf_map *map) { if (map->map_flags & BPF_F_PRESERVE_ELEMS) - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } @@ -1304,7 +1309,7 @@ static void cgroup_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_de static void cgroup_fd_array_free(struct bpf_map *map) { - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } @@ -1349,7 +1354,7 @@ static void array_of_map_free(struct bpf_map *map) * is protected by fdget/fdput. */ bpf_map_meta_free(map->inner_map_meta); - bpf_fd_array_map_clear(map); + bpf_fd_array_map_clear(map, false); fd_array_map_free(map); } From 876673364161da50eed6b472d746ef88242b2368 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:22 +0800 Subject: [PATCH 4/7] bpf: Defer the free of inner map when necessary When updating or deleting an inner map in map array or map htab, the map may still be accessed by non-sleepable program or sleepable program. However bpf_map_fd_put_ptr() decreases the ref-counter of the inner map directly through bpf_map_put(), if the ref-counter is the last one (which is true for most cases), the inner map will be freed by ops->map_free() in a kworker. But for now, most .map_free() callbacks don't use synchronize_rcu() or its variants to wait for the elapse of a RCU grace period, so after the invocation of ops->map_free completes, the bpf program which is accessing the inner map may incur use-after-free problem. Fix the free of inner map by invoking bpf_map_free_deferred() after both one RCU grace period and one tasks trace RCU grace period if the inner map has been removed from the outer map before. The deferment is accomplished by using call_rcu() or call_rcu_tasks_trace() when releasing the last ref-counter of bpf map. The newly-added rcu_head field in bpf_map shares the same storage space with work field to reduce the size of bpf_map. Fixes: bba1dc0b55ac ("bpf: Remove redundant synchronize_rcu.") Fixes: 638e4b825d52 ("bpf: Allows per-cpu maps and map-in-map in sleepable programs") Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-5-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 ++++++- kernel/bpf/map_in_map.c | 11 ++++++++--- kernel/bpf/syscall.c | 32 +++++++++++++++++++++++++++----- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d273348cfb2f..de3bd03cbeea 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -276,7 +276,11 @@ struct bpf_map { */ atomic64_t refcnt ____cacheline_aligned; atomic64_t usercnt; - struct work_struct work; + /* rcu is used before freeing and work is only used during freeing */ + union { + struct work_struct work; + struct rcu_head rcu; + }; struct mutex freeze_mutex; atomic64_t writecnt; /* 'Ownership' of program-containing map is claimed by the first program @@ -292,6 +296,7 @@ struct bpf_map { } owner; bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ + bool free_after_mult_rcu_gp; s64 __percpu *elem_count; }; diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 2dfeb5835e16..3248ff5d8161 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -129,10 +129,15 @@ void *bpf_map_fd_get_ptr(struct bpf_map *map, void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { - /* ptr->ops->map_free() has to go through one - * rcu grace period by itself. + struct bpf_map *inner_map = ptr; + + /* The inner map may still be used by both non-sleepable and sleepable + * bpf program, so free it after one RCU grace period and one tasks + * trace RCU grace period. */ - bpf_map_put(ptr); + if (need_defer) + WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); + bpf_map_put(inner_map); } u32 bpf_map_fd_sys_lookup_elem(void *ptr) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5e43ddd1b83f..dd515f6b9741 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -719,6 +719,28 @@ static void bpf_map_put_uref(struct bpf_map *map) } } +static void bpf_map_free_in_work(struct bpf_map *map) +{ + INIT_WORK(&map->work, bpf_map_free_deferred); + /* Avoid spawning kworkers, since they all might contend + * for the same mutex like slab_mutex. + */ + queue_work(system_unbound_wq, &map->work); +} + +static void bpf_map_free_rcu_gp(struct rcu_head *rcu) +{ + bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu)); +} + +static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu) +{ + if (rcu_trace_implies_rcu_gp()) + bpf_map_free_rcu_gp(rcu); + else + call_rcu(rcu, bpf_map_free_rcu_gp); +} + /* decrement map refcnt and schedule it for freeing via workqueue * (underlying map implementation ops->map_free() might sleep) */ @@ -728,11 +750,11 @@ void bpf_map_put(struct bpf_map *map) /* bpf_map_free_id() must be called first */ bpf_map_free_id(map); btf_put(map->btf); - INIT_WORK(&map->work, bpf_map_free_deferred); - /* Avoid spawning kworkers, since they all might contend - * for the same mutex like slab_mutex. - */ - queue_work(system_unbound_wq, &map->work); + + if (READ_ONCE(map->free_after_mult_rcu_gp)) + call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp); + else + bpf_map_free_in_work(map); } } EXPORT_SYMBOL_GPL(bpf_map_put); From af66bfd3c8538ed21cf72af18426fc4a408665cf Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:23 +0800 Subject: [PATCH 5/7] bpf: Optimize the free of inner map When removing the inner map from the outer map, the inner map will be freed after one RCU grace period and one RCU tasks trace grace period, so it is certain that the bpf program, which may access the inner map, has exited before the inner map is freed. However there is no need to wait for one RCU tasks trace grace period if the outer map is only accessed by non-sleepable program. So adding sleepable_refcnt in bpf_map and increasing sleepable_refcnt when adding the outer map into env->used_maps for sleepable program. Although the max number of bpf program is INT_MAX - 1, the number of bpf programs which are being loaded may be greater than INT_MAX, so using atomic64_t instead of atomic_t for sleepable_refcnt. When removing the inner map from the outer map, using sleepable_refcnt to decide whether or not a RCU tasks trace grace period is needed before freeing the inner map. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-6-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ kernel/bpf/core.c | 4 ++++ kernel/bpf/map_in_map.c | 14 +++++++++----- kernel/bpf/syscall.c | 8 ++++++++ kernel/bpf/verifier.c | 4 +++- 5 files changed, 26 insertions(+), 6 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index de3bd03cbeea..10e5e4d8a00f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -297,6 +297,8 @@ struct bpf_map { bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ bool free_after_mult_rcu_gp; + bool free_after_rcu_gp; + atomic64_t sleepable_refcnt; s64 __percpu *elem_count; }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index cd3afe57ece3..4b813da8d6c0 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2664,12 +2664,16 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len) { struct bpf_map *map; + bool sleepable; u32 i; + sleepable = aux->sleepable; for (i = 0; i < len; i++) { map = used_maps[i]; if (map->ops->map_poke_untrack) map->ops->map_poke_untrack(map, aux); + if (sleepable) + atomic64_dec(&map->sleepable_refcnt); bpf_map_put(map); } } diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 3248ff5d8161..8ef269e66ba5 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -131,12 +131,16 @@ void bpf_map_fd_put_ptr(struct bpf_map *map, void *ptr, bool need_defer) { struct bpf_map *inner_map = ptr; - /* The inner map may still be used by both non-sleepable and sleepable - * bpf program, so free it after one RCU grace period and one tasks - * trace RCU grace period. + /* Defer the freeing of inner map according to the sleepable attribute + * of bpf program which owns the outer map, so unnecessary waiting for + * RCU tasks trace grace period can be avoided. */ - if (need_defer) - WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); + if (need_defer) { + if (atomic64_read(&map->sleepable_refcnt)) + WRITE_ONCE(inner_map->free_after_mult_rcu_gp, true); + else + WRITE_ONCE(inner_map->free_after_rcu_gp, true); + } bpf_map_put(inner_map); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dd515f6b9741..ebaccf77d56e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -751,8 +751,11 @@ void bpf_map_put(struct bpf_map *map) bpf_map_free_id(map); btf_put(map->btf); + WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt)); if (READ_ONCE(map->free_after_mult_rcu_gp)) call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp); + else if (READ_ONCE(map->free_after_rcu_gp)) + call_rcu(&map->rcu, bpf_map_free_rcu_gp); else bpf_map_free_in_work(map); } @@ -5345,6 +5348,11 @@ static int bpf_prog_bind_map(union bpf_attr *attr) goto out_unlock; } + /* The bpf program will not access the bpf map, but for the sake of + * simplicity, increase sleepable_refcnt for sleepable program as well. + */ + if (prog->aux->sleepable) + atomic64_inc(&map->sleepable_refcnt); memcpy(used_maps_new, used_maps_old, sizeof(used_maps_old[0]) * prog->aux->used_map_cnt); used_maps_new[prog->aux->used_map_cnt] = map; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cdb4f5f0ba79..1ed39665f802 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17889,10 +17889,12 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) return -E2BIG; } + if (env->prog->aux->sleepable) + atomic64_inc(&map->sleepable_refcnt); /* hold the map. If the program is rejected by verifier, * the map will be released by release_maps() or it * will be used by the valid program until it's unloaded - * and all maps are released in free_used_maps() + * and all maps are released in bpf_free_used_maps() */ bpf_map_inc(map); From 1624918be84a8bcc4f592e55635bc4fe4a96460a Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:24 +0800 Subject: [PATCH 6/7] selftests/bpf: Add test cases for inner map Add test cases to test the race between the destroy of inner map due to map-in-map update and the access of inner map in bpf program. The following 4 combinations are added: (1) array map in map array + bpf program (2) array map in map array + sleepable bpf program (3) array map in map htab + bpf program (4) array map in map htab + sleepable bpf program Before applying the fixes, when running `./test_prog -a map_in_map`, the following error was reported: ================================================================== BUG: KASAN: slab-use-after-free in array_map_update_elem+0x48/0x3e0 Read of size 4 at addr ffff888114f33824 by task test_progs/1858 CPU: 1 PID: 1858 Comm: test_progs Tainted: G O 6.6.0+ #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ...... Call Trace: dump_stack_lvl+0x4a/0x90 print_report+0xd2/0x620 kasan_report+0xd1/0x110 __asan_load4+0x81/0xa0 array_map_update_elem+0x48/0x3e0 bpf_prog_be94a9f26772f5b7_access_map_in_array+0xe6/0xf6 trace_call_bpf+0x1aa/0x580 kprobe_perf_func+0xdd/0x430 kprobe_dispatcher+0xa0/0xb0 kprobe_ftrace_handler+0x18b/0x2e0 0xffffffffc02280f7 RIP: 0010:__x64_sys_getpgid+0x1/0x30 ...... Allocated by task 1857: kasan_save_stack+0x26/0x50 kasan_set_track+0x25/0x40 kasan_save_alloc_info+0x1e/0x30 __kasan_kmalloc+0x98/0xa0 __kmalloc_node+0x6a/0x150 __bpf_map_area_alloc+0x141/0x170 bpf_map_area_alloc+0x10/0x20 array_map_alloc+0x11f/0x310 map_create+0x28a/0xb40 __sys_bpf+0x753/0x37c0 __x64_sys_bpf+0x44/0x60 do_syscall_64+0x36/0xb0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Freed by task 11: kasan_save_stack+0x26/0x50 kasan_set_track+0x25/0x40 kasan_save_free_info+0x2b/0x50 __kasan_slab_free+0x113/0x190 slab_free_freelist_hook+0xd7/0x1e0 __kmem_cache_free+0x170/0x260 kfree+0x9b/0x160 kvfree+0x2d/0x40 bpf_map_area_free+0xe/0x20 array_map_free+0x120/0x2c0 bpf_map_free_deferred+0xd7/0x1e0 process_one_work+0x462/0x990 worker_thread+0x370/0x670 kthread+0x1b0/0x200 ret_from_fork+0x3a/0x70 ret_from_fork_asm+0x1b/0x30 Last potentially related work creation: kasan_save_stack+0x26/0x50 __kasan_record_aux_stack+0x94/0xb0 kasan_record_aux_stack_noalloc+0xb/0x20 __queue_work+0x331/0x950 queue_work_on+0x75/0x80 bpf_map_put+0xfa/0x160 bpf_map_fd_put_ptr+0xe/0x20 bpf_fd_array_map_update_elem+0x174/0x1b0 bpf_map_update_value+0x2b7/0x4a0 __sys_bpf+0x2551/0x37c0 __x64_sys_bpf+0x44/0x60 do_syscall_64+0x36/0xb0 entry_SYSCALL_64_after_hwframe+0x6e/0x76 Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-7-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/map_in_map.c | 141 ++++++++++++++++++ .../selftests/bpf/progs/access_map_in_map.c | 93 ++++++++++++ 2 files changed, 234 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/map_in_map.c create mode 100644 tools/testing/selftests/bpf/progs/access_map_in_map.c diff --git a/tools/testing/selftests/bpf/prog_tests/map_in_map.c b/tools/testing/selftests/bpf/prog_tests/map_in_map.c new file mode 100644 index 000000000000..d2a10eb4e5b5 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/map_in_map.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2023. Huawei Technologies Co., Ltd */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include "access_map_in_map.skel.h" + +struct thread_ctx { + pthread_barrier_t barrier; + int outer_map_fd; + int start, abort; + int loop, err; +}; + +static int wait_for_start_or_abort(struct thread_ctx *ctx) +{ + while (!ctx->start && !ctx->abort) + usleep(1); + return ctx->abort ? -1 : 0; +} + +static void *update_map_fn(void *data) +{ + struct thread_ctx *ctx = data; + int loop = ctx->loop, err = 0; + + if (wait_for_start_or_abort(ctx) < 0) + return NULL; + pthread_barrier_wait(&ctx->barrier); + + while (loop-- > 0) { + int fd, zero = 0; + + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, 4, 4, 1, NULL); + if (fd < 0) { + err |= 1; + pthread_barrier_wait(&ctx->barrier); + continue; + } + + /* Remove the old inner map */ + if (bpf_map_update_elem(ctx->outer_map_fd, &zero, &fd, 0) < 0) + err |= 2; + close(fd); + pthread_barrier_wait(&ctx->barrier); + } + + ctx->err = err; + + return NULL; +} + +static void *access_map_fn(void *data) +{ + struct thread_ctx *ctx = data; + int loop = ctx->loop; + + if (wait_for_start_or_abort(ctx) < 0) + return NULL; + pthread_barrier_wait(&ctx->barrier); + + while (loop-- > 0) { + /* Access the old inner map */ + syscall(SYS_getpgid); + pthread_barrier_wait(&ctx->barrier); + } + + return NULL; +} + +static void test_map_in_map_access(const char *prog_name, const char *map_name) +{ + struct access_map_in_map *skel; + struct bpf_map *outer_map; + struct bpf_program *prog; + struct thread_ctx ctx; + pthread_t tid[2]; + int err; + + skel = access_map_in_map__open(); + if (!ASSERT_OK_PTR(skel, "access_map_in_map open")) + return; + + prog = bpf_object__find_program_by_name(skel->obj, prog_name); + if (!ASSERT_OK_PTR(prog, "find program")) + goto out; + bpf_program__set_autoload(prog, true); + + outer_map = bpf_object__find_map_by_name(skel->obj, map_name); + if (!ASSERT_OK_PTR(outer_map, "find map")) + goto out; + + err = access_map_in_map__load(skel); + if (!ASSERT_OK(err, "access_map_in_map load")) + goto out; + + err = access_map_in_map__attach(skel); + if (!ASSERT_OK(err, "access_map_in_map attach")) + goto out; + + skel->bss->tgid = getpid(); + + memset(&ctx, 0, sizeof(ctx)); + pthread_barrier_init(&ctx.barrier, NULL, 2); + ctx.outer_map_fd = bpf_map__fd(outer_map); + ctx.loop = 4; + + err = pthread_create(&tid[0], NULL, update_map_fn, &ctx); + if (!ASSERT_OK(err, "close_thread")) + goto out; + + err = pthread_create(&tid[1], NULL, access_map_fn, &ctx); + if (!ASSERT_OK(err, "read_thread")) { + ctx.abort = 1; + pthread_join(tid[0], NULL); + goto out; + } + + ctx.start = 1; + pthread_join(tid[0], NULL); + pthread_join(tid[1], NULL); + + ASSERT_OK(ctx.err, "err"); +out: + access_map_in_map__destroy(skel); +} + +void test_map_in_map(void) +{ + if (test__start_subtest("acc_map_in_array")) + test_map_in_map_access("access_map_in_array", "outer_array_map"); + if (test__start_subtest("sleepable_acc_map_in_array")) + test_map_in_map_access("sleepable_access_map_in_array", "outer_array_map"); + if (test__start_subtest("acc_map_in_htab")) + test_map_in_map_access("access_map_in_htab", "outer_htab_map"); + if (test__start_subtest("sleepable_acc_map_in_htab")) + test_map_in_map_access("sleepable_access_map_in_htab", "outer_htab_map"); +} + diff --git a/tools/testing/selftests/bpf/progs/access_map_in_map.c b/tools/testing/selftests/bpf/progs/access_map_in_map.c new file mode 100644 index 000000000000..1126871c2ebd --- /dev/null +++ b/tools/testing/selftests/bpf/progs/access_map_in_map.c @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (C) 2023. Huawei Technologies Co., Ltd */ +#include +#include +#include + +#include "bpf_misc.h" + +struct inner_map_type { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(key_size, 4); + __uint(value_size, 4); + __uint(max_entries, 1); +} inner_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __type(key, int); + __type(value, int); + __uint(max_entries, 1); + __array(values, struct inner_map_type); +} outer_array_map SEC(".maps") = { + .values = { + [0] = &inner_map, + }, +}; + +struct { + __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS); + __type(key, int); + __type(value, int); + __uint(max_entries, 1); + __array(values, struct inner_map_type); +} outer_htab_map SEC(".maps") = { + .values = { + [0] = &inner_map, + }, +}; + +char _license[] SEC("license") = "GPL"; + +int tgid = 0; + +static int acc_map_in_map(void *outer_map) +{ + int i, key, value = 0xdeadbeef; + void *inner_map; + + if ((bpf_get_current_pid_tgid() >> 32) != tgid) + return 0; + + /* Find nonexistent inner map */ + key = 1; + inner_map = bpf_map_lookup_elem(outer_map, &key); + if (inner_map) + return 0; + + /* Find the old inner map */ + key = 0; + inner_map = bpf_map_lookup_elem(outer_map, &key); + if (!inner_map) + return 0; + + /* Wait for the old inner map to be replaced */ + for (i = 0; i < 2048; i++) + bpf_map_update_elem(inner_map, &key, &value, 0); + + return 0; +} + +SEC("?kprobe/" SYS_PREFIX "sys_getpgid") +int access_map_in_array(void *ctx) +{ + return acc_map_in_map(&outer_array_map); +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int sleepable_access_map_in_array(void *ctx) +{ + return acc_map_in_map(&outer_array_map); +} + +SEC("?kprobe/" SYS_PREFIX "sys_getpgid") +int access_map_in_htab(void *ctx) +{ + return acc_map_in_map(&outer_htab_map); +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int sleepable_access_map_in_htab(void *ctx) +{ + return acc_map_in_map(&outer_htab_map); +} From e3dd40828534a67931e0dd00fcd35846271fd4e8 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 4 Dec 2023 22:04:25 +0800 Subject: [PATCH 7/7] selftests/bpf: Test outer map update operations in syscall program Syscall program is running with rcu_read_lock_trace being held, so if bpf_map_update_elem() or bpf_map_delete_elem() invokes synchronize_rcu_tasks_trace() when operating on an outer map, there will be dead-lock, so add a test to guarantee that it is dead-lock free. Signed-off-by: Hou Tao Link: https://lore.kernel.org/r/20231204140425.1480317-8-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/syscall.c | 30 +++++- tools/testing/selftests/bpf/progs/syscall.c | 96 ++++++++++++++++++- 2 files changed, 119 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/syscall.c b/tools/testing/selftests/bpf/prog_tests/syscall.c index f4d40001155a..0be8301c0ffd 100644 --- a/tools/testing/selftests/bpf/prog_tests/syscall.c +++ b/tools/testing/selftests/bpf/prog_tests/syscall.c @@ -12,7 +12,7 @@ struct args { int btf_fd; }; -void test_syscall(void) +static void test_syscall_load_prog(void) { static char verifier_log[8192]; struct args ctx = { @@ -32,7 +32,7 @@ void test_syscall(void) if (!ASSERT_OK_PTR(skel, "skel_load")) goto cleanup; - prog_fd = bpf_program__fd(skel->progs.bpf_prog); + prog_fd = bpf_program__fd(skel->progs.load_prog); err = bpf_prog_test_run_opts(prog_fd, &tattr); ASSERT_EQ(err, 0, "err"); ASSERT_EQ(tattr.retval, 1, "retval"); @@ -53,3 +53,29 @@ cleanup: if (ctx.btf_fd > 0) close(ctx.btf_fd); } + +static void test_syscall_update_outer_map(void) +{ + LIBBPF_OPTS(bpf_test_run_opts, opts); + struct syscall *skel; + int err, prog_fd; + + skel = syscall__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel_load")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.update_outer_map); + err = bpf_prog_test_run_opts(prog_fd, &opts); + ASSERT_EQ(err, 0, "err"); + ASSERT_EQ(opts.retval, 1, "retval"); +cleanup: + syscall__destroy(skel); +} + +void test_syscall(void) +{ + if (test__start_subtest("load_prog")) + test_syscall_load_prog(); + if (test__start_subtest("update_outer_map")) + test_syscall_update_outer_map(); +} diff --git a/tools/testing/selftests/bpf/progs/syscall.c b/tools/testing/selftests/bpf/progs/syscall.c index e550f728962d..3d3cafdebe72 100644 --- a/tools/testing/selftests/bpf/progs/syscall.c +++ b/tools/testing/selftests/bpf/progs/syscall.c @@ -6,9 +6,15 @@ #include #include <../../../tools/include/linux/filter.h> #include +#include +#include char _license[] SEC("license") = "GPL"; +struct bpf_map { + int id; +} __attribute__((preserve_access_index)); + struct args { __u64 log_buf; __u32 log_size; @@ -27,6 +33,37 @@ struct args { BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), sz), \ BTF_INT_ENC(encoding, bits_offset, bits) +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __type(key, int); + __type(value, union bpf_attr); + __uint(max_entries, 1); +} bpf_attr_array SEC(".maps"); + +struct inner_map_type { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(key_size, 4); + __uint(value_size, 4); + __uint(max_entries, 1); +} inner_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); + __type(key, int); + __type(value, int); + __uint(max_entries, 1); + __array(values, struct inner_map_type); +} outer_array_map SEC(".maps") = { + .values = { + [0] = &inner_map, + }, +}; + +static inline __u64 ptr_to_u64(const void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + static int btf_load(void) { struct btf_blob { @@ -58,7 +95,7 @@ static int btf_load(void) } SEC("syscall") -int bpf_prog(struct args *ctx) +int load_prog(struct args *ctx) { static char license[] = "GPL"; static struct bpf_insn insns[] = { @@ -94,8 +131,8 @@ int bpf_prog(struct args *ctx) map_create_attr.max_entries = ctx->max_entries; map_create_attr.btf_fd = ret; - prog_load_attr.license = (long) license; - prog_load_attr.insns = (long) insns; + prog_load_attr.license = ptr_to_u64(license); + prog_load_attr.insns = ptr_to_u64(insns); prog_load_attr.log_buf = ctx->log_buf; prog_load_attr.log_size = ctx->log_size; prog_load_attr.log_level = 1; @@ -107,8 +144,8 @@ int bpf_prog(struct args *ctx) insns[3].imm = ret; map_update_attr.map_fd = ret; - map_update_attr.key = (long) &key; - map_update_attr.value = (long) &value; + map_update_attr.key = ptr_to_u64(&key); + map_update_attr.value = ptr_to_u64(&value); ret = bpf_sys_bpf(BPF_MAP_UPDATE_ELEM, &map_update_attr, sizeof(map_update_attr)); if (ret < 0) return ret; @@ -119,3 +156,52 @@ int bpf_prog(struct args *ctx) ctx->prog_fd = ret; return 1; } + +SEC("syscall") +int update_outer_map(void *ctx) +{ + int zero = 0, ret = 0, outer_fd = -1, inner_fd = -1, err; + const int attr_sz = sizeof(union bpf_attr); + union bpf_attr *attr; + + attr = bpf_map_lookup_elem((struct bpf_map *)&bpf_attr_array, &zero); + if (!attr) + goto out; + + memset(attr, 0, attr_sz); + attr->map_id = ((struct bpf_map *)&outer_array_map)->id; + outer_fd = bpf_sys_bpf(BPF_MAP_GET_FD_BY_ID, attr, attr_sz); + if (outer_fd < 0) + goto out; + + memset(attr, 0, attr_sz); + attr->map_type = BPF_MAP_TYPE_ARRAY; + attr->key_size = 4; + attr->value_size = 4; + attr->max_entries = 1; + inner_fd = bpf_sys_bpf(BPF_MAP_CREATE, attr, attr_sz); + if (inner_fd < 0) + goto out; + + memset(attr, 0, attr_sz); + attr->map_fd = outer_fd; + attr->key = ptr_to_u64(&zero); + attr->value = ptr_to_u64(&inner_fd); + err = bpf_sys_bpf(BPF_MAP_UPDATE_ELEM, attr, attr_sz); + if (err) + goto out; + + memset(attr, 0, attr_sz); + attr->map_fd = outer_fd; + attr->key = ptr_to_u64(&zero); + err = bpf_sys_bpf(BPF_MAP_DELETE_ELEM, attr, attr_sz); + if (err) + goto out; + ret = 1; +out: + if (inner_fd >= 0) + bpf_sys_close(inner_fd); + if (outer_fd >= 0) + bpf_sys_close(outer_fd); + return ret; +}