Merge tag 'bpf-next-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Pull bpf updates from Alexei Starovoitov:

 - Add BPF uprobe session support (Jiri Olsa)

 - Optimize uprobe performance (Andrii Nakryiko)

 - Add bpf_fastcall support to helpers and kfuncs (Eduard Zingerman)

 - Avoid calling free_htab_elem() under hash map bucket lock (Hou Tao)

 - Prevent tailcall infinite loop caused by freplace (Leon Hwang)

 - Mark raw_tracepoint arguments as nullable (Kumar Kartikeya Dwivedi)

 - Introduce uptr support in the task local storage map (Martin KaFai
   Lau)

 - Stringify errno log messages in libbpf (Mykyta Yatsenko)

 - Add kmem_cache BPF iterator for perf's lock profiling (Namhyung Kim)

 - Support BPF objects of either endianness in libbpf (Tony Ambardar)

 - Add ksym to struct_ops trampoline to fix stack trace (Xu Kuohai)

 - Introduce private stack for eligible BPF programs (Yonghong Song)

 - Migrate samples/bpf tests to selftests/bpf test_progs (Daniel T. Lee)

 - Migrate test_sock to selftests/bpf test_progs (Jordan Rife)

* tag 'bpf-next-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (152 commits)
  libbpf: Change hash_combine parameters from long to unsigned long
  selftests/bpf: Fix build error with llvm 19
  libbpf: Fix memory leak in bpf_program__attach_uprobe_multi
  bpf: use common instruction history across all states
  bpf: Add necessary migrate_disable to range_tree.
  bpf: Do not alloc arena on unsupported arches
  selftests/bpf: Set test path for token/obj_priv_implicit_token_envvar
  selftests/bpf: Add a test for arena range tree algorithm
  bpf: Introduce range_tree data structure and use it in bpf arena
  samples/bpf: Remove unused variable in xdp2skb_meta_kern.c
  samples/bpf: Remove unused variables in tc_l2_redirect_kern.c
  bpftool: Cast variable `var` to long long
  bpf, x86: Propagate tailcall info only for subprogs
  bpf: Add kernel symbol for struct_ops trampoline
  bpf: Use function pointers count as struct_ops links count
  bpf: Remove unused member rcu from bpf_struct_ops_map
  selftests/bpf: Add struct_ops prog private stack tests
  bpf: Support private stack for struct_ops progs
  selftests/bpf: Add tracing prog private stack tests
  bpf, x86: Support private stack in jit
  ...
This commit is contained in:
Linus Torvalds
2024-11-21 08:11:04 -08:00
211 changed files with 6963 additions and 3475 deletions

View File

@@ -16,7 +16,7 @@ obj-$(CONFIG_BPF_SYSCALL) += disasm.o mprog.o
obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o memalloc.o
ifeq ($(CONFIG_MMU)$(CONFIG_64BIT),yy)
obj-$(CONFIG_BPF_SYSCALL) += arena.o
obj-$(CONFIG_BPF_SYSCALL) += arena.o range_tree.o
endif
obj-$(CONFIG_BPF_JIT) += dispatcher.o
ifeq ($(CONFIG_NET),y)
@@ -52,3 +52,4 @@ obj-$(CONFIG_BPF_PRELOAD) += preload/
obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o
obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o
obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o

View File

@@ -3,9 +3,11 @@
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/err.h>
#include "linux/filter.h"
#include <linux/btf_ids.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include "range_tree.h"
/*
* bpf_arena is a sparsely populated shared memory region between bpf program and
@@ -45,7 +47,7 @@ struct bpf_arena {
u64 user_vm_start;
u64 user_vm_end;
struct vm_struct *kern_vm;
struct maple_tree mt;
struct range_tree rt;
struct list_head vma_list;
struct mutex lock;
};
@@ -98,6 +100,9 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
u64 vm_range;
int err = -ENOMEM;
if (!bpf_jit_supports_arena())
return ERR_PTR(-EOPNOTSUPP);
if (attr->key_size || attr->value_size || attr->max_entries == 0 ||
/* BPF_F_MMAPABLE must be set */
!(attr->map_flags & BPF_F_MMAPABLE) ||
@@ -132,7 +137,8 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
INIT_LIST_HEAD(&arena->vma_list);
bpf_map_init_from_attr(&arena->map, attr);
mt_init_flags(&arena->mt, MT_FLAGS_ALLOC_RANGE);
range_tree_init(&arena->rt);
range_tree_set(&arena->rt, 0, attr->max_entries);
mutex_init(&arena->lock);
return &arena->map;
@@ -183,7 +189,7 @@ static void arena_map_free(struct bpf_map *map)
apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
free_vm_area(arena->kern_vm);
mtree_destroy(&arena->mt);
range_tree_destroy(&arena->rt);
bpf_map_area_free(arena);
}
@@ -274,20 +280,20 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
/* User space requested to segfault when page is not allocated by bpf prog */
return VM_FAULT_SIGSEGV;
ret = mtree_insert(&arena->mt, vmf->pgoff, MT_ENTRY, GFP_KERNEL);
ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
if (ret)
return VM_FAULT_SIGSEGV;
/* Account into memcg of the process that created bpf_arena */
ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page);
if (ret) {
mtree_erase(&arena->mt, vmf->pgoff);
range_tree_set(&arena->rt, vmf->pgoff, 1);
return VM_FAULT_SIGSEGV;
}
ret = vm_area_map_pages(arena->kern_vm, kaddr, kaddr + PAGE_SIZE, &page);
if (ret) {
mtree_erase(&arena->mt, vmf->pgoff);
range_tree_set(&arena->rt, vmf->pgoff, 1);
__free_page(page);
return VM_FAULT_SIGSEGV;
}
@@ -444,12 +450,16 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
guard(mutex)(&arena->lock);
if (uaddr)
ret = mtree_insert_range(&arena->mt, pgoff, pgoff + page_cnt - 1,
MT_ENTRY, GFP_KERNEL);
else
ret = mtree_alloc_range(&arena->mt, &pgoff, MT_ENTRY,
page_cnt, 0, page_cnt_max - 1, GFP_KERNEL);
if (uaddr) {
ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
if (ret)
goto out_free_pages;
ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
} else {
ret = pgoff = range_tree_find(&arena->rt, page_cnt);
if (pgoff >= 0)
ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
}
if (ret)
goto out_free_pages;
@@ -476,7 +486,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
kvfree(pages);
return clear_lo32(arena->user_vm_start) + uaddr32;
out:
mtree_erase(&arena->mt, pgoff);
range_tree_set(&arena->rt, pgoff, page_cnt);
out_free_pages:
kvfree(pages);
return 0;
@@ -516,7 +526,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt)
pgoff = compute_pgoff(arena, uaddr);
/* clear range */
mtree_store_range(&arena->mt, pgoff, pgoff + page_cnt - 1, NULL, GFP_KERNEL);
range_tree_set(&arena->rt, pgoff, page_cnt);
if (page_cnt > 1)
/* bulk zap if multiple pages being freed */

View File

@@ -947,22 +947,44 @@ static void *prog_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
struct bpf_prog *prog = bpf_prog_get(fd);
bool is_extended;
if (IS_ERR(prog))
return prog;
if (!bpf_prog_map_compatible(map, prog)) {
if (prog->type == BPF_PROG_TYPE_EXT ||
!bpf_prog_map_compatible(map, prog)) {
bpf_prog_put(prog);
return ERR_PTR(-EINVAL);
}
mutex_lock(&prog->aux->ext_mutex);
is_extended = prog->aux->is_extended;
if (!is_extended)
prog->aux->prog_array_member_cnt++;
mutex_unlock(&prog->aux->ext_mutex);
if (is_extended) {
/* Extended prog can not be tail callee. It's to prevent a
* potential infinite loop like:
* tail callee prog entry -> tail callee prog subprog ->
* freplace prog entry --tailcall-> tail callee prog entry.
*/
bpf_prog_put(prog);
return ERR_PTR(-EBUSY);
}
return prog;
}
static void prog_fd_array_put_ptr(struct bpf_map *map, void *ptr, bool need_defer)
{
struct bpf_prog *prog = ptr;
mutex_lock(&prog->aux->ext_mutex);
prog->aux->prog_array_member_cnt--;
mutex_unlock(&prog->aux->ext_mutex);
/* bpf_prog is freed after one RCU or tasks trace grace period */
bpf_prog_put(ptr);
bpf_prog_put(prog);
}
static u32 prog_fd_array_sys_lookup_elem(void *ptr)

View File

@@ -107,7 +107,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
bpf_cgrp_storage_lock();
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
value, map_flags, GFP_ATOMIC);
value, map_flags, false, GFP_ATOMIC);
bpf_cgrp_storage_unlock();
cgroup_put(cgroup);
return PTR_ERR_OR_ZERO(sdata);
@@ -181,7 +181,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
value, BPF_NOEXIST, gfp_flags);
value, BPF_NOEXIST, false, gfp_flags);
unlock:
bpf_cgrp_storage_unlock();

View File

@@ -99,7 +99,7 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
sdata = bpf_local_storage_update(file_inode(fd_file(f)),
(struct bpf_local_storage_map *)map,
value, map_flags, GFP_ATOMIC);
value, map_flags, false, GFP_ATOMIC);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -153,7 +153,7 @@ BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
sdata = bpf_local_storage_update(
inode, (struct bpf_local_storage_map *)map, value,
BPF_NOEXIST, gfp_flags);
BPF_NOEXIST, false, gfp_flags);
return IS_ERR(sdata) ? (unsigned long)NULL :
(unsigned long)sdata->data;
}

View File

@@ -73,7 +73,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
void *value, bool charge_mem, gfp_t gfp_flags)
void *value, bool charge_mem, bool swap_uptrs, gfp_t gfp_flags)
{
struct bpf_local_storage_elem *selem;
@@ -99,9 +99,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
}
if (selem) {
if (value)
if (value) {
/* No need to call check_and_init_map_value as memory is zero init */
copy_map_value(&smap->map, SDATA(selem)->data, value);
/* No need to call check_and_init_map_value as memory is zero init */
if (swap_uptrs)
bpf_obj_swap_uptrs(smap->map.record, SDATA(selem)->data, value);
}
return selem;
}
@@ -209,8 +212,12 @@ static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
static void bpf_selem_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
struct bpf_local_storage_map *smap;
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
/* The bpf_local_storage_map_free will wait for rcu_barrier */
smap = rcu_dereference_check(SDATA(selem)->smap, 1);
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
bpf_mem_cache_raw_free(selem);
}
@@ -226,16 +233,25 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
struct bpf_local_storage_map *smap,
bool reuse_now)
{
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
if (!smap->bpf_ma) {
/* Only task storage has uptrs and task storage
* has moved to bpf_mem_alloc. Meaning smap->bpf_ma == true
* for task storage, so this bpf_obj_free_fields() won't unpin
* any uptr.
*/
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
__bpf_selem_free(selem, reuse_now);
return;
}
if (!reuse_now) {
call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
} else {
if (reuse_now) {
/* reuse_now == true only happens when the storage owner
* (e.g. task_struct) is being destructed or the map itself
* is being destructed (ie map_free). In both cases,
* no bpf prog can have a hold on the selem. It is
* safe to unpin the uptrs and free the selem now.
*/
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
/* Instead of using the vanilla call_rcu(),
* bpf_mem_cache_free will be able to reuse selem
* immediately.
@@ -243,6 +259,26 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
migrate_disable();
bpf_mem_cache_free(&smap->selem_ma, selem);
migrate_enable();
return;
}
call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
}
static void bpf_selem_free_list(struct hlist_head *list, bool reuse_now)
{
struct bpf_local_storage_elem *selem;
struct bpf_local_storage_map *smap;
struct hlist_node *n;
/* The "_safe" iteration is needed.
* The loop is not removing the selem from the list
* but bpf_selem_free will use the selem->rcu_head
* which is union-ized with the selem->free_node.
*/
hlist_for_each_entry_safe(selem, n, list, free_node) {
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
bpf_selem_free(selem, smap, reuse_now);
}
}
@@ -252,7 +288,7 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
*/
static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem,
bool uncharge_mem, bool reuse_now)
bool uncharge_mem, struct hlist_head *free_selem_list)
{
struct bpf_local_storage_map *smap;
bool free_local_storage;
@@ -296,7 +332,7 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
SDATA(selem))
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
bpf_selem_free(selem, smap, reuse_now);
hlist_add_head(&selem->free_node, free_selem_list);
if (rcu_access_pointer(local_storage->smap) == smap)
RCU_INIT_POINTER(local_storage->smap, NULL);
@@ -345,6 +381,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage *local_storage;
bool bpf_ma, free_local_storage = false;
HLIST_HEAD(selem_free_list);
unsigned long flags;
if (unlikely(!selem_linked_to_storage_lockless(selem)))
@@ -360,9 +397,11 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
local_storage, selem, true, reuse_now);
local_storage, selem, true, &selem_free_list);
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
bpf_selem_free_list(&selem_free_list, reuse_now);
if (free_local_storage)
bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
}
@@ -524,11 +563,12 @@ uncharge:
*/
struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
void *value, u64 map_flags, gfp_t gfp_flags)
void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags)
{
struct bpf_local_storage_data *old_sdata = NULL;
struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
struct bpf_local_storage *local_storage;
HLIST_HEAD(old_selem_free_list);
unsigned long flags;
int err;
@@ -550,7 +590,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (err)
return ERR_PTR(err);
selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
if (!selem)
return ERR_PTR(-ENOMEM);
@@ -584,7 +624,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
/* A lookup has just been done before and concluded a new selem is
* needed. The chance of an unnecessary alloc is unlikely.
*/
alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, gfp_flags);
alloc_selem = selem = bpf_selem_alloc(smap, owner, value, true, swap_uptrs, gfp_flags);
if (!alloc_selem)
return ERR_PTR(-ENOMEM);
@@ -624,11 +664,12 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (old_sdata) {
bpf_selem_unlink_map(SELEM(old_sdata));
bpf_selem_unlink_storage_nolock(local_storage, SELEM(old_sdata),
true, false);
true, &old_selem_free_list);
}
unlock:
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
bpf_selem_free_list(&old_selem_free_list, false);
if (alloc_selem) {
mem_uncharge(smap, owner, smap->elem_size);
bpf_selem_free(alloc_selem, smap, true);
@@ -706,6 +747,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage_elem *selem;
bool bpf_ma, free_storage = false;
HLIST_HEAD(free_selem_list);
struct hlist_node *n;
unsigned long flags;
@@ -734,10 +776,12 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
* of the loop will set the free_cgroup_storage to true.
*/
free_storage = bpf_selem_unlink_storage_nolock(
local_storage, selem, true, true);
local_storage, selem, true, &free_selem_list);
}
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
bpf_selem_free_list(&free_selem_list, true);
if (free_storage)
bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
}
@@ -883,6 +927,9 @@ void bpf_local_storage_map_free(struct bpf_map *map,
synchronize_rcu();
if (smap->bpf_ma) {
rcu_barrier_tasks_trace();
if (!rcu_trace_implies_rcu_gp())
rcu_barrier();
bpf_mem_alloc_destroy(&smap->selem_ma);
bpf_mem_alloc_destroy(&smap->storage_ma);
}

View File

@@ -23,7 +23,6 @@ struct bpf_struct_ops_value {
struct bpf_struct_ops_map {
struct bpf_map map;
struct rcu_head rcu;
const struct bpf_struct_ops_desc *st_ops_desc;
/* protect map_update */
struct mutex lock;
@@ -32,7 +31,9 @@ struct bpf_struct_ops_map {
* (in kvalue.data).
*/
struct bpf_link **links;
u32 links_cnt;
/* ksyms for bpf trampolines */
struct bpf_ksym **ksyms;
u32 funcs_cnt;
u32 image_pages_cnt;
/* image_pages is an array of pages that has all the trampolines
* that stores the func args before calling the bpf_prog.
@@ -481,11 +482,11 @@ static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
{
u32 i;
for (i = 0; i < st_map->links_cnt; i++) {
if (st_map->links[i]) {
bpf_link_put(st_map->links[i]);
st_map->links[i] = NULL;
}
for (i = 0; i < st_map->funcs_cnt; i++) {
if (!st_map->links[i])
break;
bpf_link_put(st_map->links[i]);
st_map->links[i] = NULL;
}
}
@@ -586,6 +587,49 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
return 0;
}
static void bpf_struct_ops_ksym_init(const char *tname, const char *mname,
void *image, unsigned int size,
struct bpf_ksym *ksym)
{
snprintf(ksym->name, KSYM_NAME_LEN, "bpf__%s_%s", tname, mname);
INIT_LIST_HEAD_RCU(&ksym->lnode);
bpf_image_ksym_init(image, size, ksym);
}
static void bpf_struct_ops_map_add_ksyms(struct bpf_struct_ops_map *st_map)
{
u32 i;
for (i = 0; i < st_map->funcs_cnt; i++) {
if (!st_map->ksyms[i])
break;
bpf_image_ksym_add(st_map->ksyms[i]);
}
}
static void bpf_struct_ops_map_del_ksyms(struct bpf_struct_ops_map *st_map)
{
u32 i;
for (i = 0; i < st_map->funcs_cnt; i++) {
if (!st_map->ksyms[i])
break;
bpf_image_ksym_del(st_map->ksyms[i]);
}
}
static void bpf_struct_ops_map_free_ksyms(struct bpf_struct_ops_map *st_map)
{
u32 i;
for (i = 0; i < st_map->funcs_cnt; i++) {
if (!st_map->ksyms[i])
break;
kfree(st_map->ksyms[i]);
st_map->ksyms[i] = NULL;
}
}
static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
@@ -601,6 +645,9 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
int prog_fd, err;
u32 i, trampoline_start, image_off = 0;
void *cur_image = NULL, *image = NULL;
struct bpf_link **plink;
struct bpf_ksym **pksym;
const char *tname, *mname;
if (flags)
return -EINVAL;
@@ -639,14 +686,19 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
udata = &uvalue->data;
kdata = &kvalue->data;
plink = st_map->links;
pksym = st_map->ksyms;
tname = btf_name_by_offset(st_map->btf, t->name_off);
module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]);
for_each_member(i, t, member) {
const struct btf_type *mtype, *ptype;
struct bpf_prog *prog;
struct bpf_tramp_link *link;
struct bpf_ksym *ksym;
u32 moff;
moff = __btf_member_bit_offset(t, member) / 8;
mname = btf_name_by_offset(st_map->btf, member->name_off);
ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL);
if (ptype == module_type) {
if (*(void **)(udata + moff))
@@ -714,7 +766,14 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
}
bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS,
&bpf_struct_ops_link_lops, prog);
st_map->links[i] = &link->link;
*plink++ = &link->link;
ksym = kzalloc(sizeof(*ksym), GFP_USER);
if (!ksym) {
err = -ENOMEM;
goto reset_unlock;
}
*pksym++ = ksym;
trampoline_start = image_off;
err = bpf_struct_ops_prepare_trampoline(tlinks, link,
@@ -735,6 +794,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
/* put prog_id to udata */
*(unsigned long *)(udata + moff) = prog->aux->id;
/* init ksym for this trampoline */
bpf_struct_ops_ksym_init(tname, mname,
image + trampoline_start,
image_off - trampoline_start,
ksym);
}
if (st_ops->validate) {
@@ -783,6 +848,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
*/
reset_unlock:
bpf_struct_ops_map_free_ksyms(st_map);
bpf_struct_ops_map_free_image(st_map);
bpf_struct_ops_map_put_progs(st_map);
memset(uvalue, 0, map->value_size);
@@ -790,6 +856,8 @@ reset_unlock:
unlock:
kfree(tlinks);
mutex_unlock(&st_map->lock);
if (!err)
bpf_struct_ops_map_add_ksyms(st_map);
return err;
}
@@ -849,7 +917,10 @@ static void __bpf_struct_ops_map_free(struct bpf_map *map)
if (st_map->links)
bpf_struct_ops_map_put_progs(st_map);
if (st_map->ksyms)
bpf_struct_ops_map_free_ksyms(st_map);
bpf_map_area_free(st_map->links);
bpf_map_area_free(st_map->ksyms);
bpf_struct_ops_map_free_image(st_map);
bpf_map_area_free(st_map->uvalue);
bpf_map_area_free(st_map);
@@ -866,6 +937,8 @@ static void bpf_struct_ops_map_free(struct bpf_map *map)
if (btf_is_module(st_map->btf))
module_put(st_map->st_ops_desc->st_ops->owner);
bpf_struct_ops_map_del_ksyms(st_map);
/* The struct_ops's function may switch to another struct_ops.
*
* For example, bpf_tcp_cc_x->init() may switch to
@@ -895,6 +968,19 @@ static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
return 0;
}
static u32 count_func_ptrs(const struct btf *btf, const struct btf_type *t)
{
int i;
u32 count;
const struct btf_member *member;
count = 0;
for_each_member(i, t, member)
if (btf_type_resolve_func_ptr(btf, member->type, NULL))
count++;
return count;
}
static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
{
const struct bpf_struct_ops_desc *st_ops_desc;
@@ -961,11 +1047,15 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
map = &st_map->map;
st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE);
st_map->links_cnt = btf_type_vlen(t);
st_map->funcs_cnt = count_func_ptrs(btf, t);
st_map->links =
bpf_map_area_alloc(st_map->links_cnt * sizeof(struct bpf_links *),
bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_link *),
NUMA_NO_NODE);
if (!st_map->uvalue || !st_map->links) {
st_map->ksyms =
bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_ksym *),
NUMA_NO_NODE);
if (!st_map->uvalue || !st_map->links || !st_map->ksyms) {
ret = -ENOMEM;
goto errout_free;
}
@@ -994,7 +1084,8 @@ static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map)
usage = sizeof(*st_map) +
vt->size - sizeof(struct bpf_struct_ops_value);
usage += vt->size;
usage += btf_type_vlen(vt) * sizeof(struct bpf_links *);
usage += st_map->funcs_cnt * sizeof(struct bpf_link *);
usage += st_map->funcs_cnt * sizeof(struct bpf_ksym *);
usage += PAGE_SIZE;
return usage;
}

View File

@@ -128,6 +128,9 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
struct pid *pid;
int fd, err;
if ((map_flags & BPF_F_LOCK) && btf_record_has_field(map->record, BPF_UPTR))
return -EOPNOTSUPP;
fd = *(int *)key;
pid = pidfd_get_pid(fd, &f_flags);
if (IS_ERR(pid))
@@ -146,7 +149,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
bpf_task_storage_lock();
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value, map_flags,
GFP_ATOMIC);
true, GFP_ATOMIC);
bpf_task_storage_unlock();
err = PTR_ERR_OR_ZERO(sdata);
@@ -218,7 +221,7 @@ static void *__bpf_task_storage_get(struct bpf_map *map,
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE) && nobusy) {
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
BPF_NOEXIST, gfp_flags);
BPF_NOEXIST, false, gfp_flags);
return IS_ERR(sdata) ? NULL : sdata->data;
}

View File

@@ -2808,7 +2808,7 @@ static void btf_ref_type_log(struct btf_verifier_env *env,
btf_verifier_log(env, "type_id=%u", t->type);
}
static struct btf_kind_operations modifier_ops = {
static const struct btf_kind_operations modifier_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_modifier_resolve,
.check_member = btf_modifier_check_member,
@@ -2817,7 +2817,7 @@ static struct btf_kind_operations modifier_ops = {
.show = btf_modifier_show,
};
static struct btf_kind_operations ptr_ops = {
static const struct btf_kind_operations ptr_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_ptr_resolve,
.check_member = btf_ptr_check_member,
@@ -2858,7 +2858,7 @@ static void btf_fwd_type_log(struct btf_verifier_env *env,
btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct");
}
static struct btf_kind_operations fwd_ops = {
static const struct btf_kind_operations fwd_ops = {
.check_meta = btf_fwd_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_df_check_member,
@@ -3109,7 +3109,7 @@ static void btf_array_show(const struct btf *btf, const struct btf_type *t,
__btf_array_show(btf, t, type_id, data, bits_offset, show);
}
static struct btf_kind_operations array_ops = {
static const struct btf_kind_operations array_ops = {
.check_meta = btf_array_check_meta,
.resolve = btf_array_resolve,
.check_member = btf_array_check_member,
@@ -3334,7 +3334,7 @@ static int btf_find_struct(const struct btf *btf, const struct btf_type *t,
}
static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
u32 off, int sz, struct btf_field_info *info)
u32 off, int sz, struct btf_field_info *info, u32 field_mask)
{
enum btf_field_type type;
u32 res_id;
@@ -3358,9 +3358,14 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t,
type = BPF_KPTR_REF;
else if (!strcmp("percpu_kptr", __btf_name_by_offset(btf, t->name_off)))
type = BPF_KPTR_PERCPU;
else if (!strcmp("uptr", __btf_name_by_offset(btf, t->name_off)))
type = BPF_UPTR;
else
return -EINVAL;
if (!(type & field_mask))
return BTF_FIELD_IGNORE;
/* Get the base type */
t = btf_type_skip_modifiers(btf, t->type, &res_id);
/* Only pointer to struct is allowed */
@@ -3502,7 +3507,7 @@ static int btf_get_field_type(const struct btf *btf, const struct btf_type *var_
field_mask_test_name(BPF_REFCOUNT, "bpf_refcount");
/* Only return BPF_KPTR when all other types with matchable names fail */
if (field_mask & BPF_KPTR && !__btf_type_is_struct(var_type)) {
if (field_mask & (BPF_KPTR | BPF_UPTR) && !__btf_type_is_struct(var_type)) {
type = BPF_KPTR_REF;
goto end;
}
@@ -3535,6 +3540,7 @@ static int btf_repeat_fields(struct btf_field_info *info, int info_cnt,
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
case BPF_UPTR:
case BPF_LIST_HEAD:
case BPF_RB_ROOT:
break;
@@ -3667,8 +3673,9 @@ static int btf_find_field_one(const struct btf *btf,
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
case BPF_UPTR:
ret = btf_find_kptr(btf, var_type, off, sz,
info_cnt ? &info[0] : &tmp);
info_cnt ? &info[0] : &tmp, field_mask);
if (ret < 0)
return ret;
break;
@@ -3991,6 +3998,7 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
case BPF_UPTR:
ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]);
if (ret < 0)
goto end;
@@ -4050,12 +4058,28 @@ int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec)
* Hence we only need to ensure that bpf_{list_head,rb_root} ownership
* does not form cycles.
*/
if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & BPF_GRAPH_ROOT))
if (IS_ERR_OR_NULL(rec) || !(rec->field_mask & (BPF_GRAPH_ROOT | BPF_UPTR)))
return 0;
for (i = 0; i < rec->cnt; i++) {
struct btf_struct_meta *meta;
const struct btf_type *t;
u32 btf_id;
if (rec->fields[i].type == BPF_UPTR) {
/* The uptr only supports pinning one page and cannot
* point to a kernel struct
*/
if (btf_is_kernel(rec->fields[i].kptr.btf))
return -EINVAL;
t = btf_type_by_id(rec->fields[i].kptr.btf,
rec->fields[i].kptr.btf_id);
if (!t->size)
return -EINVAL;
if (t->size > PAGE_SIZE)
return -E2BIG;
continue;
}
if (!(rec->fields[i].type & BPF_GRAPH_ROOT))
continue;
btf_id = rec->fields[i].graph_root.value_btf_id;
@@ -4191,7 +4215,7 @@ static void btf_struct_show(const struct btf *btf, const struct btf_type *t,
__btf_struct_show(btf, t, type_id, data, bits_offset, show);
}
static struct btf_kind_operations struct_ops = {
static const struct btf_kind_operations struct_ops = {
.check_meta = btf_struct_check_meta,
.resolve = btf_struct_resolve,
.check_member = btf_struct_check_member,
@@ -4359,7 +4383,7 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t,
btf_show_end_type(show);
}
static struct btf_kind_operations enum_ops = {
static const struct btf_kind_operations enum_ops = {
.check_meta = btf_enum_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_enum_check_member,
@@ -4462,7 +4486,7 @@ static void btf_enum64_show(const struct btf *btf, const struct btf_type *t,
btf_show_end_type(show);
}
static struct btf_kind_operations enum64_ops = {
static const struct btf_kind_operations enum64_ops = {
.check_meta = btf_enum64_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_enum_check_member,
@@ -4540,7 +4564,7 @@ done:
btf_verifier_log(env, ")");
}
static struct btf_kind_operations func_proto_ops = {
static const struct btf_kind_operations func_proto_ops = {
.check_meta = btf_func_proto_check_meta,
.resolve = btf_df_resolve,
/*
@@ -4598,7 +4622,7 @@ static int btf_func_resolve(struct btf_verifier_env *env,
return 0;
}
static struct btf_kind_operations func_ops = {
static const struct btf_kind_operations func_ops = {
.check_meta = btf_func_check_meta,
.resolve = btf_func_resolve,
.check_member = btf_df_check_member,
@@ -5566,7 +5590,7 @@ btf_parse_struct_metas(struct bpf_verifier_log *log, struct btf *btf)
goto free_aof;
}
ret = btf_find_kptr(btf, t, 0, 0, &tmp);
ret = btf_find_kptr(btf, t, 0, 0, &tmp, BPF_KPTR);
if (ret != BTF_FIELD_FOUND)
continue;
@@ -6564,7 +6588,10 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (prog_args_trusted(prog))
info->reg_type |= PTR_TRUSTED;
if (btf_param_match_suffix(btf, &args[arg], "__nullable"))
/* Raw tracepoint arguments always get marked as maybe NULL */
if (bpf_prog_is_raw_tp(prog))
info->reg_type |= PTR_MAYBE_NULL;
else if (btf_param_match_suffix(btf, &args[arg], "__nullable"))
info->reg_type |= PTR_MAYBE_NULL;
if (tgt_prog) {

View File

@@ -131,6 +131,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode);
#endif
mutex_init(&fp->aux->used_maps_mutex);
mutex_init(&fp->aux->ext_mutex);
mutex_init(&fp->aux->dst_mutex);
return fp;
@@ -3044,6 +3045,11 @@ bool __weak bpf_jit_supports_exceptions(void)
return false;
}
bool __weak bpf_jit_supports_private_stack(void)
{
return false;
}
void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie)
{
}

View File

@@ -154,7 +154,8 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
d->image = NULL;
goto out;
}
bpf_image_ksym_add(d->image, PAGE_SIZE, &d->ksym);
bpf_image_ksym_init(d->image, PAGE_SIZE, &d->ksym);
bpf_image_ksym_add(&d->ksym);
}
prev_num_progs = d->num_progs;

View File

@@ -896,9 +896,12 @@ find_first_elem:
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
check_and_free_fields(htab, l);
migrate_disable();
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
bpf_mem_cache_free(&htab->ma, l);
migrate_enable();
}
static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
@@ -948,7 +951,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
if (htab_is_prealloc(htab)) {
bpf_map_dec_elem_count(&htab->map);
check_and_free_fields(htab, l);
__pcpu_freelist_push(&htab->freelist, &l->fnode);
pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
dec_elem_count(htab);
htab_elem_free(htab, l);
@@ -1018,7 +1021,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
*/
pl_new = this_cpu_ptr(htab->extra_elems);
l_new = *pl_new;
htab_put_fd_value(htab, old_elem);
*pl_new = old_elem;
} else {
struct pcpu_freelist_node *l;
@@ -1105,6 +1107,7 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
struct htab_elem *l_new = NULL, *l_old;
struct hlist_nulls_head *head;
unsigned long flags;
void *old_map_ptr;
struct bucket *b;
u32 key_size, hash;
int ret;
@@ -1183,12 +1186,27 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
hlist_nulls_del_rcu(&l_old->hash_node);
/* l_old has already been stashed in htab->extra_elems, free
* its special fields before it is available for reuse. Also
* save the old map pointer in htab of maps before unlock
* and release it after unlock.
*/
old_map_ptr = NULL;
if (htab_is_prealloc(htab)) {
if (map->ops->map_fd_put_ptr)
old_map_ptr = fd_htab_map_get_ptr(map, l_old);
check_and_free_fields(htab, l_old);
}
}
htab_unlock_bucket(htab, b, hash, flags);
if (l_old) {
if (old_map_ptr)
map->ops->map_fd_put_ptr(map, old_map_ptr, true);
if (!htab_is_prealloc(htab))
free_htab_elem(htab, l_old);
else
check_and_free_fields(htab, l_old);
}
ret = 0;
return 0;
err:
htab_unlock_bucket(htab, b, hash, flags);
return ret;
@@ -1432,15 +1450,15 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key)
return ret;
l = lookup_elem_raw(head, hash, key, key_size);
if (l) {
if (l)
hlist_nulls_del_rcu(&l->hash_node);
free_htab_elem(htab, l);
} else {
else
ret = -ENOENT;
}
htab_unlock_bucket(htab, b, hash, flags);
if (l)
free_htab_elem(htab, l);
return ret;
}
@@ -1853,13 +1871,14 @@ again_nocopy:
* may cause deadlock. See comments in function
* prealloc_lru_pop(). Let us do bpf_lru_push_free()
* after releasing the bucket lock.
*
* For htab of maps, htab_put_fd_value() in
* free_htab_elem() may acquire a spinlock with bucket
* lock being held and it violates the lock rule, so
* invoke free_htab_elem() after unlock as well.
*/
if (is_lru_map) {
l->batch_flink = node_to_free;
node_to_free = l;
} else {
free_htab_elem(htab, l);
}
l->batch_flink = node_to_free;
node_to_free = l;
}
dst_key += key_size;
dst_val += value_size;
@@ -1871,7 +1890,10 @@ again_nocopy:
while (node_to_free) {
l = node_to_free;
node_to_free = node_to_free->batch_flink;
htab_lru_push_free(htab, l);
if (is_lru_map)
htab_lru_push_free(htab, l);
else
free_htab_elem(htab, l);
}
next_batch:

View File

@@ -2521,6 +2521,25 @@ __bpf_kfunc struct task_struct *bpf_task_from_pid(s32 pid)
return p;
}
/**
* bpf_task_from_vpid - Find a struct task_struct from its vpid by looking it up
* in the pid namespace of the current task. If a task is returned, it must
* either be stored in a map, or released with bpf_task_release().
* @vpid: The vpid of the task being looked up.
*/
__bpf_kfunc struct task_struct *bpf_task_from_vpid(s32 vpid)
{
struct task_struct *p;
rcu_read_lock();
p = find_task_by_vpid(vpid);
if (p)
p = bpf_task_acquire(p);
rcu_read_unlock();
return p;
}
/**
* bpf_dynptr_slice() - Obtain a read-only pointer to the dynptr data.
* @p: The dynptr whose data slice to retrieve
@@ -3068,7 +3087,9 @@ BTF_ID_FLAGS(func, bpf_task_under_cgroup, KF_RCU)
BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
#endif
BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_throw)
BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(generic_btf_ids)
static const struct btf_kfunc_id_set generic_kfunc_set = {
@@ -3086,8 +3107,8 @@ BTF_ID(func, bpf_cgroup_release_dtor)
#endif
BTF_KFUNCS_START(common_btf_ids)
BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx)
BTF_ID_FLAGS(func, bpf_rdonly_cast)
BTF_ID_FLAGS(func, bpf_cast_to_kern_ctx, KF_FASTCALL)
BTF_ID_FLAGS(func, bpf_rdonly_cast, KF_FASTCALL)
BTF_ID_FLAGS(func, bpf_rcu_read_lock)
BTF_ID_FLAGS(func, bpf_rcu_read_unlock)
BTF_ID_FLAGS(func, bpf_dynptr_slice, KF_RET_NULL)
@@ -3124,6 +3145,10 @@ BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW)
BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY)
BTF_ID_FLAGS(func, bpf_copy_from_user_str, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_get_kmem_cache)
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {

View File

@@ -0,0 +1,238 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2024 Google */
#include <linux/bpf.h>
#include <linux/btf_ids.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/seq_file.h>
#include "../../mm/slab.h" /* kmem_cache, slab_caches and slab_mutex */
/* open-coded version */
struct bpf_iter_kmem_cache {
__u64 __opaque[1];
} __attribute__((aligned(8)));
struct bpf_iter_kmem_cache_kern {
struct kmem_cache *pos;
} __attribute__((aligned(8)));
#define KMEM_CACHE_POS_START ((void *)1L)
__bpf_kfunc_start_defs();
__bpf_kfunc int bpf_iter_kmem_cache_new(struct bpf_iter_kmem_cache *it)
{
struct bpf_iter_kmem_cache_kern *kit = (void *)it;
BUILD_BUG_ON(sizeof(*kit) > sizeof(*it));
BUILD_BUG_ON(__alignof__(*kit) != __alignof__(*it));
kit->pos = KMEM_CACHE_POS_START;
return 0;
}
__bpf_kfunc struct kmem_cache *bpf_iter_kmem_cache_next(struct bpf_iter_kmem_cache *it)
{
struct bpf_iter_kmem_cache_kern *kit = (void *)it;
struct kmem_cache *prev = kit->pos;
struct kmem_cache *next;
bool destroy = false;
if (!prev)
return NULL;
mutex_lock(&slab_mutex);
if (list_empty(&slab_caches)) {
mutex_unlock(&slab_mutex);
return NULL;
}
if (prev == KMEM_CACHE_POS_START)
next = list_first_entry(&slab_caches, struct kmem_cache, list);
else if (list_last_entry(&slab_caches, struct kmem_cache, list) == prev)
next = NULL;
else
next = list_next_entry(prev, list);
/* boot_caches have negative refcount, don't touch them */
if (next && next->refcount > 0)
next->refcount++;
/* Skip kmem_cache_destroy() for active entries */
if (prev && prev != KMEM_CACHE_POS_START) {
if (prev->refcount > 1)
prev->refcount--;
else if (prev->refcount == 1)
destroy = true;
}
mutex_unlock(&slab_mutex);
if (destroy)
kmem_cache_destroy(prev);
kit->pos = next;
return next;
}
__bpf_kfunc void bpf_iter_kmem_cache_destroy(struct bpf_iter_kmem_cache *it)
{
struct bpf_iter_kmem_cache_kern *kit = (void *)it;
struct kmem_cache *s = kit->pos;
bool destroy = false;
if (s == NULL || s == KMEM_CACHE_POS_START)
return;
mutex_lock(&slab_mutex);
/* Skip kmem_cache_destroy() for active entries */
if (s->refcount > 1)
s->refcount--;
else if (s->refcount == 1)
destroy = true;
mutex_unlock(&slab_mutex);
if (destroy)
kmem_cache_destroy(s);
}
__bpf_kfunc_end_defs();
struct bpf_iter__kmem_cache {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct kmem_cache *, s);
};
union kmem_cache_iter_priv {
struct bpf_iter_kmem_cache it;
struct bpf_iter_kmem_cache_kern kit;
};
static void *kmem_cache_iter_seq_start(struct seq_file *seq, loff_t *pos)
{
loff_t cnt = 0;
bool found = false;
struct kmem_cache *s;
union kmem_cache_iter_priv *p = seq->private;
mutex_lock(&slab_mutex);
/* Find an entry at the given position in the slab_caches list instead
* of keeping a reference (of the last visited entry, if any) out of
* slab_mutex. It might miss something if one is deleted in the middle
* while it releases the lock. But it should be rare and there's not
* much we can do about it.
*/
list_for_each_entry(s, &slab_caches, list) {
if (cnt == *pos) {
/* Make sure this entry remains in the list by getting
* a new reference count. Note that boot_cache entries
* have a negative refcount, so don't touch them.
*/
if (s->refcount > 0)
s->refcount++;
found = true;
break;
}
cnt++;
}
mutex_unlock(&slab_mutex);
if (!found)
s = NULL;
p->kit.pos = s;
return s;
}
static void kmem_cache_iter_seq_stop(struct seq_file *seq, void *v)
{
struct bpf_iter_meta meta;
struct bpf_iter__kmem_cache ctx = {
.meta = &meta,
.s = v,
};
union kmem_cache_iter_priv *p = seq->private;
struct bpf_prog *prog;
meta.seq = seq;
prog = bpf_iter_get_info(&meta, true);
if (prog && !ctx.s)
bpf_iter_run_prog(prog, &ctx);
bpf_iter_kmem_cache_destroy(&p->it);
}
static void *kmem_cache_iter_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
union kmem_cache_iter_priv *p = seq->private;
++*pos;
return bpf_iter_kmem_cache_next(&p->it);
}
static int kmem_cache_iter_seq_show(struct seq_file *seq, void *v)
{
struct bpf_iter_meta meta;
struct bpf_iter__kmem_cache ctx = {
.meta = &meta,
.s = v,
};
struct bpf_prog *prog;
int ret = 0;
meta.seq = seq;
prog = bpf_iter_get_info(&meta, false);
if (prog)
ret = bpf_iter_run_prog(prog, &ctx);
return ret;
}
static const struct seq_operations kmem_cache_iter_seq_ops = {
.start = kmem_cache_iter_seq_start,
.next = kmem_cache_iter_seq_next,
.stop = kmem_cache_iter_seq_stop,
.show = kmem_cache_iter_seq_show,
};
BTF_ID_LIST_GLOBAL_SINGLE(bpf_kmem_cache_btf_id, struct, kmem_cache)
static const struct bpf_iter_seq_info kmem_cache_iter_seq_info = {
.seq_ops = &kmem_cache_iter_seq_ops,
.seq_priv_size = sizeof(union kmem_cache_iter_priv),
};
static void bpf_iter_kmem_cache_show_fdinfo(const struct bpf_iter_aux_info *aux,
struct seq_file *seq)
{
seq_puts(seq, "kmem_cache iter\n");
}
DEFINE_BPF_ITER_FUNC(kmem_cache, struct bpf_iter_meta *meta,
struct kmem_cache *s)
static struct bpf_iter_reg bpf_kmem_cache_reg_info = {
.target = "kmem_cache",
.feature = BPF_ITER_RESCHED,
.show_fdinfo = bpf_iter_kmem_cache_show_fdinfo,
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__kmem_cache, s),
PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
},
.seq_info = &kmem_cache_iter_seq_info,
};
static int __init bpf_kmem_cache_iter_init(void)
{
bpf_kmem_cache_reg_info.ctx_arg_info[0].btf_id = bpf_kmem_cache_btf_id[0];
return bpf_iter_reg_target(&bpf_kmem_cache_reg_info);
}
late_initcall(bpf_kmem_cache_iter_init);

View File

@@ -254,11 +254,8 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
static void free_one(void *obj, bool percpu)
{
if (percpu) {
if (percpu)
free_percpu(((void __percpu **)obj)[1]);
kfree(obj);
return;
}
kfree(obj);
}

272
kernel/bpf/range_tree.c Normal file
View File

@@ -0,0 +1,272 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#include <linux/interval_tree_generic.h>
#include <linux/slab.h>
#include <linux/bpf_mem_alloc.h>
#include <linux/bpf.h>
#include "range_tree.h"
/*
* struct range_tree is a data structure used to allocate contiguous memory
* ranges in bpf arena. It's a large bitmap. The contiguous sequence of bits is
* represented by struct range_node or 'rn' for short.
* rn->rn_rbnode links it into an interval tree while
* rn->rb_range_size links it into a second rbtree sorted by size of the range.
* __find_range() performs binary search and best fit algorithm to find the
* range less or equal requested size.
* range_tree_clear/set() clears or sets a range of bits in this bitmap. The
* adjacent ranges are merged or split at the same time.
*
* The split/merge logic is based/borrowed from XFS's xbitmap32 added
* in commit 6772fcc8890a ("xfs: convert xbitmap to interval tree").
*
* The implementation relies on external lock to protect rbtree-s.
* The alloc/free of range_node-s is done via bpf_mem_alloc.
*
* bpf arena is using range_tree to represent unallocated slots.
* At init time:
* range_tree_set(rt, 0, max);
* Then:
* start = range_tree_find(rt, len);
* if (start >= 0)
* range_tree_clear(rt, start, len);
* to find free range and mark slots as allocated and later:
* range_tree_set(rt, start, len);
* to mark as unallocated after use.
*/
struct range_node {
struct rb_node rn_rbnode;
struct rb_node rb_range_size;
u32 rn_start;
u32 rn_last; /* inclusive */
u32 __rn_subtree_last;
};
static struct range_node *rb_to_range_node(struct rb_node *rb)
{
return rb_entry(rb, struct range_node, rb_range_size);
}
static u32 rn_size(struct range_node *rn)
{
return rn->rn_last - rn->rn_start + 1;
}
/* Find range that fits best to requested size */
static inline struct range_node *__find_range(struct range_tree *rt, u32 len)
{
struct rb_node *rb = rt->range_size_root.rb_root.rb_node;
struct range_node *best = NULL;
while (rb) {
struct range_node *rn = rb_to_range_node(rb);
if (len <= rn_size(rn)) {
best = rn;
rb = rb->rb_right;
} else {
rb = rb->rb_left;
}
}
return best;
}
s64 range_tree_find(struct range_tree *rt, u32 len)
{
struct range_node *rn;
rn = __find_range(rt, len);
if (!rn)
return -ENOENT;
return rn->rn_start;
}
/* Insert the range into rbtree sorted by the range size */
static inline void __range_size_insert(struct range_node *rn,
struct rb_root_cached *root)
{
struct rb_node **link = &root->rb_root.rb_node, *rb = NULL;
u64 size = rn_size(rn);
bool leftmost = true;
while (*link) {
rb = *link;
if (size > rn_size(rb_to_range_node(rb))) {
link = &rb->rb_left;
} else {
link = &rb->rb_right;
leftmost = false;
}
}
rb_link_node(&rn->rb_range_size, rb, link);
rb_insert_color_cached(&rn->rb_range_size, root, leftmost);
}
#define START(node) ((node)->rn_start)
#define LAST(node) ((node)->rn_last)
INTERVAL_TREE_DEFINE(struct range_node, rn_rbnode, u32,
__rn_subtree_last, START, LAST,
static inline __maybe_unused,
__range_it)
static inline __maybe_unused void
range_it_insert(struct range_node *rn, struct range_tree *rt)
{
__range_size_insert(rn, &rt->range_size_root);
__range_it_insert(rn, &rt->it_root);
}
static inline __maybe_unused void
range_it_remove(struct range_node *rn, struct range_tree *rt)
{
rb_erase_cached(&rn->rb_range_size, &rt->range_size_root);
RB_CLEAR_NODE(&rn->rb_range_size);
__range_it_remove(rn, &rt->it_root);
}
static inline __maybe_unused struct range_node *
range_it_iter_first(struct range_tree *rt, u32 start, u32 last)
{
return __range_it_iter_first(&rt->it_root, start, last);
}
/* Clear the range in this range tree */
int range_tree_clear(struct range_tree *rt, u32 start, u32 len)
{
u32 last = start + len - 1;
struct range_node *new_rn;
struct range_node *rn;
while ((rn = range_it_iter_first(rt, start, last))) {
if (rn->rn_start < start && rn->rn_last > last) {
u32 old_last = rn->rn_last;
/* Overlaps with the entire clearing range */
range_it_remove(rn, rt);
rn->rn_last = start - 1;
range_it_insert(rn, rt);
/* Add a range */
migrate_disable();
new_rn = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node));
migrate_enable();
if (!new_rn)
return -ENOMEM;
new_rn->rn_start = last + 1;
new_rn->rn_last = old_last;
range_it_insert(new_rn, rt);
} else if (rn->rn_start < start) {
/* Overlaps with the left side of the clearing range */
range_it_remove(rn, rt);
rn->rn_last = start - 1;
range_it_insert(rn, rt);
} else if (rn->rn_last > last) {
/* Overlaps with the right side of the clearing range */
range_it_remove(rn, rt);
rn->rn_start = last + 1;
range_it_insert(rn, rt);
break;
} else {
/* in the middle of the clearing range */
range_it_remove(rn, rt);
migrate_disable();
bpf_mem_free(&bpf_global_ma, rn);
migrate_enable();
}
}
return 0;
}
/* Is the whole range set ? */
int is_range_tree_set(struct range_tree *rt, u32 start, u32 len)
{
u32 last = start + len - 1;
struct range_node *left;
/* Is this whole range set ? */
left = range_it_iter_first(rt, start, last);
if (left && left->rn_start <= start && left->rn_last >= last)
return 0;
return -ESRCH;
}
/* Set the range in this range tree */
int range_tree_set(struct range_tree *rt, u32 start, u32 len)
{
u32 last = start + len - 1;
struct range_node *right;
struct range_node *left;
int err;
/* Is this whole range already set ? */
left = range_it_iter_first(rt, start, last);
if (left && left->rn_start <= start && left->rn_last >= last)
return 0;
/* Clear out everything in the range we want to set. */
err = range_tree_clear(rt, start, len);
if (err)
return err;
/* Do we have a left-adjacent range ? */
left = range_it_iter_first(rt, start - 1, start - 1);
if (left && left->rn_last + 1 != start)
return -EFAULT;
/* Do we have a right-adjacent range ? */
right = range_it_iter_first(rt, last + 1, last + 1);
if (right && right->rn_start != last + 1)
return -EFAULT;
if (left && right) {
/* Combine left and right adjacent ranges */
range_it_remove(left, rt);
range_it_remove(right, rt);
left->rn_last = right->rn_last;
range_it_insert(left, rt);
migrate_disable();
bpf_mem_free(&bpf_global_ma, right);
migrate_enable();
} else if (left) {
/* Combine with the left range */
range_it_remove(left, rt);
left->rn_last = last;
range_it_insert(left, rt);
} else if (right) {
/* Combine with the right range */
range_it_remove(right, rt);
right->rn_start = start;
range_it_insert(right, rt);
} else {
migrate_disable();
left = bpf_mem_alloc(&bpf_global_ma, sizeof(struct range_node));
migrate_enable();
if (!left)
return -ENOMEM;
left->rn_start = start;
left->rn_last = last;
range_it_insert(left, rt);
}
return 0;
}
void range_tree_destroy(struct range_tree *rt)
{
struct range_node *rn;
while ((rn = range_it_iter_first(rt, 0, -1U))) {
range_it_remove(rn, rt);
migrate_disable();
bpf_mem_free(&bpf_global_ma, rn);
migrate_enable();
}
}
void range_tree_init(struct range_tree *rt)
{
rt->it_root = RB_ROOT_CACHED;
rt->range_size_root = RB_ROOT_CACHED;
}

21
kernel/bpf/range_tree.h Normal file
View File

@@ -0,0 +1,21 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */
#ifndef _RANGE_TREE_H
#define _RANGE_TREE_H 1
struct range_tree {
/* root of interval tree */
struct rb_root_cached it_root;
/* root of rbtree of interval sizes */
struct rb_root_cached range_size_root;
};
void range_tree_init(struct range_tree *rt);
void range_tree_destroy(struct range_tree *rt);
int range_tree_clear(struct range_tree *rt, u32 start, u32 len);
int range_tree_set(struct range_tree *rt, u32 start, u32 len);
int is_range_tree_set(struct range_tree *rt, u32 start, u32 len);
s64 range_tree_find(struct range_tree *rt, u32 len);
#endif

View File

@@ -155,6 +155,89 @@ static void maybe_wait_bpf_programs(struct bpf_map *map)
synchronize_rcu();
}
static void unpin_uptr_kaddr(void *kaddr)
{
if (kaddr)
unpin_user_page(virt_to_page(kaddr));
}
static void __bpf_obj_unpin_uptrs(struct btf_record *rec, u32 cnt, void *obj)
{
const struct btf_field *field;
void **uptr_addr;
int i;
for (i = 0, field = rec->fields; i < cnt; i++, field++) {
if (field->type != BPF_UPTR)
continue;
uptr_addr = obj + field->offset;
unpin_uptr_kaddr(*uptr_addr);
}
}
static void bpf_obj_unpin_uptrs(struct btf_record *rec, void *obj)
{
if (!btf_record_has_field(rec, BPF_UPTR))
return;
__bpf_obj_unpin_uptrs(rec, rec->cnt, obj);
}
static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj)
{
const struct btf_field *field;
const struct btf_type *t;
unsigned long start, end;
struct page *page;
void **uptr_addr;
int i, err;
if (!btf_record_has_field(rec, BPF_UPTR))
return 0;
for (i = 0, field = rec->fields; i < rec->cnt; i++, field++) {
if (field->type != BPF_UPTR)
continue;
uptr_addr = obj + field->offset;
start = *(unsigned long *)uptr_addr;
if (!start)
continue;
t = btf_type_by_id(field->kptr.btf, field->kptr.btf_id);
/* t->size was checked for zero before */
if (check_add_overflow(start, t->size - 1, &end)) {
err = -EFAULT;
goto unpin_all;
}
/* The uptr's struct cannot span across two pages */
if ((start & PAGE_MASK) != (end & PAGE_MASK)) {
err = -EOPNOTSUPP;
goto unpin_all;
}
err = pin_user_pages_fast(start, 1, FOLL_LONGTERM | FOLL_WRITE, &page);
if (err != 1)
goto unpin_all;
if (PageHighMem(page)) {
err = -EOPNOTSUPP;
unpin_user_page(page);
goto unpin_all;
}
*uptr_addr = page_address(page) + offset_in_page(start);
}
return 0;
unpin_all:
__bpf_obj_unpin_uptrs(rec, i, obj);
return err;
}
static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
void *key, void *value, __u64 flags)
{
@@ -199,9 +282,14 @@ static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
err = map->ops->map_push_elem(map, value, flags);
} else {
rcu_read_lock();
err = map->ops->map_update_elem(map, key, value, flags);
rcu_read_unlock();
err = bpf_obj_pin_uptrs(map->record, value);
if (!err) {
rcu_read_lock();
err = map->ops->map_update_elem(map, key, value, flags);
rcu_read_unlock();
if (err)
bpf_obj_unpin_uptrs(map->record, value);
}
}
bpf_enable_instrumentation();
@@ -548,6 +636,7 @@ void btf_record_free(struct btf_record *rec)
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
case BPF_UPTR:
if (rec->fields[i].kptr.module)
module_put(rec->fields[i].kptr.module);
if (btf_is_kernel(rec->fields[i].kptr.btf))
@@ -597,6 +686,7 @@ struct btf_record *btf_record_dup(const struct btf_record *rec)
case BPF_KPTR_UNREF:
case BPF_KPTR_REF:
case BPF_KPTR_PERCPU:
case BPF_UPTR:
if (btf_is_kernel(fields[i].kptr.btf))
btf_get(fields[i].kptr.btf);
if (fields[i].kptr.module && !try_module_get(fields[i].kptr.module)) {
@@ -714,6 +804,10 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
field->kptr.dtor(xchgd_field);
}
break;
case BPF_UPTR:
/* The caller ensured that no one is using the uptr */
unpin_uptr_kaddr(*(void **)field_ptr);
break;
case BPF_LIST_HEAD:
if (WARN_ON_ONCE(rec->spin_lock_off < 0))
continue;
@@ -1105,7 +1199,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
map->record = btf_parse_fields(btf, value_type,
BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR | BPF_LIST_HEAD |
BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE,
BPF_RB_ROOT | BPF_REFCOUNT | BPF_WORKQUEUE | BPF_UPTR,
map->value_size);
if (!IS_ERR_OR_NULL(map->record)) {
int i;
@@ -1161,6 +1255,12 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
goto free_map_tab;
}
break;
case BPF_UPTR:
if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) {
ret = -EOPNOTSUPP;
goto free_map_tab;
}
break;
case BPF_LIST_HEAD:
case BPF_RB_ROOT:
if (map->map_type != BPF_MAP_TYPE_HASH &&
@@ -3218,7 +3318,8 @@ static void bpf_tracing_link_release(struct bpf_link *link)
container_of(link, struct bpf_tracing_link, link.link);
WARN_ON_ONCE(bpf_trampoline_unlink_prog(&tr_link->link,
tr_link->trampoline));
tr_link->trampoline,
tr_link->tgt_prog));
bpf_trampoline_put(tr_link->trampoline);
@@ -3358,7 +3459,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
* in prog->aux
*
* - if prog->aux->dst_trampoline is NULL, the program has already been
* attached to a target and its initial target was cleared (below)
* attached to a target and its initial target was cleared (below)
*
* - if tgt_prog != NULL, the caller specified tgt_prog_fd +
* target_btf_id using the link_create API.
@@ -3433,7 +3534,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
if (err)
goto out_unlock;
err = bpf_trampoline_link_prog(&link->link, tr);
err = bpf_trampoline_link_prog(&link->link, tr, tgt_prog);
if (err) {
bpf_link_cleanup(&link_primer);
link = NULL;
@@ -4002,10 +4103,14 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI &&
attach_type != BPF_TRACE_UPROBE_MULTI)
return -EINVAL;
if (prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION &&
attach_type != BPF_TRACE_UPROBE_SESSION)
return -EINVAL;
if (attach_type != BPF_PERF_EVENT &&
attach_type != BPF_TRACE_KPROBE_MULTI &&
attach_type != BPF_TRACE_KPROBE_SESSION &&
attach_type != BPF_TRACE_UPROBE_MULTI)
attach_type != BPF_TRACE_UPROBE_MULTI &&
attach_type != BPF_TRACE_UPROBE_SESSION)
return -EINVAL;
return 0;
case BPF_PROG_TYPE_SCHED_CLS:
@@ -5258,7 +5363,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
else if (attr->link_create.attach_type == BPF_TRACE_KPROBE_MULTI ||
attr->link_create.attach_type == BPF_TRACE_KPROBE_SESSION)
ret = bpf_kprobe_multi_link_attach(attr, prog);
else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI)
else if (attr->link_create.attach_type == BPF_TRACE_UPROBE_MULTI ||
attr->link_create.attach_type == BPF_TRACE_UPROBE_SESSION)
ret = bpf_uprobe_multi_link_attach(attr, prog);
break;
default:

View File

@@ -115,10 +115,14 @@ bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
(ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC);
}
void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym)
void bpf_image_ksym_init(void *data, unsigned int size, struct bpf_ksym *ksym)
{
ksym->start = (unsigned long) data;
ksym->end = ksym->start + size;
}
void bpf_image_ksym_add(struct bpf_ksym *ksym)
{
bpf_ksym_add(ksym);
perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
PAGE_SIZE, false, ksym->name);
@@ -377,7 +381,8 @@ static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size)
ksym = &im->ksym;
INIT_LIST_HEAD_RCU(&ksym->lnode);
snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", key);
bpf_image_ksym_add(image, size, ksym);
bpf_image_ksym_init(image, size, ksym);
bpf_image_ksym_add(ksym);
return im;
out_free_image:
@@ -523,7 +528,27 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
}
}
static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
static int bpf_freplace_check_tgt_prog(struct bpf_prog *tgt_prog)
{
struct bpf_prog_aux *aux = tgt_prog->aux;
guard(mutex)(&aux->ext_mutex);
if (aux->prog_array_member_cnt)
/* Program extensions can not extend target prog when the target
* prog has been updated to any prog_array map as tail callee.
* It's to prevent a potential infinite loop like:
* tgt prog entry -> tgt prog subprog -> freplace prog entry
* --tailcall-> tgt prog entry.
*/
return -EBUSY;
aux->is_extended = true;
return 0;
}
static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link,
struct bpf_trampoline *tr,
struct bpf_prog *tgt_prog)
{
enum bpf_tramp_prog_type kind;
struct bpf_tramp_link *link_exiting;
@@ -544,6 +569,9 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr
/* Cannot attach extension if fentry/fexit are in use. */
if (cnt)
return -EBUSY;
err = bpf_freplace_check_tgt_prog(tgt_prog);
if (err)
return err;
tr->extension_prog = link->link.prog;
return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
link->link.prog->bpf_func);
@@ -570,17 +598,21 @@ static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_tr
return err;
}
int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
int bpf_trampoline_link_prog(struct bpf_tramp_link *link,
struct bpf_trampoline *tr,
struct bpf_prog *tgt_prog)
{
int err;
mutex_lock(&tr->mutex);
err = __bpf_trampoline_link_prog(link, tr);
err = __bpf_trampoline_link_prog(link, tr, tgt_prog);
mutex_unlock(&tr->mutex);
return err;
}
static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
struct bpf_trampoline *tr,
struct bpf_prog *tgt_prog)
{
enum bpf_tramp_prog_type kind;
int err;
@@ -591,6 +623,8 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
tr->extension_prog->bpf_func, NULL);
tr->extension_prog = NULL;
guard(mutex)(&tgt_prog->aux->ext_mutex);
tgt_prog->aux->is_extended = false;
return err;
}
hlist_del_init(&link->tramp_hlist);
@@ -599,12 +633,14 @@ static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_
}
/* bpf_trampoline_unlink_prog() should never fail. */
int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link,
struct bpf_trampoline *tr,
struct bpf_prog *tgt_prog)
{
int err;
mutex_lock(&tr->mutex);
err = __bpf_trampoline_unlink_prog(link, tr);
err = __bpf_trampoline_unlink_prog(link, tr, tgt_prog);
mutex_unlock(&tr->mutex);
return err;
}
@@ -619,7 +655,7 @@ static void bpf_shim_tramp_link_release(struct bpf_link *link)
if (!shim_link->trampoline)
return;
WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline));
WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline, NULL));
bpf_trampoline_put(shim_link->trampoline);
}
@@ -733,7 +769,7 @@ int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
goto err;
}
err = __bpf_trampoline_link_prog(&shim_link->link, tr);
err = __bpf_trampoline_link_prog(&shim_link->link, tr, NULL);
if (err)
goto err;
@@ -868,6 +904,8 @@ static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tram
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
bpf_prog_inc_misses_counter(prog);
if (prog->aux->recursion_detected)
prog->aux->recursion_detected(prog);
return 0;
}
return bpf_prog_start_time();
@@ -944,6 +982,8 @@ u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog,
if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) {
bpf_prog_inc_misses_counter(prog);
if (prog->aux->recursion_detected)
prog->aux->recursion_detected(prog);
return 0;
}
return bpf_prog_start_time();

File diff suppressed because it is too large Load Diff

View File

@@ -802,6 +802,8 @@ struct send_signal_irq_work {
struct task_struct *task;
u32 sig;
enum pid_type type;
bool has_siginfo;
struct kernel_siginfo info;
};
static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
@@ -809,27 +811,46 @@ static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);
static void do_bpf_send_signal(struct irq_work *entry)
{
struct send_signal_irq_work *work;
struct kernel_siginfo *siginfo;
work = container_of(entry, struct send_signal_irq_work, irq_work);
group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, work->type);
siginfo = work->has_siginfo ? &work->info : SEND_SIG_PRIV;
group_send_sig_info(work->sig, siginfo, work->task, work->type);
put_task_struct(work->task);
}
static int bpf_send_signal_common(u32 sig, enum pid_type type)
static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struct *task, u64 value)
{
struct send_signal_irq_work *work = NULL;
struct kernel_siginfo info;
struct kernel_siginfo *siginfo;
if (!task) {
task = current;
siginfo = SEND_SIG_PRIV;
} else {
clear_siginfo(&info);
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_KERNEL;
info.si_pid = 0;
info.si_uid = 0;
info.si_value.sival_ptr = (void *)(unsigned long)value;
siginfo = &info;
}
/* Similar to bpf_probe_write_user, task needs to be
* in a sound condition and kernel memory access be
* permitted in order to send signal to the current
* task.
*/
if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
if (unlikely(task->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
/* Task should not be pid=1 to avoid kernel panic. */
if (unlikely(is_global_init(current)))
if (unlikely(is_global_init(task)))
return -EPERM;
if (irqs_disabled()) {
@@ -847,19 +868,22 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
* to the irq_work. The current task may change when queued
* irq works get executed.
*/
work->task = get_task_struct(current);
work->task = get_task_struct(task);
work->has_siginfo = siginfo == &info;
if (work->has_siginfo)
copy_siginfo(&work->info, &info);
work->sig = sig;
work->type = type;
irq_work_queue(&work->irq_work);
return 0;
}
return group_send_sig_info(sig, SEND_SIG_PRIV, current, type);
return group_send_sig_info(sig, siginfo, task, type);
}
BPF_CALL_1(bpf_send_signal, u32, sig)
{
return bpf_send_signal_common(sig, PIDTYPE_TGID);
return bpf_send_signal_common(sig, PIDTYPE_TGID, NULL, 0);
}
static const struct bpf_func_proto bpf_send_signal_proto = {
@@ -871,7 +895,7 @@ static const struct bpf_func_proto bpf_send_signal_proto = {
BPF_CALL_1(bpf_send_signal_thread, u32, sig)
{
return bpf_send_signal_common(sig, PIDTYPE_PID);
return bpf_send_signal_common(sig, PIDTYPE_PID, NULL, 0);
}
static const struct bpf_func_proto bpf_send_signal_thread_proto = {
@@ -1557,6 +1581,17 @@ static inline bool is_kprobe_session(const struct bpf_prog *prog)
return prog->expected_attach_type == BPF_TRACE_KPROBE_SESSION;
}
static inline bool is_uprobe_multi(const struct bpf_prog *prog)
{
return prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI ||
prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
}
static inline bool is_uprobe_session(const struct bpf_prog *prog)
{
return prog->expected_attach_type == BPF_TRACE_UPROBE_SESSION;
}
static const struct bpf_func_proto *
kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -1574,13 +1609,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_func_ip:
if (is_kprobe_multi(prog))
return &bpf_get_func_ip_proto_kprobe_multi;
if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
if (is_uprobe_multi(prog))
return &bpf_get_func_ip_proto_uprobe_multi;
return &bpf_get_func_ip_proto_kprobe;
case BPF_FUNC_get_attach_cookie:
if (is_kprobe_multi(prog))
return &bpf_get_attach_cookie_proto_kmulti;
if (prog->expected_attach_type == BPF_TRACE_UPROBE_MULTI)
if (is_uprobe_multi(prog))
return &bpf_get_attach_cookie_proto_umulti;
return &bpf_get_attach_cookie_proto_trace;
default:
@@ -3072,6 +3107,7 @@ struct bpf_uprobe {
u64 cookie;
struct uprobe *uprobe;
struct uprobe_consumer consumer;
bool session;
};
struct bpf_uprobe_multi_link {
@@ -3084,7 +3120,7 @@ struct bpf_uprobe_multi_link {
};
struct bpf_uprobe_multi_run_ctx {
struct bpf_run_ctx run_ctx;
struct bpf_session_run_ctx session_ctx;
unsigned long entry_ip;
struct bpf_uprobe *uprobe;
};
@@ -3195,17 +3231,22 @@ static const struct bpf_link_ops bpf_uprobe_multi_link_lops = {
static int uprobe_prog_run(struct bpf_uprobe *uprobe,
unsigned long entry_ip,
struct pt_regs *regs)
struct pt_regs *regs,
bool is_return, void *data)
{
struct bpf_uprobe_multi_link *link = uprobe->link;
struct bpf_uprobe_multi_run_ctx run_ctx = {
.session_ctx = {
.is_return = is_return,
.data = data,
},
.entry_ip = entry_ip,
.uprobe = uprobe,
};
struct bpf_prog *prog = link->link.prog;
bool sleepable = prog->sleepable;
struct bpf_run_ctx *old_run_ctx;
int err = 0;
int err;
if (link->task && !same_thread_group(current, link->task))
return 0;
@@ -3217,7 +3258,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
migrate_disable();
old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
err = bpf_prog_run(link->link.prog, regs);
bpf_reset_run_ctx(old_run_ctx);
@@ -3244,9 +3285,13 @@ uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs,
__u64 *data)
{
struct bpf_uprobe *uprobe;
int ret;
uprobe = container_of(con, struct bpf_uprobe, consumer);
return uprobe_prog_run(uprobe, instruction_pointer(regs), regs);
ret = uprobe_prog_run(uprobe, instruction_pointer(regs), regs, false, data);
if (uprobe->session)
return ret ? UPROBE_HANDLER_IGNORE : 0;
return 0;
}
static int
@@ -3256,14 +3301,16 @@ uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, s
struct bpf_uprobe *uprobe;
uprobe = container_of(con, struct bpf_uprobe, consumer);
return uprobe_prog_run(uprobe, func, regs);
uprobe_prog_run(uprobe, func, regs, true, data);
return 0;
}
static u64 bpf_uprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
{
struct bpf_uprobe_multi_run_ctx *run_ctx;
run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx);
run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx,
session_ctx.run_ctx);
return run_ctx->entry_ip;
}
@@ -3271,7 +3318,8 @@ static u64 bpf_uprobe_multi_cookie(struct bpf_run_ctx *ctx)
{
struct bpf_uprobe_multi_run_ctx *run_ctx;
run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx, run_ctx);
run_ctx = container_of(current->bpf_ctx, struct bpf_uprobe_multi_run_ctx,
session_ctx.run_ctx);
return run_ctx->uprobe->cookie;
}
@@ -3295,7 +3343,7 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
if (sizeof(u64) != sizeof(void *))
return -EOPNOTSUPP;
if (prog->expected_attach_type != BPF_TRACE_UPROBE_MULTI)
if (!is_uprobe_multi(prog))
return -EINVAL;
flags = attr->link_create.uprobe_multi.flags;
@@ -3371,11 +3419,12 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
uprobes[i].link = link;
if (flags & BPF_F_UPROBE_MULTI_RETURN)
uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler;
else
if (!(flags & BPF_F_UPROBE_MULTI_RETURN))
uprobes[i].consumer.handler = uprobe_multi_link_handler;
if (flags & BPF_F_UPROBE_MULTI_RETURN || is_uprobe_session(prog))
uprobes[i].consumer.ret_handler = uprobe_multi_link_ret_handler;
if (is_uprobe_session(prog))
uprobes[i].session = true;
if (pid)
uprobes[i].consumer.filter = uprobe_multi_link_filter;
}
@@ -3464,7 +3513,7 @@ static int bpf_kprobe_multi_filter(const struct bpf_prog *prog, u32 kfunc_id)
if (!btf_id_set8_contains(&kprobe_multi_kfunc_set_ids, kfunc_id))
return 0;
if (!is_kprobe_session(prog))
if (!is_kprobe_session(prog) && !is_uprobe_session(prog))
return -EACCES;
return 0;
@@ -3482,3 +3531,16 @@ static int __init bpf_kprobe_multi_kfuncs_init(void)
}
late_initcall(bpf_kprobe_multi_kfuncs_init);
__bpf_kfunc_start_defs();
__bpf_kfunc int bpf_send_signal_task(struct task_struct *task, int sig, enum pid_type type,
u64 value)
{
if (type != PIDTYPE_PID && type != PIDTYPE_TGID)
return -EINVAL;
return bpf_send_signal_common(sig, type, task, value);
}
__bpf_kfunc_end_defs();