[ Applies on top of bennee/mttcg/enable-mttcg-for-armv7-v1 after reverting "translate-all: introduces tb_flush_safe". A trivial conflict must be solved after applying. ]
This is a first attempt at making tb_flush not have to stop all CPUs. There are issues as pointed out below, but this could be a good start. Context: https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04658.html https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06942.html Changes from v1: - When a static buffer is used, split it in two instead of using a second buffer. Known issues: - Fails Alex' unit test with low enough -tb-size, see https://lists.gnu.org/archive/html/qemu-devel/2016-04/msg03465.html Seems to work in MTTCG, although I've only tested with tb_lock always being taken in tb_find_fast. - Windows; not even compile-tested! Signed-off-by: Emilio G. Cota <[email protected]> --- translate-all.c | 146 +++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 133 insertions(+), 13 deletions(-) diff --git a/translate-all.c b/translate-all.c index 8e70583..6830371 100644 --- a/translate-all.c +++ b/translate-all.c @@ -535,6 +535,9 @@ static inline void *split_cross_256mb(void *buf1, size_t size1) #ifdef USE_STATIC_CODE_GEN_BUFFER static uint8_t static_code_gen_buffer[DEFAULT_CODE_GEN_BUFFER_SIZE] __attribute__((aligned(CODE_GEN_ALIGN))); +static int static_buf_mask = 1; +static void *static_buf1; +static void *static_buf2; # ifdef _WIN32 static inline void do_protect(void *addr, long size, int prot) @@ -577,6 +580,13 @@ static inline void map_none(void *addr, long size) } # endif /* WIN32 */ +static void map_static_code_gen_buffer(void *buf, size_t size) +{ + map_exec(buf, size); + map_none(buf + size, qemu_real_host_page_size); + qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE); +} + static inline void *alloc_code_gen_buffer(void) { void *buf = static_code_gen_buffer; @@ -586,28 +596,41 @@ static inline void *alloc_code_gen_buffer(void) full_size = (((uintptr_t)buf + sizeof(static_code_gen_buffer)) & qemu_real_host_page_mask) - (uintptr_t)buf; - /* Reserve a guard page. */ - size = full_size - qemu_real_host_page_size; + /* + * Reserve two guard pages, one after each of the two buffers: + * | buf1 |g1| buf2 |g2| + */ + size = full_size - 2 * qemu_real_host_page_size; /* Honor a command-line option limiting the size of the buffer. */ if (size > tcg_ctx.code_gen_buffer_size) { size = (((uintptr_t)buf + tcg_ctx.code_gen_buffer_size) & qemu_real_host_page_mask) - (uintptr_t)buf; } - tcg_ctx.code_gen_buffer_size = size; #ifdef __mips__ - if (cross_256mb(buf, size)) { - buf = split_cross_256mb(buf, size); - size = tcg_ctx.code_gen_buffer_size; + /* + * Pass 'size + page_size', since we want 'buf1 | guard1 | buf2' to be + * within the boundary. + */ + if (cross_256mb(buf, size + qemu_real_host_page_size)) { + buf = split_cross_256mb(buf, size + qemu_real_host_page_size); + size = tcg_ctx.code_gen_buffer_size - qemu_real_host_page_size; } #endif - map_exec(buf, size); - map_none(buf + size, qemu_real_host_page_size); - qemu_madvise(buf, size, QEMU_MADV_HUGEPAGE); + /* split the buffer in two */ + size /= 2; + tcg_ctx.code_gen_buffer_size = size; - return buf; + static_buf1 = buf; + static_buf2 = buf + size + qemu_real_host_page_mask; + + map_static_code_gen_buffer(static_buf1, size); + map_static_code_gen_buffer(static_buf2, size); + + assert(static_buf_mask == 1); + return static_buf1; } #elif defined(_WIN32) static inline void *alloc_code_gen_buffer(void) @@ -825,10 +848,100 @@ static void page_flush_tb(void) } } +#ifdef USE_STATIC_CODE_GEN_BUFFER + +struct code_gen_desc { + struct rcu_head rcu; + int clear_bit; +}; + +static void code_gen_buffer_clear(struct rcu_head *rcu) +{ + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); + + tb_lock(); + static_buf_mask &= ~desc->clear_bit; + tb_unlock(); + g_free(desc); +} + +static void *code_gen_buffer_replace(void) +{ + struct code_gen_desc *desc = g_malloc0(sizeof(*desc)); + + /* + * If both bits are set, we're having two concurrent flushes. This + * can easily happen if the buffers are heavily undersized. + */ + assert(static_buf_mask == 1 || static_buf_mask == 2); + + desc->clear_bit = static_buf_mask; + call_rcu1(&desc->rcu, code_gen_buffer_clear); + + if (static_buf_mask == 1) { + static_buf_mask |= 2; + return static_buf2; + } + static_buf_mask |= 1; + return static_buf1; +} + +#elif defined(_WIN32) + +struct code_gen_desc { + struct rcu_head rcu; + void *buf; +}; + +static void code_gen_buffer_vfree(struct rcu_head *rcu) +{ + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); + + VirtualFree(desc->buf, 0, MEM_RELEASE); + g_free(desc); +} + +static void *code_gen_buffer_replace(void) +{ + struct code_gen_desc *desc; + + desc = g_malloc0(sizeof(*desc)); + desc->buf = tcg_ctx.code_gen_buffer; + call_rcu1(&desc->rcu, code_gen_buffer_vfree); + + return alloc_code_gen_buffer(); +} + +#else /* UNIX, dynamically-allocated code buffer */ + +struct code_gen_desc { + struct rcu_head rcu; + void *buf; + size_t size; +}; + +static void code_gen_buffer_unmap(struct rcu_head *rcu) +{ + struct code_gen_desc *desc = container_of(rcu, struct code_gen_desc, rcu); + + munmap(desc->buf, desc->size + qemu_real_host_page_size); + g_free(desc); +} + +static void *code_gen_buffer_replace(void) +{ + struct code_gen_desc *desc; + + desc = g_malloc0(sizeof(*desc)); + desc->buf = tcg_ctx.code_gen_buffer; + desc->size = tcg_ctx.code_gen_buffer_size; + call_rcu1(&desc->rcu, code_gen_buffer_unmap); + + return alloc_code_gen_buffer(); +} +#endif /* USE_STATIC_CODE_GEN_BUFFER */ + /* flush all the translation blocks */ -/* XXX: tb_flush is currently not thread safe. System emulation calls it only - * with tb_lock taken or from safe_work, so no need to take tb_lock here. - */ void tb_flush(CPUState *cpu) { #if defined(DEBUG_FLUSH) @@ -852,10 +965,17 @@ void tb_flush(CPUState *cpu) memset(tcg_ctx.tb_ctx.tb_phys_hash, 0, sizeof(tcg_ctx.tb_ctx.tb_phys_hash)); page_flush_tb(); + tcg_ctx.code_gen_buffer = code_gen_buffer_replace(); tcg_ctx.code_gen_ptr = tcg_ctx.code_gen_buffer; + tcg_prologue_init(&tcg_ctx); /* XXX: flush processor icache at this point if cache flush is expensive */ tcg_ctx.tb_ctx.tb_flush_count++; + + /* exit all CPUs so that the old buffer is quickly cleared. */ + CPU_FOREACH(cpu) { + cpu_exit(cpu); + } } #ifdef DEBUG_TB_CHECK -- 2.5.0
