Skip to content

Commit

Permalink
i#2502 lockless ARM: Fix missing AArch synchronization (#4269)
Browse files Browse the repository at this point in the history
For update_lookuptable_tls: Adds a store-release to the lookuptable
store and a load-acquire to the IBL generated code for both ARM and
AArch64, as there is no lighter-weight way to ensure the mask and
table stores are seen in the proper order.

For flushtime_global: made all readers and the increment use
store-release semantics.

For safely_nullify_tables: leaving as weak stores since timing and
ordering does not matter there.

Issue: #2502
  • Loading branch information
derekbruening authored Apr 21, 2020
1 parent c54db8b commit 906b044
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 19 deletions.
21 changes: 15 additions & 6 deletions core/arch/aarch64/emit_utils.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* **********************************************************
* Copyright (c) 2014-2018 Google, Inc. All rights reserved.
* Copyright (c) 2014-2020 Google, Inc. All rights reserved.
* Copyright (c) 2016 ARM Limited. All rights reserved.
* **********************************************************/

Expand Down Expand Up @@ -720,13 +720,22 @@ emit_indirect_branch_lookup(dcontext_t *dc, generated_code_t *code, byte *pc,

/* Spill x0. */
APP(&ilist, instr_create_save_to_tls(dc, DR_REG_R0, TLS_REG3_SLOT));
/* Load hash mask and base. */
/* ldp x1, x0, [x28, hash_mask] */
/* Load-acquire hash mask. We need a load-acquire to ensure we see updates
* properly; the corresponding store-release is in update_lookuptable_tls().
*/
/* add x1, x28 + hash_mask_offs; ldar x1, [x1] (ldar doesn't take an offs.) */
APP(&ilist,
INSTR_CREATE_add(dc, opnd_create_reg(DR_REG_X1), opnd_create_reg(dr_reg_stolen),
OPND_CREATE_INT32(TLS_MASK_SLOT(ibl_code->branch_type))));
APP(&ilist,
INSTR_CREATE_ldp(dc, opnd_create_reg(DR_REG_X1), opnd_create_reg(DR_REG_X0),
INSTR_CREATE_ldar(dc, opnd_create_reg(DR_REG_X1),
OPND_CREATE_MEMPTR(DR_REG_X1, 0)));
/* ldr x0, [x28, hash_table] */
APP(&ilist,
INSTR_CREATE_ldr(dc, opnd_create_reg(DR_REG_X0),
opnd_create_base_disp(dr_reg_stolen, DR_REG_NULL, 0,
TLS_MASK_SLOT(ibl_code->branch_type),
OPSZ_16)));
TLS_TABLE_SLOT(ibl_code->branch_type),
OPSZ_8)));
/* and x1, x1, x2 */
APP(&ilist,
INSTR_CREATE_and(dc, opnd_create_reg(DR_REG_X1), opnd_create_reg(DR_REG_X1),
Expand Down
12 changes: 12 additions & 0 deletions core/arch/arch_exports.h
Original file line number Diff line number Diff line change
Expand Up @@ -1157,6 +1157,18 @@ atomic_aligned_read_int64(volatile int64 *var)
} while (0)
#endif

/* Our ATOMIC_* macros target release-acquire semantics. ATOMIC_PTRSZ_ALIGNED_WRITE
* is a Store-Release and ensures prior stores in program order in this thread
* are not observed by another thread after this store.
*/
#ifdef X64
# define ATOMIC_PTRSZ_ALIGNED_WRITE(target, value, hot_patch) \
ATOMIC_8BYTE_ALIGNED_WRITE(target, value, hot_patch)
#else
# define ATOMIC_PTRSZ_ALIGNED_WRITE(target, value, hot_patch) \
ATOMIC_4BYTE_ALIGNED_WRITE(target, value, hot_patch)
#endif

#define ATOMIC_MAX(type, maxvar, curvar) ATOMIC_MAX_##type(type, maxvar, curvar)

#define DEBUGGER_INTERRUPT_BYTE 0xcc
Expand Down
5 changes: 5 additions & 0 deletions core/arch/arm/emit_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -948,6 +948,11 @@ emit_indirect_branch_lookup(dcontext_t *dc, generated_code_t *code, byte *pc,
APP(&ilist,
INSTR_CREATE_ldr(dc, OPREG(DR_REG_R1),
OPND_TLS_FIELD(TLS_MASK_SLOT(ibl_code->branch_type))));
/* We need the mask load to have Aqcuire semantics to pair with the Release in
* update_lookuptable_tls() and avoid the reader here seeing a new mask with
* an old table.
*/
APP(&ilist, INSTR_CREATE_dmb(dc, OPND_CREATE_INT(DR_DMB_ISHLD)));
APP(&ilist,
INSTR_CREATE_and(dc, OPREG(DR_REG_R1), OPREG(DR_REG_R1), OPREG(DR_REG_R2)));
APP(&ilist,
Expand Down
18 changes: 17 additions & 1 deletion core/arch/arm/instr_create.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/* **********************************************************
* Copyright (c) 2011-2017 Google, Inc. All rights reserved.
* Copyright (c) 2011-2020 Google, Inc. All rights reserved.
* Copyright (c) 2002-2010 VMware, Inc. All rights reserved.
* **********************************************************/

Expand Down Expand Up @@ -76,6 +76,22 @@
#define OPND_CREATE_MEMLIST(base) \
opnd_create_base_disp(base, DR_REG_NULL, 0, 0, OPSZ_VAR_REGLIST)

/** Immediate values for INSTR_CREATE_dmb(). */
enum {
DR_DMB_OSHLD = 1, /**< DMB Outer Shareable - Loads. */
DR_DMB_OSHST = 2, /**< DMB Outer Shareable - Stores. */
DR_DMB_OSH = 3, /**< DMB Outer Shareable - Loads and Stores. */
DR_DMB_NSHLD = 5, /**< DMB Non Shareable - Loads. */
DR_DMB_NSHST = 6, /**< DMB Non Shareable - Stores. */
DR_DMB_NSH = 7, /**< DMB Non Shareable - Loads and Stores. */
DR_DMB_ISHLD = 9, /**< DMB Inner Shareable - Loads. */
DR_DMB_ISHST = 10 /**< DMB Inner Shareable - Stores. */,
DR_DMB_ISH = 11, /**< DMB Inner Shareable - Loads and Stores. */
DR_DMB_LD = 13, /**< DMB Full System - Loads. */
DR_DMB_ST = 14, /**< DMB Full System - Stores. */
DR_DMB_SY = 15, /**< DMB Full System - Loads and Stores. */
};

/* Macros for building instructions, one for each opcode.
* Each INSTR_CREATE_xxx macro creates an instr_t with opcode OP_xxx and
* the given explicit operands, automatically supplying any implicit operands.
Expand Down
42 changes: 30 additions & 12 deletions core/fragment.c
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,12 @@ update_lookuptable_tls(dcontext_t *dcontext, ibl_table_t *table)
* crashing or worse.
*/
state->table_space.table[table->branch_type].lookuptable = table->table;
state->table_space.table[table->branch_type].hash_mask = table->hash_mask;
/* Perform a Store-Release, which when combined with a Load-Acquire of the mask
* in the IBL itself, ensures the prior store to lookuptable is always
* observed before this store to hash_mask on weakly ordered arches.
*/
ATOMIC_PTRSZ_ALIGNED_WRITE(&state->table_space.table[table->branch_type].hash_mask,
table->hash_mask, false);
}

#ifdef DEBUG
Expand Down Expand Up @@ -935,6 +940,9 @@ safely_nullify_tables(dcontext_t *dcontext, ibl_table_t *new_table,
* table untouched also so that the fragment_table_t is in a consistent
* state.)
*/
/* For weakly ordered arches: we leave this as a weak (atomic-untorn b/c it's
* aligned) store, which should eventually be seen by the target thread.
*/
table[i].start_pc_fragment = target_delete;
}
STATS_INC(num_shared_ibt_table_flushes);
Expand Down Expand Up @@ -1849,7 +1857,10 @@ fragment_thread_reset_init(dcontext_t *dcontext)
* when resetting, though, thread free & re-init is done before global free,
* so we have to explicitly set to 0 for that case.
*/
pt->flushtime_last_update = (dynamo_resetting) ? 0 : flushtime_global;
if (dynamo_resetting)
pt->flushtime_last_update = 0;
else
ATOMIC_4BYTE_ALIGNED_READ(&flushtime_global, &pt->flushtime_last_update);

/* set initial hashtable sizes */
hashtable_fragment_init(
Expand Down Expand Up @@ -5449,13 +5460,15 @@ check_flush_queue(dcontext_t *dcontext, fragment_t *was_I_flushed)
SELF_PROTECT_LOCAL(dcontext, READONLY);
}
/* now check shared queue to dec ref counts */
uint local_flushtime_global;
/* No lock needed: any racy incs to global are in safe direction, and our inc
* is atomic so we shouldn't see any partial-word-updated values here. This
* check is our shared deletion algorithm's only perf hit when there's no
* actual shared flushing.
*/
ATOMIC_4BYTE_ALIGNED_READ(&flushtime_global, &local_flushtime_global);
if (DYNAMO_OPTION(shared_deletion) &&
/* No lock needed: any racy incs to global are in safe direction, and our inc
* is atomic so we shouldn't see any partial-word-updated values here. This
* check is our shared deletion algorithm's only perf hit when there's no
* actual shared flushing.
*/
pt->flushtime_last_update < flushtime_global) {
pt->flushtime_last_update < local_flushtime_global) {
#ifdef LINUX
rseq_shared_fragment_flushtime_update(dcontext);
#endif
Expand Down Expand Up @@ -5848,17 +5861,19 @@ increment_global_flushtime()
/* reset will turn flushtime_global back to 0, so we schedule one
* when we're approaching overflow
*/
if (flushtime_global == UINT_MAX / 2) {
uint local_flushtime_global;
ATOMIC_4BYTE_ALIGNED_READ(&flushtime_global, &local_flushtime_global);
if (local_flushtime_global == UINT_MAX / 2) {
ASSERT_NOT_TESTED(); /* FIXME: add -stress_flushtime_global_max */
SYSLOG_INTERNAL_WARNING("flushtime_global approaching UINT_MAX, resetting");
schedule_reset(RESET_ALL);
}
ASSERT(flushtime_global < UINT_MAX);
ASSERT(local_flushtime_global < UINT_MAX);

/* compiler should 4-byte-align so no cache line crossing
* (asserted in fragment_init()
*/
flushtime_global++;
atomic_add_exchange_int((volatile int *)&flushtime_global, 1);
LOG(GLOBAL, LOG_VMAREAS, 2, "new flush timestamp: %u\n", flushtime_global);
}

Expand Down Expand Up @@ -6730,7 +6745,10 @@ flush_fragments_end_synch(dcontext_t *dcontext, bool keep_initexit_lock)
* FIXME: Does not work w/ -ignore_syscalls, but those are private
* for now.
*/
DEBUG_DECLARE(uint pre_flushtime = flushtime_global;)
#ifdef DEBUG
uint pre_flushtime;
ATOMIC_4BYTE_ALIGNED_READ(&flushtime_global, &pre_flushtime);
#endif
vm_area_check_shared_pending(tgt_dcontext, NULL);
/* lazy deletion may inc flushtime_global, so may have a higher
* value than our cached one, but should never be lower
Expand Down

0 comments on commit 906b044

Please sign in to comment.