diff --git a/.github/workflows/ci-docs.yml b/.github/workflows/ci-docs.yml
index 2f3c06c496f..1be585beebe 100644
--- a/.github/workflows/ci-docs.yml
+++ b/.github/workflows/ci-docs.yml
@@ -90,7 +90,7 @@ jobs:
       # We only use a non-zero build # when making multiple manual builds in one day.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=9.93.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=9.94.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
diff --git a/.github/workflows/ci-package.yml b/.github/workflows/ci-package.yml
index 55d5f35625c..08c0c9711e1 100644
--- a/.github/workflows/ci-package.yml
+++ b/.github/workflows/ci-package.yml
@@ -102,7 +102,7 @@ jobs:
       # We only use a non-zero build # when making multiple manual builds in one day.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=9.93.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=9.94.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -194,7 +194,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=9.93.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=9.94.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -282,7 +282,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=9.93.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=9.94.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -370,7 +370,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=9.93.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=9.94.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -450,7 +450,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER=9.93.$((`git log -n 1 --format=%ct` / (60*60*24)))
+          export VERSION_NUMBER=9.94.$((`git log -n 1 --format=%ct` / (60*60*24)))
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
         fi
@@ -535,7 +535,7 @@ jobs:
       # XXX: See x86 job comments on sharing the default ver# with CMakeLists.txt.
       run: |
         if test -z "${{ github.event.inputs.version }}"; then
-          export VERSION_NUMBER="9.93.$((`git log -n 1 --format=%ct` / (60*60*24)))"
+          export VERSION_NUMBER="9.94.$((`git log -n 1 --format=%ct` / (60*60*24)))"
           export PREFIX="cronbuild-"
         else
           export VERSION_NUMBER=${{ github.event.inputs.version }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8f3b5dce1b2..f640fce640f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -568,7 +568,7 @@ endif (EXISTS "${PROJECT_SOURCE_DIR}/.svn")
 
 # N.B.: When updating this, update all the default versions in ci-package.yml
 # and ci-docs.yml.  We should find a way to share (xref i#1565).
-set(VERSION_NUMBER_DEFAULT "9.93.${VERSION_NUMBER_PATCHLEVEL}")
+set(VERSION_NUMBER_DEFAULT "9.94.${VERSION_NUMBER_PATCHLEVEL}")
 # do not store the default VERSION_NUMBER in the cache to prevent a stale one
 # from preventing future version updates in a pre-existing build dir
 set(VERSION_NUMBER "" CACHE STRING "Version number: leave empty for default")
@@ -1381,7 +1381,7 @@ math(EXPR VERSION_NUMBER_INTEGER
 # 5.0 broke backcompat in drsyms and xmm opnd sizes
 # 4.1 broke backcompat in drsyms + 64-bit core (opcodes + reachability)
 # 4.0 broke backcompat in drmgr, drsyms, drinjectlib, and dr_get_milliseconds()
-set(OLDEST_COMPATIBLE_VERSION_DEFAULT "990")
+set(OLDEST_COMPATIBLE_VERSION_DEFAULT "994")
 set(OLDEST_COMPATIBLE_VERSION "" CACHE STRING
   "Oldest compatible version: leave empty for default")
 if ("${OLDEST_COMPATIBLE_VERSION}" STREQUAL "")
diff --git a/api/docs/release.dox b/api/docs/release.dox
index 1f756800102..670a372793b 100644
--- a/api/docs/release.dox
+++ b/api/docs/release.dox
@@ -183,6 +183,11 @@ changes:
    their precise counterparts int64_t and uint64_t.
  - The #dynamorio::drmemtrace::memref_t structure has a new field appended for
    holding the actual target of each indirect branch.
+ - Increased the size of dr_simd_t to accommodate AArch64's Scalable Vector
+   Extension (SVE) as well as adding two new dr_simd_t instances to
+   #dr_mcontext_t: SVE predicate registers svep[] and the SVE first-fault
+   register, ffr. This is a significant binary compatibility change and will
+   require re-building clients built before SVE was added.
 
 Further non-compatibility-affecting changes include:
  - Added new drmemtrace option -L0_filter_until_instrs which enables filtering
@@ -279,6 +284,9 @@ Further non-compatibility-affecting changes include:
  - Added a new drmemtrace analysis tool: syscall_mix, to count frequency of system
    calls in a trace. This tool works in both the online and offline modes of
    drmemtrace.
+ - Added proc_get_vector_length_bytes() for AArch64. This returns the current
+   vector length on all ARMv8 hardware including hardware which supports the
+   Scalable Vector Extension (SVE).
 
 **************************************************
 <hr>
diff --git a/api/samples/memtrace_simple.c b/api/samples/memtrace_simple.c
index ba05d67884a..3227c46c2f3 100644
--- a/api/samples/memtrace_simple.c
+++ b/api/samples/memtrace_simple.c
@@ -121,6 +121,10 @@ static int tls_idx;
 
 #define MINSERT instrlist_meta_preinsert
 
+#ifdef AARCH64
+static bool reported_sg_warning = false;
+#endif
+
 static void
 memtrace(void *drcontext)
 {
@@ -314,13 +318,47 @@ event_app_instruction(void *drcontext, void *tag, instrlist_t *bb, instr_t *wher
     DR_ASSERT(instr_is_app(instr_operands));
 
     for (i = 0; i < instr_num_srcs(instr_operands); i++) {
-        if (opnd_is_memory_reference(instr_get_src(instr_operands, i)))
-            instrument_mem(drcontext, bb, where, instr_get_src(instr_operands, i), false);
+        const opnd_t src = instr_get_src(instr_operands, i);
+        if (opnd_is_memory_reference(src)) {
+#ifdef AARCH64
+            /* TODO i#5844: Memory references involving SVE registers are not
+             * supported yet. To be implemented as part of scatter/gather work.
+             */
+            if (opnd_is_base_disp(src) &&
+                (reg_is_z(opnd_get_base(src)) || reg_is_z(opnd_get_index(src)))) {
+                if (!reported_sg_warning) {
+                    dr_fprintf(STDERR,
+                               "WARNING: Scatter/gather is not supported, results will "
+                               "be inaccurate\n");
+                    reported_sg_warning = true;
+                }
+                continue;
+            }
+#endif
+            instrument_mem(drcontext, bb, where, src, false);
+        }
     }
 
     for (i = 0; i < instr_num_dsts(instr_operands); i++) {
-        if (opnd_is_memory_reference(instr_get_dst(instr_operands, i)))
-            instrument_mem(drcontext, bb, where, instr_get_dst(instr_operands, i), true);
+        const opnd_t dst = instr_get_dst(instr_operands, i);
+        if (opnd_is_memory_reference(dst)) {
+#ifdef AARCH64
+            /* TODO i#5844: Memory references involving SVE registers are not
+             * supported yet. To be implemented as part of scatter/gather work.
+             */
+            if (opnd_is_base_disp(dst) &&
+                (reg_is_z(opnd_get_base(dst)) || reg_is_z(opnd_get_index(dst)))) {
+                if (!reported_sg_warning) {
+                    dr_fprintf(STDERR,
+                               "WARNING: Scatter/gather is not supported, results will "
+                               "be inaccurate\n");
+                    reported_sg_warning = true;
+                }
+                continue;
+            }
+#endif
+            instrument_mem(drcontext, bb, where, dst, true);
+        }
     }
 
     /* insert code to call clean_call for processing the buffer */
diff --git a/api/samples/memval_simple.c b/api/samples/memval_simple.c
index d22b869456b..15a7539c308 100644
--- a/api/samples/memval_simple.c
+++ b/api/samples/memval_simple.c
@@ -104,6 +104,10 @@ static int tls_idx;
 static drx_buf_t *write_buffer;
 static drx_buf_t *trace_buffer;
 
+#ifdef AARCH64
+static bool reported_sg_warning = false;
+#endif
+
 /* Requires that hex_buf be at least as long as 2*memref->size + 1. */
 static char *
 write_hexdump(char *hex_buf, byte *write_base, mem_ref_t *mem_ref)
@@ -322,14 +326,31 @@ handle_post_write(void *drcontext, instrlist_t *ilist, instr_t *where, reg_id_t
      * this.
      */
     for (i = 0; i < instr_num_dsts(prev_instr); ++i) {
-        if (opnd_is_memory_reference(instr_get_dst(prev_instr, i))) {
+        const opnd_t dst = instr_get_dst(prev_instr, i);
+        if (opnd_is_memory_reference(dst)) {
             if (seen_memref) {
                 DR_ASSERT_MSG(false, "Found inst with multiple memory destinations");
                 break;
             }
+
+#ifdef AARCH64
+            /* TODO i#5844: Memory references involving SVE registers are not
+             * supported yet. To be implemented as part of scatter/gather work.
+             */
+            if (opnd_is_base_disp(dst) &&
+                (reg_is_z(opnd_get_base(dst)) || reg_is_z(opnd_get_index(dst)))) {
+                if (!reported_sg_warning) {
+                    dr_fprintf(STDERR,
+                               "WARNING: Scatter/gather is not supported, results "
+                               "will be inaccurate\n");
+                    reported_sg_warning = true;
+                }
+                continue;
+            }
+#endif
+
             seen_memref = true;
-            instrument_post_write(drcontext, ilist, where, instr_get_dst(prev_instr, i),
-                                  prev_instr, reg_addr);
+            instrument_post_write(drcontext, ilist, where, dst, prev_instr, reg_addr);
         }
     }
 }
@@ -377,14 +398,29 @@ event_app_instruction(void *drcontext, void *tag, instrlist_t *bb, instr_t *wher
          * we assume no instruction has multiple distinct memory destination operands.
          */
         for (i = 0; i < instr_num_dsts(instr_operands); ++i) {
-            if (opnd_is_memory_reference(instr_get_dst(instr_operands, i))) {
+            const opnd_t dst = instr_get_dst(instr_operands, i);
+            if (opnd_is_memory_reference(dst)) {
                 if (seen_memref) {
                     DR_ASSERT_MSG(false, "Found inst with multiple memory destinations");
                     break;
                 }
-                data->reg_addr = instrument_pre_write(drcontext, bb, where,
-                                                      data->last_opcode, instr_operands,
-                                                      instr_get_dst(instr_operands, i));
+#ifdef AARCH64
+                /* TODO i#5844: Memory references involving SVE registers are not
+                 * supported yet. To be implemented as part of scatter/gather work.
+                 */
+                if (opnd_is_base_disp(dst) &&
+                    (reg_is_z(opnd_get_base(dst)) || reg_is_z(opnd_get_index(dst)))) {
+                    if (!reported_sg_warning) {
+                        dr_fprintf(STDERR,
+                                   "WARNING: Scatter/gather is not supported, results "
+                                   "will be inaccurate\n");
+                        reported_sg_warning = true;
+                    }
+                    continue;
+                }
+#endif
+                data->reg_addr = instrument_pre_write(
+                    drcontext, bb, where, data->last_opcode, instr_operands, dst);
                 seen_memref = true;
             }
         }
diff --git a/clients/drcachesim/tests/burst_gencode.cpp b/clients/drcachesim/tests/burst_gencode.cpp
index 0a7e4e392eb..ef3828dab16 100644
--- a/clients/drcachesim/tests/burst_gencode.cpp
+++ b/clients/drcachesim/tests/burst_gencode.cpp
@@ -198,6 +198,8 @@ class code_generator_t {
 #ifdef X86
         replace = INSTR_CREATE_lahf(dc);
 #elif defined(AARCH64)
+        // OP_psb requires SPE feature.
+        proc_set_feature(FEATURE_SPE, true);
         replace = INSTR_CREATE_psb_csync(dc);
 #elif defined(ARM)
         replace = INSTR_CREATE_yield(dc);
diff --git a/clients/drcachesim/tracer/tracer.cpp b/clients/drcachesim/tracer/tracer.cpp
index 117fc317db3..1b3365cf5f8 100644
--- a/clients/drcachesim/tracer/tracer.cpp
+++ b/clients/drcachesim/tracer/tracer.cpp
@@ -175,6 +175,10 @@ static void *trace_thread_cb_user_data;
 static bool thread_filtering_enabled;
 bool attached_midway;
 
+#ifdef AARCH64
+static bool reported_sg_warning = false;
+#endif
+
 static bool
 bbdup_instr_counting_enabled()
 {
@@ -1304,18 +1308,50 @@ event_app_instruction(void *drcontext, void *tag, instrlist_t *bb, instr_t *inst
 
         /* insert code to add an entry for each memory reference opnd */
         for (i = 0; i < instr_num_srcs(instr_operands); i++) {
-            if (opnd_is_memory_reference(instr_get_src(instr_operands, i))) {
-                adjust = instrument_memref(
-                    drcontext, ud, bb, where, reg_ptr, adjust, instr_operands,
-                    instr_get_src(instr_operands, i), i, false, pred, mode);
+            const opnd_t src = instr_get_src(instr_operands, i);
+            if (opnd_is_memory_reference(src)) {
+#ifdef AARCH64
+                /* TODO i#5844: Memory references involving SVE registers are not
+                 * supported yet. To be implemented as part of scatter/gather work.
+                 */
+                if (opnd_is_base_disp(src) &&
+                    (reg_is_z(opnd_get_base(src)) || reg_is_z(opnd_get_index(src)))) {
+                    if (!reported_sg_warning) {
+                        NOTIFY(
+                            0,
+                            "WARNING: Scatter/gather is not supported, results will be "
+                            "inaccurate\n");
+                        reported_sg_warning = true;
+                    }
+                    continue;
+                }
+#endif
+                adjust = instrument_memref(drcontext, ud, bb, where, reg_ptr, adjust,
+                                           instr_operands, src, i, false, pred, mode);
             }
         }
 
         for (i = 0; i < instr_num_dsts(instr_operands); i++) {
-            if (opnd_is_memory_reference(instr_get_dst(instr_operands, i))) {
-                adjust = instrument_memref(
-                    drcontext, ud, bb, where, reg_ptr, adjust, instr_operands,
-                    instr_get_dst(instr_operands, i), i, true, pred, mode);
+            const opnd_t dst = instr_get_dst(instr_operands, i);
+            if (opnd_is_memory_reference(dst)) {
+#ifdef AARCH64
+                /* TODO i#5844: Memory references involving SVE registers are not
+                 * supported yet. To be implemented as part of scatter/gather work.
+                 */
+                if (opnd_is_base_disp(dst) &&
+                    (reg_is_z(opnd_get_base(dst)) || reg_is_z(opnd_get_index(dst)))) {
+                    if (!reported_sg_warning) {
+                        NOTIFY(
+                            0,
+                            "WARNING: Scatter/gather is not supported, results will be "
+                            "inaccurate\n");
+                        reported_sg_warning = true;
+                    }
+                    continue;
+                }
+#endif
+                adjust = instrument_memref(drcontext, ud, bb, where, reg_ptr, adjust,
+                                           instr_operands, dst, i, true, pred, mode);
             }
         }
         if (adjust != 0)
diff --git a/clients/drdisas/drdisas.cpp b/clients/drdisas/drdisas.cpp
index 89204513852..77c96e7fcfb 100644
--- a/clients/drdisas/drdisas.cpp
+++ b/clients/drdisas/drdisas.cpp
@@ -148,7 +148,7 @@ main(int argc, const char *argv[])
 #endif
 
 #ifdef AARCH64
-    dr_set_sve_vl(op_sve_vl.get_value());
+    dr_set_sve_vector_length(op_sve_vl.get_value());
 #endif
 
     // XXX i#4021: arm not yet supported.
diff --git a/core/arch/aarch64/aarch64.asm b/core/arch/aarch64/aarch64.asm
index 232247ec8ac..1fbd09c0406 100644
--- a/core/arch/aarch64/aarch64.asm
+++ b/core/arch/aarch64/aarch64.asm
@@ -47,14 +47,7 @@ START_FILE
 #endif
 
 /* sizeof(priv_mcontext_t) rounded up to a multiple of 16 */
-#define PRIV_MCONTEXT_SIZE 800
-
-/* offset of priv_mcontext_t in dr_mcontext_t */
-#define PRIV_MCONTEXT_OFFSET 16
-
-#if PRIV_MCONTEXT_OFFSET < 16 || PRIV_MCONTEXT_OFFSET % 16 != 0
-# error PRIV_MCONTEXT_OFFSET
-#endif
+#define PRIV_MCONTEXT_SIZE 3424
 
 /* offsetof(spill_state_t, r0) */
 #define spill_state_r0_OFFSET 0
@@ -76,7 +69,7 @@ START_FILE
 /* offsetof(priv_mcontext_t, simd) */
 #define simd_OFFSET (16 * ARG_SZ*2 + 32)
 /* offsetof(dcontext_t, dstack) */
-#define dstack_OFFSET     0x368
+#define dstack_OFFSET     0xda8
 /* offsetof(dcontext_t, is_exiting) */
 #define is_exiting_OFFSET (dstack_OFFSET+1*ARG_SZ)
 /* offsetof(struct tlsdesc_t, arg) */
@@ -252,6 +245,9 @@ save_priv_mcontext_helper:
         st1      {v20.2d-v23.2d}, [x4], #64
         st1      {v24.2d-v27.2d}, [x4], #64
         st1      {v28.2d-v31.2d}, [x4], #64
+        /* TODO i#5365: Save Z/P regs as well? Will require runtime check of
+         * ID_AA64PFR0_EL1 for FEAT_SVE.
+         */
         ret
 
         DECLARE_EXPORTED_FUNC(dr_app_start)
diff --git a/core/arch/aarch64/clean_call_opt.c b/core/arch/aarch64/clean_call_opt.c
index 44b83b186ed..c95a4bd3047 100644
--- a/core/arch/aarch64/clean_call_opt.c
+++ b/core/arch/aarch64/clean_call_opt.c
@@ -183,8 +183,7 @@ analyze_callee_regs_usage(dcontext_t *dcontext, callee_info_t *ci)
     memset(ci->reg_used, 0, sizeof(bool) * DR_NUM_GPR_REGS);
     ci->num_simd_used = 0;
     /* num_opmask_used is not applicable to ARM/AArch64. */
-    ASSERT(proc_num_simd_registers() == MCXT_NUM_SIMD_SLOTS);
-    memset(ci->simd_used, 0, sizeof(bool) * proc_num_simd_registers());
+    memset(ci->simd_used, 0, sizeof(bool) * MCXT_NUM_SIMD_SLOTS);
     ci->write_flags = false;
 
     num_regparm = MIN(ci->num_args, NUM_REGPARM);
@@ -200,7 +199,6 @@ analyze_callee_regs_usage(dcontext_t *dcontext, callee_info_t *ci)
     }
 
     for (instr = instrlist_first(ilist); instr != NULL; instr = instr_get_next(instr)) {
-
         /* General purpose registers */
         for (i = 0; i < DR_NUM_GPR_REGS; i++) {
             reg_id_t reg = DR_REG_START_GPR + (reg_id_t)i;
@@ -213,9 +211,12 @@ analyze_callee_regs_usage(dcontext_t *dcontext, callee_info_t *ci)
             }
         }
 
-        /* SIMD register usage */
-        for (i = 0; i < proc_num_simd_registers(); i++) {
-            if (!ci->simd_used[i] && instr_uses_reg(instr, (DR_REG_Q0 + (reg_id_t)i))) {
+        /* SIMD/SVE register usage. */
+        for (i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+            if (!ci->simd_used[i] &&
+                instr_uses_reg(instr,
+                               (proc_has_feature(FEATURE_SVE) ? DR_REG_Z0 : DR_REG_Q0) +
+                                   (reg_id_t)i)) {
                 LOG(THREAD, LOG_CLEANCALL, 2,
                     "CLEANCALL: callee " PFX " uses VREG%d at " PFX "\n", ci->start, i,
                     instr_get_app_pc(instr));
@@ -224,6 +225,32 @@ analyze_callee_regs_usage(dcontext_t *dcontext, callee_info_t *ci)
             }
         }
 
+        if (proc_has_feature(FEATURE_SVE)) {
+            /* SVE predicate register usage */
+            for (i = MCXT_NUM_SIMD_SVE_SLOTS;
+                 i < (MCXT_NUM_SIMD_SVE_SLOTS + MCXT_NUM_SVEP_SLOTS); i++) {
+                const uint reg_idx = i - MCXT_NUM_SIMD_SVE_SLOTS;
+                if (!ci->simd_used[i] &&
+                    instr_uses_reg(instr, DR_REG_P0 + (reg_id_t)reg_idx)) {
+                    LOG(THREAD, LOG_CLEANCALL, 2,
+                        "CLEANCALL: callee " PFX " uses P%d at " PFX "\n", ci->start,
+                        reg_idx, instr_get_app_pc(instr));
+                    ci->simd_used[i] = true;
+                    ci->num_simd_used++;
+                }
+            }
+
+            /* SVE FFR register usage */
+            const uint ffr_index = MCXT_NUM_SIMD_SVE_SLOTS + MCXT_NUM_SVEP_SLOTS;
+            if (!ci->simd_used[ffr_index] && instr_uses_reg(instr, DR_REG_FFR)) {
+                LOG(THREAD, LOG_CLEANCALL, 2,
+                    "CLEANCALL: callee " PFX " uses FFR at " PFX "\n", ci->start,
+                    instr_get_app_pc(instr));
+                ci->simd_used[ffr_index] = true;
+                ci->num_simd_used++;
+            }
+        }
+
         /* NZCV register usage */
         if (!ci->write_flags &&
             TESTANY(EFLAGS_WRITE_ARITH,
@@ -476,7 +503,7 @@ insert_inline_reg_save(dcontext_t *dcontext, clean_call_info_t *cci, instrlist_t
     insert_get_mcontext_base(dcontext, ilist, where, ci->spill_reg);
 
     insert_save_inline_registers(dcontext, ilist, where, cci->reg_skip, DR_REG_START_GPR,
-                                 true, (void *)ci);
+                                 GPR_REG_TYPE, (void *)ci);
 
     /* Save nzcv */
     if (!cci->skip_save_flags && ci->write_flags) {
@@ -512,7 +539,7 @@ insert_inline_reg_restore(dcontext_t *dcontext, clean_call_info_t *cci,
     }
 
     insert_restore_inline_registers(dcontext, ilist, where, cci->reg_skip, DR_REG_X0,
-                                    true, (void *)ci);
+                                    GPR_REG_TYPE, (void *)ci);
 
     /* Restore reg used for unprotected_context_t pointer. */
     PRE(ilist, where,
diff --git a/core/arch/aarch64/emit_utils.c b/core/arch/aarch64/emit_utils.c
index de66c0f4a4b..db64f5875ec 100644
--- a/core/arch/aarch64/emit_utils.c
+++ b/core/arch/aarch64/emit_utils.c
@@ -574,7 +574,7 @@ void
 append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
 {
     int i;
-    /* add x1, x(dcxt), #(off) */
+    /* add x1, x(dcxt), #(offset simd) */
     APP(ilist,
         XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X1),
                               opnd_create_reg(REG_DCXT),
@@ -587,6 +587,67 @@ append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
                 opnd_create_reg(DR_REG_Q0 + i + 1),
                 opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, i * 16, OPSZ_32)));
     }
+    if (proc_has_feature(FEATURE_SVE)) {
+        for (i = 0; i < 32; i++) {
+            /* ldr z(i), [x1, #(i mul vl)]
+             * From the SVE manual:
+             * "Load a vector register from a memory address generated by a
+             * 64-bit scalar base, plus an immediate offset in the range -256
+             * to 255 which is multiplied by the current vector register size
+             * in bytes."
+             */
+            APP(ilist,
+                INSTR_CREATE_ldr(
+                    dcontext, opnd_create_reg(DR_REG_Z0 + i),
+                    opnd_create_base_disp(
+                        DR_REG_X1, DR_REG_NULL, 0, i * proc_get_vector_length_bytes(),
+                        opnd_size_from_bytes(proc_get_vector_length_bytes()))));
+        }
+        /* add x1, x(dcxt), #(offset svep) */
+        APP(ilist,
+            XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X1),
+                                  opnd_create_reg(REG_DCXT),
+                                  OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, svep))));
+        /* No need to load DR_REG_P15 because it will be used as a temporary
+         * register for FFR load below, then restored from svep afterwards.
+         */
+        for (i = 0; i < 15; i++) {
+            /* ldr p(i), [x1, #(i mul vl)] */
+            APP(ilist,
+                INSTR_CREATE_ldr(
+                    dcontext, opnd_create_reg(DR_REG_P0 + i),
+                    opnd_create_base_disp(
+                        DR_REG_X1, DR_REG_NULL, 0,
+                        i * (proc_get_vector_length_bytes() / 8),
+                        opnd_size_from_bytes(proc_get_vector_length_bytes() / 8))));
+        }
+        /* There is no load instruction for the first-fault register (FFR). Use
+         * a temporary predicate register to load:
+         * add x2, x(dcxt), #(offset ffr)
+         * ldr p15, [x2, #(ffr)]
+         * wrffr p15.b
+         * ldr p15, [x1, #(15 mul vl)]
+         */
+        APP(ilist,
+            XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X2),
+                                  opnd_create_reg(REG_DCXT),
+                                  OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, ffr))));
+        APP(ilist,
+            INSTR_CREATE_ldr(
+                dcontext, opnd_create_reg(DR_REG_P15),
+                opnd_create_base_disp(
+                    DR_REG_X2, DR_REG_NULL, 0, 0,
+                    opnd_size_from_bytes(proc_get_vector_length_bytes() / 8))));
+        APP(ilist,
+            INSTR_CREATE_wrffr_sve(dcontext,
+                                   opnd_create_reg_element_vector(DR_REG_P15, OPSZ_1)));
+        APP(ilist,
+            INSTR_CREATE_ldr(
+                dcontext, opnd_create_reg(DR_REG_P15),
+                opnd_create_base_disp(
+                    DR_REG_X1, DR_REG_NULL, 0, 15 * (proc_get_vector_length_bytes() / 8),
+                    opnd_size_from_bytes(proc_get_vector_length_bytes() / 8))));
+    }
 }
 
 /* Append instructions to restore gpr on fcache enter, to be executed
@@ -730,13 +791,78 @@ append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
                               opnd_create_reg(REG_DCXT),
                               OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, simd))));
     for (i = 0; i < 32; i += 2) {
-        /* stp q(i), q(i + 1), [x1, #(i * 16)] */
+        /* stp q(i), q(i + 1), [x1, #(i * 16)]
+         * From the AArch64 manual:
+         * "The signed immediate byte offset is a multiple of 16 in the range
+         * -1024 to 1008, defaulting to 0 and encoded in the imm7 field as
+         * <imm>/16."
+         */
         APP(ilist,
             INSTR_CREATE_stp(
                 dcontext,
                 opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, i * 16, OPSZ_32),
                 opnd_create_reg(DR_REG_Q0 + i), opnd_create_reg(DR_REG_Q0 + i + 1)));
     }
+    if (proc_has_feature(FEATURE_SVE)) {
+        for (i = 0; i < 32; i++) {
+            /* str z(i), [x1, #(i mul vl)]
+             * "Store a vector register to a memory address generated by a
+             * 64-bit scalar base, plus an immediate offset in the range -256
+             * to 255 which is multiplied by the current vector register size
+             * in bytes."
+             */
+            APP(ilist,
+                INSTR_CREATE_str(
+                    dcontext,
+                    opnd_create_base_disp(
+                        DR_REG_X1, DR_REG_NULL, 0, i * proc_get_vector_length_bytes(),
+                        opnd_size_from_bytes(proc_get_vector_length_bytes())),
+                    opnd_create_reg(DR_REG_Z0 + i)));
+        }
+        /* add x1, x(dcxt), #(off) */
+        APP(ilist,
+            XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X1),
+                                  opnd_create_reg(REG_DCXT),
+                                  OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, svep))));
+        for (i = 0; i < 16; i++) {
+            /* str p(i), [x1, #(i mul vl)] */
+            APP(ilist,
+                INSTR_CREATE_str(
+                    dcontext,
+                    opnd_create_base_disp(
+                        DR_REG_X1, DR_REG_NULL, 0,
+                        i * (proc_get_vector_length_bytes() / 8),
+                        opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)),
+                    opnd_create_reg(DR_REG_P0 + i)));
+        }
+        /* There is no store instruction for the first-fault register (FFR). Use
+         * a temporary predicate register to store:
+         * rdffr p15.b
+         * add x2, x(dcxt), #(offset ffr)
+         * str p15, [x2, #(ffr)]
+         * ldr p15, [x1, #(15 mul vl)]
+         */
+        APP(ilist,
+            INSTR_CREATE_rdffr_sve(dcontext,
+                                   opnd_create_reg_element_vector(DR_REG_P15, OPSZ_1)));
+        APP(ilist,
+            XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X2),
+                                  opnd_create_reg(REG_DCXT),
+                                  OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, ffr))));
+        APP(ilist,
+            INSTR_CREATE_str(
+                dcontext,
+                opnd_create_base_disp(
+                    DR_REG_X2, DR_REG_NULL, 0, 0,
+                    opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)),
+                opnd_create_reg(DR_REG_P15)));
+        APP(ilist,
+            INSTR_CREATE_ldr(
+                dcontext, opnd_create_reg(DR_REG_P15),
+                opnd_create_base_disp(
+                    DR_REG_X1, DR_REG_NULL, 0, 15 * (proc_get_vector_length_bytes() / 8),
+                    opnd_size_from_bytes(proc_get_vector_length_bytes() / 8))));
+    }
 }
 
 /* Scratch reg0 is holding exit stub. */
diff --git a/core/arch/aarch64/proc.c b/core/arch/aarch64/proc.c
index 7538c70e05d..813e28257b4 100644
--- a/core/arch/aarch64/proc.c
+++ b/core/arch/aarch64/proc.c
@@ -38,6 +38,8 @@
 
 static int num_simd_saved;
 static int num_simd_registers;
+static int num_svep_registers;
+static int num_ffr_registers;
 static int num_opmask_registers;
 
 #ifndef DR_HOST_NOT_TARGET
@@ -101,12 +103,43 @@ get_processor_specific_info(void)
     cpu_info.features.flags_aa64zfr0 = isa_features[AA64ZFR0];
     cpu_info.features.flags_aa64pfr1 = isa_features[AA64PFR1];
 
-#        if !defined(DR_HOST_NOT_TARGET) && defined(SVE)
-    /* TODO i#3044: Vector length will be set by reading value from h/w. */
-    CLIENT_ASSERT(false, "TODO i#3044: SVE requires initialisation of vector length!");
-#        elif !defined(STANDALONE_DECODER) || defined(DR_HOST_NOT_TARGET)
-    /* Set SVE vector length for unit tests. */
-    dr_set_sve_vl(256);
+    /* The SVE vector length is set to:
+     * - A value read from the host hardware.
+     * or:
+     * - 32 bytes, 256 bits.
+     * Which of the above depends on:
+     * - SVE or non-SVE AArch64 or x86 host h/w.
+     * and:
+     * - Release or development test build.
+     */
+#        if !defined(DR_HOST_NOT_TARGET)
+    if (proc_has_feature(FEATURE_SVE)) {
+#            if !defined(BUILD_TESTS)
+        uint64 vl;
+        /* This RDVL instruction is inserted as raw hex because we don't build
+         * with SVE enabled: i.e. not -march=armv8-a+sve, so that we can run a
+         * single DynamoRIO release on both SVE and non-SVE h/w.
+         * TODO i#5365: Ideally this should be generated by INSTR_CREATE_rdvl()
+         * and executed at startup time with other initialisation code.
+         */
+        asm(".inst 0x04bf5020\n" /* rdvl x0, #1 */
+            "mov %0, x0"
+            : "=r"(vl)
+            :
+            : "x0");
+        cpu_info.sve_vector_length_bytes = vl;
+        dr_set_sve_vector_length(vl * 8);
+#            else
+        cpu_info.sve_vector_length_bytes = 32;
+        dr_set_sve_vector_length(256);
+#            endif
+    } else {
+        cpu_info.sve_vector_length_bytes = 32;
+        dr_set_sve_vector_length(256);
+    }
+#        else
+    /* Set SVE vector length for unit testing the off-line decoder. */
+    dr_set_sve_vector_length(256);
 #        endif
 }
 #    endif
@@ -120,8 +153,10 @@ get_processor_specific_info(void)
 void
 proc_init_arch(void)
 {
-    num_simd_saved = MCXT_NUM_SIMD_SLOTS;
-    num_simd_registers = MCXT_NUM_SIMD_SLOTS;
+    num_simd_saved = MCXT_NUM_SIMD_SVE_SLOTS;
+    num_simd_registers = MCXT_NUM_SIMD_SVE_SLOTS;
+    num_svep_registers = MCXT_NUM_SVEP_SLOTS;
+    num_ffr_registers = MCXT_NUM_FFR_SLOTS;
     num_opmask_registers = MCXT_NUM_OPMASK_SLOTS;
 
     /* When DR_HOST_NOT_TARGET, get_cache_line_size returns false and does
@@ -198,54 +233,81 @@ proc_init_arch(void)
 #define GET_FEAT_VAL(FEATURE) (((ushort)FEATURE) & 0x000F)
 #define GET_FEAT_NSFLAG(FEATURE) ((((ushort)FEATURE) & 0x8000) >> 15)
 
+void
+proc_set_feature(feature_bit_t f, bool enable)
+{
+    uint64 *freg_val = 0;
+    ushort feat_nibble = GET_FEAT_NIBPOS(f);
+    uint64 feat_nsflag = GET_FEAT_NSFLAG(f);
+    uint64 feat_val = GET_FEAT_VAL(f);
+
+    feature_reg_idx_t feat_reg = GET_FEAT_REG(f);
+    switch (feat_reg) {
+    case AA64ISAR0: {
+        freg_val = &cpu_info.features.flags_aa64isar0;
+        break;
+    }
+    case AA64ISAR1: {
+        freg_val = &cpu_info.features.flags_aa64isar1;
+        break;
+    }
+    case AA64PFR0: {
+        freg_val = &cpu_info.features.flags_aa64pfr0;
+        break;
+    }
+    case AA64MMFR1: {
+        freg_val = &cpu_info.features.flags_aa64mmfr1;
+        break;
+    }
+    case AA64DFR0: {
+        freg_val = &cpu_info.features.flags_aa64dfr0;
+        break;
+    }
+    case AA64ZFR0: {
+        freg_val = &cpu_info.features.flags_aa64zfr0;
+        break;
+    }
+    case AA64PFR1: {
+        freg_val = &cpu_info.features.flags_aa64pfr1;
+        break;
+    }
+    default: CLIENT_ASSERT(false, "proc_has_feature: invalid feature register");
+    }
+
+    /* Clear the current feature state. */
+    *freg_val &= ~(0xFULL << (feat_nibble * 4));
+    if (enable) {
+        /* Write the feature value into the feature nibble. */
+        *freg_val |= feat_val << (feat_nibble * 4);
+    } else if (feat_nsflag == 0xF) {
+        /* If the not-set flag is 0xF, then that needs manually setting. */
+        *freg_val |= feat_nsflag << (feat_nibble * 4);
+    }
+}
+
+void
+enable_all_test_cpu_features()
+{
+    const feature_bit_t features[] = {
+        FEATURE_LSE,    FEATURE_RDM,        FEATURE_FP16,    FEATURE_DotProd,
+        FEATURE_SVE,    FEATURE_LOR,        FEATURE_FHM,     FEATURE_SM3,
+        FEATURE_SM4,    FEATURE_SHA512,     FEATURE_SHA3,    FEATURE_RAS,
+        FEATURE_SPE,    FEATURE_PAUTH,      FEATURE_LRCPC,   FEATURE_LRCPC2,
+        FEATURE_BF16,   FEATURE_I8MM,       FEATURE_F64MM,   FEATURE_FlagM,
+        FEATURE_JSCVT,  FEATURE_DPB,        FEATURE_DPB2,    FEATURE_SVE2,
+        FEATURE_SVEAES, FEATURE_SVEBitPerm, FEATURE_SVESHA3, FEATURE_SVESM4,
+        FEATURE_MTE
+    };
+    for (int i = 0; i < BUFFER_SIZE_ELEMENTS(features); ++i) {
+        proc_set_feature(features[i], true);
+    }
+    dr_set_sve_vector_length(256);
+}
+
 bool
 proc_has_feature(feature_bit_t f)
 {
 #ifndef DR_HOST_NOT_TARGET
-    /* Pretend features are supported for codec tests run on h/w which does not
-     * support all features.
-     */
-#    if defined(BUILD_TESTS)
-    switch (f) {
-    case FEATURE_LSE:
-    case FEATURE_RDM:
-    case FEATURE_FP16:
-    case FEATURE_DotProd:
-    case FEATURE_SVE:
-    case FEATURE_LOR:
-    case FEATURE_FHM:
-    case FEATURE_SM3:
-    case FEATURE_SM4:
-    case FEATURE_SHA512:
-    case FEATURE_SHA3:
-    case FEATURE_RAS:
-    case FEATURE_SPE:
-    case FEATURE_PAUTH:
-    case FEATURE_LRCPC:
-    case FEATURE_LRCPC2:
-    case FEATURE_BF16:
-    case FEATURE_I8MM:
-    case FEATURE_F64MM:
-    case FEATURE_FlagM:
-    case FEATURE_JSCVT:
-    case FEATURE_DPB:
-    case FEATURE_DPB2:
-    case FEATURE_SVE2:
-    case FEATURE_SVEAES:
-    case FEATURE_SVESHA3:
-    case FEATURE_SVESM4:
-    case FEATURE_SVEBitPerm:
-    case FEATURE_MTE: return true;
-
-    case FEATURE_AESX:
-    case FEATURE_PMULL:
-    case FEATURE_SHA1:
-    case FEATURE_SHA256:
-    case FEATURE_CRC32:
-    case FEATURE_FlagM2:
-    case FEATURE_RNG: break;
-    }
-#    endif
     ushort feat_nibble, feat_val, freg_nibble, feat_nsflag;
     uint64 freg_val = 0;
 
@@ -335,7 +397,8 @@ DR_API
 int
 proc_num_simd_registers(void)
 {
-    return num_simd_registers;
+    return num_simd_registers +
+        (proc_has_feature(FEATURE_SVE) ? (num_svep_registers + num_ffr_registers) : 0);
 }
 
 DR_API
diff --git a/core/arch/aarchxx/mangle.c b/core/arch/aarchxx/mangle.c
index 171636724b1..9806b65a2ab 100644
--- a/core/arch/aarchxx/mangle.c
+++ b/core/arch/aarchxx/mangle.c
@@ -103,29 +103,38 @@ insert_clear_eflags(dcontext_t *dcontext, clean_call_info_t *cci, instrlist_t *i
 #ifdef AARCH64
 /* Maximum positive immediate offset for STP/LDP with 64 bit registers. */
 #    define MAX_STP_OFFSET 504
+/* Maximum positive immediate offset for SVE STR/LDR with Z/P registers. */
+#    define MAX_SVE_STR_OFFSET 255
 
 /* Creates a memory reference for registers saved/restored to memory. */
 static opnd_t
-create_base_disp_for_save_restore(uint base_reg, bool is_single_reg, bool is_gpr,
+create_base_disp_for_save_restore(uint base_reg, bool is_single_reg, reg_type_t rtype,
                                   uint num_saved, callee_info_t *ci)
 {
     /* opzs depends on the kind of register and whether a single register or
      * a pair of registers is saved/restored using stp/ldp.
      */
-    uint opsz;
-    if (is_gpr) {
-        if (is_single_reg)
-            opsz = OPSZ_8;
-        else
-            opsz = OPSZ_16;
-    } else {
-        if (is_single_reg)
-            opsz = OPSZ_16;
-        else
-            opsz = OPSZ_32;
+    uint opsz = OPSZ_NA;
+    uint offset = 0;
+    switch (rtype) {
+    case GPR_REG_TYPE:
+        opsz = is_single_reg ? OPSZ_8 : OPSZ_16;
+        offset = num_saved * sizeof(reg_t);
+        break;
+    case SIMD_REG_TYPE:
+        opsz = is_single_reg ? OPSZ_16 : OPSZ_32;
+        offset = num_saved * 16;
+        break;
+    case SVE_ZREG_TYPE:
+        opsz = opnd_size_from_bytes(proc_get_vector_length_bytes());
+        offset = num_saved * proc_get_vector_length_bytes();
+        break;
+    case SVE_PREG_TYPE:
+        opsz = opnd_size_from_bytes(proc_get_vector_length_bytes() / 8);
+        offset = num_saved * (proc_get_vector_length_bytes() / 8);
+        break;
+    default: ASSERT_NOT_REACHED();
     }
-
-    uint offset = num_saved * (is_gpr ? sizeof(reg_t) : sizeof(dr_simd_t));
     return opnd_create_base_disp(base_reg, DR_REG_NULL, 0, offset, opsz);
 }
 
@@ -144,15 +153,17 @@ create_load_or_store_instr(dcontext_t *dcontext, reg_id_t reg, opnd_t mem, bool
  * is odd. Optionally takes reg_skip into account.
  */
 static void
-insert_save_or_restore_registers(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
-                                 bool *reg_skip, reg_id_t base_reg, reg_id_t first_reg,
-                                 bool save, bool is_gpr,
-                                 opnd_t (*get_mem_opnd)(uint base_reg, bool is_single_reg,
-                                                        bool is_gpr, uint num_saved,
-                                                        callee_info_t *ci),
-                                 callee_info_t *ci)
+insert_save_or_restore_gpr_simd_registers(
+    dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr, bool *reg_skip,
+    reg_id_t base_reg, reg_id_t first_reg, bool save, reg_type_t rtype,
+    opnd_t (*get_mem_opnd)(uint base_reg, bool is_single_reg, reg_type_t rtype,
+                           uint num_saved, callee_info_t *ci),
+    callee_info_t *ci)
 {
-    uint i, reg1 = UINT_MAX, num_regs = is_gpr ? 30 : 32;
+    ASSERT(rtype == GPR_REG_TYPE || rtype == SIMD_REG_TYPE);
+
+    uint i, reg1 = UINT_MAX,
+            num_regs = (rtype == GPR_REG_TYPE) ? 30 : MCXT_NUM_SIMD_SVE_SLOTS;
     uint saved_regs = 0;
     instr_t *new_instr;
     /* Use stp/ldp to save/restore as many register pairs to memory, skipping
@@ -166,7 +177,7 @@ insert_save_or_restore_registers(dcontext_t *dcontext, instrlist_t *ilist, instr
             reg1 = i;
         else {
             opnd_t mem1 =
-                get_mem_opnd(base_reg, false /* is_single_reg */, is_gpr,
+                get_mem_opnd(base_reg, /*is_single_reg=*/false, rtype,
                              /* When creating save/restore instructions
                               * for inlining, we need the register id
                               * to compute the address.
@@ -180,7 +191,7 @@ insert_save_or_restore_registers(dcontext_t *dcontext, instrlist_t *ilist, instr
                     create_load_or_store_instr(dcontext, first_reg + reg1, mem1, save));
 
                 opnd_t mem2 =
-                    get_mem_opnd(base_reg, false /* is_single_reg */, is_gpr,
+                    get_mem_opnd(base_reg, /*is_single_reg=*/false, rtype,
                                  /* When creating save/restore instructions
                                   * for inlining, we need the register id
                                   * to compute the address.
@@ -211,34 +222,173 @@ insert_save_or_restore_registers(dcontext_t *dcontext, instrlist_t *ilist, instr
      */
     if (reg1 != UINT_MAX) {
         opnd_t mem =
-            get_mem_opnd(base_reg, true /* is_single_reg */, is_gpr,
+            get_mem_opnd(base_reg, /*is_single_reg=*/true, rtype,
                          ci != NULL ? first_reg + (reg_id_t)reg1 : saved_regs, ci);
         PRE(ilist, instr,
             create_load_or_store_instr(dcontext, first_reg + reg1, mem, save));
     }
 }
 
+static void
+insert_save_or_restore_svep_registers(
+    dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr, bool *reg_skip,
+    reg_id_t base_reg, bool save,
+    opnd_t (*get_mem_opnd)(uint base_reg, bool is_single_reg, reg_type_t rtype,
+                           uint num_saved, callee_info_t *ci),
+    callee_info_t *ci)
+{
+    uint i, saved_regs = 0;
+    for (i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) {
+        if (reg_skip != NULL && reg_skip[MCXT_NUM_SIMD_SVE_SLOTS + i])
+            continue;
+
+        opnd_t mem =
+            get_mem_opnd(base_reg, /*is_single_reg=*/true, SVE_PREG_TYPE, saved_regs, ci);
+        /* disp should never be greater than MAX_SVE_STR_OFFSET because it
+         * is the immediate multiplied by the current vector register size
+         * in bytes: STR <Pn>, [<Xn|SP>{, #<imm>, MUL VL}] and we only go up
+         * num_regs registers.
+         */
+        ASSERT(opnd_get_disp(mem) / proc_get_vector_length_bytes() <= MAX_SVE_STR_OFFSET);
+        PRE(ilist, instr, create_load_or_store_instr(dcontext, DR_REG_P0 + i, mem, save));
+        saved_regs++;
+    }
+}
+static void
+insert_save_or_restore_sve_registers(
+    dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr, bool *reg_skip,
+    reg_id_t base_reg, reg_id_t first_reg, bool save, reg_type_t rtype,
+    opnd_t (*get_mem_opnd)(uint base_reg, bool is_single_reg, reg_type_t rtype,
+                           uint num_saved, callee_info_t *ci),
+    callee_info_t *ci)
+{
+    ASSERT(rtype == SVE_ZREG_TYPE);
+    ASSERT(first_reg == DR_REG_Z0);
+    ASSERT(MCXT_NUM_FFR_SLOTS == 1);
+
+    // SVE Z registers
+    uint i, saved_regs = 0;
+    for (i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+        if (reg_skip != NULL && reg_skip[i])
+            continue;
+
+        opnd_t mem =
+            get_mem_opnd(base_reg, /*is_single_reg=*/true, SVE_ZREG_TYPE, saved_regs, ci);
+        /* disp should never be greater than MAX_SVE_STR_OFFSET because it
+         * is the immediate multiplied by the current vector register size
+         * in bytes: STR <Zt>, [<Xn|SP>{, #<imm>, MUL VL}] and we only go up
+         * MCXT_NUM_SIMD_SVE_SLOTS registers.
+         */
+        ASSERT(opnd_get_disp(mem) / proc_get_vector_length_bytes() <= MAX_SVE_STR_OFFSET);
+        PRE(ilist, instr, create_load_or_store_instr(dcontext, DR_REG_Z0 + i, mem, save));
+        saved_regs++;
+    }
+
+    /* add base_reg, base_reg, #(SVE register offset) */
+    PRE(ilist, instr,
+        XINST_CREATE_add(dcontext, opnd_create_reg(base_reg),
+                         OPND_CREATE_INT16(MCXT_NUM_SIMD_SVE_SLOTS * sizeof(dr_simd_t))));
+
+    /* The FFR register cannot be loaded directly into the base as the ld/str register has
+     * to be a predicate.  Which means that the FFR saving has to be after the predicates,
+     * and vice versa when loading.
+     *
+     * Save Seq:
+     * - Save preds
+     * - Save FFR to P15
+     * - Store P15 to x0 (offset 16 to skip past preds)
+     *
+     * Load Seq:
+     * - Read x0 to P15 (offset 16 to skip past preds)
+     * - Write P15 to FFR
+     * - Restore preds
+     */
+    const bool handle_ffr =
+        reg_skip == NULL || !reg_skip[MCXT_NUM_SIMD_SVE_SLOTS + MCXT_NUM_SVEP_SLOTS];
+    // SVE P and FFR registers.
+    if (save) {
+        insert_save_or_restore_svep_registers(dcontext, ilist, instr, reg_skip, base_reg,
+                                              save, get_mem_opnd, ci);
+
+        if (handle_ffr) {
+            PRE(ilist, instr,
+                INSTR_CREATE_rdffr_sve(
+                    dcontext, opnd_create_reg_element_vector(DR_REG_P15, OPSZ_1)));
+            opnd_t mem =
+                get_mem_opnd(base_reg, /*is_single_reg=*/true, SVE_PREG_TYPE, 16, ci);
+            PRE(ilist, instr,
+                create_load_or_store_instr(dcontext, DR_REG_P15, mem, save));
+        }
+    } else {
+        if (handle_ffr) {
+            opnd_t mem =
+                get_mem_opnd(base_reg, /*is_single_reg=*/true, SVE_PREG_TYPE, 16, ci);
+            PRE(ilist, instr,
+                create_load_or_store_instr(dcontext, DR_REG_P15, mem, save));
+            PRE(ilist, instr,
+                INSTR_CREATE_wrffr_sve(
+                    dcontext, opnd_create_reg_element_vector(DR_REG_P15, OPSZ_1)));
+        }
+
+        insert_save_or_restore_svep_registers(dcontext, ilist, instr, reg_skip, base_reg,
+                                              save, get_mem_opnd, ci);
+    }
+}
+
+static void
+insert_save_or_restore_registers(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
+                                 bool *reg_skip, reg_id_t base_reg, reg_id_t first_reg,
+                                 bool save, reg_type_t rtype,
+                                 opnd_t (*get_mem_opnd)(uint base_reg, bool is_single_reg,
+                                                        reg_type_t rtype, uint num_saved,
+                                                        callee_info_t *ci),
+                                 callee_info_t *ci)
+{
+    switch (rtype) {
+    case GPR_REG_TYPE:
+    case SIMD_REG_TYPE:
+        insert_save_or_restore_gpr_simd_registers(dcontext, ilist, instr, reg_skip,
+                                                  base_reg, first_reg, save, rtype,
+                                                  get_mem_opnd, ci);
+        break;
+    case SVE_ZREG_TYPE:
+        insert_save_or_restore_sve_registers(dcontext, ilist, instr, reg_skip, base_reg,
+                                             first_reg, save, rtype, get_mem_opnd, ci);
+        break;
+    case SVE_PREG_TYPE:
+        /* SVE Z, P and FFR registers are saved/restored sequentially in
+         * insert_save_or_restore_sve_registers(). At this top level call layer
+         * we use SVE_ZREG_TYPE to indicate all of SVE register bank.
+         */
+        CLIENT_ASSERT(false,
+                      "internal error, use SVE_ZREG_TYPE for top level save/restore of "
+                      "SVE registers.");
+    default: ASSERT_NOT_REACHED();
+    }
+}
+
 static void
 insert_save_registers(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
-                      bool *reg_skip, reg_id_t base_reg, reg_id_t first_reg, bool is_gpr)
+                      bool *reg_skip, reg_id_t base_reg, reg_id_t first_reg,
+                      reg_type_t rtype)
 {
     insert_save_or_restore_registers(dcontext, ilist, instr, reg_skip, base_reg,
-                                     first_reg, true /* save */, is_gpr,
+                                     first_reg, true /* save */, rtype,
                                      create_base_disp_for_save_restore, NULL);
 }
 
 static void
 insert_restore_registers(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
                          bool *reg_skip, reg_id_t base_reg, reg_id_t first_reg,
-                         bool is_gpr)
+                         reg_type_t rtype)
 {
     insert_save_or_restore_registers(dcontext, ilist, instr, reg_skip, base_reg,
-                                     first_reg, false /* restore */, is_gpr,
+                                     first_reg, false /* restore */, rtype,
                                      create_base_disp_for_save_restore, NULL);
 }
 
 static opnd_t
-inline_get_mem_opnd(uint base_reg, bool is_single_reg, bool is_gpr, uint reg_id,
+inline_get_mem_opnd(uint base_reg, bool is_single_reg, reg_type_t rtype, uint reg_id,
                     callee_info_t *ci)
 {
     return callee_info_slot_opnd(ci, SLOT_REG, reg_id);
@@ -246,19 +396,21 @@ inline_get_mem_opnd(uint base_reg, bool is_single_reg, bool is_gpr, uint reg_id,
 
 void
 insert_save_inline_registers(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
-                             bool *reg_skip, reg_id_t first_reg, bool is_gpr, void *ci)
+                             bool *reg_skip, reg_id_t first_reg, reg_type_t rtype,
+                             void *ci)
 {
     insert_save_or_restore_registers(dcontext, ilist, instr, reg_skip, 0, first_reg,
-                                     true /* save */, is_gpr, inline_get_mem_opnd,
+                                     true /* save */, rtype, inline_get_mem_opnd,
                                      (callee_info_t *)ci);
 }
 
 void
 insert_restore_inline_registers(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
-                                bool *reg_skip, reg_id_t first_reg, bool is_gpr, void *ci)
+                                bool *reg_skip, reg_id_t first_reg, reg_type_t rtype,
+                                void *ci)
 {
     insert_save_or_restore_registers(dcontext, ilist, instr, reg_skip, 0, first_reg,
-                                     false /* restore */, is_gpr, inline_get_mem_opnd,
+                                     false /* restore */, rtype, inline_get_mem_opnd,
                                      (callee_info_t *)ci);
 }
 
@@ -283,12 +435,9 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
                               _IF_AARCH64(bool out_of_line))
 {
     uint dstack_offs = 0;
-#ifdef AARCH64
-    uint max_offs;
-#endif
+
     if (cci == NULL)
         cci = &default_clean_call_info;
-    ASSERT(proc_num_simd_registers() == MCXT_NUM_SIMD_SLOTS);
     if (cci->preserve_mcontext || cci->num_simd_skip != proc_num_simd_registers()) {
         /* FIXME i#1551: once we add skipping of regs, need to keep shape here.
          * Also, num_opmask_skip is not applicable to ARM/AArch64.
@@ -296,6 +445,11 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
     }
     /* FIXME i#1551: once we have cci->num_simd_skip, skip this if possible */
 #ifdef AARCH64
+    ASSERT(proc_num_simd_registers() ==
+           (MCXT_NUM_SIMD_SVE_SLOTS +
+            (proc_has_feature(FEATURE_SVE) ? (MCXT_NUM_SVEP_SLOTS + MCXT_NUM_FFR_SLOTS)
+                                           : 0)));
+
     /* X0 is used to hold the stack pointer. */
     cci->reg_skip[DR_REG_X0 - DR_REG_START_GPR] = false;
     /* X1 and X2 are used to save and restore the status and control registers. */
@@ -304,8 +458,6 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
     /* X11 is used to calculate the target address of the clean call. */
     cci->reg_skip[DR_REG_X11 - DR_REG_START_GPR] = false;
 
-    max_offs = get_clean_call_switch_stack_size();
-
     /* For out-of-line clean calls, the stack pointer is adjusted before jumping
      * to this code.
      */
@@ -313,16 +465,16 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
         /* sub sp, sp, #clean_call_switch_stack_size */
         PRE(ilist, instr,
             XINST_CREATE_sub(dcontext, opnd_create_reg(DR_REG_SP),
-                             OPND_CREATE_INT16(max_offs)));
+                             OPND_CREATE_INT16(get_clean_call_switch_stack_size())));
     }
 
     /* Push GPRs. */
     insert_save_registers(dcontext, ilist, instr, cci->reg_skip, DR_REG_SP, DR_REG_X0,
-                          true /* is_gpr */);
+                          GPR_REG_TYPE);
 
     dstack_offs += 32 * XSP_SZ;
 
-    /* mov x0, sp */
+    /* mov x0, sp (add %sp $0x0000 lsl $0x00 -> %x0) */
     PRE(ilist, instr,
         XINST_CREATE_move(dcontext, opnd_create_reg(DR_REG_X0),
                           opnd_create_reg(DR_REG_SP)));
@@ -339,7 +491,7 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
                              opnd_create_reg(DR_REG_X30), opnd_create_reg(DR_REG_X0)));
     }
 
-    /* add x0, x0, #dstack_offs */
+    /* add x0, x0, #dstack_offs (add %x0 $0x0100 lsl $0x00 -> %x0) */
     PRE(ilist, instr,
         XINST_CREATE_add(dcontext, opnd_create_reg(DR_REG_X0),
                          OPND_CREATE_INT16(dstack_offs)));
@@ -347,6 +499,7 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
     /* save the push_pc operand to the priv_mcontext_t.pc field */
     if (!(cci->skip_save_flags)) {
         if (opnd_is_immed_int(push_pc)) {
+            /* movz   $0x0000 lsl $0x00 -> %x1 */
             PRE(ilist, instr,
                 XINST_CREATE_load_int(dcontext, opnd_create_reg(DR_REG_X1), push_pc));
         } else {
@@ -359,7 +512,7 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
                                  OPND_CREATE_MEM64(DR_REG_SP, REG_OFFSET(push_pc_reg))));
         }
 
-        /* str x1, [sp, #dstack_offset] */
+        /* str x1, [sp, #dstack_offset] (str %x1 -> +0x0100(%sp)[8byte]) */
         PRE(ilist, instr,
             INSTR_CREATE_str(dcontext, OPND_CREATE_MEM64(DR_REG_SP, dstack_offs),
                              opnd_create_reg(DR_REG_X1)));
@@ -368,24 +521,25 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
     dstack_offs += XSP_SZ;
 
     /* Save flag values using x1, x2. */
-    /* mrs x1, nzcv */
+    /* mrs x1, nzcv (mrs %nzcv -> %x1)
+     */
     PRE(ilist, instr,
         INSTR_CREATE_mrs(dcontext, opnd_create_reg(DR_REG_X1),
                          opnd_create_reg(DR_REG_NZCV)));
-    /* mrs x2, fpcr */
+    /* mrs x2, fpcr (mrs %fpcr -> %x2) */
     PRE(ilist, instr,
         INSTR_CREATE_mrs(dcontext, opnd_create_reg(DR_REG_X2),
                          opnd_create_reg(DR_REG_FPCR)));
-    /* stp w1, w2, [x0, #8] */
+    /* stp w1, w2, [x0, #8] (stp %w1 %w2 -> +0x08(%x0)[8byte]) */
     PRE(ilist, instr,
         INSTR_CREATE_stp(dcontext, OPND_CREATE_MEM64(DR_REG_X0, 8),
                          opnd_create_reg(DR_REG_W1), opnd_create_reg(DR_REG_W2)));
 
-    /* mrs x1, fpsr */
+    /* mrs x1, fpsr (mrs %fpsr -> %x1) */
     PRE(ilist, instr,
         INSTR_CREATE_mrs(dcontext, opnd_create_reg(DR_REG_X1),
                          opnd_create_reg(DR_REG_FPSR)));
-    /* str w1, [x0, #16] */
+    /* str w1, [x0, #16] (str %w1 -> +0x10(%x0)[4byte]) */
     PRE(ilist, instr,
         INSTR_CREATE_str(dcontext, OPND_CREATE_MEM32(DR_REG_X0, 16),
                          opnd_create_reg(DR_REG_W1)));
@@ -401,12 +555,17 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
         XINST_CREATE_add(dcontext, opnd_create_reg(DR_REG_X0),
                          OPND_CREATE_INT16(dstack_offs - 32 * XSP_SZ)));
 
-    /* Push SIMD registers. */
-    insert_save_registers(dcontext, ilist, instr, cci->simd_skip, DR_REG_X0, DR_REG_Q0,
-                          false /* is_gpr */);
+    if (proc_has_feature(FEATURE_SVE)) {
+        /* Save the SVE regs */
+        insert_save_registers(dcontext, ilist, instr, cci->simd_skip, DR_REG_X0,
+                              DR_REG_Z0, SVE_ZREG_TYPE);
+    } else {
+        /* Save the SIMD registers. */
+        insert_save_registers(dcontext, ilist, instr, cci->simd_skip, DR_REG_X0,
+                              DR_REG_Q0, SIMD_REG_TYPE);
+    }
 
-    dstack_offs += (proc_num_simd_registers() * sizeof(dr_simd_t));
-    ASSERT(proc_num_simd_registers() == MCXT_NUM_SIMD_SLOTS);
+    dstack_offs += MCXT_NUM_SIMD_SLOTS * sizeof(dr_simd_t);
 
     /* Restore the registers we used. */
     /* ldp x0, x1, [sp] */
@@ -419,7 +578,6 @@ insert_push_all_registers(dcontext_t *dcontext, clean_call_info_t *cci,
                          opnd_create_base_disp(DR_REG_SP, DR_REG_NULL, 0,
                                                REG_OFFSET(DR_REG_X2), OPSZ_8)));
 #else
-
     /* vstmdb always does writeback */
     PRE(ilist, instr,
         INSTR_CREATE_vstmdb(dcontext, OPND_CREATE_MEMLIST(DR_REG_SP), SIMD_REG_LIST_LEN,
@@ -520,18 +678,23 @@ insert_pop_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, instrlist
         XINST_CREATE_move(dcontext, opnd_create_reg(DR_REG_X0),
                           opnd_create_reg(DR_REG_SP)));
 
-    current_offs = get_clean_call_switch_stack_size() -
-        proc_num_simd_registers() * sizeof(dr_simd_t);
-    ASSERT(proc_num_simd_registers() == MCXT_NUM_SIMD_SLOTS);
+    current_offs =
+        get_clean_call_switch_stack_size() - (MCXT_NUM_SIMD_SLOTS * sizeof(dr_simd_t));
 
     /* add x0, x0, current_offs */
     PRE(ilist, instr,
         XINST_CREATE_add(dcontext, opnd_create_reg(DR_REG_X0),
                          OPND_CREATE_INT32(current_offs)));
 
-    /* Pop SIMD registers. */
-    insert_restore_registers(dcontext, ilist, instr, cci->simd_skip, DR_REG_X0, DR_REG_Q0,
-                             false /* is_gpr */);
+    if (proc_has_feature(FEATURE_SVE)) {
+        /* Restore the SVE regs */
+        insert_restore_registers(dcontext, ilist, instr, cci->simd_skip, DR_REG_X0,
+                                 DR_REG_Z0, SVE_ZREG_TYPE);
+    } else {
+        /* Restore the SIMD registers. */
+        insert_restore_registers(dcontext, ilist, instr, cci->simd_skip, DR_REG_X0,
+                                 DR_REG_Q0, SIMD_REG_TYPE);
+    }
 
     /* mov x0, sp */
     PRE(ilist, instr,
@@ -553,11 +716,11 @@ insert_pop_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, instrlist
             INSTR_CREATE_ldp(dcontext, opnd_create_reg(DR_REG_W1),
                              opnd_create_reg(DR_REG_W2),
                              OPND_CREATE_MEM64(DR_REG_X0, 8)));
-        /* msr nzcv, w1 */
+        /* msr nzcv, x1 */
         PRE(ilist, instr,
             INSTR_CREATE_msr(dcontext, opnd_create_reg(DR_REG_NZCV),
                              opnd_create_reg(DR_REG_X1)));
-        /* msr fpcr, w2 */
+        /* msr fpcr, x2 */
         PRE(ilist, instr,
             INSTR_CREATE_msr(dcontext, opnd_create_reg(DR_REG_FPCR),
                              opnd_create_reg(DR_REG_X2)));
@@ -567,7 +730,7 @@ insert_pop_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, instrlist
             INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_W1),
                              OPND_CREATE_MEM32(DR_REG_X0, 16)));
 
-        /* msr fpsr, w1 */
+        /* msr fpsr, x1 */
         PRE(ilist, instr,
             INSTR_CREATE_msr(dcontext, opnd_create_reg(DR_REG_FPSR),
                              opnd_create_reg(DR_REG_X1)));
@@ -575,14 +738,14 @@ insert_pop_all_registers(dcontext_t *dcontext, clean_call_info_t *cci, instrlist
 
     /* Pop GPRs */
     insert_restore_registers(dcontext, ilist, instr, cci->reg_skip, DR_REG_SP, DR_REG_X0,
-                             true /* is_gpr */);
+                             GPR_REG_TYPE);
 
     /* For out-of-line clean calls, X30 is restored after jumping back from this
      * code, because it is used for the return address.
      */
     if (!out_of_line) {
         /* Recover x30 */
-        /* ldr w3, [x0, #16] */
+        /* ldr x30, [sp, #x30_offset] */
         PRE(ilist, instr,
             INSTR_CREATE_ldr(dcontext, opnd_create_reg(DR_REG_X30),
                              OPND_CREATE_MEM64(DR_REG_SP, REG_OFFSET(DR_REG_X30))));
diff --git a/core/arch/arch.c b/core/arch/arch.c
index 730caf19892..a7237327f16 100644
--- a/core/arch/arch.c
+++ b/core/arch/arch.c
@@ -914,7 +914,8 @@ arch_profile_exit()
 #endif /* WINDOWS_PC_SAMPLE */
 
 /* arch-specific atexit cleanup */
-void d_r_arch_exit(IF_WINDOWS_ELSE_NP(bool detach_stacked_callbacks, void))
+void
+d_r_arch_exit(IF_WINDOWS_ELSE_NP(bool detach_stacked_callbacks, void))
 {
     /* we only need to unprotect shared_code for profile extraction
      * so we do it there to also cover the fast exit path
@@ -1984,7 +1985,8 @@ fcache_return_routine_ex(dcontext_t *dcontext _IF_X86_64(gencode_mode_t mode))
     return (cache_pc)code->fcache_return;
 }
 
-cache_pc fcache_return_coarse_routine(IF_X86_64_ELSE(gencode_mode_t mode, void))
+cache_pc
+fcache_return_coarse_routine(IF_X86_64_ELSE(gencode_mode_t mode, void))
 {
     generated_code_t *code = get_shared_gencode(GLOBAL_DCONTEXT _IF_X86_64(mode));
     ASSERT(DYNAMO_OPTION(coarse_units));
@@ -1994,7 +1996,8 @@ cache_pc fcache_return_coarse_routine(IF_X86_64_ELSE(gencode_mode_t mode, void))
         return (cache_pc)code->fcache_return_coarse;
 }
 
-cache_pc trace_head_return_coarse_routine(IF_X86_64_ELSE(gencode_mode_t mode, void))
+cache_pc
+trace_head_return_coarse_routine(IF_X86_64_ELSE(gencode_mode_t mode, void))
 {
     generated_code_t *code = get_shared_gencode(GLOBAL_DCONTEXT _IF_X86_64(mode));
     ASSERT(DYNAMO_OPTION(coarse_units));
@@ -2769,7 +2772,8 @@ fcache_enter_shared_routine(dcontext_t *dcontext)
         SHARED_GENCODE_MATCH_THREAD(dcontext)->fcache_enter);
 }
 
-cache_pc fcache_return_shared_routine(IF_X86_64_ELSE(gencode_mode_t mode, void))
+cache_pc
+fcache_return_shared_routine(IF_X86_64_ELSE(gencode_mode_t mode, void))
 {
     generated_code_t *code = get_shared_gencode(GLOBAL_DCONTEXT _IF_X86_64(mode));
     ASSERT(USE_SHARED_GENCODE());
@@ -2780,7 +2784,8 @@ cache_pc fcache_return_shared_routine(IF_X86_64_ELSE(gencode_mode_t mode, void))
 }
 
 #ifdef TRACE_HEAD_CACHE_INCR
-cache_pc trace_head_incr_shared_routine(IF_X86_64_ELSE(gencode_mode_t mode, void))
+cache_pc
+trace_head_incr_shared_routine(IF_X86_64_ELSE(gencode_mode_t mode, void))
 {
     generated_code_t *code = get_shared_gencode(GLOBAL_DCONTEXT _IF_X86_64(mode));
     ASSERT(USE_SHARED_GENCODE());
@@ -3545,6 +3550,18 @@ priv_mcontext_to_dr_mcontext(dr_mcontext_t *dst, priv_mcontext_t *src)
      */
     if (dst->size > sizeof(dr_mcontext_t))
         return false;
+#if defined(AARCH64)
+    /* We could support binary compatibility for clients built before the
+     * addition of AArch64's SVE support, by evaluating the machine context's
+     * user set-size field. But currently do not, preferring to detect
+     * incompatibility and asserting or returning false.
+     */
+    if (TEST(DR_MC_MULTIMEDIA, dst->flags) && dst->size != sizeof(dr_mcontext_t)) {
+        CLIENT_ASSERT(
+            false, "A pre-SVE client is running on an Arm AArch64 SVE DynamoRIO build!");
+        return false;
+    }
+#endif
     if (TESTALL(DR_MC_ALL, dst->flags) && dst->size == sizeof(dr_mcontext_t)) {
         *(priv_mcontext_t *)(&MCXT_FIRST_REG_FIELD(dst)) = *src;
     } else {
@@ -3628,7 +3645,7 @@ priv_mcontext_to_dr_mcontext(dr_mcontext_t *dst, priv_mcontext_t *src)
                     return false;
                 memcpy(&dst->opmask, &src->opmask, sizeof(dst->opmask));
             }
-#else
+#elif defined(AARCHXX)
             /* FIXME i#1551: NYI on ARM */
             ASSERT_NOT_IMPLEMENTED(false);
 #endif
@@ -3811,14 +3828,20 @@ dump_mcontext(priv_mcontext_t *context, file_t f, bool dump_xml)
 #elif defined(AARCHXX)
     {
         int i, j;
+#    ifdef AARCH64
+        int words = proc_has_feature(FEATURE_SVE) ? 16 : 4;
+#    else
+        int words = 4;
+#    endif
         /* XXX: should be proc_num_simd_saved(). */
         for (i = 0; i < proc_num_simd_registers(); i++) {
             print_file(f, dump_xml ? "\t\tqd= \"0x" : "\tq%-3d= 0x", i);
-            for (j = 0; j < 4; j++) {
+            for (j = 0; j < words; j++) {
                 print_file(f, "%08x ", context->simd[i].u32[j]);
             }
             print_file(f, dump_xml ? "\"\n" : "\n");
         }
+        /* TODO i#5365: SVE predicate registers and FFR dump. */
     }
 #endif
 
diff --git a/core/arch/arch.h b/core/arch/arch.h
index 79efddd9614..d159b2fe16c 100644
--- a/core/arch/arch.h
+++ b/core/arch/arch.h
@@ -669,14 +669,18 @@ void
 convert_to_near_rel(dcontext_t *dcontext, instr_t *instr);
 instr_t *
 convert_to_near_rel_meta(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr);
+
 #ifdef AARCH64
+typedef enum { GPR_REG_TYPE, SIMD_REG_TYPE, SVE_ZREG_TYPE, SVE_PREG_TYPE } reg_type_t;
+
 void
 insert_save_inline_registers(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
-                             bool *reg_skip, reg_id_t first_reg, bool is_gpr, void *ci);
+                             bool *reg_skip, reg_id_t first_reg, reg_type_t rtype,
+                             void *ci);
 
 void
 insert_restore_inline_registers(dcontext_t *dcontext, instrlist_t *ilist, instr_t *instr,
-                                bool *reg_skip, reg_id_t first_reg, bool is_gpr,
+                                bool *reg_skip, reg_id_t first_reg, reg_type_t rtype,
                                 void *ci);
 
 #endif
diff --git a/core/arch/proc.h b/core/arch/proc.h
index 316abccf140..e4b204edd05 100644
--- a/core/arch/proc.h
+++ b/core/arch/proc.h
@@ -71,6 +71,7 @@ typedef struct _cpu_info_t {
     uint vendor;
 #ifdef AARCHXX
     uint architecture;
+    uint sve_vector_length_bytes;
 #endif
     uint family;
     uint type;
diff --git a/core/arch/proc_api.h b/core/arch/proc_api.h
index ed1b6e5b5c8..ebbb87a2b15 100644
--- a/core/arch/proc_api.h
+++ b/core/arch/proc_api.h
@@ -466,6 +466,25 @@ DR_API
 bool
 proc_has_feature(feature_bit_t feature);
 
+#if defined(AARCH64) && defined(BUILD_TESTS)
+DR_API
+/**
+ * Allows overriding the available state of CPU features.
+ * This is only for unit testing and offline decode, and must be called after
+ * proc_init_arch() (e.g. after dr_standalone_init() or dr_app_setup()).
+ */
+void
+proc_set_feature(feature_bit_t f, bool enable);
+
+DR_API
+/**
+ * Uses proc_set_feature() to forcibly enable CPU features for unit testing and offline
+ * decode.
+ */
+void
+enable_all_test_cpu_features();
+#endif
+
 DR_API
 /**
  * Returns all 4 32-bit feature values on X86 and architectural feature
@@ -500,6 +519,19 @@ DR_API
 const char *
 proc_get_cache_size_str(cache_size_t size);
 
+#ifdef AARCHXX
+DR_API
+/**
+ * Returns the size in bytes of the SVE registers' vector length set by the
+ * AArch64 hardware implementor. Length can be from 128 to 2048 bits in
+ * multiples of 128 bits:
+ * 128 256 384 512 640 768 896 1024 1152 1280 1408 1536 1664 1792 1920 2048
+ * Currently DynamoRIO supports implementations of up to 512 bits.
+ */
+uint
+proc_get_vector_length_bytes(void);
+#endif
+
 DR_API
 /**
  * Returns the size in bytes needed for a buffer for saving the x87 floating point state.
diff --git a/core/arch/proc_shared.c b/core/arch/proc_shared.c
index 8394d379947..e74e83fbb48 100644
--- a/core/arch/proc_shared.c
+++ b/core/arch/proc_shared.c
@@ -66,6 +66,7 @@ static ptr_uint_t mask; /* bits that should be 0 to be cache-line-aligned */
 cpu_info_t cpu_info = { VENDOR_UNKNOWN,
 #ifdef AARCHXX
                         0,
+                        0,
 #endif
                         0,
                         0,
@@ -195,6 +196,12 @@ proc_get_architecture(void)
 {
     return cpu_info.architecture;
 }
+
+uint
+proc_get_vector_length_bytes(void)
+{
+    return cpu_info.sve_vector_length_bytes;
+}
 #endif
 
 features_t *
diff --git a/core/globals.h b/core/globals.h
index 4cb7b6a9ef3..8fc73e2f1d5 100644
--- a/core/globals.h
+++ b/core/globals.h
@@ -702,7 +702,7 @@ extern thread_id_t global_try_tid;
 
 typedef struct {
     /* WARNING: if you change the offsets of any of these fields,
-     * you must also change the offsets in <arch>/<arch.s>
+     * you must also change the offsets in <arch>/<arch.asm>
      */
     priv_mcontext_t mcontext; /* real machine context (in globals_shared.h + mcxtx.h) */
 #ifdef UNIX
diff --git a/core/ir/aarch64/codec.c b/core/ir/aarch64/codec.c
index d7e8f626d44..c8169b84f16 100644
--- a/core/ir/aarch64/codec.c
+++ b/core/ir/aarch64/codec.c
@@ -1021,7 +1021,7 @@ get_elements_in_sve_vector(aarch64_reg_offset element_size)
 {
     const uint element_length =
         opnd_size_in_bits(get_opnd_size_from_offset(element_size));
-    return opnd_size_in_bits(OPSZ_SVE_VL) / element_length;
+    return opnd_size_in_bits(OPSZ_SVE_VL_BYTES) / element_length;
 }
 
 /*******************************************************************************
@@ -5180,7 +5180,7 @@ decode_opnd_svemem_gpr_simm6_vl(uint enc, int opcode, byte *pc, OUT opnd_t *opnd
     const int offset = extract_int(enc, 16, 6);
     IF_RETURN_FALSE(offset < -32 || offset > 31)
     const reg_id_t rn = decode_reg(extract_uint(enc, 5, 5), true, true);
-    const opnd_size_t mem_transfer = op_is_prefetch(opcode) ? OPSZ_0 : OPSZ_SVE_VL;
+    const opnd_size_t mem_transfer = op_is_prefetch(opcode) ? OPSZ_0 : OPSZ_SVE_VL_BYTES;
 
     /* As specified in the AArch64 SVE reference manual for contiguous prefetch
      * instructions, the immediate index value is a vector index into memory, NOT
@@ -5189,7 +5189,7 @@ decode_opnd_svemem_gpr_simm6_vl(uint enc, int opcode, byte *pc, OUT opnd_t *opnd
      * memory displacement. So when creating the address operand here, it should be
      * multiplied by the current vector register length in bytes.
      */
-    int vl_bytes = dr_get_sve_vl() / 8;
+    int vl_bytes = dr_get_sve_vector_length() / 8;
     *opnd = opnd_create_base_disp(rn, DR_REG_NULL, 0, offset * vl_bytes, mem_transfer);
 
     return true;
@@ -5199,7 +5199,7 @@ static inline bool
 encode_opnd_svemem_gpr_simm6_vl(uint enc, int opcode, byte *pc, opnd_t opnd,
                                 OUT uint *enc_out)
 {
-    const opnd_size_t mem_transfer = op_is_prefetch(opcode) ? OPSZ_0 : OPSZ_SVE_VL;
+    const opnd_size_t mem_transfer = op_is_prefetch(opcode) ? OPSZ_0 : OPSZ_SVE_VL_BYTES;
     if (!opnd_is_base_disp(opnd) || opnd_get_index(opnd) != DR_REG_NULL ||
         opnd_get_size(opnd) != mem_transfer)
         return false;
@@ -5210,7 +5210,7 @@ encode_opnd_svemem_gpr_simm6_vl(uint enc, int opcode, byte *pc, opnd_t opnd,
      * vector length at the IR level, transformed to a vector index in the
      * encoding.
      */
-    int vl_bytes = dr_get_sve_vl() / 8;
+    int vl_bytes = dr_get_sve_vector_length() / 8;
     if ((opnd_get_disp(opnd) % vl_bytes) != 0)
         return false;
     int disp = opnd_get_disp(opnd) / vl_bytes;
@@ -5329,7 +5329,7 @@ decode_opnd_svemem_gpr_simm9_vl(uint enc, int opcode, byte *pc, OUT opnd_t *opnd
     bool is_vector = TEST(1u << 14, enc);
 
     /* Transfer size depends on whether we are transferring a Z or a P register. */
-    opnd_size_t memory_transfer_size = is_vector ? OPSZ_SVE_VL : OPSZ_SVE_PL;
+    opnd_size_t memory_transfer_size = is_vector ? OPSZ_SVE_VL_BYTES : OPSZ_SVE_PL_BYTES;
 
     /* As specified in the AArch64 SVE reference manual for unpredicated vector
      * register load LDR and store STR instructions, the immediate index value is a
@@ -5339,7 +5339,7 @@ decode_opnd_svemem_gpr_simm9_vl(uint enc, int opcode, byte *pc, OUT opnd_t *opnd
      * address operand here, it should be multiplied by the current vector or
      * predicate register length in bytes.
      */
-    int vl_bytes = dr_get_sve_vl() / 8;
+    int vl_bytes = dr_get_sve_vector_length() / 8;
     int pl_bytes = vl_bytes / 8;
     int mul_len = is_vector ? vl_bytes : pl_bytes;
     *opnd =
@@ -5359,7 +5359,7 @@ encode_opnd_svemem_gpr_simm9_vl(uint enc, int opcode, byte *pc, opnd_t opnd,
     bool is_vector = TEST(1u << 14, enc);
 
     /* Transfer size depends on whether we are transferring a Z or a P register. */
-    opnd_size_t memory_transfer_size = is_vector ? OPSZ_SVE_VL : OPSZ_SVE_PL;
+    opnd_size_t memory_transfer_size = is_vector ? OPSZ_SVE_VL_BYTES : OPSZ_SVE_PL_BYTES;
 
     if (!opnd_is_base_disp(opnd) || opnd_get_size(opnd) != memory_transfer_size)
         return false;
@@ -5367,7 +5367,7 @@ encode_opnd_svemem_gpr_simm9_vl(uint enc, int opcode, byte *pc, opnd_t opnd,
      * vector or predicate length at the IR level, transformed to a vector or
      * predicate index in the encoding.
      */
-    int vl_bytes = dr_get_sve_vl() / 8;
+    int vl_bytes = dr_get_sve_vector_length() / 8;
     int pl_bytes = vl_bytes / 8;
     if (is_vector) {
         if ((opnd_get_disp(opnd) % vl_bytes) != 0)
@@ -5512,7 +5512,7 @@ decode_opnd_svemem_gpr_simm4_vl_xreg(uint enc, int opcode, byte *pc, OUT opnd_t
 {
     const uint register_count = BITS(enc, 22, 21) + 1;
     const opnd_size_t transfer_size =
-        opnd_size_from_bytes((register_count * dr_get_sve_vl()) / 8);
+        opnd_size_from_bytes((register_count * dr_get_sve_vector_length()) / 8);
 
     return decode_svemem_gpr_simm4(enc, transfer_size, register_count, opnd);
 }
@@ -5523,7 +5523,7 @@ encode_opnd_svemem_gpr_simm4_vl_xreg(uint enc, int opcode, byte *pc, opnd_t opnd
 {
     const uint register_count = BITS(enc, 22, 21) + 1;
     const opnd_size_t transfer_size =
-        opnd_size_from_bytes((register_count * dr_get_sve_vl()) / 8);
+        opnd_size_from_bytes((register_count * dr_get_sve_vector_length()) / 8);
 
     return encode_svemem_gpr_simm4(enc, transfer_size, register_count, opnd, enc_out);
 }
@@ -9695,10 +9695,5 @@ uint
 encode_common(byte *pc, instr_t *i, decode_info_t *di)
 {
     ASSERT(((ptr_int_t)pc & 3) == 0);
-
-#if defined(DR_HOST_NOT_TARGET) || defined(STANDALONE_DECODER)
-    dr_set_sve_vl(256);
-#endif
-
     return encoder_v80(pc, i, di);
 }
diff --git a/core/ir/aarch64/codec.h b/core/ir/aarch64/codec.h
index e28b6bc656b..4fe2eaa54c8 100644
--- a/core/ir/aarch64/codec.h
+++ b/core/ir/aarch64/codec.h
@@ -58,12 +58,8 @@ encode_common(byte *pc, instr_t *i, decode_info_t *di);
     ((((uint32)(_enc)) >> (bitmin)) & (uint32)MASK((bitmax) - (bitmin) + 1))
 
 #if !defined(DR_HOST_NOT_TARGET) && !defined(STANDALONE_DECODER)
-/* TODO i#3044: Vector length will be read from cpuinfo, e.g.
- * opnd_size_from_bytes(proc_get_vector_length()));
- * Setting to fixed size for now in order to pass unit tests.
- */
-#    define OPSZ_SVE_VL opnd_size_from_bytes(dr_get_sve_vl() / 8)
-#    define OPSZ_SVE_PL opnd_size_from_bytes((dr_get_sve_vl() / 8) / 8)
+#    define OPSZ_SVE_VL_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes())
+#    define OPSZ_SVE_PL_BYTES opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)
 #else
 /* SVE vector length for off-line decoder set using -vl option with drdisas,
  * e.g.
@@ -72,8 +68,8 @@ encode_common(byte *pc, instr_t *i, decode_info_t *di);
  *  85865e6b   ldr    +0x37(%x19)[32byte] -> %z11
  * $
  */
-#    define OPSZ_SVE_VL opnd_size_from_bytes(dr_get_sve_vl() / 8)
-#    define OPSZ_SVE_PL opnd_size_from_bytes((dr_get_sve_vl() / 8) / 8)
+#    define OPSZ_SVE_VL_BYTES opnd_size_from_bytes(dr_get_sve_vector_length() / 8)
+#    define OPSZ_SVE_PL_BYTES opnd_size_from_bytes((dr_get_sve_vector_length() / 8) / 8)
 #endif
 
 #define RETURN_FALSE                                               \
diff --git a/core/ir/aarch64/decode.c b/core/ir/aarch64/decode.c
index ef006a8d807..e056a7ef9d3 100644
--- a/core/ir/aarch64/decode.c
+++ b/core/ir/aarch64/decode.c
@@ -185,7 +185,7 @@ decode_first_opcode_byte(int opcode)
 const instr_info_t *
 opcode_to_encoding_info(uint opc, dr_isa_mode_t isa_mode)
 {
-    /* We do not use instr_info_t encoding info on AArch64. */
+    /* We do not use instr_info_t encoding info on AArch64. FIXME i#1569 */
     ASSERT_NOT_REACHED();
     return NULL;
 }
diff --git a/core/ir/aarch64/instr_create_api.h b/core/ir/aarch64/instr_create_api.h
index 75a38c8db16..7a699c55f28 100644
--- a/core/ir/aarch64/instr_create_api.h
+++ b/core/ir/aarch64/instr_create_api.h
@@ -10974,10 +10974,10 @@
  *             DR_EXTEND_UXTX, 0, opnd_size_from_bytes(dr_get_sve_vl() / 64), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 64), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 64), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 0)
  */
 #define INSTR_CREATE_ldff1b_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ldff1b, Zt, Rn, Pg)
@@ -11007,16 +11007,16 @@
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, LSL #3] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 3)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 3)
  *             For the [\<Xn|SP\>, \<Zm\>.D] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\> #3] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 3)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 3)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 0)
  */
 #define INSTR_CREATE_ldff1d_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ldff1d, Zt, Rn, Pg)
@@ -11054,22 +11054,22 @@
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, LSL #1] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 1)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 1)
  *             For the [\<Xn|SP\>, \<Zm\>.D] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\> #1] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 1)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 1)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\> #1] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 1)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 1)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 0)
  */
 #define INSTR_CREATE_ldff1h_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ldff1h, Zt, Rn, Pg)
@@ -11103,13 +11103,13 @@
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 64), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 64), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 64), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 64), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 64), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 0)
  */
 #define INSTR_CREATE_ldff1sb_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ldff1sb, Zt, Rn, Pg)
@@ -11146,22 +11146,22 @@
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, LSL #1] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 1)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 1)
  *             For the [\<Xn|SP\>, \<Zm\>.D] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\> #1] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 1)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 1)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\> #1] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 1)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 1)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 0)
  */
 #define INSTR_CREATE_ldff1sh_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ldff1sh, Zt, Rn, Pg)
@@ -11193,22 +11193,22 @@
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, LSL #2] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 2)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 2)
  *             For the [\<Xn|SP\>, \<Zm\>.D] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\> #2] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 2)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 2)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\> #2] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 8), 2)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 8), 2)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 8), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 8), 0)
  */
 #define INSTR_CREATE_ldff1sw_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ldff1sw, Zt, Rn, Pg)
@@ -11322,16 +11322,16 @@
  *             DR_EXTEND_UXTX, 0, 0, 0, OPSZ_1)
  *             For the B element size [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 8))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 8))
  *             For the H element size [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 16))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 16))
  *             For the S element size [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 32))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 32))
  *             For the D element size [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 64))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 64))
  *             For the  [\<Zn\>.S{, #\<imm\>}] variant:
  *             opnd_create_vector_base_disp_aarch64(Zn, DR_REG_NULL, OPSZ_4,
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
@@ -11340,13 +11340,13 @@
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 64), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 64), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 64), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 64), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 64), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 0)
  */
 #define INSTR_CREATE_ld1b_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ld1b, Zt, Rn, Pg)
@@ -11488,13 +11488,13 @@
  *             DR_EXTEND_UXTX, 0, 0, 0, OPSZ_1)
  *             For the H element size [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 16))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 16))
  *             For the S element size [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 32))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 32))
  *             For the D element size [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 64))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 64))
  *             For the  [\<Zn\>.S{, #\<imm\>}] variant:
  *             opnd_create_vector_base_disp_aarch64(Zn, DR_REG_NULL, OPSZ_4,
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
@@ -11503,13 +11503,13 @@
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 64), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 64), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 64), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 64), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 64), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 0)
  */
 #define INSTR_CREATE_ld1sb_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ld1sb, Zt, Rn, Pg)
@@ -11529,10 +11529,9 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>] variant:
  *             opnd_create_base_disp_aarch64(Rn, Rm,
- *             DR_EXTEND_UXTX, 0, 0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 8))
- *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
- *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 8))
+ *             DR_EXTEND_UXTX, 0, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() /
+ * 8)) For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant: opnd_create_base_disp(Rn,
+ * DR_REG_NULL, 0, imm, opnd_size_from_bytes(dr_get_sve_vector_length() / 8))
  */
 #define INSTR_CREATE_ldnt1b_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ldnt1b, Zt, Rn, Pg)
@@ -11560,8 +11559,8 @@
  *             DR_EXTEND_UXTX, 0, 0, 0, OPSZ_1)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / (8 * opnd_size_to_bytes(Ts))))
- *             For the  [\<Zn\>.S{, #\<imm\>}] variant:
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / (8 *
+ * opnd_size_to_bytes(Ts)))) For the  [\<Zn\>.S{, #\<imm\>}] variant:
  *             opnd_create_vector_base_disp_aarch64(Zn, DR_REG_NULL, OPSZ_4,
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
  *             For the  [\<Zn\>.D{, #\<imm\>}] variant:
@@ -11569,13 +11568,13 @@
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 64), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 64), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 64), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 64), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 64), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 0)
  */
 #define INSTR_CREATE_st1b_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_st1b, Rn, Zt, Pg)
@@ -11595,10 +11594,9 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>] variant:
  *             opnd_create_base_disp_aarch64(Rn, Rm,
- *             DR_EXTEND_UXTX, 0, 0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 8))
- *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
- *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 8))
+ *             DR_EXTEND_UXTX, 0, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() /
+ * 8)) For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant: opnd_create_base_disp(Rn,
+ * DR_REG_NULL, 0, imm, opnd_size_from_bytes(dr_get_sve_vector_length() / 8))
  */
 #define INSTR_CREATE_stnt1b_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_stnt1b, Rn, Zt, Pg)
@@ -12023,10 +12021,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>] variant:
  *             opnd_create_base_disp_aarch64(Rn, Rm, DR_EXTEND_UXTX, 0, 0, 0,
- *             opnd_size_from_bytes(2 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(2 * (dr_get_sve_vector_length() / 8)))
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(2 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(2 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_ld2b_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_2dst_2src(dc, OP_ld2b, Zt, opnd_create_increment_reg(Zt, 1), Rn, Pg)
@@ -12046,10 +12044,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>] variant:
  *             opnd_create_base_disp_aarch64(Rn, Rm, DR_EXTEND_UXTX, 0, 0, 0,
- *             opnd_size_from_bytes(3 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(3 * (dr_get_sve_vector_length() / 8)))
  *             For the [\<Xn|SP\>{, #\<simm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(3 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(3 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_ld3b_sve_pred(dc, Zt, Pg, Rn)                            \
     instr_create_3dst_2src(dc, OP_ld3b, Zt, opnd_create_increment_reg(Zt, 1), \
@@ -12070,10 +12068,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>] variant:
  *             opnd_create_base_disp_aarch64(Rn, Rm, DR_EXTEND_UXTX, 0, 0, 0,
- *             opnd_size_from_bytes(4 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(4 * (dr_get_sve_vector_length() / 8)))
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(4 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(4 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_ld4b_sve_pred(dc, Zt, Pg, Rn)                            \
     instr_create_4dst_2src(dc, OP_ld4b, Zt, opnd_create_increment_reg(Zt, 1), \
@@ -12095,10 +12093,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>] variant:
  *             opnd_create_base_disp_aarch64(Rn, Rm, DR_EXTEND_UXTX, 0, 0, 0,
- *             opnd_size_from_bytes(2 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(2 * (dr_get_sve_vector_length() / 8)))
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(2 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(2 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_st2b_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_3src(dc, OP_st2b, Rn, Zt, opnd_create_increment_reg(Zt, 1), Pg)
@@ -12118,10 +12116,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>] variant:
  *             opnd_create_base_disp_aarch64(Rn, Rm, DR_EXTEND_UXTX, 0, 0, 0,
- *             opnd_size_from_bytes(3 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(3 * (dr_get_sve_vector_length() / 8)))
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(3 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(3 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_st3b_sve_pred(dc, Zt, Pg, Rn)                                \
     instr_create_1dst_4src(dc, OP_st3b, Rn, Zt, opnd_create_increment_reg(Zt, 1), \
@@ -12142,10 +12140,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>] variant:
  *             opnd_create_base_disp_aarch64(Rn, Rm, DR_EXTEND_UXTX, 0, 0, 0,
- *             opnd_size_from_bytes(4 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(4 * (dr_get_sve_vector_length() / 8)))
  *             For the [\<Xn|SP\>{, #\<simm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(4 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(4 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_st4b_sve_pred(dc, Zt, Pg, Rn)                                \
     instr_create_1dst_5src(dc, OP_st4b, Rn, Zt, opnd_create_increment_reg(Zt, 1), \
@@ -12185,35 +12183,35 @@
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, LSL #1] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 1)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 1)
  *             For the [\<Xn|SP\>, \<Zm\>.D] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\> #1] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 1)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 1)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\> #1] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 1)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 1)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 0)
  *             For the  [\<Xn|SP\>, \<Xm\>, LSL #1] variants:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm,
- *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vl()
+ *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length()
  *             /8/16/32), 1)
  *             For the H element size [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 8))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 8))
  *             For the S element size [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 16))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 16))
  *             For the D element size [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 32))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 32))
  */
 #define INSTR_CREATE_ld1h_sve_pred(dc, Zt, Pg, Zn) \
     instr_create_1dst_2src(dc, OP_ld1h, Zt, Zn, Pg)
@@ -12249,32 +12247,32 @@
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, LSL #1] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 1)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 1)
  *             For the [\<Xn|SP\>, \<Zm\>.D] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\> #1] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 1)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 1)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\> #1] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 1)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 1)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 0)
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #1] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm,
- *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vl()
+ *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length()
  *             / 16/32), 1) depending on Zt's element size.
  *             For the S element size [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 16))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 16))
  *             For the D element size [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 32))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 32))
  */
 #define INSTR_CREATE_ld1sh_sve_pred(dc, Zt, Pg, Zn) \
     instr_create_1dst_2src(dc, OP_ld1sh, Zt, Zn, Pg)
@@ -12310,32 +12308,32 @@
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, LSL #2] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 2)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 2)
  *             For the [\<Xn|SP\>, \<Zm\>.D] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\> #2] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 2)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 2)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\> #2] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 8), 2)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 8), 2)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 8), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 8), 0)
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #2] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm,
- *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vl()
+ *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length()
  *             / 8/16), 2) depending on Zt's element size.
  *             For the S element size [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 8))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 8))
  *             For the D element size [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 16))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 16))
  */
 #define INSTR_CREATE_ld1w_sve_pred(dc, Zt, Pg, Zn) \
     instr_create_1dst_2src(dc, OP_ld1w, Zt, Zn, Pg)
@@ -12363,22 +12361,22 @@
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 8), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, LSL #3] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 3)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 3)
  *             For the [\<Xn|SP\>, \<Zm\>.D] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\> #3] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 3)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 3)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 0)
  *             For the variant \<Xn|SP\>, \<Xm\>, LSL #3]:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX,
- *             true, 0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 8), 3)
+ *             true, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 8), 3)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 8))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 8))
  */
 #define INSTR_CREATE_ld1d_sve_pred(dc, Zt, Pg, Zn) \
     instr_create_1dst_2src(dc, OP_ld1d, Zt, Zn, Pg)
@@ -12402,11 +12400,11 @@
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #2] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm,
- *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vl()
+ *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length()
  *             / 16), 2)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 16))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 16))
  */
 #define INSTR_CREATE_ld1sw_sve_pred(dc, Zt, Pg, Zn) \
     instr_create_1dst_2src(dc, OP_ld1sw, Zt, Zn, Pg)
@@ -12440,29 +12438,29 @@
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, LSL #1] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 1)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 1)
  *             For the [\<Xn|SP\>, \<Zm\>.D] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\> #1] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 1)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 1)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 32), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 32), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\> #1] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 1)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 1)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 0)
  *             For the  [\<Xn|SP\>, \<Xm\>, LSL #1] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm,
- *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vl()
+ *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length()
  *             /8/16/32), 1)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / opnd_size_to_bytes(Ts)))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / opnd_size_to_bytes(Ts)))
  */
 #define INSTR_CREATE_st1h_sve_pred(dc, Zt, Pg, Zn) \
     instr_create_1dst_2src(dc, OP_st1h, Zn, Zt, Pg)
@@ -12496,29 +12494,30 @@
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, LSL #2] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 2)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 2)
  *             For the [\<Xn|SP\>, \<Zm\>.D] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\> #2] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 2)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 2)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 16), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 16), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\> #2] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 8), 2)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 8), 2)
  *             For the [\<Xn|SP\>, \<Zm\>.S, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_4, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 8), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 8), 0)
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #2] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm,
- *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vl()
+ *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length()
  *             / 8/16), 2)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / (8 * opnd_size_to_bytes(Ts))))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / (8 *
+ * opnd_size_to_bytes(Ts))))
  */
 #define INSTR_CREATE_st1w_sve_pred(dc, Zt, Pg, Zn) \
     instr_create_1dst_2src(dc, OP_st1w, Zn, Zt, Pg)
@@ -12546,22 +12545,22 @@
  *             DR_EXTEND_UXTX, 0, imm5, 0, opnd_size_from_bytes(dr_get_sve_vl() / 8), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, LSL #3] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 8), 3)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 8), 3)
  *             For the [\<Xn|SP\>, \<Zm\>.D] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, DR_EXTEND_UXTX,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 8), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 8), 0)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\> #3] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             true, 0, opnd_size_from_bytes(dr_get_sve_vl() / 8), 3)
+ *             true, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 8), 3)
  *             For the [\<Xn|SP\>, \<Zm\>.D, \<extend\>] variant:
  *             opnd_create_vector_base_disp_aarch64(Xn, Zm, OPSZ_8, extend,
- *             0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 8), 0)
+ *             0, 0, opnd_size_from_bytes(dr_get_sve_vector_length() / 8), 0)
  *             For the  [\<Xn|SP\>, \<Xm\>, LSL #3] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm,
- *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vl() / 8), 3)
- *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
- *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / (8 * opnd_size_to_bytes(Ts))))
+ *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length()
+ * / 8), 3) For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant: opnd_create_base_disp(Rn,
+ * DR_REG_NULL, 0, imm, opnd_size_from_bytes(dr_get_sve_vector_length() / (8 *
+ * opnd_size_to_bytes(Ts))))
  */
 #define INSTR_CREATE_st1d_sve_pred(dc, Zt, Pg, Zn) \
     instr_create_1dst_2src(dc, OP_st1d, Zn, Zt, Pg)
@@ -12581,10 +12580,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #3] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(2 * (dr_get_sve_vl() / 8)), 3)
+ *             opnd_size_from_bytes(2 * (dr_get_sve_vector_length() / 8)), 3)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(2 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(2 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_ld2d_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_2dst_2src(dc, OP_ld2d, Zt, opnd_create_increment_reg(Zt, 1), Rn, Pg)
@@ -12604,10 +12603,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #1] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(2 * (dr_get_sve_vl() / 8)), 1)
+ *             opnd_size_from_bytes(2 * (dr_get_sve_vector_length() / 8)), 1)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(2 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(2 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_ld2h_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_2dst_2src(dc, OP_ld2h, Zt, opnd_create_increment_reg(Zt, 1), Rn, Pg)
@@ -12627,11 +12626,11 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #2] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(2 * (dr_get_sve_vl() / 8)), 2)
+ *             opnd_size_from_bytes(2 * (dr_get_sve_vector_length() / 8)), 2)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 4))
- *             opnd_size_from_bytes(2 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 4))
+ *             opnd_size_from_bytes(2 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_ld2w_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_2dst_2src(dc, OP_ld2w, Zt, opnd_create_increment_reg(Zt, 1), Rn, Pg)
@@ -12651,10 +12650,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #3] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(3 * (dr_get_sve_vl() / 8)), 3)
+ *             opnd_size_from_bytes(3 * (dr_get_sve_vector_length() / 8)), 3)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(3 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(3 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_ld3d_sve_pred(dc, Zt, Pg, Rn)                            \
     instr_create_3dst_2src(dc, OP_ld3d, Zt, opnd_create_increment_reg(Zt, 1), \
@@ -12675,10 +12674,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #1] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(3 * (dr_get_sve_vl() / 8)), 1)
+ *             opnd_size_from_bytes(3 * (dr_get_sve_vector_length() / 8)), 1)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(3 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(3 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_ld3h_sve_pred(dc, Zt, Pg, Rn)                            \
     instr_create_3dst_2src(dc, OP_ld3h, Zt, opnd_create_increment_reg(Zt, 1), \
@@ -12699,10 +12698,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #2] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(3 * (dr_get_sve_vl() / 8)), 2)
+ *             opnd_size_from_bytes(3 * (dr_get_sve_vector_length() / 8)), 2)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(3 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(3 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_ld3w_sve_pred(dc, Zt, Pg, Rn)                            \
     instr_create_3dst_2src(dc, OP_ld3w, Zt, opnd_create_increment_reg(Zt, 1), \
@@ -12723,10 +12722,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #3] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(4 * (dr_get_sve_vl() / 8)), 3)
+ *             opnd_size_from_bytes(4 * (dr_get_sve_vector_length() / 8)), 3)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(4 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(4 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_ld4d_sve_pred(dc, Zt, Pg, Rn)                            \
     instr_create_4dst_2src(dc, OP_ld4d, Zt, opnd_create_increment_reg(Zt, 1), \
@@ -12748,10 +12747,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #1] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(4 * (dr_get_sve_vl() / 8)), 1)
+ *             opnd_size_from_bytes(4 * (dr_get_sve_vector_length() / 8)), 1)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(4 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(4 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_ld4h_sve_pred(dc, Zt, Pg, Rn)                            \
     instr_create_4dst_2src(dc, OP_ld4h, Zt, opnd_create_increment_reg(Zt, 1), \
@@ -12773,10 +12772,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #2] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(4 * (dr_get_sve_vl() / 8)), 2)
+ *             opnd_size_from_bytes(4 * (dr_get_sve_vector_length() / 8)), 2)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(4 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(4 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_ld4w_sve_pred(dc, Zt, Pg, Rn)                            \
     instr_create_4dst_2src(dc, OP_ld4w, Zt, opnd_create_increment_reg(Zt, 1), \
@@ -12798,11 +12797,11 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #3] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm,
- *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vl()
+ *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length()
  *             / 8), 3)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 8))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 8))
  */
 #define INSTR_CREATE_ldnt1d_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ldnt1d, Zt, Rn, Pg)
@@ -12822,11 +12821,11 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #1] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm,
- *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vl()
+ *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length()
  *             / 8), 1)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 8))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 8))
  */
 #define INSTR_CREATE_ldnt1h_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ldnt1h, Zt, Rn, Pg)
@@ -12846,11 +12845,11 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #2] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm,
- *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vl()
+ *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length()
  *             / 8), 2)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 8))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 8))
  */
 #define INSTR_CREATE_ldnt1w_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ldnt1w, Zt, Rn, Pg)
@@ -12870,10 +12869,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #3] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(2 * (dr_get_sve_vl() / 8)), 3)
+ *             opnd_size_from_bytes(2 * (dr_get_sve_vector_length() / 8)), 3)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(2 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(2 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_st2d_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_3src(dc, OP_st2d, Rn, Zt, opnd_create_increment_reg(Zt, 1), Pg)
@@ -12893,10 +12892,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #1] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(2 * (dr_get_sve_vl() / 8)), 1)
+ *             opnd_size_from_bytes(2 * (dr_get_sve_vector_length() / 8)), 1)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(2 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(2 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_st2h_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_3src(dc, OP_st2h, Rn, Zt, opnd_create_increment_reg(Zt, 1), Pg)
@@ -12916,10 +12915,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #2] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(2 * (dr_get_sve_vl() / 8)), 2)
+ *             opnd_size_from_bytes(2 * (dr_get_sve_vector_length() / 8)), 2)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(2 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(2 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_st2w_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_3src(dc, OP_st2w, Rn, Zt, opnd_create_increment_reg(Zt, 1), Pg)
@@ -12939,10 +12938,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #3] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(3 * (dr_get_sve_vl() / 8)), 3)
+ *             opnd_size_from_bytes(3 * (dr_get_sve_vector_length() / 8)), 3)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(3 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(3 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_st3d_sve_pred(dc, Zt, Pg, Rn)                                \
     instr_create_1dst_4src(dc, OP_st3d, Rn, Zt, opnd_create_increment_reg(Zt, 1), \
@@ -12963,10 +12962,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #1] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(3 * (dr_get_sve_vl() / 8)), 1)
+ *             opnd_size_from_bytes(3 * (dr_get_sve_vector_length() / 8)), 1)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(3 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(3 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_st3h_sve_pred(dc, Zt, Pg, Rn)                                \
     instr_create_1dst_4src(dc, OP_st3h, Rn, Zt, opnd_create_increment_reg(Zt, 1), \
@@ -12987,10 +12986,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #2] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(3 * (dr_get_sve_vl() / 8)), 2)
+ *             opnd_size_from_bytes(3 * (dr_get_sve_vector_length() / 8)), 2)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(3 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(3 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_st3w_sve_pred(dc, Zt, Pg, Rn)                                \
     instr_create_1dst_4src(dc, OP_st3w, Rn, Zt, opnd_create_increment_reg(Zt, 1), \
@@ -13011,10 +13010,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #3] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(4 * (dr_get_sve_vl() / 8)), 3)
+ *             opnd_size_from_bytes(4 * (dr_get_sve_vector_length() / 8)), 3)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(4 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(4 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_st4d_sve_pred(dc, Zt, Pg, Rn)                                \
     instr_create_1dst_5src(dc, OP_st4d, Rn, Zt, opnd_create_increment_reg(Zt, 1), \
@@ -13036,10 +13035,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #1] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(4 * (dr_get_sve_vl() / 8)), 1)
+ *             opnd_size_from_bytes(4 * (dr_get_sve_vector_length() / 8)), 1)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(4 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(4 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_st4h_sve_pred(dc, Zt, Pg, Rn)                                \
     instr_create_1dst_5src(dc, OP_st4h, Rn, Zt, opnd_create_increment_reg(Zt, 1), \
@@ -13061,10 +13060,10 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #2] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm, DR_EXTEND_UXTX, true, 0, 0,
- *             opnd_size_from_bytes(4 * (dr_get_sve_vl() / 8)), 2)
+ *             opnd_size_from_bytes(4 * (dr_get_sve_vector_length() / 8)), 2)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm4,
- *             opnd_size_from_bytes(4 * (dr_get_sve_vl() / 8)))
+ *             opnd_size_from_bytes(4 * (dr_get_sve_vector_length() / 8)))
  */
 #define INSTR_CREATE_st4w_sve_pred(dc, Zt, Pg, Rn)                                \
     instr_create_1dst_5src(dc, OP_st4w, Rn, Zt, opnd_create_increment_reg(Zt, 1), \
@@ -13086,12 +13085,12 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #3] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm,
- *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vl()
+ *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length()
  *             / 8), 3)
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #3] variant:
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 8))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 8))
  */
 #define INSTR_CREATE_stnt1d_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_stnt1d, Rn, Zt, Pg)
@@ -13111,11 +13110,11 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #1] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm,
- *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vl()
+ *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length()
  *             / 8), 1)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 8))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 8))
  */
 #define INSTR_CREATE_stnt1h_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_stnt1h, Rn, Zt, Pg)
@@ -13135,11 +13134,11 @@
  *             constructed with the function:
  *             For the [\<Xn|SP\>, \<Xm\>, LSL #2] variant:
  *             opnd_create_base_disp_shift_aarch64(Rn, Rm,
- *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vl()
+ *             DR_EXTEND_UXTX, true, 0, 0, opnd_size_from_bytes(dr_get_sve_vector_length()
  *             / 8), 2)
  *             For the [\<Xn|SP\>{, #\<imm\>, MUL VL}] variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 8))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 8))
  */
 #define INSTR_CREATE_stnt1w_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_stnt1w, Rn, Zt, Pg)
@@ -13161,16 +13160,16 @@
  *             constructed with the function:
  *             For the B element size variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 8))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 8))
  *             For the H element size variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 16))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 16))
  *             For the S element size variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 32))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 32))
  *             For the D element size variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 64))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 64))
  */
 #define INSTR_CREATE_ldnf1b_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ldnf1b, Zt, Rn, Pg)
@@ -13188,7 +13187,7 @@
  * \param Rn   The first source base register with an immediate offset,
  *             constructed with the function:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 8))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 8))
  */
 #define INSTR_CREATE_ldnf1d_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ldnf1d, Zt, Rn, Pg)
@@ -13209,13 +13208,13 @@
  *             constructed with the function:
  *             For the H element size variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 8))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 8))
  *             For the S element size variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 16))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 16))
  *             For the D element size variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 32))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 32))
  */
 #define INSTR_CREATE_ldnf1h_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ldnf1h, Zt, Rn, Pg)
@@ -13236,13 +13235,13 @@
  *             constructed with the function:
  *             For the H element size variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 16))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 16))
  *             For the S element size variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 32))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 32))
  *             For the D element size variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 64))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 64))
  */
 #define INSTR_CREATE_ldnf1sb_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ldnf1sb, Zt, Rn, Pg)
@@ -13262,10 +13261,10 @@
  *             constructed with the function:
  *             For the S element size variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 16))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 16))
  *             For the D element size variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 32))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 32))
  */
 #define INSTR_CREATE_ldnf1sh_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ldnf1sh, Zt, Rn, Pg)
@@ -13283,7 +13282,7 @@
  * \param Rn   The first source base register with an immediate offset,
  *             constructed with the function:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 16))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 16))
  */
 #define INSTR_CREATE_ldnf1sw_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ldnf1sw, Zt, Rn, Pg)
@@ -13303,10 +13302,10 @@
  *             constructed with the function:
  *             For the S element size variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 8))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 8))
  *             For the D element size variant:
  *             opnd_create_base_disp(Rn, DR_REG_NULL, 0, imm,
- *             opnd_size_from_bytes(dr_get_sve_vl() / 16))
+ *             opnd_size_from_bytes(dr_get_sve_vector_length() / 16))
  */
 #define INSTR_CREATE_ldnf1w_sve_pred(dc, Zt, Pg, Rn) \
     instr_create_1dst_2src(dc, OP_ldnf1w, Zt, Rn, Pg)
diff --git a/core/ir/decode_shared.c b/core/ir/decode_shared.c
index ab47cd928d0..5c5475179dd 100644
--- a/core/ir/decode_shared.c
+++ b/core/ir/decode_shared.c
@@ -180,7 +180,7 @@ int sve_veclens[] = { 128,  256,  384,  512,  640,  768,  896,  1024,
                       1152, 1280, 1408, 1536, 1664, 1792, 1920, 2048 };
 
 void
-dr_set_sve_vl(int vl)
+dr_set_sve_vector_length(int vl)
 {
     /* TODO i#3044: Vector length will be read from h/w when running on SVE. */
     for (int i = 0; i < sizeof(sve_veclens); i++) {
@@ -193,7 +193,7 @@ dr_set_sve_vl(int vl)
 }
 
 int
-dr_get_sve_vl(void)
+dr_get_sve_vector_length(void)
 {
     return sve_veclen;
 }
diff --git a/core/ir/disassemble_shared.c b/core/ir/disassemble_shared.c
index 936bad963b6..19241476ca2 100644
--- a/core/ir/disassemble_shared.c
+++ b/core/ir/disassemble_shared.c
@@ -1188,7 +1188,7 @@ internal_instr_disassemble(char *buf, size_t bufsz, size_t *sofar INOUT,
         return;
     } else if (instr_opcode_valid(instr)) {
 #ifdef AARCH64
-        /* We do not use instr_info_t encoding info on AArch64. */
+        /* We do not use instr_info_t encoding info on AArch64. FIXME i#1569 */
         name = get_opcode_name(instr_get_opcode(instr));
 #else
         const instr_info_t *info = instr_get_instr_info(instr);
diff --git a/core/ir/encode_api.h b/core/ir/encode_api.h
index 912d9c22a63..48e669be79a 100644
--- a/core/ir/encode_api.h
+++ b/core/ir/encode_api.h
@@ -80,13 +80,13 @@ dr_get_isa_mode(void *drcontext);
  * running on SVE.
  */
 void
-dr_set_sve_vl(int vl);
+dr_set_sve_vector_length(int vl);
 
 /**
  * Read AArch64 Scalable Vector Extension's vector length, in bits.
  */
 int
-dr_get_sve_vl(void);
+dr_get_sve_vector_length(void);
 
 enum {
 #ifdef X86
diff --git a/core/ir/instr_shared.c b/core/ir/instr_shared.c
index 4e0ca8ae09e..66c00c74957 100644
--- a/core/ir/instr_shared.c
+++ b/core/ir/instr_shared.c
@@ -377,11 +377,17 @@ private_instr_encode(dcontext_t *dcontext, instr_t *instr, bool always_cache)
     if (nxt == NULL) {
         nxt = instr_encode_ignore_reachability(dcontext, instr, buf);
         if (nxt == NULL) {
+#ifdef AARCH64
+            /* We do not use instr_info_t encoding info on AArch64. FIXME i#1569 */
+            SYSLOG_INTERNAL_WARNING("cannot encode %s",
+                                    get_opcode_name(instr_get_opcode(instr)));
+#else
             SYSLOG_INTERNAL_WARNING("cannot encode %s",
                                     opcode_to_encoding_info(instr->opcode,
                                                             instr_get_isa_mode(instr)
                                                                 _IF_ARM(false))
                                         ->name);
+#endif
             if (!TEST(INSTR_IS_NOALLOC_STRUCT, instr->flags))
                 heap_reachable_free(dcontext, buf, MAX_INSTR_LENGTH HEAPACCT(ACCT_IR));
             return 0;
@@ -910,8 +916,7 @@ instr_get_eflags(instr_t *instr, dr_opnd_query_flags_t flags)
             encoded = true;
             len = private_instr_encode(dcontext, instr, true /*cache*/);
             if (len == 0) {
-                if (!instr_is_label(instr))
-                    CLIENT_ASSERT(false, "instr_get_eflags: invalid instr");
+                CLIENT_ASSERT(instr_is_label(instr), "instr_get_eflags: invalid instr");
                 return 0;
             }
         }
@@ -1805,6 +1810,35 @@ instr_uses_reg(instr_t *instr, reg_id_t reg)
 bool
 instr_reg_in_dst(instr_t *instr, reg_id_t reg)
 {
+#ifdef AARCH64
+    /* FFR does not appear in any operand, it is implicit upon the instruction type or
+     * accessed via SVE predicate registers.
+     */
+    if (reg == DR_REG_FFR) {
+        switch (instr_get_opcode(instr)) {
+        case OP_setffr:
+        case OP_rdffr:
+
+        case OP_ldff1b:
+        case OP_ldff1d:
+        case OP_ldff1h:
+        case OP_ldff1sb:
+        case OP_ldff1sh:
+        case OP_ldff1sw:
+        case OP_ldff1w:
+
+        case OP_ldnf1b:
+        case OP_ldnf1d:
+        case OP_ldnf1h:
+        case OP_ldnf1sb:
+        case OP_ldnf1sh:
+        case OP_ldnf1sw:
+        case OP_ldnf1w: return true;
+        default: break;
+        }
+    }
+#endif
+
     int i;
     for (i = 0; i < instr_num_dsts(instr); i++) {
         if (opnd_uses_reg(instr_get_dst(instr, i), reg))
@@ -1822,6 +1856,19 @@ instr_reg_in_src(instr_t *instr, reg_id_t reg)
     if (instr_get_opcode(instr) == OP_nop_modrm)
         return false;
 #endif
+
+#ifdef AARCH64
+    /* FFR does not appear in any operand, it is implicit upon the instruction type or
+     * accessed via SVE predicate registers.
+     */
+    if (reg == DR_REG_FFR) {
+        switch (instr_get_opcode(instr)) {
+        case OP_wrffr:
+        case OP_rdffrs: return true;
+        default: break;
+        }
+    }
+#endif
     for (i = 0; i < instr_num_srcs(instr); i++) {
         if (opnd_uses_reg(instr_get_src(instr, i), reg))
             return true;
diff --git a/core/ir/opnd_shared.c b/core/ir/opnd_shared.c
index 8b9a0997ce6..982362e9f66 100644
--- a/core/ir/opnd_shared.c
+++ b/core/ir/opnd_shared.c
@@ -2759,8 +2759,13 @@ reg_get_size(reg_id_t reg)
         return OPSZ_4;
     if (reg >= DR_REG_MDCCSR_EL0 && reg <= DR_REG_SPSR_FIQ)
         return OPSZ_8;
-    if (reg >= DR_REG_Z0 && reg <= DR_REG_Z31)
+    if (reg >= DR_REG_Z0 && reg <= DR_REG_Z31) {
+#        if !defined(DR_HOST_NOT_TARGET) && !defined(STANDALONE_DECODER)
+        return opnd_size_from_bytes(proc_get_vector_length_bytes());
+#        else
         return OPSZ_SCALABLE;
+#        endif
+    }
     if ((reg >= DR_REG_P0 && reg <= DR_REG_P15) || reg == DR_REG_FFR)
         return OPSZ_SCALABLE_PRED;
     if (reg == DR_REG_CNTVCT_EL0)
diff --git a/core/lib/globals_api.h b/core/lib/globals_api.h
index ae6e08390fc..5891206228a 100644
--- a/core/lib/globals_api.h
+++ b/core/lib/globals_api.h
@@ -664,19 +664,25 @@ typedef uint64 dr_opmask_t;
 
 #if defined(AARCHXX)
 /**
- * 128-bit ARM SIMD Vn register.
- * In AArch64, align to 16 bytes for better performance.
- * In AArch32, we're not using any uint64 fields here to avoid alignment
- * padding in sensitive structs. We could alternatively use pragma pack.
+ * 512-bit ARM Scalable Vector Extension (SVE) vector registers Zn and
+ * predicate registers Pn. Low 128 bits of Zn overlap with existing ARM
+ * Advanced SIMD (NEON) Vn registers. The SVE specification defines the
+ * following valid vector lengths:
+ * 128 256 384 512 640 768 896 1024 1152 1280 1408 1536 1664 1792 1920 2048
+ * We currently support 512-bit maximum due to DR's stack size limitation,
+ * (machine context stored in the stack). In AArch64, align to 16 bytes for
+ * better performance. In AArch32, we're not using any uint64 fields here to
+ * avoid alignment padding in sensitive structs. We could alternatively use
+ * pragma pack.
  */
 #    ifdef X64
 typedef union ALIGN_VAR(16) _dr_simd_t {
-    byte b;      /**< Bottom  8 bits of Vn == Bn. */
-    ushort h;    /**< Bottom 16 bits of Vn == Hn. */
-    uint s;      /**< Bottom 32 bits of Vn == Sn. */
-    uint d[2];   /**< Bottom 64 bits of Vn == Dn as d[1]:d[0]. */
-    uint q[4];   /**< 128-bit Qn as q[3]:q[2]:q[1]:q[0]. */
-    uint u32[4]; /**< The full 128-bit register. */
+    byte b;       /**< Byte (8 bit, Bn) scalar element of Vn, Zn, or Pn.        */
+    ushort h;     /**< Halfword (16 bit, Hn) scalar element of Vn, Zn and Pn.   */
+    uint s;       /**< Singleword (32 bit, Sn) scalar element of Vn, Zn and Pn. */
+    uint64 d;     /**< Doubleword (64 bit, Dn) scalar element of Vn, Zn and Pn. */
+    uint q[4];    /**< The full 128 bit Vn register, Qn as q[3]:q[2]:q[1]:q[0]. */
+    uint u32[16]; /**< The full 512 bit Zn, Pn and FFR registers. */
 } dr_simd_t;
 #    else
 typedef union _dr_simd_t {
@@ -686,16 +692,26 @@ typedef union _dr_simd_t {
 } dr_simd_t;
 #    endif
 #    ifdef X64
-#        define MCXT_NUM_SIMD_SLOTS                                  \
-            32 /**< Number of 128-bit SIMD Vn slots in dr_mcontext_t \
+#        define MCXT_NUM_SIMD_SVE_SLOTS                                  \
+            32 /**< Number of 128-bit SIMD Vn/Zn slots in dr_mcontext_t. \
                 */
+#        define MCXT_NUM_SVEP_SLOTS 16 /**< Number of SIMD Pn slots in dr_mcontext_t. */
+#        define MCXT_NUM_FFR_SLOTS \
+            1 /**< Number of first-fault register slots in dr_mcontext_t. */
+              /** Total number of SIMD register slots in dr_mcontext_t. */
+#        define MCXT_NUM_SIMD_SLOTS \
+            (MCXT_NUM_SIMD_SVE_SLOTS + MCXT_NUM_SVEP_SLOTS + MCXT_NUM_FFR_SLOTS)
 #    else
-#        define MCXT_NUM_SIMD_SLOTS                                  \
-            16 /**< Number of 128-bit SIMD Vn slots in dr_mcontext_t \
+#        define MCXT_NUM_SIMD_SLOTS                                   \
+            16 /**< Number of 128-bit SIMD Vn slots in dr_mcontext_t. \
                 */
+/* 32bit ARM does not have these slots, but they are defined for compatibility.
+ */
+#        define MCXT_NUM_SVEP_SLOTS 0
+#        define MCXT_NUM_FFR_SLOTS 0
 #    endif
-#    define PRE_SIMD_PADDING                                       \
-        0 /**< Bytes of padding before xmm/ymm dr_mcontext_t slots \
+#    define PRE_SIMD_PADDING                                        \
+        0 /**< Bytes of padding before xmm/ymm dr_mcontext_t slots. \
            */
 #    define MCXT_NUM_OPMASK_SLOTS                                    \
         0 /**< Number of 16-64-bit OpMask Kn slots in dr_mcontext_t, \
diff --git a/core/lib/mcxtx_api.h b/core/lib/mcxtx_api.h
index d7f36cd2a6f..e02543783a0 100644
--- a/core/lib/mcxtx_api.h
+++ b/core/lib/mcxtx_api.h
@@ -129,13 +129,36 @@
         uint cpsr; /**< The current program status registers in AArch32. */
     }; /**< The anonymous union of alternative names for apsr/cpsr register. */
 #    endif /* 64/32-bit */
+
+#    ifdef X64 /* 64-bit */
+    /**
+     * The Arm AArch64 SIMD (DR_REG_Q0->DR_REG_Q31) and Scalable Vector
+     * Extension (SVE) vector registers (DR_REG_Z0->DR_REG_Z31).
+     */
+    dr_simd_t simd[MCXT_NUM_SIMD_SVE_SLOTS];
+    /**
+     * The Arm AArch64 Scalable Vector Extension (SVE) predicate registers
+     * DR_REG_P0 to DR_REG_P15.
+     */
+    dr_simd_t svep[MCXT_NUM_SVEP_SLOTS];
+    /**
+     * The Arm AArch64 Scalable Vector Extension (SVE) first fault register
+     * DR_REG_FFR, for vector load instrcutions.
+     */
+    dr_simd_t ffr;
+#   else
+    /*
+     * For the Arm AArch32 SIMD registers, we would probably be ok if we did
+     * not preserve the callee-saved registers (q4-q7 == d8-d15) but to be safe
+     * we preserve them all. We do not need anything more than word alignment
+     * for OP_vldm/OP_vstm, and dr_simd_t has no fields larger than 32 bits, so
+     * we have no padding.
+     */
     /**
-     * The SIMD registers.  We would probably be ok if we did not preserve the
-     * callee-saved registers (q4-q7 == d8-d15) but to be safe we preserve them
-     * all.  We do not need anything more than word alignment for OP_vldm/OP_vstm,
-     * and dr_simd_t has no fields larger than 32 bits, so we have no padding.
+     * The Arm AArch32 SIMD registers.
      */
     dr_simd_t simd[MCXT_NUM_SIMD_SLOTS];
+#   endif
 #elif defined(X86)
     /* Our inlined ibl uses eax-edx, so we place them together to fit
      * on the same 32-byte cache line; yet we also want to simplify
diff --git a/core/unix/include/sigcontext.h b/core/unix/include/sigcontext.h
index ba0837af6a8..b4acbbdcfb0 100644
--- a/core/unix/include/sigcontext.h
+++ b/core/unix/include/sigcontext.h
@@ -357,6 +357,19 @@ struct fpsimd_context {
     __u32 fpcr;
     __uint128_t vregs[32];
 };
+
+/* TODO i#5365: Storage of sve_context in kernel_sigcontext_t.__reserved, see
+ * above. See also sigcontext_to_mcontext_simd() and
+ * mcontext_to_sigcontext_simd().
+ */
+
+#        define SVE_MAGIC 0x53564501
+
+struct sve_context {
+    struct _aarch64_ctx head;
+    __u16 vl;
+    __u16 __reserved[3];
+};
 #    endif
 
 #endif /* AARCH64 */
diff --git a/core/unix/os.c b/core/unix/os.c
index ed7981d64b5..d5133bf1646 100644
--- a/core/unix/os.c
+++ b/core/unix/os.c
@@ -4039,6 +4039,12 @@ client_thread_run(void)
     dcontext_t *dcontext;
     byte *xsp;
     GET_STACK_PTR(xsp);
+#    ifdef AARCH64
+    /* AArch64's Scalable Vector Extension (SVE) requires more space on the
+     * stack. Align to page boundary, similar to that in get_clone_record().
+     */
+    xsp = (app_pc)ALIGN_BACKWARD(xsp, PAGE_SIZE);
+#    endif
     void *crec = get_clone_record((reg_t)xsp);
     /* i#2335: we support setup separate from start, and we want to allow a client
      * to create a client thread during init, but we do not support that thread
diff --git a/core/unix/signal.c b/core/unix/signal.c
index 6ff6a40269d..0cbc9433721 100644
--- a/core/unix/signal.c
+++ b/core/unix/signal.c
@@ -909,8 +909,8 @@ set_clone_record_fields(void *record, reg_t app_thread_xsp, app_pc continuation_
  *
  * CAUTION: don't use a lot of stack in this routine as it gets invoked on the
  *          dstack from new_thread_setup - this is because this routine assumes
- *          no more than a page of dstack has been used so far since the clone
- *          system call was done.
+ *          no more than a page of dstack for X86 and 2 pages of dstack for
+ *          AArch64 have been used so far since the clone system call was done.
  */
 void *
 get_clone_record(reg_t xsp)
@@ -924,14 +924,20 @@ get_clone_record(reg_t xsp)
     /* The (size of the clone record +
      *      stack used by new_thread_start (only for setting up priv_mcontext_t) +
      *      stack used by new_thread_setup before calling get_clone_record())
-     * is less than a page.  This is verified by the assert below.  If it does
-     * exceed a page, it won't happen at random during runtime, but in a
-     * predictable way during development, which will be caught by the assert.
-     * The current usage is about 800 bytes for clone_record +
-     * sizeof(priv_mcontext_t) + few words in new_thread_setup before
-     * get_clone_record() is called.
+     * is less than a page for X86 and 2 pages for AArch64. This is verified by
+     * the assert below. If it does exceed 1 page for X86 and 2 for AArch64, it
+     * won't happen at random during runtime, but in a predictable way during
+     * development, which will be caught by the assert.
+     *
+     * The current usage is about 800 bytes (X86) or 1920 bytes (AArch64) for
+     * clone_record + sizeof(priv_mcontext_t) + few words in new_thread_setup
+     * before get_clone_record() is called.
      */
+#ifdef AARCH64
+    dstack_base = (byte *)ALIGN_FORWARD(xsp, PAGE_SIZE) + PAGE_SIZE;
+#else
     dstack_base = (byte *)ALIGN_FORWARD(xsp, PAGE_SIZE);
+#endif
     record = (clone_record_t *)(dstack_base - sizeof(clone_record_t));
 
     /* dstack_base and the dstack in the clone record should be the same. */
diff --git a/core/unix/signal_linux_aarch64.c b/core/unix/signal_linux_aarch64.c
index e44558247c0..585365930a2 100644
--- a/core/unix/signal_linux_aarch64.c
+++ b/core/unix/signal_linux_aarch64.c
@@ -76,8 +76,11 @@ sigcontext_to_mcontext_simd(priv_mcontext_t *mc, sig_full_cxt_t *sc_full)
     ASSERT(fpc->head.size == sizeof(struct fpsimd_context));
     mc->fpsr = fpc->fpsr;
     mc->fpcr = fpc->fpcr;
-    ASSERT(sizeof(mc->simd) == sizeof(fpc->vregs));
+    ASSERT((sizeof(mc->simd->q) * MCXT_NUM_SIMD_SVE_SLOTS) == sizeof(fpc->vregs));
     memcpy(&mc->simd, &fpc->vregs, sizeof(mc->simd));
+    /* TODO i#5365: memcpy(&mc->simd->u32,...)
+     * See also sve_context in core/unix/include/sigcontext.h.
+     */
 }
 
 void
@@ -91,8 +94,11 @@ mcontext_to_sigcontext_simd(sig_full_cxt_t *sc_full, priv_mcontext_t *mc)
     fpc->head.size = sizeof(struct fpsimd_context);
     fpc->fpsr = mc->fpsr;
     fpc->fpcr = mc->fpcr;
-    ASSERT(sizeof(fpc->vregs) == sizeof(mc->simd));
+    ASSERT(sizeof(fpc->vregs) == (sizeof(mc->simd->q) * MCXT_NUM_SIMD_SVE_SLOTS));
     memcpy(&fpc->vregs, &mc->simd, sizeof(fpc->vregs));
+    /* TODO i#5365: memcpy(..., &mc->simd->u32)
+     * See also sve_context in core/unix/include/sigcontext.h.
+     */
     next->magic = 0;
     next->size = 0;
 }
diff --git a/ext/drstatecmp/drstatecmp.c b/ext/drstatecmp/drstatecmp.c
index 8f3a01fca1a..a1fea81071f 100644
--- a/ext/drstatecmp/drstatecmp.c
+++ b/ext/drstatecmp/drstatecmp.c
@@ -483,7 +483,8 @@ drstatecmp_check_simd_value
 #elif defined(AARCHXX)
     (void *tag, dr_simd_t *value, dr_simd_t *expected)
 {
-    if (memcmp(value, expected, sizeof(dr_simd_t)))
+    size_t vl = proc_get_vector_length_bytes();
+    if (memcmp(value, expected, vl))
         drstatecmp_report_error("SIMD mismatch", tag);
 }
 #elif defined(RISCV64)
@@ -616,7 +617,11 @@ drstatecmp_check_machine_state(dr_mcontext_t *mc_instrumented, dr_mcontext_t *mc
 #endif
 
     drstatecmp_check_gpr_value("xsp", tag, mc_instrumented->xsp, mc_expected->xsp);
+#ifdef AARCH64
+    for (int i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+#else
     for (int i = 0; i < MCXT_NUM_SIMD_SLOTS; i++) {
+#endif
         drstatecmp_check_simd_value(tag, &mc_instrumented->simd[i],
                                     &mc_expected->simd[i]);
     }
diff --git a/suite/runsuite_wrapper.pl b/suite/runsuite_wrapper.pl
index 7d8d8df958e..eb9485d530d 100755
--- a/suite/runsuite_wrapper.pl
+++ b/suite/runsuite_wrapper.pl
@@ -49,6 +49,7 @@
 my $mydir = dirname(abs_path($0));
 my $is_CI = 0;
 my $is_aarchxx = $Config{archname} =~ /(aarch64)|(arm)/;
+my $is_x86_64 = $Config{archname} =~ /x86_64/;
 my $is_long = $ENV{'CI_TRIGGER'} eq 'push' && $ENV{'CI_BRANCH'} eq 'refs/heads/master';
 
 # Forward args to runsuite.cmake:
@@ -348,6 +349,13 @@
             } else {
                 $issue_no = "#2417";
             }
+        } elsif ($is_x86_64 && ($ENV{'DYNAMORIO_CROSS_AARCHXX_LINUX_ONLY'} eq 'yes') && $args =~ /64_only/) {
+            # These AArch64 cross-compiled tests fail on x86-64 QEMU but pass
+            # on native AArch64 hardware.
+            $ignore_failures_64{'code_api|client.drx_buf-test'} = 1;
+            $ignore_failures_64{'code_api|sample.memval_simple'} = 1;
+            $ignore_failures_64{'code_api|client.drreg-test'} = 1;
+            $issue_no = "#6260";
         } elsif ($^O eq 'darwin') {
             %ignore_failures_32 = ('code_api|common.decode-bad' => 1, # i#3127
                                    'code_api|linux.signal0000' => 1, # i#3127
diff --git a/suite/tests/api/dis-a64.c b/suite/tests/api/dis-a64.c
index f673a259524..762feaf70ff 100644
--- a/suite/tests/api/dis-a64.c
+++ b/suite/tests/api/dis-a64.c
@@ -35,6 +35,7 @@
 
 #include "configure.h"
 #include "dr_api.h"
+
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
@@ -260,6 +261,8 @@ main(int argc, char *argv[])
         return 0;
     }
 
+    enable_all_test_cpu_features();
+
     if (strcmp(argv[1], "-d") == 0) {
         run_decode(dc, argv[2]);
         dr_standalone_exit();
diff --git a/suite/tests/api/ir_aarch64_sve.c b/suite/tests/api/ir_aarch64_sve.c
index 688965f985f..81d42f0e009 100644
--- a/suite/tests/api/ir_aarch64_sve.c
+++ b/suite/tests/api/ir_aarch64_sve.c
@@ -20540,6 +20540,8 @@ main(int argc, char *argv[])
     bool test_result;
     instr_t *instr;
 
+    enable_all_test_cpu_features();
+
     RUN_INSTR_TEST(add_sve_pred);
     RUN_INSTR_TEST(add_sve_shift);
     RUN_INSTR_TEST(add_sve);
diff --git a/suite/tests/api/ir_aarch64_sve2.c b/suite/tests/api/ir_aarch64_sve2.c
index 8fb7d185369..56c9810de66 100644
--- a/suite/tests/api/ir_aarch64_sve2.c
+++ b/suite/tests/api/ir_aarch64_sve2.c
@@ -8371,6 +8371,8 @@ main(int argc, char *argv[])
     bool test_result;
     instr_t *instr;
 
+    enable_all_test_cpu_features();
+
     RUN_INSTR_TEST(aesd_sve);
     RUN_INSTR_TEST(aese_sve);
     RUN_INSTR_TEST(bcax_sve);
diff --git a/suite/tests/api/ir_aarch64_v81.c b/suite/tests/api/ir_aarch64_v81.c
index bd58f89ad1f..4d9485c2da6 100644
--- a/suite/tests/api/ir_aarch64_v81.c
+++ b/suite/tests/api/ir_aarch64_v81.c
@@ -362,6 +362,8 @@ main(int argc, char *argv[])
     bool test_result;
     instr_t *instr;
 
+    enable_all_test_cpu_features();
+
     RUN_INSTR_TEST(sqrdmlsh_scalar);
     RUN_INSTR_TEST(sqrdmlsh_scalar_idx);
     RUN_INSTR_TEST(sqrdmlsh_vector);
diff --git a/suite/tests/api/ir_aarch64_v82.c b/suite/tests/api/ir_aarch64_v82.c
index 50d33d88116..77876dc4864 100644
--- a/suite/tests/api/ir_aarch64_v82.c
+++ b/suite/tests/api/ir_aarch64_v82.c
@@ -5583,6 +5583,8 @@ main(int argc, char *argv[])
     bool test_result;
     instr_t *instr;
 
+    enable_all_test_cpu_features();
+
     RUN_INSTR_TEST(fcvtas_vector);
     RUN_INSTR_TEST(fcvtas_scalar);
     RUN_INSTR_TEST(fcvtau_vector);
diff --git a/suite/tests/api/ir_aarch64_v83.c b/suite/tests/api/ir_aarch64_v83.c
index e53b8fd1d74..64b029ce71d 100644
--- a/suite/tests/api/ir_aarch64_v83.c
+++ b/suite/tests/api/ir_aarch64_v83.c
@@ -616,6 +616,8 @@ main(int argc, char *argv[])
     bool test_result;
     instr_t *instr;
 
+    enable_all_test_cpu_features();
+
     RUN_INSTR_TEST(fcadd_vector);
     RUN_INSTR_TEST(fcmla_vector);
     RUN_INSTR_TEST(fcmla_vector_idx);
diff --git a/suite/tests/api/ir_aarch64_v84.c b/suite/tests/api/ir_aarch64_v84.c
index 516237371bf..b4b310106b5 100644
--- a/suite/tests/api/ir_aarch64_v84.c
+++ b/suite/tests/api/ir_aarch64_v84.c
@@ -283,6 +283,8 @@ main(int argc, char *argv[])
     bool test_result;
     instr_t *instr;
 
+    enable_all_test_cpu_features();
+
     /* ARMv8.4-RCPC */
     RUN_INSTR_TEST(ldapur);
     RUN_INSTR_TEST(ldapurb);
diff --git a/suite/tests/api/ir_aarch64_v86.c b/suite/tests/api/ir_aarch64_v86.c
index 46e6905807e..d8616343ff7 100644
--- a/suite/tests/api/ir_aarch64_v86.c
+++ b/suite/tests/api/ir_aarch64_v86.c
@@ -698,6 +698,8 @@ main(int argc, char *argv[])
     bool test_result;
     instr_t *instr;
 
+    enable_all_test_cpu_features();
+
     RUN_INSTR_TEST(bfcvt);
     RUN_INSTR_TEST(bfcvtn2_vector);
     RUN_INSTR_TEST(bfcvtn_vector);
diff --git a/suite/tests/api/opnd-a64.c b/suite/tests/api/opnd-a64.c
index f07a2b73bac..5278196e87f 100644
--- a/suite/tests/api/opnd-a64.c
+++ b/suite/tests/api/opnd-a64.c
@@ -59,18 +59,37 @@ test_get_size()
 
     // Check sizes of FP/SIMD regs.
     for (int i = 0; i < proc_num_simd_registers(); i++) {
-        ASSERT(reg_get_size((reg_id_t)DR_REG_H0 + i) == OPSZ_2);
-        ASSERT(reg_get_size((reg_id_t)DR_REG_S0 + i) == OPSZ_4);
-        ASSERT(reg_get_size((reg_id_t)DR_REG_D0 + i) == OPSZ_8);
-        ASSERT(reg_get_size((reg_id_t)DR_REG_Q0 + i) == OPSZ_16);
+        if (i < MCXT_NUM_SIMD_SVE_SLOTS) {
+            ASSERT(reg_get_size((reg_id_t)DR_REG_H0 + i) == OPSZ_2);
+            ASSERT(reg_get_size((reg_id_t)DR_REG_S0 + i) == OPSZ_4);
+            ASSERT(reg_get_size((reg_id_t)DR_REG_D0 + i) == OPSZ_8);
+            ASSERT(reg_get_size((reg_id_t)DR_REG_Q0 + i) == OPSZ_16);
+        }
     }
 
-    // Check sizes of SVE vector regs.
+    opnd_size_t opsz_vl = OPSZ_NA;
+    if (proc_has_feature(FEATURE_SVE)) {
+        /* Check sizes of SVE vector and predicate registers. Read vector length
+         * directly from hardware and compare with OPSZ_ value reg_get_size()
+         * returns.
+         */
+        uint64 vl;
+        /* Read vector length from SVE hardware. */
+        asm(".inst 0x04bf5020\n" /* rdvl x0, #1 */
+            "mov %0, x0"
+            : "=r"(vl)
+            :
+            : "x0");
+        opsz_vl = opnd_size_from_bytes(vl);
+    } else {
+        /* Set vector length to 256 bits for unit tests on non-SVE hardware. */
+        opsz_vl = OPSZ_32;
+    }
     for (uint i = 0; i < 32; i++) {
-        ASSERT(reg_get_size((reg_id_t)DR_REG_Z0 + i) == OPSZ_SCALABLE);
+        ASSERT(reg_get_size((reg_id_t)DR_REG_Z0 + i) == opsz_vl);
     }
 
-    // Check sizes of SVE predicate regs.
+    /* TODO i#5365: Check sizes of SVE predicate regs. */
     for (uint i = 0; i < 16; i++) {
         ASSERT(reg_get_size((reg_id_t)DR_REG_P0 + i) == OPSZ_SCALABLE_PRED);
     }
@@ -287,6 +306,12 @@ test_opnd_invert_immed_int()
 int
 main(int argc, char *argv[])
 {
+    /* Required for proc_init() -> proc_init_arch() establishing vector length
+     * on SVE h/w. This is validated with the direct read of vector length
+     * using the SVE RDVL instruction in test_get_size() above.
+     */
+    dr_standalone_init();
+
     test_get_size();
 
     test_opnd_compute_address();
diff --git a/suite/tests/client-interface/cleancall-opt-shared.h b/suite/tests/client-interface/cleancall-opt-shared.h
index de2595ceb6e..6d2fc746a7b 100644
--- a/suite/tests/client-interface/cleancall-opt-shared.h
+++ b/suite/tests/client-interface/cleancall-opt-shared.h
@@ -286,8 +286,17 @@ mcontexts_equal(dr_mcontext_t *mc_a, dr_mcontext_t *mc_b, int func_index)
             return false;
     }
 #elif defined(AARCH64)
-    for (i = 0; i < proc_num_simd_registers(); i++) {
-        if (memcmp(&mc_a->simd[i], &mc_b->simd[i], sizeof(dr_simd_t)) != 0)
+    size_t vl = proc_get_vector_length_bytes();
+    for (i = 0; i < MCXT_NUM_SIMD_SVE_SLOTS; i++) {
+        if (memcmp(&mc_a->simd[i], &mc_b->simd[i], vl) != 0)
+            return false;
+    }
+    if (proc_has_feature(FEATURE_SVE)) {
+        for (i = 0; i < MCXT_NUM_SVEP_SLOTS; i++) {
+            if (memcmp(&mc_a->svep[i], &mc_b->svep[i], vl / 8) != 0)
+                return false;
+        }
+        if (memcmp(&mc_a->ffr, &mc_b->ffr, vl / 8) != 0)
             return false;
     }
 #endif
@@ -312,7 +321,11 @@ dump_diff_mcontexts(void)
                    after_reg, diff_str);
     }
 
+#ifdef X86
     dr_fprintf(STDERR, "Printing XMM regs:\n");
+#elif defined(AARCH64)
+    dr_fprintf(STDERR, "Printing SIMD/SVE regs:\n");
+#endif
     /* XXX i#1312: check if test can get extended to AVX-512. */
     for (i = 0; i < proc_num_simd_registers(); i++) {
 #ifdef X86
@@ -340,12 +353,27 @@ dump_diff_mcontexts(void)
                        after_reg.u32[6], after_reg.u32[7]);
         }
 #elif defined(AARCH64)
-        dr_simd_t before_reg = before_mcontext.simd[i];
-        dr_simd_t after_reg = after_mcontext.simd[i];
-        size_t mmsz = sizeof(dr_simd_t);
+        const size_t mmsz = proc_get_vector_length_bytes();
+        dr_simd_t before_reg, after_reg;
+        char reg_name[4];
+        if (i >= (MCXT_NUM_SIMD_SVE_SLOTS + MCXT_NUM_SVEP_SLOTS)) {
+            strcpy(reg_name, "FFR");
+            before_reg = before_mcontext.ffr;
+            after_reg = after_mcontext.ffr;
+        } else if (i >= MCXT_NUM_SIMD_SVE_SLOTS) {
+            dr_snprintf(reg_name, 4, "P%2d", i - MCXT_NUM_SIMD_SVE_SLOTS);
+            before_reg = before_mcontext.svep[i - MCXT_NUM_SIMD_SVE_SLOTS];
+            after_reg = after_mcontext.svep[i - MCXT_NUM_SIMD_SVE_SLOTS];
+        } else {
+            dr_snprintf(reg_name, 4, "Z%2d", i);
+            before_reg = before_mcontext.simd[i];
+            after_reg = after_mcontext.simd[i];
+        }
+
         const char *diff_str =
             (memcmp(&before_reg, &after_reg, mmsz) == 0 ? "" : " <- DIFFERS");
-        dr_fprintf(STDERR, "xmm%2d before: %08x%08x%08x%08x", i, before_reg.u32[0],
+
+        dr_fprintf(STDERR, "%s before: %08x%08x%08x%08x", reg_name, before_reg.u32[0],
                    before_reg.u32[1], before_reg.u32[2], before_reg.u32[3]);
         dr_fprintf(STDERR, " after: %08x%08x%08x%08x", after_reg.u32[0], after_reg.u32[1],
                    after_reg.u32[2], after_reg.u32[3]);