Skip to content

Commit

Permalink
GH-90997: Improve inline cache performance for MSVC (GH-96781)
Browse files Browse the repository at this point in the history
  • Loading branch information
brandtbucher committed Sep 15, 2022
1 parent 9f18147 commit a83fdf2
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 74 deletions.
93 changes: 19 additions & 74 deletions Include/internal/pycore_code.h
Original file line number Diff line number Diff line change
Expand Up @@ -285,110 +285,55 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
#define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) ((void)0)
#endif // !Py_STATS

// Cache values are only valid in memory, so use native endianness.
#ifdef WORDS_BIGENDIAN
// Utility functions for reading/writing 32/64-bit values in the inline caches.
// Great care should be taken to ensure that these functions remain correct and
// performant! They should compile to just "move" instructions on all supported
// compilers and platforms.

// We use memcpy to let the C compiler handle unaligned accesses and endianness
// issues for us. It also seems to produce better code than manual copying for
// most compilers (see https://blog.regehr.org/archives/959 for more info).

static inline void
write_u32(uint16_t *p, uint32_t val)
{
p[0] = (uint16_t)(val >> 16);
p[1] = (uint16_t)(val >> 0);
memcpy(p, &val, sizeof(val));
}

static inline void
write_u64(uint16_t *p, uint64_t val)
{
p[0] = (uint16_t)(val >> 48);
p[1] = (uint16_t)(val >> 32);
p[2] = (uint16_t)(val >> 16);
p[3] = (uint16_t)(val >> 0);
}

static inline uint32_t
read_u32(uint16_t *p)
{
uint32_t val = 0;
val |= (uint32_t)p[0] << 16;
val |= (uint32_t)p[1] << 0;
return val;
}

static inline uint64_t
read_u64(uint16_t *p)
{
uint64_t val = 0;
val |= (uint64_t)p[0] << 48;
val |= (uint64_t)p[1] << 32;
val |= (uint64_t)p[2] << 16;
val |= (uint64_t)p[3] << 0;
return val;
}

#else

static inline void
write_u32(uint16_t *p, uint32_t val)
{
p[0] = (uint16_t)(val >> 0);
p[1] = (uint16_t)(val >> 16);
memcpy(p, &val, sizeof(val));
}

static inline void
write_u64(uint16_t *p, uint64_t val)
write_obj(uint16_t *p, PyObject *val)
{
p[0] = (uint16_t)(val >> 0);
p[1] = (uint16_t)(val >> 16);
p[2] = (uint16_t)(val >> 32);
p[3] = (uint16_t)(val >> 48);
memcpy(p, &val, sizeof(val));
}

static inline uint32_t
read_u32(uint16_t *p)
{
uint32_t val = 0;
val |= (uint32_t)p[0] << 0;
val |= (uint32_t)p[1] << 16;
uint32_t val;
memcpy(&val, p, sizeof(val));
return val;
}

static inline uint64_t
read_u64(uint16_t *p)
{
uint64_t val = 0;
val |= (uint64_t)p[0] << 0;
val |= (uint64_t)p[1] << 16;
val |= (uint64_t)p[2] << 32;
val |= (uint64_t)p[3] << 48;
uint64_t val;
memcpy(&val, p, sizeof(val));
return val;
}

#endif

static inline void
write_obj(uint16_t *p, PyObject *obj)
{
uintptr_t val = (uintptr_t)obj;
#if SIZEOF_VOID_P == 8
write_u64(p, val);
#elif SIZEOF_VOID_P == 4
write_u32(p, val);
#else
#error "SIZEOF_VOID_P must be 4 or 8"
#endif
}

static inline PyObject *
read_obj(uint16_t *p)
{
uintptr_t val;
#if SIZEOF_VOID_P == 8
val = read_u64(p);
#elif SIZEOF_VOID_P == 4
val = read_u32(p);
#else
#error "SIZEOF_VOID_P must be 4 or 8"
#endif
return (PyObject *)val;
PyObject *val;
memcpy(&val, p, sizeof(val));
return val;
}

/* See Objects/exception_handling_notes.txt for details.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Improve the performance of reading and writing inline bytecode caches on
some platforms.

0 comments on commit a83fdf2

Please sign in to comment.