Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the heuristics for L3 cache size for Arm64 #71029

Merged
merged 13 commits into from
Jun 22, 2022
97 changes: 73 additions & 24 deletions src/coreclr/gc/unix/gcenv.unix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -876,21 +876,29 @@ bool ReadMemoryValueFromFile(const char* filename, uint64_t* val)
return result;
}

#define UPDATE_CACHE_SIZE_AND_LEVEL(CACHE_LEVEL) if (size > cacheSize) { cacheSize = size; cacheLevel = CACHE_LEVEL; }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One last nit - please feel free to do that later if you want to get this in before today's snap for preview 6.
Looking at the usages of this macro, I have realized it would be great to make the size a parameter of the macro too. From the usage sites, it is not obvious where it gets the size from (I have to read the macro definition to figure it out).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, will do it in follow-up PR

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


static size_t GetLogicalProcessorCacheSizeFromOS()
{
size_t cacheLevel = 0;
size_t cacheSize = 0;
size_t size;

#ifdef _SC_LEVEL1_DCACHE_SIZE
cacheSize = std::max(cacheSize, ( size_t) sysconf(_SC_LEVEL1_DCACHE_SIZE));
size = ( size_t) sysconf(_SC_LEVEL1_DCACHE_SIZE);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q (also applies to the old src) - what does sysconf return if there's no cache of that level? I presume it returns 0, not -1 (the doc says it returns -1 if there's an error).

UPDATE_CACHE_SIZE_AND_LEVEL(1)
#endif
#ifdef _SC_LEVEL2_CACHE_SIZE
cacheSize = std::max(cacheSize, ( size_t) sysconf(_SC_LEVEL2_CACHE_SIZE));
size = ( size_t) sysconf(_SC_LEVEL2_CACHE_SIZE);
UPDATE_CACHE_SIZE_AND_LEVEL(2)
#endif
#ifdef _SC_LEVEL3_CACHE_SIZE
cacheSize = std::max(cacheSize, ( size_t) sysconf(_SC_LEVEL3_CACHE_SIZE));
size = ( size_t) sysconf(_SC_LEVEL3_CACHE_SIZE);
UPDATE_CACHE_SIZE_AND_LEVEL(3)
#endif
#ifdef _SC_LEVEL4_CACHE_SIZE
cacheSize = std::max(cacheSize, ( size_t) sysconf(_SC_LEVEL4_CACHE_SIZE));
size = ( size_t) sysconf(_SC_LEVEL4_CACHE_SIZE);
UPDATE_CACHE_SIZE_AND_LEVEL(4)
#endif

#if defined(TARGET_LINUX) && !defined(HOST_ARM) && !defined(HOST_X86)
Expand All @@ -901,25 +909,39 @@ static size_t GetLogicalProcessorCacheSizeFromOS()
// for the platform. Currently musl and arm64 should be only cases to use
// this method to determine cache size.
//
size_t size;

if (ReadMemoryValueFromFile("/sys/devices/system/cpu/cpu0/cache/index0/size", &size))
cacheSize = std::max(cacheSize, size);
if (ReadMemoryValueFromFile("/sys/devices/system/cpu/cpu0/cache/index1/size", &size))
cacheSize = std::max(cacheSize, size);
if (ReadMemoryValueFromFile("/sys/devices/system/cpu/cpu0/cache/index2/size", &size))
cacheSize = std::max(cacheSize, size);
if (ReadMemoryValueFromFile("/sys/devices/system/cpu/cpu0/cache/index3/size", &size))
cacheSize = std::max(cacheSize, size);
if (ReadMemoryValueFromFile("/sys/devices/system/cpu/cpu0/cache/index4/size", &size))
cacheSize = std::max(cacheSize, size);
size_t level;
char path_to_size_file[] = "/sys/devices/system/cpu/cpu0/cache/index-/size";
char path_to_level_file[] = "/sys/devices/system/cpu/cpu0/cache/index-/level";
int index = 40;
assert(path_to_size_file[index] == '-');
assert(path_to_level_file[index] == '-');

for (int i = 0; i < 5; i++)
{
path_to_size_file[index] = (char)(48 + i);

if (ReadMemoryValueFromFile(path_to_size_file, &size))
{
path_to_level_file[index] = (char)(48 + i);

if (ReadMemoryValueFromFile(path_to_level_file, &level))
{
UPDATE_CACHE_SIZE_AND_LEVEL(level)
}
else
{
cacheSize = std::max(cacheSize, size);
}
}
}
}
#endif

#if defined(HOST_ARM64) && !defined(TARGET_OSX)
#if (defined(HOST_ARM64) || defined(HOST_LOONGARCH64)) && !defined(TARGET_OSX)
if (cacheSize == 0)
{
// It is currently expected to be missing cache size info
// We expect to get the L3 cache size for Arm64 but currently expected to be missing that info
// from most of the machines.
//
// _SC_LEVEL*_*CACHE_SIZE is not yet present. Work is in progress to enable this for arm64
//
Expand Down Expand Up @@ -964,6 +986,38 @@ static size_t GetLogicalProcessorCacheSizeFromOS()
}
#endif

#if (defined(HOST_ARM64) || defined(HOST_LOONGARCH64)) && !defined(TARGET_OSX)
if (cacheLevel != 3)
{
// We expect to get the L3 cache size for Arm64 but currently expected to be missing that info
// from most of the machines.
// Hence, just use the following heuristics at best depending on the CPU count
// 1 ~ 4 : 4 MB
// 5 ~ 16 : 8 MB
// 17 ~ 64 : 16 MB
// 65+ : 32 MB
DWORD logicalCPUs = g_totalCpuCount;
if (logicalCPUs < 5)
{
cacheSize = 4;
}
else if (logicalCPUs < 17)
{
cacheSize = 8;
}
else if (logicalCPUs < 65)
{
cacheSize = 16;
}
else
{
cacheSize = 32;
}

cacheSize *= (1024 * 1024);
}
#endif

return cacheSize;
}

Expand Down Expand Up @@ -1037,15 +1091,10 @@ size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize)
size_t maxSize, maxTrueSize;
maxSize = maxTrueSize = GetLogicalProcessorCacheSizeFromOS(); // Returns the size of the highest level processor cache

#if defined(HOST_ARM64)
// Bigger gen0 size helps arm64 targets
maxSize = maxTrueSize * 3;
#endif

s_maxSize = maxSize;
s_maxTrueSize = maxTrueSize;

// printf("GetCacheSizePerLogicalCpu returns %d, adjusted size %d\n", maxSize, maxTrueSize);
// printf("GetCacheSizePerLogicalCpu returns %zu, adjusted size %zu\n", maxSize, maxTrueSize);
return trueSize ? maxTrueSize : maxSize;
}

Expand Down
48 changes: 41 additions & 7 deletions src/coreclr/gc/windows/gcenv.windows.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,8 @@ SYSTEM_LOGICAL_PROCESSOR_INFORMATION *GetLPI(PDWORD nEntries)
size_t GetLogicalProcessorCacheSizeFromOS()
{
size_t cache_size = 0;
size_t cache_level = 0;

DWORD nEntries = 0;

// Try to use GetLogicalProcessorInformation API and get a valid pointer to the SLPI array if successful. Returns NULL
Expand All @@ -424,7 +426,11 @@ size_t GetLogicalProcessorCacheSizeFromOS()
{
if (pslpi[i].Relationship == RelationCache)
{
last_cache_size = max(last_cache_size, pslpi[i].Cache.Size);
if (last_cache_size < pslpi[i].Cache.Size)
{
last_cache_size = pslpi[i].Cache.Size;
cache_level = pslpi[i].Cache.Level;
}
}
}
cache_size = last_cache_size;
Expand All @@ -434,6 +440,39 @@ size_t GetLogicalProcessorCacheSizeFromOS()
if(pslpi)
delete[] pslpi; // release the memory allocated for the SLPI array.

#if defined(TARGET_ARM64)
if (cache_level != 3)
{
uint32_t totalCPUCount = GCToOSInterface::GetTotalProcessorCount();

// We expect to get the L3 cache size for Arm64 but currently expected to be missing that info
// from most of the machines.
// Hence, just use the following heuristics at best depending on the CPU count
// 1 ~ 4 : 4 MB
// 5 ~ 16 : 8 MB
// 17 ~ 64 : 16 MB
// 65+ : 32 MB
if (totalCPUCount < 5)
kunalspathak marked this conversation as resolved.
Show resolved Hide resolved
{
cache_size = 4;
}
else if (totalCPUCount < 17)
{
cache_size = 8;
}
else if (totalCPUCount < 65)
{
cache_size = 16;
}
else
{
cache_size = 32;
}

cache_size *= (1024 * 1024);
}
#endif // TARGET_ARM64

return cache_size;
}

Expand Down Expand Up @@ -836,15 +875,10 @@ size_t GCToOSInterface::GetCacheSizePerLogicalCpu(bool trueSize)

maxSize = maxTrueSize = GetLogicalProcessorCacheSizeFromOS() ; // Returns the size of the highest level processor cache

#if defined(TARGET_ARM64)
// Bigger gen0 size helps arm64 targets
maxSize = maxTrueSize * 3;
#endif

s_maxSize = maxSize;
s_maxTrueSize = maxTrueSize;

// printf("GetCacheSizePerLogicalCpu returns %d, adjusted size %d\n", maxSize, maxTrueSize);
// printf("GetCacheSizePerLogicalCpu returns %zu, adjusted size %zu\n", maxSize, maxTrueSize);
return trueSize ? maxTrueSize : maxSize;
}

Expand Down
91 changes: 72 additions & 19 deletions src/coreclr/pal/src/misc/sysinfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -539,23 +539,31 @@ ReadMemoryValueFromFile(const char* filename, uint64_t* val)
return result;
}

#define UPDATE_CACHE_SIZE_AND_LEVEL(CACHE_LEVEL) if (size > cacheSize) { cacheSize = size; cacheLevel = CACHE_LEVEL; }

size_t
PALAPI
PAL_GetLogicalProcessorCacheSizeFromOS()
{
size_t cacheLevel = 0;
size_t cacheSize = 0;
size_t size;

#ifdef _SC_LEVEL1_DCACHE_SIZE
cacheSize = std::max(cacheSize, (size_t)sysconf(_SC_LEVEL1_DCACHE_SIZE));
size = ( size_t) sysconf(_SC_LEVEL1_DCACHE_SIZE);
UPDATE_CACHE_SIZE_AND_LEVEL(1)
#endif
#ifdef _SC_LEVEL2_CACHE_SIZE
cacheSize = std::max(cacheSize, (size_t)sysconf(_SC_LEVEL2_CACHE_SIZE));
size = ( size_t) sysconf(_SC_LEVEL2_CACHE_SIZE);
UPDATE_CACHE_SIZE_AND_LEVEL(2)
#endif
#ifdef _SC_LEVEL3_CACHE_SIZE
cacheSize = std::max(cacheSize, (size_t)sysconf(_SC_LEVEL3_CACHE_SIZE));
size = ( size_t) sysconf(_SC_LEVEL3_CACHE_SIZE);
UPDATE_CACHE_SIZE_AND_LEVEL(3)
#endif
#ifdef _SC_LEVEL4_CACHE_SIZE
cacheSize = std::max(cacheSize, (size_t)sysconf(_SC_LEVEL4_CACHE_SIZE));
size = ( size_t) sysconf(_SC_LEVEL4_CACHE_SIZE);
UPDATE_CACHE_SIZE_AND_LEVEL(4)
#endif

#if defined(TARGET_LINUX) && !defined(HOST_ARM) && !defined(HOST_X86)
Expand All @@ -566,25 +574,39 @@ PAL_GetLogicalProcessorCacheSizeFromOS()
// for the platform. Currently musl and arm64 should be only cases to use
// this method to determine cache size.
//
size_t size;

if(ReadMemoryValueFromFile("/sys/devices/system/cpu/cpu0/cache/index0/size", &size))
cacheSize = std::max(cacheSize, size);
if(ReadMemoryValueFromFile("/sys/devices/system/cpu/cpu0/cache/index1/size", &size))
cacheSize = std::max(cacheSize, size);
if(ReadMemoryValueFromFile("/sys/devices/system/cpu/cpu0/cache/index2/size", &size))
cacheSize = std::max(cacheSize, size);
if(ReadMemoryValueFromFile("/sys/devices/system/cpu/cpu0/cache/index3/size", &size))
cacheSize = std::max(cacheSize, size);
if(ReadMemoryValueFromFile("/sys/devices/system/cpu/cpu0/cache/index4/size", &size))
cacheSize = std::max(cacheSize, size);
size_t level;
char path_to_size_file[] = "/sys/devices/system/cpu/cpu0/cache/index-/size";
char path_to_level_file[] = "/sys/devices/system/cpu/cpu0/cache/index-/level";
int index = 40;
kunalspathak marked this conversation as resolved.
Show resolved Hide resolved
_ASSERTE(path_to_size_file[index] == '-');
_ASSERTE(path_to_level_file[index] == '-');

for (int i = 0; i < 5; i++)
{
path_to_size_file[index] = (char)(48 + i);

if (ReadMemoryValueFromFile(path_to_size_file, &size))
{
path_to_level_file[index] = (char)(48 + i);

if (ReadMemoryValueFromFile(path_to_level_file, &level))
{
UPDATE_CACHE_SIZE_AND_LEVEL(level)
}
else
{
cacheSize = std::max(cacheSize, size);
}
}
}
}
#endif

#if (defined(HOST_ARM64) || defined(HOST_LOONGARCH64)) && !defined(TARGET_OSX)
if (cacheSize == 0)
{
// It is currently expected to be missing cache size info
// We expect to get the L3 cache size for Arm64 but currently expected to be missing that info
// from most of the machines with an exceptions on some machines.
//
// _SC_LEVEL*_*CACHE_SIZE is not yet present. Work is in progress to enable this for arm64
//
Expand Down Expand Up @@ -621,12 +643,43 @@ PAL_GetLogicalProcessorCacheSizeFromOS()
|| sysctlbyname("hw.l3cachesize", &cacheSizeFromSysctl, &sz, nullptr, 0) == 0
|| sysctlbyname("hw.l2cachesize", &cacheSizeFromSysctl, &sz, nullptr, 0) == 0
|| sysctlbyname("hw.l1dcachesize", &cacheSizeFromSysctl, &sz, nullptr, 0) == 0;

if (success)
{
_ASSERTE(cacheSizeFromSysctl > 0);
cacheSize = (size_t) cacheSizeFromSysctl;
cacheSize = ( size_t) cacheSizeFromSysctl;
}
}
#endif

#if (defined(HOST_ARM64) || defined(HOST_LOONGARCH64)) && !defined(TARGET_OSX)
if (cacheLevel != 3)
{
// We expect to get the L3 cache size for Arm64 but currently expected to be missing that info
// from most of the machines.
// Hence, just use the following heuristics at best depending on the CPU count
// 1 ~ 4 : 4 MB
// 5 ~ 16 : 8 MB
// 17 ~ 64 : 16 MB
// 65+ : 32 MB
DWORD logicalCPUs = PAL_GetLogicalCpuCountFromOS();
if (logicalCPUs < 5)
{
cacheSize = 4;
}
else if (logicalCPUs < 17)
{
cacheSize = 8;
}
else if (logicalCPUs < 65)
{
cacheSize = 16;
}
else
{
cacheSize = 32;
}

cacheSize *= (1024 * 1024);
}
#endif

Expand Down
Loading