Skip to content

Commit

Permalink
Add large page support for NNUE weights and simplify TT mem management
Browse files Browse the repository at this point in the history
Use TT memory functions to allocate memory for the NNUE weights. This
should provide a small speed-up on systems where large pages are not
automatically used, including Windows and some Linux distributions.

Further, since we now have a wrapper for std::aligned_alloc(), we can
simplify the TT memory management a bit:

- We no longer need to store separate pointers to the hash table and
  its underlying memory allocation.
- We also get to merge the Linux-specific and default implementations
  of aligned_ttmem_alloc().

Finally, we'll enable the VirtualAlloc code path with large page
support also for Win32.

STC: https://tests.stockfishchess.org/tests/view/5f66595823a84a47b9036fba
LLR: 2.94 (-2.94,2.94) {-0.25,1.25}
Total: 14896 W: 1854 L: 1686 D: 11356
Ptnml(0-2): 65, 1224, 4742, 1312, 105

closes #3081

No functional change.
  • Loading branch information
skiminki authored and vondele committed Sep 21, 2020
1 parent 16b4578 commit 485d517
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 45 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ to find the best move. The classical evaluation computes this value as a functio
of various chess concepts, handcrafted by experts, tested and tuned using fishtest.
The NNUE evaluation computes this value with a neural network based on basic
inputs (e.g. piece positions only). The network is optimized and trained
on the evalutions of millions of positions at moderate search depth.
on the evaluations of millions of positions at moderate search depth.

The NNUE evaluation was first introduced in shogi, and ported to Stockfish afterward.
It can be evaluated efficiently on CPUs, and exploits the fact that only parts
Expand Down
57 changes: 25 additions & 32 deletions src/misc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -357,27 +357,11 @@ void std_aligned_free(void* ptr) {
#endif
}

/// aligned_ttmem_alloc() will return suitably aligned memory, if possible using large pages.
/// The returned pointer is the aligned one, while the mem argument is the one that needs
/// to be passed to free. With c++17 some of this functionality could be simplified.
/// aligned_large_pages_alloc() will return suitably aligned memory, if possible using large pages.

#if defined(__linux__) && !defined(__ANDROID__)

void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {

constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page sizes
size_t size = ((allocSize + alignment - 1) / alignment) * alignment; // multiple of alignment
if (posix_memalign(&mem, alignment, size))
mem = nullptr;
#if defined(MADV_HUGEPAGE)
madvise(mem, allocSize, MADV_HUGEPAGE);
#endif
return mem;
}
#if defined(_WIN32)

#elif defined(_WIN64)

static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
static void* aligned_large_pages_alloc_win(size_t allocSize) {

HANDLE hProcessToken { };
LUID luid { };
Expand Down Expand Up @@ -422,12 +406,13 @@ static void* aligned_ttmem_alloc_large_pages(size_t allocSize) {
return mem;
}

void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
void* aligned_large_pages_alloc(size_t allocSize) {

static bool firstCall = true;
void* mem;

// Try to allocate large pages
mem = aligned_ttmem_alloc_large_pages(allocSize);
mem = aligned_large_pages_alloc_win(allocSize);

// Suppress info strings on the first call. The first call occurs before 'uci'
// is received and in that case this output confuses some GUIs.
Expand All @@ -449,23 +434,31 @@ void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {

#else

void* aligned_ttmem_alloc(size_t allocSize, void*& mem) {
void* aligned_large_pages_alloc(size_t allocSize) {

#if defined(__linux__)
constexpr size_t alignment = 2 * 1024 * 1024; // assumed 2MB page size
#else
constexpr size_t alignment = 4096; // assumed small page size
#endif

constexpr size_t alignment = 64; // assumed cache line size
size_t size = allocSize + alignment - 1; // allocate some extra space
mem = malloc(size);
void* ret = reinterpret_cast<void*>((uintptr_t(mem) + alignment - 1) & ~uintptr_t(alignment - 1));
return ret;
// round up to multiples of alignment
size_t size = ((allocSize + alignment - 1) / alignment) * alignment;
void *mem = std_aligned_alloc(alignment, size);
#if defined(MADV_HUGEPAGE)
madvise(mem, size, MADV_HUGEPAGE);
#endif
return mem;
}

#endif


/// aligned_ttmem_free() will free the previously allocated ttmem
/// aligned_large_pages_free() will free the previously allocated ttmem

#if defined(_WIN64)
#if defined(_WIN32)

void aligned_ttmem_free(void* mem) {
void aligned_large_pages_free(void* mem) {

if (mem && !VirtualFree(mem, 0, MEM_RELEASE))
{
Expand All @@ -478,8 +471,8 @@ void aligned_ttmem_free(void* mem) {

#else

void aligned_ttmem_free(void *mem) {
free(mem);
void aligned_large_pages_free(void *mem) {
std_aligned_free(mem);
}

#endif
Expand Down
4 changes: 2 additions & 2 deletions src/misc.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ void prefetch(void* addr);
void start_logger(const std::string& fname);
void* std_aligned_alloc(size_t alignment, size_t size);
void std_aligned_free(void* ptr);
void* aligned_ttmem_alloc(size_t size, void*& mem);
void aligned_ttmem_free(void* mem); // nop if mem == nullptr
void* aligned_large_pages_alloc(size_t size); // memory aligned by page size, min alignment: 4096 bytes
void aligned_large_pages_free(void* mem); // nop if mem == nullptr

void dbg_hit_on(bool b);
void dbg_hit_on(bool c, bool b);
Expand Down
18 changes: 13 additions & 5 deletions src/nnue/evaluate_nnue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ namespace Eval::NNUE {
};

// Input feature converter
AlignedPtr<FeatureTransformer> feature_transformer;
LargePagePtr<FeatureTransformer> feature_transformer;

// Evaluation function
AlignedPtr<Network> network;
Expand All @@ -70,14 +70,22 @@ namespace Eval::NNUE {
std::memset(pointer.get(), 0, sizeof(T));
}

template <typename T>
void Initialize(LargePagePtr<T>& pointer) {

static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
std::memset(pointer.get(), 0, sizeof(T));
}

// Read evaluation function parameters
template <typename T>
bool ReadParameters(std::istream& stream, const AlignedPtr<T>& pointer) {
bool ReadParameters(std::istream& stream, T& reference) {

std::uint32_t header;
header = read_little_endian<std::uint32_t>(stream);
if (!stream || header != T::GetHashValue()) return false;
return pointer->ReadParameters(stream);
return reference.ReadParameters(stream);
}

} // namespace Detail
Expand Down Expand Up @@ -110,8 +118,8 @@ namespace Eval::NNUE {
std::string architecture;
if (!ReadHeader(stream, &hash_value, &architecture)) return false;
if (hash_value != kHashValue) return false;
if (!Detail::ReadParameters(stream, feature_transformer)) return false;
if (!Detail::ReadParameters(stream, network)) return false;
if (!Detail::ReadParameters(stream, *feature_transformer)) return false;
if (!Detail::ReadParameters(stream, *network)) return false;
return stream && stream.peek() == std::ios::traits_type::eof();
}

Expand Down
11 changes: 11 additions & 0 deletions src/nnue/evaluate_nnue.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,20 @@ namespace Eval::NNUE {
}
};

template <typename T>
struct TtmemDeleter {
void operator()(T* ptr) const {
ptr->~T();
aligned_large_pages_free(ptr);
}
};

template <typename T>
using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;

template <typename T>
using LargePagePtr = std::unique_ptr<T, TtmemDeleter<T>>;

} // namespace Eval::NNUE

#endif // #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
7 changes: 4 additions & 3 deletions src/tt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,12 @@ void TranspositionTable::resize(size_t mbSize) {

Threads.main()->wait_for_search_finished();

aligned_ttmem_free(mem);
aligned_large_pages_free(table);

clusterCount = mbSize * 1024 * 1024 / sizeof(Cluster);
table = static_cast<Cluster*>(aligned_ttmem_alloc(clusterCount * sizeof(Cluster), mem));
if (!mem)

table = static_cast<Cluster*>(aligned_large_pages_alloc(clusterCount * sizeof(Cluster)));
if (!table)
{
std::cerr << "Failed to allocate " << mbSize
<< "MB for transposition table." << std::endl;
Expand Down
3 changes: 1 addition & 2 deletions src/tt.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class TranspositionTable {
static_assert(sizeof(Cluster) == 32, "Unexpected Cluster size");

public:
~TranspositionTable() { aligned_ttmem_free(mem); }
~TranspositionTable() { aligned_large_pages_free(table); }
void new_search() { generation8 += 8; } // Lower 3 bits are used by PV flag and Bound
TTEntry* probe(const Key key, bool& found) const;
int hashfull() const;
Expand All @@ -89,7 +89,6 @@ class TranspositionTable {

size_t clusterCount;
Cluster* table;
void* mem;
uint8_t generation8; // Size must be not bigger than TTEntry::genBound8
};

Expand Down

0 comments on commit 485d517

Please sign in to comment.