Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chapter 8 edits #75

Open
wants to merge 3 commits into
base: main
Choose a base branch
from

Conversation

dankamongmen
Copy link
Contributor

@dankamongmen dankamongmen commented Sep 13, 2024

Not too many changes, but I have some notes:

  • 8-0 algorithmic: maybe also note that by using linear instead of binary search, you needn't keep the data sorted?
  • 8-1: AFAIK, "von Neumann" just means the code and data paths are integrated, while "Harvard" keeps them distinct
  • didn't know that was called "Eytzinger layout", interesting. added to the Eponyms list!
  • 8-2: stack allocation isn't necessarily immediate, if for instance it grows into a new page
  • 8.4.3: another option is to manually request (from the process) the pages at runtime, though this requires root privs. i've done this in code for high-speed networking. here's some example code:
#include <fcntl.h>
#include <iomanip>
#include <iostream>
#include <stdexcept>
#include "numset.h"
#include "Sysfs.h"
#include "fdesc.h"
#include "zones.h"

static inline const char* plurstr(unsigned c){
  return c == 1 ? "" : "s";
}

// get the current number of free hugepages of some size on some node. hfd
// ought be an open pathfd into a node/hugepages-fookB directory.
static bool
readFreeHugePages(int hfd, unsigned long& freehp){
  if(!Sysfs::readSysfsUInt(hfd, "free_hugepages", freehp)){
    return false;
  }
  return true;
}

static bool
readCountHugePages(int hfd, unsigned long& counthp){
  return Sysfs::readSysfsUInt(hfd, "nr_hugepages", counthp);
}

static bool
allocateHugePages(uint32_t psize, int hfd, unsigned long count){
  psize /= (1024ul * 1024);
  unsigned long cur;
  if(!readCountHugePages(hfd, cur)){
    return false;
  }
  count += cur; // FIXME check for overflow
  std::cout << " requesting " << count << " " << psize << "MiB pages (have " << cur << ")" << std::endl;
  if(!Sysfs::writeSysfsUInt(hfd, "nr_hugepages", count)){
    return false;
  }
  unsigned long fhp;
  if(!readCountHugePages(hfd, fhp)){
    return false;
  }
  if(fhp < count){
    std::cerr << " requested " << count << " " << psize << "MiB pages, only got "
              << fhp << ", have " << cur << std::endl;
    if(fhp > cur){ // undo a partial allocation
      std::cout << " releasing " << fhp - cur << " huge pages" << std::endl;
      Sysfs::writeSysfsUInt(hfd, "nr_hugepages", cur);
    }
    return false;
  }
  return true;
}

// determine the huge page size we'll use to fulfill this request. one day we
// might do something smarter where we look at current usage and even
// fragmentation. for now, we use static size thresholds [sad horn noise].
//
// this assumes x86 huge page sizes, and that both 2M and 1GB are supported.
// we have the info available to do otherwise, detected from sysfs FIXME.
static inline unsigned long
pageSize(size_t req){
  // there are 512 2MB regions in a 1GB region. L2 TLBs run a few kiloentries.
  // given the difficulty of assembling a contiguous 1GB region after the
  // address space gets fragmented to hell, plus the lack of widespread 1GB
  // page support, we require 2GB to use 1GB pages.
  if(req < 2 * X86_GIGAPAGE_SIZE){
    return X86_HUGEPAGE_SIZE;
  }
  return X86_GIGAPAGE_SIZE;
}

bool NUMAZone::reserveFastMemory(uint64_t bytes, uint32_t& pgsize, unsigned& pcount){
  pgsize = pageSize(bytes);
  pcount = bytes / pgsize + !!(bytes % pgsize);
  if(!reserveHugePages(pgsize, pcount)){
    if(pgsize == X86_GIGAPAGE_SIZE){
      pgsize = X86_HUGEPAGE_SIZE;
      pcount = bytes / pgsize + !!(bytes % pgsize);
      if(reserveHugePages(pgsize, pcount)){
        return true;
      }
    }
    pcount = 0;
    return false;
  }
  return true;
}

// some number are free, but they might be from one of our outstanding
// reservations (remember, they're not atomic). so we track our outstanding
// allocations, and rely on the caller to report back post-mapping.
bool NUMAZone::reserveHugePages(uint32_t size, unsigned count){
  for(auto& hpinfo : HPages){
    if(hpinfo.PageSize == size){
      std::lock_guard lg{mtx};
      unsigned long freehp;
      if(!readFreeHugePages(hpinfo.fd.FD(), freehp)){
        return false;
      }
      // freehp might be less than OutstandingPages due to a worker having
      // just successfully mapped them, but not yet called reportMapped(),
      // so don't consider this an error, but don't count them towards our
      // total, either. of course, they might have actually been freed by
      // someone else in that same interim, in which case we'll be demanding
      // more than we strictly need, but that's not any kind of big deal.
      if(freehp < hpinfo.OutstandingPages){
        freehp = 0;
      }else{
        freehp -= hpinfo.OutstandingPages;
      }
      if(freehp >= count){
        std::cout << " wanted " << count << " " << (size / 1024 / 1024)
                  << "MiB page" << plurstr(count) << ", had " << freehp << std::endl;
        hpinfo.OutstandingPages += count;
        return true;
      }
      count -= freehp;
      if(!allocateHugePages(hpinfo.PageSize, hpinfo.fd.FD(), count)){
        return false;
      }
      hpinfo.OutstandingPages = count + freehp;
      return true;
    }
  }
  std::cerr << "requested unsupported page size " << size << std::endl;
  return false; // no such page size, sorry
}

uint64_t NUMAZone::percentAlloc(uint32_t denom) const {
  if(!denom){
    return 0;
  }
  uint64_t alloc = Size / denom / 2;
  uint32_t hsize = Basepagesize;
  for(const auto& h : HPages){
    if(h.PageSize > alloc){
      break;
    }
    hsize = h.PageSize;
  }
  uint64_t ialloc = alloc;
  ialloc -= ialloc % hsize;
  return ialloc;
}

void NUMAZone::reportMapped(uint32_t pgsize, unsigned pcount){
  for(auto& hpinfo : HPages){
    if(hpinfo.PageSize == pgsize){
      std::lock_guard lg{mtx};
      if(hpinfo.OutstandingPages < pcount){
        std::cerr << "reported " << pcount << " page" << plurstr(pcount)
                  << " mapped, had only " << hpinfo.OutstandingPages << " outstanding" << std::endl;
        throw std::invalid_argument("invalid outstanding page count");
      }
      hpinfo.OutstandingPages -= pcount;
      return;
    }
  }
  std::cerr << "your huge page size sucks " << pgsize << std::endl;
  throw std::invalid_argument("invalid hugepage size");
}

// extract the current number of hugepages of this size on this node, and also
// the number of free hugepages. this is of course only a snapshot.
bool NUMAZone::getZoneHugePageClassInfo(int hfd, unsigned long& freep, unsigned long& mappedp){
  if(!readCountHugePages(hfd, mappedp)){
    return false;
  }
  if(!readFreeHugePages(hfd, freep)){
    return false;
  }
  return true;
}

bool NUMAZone::getZoneHugePageInfo(int zfd){
  Fdesc hpfd{openat(zfd, "hugepages", O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH)};
  if(hpfd.FD() < 0){
    return false;
  }
  // FIXME we ought just browse the directory, and find all supported
  // arbitrary sizes, but this works for now. subdirectories are of the
  // form hugepages-{foo}kB, so define in terms of 1024B.
  const std::vector<uint32_t> candidateSizes = {
    X86_HUGEPAGE_SIZE / 1024,
    X86_GIGAPAGE_SIZE / 1024,
  };
#define HPPREFIX "hugepages-"
#define HPSUFFIX "kB"
  char hpname[20 + __builtin_strlen(HPPREFIX) + __builtin_strlen(HPSUFFIX) + 1]; // 20: largest 64-bit uint
  strcpy(hpname, HPPREFIX);
  const size_t maxwrite = sizeof(hpname) - __builtin_strlen(HPPREFIX);
  for(auto sz : candidateSizes){
    auto s = snprintf(hpname + __builtin_strlen(HPPREFIX), maxwrite, "%u" HPSUFFIX, sz);
    if(s < 0 || static_cast<size_t>(s) >= maxwrite){
      std::cerr << "couldn't look for huge pages of size " << sz << "kB" << std::endl;
      return false;
    }
    Fdesc hpsfd{openat(hpfd.FD(), hpname, O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH)};
    if(hpsfd.FD() < 0){
      if(errno != ENOENT){
        std::cerr << "error opening " << hpname << ": " << strerror(errno) << std::endl;
        return false;
      }
      continue;
    }
    if(sz * 1024 < Basepagesize){
      std::cerr << "huge page size " << sz * 1024 << " >= base page size " << Basepagesize << std::endl;
      return false;
    }
    unsigned long freep, mappedp;
    if(!getZoneHugePageClassInfo(hpsfd.FD(), freep, mappedp)){
      return false;
    }
    std::cout << " " << sz / 1024 << "MiB hugepage" << plurstr(mappedp) << ": " <<
              mappedp << " allocated, " << freep << " free" << std::endl;
    HPages.emplace_back(0, sz * 1024, std::move(hpsfd));
  }
  return true;
}

bool NUMAZone::getZoneInfo(int zfd, uint64_t& size){
  auto buf = Sysfs::readSysfs(zfd, "meminfo");
  if(!buf){
    return false;
  }
  const char tag[] = "MemTotal:";
  const char* s = buf.get();
  const char* ts = tag;
  bool foundtag = false;
  bool foundvalue = false;
  size = 0;
  // MemTotal ought always be the first line. it has no repeated internal
  // characters, so we can always safely match to the beginning of the tag.
  // once foundtag goes high, start accumulating size.
  while(*s && *s != '\n'){
    if(foundtag && !foundvalue){
      if(isdigit(*s)){
        foundvalue = true;
      }else if(!isspace(*s)){
        std::cerr << "found mystery character " << *s << " for " << tag << " value" << std::endl;
        return false;
      }
    }
    if(foundvalue){
      if(isdigit(*s)){
        size *= 10;
        size += *s - '0';
      }else{
        if(isspace(*s)){ // " kB' follows value; we're done!
          size *= 1024;
          break;
        }else{
          std::cerr << "found mystery character " << *s << " for " << tag << " value" << std::endl;
          size = 0;
          return false;
        }
      }
    }else{
      if(*s == *ts){
        if(!*++ts){
          foundtag = true;
        }
      }else{
        ts = tag;
      }
    }
    ++s;
  }
  if(size){
    return true;
  }
  std::cerr << "didn't find expected " << tag << " in meminfo" << std::endl;
  return false;
}

NUMAZone::NUMAZone(int zfd, int zid, uint32_t basepagesize) :
 ZoneId(zid),
 Basepagesize(basepagesize) {
  // 64 bit value max is 20 char
  #define PREFIX "node"
  char nbuf[__builtin_strlen(PREFIX) + 20 + 1];
  auto s = snprintf(nbuf, sizeof(nbuf), PREFIX "%d", zid);
  if(s < 0 || static_cast<size_t>(s) >= sizeof(nbuf)){
    std::cerr << "crazy zone id " << zid << std::endl;
    throw std::invalid_argument("invalid id for numa zone");
  }
  Fdesc nfd{openat(zfd, nbuf, O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH)};
  if(nfd.FD() < 0){
    std::cerr << "error opening zone " << zid << " directory (" << strerror(errno) << ")" << std::endl;
    throw std::runtime_error("couldn't open numa zone in sysfs");
  }
  auto cpulist = Sysfs::readSysfs(nfd.FD(), "cpulist");
  if(!cpulist){
    std::cerr << "error reading zone " << zid << " cpu list (" << strerror(errno) << ")" << std::endl;
    throw std::runtime_error("couldn't read numa zone's cpulist");
  }
  if(!lexNumberCollection(cpulist.get(), Cores)){
    throw std::runtime_error("invalid numa zone's cpulist");
  }
  if(!getZoneInfo(nfd.FD(), Size)){
    throw std::runtime_error("invalid numa zone info");
  }
  std::cout << "numa memory zone " << zid << " ("
            << static_cast<float>(Size) / (1024lu * 1024lu * 1024lu) << " GiB): "
            << cpulist.get() << std::endl;
  if(!getZoneHugePageInfo(nfd.FD())){
    throw std::runtime_error("invalid numa zone huge page info");
  }
}

void NUMATopology::discoverZones(uint32_t basepagesize){
  Fdesc sysfd{open("/sys/devices/system/node", O_RDONLY | O_CLOEXEC | O_DIRECTORY | O_PATH)};
  if(sysfd.FD() < 0){
    std::cerr << "couldn't open sysfs system/node (" << strerror(errno) << ")" << std::endl;
    throw std::runtime_error("couldn't open sysfs system/node");
  }
  auto memzones = Sysfs::readSysfs(sysfd.FD(), "has_memory");
  if(!memzones){
    throw std::runtime_error("couldn't read system/node/has_memory");
  }
  std::unordered_set<int> zset;
  if(!lexNumberCollection(memzones.get(), zset)){
    throw std::runtime_error("couldn't parse system/node/has_memory");
  }
  for(int z : zset){
    Zones.try_emplace(z, sysfd.FD(), z, basepagesize);
  }
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant