snabbco · lukego · Jan 17, 2016 · Jan 17, 2016 · Jan 18, 2016 · Jan 19, 2016
diff --git a/src/lib/blit.dasl b/src/lib/blit.dasl
@@ -0,0 +1,138 @@
+-- blit.lua - offload engine for memory operations
+
+module(..., package.seeall)
+
+local dasm = require("dasm")
+local ffi = require("ffi")
+
+|.arch x64
+|.actionlist actions
+
+-- The blit module provides "blitter" operation to offload
+-- performance-critical memory operations. The API allows scheduling a
+-- series of operations, that can be performed at any time and in any
+-- order, and then executing a "barrier" to wait for completion.
+
+-- This module implements a blitter that defers all copy operations
+-- until the barrier() is invoked and then executes them with a single
+-- assembler code routine.
+
+ffi.cdef[[
+  struct blit_queue_entry {
+    void *src;
+    void *dst;
+    uint64_t len;
+  };
+]]
+
+-- Queue of memory operations
+local maxqueue = 10000
+local queue = ffi.new("struct blit_queue_entry[?]", maxqueue+2)
+local nqueued = 0
+
+function copy (dst, src, len)
+   -- XXX This routine is hard-coded for multiples of 32 bytes.
+   assert(len%32 == 0)
+   if nqueued == maxqueue then barrier() end
+   queue[nqueued].src = src
+   queue[nqueued].dst = dst
+   queue[nqueued].len = len
+   nqueued = nqueued + 1
+   -- Sentinel
+   queue[nqueued].src = nil
+end
+
+-- Assembler code for the barrier operation
+function gen_barrier (Dst)
+   | mov64 r8, queue
+
+   |->queue:
+   | cmp qword [r8], 0          -- sentinel?
+   | je >9
+
+   -- Load parameters for the next copy
+   | mov rsi, [r8]     -- source
+   | mov rdi, [r8+8]   -- destination
+   | mov rcx, [r8+16]  -- length
+
+   -- Copy 32 bytes at a time
+   | xor rax, rax
+   |->copy:
+   | vmovdqu ymm0, [rsi+rax]
+   | vmovdqu [rdi+rax], ymm0
+   | add rax, 32
+   | cmp rax, rcx
+   | jl ->copy
+
+   -- Advance to the next copy in the queue
+   | add r8, ffi.sizeof('struct blit_queue_entry')
+   | jmp ->queue
+   |9:
+   | ret
+end
+
+-- XXX the code below is copy-paste and should be reused somehow.
+local debug = false
+anchor = {}
+-- Utility: assemble code and optionally dump disassembly.
+local function assemble (name, prototype, generator)
+   local Dst = dasm.new(actions)
+   generator(Dst)
+   local mcode, size = Dst:build()
+   table.insert(anchor, mcode)
+   if debug then
+      print("mcode dump: "..name)
+      dasm.dump(mcode, size)
+   end
+   return ffi.cast(prototype, mcode)
+end
+
+-- Machine code for the barrier function
+local asm_barrier = assemble("barrier", "void(*)()", gen_barrier)
+
+-- Wait until all copies have completed.
+function barrier ()
+   asm_barrier()
+   nqueued = 0
+   queue[0].src = nil           -- sentinel
+end
+
+-- Test by doing the same random copies with blit and ffi.copy() and
+-- comparing the results.
+function selftest ()
+   print("selftest: blit")
+   local membytes = 10240
+   local memx = ffi.new("char[?]", membytes)
+   local memy = ffi.new("char[?]", membytes)
+   for i = 0, 10 do
+      print("loop "..i)
+      -- Initialize memx and memy with identical randomly chosen values
+      for i = 0, membytes-1 do
+         local n = math.random(256)
+         memx[i] = n
+         memy[i] = n
+      end
+      -- Perform some random copies
+      for i = 0, math.random(1000) do
+         local srcoffset = math.random(1000)
+	 local dstoffset = math.random(1000)
+         local length = math.random(8) * 32 + 32
+	 copy    (memx+dstoffset, memx+srcoffset+5120, length)
+	 ffi.copy(memy+dstoffset, memy+srcoffset+5120, length)
+      end
+      -- Execute deferred copies
+      barrier()
+      -- Check for same contents
+      for i = 0, membytes-1 do
+         if memx[i] ~= memy[i] then
+	    print(require("core.lib").hexdump(ffi.string(memx+i, 32)))
+	    print(require("core.lib").hexdump(ffi.string(memy+i, 32)))
+	    error("mismatch at byte " .. i)
+	 end
+      end
+   end
+   print("selftest: ok")
+end
+
+function copy (dst, src, len) ffi.copy(dst, src, len) end
+function barrier () end
diff --git a/src/lib/blit.lua b/src/lib/blit.lua
@@ -0,0 +1,33 @@
+-- blit.lua - offload engine for memory operations
+
+module(..., package.seeall)
+
+local ffi = require("ffi")
+
+-- The blit module provides "blitter" operation to offload
+-- performance-critical memory operations. The API allows scheduling a
+-- series of operations, that can be performed at any time and in any
+-- order, and then executing a "barrier" to wait for completion.
+
+-- The implementation in this file is very basic but could be extended
+-- in the future to take advantage of the flexibility afforded by the
+-- API to perform special optimizations (for example parallel memory
+-- copies to amortize cache latency, etc).
+
+function copy (dst, src, len)
+   -- Trivial implementation: simply do an immediate memory copy.
+   ffi.copy(dst, src, len)
+end
+
+-- Wait until all copies have completed.
+function barrier ()
+   -- No-op because the copies were already executed eagerly.
+end
+
+function selftest ()
+   print("selftest: blit")
+   -- It would be valuable to have an extensive selftest function to
+   -- make it easy to develop and test new optimized blitter
+   -- implementations.
+   print("selftest: ok")
+end
diff --git a/src/lib/virtio/net_device.lua b/src/lib/virtio/net_device.lua
@@ -11,6 +11,7 @@ local packet    = require("core.packet")
 local timer     = require("core.timer")
 local vq        = require("lib.virtio.virtq")
 local checksum  = require("lib.checksum")
+local blit      = require("lib.blit")
 local ffi       = require("ffi")
 local C         = ffi.C
 local band      = bit.band
@@ -106,6 +107,7 @@ end
 function VirtioNetDevice:poll_vring_receive ()
    -- RX
    self:receive_packets_from_vm()
+   blit.barrier()
    self:rx_signal_used()
 end
 
@@ -139,7 +141,8 @@ function VirtioNetDevice:rx_buffer_add(rx_p, addr, len)
    local addr = self:map_from_guest(addr)
    local pointer = ffi.cast(char_ptr_t, addr)
 
-   packet.append(rx_p, pointer, len)
+   rx_p.length = rx_p.length + len
+   blit.copy(rx_p, pointer, len)
    return len
 end
 
@@ -174,6 +177,7 @@ end
 function VirtioNetDevice:poll_vring_transmit ()
    -- RX
    self:transmit_packets_to_vm()
+   blit.barrier()
    self:tx_signal_used()
 end
 
@@ -239,7 +243,7 @@ function VirtioNetDevice:tx_buffer_add(tx_p, addr, len)
    local pointer = ffi.cast(char_ptr_t, addr)
 
    assert(tx_p.length <= len)
-   ffi.copy(pointer, tx_p.data, tx_p.length)
+   blit.copy(pointer, tx_p.data, tx_p.length)
 
    return tx_p.length
 end
@@ -288,7 +292,7 @@ function VirtioNetDevice:tx_buffer_add_mrg_rxbuf(tx_p, addr, len)
    local to_copy = math.min(tx_p.length - self.tx.data_sent, len + adjust)
 
    -- copy the data to the adjusted pointer
-   ffi.copy(pointer - adjust, tx_p.data + self.tx.data_sent, to_copy)
+   ffi.copy(tx_p.data + self.tx.data_sent, pointer - adjust, to_copy)
 
    -- update the num_buffers in the first virtio header
    self.tx.tx_mrg_hdr[0].num_buffers = self.tx.tx_mrg_hdr[0].num_buffers + 1