Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimized "blitter" routine written in assembler [wip] #719

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 138 additions & 0 deletions src/lib/blit.dasl
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
-- blit.lua - offload engine for memory operations

module(..., package.seeall)

local dasm = require("dasm")
local ffi = require("ffi")

|.arch x64
|.actionlist actions

-- The blit module provides "blitter" operation to offload
-- performance-critical memory operations. The API allows scheduling a
-- series of operations, that can be performed at any time and in any
-- order, and then executing a "barrier" to wait for completion.

-- This module implements a blitter that defers all copy operations
-- until the barrier() is invoked and then executes them with a single
-- assembler code routine.

ffi.cdef[[
struct blit_queue_entry {
void *src;
void *dst;
uint64_t len;
};
]]

-- Queue of memory operations
local maxqueue = 10000
local queue = ffi.new("struct blit_queue_entry[?]", maxqueue+2)
local nqueued = 0

function copy (dst, src, len)
-- XXX This routine is hard-coded for multiples of 32 bytes.
assert(len%32 == 0)
if nqueued == maxqueue then barrier() end
queue[nqueued].src = src
queue[nqueued].dst = dst
queue[nqueued].len = len
nqueued = nqueued + 1
-- Sentinel
queue[nqueued].src = nil
end

-- Assembler code for the barrier operation
function gen_barrier (Dst)
| mov64 r8, queue

|->queue:
| cmp qword [r8], 0 -- sentinel?
| je >9

-- Load parameters for the next copy
| mov rsi, [r8] -- source
| mov rdi, [r8+8] -- destination
| mov rcx, [r8+16] -- length

-- Copy 32 bytes at a time
| xor rax, rax
|->copy:
| vmovdqu ymm0, [rsi+rax]
| vmovdqu [rdi+rax], ymm0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might want to experiment unrolling this manually. I got some significant speedups by having more loads in flight.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is quite delicate :-). I started out with an unrolled version of the inner loop and then found that the looping version delivered the same performance. There have been other very innocent code variations that were much slower though. I want to use the PMU to explore these differences.

I would like to try unrolling the outer loop though to see if coping several packets in parallel could help.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, delicate indeed :) One thing to try is instead of doing load, store, load, store, to do load, load, store, store. That was what worked best for me. Good luck :)

| add rax, 32
| cmp rax, rcx
| jl ->copy

-- Advance to the next copy in the queue
| add r8, ffi.sizeof('struct blit_queue_entry')
| jmp ->queue
|9:
| ret
end

-- XXX the code below is copy-paste and should be reused somehow.
local debug = false
anchor = {}
-- Utility: assemble code and optionally dump disassembly.
local function assemble (name, prototype, generator)
local Dst = dasm.new(actions)
generator(Dst)
local mcode, size = Dst:build()
table.insert(anchor, mcode)
if debug then
print("mcode dump: "..name)
dasm.dump(mcode, size)
end
return ffi.cast(prototype, mcode)
end

-- Machine code for the barrier function
local asm_barrier = assemble("barrier", "void(*)()", gen_barrier)

-- Wait until all copies have completed.
function barrier ()
asm_barrier()
nqueued = 0
queue[0].src = nil -- sentinel
end

-- Test by doing the same random copies with blit and ffi.copy() and
-- comparing the results.
function selftest ()
print("selftest: blit")
local membytes = 10240
local memx = ffi.new("char[?]", membytes)
local memy = ffi.new("char[?]", membytes)
for i = 0, 10 do
print("loop "..i)
-- Initialize memx and memy with identical randomly chosen values
for i = 0, membytes-1 do
local n = math.random(256)
memx[i] = n
memy[i] = n
end
-- Perform some random copies
for i = 0, math.random(1000) do
local srcoffset = math.random(1000)
local dstoffset = math.random(1000)
local length = math.random(8) * 32 + 32
copy (memx+dstoffset, memx+srcoffset+5120, length)
ffi.copy(memy+dstoffset, memy+srcoffset+5120, length)
end
-- Execute deferred copies
barrier()
-- Check for same contents
for i = 0, membytes-1 do
if memx[i] ~= memy[i] then
print(require("core.lib").hexdump(ffi.string(memx+i, 32)))
print(require("core.lib").hexdump(ffi.string(memy+i, 32)))
error("mismatch at byte " .. i)
end
end
end
print("selftest: ok")
end

function copy (dst, src, len) ffi.copy(dst, src, len) end
function barrier () end
33 changes: 33 additions & 0 deletions src/lib/blit.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
-- blit.lua - offload engine for memory operations

module(..., package.seeall)

local ffi = require("ffi")

-- The blit module provides "blitter" operation to offload
-- performance-critical memory operations. The API allows scheduling a
-- series of operations, that can be performed at any time and in any
-- order, and then executing a "barrier" to wait for completion.

-- The implementation in this file is very basic but could be extended
-- in the future to take advantage of the flexibility afforded by the
-- API to perform special optimizations (for example parallel memory
-- copies to amortize cache latency, etc).

function copy (dst, src, len)
-- Trivial implementation: simply do an immediate memory copy.
ffi.copy(dst, src, len)
end

-- Wait until all copies have completed.
function barrier ()
-- No-op because the copies were already executed eagerly.
end

function selftest ()
print("selftest: blit")
-- It would be valuable to have an extensive selftest function to
-- make it easy to develop and test new optimized blitter
-- implementations.
print("selftest: ok")
end
10 changes: 7 additions & 3 deletions src/lib/virtio/net_device.lua
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ local packet = require("core.packet")
local timer = require("core.timer")
local vq = require("lib.virtio.virtq")
local checksum = require("lib.checksum")
local blit = require("lib.blit")
local ffi = require("ffi")
local C = ffi.C
local band = bit.band
Expand Down Expand Up @@ -106,6 +107,7 @@ end
function VirtioNetDevice:poll_vring_receive ()
-- RX
self:receive_packets_from_vm()
blit.barrier()
self:rx_signal_used()
end

Expand Down Expand Up @@ -139,7 +141,8 @@ function VirtioNetDevice:rx_buffer_add(rx_p, addr, len)
local addr = self:map_from_guest(addr)
local pointer = ffi.cast(char_ptr_t, addr)

packet.append(rx_p, pointer, len)
rx_p.length = rx_p.length + len
blit.copy(rx_p, pointer, len)
return len
end

Expand Down Expand Up @@ -174,6 +177,7 @@ end
function VirtioNetDevice:poll_vring_transmit ()
-- RX
self:transmit_packets_to_vm()
blit.barrier()
self:tx_signal_used()
end

Expand Down Expand Up @@ -239,7 +243,7 @@ function VirtioNetDevice:tx_buffer_add(tx_p, addr, len)
local pointer = ffi.cast(char_ptr_t, addr)

assert(tx_p.length <= len)
ffi.copy(pointer, tx_p.data, tx_p.length)
blit.copy(pointer, tx_p.data, tx_p.length)

return tx_p.length
end
Expand Down Expand Up @@ -288,7 +292,7 @@ function VirtioNetDevice:tx_buffer_add_mrg_rxbuf(tx_p, addr, len)
local to_copy = math.min(tx_p.length - self.tx.data_sent, len + adjust)

-- copy the data to the adjusted pointer
ffi.copy(pointer - adjust, tx_p.data + self.tx.data_sent, to_copy)
ffi.copy(tx_p.data + self.tx.data_sent, pointer - adjust, to_copy)

-- update the num_buffers in the first virtio header
self.tx.tx_mrg_hdr[0].num_buffers = self.tx.tx_mrg_hdr[0].num_buffers + 1
Expand Down