cleanup ffi import/export, finish readme

clickingbuttons · Jul 19, 2023 · 9c238d1 · 9c238d1
1 parent 1ed1aaa
commit 9c238d1
Show file tree

Hide file tree

Showing 21 changed files with 313 additions and 117 deletions.
diff --git a/README.md b/README.md
@@ -58,11 +58,7 @@ Notes:
 
 ### Build arrays
 
-The default `Builder` can map Zig types with reasonable defaults except for Dictionary types. You can import it like this:
-```zig
-const Builder = @import("arrow").array.Builder;
-```
-
+The default `Builder` can map Zig types with reasonable defaults except for Dictionary types. You can use it like this:
 ```zig
 var b = try Builder(?i16).init(allocator);
 try b.append(null);
@@ -83,10 +79,6 @@ error: expected type 'i16', found '@TypeOf(null)'
 ```
 
 Dictionary types must use an explicit builder.
-```zig
-const DictBuilder = @import("arrow").array.dict.Builder;
-```
-
 ```zig
 var b = try DictBuilder(?[]const u8).init(allocator);
 try b.appendNull();
@@ -95,7 +87,7 @@ try b.append("there");
 try b.append("friend");
 ```
 
-You can customize exactly how the type maps to Arrow with each type's `BuilderAdvanced`. For example to build a sparse union of structs:
+You can customize exactly how to build Arrow types with each type's `BuilderAdvanced`. For example to build a sparse union of nullable structs:
 ```zig
 var b = try UnionBuilder(
     struct {
@@ -111,19 +103,24 @@ try b.append(.{ .f = 3 });
 try b.append(.{ .i = 5 });
 ```
 
-You can view [sample.zig](./src/sample.zig) which has an example for all supported types.
+You can view [sample.zig](./src/sample.zig) which has examples for all supported types.
 
 ### FFI
 
-Arrow has a [C ABI](https://arrow.apache.org/docs/format/CDataInterface.html) that allows in-memory array importing and exporting that only copies metadata.
+Arrow has a [C ABI](https://arrow.apache.org/docs/format/CDataInterface.html) that allows importing and exporting arrays over an FFI boundary by only copying metadata.
 
 #### Export
 
 If you have a normal `Array` you can export it to a `abi.Schema` and `abi.Array` to share the memory with other code (i.e. scripting languages). When you do so, that code is responsible for calling `abi.Schema.release(&schema)` and `abi.Array.release(&array)` to free memory.
 
 ```zig
-var abi_schema = try abi.Schema.init(array);
+const array = try arrow.sample.all(allocator);
+errdefer array.deinit();
+
+// Note: these are stack allocated.
 var abi_arr = try abi.Array.init(array);
+var abi_schema = try abi.Schema.init(array);
+
 externFn(&abi_schema, &abi_arr);
 ```
 
@@ -132,10 +129,11 @@ externFn(&abi_schema, &abi_arr);
 If you have a `abi.Schema` and `abi.Array` you can transform them to an `ImportedArray` that contains a normal `Array`. Be a good steward and free the memory with `imported.deinit()`.
 
 ```zig
-const array = sample.all();
+const array = try arrow.sample.all(allocator);
+
 var abi_schema = try abi.Schema.init(array);
 var abi_arr = try abi.Array.init(array);
-var imported = try ImportedArray.init(allocator, abi_arr, abi_schema);
+var imported = try arrow.ffi.ImportedArray.init(allocator, abi_arr, abi_schema);
 defer imported.deinit();
 ```
 
@@ -146,16 +144,17 @@ Array has an [IPC format](https://arrow.apache.org/docs/format/Columnar.html#ser
 I cannot in good faith recommend using this format for the following reasons:
 
 1. [Array types](#Usage) are complicated and difficult to generically map to other type systems.
-2. Despite claiming to be zero-copy, if an array's buffer uses compression it must be copied. This implementation will also copy is its alignment is not 64 (C++ implementation uses 8).
+2. Despite claiming to be zero-copy, if an array's buffer uses compression it must be copied. This implementation will also copy is its alignment is not 64 (the C++ implementation and most files use 8).
 3. Post-compression size savings compared to CSV are marginal.
 4. Poor backwards compatability. There have been 5 versions of the format, most undocumented, with multiple breaking changes.
 
 I also have the following gripes from implementing it:
 
-1. Poor existing tooling. Tools cannot inspect individual messages and have poor error messages. Despite the message format being designed for streaming existing tools work on the entire file at once.
-2. Poor documentation. The upstream [`File.fbs`](https://github.com/apache/arrow/blob/main/format/File.fbs) has numerous **incorrect** comments.
-3. The message custom metadata that would make the format more useful than just shared `ffi` memory is inaccessible in most implementations (including this one) since they are justifiably focused on record batches.
-4. Existing implementations do not support reading/writing record batches with different schemas.
+1. Poor existing tooling. Tools cannot inspect individual messages and have poor error messages.
+2. Despite the message format being designed for streaming existing tools work on the entire file at once.
+3. Poor documentation. The upstream [`File.fbs`](https://github.com/apache/arrow/blob/main/format/File.fbs) has numerous **incorrect** comments.
+4. The message custom metadata that would make the format more useful than just shared `ffi` memory is inaccessible in most implementations (including this one) since they are justifiably focused on record batches.
+5. Existing implementations do not support reading/writing record batches with different schemas.
 
 This implementation is only provided as a way to dump normal `Array`s to disk for later inspection.
 
@@ -181,7 +180,7 @@ If feeling daring, you can use the streaming API of `ipc.reader.Reader(ReaderTyp
 You can write record batches of a normal `Arrow` array `ipc.writer.fileWriter`:
 
 ```zig
-const batch = try sample.all(std.testing.allocator);
+const batch = try arrow.sample.all(std.testing.allocator);
 try batch.toRecordBatch("record batch");
 defer batch.deinit();
 

diff --git a/build.zig b/build.zig
@@ -4,42 +4,45 @@ pub const name = "arrow";
 const path = "src/lib.zig";
 
 pub fn build(b: *std.Build) !void {
-    // Expose to zig dependents
-    _ = b.addModule(name, .{ .source_file = .{ .path = path } });
-
     const target = b.standardTargetOptions(.{});
     const optimize = b.standardOptimizeOption(.{});
 
-    const lib = b.addSharedLibrary(.{
-        .name = "arrow-zig", // Avoid naming conflict with libarrow
-        .root_source_file = .{ .path = path },
-        .target = target,
-        .optimize = optimize,
-    });
-    b.installArtifact(lib);
-
     const flatbuffers_dep = b.dependency("flatbuffers-zig", .{
         .target = target,
         .optimize = optimize,
     });
     const flatbuffers_mod = flatbuffers_dep.module("flatbuffers");
-    lib.addModule("flatbuffers", flatbuffers_mod); // For generated files to use lib
 
     const lz4 = b.dependency("lz4", .{
         .target = target,
         .optimize = optimize,
     });
     const lz4_mod = lz4.module("lz4");
-    lib.addModule("lz4", lz4_mod);
+    // Expose to zig dependents
+    const module = b.addModule(name, .{
+        .source_file = .{ .path = path },
+        .dependencies = &.{
+            .{ .name = "flatbuffers", .module = flatbuffers_mod },
+            .{ .name = "lz4", .module = lz4_mod },
+        },
+    });
+
+    const lib = b.addSharedLibrary(.{
+        .name = "arrow-zig", // Avoid naming conflict with libarrow
+        .root_source_file = .{ .path = path },
+        .target = target,
+        .optimize = optimize,
+    });
+    b.installArtifact(lib);
 
     const test_step = b.step("test", "Run library tests");
     const main_tests = b.addTest(.{
         .root_source_file = .{ .path = path },
         .target = target,
         .optimize = optimize,
     });
-    main_tests.addModule("flatbuffers", flatbuffers_mod); // For generated files to use lib
     main_tests.addModule("lz4", lz4_mod);
+    main_tests.addModule("flatbuffers", flatbuffers_mod);
     const run_main_tests = b.addRunArtifact(main_tests);
     test_step.dependOn(&run_main_tests.step);
 
@@ -54,4 +57,14 @@ pub fn build(b: *std.Build) !void {
     ipc_test.step.dependOn(&run_main_tests.step);
     integration_test_step.dependOn(&ipc_test.step);
     integration_test_step.dependOn(&ffi_test.step);
+
+    const example_test_step = b.step("test-examples", "Run example tests");
+    const example_tests = b.addTest(.{
+        .root_source_file = .{ .path = "./examples/all.zig" },
+        .target = target,
+        .optimize = optimize,
+    });
+    example_tests.addModule("arrow", module);
+    const run_example_tests = b.addRunArtifact(example_tests);
+    example_test_step.dependOn(&run_example_tests.step);
 }
diff --git a/examples/all.zig b/examples/all.zig
@@ -0,0 +1,5 @@
+test {
+    _ = @import("./build_arrays.zig");
+    _ = @import("./ffi.zig");
+    _ = @import("./ipc.zig");
+}
diff --git a/examples/build_arrays.zig b/examples/build_arrays.zig
@@ -0,0 +1,33 @@
+const std = @import("std");
+const arrow = @import("arrow");
+
+const abi = arrow.abi;
+const Builder = arrow.array.Builder;
+const DictBuilder = arrow.array.dict.Builder;
+const allocator = std.testing.allocator;
+
+test "build arrays" {
+    var b = try Builder(?i16).init(allocator);
+    errdefer b.deinit();
+
+    try b.append(null);
+    try b.append(32);
+    try b.append(33);
+    try b.append(34);
+
+    const array = try b.finish();
+    defer array.deinit();
+}
+
+test "build dictionary array" {
+    var b = try DictBuilder(?[]const u8).init(allocator);
+    errdefer b.deinit();
+
+    try b.appendNull();
+    try b.append("hello");
+    try b.append("there");
+    try b.append("friend");
+
+    const array = try b.finish();
+    defer array.deinit();
+}
diff --git a/examples/ffi.zig b/examples/ffi.zig
@@ -0,0 +1,29 @@
+const std = @import("std");
+const arrow = @import("arrow");
+
+const abi = arrow.ffi.abi;
+const allocator = std.testing.allocator;
+
+test "ffi export" {
+    const array = try arrow.sample.all(allocator);
+    errdefer array.deinit();
+
+    // Note: these are stack allocated.
+    var abi_arr = try abi.Array.init(array);
+    var abi_schema = try abi.Schema.init(array);
+
+    // externFn(&abi_schema, &abi_arr);
+
+    // Normally `externFn` would call these. The order doesn't matter.
+    abi_schema.release.?(&abi_schema);
+    abi_arr.release.?(&abi_arr);
+}
+
+test "ffi import" {
+    const array = try arrow.sample.all(allocator);
+
+    var abi_schema = try abi.Schema.init(array);
+    var abi_arr = try abi.Array.init(array);
+    var imported = try arrow.ffi.ImportedArray.init(allocator, abi_arr, abi_schema);
+    defer imported.deinit();
+}
diff --git a/examples/ipc.zig b/examples/ipc.zig
@@ -0,0 +1,29 @@
+const std = @import("std");
+const arrow = @import("arrow");
+
+const ipc = arrow.ipc;
+const allocator = std.testing.allocator;
+
+test "read file" {
+    var ipc_reader = try ipc.reader.fileReader(allocator, "./testdata/tickers.arrow");
+    defer ipc_reader.deinit();
+
+    while (try ipc_reader.nextBatch()) |rb| {
+        // Do something with rb
+        defer rb.deinit();
+    }
+}
+
+test "write file" {
+    const batch = try arrow.sample.all(std.testing.allocator);
+    try batch.toRecordBatch("record batch");
+    defer batch.deinit();
+
+    const fname = "./sample.arrow";
+    var ipc_writer = try ipc.writer.fileWriter(std.testing.allocator, fname);
+    defer ipc_writer.deinit();
+    try ipc_writer.write(batch);
+    try ipc_writer.finish();
+
+    try std.fs.cwd().deleteFile(fname);
+}
diff --git a/src/array/flat.zig b/src/array/flat.zig
@@ -63,16 +63,19 @@ pub fn BuilderAdvanced(comptime T: type, comptime opts: tags.BinaryOptions) type
                 .Bool, .Int, .Float, .ComptimeInt, .ComptimeFloat => try self.values.append(value),
                 .Pointer => |p| switch (p.size) {
                     .Slice => {
-                        std.debug.assert(layout == .VariableBinary);
                         try self.values.appendSlice(value);
                         try self.offsets.append(@intCast(self.values.items.len));
                     },
                     else => |t| @compileError("unsupported pointer type " ++ @tagName(t)),
                 },
                 .Array => |a| {
-                    std.debug.assert(is_fixed);
                     if (a.len != fixed_len)
-                        @compileError(std.fmt.comptimePrint("expected array of len {d} but got array of len {d}", .{ fixed_len, a.len }));
+                        @compileError(
+                            std.fmt.comptimePrint(
+                                "expected array of len {d} but got array of len {d}",
+                                .{ fixed_len, a.len },
+                            ),
+                        );
                     try self.values.appendSlice(&value);
                 },
                 .Null => {

diff --git a/src/array/list.zig b/src/array/list.zig
@@ -93,7 +93,13 @@ pub fn BuilderAdvanced(
                     else => |t| @compileError("unsupported pointer type " ++ @tagName(t)),
                 },
                 .Array => |a| {
-                    std.debug.assert(a.len == fixed_len);
+                    if (a.len != fixed_len)
+                        @compileError(
+                            std.fmt.comptimePrint(
+                                "expected array of len {d} but got array of len {d}",
+                                .{ fixed_len, a.len },
+                            ),
+                        );
                     for (value) |v| try self.child.append(v);
                 },
                 else => |t| @compileError("unsupported append type " ++ @tagName(t)),

diff --git a/src/ffi/abi.zig b/src/ffi/abi.zig
@@ -7,6 +7,12 @@ const export_ = @import("export.zig");
 pub const Schema = extern struct {
     const Self = @This();
 
+    pub const PrivateData = struct {
+        allocator: Allocator,
+        name_len: usize,
+        abi_format_on_heap: bool,
+    };
+
     format: [*:0]const u8, // Managed
     name: ?[*:0]const u8 = null, // Managed
     metadata: ?[*:0]const u8 = null, // Managed
@@ -26,25 +32,9 @@ pub const Schema = extern struct {
         std.debug.assert(@sizeOf(@This()) == 72);
     }
 
-    // Creates a new abi.Schema from a abi.Array. Caller owns abi.Schema and must call `.release`.
+    /// Creates a new abi.Schema from a abi.Array. Caller owns abi.Schema and must call `.release`.
     pub fn init(array: *array_mod.Array) !Self {
-        const layout = array.tag.abiLayout();
-        const n_children = if (layout == .Dictionary) 0 else array.children.len;
-        const Exporter = export_.schema;
-
-        return .{
-            .format = try array.tag.abiFormat(array.allocator, n_children),
-            .name = if (array.name.len == 0) null else try array.allocator.dupeZ(u8, array.name),
-            .metadata = null,
-            .flags = .{
-                .nullable = array.tag.nullable(),
-            },
-            .n_children = @bitCast(n_children),
-            .children = try Exporter.children(array, n_children),
-            .dictionary = try Exporter.dictionary(array, layout),
-            .release = Exporter.release,
-            .private_data = @ptrCast(array),
-        };
+        return export_.schema.init(array);
     }
 
     pub fn deinit(self: *Self) void {
@@ -136,25 +126,9 @@ pub const Array = extern struct {
         std.debug.assert(@sizeOf(@This()) == 80);
     }
 
-    // Moves array.Array into a new abi.Array. Caller owns abi.Array and must call `.release`.
+    /// Moves array.Array into a new abi.Array. Caller owns abi.Array and must call `.release`.
     pub fn init(array: *array_mod.Array) !Self {
-        const layout = array.tag.abiLayout();
-        const n_buffers = layout.nBuffers();
-        const n_children = if (layout == .Dictionary) 0 else array.children.len;
-        const Exporter = export_.array;
-
-        return .{
-            .length = @bitCast(array.length),
-            .null_count = @bitCast(array.null_count),
-            .offset = 0,
-            .n_buffers = @bitCast(n_buffers),
-            .n_children = @bitCast(n_children),
-            .buffers = try Exporter.buffers(array, n_buffers),
-            .children = try Exporter.children(array, n_children),
-            .dictionary = try Exporter.dictionary(array, layout),
-            .release = Exporter.release,
-            .private_data = @ptrCast(array),
-        };
+        return export_.array.init(array);
     }
 
     pub fn deinit(self: *Self) void {