diff --git a/docs/components/mosaic.js b/docs/components/mosaic.js index f625480..923ae18 100644 --- a/docs/components/mosaic.js +++ b/docs/components/mosaic.js @@ -1,9 +1,4 @@ -import * as vg from "npm:@uwdata/vgplot@0.6.0"; - -export function url(file) { - const url = new URL(file, window.location); - return `${url}`; -} +import * as vg from "npm:@uwdata/vgplot"; export async function vgplot(queries) { const mc = vg.coordinator(); @@ -14,3 +9,7 @@ export async function vgplot(queries) { } return api; } + +export function url(file) { + return `${new URL(file, window.location)}`; +} diff --git a/docs/data-loading.md b/docs/data-loading.md new file mode 100644 index 0000000..7ae036a --- /dev/null +++ b/docs/data-loading.md @@ -0,0 +1,72 @@ +--- +title: Data Loading with DuckDB +--- + +# Data Loading with DuckDB + +This page provides guidance for using DuckDB in Observable Framework data loaders, and then deploying them using GitHub Actions. + +## Using DuckDB in Data Loaders + +The [NYC Taxi Rides](nyc-taxi-rides) example uses a [data loader](https://observablehq.com/framework/loaders) to perform data preparation, generating pre-projected data and writing it to a Parquet file. + +The loader below is a shell script that calls the command line interface to DuckDB. +The `duckdb` executable must be on your environment path... but more on that below! + +```sh +duckdb :memory: << EOF +-- Load spatial extension +INSTALL spatial; LOAD spatial; + +-- Project, following the example at https://github.com/duckdb/duckdb_spatial +CREATE TEMP TABLE rides AS SELECT + pickup_datetime::TIMESTAMP AS datetime, + ST_Transform(ST_Point(pickup_latitude, pickup_longitude), 'EPSG:4326', 'ESRI:102718') AS pick, + ST_Transform(ST_Point(dropoff_latitude, dropoff_longitude), 'EPSG:4326', 'ESRI:102718') AS drop +FROM 'https://uwdata.github.io/mosaic-datasets/data/nyc-rides-2010.parquet'; + +-- Output parquet file to stdout +COPY (SELECT + (HOUR(datetime) + MINUTE(datetime)/60) AS time, + ST_X(pick)::INTEGER AS px, ST_Y(pick)::INTEGER AS py, + ST_X(drop)::INTEGER AS dx, ST_Y(drop)::INTEGER AS dy +FROM rides) TO 'trips.parquet' WITH (FORMAT PARQUET); +EOF + +cat trips.parquet >&1 # Write output to stdout +rm trips.parquet # Clean up +``` + +We invoke DuckDB with the `:memory:` argument to indicate an in-memory database. +We also use the `<< EOF` shell script syntax to provide multi-line input, consisting of the desired SQL queries to run. + +The last query (`COPY ...`) writes a Parquet file to disk. +However, Observable Framework requires that we instead write data to [`stdout`](https://en.wikipedia.org/wiki/Standard_streams#Standard_output_(stdout)). +On some platforms we can do this by writing to the file descriptor `/dev/stdout`. +However, this file does not exist on all platforms – including in GitHub Actions, where this query will fail. + +So we complete the script with two additional commands: + +- Write (`cat`) the bytes of the Parquet file to `stdout`. +- Remove (`rm`) the generated file, as we no longer need it. + +## Using DuckDB in GitHub Actions + +To deploy our Observable Framework site on GitHub, we use a [GitHub Actions workflow](https://github.com/uwdata/mosaic-framework-example/blob/main/.github/workflows/deploy.yml). +As noted earlier, one issue when running in GitHub Actions is the lack of file-based access to `stdout`. +But another, even more basic, issue is that we need to have DuckDB installed! + +This snippet installs DuckDB within a workflow. +We download a zip file of the official release, unpack it, copy the `duckdb` executable to `/opt/duckdb`, and then link to `duckdb` in the directory `/usr/bin`, ensuring it is accessible to subsequent scripts: + +```yaml +steps: + - name: Install DuckDB CLI + run: | + wget https://github.com/duckdb/duckdb/releases/download/v0.10.0/duckdb_cli-linux-amd64.zip + unzip duckdb_cli-linux-amd64.zip + mkdir /opt/duckdb && mv duckdb /opt/duckdb && chmod +x /opt/duckdb/duckdb && sudo ln -s /opt/duckdb/duckdb /usr/bin/duckdb + rm duckdb_cli-linux-amd64.zip +``` + +We perform this step before site build steps, ensuring `duckdb` is installed and ready. \ No newline at end of file diff --git a/docs/data/seattle-weather.parquet b/docs/data/seattle-weather.parquet new file mode 100644 index 0000000..996e335 Binary files /dev/null and b/docs/data/seattle-weather.parquet differ diff --git a/docs/flight-delays.md b/docs/flight-delays.md index 5b812e4..8d8533a 100644 --- a/docs/flight-delays.md +++ b/docs/flight-delays.md @@ -9,6 +9,7 @@ const vg = vgplot(vg => [ vg.loadParquet("flights", url(flights)) ]); ``` # Flight Delays +## Interactive exploration of large-scale transportation data What contributes to delayed airline flights? Let's examine a sample of over 200,000 flight records provided by the [U.S. DOT Bureau of Transportation Statistics](https://www.transtats.bts.gov/ontime/). diff --git a/docs/index.md b/docs/index.md index e5b629a..46776cb 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,18 +1,86 @@ +--- +title: Mosaic + Framework Examples +--- + # Mosaic + Framework Examples +## Using Mosaic and DuckDB in Observable Framework + +```js +import { vgplot, url } from "./components/mosaic.js"; +const weather = await FileAttachment("data/seattle-weather.parquet").url(); +const vg = vgplot(vg => [ vg.loadParquet("weather", url(weather)) ]); +``` + +This site shares examples of integrating Mosaic and DuckDB data loaders into Observable Framework. All source markup and code is available at . + +[Mosaic](https://uwdata.github.io/mosaic) is a system for linking data visualizations, tables, and input widgets, all leveraging a database ([DuckDB](https://duckdb.org/)) for scalable processing. With Mosaic, you can interactively visualize and explore millions and even billions of data points. + +Here is a simple example, an interactive dashboard of weather in Seattle: -[Mosaic](https://uwdata.github.io/mosaic) is a system for linking data visualizations, tables, and input widgets, all leveraging a database for scalable processing. With Mosaic, you can interactively visualize and explore millions and even billions of data points. +```js +const $click = vg.Selection.single(); +const $domain = vg.Param.array(["sun", "fog", "drizzle", "rain", "snow"]); +const $colors = vg.Param.array(["#e7ba52", "#a7a7a7", "#aec7e8", "#1f77b4", "#9467bd"]); +const $range = vg.Selection.intersect(); +``` -A key idea is that interface elements (Mosaic _clients_) publish their data needs as queries that are managed by a central _coordinator_. The coordinator may further optimize queries before issuing them to a backing _data source_ such as [DuckDB](https://duckdb.org/). +```js +vg.vconcat( + vg.hconcat( + vg.plot( + vg.dot( + vg.from("weather", {filterBy: $click}), + { + x: vg.dateMonthDay("date"), + y: "temp_max", + fill: "weather", + r: "precipitation", + fillOpacity: 0.7 + } + ), + vg.intervalX({as: $range, brush: {fill: "none", stroke: "#888"}}), + vg.highlight({by: $range, fill: "#ccc", fillOpacity: 0.2}), + vg.colorLegend({as: $click, columns: 1}), + vg.xyDomain(vg.Fixed), + vg.xTickFormat("%b"), + vg.colorDomain($domain), + vg.colorRange($colors), + vg.rDomain(vg.Fixed), + vg.rRange([2, 10]), + vg.width(680), + vg.height(300) + ) + ), + vg.plot( + vg.barX( + vg.from("weather"), + {x: vg.count(), y: "weather", fill: "#ccc", fillOpacity: 0.2} + ), + vg.barX( + vg.from("weather", {filterBy: $range}), + {x: vg.count(), y: "weather", fill: "weather"} + ), + vg.toggleY({as: $click}), + vg.highlight({by: $click}), + vg.xDomain(vg.Fixed), + vg.yDomain($domain), + vg.yLabel(null), + vg.colorDomain($domain), + vg.colorRange($colors), + vg.width(680) + ) +) +``` -This site shares examples of integrating Mosaic and DuckDB data loaders into Observable Framework. Source code is available at . +A key idea is that interface elements (Mosaic _clients_) publish their data needs as queries that are managed by a central _coordinator_. The coordinator may further optimize queries before issuing them to a backing _data source_ like DuckDB. -## Example Data Apps +## Example Articles -- [Flight Delays](/flight-delays) - explore over 200,000 flight records -- [NYC Taxi Rides](/nyc-taxi-rides) - load and visualize 1M NYC taxi cab rides -- [Observable Latency](/observable-latency) - a dense view of over 7M web requests +- [Flight Delays](flight-delays) - explore over 200,000 flight records +- [NYC Taxi Rides](nyc-taxi-rides) - load and visualize 1M NYC taxi cab rides +- [Observable Web Latency](observable-latency) - re-visiting a view of over 7M web requests ## Implementation Notes -- _Using DuckDB in data loaders and GitHub Actions_ -- _Using Mosaic + DuckDB-WASM in Observable Framework_ +- [Using DuckDB in Data Loaders and GitHub Actions](data-loading) +- [Using Mosaic + DuckDB-WASM in Observable Framework](mosaic-duckdb-wasm) diff --git a/docs/mosaic-duckdb-wasm.md b/docs/mosaic-duckdb-wasm.md new file mode 100644 index 0000000..ca63308 --- /dev/null +++ b/docs/mosaic-duckdb-wasm.md @@ -0,0 +1,79 @@ +--- +title: Using Mosaic & DuckDB-WASM +--- + +# Using Mosaic & DuckDB-WASM + +This page describes how to set up Mosaic and DuckDB-WASM to "play nice" with Observable's reactive runtime. +Unlike standard JavaScript, Observable will happily run JavaScript "out-of-order". +Observable uses dependencies among code blocks, rather than the order within the file, to determine what to run and when to run it. +This reactivity can cause problems for code that depends on "side effects" that are not tracked by Observable's runtime. + +## Importing Mosaic and Loading Data + +Here is how we initialize [Mosaic's vgplot API](https://uwdata.github.io/mosaic/what-is-mosaic/) in the [Flight Delays](flight-delays) example: + +```js run=false +import { vgplot, url } from "./components/mosaic.js"; +const flights = FileAttachment("data/flights-200k.parquet").url(); +const vg = vgplot(vg => [ vg.loadParquet("flights", url(flights)) ]); +``` + +We first import a custom `vgplot` initialization method that configures Mosaic, loads data into DuckDB, and returns the vgplot API. We also import a custom `url` method which we will later use to to prepare URLs that will be loaded by DuckDB. + +Next, we reference the data files we plan to load. +As Observable Framework needs to track which files are used, we must use its `FileAttachment` mechanism. +However, we don't actually want to load the file yet, so we instead request a URL. + +Finally, we invoke `vgplot(...)` to initialize Mosaic, which returns a (Promise to an) instance of the vgplot API. +This method takes a single function as input, and should return an array of SQL queries to execute upon load. + +We use the `url()` helper method to prepare a file URL so that DuckDB can successfully load it — the url string returned by `FileAttachment(...).url()` is a _relative_ path like `./_file/data/doodads.csv`. +DuckDB will mistakenly interpret this as a file system path rather than a web URL. +The `url()` helper produces a full URL (with `https://`, hostname, etc.), based on the location of the current page: + +```js run=false +export function url(file) { + return `${new URL(file, window.location)}`; +} +``` + +The `vg` argument to the data loader callback is exactly the same API instance that is ultimately returned by `vgplot`. +Perhaps this feels a bit circular, with `vg` provided to a callback, with the ultimate result being a reference to `vg`... why the gymnastics? +We want to have access to the API to support data loading, using Mosaic's helper functions to install extensions and load data files. +At the same time, we don't want to assign the _outer_ `vg` variable until data loading is complete. +That way, downstream code that uses the API to build visualizations will not get evaluated by the Observable runtime until _after_ data loading is complete. + +Once `vg` is assigned, the data will be loaded, and we can use the API to create [visualizations](https://uwdata.github.io/mosaic/vgplot/), +[inputs](https://uwdata.github.io/mosaic/inputs/), +[params](https://uwdata.github.io/mosaic/core/#params), and +[selections](https://uwdata.github.io/mosaic/core/#selections). + +## Mosaic Initialization + +For reference, here's the `vgplot()` method implementation: + +```js run=false +import * as vg from "npm:@uwdata/vgplot"; + +export async function vgplot(queries) { + const mc = vg.coordinator(); + const api = vg.createAPIContext({ coordinator: mc }); + mc.databaseConnector(vg.wasmConnector()); + if (queries) { + await mc.exec(queries(api)); + } + return api; +} +``` + +We first get a reference to the central coordinator, which manages all queries. +We create a new API context, which we eventually will return. + +Next, we configure Mosaic to use DuckDB-WASM. +The `wasmConnector()` method creates a new database instance in a worker thread. + +We then invoke the `queries` callback to get a list of data loading queries. +We issue the queries to DuckDB using the coordinator's `exec()` method and `await` the result. + +Once that completes, we're ready to go! diff --git a/docs/observable-latency.md b/docs/observable-latency.md index 7a4ce50..9d8bc5c 100644 --- a/docs/observable-latency.md +++ b/docs/observable-latency.md @@ -1,5 +1,5 @@ --- -title: Observable Latency +title: Observable Web Latency --- ```js @@ -9,11 +9,21 @@ const vg = vgplot(vg => [ vg.loadParquet("latency", url(latency)) ]); ``` # Observable Web Latency +## Recreating a custom graphic using Mosaic vgplot -Web request latency on Observable.com. -Each pixel in the heatmap shows the most common route (URL pattern) at a given response latency within a time interval. +The Observable Framework documentation includes a wonderful example about [Analyzing web logs](https://observablehq.com/framework/examples/api/), which looks at the latency (response time) of various routes on the Observable.com site. -Based on an [Observable Framework example](https://observablehq.com/framework/examples/api/). +The marquee graphic is a pixel-level heatmap of over 7 million requests to Observable servers over the course of a week. +The chart plots time vs. latency, where each pixel is colored according to the most common route (URL pattern) in that time and latency bin. + +That said, a lot is going on in the original [custom heatmap component](https://github.com/observablehq/framework/blob/main/examples/api/docs/components/apiHeatmap.js): + +- The data is pre-binned and aggregated for fast loading +- Observable Plot and HTML Canvas code are intermixed in non-trivial ways +- Frame-based animation is used to progressively render the graphic + +Below we re-implement this graphic using [Mosaic vgplot](https://uwdata.github.io/mosaic/what-is-mosaic/), using a simple, standalone specification. +We also leverage Mosaic's facilities for scalable filtering and cross-chart linking. ```js const $filter = vg.Selection.intersect(); @@ -75,4 +85,51 @@ vg.plot( ) ``` -Use the bar chart of most-requested routes to filter the heatmap and isolate specific patterns. +_Select elements in the bar chart of most-requested routes above to filter the heatmap and isolate specific patterns._ + +## Implementation Notes + +While the original uses a pre-binned dataset, we might want to create graphics like this in a more exploratory context. So first we "reverse-engineered" the data into original units, with columns for `time` and `latency` values, in addition to `route` and request `count`. We can leverage DuckDB to re-bin and filter data on the fly! + +We then implement the latency heatmap using a vgplot `raster` mark. Here is what that looks like when using a declarative Mosaic specification in YAML: + +```yaml +plot: +- mark: frame + fill: black +- mark: raster + data: { from: latency, filterBy: $filter } + x: time + y: latency + fill: { argmax: [route, count] } + fillOpacity: { sum: count } + width: 2016 + height: 500 + imageRendering: pixelated +colorDomain: Fixed +colorScheme: observable10 +opacityDomain: [0, 25] +opacityClamp: true +yScale: log +yLabel: ↑ Duration (ms) +yDomain: [0.5, 10000] +yTickFormat: s +xScale: utc +xLabel: null +xDomain: [1706227200000, 1706832000000] +width: 1063 +height: 550 +margins: { left: 35, top: 20, bottom: 30, right: 20 } +``` + +Key bits of the specification include: + +- Binning to a pixel grid based on `time` (_x_) and `latency` (_y_). +- Mapping the pixel fill color to the `route` with largest request `count` per bin. +- Mapping the pixel fill opacity to the sum of `count`s within a bin. +- Interactive filtering using a selection (`$filter`), populated by clicking bars in the bar chart of routes. The `colorDomain: Fixed` setting ensures consistent colors; it prevents re-coloring when the data is filtered. + +However, this re-creation does diverge from the original in a few ways: + +- The coloring is not identical. Ideally, vgplot should provide greater control over sorting scale domains (here, the list of unique `route` values). +- The re-creation above does not include nice tooltips like the original. diff --git a/observablehq.config.ts b/observablehq.config.ts index 2c58582..9e7a334 100644 --- a/observablehq.config.ts +++ b/observablehq.config.ts @@ -1,25 +1,33 @@ // See https://observablehq.com/framework/config for documentation. export default { // The project’s title; used in the sidebar and webpage titles. - title: "Mosaic + Framework Examples", + title: "Mosaic + Framework", // The pages and sections in the sidebar. If you don’t specify this option, // all pages will be listed in alphabetical order. Listing pages explicitly // lets you organize them into sections and have unlisted pages. - // pages: [ - // { - // name: "Examples", - // pages: [ - // {name: "Dashboard", path: "/example-dashboard"}, - // {name: "Report", path: "/example-report"} - // ] - // } - // ], + pages: [ + { + name: "Example Articles", + pages: [ + {name: "Flight Delays", path: "/flight-delays"}, + {name: "NYC Taxi Rides", path: "/nyc-taxi-rides"}, + {name: "Observable Latency", path: "/observable-latency"}, + ] + }, + { + name: "Implementation Notes", + pages: [ + {name: "Data Loading with DuckDB", path: "/data-loading"}, + {name: "Mosaic & DuckDB-WASM", path: "/mosaic-duckdb-wasm"} + ] + } + ], // Some additional configuration options and their defaults: // theme: "default", // try "light", "dark", "slate", etc. // header: "", // what to show in the header (HTML) - footer: "", // what to show in the footer (HTML) + footer: `Interactive Data Lab, University of Washington`, toc: true, // whether to show the table of contents pager: false, // whether to show previous & next links in the footer // root: "docs", // path to the source root for preview