From b4767de3f46de86cc23d52b4a1347a845c784dc2 Mon Sep 17 00:00:00 2001 From: Jeremy Roman Date: Thu, 15 Jun 2023 14:25:43 -0400 Subject: [PATCH 01/28] Add a Rust implementation of the Perl preprocessing logic. This implementation uses a full HTML parser (Servo's, html5ever) and operates on a DOM. It ends up being somewhat more verbose than Perl, but hopefully also more extensible/maintainable. --- .gitignore | 6 + Cargo.lock | 682 +++++++++++++++++++++++++++++++++++++ Cargo.toml | 13 + build.sh | 14 +- src/annotate_attributes.rs | 471 +++++++++++++++++++++++++ src/boilerplate.rs | 236 +++++++++++++ src/dom_utils.rs | 383 +++++++++++++++++++++ src/interface_index.rs | 391 +++++++++++++++++++++ src/main.rs | 76 +++++ src/parser.rs | 217 ++++++++++++ src/represents.rs | 152 +++++++++ src/serializer.rs | 50 +++ src/tag_omission.rs | 329 ++++++++++++++++++ 13 files changed, 3015 insertions(+), 5 deletions(-) create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/annotate_attributes.rs create mode 100644 src/boilerplate.rs create mode 100644 src/dom_utils.rs create mode 100644 src/interface_index.rs create mode 100644 src/main.rs create mode 100644 src/parser.rs create mode 100644 src/represents.rs create mode 100644 src/serializer.rs create mode 100644 src/tag_omission.rs diff --git a/.gitignore b/.gitignore index 923fda16..5bbd5ad1 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,9 @@ html/ output/ mdn/.id-list mdn/developer.mozilla.org/ +highlighter/ + + +# Added by cargo + +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 00000000..c7917763 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,682 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" +dependencies = [ + "memchr", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bytes" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" + +[[package]] +name = "cc" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "errno" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +dependencies = [ + "errno-dragonfly", + "libc", + "windows-sys", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + +[[package]] +name = "getrandom" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "hermit-abi" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" +dependencies = [ + "libc", +] + +[[package]] +name = "hermit-abi" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" + +[[package]] +name = "html-build" +version = "0.1.0" +dependencies = [ + "html5ever", + "markup5ever_rcdom", + "regex", + "tempfile", + "tokio", +] + +[[package]] +name = "html5ever" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "io-lifetimes" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" +dependencies = [ + "hermit-abi 0.3.1", + "libc", + "windows-sys", +] + +[[package]] +name = "libc" +version = "0.2.146" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b" + +[[package]] +name = "linux-raw-sys" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" + +[[package]] +name = "lock_api" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" + +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" +dependencies = [ + "log", + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "markup5ever_rcdom" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9521dd6750f8e80ee6c53d65e2e4656d7de37064f3a7a5d2d11d05df93839c2" +dependencies = [ + "html5ever", + "markup5ever", + "tendril", + "xml5ever", +] + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "mio" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" +dependencies = [ + "libc", + "wasi", + "windows-sys", +] + +[[package]] +name = "new_debug_unreachable" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" + +[[package]] +name = "num_cpus" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" +dependencies = [ + "hermit-abi 0.2.6", + "libc", +] + +[[package]] +name = "once_cell" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" + +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + +[[package]] +name = "proc-macro2" +version = "1.0.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "redox_syscall" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" + +[[package]] +name = "rustix" +version = "0.37.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b96e891d04aa506a6d1f318d2771bcb1c7dfda84e126660ace067c9b474bb2c0" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "serde" +version = "1.0.164" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d" + +[[package]] +name = "signal-hook-registry" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" +dependencies = [ + "libc", +] + +[[package]] +name = "siphasher" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" + +[[package]] +name = "smallvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" + +[[package]] +name = "socket2" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "string_cache" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" +dependencies = [ + "new_debug_unreachable", + "once_cell", + "parking_lot", + "phf_shared", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", +] + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31c0432476357e58790aaa47a8efb0c5138f137343f3b5f23bd36a27e3b0a6d6" +dependencies = [ + "autocfg", + "cfg-if", + "fastrand", + "redox_syscall", + "rustix", + "windows-sys", +] + +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + +[[package]] +name = "tokio" +version = "1.28.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94d7b1cfd2aa4011f2de74c2c4c63665e27a71006b0a192dcd2710272e73dfa2" +dependencies = [ + "autocfg", + "bytes", + "libc", + "mio", + "num_cpus", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys", +] + +[[package]] +name = "tokio-macros" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.18", +] + +[[package]] +name = "unicode-ident" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" + +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" + +[[package]] +name = "xml5ever" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650" +dependencies = [ + "log", + "mac", + "markup5ever", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..6f9f7202 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "html-build" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +tokio = { version = "1", features = ["full"] } +html5ever = "*" +markup5ever_rcdom = "*" +tempfile = "3" +regex = "1" \ No newline at end of file diff --git a/build.sh b/build.sh index a0a7b4a4..cfc36edd 100755 --- a/build.sh +++ b/build.sh @@ -529,13 +529,17 @@ function processSource { BUILD_TYPE="$2" cp -p entities/out/entities.inc "$HTML_CACHE" cp -p entities/out/entities-dtd.url "$HTML_CACHE" - if $VERBOSE; then - perl .pre-process-main.pl --verbose < "$HTML_SOURCE/$SOURCE_LOCATION" > "$HTML_TEMP/source-expanded-1" + if [ "${PROCESS_WITH_RUST:-0}" = "1" ]; then + cargo run -r <"$HTML_SOURCE/$SOURCE_LOCATION" >"$HTML_TEMP/source-whatwg-complete" else - perl .pre-process-main.pl < "$HTML_SOURCE/$SOURCE_LOCATION" > "$HTML_TEMP/source-expanded-1" + if $VERBOSE; then + perl .pre-process-main.pl --verbose < "$HTML_SOURCE/$SOURCE_LOCATION" > "$HTML_TEMP/source-expanded-1" + else + perl .pre-process-main.pl < "$HTML_SOURCE/$SOURCE_LOCATION" > "$HTML_TEMP/source-expanded-1" + fi + perl .pre-process-annotate-attributes.pl < "$HTML_TEMP/source-expanded-1" > "$HTML_TEMP/source-expanded-2" # this one could be merged + perl .pre-process-tag-omission.pl < "$HTML_TEMP/source-expanded-2" | perl .pre-process-index-generator.pl > "$HTML_TEMP/source-whatwg-complete" # this one could be merged fi - perl .pre-process-annotate-attributes.pl < "$HTML_TEMP/source-expanded-1" > "$HTML_TEMP/source-expanded-2" # this one could be merged - perl .pre-process-tag-omission.pl < "$HTML_TEMP/source-expanded-2" | perl .pre-process-index-generator.pl > "$HTML_TEMP/source-whatwg-complete" # this one could be merged runWattsi "$HTML_TEMP/source-whatwg-complete" "$HTML_TEMP/wattsi-output" "$HIGHLIGHT_SERVER_URL" if [[ $WATTSI_RESULT == "0" ]]; then diff --git a/src/annotate_attributes.rs b/src/annotate_attributes.rs new file mode 100644 index 00000000..8e5e5608 --- /dev/null +++ b/src/annotate_attributes.rs @@ -0,0 +1,471 @@ +//! Augments the content attribute list for each element with a description found in the Attributes table. + +use std::collections::HashMap; +use std::io; +use std::rc::Rc; + +use html5ever::tendril::StrTendril; +use html5ever::{local_name, namespace_url, ns, LocalName, QualName}; +use markup5ever_rcdom::{Handle, NodeData}; + +use crate::dom_utils::{self, NodeHandleExt}; +use crate::parser; + +#[derive(Debug, Default)] +struct Descriptions { + /// The default description, as a list of nodes. + default: Vec, + + /// The variant description, if any, as an unparsed string. + variant: Option, +} + +#[derive(Debug)] +struct Edit { + /// Handle on the
element which is to be filled in. + dd: Handle, + + /// The data-x attribute which must be described. + key: StrTendril, + + /// Whether this location has requested the variant/alternate description. + wants_variant_description: bool, + + /// Whether this is described as having "special semantics" and so must be + /// formatted differently. + has_special_semantics: bool, +} + +pub struct Processor { + /// Map from attribute key (e.g., attr-elem-someattribute) to the + /// descriptions found in the Attributes table. + attributes: HashMap, + + /// List of
nodes in Content attributes sections that need to be filled in. + edits: Vec, +} + +impl Processor { + pub fn new() -> Self { + Processor { + attributes: HashMap::new(), + edits: Vec::new(), + } + } + + pub fn visit(&mut self, node: &Handle) { + // We're looking for a (which is under the Attributes heading). + if node.is_html_element(&local_name!("table")) && node.has_id("attributes-1") { + self.index_attribute_table(node); + } + + // We're looking for the following: + //
+ // ... + //
Content attributes:
+ //
Global attributes
+ //
href
+ //
someattribute
+ // ... + fn is_content_attribute_dt(dt: &Handle) -> bool { + if !dt.is_html_element(&local_name!("dt")) { + return false; + } + match dt.parent_node() { + Some(p) if p.is_html_element(&local_name!("dl")) && p.has_class("element") => (), + _ => return false, + } + let data_x = QualName::new(None, ns!(), LocalName::from("data-x")); + dt.any_child(|c| c.attribute_is(&data_x, "concept-element-attributes")) + } + if is_content_attribute_dt(node) { + self.index_attribute_list(node); + } + } + + fn index_attribute_table(&mut self, table: &Handle) { + let tbody = match table + .children + .borrow() + .iter() + .find(|n| n.is_html_element(&local_name!("tbody"))) + { + Some(tbody) => tbody.clone(), + None => return, + }; + for row in tbody + .children + .borrow() + .iter() + .filter(|c| c.is_html_element(&local_name!("tr"))) + { + // Each row is expected to have this structure: + //
+ //
someattribute + // a; b; ... + // Description of how someattribute applies to a, b, etc. + // Description if the valid values + // And we want to extract the descriptions so that we can later insert them + // alongside the definitions of attr-a-someattribute, etc. + let row_children = row.children.borrow(); + let mut tds = row_children + .iter() + .filter(|c| c.is_html_element(&local_name!("td"))); + let (keys_td, description_td) = match (tds.next(), tds.next()) { + (Some(a), Some(b)) => (a, b), + _ => continue, + }; + + // These will be strings like "attr-input-maxlength", which identify particular element-attribute pairs. + let data_x = QualName::new(None, ns!(), LocalName::from("data-x")); + for attr_key in keys_td + .children + .borrow() + .iter() + .filter_map(|c| c.get_attribute(&data_x).filter(|v| !v.is_empty())) + { + // Find the comment, if one exists, and extract its contents. + let description = description_td.children.borrow(); + let mut variant_comment = None; + let mut variant_str = None; + for node in description.iter() { + if let NodeData::Comment { ref contents } = node.data { + if contents.trim().starts_with("or:") { + variant_comment = Some(node); + variant_str = Some(StrTendril::from(contents.trim()[3..].trim_start())); + } + } + } + + // Store the (already parsed) ordinary description. If a variant + // comment exists, omit it and instead store its unparsed + // string. + let descriptions = Descriptions { + default: description_td + .children + .borrow() + .iter() + .filter(|c| variant_comment.map_or(true, |vc| !Rc::ptr_eq(c, vc))) + .map(|c| c.deep_clone()) + .collect(), + variant: variant_str, + }; + let mut existing = self.attributes.entry(attr_key).or_default(); + if existing.default.is_empty() { + existing.default = descriptions.default; + } else if !descriptions.default.is_empty() { + if let NodeData::Text { ref contents } = existing.default.last().unwrap().data { + let mut borrow = contents.borrow_mut(); + if let Some(last_non_ws) = borrow.rfind(|c: char| !c.is_ascii_whitespace()) + { + let to_remove = borrow.len32() - (last_non_ws as u32) - 1; + borrow.pop_back(to_remove); + } + } + existing.default.push(Handle::create_text_node("; ")); + existing.default.extend(descriptions.default.into_iter()); + } + if existing.variant.is_none() { + existing.variant = descriptions.variant; + } else if descriptions.variant.is_some() { + let existing_variant = existing.variant.as_mut().unwrap(); + existing_variant.push_slice("; "); + existing_variant.push_tendril(&descriptions.variant.unwrap()); + } + } + } + } + + fn index_attribute_list(&mut self, dt: &Handle) { + // If a
contains , it is not annotated. + // If it contains , the description found in a comment is used instead. + // If it mentions "special semantics", it is joined with a colon rather than an em dash. + let data_x = QualName::new(None, ns!(), LocalName::from("data-x")); + let parent = dt.parent_node().unwrap(); + let children = parent.children.borrow(); + self.edits.extend( + children + .iter() + .skip_while(|n| !Rc::ptr_eq(n, dt)) + .skip(1) + .filter(|n| n.is_element()) + .take_while(|e| e.is_html_element(&local_name!("dd"))) + .filter_map(|dd| { + let mut can_annotate = true; + let mut wants_variant_description = false; + let mut has_special_semantics = false; + let mut key = None; + dom_utils::scan_dom(dd, &mut |n| match &n.data { + NodeData::Comment { ref contents } if contents.trim() == "no-annotate" => { + can_annotate = false; + } + NodeData::Comment { ref contents } if contents.trim() == "variant" => { + wants_variant_description = true; + } + NodeData::Text { ref contents } + if contents.borrow().contains("has special semantics") => + { + has_special_semantics = true; + } + NodeData::Element { .. } => { + if key.is_none() { + key = n.get_attribute(&data_x); + } + } + _ => (), + }); + match (can_annotate, key) { + (true, Some(key)) => Some(Edit { + dd: dd.clone(), + key, + wants_variant_description, + has_special_semantics, + }), + _ => None, + } + }), + ); + } + + pub async fn apply(self) -> io::Result<()> { + let em_dash = StrTendril::from(" \u{2014} "); + + for Edit { + dd, + key, + wants_variant_description, + has_special_semantics, + } in self.edits + { + // Find the requested description to insert at this point. + let descriptions = match self.attributes.get(&key) { + Some(descriptions) => descriptions, + None => continue, + }; + let mut description: Vec = match descriptions { + Descriptions { + variant: Some(ref variant), + .. + } if wants_variant_description => { + parser::parse_fragment_async(variant[..].as_bytes(), &dd).await? + } + _ if wants_variant_description => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "Attribute {key} wants variant description, but no was found" + ), + )) + } + Descriptions { ref default, .. } => { + default.iter().map(|n| n.deep_clone()).collect() + } + }; + + let mut dd_children = dd.children.borrow_mut(); + if has_special_semantics { + // Replace the trailing period with a separating colon. + if let Some(NodeData::Text { contents }) = dd_children.last_mut().map(|n| &n.data) { + let mut text = contents.borrow_mut(); + *text = StrTendril::from( + text.trim_end_matches(|c: char| c.is_ascii_whitespace() || c == '.'), + ); + text.push_slice(": "); + } + } else { + // Insert an em dash. + description.insert(0, Handle::create_text_node(em_dash.clone())); + } + + // Insert the description. + for child in description.iter_mut() { + child.parent.set(Some(Rc::downgrade(&dd))); + } + dd_children.extend(description); + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::parser::{parse_document_async, tests::serialize_for_test}; + + #[tokio::test] + async fn test_simple() -> io::Result<()> { + // This is a simple document with enough stuff in it. Elements are shown + // before and after the attributes table, to demonstrate that this is + // not sensitive to which order they occur in (i.e., these could be + // reordered in the HTML spec). + let document = parse_document_async( + r#" +

The a element

+
+
Categories +
Flow content +
Content attributes +
href +
+

Attributes

+ +
hrefa; areaDestination of the hyperlink +
+

The area element

+
+
Categories +
Flow content +
Content attributes +
href +
+ "#.trim().as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

The a element

+
+
Categories +
Flow content +
Content attributes +
href + — Destination of the hyperlink +
+

Attributes

+ +
hrefa; areaDestination of the hyperlink +
+

The area element

+
+
Categories +
Flow content +
Content attributes +
href + — Destination of the hyperlink +
+ "#.trim() + ); + Ok(()) + } + + #[tokio::test] + async fn test_variant() -> io::Result<()> { + // This checks that and work correctly. + // i.e., the variant description is used where requested + let document = parse_document_async( + r#" +

The a element

+
+
Content attributes +
href +
+

Attributes

+ +
hrefa; areaDestination of the hyperlink +
+

The area element

+
+
Content attributes +
href +
+ "#.trim().as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

The a element

+
+
Content attributes +
href + — Destination of the hyperlink +
+

Attributes

+ +
hrefa; areaDestination of the hyperlink +
+

The area element

+
+
Content attributes +
href + — click on shapes!
+ "#.trim() + ); + Ok(()) + } + + #[tokio::test] + async fn test_special_semantics() -> io::Result<()> { + // Checks that the special rules for using : instead of an em dash work. + let document = parse_document_async( + r#" +

The a element

+
+
Content attributes +
Also, the name attribute has special semantics on this element. +
+

Attributes

+ +
nameaAnchor name +
+ "#.trim().as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

The a element

+
+
Content attributes +
Also, the name attribute has special semantics on this element: Anchor name +
+

Attributes

+ +
nameaAnchor name +
+ "#.trim() + ); + Ok(()) + } + + #[tokio::test] + async fn test_special_semantics_multiple() -> io::Result<()> { + // Checks that the special rules for joining any special semantics with a ; work. + let document = parse_document_async( + r#" +

The a element

+
+
Content attributes +
Also, the name attribute has special semantics on this element. +
+

Attributes

+ +
nameaAnchor name +
nameaName of the anchor +
+ "#.trim().as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

The a element

+
+
Content attributes +
Also, the name attribute has special semantics on this element: Anchor name; Name of the anchor +
+

Attributes

+ +
nameaAnchor name +
nameaName of the anchor +
+ "#.trim() + ); + Ok(()) + } +} diff --git a/src/boilerplate.rs b/src/boilerplate.rs new file mode 100644 index 00000000..47aa587c --- /dev/null +++ b/src/boilerplate.rs @@ -0,0 +1,236 @@ +//! Replaces comments. +//! These can either be comment nodes (in which case the resulting fragment will +//! be inserted), or the complete value of an element's attribute (in which case +//! the text will become the attribute value). + +use std::io; +use std::path::{Path, PathBuf}; + +use html5ever::tendril::{self, SendTendril, StrTendril}; +use html5ever::{local_name, Attribute, LocalName, QualName}; +use markup5ever_rcdom::{Handle, NodeData}; +use tokio::fs::File; +use tokio::task::JoinHandle; + +use crate::dom_utils::NodeHandleExt; +use crate::parser; + +type SendStrTendril = SendTendril; + +enum Edit { + ReplaceHTML(Handle, JoinHandle>), + ReplaceAttr(Handle, QualName, JoinHandle>), + ReplaceText(Handle, JoinHandle>), +} + +pub struct Processor { + /// Path to look for boilerplate files. + path: PathBuf, + + /// Path to look for example files. + example_path: PathBuf, + + /// Changes to be made in the apply step. + edits: Vec, +} + +impl Processor { + pub fn new(path: impl Into, example_path: impl Into) -> Self { + Self { + path: path.into(), + example_path: example_path.into(), + edits: vec![], + } + } + + /// Should be called for each node in the document. + /// Identifies replacements which will be needed, and starts the necessary + /// I/O. + pub fn visit(&mut self, node: &Handle) { + match node.data { + // BOILERPLATE comments will need to be replaced with their + // corresponding HTML, parsed. Open the file so that we can do so on + // demand. + NodeData::Comment { ref contents } if contents.starts_with("BOILERPLATE ") => { + let path = Path::new(contents[12..].trim()); + if is_safe_path(path) { + let file = tokio::spawn(File::open(self.path.join(path))); + self.edits.push(Edit::ReplaceHTML(node.clone(), file)); + } + } + // Pseudo-comments can also appear in element attributes. These are + // not parsed as HTML, so we simply want to read them into memory so + // they can be replaced. + NodeData::Element { ref attrs, .. } => { + for Attribute { + ref name, + ref value, + } in attrs.borrow().iter() + { + if value.starts_with("") { + let path = Path::new(value[16..value.len() - 3].trim()); + if is_safe_path(path) { + let file_contents = read_to_str_tendril(self.path.join(path)); + self.edits.push(Edit::ReplaceAttr( + node.clone(), + name.clone(), + file_contents, + )); + } + } + } + } + //
 and 
 which contain EXAMPLE also need to be
+            // replaced, but as plain text. These are loaded from the "examples"
+            // directory instead.
+            NodeData::Text { ref contents } => {
+                let borrowed_contents = contents.borrow();
+                let text = borrowed_contents.trim();
+                if !text.starts_with("EXAMPLE ") {
+                    return;
+                }
+                const PRE: LocalName = local_name!("pre");
+                const CODE: LocalName = local_name!("code");
+                let has_suitable_parent = node.parent_node().map_or(false, |p| {
+                    p.is_html_element(&PRE)
+                        || (p.is_html_element(&CODE)
+                            && p.parent_node().map_or(false, |p2| p2.is_html_element(&PRE)))
+                });
+                if has_suitable_parent {
+                    let path = Path::new(text[8..].trim());
+                    if is_safe_path(path) {
+                        let file_contents = read_to_str_tendril(self.example_path.join(path));
+                        self.edits
+                            .push(Edit::ReplaceText(node.clone(), file_contents))
+                    }
+                }
+            }
+            _ => (),
+        }
+    }
+
+    /// Applies the required replacements, in order.
+    pub async fn apply(self) -> io::Result<()> {
+        for edit in self.edits {
+            match edit {
+                // When parsing HTML, we need the context it's in so that the
+                // context-sensitive parsing behavior works correctly.
+                Edit::ReplaceHTML(node, replacement) => {
+                    let context = match node.parent_node() {
+                        Some(n) => n,
+                        _ => continue,
+                    };
+                    let file: File = replacement.await??;
+                    let new_children = parser::parse_fragment_async(file, &context).await?;
+                    node.replace_with(new_children);
+                }
+                Edit::ReplaceAttr(element, ref attr, replacement) => {
+                    element.set_attribute(attr, replacement.await??.into());
+                }
+                Edit::ReplaceText(element, replacement) => match element.data {
+                    NodeData::Text { ref contents } => {
+                        contents.replace(replacement.await??.into());
+                    }
+                    _ => panic!("not text"),
+                },
+            }
+        }
+        Ok(())
+    }
+}
+
+/// Check that a path is safe to open, even if the source is potentially untrusted.
+fn is_safe_path(path: &Path) -> bool {
+    use std::path::Component;
+    path.components()
+        .all(|c| matches!(c, Component::Normal(_) | Component::CurDir))
+}
+
+/// In a spawned task, read to a string, then move it to a tendril.
+fn read_to_str_tendril(path: impl AsRef) -> JoinHandle> {
+    let path = path.as_ref().to_owned();
+    tokio::spawn(async move {
+        let string = tokio::fs::read_to_string(path).await?;
+        Ok(StrTendril::from(string).into_send())
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::dom_utils;
+    use crate::parser::{parse_document_async, tests::serialize_for_test};
+    use tempfile::TempDir;
+
+    #[tokio::test]
+    async fn test_replace_boilerplate_comment() -> io::Result<()> {
+        let boilerplate_dir = TempDir::new()?;
+        tokio::fs::write(
+            boilerplate_dir.path().join("languages"),
+            "
enEnglish", + ) + .await?; + let document = + parse_document_async("".as_bytes()).await?; + let mut proc = Processor::new(boilerplate_dir.path(), Path::new(".")); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + "
enEnglish
"); + Ok(()) + } + + #[tokio::test] + async fn test_replace_boilerplate_attribute() -> io::Result<()> { + let boilerplate_dir = TempDir::new()?; + tokio::fs::write( + boilerplate_dir.path().join("data.url"), + "data:text/html,Hello, world!", + ) + .await?; + let document = + parse_document_async("\">hello".as_bytes()) + .await?; + let mut proc = Processor::new(boilerplate_dir.path(), Path::new(".")); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + "hello"); + Ok(()) + } + + #[tokio::test] + async fn test_replace_example() -> io::Result<()> { + let example_dir = TempDir::new()?; + tokio::fs::write(example_dir.path().join("ex1"), "first").await?; + tokio::fs::write(example_dir.path().join("ex2"), "second").await?; + tokio::fs::write(example_dir.path().join("ignored"), "bad").await?; + let document = + parse_document_async("
EXAMPLE ex1
\nEXAMPLE ex2  

EXAMPLE ignored

".as_bytes()) + .await?; + let mut proc = Processor::new(Path::new("."), example_dir.path()); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + "
first
second

EXAMPLE ignored

" ); + Ok(()) + } + + #[tokio::test] + async fn test_ignores_unsafe_paths() -> io::Result<()> { + let document = + parse_document_async("
\">EXAMPLE ../foo
".as_bytes()) + .await?; + let mut proc = Processor::new(Path::new("."), Path::new(".")); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + assert_eq!(proc.edits.len(), 0); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + "
\">EXAMPLE ../foo
"); + Ok(()) + } +} diff --git a/src/dom_utils.rs b/src/dom_utils.rs new file mode 100644 index 00000000..95894515 --- /dev/null +++ b/src/dom_utils.rs @@ -0,0 +1,383 @@ +use std::cell::RefCell; +use std::rc::Rc; + +use html5ever::tendril::StrTendril; +use html5ever::{local_name, namespace_url, ns, Attribute, LocalName, QualName}; +use markup5ever_rcdom::{Handle, Node, NodeData}; + +/// Extensions to the DOM interface to make manipulation more ergonimc. +pub trait NodeHandleExt { + /// Returns a handle to the parent node, if there is one. + fn parent_node(&self) -> Option + where + Self: Sized; + + /// Gets an attribute on the element, or None if absent or not an element. + fn get_attribute(&self, name: &QualName) -> Option; + + /// Returns whether the node has the named attribute. + fn has_attribute(&self, name: &QualName) -> bool { + self.get_attribute(name).is_some() + } + + /// Returns true if the attribute exists and the predicate matches it. + fn attribute_matches(&self, name: &QualName, f: impl Fn(&str) -> bool) -> bool { + self.get_attribute(name).map_or(false, |v| f(&v)) + } + + /// Returns true if the attribute exists and has the value mentioned. + fn attribute_is(&self, name: &QualName, expected: &str) -> bool { + self.get_attribute(name).as_deref() == Some(expected) + } + + /// Sets an attribute on the element. Must be an element. + fn set_attribute(&self, name: &QualName, value: StrTendril); + + /// Returns true if the node is an element. + fn is_element(&self) -> bool; + + /// Returns true if the node is an HTML element with the given tag name. + fn is_html_element(&self, tag_name: &LocalName) -> bool; + + /// Returns true if the node is an element with the given class. + fn has_class(&self, class: &str) -> bool; + + /// Returns true if the node is an element with the given ID. + fn has_id(&self, id: &str) -> bool { + const ID: QualName = QualName { + prefix: None, + ns: ns!(), + local: local_name!("id"), + }; + self.attribute_is(&ID, id) + } + + /// If this is a text node, returns its text. + fn node_text(&self) -> Option; + + /// Concatenate the text of the node and its descendants. + fn text_content(&self) -> StrTendril; + + /// True if any child matches the predicate. + fn any_child(&self, f: impl Fn(&Self) -> bool) -> bool; + + /// Appends children (without checking node type). + fn append_children(&self, children: impl Iterator); + + /// Same, but just one. + fn append_child(&self, child: Self) + where + Self: Sized, + { + self.append_children(std::iter::once(child)) + } + + /// Inserts children before the specified child. + fn insert_children_before(&self, existing: &Self, new: impl Iterator); + + /// Same, but just one. + fn insert_child(&self, existing: &Self, new: Self) + where + Self: Sized, + { + self.insert_children_before(existing, std::iter::once(new)) + } + + /// Removes the node from its parent and replaces it with the nodes provided. + /// Does nothing if the node has no parent. + fn replace_with(&self, replacements: Vec) + where + Self: Sized; + + /// Clones the node and its entire subtree (including template contents). + fn deep_clone(&self) -> Self; + + /// Create a new element, with the given children. + fn create_element(name: LocalName) -> ElementBuilder + where + Self: Sized; + + /// Create a new text node. + fn create_text_node(text: impl Into) -> Self + where + Self: Sized; +} + +/// Convenience helper for constructing nodes. Use like: +/// Handle::create_element(local_name!("a")) +/// .attribute(&local_name!("href"), "/") +/// .text("Home") +/// .build() +pub struct ElementBuilder { + element: T, +} + +impl ElementBuilder { + pub fn attribute(self, name: &LocalName, value: impl Into) -> Self { + self.element + .set_attribute(&QualName::new(None, ns!(), name.clone()), value.into()); + self + } + + pub fn children(self, children: impl Iterator) -> Self { + self.element.append_children(children); + self + } + + pub fn child(self, child: T) -> Self { + self.children(std::iter::once(child)) + } + + pub fn text(self, text: impl Into) -> Self { + self.child(::create_text_node(text)) + } + + pub fn build(self) -> T { + self.element + } +} + +/// Recursively visits every DOM node (preorder). Template contents are visited +/// after children, but there are seldom both. +pub fn scan_dom(handle: &Handle, f: &mut F) { + f(handle); + + for child in handle.children.borrow().iter() { + scan_dom(child, f); + } + + if let NodeData::Element { + template_contents: ref tc, + .. + } = handle.data + { + if let Some(ref tc_handle) = *tc.borrow() { + scan_dom(tc_handle, f); + } + } +} + +/// Given a
element, find the corresponding
elements. +/// +/// This is more subtle than you might immediately think, because there can be +/// multiple
listing various terms with one or more common
+/// definitions. We need to find the
in the child list, and then skip it +/// and any other
, before providing the
that follow. +pub fn dt_descriptions(dt: &Handle) -> Vec { + assert!(dt.is_html_element(&local_name!("dt"))); + if let Some(ref dl) = dt + .parent_node() + .filter(|n| n.is_html_element(&local_name!("dl"))) + { + dl.children + .borrow() + .iter() + .filter(|n| n.is_element()) + .skip_while(|n| !Rc::ptr_eq(n, dt)) + .skip_while(|n| n.is_html_element(&local_name!("dt"))) + .take_while(|n| n.is_html_element(&local_name!("dd"))) + .cloned() + .collect() + } else { + Vec::new() + } +} + +impl NodeHandleExt for Handle { + fn parent_node(&self) -> Option { + let weak_parent = self.parent.take()?; + let parent = weak_parent.upgrade().expect("dangling parent"); + self.parent.set(Some(weak_parent)); + Some(parent) + } + + fn get_attribute(&self, name: &QualName) -> Option { + let attrs = match self.data { + NodeData::Element { ref attrs, .. } => attrs.borrow(), + _ => return None, + }; + attrs + .iter() + .find(|a| &a.name == name) + .map(|a| a.value.clone()) + } + + fn set_attribute(&self, name: &QualName, value: StrTendril) { + let mut attrs = match self.data { + NodeData::Element { ref attrs, .. } => attrs.borrow_mut(), + _ => panic!("not an element"), + }; + if let Some(attr) = attrs.iter_mut().find(|a| &a.name == name) { + attr.value = value; + } else { + attrs.push(Attribute { + name: name.clone(), + value, + }); + } + } + + fn is_element(&self) -> bool { + matches!(&self.data, NodeData::Element { .. }) + } + + fn is_html_element(&self, tag_name: &LocalName) -> bool { + match &self.data { + NodeData::Element { + name: + QualName { + ns: ns!(html), + ref local, + .. + }, + .. + } => local == tag_name, + _ => false, + } + } + + fn has_class(&self, class: &str) -> bool { + const CLASS: QualName = QualName { + prefix: None, + ns: ns!(), + local: local_name!("class"), + }; + self.get_attribute(&CLASS) + .map_or(false, |v| v.split_ascii_whitespace().any(|c| c == class)) + } + + fn node_text(&self) -> Option { + match &self.data { + NodeData::Text { ref contents } => Some(contents.borrow().clone()), + _ => None, + } + } + + fn text_content(&self) -> StrTendril { + let mut text = StrTendril::new(); + scan_dom(self, &mut |n| { + if let NodeData::Text { ref contents } = &n.data { + text.push_tendril(&contents.borrow()); + } + }); + text + } + + fn any_child(&self, f: impl Fn(&Handle) -> bool) -> bool { + self.children.borrow().iter().any(f) + } + + fn append_children(&self, children: impl Iterator) { + self.children.borrow_mut().extend(children.inspect(|c| { + let old_parent = c.parent.replace(Some(Rc::downgrade(self))); + assert!(old_parent.is_none()); + })); + } + + fn insert_children_before(&self, existing: &Handle, new: impl Iterator) { + let mut children = self.children.borrow_mut(); + let i = children + .iter() + .position(|c| Rc::ptr_eq(c, existing)) + .expect("corrupt child list"); + children.splice( + i..i, + new.inspect(|c| { + let old_parent = c.parent.replace(Some(Rc::downgrade(self))); + assert!(old_parent.is_none()); + }), + ); + } + + fn replace_with(&self, replacements: Vec) { + let parent = match self.parent.take() { + Some(n) => n.upgrade().expect("dangling parent"), + _ => return, + }; + for new_child in replacements.iter() { + new_child.parent.replace(Some(Rc::downgrade(&parent))); + } + let mut children = parent.children.borrow_mut(); + let i = children + .iter() + .position(|c| Rc::ptr_eq(c, self)) + .expect("corrupt child list"); + children.splice(i..=i, replacements); + self.parent.take(); + } + + fn deep_clone(&self) -> Handle { + use NodeData::*; + let new_node_data = match &self.data { + Document => Document, + Doctype { + name, + public_id, + system_id, + } => Doctype { + name: name.clone(), + public_id: public_id.clone(), + system_id: system_id.clone(), + }, + Text { contents } => Text { + contents: contents.clone(), + }, + Comment { contents } => Comment { + contents: contents.clone(), + }, + Element { + name, + attrs, + template_contents, + mathml_annotation_xml_integration_point, + } => Element { + name: name.clone(), + attrs: attrs.clone(), + template_contents: RefCell::new( + template_contents + .borrow() + .as_ref() + .map(|tc| tc.deep_clone()), + ), + mathml_annotation_xml_integration_point: *mathml_annotation_xml_integration_point, + }, + ProcessingInstruction { target, contents } => ProcessingInstruction { + target: target.clone(), + contents: contents.clone(), + }, + }; + let node = Node::new(new_node_data); + let mut children = node.children.borrow_mut(); + *children = self + .children + .borrow() + .iter() + .map(|c| c.deep_clone()) + .collect(); + for child in children.iter_mut() { + let old_parent = child.parent.replace(Some(Rc::downgrade(&node))); + assert!(old_parent.is_none()); + } + drop(children); + node + } + + fn create_element(name: LocalName) -> ElementBuilder { + let new_node_data = NodeData::Element { + name: QualName::new(None, ns!(html), name), + attrs: RefCell::new(Vec::new()), + template_contents: RefCell::new(None), + mathml_annotation_xml_integration_point: false, + }; + ElementBuilder { + element: Node::new(new_node_data), + } + } + + fn create_text_node(text: impl Into) -> Handle { + let new_node_data = NodeData::Text { + contents: RefCell::new(text.into()), + }; + Node::new(new_node_data) + } +} diff --git a/src/interface_index.rs b/src/interface_index.rs new file mode 100644 index 00000000..49bdadf4 --- /dev/null +++ b/src/interface_index.rs @@ -0,0 +1,391 @@ +//! Generates an index of WebIDL interfaces. +//! This index is inserted where "INSERT INTERFACES HERE" appears. + +use std::collections::BTreeMap; +use std::io; + +use html5ever::tendril::StrTendril; +use html5ever::{local_name, namespace_url, ns, QualName}; +use markup5ever_rcdom::Handle; + +use crate::dom_utils::NodeHandleExt; + +#[derive(Default, Debug)] +struct InterfaceInfo { + /// Number of times the interface definition was seen. Should be one. + seen: u32, + + /// The IDs of the partial interfaces, in the order they appear in the document. + partials: Vec, + + /// Set to true if a partial is missing its ID. + has_partial_with_no_id: bool, +} + +pub struct Processor { + /// The interfaces encountered, keyed and sorted by name. + interfaces: BTreeMap, + + /// The text nodes which contains the text "INSERT INTERFACES HERE". + marker_nodes: Vec, +} + +/// The string which marks where the index belongs. Ideally this would be a node +/// and not plain text. +const MARKER: &str = "INSERT INTERFACES HERE"; + +impl Processor { + pub fn new() -> Self { + Processor { + interfaces: BTreeMap::new(), + marker_nodes: Vec::new(), + } + } + + pub fn visit(&mut self, node: &Handle) { + const ID: QualName = QualName { + prefix: None, + ns: ns!(), + local: local_name!("id"), + }; + // We're looking for inside a
, to find
+        // potential interfaces defined there.
+        //
+        // One surprise here -- there is an "interface Example" that is not defined
+        // according to Wattsi. It yells about this not being defined, and the
+        // prior Perl preprocessing actually requires the 
 have no
+        // attributes.
+        if node.is_html_element(&local_name!("code"))
+            && node.has_class("idl")
+            && node.parent_node().map_or(false, |p| {
+                p.is_html_element(&local_name!("pre")) && !p.has_class("extract")
+            })
+        {
+            let borrowed_children = node.children.borrow();
+            for window in borrowed_children.windows(2) {
+                let is_partial = match window[0].node_text() {
+                    Some(a) if a.ends_with("partial interface ") => true,
+                    Some(a) if a.ends_with("interface ") => false,
+                    _ => continue,
+                };
+                // These definitions must appear as a ,  or  element.
+                if !window[1].is_html_element(&local_name!("span"))
+                    && !window[1].is_html_element(&local_name!("dfn"))
+                    && !window[1].is_html_element(&local_name!("a"))
+                {
+                    continue;
+                }
+                let name = window[1].text_content();
+                let mut info = self.interfaces.entry(name).or_default();
+                if is_partial {
+                    if let Some(id) = window[1].get_attribute(&ID) {
+                        info.partials.push(id);
+                    } else {
+                        info.has_partial_with_no_id = true;
+                    }
+                } else {
+                    info.seen += 1;
+                }
+            }
+        }
+
+        if node.node_text().map_or(false, |t| t.contains(MARKER)) {
+            self.marker_nodes.push(node.clone());
+        }
+    }
+
+    pub fn apply(self) -> io::Result<()> {
+        // It is likely an author error to not include anywhere to insert an
+        // interface index. More than one is supported, mainly because it's no
+        // more work than enforcing that just one exists.
+        if self.marker_nodes.is_empty() {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                format!("Marker {MARKER:?} not found."),
+            ));
+        }
+        for marker in self.marker_nodes {
+            // We need to find where the marker appears in the text so that we
+            // can split it into two text nodes.
+            let text = marker.node_text().expect("should still be a text node");
+            let position: u32 = match text.find(MARKER) {
+                None => {
+                    return Err(io::Error::new(
+                        io::ErrorKind::InvalidData,
+                        format!("Marker {MARKER:?} not found (but was during first pass)."),
+                    ));
+                }
+                Some(p) => p.try_into().unwrap(),
+            };
+            let end_position: u32 = position + TryInto::::try_into(MARKER.len()).unwrap();
+            let before = text.subtendril(0, position);
+            let after = text.subtendril(end_position, text.len32() - end_position);
+
+            // Then, we need to construct a list of interfaces and their partial interfaces.
+            let mut ul =
+                Handle::create_element(local_name!("ul")).attribute(&local_name!("class"), "brief");
+            for (name, info) in &self.interfaces {
+                if info.seen > 1 {
+                    return Err(io::Error::new(
+                        io::ErrorKind::InvalidData,
+                        format!("Interface {name} defined {} times.", info.seen),
+                    ));
+                }
+                fn make_link(id: &str, text: &str) -> Handle {
+                    Handle::create_element(local_name!("a"))
+                        .attribute(&local_name!("href"), format!("#{id}"))
+                        .text(text)
+                        .build()
+                }
+                let mut li = Handle::create_element(local_name!("li")).child(
+                    Handle::create_element(local_name!("code"))
+                        .text(name.clone())
+                        .build(),
+                );
+                match &info.partials[..] {
+                    [] => (),
+                    [sole_partial] => {
+                        li = li.text(", ").child(make_link(sole_partial, "partial"));
+                    }
+                    [first, rest @ ..] => {
+                        li = li.text(", ").child(make_link(first, "partial 1"));
+                        for (i, p) in rest.iter().enumerate() {
+                            li = li.text(" ").child(make_link(p, &(i + 2).to_string()));
+                        }
+                    }
+                }
+                ul = ul.child(li.build());
+            }
+
+            // Finally, we replace the marker's text node with the combination of the two.
+            marker.replace_with(vec![
+                Handle::create_text_node(before),
+                ul.build(),
+                Handle::create_text_node(after),
+            ]);
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::dom_utils;
+    use crate::parser::{parse_document_async, tests::serialize_for_test};
+
+    #[tokio::test]
+    async fn test_two_interfaces_in_one_block() -> io::Result<()> {
+        let document = parse_document_async(
+            r#"
+

+interface HTMLMarqueeElement { ... }
+interface HTMLBlinkElement { ... }
+
+INSERT INTERFACES HERE + "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

+interface HTMLMarqueeElement { ... }
+interface HTMLBlinkElement { ... }
+
+
  • HTMLBlinkElement
  • HTMLMarqueeElement
+ "#.trim()); + Ok(()) + } + + #[tokio::test] + async fn test_two_interfaces_in_separate_blocks() -> io::Result<()> { + let document = parse_document_async( + r#" +

+interface HTMLMarqueeElement { ... }
+
+

+interface HTMLBlinkElement { ... }
+
+INSERT INTERFACES HERE + "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

+interface HTMLMarqueeElement { ... }
+
+

+interface HTMLBlinkElement { ... }
+
+
  • HTMLBlinkElement
  • HTMLMarqueeElement
+ "#.trim()); + Ok(()) + } + + #[tokio::test] + async fn interface_with_partial() -> io::Result<()> { + let document = parse_document_async( + r#" +

+interface HTMLMarqueeElement { ... }
+
+

+partial interface HTMLMarqueeElement { ... }
+
+INSERT INTERFACES HERE + "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r##" +

+interface HTMLMarqueeElement { ... }
+
+

+partial interface HTMLMarqueeElement { ... }
+
+
+ "##.trim()); + Ok(()) + } + + #[tokio::test] + async fn interface_with_two_partials() -> io::Result<()> { + let document = parse_document_async( + r#" +

+interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+
+INSERT INTERFACES HERE + "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r##" +

+interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+
+ + "##.trim()); + Ok(()) + } + + #[tokio::test] + async fn only_partials() -> io::Result<()> { + let document = parse_document_async( + r#" +

+partial interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+
+INSERT INTERFACES HERE + "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r##" +

+partial interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+
+ + "##.trim()); + Ok(()) + } + + #[tokio::test] + async fn markers_before_and_after() -> io::Result<()> { + let document = parse_document_async( + r#" +INSERT INTERFACES HERE +

+interface HTMLMarqueeElement { ... }
+
+INSERT INTERFACES HERE + "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r##" +
  • HTMLMarqueeElement
+

+interface HTMLMarqueeElement { ... }
+
+
  • HTMLMarqueeElement
+ "## + .trim() + ); + Ok(()) + } + + #[tokio::test] + async fn no_marker() -> io::Result<()> { + let document = parse_document_async("".as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + let result = proc.apply(); + assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); + Ok(()) + } + + #[tokio::test] + async fn duplicate_dfn() -> io::Result<()> { + let document = parse_document_async( + r#" +

+interface HTMLMarqueeElement { ... }
+interface HTMLMarqueeElement { ... }
+
+ "# + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + let result = proc.apply(); + assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); + Ok(()) + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 00000000..2506d027 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,76 @@ +use std::borrow::Cow; +use std::default::Default; +use std::env; +use std::ffi::OsStr; +use std::io::{self, BufWriter}; +use std::path::{Path, PathBuf}; + +use markup5ever_rcdom::SerializableHandle; + +mod annotate_attributes; +mod boilerplate; +mod dom_utils; +mod interface_index; +mod parser; +mod represents; +mod serializer; +mod tag_omission; + +#[tokio::main] +async fn main() -> io::Result<()> { + // Since we're using Rc in the DOM implementation, we must ensure that tasks + // which act on it are confined to this thread. + + // Find the paths we need. + let cache_dir = path_from_env("HTML_CACHE", ".cache"); + let source_dir = path_from_env("HTML_SOURCE", "../html"); + + // Because parsing can jump around the tree a little, it's most reasonable + // to just parse the whole document before doing any processing. Even for + // the HTML5 specification, this doesn't take too long. + let document = parser::parse_document_async(tokio::io::stdin()).await?; + + let mut boilerplate = boilerplate::Processor::new(cache_dir.clone(), source_dir.join("demos")); + let mut represents = represents::Processor::new(); + let mut annotate_attributes = annotate_attributes::Processor::new(); + let mut tag_omission = tag_omission::Processor::new(); + let mut interface_index = interface_index::Processor::new(); + + // We do exactly one pass to identify the changes that need to be made. + dom_utils::scan_dom(&document, &mut |h| { + boilerplate.visit(h); + represents.visit(h); + annotate_attributes.visit(h); + tag_omission.visit(h); + interface_index.visit(h); + }); + + // And then we apply all of the changes. These different processors mostly + // apply quite local changes, so hopefully we never have to deal with + // conflicts between them. + boilerplate.apply().await?; + represents.apply()?; + annotate_attributes.apply().await?; + tag_omission.apply()?; + interface_index.apply()?; + + // Finally, we write the result to standard out. + let serializable: SerializableHandle = document.into(); + serializer::serialize( + &mut BufWriter::with_capacity(128 * 1024, io::stdout()), + &serializable, + Default::default(), + )?; + Ok(()) +} + +fn path_from_env<'a, V, D>(var: &V, default: &'a D) -> Cow<'a, Path> +where + V: AsRef + ?Sized, + D: AsRef + ?Sized, +{ + match env::var_os(var) { + Some(p) => PathBuf::from(p).into(), + None => default.as_ref().into(), + } +} diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 00000000..ba016836 --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,217 @@ +//! This module specializes the HTML5 parser to respect the "special" void +//! element, which isn't part of standard HTML. It does so by injecting a +//! synthetic token immediately afterward. +//! It also provides some mild integration with async I/O. + +use std::borrow::Cow; +use std::io; + +use html5ever::buffer_queue::BufferQueue; +use html5ever::tendril::{self, stream::Utf8LossyDecoder, ByteTendril, StrTendril, TendrilSink}; +use html5ever::tokenizer::{ + Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts, TokenizerResult, +}; +use html5ever::tree_builder::{TreeBuilder, TreeSink}; +use markup5ever_rcdom::{Handle, RcDom}; +use tokio::io::{AsyncRead, AsyncReadExt}; + +struct TokenFilter { + sink: Sink, +} + +impl TokenSink for TokenFilter { + type Handle = Sink::Handle; + + fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult { + let close_tag = match token { + Token::TagToken(Tag { + kind: TagKind::StartTag, + name: ref tag_name, + .. + }) if tag_name.eq_str_ignore_ascii_case("ref") => Some(Tag { + kind: TagKind::EndTag, + name: tag_name.clone(), + self_closing: false, + attrs: vec![], + }), + _ => None, + }; + match (self.sink.process_token(token, line_number), close_tag) { + (TokenSinkResult::Continue, Some(close_tag)) => self + .sink + .process_token(Token::TagToken(close_tag), line_number), + (result, _) => result, + } + } + + fn end(&mut self) { + self.sink.end() + } + + fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool { + self.sink + .adjusted_current_node_present_but_not_in_html_namespace() + } +} + +struct FilteredParser { + tokenizer: Tokenizer>>, + input_buffer: BufferQueue, +} + +impl TendrilSink for FilteredParser { + fn process(&mut self, t: StrTendril) { + self.input_buffer.push_back(t); + while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {} + } + + fn error(&mut self, desc: Cow<'static, str>) { + self.tokenizer.sink.sink.sink.parse_error(desc) + } + + type Output = Sink::Output; + + fn finish(mut self) -> Self::Output { + while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {} + assert!(self.input_buffer.is_empty()); + self.tokenizer.end(); + self.tokenizer.sink.sink.sink.finish() + } +} + +impl FilteredParser { + #[allow(clippy::wrong_self_convention)] + fn from_utf8(self) -> Utf8LossyDecoder { + Utf8LossyDecoder::new(self) + } +} + +async fn parse_internal_async( + tb: TreeBuilder, + tokenizer_opts: TokenizerOpts, + mut r: R, +) -> io::Result { + let tok = Tokenizer::new(TokenFilter { sink: tb }, tokenizer_opts); + let mut tendril_sink = FilteredParser { + tokenizer: tok, + input_buffer: BufferQueue::new(), + } + .from_utf8(); + + // This draws on the structure of the sync tendril read_from. + const BUFFER_SIZE: u32 = 128 * 1024; + 'read: loop { + let mut tendril = ByteTendril::new(); + unsafe { + tendril.push_uninitialized(BUFFER_SIZE); + } + loop { + match r.read(&mut tendril).await { + Ok(0) => break 'read, + Ok(n) => { + tendril.pop_back(BUFFER_SIZE - n as u32); + tendril_sink.process(tendril); + break; + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} + Err(e) => Err(e)?, + } + } + } + let dom = tendril_sink.finish(); + Ok(dom.document) +} + +pub async fn parse_fragment_async( + r: R, + context: &Handle, +) -> io::Result> { + let tb = + TreeBuilder::new_for_fragment(RcDom::default(), context.clone(), None, Default::default()); + let tokenizer_opts = TokenizerOpts { + initial_state: Some(tb.tokenizer_state_for_context_elem()), + ..TokenizerOpts::default() + }; + let document = parse_internal_async(tb, tokenizer_opts, r).await?; + let mut new_children = document.children.take()[0].children.take(); + for new_child in new_children.iter_mut() { + new_child.parent.take(); + } + Ok(new_children) +} + +pub async fn parse_document_async(r: R) -> io::Result { + let tb = TreeBuilder::new(RcDom::default(), Default::default()); + parse_internal_async(tb, TokenizerOpts::default(), r).await +} + +#[cfg(test)] +pub(crate) mod tests { + use super::*; + use crate::dom_utils::NodeHandleExt; + use html5ever::serialize::{SerializeOpts, TraversalScope}; + use html5ever::{local_name, serialize}; + use markup5ever_rcdom::{NodeData, SerializableHandle}; + + pub(crate) fn serialize_for_test(nodes: &[Handle]) -> String { + let mut output = vec![]; + for node in nodes { + let traversal_scope = match node.data { + NodeData::Document => TraversalScope::ChildrenOnly(None), + _ => TraversalScope::IncludeNode, + }; + serialize( + &mut output, + &SerializableHandle::from(node.clone()), + SerializeOpts { + traversal_scope, + ..Default::default() + }, + ) + .unwrap(); + } + String::from_utf8(output).unwrap() + } + + #[tokio::test] + async fn test_treats_ref_as_void() -> io::Result<()> { + // Without the token filtering, the first ends up as the second's parent. + let document = + parse_document_async("".as_bytes()).await?; + assert_eq!( + serialize_for_test(&[document]), + ""); + Ok(()) + } + + #[tokio::test] + async fn test_treats_ref_as_void_in_fragments() -> io::Result<()> { + // Similar to the above, but in a fragment. + let document = parse_document_async("".as_bytes()).await?; + let body = document.children.borrow()[1].children.borrow()[1].clone(); + assert!(body.is_html_element(&local_name!("body"))); + let children = + parse_fragment_async(".".as_bytes(), &body).await?; + assert_eq!( + serialize_for_test(&children), + "." + ); + Ok(()) + } + + #[tokio::test] + async fn test_fragment_respects_context() -> io::Result<()> { + // Checks that we have the appropriate insertion mode for the element + // we're in. This is important because of the special rules + // surrounding, e.g., tables. If you change this to use the body as context, + // no element at all is emitted. + let document = parse_document_async("".as_bytes()).await?; + let body = document.children.borrow()[1].children.borrow()[1].clone(); + assert!(body.is_html_element(&local_name!("body"))); + let table = body.children.borrow()[0].clone(); + assert!(table.is_html_element(&local_name!("table"))); + let children = parse_fragment_async("".as_bytes(), &table).await?; + assert_eq!(serialize_for_test(&children), ""); + Ok(()) + } +} diff --git a/src/represents.rs b/src/represents.rs new file mode 100644 index 00000000..ebb0474d --- /dev/null +++ b/src/represents.rs @@ -0,0 +1,152 @@ +//! Replaces comments with the HTML which appears in a +//! paragraph of the form: +//!

The tagname element represents ...

+ +use std::collections::HashMap; +use std::io; +use std::rc::Rc; + +use crate::dom_utils::NodeHandleExt; +use html5ever::local_name; +use html5ever::tendril::StrTendril; +use markup5ever_rcdom::{Handle, NodeData}; + +pub struct Processor { + /// Map from tag name (as found in the paragraph) to the which + /// contains the text "represents". + represents: HashMap, + + /// List of comments to be replaced, and what tag name + /// they correspond to. + placeholders: Vec<(Handle, StrTendril)>, +} + +/// Walks from the text node "represents" and finds the tag name and the +/// span that marks where the description begins, or returns None if that +/// cannot be found. +fn find_tag_name(represents_text: &Handle) -> Option<(StrTendril, Handle)> { + let span = represents_text + .parent_node() + .filter(|p| p.is_html_element(&local_name!("span")))?; + let p = span + .parent_node() + .filter(|p| p.is_html_element(&local_name!("p")))?; + let children = p.children.borrow(); + match &children[..] { + [a, b, c, d, ..] + if a.node_text().as_deref().map(|x| x.trim()) == Some("The") + && b.is_html_element(&local_name!("code")) + && c.node_text().as_deref().map(|x| x.trim()) == Some("element") + && Rc::ptr_eq(d, &span) => + { + Some((b.text_content(), span)) + } + _ => None, + } +} + +impl Processor { + pub fn new() -> Self { + Self { + represents: HashMap::new(), + placeholders: Vec::new(), + } + } + + /// Should be called for each node the document. Records when it sees a + /// represents and which element it is defining + pub fn visit(&mut self, node: &Handle) { + match node.data { + NodeData::Text { ref contents } if contents.borrow().as_ref() == "represents" => { + if let Some((tag, span)) = find_tag_name(node) { + self.represents.insert(tag, span); + } + } + NodeData::Comment { ref contents } if contents.starts_with("REPRESENTS ") => { + self.placeholders + .push((node.clone(), contents.subtendril(11, contents.len32() - 11))); + } + _ => (), + } + } + + pub fn apply(self) -> io::Result<()> { + for (placeholder, ref tag) in self.placeholders { + let span = match self.represents.get(tag) { + Some(span) => span, + None => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!(" refers to unknown tag", tag), + )); + } + }; + let parent = match span.parent_node() { + Some(p) => p, + None => continue, + }; + let replacements = parent + .children + .borrow() + .iter() + .skip_while(|s| !Rc::ptr_eq(s, span)) + .skip(1) + .enumerate() + .map(|(index, sibling)| { + let clone = sibling.deep_clone(); + // Capitalize the first letter of the first node (which is expected to be text). + if let (0, NodeData::Text { ref contents }) = (index, &clone.data) { + contents.replace_with(|text| capitalize(text.trim_start())); + } + clone + }) + .collect(); + placeholder.replace_with(replacements); + } + Ok(()) + } +} + +fn capitalize(text: &str) -> StrTendril { + let mut chars = text.chars(); + match chars.next() { + Some(c) => { + let mut capitalized = StrTendril::from_char(c.to_ascii_uppercase()); + capitalized.push_slice(chars.as_str()); + capitalized + } + None => StrTendril::new(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dom_utils; + use crate::parser::{parse_document_async, tests::serialize_for_test}; + + #[tokio::test] + async fn test_represents() -> io::Result<()> { + // Uses can occur either before or after. + let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + "

A seat\nat a table.

The chair element represents a seat\nat a table.

A seat\nat a table.

" + ); + Ok(()) + } + + #[tokio::test] + async fn test_represents_undefined() -> io::Result<()> { + // Uses can occur either before or after. + let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + let result = proc.apply(); + assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); + Ok(()) + } +} diff --git a/src/serializer.rs b/src/serializer.rs new file mode 100644 index 00000000..b4d062a3 --- /dev/null +++ b/src/serializer.rs @@ -0,0 +1,50 @@ +//! This module specializes the HTML5 serializer to omit , which is +//! treated as void by Wattsi. + +use std::io::{self, Write}; + +use html5ever::serialize::*; +use html5ever::{namespace_url, ns, QualName}; + +struct WattsiSerializer(HtmlSerializer); + +impl Serializer for WattsiSerializer { + fn start_elem<'a, AttrIter>(&mut self, name: QualName, attrs: AttrIter) -> io::Result<()> + where + AttrIter: Iterator>, + { + self.0.start_elem(name, attrs) + } + + fn end_elem(&mut self, name: QualName) -> io::Result<()> { + if name.ns == ns!(html) && &name.local == "ref" { + return Ok(()); + } + self.0.end_elem(name) + } + + fn write_text(&mut self, text: &str) -> io::Result<()> { + self.0.write_text(text) + } + + fn write_comment(&mut self, text: &str) -> io::Result<()> { + self.0.write_comment(text) + } + + fn write_doctype(&mut self, name: &str) -> io::Result<()> { + self.0.write_doctype(name) + } + + fn write_processing_instruction(&mut self, target: &str, data: &str) -> io::Result<()> { + self.0.write_processing_instruction(target, data) + } +} + +pub fn serialize(writer: Wr, node: &T, opts: SerializeOpts) -> io::Result<()> +where + Wr: Write, + T: Serialize, +{ + let mut ser = WattsiSerializer(HtmlSerializer::new(writer, opts.clone())); + node.serialize(&mut ser, opts.traversal_scope) +} diff --git a/src/tag_omission.rs b/src/tag_omission.rs new file mode 100644 index 00000000..5f5a97be --- /dev/null +++ b/src/tag_omission.rs @@ -0,0 +1,329 @@ +//! Looks at the "Optional tags" and "Void elements" sections from the HTML +//! syntax spec and replicates that information into the descriptions of the +//! individual elements. + +use std::borrow::Borrow; +use std::collections::HashMap; +use std::io; + +use html5ever::tendril::StrTendril; +use html5ever::{local_name, namespace_url, ns, LocalName, QualName}; +use markup5ever_rcdom::{Handle, NodeData}; +use regex::Regex; + +use crate::dom_utils::{self, NodeHandleExt}; + +#[derive(Default)] +struct ElementInfo { + /// Handles on any paragraphs in the "Optional tags" section which refer to the element. + optional_tags_info: Vec, + + /// Whether the element appears in the "Void elements" list. + is_void_element: bool, + + ///

into which this info must be added. + dl: Option, +} + +#[derive(Default)] +pub struct Processor { + /// The heading level of the "Optional tags" heading, if inside one. + in_optional_tags_heading: Option, + + /// Most recently seen . + most_recent_element_dfn: Option, + + /// Info about elements which have been referred to in these sections. + elements: HashMap, +} + +impl Processor { + pub fn new() -> Self { + Default::default() + } + + pub fn visit(&mut self, node: &Handle) { + // If the heading ends the "Optional tags" section, clear that state. + if let Some(optional_tag_heading_level) = self.in_optional_tags_heading { + match heading_level(node) { + Some(level) if level <= optional_tag_heading_level => { + self.in_optional_tags_heading = None; + } + _ => (), + } + } + + // If we encounter an "Optional tags" section, start observing relevant paragraphs. + // When one is encountered, possibly add it. + if let Some(level) = heading_level(node) { + if node.text_content().trim() == "Optional tags" { + self.in_optional_tags_heading = Some(level); + } + } else if self.in_optional_tags_heading.is_some() && node.is_html_element(&local_name!("p")) + { + self.maybe_record_optional_tags_paragraph(node); + } + + // If we encounter the Void elements section, look for the next dt. + if node.is_html_element(&local_name!("dfn")) + && node.text_content().trim() == "Void elements" + { + if let Some(dt) = node + .parent_node() + .filter(|n| n.is_html_element(&local_name!("dt"))) + { + for dd in dom_utils::dt_descriptions(&dt) { + dom_utils::scan_dom(&dd, &mut |n| { + if n.is_html_element(&local_name!("code")) { + let mut info = self.elements.entry(n.text_content()).or_default(); + info.is_void_element = true; + } + }); + } + } + } + + // If we see an element dfn, watch out for the upcoming
. + if node.is_html_element(&local_name!("dfn")) + && node.has_attribute(&QualName::new(None, ns!(), LocalName::from("element"))) + { + self.most_recent_element_dfn = Some(node.text_content()); + } + + // If we see a
, record that. + if node.is_html_element(&local_name!("dl")) && node.has_class("element") { + if let Some(elem) = std::mem::take(&mut self.most_recent_element_dfn) { + let info = self.elements.entry(elem).or_default(); + if info.dl.is_none() { + info.dl = Some(node.clone()); + } + } + } + } + + fn maybe_record_optional_tags_paragraph(&mut self, paragraph: &Handle) { + // The paragraph must have the structure "A(n) img element..." + let children = paragraph.children.borrow(); + let mut iter = children.iter().fuse(); + match (iter.next(), iter.next(), iter.next()) { + (Some(a), Some(b), Some(c)) + if a.node_text() + .map_or(false, |t| t.trim() == "A" || t.trim() == "An") + && b.is_html_element(&local_name!("code")) + && c.node_text() + .map_or(false, |t| t.trim().starts_with("element")) => + { + let info = self.elements.entry(b.text_content()).or_default(); + info.optional_tags_info.push(paragraph.clone()); + } + _ => (), + } + } + + pub fn apply(self) -> io::Result<()> { + let data_x = LocalName::from("data-x"); + let qual_data_x = QualName::new(None, ns!(), data_x.clone()); + let dt = Handle::create_element(local_name!("dt")) + .child( + Handle::create_element(local_name!("span")) + .attribute(&data_x, "concept-element-tag-omission") + .text("Tag omission in text/html") + .build(), + ) + .text(":") + .build(); + let void_dd = Handle::create_element(local_name!("dd")) + .text("No ") + .child( + Handle::create_element(local_name!("span")) + .attribute(&data_x, "syntax-end-tag") + .text("end tag") + .build(), + ) + .text(".") + .build(); + let default_dd = Handle::create_element(local_name!("dd")) + .text("Neither tag is omissible.") + .build(); + let may_re = Regex::new(r"\bmay\b").unwrap(); + + for info in self.elements.into_values() { + let dl = match info.dl { + Some(dl) => dl, + None => continue, + }; + + let mut to_insert = vec![dt.deep_clone()]; + if !info.optional_tags_info.is_empty() { + // Convert

to

, replacing "may" with "can". + for p in info.optional_tags_info { + let borrowed_children = p.children.borrow(); + let new_children = borrowed_children.iter().map(|n| { + let new_node = n.deep_clone(); + dom_utils::scan_dom(&new_node, &mut |c| { + if let NodeData::Text { ref contents } = c.data { + let mut text = contents.borrow_mut(); + *text = StrTendril::from(may_re.replace(&text, "can").borrow()); + } + }); + new_node + }); + let dd = Handle::create_element(local_name!("dd")) + .children(new_children) + .build(); + to_insert.push(dd); + } + } else if info.is_void_element { + to_insert.push(void_dd.deep_clone()); + } else { + to_insert.push(default_dd.deep_clone()); + } + to_insert.push(Handle::create_text_node("\n")); + + let dl_children = dl.children.borrow(); + let attributes_dt = if let Some(attributes_dt) = dl_children.iter().find(|child| { + child.is_html_element(&local_name!("dt")) + && child + .any_child(|c| c.attribute_is(&qual_data_x, "concept-element-attributes")) + }) { + attributes_dt.clone() + } else { + continue; + }; + drop(dl_children); + dl.insert_children_before(&attributes_dt, to_insert.into_iter()); + } + Ok(()) + } +} + +/// Returns the heading level (from 1 to 6) that the

through

declares, or None for all other nodes. +fn heading_level(node: &Handle) -> Option { + let local = match node.data { + NodeData::Element { ref name, .. } if name.ns == ns!(html) => &name.local, + _ => return None, + }; + match *local { + local_name!("h1") => Some(1), + local_name!("h2") => Some(2), + local_name!("h3") => Some(3), + local_name!("h4") => Some(4), + local_name!("h5") => Some(5), + local_name!("h6") => Some(6), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::parser::{parse_document_async, tests::serialize_for_test}; + + #[tokio::test] + async fn test_simple() -> io::Result<()> { + let document = parse_document_async( + r#" +

Optional tags

+

A td element does very tdish things and may be very cellular.

+

An audio element is quite audible.

+

Another section

+

A body element is ignored because it's in another section. +

+
Void elements +
img and meta are void. +
input is too. +
Non-void elements +
html is interesting but not void. +
+

Elements

+

audio +

+
+
+

body +

+
+
+

html +

+
+
+

img +

+
+
+

input +

+
+
+

meta +

+
+
+

td +

+
+
+ "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

Optional tags

+

A td element does very tdish things and may be very cellular.

+

An audio element is quite audible.

+

Another section

+

A body element is ignored because it's in another section. +

+
Void elements +
img and meta are void. +
input is too. +
Non-void elements +
html is interesting but not void. +
+

Elements

+

audio +

+
Tag omission in text/html:
An audio element is quite audible.
+
+
+

body +

+
Tag omission in text/html:
Neither tag is omissible.
+
+
+

html +

+
Tag omission in text/html:
Neither tag is omissible.
+
+
+

img +

+
Tag omission in text/html:
No end tag.
+
+
+

input +

+
Tag omission in text/html:
No end tag.
+
+
+

meta +

+
Tag omission in text/html:
No end tag.
+
+
+

td +

+
Tag omission in text/html:
A td element does very tdish things and can be very cellular.
+
+
+ "#.trim()); + Ok(()) + } +} From 3db1b5fe9bb65a6e5d5c6305fa468aefc55ea116 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Sun, 6 Aug 2023 20:17:37 +0900 Subject: [PATCH 02/28] Remove custom stuff --- src/main.rs | 6 ++-- src/parser.rs | 82 ++++------------------------------------------- src/serializer.rs | 50 ----------------------------- 3 files changed, 9 insertions(+), 129 deletions(-) delete mode 100644 src/serializer.rs diff --git a/src/main.rs b/src/main.rs index 2506d027..8d08c6d7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,4 @@ +use html5ever::serialize::{serialize, SerializeOpts}; use std::borrow::Cow; use std::default::Default; use std::env; @@ -13,7 +14,6 @@ mod dom_utils; mod interface_index; mod parser; mod represents; -mod serializer; mod tag_omission; #[tokio::main] @@ -56,10 +56,10 @@ async fn main() -> io::Result<()> { // Finally, we write the result to standard out. let serializable: SerializableHandle = document.into(); - serializer::serialize( + serialize( &mut BufWriter::with_capacity(128 * 1024, io::stdout()), &serializable, - Default::default(), + SerializeOpts::default(), )?; Ok(()) } diff --git a/src/parser.rs b/src/parser.rs index ba016836..2ec15d88 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,61 +1,17 @@ -//! This module specializes the HTML5 parser to respect the "special" void -//! element, which isn't part of standard HTML. It does so by injecting a -//! synthetic token immediately afterward. -//! It also provides some mild integration with async I/O. +//! This module provides some mild integration between the html5ever parser and async I/O. use std::borrow::Cow; use std::io; use html5ever::buffer_queue::BufferQueue; use html5ever::tendril::{self, stream::Utf8LossyDecoder, ByteTendril, StrTendril, TendrilSink}; -use html5ever::tokenizer::{ - Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts, TokenizerResult, -}; +use html5ever::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult}; use html5ever::tree_builder::{TreeBuilder, TreeSink}; use markup5ever_rcdom::{Handle, RcDom}; use tokio::io::{AsyncRead, AsyncReadExt}; -struct TokenFilter { - sink: Sink, -} - -impl TokenSink for TokenFilter { - type Handle = Sink::Handle; - - fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult { - let close_tag = match token { - Token::TagToken(Tag { - kind: TagKind::StartTag, - name: ref tag_name, - .. - }) if tag_name.eq_str_ignore_ascii_case("ref") => Some(Tag { - kind: TagKind::EndTag, - name: tag_name.clone(), - self_closing: false, - attrs: vec![], - }), - _ => None, - }; - match (self.sink.process_token(token, line_number), close_tag) { - (TokenSinkResult::Continue, Some(close_tag)) => self - .sink - .process_token(Token::TagToken(close_tag), line_number), - (result, _) => result, - } - } - - fn end(&mut self) { - self.sink.end() - } - - fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool { - self.sink - .adjusted_current_node_present_but_not_in_html_namespace() - } -} - struct FilteredParser { - tokenizer: Tokenizer>>, + tokenizer: Tokenizer>, input_buffer: BufferQueue, } @@ -66,7 +22,7 @@ impl TendrilSink for FilteredParser { } fn error(&mut self, desc: Cow<'static, str>) { - self.tokenizer.sink.sink.sink.parse_error(desc) + self.tokenizer.sink.sink.parse_error(desc) } type Output = Sink::Output; @@ -75,7 +31,7 @@ impl TendrilSink for FilteredParser { while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {} assert!(self.input_buffer.is_empty()); self.tokenizer.end(); - self.tokenizer.sink.sink.sink.finish() + self.tokenizer.sink.sink.finish() } } @@ -91,7 +47,7 @@ async fn parse_internal_async( tokenizer_opts: TokenizerOpts, mut r: R, ) -> io::Result { - let tok = Tokenizer::new(TokenFilter { sink: tb }, tokenizer_opts); + let tok = Tokenizer::new(tb, tokenizer_opts); let mut tendril_sink = FilteredParser { tokenizer: tok, input_buffer: BufferQueue::new(), @@ -173,32 +129,6 @@ pub(crate) mod tests { String::from_utf8(output).unwrap() } - #[tokio::test] - async fn test_treats_ref_as_void() -> io::Result<()> { - // Without the token filtering, the first ends up as the second's parent. - let document = - parse_document_async("".as_bytes()).await?; - assert_eq!( - serialize_for_test(&[document]), - ""); - Ok(()) - } - - #[tokio::test] - async fn test_treats_ref_as_void_in_fragments() -> io::Result<()> { - // Similar to the above, but in a fragment. - let document = parse_document_async("".as_bytes()).await?; - let body = document.children.borrow()[1].children.borrow()[1].clone(); - assert!(body.is_html_element(&local_name!("body"))); - let children = - parse_fragment_async(".".as_bytes(), &body).await?; - assert_eq!( - serialize_for_test(&children), - "." - ); - Ok(()) - } - #[tokio::test] async fn test_fragment_respects_context() -> io::Result<()> { // Checks that we have the appropriate insertion mode for the element diff --git a/src/serializer.rs b/src/serializer.rs deleted file mode 100644 index b4d062a3..00000000 --- a/src/serializer.rs +++ /dev/null @@ -1,50 +0,0 @@ -//! This module specializes the HTML5 serializer to omit , which is -//! treated as void by Wattsi. - -use std::io::{self, Write}; - -use html5ever::serialize::*; -use html5ever::{namespace_url, ns, QualName}; - -struct WattsiSerializer(HtmlSerializer); - -impl Serializer for WattsiSerializer { - fn start_elem<'a, AttrIter>(&mut self, name: QualName, attrs: AttrIter) -> io::Result<()> - where - AttrIter: Iterator>, - { - self.0.start_elem(name, attrs) - } - - fn end_elem(&mut self, name: QualName) -> io::Result<()> { - if name.ns == ns!(html) && &name.local == "ref" { - return Ok(()); - } - self.0.end_elem(name) - } - - fn write_text(&mut self, text: &str) -> io::Result<()> { - self.0.write_text(text) - } - - fn write_comment(&mut self, text: &str) -> io::Result<()> { - self.0.write_comment(text) - } - - fn write_doctype(&mut self, name: &str) -> io::Result<()> { - self.0.write_doctype(name) - } - - fn write_processing_instruction(&mut self, target: &str, data: &str) -> io::Result<()> { - self.0.write_processing_instruction(target, data) - } -} - -pub fn serialize(writer: Wr, node: &T, opts: SerializeOpts) -> io::Result<()> -where - Wr: Write, - T: Serialize, -{ - let mut ser = WattsiSerializer(HtmlSerializer::new(writer, opts.clone())); - node.serialize(&mut ser, opts.traversal_scope) -} From 8a18ae9e431af925a25d23ef732719c466c3a22d Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Sun, 6 Aug 2023 20:17:58 +0900 Subject: [PATCH 03/28] Apply cargo fix --- src/annotate_attributes.rs | 2 +- src/interface_index.rs | 2 +- src/tag_omission.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/annotate_attributes.rs b/src/annotate_attributes.rs index 8e5e5608..00720847 100644 --- a/src/annotate_attributes.rs +++ b/src/annotate_attributes.rs @@ -150,7 +150,7 @@ impl Processor { .collect(), variant: variant_str, }; - let mut existing = self.attributes.entry(attr_key).or_default(); + let existing = self.attributes.entry(attr_key).or_default(); if existing.default.is_empty() { existing.default = descriptions.default; } else if !descriptions.default.is_empty() { diff --git a/src/interface_index.rs b/src/interface_index.rs index 49bdadf4..ffbdd48a 100644 --- a/src/interface_index.rs +++ b/src/interface_index.rs @@ -76,7 +76,7 @@ impl Processor { continue; } let name = window[1].text_content(); - let mut info = self.interfaces.entry(name).or_default(); + let info = self.interfaces.entry(name).or_default(); if is_partial { if let Some(id) = window[1].get_attribute(&ID) { info.partials.push(id); diff --git a/src/tag_omission.rs b/src/tag_omission.rs index 5f5a97be..197ed123 100644 --- a/src/tag_omission.rs +++ b/src/tag_omission.rs @@ -75,7 +75,7 @@ impl Processor { for dd in dom_utils::dt_descriptions(&dt) { dom_utils::scan_dom(&dd, &mut |n| { if n.is_html_element(&local_name!("code")) { - let mut info = self.elements.entry(n.text_content()).or_default(); + let info = self.elements.entry(n.text_content()).or_default(); info.is_void_element = true; } }); From ac0b2ea0ccbf5281b99af62e5660b8b576c39a46 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Mon, 7 Aug 2023 20:42:31 +0900 Subject: [PATCH 04/28] Change method name to avoid clippy suppression --- src/parser.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 2ec15d88..6097df0c 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -36,8 +36,7 @@ impl TendrilSink for FilteredParser { } impl FilteredParser { - #[allow(clippy::wrong_self_convention)] - fn from_utf8(self) -> Utf8LossyDecoder { + fn into_utf8(self) -> Utf8LossyDecoder { Utf8LossyDecoder::new(self) } } @@ -52,7 +51,7 @@ async fn parse_internal_async( tokenizer: tok, input_buffer: BufferQueue::new(), } - .from_utf8(); + .into_utf8(); // This draws on the structure of the sync tendril read_from. const BUFFER_SIZE: u32 = 128 * 1024; From 41d864f03fb695d470e8bee8302acb847579cdfa Mon Sep 17 00:00:00 2001 From: Jeremy Roman Date: Tue, 8 Aug 2023 15:52:37 -0400 Subject: [PATCH 05/28] Remove FilteredParser. It was only different from html5ever::driver::Parser in that it used the filtered tokenizer. With that gone, the ordinary Parser struct works. --- src/parser.rs | 66 ++++++++++----------------------------------------- 1 file changed, 13 insertions(+), 53 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 6097df0c..2ab2f9a3 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,57 +1,17 @@ //! This module provides some mild integration between the html5ever parser and async I/O. -use std::borrow::Cow; use std::io; -use html5ever::buffer_queue::BufferQueue; -use html5ever::tendril::{self, stream::Utf8LossyDecoder, ByteTendril, StrTendril, TendrilSink}; -use html5ever::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult}; -use html5ever::tree_builder::{TreeBuilder, TreeSink}; +use html5ever::driver::{self, Parser}; +use html5ever::tendril::{ByteTendril, TendrilSink}; use markup5ever_rcdom::{Handle, RcDom}; use tokio::io::{AsyncRead, AsyncReadExt}; -struct FilteredParser { - tokenizer: Tokenizer>, - input_buffer: BufferQueue, -} - -impl TendrilSink for FilteredParser { - fn process(&mut self, t: StrTendril) { - self.input_buffer.push_back(t); - while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {} - } - - fn error(&mut self, desc: Cow<'static, str>) { - self.tokenizer.sink.sink.parse_error(desc) - } - - type Output = Sink::Output; - - fn finish(mut self) -> Self::Output { - while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {} - assert!(self.input_buffer.is_empty()); - self.tokenizer.end(); - self.tokenizer.sink.sink.finish() - } -} - -impl FilteredParser { - fn into_utf8(self) -> Utf8LossyDecoder { - Utf8LossyDecoder::new(self) - } -} - async fn parse_internal_async( - tb: TreeBuilder, - tokenizer_opts: TokenizerOpts, + parser: Parser, mut r: R, ) -> io::Result { - let tok = Tokenizer::new(tb, tokenizer_opts); - let mut tendril_sink = FilteredParser { - tokenizer: tok, - input_buffer: BufferQueue::new(), - } - .into_utf8(); + let mut tendril_sink = parser.from_utf8(); // This draws on the structure of the sync tendril read_from. const BUFFER_SIZE: u32 = 128 * 1024; @@ -81,13 +41,13 @@ pub async fn parse_fragment_async( r: R, context: &Handle, ) -> io::Result> { - let tb = - TreeBuilder::new_for_fragment(RcDom::default(), context.clone(), None, Default::default()); - let tokenizer_opts = TokenizerOpts { - initial_state: Some(tb.tokenizer_state_for_context_elem()), - ..TokenizerOpts::default() - }; - let document = parse_internal_async(tb, tokenizer_opts, r).await?; + let parser = driver::parse_fragment_for_element( + RcDom::default(), + Default::default(), + context.clone(), + None, + ); + let document = parse_internal_async(parser, r).await?; let mut new_children = document.children.take()[0].children.take(); for new_child in new_children.iter_mut() { new_child.parent.take(); @@ -96,8 +56,8 @@ pub async fn parse_fragment_async( } pub async fn parse_document_async(r: R) -> io::Result { - let tb = TreeBuilder::new(RcDom::default(), Default::default()); - parse_internal_async(tb, TokenizerOpts::default(), r).await + let parser = driver::parse_document(RcDom::default(), Default::default()); + parse_internal_async(parser, r).await } #[cfg(test)] From f89ca54bb41aff4822069f1747fc466a85588558 Mon Sep 17 00:00:00 2001 From: Jeremy Roman Date: Wed, 23 Aug 2023 17:31:56 -0400 Subject: [PATCH 06/28] two comments from @domfarolino --- src/main.rs | 2 +- src/parser.rs | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index 8d08c6d7..bd034472 100644 --- a/src/main.rs +++ b/src/main.rs @@ -27,7 +27,7 @@ async fn main() -> io::Result<()> { // Because parsing can jump around the tree a little, it's most reasonable // to just parse the whole document before doing any processing. Even for - // the HTML5 specification, this doesn't take too long. + // the HTML standard, this doesn't take too long. let document = parser::parse_document_async(tokio::io::stdin()).await?; let mut boilerplate = boilerplate::Processor::new(cache_dir.clone(), source_dir.join("demos")); diff --git a/src/parser.rs b/src/parser.rs index 2ab2f9a3..a10de56d 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -14,6 +14,7 @@ async fn parse_internal_async( let mut tendril_sink = parser.from_utf8(); // This draws on the structure of the sync tendril read_from. + // https://docs.rs/tendril/latest/tendril/stream/trait.TendrilSink.html#method.read_from const BUFFER_SIZE: u32 = 128 * 1024; 'read: loop { let mut tendril = ByteTendril::new(); From 3023f66c6653da8211b4ef05419b2b9d130b0337 Mon Sep 17 00:00:00 2001 From: Jeremy Roman Date: Thu, 15 Jun 2023 14:25:43 -0400 Subject: [PATCH 07/28] Add a Rust implementation of the Perl preprocessing logic. This implementation uses a full HTML parser (Servo's, html5ever) and operates on a DOM. It ends up being somewhat more verbose than Perl, but hopefully also more extensible/maintainable. --- .gitignore | 6 + Cargo.lock | 682 +++++++++++++++++++++++++++++++++++++ Cargo.toml | 13 + build.sh | 14 +- src/annotate_attributes.rs | 471 +++++++++++++++++++++++++ src/boilerplate.rs | 236 +++++++++++++ src/dom_utils.rs | 383 +++++++++++++++++++++ src/interface_index.rs | 391 +++++++++++++++++++++ src/main.rs | 76 +++++ src/parser.rs | 217 ++++++++++++ src/represents.rs | 152 +++++++++ src/serializer.rs | 50 +++ src/tag_omission.rs | 329 ++++++++++++++++++ 13 files changed, 3015 insertions(+), 5 deletions(-) create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/annotate_attributes.rs create mode 100644 src/boilerplate.rs create mode 100644 src/dom_utils.rs create mode 100644 src/interface_index.rs create mode 100644 src/main.rs create mode 100644 src/parser.rs create mode 100644 src/represents.rs create mode 100644 src/serializer.rs create mode 100644 src/tag_omission.rs diff --git a/.gitignore b/.gitignore index 923fda16..5bbd5ad1 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,9 @@ html/ output/ mdn/.id-list mdn/developer.mozilla.org/ +highlighter/ + + +# Added by cargo + +/target diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 00000000..c7917763 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,682 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" +dependencies = [ + "memchr", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bytes" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" + +[[package]] +name = "cc" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "errno" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" +dependencies = [ + "errno-dragonfly", + "libc", + "windows-sys", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + +[[package]] +name = "getrandom" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "hermit-abi" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" +dependencies = [ + "libc", +] + +[[package]] +name = "hermit-abi" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" + +[[package]] +name = "html-build" +version = "0.1.0" +dependencies = [ + "html5ever", + "markup5ever_rcdom", + "regex", + "tempfile", + "tokio", +] + +[[package]] +name = "html5ever" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "io-lifetimes" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" +dependencies = [ + "hermit-abi 0.3.1", + "libc", + "windows-sys", +] + +[[package]] +name = "libc" +version = "0.2.146" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b" + +[[package]] +name = "linux-raw-sys" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" + +[[package]] +name = "lock_api" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" + +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" +dependencies = [ + "log", + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "markup5ever_rcdom" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9521dd6750f8e80ee6c53d65e2e4656d7de37064f3a7a5d2d11d05df93839c2" +dependencies = [ + "html5ever", + "markup5ever", + "tendril", + "xml5ever", +] + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + +[[package]] +name = "mio" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" +dependencies = [ + "libc", + "wasi", + "windows-sys", +] + +[[package]] +name = "new_debug_unreachable" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" + +[[package]] +name = "num_cpus" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" +dependencies = [ + "hermit-abi 0.2.6", + "libc", +] + +[[package]] +name = "once_cell" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" + +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + +[[package]] +name = "proc-macro2" +version = "1.0.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "redox_syscall" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags", +] + +[[package]] +name = "regex" +version = "1.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" + +[[package]] +name = "rustix" +version = "0.37.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b96e891d04aa506a6d1f318d2771bcb1c7dfda84e126660ace067c9b474bb2c0" +dependencies = [ + "bitflags", + "errno", + "io-lifetimes", + "libc", + "linux-raw-sys", + "windows-sys", +] + +[[package]] +name = "scopeguard" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" + +[[package]] +name = "serde" +version = "1.0.164" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d" + +[[package]] +name = "signal-hook-registry" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" +dependencies = [ + "libc", +] + +[[package]] +name = "siphasher" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bd3e3206899af3f8b12af284fafc038cc1dc2b41d1b89dd17297221c5d225de" + +[[package]] +name = "smallvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" + +[[package]] +name = "socket2" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64a4a911eed85daf18834cfaa86a79b7d266ff93ff5ba14005426219480ed662" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "string_cache" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" +dependencies = [ + "new_debug_unreachable", + "once_cell", + "parking_lot", + "phf_shared", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", +] + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tempfile" +version = "3.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31c0432476357e58790aaa47a8efb0c5138f137343f3b5f23bd36a27e3b0a6d6" +dependencies = [ + "autocfg", + "cfg-if", + "fastrand", + "redox_syscall", + "rustix", + "windows-sys", +] + +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + +[[package]] +name = "tokio" +version = "1.28.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94d7b1cfd2aa4011f2de74c2c4c63665e27a71006b0a192dcd2710272e73dfa2" +dependencies = [ + "autocfg", + "bytes", + "libc", + "mio", + "num_cpus", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys", +] + +[[package]] +name = "tokio-macros" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.18", +] + +[[package]] +name = "unicode-ident" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" + +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" + +[[package]] +name = "xml5ever" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650" +dependencies = [ + "log", + "mac", + "markup5ever", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 00000000..6f9f7202 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "html-build" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +tokio = { version = "1", features = ["full"] } +html5ever = "*" +markup5ever_rcdom = "*" +tempfile = "3" +regex = "1" \ No newline at end of file diff --git a/build.sh b/build.sh index 860aa8a3..8af67ccb 100755 --- a/build.sh +++ b/build.sh @@ -529,13 +529,17 @@ function processSource { BUILD_TYPE="$2" cp -p entities/out/entities.inc "$HTML_CACHE" cp -p entities/out/entities-dtd.url "$HTML_CACHE" - if $VERBOSE; then - perl .pre-process-main.pl --verbose < "$HTML_SOURCE/$SOURCE_LOCATION" > "$HTML_TEMP/source-expanded-1" + if [ "${PROCESS_WITH_RUST:-0}" = "1" ]; then + cargo run -r <"$HTML_SOURCE/$SOURCE_LOCATION" >"$HTML_TEMP/source-whatwg-complete" else - perl .pre-process-main.pl < "$HTML_SOURCE/$SOURCE_LOCATION" > "$HTML_TEMP/source-expanded-1" + if $VERBOSE; then + perl .pre-process-main.pl --verbose < "$HTML_SOURCE/$SOURCE_LOCATION" > "$HTML_TEMP/source-expanded-1" + else + perl .pre-process-main.pl < "$HTML_SOURCE/$SOURCE_LOCATION" > "$HTML_TEMP/source-expanded-1" + fi + perl .pre-process-annotate-attributes.pl < "$HTML_TEMP/source-expanded-1" > "$HTML_TEMP/source-expanded-2" # this one could be merged + perl .pre-process-tag-omission.pl < "$HTML_TEMP/source-expanded-2" | perl .pre-process-index-generator.pl > "$HTML_TEMP/source-whatwg-complete" # this one could be merged fi - perl .pre-process-annotate-attributes.pl < "$HTML_TEMP/source-expanded-1" > "$HTML_TEMP/source-expanded-2" # this one could be merged - perl .pre-process-tag-omission.pl < "$HTML_TEMP/source-expanded-2" | perl .pre-process-index-generator.pl > "$HTML_TEMP/source-whatwg-complete" # this one could be merged runWattsi "$HTML_TEMP/source-whatwg-complete" "$HTML_TEMP/wattsi-output" "$HIGHLIGHT_SERVER_URL" if [[ $WATTSI_RESULT == "0" ]]; then diff --git a/src/annotate_attributes.rs b/src/annotate_attributes.rs new file mode 100644 index 00000000..8e5e5608 --- /dev/null +++ b/src/annotate_attributes.rs @@ -0,0 +1,471 @@ +//! Augments the content attribute list for each element with a description found in the Attributes table. + +use std::collections::HashMap; +use std::io; +use std::rc::Rc; + +use html5ever::tendril::StrTendril; +use html5ever::{local_name, namespace_url, ns, LocalName, QualName}; +use markup5ever_rcdom::{Handle, NodeData}; + +use crate::dom_utils::{self, NodeHandleExt}; +use crate::parser; + +#[derive(Debug, Default)] +struct Descriptions { + /// The default description, as a list of nodes. + default: Vec, + + /// The variant description, if any, as an unparsed string. + variant: Option, +} + +#[derive(Debug)] +struct Edit { + /// Handle on the
element which is to be filled in. + dd: Handle, + + /// The data-x attribute which must be described. + key: StrTendril, + + /// Whether this location has requested the variant/alternate description. + wants_variant_description: bool, + + /// Whether this is described as having "special semantics" and so must be + /// formatted differently. + has_special_semantics: bool, +} + +pub struct Processor { + /// Map from attribute key (e.g., attr-elem-someattribute) to the + /// descriptions found in the Attributes table. + attributes: HashMap, + + /// List of
nodes in Content attributes sections that need to be filled in. + edits: Vec, +} + +impl Processor { + pub fn new() -> Self { + Processor { + attributes: HashMap::new(), + edits: Vec::new(), + } + } + + pub fn visit(&mut self, node: &Handle) { + // We're looking for a
(which is under the Attributes heading). + if node.is_html_element(&local_name!("table")) && node.has_id("attributes-1") { + self.index_attribute_table(node); + } + + // We're looking for the following: + //
+ // ... + //
Content attributes:
+ //
Global attributes
+ //
href
+ //
someattribute
+ // ... + fn is_content_attribute_dt(dt: &Handle) -> bool { + if !dt.is_html_element(&local_name!("dt")) { + return false; + } + match dt.parent_node() { + Some(p) if p.is_html_element(&local_name!("dl")) && p.has_class("element") => (), + _ => return false, + } + let data_x = QualName::new(None, ns!(), LocalName::from("data-x")); + dt.any_child(|c| c.attribute_is(&data_x, "concept-element-attributes")) + } + if is_content_attribute_dt(node) { + self.index_attribute_list(node); + } + } + + fn index_attribute_table(&mut self, table: &Handle) { + let tbody = match table + .children + .borrow() + .iter() + .find(|n| n.is_html_element(&local_name!("tbody"))) + { + Some(tbody) => tbody.clone(), + None => return, + }; + for row in tbody + .children + .borrow() + .iter() + .filter(|c| c.is_html_element(&local_name!("tr"))) + { + // Each row is expected to have this structure: + //
+ //
someattribute + // a; b; ... + // Description of how someattribute applies to a, b, etc. + // Description if the valid values + // And we want to extract the descriptions so that we can later insert them + // alongside the definitions of attr-a-someattribute, etc. + let row_children = row.children.borrow(); + let mut tds = row_children + .iter() + .filter(|c| c.is_html_element(&local_name!("td"))); + let (keys_td, description_td) = match (tds.next(), tds.next()) { + (Some(a), Some(b)) => (a, b), + _ => continue, + }; + + // These will be strings like "attr-input-maxlength", which identify particular element-attribute pairs. + let data_x = QualName::new(None, ns!(), LocalName::from("data-x")); + for attr_key in keys_td + .children + .borrow() + .iter() + .filter_map(|c| c.get_attribute(&data_x).filter(|v| !v.is_empty())) + { + // Find the comment, if one exists, and extract its contents. + let description = description_td.children.borrow(); + let mut variant_comment = None; + let mut variant_str = None; + for node in description.iter() { + if let NodeData::Comment { ref contents } = node.data { + if contents.trim().starts_with("or:") { + variant_comment = Some(node); + variant_str = Some(StrTendril::from(contents.trim()[3..].trim_start())); + } + } + } + + // Store the (already parsed) ordinary description. If a variant + // comment exists, omit it and instead store its unparsed + // string. + let descriptions = Descriptions { + default: description_td + .children + .borrow() + .iter() + .filter(|c| variant_comment.map_or(true, |vc| !Rc::ptr_eq(c, vc))) + .map(|c| c.deep_clone()) + .collect(), + variant: variant_str, + }; + let mut existing = self.attributes.entry(attr_key).or_default(); + if existing.default.is_empty() { + existing.default = descriptions.default; + } else if !descriptions.default.is_empty() { + if let NodeData::Text { ref contents } = existing.default.last().unwrap().data { + let mut borrow = contents.borrow_mut(); + if let Some(last_non_ws) = borrow.rfind(|c: char| !c.is_ascii_whitespace()) + { + let to_remove = borrow.len32() - (last_non_ws as u32) - 1; + borrow.pop_back(to_remove); + } + } + existing.default.push(Handle::create_text_node("; ")); + existing.default.extend(descriptions.default.into_iter()); + } + if existing.variant.is_none() { + existing.variant = descriptions.variant; + } else if descriptions.variant.is_some() { + let existing_variant = existing.variant.as_mut().unwrap(); + existing_variant.push_slice("; "); + existing_variant.push_tendril(&descriptions.variant.unwrap()); + } + } + } + } + + fn index_attribute_list(&mut self, dt: &Handle) { + // If a
contains , it is not annotated. + // If it contains , the description found in a comment is used instead. + // If it mentions "special semantics", it is joined with a colon rather than an em dash. + let data_x = QualName::new(None, ns!(), LocalName::from("data-x")); + let parent = dt.parent_node().unwrap(); + let children = parent.children.borrow(); + self.edits.extend( + children + .iter() + .skip_while(|n| !Rc::ptr_eq(n, dt)) + .skip(1) + .filter(|n| n.is_element()) + .take_while(|e| e.is_html_element(&local_name!("dd"))) + .filter_map(|dd| { + let mut can_annotate = true; + let mut wants_variant_description = false; + let mut has_special_semantics = false; + let mut key = None; + dom_utils::scan_dom(dd, &mut |n| match &n.data { + NodeData::Comment { ref contents } if contents.trim() == "no-annotate" => { + can_annotate = false; + } + NodeData::Comment { ref contents } if contents.trim() == "variant" => { + wants_variant_description = true; + } + NodeData::Text { ref contents } + if contents.borrow().contains("has special semantics") => + { + has_special_semantics = true; + } + NodeData::Element { .. } => { + if key.is_none() { + key = n.get_attribute(&data_x); + } + } + _ => (), + }); + match (can_annotate, key) { + (true, Some(key)) => Some(Edit { + dd: dd.clone(), + key, + wants_variant_description, + has_special_semantics, + }), + _ => None, + } + }), + ); + } + + pub async fn apply(self) -> io::Result<()> { + let em_dash = StrTendril::from(" \u{2014} "); + + for Edit { + dd, + key, + wants_variant_description, + has_special_semantics, + } in self.edits + { + // Find the requested description to insert at this point. + let descriptions = match self.attributes.get(&key) { + Some(descriptions) => descriptions, + None => continue, + }; + let mut description: Vec = match descriptions { + Descriptions { + variant: Some(ref variant), + .. + } if wants_variant_description => { + parser::parse_fragment_async(variant[..].as_bytes(), &dd).await? + } + _ if wants_variant_description => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "Attribute {key} wants variant description, but no was found" + ), + )) + } + Descriptions { ref default, .. } => { + default.iter().map(|n| n.deep_clone()).collect() + } + }; + + let mut dd_children = dd.children.borrow_mut(); + if has_special_semantics { + // Replace the trailing period with a separating colon. + if let Some(NodeData::Text { contents }) = dd_children.last_mut().map(|n| &n.data) { + let mut text = contents.borrow_mut(); + *text = StrTendril::from( + text.trim_end_matches(|c: char| c.is_ascii_whitespace() || c == '.'), + ); + text.push_slice(": "); + } + } else { + // Insert an em dash. + description.insert(0, Handle::create_text_node(em_dash.clone())); + } + + // Insert the description. + for child in description.iter_mut() { + child.parent.set(Some(Rc::downgrade(&dd))); + } + dd_children.extend(description); + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::parser::{parse_document_async, tests::serialize_for_test}; + + #[tokio::test] + async fn test_simple() -> io::Result<()> { + // This is a simple document with enough stuff in it. Elements are shown + // before and after the attributes table, to demonstrate that this is + // not sensitive to which order they occur in (i.e., these could be + // reordered in the HTML spec). + let document = parse_document_async( + r#" +

The a element

+
+
Categories +
Flow content +
Content attributes +
href +
+

Attributes

+ +
hrefa; areaDestination of the hyperlink +
+

The area element

+
+
Categories +
Flow content +
Content attributes +
href +
+ "#.trim().as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

The a element

+
+
Categories +
Flow content +
Content attributes +
href + — Destination of the hyperlink +
+

Attributes

+ +
hrefa; areaDestination of the hyperlink +
+

The area element

+
+
Categories +
Flow content +
Content attributes +
href + — Destination of the hyperlink +
+ "#.trim() + ); + Ok(()) + } + + #[tokio::test] + async fn test_variant() -> io::Result<()> { + // This checks that and work correctly. + // i.e., the variant description is used where requested + let document = parse_document_async( + r#" +

The a element

+
+
Content attributes +
href +
+

Attributes

+ +
hrefa; areaDestination of the hyperlink +
+

The area element

+
+
Content attributes +
href +
+ "#.trim().as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

The a element

+
+
Content attributes +
href + — Destination of the hyperlink +
+

Attributes

+ +
hrefa; areaDestination of the hyperlink +
+

The area element

+
+
Content attributes +
href + — click on shapes!
+ "#.trim() + ); + Ok(()) + } + + #[tokio::test] + async fn test_special_semantics() -> io::Result<()> { + // Checks that the special rules for using : instead of an em dash work. + let document = parse_document_async( + r#" +

The a element

+
+
Content attributes +
Also, the name attribute has special semantics on this element. +
+

Attributes

+ +
nameaAnchor name +
+ "#.trim().as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

The a element

+
+
Content attributes +
Also, the name attribute has special semantics on this element: Anchor name +
+

Attributes

+ +
nameaAnchor name +
+ "#.trim() + ); + Ok(()) + } + + #[tokio::test] + async fn test_special_semantics_multiple() -> io::Result<()> { + // Checks that the special rules for joining any special semantics with a ; work. + let document = parse_document_async( + r#" +

The a element

+
+
Content attributes +
Also, the name attribute has special semantics on this element. +
+

Attributes

+ +
nameaAnchor name +
nameaName of the anchor +
+ "#.trim().as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

The a element

+
+
Content attributes +
Also, the name attribute has special semantics on this element: Anchor name; Name of the anchor +
+

Attributes

+ +
nameaAnchor name +
nameaName of the anchor +
+ "#.trim() + ); + Ok(()) + } +} diff --git a/src/boilerplate.rs b/src/boilerplate.rs new file mode 100644 index 00000000..47aa587c --- /dev/null +++ b/src/boilerplate.rs @@ -0,0 +1,236 @@ +//! Replaces comments. +//! These can either be comment nodes (in which case the resulting fragment will +//! be inserted), or the complete value of an element's attribute (in which case +//! the text will become the attribute value). + +use std::io; +use std::path::{Path, PathBuf}; + +use html5ever::tendril::{self, SendTendril, StrTendril}; +use html5ever::{local_name, Attribute, LocalName, QualName}; +use markup5ever_rcdom::{Handle, NodeData}; +use tokio::fs::File; +use tokio::task::JoinHandle; + +use crate::dom_utils::NodeHandleExt; +use crate::parser; + +type SendStrTendril = SendTendril; + +enum Edit { + ReplaceHTML(Handle, JoinHandle>), + ReplaceAttr(Handle, QualName, JoinHandle>), + ReplaceText(Handle, JoinHandle>), +} + +pub struct Processor { + /// Path to look for boilerplate files. + path: PathBuf, + + /// Path to look for example files. + example_path: PathBuf, + + /// Changes to be made in the apply step. + edits: Vec, +} + +impl Processor { + pub fn new(path: impl Into, example_path: impl Into) -> Self { + Self { + path: path.into(), + example_path: example_path.into(), + edits: vec![], + } + } + + /// Should be called for each node in the document. + /// Identifies replacements which will be needed, and starts the necessary + /// I/O. + pub fn visit(&mut self, node: &Handle) { + match node.data { + // BOILERPLATE comments will need to be replaced with their + // corresponding HTML, parsed. Open the file so that we can do so on + // demand. + NodeData::Comment { ref contents } if contents.starts_with("BOILERPLATE ") => { + let path = Path::new(contents[12..].trim()); + if is_safe_path(path) { + let file = tokio::spawn(File::open(self.path.join(path))); + self.edits.push(Edit::ReplaceHTML(node.clone(), file)); + } + } + // Pseudo-comments can also appear in element attributes. These are + // not parsed as HTML, so we simply want to read them into memory so + // they can be replaced. + NodeData::Element { ref attrs, .. } => { + for Attribute { + ref name, + ref value, + } in attrs.borrow().iter() + { + if value.starts_with("") { + let path = Path::new(value[16..value.len() - 3].trim()); + if is_safe_path(path) { + let file_contents = read_to_str_tendril(self.path.join(path)); + self.edits.push(Edit::ReplaceAttr( + node.clone(), + name.clone(), + file_contents, + )); + } + } + } + } + //
 and 
 which contain EXAMPLE also need to be
+            // replaced, but as plain text. These are loaded from the "examples"
+            // directory instead.
+            NodeData::Text { ref contents } => {
+                let borrowed_contents = contents.borrow();
+                let text = borrowed_contents.trim();
+                if !text.starts_with("EXAMPLE ") {
+                    return;
+                }
+                const PRE: LocalName = local_name!("pre");
+                const CODE: LocalName = local_name!("code");
+                let has_suitable_parent = node.parent_node().map_or(false, |p| {
+                    p.is_html_element(&PRE)
+                        || (p.is_html_element(&CODE)
+                            && p.parent_node().map_or(false, |p2| p2.is_html_element(&PRE)))
+                });
+                if has_suitable_parent {
+                    let path = Path::new(text[8..].trim());
+                    if is_safe_path(path) {
+                        let file_contents = read_to_str_tendril(self.example_path.join(path));
+                        self.edits
+                            .push(Edit::ReplaceText(node.clone(), file_contents))
+                    }
+                }
+            }
+            _ => (),
+        }
+    }
+
+    /// Applies the required replacements, in order.
+    pub async fn apply(self) -> io::Result<()> {
+        for edit in self.edits {
+            match edit {
+                // When parsing HTML, we need the context it's in so that the
+                // context-sensitive parsing behavior works correctly.
+                Edit::ReplaceHTML(node, replacement) => {
+                    let context = match node.parent_node() {
+                        Some(n) => n,
+                        _ => continue,
+                    };
+                    let file: File = replacement.await??;
+                    let new_children = parser::parse_fragment_async(file, &context).await?;
+                    node.replace_with(new_children);
+                }
+                Edit::ReplaceAttr(element, ref attr, replacement) => {
+                    element.set_attribute(attr, replacement.await??.into());
+                }
+                Edit::ReplaceText(element, replacement) => match element.data {
+                    NodeData::Text { ref contents } => {
+                        contents.replace(replacement.await??.into());
+                    }
+                    _ => panic!("not text"),
+                },
+            }
+        }
+        Ok(())
+    }
+}
+
+/// Check that a path is safe to open, even if the source is potentially untrusted.
+fn is_safe_path(path: &Path) -> bool {
+    use std::path::Component;
+    path.components()
+        .all(|c| matches!(c, Component::Normal(_) | Component::CurDir))
+}
+
+/// In a spawned task, read to a string, then move it to a tendril.
+fn read_to_str_tendril(path: impl AsRef) -> JoinHandle> {
+    let path = path.as_ref().to_owned();
+    tokio::spawn(async move {
+        let string = tokio::fs::read_to_string(path).await?;
+        Ok(StrTendril::from(string).into_send())
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::dom_utils;
+    use crate::parser::{parse_document_async, tests::serialize_for_test};
+    use tempfile::TempDir;
+
+    #[tokio::test]
+    async fn test_replace_boilerplate_comment() -> io::Result<()> {
+        let boilerplate_dir = TempDir::new()?;
+        tokio::fs::write(
+            boilerplate_dir.path().join("languages"),
+            "
enEnglish", + ) + .await?; + let document = + parse_document_async("".as_bytes()).await?; + let mut proc = Processor::new(boilerplate_dir.path(), Path::new(".")); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + "
enEnglish
"); + Ok(()) + } + + #[tokio::test] + async fn test_replace_boilerplate_attribute() -> io::Result<()> { + let boilerplate_dir = TempDir::new()?; + tokio::fs::write( + boilerplate_dir.path().join("data.url"), + "data:text/html,Hello, world!", + ) + .await?; + let document = + parse_document_async("\">hello".as_bytes()) + .await?; + let mut proc = Processor::new(boilerplate_dir.path(), Path::new(".")); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + "hello"); + Ok(()) + } + + #[tokio::test] + async fn test_replace_example() -> io::Result<()> { + let example_dir = TempDir::new()?; + tokio::fs::write(example_dir.path().join("ex1"), "first").await?; + tokio::fs::write(example_dir.path().join("ex2"), "second").await?; + tokio::fs::write(example_dir.path().join("ignored"), "bad").await?; + let document = + parse_document_async("
EXAMPLE ex1
\nEXAMPLE ex2  

EXAMPLE ignored

".as_bytes()) + .await?; + let mut proc = Processor::new(Path::new("."), example_dir.path()); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + "
first
second

EXAMPLE ignored

" ); + Ok(()) + } + + #[tokio::test] + async fn test_ignores_unsafe_paths() -> io::Result<()> { + let document = + parse_document_async("
\">EXAMPLE ../foo
".as_bytes()) + .await?; + let mut proc = Processor::new(Path::new("."), Path::new(".")); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + assert_eq!(proc.edits.len(), 0); + proc.apply().await?; + assert_eq!( + serialize_for_test(&[document]), + "
\">EXAMPLE ../foo
"); + Ok(()) + } +} diff --git a/src/dom_utils.rs b/src/dom_utils.rs new file mode 100644 index 00000000..95894515 --- /dev/null +++ b/src/dom_utils.rs @@ -0,0 +1,383 @@ +use std::cell::RefCell; +use std::rc::Rc; + +use html5ever::tendril::StrTendril; +use html5ever::{local_name, namespace_url, ns, Attribute, LocalName, QualName}; +use markup5ever_rcdom::{Handle, Node, NodeData}; + +/// Extensions to the DOM interface to make manipulation more ergonimc. +pub trait NodeHandleExt { + /// Returns a handle to the parent node, if there is one. + fn parent_node(&self) -> Option + where + Self: Sized; + + /// Gets an attribute on the element, or None if absent or not an element. + fn get_attribute(&self, name: &QualName) -> Option; + + /// Returns whether the node has the named attribute. + fn has_attribute(&self, name: &QualName) -> bool { + self.get_attribute(name).is_some() + } + + /// Returns true if the attribute exists and the predicate matches it. + fn attribute_matches(&self, name: &QualName, f: impl Fn(&str) -> bool) -> bool { + self.get_attribute(name).map_or(false, |v| f(&v)) + } + + /// Returns true if the attribute exists and has the value mentioned. + fn attribute_is(&self, name: &QualName, expected: &str) -> bool { + self.get_attribute(name).as_deref() == Some(expected) + } + + /// Sets an attribute on the element. Must be an element. + fn set_attribute(&self, name: &QualName, value: StrTendril); + + /// Returns true if the node is an element. + fn is_element(&self) -> bool; + + /// Returns true if the node is an HTML element with the given tag name. + fn is_html_element(&self, tag_name: &LocalName) -> bool; + + /// Returns true if the node is an element with the given class. + fn has_class(&self, class: &str) -> bool; + + /// Returns true if the node is an element with the given ID. + fn has_id(&self, id: &str) -> bool { + const ID: QualName = QualName { + prefix: None, + ns: ns!(), + local: local_name!("id"), + }; + self.attribute_is(&ID, id) + } + + /// If this is a text node, returns its text. + fn node_text(&self) -> Option; + + /// Concatenate the text of the node and its descendants. + fn text_content(&self) -> StrTendril; + + /// True if any child matches the predicate. + fn any_child(&self, f: impl Fn(&Self) -> bool) -> bool; + + /// Appends children (without checking node type). + fn append_children(&self, children: impl Iterator); + + /// Same, but just one. + fn append_child(&self, child: Self) + where + Self: Sized, + { + self.append_children(std::iter::once(child)) + } + + /// Inserts children before the specified child. + fn insert_children_before(&self, existing: &Self, new: impl Iterator); + + /// Same, but just one. + fn insert_child(&self, existing: &Self, new: Self) + where + Self: Sized, + { + self.insert_children_before(existing, std::iter::once(new)) + } + + /// Removes the node from its parent and replaces it with the nodes provided. + /// Does nothing if the node has no parent. + fn replace_with(&self, replacements: Vec) + where + Self: Sized; + + /// Clones the node and its entire subtree (including template contents). + fn deep_clone(&self) -> Self; + + /// Create a new element, with the given children. + fn create_element(name: LocalName) -> ElementBuilder + where + Self: Sized; + + /// Create a new text node. + fn create_text_node(text: impl Into) -> Self + where + Self: Sized; +} + +/// Convenience helper for constructing nodes. Use like: +/// Handle::create_element(local_name!("a")) +/// .attribute(&local_name!("href"), "/") +/// .text("Home") +/// .build() +pub struct ElementBuilder { + element: T, +} + +impl ElementBuilder { + pub fn attribute(self, name: &LocalName, value: impl Into) -> Self { + self.element + .set_attribute(&QualName::new(None, ns!(), name.clone()), value.into()); + self + } + + pub fn children(self, children: impl Iterator) -> Self { + self.element.append_children(children); + self + } + + pub fn child(self, child: T) -> Self { + self.children(std::iter::once(child)) + } + + pub fn text(self, text: impl Into) -> Self { + self.child(::create_text_node(text)) + } + + pub fn build(self) -> T { + self.element + } +} + +/// Recursively visits every DOM node (preorder). Template contents are visited +/// after children, but there are seldom both. +pub fn scan_dom(handle: &Handle, f: &mut F) { + f(handle); + + for child in handle.children.borrow().iter() { + scan_dom(child, f); + } + + if let NodeData::Element { + template_contents: ref tc, + .. + } = handle.data + { + if let Some(ref tc_handle) = *tc.borrow() { + scan_dom(tc_handle, f); + } + } +} + +/// Given a
element, find the corresponding
elements. +/// +/// This is more subtle than you might immediately think, because there can be +/// multiple
listing various terms with one or more common
+/// definitions. We need to find the
in the child list, and then skip it +/// and any other
, before providing the
that follow. +pub fn dt_descriptions(dt: &Handle) -> Vec { + assert!(dt.is_html_element(&local_name!("dt"))); + if let Some(ref dl) = dt + .parent_node() + .filter(|n| n.is_html_element(&local_name!("dl"))) + { + dl.children + .borrow() + .iter() + .filter(|n| n.is_element()) + .skip_while(|n| !Rc::ptr_eq(n, dt)) + .skip_while(|n| n.is_html_element(&local_name!("dt"))) + .take_while(|n| n.is_html_element(&local_name!("dd"))) + .cloned() + .collect() + } else { + Vec::new() + } +} + +impl NodeHandleExt for Handle { + fn parent_node(&self) -> Option { + let weak_parent = self.parent.take()?; + let parent = weak_parent.upgrade().expect("dangling parent"); + self.parent.set(Some(weak_parent)); + Some(parent) + } + + fn get_attribute(&self, name: &QualName) -> Option { + let attrs = match self.data { + NodeData::Element { ref attrs, .. } => attrs.borrow(), + _ => return None, + }; + attrs + .iter() + .find(|a| &a.name == name) + .map(|a| a.value.clone()) + } + + fn set_attribute(&self, name: &QualName, value: StrTendril) { + let mut attrs = match self.data { + NodeData::Element { ref attrs, .. } => attrs.borrow_mut(), + _ => panic!("not an element"), + }; + if let Some(attr) = attrs.iter_mut().find(|a| &a.name == name) { + attr.value = value; + } else { + attrs.push(Attribute { + name: name.clone(), + value, + }); + } + } + + fn is_element(&self) -> bool { + matches!(&self.data, NodeData::Element { .. }) + } + + fn is_html_element(&self, tag_name: &LocalName) -> bool { + match &self.data { + NodeData::Element { + name: + QualName { + ns: ns!(html), + ref local, + .. + }, + .. + } => local == tag_name, + _ => false, + } + } + + fn has_class(&self, class: &str) -> bool { + const CLASS: QualName = QualName { + prefix: None, + ns: ns!(), + local: local_name!("class"), + }; + self.get_attribute(&CLASS) + .map_or(false, |v| v.split_ascii_whitespace().any(|c| c == class)) + } + + fn node_text(&self) -> Option { + match &self.data { + NodeData::Text { ref contents } => Some(contents.borrow().clone()), + _ => None, + } + } + + fn text_content(&self) -> StrTendril { + let mut text = StrTendril::new(); + scan_dom(self, &mut |n| { + if let NodeData::Text { ref contents } = &n.data { + text.push_tendril(&contents.borrow()); + } + }); + text + } + + fn any_child(&self, f: impl Fn(&Handle) -> bool) -> bool { + self.children.borrow().iter().any(f) + } + + fn append_children(&self, children: impl Iterator) { + self.children.borrow_mut().extend(children.inspect(|c| { + let old_parent = c.parent.replace(Some(Rc::downgrade(self))); + assert!(old_parent.is_none()); + })); + } + + fn insert_children_before(&self, existing: &Handle, new: impl Iterator) { + let mut children = self.children.borrow_mut(); + let i = children + .iter() + .position(|c| Rc::ptr_eq(c, existing)) + .expect("corrupt child list"); + children.splice( + i..i, + new.inspect(|c| { + let old_parent = c.parent.replace(Some(Rc::downgrade(self))); + assert!(old_parent.is_none()); + }), + ); + } + + fn replace_with(&self, replacements: Vec) { + let parent = match self.parent.take() { + Some(n) => n.upgrade().expect("dangling parent"), + _ => return, + }; + for new_child in replacements.iter() { + new_child.parent.replace(Some(Rc::downgrade(&parent))); + } + let mut children = parent.children.borrow_mut(); + let i = children + .iter() + .position(|c| Rc::ptr_eq(c, self)) + .expect("corrupt child list"); + children.splice(i..=i, replacements); + self.parent.take(); + } + + fn deep_clone(&self) -> Handle { + use NodeData::*; + let new_node_data = match &self.data { + Document => Document, + Doctype { + name, + public_id, + system_id, + } => Doctype { + name: name.clone(), + public_id: public_id.clone(), + system_id: system_id.clone(), + }, + Text { contents } => Text { + contents: contents.clone(), + }, + Comment { contents } => Comment { + contents: contents.clone(), + }, + Element { + name, + attrs, + template_contents, + mathml_annotation_xml_integration_point, + } => Element { + name: name.clone(), + attrs: attrs.clone(), + template_contents: RefCell::new( + template_contents + .borrow() + .as_ref() + .map(|tc| tc.deep_clone()), + ), + mathml_annotation_xml_integration_point: *mathml_annotation_xml_integration_point, + }, + ProcessingInstruction { target, contents } => ProcessingInstruction { + target: target.clone(), + contents: contents.clone(), + }, + }; + let node = Node::new(new_node_data); + let mut children = node.children.borrow_mut(); + *children = self + .children + .borrow() + .iter() + .map(|c| c.deep_clone()) + .collect(); + for child in children.iter_mut() { + let old_parent = child.parent.replace(Some(Rc::downgrade(&node))); + assert!(old_parent.is_none()); + } + drop(children); + node + } + + fn create_element(name: LocalName) -> ElementBuilder { + let new_node_data = NodeData::Element { + name: QualName::new(None, ns!(html), name), + attrs: RefCell::new(Vec::new()), + template_contents: RefCell::new(None), + mathml_annotation_xml_integration_point: false, + }; + ElementBuilder { + element: Node::new(new_node_data), + } + } + + fn create_text_node(text: impl Into) -> Handle { + let new_node_data = NodeData::Text { + contents: RefCell::new(text.into()), + }; + Node::new(new_node_data) + } +} diff --git a/src/interface_index.rs b/src/interface_index.rs new file mode 100644 index 00000000..49bdadf4 --- /dev/null +++ b/src/interface_index.rs @@ -0,0 +1,391 @@ +//! Generates an index of WebIDL interfaces. +//! This index is inserted where "INSERT INTERFACES HERE" appears. + +use std::collections::BTreeMap; +use std::io; + +use html5ever::tendril::StrTendril; +use html5ever::{local_name, namespace_url, ns, QualName}; +use markup5ever_rcdom::Handle; + +use crate::dom_utils::NodeHandleExt; + +#[derive(Default, Debug)] +struct InterfaceInfo { + /// Number of times the interface definition was seen. Should be one. + seen: u32, + + /// The IDs of the partial interfaces, in the order they appear in the document. + partials: Vec, + + /// Set to true if a partial is missing its ID. + has_partial_with_no_id: bool, +} + +pub struct Processor { + /// The interfaces encountered, keyed and sorted by name. + interfaces: BTreeMap, + + /// The text nodes which contains the text "INSERT INTERFACES HERE". + marker_nodes: Vec, +} + +/// The string which marks where the index belongs. Ideally this would be a node +/// and not plain text. +const MARKER: &str = "INSERT INTERFACES HERE"; + +impl Processor { + pub fn new() -> Self { + Processor { + interfaces: BTreeMap::new(), + marker_nodes: Vec::new(), + } + } + + pub fn visit(&mut self, node: &Handle) { + const ID: QualName = QualName { + prefix: None, + ns: ns!(), + local: local_name!("id"), + }; + // We're looking for inside a
, to find
+        // potential interfaces defined there.
+        //
+        // One surprise here -- there is an "interface Example" that is not defined
+        // according to Wattsi. It yells about this not being defined, and the
+        // prior Perl preprocessing actually requires the 
 have no
+        // attributes.
+        if node.is_html_element(&local_name!("code"))
+            && node.has_class("idl")
+            && node.parent_node().map_or(false, |p| {
+                p.is_html_element(&local_name!("pre")) && !p.has_class("extract")
+            })
+        {
+            let borrowed_children = node.children.borrow();
+            for window in borrowed_children.windows(2) {
+                let is_partial = match window[0].node_text() {
+                    Some(a) if a.ends_with("partial interface ") => true,
+                    Some(a) if a.ends_with("interface ") => false,
+                    _ => continue,
+                };
+                // These definitions must appear as a ,  or  element.
+                if !window[1].is_html_element(&local_name!("span"))
+                    && !window[1].is_html_element(&local_name!("dfn"))
+                    && !window[1].is_html_element(&local_name!("a"))
+                {
+                    continue;
+                }
+                let name = window[1].text_content();
+                let mut info = self.interfaces.entry(name).or_default();
+                if is_partial {
+                    if let Some(id) = window[1].get_attribute(&ID) {
+                        info.partials.push(id);
+                    } else {
+                        info.has_partial_with_no_id = true;
+                    }
+                } else {
+                    info.seen += 1;
+                }
+            }
+        }
+
+        if node.node_text().map_or(false, |t| t.contains(MARKER)) {
+            self.marker_nodes.push(node.clone());
+        }
+    }
+
+    pub fn apply(self) -> io::Result<()> {
+        // It is likely an author error to not include anywhere to insert an
+        // interface index. More than one is supported, mainly because it's no
+        // more work than enforcing that just one exists.
+        if self.marker_nodes.is_empty() {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                format!("Marker {MARKER:?} not found."),
+            ));
+        }
+        for marker in self.marker_nodes {
+            // We need to find where the marker appears in the text so that we
+            // can split it into two text nodes.
+            let text = marker.node_text().expect("should still be a text node");
+            let position: u32 = match text.find(MARKER) {
+                None => {
+                    return Err(io::Error::new(
+                        io::ErrorKind::InvalidData,
+                        format!("Marker {MARKER:?} not found (but was during first pass)."),
+                    ));
+                }
+                Some(p) => p.try_into().unwrap(),
+            };
+            let end_position: u32 = position + TryInto::::try_into(MARKER.len()).unwrap();
+            let before = text.subtendril(0, position);
+            let after = text.subtendril(end_position, text.len32() - end_position);
+
+            // Then, we need to construct a list of interfaces and their partial interfaces.
+            let mut ul =
+                Handle::create_element(local_name!("ul")).attribute(&local_name!("class"), "brief");
+            for (name, info) in &self.interfaces {
+                if info.seen > 1 {
+                    return Err(io::Error::new(
+                        io::ErrorKind::InvalidData,
+                        format!("Interface {name} defined {} times.", info.seen),
+                    ));
+                }
+                fn make_link(id: &str, text: &str) -> Handle {
+                    Handle::create_element(local_name!("a"))
+                        .attribute(&local_name!("href"), format!("#{id}"))
+                        .text(text)
+                        .build()
+                }
+                let mut li = Handle::create_element(local_name!("li")).child(
+                    Handle::create_element(local_name!("code"))
+                        .text(name.clone())
+                        .build(),
+                );
+                match &info.partials[..] {
+                    [] => (),
+                    [sole_partial] => {
+                        li = li.text(", ").child(make_link(sole_partial, "partial"));
+                    }
+                    [first, rest @ ..] => {
+                        li = li.text(", ").child(make_link(first, "partial 1"));
+                        for (i, p) in rest.iter().enumerate() {
+                            li = li.text(" ").child(make_link(p, &(i + 2).to_string()));
+                        }
+                    }
+                }
+                ul = ul.child(li.build());
+            }
+
+            // Finally, we replace the marker's text node with the combination of the two.
+            marker.replace_with(vec![
+                Handle::create_text_node(before),
+                ul.build(),
+                Handle::create_text_node(after),
+            ]);
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::dom_utils;
+    use crate::parser::{parse_document_async, tests::serialize_for_test};
+
+    #[tokio::test]
+    async fn test_two_interfaces_in_one_block() -> io::Result<()> {
+        let document = parse_document_async(
+            r#"
+

+interface HTMLMarqueeElement { ... }
+interface HTMLBlinkElement { ... }
+
+INSERT INTERFACES HERE + "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

+interface HTMLMarqueeElement { ... }
+interface HTMLBlinkElement { ... }
+
+
  • HTMLBlinkElement
  • HTMLMarqueeElement
+ "#.trim()); + Ok(()) + } + + #[tokio::test] + async fn test_two_interfaces_in_separate_blocks() -> io::Result<()> { + let document = parse_document_async( + r#" +

+interface HTMLMarqueeElement { ... }
+
+

+interface HTMLBlinkElement { ... }
+
+INSERT INTERFACES HERE + "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

+interface HTMLMarqueeElement { ... }
+
+

+interface HTMLBlinkElement { ... }
+
+
  • HTMLBlinkElement
  • HTMLMarqueeElement
+ "#.trim()); + Ok(()) + } + + #[tokio::test] + async fn interface_with_partial() -> io::Result<()> { + let document = parse_document_async( + r#" +

+interface HTMLMarqueeElement { ... }
+
+

+partial interface HTMLMarqueeElement { ... }
+
+INSERT INTERFACES HERE + "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r##" +

+interface HTMLMarqueeElement { ... }
+
+

+partial interface HTMLMarqueeElement { ... }
+
+
+ "##.trim()); + Ok(()) + } + + #[tokio::test] + async fn interface_with_two_partials() -> io::Result<()> { + let document = parse_document_async( + r#" +

+interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+
+INSERT INTERFACES HERE + "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r##" +

+interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+
+ + "##.trim()); + Ok(()) + } + + #[tokio::test] + async fn only_partials() -> io::Result<()> { + let document = parse_document_async( + r#" +

+partial interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+
+INSERT INTERFACES HERE + "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r##" +

+partial interface HTMLMarqueeElement { ... }
+partial interface HTMLMarqueeElement { ... }
+
+ + "##.trim()); + Ok(()) + } + + #[tokio::test] + async fn markers_before_and_after() -> io::Result<()> { + let document = parse_document_async( + r#" +INSERT INTERFACES HERE +

+interface HTMLMarqueeElement { ... }
+
+INSERT INTERFACES HERE + "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r##" +
  • HTMLMarqueeElement
+

+interface HTMLMarqueeElement { ... }
+
+
  • HTMLMarqueeElement
+ "## + .trim() + ); + Ok(()) + } + + #[tokio::test] + async fn no_marker() -> io::Result<()> { + let document = parse_document_async("".as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + let result = proc.apply(); + assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); + Ok(()) + } + + #[tokio::test] + async fn duplicate_dfn() -> io::Result<()> { + let document = parse_document_async( + r#" +

+interface HTMLMarqueeElement { ... }
+interface HTMLMarqueeElement { ... }
+
+ "# + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + let result = proc.apply(); + assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); + Ok(()) + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 00000000..2506d027 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,76 @@ +use std::borrow::Cow; +use std::default::Default; +use std::env; +use std::ffi::OsStr; +use std::io::{self, BufWriter}; +use std::path::{Path, PathBuf}; + +use markup5ever_rcdom::SerializableHandle; + +mod annotate_attributes; +mod boilerplate; +mod dom_utils; +mod interface_index; +mod parser; +mod represents; +mod serializer; +mod tag_omission; + +#[tokio::main] +async fn main() -> io::Result<()> { + // Since we're using Rc in the DOM implementation, we must ensure that tasks + // which act on it are confined to this thread. + + // Find the paths we need. + let cache_dir = path_from_env("HTML_CACHE", ".cache"); + let source_dir = path_from_env("HTML_SOURCE", "../html"); + + // Because parsing can jump around the tree a little, it's most reasonable + // to just parse the whole document before doing any processing. Even for + // the HTML5 specification, this doesn't take too long. + let document = parser::parse_document_async(tokio::io::stdin()).await?; + + let mut boilerplate = boilerplate::Processor::new(cache_dir.clone(), source_dir.join("demos")); + let mut represents = represents::Processor::new(); + let mut annotate_attributes = annotate_attributes::Processor::new(); + let mut tag_omission = tag_omission::Processor::new(); + let mut interface_index = interface_index::Processor::new(); + + // We do exactly one pass to identify the changes that need to be made. + dom_utils::scan_dom(&document, &mut |h| { + boilerplate.visit(h); + represents.visit(h); + annotate_attributes.visit(h); + tag_omission.visit(h); + interface_index.visit(h); + }); + + // And then we apply all of the changes. These different processors mostly + // apply quite local changes, so hopefully we never have to deal with + // conflicts between them. + boilerplate.apply().await?; + represents.apply()?; + annotate_attributes.apply().await?; + tag_omission.apply()?; + interface_index.apply()?; + + // Finally, we write the result to standard out. + let serializable: SerializableHandle = document.into(); + serializer::serialize( + &mut BufWriter::with_capacity(128 * 1024, io::stdout()), + &serializable, + Default::default(), + )?; + Ok(()) +} + +fn path_from_env<'a, V, D>(var: &V, default: &'a D) -> Cow<'a, Path> +where + V: AsRef + ?Sized, + D: AsRef + ?Sized, +{ + match env::var_os(var) { + Some(p) => PathBuf::from(p).into(), + None => default.as_ref().into(), + } +} diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 00000000..ba016836 --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,217 @@ +//! This module specializes the HTML5 parser to respect the "special" void +//! element, which isn't part of standard HTML. It does so by injecting a +//! synthetic token immediately afterward. +//! It also provides some mild integration with async I/O. + +use std::borrow::Cow; +use std::io; + +use html5ever::buffer_queue::BufferQueue; +use html5ever::tendril::{self, stream::Utf8LossyDecoder, ByteTendril, StrTendril, TendrilSink}; +use html5ever::tokenizer::{ + Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts, TokenizerResult, +}; +use html5ever::tree_builder::{TreeBuilder, TreeSink}; +use markup5ever_rcdom::{Handle, RcDom}; +use tokio::io::{AsyncRead, AsyncReadExt}; + +struct TokenFilter { + sink: Sink, +} + +impl TokenSink for TokenFilter { + type Handle = Sink::Handle; + + fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult { + let close_tag = match token { + Token::TagToken(Tag { + kind: TagKind::StartTag, + name: ref tag_name, + .. + }) if tag_name.eq_str_ignore_ascii_case("ref") => Some(Tag { + kind: TagKind::EndTag, + name: tag_name.clone(), + self_closing: false, + attrs: vec![], + }), + _ => None, + }; + match (self.sink.process_token(token, line_number), close_tag) { + (TokenSinkResult::Continue, Some(close_tag)) => self + .sink + .process_token(Token::TagToken(close_tag), line_number), + (result, _) => result, + } + } + + fn end(&mut self) { + self.sink.end() + } + + fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool { + self.sink + .adjusted_current_node_present_but_not_in_html_namespace() + } +} + +struct FilteredParser { + tokenizer: Tokenizer>>, + input_buffer: BufferQueue, +} + +impl TendrilSink for FilteredParser { + fn process(&mut self, t: StrTendril) { + self.input_buffer.push_back(t); + while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {} + } + + fn error(&mut self, desc: Cow<'static, str>) { + self.tokenizer.sink.sink.sink.parse_error(desc) + } + + type Output = Sink::Output; + + fn finish(mut self) -> Self::Output { + while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {} + assert!(self.input_buffer.is_empty()); + self.tokenizer.end(); + self.tokenizer.sink.sink.sink.finish() + } +} + +impl FilteredParser { + #[allow(clippy::wrong_self_convention)] + fn from_utf8(self) -> Utf8LossyDecoder { + Utf8LossyDecoder::new(self) + } +} + +async fn parse_internal_async( + tb: TreeBuilder, + tokenizer_opts: TokenizerOpts, + mut r: R, +) -> io::Result { + let tok = Tokenizer::new(TokenFilter { sink: tb }, tokenizer_opts); + let mut tendril_sink = FilteredParser { + tokenizer: tok, + input_buffer: BufferQueue::new(), + } + .from_utf8(); + + // This draws on the structure of the sync tendril read_from. + const BUFFER_SIZE: u32 = 128 * 1024; + 'read: loop { + let mut tendril = ByteTendril::new(); + unsafe { + tendril.push_uninitialized(BUFFER_SIZE); + } + loop { + match r.read(&mut tendril).await { + Ok(0) => break 'read, + Ok(n) => { + tendril.pop_back(BUFFER_SIZE - n as u32); + tendril_sink.process(tendril); + break; + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} + Err(e) => Err(e)?, + } + } + } + let dom = tendril_sink.finish(); + Ok(dom.document) +} + +pub async fn parse_fragment_async( + r: R, + context: &Handle, +) -> io::Result> { + let tb = + TreeBuilder::new_for_fragment(RcDom::default(), context.clone(), None, Default::default()); + let tokenizer_opts = TokenizerOpts { + initial_state: Some(tb.tokenizer_state_for_context_elem()), + ..TokenizerOpts::default() + }; + let document = parse_internal_async(tb, tokenizer_opts, r).await?; + let mut new_children = document.children.take()[0].children.take(); + for new_child in new_children.iter_mut() { + new_child.parent.take(); + } + Ok(new_children) +} + +pub async fn parse_document_async(r: R) -> io::Result { + let tb = TreeBuilder::new(RcDom::default(), Default::default()); + parse_internal_async(tb, TokenizerOpts::default(), r).await +} + +#[cfg(test)] +pub(crate) mod tests { + use super::*; + use crate::dom_utils::NodeHandleExt; + use html5ever::serialize::{SerializeOpts, TraversalScope}; + use html5ever::{local_name, serialize}; + use markup5ever_rcdom::{NodeData, SerializableHandle}; + + pub(crate) fn serialize_for_test(nodes: &[Handle]) -> String { + let mut output = vec![]; + for node in nodes { + let traversal_scope = match node.data { + NodeData::Document => TraversalScope::ChildrenOnly(None), + _ => TraversalScope::IncludeNode, + }; + serialize( + &mut output, + &SerializableHandle::from(node.clone()), + SerializeOpts { + traversal_scope, + ..Default::default() + }, + ) + .unwrap(); + } + String::from_utf8(output).unwrap() + } + + #[tokio::test] + async fn test_treats_ref_as_void() -> io::Result<()> { + // Without the token filtering, the first ends up as the second's parent. + let document = + parse_document_async("".as_bytes()).await?; + assert_eq!( + serialize_for_test(&[document]), + ""); + Ok(()) + } + + #[tokio::test] + async fn test_treats_ref_as_void_in_fragments() -> io::Result<()> { + // Similar to the above, but in a fragment. + let document = parse_document_async("".as_bytes()).await?; + let body = document.children.borrow()[1].children.borrow()[1].clone(); + assert!(body.is_html_element(&local_name!("body"))); + let children = + parse_fragment_async(".".as_bytes(), &body).await?; + assert_eq!( + serialize_for_test(&children), + "." + ); + Ok(()) + } + + #[tokio::test] + async fn test_fragment_respects_context() -> io::Result<()> { + // Checks that we have the appropriate insertion mode for the element + // we're in. This is important because of the special rules + // surrounding, e.g., tables. If you change this to use the body as context, + // no element at all is emitted. + let document = parse_document_async("".as_bytes()).await?; + let body = document.children.borrow()[1].children.borrow()[1].clone(); + assert!(body.is_html_element(&local_name!("body"))); + let table = body.children.borrow()[0].clone(); + assert!(table.is_html_element(&local_name!("table"))); + let children = parse_fragment_async("".as_bytes(), &table).await?; + assert_eq!(serialize_for_test(&children), ""); + Ok(()) + } +} diff --git a/src/represents.rs b/src/represents.rs new file mode 100644 index 00000000..ebb0474d --- /dev/null +++ b/src/represents.rs @@ -0,0 +1,152 @@ +//! Replaces comments with the HTML which appears in a +//! paragraph of the form: +//!

The tagname element represents ...

+ +use std::collections::HashMap; +use std::io; +use std::rc::Rc; + +use crate::dom_utils::NodeHandleExt; +use html5ever::local_name; +use html5ever::tendril::StrTendril; +use markup5ever_rcdom::{Handle, NodeData}; + +pub struct Processor { + /// Map from tag name (as found in the paragraph) to the which + /// contains the text "represents". + represents: HashMap, + + /// List of comments to be replaced, and what tag name + /// they correspond to. + placeholders: Vec<(Handle, StrTendril)>, +} + +/// Walks from the text node "represents" and finds the tag name and the +/// span that marks where the description begins, or returns None if that +/// cannot be found. +fn find_tag_name(represents_text: &Handle) -> Option<(StrTendril, Handle)> { + let span = represents_text + .parent_node() + .filter(|p| p.is_html_element(&local_name!("span")))?; + let p = span + .parent_node() + .filter(|p| p.is_html_element(&local_name!("p")))?; + let children = p.children.borrow(); + match &children[..] { + [a, b, c, d, ..] + if a.node_text().as_deref().map(|x| x.trim()) == Some("The") + && b.is_html_element(&local_name!("code")) + && c.node_text().as_deref().map(|x| x.trim()) == Some("element") + && Rc::ptr_eq(d, &span) => + { + Some((b.text_content(), span)) + } + _ => None, + } +} + +impl Processor { + pub fn new() -> Self { + Self { + represents: HashMap::new(), + placeholders: Vec::new(), + } + } + + /// Should be called for each node the document. Records when it sees a + /// represents and which element it is defining + pub fn visit(&mut self, node: &Handle) { + match node.data { + NodeData::Text { ref contents } if contents.borrow().as_ref() == "represents" => { + if let Some((tag, span)) = find_tag_name(node) { + self.represents.insert(tag, span); + } + } + NodeData::Comment { ref contents } if contents.starts_with("REPRESENTS ") => { + self.placeholders + .push((node.clone(), contents.subtendril(11, contents.len32() - 11))); + } + _ => (), + } + } + + pub fn apply(self) -> io::Result<()> { + for (placeholder, ref tag) in self.placeholders { + let span = match self.represents.get(tag) { + Some(span) => span, + None => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!(" refers to unknown tag", tag), + )); + } + }; + let parent = match span.parent_node() { + Some(p) => p, + None => continue, + }; + let replacements = parent + .children + .borrow() + .iter() + .skip_while(|s| !Rc::ptr_eq(s, span)) + .skip(1) + .enumerate() + .map(|(index, sibling)| { + let clone = sibling.deep_clone(); + // Capitalize the first letter of the first node (which is expected to be text). + if let (0, NodeData::Text { ref contents }) = (index, &clone.data) { + contents.replace_with(|text| capitalize(text.trim_start())); + } + clone + }) + .collect(); + placeholder.replace_with(replacements); + } + Ok(()) + } +} + +fn capitalize(text: &str) -> StrTendril { + let mut chars = text.chars(); + match chars.next() { + Some(c) => { + let mut capitalized = StrTendril::from_char(c.to_ascii_uppercase()); + capitalized.push_slice(chars.as_str()); + capitalized + } + None => StrTendril::new(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dom_utils; + use crate::parser::{parse_document_async, tests::serialize_for_test}; + + #[tokio::test] + async fn test_represents() -> io::Result<()> { + // Uses can occur either before or after. + let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + "

A seat\nat a table.

The chair element represents a seat\nat a table.

A seat\nat a table.

" + ); + Ok(()) + } + + #[tokio::test] + async fn test_represents_undefined() -> io::Result<()> { + // Uses can occur either before or after. + let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + let result = proc.apply(); + assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); + Ok(()) + } +} diff --git a/src/serializer.rs b/src/serializer.rs new file mode 100644 index 00000000..b4d062a3 --- /dev/null +++ b/src/serializer.rs @@ -0,0 +1,50 @@ +//! This module specializes the HTML5 serializer to omit , which is +//! treated as void by Wattsi. + +use std::io::{self, Write}; + +use html5ever::serialize::*; +use html5ever::{namespace_url, ns, QualName}; + +struct WattsiSerializer(HtmlSerializer); + +impl Serializer for WattsiSerializer { + fn start_elem<'a, AttrIter>(&mut self, name: QualName, attrs: AttrIter) -> io::Result<()> + where + AttrIter: Iterator>, + { + self.0.start_elem(name, attrs) + } + + fn end_elem(&mut self, name: QualName) -> io::Result<()> { + if name.ns == ns!(html) && &name.local == "ref" { + return Ok(()); + } + self.0.end_elem(name) + } + + fn write_text(&mut self, text: &str) -> io::Result<()> { + self.0.write_text(text) + } + + fn write_comment(&mut self, text: &str) -> io::Result<()> { + self.0.write_comment(text) + } + + fn write_doctype(&mut self, name: &str) -> io::Result<()> { + self.0.write_doctype(name) + } + + fn write_processing_instruction(&mut self, target: &str, data: &str) -> io::Result<()> { + self.0.write_processing_instruction(target, data) + } +} + +pub fn serialize(writer: Wr, node: &T, opts: SerializeOpts) -> io::Result<()> +where + Wr: Write, + T: Serialize, +{ + let mut ser = WattsiSerializer(HtmlSerializer::new(writer, opts.clone())); + node.serialize(&mut ser, opts.traversal_scope) +} diff --git a/src/tag_omission.rs b/src/tag_omission.rs new file mode 100644 index 00000000..5f5a97be --- /dev/null +++ b/src/tag_omission.rs @@ -0,0 +1,329 @@ +//! Looks at the "Optional tags" and "Void elements" sections from the HTML +//! syntax spec and replicates that information into the descriptions of the +//! individual elements. + +use std::borrow::Borrow; +use std::collections::HashMap; +use std::io; + +use html5ever::tendril::StrTendril; +use html5ever::{local_name, namespace_url, ns, LocalName, QualName}; +use markup5ever_rcdom::{Handle, NodeData}; +use regex::Regex; + +use crate::dom_utils::{self, NodeHandleExt}; + +#[derive(Default)] +struct ElementInfo { + /// Handles on any paragraphs in the "Optional tags" section which refer to the element. + optional_tags_info: Vec, + + /// Whether the element appears in the "Void elements" list. + is_void_element: bool, + + ///

into which this info must be added. + dl: Option, +} + +#[derive(Default)] +pub struct Processor { + /// The heading level of the "Optional tags" heading, if inside one. + in_optional_tags_heading: Option, + + /// Most recently seen . + most_recent_element_dfn: Option, + + /// Info about elements which have been referred to in these sections. + elements: HashMap, +} + +impl Processor { + pub fn new() -> Self { + Default::default() + } + + pub fn visit(&mut self, node: &Handle) { + // If the heading ends the "Optional tags" section, clear that state. + if let Some(optional_tag_heading_level) = self.in_optional_tags_heading { + match heading_level(node) { + Some(level) if level <= optional_tag_heading_level => { + self.in_optional_tags_heading = None; + } + _ => (), + } + } + + // If we encounter an "Optional tags" section, start observing relevant paragraphs. + // When one is encountered, possibly add it. + if let Some(level) = heading_level(node) { + if node.text_content().trim() == "Optional tags" { + self.in_optional_tags_heading = Some(level); + } + } else if self.in_optional_tags_heading.is_some() && node.is_html_element(&local_name!("p")) + { + self.maybe_record_optional_tags_paragraph(node); + } + + // If we encounter the Void elements section, look for the next dt. + if node.is_html_element(&local_name!("dfn")) + && node.text_content().trim() == "Void elements" + { + if let Some(dt) = node + .parent_node() + .filter(|n| n.is_html_element(&local_name!("dt"))) + { + for dd in dom_utils::dt_descriptions(&dt) { + dom_utils::scan_dom(&dd, &mut |n| { + if n.is_html_element(&local_name!("code")) { + let mut info = self.elements.entry(n.text_content()).or_default(); + info.is_void_element = true; + } + }); + } + } + } + + // If we see an element dfn, watch out for the upcoming
. + if node.is_html_element(&local_name!("dfn")) + && node.has_attribute(&QualName::new(None, ns!(), LocalName::from("element"))) + { + self.most_recent_element_dfn = Some(node.text_content()); + } + + // If we see a
, record that. + if node.is_html_element(&local_name!("dl")) && node.has_class("element") { + if let Some(elem) = std::mem::take(&mut self.most_recent_element_dfn) { + let info = self.elements.entry(elem).or_default(); + if info.dl.is_none() { + info.dl = Some(node.clone()); + } + } + } + } + + fn maybe_record_optional_tags_paragraph(&mut self, paragraph: &Handle) { + // The paragraph must have the structure "A(n) img element..." + let children = paragraph.children.borrow(); + let mut iter = children.iter().fuse(); + match (iter.next(), iter.next(), iter.next()) { + (Some(a), Some(b), Some(c)) + if a.node_text() + .map_or(false, |t| t.trim() == "A" || t.trim() == "An") + && b.is_html_element(&local_name!("code")) + && c.node_text() + .map_or(false, |t| t.trim().starts_with("element")) => + { + let info = self.elements.entry(b.text_content()).or_default(); + info.optional_tags_info.push(paragraph.clone()); + } + _ => (), + } + } + + pub fn apply(self) -> io::Result<()> { + let data_x = LocalName::from("data-x"); + let qual_data_x = QualName::new(None, ns!(), data_x.clone()); + let dt = Handle::create_element(local_name!("dt")) + .child( + Handle::create_element(local_name!("span")) + .attribute(&data_x, "concept-element-tag-omission") + .text("Tag omission in text/html") + .build(), + ) + .text(":") + .build(); + let void_dd = Handle::create_element(local_name!("dd")) + .text("No ") + .child( + Handle::create_element(local_name!("span")) + .attribute(&data_x, "syntax-end-tag") + .text("end tag") + .build(), + ) + .text(".") + .build(); + let default_dd = Handle::create_element(local_name!("dd")) + .text("Neither tag is omissible.") + .build(); + let may_re = Regex::new(r"\bmay\b").unwrap(); + + for info in self.elements.into_values() { + let dl = match info.dl { + Some(dl) => dl, + None => continue, + }; + + let mut to_insert = vec![dt.deep_clone()]; + if !info.optional_tags_info.is_empty() { + // Convert

to

, replacing "may" with "can". + for p in info.optional_tags_info { + let borrowed_children = p.children.borrow(); + let new_children = borrowed_children.iter().map(|n| { + let new_node = n.deep_clone(); + dom_utils::scan_dom(&new_node, &mut |c| { + if let NodeData::Text { ref contents } = c.data { + let mut text = contents.borrow_mut(); + *text = StrTendril::from(may_re.replace(&text, "can").borrow()); + } + }); + new_node + }); + let dd = Handle::create_element(local_name!("dd")) + .children(new_children) + .build(); + to_insert.push(dd); + } + } else if info.is_void_element { + to_insert.push(void_dd.deep_clone()); + } else { + to_insert.push(default_dd.deep_clone()); + } + to_insert.push(Handle::create_text_node("\n")); + + let dl_children = dl.children.borrow(); + let attributes_dt = if let Some(attributes_dt) = dl_children.iter().find(|child| { + child.is_html_element(&local_name!("dt")) + && child + .any_child(|c| c.attribute_is(&qual_data_x, "concept-element-attributes")) + }) { + attributes_dt.clone() + } else { + continue; + }; + drop(dl_children); + dl.insert_children_before(&attributes_dt, to_insert.into_iter()); + } + Ok(()) + } +} + +/// Returns the heading level (from 1 to 6) that the

through

declares, or None for all other nodes. +fn heading_level(node: &Handle) -> Option { + let local = match node.data { + NodeData::Element { ref name, .. } if name.ns == ns!(html) => &name.local, + _ => return None, + }; + match *local { + local_name!("h1") => Some(1), + local_name!("h2") => Some(2), + local_name!("h3") => Some(3), + local_name!("h4") => Some(4), + local_name!("h5") => Some(5), + local_name!("h6") => Some(6), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::parser::{parse_document_async, tests::serialize_for_test}; + + #[tokio::test] + async fn test_simple() -> io::Result<()> { + let document = parse_document_async( + r#" +

Optional tags

+

A td element does very tdish things and may be very cellular.

+

An audio element is quite audible.

+

Another section

+

A body element is ignored because it's in another section. +

+
Void elements +
img and meta are void. +
input is too. +
Non-void elements +
html is interesting but not void. +
+

Elements

+

audio +

+
+
+

body +

+
+
+

html +

+
+
+

img +

+
+
+

input +

+
+
+

meta +

+
+
+

td +

+
+
+ "# + .trim() + .as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + proc.apply()?; + assert_eq!( + serialize_for_test(&[document]), + r#" +

Optional tags

+

A td element does very tdish things and may be very cellular.

+

An audio element is quite audible.

+

Another section

+

A body element is ignored because it's in another section. +

+
Void elements +
img and meta are void. +
input is too. +
Non-void elements +
html is interesting but not void. +
+

Elements

+

audio +

+
Tag omission in text/html:
An audio element is quite audible.
+
+
+

body +

+
Tag omission in text/html:
Neither tag is omissible.
+
+
+

html +

+
Tag omission in text/html:
Neither tag is omissible.
+
+
+

img +

+
Tag omission in text/html:
No end tag.
+
+
+

input +

+
Tag omission in text/html:
No end tag.
+
+
+

meta +

+
Tag omission in text/html:
No end tag.
+
+
+

td +

+
Tag omission in text/html:
A td element does very tdish things and can be very cellular.
+
+
+ "#.trim()); + Ok(()) + } +} From 8c0a8d4b723baed74c9b8fa6f5d282e2eb238d00 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Sun, 6 Aug 2023 20:17:37 +0900 Subject: [PATCH 08/28] Remove custom stuff --- src/main.rs | 6 ++-- src/parser.rs | 82 ++++------------------------------------------- src/serializer.rs | 50 ----------------------------- 3 files changed, 9 insertions(+), 129 deletions(-) delete mode 100644 src/serializer.rs diff --git a/src/main.rs b/src/main.rs index 2506d027..8d08c6d7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,4 @@ +use html5ever::serialize::{serialize, SerializeOpts}; use std::borrow::Cow; use std::default::Default; use std::env; @@ -13,7 +14,6 @@ mod dom_utils; mod interface_index; mod parser; mod represents; -mod serializer; mod tag_omission; #[tokio::main] @@ -56,10 +56,10 @@ async fn main() -> io::Result<()> { // Finally, we write the result to standard out. let serializable: SerializableHandle = document.into(); - serializer::serialize( + serialize( &mut BufWriter::with_capacity(128 * 1024, io::stdout()), &serializable, - Default::default(), + SerializeOpts::default(), )?; Ok(()) } diff --git a/src/parser.rs b/src/parser.rs index ba016836..2ec15d88 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,61 +1,17 @@ -//! This module specializes the HTML5 parser to respect the "special" void -//! element, which isn't part of standard HTML. It does so by injecting a -//! synthetic token immediately afterward. -//! It also provides some mild integration with async I/O. +//! This module provides some mild integration between the html5ever parser and async I/O. use std::borrow::Cow; use std::io; use html5ever::buffer_queue::BufferQueue; use html5ever::tendril::{self, stream::Utf8LossyDecoder, ByteTendril, StrTendril, TendrilSink}; -use html5ever::tokenizer::{ - Tag, TagKind, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts, TokenizerResult, -}; +use html5ever::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult}; use html5ever::tree_builder::{TreeBuilder, TreeSink}; use markup5ever_rcdom::{Handle, RcDom}; use tokio::io::{AsyncRead, AsyncReadExt}; -struct TokenFilter { - sink: Sink, -} - -impl TokenSink for TokenFilter { - type Handle = Sink::Handle; - - fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult { - let close_tag = match token { - Token::TagToken(Tag { - kind: TagKind::StartTag, - name: ref tag_name, - .. - }) if tag_name.eq_str_ignore_ascii_case("ref") => Some(Tag { - kind: TagKind::EndTag, - name: tag_name.clone(), - self_closing: false, - attrs: vec![], - }), - _ => None, - }; - match (self.sink.process_token(token, line_number), close_tag) { - (TokenSinkResult::Continue, Some(close_tag)) => self - .sink - .process_token(Token::TagToken(close_tag), line_number), - (result, _) => result, - } - } - - fn end(&mut self) { - self.sink.end() - } - - fn adjusted_current_node_present_but_not_in_html_namespace(&self) -> bool { - self.sink - .adjusted_current_node_present_but_not_in_html_namespace() - } -} - struct FilteredParser { - tokenizer: Tokenizer>>, + tokenizer: Tokenizer>, input_buffer: BufferQueue, } @@ -66,7 +22,7 @@ impl TendrilSink for FilteredParser { } fn error(&mut self, desc: Cow<'static, str>) { - self.tokenizer.sink.sink.sink.parse_error(desc) + self.tokenizer.sink.sink.parse_error(desc) } type Output = Sink::Output; @@ -75,7 +31,7 @@ impl TendrilSink for FilteredParser { while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {} assert!(self.input_buffer.is_empty()); self.tokenizer.end(); - self.tokenizer.sink.sink.sink.finish() + self.tokenizer.sink.sink.finish() } } @@ -91,7 +47,7 @@ async fn parse_internal_async( tokenizer_opts: TokenizerOpts, mut r: R, ) -> io::Result { - let tok = Tokenizer::new(TokenFilter { sink: tb }, tokenizer_opts); + let tok = Tokenizer::new(tb, tokenizer_opts); let mut tendril_sink = FilteredParser { tokenizer: tok, input_buffer: BufferQueue::new(), @@ -173,32 +129,6 @@ pub(crate) mod tests { String::from_utf8(output).unwrap() } - #[tokio::test] - async fn test_treats_ref_as_void() -> io::Result<()> { - // Without the token filtering, the first ends up as the second's parent. - let document = - parse_document_async("".as_bytes()).await?; - assert_eq!( - serialize_for_test(&[document]), - ""); - Ok(()) - } - - #[tokio::test] - async fn test_treats_ref_as_void_in_fragments() -> io::Result<()> { - // Similar to the above, but in a fragment. - let document = parse_document_async("".as_bytes()).await?; - let body = document.children.borrow()[1].children.borrow()[1].clone(); - assert!(body.is_html_element(&local_name!("body"))); - let children = - parse_fragment_async(".".as_bytes(), &body).await?; - assert_eq!( - serialize_for_test(&children), - "." - ); - Ok(()) - } - #[tokio::test] async fn test_fragment_respects_context() -> io::Result<()> { // Checks that we have the appropriate insertion mode for the element diff --git a/src/serializer.rs b/src/serializer.rs deleted file mode 100644 index b4d062a3..00000000 --- a/src/serializer.rs +++ /dev/null @@ -1,50 +0,0 @@ -//! This module specializes the HTML5 serializer to omit , which is -//! treated as void by Wattsi. - -use std::io::{self, Write}; - -use html5ever::serialize::*; -use html5ever::{namespace_url, ns, QualName}; - -struct WattsiSerializer(HtmlSerializer); - -impl Serializer for WattsiSerializer { - fn start_elem<'a, AttrIter>(&mut self, name: QualName, attrs: AttrIter) -> io::Result<()> - where - AttrIter: Iterator>, - { - self.0.start_elem(name, attrs) - } - - fn end_elem(&mut self, name: QualName) -> io::Result<()> { - if name.ns == ns!(html) && &name.local == "ref" { - return Ok(()); - } - self.0.end_elem(name) - } - - fn write_text(&mut self, text: &str) -> io::Result<()> { - self.0.write_text(text) - } - - fn write_comment(&mut self, text: &str) -> io::Result<()> { - self.0.write_comment(text) - } - - fn write_doctype(&mut self, name: &str) -> io::Result<()> { - self.0.write_doctype(name) - } - - fn write_processing_instruction(&mut self, target: &str, data: &str) -> io::Result<()> { - self.0.write_processing_instruction(target, data) - } -} - -pub fn serialize(writer: Wr, node: &T, opts: SerializeOpts) -> io::Result<()> -where - Wr: Write, - T: Serialize, -{ - let mut ser = WattsiSerializer(HtmlSerializer::new(writer, opts.clone())); - node.serialize(&mut ser, opts.traversal_scope) -} From 19b6e8e33601c1eba92bdb49c2cd3b351f131b4d Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Sun, 6 Aug 2023 20:17:58 +0900 Subject: [PATCH 09/28] Apply cargo fix --- src/annotate_attributes.rs | 2 +- src/interface_index.rs | 2 +- src/tag_omission.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/annotate_attributes.rs b/src/annotate_attributes.rs index 8e5e5608..00720847 100644 --- a/src/annotate_attributes.rs +++ b/src/annotate_attributes.rs @@ -150,7 +150,7 @@ impl Processor { .collect(), variant: variant_str, }; - let mut existing = self.attributes.entry(attr_key).or_default(); + let existing = self.attributes.entry(attr_key).or_default(); if existing.default.is_empty() { existing.default = descriptions.default; } else if !descriptions.default.is_empty() { diff --git a/src/interface_index.rs b/src/interface_index.rs index 49bdadf4..ffbdd48a 100644 --- a/src/interface_index.rs +++ b/src/interface_index.rs @@ -76,7 +76,7 @@ impl Processor { continue; } let name = window[1].text_content(); - let mut info = self.interfaces.entry(name).or_default(); + let info = self.interfaces.entry(name).or_default(); if is_partial { if let Some(id) = window[1].get_attribute(&ID) { info.partials.push(id); diff --git a/src/tag_omission.rs b/src/tag_omission.rs index 5f5a97be..197ed123 100644 --- a/src/tag_omission.rs +++ b/src/tag_omission.rs @@ -75,7 +75,7 @@ impl Processor { for dd in dom_utils::dt_descriptions(&dt) { dom_utils::scan_dom(&dd, &mut |n| { if n.is_html_element(&local_name!("code")) { - let mut info = self.elements.entry(n.text_content()).or_default(); + let info = self.elements.entry(n.text_content()).or_default(); info.is_void_element = true; } }); From cff9c938c1395768c438a1fef5548f69f826a9e0 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Mon, 7 Aug 2023 20:42:31 +0900 Subject: [PATCH 10/28] Change method name to avoid clippy suppression --- src/parser.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 2ec15d88..6097df0c 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -36,8 +36,7 @@ impl TendrilSink for FilteredParser { } impl FilteredParser { - #[allow(clippy::wrong_self_convention)] - fn from_utf8(self) -> Utf8LossyDecoder { + fn into_utf8(self) -> Utf8LossyDecoder { Utf8LossyDecoder::new(self) } } @@ -52,7 +51,7 @@ async fn parse_internal_async( tokenizer: tok, input_buffer: BufferQueue::new(), } - .from_utf8(); + .into_utf8(); // This draws on the structure of the sync tendril read_from. const BUFFER_SIZE: u32 = 128 * 1024; From f4f795926f8951db0069014a6831495b8c3eaa6b Mon Sep 17 00:00:00 2001 From: Jeremy Roman Date: Tue, 8 Aug 2023 15:52:37 -0400 Subject: [PATCH 11/28] Remove FilteredParser. It was only different from html5ever::driver::Parser in that it used the filtered tokenizer. With that gone, the ordinary Parser struct works. --- src/parser.rs | 66 ++++++++++----------------------------------------- 1 file changed, 13 insertions(+), 53 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 6097df0c..2ab2f9a3 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,57 +1,17 @@ //! This module provides some mild integration between the html5ever parser and async I/O. -use std::borrow::Cow; use std::io; -use html5ever::buffer_queue::BufferQueue; -use html5ever::tendril::{self, stream::Utf8LossyDecoder, ByteTendril, StrTendril, TendrilSink}; -use html5ever::tokenizer::{Tokenizer, TokenizerOpts, TokenizerResult}; -use html5ever::tree_builder::{TreeBuilder, TreeSink}; +use html5ever::driver::{self, Parser}; +use html5ever::tendril::{ByteTendril, TendrilSink}; use markup5ever_rcdom::{Handle, RcDom}; use tokio::io::{AsyncRead, AsyncReadExt}; -struct FilteredParser { - tokenizer: Tokenizer>, - input_buffer: BufferQueue, -} - -impl TendrilSink for FilteredParser { - fn process(&mut self, t: StrTendril) { - self.input_buffer.push_back(t); - while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {} - } - - fn error(&mut self, desc: Cow<'static, str>) { - self.tokenizer.sink.sink.parse_error(desc) - } - - type Output = Sink::Output; - - fn finish(mut self) -> Self::Output { - while let TokenizerResult::Script(_) = self.tokenizer.feed(&mut self.input_buffer) {} - assert!(self.input_buffer.is_empty()); - self.tokenizer.end(); - self.tokenizer.sink.sink.finish() - } -} - -impl FilteredParser { - fn into_utf8(self) -> Utf8LossyDecoder { - Utf8LossyDecoder::new(self) - } -} - async fn parse_internal_async( - tb: TreeBuilder, - tokenizer_opts: TokenizerOpts, + parser: Parser, mut r: R, ) -> io::Result { - let tok = Tokenizer::new(tb, tokenizer_opts); - let mut tendril_sink = FilteredParser { - tokenizer: tok, - input_buffer: BufferQueue::new(), - } - .into_utf8(); + let mut tendril_sink = parser.from_utf8(); // This draws on the structure of the sync tendril read_from. const BUFFER_SIZE: u32 = 128 * 1024; @@ -81,13 +41,13 @@ pub async fn parse_fragment_async( r: R, context: &Handle, ) -> io::Result> { - let tb = - TreeBuilder::new_for_fragment(RcDom::default(), context.clone(), None, Default::default()); - let tokenizer_opts = TokenizerOpts { - initial_state: Some(tb.tokenizer_state_for_context_elem()), - ..TokenizerOpts::default() - }; - let document = parse_internal_async(tb, tokenizer_opts, r).await?; + let parser = driver::parse_fragment_for_element( + RcDom::default(), + Default::default(), + context.clone(), + None, + ); + let document = parse_internal_async(parser, r).await?; let mut new_children = document.children.take()[0].children.take(); for new_child in new_children.iter_mut() { new_child.parent.take(); @@ -96,8 +56,8 @@ pub async fn parse_fragment_async( } pub async fn parse_document_async(r: R) -> io::Result { - let tb = TreeBuilder::new(RcDom::default(), Default::default()); - parse_internal_async(tb, TokenizerOpts::default(), r).await + let parser = driver::parse_document(RcDom::default(), Default::default()); + parse_internal_async(parser, r).await } #[cfg(test)] From 9d3ad67472c7dab0f7ecc90fd56376cdd346a288 Mon Sep 17 00:00:00 2001 From: Jeremy Roman Date: Wed, 23 Aug 2023 17:31:56 -0400 Subject: [PATCH 12/28] two comments from @domfarolino --- src/main.rs | 2 +- src/parser.rs | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index 8d08c6d7..bd034472 100644 --- a/src/main.rs +++ b/src/main.rs @@ -27,7 +27,7 @@ async fn main() -> io::Result<()> { // Because parsing can jump around the tree a little, it's most reasonable // to just parse the whole document before doing any processing. Even for - // the HTML5 specification, this doesn't take too long. + // the HTML standard, this doesn't take too long. let document = parser::parse_document_async(tokio::io::stdin()).await?; let mut boilerplate = boilerplate::Processor::new(cache_dir.clone(), source_dir.join("demos")); diff --git a/src/parser.rs b/src/parser.rs index 2ab2f9a3..a10de56d 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -14,6 +14,7 @@ async fn parse_internal_async( let mut tendril_sink = parser.from_utf8(); // This draws on the structure of the sync tendril read_from. + // https://docs.rs/tendril/latest/tendril/stream/trait.TendrilSink.html#method.read_from const BUFFER_SIZE: u32 = 128 * 1024; 'read: loop { let mut tendril = ByteTendril::new(); From d45608dea0c45d652be1a618487d489be84f608a Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Thu, 31 Aug 2023 14:02:20 +0900 Subject: [PATCH 13/28] Update Docker stuff --- .dockerignore | 3 +++ Dockerfile | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/.dockerignore b/.dockerignore index 7f80c3e4..216553b6 100644 --- a/.dockerignore +++ b/.dockerignore @@ -4,3 +4,6 @@ !*.pl !build.sh !lint.sh +!Cargo.lock +!Cargo.toml +!src diff --git a/Dockerfile b/Dockerfile index 54b555bb..274f7b74 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,15 @@ +FROM rust:1.72 as builder +WORKDIR /whatwg/html-build +COPY . . +RUN cargo install --path . + FROM debian:stable-slim RUN apt-get update && \ apt-get install --yes --no-install-recommends ca-certificates curl git python3 python3-pip pipx && \ rm -rf /var/lib/apt/lists/* +COPY --from=builder /usr/local/cargo/bin/html-build /bin/html-build + COPY --from=ghcr.io/whatwg/wattsi:latest /whatwg/wattsi/bin/wattsi /bin/wattsi ENV PIPX_HOME /opt/pipx From e9c53895c62519ff9870b2a0c2e5dae9e15efb65 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Thu, 31 Aug 2023 14:11:31 +0900 Subject: [PATCH 14/28] Better Dockerification --- Dockerfile | 4 +++- build.sh | 9 +++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 274f7b74..5133a031 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,7 @@ FROM rust:1.72 as builder WORKDIR /whatwg/html-build -COPY . . +COPY Cargo.lock Cargo.toml ./ +COPY src ./src/ RUN cargo install --path . FROM debian:stable-slim @@ -19,4 +20,5 @@ RUN pipx install bs-highlighter COPY . /whatwg/html-build/ ENV SKIP_BUILD_UPDATE_CHECK true +ENV PROCESS_WITH_RUST true ENTRYPOINT ["bash", "/whatwg/html-build/build.sh"] diff --git a/build.sh b/build.sh index 8af67ccb..457a4c60 100755 --- a/build.sh +++ b/build.sh @@ -32,6 +32,7 @@ HTML_CACHE=${HTML_CACHE:-$DIR/.cache} HTML_TEMP=${HTML_TEMP:-$DIR/.temp} HTML_OUTPUT=${HTML_OUTPUT:-$DIR/output} HTML_GIT_CLONE_OPTIONS=${HTML_GIT_CLONE_OPTIONS:-"--depth=2"} +PROCESS_WITH_RUST=${PROCESS_WITH_RUST:-false} # These are used by child scripts, and so we export them export HTML_CACHE @@ -529,8 +530,12 @@ function processSource { BUILD_TYPE="$2" cp -p entities/out/entities.inc "$HTML_CACHE" cp -p entities/out/entities-dtd.url "$HTML_CACHE" - if [ "${PROCESS_WITH_RUST:-0}" = "1" ]; then - cargo run -r <"$HTML_SOURCE/$SOURCE_LOCATION" >"$HTML_TEMP/source-whatwg-complete" + if [ $PROCESS_WITH_RUST == "true" ]; then + if hash html-build 2>/dev/null; then + html-build <"$HTML_SOURCE/$SOURCE_LOCATION" >"$HTML_TEMP/source-whatwg-complete" + else + cargo run -r <"$HTML_SOURCE/$SOURCE_LOCATION" >"$HTML_TEMP/source-whatwg-complete" + fi else if $VERBOSE; then perl .pre-process-main.pl --verbose < "$HTML_SOURCE/$SOURCE_LOCATION" > "$HTML_TEMP/source-expanded-1" From b077027300046dd5434fb556cecfe83ebbef7f37 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Thu, 31 Aug 2023 14:14:23 +0900 Subject: [PATCH 15/28] Thanks shellcheck --- build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sh b/build.sh index 457a4c60..1c70b3d6 100755 --- a/build.sh +++ b/build.sh @@ -530,7 +530,7 @@ function processSource { BUILD_TYPE="$2" cp -p entities/out/entities.inc "$HTML_CACHE" cp -p entities/out/entities-dtd.url "$HTML_CACHE" - if [ $PROCESS_WITH_RUST == "true" ]; then + if [[ $PROCESS_WITH_RUST == "true" ]]; then if hash html-build 2>/dev/null; then html-build <"$HTML_SOURCE/$SOURCE_LOCATION" >"$HTML_TEMP/source-whatwg-complete" else From 732cdc1bb3717bec609849e8ecaf1677d1d627f1 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Thu, 31 Aug 2023 15:10:23 +0900 Subject: [PATCH 16/28] Build for release --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 5133a031..570f8352 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM rust:1.72 as builder WORKDIR /whatwg/html-build COPY Cargo.lock Cargo.toml ./ COPY src ./src/ -RUN cargo install --path . +RUN cargo install --path . --release FROM debian:stable-slim RUN apt-get update && \ From 91fd569329144acd9355a1730e9cbe26d1b16b0a Mon Sep 17 00:00:00 2001 From: Jeremy Roman Date: Mon, 16 Oct 2023 16:56:52 -0400 Subject: [PATCH 17/28] warn on more than one copy of MARKER --- src/interface_index.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/interface_index.rs b/src/interface_index.rs index ffbdd48a..aa665aa7 100644 --- a/src/interface_index.rs +++ b/src/interface_index.rs @@ -104,6 +104,15 @@ impl Processor { format!("Marker {MARKER:?} not found."), )); } + if self.marker_nodes.len() > 1 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "{MARKER:?} found {} times, expected just one.", + self.marker_nodes.len() + ), + )); + } for marker in self.marker_nodes { // We need to find where the marker appears in the text so that we // can split it into two text nodes. From 6da2b82345ceb9d557b5cffaf64f7a8f72e2a32f Mon Sep 17 00:00:00 2001 From: Jeremy Roman Date: Mon, 16 Oct 2023 16:59:07 -0400 Subject: [PATCH 18/28] only accept one marker for interface index --- src/interface_index.rs | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/interface_index.rs b/src/interface_index.rs index aa665aa7..a6bbf37f 100644 --- a/src/interface_index.rs +++ b/src/interface_index.rs @@ -339,14 +339,13 @@ partial interface HTMLMarqueeElement io::Result<()> { + async fn marker_before() -> io::Result<()> { let document = parse_document_async( r#" INSERT INTERFACES HERE

 interface HTMLMarqueeElement { ... }
 
-INSERT INTERFACES HERE "# .trim() .as_bytes(), @@ -361,8 +360,7 @@ INSERT INTERFACES HERE
  • HTMLMarqueeElement

 interface HTMLMarqueeElement { ... }
-
-
  • HTMLMarqueeElement
+ "## .trim() ); @@ -379,6 +377,19 @@ interface HTMLMarqueeElement { ... } Ok(()) } + #[tokio::test] + async fn duplicate_marker() -> io::Result<()> { + let document = parse_document_async( + "
INSERT INTERFACES HERE
INSERT INTERFACES HERE
".as_bytes(), + ) + .await?; + let mut proc = Processor::new(); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + let result = proc.apply(); + assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); + Ok(()) + } + #[tokio::test] async fn duplicate_dfn() -> io::Result<()> { let document = parse_document_async( From 369429935882b216a4509e07188d2da3df7ac4a9 Mon Sep 17 00:00:00 2001 From: Jeremy Roman Date: Mon, 16 Oct 2023 17:12:22 -0400 Subject: [PATCH 19/28] add a comment requested by domfarolino --- src/interface_index.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/interface_index.rs b/src/interface_index.rs index a6bbf37f..7039fd9a 100644 --- a/src/interface_index.rs +++ b/src/interface_index.rs @@ -13,6 +13,7 @@ use crate::dom_utils::NodeHandleExt; #[derive(Default, Debug)] struct InterfaceInfo { /// Number of times the interface definition was seen. Should be one. + /// We store other numbers for convenience in error handling and reporting. seen: u32, /// The IDs of the partial interfaces, in the order they appear in the document. From 3c5b2eb5eaa0a2ad23bc04fa434c82dd5e97d1e9 Mon Sep 17 00:00:00 2001 From: Jeremy Roman Date: Mon, 16 Oct 2023 17:30:40 -0400 Subject: [PATCH 20/28] error rather than ignore unsafe paths --- src/boilerplate.rs | 63 +++++++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/src/boilerplate.rs b/src/boilerplate.rs index 47aa587c..87801a3a 100644 --- a/src/boilerplate.rs +++ b/src/boilerplate.rs @@ -53,10 +53,15 @@ impl Processor { // demand. NodeData::Comment { ref contents } if contents.starts_with("BOILERPLATE ") => { let path = Path::new(contents[12..].trim()); - if is_safe_path(path) { - let file = tokio::spawn(File::open(self.path.join(path))); - self.edits.push(Edit::ReplaceHTML(node.clone(), file)); - } + let file = if is_safe_path(path) { + tokio::spawn(File::open(self.path.join(path))) + } else { + async_error(io::Error::new( + io::ErrorKind::PermissionDenied, + "cannot traverse to a parent directory in {path}", + )) + }; + self.edits.push(Edit::ReplaceHTML(node.clone(), file)); } // Pseudo-comments can also appear in element attributes. These are // not parsed as HTML, so we simply want to read them into memory so @@ -69,14 +74,19 @@ impl Processor { { if value.starts_with("") { let path = Path::new(value[16..value.len() - 3].trim()); - if is_safe_path(path) { - let file_contents = read_to_str_tendril(self.path.join(path)); - self.edits.push(Edit::ReplaceAttr( - node.clone(), - name.clone(), - file_contents, - )); - } + let file_contents = if is_safe_path(path) { + read_to_str_tendril(self.path.join(path)) + } else { + async_error(io::Error::new( + io::ErrorKind::PermissionDenied, + "cannot traverse to a parent directory in {path}", + )) + }; + self.edits.push(Edit::ReplaceAttr( + node.clone(), + name.clone(), + file_contents, + )); } } } @@ -98,11 +108,16 @@ impl Processor { }); if has_suitable_parent { let path = Path::new(text[8..].trim()); - if is_safe_path(path) { - let file_contents = read_to_str_tendril(self.example_path.join(path)); - self.edits - .push(Edit::ReplaceText(node.clone(), file_contents)) - } + let file_contents = if is_safe_path(path) { + read_to_str_tendril(self.example_path.join(path)) + } else { + async_error(io::Error::new( + io::ErrorKind::PermissionDenied, + "cannot traverse to a parent directory in {path}", + )) + }; + self.edits + .push(Edit::ReplaceText(node.clone(), file_contents)) } } _ => (), @@ -155,6 +170,11 @@ fn read_to_str_tendril(path: impl AsRef) -> JoinHandle(err: io::Error) -> JoinHandle> { + tokio::spawn(async move { Err(err) }) +} + #[cfg(test)] mod tests { use super::*; @@ -220,17 +240,14 @@ mod tests { } #[tokio::test] - async fn test_ignores_unsafe_paths() -> io::Result<()> { + async fn test_errors_unsafe_paths() -> io::Result<()> { let document = parse_document_async("
\">EXAMPLE ../foo
".as_bytes()) .await?; let mut proc = Processor::new(Path::new("."), Path::new(".")); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); - assert_eq!(proc.edits.len(), 0); - proc.apply().await?; - assert_eq!( - serialize_for_test(&[document]), - "
\">EXAMPLE ../foo
"); + let result = proc.apply().await; + assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::PermissionDenied)); Ok(()) } } From cf304a07317b693b89f3321f30ef8c0fa93b9a61 Mon Sep 17 00:00:00 2001 From: Jeremy Roman Date: Mon, 16 Oct 2023 17:33:32 -0400 Subject: [PATCH 21/28] make absolute and .. paths an error rather than ignored --- src/boilerplate.rs | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/boilerplate.rs b/src/boilerplate.rs index 87801a3a..aae1be99 100644 --- a/src/boilerplate.rs +++ b/src/boilerplate.rs @@ -241,13 +241,18 @@ mod tests { #[tokio::test] async fn test_errors_unsafe_paths() -> io::Result<()> { - let document = - parse_document_async("
\">EXAMPLE ../foo
".as_bytes()) - .await?; - let mut proc = Processor::new(Path::new("."), Path::new(".")); - dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); - let result = proc.apply().await; - assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::PermissionDenied)); + let bad_path_examples = [ + "", + "
\">
", + "
EXAMPLE ../foo
", + ]; + for example in bad_path_examples { + let document = parse_document_async(example.as_bytes()).await?; + let mut proc = Processor::new(Path::new("."), Path::new(".")); + dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); + let result = proc.apply().await; + assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::PermissionDenied)); + } Ok(()) } } From 401b9978e83b1715211fe3f0a7894acdc8ab515b Mon Sep 17 00:00:00 2001 From: Jeremy Roman Date: Mon, 16 Oct 2023 22:05:45 -0400 Subject: [PATCH 22/28] io utils --- src/boilerplate.rs | 24 ++----------------- src/io_utils.rs | 57 ++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 1 + 3 files changed, 60 insertions(+), 22 deletions(-) create mode 100644 src/io_utils.rs diff --git a/src/boilerplate.rs b/src/boilerplate.rs index aae1be99..04677b4d 100644 --- a/src/boilerplate.rs +++ b/src/boilerplate.rs @@ -6,13 +6,14 @@ use std::io; use std::path::{Path, PathBuf}; -use html5ever::tendril::{self, SendTendril, StrTendril}; +use html5ever::tendril::{self, SendTendril}; use html5ever::{local_name, Attribute, LocalName, QualName}; use markup5ever_rcdom::{Handle, NodeData}; use tokio::fs::File; use tokio::task::JoinHandle; use crate::dom_utils::NodeHandleExt; +use crate::io_utils::{is_safe_path, read_to_str_tendril, async_error}; use crate::parser; type SendStrTendril = SendTendril; @@ -154,27 +155,6 @@ impl Processor { } } -/// Check that a path is safe to open, even if the source is potentially untrusted. -fn is_safe_path(path: &Path) -> bool { - use std::path::Component; - path.components() - .all(|c| matches!(c, Component::Normal(_) | Component::CurDir)) -} - -/// In a spawned task, read to a string, then move it to a tendril. -fn read_to_str_tendril(path: impl AsRef) -> JoinHandle> { - let path = path.as_ref().to_owned(); - tokio::spawn(async move { - let string = tokio::fs::read_to_string(path).await?; - Ok(StrTendril::from(string).into_send()) - }) -} - -/// Creates a join Handle for an error -fn async_error(err: io::Error) -> JoinHandle> { - tokio::spawn(async move { Err(err) }) -} - #[cfg(test)] mod tests { use super::*; diff --git a/src/io_utils.rs b/src/io_utils.rs new file mode 100644 index 00000000..f96a06a4 --- /dev/null +++ b/src/io_utils.rs @@ -0,0 +1,57 @@ +//! Misccellaneous utilities for I/O. + +use std::io; +use std::path::Path; + +use html5ever::tendril::{self, SendTendril, StrTendril}; +use tokio::task::JoinHandle; + +type SendStrTendril = SendTendril; + +/// Check that a path is safe to open, even if the source is potentially untrusted. +pub fn is_safe_path(path: impl AsRef) -> bool { + use std::path::Component; + path.as_ref() + .components() + .all(|c| matches!(c, Component::Normal(_) | Component::CurDir)) +} + +/// In a spawned task, read to a string, then move it to a tendril. +pub fn read_to_str_tendril(path: impl AsRef) -> JoinHandle> { + let path = path.as_ref().to_owned(); + tokio::spawn(async move { + let string = tokio::fs::read_to_string(path).await?; + Ok(StrTendril::from(string).into_send()) + }) +} + +/// Creates a join Handle for an error. Useful when an operation will fail, but +/// it's more convenient to handle later on. +pub fn async_error(err: io::Error) -> JoinHandle> { + tokio::spawn(async move { Err(err) }) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_is_safe_path() { + assert!(is_safe_path("a.txt")); + assert!(is_safe_path("a/b.txt")); + assert!(is_safe_path("a/b/c/./d.txt")); + assert!(!is_safe_path("../parent.txt")); + assert!(!is_safe_path("/etc/passwd")); + } + + #[tokio::test] + async fn test_read_to_str_tendril() -> io::Result<()> { + let temp_dir = TempDir::new()?; + let file_path = temp_dir.path().join("a.txt"); + tokio::fs::write(&file_path, "Hello, world!").await?; + let send_tendril = read_to_str_tendril(&file_path).await??; + assert_eq!(StrTendril::from(send_tendril).as_ref(), "Hello, world!"); + Ok(()) + } +} \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index bd034472..975d24ef 100644 --- a/src/main.rs +++ b/src/main.rs @@ -12,6 +12,7 @@ mod annotate_attributes; mod boilerplate; mod dom_utils; mod interface_index; +mod io_utils; mod parser; mod represents; mod tag_omission; From e9f39c3db7bcb5f5bc475d9aa1a448195743318d Mon Sep 17 00:00:00 2001 From: Jeremy Roman Date: Mon, 16 Oct 2023 22:13:36 -0400 Subject: [PATCH 23/28] move heading_level into dom_utils --- src/dom_utils.rs | 17 +++++++++++++++++ src/tag_omission.rs | 19 +------------------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/dom_utils.rs b/src/dom_utils.rs index 95894515..34b07aeb 100644 --- a/src/dom_utils.rs +++ b/src/dom_utils.rs @@ -183,6 +183,23 @@ pub fn dt_descriptions(dt: &Handle) -> Vec { } } +/// Returns the heading level (from 1 to 6) that the

through

declares, or None for all other nodes. +pub fn heading_level(node: &Handle) -> Option { + let local = match node.data { + NodeData::Element { ref name, .. } if name.ns == ns!(html) => &name.local, + _ => return None, + }; + match *local { + local_name!("h1") => Some(1), + local_name!("h2") => Some(2), + local_name!("h3") => Some(3), + local_name!("h4") => Some(4), + local_name!("h5") => Some(5), + local_name!("h6") => Some(6), + _ => None, + } +} + impl NodeHandleExt for Handle { fn parent_node(&self) -> Option { let weak_parent = self.parent.take()?; diff --git a/src/tag_omission.rs b/src/tag_omission.rs index 197ed123..edbb80a8 100644 --- a/src/tag_omission.rs +++ b/src/tag_omission.rs @@ -11,7 +11,7 @@ use html5ever::{local_name, namespace_url, ns, LocalName, QualName}; use markup5ever_rcdom::{Handle, NodeData}; use regex::Regex; -use crate::dom_utils::{self, NodeHandleExt}; +use crate::dom_utils::{self, heading_level, NodeHandleExt}; #[derive(Default)] struct ElementInfo { @@ -197,23 +197,6 @@ impl Processor { } } -/// Returns the heading level (from 1 to 6) that the

through

declares, or None for all other nodes. -fn heading_level(node: &Handle) -> Option { - let local = match node.data { - NodeData::Element { ref name, .. } if name.ns == ns!(html) => &name.local, - _ => return None, - }; - match *local { - local_name!("h1") => Some(1), - local_name!("h2") => Some(2), - local_name!("h3") => Some(3), - local_name!("h4") => Some(4), - local_name!("h5") => Some(5), - local_name!("h6") => Some(6), - _ => None, - } -} - #[cfg(test)] mod tests { use super::*; From 7b438db8ab0dadf89122aec99488d48850d5be72 Mon Sep 17 00:00:00 2001 From: Jeremy Roman Date: Mon, 16 Oct 2023 22:13:49 -0400 Subject: [PATCH 24/28] format --- src/boilerplate.rs | 2 +- src/io_utils.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/boilerplate.rs b/src/boilerplate.rs index 04677b4d..faab19f2 100644 --- a/src/boilerplate.rs +++ b/src/boilerplate.rs @@ -13,7 +13,7 @@ use tokio::fs::File; use tokio::task::JoinHandle; use crate::dom_utils::NodeHandleExt; -use crate::io_utils::{is_safe_path, read_to_str_tendril, async_error}; +use crate::io_utils::{async_error, is_safe_path, read_to_str_tendril}; use crate::parser; type SendStrTendril = SendTendril; diff --git a/src/io_utils.rs b/src/io_utils.rs index f96a06a4..d6a329d2 100644 --- a/src/io_utils.rs +++ b/src/io_utils.rs @@ -54,4 +54,4 @@ mod tests { assert_eq!(StrTendril::from(send_tendril).as_ref(), "Hello, world!"); Ok(()) } -} \ No newline at end of file +} From 4398c55dbc0929b9424cf8574ab96fa92d466431 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Mon, 23 Oct 2023 14:32:03 +0900 Subject: [PATCH 25/28] Cargo.toml fixups --- Cargo.toml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6f9f7202..ba472022 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,13 +1,16 @@ [package] name = "html-build" -version = "0.1.0" +version = "0.0.0" +publish = false edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] tokio = { version = "1", features = ["full"] } -html5ever = "*" -markup5ever_rcdom = "*" +html5ever = "0.26.0" +markup5ever_rcdom = "0.2.0" +regex = "1" + +[dev-dependencies] tempfile = "3" -regex = "1" \ No newline at end of file From 438f9cee7aa3cb64bd6b52dcb361570de50d8ba9 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Mon, 23 Oct 2023 14:36:19 +0900 Subject: [PATCH 26/28] Invocation changes --- Cargo.lock | 2 +- build.sh | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c7917763..3e3aca45 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -109,7 +109,7 @@ checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" [[package]] name = "html-build" -version = "0.1.0" +version = "0.0.0" dependencies = [ "html5ever", "markup5ever_rcdom", diff --git a/build.sh b/build.sh index 1c70b3d6..14044d20 100755 --- a/build.sh +++ b/build.sh @@ -534,7 +534,10 @@ function processSource { if hash html-build 2>/dev/null; then html-build <"$HTML_SOURCE/$SOURCE_LOCATION" >"$HTML_TEMP/source-whatwg-complete" else - cargo run -r <"$HTML_SOURCE/$SOURCE_LOCATION" >"$HTML_TEMP/source-whatwg-complete" + CARGO_ARGS=() + $VERBOSE && CARGO_ARGS+=( --verbose ) + $QUIET && CARGO_ARGS+=( --quiet ) + cargo run "${CARGO_ARGS[@]}" --release <"$HTML_SOURCE/$SOURCE_LOCATION" >"$HTML_TEMP/source-whatwg-complete" fi else if $VERBOSE; then From 831cb062f7a75dad6f2508a6eeb422080aaf28d2 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Mon, 23 Oct 2023 14:38:31 +0900 Subject: [PATCH 27/28] Relesae is cargo install default apparently --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 570f8352..5133a031 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM rust:1.72 as builder WORKDIR /whatwg/html-build COPY Cargo.lock Cargo.toml ./ COPY src ./src/ -RUN cargo install --path . --release +RUN cargo install --path . FROM debian:stable-slim RUN apt-get update && \ From bd57b95822d847a8bf1e0d3a61bddaaa1f94ca17 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Mon, 23 Oct 2023 14:41:27 +0900 Subject: [PATCH 28/28] More Dockerfile tweaks --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 5133a031..5dd646b6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM rust:1.72 as builder +FROM rust:1.73-slim as builder WORKDIR /whatwg/html-build COPY Cargo.lock Cargo.toml ./ COPY src ./src/