diff --git a/Cargo.lock b/Cargo.lock index 028082fc..d2534777 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,21 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "addr2line" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + [[package]] name = "ahash" version = "0.8.11" @@ -23,6 +38,17 @@ dependencies = [ "memchr", ] +[[package]] +name = "aligned_media" +version = "0.1.0" +dependencies = [ + "isolang", + "peg", + "serde", + "serde_json", + "thiserror", +] + [[package]] name = "allocator-api2" version = "0.2.16" @@ -79,9 +105,54 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.80" +version = "1.0.81" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247" + +[[package]] +name = "async-convert" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d416feee97712e43152cd42874de162b8f9b77295b1c85e5d92725cc8310bae" +dependencies = [ + "async-trait", +] + +[[package]] +name = "async-openai" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6791601019a2f14a6e5d3f7da849b5ff1594ef5c8da5492d63214c220449270c" +dependencies = [ + "async-convert", + "backoff", + "base64", + "bytes", + "derive_builder", + "futures", + "rand", + "reqwest", + "reqwest-eventsource", + "secrecy", + "serde", + "serde_json", + "thiserror", + "tokio", + "tokio-stream", + "tokio-util", + "tracing", +] + +[[package]] +name = "async-trait" +version = "0.1.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" +checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.52", +] [[package]] name = "autocfg" @@ -89,6 +160,47 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "backoff" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b62ddb9cb1ec0a098ad4bbf9344d0713fa193ae1a80af55febcff2627b6a00c1" +dependencies = [ + "futures-core", + "getrandom", + "instant", + "pin-project-lite", + "rand", + "tokio", +] + +[[package]] +name = "backtrace" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.4.2" @@ -104,12 +216,30 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bumpalo" +version = "3.15.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ff69b9dd49fd426c69a0db9fc04dd934cdb6645ff000864d98f7e2af8830eaa" + +[[package]] +name = "bytes" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" + [[package]] name = "cast" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "cc" +version = "1.0.90" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5" + [[package]] name = "cfg-if" version = "1.0.0" @@ -124,9 +254,9 @@ checksum = "1a48563284b67c003ba0fb7243c87fab68885e1532c605704228a80238512e31" [[package]] name = "clap" -version = "4.5.1" +version = "4.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c918d541ef2913577a0f9566e9ce27cb35b6df072075769e0b26cb5a554520da" +checksum = "b230ab84b0ffdf890d5a10abdbc8b83ae1c4918275daea1ab8801f71536b2651" dependencies = [ "clap_builder", "clap_derive", @@ -134,14 +264,14 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.1" +version = "4.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f3e7391dad68afb0c2ede1bf619f579a3dc9c2ec67f089baa397123a2f3d1eb" +checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" dependencies = [ "anstream", "anstyle", "clap_lex", - "strsim", + "strsim 0.11.0", "terminal_size", ] @@ -174,28 +304,42 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" [[package]] -name = "cpufeatures" -version = "0.2.12" +name = "console" +version = "0.15.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb" dependencies = [ + "encode_unicode", + "lazy_static", "libc", + "unicode-width", + "windows-sys 0.52.0", ] [[package]] -name = "crossbeam-channel" -version = "0.5.12" +name = "core-foundation" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab3db02a9c5b5121e1e42fbdb1aeb65f5e02624cc58c43f2884c6ccac0b82f95" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" dependencies = [ - "crossbeam-utils", + "core-foundation-sys", + "libc", ] [[package]] -name = "crossbeam-utils" -version = "0.8.19" +name = "core-foundation-sys" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" + +[[package]] +name = "cpufeatures" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" +checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +dependencies = [ + "libc", +] [[package]] name = "crypto-common" @@ -228,6 +372,72 @@ dependencies = [ "memchr", ] +[[package]] +name = "darling" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim 0.10.0", + "syn 1.0.109", +] + +[[package]] +name = "darling_macro" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" +dependencies = [ + "darling_core", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "derive_builder" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "derive_builder_macro" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e" +dependencies = [ + "derive_builder_core", + "syn 1.0.109", +] + [[package]] name = "diff" version = "0.1.13" @@ -250,6 +460,18 @@ dependencies = [ "crypto-common", ] +[[package]] +name = "dotenv" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f" + +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + [[package]] name = "encoding" version = "0.2.33" @@ -314,6 +536,15 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" +[[package]] +name = "encoding_rs" +version = "0.8.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1" +dependencies = [ + "cfg-if", +] + [[package]] name = "env_filter" version = "0.1.0" @@ -326,9 +557,9 @@ dependencies = [ [[package]] name = "env_logger" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d" +checksum = "38b35839ba51819680ba087cd351788c9a3c476841207e0b8cee0b04722343b9" dependencies = [ "anstream", "anstyle", @@ -337,6 +568,12 @@ dependencies = [ "log", ] +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + [[package]] name = "errno" version = "0.3.8" @@ -347,6 +584,17 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "eventsource-stream" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74fef4569247a5f429d9156b9d0a2599914385dd189c539334c625d8099d90ab" +dependencies = [ + "futures-core", + "nom", + "pin-project-lite", +] + [[package]] name = "ext-trait" version = "1.0.1" @@ -377,155 +625,504 @@ dependencies = [ ] [[package]] -name = "generic-array" -version = "0.14.7" +name = "fnv" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" -dependencies = [ - "typenum", - "version_check", -] +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] -name = "handlebars" -version = "5.1.0" +name = "form_urlencoded" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab283476b99e66691dee3f1640fea91487a8d81f50fb5ecc75538f8f8879a1e4" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" dependencies = [ - "log", - "pest", - "pest_derive", - "serde", - "serde_json", - "thiserror", + "percent-encoding", ] [[package]] -name = "hashbrown" -version = "0.14.3" +name = "futures" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" dependencies = [ - "ahash", - "allocator-api2", + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", ] [[package]] -name = "heck" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" - -[[package]] -name = "humantime" -version = "2.1.0" +name = "futures-channel" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +dependencies = [ + "futures-core", + "futures-sink", +] [[package]] -name = "itoa" -version = "1.0.10" +name = "futures-core" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" [[package]] -name = "lazy_static" -version = "1.4.0" +name = "futures-executor" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] [[package]] -name = "lending-iterator" -version = "0.1.7" +name = "futures-io" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc07588c853b50689205fb5c00498aa681d89828e0ce8cbd965ebc7a5d8ae260" -dependencies = [ - "extension-traits", - "lending-iterator-proc_macros", - "macro_rules_attribute", - "never-say-never", - "nougat", - "polonius-the-crab", -] +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" [[package]] -name = "lending-iterator-proc_macros" -version = "0.1.7" +name = "futures-macro" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5445dd1c0deb1e97b8a16561d17fc686ca83e8411128fb036e9668a72d51b1d" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.52", ] [[package]] -name = "libc" -version = "0.2.153" +name = "futures-sink" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" [[package]] -name = "linux-raw-sys" -version = "0.4.13" +name = "futures-task" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" [[package]] -name = "log" -version = "0.4.21" +name = "futures-timer" +version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" [[package]] -name = "macro_rules_attribute" -version = "0.1.3" +name = "futures-util" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf0c9b980bf4f3a37fd7b1c066941dd1b1d0152ce6ee6e8fe8c49b9f6810d862" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" dependencies = [ - "macro_rules_attribute-proc_macro", - "paste", + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", ] [[package]] -name = "macro_rules_attribute-proc_macro" -version = "0.1.3" +name = "generic-array" +version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] [[package]] -name = "memchr" -version = "2.7.1" +name = "getrandom" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" +checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] [[package]] -name = "never-say-never" -version = "6.6.666" +name = "gimli" +version = "0.28.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf5a574dadd7941adeaa71823ecba5e28331b8313fb2e1c6a5c7e5981ea53ad6" +checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" [[package]] -name = "nougat" -version = "0.2.4" +name = "h2" +version = "0.3.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97b57b9ced431322f054fc673f1d3c7fa52d80efd9df74ad2fc759f044742510" +checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9" dependencies = [ - "macro_rules_attribute", - "nougat-proc_macros", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", ] [[package]] -name = "nougat-proc_macros" -version = "0.2.4" +name = "handlebars" +version = "5.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c84f77a45e99a2f9b492695d99e1c23844619caa5f3e57647cffacad773ca257" +checksum = "ab283476b99e66691dee3f1640fea91487a8d81f50fb5ecc75538f8f8879a1e4" dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - + "log", + "pest", + "pest_derive", + "serde", + "serde_json", + "thiserror", +] + +[[package]] +name = "hashbrown" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +dependencies = [ + "ahash", + "allocator-api2", +] + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + +[[package]] +name = "hermit-abi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" + +[[package]] +name = "http" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "hyper" +version = "0.14.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf96e135eb83a2a8ddf766e426a841d8ddd7449d5f00d34ea02b41d2f19eef80" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" +dependencies = [ + "futures-util", + "http", + "hyper", + "rustls", + "tokio", + "tokio-rustls", +] + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "idna" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "indexmap" +version = "2.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b0b929d511467233429c45a44ac1dcaa21ba0f5ba11e4879e6ed28ddb4f9df4" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "indicatif" +version = "0.17.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3" +dependencies = [ + "console", + "instant", + "number_prefix", + "portable-atomic", + "unicode-width", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "ipnet" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" + +[[package]] +name = "isolang" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe50d48c77760c55188549098b9a7f6e37ae980c586a24693d6b01c3b2010c3c" +dependencies = [ + "phf", + "serde", +] + +[[package]] +name = "itoa" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" + +[[package]] +name = "js-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "lending-iterator" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc07588c853b50689205fb5c00498aa681d89828e0ce8cbd965ebc7a5d8ae260" +dependencies = [ + "extension-traits", + "lending-iterator-proc_macros", + "macro_rules_attribute", + "never-say-never", + "nougat", + "polonius-the-crab", +] + +[[package]] +name = "lending-iterator-proc_macros" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5445dd1c0deb1e97b8a16561d17fc686ca83e8411128fb036e9668a72d51b1d" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "libc" +version = "0.2.153" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" + +[[package]] +name = "linux-raw-sys" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" + +[[package]] +name = "log" +version = "0.4.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" + +[[package]] +name = "macro_rules_attribute" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf0c9b980bf4f3a37fd7b1c066941dd1b1d0152ce6ee6e8fe8c49b9f6810d862" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d" + +[[package]] +name = "memchr" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "mime_guess" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef" +dependencies = [ + "mime", + "unicase", +] + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "miniz_oxide" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" +dependencies = [ + "adler", +] + +[[package]] +name = "mio" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.48.0", +] + +[[package]] +name = "never-say-never" +version = "6.6.666" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf5a574dadd7941adeaa71823ecba5e28331b8313fb2e1c6a5c7e5981ea53ad6" + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "nougat" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97b57b9ced431322f054fc673f1d3c7fa52d80efd9df74ad2fc759f044742510" +dependencies = [ + "macro_rules_attribute", + "nougat-proc_macros", +] + +[[package]] +name = "nougat-proc_macros" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c84f77a45e99a2f9b492695d99e1c23844619caa5f3e57647cffacad773ca257" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "num" version = "0.4.1" @@ -601,12 +1198,43 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "object" +version = "0.32.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + [[package]] name = "ordered-float" version = "4.2.0" @@ -628,17 +1256,6 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" -[[package]] -name = "pbr" -version = "1.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed5827dfa0d69b6c92493d6c38e633bbaa5937c153d0d7c28bf12313f8c6d514" -dependencies = [ - "crossbeam-channel", - "libc", - "winapi", -] - [[package]] name = "peg" version = "0.8.2" @@ -666,6 +1283,12 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "36bae92c60fa2398ce4678b98b2c4b5a7c61099961ca1fa305aec04a9ad28922" +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + [[package]] name = "pest" version = "2.7.8" @@ -701,87 +1324,335 @@ dependencies = [ ] [[package]] -name = "pest_meta" -version = "2.7.8" +name = "pest_meta" +version = "2.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "934cd7631c050f4674352a6e835d5f6711ffbfb9345c2fc0107155ac495ae293" +dependencies = [ + "once_cell", + "pest", + "sha2", +] + +[[package]] +name = "phf" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_shared" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "polonius-the-crab" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a69ee997a6282f8462abf1e0d8c38c965e968799e912b3bed8c9e8a28c2f9f" + +[[package]] +name = "portable-atomic" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "proc-macro2" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "regex" +version = "1.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + +[[package]] +name = "reqwest" +version = "0.11.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78bf93c4af7a8bb7d879d51cebe797356ff10ae8516ace542b5182d9dcac10b2" +dependencies = [ + "base64", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-rustls", + "ipnet", + "js-sys", + "log", + "mime", + "mime_guess", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls", + "rustls-native-certs", + "rustls-pemfile", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "system-configuration", + "tokio", + "tokio-rustls", + "tokio-util", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", + "winreg", +] + +[[package]] +name = "reqwest-eventsource" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f03f570355882dd8d15acc3a313841e6e90eddbc76a93c748fd82cc13ba9f51" +dependencies = [ + "eventsource-stream", + "futures-core", + "futures-timer", + "mime", + "nom", + "pin-project-lite", + "reqwest", + "thiserror", +] + +[[package]] +name = "ring" +version = "0.17.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" +dependencies = [ + "cc", + "cfg-if", + "getrandom", + "libc", + "spin", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" + +[[package]] +name = "rustix" +version = "0.38.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" +dependencies = [ + "bitflags 2.4.2", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustls" +version = "0.21.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "934cd7631c050f4674352a6e835d5f6711ffbfb9345c2fc0107155ac495ae293" +checksum = "f9d5a6813c0759e4609cd494e8e725babae6a2ca7b62a5536a13daaec6fcb7ba" dependencies = [ - "once_cell", - "pest", - "sha2", + "log", + "ring", + "rustls-webpki", + "sct", ] [[package]] -name = "polonius-the-crab" -version = "0.2.1" +name = "rustls-native-certs" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2a69ee997a6282f8462abf1e0d8c38c965e968799e912b3bed8c9e8a28c2f9f" +checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" +dependencies = [ + "openssl-probe", + "rustls-pemfile", + "schannel", + "security-framework", +] [[package]] -name = "proc-macro2" -version = "1.0.78" +name = "rustls-pemfile" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" +checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" dependencies = [ - "unicode-ident", + "base64", ] [[package]] -name = "quote" -version = "1.0.35" +name = "rustls-webpki" +version = "0.101.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" dependencies = [ - "proc-macro2", + "ring", + "untrusted", ] [[package]] -name = "regex" -version = "1.10.3" +name = "ryu" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" +checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" + +[[package]] +name = "schannel" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", + "windows-sys 0.52.0", ] [[package]] -name = "regex-automata" -version = "0.4.6" +name = "sct" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" +checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", + "ring", + "untrusted", ] [[package]] -name = "regex-syntax" -version = "0.8.2" +name = "secrecy" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +checksum = "9bd1c54ea06cfd2f6b63219704de0b9b4f72dcc2b8fdef820be6cd799780e91e" +dependencies = [ + "serde", + "zeroize", +] [[package]] -name = "rustix" -version = "0.38.31" +name = "security-framework" +version = "2.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" +checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de" dependencies = [ - "bitflags", - "errno", + "bitflags 1.3.2", + "core-foundation", + "core-foundation-sys", "libc", - "linux-raw-sys", - "windows-sys 0.52.0", + "security-framework-sys", ] [[package]] -name = "ryu" -version = "1.0.17" +name = "security-framework-sys" +version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" +checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a" +dependencies = [ + "core-foundation-sys", + "libc", +] [[package]] name = "serde" @@ -814,6 +1685,18 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + [[package]] name = "sha2" version = "0.10.8" @@ -825,6 +1708,43 @@ dependencies = [ "digest", ] +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "socket2" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + [[package]] name = "strsim" version = "0.11.0" @@ -836,6 +1756,7 @@ name = "substudy" version = "0.5.1" dependencies = [ "anyhow", + "async-openai", "cast", "chardet", "clap", @@ -843,20 +1764,22 @@ dependencies = [ "csv", "diff", "difference", + "dotenv", "encoding", "env_logger", "handlebars", + "indicatif", "lazy_static", "lending-iterator", "log", "num", "ordered-float", "paragraph-breaker", - "pbr", "peg", "regex", "serde", "serde_json", + "tokio", "unicode-width", "whatlang", ] @@ -883,6 +1806,33 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" + +[[package]] +name = "system-configuration" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "terminal_size" version = "0.3.0" @@ -895,24 +1845,145 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.57" +version = "1.0.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" +checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.57" +version = "1.0.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.52", +] + +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "num_cpus", + "pin-project-lite", + "socket2", + "tokio-macros", + "windows-sys 0.48.0", +] + +[[package]] +name = "tokio-macros" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" +checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", "syn 2.0.52", ] +[[package]] +name = "tokio-rustls" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", + "tracing", +] + +[[package]] +name = "tower-service" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" + +[[package]] +name = "tracing" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.52", +] + +[[package]] +name = "tracing-core" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +dependencies = [ + "once_cell", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + [[package]] name = "typenum" version = "1.17.0" @@ -925,18 +1996,59 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed646292ffc8188ef8ea4d1e0e0150fb15a5c2e12ad9b8fc191ae7a8a7f3c4b9" +[[package]] +name = "unicase" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89" +dependencies = [ + "version_check", +] + +[[package]] +name = "unicode-bidi" +version = "0.3.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" + [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "unicode-normalization" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" +dependencies = [ + "tinyvec", +] + [[package]] name = "unicode-width" version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "url" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + [[package]] name = "utf8parse" version = "0.2.1" @@ -950,36 +2062,118 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" [[package]] -name = "whatlang" -version = "0.16.4" +name = "want" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "471d1c1645d361eb782a1650b1786a8fb58dd625e681a04c09f5ff7c8764a7b0" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" dependencies = [ - "hashbrown", + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +dependencies = [ + "bumpalo", + "log", "once_cell", + "proc-macro2", + "quote", + "syn 2.0.52", + "wasm-bindgen-shared", ] [[package]] -name = "winapi" -version = "0.3.9" +name = "wasm-bindgen-futures" +version = "0.4.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", ] [[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" +name = "wasm-bindgen-macro" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.92" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.52", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" [[package]] -name = "winapi-x86_64-pc-windows-gnu" +name = "wasm-streams" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "web-sys" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "whatlang" +version = "0.16.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "471d1c1645d361eb782a1650b1786a8fb58dd625e681a04c09f5ff7c8764a7b0" +dependencies = [ + "hashbrown", + "once_cell", +] [[package]] name = "windows-sys" @@ -1113,6 +2307,16 @@ version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" +[[package]] +name = "winreg" +version = "0.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + [[package]] name = "zerocopy" version = "0.7.32" @@ -1132,3 +2336,9 @@ dependencies = [ "quote", "syn 2.0.52", ] + +[[package]] +name = "zeroize" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" diff --git a/Cargo.toml b/Cargo.toml index bd80099e..df59fa05 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,7 @@ #opt-level = 3 [workspace] -members = ["cli_test_dir", "substudy"] +members = ["cli_test_dir", "aligned_media", "substudy"] # Removed until I can update `nom` over 5 major releases. # diff --git a/archived/aligned_media/Cargo.toml b/aligned_media/Cargo.toml similarity index 84% rename from archived/aligned_media/Cargo.toml rename to aligned_media/Cargo.toml index 8235918b..1ca14d32 100644 --- a/archived/aligned_media/Cargo.toml +++ b/aligned_media/Cargo.toml @@ -2,6 +2,7 @@ name = "aligned_media" version = "0.1.0" authors = ["Eric Kidd "] +edition = "2021" description = "Rust implementation of the \"aligned media\" format for language-learning software." license = "CC0-1.0" @@ -18,12 +19,9 @@ documentation = "https://docs.rs/aligned_media/" # for actual end-user software. no_forwards_compatibility = [] -[build-dependencies] -peg = "0.5" - [dependencies] -failure = "0.1.1" -isolang = { version = "0.2", features = ["serde_serialize"] } -serde = "1.0" -serde_derive = "1.0" +isolang = { version = "2.4.0", features = ["serde"] } +peg = "0.8.2" +serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" +thiserror = "1.0.58" diff --git a/archived/aligned_media/README.md b/aligned_media/README.md similarity index 100% rename from archived/aligned_media/README.md rename to aligned_media/README.md diff --git a/archived/aligned_media/fixtures/examples/book_example.aligned/metadata.json b/aligned_media/fixtures/examples/book_example.aligned/metadata.json similarity index 100% rename from archived/aligned_media/fixtures/examples/book_example.aligned/metadata.json rename to aligned_media/fixtures/examples/book_example.aligned/metadata.json diff --git a/archived/aligned_media/fixtures/examples/subtitle_example.aligned/metadata.json b/aligned_media/fixtures/examples/subtitle_example.aligned/metadata.json similarity index 100% rename from archived/aligned_media/fixtures/examples/subtitle_example.aligned/metadata.json rename to aligned_media/fixtures/examples/subtitle_example.aligned/metadata.json diff --git a/archived/aligned_media/fixtures/examples/subtitle_extracted_example.aligned/metadata.json b/aligned_media/fixtures/examples/subtitle_extracted_example.aligned/metadata.json similarity index 100% rename from archived/aligned_media/fixtures/examples/subtitle_extracted_example.aligned/metadata.json rename to aligned_media/fixtures/examples/subtitle_extracted_example.aligned/metadata.json diff --git a/aligned_media/src/html.rs b/aligned_media/src/html.rs new file mode 100644 index 00000000..56763c81 --- /dev/null +++ b/aligned_media/src/html.rs @@ -0,0 +1,350 @@ +//! A very simple version of an HTML data model. This is designed to correspond +//! to the limited HTML features supported in some subtitle formats, and to be +//! easy to sanitize using a whitelist of supported tags and attributes. + +use serde::de::Error as DeError; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use std::collections::HashMap; +use std::error::Error as StdError; +use std::fmt; +use std::result; +use std::str::FromStr; + +use super::{Error, Result}; + +/// An HTML fragment. +#[derive(Clone, Debug, Default, Eq, PartialEq)] +#[non_exhaustive] +pub struct Fragment { + /// The HTML nodes in this fragment. + pub nodes: Vec, +} + +impl Fragment { + /// Create an HTML fragment from a plain text node, escaping any special + /// characters. + pub fn from_text>(text: S) -> Fragment { + let node = Node::Text { text: text.into() }; + Fragment { nodes: vec![node] } + } +} + +impl FromStr for Fragment { + type Err = Error; + + fn from_str(html: &str) -> Result { + Ok( + grammar::fragment(html).map_err(|err| Error::CouldNotParseHtml { + html: html.to_owned(), + source: Box::new(err), + })?, + ) + } +} + +impl fmt::Display for Fragment { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + for node in &self.nodes { + node.fmt(f)?; + } + Ok(()) + } +} + +impl<'de> Deserialize<'de> for Fragment { + fn deserialize>(d: D) -> result::Result { + let raw_html = String::deserialize(d)?; + raw_html.parse().map_err(|err: Error| { + // Make sure we get the entire cause chain for this error. + // + // TODO: Use a writer to format this, or just use `common_failures`. + let mut msg = String::new(); + let mut next: Option<&dyn StdError> = Some(&err); + while let Some(err) = next { + if !msg.is_empty() { + msg.push_str("\n caused by: "); + } + msg.push_str(&format!("{}", err)); + next = err.source() + } + D::Error::custom(msg) + }) + } +} + +impl Serialize for Fragment { + fn serialize(&self, serializer: S) -> result::Result + where + S: Serializer, + { + format!("{}", self).serialize(serializer) + } +} + +/// A DOM node in an HTML fragment. Note that we convert all character entities +/// to text nodes at parse time. +#[derive(Clone, Debug, Eq, PartialEq)] +#[non_exhaustive] +pub enum Node { + /// A regular text node. + #[non_exhaustive] + Text { + /// The text of this node. + text: String, + }, + + /// An HTML element, with possible attributes. + #[non_exhaustive] + Element { + /// The name of this element. + name: String, + + /// HTML element attributes. + attributes: Attributes, + + /// Child nodes. + children: Vec, + }, +} + +impl fmt::Display for Node { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // TODO - Verify all strings are in legal ranges before printing. + match *self { + Node::Text { ref text } => { + text.replace("&", "&").replace("<", "<").fmt(f)?; + } + Node::Element { + ref name, + ref attributes, + ref children, + } => { + write!(f, "<{}", name)?; + for (name, value) in attributes { + write!( + f, + " {}=\"{}\"", + name, + value.replace("&", "&").replace("\"", """), + )?; + } + write!(f, ">")?; + if name != "br" && name != "img" { + for child in children { + child.fmt(f)?; + } + write!(f, "", name)?; + } + } + } + Ok(()) + } +} + +/// HTML element attributes. +pub type Attributes = HashMap; + +fn parse_numeric_entity( + digits: &str, + radix: u32, +) -> result::Result { + let code = u32::from_str_radix(digits, radix) + .expect("parser should have required a valid number"); + if code == 0 { + return Err("no \"\\0\" characters allowed in HTML"); + } + ::std::char::from_u32(code).ok_or_else(|| "a valid UTF-8 character") +} + +peg::parser! { + grammar grammar() for str { + use std::collections::HashMap; + use std::iter::FromIterator; + + use super::{Fragment, Node, Attributes, parse_numeric_entity}; + + pub(crate) rule fragment() -> Fragment + = nodes:(node()*) { + Fragment { nodes } + } + + rule node() -> Node = text_node() / entity() / element() + + rule text_node() -> Node + = quiet!{ text:$([^ '<' | '&']+) { + Node::Text { text: text.to_owned() } + } } / expected!("text") + + rule entity() -> Node + = "&" text:( entity_named() / entity_hex() / entity_dec()) ";" { + Node::Text { text } + } + + rule entity_named() -> String + = name:$("lt" / "gt" / "amp" / "apos" / "quot") { + match name { + "lt" => "<".to_string(), + "gt" => ">".to_string(), + "amp" => "&".to_string(), + "apos" => "'".to_string(), + "quot" => "\"".to_string(), + _ => unreachable!("unknown named entity"), + } + } + + rule entity_hex() -> String + = "#x" digits:$(['0'..='9' | 'a'..='f' | 'A'..='F']+) {? + parse_numeric_entity(digits, 16).map(|c| c.to_string()) + } + + rule entity_dec() -> String + = "#" digits:$(['0'..='9']+) {? + parse_numeric_entity(digits, 10).map(|c| c.to_string()) + } + + rule element() -> Node + = empty_element() / start_end_element() + + rule empty_element() -> Node + = "<" name:$(i("br") / i("img")) attributes:attributes() _* "/"? ">" { + Node::Element { + name: name.to_owned(), + attributes, + children: vec![], + } + } + + rule start_end_element() -> Node + = start_tag:start_tag() children:(node()*) end_tag:end_tag() {? + let (name, attributes) = start_tag; + if name == end_tag { + Ok(Node::Element { + name, + attributes, + children, + }) + } else { + Err("start and end tags must match") + } + } + + rule start_tag() -> (String, Attributes) + = "<" name:$(name()) attributes:attributes() _* ">" { + (name.to_owned(), attributes) + } + + rule end_tag() -> String + = quiet!{ "" { + name.to_owned() + } } / expected!("end tag") + + rule attributes() -> Attributes + = attributes:(attribute()*) { + HashMap::from_iter(attributes) + } + + rule attribute() -> (String, String) + = quiet!{ _* name:$(name()) _* "=" _* att_value:att_value() { + (name.to_owned(), att_value) + } } / expected!("attribute") + + rule att_value() -> String + = att_value_double_quotes() / att_value_single_quotes() + + // TODO: Handle escaped "<" and "&" values, if we ever need anything more + // than "" support. + rule att_value_double_quotes() -> String + = "\"" text:$([^ '<' | '&' | '"']*) "\"" { + text.to_owned() + } + + // TODO: Handle escaped "<" and "&" values, if we ever need anything more + // than "" support. + rule att_value_single_quotes() -> String + = "'" text:$([^ '<' | '&' |'\'']*) "'" { + text.to_owned() + } + + rule name() = quiet!{['A'..='Z' | 'a'..='z' | ':' | '_']['-' | 'A'..='Z' | 'a'..='z' | '0'..='9' | ':' | '_' | '.' ]*} / expected!("identifier") + + /// Case insensitive match. + rule i(s: &'static str) + = found:$([_]*<{s.len()}>) {? + if found == s { + Ok(()) + } else { + Err(s) + } + } + + rule _ = quiet!{[' ' | '\r' | '\n' | '\t']+} + } +} + +#[test] +fn parse_valid_html_fragments() { + let good_examples = &[ + "Hello!", + "&<>"'@JJ", + "foo2

", + "34", + ]; + for example in good_examples { + let _parsed: Fragment = example.parse().expect("failed to parse plain_text"); + } +} + +#[test] +fn error_on_invalid_html_fragments() { + let bad_examples = &[ + "<", + "", + "", + "", + "", + "", + ]; + for example in bad_examples { + let result = example.parse::(); + match result { + Ok(fragment) => { + panic!( + "parsed {:?} as {:?}, but should have failed", + example, fragment, + ); + } + Err(err) => { + println!( + "correctly got error: {}\n caused by: {}", + err, + err.source().expect("error should have cause"), + ); + } + } + } +} + +#[test] +fn display_produces_canonical_string() { + let examples = &[ + ("Hello!", "Hello!"), + ( + "&<>"'@JJ", + "&<>\"'@JJ", + ), + ("12

", "12

"), + ( + "34", + "34", + ), + ]; + for &(input, expected) in examples { + let fragment = input.parse::().expect("could not parse HTML"); + assert_eq!(format!("{}", fragment), expected); + } + + let text_fragment = Fragment::from_text("A & B <"); + assert_eq!(format!("{}", text_fragment), "A & B <"); +} diff --git a/archived/aligned_media/src/lib.rs b/aligned_media/src/lib.rs similarity index 73% rename from archived/aligned_media/src/lib.rs rename to aligned_media/src/lib.rs index 3b0c9d5e..55e1ce8e 100644 --- a/archived/aligned_media/src/lib.rs +++ b/aligned_media/src/lib.rs @@ -5,63 +5,77 @@ //! //! [spec]: https://github.com/language-learners/aligned-media-spec -#[warn(missing_docs)] +#![warn(missing_docs)] -#[macro_use] -extern crate failure; -extern crate isolang; -extern crate serde; -#[macro_use] -extern crate serde_derive; -extern crate serde_json; - -use failure::ResultExt; -use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde::de::Error as DeError; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; use std::collections::HashMap; use std::result; +use thiserror::Error; pub mod html; +/// Our standard result type. +pub type Result = result::Result; + /// Errors which can be returned by this crate. -#[derive(Debug, Fail)] +#[derive(Debug, Error)] +#[non_exhaustive] pub enum Error { /// We could not parse the specified HTML. - #[fail(display = "could not parse HTML {:?}", html)] + #[error("could not parse HTML {html:?}")] + #[non_exhaustive] CouldNotParseHtml { + /// The HTML that we could not parse. html: String, + + /// The underlying error. + source: Box, }, /// We could not parse the input data. - #[fail(display = "could not parse metadata")] - CouldNotParseMetadata, + #[error("could not parse metadata")] + #[non_exhaustive] + CouldNotParseMetadata { + /// The underlying error. + source: Box, + }, /// We encountered an unsupported HTML attribute. - #[fail(display = "the HTML attribute {:?} is not allowed", name)] + #[error("the HTML attribute {name:?} is not allowed")] + #[non_exhaustive] HtmlAttributeForbidden { + /// The name of the forbidden attribute. name: String, }, /// We encountered an unsupported HTML element. - #[fail(display = "the HTML element {:?} is not allowed", name)] + #[error("the HTML element {name:?} is not allowed")] + #[non_exhaustive] HtmlElementForbidden { + /// The name of the forbidden element. name: String, }, /// We encountered an unsupported HTML entity. - #[fail(display = "the HTML entity {:?} is not allowed", name)] + #[error("the HTML entity {name:?} is not allowed")] + #[non_exhaustive] HtmlEntityForbidden { + /// The name of the forbidden entity. name: String, }, /// We encountered an invalid path. - #[fail(display = "path {:?} is not allowed", path)] + #[error("path {path:?} is not allowed")] + #[non_exhaustive] InvalidPath { + /// The invalid path. path: String, }, /// We encountered an invalid span. - #[fail(display = "beginning of time span {},{} is greater than end", begin, end)] + #[error("beginning of time span {begin},{end} is greater than end")] + #[non_exhaustive] InvalidSpan { /// The beginning of the invalid span. begin: f32, @@ -70,22 +84,23 @@ pub enum Error { }, /// We encountered an unknown track type that didn't begin with "x-". - #[fail(display = "unsupported track type {:?} (did you want to prefix it with \"x-\"?)", value)] + #[error( + "unsupported track type {value:?} (did you want to prefix it with \"x-\"?)" + )] + #[non_exhaustive] UnsupportedTrackType { /// The unknown track type. value: String, }, } -/// Type alias for results returned by our crate. -pub type Result = result::Result; - /// A single media file, typically an episode of a TV series, a film, an chapter /// of an audiobook. It might also be something more exotic, like a PDF of a /// graphic novel. #[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] #[serde(rename_all = "camelCase")] -#[cfg_attr(feature="no_forwards_compatibility", serde(deny_unknown_fields))] +#[cfg_attr(feature = "no_forwards_compatibility", serde(deny_unknown_fields))] +#[non_exhaustive] pub struct Metadata { /// The title of a book, TV series, album, etc. This may be the same for /// multiple files if `section_number` and/or `section_title` are used. @@ -128,21 +143,21 @@ pub struct Metadata { /// Application-specific extension data. #[serde(default, skip_serializing_if = "ExtensionData::is_empty")] pub ext: ExtensionData, - - /// Placeholder to allow for future extensibility without breaking the API. - #[serde(default, skip_serializing)] - _placeholder: (), } impl Metadata { /// Parse `metadata.json` represented as raw bytes. This will be interpreted /// as UTF-8, because the format is strict. - pub fn from_bytes(data: &[u8]) -> result::Result { - Ok(serde_json::from_slice(data).context(Error::CouldNotParseMetadata)?) + pub fn from_bytes(data: &[u8]) -> Result { + Ok(serde_json::from_slice(data).map_err(|err| { + Error::CouldNotParseMetadata { + source: Box::new(err), + } + })?) } /// Parse `metadata.json` represented as a UTF-8 Rust string. - pub fn from_str(data: &str) -> result::Result { + pub fn from_str(data: &str) -> Result { Self::from_bytes(data.as_bytes()) } } @@ -152,11 +167,12 @@ fn parse_metadata() { let examples = &[ include_str!("../fixtures/examples/book_example.aligned/metadata.json"), include_str!("../fixtures/examples/subtitle_example.aligned/metadata.json"), - include_str!("../fixtures/examples/subtitle_extracted_example.aligned/metadata.json"), + include_str!( + "../fixtures/examples/subtitle_extracted_example.aligned/metadata.json" + ), ]; for example in examples { - Metadata::from_str(example) - .expect("failed to parse example metadata"); + Metadata::from_str(example).expect("failed to parse example metadata"); } } @@ -164,9 +180,10 @@ fn parse_metadata() { /// single language, or a still image taken from a video #[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] #[serde(rename_all = "camelCase")] -#[cfg_attr(feature="no_forwards_compatibility", serde(deny_unknown_fields))] +#[cfg_attr(feature = "no_forwards_compatibility", serde(deny_unknown_fields))] +#[non_exhaustive] pub struct Track { - // The kind of data stored in this track. + /// The kind of data stored in this track. #[serde(rename = "type")] pub track_type: TrackType, @@ -175,16 +192,15 @@ pub struct Track { /// included in ISO 639-1. If this is omitted, then programs may assume that /// this track might be something like a still image from a video or an /// illustration, that provides context but contains no linguistic data. - #[serde(default, skip_serializing_if = "Option::is_none", with = "iso_short_code_serialization::opt")] + #[serde(default, skip_serializing_if = "Option::is_none")] pub lang: Option, - // The actual underlying file on disk, if any. Either this or `html` should - // be present, but not both. - file: Option, + /// The actual underlying file on disk, if any. Either this or `html` should + /// be present, but not both. + pub file: Option, // TODO: Do we want a `fileSpan: Span` element, to select only a portion of // a media file? - /// Textual context, which should be valid HTML 5, optionally with embedded /// tags like ``, `` and `
`. #[serde(default, skip_serializing_if = "Option::is_none")] @@ -193,10 +209,6 @@ pub struct Track { /// Application-specific extension data. #[serde(default, skip_serializing_if = "ExtensionData::is_empty")] pub ext: ExtensionData, - - /// Placeholder to allow for future extensibility without breaking the API. - #[serde(default, skip_serializing)] - _placeholder: (), } impl Track { @@ -209,7 +221,6 @@ impl Track { file: None, html: None, ext: ExtensionData::default(), - _placeholder: (), } } @@ -224,7 +235,6 @@ impl Track { file: None, html: Some(html.into()), ext: ExtensionData::default(), - _placeholder: (), } } @@ -239,13 +249,13 @@ impl Track { file: None, html: Some(html::Fragment::from_text(text)), ext: ExtensionData::default(), - _placeholder: (), } } } /// Different possible track types. #[derive(Clone, Debug, Eq, Hash, PartialEq, PartialOrd, Ord)] +#[non_exhaustive] pub enum TrackType { /// This track contains HTML data. Html, @@ -268,11 +278,9 @@ impl<'de> Deserialize<'de> for TrackType { other if other.starts_with("x-") => { Ok(TrackType::Ext(other[2..].to_owned())) } - other => { - Err(D::Error::custom(Error::UnsupportedTrackType { - value: other.to_owned(), - })) - } + other => Err(D::Error::custom(Error::UnsupportedTrackType { + value: other.to_owned(), + })), } } } @@ -286,9 +294,7 @@ impl Serialize for TrackType { TrackType::Html => "html".serialize(serializer), TrackType::Media => "media".serialize(serializer), TrackType::Image => "image".serialize(serializer), - TrackType::Ext(ref name) => { - format!("x-{}", name).serialize(serializer) - } + TrackType::Ext(ref name) => format!("x-{}", name).serialize(serializer), } } } @@ -298,7 +304,8 @@ impl Serialize for TrackType { /// application can do. #[derive(Clone, Debug, Default, Deserialize, PartialEq, Serialize)] #[serde(rename_all = "camelCase")] -#[cfg_attr(feature="no_forwards_compatibility", serde(deny_unknown_fields))] +#[cfg_attr(feature = "no_forwards_compatibility", serde(deny_unknown_fields))] +#[non_exhaustive] pub struct Alignment { /// The time span associated with this alignment, relative to /// `MediaFile.baseTrack`. If `MediaFile.baseTrack` was not specified, this @@ -390,7 +397,9 @@ impl FilePath { pub fn new>(path: S) -> Result { let path = path.into(); for component in path.split("/") { - if component == "" || component == "." || component == ".." + if component == "" + || component == "." + || component == ".." || component.contains("\\") { return Err(Error::InvalidPath { path: path.clone() }); @@ -436,55 +445,3 @@ impl Serialize for FilePath { /// `myapp-attrname`, where `myapp` is the application that uses them. This is a /// map with string keys and arbitrary JSON values. pub type ExtensionData = HashMap; - -/// Routines for serialzing and deserialzing optional ISO 639-1 and 639-3 codes, -/// represented as `isolang::Language`. These are intended for use with -/// `serde_derive`'s `with =` argument. -pub mod iso_short_code_serialization { - /// Serialize and deserialize `Option`. - pub mod opt { - use isolang::Language; - use serde::{Deserialize, Deserializer, Serialize, Serializer}; - use serde::de::Error; - use std::result::Result; - - // Deserialize an ISO 639-1 or 639-3 code. - pub fn deserialize<'de, D>(d: D) -> Result, D::Error> - where - D: Deserializer<'de>, - { - // We need to do this manually because of - // https://github.com/humenda/isolang-rs/issues/7 - let code: Option<&str> = Deserialize::deserialize(d)?; - match code { - None => Ok(None), - Some(c) => { - let lang = Language::from_639_3(c) - .or_else(|| Language::from_639_1(c)) - .ok_or_else(|| { - D::Error::unknown_variant( - c, - &["an ISO 639-1 or 639-3 language code"], - ) - })?; - Ok(Some(lang)) - } - } - } - - /// Serialize an `isolang::Language` as a 2-letter code if possible, or - /// a 3-letter code otherwise. - pub fn serialize( - lang: &Option, - serializer: S, - ) -> Result - where - S: Serializer, - { - let code = lang.map(|l| l.to_639_1().unwrap_or_else(|| l.to_639_3())); - code.serialize(serializer) - } - } -} - - diff --git a/archived/aligned_media/build.rs b/archived/aligned_media/build.rs deleted file mode 100644 index 5097ac99..00000000 --- a/archived/aligned_media/build.rs +++ /dev/null @@ -1,7 +0,0 @@ -// Build script which runs the `peg` parser generator for our HTML parser. - -extern crate peg; - -fn main() { - peg::cargo_build("src/html/grammar.rustpeg"); -} diff --git a/archived/aligned_media/src/html/grammar.rustpeg b/archived/aligned_media/src/html/grammar.rustpeg deleted file mode 100644 index 9652d168..00000000 --- a/archived/aligned_media/src/html/grammar.rustpeg +++ /dev/null @@ -1,113 +0,0 @@ -// Compiled automatically using `build.rs` script and `peg`. - -use std::collections::HashMap; -use std::iter::FromIterator; - -use super::{Fragment, Node, Attributes}; - -#[pub] -fragment -> Fragment - = nodes:(node*) { - Fragment { nodes } - } - -node -> Node = text_node / entity / element - -text_node -> Node - = #quiet< text:$([^<&]+) { - Node::Text { text: text.to_owned() } - } > / #expected("text") - -entity -> Node - = "&" text:( entity_named / entity_hex / entity_dec) ";" { - Node::Text { text } - } - -entity_named -> String - = name:$("lt" / "gt" / "amp" / "apos" / "quot") { - match name { - "lt" => "<".to_string(), - "gt" => ">".to_string(), - "amp" => "&".to_string(), - "apos" => "'".to_string(), - "quot" => "\"".to_string(), - _ => unreachable!("unknown named entity"), - } - } - -entity_hex -> String - = "#x" digits:$([0-9a-fA-F]+) {? - parse_numeric_entity(digits, 16).map(|c| c.to_string()) - } - -entity_dec -> String - = "#" digits:$([0-9]+) {? - parse_numeric_entity(digits, 10).map(|c| c.to_string()) - } - -element -> Node - = empty_element / start_end_element - -empty_element -> Node - = "<" name:$("br"i / "img"i) attributes:attributes ws* "/"? ">" { - Node::Element { - name: name.to_owned(), - attributes, - children: vec![], - } - } - -start_end_element -> Node - = start_tag:start_tag children:(node*) end_tag:end_tag {? - let (name, attributes) = start_tag; - if name == end_tag { - Ok(Node::Element { - name, - attributes, - children, - }) - } else { - Err("start and end tags must match") - } - } - -start_tag -> (String, Attributes) - = "<" name:$(name) attributes:attributes ws* ">" { - (name.to_owned(), attributes) - } - -end_tag -> String - = #quiet< "" { - name.to_owned() - } > / #expected("end tag") - -attributes -> Attributes - = attributes:(attribute*) { - HashMap::from_iter(attributes) - } - -attribute -> (String, String) - = #quiet< ws* name:$(name) ws* "=" ws* att_value:att_value { - (name.to_owned(), att_value) - } > / #expected("attribute") - -att_value -> String - = att_value_double_quotes / att_value_single_quotes - -// TODO: Handle escaped "<" and "&" values, if we ever need anything more -// than "" support. -att_value_double_quotes -> String - = "\"" text:$([^<&"]*) "\"" { - text.to_owned() - } - -// TODO: Handle escaped "<" and "&" values, if we ever need anything more -// than "" support. -att_value_single_quotes -> String - = "'" text:$([^<&']*) "'" { - text.to_owned() - } - -name = #quiet<[A-Za-z:_][-A-Za-z0-9:_.]*> / #expected("identifier") - -ws = #quiet<[ \r\n\t]+> diff --git a/archived/aligned_media/src/html/mod.rs b/archived/aligned_media/src/html/mod.rs deleted file mode 100644 index 78c256c7..00000000 --- a/archived/aligned_media/src/html/mod.rs +++ /dev/null @@ -1,216 +0,0 @@ -//! A very simple version of an HTML data model. This is designed to correspond -//! to the limited HTML features supported in some subtitle formats, and to be -//! easy to sanitize using a whitelist of supported tags and attributes. - -use failure::{self, ResultExt}; -use serde::{Deserialize, Deserializer, Serialize, Serializer}; -use serde::de::Error as DeError; -use std::collections::HashMap; -use std::fmt; -use std::result; -use std::str::FromStr; - -use super::Error; - -/// Our custom HTML-lite grammar. -mod grammar { - use std::result; - - /// Helper function to parse numeric character entities. - fn parse_numeric_entity( - digits: &str, - radix: u32, - ) -> result::Result { - let code = u32::from_str_radix(digits, radix) - .expect("parser should have required a valid number"); - if code == 0 { - return Err("no \"\\0\" characters allowed in HTML"); - } - ::std::char::from_u32(code).ok_or_else(|| { - "a valid UTF-8 character" - }) - } - - include!(concat!(env!("OUT_DIR"), "/grammar.rs")); -} - -/// An HTML fragment. -#[derive(Clone, Debug, Default, Eq, PartialEq)] -pub struct Fragment { - pub nodes: Vec -} - -impl Fragment { - /// Create an HTML fragment from a plain text node, escaping any special - /// characters. - pub fn from_text>(text: S) -> Fragment { - let node = Node::Text { text: text.into() }; - Fragment { nodes: vec![node] } - } -} - -impl FromStr for Fragment { - type Err = failure::Error; - - fn from_str(html: &str) -> result::Result { - Ok(grammar::fragment(html).with_context(|_| { - Error::CouldNotParseHtml { html: html.to_owned() } - })?) - } -} - -impl fmt::Display for Fragment { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - for node in &self.nodes { - node.fmt(f)?; - } - Ok(()) - } -} - -impl<'de> Deserialize<'de> for Fragment { - fn deserialize>(d: D) -> result::Result { - let raw_html = String::deserialize(d)?; - raw_html.parse().map_err(|err: failure::Error| { - // Make sure we get the entire cause chain for this error. - // - // TODO: Use a writer to format this, or just use `common_failures`. - let mut msg = String::new(); - for cause in err.iter_chain() { - if !msg.is_empty() { - msg.push_str("\n caused by: "); - } - msg.push_str(&format!("{}", cause)); - } - D::Error::custom(msg) - }) - } -} - -impl Serialize for Fragment { - fn serialize(&self, serializer: S) -> result::Result - where - S: Serializer, - { - format!("{}", self).serialize(serializer) - } -} - -/// A DOM node in an HTML fragment. Note that we convert all character entities -/// to text nodes at parse time. -#[derive(Clone, Debug, Eq, PartialEq)] -pub enum Node { - /// A regular text node. - Text { text: String }, - - /// An HTML element, with possible attributes. - Element { - /// The name of this element. - name: String, - - /// HTML element attributes. - attributes: Attributes, - - /// Child nodes. - children: Vec, - }, -} - -impl fmt::Display for Node { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - // TODO - Verify all strings are in legal ranges before printing. - match *self { - Node::Text { ref text } => { - text.replace("&", "&").replace("<", "<").fmt(f)?; - } - Node::Element { ref name, ref attributes, ref children } => { - write!(f, "<{}", name)?; - for (name, value) in attributes { - write!( - f, - " {}=\"{}\"", - name, - value.replace("&", "&").replace("\"", """), - )?; - } - write!(f, ">")?; - if name != "br" && name != "img" { - for child in children { - child.fmt(f)?; - } - write!(f, "", name)?; - } - } - } - Ok(()) - } -} - -/// HTML element attributes. -pub type Attributes = HashMap; - -#[test] -fn parse_valid_html_fragments() { - let good_examples = &[ - "Hello!", - "&<>"'@JJ", - "foo2

", - "34", - ]; - for example in good_examples { - let _parsed: Fragment = example.parse() - .expect("failed to parse plain_text"); - } -} - -#[test] -fn error_on_invalid_html_fragments() { - let bad_examples = &[ - "<", - "", - "
", - "", - "", - "", - ]; - for example in bad_examples { - let result = example.parse::(); - match result { - Ok(fragment) => { - panic!( - "parsed {:?} as {:?}, but should have failed", - example, - fragment, - ); - } - Err(err) => { - println!( - "correctly got error: {}\n caused by: {}", - err, - err.cause().cause().expect("error should have cause"), - ); - } - } - } -} - -#[test] -fn display_produces_canonical_string() { - let examples = &[ - ("Hello!", "Hello!"), - ("&<>"'@JJ", "&<>\"'@JJ"), - ("12

", "12

"), - ("34", - "34"), - ]; - for &(input, expected) in examples { - let fragment = input.parse::() - .expect("could not parse HTML"); - assert_eq!(format!("{}", fragment), expected); - } - - let text_fragment = Fragment::from_text("A & B <"); - assert_eq!(format!("{}", text_fragment), "A & B <"); -} diff --git a/substudy/Cargo.toml b/substudy/Cargo.toml index ffa91e00..5dcd50d7 100644 --- a/substudy/Cargo.toml +++ b/substudy/Cargo.toml @@ -19,25 +19,32 @@ exclude = ["fixtures/empty.mp4"] [dependencies] anyhow = "1.0.80" +async-openai = "0.19.0" cast = "0.3.0" chardet = "0.2" clap = { version = "4.5.1", features = ["derive", "wrap_help"] } csv = "1.0.0" diff = "0.1.13" +dotenv = "0.15.0" encoding = "0.2" env_logger = "0.11.2" handlebars = "5.1.0" +indicatif = "0.17.8" lazy_static = "1.0" lending-iterator = "0.1.7" log = "0.4.21" num = "0.4.1" ordered-float = { version = "4.0", default-features = false } paragraph-breaker = "0.4.4" -pbr = "1.0" peg = "0.8.2" regex = "1.10.3" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" +tokio = { version = "1.0", default-features = false, features = [ + "rt", + "rt-multi-thread", + "macros", +] } unicode-width = "0.1.5" whatlang = "0.16.4" diff --git a/substudy/src/bin/substudy.rs b/substudy/src/bin/substudy.rs index 058fb72a..5420ecda 100644 --- a/substudy/src/bin/substudy.rs +++ b/substudy/src/bin/substudy.rs @@ -3,9 +3,12 @@ use std::path::{Path, PathBuf}; use clap::{Parser, Subcommand}; +use dotenv::dotenv; use substudy::{ - align::combine_files, export, import, srt::SubtitleFile, video, Result, + align::combine_files, export, import, lang::Lang, + services::oai::translate_subtitle_file, srt::SubtitleFile, video, Result, }; +use tokio::task::spawn_blocking; #[derive(Debug, Parser)] /// Subtitle processing tools for students of foreign languages. (For now, all @@ -51,6 +54,17 @@ enum Args { #[command(subcommand)] to_list: ToList, }, + + /// Translate subtitles. + #[command(name = "translate")] + Translate { + /// Path to the subtitle file to translate. + foreign_subs: PathBuf, + + /// Target language code (e.g. "en" for English). + #[arg(long)] + native_lang: String, + }, } #[derive(Debug, Subcommand)] @@ -162,28 +176,39 @@ enum ToList { } // Choose and run the appropriate command. -fn main() -> Result<()> { +#[tokio::main] +async fn main() -> Result<()> { + dotenv().ok(); env_logger::init(); // Parse our command-line arguments using docopt (very shiny). let args: Args = Args::parse(); match args { - Args::Clean { ref subs } => cmd_clean(subs), + Args::Clean { subs } => spawn_blocking(move || cmd_clean(&subs)).await?, Args::Combine { - ref foreign_subs, - ref native_subs, - } => cmd_combine(foreign_subs, native_subs), - Args::Export { ref format } => cmd_export( - format.name(), - format.video(), - format.foreign_subs(), - format.native_subs(), - ), - Args::Import { format } => cmd_import(format), + foreign_subs, + native_subs, + } => spawn_blocking(move || cmd_combine(&foreign_subs, &native_subs)).await?, + Args::Export { format } => { + spawn_blocking(move || { + cmd_export( + format.name(), + format.video(), + format.foreign_subs(), + format.native_subs(), + ) + }) + .await? + } + Args::Import { format } => spawn_blocking(move || cmd_import(format)).await?, Args::List { - to_list: ToList::Tracks { ref video }, - } => cmd_tracks(video), + to_list: ToList::Tracks { video }, + } => spawn_blocking(move || cmd_tracks(&video)).await?, + Args::Translate { + foreign_subs, + native_lang, + } => cmd_translate(&foreign_subs, &native_lang).await, } } @@ -246,3 +271,11 @@ fn cmd_import(format: ImportFormat) -> std::prelude::v1::Result<(), anyhow::Erro } } } + +async fn cmd_translate(foreign_subs: &Path, native_lang: &str) -> Result<()> { + let file = SubtitleFile::cleaned_from_path(foreign_subs)?; + let native_lang = Lang::iso639(native_lang)?; + let translated = translate_subtitle_file(&file, native_lang).await?; + print!("{}", translated.to_string()); + Ok(()) +} diff --git a/substudy/src/data/README.md b/substudy/src/data/README.md new file mode 100644 index 00000000..c5243ab3 --- /dev/null +++ b/substudy/src/data/README.md @@ -0,0 +1,3 @@ +# Data sources + +- `language-codes-full.csv` is from [DataHub](https://datahub.io/core/language-codes), and distributed with the Open Data Commons Public Domain Dedication and License v1.0. But it's just a repackaging of the Library of Congress's [ISO 639-2 code list](https://www.loc.gov/standards/iso639-2/php/code_list.php) in CSV format. diff --git a/substudy/src/data/language-codes-full.csv b/substudy/src/data/language-codes-full.csv new file mode 100644 index 00000000..add7673c --- /dev/null +++ b/substudy/src/data/language-codes-full.csv @@ -0,0 +1,487 @@ +alpha3-b,alpha3-t,alpha2,English,French +aar,null,aa,Afar,afar +abk,null,ab,Abkhazian,abkhaze +ace,null,null,Achinese,aceh +ach,null,null,Acoli,acoli +ada,null,null,Adangme,adangme +ady,null,null,Adyghe; Adygei,adyghé +afa,null,null,Afro-Asiatic languages,"afro-asiatiques, langues" +afh,null,null,Afrihili,afrihili +afr,null,af,Afrikaans,afrikaans +ain,null,null,Ainu,aïnou +aka,null,ak,Akan,akan +akk,null,null,Akkadian,akkadien +alb,sqi,sq,Albanian,albanais +ale,null,null,Aleut,aléoute +alg,null,null,Algonquian languages,"algonquines, langues" +alt,null,null,Southern Altai,altai du Sud +amh,null,am,Amharic,amharique +ang,null,null,"English, Old (ca.450-1100)",anglo-saxon (ca.450-1100) +anp,null,null,Angika,angika +apa,null,null,Apache languages,"apaches, langues" +ara,null,ar,Arabic,arabe +arc,null,null,Official Aramaic (700-300 BCE); Imperial Aramaic (700-300 BCE),araméen d'empire (700-300 BCE) +arg,null,an,Aragonese,aragonais +arm,hye,hy,Armenian,arménien +arn,null,null,Mapudungun; Mapuche,mapudungun; mapuche; mapuce +arp,null,null,Arapaho,arapaho +art,null,null,Artificial languages,"artificielles, langues" +arw,null,null,Arawak,arawak +asm,null,as,Assamese,assamais +ast,null,null,Asturian; Bable; Leonese; Asturleonese,asturien; bable; léonais; asturoléonais +ath,null,null,Athapascan languages,"athapascanes, langues" +aus,null,null,Australian languages,"australiennes, langues" +ava,null,av,Avaric,avar +ave,null,ae,Avestan,avestique +awa,null,null,Awadhi,awadhi +aym,null,ay,Aymara,aymara +aze,null,az,Azerbaijani,azéri +bad,null,null,Banda languages,"banda, langues" +bai,null,null,Bamileke languages,"bamiléké, langues" +bak,null,ba,Bashkir,bachkir +bal,null,null,Baluchi,baloutchi +bam,null,bm,Bambara,bambara +ban,null,null,Balinese,balinais +baq,eus,eu,Basque,basque +bas,null,null,Basa,basa +bat,null,null,Baltic languages,"baltes, langues" +bej,null,null,Beja; Bedawiyet,bedja +bel,null,be,Belarusian,biélorusse +bem,null,null,Bemba,bemba +ben,null,bn,Bengali,bengali +ber,null,null,Berber languages,"berbères, langues" +bho,null,null,Bhojpuri,bhojpuri +bih,null,bh,Bihari languages,langues biharis +bik,null,null,Bikol,bikol +bin,null,null,Bini; Edo,bini; edo +bis,null,bi,Bislama,bichlamar +bla,null,null,Siksika,blackfoot +bnt,null,null,Bantu (Other),"bantoues, autres langues" +bos,null,bs,Bosnian,bosniaque +bra,null,null,Braj,braj +bre,null,br,Breton,breton +btk,null,null,Batak languages,"batak, langues" +bua,null,null,Buriat,bouriate +bug,null,null,Buginese,bugi +bul,null,bg,Bulgarian,bulgare +bur,mya,my,Burmese,birman +byn,null,null,Blin; Bilin,blin; bilen +cad,null,null,Caddo,caddo +cai,null,null,Central American Indian languages,"amérindiennes de L'Amérique centrale, langues" +car,null,null,Galibi Carib,karib; galibi; carib +cat,null,ca,Catalan; Valencian,catalan; valencien +cau,null,null,Caucasian languages,"caucasiennes, langues" +ceb,null,null,Cebuano,cebuano +cel,null,null,Celtic languages,"celtiques, langues; celtes, langues" +cha,null,ch,Chamorro,chamorro +chb,null,null,Chibcha,chibcha +che,null,ce,Chechen,tchétchène +chg,null,null,Chagatai,djaghataï +chi,zho,zh,Chinese,chinois +chk,null,null,Chuukese,chuuk +chm,null,null,Mari,mari +chn,null,null,Chinook jargon,"chinook, jargon" +cho,null,null,Choctaw,choctaw +chp,null,null,Chipewyan; Dene Suline,chipewyan +chr,null,null,Cherokee,cherokee +chu,null,cu,Church Slavic; Old Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic,slavon d'église; vieux slave; slavon liturgique; vieux bulgare +chv,null,cv,Chuvash,tchouvache +chy,null,null,Cheyenne,cheyenne +cmc,null,null,Chamic languages,"chames, langues" +cop,null,null,Coptic,copte +cor,null,kw,Cornish,cornique +cos,null,co,Corsican,corse +cpe,null,null,"Creoles and pidgins, English based",créoles et pidgins basés sur l'anglais +cpf,null,null,"Creoles and pidgins, French-based",créoles et pidgins basés sur le français +cpp,null,null,"Creoles and pidgins, Portuguese-based",créoles et pidgins basés sur le portugais +cre,null,cr,Cree,cree +crh,null,null,Crimean Tatar; Crimean Turkish,tatar de Crimé +crp,null,null,Creoles and pidgins,créoles et pidgins +csb,null,null,Kashubian,kachoube +cus,null,null,Cushitic languages,"couchitiques, langues" +cze,ces,cs,Czech,tchèque +dak,null,null,Dakota,dakota +dan,null,da,Danish,danois +dar,null,null,Dargwa,dargwa +day,null,null,Land Dayak languages,"dayak, langues" +del,null,null,Delaware,delaware +den,null,null,Slave (Athapascan),esclave (athapascan) +dgr,null,null,Dogrib,dogrib +din,null,null,Dinka,dinka +div,null,dv,Divehi; Dhivehi; Maldivian,maldivien +doi,null,null,Dogri,dogri +dra,null,null,Dravidian languages,"dravidiennes, langues" +dsb,null,null,Lower Sorbian,bas-sorabe +dua,null,null,Duala,douala +dum,null,null,"Dutch, Middle (ca.1050-1350)",néerlandais moyen (ca. 1050-1350) +dut,nld,nl,Dutch; Flemish,néerlandais; flamand +dyu,null,null,Dyula,dioula +dzo,null,dz,Dzongkha,dzongkha +efi,null,null,Efik,efik +egy,null,null,Egyptian (Ancient),égyptien +eka,null,null,Ekajuk,ekajuk +elx,null,null,Elamite,élamite +eng,null,en,English,anglais +enm,null,null,"English, Middle (1100-1500)",anglais moyen (1100-1500) +epo,null,eo,Esperanto,espéranto +est,null,et,Estonian,estonien +ewe,null,ee,Ewe,éwé +ewo,null,null,Ewondo,éwondo +fan,null,null,Fang,fang +fao,null,fo,Faroese,féroïen +fat,null,null,Fanti,fanti +fij,null,fj,Fijian,fidjien +fil,null,null,Filipino; Pilipino,filipino; pilipino +fin,null,fi,Finnish,finnois +fiu,null,null,Finno-Ugrian languages,"finno-ougriennes, langues" +fon,null,null,Fon,fon +fre,fra,fr,French,français +frm,null,null,"French, Middle (ca.1400-1600)",français moyen (1400-1600) +fro,null,null,"French, Old (842-ca.1400)",français ancien (842-ca.1400) +frr,null,null,Northern Frisian,frison septentrional +frs,null,null,Eastern Frisian,frison oriental +fry,null,fy,Western Frisian,frison occidental +ful,null,ff,Fulah,peul +fur,null,null,Friulian,frioulan +gaa,null,null,Ga,ga +gay,null,null,Gayo,gayo +gba,null,null,Gbaya,gbaya +gem,null,null,Germanic languages,"germaniques, langues" +geo,kat,ka,Georgian,géorgien +ger,deu,de,German,allemand +gez,null,null,Geez,guèze +gil,null,null,Gilbertese,kiribati +gla,null,gd,Gaelic; Scottish Gaelic,gaélique; gaélique écossais +gle,null,ga,Irish,irlandais +glg,null,gl,Galician,galicien +glv,null,gv,Manx,manx; mannois +gmh,null,null,"German, Middle High (ca.1050-1500)","allemand, moyen haut (ca. 1050-1500)" +goh,null,null,"German, Old High (ca.750-1050)","allemand, vieux haut (ca. 750-1050)" +gon,null,null,Gondi,gond +gor,null,null,Gorontalo,gorontalo +got,null,null,Gothic,gothique +grb,null,null,Grebo,grebo +grc,null,null,"Greek, Ancient (to 1453)",grec ancien (jusqu'à 1453) +gre,ell,el,"Greek, Modern (1453-)",grec moderne (après 1453) +grn,null,gn,Guarani,guarani +gsw,null,null,Swiss German; Alemannic; Alsatian,suisse alémanique; alémanique; alsacien +guj,null,gu,Gujarati,goudjrati +gwi,null,null,Gwich'in,gwich'in +hai,null,null,Haida,haida +hat,null,ht,Haitian; Haitian Creole,haïtien; créole haïtien +hau,null,ha,Hausa,haoussa +haw,null,null,Hawaiian,hawaïen +heb,null,he,Hebrew,hébreu +her,null,hz,Herero,herero +hil,null,null,Hiligaynon,hiligaynon +him,null,null,Himachali languages; Western Pahari languages,langues himachalis; langues paharis occidentales +hin,null,hi,Hindi,hindi +hit,null,null,Hittite,hittite +hmn,null,null,Hmong; Mong,hmong +hmo,null,ho,Hiri Motu,hiri motu +hrv,null,hr,Croatian,croate +hsb,null,null,Upper Sorbian,haut-sorabe +hun,null,hu,Hungarian,hongrois +hup,null,null,Hupa,hupa +iba,null,null,Iban,iban +ibo,null,ig,Igbo,igbo +ice,isl,is,Icelandic,islandais +ido,null,io,Ido,ido +iii,null,ii,Sichuan Yi; Nuosu,yi de Sichuan +ijo,null,null,Ijo languages,"ijo, langues" +iku,null,iu,Inuktitut,inuktitut +ile,null,ie,Interlingue; Occidental,interlingue +ilo,null,null,Iloko,ilocano +ina,null,ia,Interlingua (International Auxiliary Language Association),interlingua (langue auxiliaire internationale) +inc,null,null,Indic languages,"indo-aryennes, langues" +ind,null,id,Indonesian,indonésien +ine,null,null,Indo-European languages,"indo-européennes, langues" +inh,null,null,Ingush,ingouche +ipk,null,ik,Inupiaq,inupiaq +ira,null,null,Iranian languages,"iraniennes, langues" +iro,null,null,Iroquoian languages,"iroquoises, langues" +ita,null,it,Italian,italien +jav,null,jv,Javanese,javanais +jbo,null,null,Lojban,lojban +jpn,null,ja,Japanese,japonais +jpr,null,null,Judeo-Persian,judéo-persan +jrb,null,null,Judeo-Arabic,judéo-arabe +kaa,null,null,Kara-Kalpak,karakalpak +kab,null,null,Kabyle,kabyle +kac,null,null,Kachin; Jingpho,kachin; jingpho +kal,null,kl,Kalaallisut; Greenlandic,groenlandais +kam,null,null,Kamba,kamba +kan,null,kn,Kannada,kannada +kar,null,null,Karen languages,"karen, langues" +kas,null,ks,Kashmiri,kashmiri +kau,null,kr,Kanuri,kanouri +kaw,null,null,Kawi,kawi +kaz,null,kk,Kazakh,kazakh +kbd,null,null,Kabardian,kabardien +kha,null,null,Khasi,khasi +khi,null,null,Khoisan languages,"khoïsan, langues" +khm,null,km,Central Khmer,khmer central +kho,null,null,Khotanese; Sakan,khotanais; sakan +kik,null,ki,Kikuyu; Gikuyu,kikuyu +kin,null,rw,Kinyarwanda,rwanda +kir,null,ky,Kirghiz; Kyrgyz,kirghiz +kmb,null,null,Kimbundu,kimbundu +kok,null,null,Konkani,konkani +kom,null,kv,Komi,kom +kon,null,kg,Kongo,kongo +kor,null,ko,Korean,coréen +kos,null,null,Kosraean,kosrae +kpe,null,null,Kpelle,kpellé +krc,null,null,Karachay-Balkar,karatchai balkar +krl,null,null,Karelian,carélien +kro,null,null,Kru languages,"krou, langues" +kru,null,null,Kurukh,kurukh +kua,null,kj,Kuanyama; Kwanyama,kuanyama; kwanyama +kum,null,null,Kumyk,koumyk +kur,null,ku,Kurdish,kurde +kut,null,null,Kutenai,kutenai +lad,null,null,Ladino,judéo-espagnol +lah,null,null,Lahnda,lahnda +lam,null,null,Lamba,lamba +lao,null,lo,Lao,lao +lat,null,la,Latin,latin +lav,null,lv,Latvian,letton +lez,null,null,Lezghian,lezghien +lim,null,li,Limburgan; Limburger; Limburgish,limbourgeois +lin,null,ln,Lingala,lingala +lit,null,lt,Lithuanian,lituanien +lol,null,null,Mongo,mongo +loz,null,null,Lozi,lozi +ltz,null,lb,Luxembourgish; Letzeburgesch,luxembourgeois +lua,null,null,Luba-Lulua,luba-lulua +lub,null,lu,Luba-Katanga,luba-katanga +lug,null,lg,Ganda,ganda +lui,null,null,Luiseno,luiseno +lun,null,null,Lunda,lunda +luo,null,null,Luo (Kenya and Tanzania),luo (Kenya et Tanzanie) +lus,null,null,Lushai,lushai +mac,mkd,mk,Macedonian,macédonien +mad,null,null,Madurese,madourais +mag,null,null,Magahi,magahi +mah,null,mh,Marshallese,marshall +mai,null,null,Maithili,maithili +mak,null,null,Makasar,makassar +mal,null,ml,Malayalam,malayalam +man,null,null,Mandingo,mandingue +mao,mri,mi,Maori,maori +map,null,null,Austronesian languages,"austronésiennes, langues" +mar,null,mr,Marathi,marathe +mas,null,null,Masai,massaï +may,msa,ms,Malay,malais +mdf,null,null,Moksha,moksa +mdr,null,null,Mandar,mandar +men,null,null,Mende,mendé +mga,null,null,"Irish, Middle (900-1200)",irlandais moyen (900-1200) +mic,null,null,Mi'kmaq; Micmac,mi'kmaq; micmac +min,null,null,Minangkabau,minangkabau +mis,null,null,Uncoded languages,langues non codées +mkh,null,null,Mon-Khmer languages,"môn-khmer, langues" +mlg,null,mg,Malagasy,malgache +mlt,null,mt,Maltese,maltais +mnc,null,null,Manchu,mandchou +mni,null,null,Manipuri,manipuri +mno,null,null,Manobo languages,"manobo, langues" +moh,null,null,Mohawk,mohawk +mon,null,mn,Mongolian,mongol +mos,null,null,Mossi,moré +mul,null,null,Multiple languages,multilingue +mun,null,null,Munda languages,"mounda, langues" +mus,null,null,Creek,muskogee +mwl,null,null,Mirandese,mirandais +mwr,null,null,Marwari,marvari +myn,null,null,Mayan languages,"maya, langues" +myv,null,null,Erzya,erza +nah,null,null,Nahuatl languages,"nahuatl, langues" +nai,null,null,North American Indian languages,"nord-amérindiennes, langues" +nap,null,null,Neapolitan,napolitain +nau,null,na,Nauru,nauruan +nav,null,nv,Navajo; Navaho,navaho +nbl,null,nr,"Ndebele, South; South Ndebele",ndébélé du Sud +nde,null,nd,"Ndebele, North; North Ndebele",ndébélé du Nord +ndo,null,ng,Ndonga,ndonga +nds,null,null,"Low German; Low Saxon; German, Low; Saxon, Low","bas allemand; bas saxon; allemand, bas; saxon, bas" +nep,null,ne,Nepali,népalais +new,null,null,Nepal Bhasa; Newari,nepal bhasa; newari +nia,null,null,Nias,nias +nic,null,null,Niger-Kordofanian languages,"nigéro-kordofaniennes, langues" +niu,null,null,Niuean,niué +nno,null,nn,"Norwegian Nynorsk; Nynorsk, Norwegian","norvégien nynorsk; nynorsk, norvégien" +nob,null,nb,"Bokmål, Norwegian; Norwegian Bokmål",norvégien bokmål +nog,null,null,Nogai,nogaï; nogay +non,null,null,"Norse, Old","norrois, vieux" +nor,null,no,Norwegian,norvégien +nqo,null,null,N'Ko,n'ko +nso,null,null,Pedi; Sepedi; Northern Sotho,pedi; sepedi; sotho du Nord +nub,null,null,Nubian languages,"nubiennes, langues" +nwc,null,null,Classical Newari; Old Newari; Classical Nepal Bhasa,newari classique +nya,null,ny,Chichewa; Chewa; Nyanja,chichewa; chewa; nyanja +nym,null,null,Nyamwezi,nyamwezi +nyn,null,null,Nyankole,nyankolé +nyo,null,null,Nyoro,nyoro +nzi,null,null,Nzima,nzema +oci,null,oc,Occitan (post 1500); Provençal,occitan (après 1500); provençal +oji,null,oj,Ojibwa,ojibwa +ori,null,or,Oriya,oriya +orm,null,om,Oromo,galla +osa,null,null,Osage,osage +oss,null,os,Ossetian; Ossetic,ossète +ota,null,null,"Turkish, Ottoman (1500-1928)",turc ottoman (1500-1928) +oto,null,null,Otomian languages,"otomi, langues" +paa,null,null,Papuan languages,"papoues, langues" +pag,null,null,Pangasinan,pangasinan +pal,null,null,Pahlavi,pahlavi +pam,null,null,Pampanga; Kapampangan,pampangan +pan,null,pa,Panjabi; Punjabi,pendjabi +pap,null,null,Papiamento,papiamento +pau,null,null,Palauan,palau +peo,null,null,"Persian, Old (ca.600-400 B.C.)","perse, vieux (ca. 600-400 av. J.-C.)" +per,fas,fa,Persian,persan +phi,null,null,Philippine languages,"philippines, langues" +phn,null,null,Phoenician,phénicien +pli,null,pi,Pali,pali +pol,null,pl,Polish,polonais +pon,null,null,Pohnpeian,pohnpei +por,null,pt,Portuguese,portugais +pra,null,null,Prakrit languages,"prâkrit, langues" +pro,null,null,"Provençal, Old (to 1500)",provençal ancien (jusqu'à 1500) +pus,null,ps,Pushto; Pashto,pachto +qaa-qtz,null,null,Reserved for local use,réservée à l'usage local +que,null,qu,Quechua,quechua +raj,null,null,Rajasthani,rajasthani +rap,null,null,Rapanui,rapanui +rar,null,null,Rarotongan; Cook Islands Maori,rarotonga; maori des îles Cook +roa,null,null,Romance languages,"romanes, langues" +roh,null,rm,Romansh,romanche +rom,null,null,Romany,tsigane +rum,ron,ro,Romanian; Moldavian; Moldovan,roumain; moldave +run,null,rn,Rundi,rundi +rup,null,null,Aromanian; Arumanian; Macedo-Romanian,aroumain; macédo-roumain +rus,null,ru,Russian,russe +sad,null,null,Sandawe,sandawe +sag,null,sg,Sango,sango +sah,null,null,Yakut,iakoute +sai,null,null,South American Indian (Other),"indiennes d'Amérique du Sud, autres langues" +sal,null,null,Salishan languages,"salishennes, langues" +sam,null,null,Samaritan Aramaic,samaritain +san,null,sa,Sanskrit,sanskrit +sas,null,null,Sasak,sasak +sat,null,null,Santali,santal +scn,null,null,Sicilian,sicilien +sco,null,null,Scots,écossais +sel,null,null,Selkup,selkoupe +sem,null,null,Semitic languages,"sémitiques, langues" +sga,null,null,"Irish, Old (to 900)",irlandais ancien (jusqu'à 900) +sgn,null,null,Sign Languages,langues des signes +shn,null,null,Shan,chan +sid,null,null,Sidamo,sidamo +sin,null,si,Sinhala; Sinhalese,singhalais +sio,null,null,Siouan languages,"sioux, langues" +sit,null,null,Sino-Tibetan languages,"sino-tibétaines, langues" +sla,null,null,Slavic languages,"slaves, langues" +slo,slk,sk,Slovak,slovaque +slv,null,sl,Slovenian,slovène +sma,null,null,Southern Sami,sami du Sud +sme,null,se,Northern Sami,sami du Nord +smi,null,null,Sami languages,"sames, langues" +smj,null,null,Lule Sami,sami de Lule +smn,null,null,Inari Sami,sami d'Inari +smo,null,sm,Samoan,samoan +sms,null,null,Skolt Sami,sami skolt +sna,null,sn,Shona,shona +snd,null,sd,Sindhi,sindhi +snk,null,null,Soninke,soninké +sog,null,null,Sogdian,sogdien +som,null,so,Somali,somali +son,null,null,Songhai languages,"songhai, langues" +sot,null,st,"Sotho, Southern",sotho du Sud +spa,null,es,Spanish; Castilian,espagnol; castillan +srd,null,sc,Sardinian,sarde +srn,null,null,Sranan Tongo,sranan tongo +srp,null,sr,Serbian,serbe +srr,null,null,Serer,sérère +ssa,null,null,Nilo-Saharan languages,"nilo-sahariennes, langues" +ssw,null,ss,Swati,swati +suk,null,null,Sukuma,sukuma +sun,null,su,Sundanese,soundanais +sus,null,null,Susu,soussou +sux,null,null,Sumerian,sumérien +swa,null,sw,Swahili,swahili +swe,null,sv,Swedish,suédois +syc,null,null,Classical Syriac,syriaque classique +syr,null,null,Syriac,syriaque +tah,null,ty,Tahitian,tahitien +tai,null,null,Tai languages,"tai, langues" +tam,null,ta,Tamil,tamoul +tat,null,tt,Tatar,tatar +tel,null,te,Telugu,télougou +tem,null,null,Timne,temne +ter,null,null,Tereno,tereno +tet,null,null,Tetum,tetum +tgk,null,tg,Tajik,tadjik +tgl,null,tl,Tagalog,tagalog +tha,null,th,Thai,thaï +tib,bod,bo,Tibetan,tibétain +tig,null,null,Tigre,tigré +tir,null,ti,Tigrinya,tigrigna +tiv,null,null,Tiv,tiv +tkl,null,null,Tokelau,tokelau +tlh,null,null,Klingon; tlhIngan-Hol,klingon +tli,null,null,Tlingit,tlingit +tmh,null,null,Tamashek,tamacheq +tog,null,null,Tonga (Nyasa),tonga (Nyasa) +ton,null,to,Tonga (Tonga Islands),tongan (Îles Tonga) +tpi,null,null,Tok Pisin,tok pisin +tsi,null,null,Tsimshian,tsimshian +tsn,null,tn,Tswana,tswana +tso,null,ts,Tsonga,tsonga +tuk,null,tk,Turkmen,turkmène +tum,null,null,Tumbuka,tumbuka +tup,null,null,Tupi languages,"tupi, langues" +tur,null,tr,Turkish,turc +tut,null,null,Altaic languages,"altaïques, langues" +tvl,null,null,Tuvalu,tuvalu +twi,null,tw,Twi,twi +tyv,null,null,Tuvinian,touva +udm,null,null,Udmurt,oudmourte +uga,null,null,Ugaritic,ougaritique +uig,null,ug,Uighur; Uyghur,ouïgour +ukr,null,uk,Ukrainian,ukrainien +umb,null,null,Umbundu,umbundu +und,null,null,Undetermined,indéterminée +urd,null,ur,Urdu,ourdou +uzb,null,uz,Uzbek,ouszbek +vai,null,null,Vai,vaï +ven,null,ve,Venda,venda +vie,null,vi,Vietnamese,vietnamien +vol,null,vo,Volapük,volapük +vot,null,null,Votic,vote +wak,null,null,Wakashan languages,"wakashanes, langues" +wal,null,null,Walamo,walamo +war,null,null,Waray,waray +was,null,null,Washo,washo +wel,cym,cy,Welsh,gallois +wen,null,null,Sorbian languages,"sorabes, langues" +wln,null,wa,Walloon,wallon +wol,null,wo,Wolof,wolof +xal,null,null,Kalmyk; Oirat,kalmouk; oïrat +xho,null,xh,Xhosa,xhosa +yao,null,null,Yao,yao +yap,null,null,Yapese,yapois +yid,null,yi,Yiddish,yiddish +yor,null,yo,Yoruba,yoruba +ypk,null,null,Yupik languages,"yupik, langues" +zap,null,null,Zapotec,zapotèque +zbl,null,null,Blissymbols; Blissymbolics; Bliss,symboles Bliss; Bliss +zen,null,null,Zenaga,zenaga +zgh,null,null,Standard Moroccan Tamazight,amazighe standard marocain +zha,null,za,Zhuang; Chuang,zhuang; chuang +znd,null,null,Zande languages,"zandé, langues" +zul,null,zu,Zulu,zoulou +zun,null,null,Zuni,zuni +zxx,null,null,No linguistic content; Not applicable,pas de contenu linguistique; non applicable +zza,null,null,Zaza; Dimili; Dimli; Kirdki; Kirmanjki; Zazaki,zaza; dimili; dimli; kirdki; kirmanjki; zazaki \ No newline at end of file diff --git a/substudy/src/import/whisper.rs b/substudy/src/import/whisper.rs index 6a98b525..f35f78b5 100644 --- a/substudy/src/import/whisper.rs +++ b/substudy/src/import/whisper.rs @@ -1,9 +1,9 @@ use std::{fmt, fs::File, iter::repeat, path::Path}; -use anyhow::{anyhow, Context as _}; +use anyhow::Context as _; use lazy_static::lazy_static; use lending_iterator::{lending_iterator::constructors::windows_mut, LendingIterator}; -use log::{debug, info, trace, warn}; +use log::{debug, trace, warn}; use ordered_float::NotNan; use regex::Regex; use serde::Deserialize; @@ -84,7 +84,7 @@ enum Overlap { impl Overlap { /// Compute overlap type from start and end times. - fn from_times(start1: f32, end1: f32, start2: f32, end2: f32) -> Overlap { + fn from_times(_start1: f32, end1: f32, start2: f32, end2: f32) -> Overlap { if end1 > start2 + 1.0 || end1 > end2 { Overlap::Severe } else if end1 + 0.01 > start2 { @@ -356,7 +356,7 @@ impl AnalyzedSegments { chars_per_sec.sort_unstable(); let median_chars_per_sec = if chars_per_sec.is_empty() { warn!("No valid segment times found"); - 10.0 + DEFAULT_CHARS_PER_SECOND } else { chars_per_sec[chars_per_sec.len() / 2].into_inner() }; diff --git a/substudy/src/lang.rs b/substudy/src/lang.rs index 0ed952e1..0bde2d03 100644 --- a/substudy/src/lang.rs +++ b/substudy/src/lang.rs @@ -1,6 +1,6 @@ //! Naming and identifying languages. We use -use std::{collections::HashMap, fmt, iter::FromIterator, result, str::from_utf8}; +use std::{collections::HashMap, fmt, result, str::from_utf8}; use anyhow::anyhow; use lazy_static::lazy_static; @@ -10,66 +10,59 @@ use whatlang; use crate::Result; +/// External CSV data from the LoC. +/// +/// This is a CSV file which looks like: +/// +/// ```csv +/// alpha3-b,alpha3-t,alpha2,English,French +/// aar,null,aa,Afar,afar +/// ``` +static ISO_639_CODES: &str = include_str!("data/language-codes-full.csv"); + +/// Maps related to ISO 639 language codes. +struct LangMaps { + canonical_codes: HashMap, + names: HashMap, +} + +/// Helper function called to build language maps. +fn iso_689_canonical_codes_and_names() -> LangMaps { + let mut canonical_codes = HashMap::new(); + let mut names = HashMap::new(); + + // Parse using `csv` crate. + let mut rdr = csv::Reader::from_reader(ISO_639_CODES.as_bytes()); + let mut r = csv::StringRecord::new(); + while rdr.read_record(&mut r).expect("error reading embedded CSV") { + let (a3b, a3t, a2, en, _fr) = (&r[0], &r[1], &r[2], &r[3], &r[4]); + if a2 != "null" { + if a3b != "null" { + canonical_codes.insert(a3b.to_owned(), a2.to_owned()); + } + if a3t != "null" { + canonical_codes.insert(a3t.to_owned(), a2.to_owned()); + } + names.insert(a2.to_owned(), en.to_owned()); + } else { + if a3b != "null" { + names.insert(a3b.to_owned(), en.to_owned()); + } + if a3t != "null" { + names.insert(a3t.to_owned(), en.to_owned()); + } + } + } + LangMaps { + canonical_codes, + names, + } +} + // Use the third-party `lazy_static!` macro to declare variables that will // initialized the first time we use them. lazy_static! { - /// Maps ISO 639 codes to their preferred internal forms. Based on - /// http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt - static ref CANONICAL_CODE: HashMap<&'static str, &'static str> = { - HashMap::from_iter([ - ("aar", "aa"), ("abk", "ab"), ("afr", "af"), ("aka", "ak"), - ("alb", "sq"), ("sqi", "sq"), ("amh", "am"), ("ara", "ar"), - ("arg", "an"), ("arm", "hy"), ("hye", "hy"), ("asm", "as"), - ("ava", "av"), ("ave", "ae"), ("aym", "ay"), ("aze", "az"), - ("bak", "ba"), ("bam", "bm"), ("baq", "eu"), ("eus", "eu"), - ("bel", "be"), ("ben", "bn"), ("bih", "bh"), ("bis", "bi"), - ("bos", "bs"), ("bre", "br"), ("bul", "bg"), ("bur", "my"), - ("mya", "my"), ("cat", "ca"), ("cha", "ch"), ("che", "ce"), - ("chi", "zh"), ("zho", "zh"), ("chu", "cu"), ("chv", "cv"), - ("cor", "kw"), ("cos", "co"), ("cre", "cr"), ("ces", "cs"), - ("cze", "cs"), ("dan", "da"), ("div", "dv"), ("dut", "nl"), - ("nld", "nl"), ("dzo", "dz"), ("eng", "en"), ("epo", "eo"), - ("est", "et"), ("ewe", "ee"), ("fao", "fo"), ("fij", "fj"), - ("fin", "fi"), ("fra", "fr"), ("fre", "fr"), ("fry", "fy"), - ("ful", "ff"), ("geo", "ka"), ("kat", "ka"), ("deu", "de"), - ("ger", "de"), ("gla", "gd"), ("gle", "ga"), ("glg", "gl"), - ("glv", "gv"), ("ell", "el"), ("gre", "el"), ("grn", "gn"), - ("guj", "gu"), ("hat", "ht"), ("hau", "ha"), ("heb", "he"), - ("her", "hz"), ("hin", "hi"), ("hmo", "ho"), ("hrv", "hr"), - ("hun", "hu"), ("ibo", "ig"), ("ice", "is"), ("isl", "is"), - ("ido", "io"), ("iii", "ii"), ("iku", "iu"), ("ile", "ie"), - ("ina", "ia"), ("ind", "id"), ("ipk", "ik"), ("ita", "it"), - ("jav", "jv"), ("jpn", "ja"), ("kal", "kl"), ("kan", "kn"), - ("kas", "ks"), ("kau", "kr"), ("kaz", "kk"), ("khm", "km"), - ("kik", "ki"), ("kin", "rw"), ("kir", "ky"), ("kom", "kv"), - ("kon", "kg"), ("kor", "ko"), ("kua", "kj"), ("kur", "ku"), - ("lao", "lo"), ("lat", "la"), ("lav", "lv"), ("lim", "li"), - ("lin", "ln"), ("lit", "lt"), ("ltz", "lb"), ("lub", "lu"), - ("lug", "lg"), ("mac", "mk"), ("mkd", "mk"), ("mah", "mh"), - ("mal", "ml"), ("mao", "mi"), ("mri", "mi"), ("mar", "mr"), - ("may", "ms"), ("msa", "ms"), ("mlg", "mg"), ("mlt", "mt"), - ("mon", "mn"), ("nau", "na"), ("nav", "nv"), ("nbl", "nr"), - ("nde", "nd"), ("ndo", "ng"), ("nep", "ne"), ("nno", "nn"), - ("nob", "nb"), ("nor", "no"), ("nya", "ny"), ("oci", "oc"), - ("oji", "oj"), ("ori", "or"), ("orm", "om"), ("oss", "os"), - ("pan", "pa"), ("fas", "fa"), ("per", "fa"), ("pli", "pi"), - ("pol", "pl"), ("por", "pt"), ("pus", "ps"), ("que", "qu"), - ("roh", "rm"), ("ron", "ro"), ("rum", "ro"), ("run", "rn"), - ("rus", "ru"), ("sag", "sg"), ("san", "sa"), ("sin", "si"), - ("slk", "sk"), ("slo", "sk"), ("slv", "sl"), ("sme", "se"), - ("smo", "sm"), ("sna", "sn"), ("snd", "sd"), ("som", "so"), - ("sot", "st"), ("spa", "es"), ("srd", "sc"), ("srp", "sr"), - ("ssw", "ss"), ("sun", "su"), ("swa", "sw"), ("swe", "sv"), - ("tah", "ty"), ("tam", "ta"), ("tat", "tt"), ("tel", "te"), - ("tgk", "tg"), ("tgl", "tl"), ("tha", "th"), ("bod", "bo"), - ("tib", "bo"), ("tir", "ti"), ("ton", "to"), ("tsn", "tn"), - ("tso", "ts"), ("tuk", "tk"), ("tur", "tr"), ("twi", "tw"), - ("uig", "ug"), ("ukr", "uk"), ("urd", "ur"), ("uzb", "uz"), - ("ven", "ve"), ("vie", "vi"), ("vol", "vo"), ("cym", "cy"), - ("wel", "cy"), ("wln", "wa"), ("wol", "wo"), ("xho", "xh"), - ("yid", "yi"), ("yor", "yo"), ("zha", "za"), ("zul", "zu"), - ].iter().cloned()) - }; + static ref LANG_MAPS: LangMaps = iso_689_canonical_codes_and_names(); } /// A language identifier. @@ -91,7 +84,11 @@ impl Lang { /// assert!(Lang::iso639("abcd").is_err()); /// ``` pub fn iso639(code: &str) -> Result { - let canon = CANONICAL_CODE.get(code).cloned().unwrap_or(code); + let canon = LANG_MAPS + .canonical_codes + .get(code) + .cloned() + .unwrap_or_else(|| code.to_owned()); let c = canon.as_bytes(); match (canon.is_ascii(), c.len()) { (true, 2) => Ok(Lang { @@ -138,6 +135,27 @@ impl Lang { } None } + + /// Names of the language (or related languages) in English. These + /// may be separated by semi-colons. + /// + /// ``` + /// use substudy::lang::Lang; + /// assert_eq!( + /// vec!["English".to_owned()], + /// Lang::iso639("en").unwrap().english_names().unwrap(), + /// ); + /// ``` + pub fn english_names(&self) -> Result> { + let name_str = LANG_MAPS + .names + .get(self.as_str()) + .map(|s| s.as_str()) + .ok_or_else(|| { + anyhow!("No English name for language code: {:?}", self.as_str()) + })?; + Ok(name_str.split("; ").collect()) + } } impl fmt::Debug for Lang { diff --git a/substudy/src/lib.rs b/substudy/src/lib.rs index 11a437a4..e2dcb914 100644 --- a/substudy/src/lib.rs +++ b/substudy/src/lib.rs @@ -18,7 +18,9 @@ pub mod export; pub mod import; pub mod lang; pub mod merge; +pub(crate) mod progress; pub mod segment; +pub mod services; pub mod srt; pub mod time; pub mod video; diff --git a/substudy/src/progress.rs b/substudy/src/progress.rs new file mode 100644 index 00000000..2298ec55 --- /dev/null +++ b/substudy/src/progress.rs @@ -0,0 +1,10 @@ +//! Progress bar support. + +use indicatif::ProgressStyle; + +/// Our default progress style. +pub(crate) fn default_progress_style() -> ProgressStyle { + ProgressStyle::default_bar() + .template("{prefix} {pos:>4}/{len:4} {wide_bar:.cyan/blue} {eta_precise}") + .expect("bad progress bar template") +} diff --git a/substudy/src/services/mod.rs b/substudy/src/services/mod.rs new file mode 100644 index 00000000..e4a8005c --- /dev/null +++ b/substudy/src/services/mod.rs @@ -0,0 +1,3 @@ +//! External services we use. + +pub mod oai; diff --git a/substudy/src/services/oai/mod.rs b/substudy/src/services/oai/mod.rs new file mode 100644 index 00000000..835c98f7 --- /dev/null +++ b/substudy/src/services/oai/mod.rs @@ -0,0 +1,131 @@ +//! OpenAI client. + +use std::{future::Future, time::Duration}; + +use anyhow::{anyhow, Context}; +use async_openai::types::{ + ChatCompletionNamedToolChoice, ChatCompletionRequestMessage, + ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage, + ChatCompletionRequestUserMessageContent, ChatCompletionTool, + ChatCompletionToolChoiceOption, ChatCompletionToolType, + CreateChatCompletionResponse, FunctionName, FunctionObject, Role, +}; +use log::debug; +use tokio::time::sleep; + +use crate::Result; + +pub use self::translate::translate_subtitle_file; + +mod translate; + +/// Retry an OpenAI request a few times. +async fn retry_openai_request(f: Func) -> Result +where + Func: Fn() -> Fut, + Fut: Future>, + T: std::fmt::Debug + Send, +{ + let mut max_tries = 3; + loop { + let result = f().await; + max_tries -= 1; + match result { + Ok(t) => return Ok(t), + Err(e) if max_tries == 0 => return Err(e), + Err(e) => { + log::warn!("OpenAI request failed, retrying: {:?}", e); + sleep(Duration::from_secs(2)).await; + } + } + } +} + +// let mut max_tries = 3; +// let translated_lines = loop { +// let result = translate_chunk(&client, chunk, from_lang, to_lang).await; +// max_tries -= 1; +// match result { +// Ok(lines) => break lines, +// Err(e) if max_tries == 0 => { +// return Err(e); +// } +// Err(e) => { +// warn!("Failed to translate chunk, retrying: {}", e); +// sleep(Duration::from_secs(2)).await; +// } +// } +// }; + +/// Generate a system message. +fn system_message(content: &str) -> ChatCompletionRequestMessage { + ChatCompletionRequestMessage::System(ChatCompletionRequestSystemMessage { + role: Role::System, + content: content.to_owned(), + name: None, + }) +} + +/// Generate a user message. +fn user_message>(content: S) -> ChatCompletionRequestMessage { + ChatCompletionRequestMessage::User(ChatCompletionRequestUserMessage { + role: Role::User, + content: ChatCompletionRequestUserMessageContent::Text(content.into()), + name: None, + }) +} + +/// Describe a "function" tool GPT can call. +fn function_tool( + name: &str, + description: &str, + parameters: &serde_json::Value, +) -> ChatCompletionTool { + ChatCompletionTool { + r#type: ChatCompletionToolType::Function, + function: FunctionObject { + name: name.to_owned(), + description: Some(description.to_owned()), + parameters: Some(parameters.clone()), + }, + } +} + +/// Specify a "function" tool GPT should call. +fn function_tool_choice(name: &str) -> ChatCompletionToolChoiceOption { + ChatCompletionToolChoiceOption::Named(ChatCompletionNamedToolChoice { + r#type: ChatCompletionToolType::Function, + function: FunctionName { + name: name.to_owned(), + }, + }) +} + +/// Extract a "tool call" from a chat response. +fn tool_call_response( + resp: &CreateChatCompletionResponse, + expected_function: &str, +) -> Result +where + T: serde::de::DeserializeOwned, +{ + let choice = resp.choices.get(0).ok_or_else(|| { + anyhow!("OpenAI did not return a response to our translation request") + })?; + let tool_calls = + choice.message.tool_calls.as_ref().ok_or_else(|| { + anyhow!("OpenAI did not return tool calls in its response") + })?; + let tool_call = tool_calls + .get(0) + .ok_or_else(|| anyhow!("OpenAI did not return a tool call in its response"))?; + let f = &tool_call.function; + if f.name != expected_function { + return Err(anyhow!( + "OpenAI returned a response, but it called the wrong function: {}", + f.name + )); + } + debug!("OpenAI called: {}({:?})", expected_function, f.arguments); + serde_json::from_str::(&f.arguments).context("Failed to parse OpenAPI response") +} diff --git a/substudy/src/services/oai/translate.rs b/substudy/src/services/oai/translate.rs new file mode 100644 index 00000000..21f144b9 --- /dev/null +++ b/substudy/src/services/oai/translate.rs @@ -0,0 +1,218 @@ +//! OpenAI translation. + +use anyhow::anyhow; +use async_openai::{config::OpenAIConfig, types::CreateChatCompletionRequest, Client}; +use indicatif::ProgressBar; +use lazy_static::lazy_static; +use log::{debug, trace}; +use regex::Regex; +use serde::{Deserialize, Serialize}; +use serde_json::json; + +use super::{ + function_tool, function_tool_choice, retry_openai_request, system_message, + tool_call_response, user_message, +}; +use crate::{ + lang::Lang, + progress::default_progress_style, + srt::{Subtitle, SubtitleFile}, + Result, +}; + +/// Always send this many lines in a prompt, and the try to end +/// on a sentence boundary. +const MIN_CHUNK_SIZE: usize = 10; + +/// If we can't find a sentence boundary, end no later than this. +const MAX_CHUNK_SIZE: usize = 15; + +lazy_static! { + /// A JSON Schema for the report_translation"function" we tell OpenAI to + /// call. This is really just the output we want from the LLM. + static ref REPORT_TRANSLATION_PARAMETERS_SCHEMA: serde_json::Value = json!({ + "type": "object", + "properties": { + "lines": { + "type": "array", + "items": { + "type": "object", + "properties": { + "original": { + "type": "string" + }, + "translation": { + "type": "string" + } + }, + "required": [ + "original", + "translation" + ] + } + } + }, + "required": [ + "lines" + ] + }); + + /// Unicode-aware regex for identifying the likely end of a sentence. This + /// includes ".", "!", "?", plus other punctuation used in other languages. + static ref SENTENCE_END: Regex = + Regex::new(r"[\p{Sentence_Terminal}]\s*$").unwrap(); +} + +/// Translate subtitle lines using OpenAI's GPT API. +pub async fn translate_subtitle_file( + file: &SubtitleFile, + to_lang: Lang, +) -> Result { + // Infer the language of the subtitle file. + let from_lang = file.detect_language().ok_or_else(|| { + anyhow!("Could not detect the language of the input subtitle file") + })?; + + // Split into chunks of at least `MIN_CHUNK_SIZE`, but then try to end on a + // sentence boundary. Even if we can't find a sentence boundary, end + // no later than `MAX_CHUNK_SIZE`. + let mut sub_chunks = vec![]; + let mut current_chunk = vec![]; + for sub in &file.subtitles { + current_chunk.push(sub.clone()); + let last_line = sub.lines.last().cloned().unwrap_or_else(|| "".to_owned()); + if current_chunk.len() >= MIN_CHUNK_SIZE + && (current_chunk.len() >= MAX_CHUNK_SIZE + || SENTENCE_END.is_match(&last_line)) + { + sub_chunks.push(current_chunk.clone()); + current_chunk.clear(); + } + } + if current_chunk.len() > 0 { + sub_chunks.push(current_chunk); + } + + let progress = ProgressBar::new(file.subtitles.len() as u64); + progress.set_style(default_progress_style()); + progress.set_prefix("📖 Translating"); + progress.tick(); + + let client = Client::new(); + let mut translated_subs = vec![]; + for chunk in &sub_chunks { + let translated_lines = retry_openai_request(|| { + translate_chunk(&client, chunk, from_lang, to_lang) + }) + .await?; + for (sub, translated) in chunk.iter().zip(translated_lines) { + let mut translated_sub = sub.clone(); + translated_sub.lines = + vec![translated.translation.clone().ok_or_else(|| { + anyhow!( + "OpenAI did not return a translation for a line: {:?}", + translated.original + ) + })?]; + translated_subs.push(translated_sub); + } + progress.inc(chunk.len() as u64); + } + progress.finish(); + + // Reassemble the translated chunks. + Ok(SubtitleFile { + subtitles: translated_subs, + }) +} + +async fn translate_chunk( + client: &Client, + chunk: &[Subtitle], + from_lang: Lang, + to_lang: Lang, +) -> Result> { + let prompt = prompt_from_chunk(chunk, from_lang, to_lang)?; + debug!("OpenAI request (prompt): {}", prompt); + let req = CreateChatCompletionRequest { + model: "gpt-3.5-turbo".to_owned(), + messages: vec![ + system_message("You are a subtitle translator helping language learners."), + user_message(prompt), + ], + tools: Some(vec![function_tool( + "report_translations", + "Report the translations of the lines of dialog.", + &REPORT_TRANSLATION_PARAMETERS_SCHEMA, + )]), + tool_choice: Some(function_tool_choice("report_translations")), + ..Default::default() + }; + trace!("OpenAI request (full): {:?}", req); + let resp = client.chat().create(req).await?; + trace!("OpenAI response (full): {:?}", resp); + let args = tool_call_response::( + &resp, + "report_translations", + )?; + let translated_lines = args.lines; + if translated_lines.len() != chunk.len() { + return Err(anyhow!( + "OpenAI returned the wrong number of translations: {}", + translated_lines.len() + )); + } + Ok(translated_lines) +} + +/// Generate a prompt from a chunk of subtitles. +fn prompt_from_chunk( + chunk: &[Subtitle], + from_lang: Lang, + to_lang: Lang, +) -> Result { + let template = ReportTranslationParameters { + lines: chunk + .iter() + .map(LineTranslation::template_from_subtitle) + .collect(), + }; + let json_template = + serde_json::to_string_pretty(&template).expect("failed to format JSON"); + Ok(format!( + "Translate the following consecutive lines of dialog from {from} to {to}: + +```json\n{template}``` + +Please call the function `report_translation` with your output.", + from = from_lang.english_names()?[0], + to = to_lang.english_names()?[0], + template = json_template, + )) +} + +/// "Parameters" for the `report_translation` function. +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct ReportTranslationParameters { + /// The translated lines. + pub lines: Vec, +} + +/// Translation of a line. +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct LineTranslation { + /// The original line. + pub original: String, + /// The translated line. + pub translation: Option, +} + +impl LineTranslation { + /// Construct a template from a [`Subtitle`]. + pub fn template_from_subtitle(sub: &Subtitle) -> LineTranslation { + LineTranslation { + original: sub.lines.join(" "), + translation: None, + } + } +} diff --git a/substudy/src/video.rs b/substudy/src/video.rs index 4e5292c0..f7f28c53 100644 --- a/substudy/src/video.rs +++ b/substudy/src/video.rs @@ -11,14 +11,17 @@ use std::{ use anyhow::{anyhow, Context as _}; use cast; +use indicatif::ProgressBar; use log::debug; use num::rational::Ratio; -use pbr::ProgressBar; use regex::Regex; use serde::{de, Deserialize, Deserializer}; use serde_json; -use crate::{errors::RunCommandError, lang::Lang, time::Period, Result}; +use crate::{ + errors::RunCommandError, lang::Lang, progress::default_progress_style, + time::Period, Result, +}; /// Information about an MP3 track (optional). #[derive(Debug, Default)] @@ -346,21 +349,24 @@ impl Video { /// batch interface to avoid making too many passes through the file. /// We assume that the extractions are sorted in temporal order. pub fn extract(&self, extractions: &[Extraction]) -> Result<()> { - let mut pb = ProgressBar::new(cast::u64(extractions.len())); - pb.format("[== ]"); + let pb = ProgressBar::new(cast::u64(extractions.len())); + pb.set_style(default_progress_style()); + pb.set_prefix("✂️ Extracting media"); + pb.tick(); + let mut batch: Vec<&Extraction> = vec![]; for e in extractions { if e.spec.can_be_batched() { batch.push(e); } else { self.extract_one(e)?; - pb.inc(); + pb.inc(1); } } for chunk in batch.chunks(20) { self.extract_batch(chunk)?; - pb.add(cast::u64(chunk.len())); + pb.inc(cast::u64(chunk.len())); } Ok(()) }