From 1371eafd0f6a734d5c8264ea0d7a2e630e55a3ef Mon Sep 17 00:00:00 2001 From: CAD97 Date: Tue, 18 Jul 2017 20:44:50 -0400 Subject: [PATCH 1/6] char_property! macro --- components/ucd/utils/src/lib.rs | 1 + components/ucd/utils/src/macros.rs | 209 +++++++++++++++++++++++++++++ 2 files changed, 210 insertions(+) create mode 100644 components/ucd/utils/src/macros.rs diff --git a/components/ucd/utils/src/lib.rs b/components/ucd/utils/src/lib.rs index f829aabb..f5b57ef3 100644 --- a/components/ucd/utils/src/lib.rs +++ b/components/ucd/utils/src/lib.rs @@ -22,6 +22,7 @@ //! * [**Unicode Code Point**](http://unicode.org/glossary/#code_point) //! * [**Unicode Scalar Value**](http://unicode.org/glossary/#unicode_scalar_value) +mod macros; use std::char::from_u32; use std::ops::Range; diff --git a/components/ucd/utils/src/macros.rs b/components/ucd/utils/src/macros.rs new file mode 100644 index 00000000..72679a8d --- /dev/null +++ b/components/ucd/utils/src/macros.rs @@ -0,0 +1,209 @@ +/// Convenience macro for declaring a enumerated character property. +/// +/// Syntax: +/// +/// ```rust +/// # #[macro_use] extern crate unic_ucd_utils; +/// char_property! { +/// /// Any amount of doc comments describing the property +/// pub enum PropertyName { +/// /// Exactly one line describing the variant +/// RustName: Long_Name / Abbr "Optional display string literal", +/// }; +/// +/// /// Any amount of doc comments describing the module +/// pub mod module_name; +/// } +/// # fn main() {} +/// ``` +/// +/// Of course, any number (one or more) of variants may be included, each terminated by a comma. +/// Once ***[rust-lang/rust#42913]*** reaches stable (1.20), one or more doc comment lines +/// can be used on each variant. Additionally, at that time we can remove the restriction that at +/// least one line must be present. +/// +/// If not specified, the display literal defaults to the Long_Name (stringified). +/// Note that at this time, either all or none of the display literals must be present. +/// +// TODO: Formalize this with a trait? +/* +trait UnicodeCharacterProperty : Clone + Copy + Debug + Display + PartialEq + Eq { + /// Abbreviated name of this property + fn abbr_name(&self) -> &'static str; + /// Long name of this property + fn long_name(&self) -> &'static str; + + // And optionally + /// Get the property value for this character. + fn of(ch: char) -> Self; +} +*/ +// In that case this macro and that trait should probably be in core. +// +/// `PropertyName::abbr_name(&self) -> &'static str` and +/// `PropertyName::long_name(&self) -> &'static str` are provided by this macro, as well as +/// `#[derive(Clone, Copy, Debug, PartialEq, Eq)]`. The order of variants is unchanged, meaning +/// `#[derive(PartialOrd, Ord)]` can be used on the enum declaration if it makes sense. +/// +/// The module `module_name` is populated with `pub use` aliases for the variants in the abbr form. +/// This module will likely be removed in favor of [`Associated Consts`][rust-lang/rust#42809] once +/// it reaches stable (1.20). +/// +/// [rust-lang/rust#42913]: https://github.com/rust-lang/rust/pull/42913 +/// [rust-lang/rust#42809]: https://github.com/rust-lang/rust/pull/42809 +#[macro_export] +macro_rules! char_property { + // Default Display impl + { + $(#[$_name:meta])* + pub enum $name:ident { + $( + $(#[$_variant:meta])+ + $variant:ident: $long:ident / $abbr:ident, + )+ + }; + + $(#[$_alias:meta])* + pub mod $alias:ident; + } + => + { + char_property! { + $(#[$_name])* + pub enum $name { + $( + $(#[$_variant])+ + $variant: $long/$abbr stringify!($long), + )+ + }; + + $(#[$_alias])* + pub mod $alias; + } + }; + + // Specified Display impl + { + $(#[$_name:meta])* + pub enum $name:ident { + $( + $(#[$_variant:meta])+ + $variant:ident: $long:ident / $abbr:ident $display:expr, + )+ + }; + + $(#[$_alias:meta])* + pub mod $alias:ident; + } + => + { + $(#[$_name])* + #[derive(Clone, Copy, Debug, PartialEq, Eq)] + pub enum $name { + $( + $(#[$_variant])* + $variant + ),+ + } + + impl $name { + /// Abbreviated name of this property + pub fn abbr_name(&self) -> &'static str { + match *self { + $($name::$variant => stringify!($abbr)),+ + } + } + /// Long name of this property + pub fn long_name(&self) -> &'static str { + match *self { + $($name::$variant => stringify!($long)),+ + } + } + } + + impl ::std::fmt::Display for $name { + fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result { + match *self { + $( + $name::$variant => + write!(f, $display) + ),+ + } + + } + } + + $(#[$_alias])* + pub mod $alias { + $(pub use super::$name::$variant as $abbr;)+ + } + }; +} + +#[cfg(test)] +mod tests { + char_property! { + /// A very + /// well documented + /// character property + pub enum CustomDisplayProperty { + /// + Variant1: Variant_1 / V1 "Variant_1", + /// + Variant2: Variant_2 / V2 "Property=Variant_2", + /// + Variant3: Variant_3 / V3 "The third variant", + }; + + /// A very + /// well documented + /// abbreviated alias + pub mod cd_abbr; + } + + char_property! { + pub enum ImplicitDisplayProperty { + /// + TheDisplay: The_Display / Td, + /// + IsImplicit: Is_Implicit / Ii, + }; + pub mod id_abbr; + } + + #[test] + fn abbr_name() { + assert_eq!(CustomDisplayProperty::Variant1.abbr_name(), "V1"); + assert_eq!(CustomDisplayProperty::Variant2.abbr_name(), "V2"); + assert_eq!(CustomDisplayProperty::Variant3.abbr_name(), "V3"); + assert_eq!(ImplicitDisplayProperty::TheDisplay.abbr_name(), "Td"); + assert_eq!(ImplicitDisplayProperty::IsImplicit.abbr_name(), "Ii"); + } + + #[test] + fn long_name() { + assert_eq!(CustomDisplayProperty::Variant1.long_name(), "Variant_1"); + assert_eq!(CustomDisplayProperty::Variant2.long_name(), "Variant_2"); + assert_eq!(CustomDisplayProperty::Variant3.long_name(), "Variant_3"); + assert_eq!(ImplicitDisplayProperty::TheDisplay.long_name(), "The_Display"); + assert_eq!(ImplicitDisplayProperty::IsImplicit.long_name(), "Is_Implicit"); + } + + #[test] + fn abbr_mod() { + assert_eq!(CustomDisplayProperty::Variant1, cd_abbr::V1); + assert_eq!(CustomDisplayProperty::Variant2, cd_abbr::V2); + assert_eq!(CustomDisplayProperty::Variant3, cd_abbr::V3); + assert_eq!(ImplicitDisplayProperty::TheDisplay, id_abbr::Td); + assert_eq!(ImplicitDisplayProperty::IsImplicit, id_abbr::Ii); + } + + #[test] + fn display() { + assert_eq!(format!("{}", CustomDisplayProperty::Variant1), "Variant_1"); + assert_eq!(format!("{}", CustomDisplayProperty::Variant2), "Property=Variant_2"); + assert_eq!(format!("{}", CustomDisplayProperty::Variant3), "The third variant"); + assert_eq!(format!("{}", ImplicitDisplayProperty::TheDisplay), "The_Display"); + assert_eq!(format!("{}", ImplicitDisplayProperty::IsImplicit), "Is_Implicit"); + } +} From 20598b4b8b09e74669c229b2b32399c1e0c2c7fa Mon Sep 17 00:00:00 2001 From: CAD97 Date: Tue, 18 Jul 2017 21:01:59 -0400 Subject: [PATCH 2/6] char_property! should also imply Hash --- components/ucd/utils/src/macros.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/components/ucd/utils/src/macros.rs b/components/ucd/utils/src/macros.rs index 72679a8d..035aab94 100644 --- a/components/ucd/utils/src/macros.rs +++ b/components/ucd/utils/src/macros.rs @@ -27,7 +27,7 @@ /// // TODO: Formalize this with a trait? /* -trait UnicodeCharacterProperty : Clone + Copy + Debug + Display + PartialEq + Eq { +trait UnicodeCharacterProperty : Clone + Copy + Debug + Display + PartialEq + Eq + Hash { /// Abbreviated name of this property fn abbr_name(&self) -> &'static str; /// Long name of this property @@ -42,8 +42,8 @@ trait UnicodeCharacterProperty : Clone + Copy + Debug + Display + PartialEq + Eq // /// `PropertyName::abbr_name(&self) -> &'static str` and /// `PropertyName::long_name(&self) -> &'static str` are provided by this macro, as well as -/// `#[derive(Clone, Copy, Debug, PartialEq, Eq)]`. The order of variants is unchanged, meaning -/// `#[derive(PartialOrd, Ord)]` can be used on the enum declaration if it makes sense. +/// `#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]`. The order of variants is unchanged, +/// meaning `#[derive(PartialOrd, Ord)]` can be used on the enum declaration if it makes sense. /// /// The module `module_name` is populated with `pub use` aliases for the variants in the abbr form. /// This module will likely be removed in favor of [`Associated Consts`][rust-lang/rust#42809] once @@ -98,7 +98,7 @@ macro_rules! char_property { => { $(#[$_name])* - #[derive(Clone, Copy, Debug, PartialEq, Eq)] + #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub enum $name { $( $(#[$_variant])* From 5d01a3541de5a0afd27fbf6495f2bdfcc9fcf4e6 Mon Sep 17 00:00:00 2001 From: CAD97 Date: Tue, 18 Jul 2017 21:21:12 -0400 Subject: [PATCH 3/6] Reconsider macro Long_Name in favor of "Display Name". Bidi_Class uses names with spaces and hyphens, meaning that the naive $ident match will not work. This also cuts down on chaff, because there are no longer two matches in the macro or repeated information (other than that in the docs). The doc repetition I think is valid, as docs should be considered separately and just populating them with the display name would be redundant. --- components/ucd/utils/src/macros.rs | 99 ++++++------------------------ 1 file changed, 20 insertions(+), 79 deletions(-) diff --git a/components/ucd/utils/src/macros.rs b/components/ucd/utils/src/macros.rs index 035aab94..6b3c9d02 100644 --- a/components/ucd/utils/src/macros.rs +++ b/components/ucd/utils/src/macros.rs @@ -8,7 +8,7 @@ /// /// Any amount of doc comments describing the property /// pub enum PropertyName { /// /// Exactly one line describing the variant -/// RustName: Long_Name / Abbr "Optional display string literal", +/// RustName: Abbr "Display Name", /// }; /// /// /// Any amount of doc comments describing the module @@ -20,10 +20,7 @@ /// Of course, any number (one or more) of variants may be included, each terminated by a comma. /// Once ***[rust-lang/rust#42913]*** reaches stable (1.20), one or more doc comment lines /// can be used on each variant. Additionally, at that time we can remove the restriction that at -/// least one line must be present. -/// -/// If not specified, the display literal defaults to the Long_Name (stringified). -/// Note that at this time, either all or none of the display literals must be present. +/// least one line doc comment line must be present. /// // TODO: Formalize this with a trait? /* @@ -31,7 +28,7 @@ trait UnicodeCharacterProperty : Clone + Copy + Debug + Display + PartialEq + Eq /// Abbreviated name of this property fn abbr_name(&self) -> &'static str; /// Long name of this property - fn long_name(&self) -> &'static str; + fn name(&self) -> &'static str; // And optionally /// Get the property value for this character. @@ -41,9 +38,10 @@ trait UnicodeCharacterProperty : Clone + Copy + Debug + Display + PartialEq + Eq // In that case this macro and that trait should probably be in core. // /// `PropertyName::abbr_name(&self) -> &'static str` and -/// `PropertyName::long_name(&self) -> &'static str` are provided by this macro, as well as +/// `PropertyName::name(&self) -> &'static str` are provided by this macro, as well as /// `#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]`. The order of variants is unchanged, /// meaning `#[derive(PartialOrd, Ord)]` can be used on the enum declaration if it makes sense. +/// ```std::fmt::Display` is also implemented to write the display name of the property. /// /// The module `module_name` is populated with `pub use` aliases for the variants in the abbr form. /// This module will likely be removed in favor of [`Associated Consts`][rust-lang/rust#42809] once @@ -53,42 +51,12 @@ trait UnicodeCharacterProperty : Clone + Copy + Debug + Display + PartialEq + Eq /// [rust-lang/rust#42809]: https://github.com/rust-lang/rust/pull/42809 #[macro_export] macro_rules! char_property { - // Default Display impl { $(#[$_name:meta])* pub enum $name:ident { $( $(#[$_variant:meta])+ - $variant:ident: $long:ident / $abbr:ident, - )+ - }; - - $(#[$_alias:meta])* - pub mod $alias:ident; - } - => - { - char_property! { - $(#[$_name])* - pub enum $name { - $( - $(#[$_variant])+ - $variant: $long/$abbr stringify!($long), - )+ - }; - - $(#[$_alias])* - pub mod $alias; - } - }; - - // Specified Display impl - { - $(#[$_name:meta])* - pub enum $name:ident { - $( - $(#[$_variant:meta])+ - $variant:ident: $long:ident / $abbr:ident $display:expr, + $variant:ident: $abbr:ident $long:expr, )+ }; @@ -113,23 +81,17 @@ macro_rules! char_property { $($name::$variant => stringify!($abbr)),+ } } - /// Long name of this property - pub fn long_name(&self) -> &'static str { + /// Name of this property + pub fn name(&self) -> &'static str { match *self { - $($name::$variant => stringify!($long)),+ + $($name::$variant => $long),+ } } } impl ::std::fmt::Display for $name { fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result { - match *self { - $( - $name::$variant => - write!(f, $display) - ),+ - } - + write!(f, "{}", self.name()) } } @@ -146,47 +108,30 @@ mod tests { /// A very /// well documented /// character property - pub enum CustomDisplayProperty { + pub enum Property { /// - Variant1: Variant_1 / V1 "Variant_1", + Variant1: V1 "Variant_1", /// - Variant2: Variant_2 / V2 "Property=Variant_2", + Variant2: V2 "Variant 2", /// - Variant3: Variant_3 / V3 "The third variant", + Variant3: V3 "Variant-3", }; - /// A very - /// well documented - /// abbreviated alias pub mod cd_abbr; } - char_property! { - pub enum ImplicitDisplayProperty { - /// - TheDisplay: The_Display / Td, - /// - IsImplicit: Is_Implicit / Ii, - }; - pub mod id_abbr; - } - #[test] fn abbr_name() { assert_eq!(CustomDisplayProperty::Variant1.abbr_name(), "V1"); assert_eq!(CustomDisplayProperty::Variant2.abbr_name(), "V2"); assert_eq!(CustomDisplayProperty::Variant3.abbr_name(), "V3"); - assert_eq!(ImplicitDisplayProperty::TheDisplay.abbr_name(), "Td"); - assert_eq!(ImplicitDisplayProperty::IsImplicit.abbr_name(), "Ii"); } #[test] fn long_name() { - assert_eq!(CustomDisplayProperty::Variant1.long_name(), "Variant_1"); - assert_eq!(CustomDisplayProperty::Variant2.long_name(), "Variant_2"); - assert_eq!(CustomDisplayProperty::Variant3.long_name(), "Variant_3"); - assert_eq!(ImplicitDisplayProperty::TheDisplay.long_name(), "The_Display"); - assert_eq!(ImplicitDisplayProperty::IsImplicit.long_name(), "Is_Implicit"); + assert_eq!(CustomDisplayProperty::Variant1.name(), "Variant_1"); + assert_eq!(CustomDisplayProperty::Variant2.name(), "Variant 2"); + assert_eq!(CustomDisplayProperty::Variant3.name(), "Variant-3"); } #[test] @@ -194,16 +139,12 @@ mod tests { assert_eq!(CustomDisplayProperty::Variant1, cd_abbr::V1); assert_eq!(CustomDisplayProperty::Variant2, cd_abbr::V2); assert_eq!(CustomDisplayProperty::Variant3, cd_abbr::V3); - assert_eq!(ImplicitDisplayProperty::TheDisplay, id_abbr::Td); - assert_eq!(ImplicitDisplayProperty::IsImplicit, id_abbr::Ii); } #[test] fn display() { - assert_eq!(format!("{}", CustomDisplayProperty::Variant1), "Variant_1"); - assert_eq!(format!("{}", CustomDisplayProperty::Variant2), "Property=Variant_2"); - assert_eq!(format!("{}", CustomDisplayProperty::Variant3), "The third variant"); - assert_eq!(format!("{}", ImplicitDisplayProperty::TheDisplay), "The_Display"); - assert_eq!(format!("{}", ImplicitDisplayProperty::IsImplicit), "Is_Implicit"); + assert_eq!(format!("{}", CustomDisplayProperty::Variant1), CustomDisplayProperty::Variant1.name()); + assert_eq!(format!("{}", CustomDisplayProperty::Variant2), CustomDisplayProperty::Variant2.name()); + assert_eq!(format!("{}", CustomDisplayProperty::Variant3), CustomDisplayProperty::Variant3.name()); } } From ac5f1302708c9bd8e2cee424f4a164b5f29e0687 Mon Sep 17 00:00:00 2001 From: CAD97 Date: Tue, 18 Jul 2017 22:20:55 -0400 Subject: [PATCH 4/6] Let Bidi_Class use the char_property! macro --- components/ucd/bidi/Cargo.toml | 1 + components/ucd/bidi/src/bidi_class.rs | 296 ++++++++++++++------------ components/ucd/bidi/src/lib.rs | 3 + 3 files changed, 158 insertions(+), 142 deletions(-) diff --git a/components/ucd/bidi/Cargo.toml b/components/ucd/bidi/Cargo.toml index b81fa34d..c0f38aaf 100644 --- a/components/ucd/bidi/Cargo.toml +++ b/components/ucd/bidi/Cargo.toml @@ -13,3 +13,4 @@ travis-ci = { repository = "behnam/rust-unic", branch = "master" } [dependencies] unic-ucd-core = { path = "../core/", version = "0.4.0" } +unic-ucd-utils = { path = "../utils/", version = "0.4.0" } diff --git a/components/ucd/bidi/src/bidi_class.rs b/components/ucd/bidi/src/bidi_class.rs index ebbb4b12..4499822e 100644 --- a/components/ucd/bidi/src/bidi_class.rs +++ b/components/ucd/bidi/src/bidi_class.rs @@ -10,74 +10,162 @@ // except according to those terms. use std::cmp::Ordering; -use std::fmt; -/// Represents the Unicode character -/// [*Bidi_Class*](http://www.unicode.org/reports/tr44/#Bidi_Class) property, also known as the -/// *bidirectional character type*. -/// -/// * -/// * -#[derive(Clone, Copy, Eq, PartialEq, Debug, Hash)] -#[allow(missing_docs)] -pub enum BidiClass { - ArabicLetter, - ArabicNumber, - ParagraphSeparator, - BoundaryNeutral, - CommonSeparator, - EuropeanNumber, - EuropeanSeparator, - EuropeanTerminator, - FirstStrongIsolate, - LeftToRight, - LeftToRightEmbedding, - LeftToRightIsolate, - LeftToRightOverride, - NonspacingMark, - OtherNeutral, - PopDirectionalFormat, - PopDirectionalIsolate, - RightToLeft, - RightToLeftEmbedding, - RightToLeftIsolate, - RightToLeftOverride, - SegmentSeparator, - WhiteSpace, - // [UNIC_UPDATE_ON_UNICODE_UPDATE] Source: `tables/bidi_class_type.rsv` -} - - -/// Abbreviated name aliases for -/// [*Bidi_Class*](http://www.unicode.org/reports/tr44/#Bidi_Class) property. -/// -/// -pub mod abbr_names { - pub use BidiClass::ArabicLetter as AL; - pub use BidiClass::ArabicNumber as AN; - pub use BidiClass::ParagraphSeparator as B; - pub use BidiClass::BoundaryNeutral as BN; - pub use BidiClass::CommonSeparator as CS; - pub use BidiClass::EuropeanNumber as EN; - pub use BidiClass::EuropeanSeparator as ES; - pub use BidiClass::EuropeanTerminator as ET; - pub use BidiClass::FirstStrongIsolate as FSI; - pub use BidiClass::LeftToRight as L; - pub use BidiClass::LeftToRightEmbedding as LRE; - pub use BidiClass::LeftToRightIsolate as LRI; - pub use BidiClass::LeftToRightOverride as LRO; - pub use BidiClass::NonspacingMark as NSM; - pub use BidiClass::OtherNeutral as ON; - pub use BidiClass::PopDirectionalFormat as PDF; - pub use BidiClass::PopDirectionalIsolate as PDI; - pub use BidiClass::RightToLeft as R; - pub use BidiClass::RightToLeftEmbedding as RLE; - pub use BidiClass::RightToLeftIsolate as RLI; - pub use BidiClass::RightToLeftOverride as RLO; - pub use BidiClass::SegmentSeparator as S; - pub use BidiClass::WhiteSpace as WS; - // [UNIC_UPDATE_ON_UNICODE_UPDATE] Source: `tables/bidi_class_type.rsv` +char_property! { + // TODO: Once 1.20 comes, add the rest of the enum variants' docs + /// Represents the Unicode character + /// [*Bidi_Class*](http://www.unicode.org/reports/tr44/#Bidi_Class) property, also known as the + /// *bidirectional character type*. + /// + /// * + /// * + pub enum BidiClass { + // == Strong == // + + /// Any strong left-to-right character + // /// + // /// ***General Scope*** + // /// + // /// LRM, most alphabetic, syllabic, Han ideographs, + // /// non-European or non-Arabic digits, ... + LeftToRight: L "Left-to-Right", + + /// Any strong right-to-left (non-Arabic-type) character + // /// + // /// ***General Scope*** + // /// + // /// RLM, Hebrew alphabet, and related punctuation + RightToLeft: R "Right-to-Left", + + /// Any strong right-to-left (Arabic-type) character + // /// + // /// ***General Scope*** + // /// + // /// ALM, Arabic, Thaana, and Syriac alphabets, + // /// most punctuation specific to those scripts, ... + ArabicLetter: AL "Right-to-Left Arabic", + + // == Weak == // + + /// Any ASCII digit or Eastern Arabic-Indic digit + // /// + // /// ***General Scope*** + // /// + // /// European digits, Eastern Arabic-Indic digits, ... + EuropeanNumber: EN "European Number", + + /// Plus and minus signs + // /// + // /// ***General Scope*** + // /// + // /// PLUS SIGN, MINUS SIGN + EuropeanSeparator: ES "European Number Separator", + + /// A terminator in a numeric format context, includes currency signs + // /// + // /// ***General Scope*** + // /// + // /// DEGREE SIGN, currency symbols, ... + EuropeanTerminator: ET "European Number Terminator", + + /// Any Arabic-Indic digit + // /// + // /// ***General Scope*** + // /// + // /// Arabic-Indic digits, Arabic decimal and thousands separators, ... + ArabicNumber: AN "Arabic Number", + + /// Commas, colons, and slashes + // /// + // /// ***General Scope*** + // /// + // /// COLON, COMMA, FULL STOP, NO_BREAK SPACE, ... + CommonSeparator: CS "Common Number Separator", + + /// Any nonspacing mark + // /// + // /// ***General Scope*** + // /// + // /// Characters with the General_Category values: + // /// Mn (Nonspacing_Mark) and Me (Enclosing_Mark) + NonspacingMark: NSM "Nonspacing Mark", + + /// Most format characters, control codes, or noncharacters + // /// + // /// ***General Scope*** + // /// + // /// Default ignorables, non-characters, and control characters, + // /// other than those explicitly given other types. + BoundaryNeutral: BN "Boundary Neutral", + + // == Neutral == // + + /// Various newline characters + // /// + // /// ***General Scope*** + // /// + // /// PARAGRAPH SEPARATOR, appropriate Newline Functions, + // /// higher-level protocol paragraph determination + ParagraphSeparator: B "Paragraph Separator", + + /// Various segment-related control codes + // /// + // /// ***General Scope*** + // /// + // /// *Tab* + SegmentSeparator: S "Segment Separator", + + /// Spaces + // /// + // /// ***General Scope*** + // /// + // /// SPACE, FIGURE SPACE, LIN SEPARATOR, FORM FEED, + // /// General Punctuation spaces, ... + WhiteSpace: WS "Whitespace", + + /// Most other symbols and punctuation marks + // /// + // /// ***General Scope*** + // /// + // /// All other characters, including OBJECT REPLACEMENT CHARACTER + OtherNeutral: ON "Other Neutrals", + + // == Explicit Formatting == // + + /// U+202A: The LR embedding control + LeftToRightEmbedding: LRE "Left-to-Right Embedding", + + /// U+202D: The LR override control + LeftToRightOverride: LRO "Left-to-Right Override", + + /// U+202B: The RL embedding control + RightToLeftEmbedding: RLE "Right-to-Left Embedding", + + /// U+202E: The RL override control + RightToLeftOverride: RLO "Right-to-Left Override", + + /// U+202C: Terminates an embedding or override control + PopDirectionalFormat: PDF "Pop Directional Format", + + /// U+2066: The LR isolate control + LeftToRightIsolate: LRI "Left-to-Right Isolate", + + /// U+2067: The RL isolate control + RightToLeftIsolate: RLI "Left-to-Right Isolate", + + /// U+2068: The first string isolate control + FirstStrongIsolate: FSI "First Strong Isolate", + + /// U+2069: Terminates an isolate control + PopDirectionalIsolate: PDI "Pop Directional Isolate", + }; + + /// Abbreviated name aliases for + /// [*Bidi_Class*](http://www.unicode.org/reports/tr44/#Bidi_Class) property. + /// + /// + pub mod abbr_names; } @@ -112,76 +200,6 @@ impl BidiClass { bsearch_range_value_table(ch, BIDI_CLASS_TABLE) } - /// Abbreviated name of the Bidi Class property value. - /// - /// - pub fn abbr_name(&self) -> &str { - match *self { - BidiClass::ArabicLetter => "AL", - BidiClass::ArabicNumber => "AN", - BidiClass::ParagraphSeparator => "B", - BidiClass::BoundaryNeutral => "BN", - BidiClass::CommonSeparator => "CS", - BidiClass::EuropeanNumber => "EN", - BidiClass::EuropeanSeparator => "ES", - BidiClass::EuropeanTerminator => "ET", - BidiClass::FirstStrongIsolate => "FSI", - BidiClass::LeftToRight => "L", - BidiClass::LeftToRightEmbedding => "LRE", - BidiClass::LeftToRightIsolate => "LRI", - BidiClass::LeftToRightOverride => "LRO", - BidiClass::NonspacingMark => "NSM", - BidiClass::OtherNeutral => "ON", - BidiClass::PopDirectionalFormat => "PDF", - BidiClass::PopDirectionalIsolate => "PDI", - BidiClass::RightToLeft => "R", - BidiClass::RightToLeftEmbedding => "RLE", - BidiClass::RightToLeftIsolate => "RLI", - BidiClass::RightToLeftOverride => "RLO", - BidiClass::SegmentSeparator => "S", - BidiClass::WhiteSpace => "WS", - } - } - - /// Human-readable description of the Bidi Class property value. - /// - /// - #[inline] - pub fn display(&self) -> &str { - match *self { - // Strong - L => "Left-to-Right", - R => "Right-to-Left", - AL => "Right-to-Left Arabic", - - // Weak - EN => "European Number", - ES => "European Number Separator", - ET => "European Number Terminator", - AN => "Arabic Number", - CS => "Common Number Separator", - NSM => "Nonspacing Mark", - BN => "Boundary Neutral", - - // Neutral - B => "Paragraph Separator", - S => "Segment Separator", - WS => "Whitespace", - ON => "Other Neutrals", - - // Explicit Formatting - LRE => "Left-to-Right Embedding", - LRO => "Left-to-Right Override", - RLE => "Right-to-Left Embedding", - RLO => "Right-to-Left Override", - PDF => "Pop Directional Format", - LRI => "Left-to-Right Isolate", - RLI => "Right-to-Left Isolate", - FSI => "First Strong Isolate", - PDI => "Pop Directional Isolate", - } - } - /// If the `BidiClass` has strong or explicit Left-to-Right direction. #[inline] pub fn category(&self) -> BidiClassCategory { @@ -232,12 +250,6 @@ fn bsearch_range_value_table(c: char, r: &'static [(char, char, BidiClass)]) -> } } -impl fmt::Display for BidiClass { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", self.display()) - } -} - #[cfg(test)] mod tests { diff --git a/components/ucd/bidi/src/lib.rs b/components/ucd/bidi/src/lib.rs index 79fa7d6a..fe127463 100644 --- a/components/ucd/bidi/src/lib.rs +++ b/components/ucd/bidi/src/lib.rs @@ -19,6 +19,9 @@ //! //! Accessor for `Bidi_Class` property from Unicode Character Database (UCD) +#[macro_use] +#[no_link] +extern crate unic_ucd_utils; extern crate unic_ucd_core; From 2b9403689baa2c2a4db6d6c8d168945858f7e636 Mon Sep 17 00:00:00 2001 From: CAD97 Date: Tue, 18 Jul 2017 22:45:29 -0400 Subject: [PATCH 5/6] Let General_Category use the char_property! macro The display names come from Unicode Chapter 4 Table 4-4. General Category --- components/ucd/category/Cargo.toml | 3 +- components/ucd/category/src/category.rs | 202 +++++++++++++----------- components/ucd/category/src/lib.rs | 3 + 3 files changed, 118 insertions(+), 90 deletions(-) diff --git a/components/ucd/category/Cargo.toml b/components/ucd/category/Cargo.toml index 2a456a9c..76ef8ede 100644 --- a/components/ucd/category/Cargo.toml +++ b/components/ucd/category/Cargo.toml @@ -12,5 +12,6 @@ description = "UNIC - Unicode Character Database - General Category" travis-ci = { repository = "behnam/rust-unic", branch = "master" } [dependencies] -unic-ucd-core = { path = "../core/", version = "0.4.0" } matches = "0.1.6" +unic-ucd-core = { path = "../core/", version = "0.4.0" } +unic-ucd-utils = { path = "../utils/", version = "0.4.0" } diff --git a/components/ucd/category/src/category.rs b/components/ucd/category/src/category.rs index 2a268a5e..c28cde84 100644 --- a/components/ucd/category/src/category.rs +++ b/components/ucd/category/src/category.rs @@ -10,77 +10,114 @@ use std::cmp::Ordering; -/// Represents the Unicode Character -/// [*General Category*](http://unicode.org/reports/tr44/#General_Category) property. -/// -/// This is a useful breakdown into various character types which can be used as a default -/// categorization in implementations. For the property values, see -/// [*General Category Values*](http://unicode.org/reports/tr44/#General_Category_Values). -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub enum GeneralCategory { - /// An uppercase letter (Short form: `Lu`) - UppercaseLetter, - /// A lowercase letter (Short form: `Ll`) - LowercaseLetter, - /// A digraphic character, with first part uppercase (Short form: `Lt`) - TitlecaseLetter, - /// A modifier letter (Short form: `Lm`) - ModifierLetter, - /// Other letters, including syllables and ideographs (Short form: `Lo`) - OtherLetter, - /// A nonspacing combining mark (zero advance width) (Short form: `Mn`) - NonspacingMark, - /// A spacing combining mark (positive advance width) (Short form: `Mc`) - SpacingMark, - /// An enclosing combining mark (Short form: `Me`) - EnclosingMark, - /// A decimal digit (Short form: `Nd`) - DecimalNumber, - /// A letterlike numeric character (Short form: `Nl`) - LetterNumber, - /// A numeric character of other type (Short form: `No`) - OtherNumber, - /// A connecting punctuation mark, like a tie (Short form: `Pc`) - ConnectorPunctuation, - /// A dash or hyphen punctuation mark (Short form: `Pd`) - DashPunctuation, - /// An opening punctuation mark (of a pair) (Short form: `Ps`) - OpenPunctuation, - /// A closing punctuation mark (of a pair) (Short form: `Pe`) - ClosePunctuation, - /// An initial quotation mark (Short form: `Pi`) - InitialPunctuation, - /// A final quotation mark (Short form: `Pf`) - FinalPunctuation, - /// A punctuation mark of other type (Short form: `Po`) - OtherPunctuation, - /// A symbol of mathematical use (Short form: `Sm`) - MathSymbol, - /// A currency sign (Short form: `Sc`) - CurrencySymbol, - /// A non-letterlike modifier symbol (Short form: `Sk`) - ModifierSymbol, - /// A symbol of other type (Short form: `So`) - OtherSymbol, - /// A space character (of various non-zero widths) (Short form: `Zs`) - SpaceSeparator, - /// U+2028 LINE SEPARATOR only (Short form: `Zl`) - LineSeparator, - /// U+2029 PARAGRAPH SEPARATOR only (Short form: `Zp`) - ParagraphSeparator, - /// A C0 or C1 control code (Short form: `Cc`) - Control, - /// A format control character (Short form: `Cf`) - Format, - /// A surrogate code point (Short form: `Cs`) - Surrogate, - /// A private-use character (Short form: `Co`) - PrivateUse, - /// Unassigned (Short form: `Cn`) - Unassigned, +char_property! { + /// Represents the Unicode Character + /// [*General Category*](http://unicode.org/reports/tr44/#General_Category) property. + /// + /// This is a useful breakdown into various character types which can be used as a default + /// categorization in implementations. For the property values, see + /// [*General Category Values*](http://unicode.org/reports/tr44/#General_Category_Values). + pub enum GeneralCategory { + /// An uppercase letter + UppercaseLetter: Lu "Letter, uppercase", + + /// A lowercase letter + LowercaseLetter: Ll "Letter, lowercase", + + /// A digraphic character, with first part uppercase + TitlecaseLetter: Lt "Letter, titlecase", + + /// A modifier letter + ModifierLetter: Lm "Letter, modifier", + + /// Other letters, including syllables and ideographs + OtherLetter: Lo "Letter, other", + + /// A nonspacing combining mark (zero advance width) + NonspacingMark: Mn "Mark, nonspacing", + + /// A spacing combining mark (positive advance width) + SpacingMark: Mc "Mark, spacing combining", + + /// An enclosing combining mark + EnclosingMark: Me "Mark, enclosing", + + /// A decimal digit + DecimalNumber: Nd "Number, decimal digit", + + /// A letterlike numeric character + LetterNumber: Nl "Number, letter", + + /// A numeric character of other type + OtherNumber: No "Number, other", + + /// A connecting punctuation mark, like a tie + ConnectorPunctuation: Pc "Punctuation, connector", + + /// A dash or hyphen punctuation mark + DashPunctuation: Pd "Punctuation, dash", + + /// An opening punctuation mark (of a pair) + OpenPunctuation: Ps "Punctuation, dash", + + /// A closing punctuation mark (of a pair) + ClosePunctuation: Pe "Punctuation, close", + + /// An initial quotation mark + InitialPunctuation: Pi "Punctuation, initial quote", + + /// A final quotation mark + FinalPunctuation: Pf "Punctuation, final quote", + + /// A punctuation mark of other type + OtherPunctuation: Po "Punctuation, other", + + /// A symbol of mathematical use + MathSymbol: Sm "Symbol, math", + + /// A currency sign + CurrencySymbol: Sc "Symbol, currency", + + /// A non-letterlike modifier symbol + ModifierSymbol: Sk "Symbol, modifier", + + /// A symbol of other type + OtherSymbol: So "Symbol, other", + + /// A space character (of various non-zero widths) + SpaceSeparator: Zs "Separator, space", + + /// U+2028 LINE SEPARATOR only + LineSeparator: Zl "Separator, line", + + /// U+2029 PARAGRAPH SEPARATOR only + ParagraphSeparator: Zp "Separator, paragraph", + + /// A C0 or C1 control code + Control: Cc "Other, Control", + + /// A format control character + Format: Cf "Other, format", + + /// A surrogate code point + Surrogate: Cs "Other, surrogate", + + /// A private-use character + PrivateUse: Co "Other, private use", + + /// Unassigned + Unassigned: Cn "Other, not assigned", + }; + + /// Abbreviated name aliases for the + /// [*General Category*](http://unicode.org/reports/tr44/#General_Category) property. + /// + /// + pub mod abbr_names; } use self::GeneralCategory::*; +use self::abbr_names::*; const GENERAL_CATEGORY_TABLE: &'static [(char, char, GeneralCategory)] = include!("tables/general_category.rsv"); @@ -95,55 +132,42 @@ impl GeneralCategory { impl GeneralCategory { /// `Lu` | `Ll` | `Lt` (Short form: `LC`) pub fn is_cased_letter(&self) -> bool { - matches!(*self, UppercaseLetter | LowercaseLetter | TitlecaseLetter) + matches!(*self, Lu | Ll | Lt) } /// `Lu` | `Ll` | `Lt` | `Lm` | `Lo` (Short form: `L`) pub fn is_letter(&self) -> bool { - matches!( - *self, - UppercaseLetter | LowercaseLetter | TitlecaseLetter | ModifierLetter | OtherLetter - ) + matches!(*self, Lu | Ll | Lt | Lm | Lo) } /// `Mn` | `Mc` | `Me` (Short form: `M`) pub fn is_mark(&self) -> bool { - matches!(*self, NonspacingMark | SpacingMark | EnclosingMark) + matches!(*self, Mn | Mc | Me) } /// `Nd` | `Nl` | `No` (Short form: `N`) pub fn is_number(&self) -> bool { - matches!(*self, DecimalNumber | LetterNumber | OtherNumber) + matches!(*self, Nd | Nl | No) } /// `Pc` | `Pd` | `Ps` | `Pe` | `Pi` | `Pf` | `Po` (Short form: `P`) pub fn is_punctuation(&self) -> bool { - matches!( - *self, - ConnectorPunctuation | DashPunctuation | OpenPunctuation | ClosePunctuation | - InitialPunctuation | FinalPunctuation | OtherPunctuation - ) + matches!(*self, Pc | Pd | Ps | Pe | Pi | Pf | Po) } /// `Sm` | `Sc` | `Sk` | `So` (Short form: `S`) pub fn is_symbol(&self) -> bool { - matches!( - *self, - MathSymbol | CurrencySymbol | ModifierLetter | OtherSymbol - ) + matches!(*self, Sm | Sc | Sk | So) } /// `Zs` | `Zl` | `Zp` (Short form: `Z`) pub fn is_separator(&self) -> bool { - matches!(*self, SpaceSeparator | LineSeparator | ParagraphSeparator) + matches!(*self, Zs | Zl | Zp) } /// `Cc` | `Cf` | `Cs` | `Co` | `Cn` (Short form: `C`) pub fn is_other(&self) -> bool { - matches!( - *self, - Control | Format | Surrogate | PrivateUse | Unassigned - ) + matches!(*self, Cc | Cf | Cs | Co | Cn) } } diff --git a/components/ucd/category/src/lib.rs b/components/ucd/category/src/lib.rs index b35312d9..5aa77ce9 100644 --- a/components/ucd/category/src/lib.rs +++ b/components/ucd/category/src/lib.rs @@ -40,6 +40,9 @@ #[macro_use] extern crate matches; extern crate unic_ucd_core; +#[no_link] +#[macro_use] +extern crate unic_ucd_utils; mod category; From c4f163a19f68f95a767b4f8345e3c04f9ac32c65 Mon Sep 17 00:00:00 2001 From: CAD97 Date: Tue, 18 Jul 2017 23:57:13 -0400 Subject: [PATCH 6/6] _._ of all the things to miss >.> --- components/ucd/utils/src/macros.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/components/ucd/utils/src/macros.rs b/components/ucd/utils/src/macros.rs index 6b3c9d02..469712ad 100644 --- a/components/ucd/utils/src/macros.rs +++ b/components/ucd/utils/src/macros.rs @@ -122,29 +122,29 @@ mod tests { #[test] fn abbr_name() { - assert_eq!(CustomDisplayProperty::Variant1.abbr_name(), "V1"); - assert_eq!(CustomDisplayProperty::Variant2.abbr_name(), "V2"); - assert_eq!(CustomDisplayProperty::Variant3.abbr_name(), "V3"); + assert_eq!(Property::Variant1.abbr_name(), "V1"); + assert_eq!(Property::Variant2.abbr_name(), "V2"); + assert_eq!(Property::Variant3.abbr_name(), "V3"); } #[test] fn long_name() { - assert_eq!(CustomDisplayProperty::Variant1.name(), "Variant_1"); - assert_eq!(CustomDisplayProperty::Variant2.name(), "Variant 2"); - assert_eq!(CustomDisplayProperty::Variant3.name(), "Variant-3"); + assert_eq!(Property::Variant1.name(), "Variant_1"); + assert_eq!(Property::Variant2.name(), "Variant 2"); + assert_eq!(Property::Variant3.name(), "Variant-3"); } #[test] fn abbr_mod() { - assert_eq!(CustomDisplayProperty::Variant1, cd_abbr::V1); - assert_eq!(CustomDisplayProperty::Variant2, cd_abbr::V2); - assert_eq!(CustomDisplayProperty::Variant3, cd_abbr::V3); + assert_eq!(Property::Variant1, cd_abbr::V1); + assert_eq!(Property::Variant2, cd_abbr::V2); + assert_eq!(Property::Variant3, cd_abbr::V3); } #[test] fn display() { - assert_eq!(format!("{}", CustomDisplayProperty::Variant1), CustomDisplayProperty::Variant1.name()); - assert_eq!(format!("{}", CustomDisplayProperty::Variant2), CustomDisplayProperty::Variant2.name()); - assert_eq!(format!("{}", CustomDisplayProperty::Variant3), CustomDisplayProperty::Variant3.name()); + assert_eq!(format!("{}", Property::Variant1), Property::Variant1.name()); + assert_eq!(format!("{}", Property::Variant2), Property::Variant2.name()); + assert_eq!(format!("{}", Property::Variant3), Property::Variant3.name()); } }