Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial implementation for unic-ucd-unihan #225

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions gen/src/source/ucd/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ pub mod prop_list;
pub mod readme;
pub mod sentence_break_property;
pub mod unicode_data;
pub mod unihan;
pub mod word_break_property;

use regex::Regex;
Expand Down
65 changes: 65 additions & 0 deletions gen/src/source/ucd/unihan/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Copyright 2017 The UNIC Project Developers.
//
// See the COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

pub mod numeric_values;
pub mod readings;
pub mod variants;

use std::char;
use std::collections::BTreeMap;

use regex::Regex;

lazy_static! {
pub static ref UNIHAN_DATA_ENTRY_REGEX: Regex = Regex::new(
r"(?xm)^ # every line
U\+([[:xdigit:]]{4,6}) # [1]codepoint
\t # separator
(k[a-zA-Z0-9_]+) # [2]field key
\t # separator
(.*) # [3]field value
",
)
.unwrap();
}

pub trait DataEntry {
fn new(character: char) -> Self;
fn update<'a>(&mut self, key: &'a str, value: &'a str);
}

pub fn parse_entries_from_str<T>(str: &str) -> Vec<T>
where
T: DataEntry + Clone,
{
let mut entry_map: BTreeMap<char, T> = BTreeMap::default();

for capture in UNIHAN_DATA_ENTRY_REGEX.captures_iter(str) {
let code_point = u32::from_str_radix(&capture[1], 16).unwrap();
let chr = char::from_u32(code_point).unwrap();

let key = &capture[2];
let value = &capture[3];

match entry_map.get(&chr) {
None => {
let mut entry = T::new(chr);
entry.update(key, value);
entry_map.insert(chr, entry);
}
Some(_) => {
let entry = entry_map.get_mut(&chr).unwrap();
entry.update(key, value);
}
}
}

entry_map.values().cloned().collect::<Vec<T>>()
}
96 changes: 96 additions & 0 deletions gen/src/source/ucd/unihan/numeric_values.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// Copyright 2017 The UNIC Project Developers.
//
// See the COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms

use std::str::FromStr;

use crate::source::utils::read;

use super::{parse_entries_from_str, DataEntry};

lazy_static! {
/// [Numeric values]: http://www.unicode.org/reports/tr38/#N1024D
pub static ref UNIHAN_NUMERIC_VALUES_DATA: NumericValuesData = {
read("external/unicode/ucd/data/Unihan/Unihan_NumericValues.txt").parse().unwrap()
};
}

#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct NumericValuesDataEntry {
pub character: char,
pub accounting_numeric: Option<u64>,
pub other_numeric: Option<u64>,
pub primary_numeric: Option<u64>,
}

impl DataEntry for NumericValuesDataEntry {
fn new(character: char) -> NumericValuesDataEntry {
NumericValuesDataEntry {
character: character,
accounting_numeric: None,
other_numeric: None,
primary_numeric: None,
}
}

fn update<'a>(&mut self, key: &'a str, value: &'a str) {
match key {
"kAccountingNumeric" => self.accounting_numeric = value.parse::<u64>().ok(),
"kOtherNumeric" => self.other_numeric = value.parse::<u64>().ok(),
"kPrimaryNumeric" => self.primary_numeric = value.parse::<u64>().ok(),
_ => {}
}
}
}

#[derive(Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct NumericValuesData {
pub entries: Box<[NumericValuesDataEntry]>,
}

impl FromStr for NumericValuesData {
type Err = ();

fn from_str(str: &str) -> Result<Self, Self::Err> {
Ok(NumericValuesData {
entries: parse_entries_from_str(str).into_boxed_slice(),
})
}
}

#[cfg(test)]
mod test {
use super::super::DataEntry;
use super::{NumericValuesData, NumericValuesDataEntry};

#[test]
fn data_entry_parse() {
let mut entry1 = NumericValuesDataEntry::new('\u{3405}');
entry1.other_numeric = Some(5);

let mut entry2 = NumericValuesDataEntry::new('\u{4EDF}');
entry2.accounting_numeric = Some(1000);

let mut entry3 = NumericValuesDataEntry::new('\u{5146}');
entry3.primary_numeric = Some(1000000000000);

let entries = vec![entry1, entry2, entry3];

assert_eq!(
"U+3405 kOtherNumeric 5\n\
U+4EDF kAccountingNumeric 1000\n\
U+5146 kPrimaryNumeric 1000000000000\n\
"
.parse(),
Ok(NumericValuesData {
entries: entries.into_boxed_slice(),
}),
);
}
}
92 changes: 92 additions & 0 deletions gen/src/source/ucd/unihan/readings.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// Copyright 2017 The UNIC Project Developers.
//
// See the COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms

use std::str::FromStr;

use crate::source::utils::read;

use super::{parse_entries_from_str, DataEntry};

lazy_static! {
/// [Readings]: http://www.unicode.org/reports/tr38/#N1019C
pub static ref UNIHAN_READINGS_DATA: ReadingsData = {
read("external/unicode/ucd/data/Unihan/Unihan_Readings.txt").parse().unwrap()
};
}

#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct ReadingsDataEntry {
pub character: char,
pub cantonese: Option<String>,
pub definition: Option<String>,
pub hangul: Option<String>,
pub hanyu_pinlu: Option<String>,
pub hanyu_pinyin: Option<String>,
pub japanese_kun: Option<String>,
pub japanese_on: Option<String>,
pub korean: Option<String>,
pub mandarin: Option<String>,
pub tang: Option<String>,
pub vietnamese: Option<String>,
pub xhc_1983: Option<String>,
}

impl DataEntry for ReadingsDataEntry {
fn new(character: char) -> ReadingsDataEntry {
ReadingsDataEntry {
character: character,
cantonese: None,
definition: None,
hangul: None,
hanyu_pinlu: None,
hanyu_pinyin: None,
japanese_kun: None,
japanese_on: None,
korean: None,
mandarin: None,
tang: None,
vietnamese: None,
xhc_1983: None,
}
}

fn update<'a>(&mut self, key: &'a str, value: &'a str) {
match key {
"kCantonese" => self.cantonese = Some(value.to_owned()),
"kDefinition" => self.definition = Some(value.to_owned()),
"kHangul" => self.hangul = Some(value.to_owned()),
"kHanyuPinlu" => self.hanyu_pinlu = Some(value.to_owned()),
"kHanyuPinyin" => self.hanyu_pinyin = Some(value.to_owned()),
"kJapaneseKun" => self.japanese_kun = Some(value.to_owned()),
"kJapaneseOn" => self.japanese_on = Some(value.to_owned()),
"kKorean" => self.korean = Some(value.to_owned()),
"kMandarin" => self.mandarin = Some(value.to_owned()),
"kTang" => self.tang = Some(value.to_owned()),
"kVietnamese" => self.vietnamese = Some(value.to_owned()),
"kXHC1983" => self.xhc_1983 = Some(value.to_owned()),
_ => {}
}
}
}

#[derive(Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct ReadingsData {
pub entries: Box<[ReadingsDataEntry]>,
}

impl FromStr for ReadingsData {
type Err = ();

fn from_str(str: &str) -> Result<Self, Self::Err> {
Ok(ReadingsData {
entries: parse_entries_from_str(str).into_boxed_slice(),
})
}
}
132 changes: 132 additions & 0 deletions gen/src/source/ucd/unihan/variants.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
// Copyright 2017 The UNIC Project Developers.
//
// See the COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms

use std::char;
use std::str::FromStr;

use regex::Regex;

use crate::source::utils::read;

use super::{parse_entries_from_str, DataEntry};

lazy_static! {
/// [Variants]: http://www.unicode.org/reports/tr38/#N10211
pub static ref UNIHAN_VARIANTS_DATA: VariantsData = {
read("external/unicode/ucd/data/Unihan/Unihan_Variants.txt").parse().unwrap()
};

pub static ref VALUE_REGEX: Regex = Regex::new(
r"(?x) # extended regex syntax
U\+(2?[[:xdigit:]]{4}) # [1]codepoint
<?( # [2]additional data
k[[:alnum:]]+(:[TBZFJ]+)?(,k[[:alnum:]]+(:[TBZFJ]+)?)*
)?
",
).unwrap();
}

#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct VariantsDataEntry {
pub character: char,
pub semantic_variants: Option<Vec<char>>, // FIXME: handle additional data
pub simplified_variant: Option<char>,
pub specialized_semantic_variants: Option<Vec<char>>, // FIXME: handle additional data
pub traditional_variant: Option<char>,
pub z_variants: Option<Vec<char>>, // FIXME: handle additional data
}

impl VariantsDataEntry {
pub fn parse_value<'a>(str: &'a str) -> char {
let capture = VALUE_REGEX.captures(str).unwrap();
let code_point = u32::from_str_radix(&capture[1], 16).unwrap();
char::from_u32(code_point).unwrap()
}

pub fn parse_values_with_additional_data<'a>(str: &'a str) -> Vec<char> {
let mut chars = vec![];
for capture in VALUE_REGEX.captures_iter(str) {
let code_point = u32::from_str_radix(&capture[1], 16).unwrap();
let chr = char::from_u32(code_point).unwrap();
chars.push(chr);
}
chars
}
}

impl DataEntry for VariantsDataEntry {
fn new(character: char) -> VariantsDataEntry {
VariantsDataEntry {
character: character,
semantic_variants: None,
simplified_variant: None,
specialized_semantic_variants: None,
traditional_variant: None,
z_variants: None,
}
}

fn update<'a>(&mut self, key: &'a str, value: &'a str) {
match key {
"kSemanticVariant" => {
self.semantic_variants =
Some(VariantsDataEntry::parse_values_with_additional_data(value))
}
"kSimplifiedVariant" => {
self.simplified_variant = Some(VariantsDataEntry::parse_value(value))
}
"kSpecializedSemanticVariant" => {
self.specialized_semantic_variants =
Some(VariantsDataEntry::parse_values_with_additional_data(value))
}
"kTraditionalVariant" => {
self.traditional_variant = Some(VariantsDataEntry::parse_value(value))
}
"kZVariant" => {
self.z_variants = Some(VariantsDataEntry::parse_values_with_additional_data(value))
}
_ => {}
}
}
}

#[derive(Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct VariantsData {
pub entries: Box<[VariantsDataEntry]>,
}

impl FromStr for VariantsData {
type Err = ();

fn from_str(str: &str) -> Result<Self, Self::Err> {
Ok(VariantsData {
entries: parse_entries_from_str(str).into_boxed_slice(),
})
}
}

#[cfg(test)]
mod test {
use super::VariantsDataEntry;

#[test]
fn value_parse() {
let sample_value = "U+54A8";
let chr = VariantsDataEntry::parse_value(sample_value);
assert_eq!(chr, '\u{54A8}')
}

#[test]
fn value_parse_with_additional_data() {
let sample_value = "U+54A8<kMatthews:T,kMeyerWempe U+8AEE<kMatthews,kMeyerWempe";
let chars = VariantsDataEntry::parse_values_with_additional_data(sample_value);
assert_eq!(chars, vec!['\u{54A8}', '\u{8AEE}'])
}
}
Loading