Skip to content

Commit

Permalink
Add support for generating regexes with native properties
Browse files Browse the repository at this point in the history
  • Loading branch information
radarek committed Oct 1, 2021
1 parent 505c78b commit 5532e0f
Show file tree
Hide file tree
Showing 14 changed files with 249 additions and 162 deletions.
333 changes: 175 additions & 158 deletions data/generate_constants.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,125 +4,15 @@

include Unicode::Emoji

pack = ->(ord){ Regexp.escape(Array(ord).pack("U*")) }
join = -> (*strings){ "(?:" + strings.join("|") + ")" }
pack_and_join = ->(ords){ join[*ords.map{ |ord| pack[ord] }] }

emoji_character = pack_and_join[EMOJI_CHAR]
emoji_modifier = pack_and_join[EMOJI_MODIFIERS]
emoji_modifier_base = pack_and_join[EMOJI_MODIFIER_BASES]
emoji_component = pack_and_join[EMOJI_COMPONENT]
emoji_presentation = pack_and_join[EMOJI_PRESENTATION]
picto = pack_and_join[EXTENDED_PICTOGRAPHIC]
picto_no_emoji = pack_and_join[EXTENDED_PICTOGRAPHIC_NO_EMOJI]

emoji_presentation_sequence = \
join[
pack_and_join[TEXT_PRESENTATION] + pack[EMOJI_VARIATION_SELECTOR],
emoji_presentation + "(?!" + pack[TEXT_VARIATION_SELECTOR] + ")" + pack[EMOJI_VARIATION_SELECTOR] + "?",
]

non_component_emoji_presentation_sequence = \
"(?!" + emoji_component + ")" + emoji_presentation_sequence

text_presentation_sequence = \
join[
pack_and_join[TEXT_PRESENTATION]+ "(?!" + join[emoji_modifier, pack[EMOJI_VARIATION_SELECTOR]] + ")" + pack[TEXT_VARIATION_SELECTOR] + "?",
emoji_presentation + pack[TEXT_VARIATION_SELECTOR]
]

emoji_modifier_sequence = \
emoji_modifier_base + emoji_modifier

emoji_keycap_sequence = \
pack_and_join[EMOJI_KEYCAPS] + pack[[EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX]]

emoji_valid_flag_sequence = \
pack_and_join[VALID_REGION_FLAGS]

emoji_well_formed_flag_sequence = \
"(?:" +
pack_and_join[REGIONAL_INDICATORS] +
pack_and_join[REGIONAL_INDICATORS] +
")"

emoji_valid_core_sequence = \
join[
# emoji_character,
emoji_keycap_sequence,
emoji_modifier_sequence,
non_component_emoji_presentation_sequence,
emoji_valid_flag_sequence,
]

emoji_well_formed_core_sequence = \
join[
# emoji_character,
emoji_keycap_sequence,
emoji_modifier_sequence,
non_component_emoji_presentation_sequence,
emoji_well_formed_flag_sequence,
]

emoji_rgi_tag_sequence = \
pack_and_join[RECOMMENDED_SUBDIVISION_FLAGS]

emoji_valid_tag_sequence = \
"(?:" +
pack[EMOJI_TAG_BASE_FLAG] +
"(?:" + VALID_SUBDIVISIONS.map{ |sd| Regexp.escape(sd.tr("\u{20}-\u{7E}", "\u{E0020}-\u{E007E}"))}.join("|") + ")" +
pack[CANCEL_TAG] +
")"

emoji_well_formed_tag_sequence = \
"(?:" +
join[
non_component_emoji_presentation_sequence,
emoji_modifier_sequence,
] +
pack_and_join[TAGS] + "+" +
pack[CANCEL_TAG] +
")"

emoji_rgi_zwj_sequence = \
pack_and_join[RECOMMENDED_ZWJ_SEQUENCES]

emoji_valid_zwj_element = \
join[
emoji_modifier_sequence,
emoji_presentation_sequence,
emoji_character,
]

emoji_valid_zwj_sequence = \
"(?:" +
"(?:" + emoji_valid_zwj_element + pack[ZWJ] + ")+" + emoji_valid_zwj_element +
")"

emoji_rgi_sequence = \
join[
emoji_rgi_zwj_sequence,
emoji_rgi_tag_sequence,
emoji_valid_core_sequence,
]

emoji_valid_sequence = \
join[
emoji_valid_zwj_sequence,
emoji_valid_tag_sequence,
emoji_valid_core_sequence,
]

emoji_well_formed_sequence = \
join[
emoji_valid_zwj_sequence,
emoji_well_formed_tag_sequence,
emoji_well_formed_core_sequence,
]

def write_regex(const_name, regex)
def write_regexes(regexes, dirpath)
regexes.each do |const_name, regex|
write_regex(const_name, regex, dirpath)
end
end

def write_regex(const_name, regex, dirpath)
filename = const_name.to_s.downcase
filepath = File.expand_path("../lib/unicode/emoji/generated/#{filename}.rb", __dir__)
filepath = File.join(dirpath, "#{filename}.rb")

File.write(filepath, <<~CONTENT)
# This file was generated. Please, do not edit this file by hand.
Expand All @@ -135,54 +25,181 @@ module Emoji
puts "#{const_name} written to #{filepath}"
end

# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
REGEX = Regexp.compile(emoji_rgi_sequence)
write_regex(:REGEX, REGEX)
def pack(ord)
Regexp.escape(Array(ord).pack("U*"))
end

# Matches basic singleton emoji and all kind of valid sequences
REGEX_VALID = Regexp.compile(emoji_valid_sequence)
write_regex(:REGEX_VALID, REGEX_VALID)
def join(*strings)
"(?:" + strings.join("|") + ")"
end

# Matches basic singleton emoji and all kind of sequences
REGEX_WELL_FORMED = Regexp.compile(emoji_well_formed_sequence)
write_regex(:REGEX_WELL_FORMED, REGEX_WELL_FORMED)
def pack_and_join(ords)
join(*ords.map{ |ord| pack(ord) })
end

# Matches only basic single, non-textual emoji
# Ignores "components" like modifiers or simple digits
REGEX_BASIC = Regexp.compile(
"(?!" + emoji_component + ")" + emoji_presentation_sequence
)
write_regex(:REGEX_BASIC, REGEX_BASIC)
def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, picto:, picto_no_emoji:)
emoji_presentation_sequence = \
join(
pack_and_join(TEXT_PRESENTATION) + pack(EMOJI_VARIATION_SELECTOR),
emoji_presentation + "(?!" + pack(TEXT_VARIATION_SELECTOR) + ")" + pack(EMOJI_VARIATION_SELECTOR) + "?",
)

non_component_emoji_presentation_sequence = \
"(?!" + emoji_component + ")" + emoji_presentation_sequence

text_presentation_sequence = \
join(
pack_and_join(TEXT_PRESENTATION)+ "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?",
emoji_presentation + pack(TEXT_VARIATION_SELECTOR)
)

emoji_modifier_sequence = \
emoji_modifier_base + emoji_modifier

emoji_keycap_sequence = \
pack_and_join(EMOJI_KEYCAPS) + pack([EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX])

emoji_valid_flag_sequence = \
pack_and_join(VALID_REGION_FLAGS)

emoji_well_formed_flag_sequence = \
"(?:" +
pack_and_join(REGIONAL_INDICATORS) +
pack_and_join(REGIONAL_INDICATORS) +
")"

emoji_valid_core_sequence = \
join(
# emoji_character,
emoji_keycap_sequence,
emoji_modifier_sequence,
non_component_emoji_presentation_sequence,
emoji_valid_flag_sequence,
)

# Matches only basic single, textual emoji
# Ignores "components" like modifiers or simple digits
REGEX_TEXT = Regexp.compile(
"(?!" + emoji_component + ")" + text_presentation_sequence
)
write_regex(:REGEX_TEXT, REGEX_TEXT)
emoji_well_formed_core_sequence = \
join(
# emoji_character,
emoji_keycap_sequence,
emoji_modifier_sequence,
non_component_emoji_presentation_sequence,
emoji_well_formed_flag_sequence,
)

emoji_rgi_tag_sequence = \
pack_and_join(RECOMMENDED_SUBDIVISION_FLAGS)

emoji_valid_tag_sequence = \
"(?:" +
pack(EMOJI_TAG_BASE_FLAG) +
"(?:" + VALID_SUBDIVISIONS.map{ |sd| Regexp.escape(sd.tr("\u{20}-\u{7E}", "\u{E0020}-\u{E007E}"))}.join("|") + ")" +
pack(CANCEL_TAG) +
")"

emoji_well_formed_tag_sequence = \
"(?:" +
join(
non_component_emoji_presentation_sequence,
emoji_modifier_sequence,
) +
pack_and_join(TAGS) + "+" +
pack(CANCEL_TAG) +
")"

emoji_rgi_zwj_sequence = \
pack_and_join(RECOMMENDED_ZWJ_SEQUENCES)

emoji_valid_zwj_element = \
join(
emoji_modifier_sequence,
emoji_presentation_sequence,
emoji_character,
)

# Matches any emoji-related codepoint - Use with caution (returns partil matches)
REGEX_ANY = Regexp.compile(
emoji_character
)
write_regex(:REGEX_ANY, REGEX_ANY)
emoji_valid_zwj_sequence = \
"(?:" +
"(?:" + emoji_valid_zwj_element + pack(ZWJ) + ")+" + emoji_valid_zwj_element +
")"

# Combined REGEXes which also match for TEXTUAL emoji
REGEX_INCLUDE_TEXT = Regexp.union(REGEX, REGEX_TEXT)
write_regex(:REGEX_INCLUDE_TEXT, REGEX_INCLUDE_TEXT)
emoji_rgi_sequence = \
join(
emoji_rgi_zwj_sequence,
emoji_rgi_tag_sequence,
emoji_valid_core_sequence,
)

REGEX_VALID_INCLUDE_TEXT = Regexp.union(REGEX_VALID, REGEX_TEXT)
write_regex(:REGEX_VALID_INCLUDE_TEXT, REGEX_VALID_INCLUDE_TEXT)
emoji_valid_sequence = \
join(
emoji_valid_zwj_sequence,
emoji_valid_tag_sequence,
emoji_valid_core_sequence,
)

REGEX_WELL_FORMED_INCLUDE_TEXT = Regexp.union(REGEX_WELL_FORMED, REGEX_TEXT)
write_regex(:REGEX_WELL_FORMED_INCLUDE_TEXT, REGEX_WELL_FORMED_INCLUDE_TEXT)
emoji_well_formed_sequence = \
join(
emoji_valid_zwj_sequence,
emoji_well_formed_tag_sequence,
emoji_well_formed_core_sequence,
)

REGEX_PICTO = Regexp.compile(
picto
)
write_regex(:REGEX_PICTO, REGEX_PICTO)
regexes = {}

# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
regexes[:REGEX] = Regexp.compile(emoji_rgi_sequence)

# Matches basic singleton emoji and all kind of valid sequences
regexes[:REGEX_VALID] = Regexp.compile(emoji_valid_sequence)

# Matches basic singleton emoji and all kind of sequences
regexes[:REGEX_WELL_FORMED] = Regexp.compile(emoji_well_formed_sequence)

# Matches only basic single, non-textual emoji
# Ignores "components" like modifiers or simple digits
regexes[:REGEX_BASIC] = Regexp.compile(
"(?!" + emoji_component + ")" + emoji_presentation_sequence
)

# Matches only basic single, textual emoji
# Ignores "components" like modifiers or simple digits
regexes[:REGEX_TEXT] = Regexp.compile(
"(?!" + emoji_component + ")" + text_presentation_sequence
)

# Matches any emoji-related codepoint - Use with caution (returns partil matches)
regexes[:REGEX_ANY] = Regexp.compile(emoji_character)

# Combined REGEXes which also match for TEXTUAL emoji
regexes[:REGEX_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX], regexes[:REGEX_TEXT])

REGEX_PICTO_NO_EMOJI = Regexp.compile(
picto_no_emoji
regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_VALID], regexes[:REGEX_TEXT])

regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_WELL_FORMED], regexes[:REGEX_TEXT])

regexes[:REGEX_PICTO] = Regexp.compile(picto)

regexes[:REGEX_PICTO_NO_EMOJI] = Regexp.compile(picto_no_emoji)

regexes
end

regexes = compile(
emoji_character: pack_and_join(EMOJI_CHAR),
emoji_modifier: pack_and_join(EMOJI_MODIFIERS),
emoji_modifier_base: pack_and_join(EMOJI_MODIFIER_BASES),
emoji_component: pack_and_join(EMOJI_COMPONENT),
emoji_presentation: pack_and_join(EMOJI_PRESENTATION),
picto: pack_and_join(EXTENDED_PICTOGRAPHIC),
picto_no_emoji: pack_and_join(EXTENDED_PICTOGRAPHIC_NO_EMOJI)
)
write_regexes(regexes, File.expand_path("../lib/unicode/emoji/generated", __dir__))

native_regexes = compile(
emoji_character: "\\p{Emoji}",
emoji_modifier: "\\p{Emoji Modifier}",
emoji_modifier_base: "\\p{Emoji Modifier Base}",
emoji_component: "\\p{Emoji Component}",
emoji_presentation: "\\p{Emoji Presentation}",
picto: "\\p{Extended Pictographic}",
picto_no_emoji: "\\p{Extended Pictographic}(?<!\\p{Emoji})"
)
write_regex(:REGEX_PICTO_NO_EMOJI, REGEX_PICTO_NO_EMOJI)
write_regexes(native_regexes, File.expand_path("../lib/unicode/emoji/generated_native", __dir__))
11 changes: 8 additions & 3 deletions lib/unicode/emoji.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,24 @@ module Unicode
module Emoji
autoload :INDEX, File.expand_path('emoji/index', __dir__)

%i[
%w[
EMOJI_CHAR EMOJI_CHAR EMOJI_PRESENTATION TEXT_PRESENTATION EMOJI_COMPONENT EMOJI_MODIFIER_BASES
EMOJI_MODIFIERS EXTENDED_PICTOGRAPHIC EXTENDED_PICTOGRAPHIC_NO_EMOJ EMOJI_KEYCAPS VALID_REGION_FLAGS
VALID_SUBDIVISIONS RECOMMENDED_SUBDIVISION_FLAGS RECOMMENDED_ZWJ_SEQUENCES LIST LIST_REMOVED_KEYS
].each do |const_name|
autoload const_name, File.expand_path('emoji/lazy_constants', __dir__)
end

%i[
generated_constants_dirpath = File.expand_path(
EMOJI_VERSION == Unicode::Version.emoji_version ? "emoji/generated_native/" : "emoji/generated/",
__dir__
)

%w[
REGEX REGEX_VALID REGEX_WELL_FORMED REGEX_BASIC REGEX_TEXT REGEX_ANY REGEX_INCLUDE_TEXT
REGEX_VALID_INCLUDE_TEXT REGEX_WELL_FORMED_INCLUDE_TEXT REGEX_PICTO REGEX_PICTO_NO_EMOJI
].each do |const_name|
autoload const_name, File.expand_path("emoji/generated/#{const_name.downcase}", __dir__)
autoload const_name, File.join(generated_constants_dirpath, const_name.downcase)
end

def self.properties(char)
Expand Down
Loading

0 comments on commit 5532e0f

Please sign in to comment.