-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Make all regexes constants lazy loaded from pregenerated files
- Loading branch information
Showing
15 changed files
with
313 additions
and
203 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
require_relative '../lib/unicode/emoji/constants' | ||
require_relative '../lib/unicode/emoji/index' | ||
require_relative '../lib/unicode/emoji/lazy_constants' | ||
|
||
include Unicode::Emoji | ||
|
||
pack = ->(ord){ Regexp.escape(Array(ord).pack("U*")) } | ||
join = -> (*strings){ "(?:" + strings.join("|") + ")" } | ||
pack_and_join = ->(ords){ join[*ords.map{ |ord| pack[ord] }] } | ||
|
||
emoji_character = pack_and_join[EMOJI_CHAR] | ||
emoji_modifier = pack_and_join[EMOJI_MODIFIERS] | ||
emoji_modifier_base = pack_and_join[EMOJI_MODIFIER_BASES] | ||
emoji_component = pack_and_join[EMOJI_COMPONENT] | ||
emoji_presentation = pack_and_join[EMOJI_PRESENTATION] | ||
picto = pack_and_join[EXTENDED_PICTOGRAPHIC] | ||
picto_no_emoji = pack_and_join[EXTENDED_PICTOGRAPHIC_NO_EMOJI] | ||
|
||
emoji_presentation_sequence = \ | ||
join[ | ||
pack_and_join[TEXT_PRESENTATION] + pack[EMOJI_VARIATION_SELECTOR], | ||
emoji_presentation + "(?!" + pack[TEXT_VARIATION_SELECTOR] + ")" + pack[EMOJI_VARIATION_SELECTOR] + "?", | ||
] | ||
|
||
non_component_emoji_presentation_sequence = \ | ||
"(?!" + emoji_component + ")" + emoji_presentation_sequence | ||
|
||
text_presentation_sequence = \ | ||
join[ | ||
pack_and_join[TEXT_PRESENTATION]+ "(?!" + join[emoji_modifier, pack[EMOJI_VARIATION_SELECTOR]] + ")" + pack[TEXT_VARIATION_SELECTOR] + "?", | ||
emoji_presentation + pack[TEXT_VARIATION_SELECTOR] | ||
] | ||
|
||
emoji_modifier_sequence = \ | ||
emoji_modifier_base + emoji_modifier | ||
|
||
emoji_keycap_sequence = \ | ||
pack_and_join[EMOJI_KEYCAPS] + pack[[EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX]] | ||
|
||
emoji_valid_flag_sequence = \ | ||
pack_and_join[VALID_REGION_FLAGS] | ||
|
||
emoji_well_formed_flag_sequence = \ | ||
"(?:" + | ||
pack_and_join[REGIONAL_INDICATORS] + | ||
pack_and_join[REGIONAL_INDICATORS] + | ||
")" | ||
|
||
emoji_valid_core_sequence = \ | ||
join[ | ||
# emoji_character, | ||
emoji_keycap_sequence, | ||
emoji_modifier_sequence, | ||
non_component_emoji_presentation_sequence, | ||
emoji_valid_flag_sequence, | ||
] | ||
|
||
emoji_well_formed_core_sequence = \ | ||
join[ | ||
# emoji_character, | ||
emoji_keycap_sequence, | ||
emoji_modifier_sequence, | ||
non_component_emoji_presentation_sequence, | ||
emoji_well_formed_flag_sequence, | ||
] | ||
|
||
emoji_rgi_tag_sequence = \ | ||
pack_and_join[RECOMMENDED_SUBDIVISION_FLAGS] | ||
|
||
emoji_valid_tag_sequence = \ | ||
"(?:" + | ||
pack[EMOJI_TAG_BASE_FLAG] + | ||
"(?:" + VALID_SUBDIVISIONS.map{ |sd| Regexp.escape(sd.tr("\u{20}-\u{7E}", "\u{E0020}-\u{E007E}"))}.join("|") + ")" + | ||
pack[CANCEL_TAG] + | ||
")" | ||
|
||
emoji_well_formed_tag_sequence = \ | ||
"(?:" + | ||
join[ | ||
non_component_emoji_presentation_sequence, | ||
emoji_modifier_sequence, | ||
] + | ||
pack_and_join[TAGS] + "+" + | ||
pack[CANCEL_TAG] + | ||
")" | ||
|
||
emoji_rgi_zwj_sequence = \ | ||
pack_and_join[RECOMMENDED_ZWJ_SEQUENCES] | ||
|
||
emoji_valid_zwj_element = \ | ||
join[ | ||
emoji_modifier_sequence, | ||
emoji_presentation_sequence, | ||
emoji_character, | ||
] | ||
|
||
emoji_valid_zwj_sequence = \ | ||
"(?:" + | ||
"(?:" + emoji_valid_zwj_element + pack[ZWJ] + ")+" + emoji_valid_zwj_element + | ||
")" | ||
|
||
emoji_rgi_sequence = \ | ||
join[ | ||
emoji_rgi_zwj_sequence, | ||
emoji_rgi_tag_sequence, | ||
emoji_valid_core_sequence, | ||
] | ||
|
||
emoji_valid_sequence = \ | ||
join[ | ||
emoji_valid_zwj_sequence, | ||
emoji_valid_tag_sequence, | ||
emoji_valid_core_sequence, | ||
] | ||
|
||
emoji_well_formed_sequence = \ | ||
join[ | ||
emoji_valid_zwj_sequence, | ||
emoji_well_formed_tag_sequence, | ||
emoji_well_formed_core_sequence, | ||
] | ||
|
||
def write_regex(const_name, regex) | ||
filename = const_name.to_s.downcase | ||
filepath = File.expand_path("../lib/unicode/emoji/generated/#{filename}.rb", __dir__) | ||
|
||
File.write(filepath, <<~CONTENT) | ||
# This file was generated. Please, do not edit this file by hand. | ||
module Unicode | ||
module Emoji | ||
#{const_name} = #{regex.inspect} | ||
end | ||
end | ||
CONTENT | ||
puts "#{const_name} written to #{filepath}" | ||
end | ||
|
||
# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi) | ||
REGEX = Regexp.compile(emoji_rgi_sequence) | ||
write_regex(:REGEX, REGEX) | ||
|
||
# Matches basic singleton emoji and all kind of valid sequences | ||
REGEX_VALID = Regexp.compile(emoji_valid_sequence) | ||
write_regex(:REGEX_VALID, REGEX_VALID) | ||
|
||
# Matches basic singleton emoji and all kind of sequences | ||
REGEX_WELL_FORMED = Regexp.compile(emoji_well_formed_sequence) | ||
write_regex(:REGEX_WELL_FORMED, REGEX_WELL_FORMED) | ||
|
||
# Matches only basic single, non-textual emoji | ||
# Ignores "components" like modifiers or simple digits | ||
REGEX_BASIC = Regexp.compile( | ||
"(?!" + emoji_component + ")" + emoji_presentation_sequence | ||
) | ||
write_regex(:REGEX_BASIC, REGEX_BASIC) | ||
|
||
# Matches only basic single, textual emoji | ||
# Ignores "components" like modifiers or simple digits | ||
REGEX_TEXT = Regexp.compile( | ||
"(?!" + emoji_component + ")" + text_presentation_sequence | ||
) | ||
write_regex(:REGEX_TEXT, REGEX_TEXT) | ||
|
||
# Matches any emoji-related codepoint - Use with caution (returns partil matches) | ||
REGEX_ANY = Regexp.compile( | ||
emoji_character | ||
) | ||
write_regex(:REGEX_ANY, REGEX_ANY) | ||
|
||
# Combined REGEXes which also match for TEXTUAL emoji | ||
REGEX_INCLUDE_TEXT = Regexp.union(REGEX, REGEX_TEXT) | ||
write_regex(:REGEX_INCLUDE_TEXT, REGEX_INCLUDE_TEXT) | ||
|
||
REGEX_VALID_INCLUDE_TEXT = Regexp.union(REGEX_VALID, REGEX_TEXT) | ||
write_regex(:REGEX_VALID_INCLUDE_TEXT, REGEX_VALID_INCLUDE_TEXT) | ||
|
||
REGEX_WELL_FORMED_INCLUDE_TEXT = Regexp.union(REGEX_WELL_FORMED, REGEX_TEXT) | ||
write_regex(:REGEX_WELL_FORMED_INCLUDE_TEXT, REGEX_WELL_FORMED_INCLUDE_TEXT) | ||
|
||
REGEX_PICTO = Regexp.compile( | ||
picto | ||
) | ||
write_regex(:REGEX_PICTO, REGEX_PICTO) | ||
|
||
REGEX_PICTO_NO_EMOJI = Regexp.compile( | ||
picto_no_emoji | ||
) | ||
write_regex(:REGEX_PICTO_NO_EMOJI, REGEX_PICTO_NO_EMOJI) |
Oops, something went wrong.