Skip to content

Commit

Permalink
Make all regexes constants lazy loaded from pregenerated files
Browse files Browse the repository at this point in the history
  • Loading branch information
radarek committed Sep 30, 2021
1 parent 91a6172 commit 505c78b
Show file tree
Hide file tree
Showing 15 changed files with 313 additions and 203 deletions.
188 changes: 188 additions & 0 deletions data/generate_constants.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
require_relative '../lib/unicode/emoji/constants'
require_relative '../lib/unicode/emoji/index'
require_relative '../lib/unicode/emoji/lazy_constants'

include Unicode::Emoji

pack = ->(ord){ Regexp.escape(Array(ord).pack("U*")) }
join = -> (*strings){ "(?:" + strings.join("|") + ")" }
pack_and_join = ->(ords){ join[*ords.map{ |ord| pack[ord] }] }

emoji_character = pack_and_join[EMOJI_CHAR]
emoji_modifier = pack_and_join[EMOJI_MODIFIERS]
emoji_modifier_base = pack_and_join[EMOJI_MODIFIER_BASES]
emoji_component = pack_and_join[EMOJI_COMPONENT]
emoji_presentation = pack_and_join[EMOJI_PRESENTATION]
picto = pack_and_join[EXTENDED_PICTOGRAPHIC]
picto_no_emoji = pack_and_join[EXTENDED_PICTOGRAPHIC_NO_EMOJI]

emoji_presentation_sequence = \
join[
pack_and_join[TEXT_PRESENTATION] + pack[EMOJI_VARIATION_SELECTOR],
emoji_presentation + "(?!" + pack[TEXT_VARIATION_SELECTOR] + ")" + pack[EMOJI_VARIATION_SELECTOR] + "?",
]

non_component_emoji_presentation_sequence = \
"(?!" + emoji_component + ")" + emoji_presentation_sequence

text_presentation_sequence = \
join[
pack_and_join[TEXT_PRESENTATION]+ "(?!" + join[emoji_modifier, pack[EMOJI_VARIATION_SELECTOR]] + ")" + pack[TEXT_VARIATION_SELECTOR] + "?",
emoji_presentation + pack[TEXT_VARIATION_SELECTOR]
]

emoji_modifier_sequence = \
emoji_modifier_base + emoji_modifier

emoji_keycap_sequence = \
pack_and_join[EMOJI_KEYCAPS] + pack[[EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX]]

emoji_valid_flag_sequence = \
pack_and_join[VALID_REGION_FLAGS]

emoji_well_formed_flag_sequence = \
"(?:" +
pack_and_join[REGIONAL_INDICATORS] +
pack_and_join[REGIONAL_INDICATORS] +
")"

emoji_valid_core_sequence = \
join[
# emoji_character,
emoji_keycap_sequence,
emoji_modifier_sequence,
non_component_emoji_presentation_sequence,
emoji_valid_flag_sequence,
]

emoji_well_formed_core_sequence = \
join[
# emoji_character,
emoji_keycap_sequence,
emoji_modifier_sequence,
non_component_emoji_presentation_sequence,
emoji_well_formed_flag_sequence,
]

emoji_rgi_tag_sequence = \
pack_and_join[RECOMMENDED_SUBDIVISION_FLAGS]

emoji_valid_tag_sequence = \
"(?:" +
pack[EMOJI_TAG_BASE_FLAG] +
"(?:" + VALID_SUBDIVISIONS.map{ |sd| Regexp.escape(sd.tr("\u{20}-\u{7E}", "\u{E0020}-\u{E007E}"))}.join("|") + ")" +
pack[CANCEL_TAG] +
")"

emoji_well_formed_tag_sequence = \
"(?:" +
join[
non_component_emoji_presentation_sequence,
emoji_modifier_sequence,
] +
pack_and_join[TAGS] + "+" +
pack[CANCEL_TAG] +
")"

emoji_rgi_zwj_sequence = \
pack_and_join[RECOMMENDED_ZWJ_SEQUENCES]

emoji_valid_zwj_element = \
join[
emoji_modifier_sequence,
emoji_presentation_sequence,
emoji_character,
]

emoji_valid_zwj_sequence = \
"(?:" +
"(?:" + emoji_valid_zwj_element + pack[ZWJ] + ")+" + emoji_valid_zwj_element +
")"

emoji_rgi_sequence = \
join[
emoji_rgi_zwj_sequence,
emoji_rgi_tag_sequence,
emoji_valid_core_sequence,
]

emoji_valid_sequence = \
join[
emoji_valid_zwj_sequence,
emoji_valid_tag_sequence,
emoji_valid_core_sequence,
]

emoji_well_formed_sequence = \
join[
emoji_valid_zwj_sequence,
emoji_well_formed_tag_sequence,
emoji_well_formed_core_sequence,
]

def write_regex(const_name, regex)
filename = const_name.to_s.downcase
filepath = File.expand_path("../lib/unicode/emoji/generated/#{filename}.rb", __dir__)

File.write(filepath, <<~CONTENT)
# This file was generated. Please, do not edit this file by hand.
module Unicode
module Emoji
#{const_name} = #{regex.inspect}
end
end
CONTENT
puts "#{const_name} written to #{filepath}"
end

# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
REGEX = Regexp.compile(emoji_rgi_sequence)
write_regex(:REGEX, REGEX)

# Matches basic singleton emoji and all kind of valid sequences
REGEX_VALID = Regexp.compile(emoji_valid_sequence)
write_regex(:REGEX_VALID, REGEX_VALID)

# Matches basic singleton emoji and all kind of sequences
REGEX_WELL_FORMED = Regexp.compile(emoji_well_formed_sequence)
write_regex(:REGEX_WELL_FORMED, REGEX_WELL_FORMED)

# Matches only basic single, non-textual emoji
# Ignores "components" like modifiers or simple digits
REGEX_BASIC = Regexp.compile(
"(?!" + emoji_component + ")" + emoji_presentation_sequence
)
write_regex(:REGEX_BASIC, REGEX_BASIC)

# Matches only basic single, textual emoji
# Ignores "components" like modifiers or simple digits
REGEX_TEXT = Regexp.compile(
"(?!" + emoji_component + ")" + text_presentation_sequence
)
write_regex(:REGEX_TEXT, REGEX_TEXT)

# Matches any emoji-related codepoint - Use with caution (returns partil matches)
REGEX_ANY = Regexp.compile(
emoji_character
)
write_regex(:REGEX_ANY, REGEX_ANY)

# Combined REGEXes which also match for TEXTUAL emoji
REGEX_INCLUDE_TEXT = Regexp.union(REGEX, REGEX_TEXT)
write_regex(:REGEX_INCLUDE_TEXT, REGEX_INCLUDE_TEXT)

REGEX_VALID_INCLUDE_TEXT = Regexp.union(REGEX_VALID, REGEX_TEXT)
write_regex(:REGEX_VALID_INCLUDE_TEXT, REGEX_VALID_INCLUDE_TEXT)

REGEX_WELL_FORMED_INCLUDE_TEXT = Regexp.union(REGEX_WELL_FORMED, REGEX_TEXT)
write_regex(:REGEX_WELL_FORMED_INCLUDE_TEXT, REGEX_WELL_FORMED_INCLUDE_TEXT)

REGEX_PICTO = Regexp.compile(
picto
)
write_regex(:REGEX_PICTO, REGEX_PICTO)

REGEX_PICTO_NO_EMOJI = Regexp.compile(
picto_no_emoji
)
write_regex(:REGEX_PICTO_NO_EMOJI, REGEX_PICTO_NO_EMOJI)
Loading

0 comments on commit 505c78b

Please sign in to comment.