Skip to content

Commit

Permalink
Refactor config, add time tracking facility, optimize cleaner
Browse files Browse the repository at this point in the history
In particular, I was curious what caused a performance problem on
a large document I'm working on. Turned out, it was a
remove_inner_whitespace procedure in Cleaner. With a simple fix I
managed to make it finish in 1 second, instead of 170s.

All the rest of the processing combined takes 10s, so we will be
able to progress much faster on next issues.
  • Loading branch information
hmdne committed May 24, 2024
1 parent cd84879 commit bdd1dab
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 41 deletions.
4 changes: 4 additions & 0 deletions exe/reverse_adoc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ OptionParser.new do |opts|
Coradoc::ReverseAdoc.config.unknown_tags = v
end

opts.on("--track-time", "Track time spent on each step") do
Coradoc::ReverseAdoc.config.track_time = true
end

opts.on("-v", "--version", "Version information") do |_v|
puts "reverse_adoc: v#{Coradoc::ReverseAdoc::VERSION}"
exit
Expand Down
26 changes: 19 additions & 7 deletions lib/coradoc/reverse_adoc/cleaner.rb
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
module Coradoc::ReverseAdoc
class Cleaner
def tidy(string)
result = remove_inner_whitespaces(String.new(string))
result = remove_newlines(result)
result = remove_leading_newlines(result)
result = clean_tag_borders(result)
clean_punctuation_characters(result)
result = HtmlConverter.track_time "Removing inner whitespace" do
remove_inner_whitespaces(String.new(string))
end
result = HtmlConverter.track_time "Removing newlines" do
remove_newlines(result)
end
result = HtmlConverter.track_time "Removing leading newlines" do
remove_leading_newlines(result)
end
result = HtmlConverter.track_time "Cleaning tag borders" do
clean_tag_borders(result)
end
result = HtmlConverter.track_time "Cleaning punctuation characters" do
clean_punctuation_characters(result)
end
end

def remove_newlines(string)
Expand All @@ -22,11 +32,13 @@ def remove_inner_whitespaces(string)
string.gsub!(/(stem:\[([^\]]|\\\])*\])\n(?=\S)/, "\\1 ")
string.gsub!(/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/, "\\1")
end
string.each_line.inject("") do |memo, line|
memo + preserve_border_whitespaces(line) do
result = +""
string.each_line do |line|
result << preserve_border_whitespaces(line) do
line.strip.gsub(/[ \t]{2,}/, " ")
end
end
result
end

# Find non-asterisk content that is enclosed by two or
Expand Down
36 changes: 20 additions & 16 deletions lib/coradoc/reverse_adoc/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@

module Coradoc::ReverseAdoc
class Config
attr_accessor :unknown_tags, :tag_border, :mathml2asciimath, :external_images,
:destination, :sourcedir, :image_counter, :image_counter_pattern, :input_format

def initialize
@unknown_tags = :pass_through
@input_format = :html
Expand All @@ -26,29 +23,36 @@ def initialize
@strong_delimiter = "*".freeze
@inline_options = {}
@tag_border = " ".freeze

# Debugging options
@track_time = false
end

def with(options = {})
old_options = @inline_options
@inline_options = options
result = yield
@inline_options = {}
@inline_options = old_options
result
end

def unknown_tags
@inline_options[:unknown_tags] || @unknown_tags
end

def mathml2asciimath
@inline_options[:mathml2asciimath] || @mathml2asciimath
end
def self.declare_option(option)
define_method(option) do
@inline_options[option] || instance_variable_get(:"@#{option}")
end

def external_images
@inline_options[:external_images] || @external_images
attr_writer option
end

def tag_border
@inline_options[:tag_border] || @tag_border
end
declare_option :unknown_tags
declare_option :tag_border
declare_option :mathml2asciimath
declare_option :external_images
declare_option :destination
declare_option :sourcedir
declare_option :image_counter
declare_option :image_counter_pattern
declare_option :input_format
declare_option :track_time
end
end
67 changes: 49 additions & 18 deletions lib/coradoc/reverse_adoc/html_converter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,28 +36,59 @@
require_relative "converters/video"
require_relative "converters/math"

module Coradoc::ReverseAdoc
class HtmlConverter
def self.to_coradoc(input, options = {})
root = case input
when String
Nokogiri::HTML(input).root
when Nokogiri::XML::Document
input.root
when Nokogiri::XML::Node
input
end
module Coradoc
module ReverseAdoc
class HtmlConverter
def self.to_coradoc(input, options = {})
ReverseAdoc.config.with(options) do
root = track_time "Loading input HTML document" do
case input
when String
Nokogiri::HTML(input).root
when Nokogiri::XML::Document
input.root
when Nokogiri::XML::Node
input
end
end

return "" unless root
return "" unless root

Coradoc::ReverseAdoc.config.with(options) do
Coradoc::ReverseAdoc::Converters.lookup(root.name).to_coradoc(root)
track_time "Converting input document tree to Coradoc tree" do
Converters.lookup(root.name).to_coradoc(root)
end
end
end

def self.convert(input, options = {})
ReverseAdoc.config.with(options) do
coradoc = to_coradoc(input)
result = track_time "Converting Coradoc tree into Asciidoc" do
Coradoc::Generator.gen_adoc(coradoc)
end
track_time "Cleaning up the result" do
ReverseAdoc.cleaner.tidy(result)
end
end
end
end

def self.convert(input, options = {})
result = Coradoc::Generator.gen_adoc(to_coradoc(input, options))
Coradoc::ReverseAdoc.cleaner.tidy(result)
@track_time_indentation = 0
def self.track_time(task)
if ReverseAdoc.config.track_time
warn " " * @track_time_indentation +

Check warning on line 78 in lib/coradoc/reverse_adoc/html_converter.rb

View check run for this annotation

Codecov / codecov/patch

lib/coradoc/reverse_adoc/html_converter.rb#L78

Added line #L78 was not covered by tests
"* #{task} is starting..."
@track_time_indentation += 1
t0 = Time.now
ret = yield
time_elapsed = Time.now - t0
@track_time_indentation -= 1
warn " " * @track_time_indentation +

Check warning on line 85 in lib/coradoc/reverse_adoc/html_converter.rb

View check run for this annotation

Codecov / codecov/patch

lib/coradoc/reverse_adoc/html_converter.rb#L80-L85

Added lines #L80 - L85 were not covered by tests
"* #{task} took #{time_elapsed.round(3)} seconds"
ret

Check warning on line 87 in lib/coradoc/reverse_adoc/html_converter.rb

View check run for this annotation

Codecov / codecov/patch

lib/coradoc/reverse_adoc/html_converter.rb#L87

Added line #L87 was not covered by tests
else
yield
end
end
end
end
end

0 comments on commit bdd1dab

Please sign in to comment.