From 3f5b7d783c2a9bf98846954f719ac61a59984ab0 Mon Sep 17 00:00:00 2001 From: hmdne <54514036+hmdne@users.noreply.github.com> Date: Sun, 29 Sep 2024 10:26:06 +0200 Subject: [PATCH 1/4] element: Add #inspect methods for a couple of elements --- lib/coradoc/element/attribute_list.rb | 14 +++++++++++++- lib/coradoc/element/text_element.rb | 9 +++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/lib/coradoc/element/attribute_list.rb b/lib/coradoc/element/attribute_list.rb index e3646cb..397bee5 100644 --- a/lib/coradoc/element/attribute_list.rb +++ b/lib/coradoc/element/attribute_list.rb @@ -12,6 +12,16 @@ def initialize(*positional, **named) @rejected_named = [] end + def inspect + "AttributeList: " + + [ + @positional.map(&:inspect).join(", "), + @named.map { |k, v| "#{k}: #{v.inspect}" }.join(", "), + (@rejected_positional.empty? or "rejected: #{@rejected_positional.inspect}"), + (@rejected_positional.empty? or "rejected: #{@rejected_named.inspect}"), + ].reject { |i| i == true || i.empty? }.join(", ") + end + def add_positional(*attr) @positional += attr end @@ -65,7 +75,9 @@ def to_adoc(show_empty = true) adoc = +"" if !@positional.empty? - adoc << @positional.map { |p| [nil, ""].include?(p) ? '""' : p }.join(",") + adoc << @positional.map do |p| + [nil, ""].include?(p) ? '""' : p + end.join(",") end adoc << "," if @positional.any? && @named.any? adoc << @named.map do |k, v| diff --git a/lib/coradoc/element/text_element.rb b/lib/coradoc/element/text_element.rb index a27b361..1ef003a 100644 --- a/lib/coradoc/element/text_element.rb +++ b/lib/coradoc/element/text_element.rb @@ -15,6 +15,15 @@ def initialize(content, options = {}) end end + def inspect + str = "TextElement" + str += "(#{@id})" if @id + str += ": " + str += @content.inspect + str += " + #{@line_break.inspect}" unless line_break.empty? + str + end + def to_adoc Coradoc::Generator.gen_adoc(@content) + @line_break end From d605beaede13971450cdfc43db69e9df34872160 Mon Sep 17 00:00:00 2001 From: hmdne <54514036+hmdne@users.noreply.github.com> Date: Sun, 29 Sep 2024 10:29:19 +0200 Subject: [PATCH 2/4] input/html/converters/div: Also accept a
tag --- lib/coradoc/input/html/converters/div.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/coradoc/input/html/converters/div.rb b/lib/coradoc/input/html/converters/div.rb index 9bac014..19a3db4 100644 --- a/lib/coradoc/input/html/converters/div.rb +++ b/lib/coradoc/input/html/converters/div.rb @@ -10,5 +10,6 @@ def to_coradoc(node, state = {}) register :div, Div.new register :article, Div.new + register :center, Div.new end end From 2409a4137f7e3d815dc12fe81a54da1bf2a5bc29 Mon Sep 17 00:00:00 2001 From: hmdne <54514036+hmdne@users.noreply.github.com> Date: Sun, 29 Sep 2024 10:30:36 +0200 Subject: [PATCH 3/4] element: change semantics of safe_to_collapse? Now, caller is responsible for anchors --- lib/coradoc/element/base.rb | 2 ++ lib/coradoc/element/section.rb | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/coradoc/element/base.rb b/lib/coradoc/element/base.rb index 58145e2..71a7ec3 100644 --- a/lib/coradoc/element/base.rb +++ b/lib/coradoc/element/base.rb @@ -14,6 +14,8 @@ def simplify_block_content(content) when Coradoc::Element::Section return content unless i.safe_to_collapse? + collected_content << i.anchor if i.anchor + simplified = simplify_block_content(i.contents) if simplified && !simplified.empty? diff --git a/lib/coradoc/element/section.rb b/lib/coradoc/element/section.rb index 7813264..cee44ae 100644 --- a/lib/coradoc/element/section.rb +++ b/lib/coradoc/element/section.rb @@ -1,7 +1,7 @@ module Coradoc module Element class Section < Base - attr_accessor :id, :title, :attrs, :contents, :sections + attr_accessor :id, :title, :attrs, :contents, :sections, :anchor declare_children :id, :title, :contents, :sections @@ -49,7 +49,7 @@ def to_adoc # HTML element and if it happens inside some other block element, can be # safely collapsed. def safe_to_collapse? - @title.nil? && @id.nil? && @sections.empty? + @title.nil? && @sections.empty? end private From 72962ca0a90a7eb962a8318991cf3841c825c849 Mon Sep 17 00:00:00 2001 From: hmdne <54514036+hmdne@users.noreply.github.com> Date: Sun, 29 Sep 2024 10:31:39 +0200 Subject: [PATCH 4/4] input/html/preprocessor: Extract titles from numbered lists Rationale for that can be found in the comment: https://github.com/metanorma/coradoc/issues/133#issuecomment-2381254385 --- lib/coradoc/input/html/postprocessor.rb | 92 +++++++++++++++++++++---- 1 file changed, 77 insertions(+), 15 deletions(-) diff --git a/lib/coradoc/input/html/postprocessor.rb b/lib/coradoc/input/html/postprocessor.rb index 7c9e0c5..0976ee3 100644 --- a/lib/coradoc/input/html/postprocessor.rb +++ b/lib/coradoc/input/html/postprocessor.rb @@ -4,6 +4,8 @@ module Coradoc::Input::HTML # is compatible with what we would get out of Coradoc, if # it parsed it directly. class Postprocessor + Element = Coradoc::Element + def self.process(coradoc) new(coradoc).process end @@ -12,17 +14,74 @@ def initialize(coradoc) @tree = coradoc end + # Extracts titles from lists. This happens in HTML files + # generated from DOCX documents by LibreOffice. + # + # We are interested in a particular tree: + # Element::List::Ordered items: + # Element::List::Ordered items: (any depth) + # Element::ListItem content: + # Element::Title + # (any number of other titles of the same scheme) + # + # This tree is flattened into: + # Element::Title + # Element::Title (any number of titles) + def extract_titles_from_lists + @tree = Element::Base.visit(@tree) do |elem, dir| + next elem unless dir == :pre + next elem unless elem.is_a?(Element::List::Ordered) + next elem if elem.items.length != 1 + + anchors = [] + anchors << elem.anchor if elem.anchor + + # Extract ListItem from any depth of List::Ordered + processed = elem + while processed.is_a?(Element::List::Ordered) + if processed.items.length != 1 + backtrack = true + break + end + anchors << processed.anchor if processed.anchor + processed = processed.items.first + end + + # Something went wrong? Anything not matching on the way? + next elem if backtrack + next elem unless processed.is_a?(Element::ListItem) + + anchors << processed.anchor if processed.anchor + + # Now we must have a title (or titles). + titles = processed.content.flatten + + # Don't bother if there's no title in there. + next elem unless titles.any? { |i| i.is_a? Element::Title } + + # Ordered is another iteration for our cleanup. + next elem unless titles.all? do |i| + i.is_a?(Element::Title) || i.is_a?(Element::List::Ordered) + end + + # We are done now. + titles + anchors + end + end + # Collapse DIVs that only have a title, or nest another DIV. def collapse_meaningless_sections - @tree = Coradoc::Element::Base.visit(@tree) do |elem, _dir| - if elem.is_a?(Coradoc::Element::Section) && elem.safe_to_collapse? + @tree = Element::Base.visit(@tree) do |elem, _dir| + if elem.is_a?(Element::Section) && elem.safe_to_collapse? children_classes = Array(elem.contents).map(&:class) count = children_classes.length - safe_classes = [Coradoc::Element::Section, Coradoc::Element::Title] + safe_classes = [Element::Section, Element::Title] # Count > 0 because some documents use
as a
. if count > 0 && children_classes.all? { |i| safe_classes.include?(i) } - next elem.contents + contents = elem.contents.dup + contents.prepend(elem.anchor) if elem.anchor + next contents end end elem @@ -32,12 +91,14 @@ def collapse_meaningless_sections # tree should now be more cleaned up, so we can progress with # creating meaningful sections def generate_meaningful_sections - @tree = Coradoc::Element::Base.visit(@tree) do |elem, dir| + @tree = Element::Base.visit(@tree) do |elem, dir| # We are searching for an array, that has a title. This # will be a candidate for our section array. if dir == :post && elem.is_a?(Array) && - !elem.grep(Coradoc::Element::Title).empty? + !elem.flatten.grep(Element::Title).empty? + + elem = elem.flatten new_array = [] content_array = new_array @@ -47,12 +108,12 @@ def generate_meaningful_sections # all descendant sections into those sections. Otherwise, we push # an element as content of current section. elem.each do |e| - if e.is_a? Coradoc::Element::Title + if e.is_a? Element::Title title = e content_array = [] section_array = [] level = title.level_int - section = Coradoc::Element::Section.new( + section = Element::Section.new( title, contents: content_array, sections: section_array ) # Some documents may not be consistent and eg. follow H4 after @@ -82,11 +143,11 @@ def split_sections previous_sections = {} determine_section_id = ->(elem) do - if elem.title.style == "appendix" - level = "A" - else - level = 1 - end + level = if elem.title.style == "appendix" + "A" + else + 1 + end section = previous_sections[elem] while section @@ -102,8 +163,8 @@ def split_sections style end - @tree = Coradoc::Element::Base.visit(@tree) do |elem, dir| - title = elem.title if elem.is_a?(Coradoc::Element::Section) + @tree = Element::Base.visit(@tree) do |elem, dir| + title = elem.title if elem.is_a?(Element::Section) if title && title.level_int <= max_level if dir == :pre @@ -137,6 +198,7 @@ def split_sections end def process + extract_titles_from_lists collapse_meaningless_sections generate_meaningful_sections # Do it again to simplify the document further.