From 3f5b7d783c2a9bf98846954f719ac61a59984ab0 Mon Sep 17 00:00:00 2001
From: hmdne <54514036+hmdne@users.noreply.github.com>
Date: Sun, 29 Sep 2024 10:26:06 +0200
Subject: [PATCH 1/4] element: Add #inspect methods for a couple of elements
---
lib/coradoc/element/attribute_list.rb | 14 +++++++++++++-
lib/coradoc/element/text_element.rb | 9 +++++++++
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/lib/coradoc/element/attribute_list.rb b/lib/coradoc/element/attribute_list.rb
index e3646cb..397bee5 100644
--- a/lib/coradoc/element/attribute_list.rb
+++ b/lib/coradoc/element/attribute_list.rb
@@ -12,6 +12,16 @@ def initialize(*positional, **named)
@rejected_named = []
end
+ def inspect
+ "AttributeList: " +
+ [
+ @positional.map(&:inspect).join(", "),
+ @named.map { |k, v| "#{k}: #{v.inspect}" }.join(", "),
+ (@rejected_positional.empty? or "rejected: #{@rejected_positional.inspect}"),
+ (@rejected_positional.empty? or "rejected: #{@rejected_named.inspect}"),
+ ].reject { |i| i == true || i.empty? }.join(", ")
+ end
+
def add_positional(*attr)
@positional += attr
end
@@ -65,7 +75,9 @@ def to_adoc(show_empty = true)
adoc = +""
if !@positional.empty?
- adoc << @positional.map { |p| [nil, ""].include?(p) ? '""' : p }.join(",")
+ adoc << @positional.map do |p|
+ [nil, ""].include?(p) ? '""' : p
+ end.join(",")
end
adoc << "," if @positional.any? && @named.any?
adoc << @named.map do |k, v|
diff --git a/lib/coradoc/element/text_element.rb b/lib/coradoc/element/text_element.rb
index a27b361..1ef003a 100644
--- a/lib/coradoc/element/text_element.rb
+++ b/lib/coradoc/element/text_element.rb
@@ -15,6 +15,15 @@ def initialize(content, options = {})
end
end
+ def inspect
+ str = "TextElement"
+ str += "(#{@id})" if @id
+ str += ": "
+ str += @content.inspect
+ str += " + #{@line_break.inspect}" unless line_break.empty?
+ str
+ end
+
def to_adoc
Coradoc::Generator.gen_adoc(@content) + @line_break
end
From d605beaede13971450cdfc43db69e9df34872160 Mon Sep 17 00:00:00 2001
From: hmdne <54514036+hmdne@users.noreply.github.com>
Date: Sun, 29 Sep 2024 10:29:19 +0200
Subject: [PATCH 2/4] input/html/converters/div: Also accept a
tag
---
lib/coradoc/input/html/converters/div.rb | 1 +
1 file changed, 1 insertion(+)
diff --git a/lib/coradoc/input/html/converters/div.rb b/lib/coradoc/input/html/converters/div.rb
index 9bac014..19a3db4 100644
--- a/lib/coradoc/input/html/converters/div.rb
+++ b/lib/coradoc/input/html/converters/div.rb
@@ -10,5 +10,6 @@ def to_coradoc(node, state = {})
register :div, Div.new
register :article, Div.new
+ register :center, Div.new
end
end
From 2409a4137f7e3d815dc12fe81a54da1bf2a5bc29 Mon Sep 17 00:00:00 2001
From: hmdne <54514036+hmdne@users.noreply.github.com>
Date: Sun, 29 Sep 2024 10:30:36 +0200
Subject: [PATCH 3/4] element: change semantics of safe_to_collapse?
Now, caller is responsible for anchors
---
lib/coradoc/element/base.rb | 2 ++
lib/coradoc/element/section.rb | 4 ++--
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/lib/coradoc/element/base.rb b/lib/coradoc/element/base.rb
index 58145e2..71a7ec3 100644
--- a/lib/coradoc/element/base.rb
+++ b/lib/coradoc/element/base.rb
@@ -14,6 +14,8 @@ def simplify_block_content(content)
when Coradoc::Element::Section
return content unless i.safe_to_collapse?
+ collected_content << i.anchor if i.anchor
+
simplified = simplify_block_content(i.contents)
if simplified && !simplified.empty?
diff --git a/lib/coradoc/element/section.rb b/lib/coradoc/element/section.rb
index 7813264..cee44ae 100644
--- a/lib/coradoc/element/section.rb
+++ b/lib/coradoc/element/section.rb
@@ -1,7 +1,7 @@
module Coradoc
module Element
class Section < Base
- attr_accessor :id, :title, :attrs, :contents, :sections
+ attr_accessor :id, :title, :attrs, :contents, :sections, :anchor
declare_children :id, :title, :contents, :sections
@@ -49,7 +49,7 @@ def to_adoc
# HTML element and if it happens inside some other block element, can be
# safely collapsed.
def safe_to_collapse?
- @title.nil? && @id.nil? && @sections.empty?
+ @title.nil? && @sections.empty?
end
private
From 72962ca0a90a7eb962a8318991cf3841c825c849 Mon Sep 17 00:00:00 2001
From: hmdne <54514036+hmdne@users.noreply.github.com>
Date: Sun, 29 Sep 2024 10:31:39 +0200
Subject: [PATCH 4/4] input/html/preprocessor: Extract titles from numbered
lists
Rationale for that can be found in the comment:
https://github.com/metanorma/coradoc/issues/133#issuecomment-2381254385
---
lib/coradoc/input/html/postprocessor.rb | 92 +++++++++++++++++++++----
1 file changed, 77 insertions(+), 15 deletions(-)
diff --git a/lib/coradoc/input/html/postprocessor.rb b/lib/coradoc/input/html/postprocessor.rb
index 7c9e0c5..0976ee3 100644
--- a/lib/coradoc/input/html/postprocessor.rb
+++ b/lib/coradoc/input/html/postprocessor.rb
@@ -4,6 +4,8 @@ module Coradoc::Input::HTML
# is compatible with what we would get out of Coradoc, if
# it parsed it directly.
class Postprocessor
+ Element = Coradoc::Element
+
def self.process(coradoc)
new(coradoc).process
end
@@ -12,17 +14,74 @@ def initialize(coradoc)
@tree = coradoc
end
+ # Extracts titles from lists. This happens in HTML files
+ # generated from DOCX documents by LibreOffice.
+ #
+ # We are interested in a particular tree:
+ # Element::List::Ordered items:
+ # Element::List::Ordered items: (any depth)
+ # Element::ListItem content:
+ # Element::Title
+ # (any number of other titles of the same scheme)
+ #
+ # This tree is flattened into:
+ # Element::Title
+ # Element::Title (any number of titles)
+ def extract_titles_from_lists
+ @tree = Element::Base.visit(@tree) do |elem, dir|
+ next elem unless dir == :pre
+ next elem unless elem.is_a?(Element::List::Ordered)
+ next elem if elem.items.length != 1
+
+ anchors = []
+ anchors << elem.anchor if elem.anchor
+
+ # Extract ListItem from any depth of List::Ordered
+ processed = elem
+ while processed.is_a?(Element::List::Ordered)
+ if processed.items.length != 1
+ backtrack = true
+ break
+ end
+ anchors << processed.anchor if processed.anchor
+ processed = processed.items.first
+ end
+
+ # Something went wrong? Anything not matching on the way?
+ next elem if backtrack
+ next elem unless processed.is_a?(Element::ListItem)
+
+ anchors << processed.anchor if processed.anchor
+
+ # Now we must have a title (or titles).
+ titles = processed.content.flatten
+
+ # Don't bother if there's no title in there.
+ next elem unless titles.any? { |i| i.is_a? Element::Title }
+
+ # Ordered is another iteration for our cleanup.
+ next elem unless titles.all? do |i|
+ i.is_a?(Element::Title) || i.is_a?(Element::List::Ordered)
+ end
+
+ # We are done now.
+ titles + anchors
+ end
+ end
+
# Collapse DIVs that only have a title, or nest another DIV.
def collapse_meaningless_sections
- @tree = Coradoc::Element::Base.visit(@tree) do |elem, _dir|
- if elem.is_a?(Coradoc::Element::Section) && elem.safe_to_collapse?
+ @tree = Element::Base.visit(@tree) do |elem, _dir|
+ if elem.is_a?(Element::Section) && elem.safe_to_collapse?
children_classes = Array(elem.contents).map(&:class)
count = children_classes.length
- safe_classes = [Coradoc::Element::Section, Coradoc::Element::Title]
+ safe_classes = [Element::Section, Element::Title]
# Count > 0 because some documents use as a
.
if count > 0 && children_classes.all? { |i| safe_classes.include?(i) }
- next elem.contents
+ contents = elem.contents.dup
+ contents.prepend(elem.anchor) if elem.anchor
+ next contents
end
end
elem
@@ -32,12 +91,14 @@ def collapse_meaningless_sections
# tree should now be more cleaned up, so we can progress with
# creating meaningful sections
def generate_meaningful_sections
- @tree = Coradoc::Element::Base.visit(@tree) do |elem, dir|
+ @tree = Element::Base.visit(@tree) do |elem, dir|
# We are searching for an array, that has a title. This
# will be a candidate for our section array.
if dir == :post &&
elem.is_a?(Array) &&
- !elem.grep(Coradoc::Element::Title).empty?
+ !elem.flatten.grep(Element::Title).empty?
+
+ elem = elem.flatten
new_array = []
content_array = new_array
@@ -47,12 +108,12 @@ def generate_meaningful_sections
# all descendant sections into those sections. Otherwise, we push
# an element as content of current section.
elem.each do |e|
- if e.is_a? Coradoc::Element::Title
+ if e.is_a? Element::Title
title = e
content_array = []
section_array = []
level = title.level_int
- section = Coradoc::Element::Section.new(
+ section = Element::Section.new(
title, contents: content_array, sections: section_array
)
# Some documents may not be consistent and eg. follow H4 after
@@ -82,11 +143,11 @@ def split_sections
previous_sections = {}
determine_section_id = ->(elem) do
- if elem.title.style == "appendix"
- level = "A"
- else
- level = 1
- end
+ level = if elem.title.style == "appendix"
+ "A"
+ else
+ 1
+ end
section = previous_sections[elem]
while section
@@ -102,8 +163,8 @@ def split_sections
style
end
- @tree = Coradoc::Element::Base.visit(@tree) do |elem, dir|
- title = elem.title if elem.is_a?(Coradoc::Element::Section)
+ @tree = Element::Base.visit(@tree) do |elem, dir|
+ title = elem.title if elem.is_a?(Element::Section)
if title && title.level_int <= max_level
if dir == :pre
@@ -137,6 +198,7 @@ def split_sections
end
def process
+ extract_titles_from_lists
collapse_meaningless_sections
generate_meaningful_sections
# Do it again to simplify the document further.