Skip to content

Commit

Permalink
epub-parser: find siblings from parent's sibling node aswell (#151)
Browse files Browse the repository at this point in the history
Currently, the EPUB parser only finds siblings located inside its own parent, which may result in an empty or incomplete chapter body if some of the sibling nodes are located within the siblings of its parent node instead of being direct siblings.

---------------------------
Signed-off-by: starry-shivam <[email protected]>
  • Loading branch information
starry-shivam authored Apr 16, 2024
1 parent 4b7d0f1 commit 51a4475
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 8 deletions.
1 change: 0 additions & 1 deletion .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 11 additions & 0 deletions app/src/main/java/com/starry/myne/epub/EpubUtils.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package com.starry.myne.epub
import org.w3c.dom.Document
import org.w3c.dom.Element
import org.w3c.dom.Node
import org.jsoup.nodes.Node as JsoupNode
import org.w3c.dom.NodeList
import org.xml.sax.InputSource
import java.io.File
Expand Down Expand Up @@ -58,3 +59,13 @@ fun Node.getAttributeValue(attribute: String): String? =

val NodeList.elements get() = (0..length).asSequence().mapNotNull { item(it) as? Element }
val Node.childElements get() = childNodes.elements

fun JsoupNode.nextSiblingNodes(): List<org.jsoup.nodes.Node> {
val siblings = mutableListOf<org.jsoup.nodes.Node>()
var nextSibling = nextSibling()
while (nextSibling != null) {
siblings.add(nextSibling)
nextSibling = nextSibling.nextSibling()
}
return siblings
}
48 changes: 41 additions & 7 deletions app/src/main/java/com/starry/myne/epub/EpubXMLFileParser.kt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import android.util.Log
import org.jsoup.Jsoup
import org.jsoup.nodes.TextNode
import java.io.File
import org.jsoup.nodes.Node
import kotlin.io.path.invariantSeparatorsPathString

/**
Expand Down Expand Up @@ -86,7 +87,7 @@ class EpubXMLFileParser(
val fragmentElement = document.selectFirst("#$fragmentId")
title = fragmentElement?.selectFirst("h1, h2, h3, h4, h5, h6")?.text() ?: ""
val bodyBuilder = StringBuilder()
var currentNode: org.jsoup.nodes.Node? = fragmentElement?.nextSibling()
var currentNode: Node? = fragmentElement?.nextSibling()
val nextFragmentIdElement = if (nextFragmentId != null) {
document.selectFirst("#$nextFragmentId")
} else {
Expand All @@ -96,7 +97,7 @@ class EpubXMLFileParser(

while (currentNode != null && currentNode != nextFragmentIdElement) {
bodyBuilder.append(getNodeStructuredText(currentNode) + "\n\n")
currentNode = currentNode.nextSibling()
currentNode = getNextSibling(currentNode)
}
bodyContent = bodyBuilder.toString()
}
Expand All @@ -115,6 +116,39 @@ class EpubXMLFileParser(
)
}

private fun getNextSibling(currentNode: Node?): Node? {
var nextSibling: Node? = currentNode?.nextSibling()

if (nextSibling == null) {
var parentNode = currentNode?.parent()
while (parentNode != null) {
nextSibling = parentNode.nextSibling()
if (nextSibling != null) {
// If the parent's next sibling is not null, traverse its descendants
// to find the next node
return traverseDescendants(nextSibling)
}
parentNode = parentNode.parent()
}
}

return nextSibling
}

private fun traverseDescendants(node: Node): Node? {
val children = node.childNodes()
if (children.isNotEmpty()) {
return children.first()
}

val siblings = node.nextSiblingNodes()
if (siblings.isNotEmpty()) {
return traverseDescendants(siblings.first())
}

return null
}

fun parseAsImage(absolutePathImage: String): String {
// Use run catching so it can be run locally without crash
val bitmap = zipFile[absolutePathImage]?.data?.runCatching {
Expand All @@ -130,7 +164,7 @@ class EpubXMLFileParser(
}

// Rewrites the image node to xml for the next stage.
private fun declareImgEntry(node: org.jsoup.nodes.Node): String {
private fun declareImgEntry(node: Node): String {
val attrs = node.attributes().associate { it.key to it.value }
val relPathEncoded = attrs["src"] ?: attrs["xlink:href"] ?: ""

Expand All @@ -143,8 +177,8 @@ class EpubXMLFileParser(
return parseAsImage(absolutePathImage)
}

private fun getPTraverse(node: org.jsoup.nodes.Node): String {
fun innerTraverse(node: org.jsoup.nodes.Node): String =
private fun getPTraverse(node: Node): String {
fun innerTraverse(node: Node): String =
node.childNodes().joinToString("") { child ->
when {
child.nodeName() == "br" -> "\n"
Expand All @@ -159,7 +193,7 @@ class EpubXMLFileParser(
return if (paragraph.isNotEmpty()) "$paragraph\n\n" else ""
}

private fun getNodeTextTraverse(node: org.jsoup.nodes.Node): String {
private fun getNodeTextTraverse(node: Node): String {
val children = node.childNodes()
if (children.isEmpty())
return ""
Expand All @@ -181,7 +215,7 @@ class EpubXMLFileParser(
}
}

private fun getNodeStructuredText(node: org.jsoup.nodes.Node): String {
private fun getNodeStructuredText(node: Node): String {
val children = node.childNodes()
if (children.isEmpty())
return ""
Expand Down

0 comments on commit 51a4475

Please sign in to comment.