Skip to content

Commit

Permalink
fix(JATS): Workaround for xml-js dropping spaces between elements
Browse files Browse the repository at this point in the history
  • Loading branch information
nokome committed Jun 21, 2023
1 parent 4234ead commit a34678c
Show file tree
Hide file tree
Showing 9 changed files with 102 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2845,7 +2845,7 @@
"value": "10.3120/0024-9637-56.3.209"
}
],
"title": "PattersonB.2009. Systematics, Evolution, and Biogeography of Compositae, Madroño. doi:10.3120/0024-9637-56.3.209"
"title": "Patterson B. 2009. Systematics, Evolution, and Biogeography of Compositae, Madroño. doi:10.3120/0024-9637-56.3.209"
},
{
"type": "Article",
Expand Down Expand Up @@ -6089,6 +6089,7 @@
"10A;"
]
},
" ",
{
"type": "Link",
"relation": "table",
Expand Down
4 changes: 4 additions & 0 deletions src/codecs/jats/__file_snapshots__/87253.json
Original file line number Diff line number Diff line change
Expand Up @@ -7882,6 +7882,7 @@
"(A)"
]
},
" ",
{
"type": "Emphasis",
"content": [
Expand Down Expand Up @@ -8823,6 +8824,7 @@
"(D)"
]
},
" ",
{
"type": "Emphasis",
"content": [
Expand Down Expand Up @@ -9720,6 +9722,7 @@
"(F)"
]
},
" ",
{
"type": "Emphasis",
"content": [
Expand Down Expand Up @@ -11146,6 +11149,7 @@
"(A)"
]
},
" ",
{
"type": "Emphasis",
"content": [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5629,7 +5629,7 @@
"type": "Periodical",
"name": "Annu. Rev. Immunol."
},
"title": "A. C.Hayday, P.Vantourout, Annu. Rev. Immunol., in press, doi:10.1146/annurev-immunol-102819-023144.",
"title": "A. C. Hayday, P. Vantourout, Annu. Rev. Immunol., in press, doi:10.1146/annurev-immunol-102819-023144.",
"meta": {
"label": "46."
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3538,7 +3538,7 @@
},
"volumeNumber": 29
},
"title": "SteckK, WalkerSJ, ItskovPM, BaltazarC, MoreiraJ-M, RibeiroC.2018. Internal amino acid state modulates yeast taste neurons to support protein homeostasis in Drosophila29."
"title": "Steck K, Walker SJ, Itskov PM, Baltazar C, Moreira J-M, Ribeiro C. 2018. Internal amino acid state modulates yeast taste neurons to support protein homeostasis in Drosophila 29."
},
{
"type": "Article",
Expand Down Expand Up @@ -6816,6 +6816,7 @@
}
]
},
" ",
{
"type": "Emphasis",
"content": [
Expand Down
28 changes: 28 additions & 0 deletions src/codecs/jats/__file_snapshots__/spaces-around-marks.jats.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.1 20151215//EN" "JATS-archivearticle1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<article-meta>
<title-group>
<article-title/>
</title-group>
<contrib-group/>
</article-meta>
</front>
<body>
<sec>
<p>Test that spaces around marks are retained. A regression test based on a published example</p>
<list list-type="order">
<list-item>
<p>
<bold>
<italic>Space after</italic>
</bold>
<italic>space before and after</italic>
</p>
</list-item>
</list>
</sec>
</body>
<back/>
</article>
26 changes: 26 additions & 0 deletions src/codecs/jats/__file_snapshots__/spaces-around-marks.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
type: Article
content:
- type: Paragraph
content:
- >-
Test that spaces around marks are retained. A regression test based on a
published example
- type: List
items:
- type: ListItem
content:
- type: Paragraph
content:
- type: Strong
content:
- type: Emphasis
content:
- Space after
- ' '
- type: Emphasis
content:
- space before and after
- ' '
order: Ascending
meta:
listType: roman-lower
11 changes: 11 additions & 0 deletions src/codecs/jats/__fixtures__/spaces-around-marks.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?xml version="1.0"?>
<article>
<body>
<p>Test that spaces around marks are retained. A regression test based on a published example</p>
<list list-type="roman-lower">
<list-item>
<p><bold><italic>Space after</italic></bold> <italic>space before and after</italic> </p>
</list-item>
</list>
</body>
</article>
31 changes: 27 additions & 4 deletions src/codecs/jats/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,16 @@ type Content = stencila.InlineContent | stencila.BlockContent
const DOCTYPE =
'article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD v1.1 20151215//EN" "JATS-archivearticle1.dtd"'

/**
* The spacing character to use to workaround a bug
* in xml-js that strips whitespace between elements
* https://github.com/nashwaan/xml-js/issues/177
*
* This is an invisible character but xml-js does not treat as whitespace
* https://unicode-explorer.com/c/2800
*/
const SPACER = '\u2800'

export class JatsCodec extends Codec implements Codec {
/**
* Media types that this codec will match.
Expand Down Expand Up @@ -110,8 +120,13 @@ export class JatsCodec extends Codec implements Codec {
public readonly decode = async (
file: vfile.VFile
): Promise<stencila.Article | Content[]> => {
const jats = await vfile.dump(file)
const doc = xml.load(jats)
let jats = await vfile.dump(file)

// See notes on SPACER for why this is needed
let fixed = jats.replaceAll('> <', `>${SPACER}<`)

const doc = xml.load(fixed)

return decodeDocument(doc)
}

Expand Down Expand Up @@ -1387,6 +1402,9 @@ export function decodeReference(
title = textOrUndefined(elem)
}

// Remove any space placeholders
title = title?.replaceAll(SPACER, ' ')

let publisher: stencila.Organization | undefined
const publisherName = textOrUndefined(child(elem, 'publisher-name'))
if (publisherName !== undefined) {
Expand Down Expand Up @@ -1612,7 +1630,8 @@ function encodeBody(nodes: stencila.Node[], state: EncodeState): xml.Element {
* Decode a JATS element to an array of Stencila `Node`s
*/
function decodeElement(elem: xml.Element, state: DecodeState): stencila.Node[] {
if (elem.type === 'text') return [elem.text ?? '']
if (elem.type === 'text')
return [elem.text?.toString().replace(SPACER, ' ') ?? '']
switch (elem.name) {
case 'alternatives':
return decodeAlternatives(elem, state)
Expand Down Expand Up @@ -1694,7 +1713,11 @@ function encodeNode(node: stencila.Node, state: EncodeState): xml.Element[] {
case 'Strong':
return encodeMark(node as stencila.Strong, state, 'bold')
case 'NontextualAnnotation':
return encodeMark(node as stencila.NontextualAnnotation, state, 'underline')
return encodeMark(
node as stencila.NontextualAnnotation,
state,
'underline'
)
case 'Superscript':
return encodeMark(node as stencila.Superscript, state, 'sup')
case 'Subscript':
Expand Down
1 change: 1 addition & 0 deletions src/codecs/jats/jats.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ test.each([
'nested-list.xml',
'supplementary-material.xml',
'labelled-list-items.xml',
'spaces-around-marks.xml',
'elife-30274-v1',
'elife-43154-v2',
'elife-46472-v3',
Expand Down

0 comments on commit a34678c

Please sign in to comment.