Skip to content

Commit

Permalink
Merge pull request #1780 from openzim/treatment-sequence
Browse files Browse the repository at this point in the history
Treat hidden DOM elements before treating media files.
  • Loading branch information
kelson42 authored Feb 18, 2023
2 parents 993cdd9 + 08d56d8 commit 89abe74
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 11 deletions.
24 changes: 13 additions & 11 deletions src/util/saveArticles.ts
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,19 @@ async function processArticleHtml(
let mediaDependencies: Array<{ url: string; path: string }> = []
let subtitles: Array<{ url: string; path: string }> = []
let doc = domino.createDocument(html)

const ruRet = await rewriteUrlsOfDoc(doc, articleId, redisStore, mw, dump)
doc = ruRet.doc
mediaDependencies = mediaDependencies.concat(
ruRet.mediaDependencies
.filter((a) => a)
.map((url) => {
const path = getMediaBase(url, false)
return { url, path }
}),
)
doc = applyOtherTreatments(doc, dump)

const tmRet = await treatMedias(doc, mw, dump, articleId, webp, redisStore)
doc = tmRet.doc

Expand All @@ -454,17 +467,6 @@ async function processArticleHtml(
return { url, path }
}),
)
const ruRet = await rewriteUrlsOfDoc(doc, articleId, redisStore, mw, dump)
doc = ruRet.doc
mediaDependencies = mediaDependencies.concat(
ruRet.mediaDependencies
.filter((a) => a)
.map((url) => {
const path = getMediaBase(url, false)
return { url, path }
}),
)
doc = applyOtherTreatments(doc, dump)

if (!dump.isMainPage(articleId) && dump.customProcessor?.preProcessArticle) {
doc = await dump.customProcessor.preProcessArticle(articleId, doc)
Expand Down
38 changes: 38 additions & 0 deletions test/e2e/treatMedia.e2e.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import * as mwoffliner from '../../src/mwoffliner.lib.js'
import { execa } from 'execa'
import rimraf from 'rimraf'
import { zimdumpAvailable, zimdump } from '../util.js'
import 'dotenv/config'
import { jest } from '@jest/globals'

jest.setTimeout(20000)

describe('treatment test', () => {
const now = new Date()
const testId = `mwo-test-${+now}`

const articleList = 'Read_my_lips:_no_new_taxes'
const parameters = {
mwUrl: 'https://en.wikipedia.org',
adminEmail: '[email protected]',
articleList,
outputDirectory: testId,
redis: process.env.REDIS,
}

test('media file from hidden element should not be downloaded', async () => {
await execa('redis-cli flushall', { shell: true })

const outFiles = await mwoffliner.execute(parameters)
// Created 1 output
expect(outFiles).toHaveLength(1)

if (await zimdumpAvailable()) {
await expect(zimdump(`list --url "I/George_Bush_1988_No_New_Taxes.ogg" ${outFiles[0].outFile}`)).rejects.toThrow('Entry not found')
} else {
console.log('Zimdump not installed, skipping test')
}

rimraf.sync(`./${testId}`)
})
})

0 comments on commit 89abe74

Please sign in to comment.