Skip to content

Commit

Permalink
Merge pull request #1919 from openzim/namespace-blacklist
Browse files Browse the repository at this point in the history
Don't mirror 'Story' namespace
  • Loading branch information
kelson42 authored Oct 4, 2023
2 parents 190cd26 + 4b29977 commit fc79938
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 12 deletions.
8 changes: 6 additions & 2 deletions src/MediaWiki.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import ApiURLDirector from './util/builders/url/api.director.js'
import DesktopURLDirector from './util/builders/url/desktop.director.js'
import VisualEditorURLDirector from './util/builders/url/visual-editor.director.js'
import { checkApiAvailability } from './util/mw-api.js'
import { BLACKLISTED_NS } from './util/const.js'

export interface QueryOpts {
action: string
Expand All @@ -34,7 +35,6 @@ class MediaWiki {
}

public metaData: MWMetaData
public _base: string
public baseUrl: URL
public getCategories: boolean
public namespaces: MWNamespaces = {}
Expand Down Expand Up @@ -227,18 +227,22 @@ class MediaWiki {
const num = entry.id
const allowedSubpages = 'subpages' in entry
const isContent = type === 'namespaces' ? !!(entry.content || util.contains(addNamespaces, num)) : !!(entry.content !== undefined || util.contains(addNamespaces, num))
const isBlacklisted = BLACKLISTED_NS.includes(name)
const canonical = entry.canonical ? entry.canonical : ''
const details = { num, allowedSubpages, isContent }

/* Namespaces in local language */
this.namespaces[util.lcFirst(name)] = details
this.namespaces[util.ucFirst(name)] = details

/* Namespaces in English (if available) */
if (canonical) {
this.namespaces[util.lcFirst(canonical)] = details
this.namespaces[util.ucFirst(canonical)] = details
}

/* Is content to mirror */
if (isContent) {
if (isContent && !isBlacklisted) {
this.namespacesToMirror.push(name)
}
})
Expand Down
1 change: 1 addition & 0 deletions src/util/const.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ export const LOAD_PHP = /script.src = ".*load\.php.*";/
export const RULE_TO_REDIRECT = /window\.top !== window\.self/
export const WEBP_HANDLER_URL = 'https://gist.githubusercontent.com/rgaudin/60bb9cc6f187add506584258028b8ee1/raw/9d575b8e25d67eed2a9c9a91d3e053a0062d2fc7/web-handler.js'
export const MAX_FILE_DOWNLOAD_RETRIES = 5
export const BLACKLISTED_NS = ['Story'] // 'Story' Wikipedia namespace is content, but not indgestable by Parsoid https://github.com/openzim/mwoffliner/issues/1853
48 changes: 38 additions & 10 deletions test/unit/mwApi.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,26 +9,32 @@ import { jest } from '@jest/globals'

jest.setTimeout(10000)

describe('mwApi', () => {
beforeAll(startRedis)
afterAll(stopRedis)
beforeAll(async () => {
MediaWiki.reset()
await startRedis()
})
afterAll(stopRedis)

const initMW = async (downloader: Downloader) => {
await MediaWiki.getMwMetaData(downloader)
await MediaWiki.hasCoordinates(downloader)
await MediaWiki.hasWikimediaDesktopRestApi()
await MediaWiki.hasVisualEditorApi()

await MediaWiki.getNamespaces([], downloader)
}

describe('mwApi', () => {
let downloader: Downloader

beforeEach(async () => {
await RedisStore.articleDetailXId.flush()

MediaWiki.base = 'https://en.wikipedia.org'
MediaWiki.getCategories = true

downloader = new Downloader({ uaString: `${config.userAgent} ([email protected])`, speed: 1, reqTimeout: 1000 * 60, webp: false, optimisationCacheUrl: '' })

await MediaWiki.getMwMetaData(downloader)
await MediaWiki.hasCoordinates(downloader)
await MediaWiki.hasWikimediaDesktopRestApi()
await MediaWiki.hasVisualEditorApi()

await MediaWiki.getNamespaces([], downloader)
await initMW(downloader)
})

test('MWApi Article Ids', async () => {
Expand Down Expand Up @@ -116,3 +122,25 @@ describe('mwApi', () => {
expect(interWikiTitle).toBeNull()
})
})

describe('Test blacklisted NSs', () => {
let downloader: Downloader

beforeEach(async () => {
await RedisStore.articleDetailXId.flush()

MediaWiki.base = 'https://id.wikipedia.org'
MediaWiki.getCategories = true

downloader = new Downloader({ uaString: `${config.userAgent} ([email protected])`, speed: 1, reqTimeout: 1000 * 60, webp: false, optimisationCacheUrl: '' })

await initMW(downloader)
})

test('Prevent blacklisted namespaces to mirroring', async () => {
const aIds = ['Story:Satelit_Oberon', 'London']
await getArticleIds(downloader, 'Main_Page', aIds)

expect(MediaWiki.namespacesToMirror).not.toContain('Story')
})
})

0 comments on commit fc79938

Please sign in to comment.