diff --git a/.changeset/five-countries-wait.md b/.changeset/five-countries-wait.md new file mode 100644 index 000000000..78fcd0a47 --- /dev/null +++ b/.changeset/five-countries-wait.md @@ -0,0 +1,5 @@ +--- +'citation-js-utils': patch +--- + +Get year from citation issued literal if it is not parsed diff --git a/.changeset/good-cameras-hammer.md b/.changeset/good-cameras-hammer.md new file mode 100644 index 000000000..0c329cc08 --- /dev/null +++ b/.changeset/good-cameras-hammer.md @@ -0,0 +1,6 @@ +--- +'citation-js-utils': patch +'myst-cli': patch +--- + +Pull url from citation data and add to citation node diff --git a/.changeset/mean-walls-hear.md b/.changeset/mean-walls-hear.md new file mode 100644 index 000000000..12472fc59 --- /dev/null +++ b/.changeset/mean-walls-hear.md @@ -0,0 +1,5 @@ +--- +'myst-cli': patch +--- + +Add better warning message for valid dois without bibtex diff --git a/.changeset/ninety-pots-smash.md b/.changeset/ninety-pots-smash.md new file mode 100644 index 000000000..9a944a206 --- /dev/null +++ b/.changeset/ninety-pots-smash.md @@ -0,0 +1,7 @@ +--- +'myst-spec-ext': patch +'myst-common': patch +'myst-cli': patch +--- + +Add enumerator to citations and cite nodes diff --git a/.changeset/tough-cycles-scream.md b/.changeset/tough-cycles-scream.md new file mode 100644 index 000000000..b36128770 --- /dev/null +++ b/.changeset/tough-cycles-scream.md @@ -0,0 +1,6 @@ +--- +'myst-common': patch +'myst-cli': patch +--- + +Add cli warnings for invalid citation labels diff --git a/.changeset/weak-games-poke.md b/.changeset/weak-games-poke.md new file mode 100644 index 000000000..e95b2f608 --- /dev/null +++ b/.changeset/weak-games-poke.md @@ -0,0 +1,5 @@ +--- +'citation-js-utils': patch +--- + +Stop removing urls from citation html diff --git a/packages/citation-js-utils/src/index.ts b/packages/citation-js-utils/src/index.ts index 6af2fcb2e..40448b217 100644 --- a/packages/citation-js-utils/src/index.ts +++ b/packages/citation-js-utils/src/index.ts @@ -12,7 +12,7 @@ export type CitationJson = { type?: 'article-journal' | string; id: string; author?: { given: string; family: string }[]; - issued?: { 'date-parts': number[][] }; + issued?: { 'date-parts'?: number[][]; literal?: string }; publisher?: string; title?: string; 'citation-key'?: string; @@ -73,12 +73,20 @@ const defaultString: OutputOptions = { style: CitationJSStyles.apa, }; +export function yearFromCitation(data: CitationJson) { + let year: number | string | undefined = data.issued?.['date-parts']?.[0]?.[0]; + if (year) return year; + year = data.issued?.['literal']?.match(/\b[12][0-9]{3}\b/)?.[0]; + if (year) return year; + return 'n.d.'; +} + export function getInlineCitation(data: CitationJson, kind: InlineCite, opts?: InlineOptions) { let authors = data.author; if (!authors || authors.length === 0) { authors = data.editor; } - const year = data.issued?.['date-parts']?.[0]?.[0]; + const year = yearFromCitation(data); const prefix = opts?.prefix ? `${opts.prefix} ` : ''; const suffix = opts?.suffix ? `, ${opts.suffix}` : ''; let yearPart = kind === InlineCite.t ? ` (${year}${suffix})` : `, ${year}${suffix}`; @@ -120,23 +128,47 @@ export type CitationRenderer = Record< render: (style?: CitationJSStyles) => string; inline: (kind?: InlineCite, opts?: InlineOptions) => InlineNode[]; getDOI: () => string | undefined; + getURL: () => string | undefined; cite: CitationJson; } >; -function wrapWithDoiAnchorTag(doiStr: string) { - if (!doiStr) return ''; - return `${doiStr}`; +function doiUrl(doi?: string) { + return doi ? `https://doi.org/${doi}` : undefined; +} + +function wrapWithAnchorTag(url: string, text?: string) { + if (!url) return ''; + return `${text ?? url}`; +} + +function wrapWithDoiAnchorTag(doi?: string) { + const url = doiUrl(doi); + if (!url) return ''; + return wrapWithAnchorTag(url, doi); } const URL_REGEX = - /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&//=]*)/; + /https?:\/\/(www\.)?[-a-zA-Z0-9@:%._+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&//=]*)/g; + +function replaceUrlsWithAnchorElement(str?: string, doi?: string) { + if (!str) return ''; + const matches = [...str.matchAll(URL_REGEX)]; + let newStr = str; + matches.forEach((match) => { + if (doi && match[0].includes(doi)) { + newStr = newStr.replace(match[0], wrapWithDoiAnchorTag(doi)); + } else { + newStr = newStr.replace(match[0], wrapWithAnchorTag(match[0])); + } + }); + return newStr; +} -function replaceDoiWithAnchorElement(str: string, doi: string) { - if (!str) return str; - const match = str.match(URL_REGEX); - if (!match) return str; - return str.replace(URL_REGEX, wrapWithDoiAnchorTag(doi)); +export function firstNonDoiUrl(str?: string, doi?: string) { + if (!str) return; + const matches = [...str.matchAll(URL_REGEX)]; + return matches.map((match) => match[0]).find((match) => !doi || !match.includes(doi)); } export async function getCitations(bibtex: string): Promise { @@ -156,7 +188,7 @@ export async function getCitations(bibtex: string): Promise { return getInlineCitation(c, kind, opts); }, render(style?: CitationJSStyles) { - return replaceDoiWithAnchorElement( + return replaceUrlsWithAnchorElement( cleanRef(cite.set(c).get({ ...defaultString, style: style ?? CitationJSStyles.apa })), c.DOI, ); @@ -164,6 +196,9 @@ export async function getCitations(bibtex: string): Promise { getDOI(): string | undefined { return c.DOI || undefined; }, + getURL(): string | undefined { + return firstNonDoiUrl(cleanRef(cite.set(c).get(defaultString)), c.DOI) ?? doiUrl(c.DOI); + }, cite: c, }, ]; diff --git a/packages/citation-js-utils/tests/basic.spec.ts b/packages/citation-js-utils/tests/basic.spec.ts index 6c814ee56..d5bc1e659 100644 --- a/packages/citation-js-utils/tests/basic.spec.ts +++ b/packages/citation-js-utils/tests/basic.spec.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from 'vitest'; -import { getCitations, CitationJSStyles } from '../src'; +import { getCitations, CitationJSStyles, yearFromCitation, firstNonDoiUrl } from '../src'; import { bibtex, doiInNote, @@ -33,3 +33,62 @@ describe('Test reference rendering', () => { expect(citations['cury2020sparse'].getDOI()).toBe(TEST_DOI_IN_OTHER_FIELD); }); }); + +describe('yearFromCitation', () => { + it('date-parts year is returned', async () => { + const data = { id: 'id', issued: { 'date-parts': [[2020, 1, 1]] } }; + expect(yearFromCitation(data)).toEqual(2020); + }); + it('date-parts year is prioritized', async () => { + const data = { id: 'id', issued: { 'date-parts': [[2020, 1, 1]], literal: '1999' } }; + expect(yearFromCitation(data)).toEqual(2020); + }); + it('literal is used', async () => { + const data = { id: 'id', issued: { literal: '2020' } }; + expect(yearFromCitation(data)).toEqual('2020'); + }); + it('literal is parses from string', async () => { + const data = { id: 'id', issued: { literal: 'Accessed 2020 Jan 1' } }; + expect(yearFromCitation(data)).toEqual('2020'); + }); + it('literal is parses from string with comma', async () => { + const data = { id: 'id', issued: { literal: 'Accessed 2020, Jan 1' } }; + expect(yearFromCitation(data)).toEqual('2020'); + }); + it('literal is does not parse longer number', async () => { + const data = { id: 'id', issued: { literal: 'Accessed 202020' } }; + expect(yearFromCitation(data)).toEqual('n.d.'); + }); + it('literal is does not parse as part of word', async () => { + const data = { id: 'id', issued: { literal: 'Accessed a2020' } }; + expect(yearFromCitation(data)).toEqual('n.d.'); + }); + it('no date returns n.d.', async () => { + const data = { id: 'id' }; + expect(yearFromCitation(data)).toEqual('n.d.'); + }); +}); + +describe('firstNonDoiUrl', () => { + it('no url returns undefined', async () => { + expect(firstNonDoiUrl('my citation', 'abc123')).toEqual(undefined); + }); + it('one url returns url', async () => { + expect(firstNonDoiUrl('my citation https://example.com', 'abc123')).toEqual( + 'https://example.com', + ); + }); + it('two urls returns first url', async () => { + expect( + firstNonDoiUrl('my citation https://example.com/a and https://example.com/b', 'abc123'), + ).toEqual('https://example.com/a'); + }); + it('doi urls is skipped', async () => { + expect(firstNonDoiUrl('my citation https://example.com/abc123', 'abc123')).toEqual(undefined); + }); + it('url after doi url is returned', async () => { + expect( + firstNonDoiUrl('my citation https://example.com/abc123 and https://example.com/b', 'abc123'), + ).toEqual('https://example.com/b'); + }); +}); diff --git a/packages/myst-cli/src/process/mdast.ts b/packages/myst-cli/src/process/mdast.ts index bd8dc7358..6162c6f16 100644 --- a/packages/myst-cli/src/process/mdast.ts +++ b/packages/myst-cli/src/process/mdast.ts @@ -232,7 +232,7 @@ export async function transformMdast( transformRenderInlineExpressions(mdast, vfile); await transformOutputsToCache(session, mdast, kind, { minifyMaxCharacters }); transformFilterOutputStreams(mdast, vfile, frontmatter.settings); - transformCitations(mdast, fileCitationRenderer, references); + transformCitations(session, file, mdast, fileCitationRenderer, references); await unified() .use(codePlugin, { lang: frontmatter?.kernelspec?.language }) .use(footnotesPlugin) // Needs to happen near the end diff --git a/packages/myst-cli/src/transforms/citations.spec.ts b/packages/myst-cli/src/transforms/citations.spec.ts new file mode 100644 index 000000000..7f11b7b17 --- /dev/null +++ b/packages/myst-cli/src/transforms/citations.spec.ts @@ -0,0 +1,92 @@ +import { describe, expect, it } from 'vitest'; +import { Session } from '../session'; +import { transformCitations } from './citations'; +import type { CitationRenderer } from 'citation-js-utils'; +import type { References } from 'myst-common'; + +const RENDERER: CitationRenderer = { + author1: { + render: () => '', + inline: () => { + return [{ type: 'text', value: 'inline 1' }]; + }, + getDOI: () => 'abc123', + getURL: () => 'https://example.com', + cite: { id: 'my-cite-1' }, + }, + author2: { + render: () => '', + inline: () => { + return [{ type: 'text', value: 'inline 2' }]; + }, + getDOI: () => undefined, + getURL: () => undefined, + cite: { id: 'my-cite-2' }, + }, +}; + +describe('transformCitations', () => { + it('citation transforms', async () => { + const mdast: any = { + type: 'root', + children: [ + { + type: 'cite', + label: 'author1', + }, + ], + }; + const references: References = {}; + transformCitations(new Session(), '', mdast, RENDERER, references); + expect(mdast.children[0].children).toEqual([{ type: 'text', value: 'inline 1' }]); + expect(mdast.children[0].enumerator).toEqual('1'); + expect(references.cite?.order).toEqual(['author1']); + expect(references.cite?.data?.author1).toEqual({ + label: 'author1', + doi: 'abc123', + url: 'https://example.com', + enumerator: '1', + html: '', + }); + }); + it('multiple citations transform', async () => { + const mdast: any = { + type: 'root', + children: [ + { + type: 'cite', + label: 'author2', + }, + { + type: 'cite', + label: 'author1', + }, + { + type: 'cite', + label: 'author2', + }, + ], + }; + const references: References = {}; + transformCitations(new Session(), '', mdast, RENDERER, references); + expect(mdast.children[0].children).toEqual([{ type: 'text', value: 'inline 2' }]); + expect(mdast.children[0].enumerator).toEqual('1'); + expect(mdast.children[1].children).toEqual([{ type: 'text', value: 'inline 1' }]); + expect(mdast.children[1].enumerator).toEqual('2'); + expect(mdast.children[2].children).toEqual([{ type: 'text', value: 'inline 2' }]); + expect(mdast.children[2].enumerator).toEqual('1'); + expect(references.cite?.order).toEqual(['author2', 'author1']); + expect(references.cite?.data?.author1).toEqual({ + label: 'author1', + doi: 'abc123', + url: 'https://example.com', + enumerator: '2', + html: '', + }); + expect(references.cite?.data?.author2).toEqual({ + label: 'author2', + enumerator: '1', + html: '', + }); + }); +}); diff --git a/packages/myst-cli/src/transforms/citations.ts b/packages/myst-cli/src/transforms/citations.ts index 80161379f..f709e0f5a 100644 --- a/packages/myst-cli/src/transforms/citations.ts +++ b/packages/myst-cli/src/transforms/citations.ts @@ -1,32 +1,45 @@ import type { CitationRenderer } from 'citation-js-utils'; import { InlineCite } from 'citation-js-utils'; +import { RuleId } from 'myst-common'; import type { GenericNode, GenericParent, References } from 'myst-common'; import type { StaticPhrasingContent } from 'myst-spec'; import type { Cite } from 'myst-spec-ext'; import { selectAll } from 'unist-util-select'; +import type { ISession } from '../session/types.js'; +import { addWarningForFile } from '../utils/addWarningForFile.js'; function pushCite( references: Pick, citeRenderer: CitationRenderer, label: string, -) { +): string { if (!references.cite) { references.cite = { order: [], data: {} }; } - if (!references.cite?.data[label]) { + if (!references.cite.data[label]) { references.cite.order.push(label); + references.cite.data[label] = { + label, + enumerator: `${references.cite.order.length}`, + doi: citeRenderer[label]?.getDOI(), + html: citeRenderer[label]?.render(), + url: citeRenderer[label]?.getURL(), + }; } - references.cite.data[label] = { - // TODO: this number isn't right? Should be the last time it was seen, not the current size. - number: references.cite.order.length, - doi: citeRenderer[label]?.getDOI(), - html: citeRenderer[label]?.render(), - }; + return references.cite.data[label].enumerator; } -function addCitationChildren(cite: Cite, renderer: CitationRenderer): boolean { +function addCitationChildren( + session: ISession, + file: string, + cite: Cite, + renderer: CitationRenderer, +): boolean { const render = renderer[cite.label as string]; if (!render) { + addWarningForFile(session, file, `Citation not found for label: ${cite.label}`, 'error', { + ruleId: RuleId.citationLabelExists, + }); cite.error = 'not found'; return false; } @@ -38,6 +51,15 @@ function addCitationChildren(cite: Cite, renderer: CitationRenderer): boolean { partial: cite.partial, }) as StaticPhrasingContent[]; } catch (error) { + addWarningForFile( + session, + file, + `Citation failed to render for label: ${cite.label}`, + 'error', + { + ruleId: RuleId.citationRenders, + }, + ); cite.error = 'rendering error'; return false; } @@ -51,6 +73,8 @@ function hasChildren(node: GenericNode) { } export function transformCitations( + session: ISession, + file: string, mdast: GenericParent, renderer: CitationRenderer, references: Pick, @@ -59,7 +83,7 @@ export function transformCitations( citations.forEach((cite) => { const citeLabel = cite.label as string; // push cites in order of appearance in the document - const success = addCitationChildren(cite, renderer); - if (success) pushCite(references, renderer, citeLabel); + const success = addCitationChildren(session, file, cite, renderer); + if (success) cite.enumerator = pushCite(references, renderer, citeLabel); }); } diff --git a/packages/myst-cli/src/transforms/dois.ts b/packages/myst-cli/src/transforms/dois.ts index f333514e4..d58c1fe40 100644 --- a/packages/myst-cli/src/transforms/dois.ts +++ b/packages/myst-cli/src/transforms/dois.ts @@ -13,28 +13,38 @@ import type { SingleCitationRenderer } from './types.js'; import type { VFile } from 'vfile'; import type { ISession } from '../session/types.js'; -function doiCacheFile(session: ISession, normalizedDoi: string) { +function doiBibtexCacheFile(session: ISession, normalizedDoi: string) { const filename = `doi-${computeHash(normalizedDoi)}.bib`; const cacheFolder = join(session.buildPath(), 'cache'); if (!fs.existsSync(cacheFolder)) fs.mkdirSync(cacheFolder, { recursive: true }); return join(cacheFolder, filename); } +function doiResolvesCacheFile(session: ISession, normalizedDoi: string) { + const filename = `doi-${computeHash(normalizedDoi)}.txt`; + const cacheFolder = join(session.buildPath(), 'cache'); + if (!fs.existsSync(cacheFolder)) fs.mkdirSync(cacheFolder, { recursive: true }); + return join(cacheFolder, filename); +} + +/** + * Fetch bibtex entry for doi from doi.org using application/x-bibtex accept header + */ export async function getDoiOrgBibtex( session: ISession, doiString: string, ): Promise { const normalizedDoi = doi.normalize(doiString); - if (!doi.validate(doiString) || !normalizedDoi) return null; - const cachePath = doiCacheFile(session, normalizedDoi); + const url = doi.buildUrl(normalizedDoi); + if (!doi.validate(doiString) || !normalizedDoi || !url) return null; + const cachePath = doiBibtexCacheFile(session, normalizedDoi); if (fs.existsSync(cachePath)) { const bibtex = fs.readFileSync(cachePath).toString(); - session.log.debug(`Loaded cached reference information doi:${normalizedDoi}`); + session.log.debug(`Loaded cached reference bibtex for doi:${normalizedDoi}`); return bibtex; } const toc = tic(); - session.log.debug('Fetching DOI information from doi.org'); - const url = `https://doi.org/${normalizedDoi}`; + session.log.debug('Fetching DOI bibtex from doi.org'); const response = await session .fetch(url, { headers: [['Accept', 'application/x-bibtex']], @@ -44,16 +54,44 @@ export async function getDoiOrgBibtex( return null; }); if (!response || !response.ok) { - session.log.debug(`doi.org fetch failed for ${doiString}}`); + session.log.debug(`doi.org fetch failed for ${doiString}`); return null; } const bibtex = await response.text(); - session.log.debug(toc(`Fetched reference information doi:${normalizedDoi} in %s`)); - session.log.debug(`Saving doi to cache ${cachePath}`); + session.log.debug(toc(`Fetched reference bibtex for doi:${normalizedDoi} in %s`)); + session.log.debug(`Saving doi bibtex to cache ${cachePath}`); fs.writeFileSync(cachePath, bibtex); return bibtex; } +/** + * Fetch doi from doi.org to see if it resolves + */ +export async function doiOrgResolves(session: ISession, doiString: string): Promise { + const normalizedDoi = doi.normalize(doiString); + const url = doi.buildUrl(normalizedDoi); + if (!doi.validate(doiString) || !normalizedDoi || !url) return false; + const cachePath = doiResolvesCacheFile(session, normalizedDoi); + if (fs.existsSync(cachePath)) { + session.log.debug(`Loaded cached resolution result for doi:${normalizedDoi}`); + return true; + } + const toc = tic(); + session.log.debug('Resolving doi existence from doi.org'); + const response = await session.fetch(url).catch(() => { + session.log.debug(`Request to ${url} failed.`); + return null; + }); + if (!response || !response.ok) { + session.log.debug(`doi.org fetch failed for ${doiString}`); + return false; + } + session.log.debug(toc(`Resolved doi existence for doi:${normalizedDoi} in %s`)); + session.log.debug(`Saving resolution result to cache ${cachePath}`); + fs.writeFileSync(cachePath, 'ok'); + return true; +} + export async function getCitation( session: ISession, vfile: VFile, @@ -63,9 +101,20 @@ export async function getCitation( if (!doi.validate(doiString)) return null; const bibtex = await getDoiOrgBibtex(session, doiString); if (!bibtex) { - fileWarn(vfile, `Could not find DOI from link: ${doiString} as ${doi.normalize(doiString)}`, { + const resolves = await doiOrgResolves(session, doiString); + const normalizedDoi = doi.normalize(doiString); + let message: string; + let note: string | undefined; + if (resolves) { + message = `No bibtex available from doi.org for doi:${normalizedDoi}`; + note = `To resolve this error, visit ${doi.buildUrl(normalizedDoi)} and add citation info to .bib file`; + } else { + message = `Could not find DOI from link: ${doiString} as ${normalizedDoi}`; + } + fileWarn(vfile, message, { node, ruleId: RuleId.doiLinkValid, + note, }); return null; } diff --git a/packages/myst-common/src/ruleids.ts b/packages/myst-common/src/ruleids.ts index 4ad643831..591f0c012 100644 --- a/packages/myst-common/src/ruleids.ts +++ b/packages/myst-common/src/ruleids.ts @@ -80,6 +80,8 @@ export enum RuleId { // Citation rules citationIsUnique = 'citation-is-unique', bibFileExists = 'bib-file-exists', + citationRenders = 'citation-renders', + citationLabelExists = 'citation-label-exists', // Code rules codeMetadataLifted = 'code-metadata-lifted', codeMetatagsValid = 'code-metatags-valid', diff --git a/packages/myst-common/src/types.ts b/packages/myst-common/src/types.ts index 20cba656a..8ad29e144 100644 --- a/packages/myst-common/src/types.ts +++ b/packages/myst-common/src/types.ts @@ -20,7 +20,10 @@ export type GenericParent = Record> = export type Citations = { order: string[]; - data: Record; + data: Record< + string, + { label: string; html: string; enumerator: string; doi?: string; url?: string } + >; }; export enum NotebookCell { diff --git a/packages/myst-spec-ext/src/types.ts b/packages/myst-spec-ext/src/types.ts index d14c97024..a253a0a24 100644 --- a/packages/myst-spec-ext/src/types.ts +++ b/packages/myst-spec-ext/src/types.ts @@ -141,6 +141,7 @@ export type Cite = { prefix?: string; suffix?: string; partial?: 'author' | 'year'; + enumerator?: string; }; export type CiteGroup = {