diff --git a/docs/source/_data/sidebar.yml b/docs/source/_data/sidebar.yml index bf6affcb97..6aa1d0c223 100644 --- a/docs/source/_data/sidebar.yml +++ b/docs/source/_data/sidebar.yml @@ -29,6 +29,7 @@ filters: overview: overview.html abs: abs.html append: append.html + array_to_sentence_string: array_to_sentence_string.html at_least: at_least.html at_most: at_most.html capitalize: capitalize.html @@ -63,6 +64,7 @@ filters: modulo: modulo.html newline_to_br: newline_to_br.html normalize_whitespace: normalize_whitespace.html + number_of_words: number_of_words.html plus: plus.html pop: pop.html push: push.html diff --git a/docs/source/filters/array_to_sentence_string.md b/docs/source/filters/array_to_sentence_string.md new file mode 100644 index 0000000000..32f25f7ad9 --- /dev/null +++ b/docs/source/filters/array_to_sentence_string.md @@ -0,0 +1,27 @@ +--- +title: array_to_sentence_string +--- + +{% since %}v10.13.0{% endsince %} + +Convert an array into a sentence. Useful for listing tags. Optional argument for connector. + +Input +```liquid +{{ "foo,bar,baz" | split: "," | array_to_sentence_string }} +``` + +Output +```text +foo, bar, and baz +``` + +Input +```liquid +{{ "foo,bar,baz" | split: "," | array_to_sentence_string: "or" }} +``` + +Output +```text +foo, bar, or baz +``` diff --git a/docs/source/filters/number_of_words.md b/docs/source/filters/number_of_words.md new file mode 100644 index 0000000000..46b39275af --- /dev/null +++ b/docs/source/filters/number_of_words.md @@ -0,0 +1,49 @@ +--- +title: number_of_words +--- + +{% since %}v10.13.0{% endsince %} + +Count the number of words in some text. This filter takes an optional argument to control the handling of Chinese-Japanese-Korean (CJK) characters in the input string: +- Passing `'cjk'` as the argument will count every CJK character detected as one word irrespective of being separated by whitespace. +- Passing `'auto'` (auto-detect) works similar to `'cjk'` but is more performant if the filter is used on a variable string that may or may not contain CJK chars. + +Input +```liquid +{{ "Hello world!" | number_of_words }} +``` + +Output +```text +2 +``` + +Input +```liquid +{{ "你好hello世界world" | number_of_words }} +``` + +Output +```text +1 +``` + +Input +```liquid +{{ "你好hello世界world" | number_of_words: "cjk" }} +``` + +Output +```text +6 +``` + +Input +```liquid +{{ "你好hello世界world" | number_of_words: "auto" }} +``` + +Output +```text +6 +``` diff --git a/docs/source/filters/overview.md b/docs/source/filters/overview.md index c531cd281c..447715e8cb 100644 --- a/docs/source/filters/overview.md +++ b/docs/source/filters/overview.md @@ -10,7 +10,7 @@ There's 40+ filters supported by LiquidJS. These filters can be categorized into Categories | Filters --- | --- Math | plus, minus, modulo, times, floor, ceil, round, divided_by, abs, at_least, at_most -String | append, prepend, capitalize, upcase, downcase, strip, lstrip, rstrip, strip_newlines, split, replace, replace_first, replace_last,remove, remove_first, remove_last, truncate, truncatewords, normalize_whitespace +String | append, prepend, capitalize, upcase, downcase, strip, lstrip, rstrip, strip_newlines, split, replace, replace_first, replace_last,remove, remove_first, remove_last, truncate, truncatewords, normalize_whitespace, number_of_words, array_to_sentence_string HTML/URI | escape, escape_once, url_encode, url_decode, strip_html, newline_to_br, xml_escape, cgi_escape, uri_escape Array | slice, map, sort, sort_natural, uniq, where, where_exp, group_by, group_by_exp, find, find_exp, first, last, join, reverse, concat, compact, size, push, pop, shift, unshift Date | date, date_to_xmlschema, date_to_rfc822, date_to_string, date_to_long_string diff --git a/docs/source/zh-cn/filters/array_to_sentence_string.md b/docs/source/zh-cn/filters/array_to_sentence_string.md new file mode 100644 index 0000000000..3e1d8b5378 --- /dev/null +++ b/docs/source/zh-cn/filters/array_to_sentence_string.md @@ -0,0 +1,27 @@ +--- +title: array_to_sentence_string +--- + +{% since %}v10.13.0{% endsince %} + +把数组转化为句子,用于做标签列表。有一个可选的连接词参数。 + +输入 +```liquid +{{ "foo,bar,baz" | split: "," | array_to_sentence_string }} +``` + +输出 +```text +foo, bar, and baz +``` + +输入 +```liquid +{{ "foo,bar,baz" | split: "," | array_to_sentence_string: "or" }} +``` + +输出 +```text +foo, bar, or baz +``` diff --git a/docs/source/zh-cn/filters/number_of_words.md b/docs/source/zh-cn/filters/number_of_words.md new file mode 100644 index 0000000000..2e239b8766 --- /dev/null +++ b/docs/source/zh-cn/filters/number_of_words.md @@ -0,0 +1,49 @@ +--- +title: number_of_words +--- + +{% since %}v10.13.0{% endsince %} + +计算文本中的单词数。此过滤器接受一个可选参数,用于控制输入字符串中汉字-日语-韩语(CJK)字符的处理方式: +- `'cjk'`:将每个检测到的 CJK 字符计为一个单词,无论是否由空格分隔。 +- `'auto'`:与 `'cjk'` 类似,但如果过滤器用于可能包含或不包含 CJK 字符的字符串,则性能更好。 + +输入 +```liquid +{{ "Hello world!" | number_of_words }} +``` + +输出 +```text +2 +``` + +输入 +```liquid +{{ "你好hello世界world" | number_of_words }} +``` + +输出 +```text +1 +``` + +输入 +```liquid +{{ "你好hello世界world" | number_of_words: "cjk" }} +``` + +输出 +```text +6 +``` + +输入 +```liquid +{{ "你好hello世界world" | number_of_words: "auto" }} +``` + +输出 +```text +6 +``` diff --git a/docs/source/zh-cn/filters/overview.md b/docs/source/zh-cn/filters/overview.md index 58b08b5323..97d6b21342 100644 --- a/docs/source/zh-cn/filters/overview.md +++ b/docs/source/zh-cn/filters/overview.md @@ -10,7 +10,7 @@ LiquidJS 共支持 40+ 个过滤器,可以分为如下几类: 类别 | 过滤器 --- | --- 数学 | plus, minus, modulo, times, floor, ceil, round, divided_by, abs, at_least, at_most -字符串 | append, prepend, capitalize, upcase, downcase, strip, lstrip, rstrip, strip_newlines, split, replace, replace_first, replace_last, remove, remove_first, remove_last, truncate, truncatewords, normalize_whitespace +字符串 | append, prepend, capitalize, upcase, downcase, strip, lstrip, rstrip, strip_newlines, split, replace, replace_first, replace_last, remove, remove_first, remove_last, truncate, truncatewords, normalize_whitespace, number_of_words, array_to_sentence_string HTML/URI | escape, escape_once, url_encode, url_decode, strip_html, newline_to_br, xml_escape, cgi_escape, uri_escape 数组 | slice, map, sort, sort_natural, uniq, where, where_exp, group_by, group_by_exp, find, find_exp, first, last, join, reverse, concat, compact, size, push, pop, shift, unshift 日期 | date, date_to_xmlschema, date_to_rfc822, date_to_string, date_to_long_string diff --git a/src/filters/string.ts b/src/filters/string.ts index 1bdc315d81..0a431b352a 100644 --- a/src/filters/string.ts +++ b/src/filters/string.ts @@ -3,8 +3,20 @@ * * * prefer stringify() to String() since `undefined`, `null` should eval '' */ + +// Han (Chinese) characters: \u4E00-\u9FFF +// Additional Han characters: \uF900-\uFAFF (CJK Compatibility Ideographs) +// Additional Han characters: \u3400-\u4DBF (CJK Unified Ideographs Extension A) +// Katakana (Japanese): \u30A0-\u30FF +// Hiragana (Japanese): \u3040-\u309F +// Hangul (Korean): \uAC00-\uD7AF import { assert, escapeRegExp, stringify } from '../util' +const rCJKWord = /[\u4E00-\u9FFF\uF900-\uFAFF\u3400-\u4DBF\u3040-\u309F\u30A0-\u30FF\uAC00-\uD7AF]/gu + +// Word boundary followed by word characters (for detecting words) +const rNonCJKWord = /[^\u4E00-\u9FFF\uF900-\uFAFF\u3400-\u4DBF\u3040-\u309F\u30A0-\u30FF\uAC00-\uD7AF\s]+/gu + export function append (v: string, arg: string) { assert(arguments.length === 2, 'append expect 2 arguments') return stringify(v) + stringify(arg) @@ -32,16 +44,16 @@ export function upcase (str: string) { } export function remove (v: string, arg: string) { - return stringify(v).split(String(arg)).join('') + return stringify(v).split(stringify(arg)).join('') } export function remove_first (v: string, l: string) { - return stringify(v).replace(String(l), '') + return stringify(v).replace(stringify(l), '') } export function remove_last (v: string, l: string) { const str = stringify(v) - const pattern = String(l) + const pattern = stringify(l) const index = str.lastIndexOf(pattern) if (index === -1) return str return str.substring(0, index) + str.substring(index + pattern.length) @@ -56,7 +68,7 @@ export function rstrip (str: string, chars?: string) { } export function split (v: string, arg: string) { - const arr = stringify(v).split(String(arg)) + const arr = stringify(v).split(stringify(arg)) // align to ruby split, which is the behavior of shopify/liquid // see: https://ruby-doc.org/core-2.4.0/String.html#method-i-split while (arr.length && arr[arr.length - 1] === '') arr.pop() @@ -83,19 +95,19 @@ export function capitalize (str: string) { } export function replace (v: string, pattern: string, replacement: string) { - return stringify(v).split(String(pattern)).join(replacement) + return stringify(v).split(stringify(pattern)).join(replacement) } export function replace_first (v: string, arg1: string, arg2: string) { - return stringify(v).replace(String(arg1), arg2) + return stringify(v).replace(stringify(arg1), arg2) } export function replace_last (v: string, arg1: string, arg2: string) { const str = stringify(v) - const pattern = String(arg1) + const pattern = stringify(arg1) const index = str.lastIndexOf(pattern) if (index === -1) return str - const replacement = String(arg2) + const replacement = stringify(arg2) return str.substring(0, index) + replacement + str.substring(index + pattern.length) } @@ -117,3 +129,34 @@ export function normalize_whitespace (v: string) { v = stringify(v) return v.replace(/\s+/g, ' ') } + +export function number_of_words (input: string, mode?: 'cjk' | 'auto') { + input = stringify(input).trim() + if (!input) return 0 + switch (mode) { + case 'cjk': + // Count CJK characters and words + return (input.match(rCJKWord) || []).length + (input.match(rNonCJKWord) || []).length + case 'auto': + // Count CJK characters, if none, count words + return rCJKWord.test(input) + ? input.match(rCJKWord)!.length + (input.match(rNonCJKWord) || []).length + : input.split(/\s+/).length + default: + // Count words only + return input.split(/\s+/).length + } +} + +export function array_to_sentence_string (array: unknown[], connector = 'and') { + switch (array.length) { + case 0: + return '' + case 1: + return array[0] + case 2: + return `${array[0]} ${connector} ${array[1]}` + default: + return `${array.slice(0, -1).join(', ')}, ${connector} ${array[array.length - 1]}` + } +} diff --git a/test/integration/filters/string.spec.ts b/test/integration/filters/string.spec.ts index 3cb494da5a..108e46989f 100644 --- a/test/integration/filters/string.spec.ts +++ b/test/integration/filters/string.spec.ts @@ -238,4 +238,100 @@ describe('filters/string', function () { expect(liquid.parseAndRenderSync('{{ "a \n b c" | normalize_whitespace }}')).toEqual('a b c') }) }) + describe('number_of_words', () => { + it('should count words of Latin sentence', async () => { + const html = await liquid.parseAndRender('{{ "I\'m not hungry" | number_of_words: "auto"}}') + expect(html).toEqual('3') + }) + + it('should count words of mixed sentence', async () => { + const html = await liquid.parseAndRender('{{ "Hello world!" | number_of_words }}') + expect(html).toEqual('2') + }) + + it('should count words of CJK sentence', async () => { + const html = await liquid.parseAndRender('{{ "你好hello世界world" | number_of_words }}') + expect(html).toEqual('1') + }) + + it('should count words of CJK sentence with mode "cjk"', async () => { + const html = await liquid.parseAndRender('{{ "你好hello世界world" | number_of_words: "cjk" }}') + expect(html).toEqual('6') + }) + + it('should count words of CJK sentence with mode "auto"', async () => { + const html = await liquid.parseAndRender('{{ "你好hello世界world" | number_of_words: "auto" }}') + expect(html).toEqual('6') + }) + it('should handle empty input', async () => { + const html = await liquid.parseAndRender('{{ "" | number_of_words }}') + expect(html).toEqual('0') + }) + + it('should handle input with only whitespace', async () => { + const html = await liquid.parseAndRender('{{ " " | number_of_words }}') + expect(html).toEqual('0') + }) + + it('should count words with punctuation marks', async () => { + const html = await liquid.parseAndRender('{{ "Hello! This is a test." | number_of_words }}') + expect(html).toEqual('5') + }) + + it('should count words with special characters', async () => { + const html = await liquid.parseAndRender('{{ "This is a test with special characters: !@#$%^&*()-_+=`~[]{};:\'\\"\\|<,>.?/" | number_of_words }}') + expect(html).toEqual('8') + }) + + it('should count words with multiple spaces between words', async () => { + const html = await liquid.parseAndRender('{{ " Hello world! " | number_of_words }}') + expect(html).toEqual('2') + }) + + it('should count words with mixed CJK characters', async () => { + const html = await liquid.parseAndRender('{{ "你好こんにちは안녕하세요" | number_of_words: "cjk" }}') + expect(html).toEqual('12') + }) + }) + describe('array_to_sentence_string', () => { + it('should handle an empty array', async () => { + const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: [] }) + expect(html).toEqual('') + }) + + it('should handle an array with one element', async () => { + const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: ['apple'] }) + expect(html).toEqual('apple') + }) + + it('should handle an array with two elements', async () => { + const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: ['apple', 'banana'] }) + expect(html).toEqual('apple and banana') + }) + + it('should handle an array with more than two elements', async () => { + const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: ['apple', 'banana', 'orange'] }) + expect(html).toEqual('apple, banana, and orange') + }) + + it('should handle an array with custom connector', async () => { + const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string: "or" }}', { arr: ['apple', 'banana', 'orange'] }) + expect(html).toEqual('apple, banana, or orange') + }) + + it('should handle an array of numbers', async () => { + const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: [1, 2, 3] }) + expect(html).toEqual('1, 2, and 3') + }) + + it('should handle an array of mixed types', async () => { + const html = await liquid.parseAndRender('{{ arr | array_to_sentence_string }}', { arr: ['apple', 2, 'orange'] }) + expect(html).toEqual('apple, 2, and orange') + }) + + it('should handle an array of mixed types', async () => { + const html = await liquid.parseAndRender('{{ "foo,bar,baz" | split: "," | array_to_sentence_string }}') + expect(html).toEqual('foo, bar, and baz') + }) + }) })