Skip to content

Commit

Permalink
Add support for recognizing word locales in word operations (microsof…
Browse files Browse the repository at this point in the history
…t#50045) (microsoft#203605)

* Add support for recognizing word locales in word operations (microsoft#50045)

* Move intlSegmenterLocales in the WordCharacterClassifier class

* Rerun compiler

* Renames

* Avoid duplicating code

---------

Co-authored-by: Alex Dima <[email protected]>
  • Loading branch information
2 people authored and chen-ky committed Mar 18, 2024
1 parent 96d7c32 commit d1e22da
Show file tree
Hide file tree
Showing 14 changed files with 329 additions and 74 deletions.
4 changes: 2 additions & 2 deletions src/vs/editor/browser/controller/textAreaHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,7 @@ export class TextAreaHandler extends ViewPart {
private _getAndroidWordAtPosition(position: Position): [string, number] {
const ANDROID_WORD_SEPARATORS = '`~!@#$%^&*()-=+[{]}\\|;:",.<>/?';
const lineContent = this._context.viewModel.getLineContent(position.lineNumber);
const wordSeparators = getMapForWordSeparators(ANDROID_WORD_SEPARATORS);
const wordSeparators = getMapForWordSeparators(ANDROID_WORD_SEPARATORS, []);

let goingLeft = true;
let startColumn = position.column;
Expand Down Expand Up @@ -530,7 +530,7 @@ export class TextAreaHandler extends ViewPart {

private _getWordBeforePosition(position: Position): string {
const lineContent = this._context.viewModel.getLineContent(position.lineNumber);
const wordSeparators = getMapForWordSeparators(this._context.configuration.options.get(EditorOption.wordSeparators));
const wordSeparators = getMapForWordSeparators(this._context.configuration.options.get(EditorOption.wordSeparators), []);

let column = position.column;
let distance = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ export class CodeEditorWidget extends Disposable implements editorBrowser.ICodeE
if (!this._modelData) {
return null;
}
return WordOperations.getWordAtPosition(this._modelData.model, this._configuration.options.get(EditorOption.wordSeparators), position);
return WordOperations.getWordAtPosition(this._modelData.model, this._configuration.options.get(EditorOption.wordSeparators), this._configuration.options.get(EditorOption.wordSegmenterLocales), position);
}

public getValue(options: { preserveBOM: boolean; lineEnding: string } | null = null): string {
Expand Down
66 changes: 66 additions & 0 deletions src/vs/editor/common/config/editorOptions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,13 @@ export interface IEditorOptions {
* Defaults to empty array.
*/
rulers?: (number | IRulerOption)[];
/**
* Locales used for segmenting lines into words when doing word related navigations or operations.
*
* Specify the BCP 47 language tag of the word you wish to recognize (e.g., ja, zh-CN, zh-Hant-TW, etc.).
* Defaults to empty array
*/
wordSegmenterLocales?: string | string[];
/**
* A string containing the word separators used when doing word navigation.
* Defaults to `~!@#$%^&*()-=+[{]}\\|;:\'",.<>/?
Expand Down Expand Up @@ -4895,6 +4902,63 @@ class SmartSelect extends BaseEditorOption<EditorOption.smartSelect, ISmartSelec
}
}

//#endregion

//#region wordSegmenterLocales

/**
* Locales used for segmenting lines into words when doing word related navigations or operations.
*
* Specify the BCP 47 language tag of the word you wish to recognize (e.g., ja, zh-CN, zh-Hant-TW, etc.).
*/
class WordSegmenterLocales extends BaseEditorOption<EditorOption.wordSegmenterLocales, string | string[], string[]> {
constructor() {
const defaults: string[] = [];

super(
EditorOption.wordSegmenterLocales, 'wordSegmenterLocales', defaults,
{
anyOf: [
{
description: nls.localize('wordSegmenterLocales', "Locales to be used for word segmentation when doing word related navigations or operations. Specify the BCP 47 language tag of the word you wish to recognize (e.g., ja, zh-CN, zh-Hant-TW, etc.)."),
type: 'string',
}, {
description: nls.localize('wordSegmenterLocales', "Locales to be used for word segmentation when doing word related navigations or operations. Specify the BCP 47 language tag of the word you wish to recognize (e.g., ja, zh-CN, zh-Hant-TW, etc.)."),
type: 'array',
items: {
type: 'string'
}
}
]
}
);
}

public validate(input: any): string[] {
if (typeof input === 'string') {
input = [input];
}
if (Array.isArray(input)) {
const validLocales: string[] = [];
for (const locale of input) {
if (typeof locale === 'string') {
try {
if (Intl.Segmenter.supportedLocalesOf(locale).length > 0) {
validLocales.push(locale);
}
} catch {
// ignore invalid locales
}
}
}
return validLocales;
}

return this.defaultValue;
}
}


//#endregion

//#region wrappingIndent
Expand Down Expand Up @@ -5288,6 +5352,7 @@ export const enum EditorOption {
useShadowDOM,
useTabStops,
wordBreak,
wordSegmenterLocales,
wordSeparators,
wordWrap,
wordWrapBreakAfterCharacters,
Expand Down Expand Up @@ -6005,6 +6070,7 @@ export const EditorOptions = {
description: nls.localize('wordBreak', "Controls the word break rules used for Chinese/Japanese/Korean (CJK) text.")
}
)),
wordSegmenterLocales: register(new WordSegmenterLocales()),
wordSeparators: register(new EditorStringOption(
EditorOption.wordSeparators, 'wordSeparators', USUAL_WORD_SEPARATORS,
{ description: nls.localize('wordSeparators', "Characters that will be used as word separators when doing word related navigations or operations.") }
Expand Down
92 changes: 80 additions & 12 deletions src/vs/editor/common/core/wordCharacterClassifier.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
*--------------------------------------------------------------------------------------------*/

import { CharCode } from 'vs/base/common/charCode';
import { LRUCache } from 'vs/base/common/map';
import { CharacterClassifier } from 'vs/editor/common/core/characterClassifier';

export const enum WordCharacterClass {
Expand All @@ -14,8 +15,19 @@ export const enum WordCharacterClass {

export class WordCharacterClassifier extends CharacterClassifier<WordCharacterClass> {

constructor(wordSeparators: string) {
public readonly intlSegmenterLocales: Intl.UnicodeBCP47LocaleIdentifier[];
private readonly _segmenter: Intl.Segmenter | null = null;
private _cachedLine: string | null = null;
private _cachedSegments: IntlWordSegmentData[] = [];

constructor(wordSeparators: string, intlSegmenterLocales: Intl.UnicodeBCP47LocaleIdentifier[]) {
super(WordCharacterClass.Regular);
this.intlSegmenterLocales = intlSegmenterLocales;
if (this.intlSegmenterLocales.length > 0) {
this._segmenter = new Intl.Segmenter(this.intlSegmenterLocales, { granularity: 'word' });
} else {
this._segmenter = null;
}

for (let i = 0, len = wordSeparators.length; i < len; i++) {
this.set(wordSeparators.charCodeAt(i), WordCharacterClass.WordSeparator);
Expand All @@ -25,18 +37,74 @@ export class WordCharacterClassifier extends CharacterClassifier<WordCharacterCl
this.set(CharCode.Tab, WordCharacterClass.Whitespace);
}

}
public findPrevIntlWordBeforeOrAtOffset(line: string, offset: number): IntlWordSegmentData | null {
let candidate: IntlWordSegmentData | null = null;
for (const segment of this._getIntlSegmenterWordsOnLine(line)) {
if (segment.index > offset) {
break;
}
candidate = segment;
}
return candidate;
}

public findNextIntlWordAtOrAfterOffset(lineContent: string, offset: number): IntlWordSegmentData | null {
for (const segment of this._getIntlSegmenterWordsOnLine(lineContent)) {
if (segment.index < offset) {
continue;
}
return segment;
}
return null;
}

private _getIntlSegmenterWordsOnLine(line: string): IntlWordSegmentData[] {
if (!this._segmenter) {
return [];
}

// Check if the line has changed from the previous call
if (this._cachedLine === line) {
return this._cachedSegments;
}

function once<R>(computeFn: (input: string) => R): (input: string) => R {
const cache: { [key: string]: R } = {}; // TODO@Alex unbounded cache
return (input: string): R => {
if (!cache.hasOwnProperty(input)) {
cache[input] = computeFn(input);
// Update the cache with the new line
this._cachedLine = line;
this._cachedSegments = this._filterWordSegments(this._segmenter.segment(line));

return this._cachedSegments;
}

private _filterWordSegments(segments: Intl.Segments): IntlWordSegmentData[] {
const result: IntlWordSegmentData[] = [];
for (const segment of segments) {
if (this._isWordLike(segment)) {
result.push(segment);
}
}
return result;
}

private _isWordLike(segment: Intl.SegmentData): segment is IntlWordSegmentData {
if (segment.isWordLike) {
return true;
}
return cache[input];
};
return false;
}
}

export interface IntlWordSegmentData extends Intl.SegmentData {
isWordLike: true;
}

export const getMapForWordSeparators = once<WordCharacterClassifier>(
(input) => new WordCharacterClassifier(input)
);
const wordClassifierCache = new LRUCache<string, WordCharacterClassifier>(10);

export function getMapForWordSeparators(wordSeparators: string, intlSegmenterLocales: Intl.UnicodeBCP47LocaleIdentifier[]): WordCharacterClassifier {
const key = `${wordSeparators}/${intlSegmenterLocales.join(',')}`;
let result = wordClassifierCache.get(key)!;
if (!result) {
result = new WordCharacterClassifier(wordSeparators, intlSegmenterLocales);
wordClassifierCache.set(key, result);
}
return result;
}
2 changes: 1 addition & 1 deletion src/vs/editor/common/cursor/cursorTypeOperations.ts
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,7 @@ export class TypeOperations {

// Do not auto-close ' or " after a word character
if (pair.open.length === 1 && (ch === '\'' || ch === '"') && autoCloseConfig !== 'always') {
const wordSeparators = getMapForWordSeparators(config.wordSeparators);
const wordSeparators = getMapForWordSeparators(config.wordSeparators, []);
if (lineBefore.length > 0) {
const characterBefore = lineBefore.charCodeAt(lineBefore.length - 1);
if (wordSeparators.get(characterBefore) === WordCharacterClass.Regular) {
Expand Down
40 changes: 36 additions & 4 deletions src/vs/editor/common/cursor/cursorWordOperations.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import * as strings from 'vs/base/common/strings';
import { EditorAutoClosingEditStrategy, EditorAutoClosingStrategy } from 'vs/editor/common/config/editorOptions';
import { CursorConfiguration, ICursorSimpleModel, SelectionStartKind, SingleCursorState } from 'vs/editor/common/cursorCommon';
import { DeleteOperations } from 'vs/editor/common/cursor/cursorDeleteOperations';
import { WordCharacterClass, WordCharacterClassifier, getMapForWordSeparators } from 'vs/editor/common/core/wordCharacterClassifier';
import { WordCharacterClass, WordCharacterClassifier, IntlWordSegmentData, getMapForWordSeparators } from 'vs/editor/common/core/wordCharacterClassifier';
import { Position } from 'vs/editor/common/core/position';
import { Range } from 'vs/editor/common/core/range';
import { Selection } from 'vs/editor/common/core/selection';
Expand Down Expand Up @@ -67,17 +67,29 @@ export class WordOperations {
return { start: start, end: end, wordType: wordType, nextCharClass: nextCharClass };
}

private static _createIntlWord(intlWord: IntlWordSegmentData, nextCharClass: WordCharacterClass): IFindWordResult {
// console.log('INTL WORD ==> ' + intlWord.index + ' => ' + intlWord.index + intlWord.segment.length + ':::: <<<' + intlWord.segment + '>>>');
return { start: intlWord.index, end: intlWord.index + intlWord.segment.length, wordType: WordType.Regular, nextCharClass: nextCharClass };
}

private static _findPreviousWordOnLine(wordSeparators: WordCharacterClassifier, model: ICursorSimpleModel, position: Position): IFindWordResult | null {
const lineContent = model.getLineContent(position.lineNumber);
return this._doFindPreviousWordOnLine(lineContent, wordSeparators, position);
}

private static _doFindPreviousWordOnLine(lineContent: string, wordSeparators: WordCharacterClassifier, position: Position): IFindWordResult | null {
let wordType = WordType.None;

const previousIntlWord = wordSeparators.findPrevIntlWordBeforeOrAtOffset(lineContent, position.column - 2);

for (let chIndex = position.column - 2; chIndex >= 0; chIndex--) {
const chCode = lineContent.charCodeAt(chIndex);
const chClass = wordSeparators.get(chCode);

if (previousIntlWord && chIndex === previousIntlWord.index) {
return this._createIntlWord(previousIntlWord, chClass);
}

if (chClass === WordCharacterClass.Regular) {
if (wordType === WordType.Separator) {
return this._createWord(lineContent, wordType, chClass, chIndex + 1, this._findEndOfWord(lineContent, wordSeparators, wordType, chIndex + 1));
Expand All @@ -103,11 +115,18 @@ export class WordOperations {
}

private static _findEndOfWord(lineContent: string, wordSeparators: WordCharacterClassifier, wordType: WordType, startIndex: number): number {

const nextIntlWord = wordSeparators.findNextIntlWordAtOrAfterOffset(lineContent, startIndex);

const len = lineContent.length;
for (let chIndex = startIndex; chIndex < len; chIndex++) {
const chCode = lineContent.charCodeAt(chIndex);
const chClass = wordSeparators.get(chCode);

if (nextIntlWord && chIndex === nextIntlWord.index + nextIntlWord.segment.length) {
return chIndex;
}

if (chClass === WordCharacterClass.Whitespace) {
return chIndex;
}
Expand All @@ -130,10 +149,16 @@ export class WordOperations {
let wordType = WordType.None;
const len = lineContent.length;

const nextIntlWord = wordSeparators.findNextIntlWordAtOrAfterOffset(lineContent, position.column - 1);

for (let chIndex = position.column - 1; chIndex < len; chIndex++) {
const chCode = lineContent.charCodeAt(chIndex);
const chClass = wordSeparators.get(chCode);

if (nextIntlWord && chIndex === nextIntlWord.index) {
return this._createIntlWord(nextIntlWord, chClass);
}

if (chClass === WordCharacterClass.Regular) {
if (wordType === WordType.Separator) {
return this._createWord(lineContent, wordType, chClass, this._findStartOfWord(lineContent, wordSeparators, wordType, chIndex - 1), chIndex);
Expand All @@ -159,10 +184,17 @@ export class WordOperations {
}

private static _findStartOfWord(lineContent: string, wordSeparators: WordCharacterClassifier, wordType: WordType, startIndex: number): number {

const previousIntlWord = wordSeparators.findPrevIntlWordBeforeOrAtOffset(lineContent, startIndex);

for (let chIndex = startIndex; chIndex >= 0; chIndex--) {
const chCode = lineContent.charCodeAt(chIndex);
const chClass = wordSeparators.get(chCode);

if (previousIntlWord && chIndex === previousIntlWord.index) {
return chIndex;
}

if (chClass === WordCharacterClass.Whitespace) {
return chIndex + 1;
}
Expand Down Expand Up @@ -689,8 +721,8 @@ export class WordOperations {
};
}

public static getWordAtPosition(model: ITextModel, _wordSeparators: string, position: Position): IWordAtPosition | null {
const wordSeparators = getMapForWordSeparators(_wordSeparators);
public static getWordAtPosition(model: ITextModel, _wordSeparators: string, _intlSegmenterLocales: string[], position: Position): IWordAtPosition | null {
const wordSeparators = getMapForWordSeparators(_wordSeparators, _intlSegmenterLocales);
const prevWord = WordOperations._findPreviousWordOnLine(wordSeparators, model, position);
if (prevWord && prevWord.wordType === WordType.Regular && prevWord.start <= position.column - 1 && position.column - 1 <= prevWord.end) {
return WordOperations._createWordAtPosition(model, position.lineNumber, prevWord);
Expand All @@ -703,7 +735,7 @@ export class WordOperations {
}

public static word(config: CursorConfiguration, model: ICursorSimpleModel, cursor: SingleCursorState, inSelectionMode: boolean, position: Position): SingleCursorState {
const wordSeparators = getMapForWordSeparators(config.wordSeparators);
const wordSeparators = getMapForWordSeparators(config.wordSeparators, config.wordSegmenterLocales);
const prevWord = WordOperations._findPreviousWordOnLine(wordSeparators, model, position);
const nextWord = WordOperations._findNextWordOnLine(wordSeparators, model, position);

Expand Down
3 changes: 3 additions & 0 deletions src/vs/editor/common/cursorCommon.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ export class CursorConfiguration {
public readonly surroundingPairs: CharacterMap;
public readonly blockCommentStartToken: string | null;
public readonly shouldAutoCloseBefore: { quote: (ch: string) => boolean; bracket: (ch: string) => boolean; comment: (ch: string) => boolean };
public readonly wordSegmenterLocales: string[];

private readonly _languageId: string;
private _electricChars: { [key: string]: boolean } | null;
Expand All @@ -97,6 +98,7 @@ export class CursorConfiguration {
|| e.hasChanged(EditorOption.useTabStops)
|| e.hasChanged(EditorOption.fontInfo)
|| e.hasChanged(EditorOption.readOnly)
|| e.hasChanged(EditorOption.wordSegmenterLocales)
);
}

Expand Down Expand Up @@ -134,6 +136,7 @@ export class CursorConfiguration {
this.autoClosingOvertype = options.get(EditorOption.autoClosingOvertype);
this.autoSurround = options.get(EditorOption.autoSurround);
this.autoIndent = options.get(EditorOption.autoIndent);
this.wordSegmenterLocales = options.get(EditorOption.wordSegmenterLocales);

this.surroundingPairs = {};
this._electricChars = null;
Expand Down
2 changes: 1 addition & 1 deletion src/vs/editor/common/model/textModelSearch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ export class SearchParams {
canUseSimpleSearch = this.matchCase;
}

return new SearchData(regex, this.wordSeparators ? getMapForWordSeparators(this.wordSeparators) : null, canUseSimpleSearch ? this.searchString : null);
return new SearchData(regex, this.wordSeparators ? getMapForWordSeparators(this.wordSeparators, []) : null, canUseSimpleSearch ? this.searchString : null);
}
}

Expand Down
Loading

0 comments on commit d1e22da

Please sign in to comment.