Skip to content

Commit

Permalink
feat: validate delimiter tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
yjl9903 committed Apr 11, 2023
1 parent 79fdc2a commit 530e296
Show file tree
Hide file tree
Showing 3 changed files with 165 additions and 16 deletions.
91 changes: 78 additions & 13 deletions packages/anitomy/src/token.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
export class TextRange {
public text: string;

public offset: number;

public size: number;

public constructor(text: string, offset: number, size: number) {
this.text = text;
this.offset = offset;
this.size = size;
}

public fork(offset: number, size: number) {
return new TextRange(this.text, offset, size);
}

public toString() {
return this.text.slice(this.offset, this.offset + this.size);
}
}

export enum TokenCategory {
Unknown = 'Unknown',
Bracket = 'Bracket',
Expand All @@ -6,6 +28,25 @@ export enum TokenCategory {
Invalid = 'Invalid'
}

export enum TokenFlag {
// None
None,
// Categories
Bracket,
NotBracket,
Delimiter,
NotDelimiter,
Identifier,
NotIdentifier,
Unknown,
NotUnknown,
Valid,
NotValid,
// Enclosed (Meaning that it is enclosed in some bracket (e.g. [ ] ))
Enclosed,
NotEnclosed
}

export interface Token {
category: TokenCategory;

Expand All @@ -14,24 +55,48 @@ export interface Token {
enclosed: boolean;
}

export class TextRange {
public text: string;

public offset: number;
function checkTokenFlags(token: Token, flags: TokenFlag[]) {
// Make sure token is the correct closure
if (flags.some((f) => f === TokenFlag.Enclosed || f === TokenFlag.NotEnclosed)) {
const success = flags.includes(TokenFlag.Enclosed) === token.enclosed;
if (!success) return false; // Not enclosed correctly (e.g. enclosed when we're looking for non-enclosed).
}

public size: number;
// Make sure token is the correct category
if (!flags.some((f) => TokenFlag.Bracket <= f && f <= TokenFlag.NotValid)) {
return true;
}

public constructor(text: string, offset: number, size: number) {
this.text = text;
this.offset = offset;
this.size = size;
const tasks: [TokenFlag, TokenFlag, TokenCategory][] = [
[TokenFlag.Bracket, TokenFlag.NotBracket, TokenCategory.Bracket],
[TokenFlag.Delimiter, TokenFlag.NotDelimiter, TokenCategory.Delimiter],
[TokenFlag.Identifier, TokenFlag.NotIdentifier, TokenCategory.Identifier],
[TokenFlag.Unknown, TokenFlag.NotUnknown, TokenCategory.Unknown],
[TokenFlag.NotValid, TokenFlag.Valid, TokenCategory.Invalid]
];
for (const [fe, fn, c] of tasks) {
const success = flags.includes(fe)
? token.category === c
: flags.includes(fn) && token.category !== c;
if (success) return true;
}
return false;
}

public fork(offset: number, size: number) {
return new TextRange(this.text, offset, size);
export function findNextToken(tokens: Token[], position: number, ...flags: TokenFlag[]) {
for (let i = position + 1; i < tokens.length; i++) {
if (checkTokenFlags(tokens[i], flags)) {
return i;
}
}
return tokens.length;
}

public toString() {
return this.text.slice(this.offset, this.offset + this.size);
export function findPrevToken(tokens: Token[], position: number, ...flags: TokenFlag[]) {
for (let i = position - 1; i >= 0; i--) {
if (checkTokenFlags(tokens[i], flags)) {
return i;
}
}
return -1;
}
86 changes: 83 additions & 3 deletions packages/anitomy/src/tokenizer.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import type { AnitomyOptions, ParsedResult } from './types';

import { KeywordManager } from './keyword';
import { TextRange, Token, TokenCategory } from './token';
import { TextRange, Token, TokenCategory, TokenFlag, findNextToken, findPrevToken } from './token';
import { isNumericString } from './utils';

const Brackets: Array<[string, string]> = [
['(', ')'],
Expand Down Expand Up @@ -143,8 +144,87 @@ export function tokenize(filename: string, options: AnitomyOptions) {
const token = tokens[i];
if (token.category !== TokenCategory.Delimiter) continue;
const delimiter = token.content[0];

const prevToken = findPrevToken(tokens, i, TokenFlag.Valid);
let nextToken = findNextToken(tokens, i, TokenFlag.Valid);

// Check for single-character tokens to prevent splitting group names, keywords, episode numbers, etc.
if (![' ', '_'].includes(delimiter)) {
// Single character token
if (isSingleCharacterToken(prevToken)) {
appendTokenTo(token, tokens[prevToken]);

while (isUnknownToken(nextToken)) {
appendTokenTo(tokens[nextToken], tokens[prevToken]);
nextToken = findNextToken(tokens, i, TokenFlag.Valid);
if (!isDelimiterToken(nextToken) || tokens[nextToken].content[0] !== delimiter)
continue;
appendTokenTo(tokens[nextToken], tokens[prevToken]);
nextToken = findNextToken(tokens, nextToken, TokenFlag.Valid);
}

continue;
}

if (isSingleCharacterToken(nextToken)) {
appendTokenTo(token, tokens[prevToken]);
appendTokenTo(tokens[nextToken], tokens[prevToken]);
continue;
}
}

// Check for adjacent delimiters
if (isUnknownToken(prevToken) && isDelimiterToken(nextToken)) {
const nextDelimiter = tokens[nextToken].content[0];
if (delimiter !== nextDelimiter && delimiter !== ',') {
if (delimiter === ' ' || nextDelimiter === '_') {
appendTokenTo(token, tokens[prevToken]);
}
}
} else if (isDelimiterToken(prevToken) && isDelimiterToken(nextToken)) {
const prevDelimiter = tokens[prevToken].content[0];
const nextDelimiter = tokens[nextToken].content[0];
if (prevDelimiter === nextDelimiter && prevDelimiter != delimiter) {
token.category = TokenCategory.Unknown; // e.g. "& in "_&_"
}
}

// Check for other special cases
if (!['&', '+'].includes(delimiter)) continue;
if (!isUnknownToken(prevToken) || !isUnknownToken(nextToken)) continue;
if (
!isNumericString(tokens[prevToken].content) ||
!isNumericString(tokens[nextToken].content)
) {
continue;
}
appendTokenTo(token, tokens[prevToken]);
appendTokenTo(tokens[nextToken], tokens[prevToken]); // e.g. 01+02
}

// Remove invalid tokens
tokens.splice(0, tokens.length, ...tokens.filter((t) => t.category !== TokenCategory.Invalid));

function inRange<T>(list: T[], idx: number) {
return 0 <= idx && idx < list.length;
}

function isDelimiterToken(idx: number) {
return inRange(tokens, idx) && tokens[idx].category === TokenCategory.Delimiter;
}

function isUnknownToken(idx: number) {
return inRange(tokens, idx) && tokens[idx].category === TokenCategory.Unknown;
}

function isSingleCharacterToken(idx: number) {
const content = tokens[idx].content;
return isUnknownToken(idx) && content.length === 1 && content !== '-';
}

function appendTokenTo(src: Token, dst: Token) {
dst.content += src.content;
src.category = TokenCategory.Invalid;
}
}
}

function findPrevToken(tokens: Token[], position: number) {}
4 changes: 4 additions & 0 deletions packages/anitomy/src/utils.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import type { ParsedResult } from './types';

export function isNumericString(text: string) {
return /^\d+$/.test(text);
}

export function mergeResult(source: ParsedResult, income: ParsedResult) {
return {
...source,
Expand Down

0 comments on commit 530e296

Please sign in to comment.