Spaces:

huggingfacejs
/

inference-widgets

Running on CPU Upgrade

inference-widgets / packages /jinja /src /lexer.ts

machineuser

Sync widgets demo

32d7cd6 almost 2 years ago

9.95 kB

	/**
	* Represents tokens that our language understands in parsing.
	*/
	export const TOKEN_TYPES = Object.freeze({
	Text: "Text", // The text between Jinja statements or expressions

	NumericLiteral: "NumericLiteral", // e.g., 123
	BooleanLiteral: "BooleanLiteral", // true or false
	StringLiteral: "StringLiteral", // 'string'
	Identifier: "Identifier", // Variables, functions, etc.
	Equals: "Equals", // =
	OpenParen: "OpenParen", // (
	CloseParen: "CloseParen", // )
	OpenStatement: "OpenStatement", // {%
	CloseStatement: "CloseStatement", // %}
	OpenExpression: "OpenExpression", // {{
	CloseExpression: "CloseExpression", // }}
	OpenSquareBracket: "OpenSquareBracket", // [
	CloseSquareBracket: "CloseSquareBracket", // ]
	OpenCurlyBracket: "OpenCurlyBracket", // {
	CloseCurlyBracket: "CloseCurlyBracket", // }
	Comma: "Comma", // ,
	Dot: "Dot", // .
	Colon: "Colon", // :
	Pipe: "Pipe", // \|

	CallOperator: "CallOperator", // ()
	AdditiveBinaryOperator: "AdditiveBinaryOperator", // + -
	MultiplicativeBinaryOperator: "MultiplicativeBinaryOperator", // * / %
	ComparisonBinaryOperator: "ComparisonBinaryOperator", // < > <= >= == !=
	UnaryOperator: "UnaryOperator", // ! - +

	// Keywords
	Set: "Set",
	If: "If",
	For: "For",
	In: "In",
	Is: "Is",
	NotIn: "NotIn",
	Else: "Else",
	EndIf: "EndIf",
	ElseIf: "ElseIf",
	EndFor: "EndFor",
	And: "And",
	Or: "Or",
	Not: "UnaryOperator",
	});

	export type TokenType = keyof typeof TOKEN_TYPES;

	/**
	* Constant lookup for keywords and known identifiers + symbols.
	*/
	const KEYWORDS = Object.freeze({
	set: TOKEN_TYPES.Set,
	for: TOKEN_TYPES.For,
	in: TOKEN_TYPES.In,
	is: TOKEN_TYPES.Is,
	if: TOKEN_TYPES.If,
	else: TOKEN_TYPES.Else,
	endif: TOKEN_TYPES.EndIf,
	elif: TOKEN_TYPES.ElseIf,
	endfor: TOKEN_TYPES.EndFor,
	and: TOKEN_TYPES.And,
	or: TOKEN_TYPES.Or,
	not: TOKEN_TYPES.Not,
	"not in": TOKEN_TYPES.NotIn,

	// Literals
	true: TOKEN_TYPES.BooleanLiteral,
	false: TOKEN_TYPES.BooleanLiteral,
	});

	/**
	* Represents a single token in the template.
	*/
	export class Token {
	/**
	* Constructs a new Token.
	* @param {string} value The raw value as seen inside the source code.
	* @param {TokenType} type The type of token.
	*/
	constructor(
	public value: string,
	public type: TokenType
	) {}
	}

	function isWord(char: string): boolean {
	return /\w/.test(char);
	}

	function isInteger(char: string): boolean {
	return /[0-9]/.test(char);
	}

	/**
	* A data structure which contains a list of rules to test
	*/
	const ORDERED_MAPPING_TABLE: [string, TokenType][] = [
	// Control sequences
	["{%", TOKEN_TYPES.OpenStatement],
	["%}", TOKEN_TYPES.CloseStatement],
	["{{", TOKEN_TYPES.OpenExpression],
	["}}", TOKEN_TYPES.CloseExpression],
	// Single character tokens
	["(", TOKEN_TYPES.OpenParen],
	[")", TOKEN_TYPES.CloseParen],
	["{", TOKEN_TYPES.OpenCurlyBracket],
	["}", TOKEN_TYPES.CloseCurlyBracket],
	["[", TOKEN_TYPES.OpenSquareBracket],
	["]", TOKEN_TYPES.CloseSquareBracket],
	[",", TOKEN_TYPES.Comma],
	[".", TOKEN_TYPES.Dot],
	[":", TOKEN_TYPES.Colon],
	["\|", TOKEN_TYPES.Pipe],
	// Comparison operators
	["<=", TOKEN_TYPES.ComparisonBinaryOperator],
	[">=", TOKEN_TYPES.ComparisonBinaryOperator],
	["==", TOKEN_TYPES.ComparisonBinaryOperator],
	["!=", TOKEN_TYPES.ComparisonBinaryOperator],
	["<", TOKEN_TYPES.ComparisonBinaryOperator],
	[">", TOKEN_TYPES.ComparisonBinaryOperator],
	// Arithmetic operators
	["+", TOKEN_TYPES.AdditiveBinaryOperator],
	["-", TOKEN_TYPES.AdditiveBinaryOperator],
	["*", TOKEN_TYPES.MultiplicativeBinaryOperator],
	["/", TOKEN_TYPES.MultiplicativeBinaryOperator],
	["%", TOKEN_TYPES.MultiplicativeBinaryOperator],
	// Assignment operator
	["=", TOKEN_TYPES.Equals],
	];

	const ESCAPE_CHARACTERS = new Map([
	["n", "\n"], // New line
	["t", "\t"], // Horizontal tab
	["r", "\r"], // Carriage return
	["b", "\b"], // Backspace
	["f", "\f"], // Form feed
	["v", "\v"], // Vertical tab
	["'", "'"], // Single quote
	['"', '"'], // Double quote
	["\\", "\\"], // Backslash
	]);

	export interface PreprocessOptions {
	trim_blocks?: boolean;
	lstrip_blocks?: boolean;
	}

	function preprocess(template: string, options: PreprocessOptions = {}): string {
	// According to https://jinja.palletsprojects.com/en/3.0.x/templates/#whitespace-control

	// In the default configuration:
	// - a single trailing newline is stripped if present
	// - other whitespace (spaces, tabs, newlines etc.) is returned unchanged
	if (template.endsWith("\n")) {
	template = template.slice(0, -1);
	}

	// Replace all comments with a placeholder
	// This ensures that comments don't interfere with the following options
	template = template.replace(/{#.*?#}/gs, "{##}");

	if (options.lstrip_blocks) {
	// The lstrip_blocks option can also be set to strip tabs and spaces from the
	// beginning of a line to the start of a block. (Nothing will be stripped if
	// there are other characters before the start of the block.)
	template = template.replace(/^[ \t]*({[#%])/gm, "$1");
	}

	if (options.trim_blocks) {
	// If an application configures Jinja to trim_blocks, the first newline after
	// a template tag is removed automatically (like in PHP).
	template = template.replace(/([#%]})\n/g, "$1");
	}

	return template
	.replace(/{##}/g, "") // Remove comments
	.replace(/-%}\s*/g, "%}")
	.replace(/\s*{%-/g, "{%")
	.replace(/-}}\s*/g, "}}")
	.replace(/\s*{{-/g, "{{");
	}

	/**
	* Generate a list of tokens from a source string.
	*/
	export function tokenize(source: string, options: PreprocessOptions = {}): Token[] {
	const tokens: Token[] = [];
	const src: string = preprocess(source, options);

	let cursorPosition = 0;

	const consumeWhile = (predicate: (char: string) => boolean): string => {
	let str = "";
	while (predicate(src[cursorPosition])) {
	// Check for escaped characters
	if (src[cursorPosition] === "\\") {
	// Consume the backslash
	++cursorPosition;
	// Check for end of input
	if (cursorPosition >= src.length) throw new SyntaxError("Unexpected end of input");

	// Add the escaped character
	const escaped = src[cursorPosition++];
	const unescaped = ESCAPE_CHARACTERS.get(escaped);
	if (unescaped === undefined) {
	throw new SyntaxError(`Unexpected escaped character: ${escaped}`);
	}
	str += unescaped;
	continue;
	}

	str += src[cursorPosition++];
	if (cursorPosition >= src.length) throw new SyntaxError("Unexpected end of input");
	}
	return str;
	};

	// Build each token until end of input
	main: while (cursorPosition < src.length) {
	// First, consume all text that is outside of a Jinja statement or expression
	const lastTokenType = tokens.at(-1)?.type;
	if (
	lastTokenType === undefined \|\|
	lastTokenType === TOKEN_TYPES.CloseStatement \|\|
	lastTokenType === TOKEN_TYPES.CloseExpression
	) {
	let text = "";
	while (
	cursorPosition < src.length &&
	// Keep going until we hit the next Jinja statement or expression
	!(src[cursorPosition] === "{" && (src[cursorPosition + 1] === "%" \|\| src[cursorPosition + 1] === "{"))
	) {
	// Consume text
	text += src[cursorPosition++];
	}

	// There is some text to add
	if (text.length > 0) {
	tokens.push(new Token(text, TOKEN_TYPES.Text));
	continue;
	}
	}

	// Consume (and ignore) all whitespace inside Jinja statements or expressions
	consumeWhile((char) => /\s/.test(char));

	// Handle multi-character tokens
	const char = src[cursorPosition];

	// Check for unary operators
	if (char === "-" \|\| char === "+") {
	const lastTokenType = tokens.at(-1)?.type;
	if (lastTokenType === TOKEN_TYPES.Text \|\| lastTokenType === undefined) {
	throw new SyntaxError(`Unexpected character: ${char}`);
	}
	switch (lastTokenType) {
	case TOKEN_TYPES.Identifier:
	case TOKEN_TYPES.NumericLiteral:
	case TOKEN_TYPES.BooleanLiteral:
	case TOKEN_TYPES.StringLiteral:
	case TOKEN_TYPES.CloseParen:
	case TOKEN_TYPES.CloseSquareBracket:
	// Part of a binary operator
	// a - 1, 1 - 1, true - 1, "apple" - 1, (1) - 1, a[1] - 1
	// Continue parsing normally
	break;

	default: {
	// Is part of a unary operator
	// (-1), [-1], (1 + -1), not -1, -apple
	++cursorPosition; // consume the unary operator

	// Check for numbers following the unary operator
	const num = consumeWhile(isInteger);
	tokens.push(
	new Token(`${char}${num}`, num.length > 0 ? TOKEN_TYPES.NumericLiteral : TOKEN_TYPES.UnaryOperator)
	);
	continue;
	}
	}
	}

	// Try to match one of the tokens in the mapping table
	for (const [char, token] of ORDERED_MAPPING_TABLE) {
	const slice = src.slice(cursorPosition, cursorPosition + char.length);
	if (slice === char) {
	tokens.push(new Token(char, token));
	cursorPosition += char.length;
	continue main;
	}
	}

	if (char === "'" \|\| char === '"') {
	++cursorPosition; // Skip the opening quote
	const str = consumeWhile((c) => c !== char);
	tokens.push(new Token(str, TOKEN_TYPES.StringLiteral));
	++cursorPosition; // Skip the closing quote
	continue;
	}

	if (isInteger(char)) {
	const num = consumeWhile(isInteger);
	tokens.push(new Token(num, TOKEN_TYPES.NumericLiteral));
	continue;
	}
	if (isWord(char)) {
	const word = consumeWhile(isWord);

	// Check for special/reserved keywords
	// NOTE: We use Object.hasOwn() to avoid matching `.toString()` and other Object methods
	const type = Object.hasOwn(KEYWORDS, word) ? KEYWORDS[word as keyof typeof KEYWORDS] : TOKEN_TYPES.Identifier;

	// Special case of not in:
	// If the previous token was a "not", and this token is "in"
	// then we want to combine them into a single token
	if (type === TOKEN_TYPES.In && tokens.at(-1)?.type === TOKEN_TYPES.Not) {
	tokens.pop();
	tokens.push(new Token("not in", TOKEN_TYPES.NotIn));
	} else {
	tokens.push(new Token(word, type));
	}

	continue;
	}

	throw new SyntaxError(`Unexpected character: ${char}`);
	}
	return tokens;
	}