first commit

This commit is contained in:
douboer
2026-01-14 13:05:57 +08:00
commit 0a7d2c5c20
1224 changed files with 1045848 additions and 0 deletions

81
node_modules/entities/src/decode-codepoint.ts generated vendored Normal file
View File

@@ -0,0 +1,81 @@
// Adapted from https://github.com/mathiasbynens/he/blob/36afe179392226cf1b6ccdb16ebbb7a5a844d93a/src/he.js#L106-L134
const decodeMap = new Map([
[0, 65_533],
// C1 Unicode control character reference replacements
[128, 8364],
[130, 8218],
[131, 402],
[132, 8222],
[133, 8230],
[134, 8224],
[135, 8225],
[136, 710],
[137, 8240],
[138, 352],
[139, 8249],
[140, 338],
[142, 381],
[145, 8216],
[146, 8217],
[147, 8220],
[148, 8221],
[149, 8226],
[150, 8211],
[151, 8212],
[152, 732],
[153, 8482],
[154, 353],
[155, 8250],
[156, 339],
[158, 382],
[159, 376],
]);
/**
* Polyfill for `String.fromCodePoint`. It is used to create a string from a Unicode code point.
*/
export const fromCodePoint: (...codePoints: number[]) => string =
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition, n/no-unsupported-features/es-builtins
String.fromCodePoint ??
((codePoint: number): string => {
let output = "";
if (codePoint > 0xff_ff) {
codePoint -= 0x1_00_00;
output += String.fromCharCode(
((codePoint >>> 10) & 0x3_ff) | 0xd8_00,
);
codePoint = 0xdc_00 | (codePoint & 0x3_ff);
}
output += String.fromCharCode(codePoint);
return output;
});
/**
* Replace the given code point with a replacement character if it is a
* surrogate or is outside the valid range. Otherwise return the code
* point unchanged.
*/
export function replaceCodePoint(codePoint: number): number {
if (
(codePoint >= 0xd8_00 && codePoint <= 0xdf_ff) ||
codePoint > 0x10_ff_ff
) {
return 0xff_fd;
}
return decodeMap.get(codePoint) ?? codePoint;
}
/**
* Replace the code point if relevant, then convert it to a string.
*
* @deprecated Use `fromCodePoint(replaceCodePoint(codePoint))` instead.
* @param codePoint The code point to decode.
* @returns The decoded code point.
*/
export function decodeCodePoint(codePoint: number): string {
return fromCodePoint(replaceCodePoint(codePoint));
}

363
node_modules/entities/src/decode.spec.ts generated vendored Normal file
View File

@@ -0,0 +1,363 @@
import { describe, expect, it, vitest } from "vitest";
import * as entities from "./decode.js";
describe("Decode test", () => {
const testcases = [
{ input: "&amp;amp;", output: "&amp;" },
{ input: "&amp;#38;", output: "&#38;" },
{ input: "&amp;#x26;", output: "&#x26;" },
{ input: "&amp;#X26;", output: "&#X26;" },
{ input: "&#38;#38;", output: "&#38;" },
{ input: "&#x26;#38;", output: "&#38;" },
{ input: "&#X26;#38;", output: "&#38;" },
{ input: "&#x3a;", output: ":" },
{ input: "&#x3A;", output: ":" },
{ input: "&#X3a;", output: ":" },
{ input: "&#X3A;", output: ":" },
{ input: "&#", output: "&#" },
{ input: "&>", output: "&>" },
{ input: "id=770&#anchor", output: "id=770&#anchor" },
];
for (const { input, output } of testcases) {
it(`should XML decode ${input}`, () =>
expect(entities.decodeXML(input)).toBe(output));
it(`should HTML decode ${input}`, () =>
expect(entities.decodeHTML(input)).toBe(output));
}
it("should HTML decode partial legacy entity", () => {
expect(entities.decodeHTMLStrict("&timesbar")).toBe("&timesbar");
expect(entities.decodeHTML("&timesbar")).toBe("×bar");
});
it("should HTML decode legacy entities according to spec", () =>
expect(entities.decodeHTML("?&image_uri=1&=2&image=3")).toBe(
"?&image_uri=1&=2&image=3",
));
it("should back out of legacy entities", () =>
expect(entities.decodeHTML("&ampa")).toBe("&a"));
it("should not parse numeric entities in strict mode", () =>
expect(entities.decodeHTMLStrict("&#55")).toBe("&#55"));
it("should parse &nbsp followed by < (#852)", () =>
expect(entities.decodeHTML("&nbsp<")).toBe("\u00A0<"));
it("should decode trailing legacy entities", () => {
expect(entities.decodeHTML("&timesbar;&timesbar")).toBe("⨱×bar");
});
it("should decode multi-byte entities", () => {
expect(entities.decodeHTML("&NotGreaterFullEqual;")).toBe("≧̸");
});
it("should not decode legacy entities followed by text in attribute mode", () => {
expect(
entities.decodeHTML("&not", entities.DecodingMode.Attribute),
).toBe("¬");
expect(
entities.decodeHTML("&noti", entities.DecodingMode.Attribute),
).toBe("&noti");
expect(
entities.decodeHTML("&not=", entities.DecodingMode.Attribute),
).toBe("&not=");
expect(entities.decodeHTMLAttribute("&notp")).toBe("&notp");
expect(entities.decodeHTMLAttribute("&notP")).toBe("&notP");
expect(entities.decodeHTMLAttribute("&not3")).toBe("&not3");
});
});
describe("EntityDecoder", () => {
it("should decode decimal entities", () => {
const callback = vitest.fn();
const decoder = new entities.EntityDecoder(
entities.htmlDecodeTree,
callback,
);
expect(decoder.write("&#5", 1)).toBe(-1);
expect(decoder.write("8;", 0)).toBe(5);
expect(callback).toHaveBeenCalledTimes(1);
expect(callback).toHaveBeenCalledWith(":".charCodeAt(0), 5);
});
it("should decode hex entities", () => {
const callback = vitest.fn();
const decoder = new entities.EntityDecoder(
entities.htmlDecodeTree,
callback,
);
expect(decoder.write("&#x3a;", 1)).toBe(6);
expect(callback).toHaveBeenCalledTimes(1);
expect(callback).toHaveBeenCalledWith(":".charCodeAt(0), 6);
});
it("should decode named entities", () => {
const callback = vitest.fn();
const decoder = new entities.EntityDecoder(
entities.htmlDecodeTree,
callback,
);
expect(decoder.write("&amp;", 1)).toBe(5);
expect(callback).toHaveBeenCalledTimes(1);
expect(callback).toHaveBeenCalledWith("&".charCodeAt(0), 5);
});
it("should decode legacy entities", () => {
const callback = vitest.fn();
const decoder = new entities.EntityDecoder(
entities.htmlDecodeTree,
callback,
);
decoder.startEntity(entities.DecodingMode.Legacy);
expect(decoder.write("&amp", 1)).toBe(-1);
expect(callback).toHaveBeenCalledTimes(0);
expect(decoder.end()).toBe(4);
expect(callback).toHaveBeenCalledTimes(1);
expect(callback).toHaveBeenCalledWith("&".charCodeAt(0), 4);
});
it("should decode named entity written character by character", () => {
const callback = vitest.fn();
const decoder = new entities.EntityDecoder(
entities.htmlDecodeTree,
callback,
);
for (const c of "amp") {
expect(decoder.write(c, 0)).toBe(-1);
}
expect(decoder.write(";", 0)).toBe(5);
expect(callback).toHaveBeenCalledTimes(1);
expect(callback).toHaveBeenCalledWith("&".charCodeAt(0), 5);
});
it("should decode numeric entity written character by character", () => {
const callback = vitest.fn();
const decoder = new entities.EntityDecoder(
entities.htmlDecodeTree,
callback,
);
for (const c of "#x3a") {
expect(decoder.write(c, 0)).toBe(-1);
}
expect(decoder.write(";", 0)).toBe(6);
expect(callback).toHaveBeenCalledTimes(1);
expect(callback).toHaveBeenCalledWith(":".charCodeAt(0), 6);
});
it("should decode hex entities across several chunks", () => {
const callback = vitest.fn();
const decoder = new entities.EntityDecoder(
entities.htmlDecodeTree,
callback,
);
for (const chunk of ["#x", "cf", "ff", "d"]) {
expect(decoder.write(chunk, 0)).toBe(-1);
}
expect(decoder.write(";", 0)).toBe(9);
expect(callback).toHaveBeenCalledTimes(1);
expect(callback).toHaveBeenCalledWith(0xc_ff_fd, 9);
});
it("should not fail if nothing is written", () => {
const callback = vitest.fn();
const decoder = new entities.EntityDecoder(
entities.htmlDecodeTree,
callback,
);
expect(decoder.end()).toBe(0);
expect(callback).toHaveBeenCalledTimes(0);
});
/*
* Focused tests exercising early exit paths inside a compact run in the real trie.
* Discovered prefix: "zi" followed by compact run "grarr"; mismatching inside this run should
* return 0 with no emission (result still 0).
*/
describe("compact run mismatches", () => {
it("first run character mismatch returns 0", () => {
const callback = vitest.fn();
const d = new entities.EntityDecoder(
entities.htmlDecodeTree,
callback,
);
d.startEntity(entities.DecodingMode.Strict);
// After '&': correct prefix 'zi', wrong first run char 'X' (expected 'g').
expect(d.write("ziXgrar", 0)).toBe(0);
expect(callback).not.toHaveBeenCalled();
});
it("mismatch after one correct run char returns 0", () => {
const callback = vitest.fn();
const d = new entities.EntityDecoder(
entities.htmlDecodeTree,
callback,
);
d.startEntity(entities.DecodingMode.Strict);
// 'zig' matches prefix + first run char; next char 'X' mismatches expected 'r'.
expect(d.write("zigXarr", 0)).toBe(0);
expect(callback).not.toHaveBeenCalled();
});
it("mismatch after two correct run chars returns 0", () => {
const callback = vitest.fn();
const d = new entities.EntityDecoder(
entities.htmlDecodeTree,
callback,
);
d.startEntity(entities.DecodingMode.Strict);
// 'zigr' matches prefix + first two run chars; next char 'X' mismatches expected 'a'.
expect(d.write("zigrXrr", 0)).toBe(0);
expect(callback).not.toHaveBeenCalled();
});
});
describe("errors", () => {
it("should produce an error for a named entity without a semicolon", () => {
const errorHandlers = {
missingSemicolonAfterCharacterReference: vitest.fn(),
absenceOfDigitsInNumericCharacterReference: vitest.fn(),
validateNumericCharacterReference: vitest.fn(),
};
const callback = vitest.fn();
const decoder = new entities.EntityDecoder(
entities.htmlDecodeTree,
callback,
errorHandlers,
);
decoder.startEntity(entities.DecodingMode.Legacy);
expect(decoder.write("&amp;", 1)).toBe(5);
expect(callback).toHaveBeenCalledTimes(1);
expect(callback).toHaveBeenCalledWith("&".charCodeAt(0), 5);
expect(
errorHandlers.missingSemicolonAfterCharacterReference,
).toHaveBeenCalledTimes(0);
decoder.startEntity(entities.DecodingMode.Legacy);
expect(decoder.write("&amp", 1)).toBe(-1);
expect(decoder.end()).toBe(4);
expect(callback).toHaveBeenCalledTimes(2);
expect(callback).toHaveBeenLastCalledWith("&".charCodeAt(0), 4);
expect(
errorHandlers.missingSemicolonAfterCharacterReference,
).toHaveBeenCalledTimes(1);
});
it("should produce an error for a numeric entity without a semicolon", () => {
const errorHandlers = {
missingSemicolonAfterCharacterReference: vitest.fn(),
absenceOfDigitsInNumericCharacterReference: vitest.fn(),
validateNumericCharacterReference: vitest.fn(),
};
const callback = vitest.fn();
const decoder = new entities.EntityDecoder(
entities.htmlDecodeTree,
callback,
errorHandlers,
);
decoder.startEntity(entities.DecodingMode.Legacy);
expect(decoder.write("&#x3a", 1)).toBe(-1);
expect(decoder.end()).toBe(5);
expect(callback).toHaveBeenCalledTimes(1);
expect(callback).toHaveBeenCalledWith(0x3a, 5);
expect(
errorHandlers.missingSemicolonAfterCharacterReference,
).toHaveBeenCalledTimes(1);
expect(
errorHandlers.absenceOfDigitsInNumericCharacterReference,
).toHaveBeenCalledTimes(0);
expect(
errorHandlers.validateNumericCharacterReference,
).toHaveBeenCalledTimes(1);
expect(
errorHandlers.validateNumericCharacterReference,
).toHaveBeenCalledWith(0x3a);
});
it("should produce an error for numeric entities without digits", () => {
const errorHandlers = {
missingSemicolonAfterCharacterReference: vitest.fn(),
absenceOfDigitsInNumericCharacterReference: vitest.fn(),
validateNumericCharacterReference: vitest.fn(),
};
const callback = vitest.fn();
const decoder = new entities.EntityDecoder(
entities.htmlDecodeTree,
callback,
errorHandlers,
);
decoder.startEntity(entities.DecodingMode.Legacy);
expect(decoder.write("&#", 1)).toBe(-1);
expect(decoder.end()).toBe(0);
expect(callback).toHaveBeenCalledTimes(0);
expect(
errorHandlers.missingSemicolonAfterCharacterReference,
).toHaveBeenCalledTimes(0);
expect(
errorHandlers.absenceOfDigitsInNumericCharacterReference,
).toHaveBeenCalledTimes(1);
expect(
errorHandlers.absenceOfDigitsInNumericCharacterReference,
).toHaveBeenCalledWith(2);
expect(
errorHandlers.validateNumericCharacterReference,
).toHaveBeenCalledTimes(0);
});
it("should produce an error for hex entities without digits", () => {
const errorHandlers = {
missingSemicolonAfterCharacterReference: vitest.fn(),
absenceOfDigitsInNumericCharacterReference: vitest.fn(),
validateNumericCharacterReference: vitest.fn(),
};
const callback = vitest.fn();
const decoder = new entities.EntityDecoder(
entities.htmlDecodeTree,
callback,
errorHandlers,
);
decoder.startEntity(entities.DecodingMode.Legacy);
expect(decoder.write("&#x", 1)).toBe(-1);
expect(decoder.end()).toBe(0);
expect(callback).toHaveBeenCalledTimes(0);
expect(
errorHandlers.missingSemicolonAfterCharacterReference,
).toHaveBeenCalledTimes(0);
expect(
errorHandlers.absenceOfDigitsInNumericCharacterReference,
).toHaveBeenCalledTimes(1);
expect(
errorHandlers.validateNumericCharacterReference,
).toHaveBeenCalledTimes(0);
});
});
});

676
node_modules/entities/src/decode.ts generated vendored Normal file
View File

@@ -0,0 +1,676 @@
import { fromCodePoint, replaceCodePoint } from "./decode-codepoint.js";
import { htmlDecodeTree } from "./generated/decode-data-html.js";
import { xmlDecodeTree } from "./generated/decode-data-xml.js";
import { BinTrieFlags } from "./internal/bin-trie-flags.js";
const enum CharCodes {
NUM = 35, // "#"
SEMI = 59, // ";"
EQUALS = 61, // "="
ZERO = 48, // "0"
NINE = 57, // "9"
LOWER_A = 97, // "a"
LOWER_F = 102, // "f"
LOWER_X = 120, // "x"
LOWER_Z = 122, // "z"
UPPER_A = 65, // "A"
UPPER_F = 70, // "F"
UPPER_Z = 90, // "Z"
}
/** Bit that needs to be set to convert an upper case ASCII character to lower case */
const TO_LOWER_BIT = 0b10_0000;
function isNumber(code: number): boolean {
return code >= CharCodes.ZERO && code <= CharCodes.NINE;
}
function isHexadecimalCharacter(code: number): boolean {
return (
(code >= CharCodes.UPPER_A && code <= CharCodes.UPPER_F) ||
(code >= CharCodes.LOWER_A && code <= CharCodes.LOWER_F)
);
}
function isAsciiAlphaNumeric(code: number): boolean {
return (
(code >= CharCodes.UPPER_A && code <= CharCodes.UPPER_Z) ||
(code >= CharCodes.LOWER_A && code <= CharCodes.LOWER_Z) ||
isNumber(code)
);
}
/**
* Checks if the given character is a valid end character for an entity in an attribute.
*
* Attribute values that aren't terminated properly aren't parsed, and shouldn't lead to a parser error.
* See the example in https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
*/
function isEntityInAttributeInvalidEnd(code: number): boolean {
return code === CharCodes.EQUALS || isAsciiAlphaNumeric(code);
}
const enum EntityDecoderState {
EntityStart,
NumericStart,
NumericDecimal,
NumericHex,
NamedEntity,
}
export enum DecodingMode {
/** Entities in text nodes that can end with any character. */
Legacy = 0,
/** Only allow entities terminated with a semicolon. */
Strict = 1,
/** Entities in attributes have limitations on ending characters. */
Attribute = 2,
}
/**
* Producers for character reference errors as defined in the HTML spec.
*/
export interface EntityErrorProducer {
missingSemicolonAfterCharacterReference(): void;
absenceOfDigitsInNumericCharacterReference(
consumedCharacters: number,
): void;
validateNumericCharacterReference(code: number): void;
}
/**
* Token decoder with support of writing partial entities.
*/
export class EntityDecoder {
constructor(
/** The tree used to decode entities. */
// biome-ignore lint/correctness/noUnusedPrivateClassMembers: False positive
private readonly decodeTree: Uint16Array,
/**
* The function that is called when a codepoint is decoded.
*
* For multi-byte named entities, this will be called multiple times,
* with the second codepoint, and the same `consumed` value.
*
* @param codepoint The decoded codepoint.
* @param consumed The number of bytes consumed by the decoder.
*/
private readonly emitCodePoint: (cp: number, consumed: number) => void,
/** An object that is used to produce errors. */
private readonly errors?: EntityErrorProducer | undefined,
) {}
/** The current state of the decoder. */
private state = EntityDecoderState.EntityStart;
/** Characters that were consumed while parsing an entity. */
private consumed = 1;
/**
* The result of the entity.
*
* Either the result index of a numeric entity, or the codepoint of a
* numeric entity.
*/
private result = 0;
/** The current index in the decode tree. */
private treeIndex = 0;
/** The number of characters that were consumed in excess. */
private excess = 1;
/** The mode in which the decoder is operating. */
private decodeMode = DecodingMode.Strict;
/** Resets the instance to make it reusable. */
startEntity(decodeMode: DecodingMode): void {
this.decodeMode = decodeMode;
this.state = EntityDecoderState.EntityStart;
this.result = 0;
this.treeIndex = 0;
this.excess = 1;
this.consumed = 1;
}
/**
* Write an entity to the decoder. This can be called multiple times with partial entities.
* If the entity is incomplete, the decoder will return -1.
*
* Mirrors the implementation of `getDecoder`, but with the ability to stop decoding if the
* entity is incomplete, and resume when the next string is written.
*
* @param input The string containing the entity (or a continuation of the entity).
* @param offset The offset at which the entity begins. Should be 0 if this is not the first call.
* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
*/
write(input: string, offset: number): number {
switch (this.state) {
case EntityDecoderState.EntityStart: {
if (input.charCodeAt(offset) === CharCodes.NUM) {
this.state = EntityDecoderState.NumericStart;
this.consumed += 1;
return this.stateNumericStart(input, offset + 1);
}
this.state = EntityDecoderState.NamedEntity;
return this.stateNamedEntity(input, offset);
}
case EntityDecoderState.NumericStart: {
return this.stateNumericStart(input, offset);
}
case EntityDecoderState.NumericDecimal: {
return this.stateNumericDecimal(input, offset);
}
case EntityDecoderState.NumericHex: {
return this.stateNumericHex(input, offset);
}
case EntityDecoderState.NamedEntity: {
return this.stateNamedEntity(input, offset);
}
}
}
/**
* Switches between the numeric decimal and hexadecimal states.
*
* Equivalent to the `Numeric character reference state` in the HTML spec.
*
* @param input The string containing the entity (or a continuation of the entity).
* @param offset The current offset.
* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
*/
private stateNumericStart(input: string, offset: number): number {
if (offset >= input.length) {
return -1;
}
if ((input.charCodeAt(offset) | TO_LOWER_BIT) === CharCodes.LOWER_X) {
this.state = EntityDecoderState.NumericHex;
this.consumed += 1;
return this.stateNumericHex(input, offset + 1);
}
this.state = EntityDecoderState.NumericDecimal;
return this.stateNumericDecimal(input, offset);
}
/**
* Parses a hexadecimal numeric entity.
*
* Equivalent to the `Hexademical character reference state` in the HTML spec.
*
* @param input The string containing the entity (or a continuation of the entity).
* @param offset The current offset.
* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
*/
private stateNumericHex(input: string, offset: number): number {
while (offset < input.length) {
const char = input.charCodeAt(offset);
if (isNumber(char) || isHexadecimalCharacter(char)) {
// Convert hex digit to value (0-15); 'a'/'A' -> 10.
const digit =
char <= CharCodes.NINE
? char - CharCodes.ZERO
: (char | TO_LOWER_BIT) - CharCodes.LOWER_A + 10;
this.result = this.result * 16 + digit;
this.consumed++;
offset++;
} else {
return this.emitNumericEntity(char, 3);
}
}
return -1; // Incomplete entity
}
/**
* Parses a decimal numeric entity.
*
* Equivalent to the `Decimal character reference state` in the HTML spec.
*
* @param input The string containing the entity (or a continuation of the entity).
* @param offset The current offset.
* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
*/
private stateNumericDecimal(input: string, offset: number): number {
while (offset < input.length) {
const char = input.charCodeAt(offset);
if (isNumber(char)) {
this.result = this.result * 10 + (char - CharCodes.ZERO);
this.consumed++;
offset++;
} else {
return this.emitNumericEntity(char, 2);
}
}
return -1; // Incomplete entity
}
/**
* Validate and emit a numeric entity.
*
* Implements the logic from the `Hexademical character reference start
* state` and `Numeric character reference end state` in the HTML spec.
*
* @param lastCp The last code point of the entity. Used to see if the
* entity was terminated with a semicolon.
* @param expectedLength The minimum number of characters that should be
* consumed. Used to validate that at least one digit
* was consumed.
* @returns The number of characters that were consumed.
*/
private emitNumericEntity(lastCp: number, expectedLength: number): number {
// Ensure we consumed at least one digit.
if (this.consumed <= expectedLength) {
this.errors?.absenceOfDigitsInNumericCharacterReference(
this.consumed,
);
return 0;
}
// Figure out if this is a legit end of the entity
if (lastCp === CharCodes.SEMI) {
this.consumed += 1;
} else if (this.decodeMode === DecodingMode.Strict) {
return 0;
}
this.emitCodePoint(replaceCodePoint(this.result), this.consumed);
if (this.errors) {
if (lastCp !== CharCodes.SEMI) {
this.errors.missingSemicolonAfterCharacterReference();
}
this.errors.validateNumericCharacterReference(this.result);
}
return this.consumed;
}
/**
* Parses a named entity.
*
* Equivalent to the `Named character reference state` in the HTML spec.
*
* @param input The string containing the entity (or a continuation of the entity).
* @param offset The current offset.
* @returns The number of characters that were consumed, or -1 if the entity is incomplete.
*/
private stateNamedEntity(input: string, offset: number): number {
const { decodeTree } = this;
let current = decodeTree[this.treeIndex];
// The length is the number of bytes of the value, including the current byte.
let valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
while (offset < input.length) {
// Handle compact runs (possibly inline): valueLength == 0 and SEMI_REQUIRED bit set.
if (valueLength === 0 && (current & BinTrieFlags.FLAG13) !== 0) {
const runLength =
(current & BinTrieFlags.BRANCH_LENGTH) >> 7; /* 2..63 */
const firstChar = current & BinTrieFlags.JUMP_TABLE;
// Fast-fail if we don't have enough remaining input for the full run (incomplete entity)
if (offset + runLength > input.length) return -1;
// Verify first char
if (input.charCodeAt(offset) !== firstChar) {
return this.result === 0
? 0
: this.emitNotTerminatedNamedEntity();
}
offset++;
this.excess++;
// Remaining characters after the first
const remaining = runLength - 1;
// Iterate over packed 2-char words
for (let runPos = 1; runPos < runLength; runPos += 2) {
const packedWord =
decodeTree[this.treeIndex + 1 + ((runPos - 1) >> 1)];
const low = packedWord & 0xff;
if (input.charCodeAt(offset) !== low) {
return this.result === 0
? 0
: this.emitNotTerminatedNamedEntity();
}
offset++;
this.excess++;
const high = (packedWord >> 8) & 0xff;
if (runPos + 1 < runLength) {
if (input.charCodeAt(offset) !== high) {
return this.result === 0
? 0
: this.emitNotTerminatedNamedEntity();
}
offset++;
this.excess++;
}
}
this.treeIndex += 1 + ((remaining + 1) >> 1);
current = decodeTree[this.treeIndex];
valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
}
if (offset >= input.length) break;
const char = input.charCodeAt(offset);
/*
* Implicit semicolon handling for nodes that require a semicolon but
* don't have an explicit ';' branch stored in the trie. If we have
* a value on the current node, it requires a semicolon, and the
* current input character is a semicolon, emit the entity using the
* current node (without descending further).
*/
if (
char === CharCodes.SEMI &&
valueLength !== 0 &&
(current & BinTrieFlags.FLAG13) !== 0
) {
return this.emitNamedEntityData(
this.treeIndex,
valueLength,
this.consumed + this.excess,
);
}
this.treeIndex = determineBranch(
decodeTree,
current,
this.treeIndex + Math.max(1, valueLength),
char,
);
if (this.treeIndex < 0) {
return this.result === 0 ||
// If we are parsing an attribute
(this.decodeMode === DecodingMode.Attribute &&
// We shouldn't have consumed any characters after the entity,
(valueLength === 0 ||
// And there should be no invalid characters.
isEntityInAttributeInvalidEnd(char)))
? 0
: this.emitNotTerminatedNamedEntity();
}
current = decodeTree[this.treeIndex];
valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
// If the branch is a value, store it and continue
if (valueLength !== 0) {
// If the entity is terminated by a semicolon, we are done.
if (char === CharCodes.SEMI) {
return this.emitNamedEntityData(
this.treeIndex,
valueLength,
this.consumed + this.excess,
);
}
// If we encounter a non-terminated (legacy) entity while parsing strictly, then ignore it.
if (
this.decodeMode !== DecodingMode.Strict &&
(current & BinTrieFlags.FLAG13) === 0
) {
this.result = this.treeIndex;
this.consumed += this.excess;
this.excess = 0;
}
}
// Increment offset & excess for next iteration
offset++;
this.excess++;
}
return -1;
}
/**
* Emit a named entity that was not terminated with a semicolon.
*
* @returns The number of characters consumed.
*/
private emitNotTerminatedNamedEntity(): number {
const { result, decodeTree } = this;
const valueLength =
(decodeTree[result] & BinTrieFlags.VALUE_LENGTH) >> 14;
this.emitNamedEntityData(result, valueLength, this.consumed);
this.errors?.missingSemicolonAfterCharacterReference();
return this.consumed;
}
/**
* Emit a named entity.
*
* @param result The index of the entity in the decode tree.
* @param valueLength The number of bytes in the entity.
* @param consumed The number of characters consumed.
*
* @returns The number of characters consumed.
*/
private emitNamedEntityData(
result: number,
valueLength: number,
consumed: number,
): number {
const { decodeTree } = this;
this.emitCodePoint(
valueLength === 1
? decodeTree[result] &
~(BinTrieFlags.VALUE_LENGTH | BinTrieFlags.FLAG13)
: decodeTree[result + 1],
consumed,
);
if (valueLength === 3) {
// For multi-byte values, we need to emit the second byte.
this.emitCodePoint(decodeTree[result + 2], consumed);
}
return consumed;
}
/**
* Signal to the parser that the end of the input was reached.
*
* Remaining data will be emitted and relevant errors will be produced.
*
* @returns The number of characters consumed.
*/
end(): number {
switch (this.state) {
case EntityDecoderState.NamedEntity: {
// Emit a named entity if we have one.
return this.result !== 0 &&
(this.decodeMode !== DecodingMode.Attribute ||
this.result === this.treeIndex)
? this.emitNotTerminatedNamedEntity()
: 0;
}
// Otherwise, emit a numeric entity if we have one.
case EntityDecoderState.NumericDecimal: {
return this.emitNumericEntity(0, 2);
}
case EntityDecoderState.NumericHex: {
return this.emitNumericEntity(0, 3);
}
case EntityDecoderState.NumericStart: {
this.errors?.absenceOfDigitsInNumericCharacterReference(
this.consumed,
);
return 0;
}
case EntityDecoderState.EntityStart: {
// Return 0 if we have no entity.
return 0;
}
}
}
}
/**
* Creates a function that decodes entities in a string.
*
* @param decodeTree The decode tree.
* @returns A function that decodes entities in a string.
*/
function getDecoder(decodeTree: Uint16Array) {
let returnValue = "";
const decoder = new EntityDecoder(
decodeTree,
(data) => (returnValue += fromCodePoint(data)),
);
return function decodeWithTrie(
input: string,
decodeMode: DecodingMode,
): string {
let lastIndex = 0;
let offset = 0;
while ((offset = input.indexOf("&", offset)) >= 0) {
returnValue += input.slice(lastIndex, offset);
decoder.startEntity(decodeMode);
const length = decoder.write(
input,
// Skip the "&"
offset + 1,
);
if (length < 0) {
lastIndex = offset + decoder.end();
break;
}
lastIndex = offset + length;
// If `length` is 0, skip the current `&` and continue.
offset = length === 0 ? lastIndex + 1 : lastIndex;
}
const result = returnValue + input.slice(lastIndex);
// Make sure we don't keep a reference to the final string.
returnValue = "";
return result;
};
}
/**
* Determines the branch of the current node that is taken given the current
* character. This function is used to traverse the trie.
*
* @param decodeTree The trie.
* @param current The current node.
* @param nodeIdx The index right after the current node and its value.
* @param char The current character.
* @returns The index of the next node, or -1 if no branch is taken.
*/
export function determineBranch(
decodeTree: Uint16Array,
current: number,
nodeIndex: number,
char: number,
): number {
const branchCount = (current & BinTrieFlags.BRANCH_LENGTH) >> 7;
const jumpOffset = current & BinTrieFlags.JUMP_TABLE;
// Case 1: Single branch encoded in jump offset
if (branchCount === 0) {
return jumpOffset !== 0 && char === jumpOffset ? nodeIndex : -1;
}
// Case 2: Multiple branches encoded in jump table
if (jumpOffset) {
const value = char - jumpOffset;
return value < 0 || value >= branchCount
? -1
: decodeTree[nodeIndex + value] - 1;
}
// Case 3: Multiple branches encoded in packed dictionary (two keys per uint16)
const packedKeySlots = (branchCount + 1) >> 1;
/*
* Treat packed keys as a virtual sorted array of length `branchCount`.
* Key(i) = low byte for even i, high byte for odd i in slot i>>1.
*/
let lo = 0;
let hi = branchCount - 1;
while (lo <= hi) {
const mid = (lo + hi) >>> 1;
const slot = mid >> 1;
const packed = decodeTree[nodeIndex + slot];
const midKey = (packed >> ((mid & 1) * 8)) & 0xff;
if (midKey < char) {
lo = mid + 1;
} else if (midKey > char) {
hi = mid - 1;
} else {
return decodeTree[nodeIndex + packedKeySlots + mid];
}
}
return -1;
}
const htmlDecoder = /* #__PURE__ */ getDecoder(htmlDecodeTree);
const xmlDecoder = /* #__PURE__ */ getDecoder(xmlDecodeTree);
/**
* Decodes an HTML string.
*
* @param htmlString The string to decode.
* @param mode The decoding mode.
* @returns The decoded string.
*/
export function decodeHTML(
htmlString: string,
mode: DecodingMode = DecodingMode.Legacy,
): string {
return htmlDecoder(htmlString, mode);
}
/**
* Decodes an HTML string in an attribute.
*
* @param htmlAttribute The string to decode.
* @returns The decoded string.
*/
export function decodeHTMLAttribute(htmlAttribute: string): string {
return htmlDecoder(htmlAttribute, DecodingMode.Attribute);
}
/**
* Decodes an HTML string, requiring all entities to be terminated by a semicolon.
*
* @param htmlString The string to decode.
* @returns The decoded string.
*/
export function decodeHTMLStrict(htmlString: string): string {
return htmlDecoder(htmlString, DecodingMode.Strict);
}
/**
* Decodes an XML string, requiring all entities to be terminated by a semicolon.
*
* @param xmlString The string to decode.
* @returns The decoded string.
*/
export function decodeXML(xmlString: string): string {
return xmlDecoder(xmlString, DecodingMode.Strict);
}
export {
decodeCodePoint,
fromCodePoint,
replaceCodePoint,
} from "./decode-codepoint.js";
// Re-export for use by eg. htmlparser2
export { htmlDecodeTree } from "./generated/decode-data-html.js";
export { xmlDecodeTree } from "./generated/decode-data-xml.js";

78
node_modules/entities/src/encode.spec.ts generated vendored Normal file
View File

@@ -0,0 +1,78 @@
import { describe, expect, it } from "vitest";
import * as entities from "./index.js";
describe("Encode->decode test", () => {
const testcases = [
{
input: "asdf & ÿ ü '",
xml: "asdf &amp; &#xff; &#xfc; &apos;",
html: "asdf &amp; &yuml; &uuml; &apos;",
},
{
input: "&#38;",
xml: "&amp;#38;",
html: "&amp;&num;38&semi;",
},
];
for (const { input, xml, html } of testcases) {
const encodedXML = entities.encodeXML(input);
it(`should XML encode ${input}`, () => expect(encodedXML).toBe(xml));
it(`should default to XML encode ${input}`, () =>
expect(entities.encode(input)).toBe(xml));
it(`should XML decode ${encodedXML}`, () =>
expect(entities.decodeXML(encodedXML)).toBe(input));
it(`should default to XML encode ${encodedXML}`, () =>
expect(entities.decode(encodedXML)).toBe(input));
it(`should default strict to XML encode ${encodedXML}`, () =>
expect(entities.decodeStrict(encodedXML)).toBe(input));
const encodedHTML5 = entities.encodeHTML5(input);
it(`should HTML5 encode ${input}`, () =>
expect(encodedHTML5).toBe(html));
it(`should HTML5 decode ${encodedHTML5}`, () =>
expect(entities.decodeHTML(encodedHTML5)).toBe(input));
it("should encode emojis", () =>
expect(entities.encodeHTML5("😄🍾🥳💥😇")).toBe(
"&#x1f604;&#x1f37e;&#x1f973;&#x1f4a5;&#x1f607;",
));
}
it("should encode data URIs (issue #16)", () => {
const data =
"data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAALAAABAAEAAAIBRAA7";
expect(entities.decode(entities.encode(data))).toBe(data);
});
it("should HTML encode all ASCII characters", () => {
for (let index = 0; index < 128; index++) {
const char = String.fromCharCode(index);
const encoded = entities.encodeHTML(char);
const decoded = entities.decodeHTML(encoded);
expect(decoded).toBe(char);
}
});
it("should encode trailing parts of entities", () =>
expect(entities.encodeHTML("\uD835")).toBe("&#xd835;"));
it("should encode surrogate pair with first surrogate equivalent of entity, without corresponding entity", () =>
expect(entities.encodeHTML("\u{1D4A4}")).toBe("&#x1d4a4;"));
});
describe("encodeNonAsciiHTML", () => {
it("should encode all non-ASCII characters", () =>
expect(entities.encodeNonAsciiHTML("<test> #123! übermaßen")).toBe(
"&lt;test&gt; #123! &uuml;berma&szlig;en",
));
it("should encode emojis", () =>
expect(entities.encodeNonAsciiHTML("😄🍾🥳💥😇")).toBe(
"&#x1f604;&#x1f37e;&#x1f973;&#x1f4a5;&#x1f607;",
));
it("should encode chars above surrogates", () =>
expect(entities.encodeNonAsciiHTML("♒️♓️♈️♉️♊️♋️♌️♍️♎️♏️♐️♑️")).toBe(
"&#x2652;&#xfe0f;&#x2653;&#xfe0f;&#x2648;&#xfe0f;&#x2649;&#xfe0f;&#x264a;&#xfe0f;&#x264b;&#xfe0f;&#x264c;&#xfe0f;&#x264d;&#xfe0f;&#x264e;&#xfe0f;&#x264f;&#xfe0f;&#x2650;&#xfe0f;&#x2651;&#xfe0f;",
));
});

93
node_modules/entities/src/encode.ts generated vendored Normal file
View File

@@ -0,0 +1,93 @@
import { getCodePoint, XML_BITSET_VALUE } from "./escape.js";
import { htmlTrie } from "./generated/encode-html.js";
/**
* We store the characters to consider as a compact bitset for fast lookups.
*/
const HTML_BITSET = /* #__PURE__ */ new Uint32Array([
0x16_00, // Bits for 09,0A,0C
0xfc_00_ff_fe, // 32..63 -> 21-2D (minus space), 2E,2F,3A-3F
0xf8_00_00_01, // 64..95 -> 40, 5B-5F
0x38_00_00_01, // 96..127-> 60, 7B-7D
]);
const XML_BITSET = /* #__PURE__ */ new Uint32Array([0, XML_BITSET_VALUE, 0, 0]);
/**
* Encodes all characters in the input using HTML entities. This includes
* characters that are valid ASCII characters in HTML documents, such as `#`.
*
* To get a more compact output, consider using the `encodeNonAsciiHTML`
* function, which will only encode characters that are not valid in HTML
* documents, as well as non-ASCII characters.
*
* If a character has no equivalent entity, a numeric hexadecimal reference
* (eg. `&#xfc;`) will be used.
*/
export function encodeHTML(input: string): string {
return encodeHTMLTrieRe(HTML_BITSET, input);
}
/**
* Encodes all non-ASCII characters, as well as characters not valid in HTML
* documents using HTML entities. This function will not encode characters that
* are valid in HTML documents, such as `#`.
*
* If a character has no equivalent entity, a numeric hexadecimal reference
* (eg. `&#xfc;`) will be used.
*/
export function encodeNonAsciiHTML(input: string): string {
return encodeHTMLTrieRe(XML_BITSET, input);
}
function encodeHTMLTrieRe(bitset: Uint32Array, input: string): string {
let out: string | undefined;
let last = 0; // Start of the next untouched slice.
const { length } = input;
for (let index = 0; index < length; index++) {
const char = input.charCodeAt(index);
// Skip ASCII characters that don't need encoding
if (char < 0x80 && !((bitset[char >>> 5] >>> char) & 1)) {
continue;
}
if (out === undefined) out = input.substring(0, index);
else if (last !== index) out += input.substring(last, index);
let node = htmlTrie.get(char);
if (typeof node === "object") {
if (index + 1 < length) {
const nextChar = input.charCodeAt(index + 1);
const value =
typeof node.next === "number"
? node.next === nextChar
? node.nextValue
: undefined
: node.next.get(nextChar);
if (value !== undefined) {
out += value;
index++;
last = index + 1;
continue;
}
}
node = node.value;
}
if (node === undefined) {
const cp = getCodePoint(input, index);
out += `&#x${cp.toString(16)};`;
if (cp !== char) index++;
last = index + 1;
} else {
out += node;
last = index + 1;
}
}
if (out === undefined) return input;
if (last < length) out += input.substr(last);
return out;
}

14
node_modules/entities/src/escape.spec.ts generated vendored Normal file
View File

@@ -0,0 +1,14 @@
import { describe, expect, it } from "vitest";
import * as entities from "./index.js";
describe("escape HTML", () => {
it("should escape HTML attribute values", () =>
expect(entities.escapeAttribute('<a " attr > & value \u00A0!')).toBe(
"<a &quot; attr > &amp; value &nbsp;!",
));
it("should escape HTML text", () =>
expect(entities.escapeText('<a " text > & value \u00A0!')).toBe(
'&lt;a " text &gt; &amp; value &nbsp;!',
));
});

161
node_modules/entities/src/escape.ts generated vendored Normal file
View File

@@ -0,0 +1,161 @@
const xmlCodeMap = new Map([
[34, "&quot;"],
[38, "&amp;"],
[39, "&apos;"],
[60, "&lt;"],
[62, "&gt;"],
]);
// For compatibility with node < 4, we wrap `codePointAt`
export const getCodePoint: (c: string, index: number) => number =
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
String.prototype.codePointAt == null
? (c: string, index: number): number =>
(c.charCodeAt(index) & 0xfc_00) === 0xd8_00
? (c.charCodeAt(index) - 0xd8_00) * 0x4_00 +
c.charCodeAt(index + 1) -
0xdc_00 +
0x1_00_00
: c.charCodeAt(index)
: // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
(input: string, index: number): number => input.codePointAt(index)!;
/**
* Bitset for ASCII characters that need to be escaped in XML.
*/
export const XML_BITSET_VALUE = 0x50_00_00_c4; // 32..63 -> 34 ("),38 (&),39 ('),60 (<),62 (>)
/**
* Encodes all non-ASCII characters, as well as characters not valid in XML
* documents using XML entities. Uses a fast bitset scan instead of RegExp.
*
* If a character has no equivalent entity, a numeric hexadecimal reference
* (eg. `&#xfc;`) will be used.
*/
export function encodeXML(input: string): string {
let out: string | undefined;
let last = 0;
const { length } = input;
for (let index = 0; index < length; index++) {
const char = input.charCodeAt(index);
// Check for ASCII chars that don't need escaping
if (
char < 0x80 &&
(((XML_BITSET_VALUE >>> char) & 1) === 0 || char >= 64 || char < 32)
) {
continue;
}
if (out === undefined) out = input.substring(0, index);
else if (last !== index) out += input.substring(last, index);
if (char < 64) {
// Known replacement
out += xmlCodeMap.get(char)!;
last = index + 1;
continue;
}
// Non-ASCII: encode as numeric entity (handle surrogate pair)
const cp = getCodePoint(input, index);
out += `&#x${cp.toString(16)};`;
if (cp !== char) index++; // Skip trailing surrogate
last = index + 1;
}
if (out === undefined) return input;
if (last < length) out += input.substr(last);
return out;
}
/**
* Encodes all non-ASCII characters, as well as characters not valid in XML
* documents using numeric hexadecimal reference (eg. `&#xfc;`).
*
* Have a look at `escapeUTF8` if you want a more concise output at the expense
* of reduced transportability.
*
* @param data String to escape.
*/
export const escape: typeof encodeXML = encodeXML;
/**
* Creates a function that escapes all characters matched by the given regular
* expression using the given map of characters to escape to their entities.
*
* @param regex Regular expression to match characters to escape.
* @param map Map of characters to escape to their entities.
*
* @returns Function that escapes all characters matched by the given regular
* expression using the given map of characters to escape to their entities.
*/
function getEscaper(
regex: RegExp,
map: Map<number, string>,
): (data: string) => string {
return function escape(data: string): string {
let match: RegExpExecArray | null;
let lastIndex = 0;
let result = "";
while ((match = regex.exec(data))) {
if (lastIndex !== match.index) {
result += data.substring(lastIndex, match.index);
}
// We know that this character will be in the map.
result += map.get(match[0].charCodeAt(0))!;
// Every match will be of length 1
lastIndex = match.index + 1;
}
return result + data.substring(lastIndex);
};
}
/**
* Encodes all characters not valid in XML documents using XML entities.
*
* Note that the output will be character-set dependent.
*
* @param data String to escape.
*/
export const escapeUTF8: (data: string) => string = /* #__PURE__ */ getEscaper(
/["&'<>]/g,
xmlCodeMap,
);
/**
* Encodes all characters that have to be escaped in HTML attributes,
* following {@link https://html.spec.whatwg.org/multipage/parsing.html#escapingString}.
*
* @param data String to escape.
*/
export const escapeAttribute: (data: string) => string =
/* #__PURE__ */ getEscaper(
/["&\u00A0]/g,
new Map([
[34, "&quot;"],
[38, "&amp;"],
[160, "&nbsp;"],
]),
);
/**
* Encodes all characters that have to be escaped in HTML text,
* following {@link https://html.spec.whatwg.org/multipage/parsing.html#escapingString}.
*
* @param data String to escape.
*/
export const escapeText: (data: string) => string = /* #__PURE__ */ getEscaper(
/[&<>\u00A0]/g,
new Map([
[38, "&amp;"],
[60, "&lt;"],
[62, "&gt;"],
[160, "&nbsp;"],
]),
);

10
node_modules/entities/src/generated/.eslintrc.json generated vendored Normal file
View File

@@ -0,0 +1,10 @@
{
"rules": {
"multiline-comment-style": 0,
"capitalized-comments": 0,
"unicorn/escape-case": 0,
"unicorn/no-hex-escape": 0,
"unicorn/numeric-separators-style": 0,
"unicorn/prefer-spread": 0
}
}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,6 @@
// Generated using scripts/write-decode-map.ts
import { decodeBase64 } from "../internal/decode-shared.js";
export const xmlDecodeTree: Uint16Array = /* #__PURE__ */ decodeBase64(
"AAJhZ2xxBwARABMAFQBtAg0AAAAAAA8AcAAmYG8AcwAnYHQAPmB0ADxg9SFvdCJg",
);

17
node_modules/entities/src/generated/encode-html.ts generated vendored Normal file

File diff suppressed because one or more lines are too long

125
node_modules/entities/src/index.spec.ts generated vendored Normal file
View File

@@ -0,0 +1,125 @@
import { readFileSync } from "node:fs";
import { describe, expect, it } from "vitest";
import legacy from "../maps/legacy.json" with { type: "json" };
import * as entities from "./index.js";
const levels = ["xml", "entities"];
describe("Documents", () => {
const levelDocuments = levels
.map((name) => new URL(`../maps/${name}.json`, import.meta.url))
.map((url) => JSON.parse(readFileSync(url, "utf8")))
.map((document, index) => [index, document]);
for (const [level, document] of levelDocuments) {
describe("Decode", () => {
it(levels[level], () => {
for (const entity of Object.keys(document)) {
for (let l = level; l < levels.length; l++) {
expect(entities.decode(`&${entity};`, l)).toBe(
document[entity],
);
expect(
entities.decode(`&${entity};`, { level: l }),
).toBe(document[entity]);
}
}
});
});
describe("Decode strict", () => {
it(levels[level], () => {
for (const entity of Object.keys(document)) {
for (let l = level; l < levels.length; l++) {
expect(entities.decodeStrict(`&${entity};`, l)).toBe(
document[entity],
);
expect(
entities.decode(`&${entity};`, {
level: l,
mode: entities.DecodingMode.Strict,
}),
).toBe(document[entity]);
}
}
});
});
describe("Encode", () => {
it(levels[level], () => {
for (const entity of Object.keys(document)) {
for (let l = level; l < levels.length; l++) {
const encoded = entities.encode(document[entity], l);
const decoded = entities.decode(encoded, l);
expect(decoded).toBe(document[entity]);
}
}
});
it("should only encode non-ASCII values if asked", () =>
expect(
entities.encode("Great #'s of 🎁", {
level,
mode: entities.EncodingMode.ASCII,
}),
).toBe("Great #&apos;s of &#x1f381;"));
});
}
describe("Legacy", () => {
const legacyMap: Record<string, string> = legacy;
it("should decode", () => {
for (const entity of Object.keys(legacyMap)) {
expect(entities.decodeHTML(`&${entity}`)).toBe(
legacyMap[entity],
);
expect(
entities.decodeStrict(`&${entity}`, {
level: entities.EntityLevel.HTML,
mode: entities.DecodingMode.Legacy,
}),
).toBe(legacyMap[entity]);
}
});
});
});
const astral = [
["1d306", "\uD834\uDF06"],
["1d11e", "\uD834\uDD1E"],
];
const astralSpecial = [
["80", "\u20AC"],
["110000", "\uFFFD"],
];
describe("Astral entities", () => {
for (const [c, value] of astral) {
it(`should decode ${value}`, () =>
expect(entities.decode(`&#x${c};`)).toBe(value));
it(`should encode ${value}`, () =>
expect(entities.encode(value)).toBe(`&#x${c};`));
it(`should escape ${value}`, () =>
expect(entities.escape(value)).toBe(`&#x${c};`));
}
for (const [c, value] of astralSpecial) {
it(`should decode special \\u${c}`, () =>
expect(entities.decode(`&#x${c};`)).toBe(value));
}
});
describe("Escape", () => {
it("should always decode ASCII chars", () => {
for (let index = 0; index < 0x7f; index++) {
const c = String.fromCharCode(index);
expect(entities.decodeXML(entities.escape(c))).toBe(c);
}
});
it("should keep UTF8 characters", () =>
expect(entities.escapeUTF8('ß < "ü"')).toBe(`ß &lt; &quot;ü&quot;`));
});

187
node_modules/entities/src/index.ts generated vendored Normal file
View File

@@ -0,0 +1,187 @@
import { DecodingMode, decodeHTML, decodeXML } from "./decode.js";
import { encodeHTML, encodeNonAsciiHTML } from "./encode.js";
import {
encodeXML,
escapeAttribute,
escapeText,
escapeUTF8,
} from "./escape.js";
/** The level of entities to support. */
export enum EntityLevel {
/** Support only XML entities. */
XML = 0,
/** Support HTML entities, which are a superset of XML entities. */
HTML = 1,
}
export enum EncodingMode {
/**
* The output is UTF-8 encoded. Only characters that need escaping within
* XML will be escaped.
*/
UTF8,
/**
* The output consists only of ASCII characters. Characters that need
* escaping within HTML, and characters that aren't ASCII characters will
* be escaped.
*/
ASCII,
/**
* Encode all characters that have an equivalent entity, as well as all
* characters that are not ASCII characters.
*/
Extensive,
/**
* Encode all characters that have to be escaped in HTML attributes,
* following {@link https://html.spec.whatwg.org/multipage/parsing.html#escapingString}.
*/
Attribute,
/**
* Encode all characters that have to be escaped in HTML text,
* following {@link https://html.spec.whatwg.org/multipage/parsing.html#escapingString}.
*/
Text,
}
export interface DecodingOptions {
/**
* The level of entities to support.
* @default {@link EntityLevel.XML}
*/
level?: EntityLevel;
/**
* Decoding mode. If `Legacy`, will support legacy entities not terminated
* with a semicolon (`;`).
*
* Always `Strict` for XML. For HTML, set this to `true` if you are parsing
* an attribute value.
*
* The deprecated `decodeStrict` function defaults this to `Strict`.
*
* @default {@link DecodingMode.Legacy}
*/
mode?: DecodingMode | undefined;
}
/**
* Decodes a string with entities.
*
* @param input String to decode.
* @param options Decoding options.
*/
export function decode(
input: string,
options: DecodingOptions | EntityLevel = EntityLevel.XML,
): string {
const level = typeof options === "number" ? options : options.level;
if (level === EntityLevel.HTML) {
const mode = typeof options === "object" ? options.mode : undefined;
return decodeHTML(input, mode);
}
return decodeXML(input);
}
/**
* Decodes a string with entities. Does not allow missing trailing semicolons for entities.
*
* @param input String to decode.
* @param options Decoding options.
* @deprecated Use `decode` with the `mode` set to `Strict`.
*/
export function decodeStrict(
input: string,
options: DecodingOptions | EntityLevel = EntityLevel.XML,
): string {
const normalizedOptions =
typeof options === "number" ? { level: options } : options;
normalizedOptions.mode ??= DecodingMode.Strict;
return decode(input, normalizedOptions);
}
/**
* Options for `encode`.
*/
export interface EncodingOptions {
/**
* The level of entities to support.
* @default {@link EntityLevel.XML}
*/
level?: EntityLevel;
/**
* Output format.
* @default {@link EncodingMode.Extensive}
*/
mode?: EncodingMode;
}
/**
* Encodes a string with entities.
*
* @param input String to encode.
* @param options Encoding options.
*/
export function encode(
input: string,
options: EncodingOptions | EntityLevel = EntityLevel.XML,
): string {
const { mode = EncodingMode.Extensive, level = EntityLevel.XML } =
typeof options === "number" ? { level: options } : options;
switch (mode) {
case EncodingMode.UTF8: {
return escapeUTF8(input);
}
case EncodingMode.Attribute: {
return escapeAttribute(input);
}
case EncodingMode.Text: {
return escapeText(input);
}
case EncodingMode.ASCII: {
return level === EntityLevel.HTML
? encodeNonAsciiHTML(input)
: encodeXML(input);
}
// biome-ignore lint/complexity/noUselessSwitchCase: we get an error for the switch not being exhaustive
case EncodingMode.Extensive: // eslint-disable-line unicorn/no-useless-switch-case
default: {
return level === EntityLevel.HTML
? encodeHTML(input)
: encodeXML(input);
}
}
}
export {
DecodingMode,
decodeHTML,
// Legacy aliases (deprecated)
decodeHTML as decodeHTML4,
decodeHTML as decodeHTML5,
decodeHTMLAttribute,
decodeHTMLStrict,
decodeHTMLStrict as decodeHTML4Strict,
decodeHTMLStrict as decodeHTML5Strict,
decodeXML,
decodeXML as decodeXMLStrict,
EntityDecoder,
} from "./decode.js";
export {
encodeHTML,
// Legacy aliases (deprecated)
encodeHTML as encodeHTML4,
encodeHTML as encodeHTML5,
encodeNonAsciiHTML,
} from "./encode.js";
export {
encodeXML,
escape,
escapeAttribute,
escapeText,
escapeUTF8,
} from "./escape.js";

16
node_modules/entities/src/internal/bin-trie-flags.ts generated vendored Normal file
View File

@@ -0,0 +1,16 @@
/**
* Bit flags & masks for the binary trie encoding used for entity decoding.
*
* Bit layout (16 bits total):
* 15..14 VALUE_LENGTH (+1 encoding; 0 => no value)
* 13 FLAG13. If valueLength>0: semicolon required flag (implicit ';').
* If valueLength==0: compact run flag.
* 12..7 BRANCH_LENGTH Branch length (0 => single branch in 6..0 if jumpOffset==char) OR run length (when compact run)
* 6..0 JUMP_TABLE Jump offset (jump table) OR single-branch char code OR first run char
*/
export enum BinTrieFlags {
VALUE_LENGTH = 0b1100_0000_0000_0000,
FLAG13 = 0b0010_0000_0000_0000,
BRANCH_LENGTH = 0b0001_1111_1000_0000,
JUMP_TABLE = 0b0000_0000_0111_1111,
}

30
node_modules/entities/src/internal/decode-shared.ts generated vendored Normal file
View File

@@ -0,0 +1,30 @@
/*
* Shared base64 decode helper for generated decode data.
* Assumes global atob is available.
*/
export function decodeBase64(input: string): Uint16Array {
const binary: string =
// eslint-disable-next-line n/no-unsupported-features/node-builtins
typeof atob === "function"
? // Browser (and Node >=16)
// eslint-disable-next-line n/no-unsupported-features/node-builtins
atob(input)
: // Older Node versions (<16)
// eslint-disable-next-line n/no-unsupported-features/node-builtins
typeof Buffer.from === "function"
? // eslint-disable-next-line n/no-unsupported-features/node-builtins
Buffer.from(input, "base64").toString("binary")
: // eslint-disable-next-line unicorn/no-new-buffer, n/no-deprecated-api
new Buffer(input, "base64").toString("binary");
const evenLength = binary.length & ~1; // Round down to even length
const out = new Uint16Array(evenLength / 2);
for (let index = 0, outIndex = 0; index < evenLength; index += 2) {
const lo = binary.charCodeAt(index);
const hi = binary.charCodeAt(index + 1);
out[outIndex++] = lo | (hi << 8);
}
return out;
}

121
node_modules/entities/src/internal/encode-shared.ts generated vendored Normal file
View File

@@ -0,0 +1,121 @@
/**
* A node inside the encoding trie used by `encode.ts`.
*
* There are two physical shapes to minimize allocations and lookup cost:
*
* 1. Leaf node (string)
* - A plain string (already in the form `"&name;"`).
* - Represents a terminal match with no children.
*
* 2. Branch / value node (object)
*/
export type EncodeTrieNode =
| string
| {
/**
* Entity value for the current code point sequence (wrapped: `&...;`).
* Present when the path to this node itself is a valid named entity.
*/
value: string | undefined;
/** If a number, the next code unit of the only next character. */
next: number | Map<number, EncodeTrieNode>;
/** If next is a number, `nextValue` contains the entity value. */
nextValue?: string;
};
/**
* Parse a compact encode trie string into a Map structure used for encoding.
*
* Format per entry (ascending code points using delta encoding):
* <diffBase36>[&name;][{<children>}] -- diff omitted when 0
* Where diff = currentKey - previousKey - 1 (first entry stores absolute key).
* `&name;` is the entity value (already wrapped); a following `{` denotes children.
*/
export function parseEncodeTrie(
serialized: string,
): Map<number, EncodeTrieNode> {
const top = new Map<number, EncodeTrieNode>();
const totalLength = serialized.length;
let cursor = 0;
let lastTopKey = -1;
function readDiff(): number {
const start = cursor;
while (cursor < totalLength) {
const char = serialized.charAt(cursor);
if ((char < "0" || char > "9") && (char < "a" || char > "z")) {
break;
}
cursor++;
}
if (cursor === start) return 0;
return Number.parseInt(serialized.slice(start, cursor), 36);
}
function readEntity(): string {
if (serialized[cursor] !== "&") {
throw new Error(`Child entry missing value near index ${cursor}`);
}
// Cursor currently points at '&'
const start = cursor;
const end = serialized.indexOf(";", cursor + 1);
if (end === -1) {
throw new Error(`Unterminated entity starting at index ${start}`);
}
cursor = end + 1; // Move past ';'
return serialized.slice(start, cursor); // Includes & ... ;
}
while (cursor < totalLength) {
const keyDiff = readDiff();
const key = lastTopKey === -1 ? keyDiff : lastTopKey + keyDiff + 1;
let value: string | undefined;
if (serialized[cursor] === "&") value = readEntity();
if (serialized[cursor] === "{") {
cursor++; // Skip '{'
// Parse first child
let diff = readDiff();
let childKey = diff; // First key (lastChildKey = -1)
const firstValue = readEntity();
if (serialized[cursor] === "{") {
throw new Error("Unexpected nested '{' beyond depth 2");
}
// If end of block -> single child optimization
if (serialized[cursor] === "}") {
top.set(key, { value, next: childKey, nextValue: firstValue });
cursor++; // Skip '}'
} else {
const childMap = new Map<number, EncodeTrieNode>();
childMap.set(childKey, firstValue);
let lastChildKey = childKey;
while (cursor < totalLength && serialized[cursor] !== "}") {
diff = readDiff();
childKey = lastChildKey + diff + 1;
const childValue = readEntity();
if (serialized[cursor] === "{") {
throw new Error("Unexpected nested '{' beyond depth 2");
}
childMap.set(childKey, childValue);
lastChildKey = childKey;
}
if (serialized[cursor] !== "}") {
throw new Error("Unterminated child block");
}
cursor++; // Skip '}'
top.set(key, { value, next: childMap });
}
} else if (value === undefined) {
throw new Error(
`Malformed encode trie: missing value at index ${cursor}`,
);
} else {
top.set(key, value);
}
lastTopKey = key;
}
return top;
}