encode.ts 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. import { getCodePoint, XML_BITSET_VALUE } from "./escape.js";
  2. import { htmlTrie } from "./generated/encode-html.js";
  3. /**
  4. * We store the characters to consider as a compact bitset for fast lookups.
  5. */
  6. const HTML_BITSET = /* #__PURE__ */ new Uint32Array([
  7. 0x16_00, // Bits for 09,0A,0C
  8. 0xfc_00_ff_fe, // 32..63 -> 21-2D (minus space), 2E,2F,3A-3F
  9. 0xf8_00_00_01, // 64..95 -> 40, 5B-5F
  10. 0x38_00_00_01, // 96..127-> 60, 7B-7D
  11. ]);
  12. const XML_BITSET = /* #__PURE__ */ new Uint32Array([0, XML_BITSET_VALUE, 0, 0]);
  13. /**
  14. * Encodes all characters in the input using HTML entities. This includes
  15. * characters that are valid ASCII characters in HTML documents, such as `#`.
  16. *
  17. * To get a more compact output, consider using the `encodeNonAsciiHTML`
  18. * function, which will only encode characters that are not valid in HTML
  19. * documents, as well as non-ASCII characters.
  20. *
  21. * If a character has no equivalent entity, a numeric hexadecimal reference
  22. * (eg. `ü`) will be used.
  23. */
  24. export function encodeHTML(input: string): string {
  25. return encodeHTMLTrieRe(HTML_BITSET, input);
  26. }
  27. /**
  28. * Encodes all non-ASCII characters, as well as characters not valid in HTML
  29. * documents using HTML entities. This function will not encode characters that
  30. * are valid in HTML documents, such as `#`.
  31. *
  32. * If a character has no equivalent entity, a numeric hexadecimal reference
  33. * (eg. `ü`) will be used.
  34. */
  35. export function encodeNonAsciiHTML(input: string): string {
  36. return encodeHTMLTrieRe(XML_BITSET, input);
  37. }
  38. function encodeHTMLTrieRe(bitset: Uint32Array, input: string): string {
  39. let out: string | undefined;
  40. let last = 0; // Start of the next untouched slice.
  41. const { length } = input;
  42. for (let index = 0; index < length; index++) {
  43. const char = input.charCodeAt(index);
  44. // Skip ASCII characters that don't need encoding
  45. if (char < 0x80 && !((bitset[char >>> 5] >>> char) & 1)) {
  46. continue;
  47. }
  48. if (out === undefined) out = input.substring(0, index);
  49. else if (last !== index) out += input.substring(last, index);
  50. let node = htmlTrie.get(char);
  51. if (typeof node === "object") {
  52. if (index + 1 < length) {
  53. const nextChar = input.charCodeAt(index + 1);
  54. const value =
  55. typeof node.next === "number"
  56. ? node.next === nextChar
  57. ? node.nextValue
  58. : undefined
  59. : node.next.get(nextChar);
  60. if (value !== undefined) {
  61. out += value;
  62. index++;
  63. last = index + 1;
  64. continue;
  65. }
  66. }
  67. node = node.value;
  68. }
  69. if (node === undefined) {
  70. const cp = getCodePoint(input, index);
  71. out += `&#x${cp.toString(16)};`;
  72. if (cp !== char) index++;
  73. last = index + 1;
  74. } else {
  75. out += node;
  76. last = index + 1;
  77. }
  78. }
  79. if (out === undefined) return input;
  80. if (last < length) out += input.substr(last);
  81. return out;
  82. }