summaryrefslogtreecommitdiff
path: root/school/node_modules/parse5/lib/tokenizer/preprocessor.js
diff options
context:
space:
mode:
authorMinteck <contact@minteck.org>2023-01-10 14:54:04 +0100
committerMinteck <contact@minteck.org>2023-01-10 14:54:04 +0100
commit99c1d9af689e5325f3cf535c4007b3aeb8325229 (patch)
treee663b3c2ebdbd67c818ac0c5147f0ce1d2463cda /school/node_modules/parse5/lib/tokenizer/preprocessor.js
parent9871b03912fc28ad38b4037ebf26a78aa937baba (diff)
downloadpluralconnect-99c1d9af689e5325f3cf535c4007b3aeb8325229.tar.gz
pluralconnect-99c1d9af689e5325f3cf535c4007b3aeb8325229.tar.bz2
pluralconnect-99c1d9af689e5325f3cf535c4007b3aeb8325229.zip
Update - This is an automated commit
Diffstat (limited to 'school/node_modules/parse5/lib/tokenizer/preprocessor.js')
-rw-r--r--school/node_modules/parse5/lib/tokenizer/preprocessor.js159
1 files changed, 159 insertions, 0 deletions
diff --git a/school/node_modules/parse5/lib/tokenizer/preprocessor.js b/school/node_modules/parse5/lib/tokenizer/preprocessor.js
new file mode 100644
index 0000000..26fde48
--- /dev/null
+++ b/school/node_modules/parse5/lib/tokenizer/preprocessor.js
@@ -0,0 +1,159 @@
+'use strict';
+
+const unicode = require('../common/unicode');
+const ERR = require('../common/error-codes');
+
+//Aliases
+const $ = unicode.CODE_POINTS;
+
+//Const
+const DEFAULT_BUFFER_WATERLINE = 1 << 16;
+
+//Preprocessor
+//NOTE: HTML input preprocessing
+//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream)
+class Preprocessor {
+ constructor() {
+ this.html = null;
+
+ this.pos = -1;
+ this.lastGapPos = -1;
+ this.lastCharPos = -1;
+
+ this.gapStack = [];
+
+ this.skipNextNewLine = false;
+
+ this.lastChunkWritten = false;
+ this.endOfChunkHit = false;
+ this.bufferWaterline = DEFAULT_BUFFER_WATERLINE;
+ }
+
+ _err() {
+ // NOTE: err reporting is noop by default. Enabled by mixin.
+ }
+
+ _addGap() {
+ this.gapStack.push(this.lastGapPos);
+ this.lastGapPos = this.pos;
+ }
+
+ _processSurrogate(cp) {
+ //NOTE: try to peek a surrogate pair
+ if (this.pos !== this.lastCharPos) {
+ const nextCp = this.html.charCodeAt(this.pos + 1);
+
+ if (unicode.isSurrogatePair(nextCp)) {
+ //NOTE: we have a surrogate pair. Peek pair character and recalculate code point.
+ this.pos++;
+
+ //NOTE: add gap that should be avoided during retreat
+ this._addGap();
+
+ return unicode.getSurrogatePairCodePoint(cp, nextCp);
+ }
+ }
+
+ //NOTE: we are at the end of a chunk, therefore we can't infer surrogate pair yet.
+ else if (!this.lastChunkWritten) {
+ this.endOfChunkHit = true;
+ return $.EOF;
+ }
+
+ //NOTE: isolated surrogate
+ this._err(ERR.surrogateInInputStream);
+
+ return cp;
+ }
+
+ dropParsedChunk() {
+ if (this.pos > this.bufferWaterline) {
+ this.lastCharPos -= this.pos;
+ this.html = this.html.substring(this.pos);
+ this.pos = 0;
+ this.lastGapPos = -1;
+ this.gapStack = [];
+ }
+ }
+
+ write(chunk, isLastChunk) {
+ if (this.html) {
+ this.html += chunk;
+ } else {
+ this.html = chunk;
+ }
+
+ this.lastCharPos = this.html.length - 1;
+ this.endOfChunkHit = false;
+ this.lastChunkWritten = isLastChunk;
+ }
+
+ insertHtmlAtCurrentPos(chunk) {
+ this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1, this.html.length);
+
+ this.lastCharPos = this.html.length - 1;
+ this.endOfChunkHit = false;
+ }
+
+ advance() {
+ this.pos++;
+
+ if (this.pos > this.lastCharPos) {
+ this.endOfChunkHit = !this.lastChunkWritten;
+ return $.EOF;
+ }
+
+ let cp = this.html.charCodeAt(this.pos);
+
+ //NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character
+ //must be ignored.
+ if (this.skipNextNewLine && cp === $.LINE_FEED) {
+ this.skipNextNewLine = false;
+ this._addGap();
+ return this.advance();
+ }
+
+ //NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters
+ if (cp === $.CARRIAGE_RETURN) {
+ this.skipNextNewLine = true;
+ return $.LINE_FEED;
+ }
+
+ this.skipNextNewLine = false;
+
+ if (unicode.isSurrogate(cp)) {
+ cp = this._processSurrogate(cp);
+ }
+
+ //OPTIMIZATION: first check if code point is in the common allowed
+ //range (ASCII alphanumeric, whitespaces, big chunk of BMP)
+ //before going into detailed performance cost validation.
+ const isCommonValidRange =
+ (cp > 0x1f && cp < 0x7f) || cp === $.LINE_FEED || cp === $.CARRIAGE_RETURN || (cp > 0x9f && cp < 0xfdd0);
+
+ if (!isCommonValidRange) {
+ this._checkForProblematicCharacters(cp);
+ }
+
+ return cp;
+ }
+
+ _checkForProblematicCharacters(cp) {
+ if (unicode.isControlCodePoint(cp)) {
+ this._err(ERR.controlCharacterInInputStream);
+ } else if (unicode.isUndefinedCodePoint(cp)) {
+ this._err(ERR.noncharacterInInputStream);
+ }
+ }
+
+ retreat() {
+ if (this.pos === this.lastGapPos) {
+ this.lastGapPos = this.gapStack.pop();
+ this.pos--;
+ }
+
+ this.pos--;
+ }
+}
+
+module.exports = Preprocessor;