diff options
author | Minteck <contact@minteck.org> | 2023-01-10 14:54:04 +0100 |
---|---|---|
committer | Minteck <contact@minteck.org> | 2023-01-10 14:54:04 +0100 |
commit | 99c1d9af689e5325f3cf535c4007b3aeb8325229 (patch) | |
tree | e663b3c2ebdbd67c818ac0c5147f0ce1d2463cda /school/node_modules/parse5/lib/tokenizer/preprocessor.js | |
parent | 9871b03912fc28ad38b4037ebf26a78aa937baba (diff) | |
download | pluralconnect-99c1d9af689e5325f3cf535c4007b3aeb8325229.tar.gz pluralconnect-99c1d9af689e5325f3cf535c4007b3aeb8325229.tar.bz2 pluralconnect-99c1d9af689e5325f3cf535c4007b3aeb8325229.zip |
Update - This is an automated commit
Diffstat (limited to 'school/node_modules/parse5/lib/tokenizer/preprocessor.js')
-rw-r--r-- | school/node_modules/parse5/lib/tokenizer/preprocessor.js | 159 |
1 files changed, 159 insertions, 0 deletions
diff --git a/school/node_modules/parse5/lib/tokenizer/preprocessor.js b/school/node_modules/parse5/lib/tokenizer/preprocessor.js new file mode 100644 index 0000000..26fde48 --- /dev/null +++ b/school/node_modules/parse5/lib/tokenizer/preprocessor.js @@ -0,0 +1,159 @@ +'use strict'; + +const unicode = require('../common/unicode'); +const ERR = require('../common/error-codes'); + +//Aliases +const $ = unicode.CODE_POINTS; + +//Const +const DEFAULT_BUFFER_WATERLINE = 1 << 16; + +//Preprocessor +//NOTE: HTML input preprocessing +//(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream) +class Preprocessor { + constructor() { + this.html = null; + + this.pos = -1; + this.lastGapPos = -1; + this.lastCharPos = -1; + + this.gapStack = []; + + this.skipNextNewLine = false; + + this.lastChunkWritten = false; + this.endOfChunkHit = false; + this.bufferWaterline = DEFAULT_BUFFER_WATERLINE; + } + + _err() { + // NOTE: err reporting is noop by default. Enabled by mixin. + } + + _addGap() { + this.gapStack.push(this.lastGapPos); + this.lastGapPos = this.pos; + } + + _processSurrogate(cp) { + //NOTE: try to peek a surrogate pair + if (this.pos !== this.lastCharPos) { + const nextCp = this.html.charCodeAt(this.pos + 1); + + if (unicode.isSurrogatePair(nextCp)) { + //NOTE: we have a surrogate pair. Peek pair character and recalculate code point. + this.pos++; + + //NOTE: add gap that should be avoided during retreat + this._addGap(); + + return unicode.getSurrogatePairCodePoint(cp, nextCp); + } + } + + //NOTE: we are at the end of a chunk, therefore we can't infer surrogate pair yet. + else if (!this.lastChunkWritten) { + this.endOfChunkHit = true; + return $.EOF; + } + + //NOTE: isolated surrogate + this._err(ERR.surrogateInInputStream); + + return cp; + } + + dropParsedChunk() { + if (this.pos > this.bufferWaterline) { + this.lastCharPos -= this.pos; + this.html = this.html.substring(this.pos); + this.pos = 0; + this.lastGapPos = -1; + this.gapStack = []; + } + } + + write(chunk, isLastChunk) { + if (this.html) { + this.html += chunk; + } else { + this.html = chunk; + } + + this.lastCharPos = this.html.length - 1; + this.endOfChunkHit = false; + this.lastChunkWritten = isLastChunk; + } + + insertHtmlAtCurrentPos(chunk) { + this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1, this.html.length); + + this.lastCharPos = this.html.length - 1; + this.endOfChunkHit = false; + } + + advance() { + this.pos++; + + if (this.pos > this.lastCharPos) { + this.endOfChunkHit = !this.lastChunkWritten; + return $.EOF; + } + + let cp = this.html.charCodeAt(this.pos); + + //NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character + //must be ignored. + if (this.skipNextNewLine && cp === $.LINE_FEED) { + this.skipNextNewLine = false; + this._addGap(); + return this.advance(); + } + + //NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters + if (cp === $.CARRIAGE_RETURN) { + this.skipNextNewLine = true; + return $.LINE_FEED; + } + + this.skipNextNewLine = false; + + if (unicode.isSurrogate(cp)) { + cp = this._processSurrogate(cp); + } + + //OPTIMIZATION: first check if code point is in the common allowed + //range (ASCII alphanumeric, whitespaces, big chunk of BMP) + //before going into detailed performance cost validation. + const isCommonValidRange = + (cp > 0x1f && cp < 0x7f) || cp === $.LINE_FEED || cp === $.CARRIAGE_RETURN || (cp > 0x9f && cp < 0xfdd0); + + if (!isCommonValidRange) { + this._checkForProblematicCharacters(cp); + } + + return cp; + } + + _checkForProblematicCharacters(cp) { + if (unicode.isControlCodePoint(cp)) { + this._err(ERR.controlCharacterInInputStream); + } else if (unicode.isUndefinedCodePoint(cp)) { + this._err(ERR.noncharacterInInputStream); + } + } + + retreat() { + if (this.pos === this.lastGapPos) { + this.lastGapPos = this.gapStack.pop(); + this.pos--; + } + + this.pos--; + } +} + +module.exports = Preprocessor; |