diff options
author | RaindropsSys <contact@minteck.org> | 2023-04-06 22:18:28 +0200 |
---|---|---|
committer | RaindropsSys <contact@minteck.org> | 2023-04-06 22:18:28 +0200 |
commit | 83354b2b88218090988dd6e526b0a2505b57e0f1 (patch) | |
tree | e3c73c38a122a78bb7e66fbb99056407edd9d4b9 /includes/external/addressbook/node_modules/htmlparser2/lib/Tokenizer.js | |
parent | 47b8f2299a483024c4a6a8876af825a010954caa (diff) | |
download | pluralconnect-83354b2b88218090988dd6e526b0a2505b57e0f1.tar.gz pluralconnect-83354b2b88218090988dd6e526b0a2505b57e0f1.tar.bz2 pluralconnect-83354b2b88218090988dd6e526b0a2505b57e0f1.zip |
Updated 5 files and added 1110 files (automated)
Diffstat (limited to 'includes/external/addressbook/node_modules/htmlparser2/lib/Tokenizer.js')
-rw-r--r-- | includes/external/addressbook/node_modules/htmlparser2/lib/Tokenizer.js | 938 |
1 files changed, 938 insertions, 0 deletions
diff --git a/includes/external/addressbook/node_modules/htmlparser2/lib/Tokenizer.js b/includes/external/addressbook/node_modules/htmlparser2/lib/Tokenizer.js new file mode 100644 index 0000000..43863cc --- /dev/null +++ b/includes/external/addressbook/node_modules/htmlparser2/lib/Tokenizer.js @@ -0,0 +1,938 @@ +"use strict"; +Object.defineProperty(exports, "__esModule", { value: true }); +exports.QuoteType = void 0; +var decode_js_1 = require("entities/lib/decode.js"); +var CharCodes; +(function (CharCodes) { + CharCodes[CharCodes["Tab"] = 9] = "Tab"; + CharCodes[CharCodes["NewLine"] = 10] = "NewLine"; + CharCodes[CharCodes["FormFeed"] = 12] = "FormFeed"; + CharCodes[CharCodes["CarriageReturn"] = 13] = "CarriageReturn"; + CharCodes[CharCodes["Space"] = 32] = "Space"; + CharCodes[CharCodes["ExclamationMark"] = 33] = "ExclamationMark"; + CharCodes[CharCodes["Number"] = 35] = "Number"; + CharCodes[CharCodes["Amp"] = 38] = "Amp"; + CharCodes[CharCodes["SingleQuote"] = 39] = "SingleQuote"; + CharCodes[CharCodes["DoubleQuote"] = 34] = "DoubleQuote"; + CharCodes[CharCodes["Dash"] = 45] = "Dash"; + CharCodes[CharCodes["Slash"] = 47] = "Slash"; + CharCodes[CharCodes["Zero"] = 48] = "Zero"; + CharCodes[CharCodes["Nine"] = 57] = "Nine"; + CharCodes[CharCodes["Semi"] = 59] = "Semi"; + CharCodes[CharCodes["Lt"] = 60] = "Lt"; + CharCodes[CharCodes["Eq"] = 61] = "Eq"; + CharCodes[CharCodes["Gt"] = 62] = "Gt"; + CharCodes[CharCodes["Questionmark"] = 63] = "Questionmark"; + CharCodes[CharCodes["UpperA"] = 65] = "UpperA"; + CharCodes[CharCodes["LowerA"] = 97] = "LowerA"; + CharCodes[CharCodes["UpperF"] = 70] = "UpperF"; + CharCodes[CharCodes["LowerF"] = 102] = "LowerF"; + CharCodes[CharCodes["UpperZ"] = 90] = "UpperZ"; + CharCodes[CharCodes["LowerZ"] = 122] = "LowerZ"; + CharCodes[CharCodes["LowerX"] = 120] = "LowerX"; + CharCodes[CharCodes["OpeningSquareBracket"] = 91] = "OpeningSquareBracket"; +})(CharCodes || (CharCodes = {})); +/** All the states the tokenizer can be in. */ +var State; +(function (State) { + State[State["Text"] = 1] = "Text"; + State[State["BeforeTagName"] = 2] = "BeforeTagName"; + State[State["InTagName"] = 3] = "InTagName"; + State[State["InSelfClosingTag"] = 4] = "InSelfClosingTag"; + State[State["BeforeClosingTagName"] = 5] = "BeforeClosingTagName"; + State[State["InClosingTagName"] = 6] = "InClosingTagName"; + State[State["AfterClosingTagName"] = 7] = "AfterClosingTagName"; + // Attributes + State[State["BeforeAttributeName"] = 8] = "BeforeAttributeName"; + State[State["InAttributeName"] = 9] = "InAttributeName"; + State[State["AfterAttributeName"] = 10] = "AfterAttributeName"; + State[State["BeforeAttributeValue"] = 11] = "BeforeAttributeValue"; + State[State["InAttributeValueDq"] = 12] = "InAttributeValueDq"; + State[State["InAttributeValueSq"] = 13] = "InAttributeValueSq"; + State[State["InAttributeValueNq"] = 14] = "InAttributeValueNq"; + // Declarations + State[State["BeforeDeclaration"] = 15] = "BeforeDeclaration"; + State[State["InDeclaration"] = 16] = "InDeclaration"; + // Processing instructions + State[State["InProcessingInstruction"] = 17] = "InProcessingInstruction"; + // Comments & CDATA + State[State["BeforeComment"] = 18] = "BeforeComment"; + State[State["CDATASequence"] = 19] = "CDATASequence"; + State[State["InSpecialComment"] = 20] = "InSpecialComment"; + State[State["InCommentLike"] = 21] = "InCommentLike"; + // Special tags + State[State["BeforeSpecialS"] = 22] = "BeforeSpecialS"; + State[State["SpecialStartSequence"] = 23] = "SpecialStartSequence"; + State[State["InSpecialTag"] = 24] = "InSpecialTag"; + State[State["BeforeEntity"] = 25] = "BeforeEntity"; + State[State["BeforeNumericEntity"] = 26] = "BeforeNumericEntity"; + State[State["InNamedEntity"] = 27] = "InNamedEntity"; + State[State["InNumericEntity"] = 28] = "InNumericEntity"; + State[State["InHexEntity"] = 29] = "InHexEntity"; +})(State || (State = {})); +function isWhitespace(c) { + return (c === CharCodes.Space || + c === CharCodes.NewLine || + c === CharCodes.Tab || + c === CharCodes.FormFeed || + c === CharCodes.CarriageReturn); +} +function isEndOfTagSection(c) { + return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c); +} +function isNumber(c) { + return c >= CharCodes.Zero && c <= CharCodes.Nine; +} +function isASCIIAlpha(c) { + return ((c >= CharCodes.LowerA && c <= CharCodes.LowerZ) || + (c >= CharCodes.UpperA && c <= CharCodes.UpperZ)); +} +function isHexDigit(c) { + return ((c >= CharCodes.UpperA && c <= CharCodes.UpperF) || + (c >= CharCodes.LowerA && c <= CharCodes.LowerF)); +} +var QuoteType; +(function (QuoteType) { + QuoteType[QuoteType["NoValue"] = 0] = "NoValue"; + QuoteType[QuoteType["Unquoted"] = 1] = "Unquoted"; + QuoteType[QuoteType["Single"] = 2] = "Single"; + QuoteType[QuoteType["Double"] = 3] = "Double"; +})(QuoteType = exports.QuoteType || (exports.QuoteType = {})); +/** + * Sequences used to match longer strings. + * + * We don't have `Script`, `Style`, or `Title` here. Instead, we re-use the *End + * sequences with an increased offset. + */ +var Sequences = { + Cdata: new Uint8Array([0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]), + CdataEnd: new Uint8Array([0x5d, 0x5d, 0x3e]), + CommentEnd: new Uint8Array([0x2d, 0x2d, 0x3e]), + ScriptEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74]), + StyleEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65]), + TitleEnd: new Uint8Array([0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65]), // `</title` +}; +var Tokenizer = /** @class */ (function () { + function Tokenizer(_a, cbs) { + var _b = _a.xmlMode, xmlMode = _b === void 0 ? false : _b, _c = _a.decodeEntities, decodeEntities = _c === void 0 ? true : _c; + this.cbs = cbs; + /** The current state the tokenizer is in. */ + this.state = State.Text; + /** The read buffer. */ + this.buffer = ""; + /** The beginning of the section that is currently being read. */ + this.sectionStart = 0; + /** The index within the buffer that we are currently looking at. */ + this.index = 0; + /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */ + this.baseState = State.Text; + /** For special parsing behavior inside of script and style tags. */ + this.isSpecial = false; + /** Indicates whether the tokenizer has been paused. */ + this.running = true; + /** The offset of the current buffer. */ + this.offset = 0; + this.currentSequence = undefined; + this.sequenceIndex = 0; + this.trieIndex = 0; + this.trieCurrent = 0; + /** For named entities, the index of the value. For numeric entities, the code point. */ + this.entityResult = 0; + this.entityExcess = 0; + this.xmlMode = xmlMode; + this.decodeEntities = decodeEntities; + this.entityTrie = xmlMode ? decode_js_1.xmlDecodeTree : decode_js_1.htmlDecodeTree; + } + Tokenizer.prototype.reset = function () { + this.state = State.Text; + this.buffer = ""; + this.sectionStart = 0; + this.index = 0; + this.baseState = State.Text; + this.currentSequence = undefined; + this.running = true; + this.offset = 0; + }; + Tokenizer.prototype.write = function (chunk) { + this.offset += this.buffer.length; + this.buffer = chunk; + this.parse(); + }; + Tokenizer.prototype.end = function () { + if (this.running) + this.finish(); + }; + Tokenizer.prototype.pause = function () { + this.running = false; + }; + Tokenizer.prototype.resume = function () { + this.running = true; + if (this.index < this.buffer.length + this.offset) { + this.parse(); + } + }; + /** + * The current index within all of the written data. + */ + Tokenizer.prototype.getIndex = function () { + return this.index; + }; + /** + * The start of the current section. + */ + Tokenizer.prototype.getSectionStart = function () { + return this.sectionStart; + }; + Tokenizer.prototype.stateText = function (c) { + if (c === CharCodes.Lt || + (!this.decodeEntities && this.fastForwardTo(CharCodes.Lt))) { + if (this.index > this.sectionStart) { + this.cbs.ontext(this.sectionStart, this.index); + } + this.state = State.BeforeTagName; + this.sectionStart = this.index; + } + else if (this.decodeEntities && c === CharCodes.Amp) { + this.state = State.BeforeEntity; + } + }; + Tokenizer.prototype.stateSpecialStartSequence = function (c) { + var isEnd = this.sequenceIndex === this.currentSequence.length; + var isMatch = isEnd + ? // If we are at the end of the sequence, make sure the tag name has ended + isEndOfTagSection(c) + : // Otherwise, do a case-insensitive comparison + (c | 0x20) === this.currentSequence[this.sequenceIndex]; + if (!isMatch) { + this.isSpecial = false; + } + else if (!isEnd) { + this.sequenceIndex++; + return; + } + this.sequenceIndex = 0; + this.state = State.InTagName; + this.stateInTagName(c); + }; + /** Look for an end tag. For <title> tags, also decode entities. */ + Tokenizer.prototype.stateInSpecialTag = function (c) { + if (this.sequenceIndex === this.currentSequence.length) { + if (c === CharCodes.Gt || isWhitespace(c)) { + var endOfText = this.index - this.currentSequence.length; + if (this.sectionStart < endOfText) { + // Spoof the index so that reported locations match up. + var actualIndex = this.index; + this.index = endOfText; + this.cbs.ontext(this.sectionStart, endOfText); + this.index = actualIndex; + } + this.isSpecial = false; + this.sectionStart = endOfText + 2; // Skip over the `</` + this.stateInClosingTagName(c); + return; // We are done; skip the rest of the function. + } + this.sequenceIndex = 0; + } + if ((c | 0x20) === this.currentSequence[this.sequenceIndex]) { + this.sequenceIndex += 1; + } + else if (this.sequenceIndex === 0) { + if (this.currentSequence === Sequences.TitleEnd) { + // We have to parse entities in <title> tags. + if (this.decodeEntities && c === CharCodes.Amp) { + this.state = State.BeforeEntity; + } + } + else if (this.fastForwardTo(CharCodes.Lt)) { + // Outside of <title> tags, we can fast-forward. + this.sequenceIndex = 1; + } + } + else { + // If we see a `<`, set the sequence index to 1; useful for eg. `<</script>`. + this.sequenceIndex = Number(c === CharCodes.Lt); + } + }; + Tokenizer.prototype.stateCDATASequence = function (c) { + if (c === Sequences.Cdata[this.sequenceIndex]) { + if (++this.sequenceIndex === Sequences.Cdata.length) { + this.state = State.InCommentLike; + this.currentSequence = Sequences.CdataEnd; + this.sequenceIndex = 0; + this.sectionStart = this.index + 1; + } + } + else { + this.sequenceIndex = 0; + this.state = State.InDeclaration; + this.stateInDeclaration(c); // Reconsume the character + } + }; + /** + * When we wait for one specific character, we can speed things up + * by skipping through the buffer until we find it. + * + * @returns Whether the character was found. + */ + Tokenizer.prototype.fastForwardTo = function (c) { + while (++this.index < this.buffer.length + this.offset) { + if (this.buffer.charCodeAt(this.index - this.offset) === c) { + return true; + } + } + /* + * We increment the index at the end of the `parse` loop, + * so set it to `buffer.length - 1` here. + * + * TODO: Refactor `parse` to increment index before calling states. + */ + this.index = this.buffer.length + this.offset - 1; + return false; + }; + /** + * Comments and CDATA end with `-->` and `]]>`. + * + * Their common qualities are: + * - Their end sequences have a distinct character they start with. + * - That character is then repeated, so we have to check multiple repeats. + * - All characters but the start character of the sequence can be skipped. + */ + Tokenizer.prototype.stateInCommentLike = function (c) { + if (c === this.currentSequence[this.sequenceIndex]) { + if (++this.sequenceIndex === this.currentSequence.length) { + if (this.currentSequence === Sequences.CdataEnd) { + this.cbs.oncdata(this.sectionStart, this.index, 2); + } + else { + this.cbs.oncomment(this.sectionStart, this.index, 2); + } + this.sequenceIndex = 0; + this.sectionStart = this.index + 1; + this.state = State.Text; + } + } + else if (this.sequenceIndex === 0) { + // Fast-forward to the first character of the sequence + if (this.fastForwardTo(this.currentSequence[0])) { + this.sequenceIndex = 1; + } + } + else if (c !== this.currentSequence[this.sequenceIndex - 1]) { + // Allow long sequences, eg. --->, ]]]> + this.sequenceIndex = 0; + } + }; + /** + * HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name. + * + * XML allows a lot more characters here (@see https://www.w3.org/TR/REC-xml/#NT-NameStartChar). + * We allow anything that wouldn't end the tag. + */ + Tokenizer.prototype.isTagStartChar = function (c) { + return this.xmlMode ? !isEndOfTagSection(c) : isASCIIAlpha(c); + }; + Tokenizer.prototype.startSpecial = function (sequence, offset) { + this.isSpecial = true; + this.currentSequence = sequence; + this.sequenceIndex = offset; + this.state = State.SpecialStartSequence; + }; + Tokenizer.prototype.stateBeforeTagName = function (c) { + if (c === CharCodes.ExclamationMark) { + this.state = State.BeforeDeclaration; + this.sectionStart = this.index + 1; + } + else if (c === CharCodes.Questionmark) { + this.state = State.InProcessingInstruction; + this.sectionStart = this.index + 1; + } + else if (this.isTagStartChar(c)) { + var lower = c | 0x20; + this.sectionStart = this.index; + if (!this.xmlMode && lower === Sequences.TitleEnd[2]) { + this.startSpecial(Sequences.TitleEnd, 3); + } + else { + this.state = + !this.xmlMode && lower === Sequences.ScriptEnd[2] + ? State.BeforeSpecialS + : State.InTagName; + } + } + else if (c === CharCodes.Slash) { + this.state = State.BeforeClosingTagName; + } + else { + this.state = State.Text; + this.stateText(c); + } + }; + Tokenizer.prototype.stateInTagName = function (c) { + if (isEndOfTagSection(c)) { + this.cbs.onopentagname(this.sectionStart, this.index); + this.sectionStart = -1; + this.state = State.BeforeAttributeName; + this.stateBeforeAttributeName(c); + } + }; + Tokenizer.prototype.stateBeforeClosingTagName = function (c) { + if (isWhitespace(c)) { + // Ignore + } + else if (c === CharCodes.Gt) { + this.state = State.Text; + } + else { + this.state = this.isTagStartChar(c) + ? State.InClosingTagName + : State.InSpecialComment; + this.sectionStart = this.index; + } + }; + Tokenizer.prototype.stateInClosingTagName = function (c) { + if (c === CharCodes.Gt || isWhitespace(c)) { + this.cbs.onclosetag(this.sectionStart, this.index); + this.sectionStart = -1; + this.state = State.AfterClosingTagName; + this.stateAfterClosingTagName(c); + } + }; + Tokenizer.prototype.stateAfterClosingTagName = function (c) { + // Skip everything until ">" + if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { + this.state = State.Text; + this.baseState = State.Text; + this.sectionStart = this.index + 1; + } + }; + Tokenizer.prototype.stateBeforeAttributeName = function (c) { + if (c === CharCodes.Gt) { + this.cbs.onopentagend(this.index); + if (this.isSpecial) { + this.state = State.InSpecialTag; + this.sequenceIndex = 0; + } + else { + this.state = State.Text; + } + this.baseState = this.state; + this.sectionStart = this.index + 1; + } + else if (c === CharCodes.Slash) { + this.state = State.InSelfClosingTag; + } + else if (!isWhitespace(c)) { + this.state = State.InAttributeName; + this.sectionStart = this.index; + } + }; + Tokenizer.prototype.stateInSelfClosingTag = function (c) { + if (c === CharCodes.Gt) { + this.cbs.onselfclosingtag(this.index); + this.state = State.Text; + this.baseState = State.Text; + this.sectionStart = this.index + 1; + this.isSpecial = false; // Reset special state, in case of self-closing special tags + } + else if (!isWhitespace(c)) { + this.state = State.BeforeAttributeName; + this.stateBeforeAttributeName(c); + } + }; + Tokenizer.prototype.stateInAttributeName = function (c) { + if (c === CharCodes.Eq || isEndOfTagSection(c)) { + this.cbs.onattribname(this.sectionStart, this.index); + this.sectionStart = -1; + this.state = State.AfterAttributeName; + this.stateAfterAttributeName(c); + } + }; + Tokenizer.prototype.stateAfterAttributeName = function (c) { + if (c === CharCodes.Eq) { + this.state = State.BeforeAttributeValue; + } + else if (c === CharCodes.Slash || c === CharCodes.Gt) { + this.cbs.onattribend(QuoteType.NoValue, this.index); + this.state = State.BeforeAttributeName; + this.stateBeforeAttributeName(c); + } + else if (!isWhitespace(c)) { + this.cbs.onattribend(QuoteType.NoValue, this.index); + this.state = State.InAttributeName; + this.sectionStart = this.index; + } + }; + Tokenizer.prototype.stateBeforeAttributeValue = function (c) { + if (c === CharCodes.DoubleQuote) { + this.state = State.InAttributeValueDq; + this.sectionStart = this.index + 1; + } + else if (c === CharCodes.SingleQuote) { + this.state = State.InAttributeValueSq; + this.sectionStart = this.index + 1; + } + else if (!isWhitespace(c)) { + this.sectionStart = this.index; + this.state = State.InAttributeValueNq; + this.stateInAttributeValueNoQuotes(c); // Reconsume token + } + }; + Tokenizer.prototype.handleInAttributeValue = function (c, quote) { + if (c === quote || + (!this.decodeEntities && this.fastForwardTo(quote))) { + this.cbs.onattribdata(this.sectionStart, this.index); + this.sectionStart = -1; + this.cbs.onattribend(quote === CharCodes.DoubleQuote + ? QuoteType.Double + : QuoteType.Single, this.index); + this.state = State.BeforeAttributeName; + } + else if (this.decodeEntities && c === CharCodes.Amp) { + this.baseState = this.state; + this.state = State.BeforeEntity; + } + }; + Tokenizer.prototype.stateInAttributeValueDoubleQuotes = function (c) { + this.handleInAttributeValue(c, CharCodes.DoubleQuote); + }; + Tokenizer.prototype.stateInAttributeValueSingleQuotes = function (c) { + this.handleInAttributeValue(c, CharCodes.SingleQuote); + }; + Tokenizer.prototype.stateInAttributeValueNoQuotes = function (c) { + if (isWhitespace(c) || c === CharCodes.Gt) { + this.cbs.onattribdata(this.sectionStart, this.index); + this.sectionStart = -1; + this.cbs.onattribend(QuoteType.Unquoted, this.index); + this.state = State.BeforeAttributeName; + this.stateBeforeAttributeName(c); + } + else if (this.decodeEntities && c === CharCodes.Amp) { + this.baseState = this.state; + this.state = State.BeforeEntity; + } + }; + Tokenizer.prototype.stateBeforeDeclaration = function (c) { + if (c === CharCodes.OpeningSquareBracket) { + this.state = State.CDATASequence; + this.sequenceIndex = 0; + } + else { + this.state = + c === CharCodes.Dash + ? State.BeforeComment + : State.InDeclaration; + } + }; + Tokenizer.prototype.stateInDeclaration = function (c) { + if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { + this.cbs.ondeclaration(this.sectionStart, this.index); + this.state = State.Text; + this.sectionStart = this.index + 1; + } + }; + Tokenizer.prototype.stateInProcessingInstruction = function (c) { + if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { + this.cbs.onprocessinginstruction(this.sectionStart, this.index); + this.state = State.Text; + this.sectionStart = this.index + 1; + } + }; + Tokenizer.prototype.stateBeforeComment = function (c) { + if (c === CharCodes.Dash) { + this.state = State.InCommentLike; + this.currentSequence = Sequences.CommentEnd; + // Allow short comments (eg. <!-->) + this.sequenceIndex = 2; + this.sectionStart = this.index + 1; + } + else { + this.state = State.InDeclaration; + } + }; + Tokenizer.prototype.stateInSpecialComment = function (c) { + if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) { + this.cbs.oncomment(this.sectionStart, this.index, 0); + this.state = State.Text; + this.sectionStart = this.index + 1; + } + }; + Tokenizer.prototype.stateBeforeSpecialS = function (c) { + var lower = c | 0x20; + if (lower === Sequences.ScriptEnd[3]) { + this.startSpecial(Sequences.ScriptEnd, 4); + } + else if (lower === Sequences.StyleEnd[3]) { + this.startSpecial(Sequences.StyleEnd, 4); + } + else { + this.state = State.InTagName; + this.stateInTagName(c); // Consume the token again + } + }; + Tokenizer.prototype.stateBeforeEntity = function (c) { + // Start excess with 1 to include the '&' + this.entityExcess = 1; + this.entityResult = 0; + if (c === CharCodes.Number) { + this.state = State.BeforeNumericEntity; + } + else if (c === CharCodes.Amp) { + // We have two `&` characters in a row. Stay in the current state. + } + else { + this.trieIndex = 0; + this.trieCurrent = this.entityTrie[0]; + this.state = State.InNamedEntity; + this.stateInNamedEntity(c); + } + }; + Tokenizer.prototype.stateInNamedEntity = function (c) { + this.entityExcess += 1; + this.trieIndex = (0, decode_js_1.determineBranch)(this.entityTrie, this.trieCurrent, this.trieIndex + 1, c); + if (this.trieIndex < 0) { + this.emitNamedEntity(); + this.index--; + return; + } + this.trieCurrent = this.entityTrie[this.trieIndex]; + var masked = this.trieCurrent & decode_js_1.BinTrieFlags.VALUE_LENGTH; + // If the branch is a value, store it and continue + if (masked) { + // The mask is the number of bytes of the value, including the current byte. + var valueLength = (masked >> 14) - 1; + // If we have a legacy entity while parsing strictly, just skip the number of bytes + if (!this.allowLegacyEntity() && c !== CharCodes.Semi) { + this.trieIndex += valueLength; + } + else { + // Add 1 as we have already incremented the excess + var entityStart = this.index - this.entityExcess + 1; + if (entityStart > this.sectionStart) { + this.emitPartial(this.sectionStart, entityStart); + } + // If this is a surrogate pair, consume the next two bytes + this.entityResult = this.trieIndex; + this.trieIndex += valueLength; + this.entityExcess = 0; + this.sectionStart = this.index + 1; + if (valueLength === 0) { + this.emitNamedEntity(); + } + } + } + }; + Tokenizer.prototype.emitNamedEntity = function () { + this.state = this.baseState; + if (this.entityResult === 0) { + return; + } + var valueLength = (this.entityTrie[this.entityResult] & decode_js_1.BinTrieFlags.VALUE_LENGTH) >> + 14; + switch (valueLength) { + case 1: { + this.emitCodePoint(this.entityTrie[this.entityResult] & + ~decode_js_1.BinTrieFlags.VALUE_LENGTH); + break; + } + case 2: { + this.emitCodePoint(this.entityTrie[this.entityResult + 1]); + break; + } + case 3: { + this.emitCodePoint(this.entityTrie[this.entityResult + 1]); + this.emitCodePoint(this.entityTrie[this.entityResult + 2]); + } + } + }; + Tokenizer.prototype.stateBeforeNumericEntity = function (c) { + if ((c | 0x20) === CharCodes.LowerX) { + this.entityExcess++; + this.state = State.InHexEntity; + } + else { + this.state = State.InNumericEntity; + this.stateInNumericEntity(c); + } + }; + Tokenizer.prototype.emitNumericEntity = function (strict) { + var entityStart = this.index - this.entityExcess - 1; + var numberStart = entityStart + 2 + Number(this.state === State.InHexEntity); + if (numberStart !== this.index) { + // Emit leading data if any + if (entityStart > this.sectionStart) { + this.emitPartial(this.sectionStart, entityStart); + } + this.sectionStart = this.index + Number(strict); + this.emitCodePoint((0, decode_js_1.replaceCodePoint)(this.entityResult)); + } + this.state = this.baseState; + }; + Tokenizer.prototype.stateInNumericEntity = function (c) { + if (c === CharCodes.Semi) { + this.emitNumericEntity(true); + } + else if (isNumber(c)) { + this.entityResult = this.entityResult * 10 + (c - CharCodes.Zero); + this.entityExcess++; + } + else { + if (this.allowLegacyEntity()) { + this.emitNumericEntity(false); + } + else { + this.state = this.baseState; + } + this.index--; + } + }; + Tokenizer.prototype.stateInHexEntity = function (c) { + if (c === CharCodes.Semi) { + this.emitNumericEntity(true); + } + else if (isNumber(c)) { + this.entityResult = this.entityResult * 16 + (c - CharCodes.Zero); + this.entityExcess++; + } + else if (isHexDigit(c)) { + this.entityResult = + this.entityResult * 16 + ((c | 0x20) - CharCodes.LowerA + 10); + this.entityExcess++; + } + else { + if (this.allowLegacyEntity()) { + this.emitNumericEntity(false); + } + else { + this.state = this.baseState; + } + this.index--; + } + }; + Tokenizer.prototype.allowLegacyEntity = function () { + return (!this.xmlMode && + (this.baseState === State.Text || + this.baseState === State.InSpecialTag)); + }; + /** + * Remove data that has already been consumed from the buffer. + */ + Tokenizer.prototype.cleanup = function () { + // If we are inside of text or attributes, emit what we already have. + if (this.running && this.sectionStart !== this.index) { + if (this.state === State.Text || + (this.state === State.InSpecialTag && this.sequenceIndex === 0)) { + this.cbs.ontext(this.sectionStart, this.index); + this.sectionStart = this.index; + } + else if (this.state === State.InAttributeValueDq || + this.state === State.InAttributeValueSq || + this.state === State.InAttributeValueNq) { + this.cbs.onattribdata(this.sectionStart, this.index); + this.sectionStart = this.index; + } + } + }; + Tokenizer.prototype.shouldContinue = function () { + return this.index < this.buffer.length + this.offset && this.running; + }; + /** + * Iterates through the buffer, calling the function corresponding to the current state. + * + * States that are more likely to be hit are higher up, as a performance improvement. + */ + Tokenizer.prototype.parse = function () { + while (this.shouldContinue()) { + var c = this.buffer.charCodeAt(this.index - this.offset); + switch (this.state) { + case State.Text: { + this.stateText(c); + break; + } + case State.SpecialStartSequence: { + this.stateSpecialStartSequence(c); + break; + } + case State.InSpecialTag: { + this.stateInSpecialTag(c); + break; + } + case State.CDATASequence: { + this.stateCDATASequence(c); + break; + } + case State.InAttributeValueDq: { + this.stateInAttributeValueDoubleQuotes(c); + break; + } + case State.InAttributeName: { + this.stateInAttributeName(c); + break; + } + case State.InCommentLike: { + this.stateInCommentLike(c); + break; + } + case State.InSpecialComment: { + this.stateInSpecialComment(c); + break; + } + case State.BeforeAttributeName: { + this.stateBeforeAttributeName(c); + break; + } + case State.InTagName: { + this.stateInTagName(c); + break; + } + case State.InClosingTagName: { + this.stateInClosingTagName(c); + break; + } + case State.BeforeTagName: { + this.stateBeforeTagName(c); + break; + } + case State.AfterAttributeName: { + this.stateAfterAttributeName(c); + break; + } + case State.InAttributeValueSq: { + this.stateInAttributeValueSingleQuotes(c); + break; + } + case State.BeforeAttributeValue: { + this.stateBeforeAttributeValue(c); + break; + } + case State.BeforeClosingTagName: { + this.stateBeforeClosingTagName(c); + break; + } + case State.AfterClosingTagName: { + this.stateAfterClosingTagName(c); + break; + } + case State.BeforeSpecialS: { + this.stateBeforeSpecialS(c); + break; + } + case State.InAttributeValueNq: { + this.stateInAttributeValueNoQuotes(c); + break; + } + case State.InSelfClosingTag: { + this.stateInSelfClosingTag(c); + break; + } + case State.InDeclaration: { + this.stateInDeclaration(c); + break; + } + case State.BeforeDeclaration: { + this.stateBeforeDeclaration(c); + break; + } + case State.BeforeComment: { + this.stateBeforeComment(c); + break; + } + case State.InProcessingInstruction: { + this.stateInProcessingInstruction(c); + break; + } + case State.InNamedEntity: { + this.stateInNamedEntity(c); + break; + } + case State.BeforeEntity: { + this.stateBeforeEntity(c); + break; + } + case State.InHexEntity: { + this.stateInHexEntity(c); + break; + } + case State.InNumericEntity: { + this.stateInNumericEntity(c); + break; + } + default: { + // `this._state === State.BeforeNumericEntity` + this.stateBeforeNumericEntity(c); + } + } + this.index++; + } + this.cleanup(); + }; + Tokenizer.prototype.finish = function () { + if (this.state === State.InNamedEntity) { + this.emitNamedEntity(); + } + // If there is remaining data, emit it in a reasonable way + if (this.sectionStart < this.index) { + this.handleTrailingData(); + } + this.cbs.onend(); + }; + /** Handle any trailing data. */ + Tokenizer.prototype.handleTrailingData = function () { + var endIndex = this.buffer.length + this.offset; + if (this.state === State.InCommentLike) { + if (this.currentSequence === Sequences.CdataEnd) { + this.cbs.oncdata(this.sectionStart, endIndex, 0); + } + else { + this.cbs.oncomment(this.sectionStart, endIndex, 0); + } + } + else if (this.state === State.InNumericEntity && + this.allowLegacyEntity()) { + this.emitNumericEntity(false); + // All trailing data will have been consumed + } + else if (this.state === State.InHexEntity && + this.allowLegacyEntity()) { + this.emitNumericEntity(false); + // All trailing data will have been consumed + } + else if (this.state === State.InTagName || + this.state === State.BeforeAttributeName || + this.state === State.BeforeAttributeValue || + this.state === State.AfterAttributeName || + this.state === State.InAttributeName || + this.state === State.InAttributeValueSq || + this.state === State.InAttributeValueDq || + this.state === State.InAttributeValueNq || + this.state === State.InClosingTagName) { + /* + * If we are currently in an opening or closing tag, us not calling the + * respective callback signals that the tag should be ignored. + */ + } + else { + this.cbs.ontext(this.sectionStart, endIndex); + } + }; + Tokenizer.prototype.emitPartial = function (start, endIndex) { + if (this.baseState !== State.Text && + this.baseState !== State.InSpecialTag) { + this.cbs.onattribdata(start, endIndex); + } + else { + this.cbs.ontext(start, endIndex); + } + }; + Tokenizer.prototype.emitCodePoint = function (cp) { + if (this.baseState !== State.Text && + this.baseState !== State.InSpecialTag) { + this.cbs.onattribentity(cp); + } + else { + this.cbs.ontextentity(cp); + } + }; + return Tokenizer; +}()); +exports.default = Tokenizer; +//# sourceMappingURL=Tokenizer.js.map
\ No newline at end of file |