Skip to content

Commit 790c756

Browse files
authored
fix(tokenizer): Drop chunks after emitting tokens (#432)
1 parent 6e7b230 commit 790c756

File tree

4 files changed

+44
-15
lines changed

4 files changed

+44
-15
lines changed

packages/parse5-html-rewriting-stream/test/rewriting-stream.test.ts

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { outdent } from 'outdent';
33
import { RewritingStream } from '../lib/index.js';
44
import { loadSAXParserTestData } from 'parse5-test-utils/utils/load-sax-parser-test-data.js';
55
import { getStringDiffMsg, writeChunkedToStream, WritableStreamStub } from 'parse5-test-utils/utils/common.js';
6+
import { finished } from 'node:stream';
67

78
const srcHtml = outdent`
89
<!DOCTYPE html "">
@@ -17,6 +18,9 @@ const srcHtml = outdent`
1718
</html>
1819
`;
1920

21+
const LONG_TEXT = 'a'.repeat((1 << 16) + 1);
22+
const LONG_TEXT_WITH_COMMENT = `${'a'.repeat((1 << 16) - 5)}<!-- comment -->`;
23+
2024
function createRewriterTest({
2125
src,
2226
expected,
@@ -28,13 +32,17 @@ function createRewriterTest({
2832
expected: string;
2933
assignTokenHandlers?: (rewriter: RewritingStream) => void;
3034
}) {
31-
return (done: () => void): void => {
35+
return (done: (err?: unknown) => void): void => {
3236
const rewriter = new RewritingStream();
3337
const writable = new WritableStreamStub();
3438

35-
writable.once('finish', () => {
36-
assert.ok(writable.writtenData === expected, getStringDiffMsg(writable.writtenData, expected));
37-
done();
39+
finished(writable, () => {
40+
try {
41+
assert.ok(writable.writtenData === expected, getStringDiffMsg(writable.writtenData, expected));
42+
done();
43+
} catch (error) {
44+
done(error);
45+
}
3846
});
3947

4048
rewriter.pipe(writable);
@@ -305,4 +313,20 @@ describe('RewritingStream', () => {
305313

306314
assert.throws(() => stream.write(buf), TypeError);
307315
});
316+
317+
it(
318+
'Should pass long text correctly (GH-292)',
319+
createRewriterTest({
320+
src: LONG_TEXT,
321+
expected: LONG_TEXT,
322+
})
323+
);
324+
325+
it(
326+
'Should emit comment after text correctly',
327+
createRewriterTest({
328+
src: LONG_TEXT_WITH_COMMENT,
329+
expected: LONG_TEXT_WITH_COMMENT,
330+
})
331+
);
308332
});

packages/parse5-sax-parser/lib/index.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,10 @@ export class SAXParser extends Transform implements TokenHandler {
156156
};
157157
}
158158
}
159+
160+
if (this.tokenizer.preprocessor.willDropParsedChunk()) {
161+
this._emitPendingText();
162+
}
159163
}
160164

161165
/** @internal */

packages/parse5/lib/tokenizer/index.ts

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -465,16 +465,22 @@ export class Tokenizer {
465465

466466
this.handler.onEndTag(ct);
467467
}
468+
469+
this.preprocessor.dropParsedChunk();
468470
}
469471

470472
private emitCurrentComment(ct: CommentToken): void {
471473
this.prepareToken(ct);
472474
this.handler.onComment(ct);
475+
476+
this.preprocessor.dropParsedChunk();
473477
}
474478

475479
private emitCurrentDoctype(ct: DoctypeToken): void {
476480
this.prepareToken(ct);
477481
this.handler.onDoctype(ct);
482+
483+
this.preprocessor.dropParsedChunk();
478484
}
479485

480486
private _emitCurrentCharacterToken(nextLocation: Location | null): void {
@@ -536,6 +542,7 @@ export class Tokenizer {
536542
if (this.currentCharacterToken.type !== type) {
537543
this.currentLocation = this.getCurrentLocation(0);
538544
this._emitCurrentCharacterToken(this.currentLocation);
545+
this.preprocessor.dropParsedChunk();
539546
} else {
540547
this.currentCharacterToken.chars += ch;
541548
return;
@@ -969,8 +976,6 @@ export class Tokenizer {
969976
// Data state
970977
//------------------------------------------------------------------
971978
private _stateData(cp: number): void {
972-
this.preprocessor.dropParsedChunk();
973-
974979
switch (cp) {
975980
case $.LESS_THAN_SIGN: {
976981
this.state = State.TAG_OPEN;
@@ -999,8 +1004,6 @@ export class Tokenizer {
9991004
// RCDATA state
10001005
//------------------------------------------------------------------
10011006
private _stateRcdata(cp: number): void {
1002-
this.preprocessor.dropParsedChunk();
1003-
10041007
switch (cp) {
10051008
case $.AMPERSAND: {
10061009
this.returnState = State.RCDATA;
@@ -1029,8 +1032,6 @@ export class Tokenizer {
10291032
// RAWTEXT state
10301033
//------------------------------------------------------------------
10311034
private _stateRawtext(cp: number): void {
1032-
this.preprocessor.dropParsedChunk();
1033-
10341035
switch (cp) {
10351036
case $.LESS_THAN_SIGN: {
10361037
this.state = State.RAWTEXT_LESS_THAN_SIGN;
@@ -1054,8 +1055,6 @@ export class Tokenizer {
10541055
// Script data state
10551056
//------------------------------------------------------------------
10561057
private _stateScriptData(cp: number): void {
1057-
this.preprocessor.dropParsedChunk();
1058-
10591058
switch (cp) {
10601059
case $.LESS_THAN_SIGN: {
10611060
this.state = State.SCRIPT_DATA_LESS_THAN_SIGN;
@@ -1079,8 +1078,6 @@ export class Tokenizer {
10791078
// PLAINTEXT state
10801079
//------------------------------------------------------------------
10811080
private _statePlaintext(cp: number): void {
1082-
this.preprocessor.dropParsedChunk();
1083-
10841081
switch (cp) {
10851082
case $.NULL: {
10861083
this._err(ERR.unexpectedNullCharacter);

packages/parse5/lib/tokenizer/preprocessor.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,12 @@ export class Preprocessor {
9797
return cp;
9898
}
9999

100+
public willDropParsedChunk(): boolean {
101+
return this.pos > this.bufferWaterline;
102+
}
103+
100104
public dropParsedChunk(): void {
101-
if (this.pos > this.bufferWaterline) {
105+
if (this.willDropParsedChunk()) {
102106
this.html = this.html.substring(this.pos);
103107
this.lineStartPos -= this.pos;
104108
this.droppedBufferSize += this.pos;

0 commit comments

Comments
 (0)