From 9f491b95cf91f9e93d26ee9ae6ebab2df498dee3 Mon Sep 17 00:00:00 2001 From: Joshua Lochner <26504141+xenova@users.noreply.github.com> Date: Fri, 18 Jul 2025 11:42:19 -0400 Subject: [PATCH 1/5] Add special tokens in text-generation pipeline if tokenizer requires --- src/pipelines.js | 9 ++++++--- src/tokenizers.js | 3 +++ tests/pipelines/test_pipelines_text_generation.js | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/pipelines.js b/src/pipelines.js index cc7b0e185..6304ce445 100644 --- a/src/pipelines.js +++ b/src/pipelines.js @@ -996,6 +996,11 @@ export class TextGenerationPipeline extends (/** @type {new (options: TextPipeli let isBatched = false; let isChatInput = false; + // By default, do not add special tokens, unless the tokenizer specifies otherwise + let add_special_tokens = generate_kwargs.add_special_tokens + ?? (this.tokenizer.add_bos_token || this.tokenizer.add_eos_token) + ?? false; + // Normalize inputs /** @type {string[]} */ let inputs; @@ -1021,11 +1026,9 @@ export class TextGenerationPipeline extends (/** @type {new (options: TextPipeli add_generation_prompt: true, }) )); + add_special_tokens = false; // Chat template handles this already } - // By default, do not add special tokens - const add_special_tokens = generate_kwargs.add_special_tokens ?? false; - // By default, return full text const return_full_text = isChatInput ? false diff --git a/src/tokenizers.js b/src/tokenizers.js index 965595747..2e617f93b 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -2659,6 +2659,9 @@ export class PreTrainedTokenizer extends Callable { this.padding_side = tokenizerConfig.padding_side; } + this.add_bos_token = tokenizerConfig.add_bos_token; + this.add_eos_token = tokenizerConfig.add_eos_token; + this.legacy = false; this.chat_template = tokenizerConfig.chat_template ?? null; diff --git a/tests/pipelines/test_pipelines_text_generation.js b/tests/pipelines/test_pipelines_text_generation.js index 085808263..42f23e910 100644 --- a/tests/pipelines/test_pipelines_text_generation.js +++ b/tests/pipelines/test_pipelines_text_generation.js @@ -20,7 +20,7 @@ export default () => { describe("batch_size=1", () => { const text_input = "hello"; - const generated_text_target = "erdingsAndroid Load"; + const generated_text_target = "erdingsdelete mely"; const text_target = [{ generated_text: text_input + generated_text_target }]; const new_text_target = [{ generated_text: generated_text_target }]; From 3164856352deb1496abfc8a3a1a83d108510bee3 Mon Sep 17 00:00:00 2001 From: Joshua Lochner <26504141+xenova@users.noreply.github.com> Date: Fri, 18 Jul 2025 18:39:43 -0400 Subject: [PATCH 2/5] Fix logits processors tests --- tests/utils/logits_process.test.js | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/utils/logits_process.test.js b/tests/utils/logits_process.test.js index 38ca613d0..a4b110767 100644 --- a/tests/utils/logits_process.test.js +++ b/tests/utils/logits_process.test.js @@ -35,17 +35,17 @@ describe("Logits Processors", () => { async () => { const text_input = "hello"; - const generated_text_target = " Bert explicit wed digasset"; + const generated_text_target = "\uff0d Giuseppeitte natoud"; const text_target = [{ generated_text: text_input + generated_text_target }]; const output = await pipe(text_input, { max_new_tokens: 5, bad_words_ids: [ - // default: [22172n, 18547n, 8136n, 16012n, 28064n, 11361n] + // default: [1n, 22172n, 18547n, 8143n, 22202n, 9456n, 17213n] [18547], - // block #1: [22172n, 16662n, 6261n, 18916n, 29109n, 799n] - [6261, 18916], + // block #1: [1n, 22172n, 31583n, 18824n, 16621n, 8136n, 16012n] + [18824, 16621], ], }); compare(output, text_target); @@ -58,22 +58,22 @@ describe("Logits Processors", () => { async () => { const text_input = "hello"; - const generated_text_target = "erdingsdeletearus)?nor"; + const generated_text_target = "erdingsdelete войsequ族"; const text_target = [{ generated_text: text_input + generated_text_target }]; // Construct long list of bad words const bad_words_ids = []; - // default: [22172n, 18547n, 8136n, 16012n, 28064n, 11361n] + // default: [1n, 22172n, 18547n, 8143n, 22202n, 9456n, 17213n] for (let i = 0; i < 100000; ++i) { bad_words_ids.push([i * 2]); // block all even numbers } - // block #1: [22172n, 18547n, 8143n, 30327n, 20061n, 18193n] + // block #1: [1n, 22172n, 18547n, 8143n, 30327n, 624n, 2806n, 2004n] bad_words_ids.push([8143, 30327]); - // block #2: [22172n, 18547n, 8143n, 29485n, 3799n, 29331n] + // block #2: [1n, 22172n, 18547n, 8143n, 29485n, 3799n, 29331n] bad_words_ids.push([18547, 8143, 29485]); - // block #3: [22172n, 18547n, 8143n, 26465n, 6877n, 15459n] + // block #3: [1n, 22172n, 18547n, 8143n, 7587n, 6831n, 30999n] const output = await pipe(text_input, { max_new_tokens: 5, bad_words_ids }); compare(output, text_target); }, @@ -85,19 +85,19 @@ describe("Logits Processors", () => { async () => { const text_input = "this is a test"; - const generated_text_target = "кт México constructed lake user"; + const generated_text_target = "кт México constructed lake års"; const text_target = [{ generated_text: text_input + generated_text_target }]; const output = await pipe(text_input, { max_new_tokens: 5, bad_words_ids: [ - // default: [445n, 338n, 263n, 1243n, 3931n, 14756n, 7811n, 21645n, 16426n] + // default: [1n, 445n, 338n, 263n, 1243n, 3931n, 14756n, 7811n, 21645n, 31252n] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3931], // should never trigger (longer than input sequence) - // block #1: [445n, 338n, 263n, 1243n, 3931n, 14756n, 7811n, 21645n, 16426n] + // block #1: [1n, 445n, 338n, 263n, 1243n, 3931n, 14756n, 7811n, 21645n, 31252n] [3931, 14756, 7811], - // result: [445n, 338n, 263n, 1243n, 3931n, 14756n, 13319n, 19437n, 1404n] + // result: [445n, 338n, 263n, 1243n, 3931n, 14756n, 13319n, 19437n, 21948n] ], }); compare(output, text_target); From f4c2d59bfe9cd35f02746b871652b277ccd0ee1f Mon Sep 17 00:00:00 2001 From: Joshua Lochner <26504141+xenova@users.noreply.github.com> Date: Fri, 18 Jul 2025 19:14:46 -0400 Subject: [PATCH 3/5] Update bundles.test.js --- tests/bundles.test.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/bundles.test.js b/tests/bundles.test.js index 3759e1706..1bc04117d 100644 --- a/tests/bundles.test.js +++ b/tests/bundles.test.js @@ -9,7 +9,7 @@ const result = await generator("hello", { max_new_tokens: 3, return_full_text: f process.stdout.write(result[0].generated_text); `; -const TARGET_OUTPUT = "erdingsAndroid Load"; +const TARGET_OUTPUT = "erdingsdelete mely"; const wrap_async_iife = (code) => `(async function() { ${code} })();`; @@ -17,9 +17,9 @@ const check = (code, module = false) => { const args = ["-e", code]; if (module) args.push("--input-type=module"); const { status, stdout, stderr } = spawnSync("node", args); - expect(stderr.toString()).toBe(""); // No warnings or errors are printed - expect(stdout.toString()).toBe(TARGET_OUTPUT); // The output should match - expect(status).toBe(0); // The process should exit cleanly + expect(stderr.toString()).toEqual(""); // No warnings or errors are printed + expect(stdout.toString()).toEqual(TARGET_OUTPUT); // The output should match + expect(status).toEqual(0); // The process should exit cleanly }; describe("Testing the bundle", () => { From fa2f3349cb0c858df068bb76dcfb2e6d9abac2d2 Mon Sep 17 00:00:00 2001 From: Joshua Lochner <26504141+xenova@users.noreply.github.com> Date: Fri, 18 Jul 2025 19:23:30 -0400 Subject: [PATCH 4/5] Update comment --- tests/utils/logits_process.test.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils/logits_process.test.js b/tests/utils/logits_process.test.js index a4b110767..1061b1d58 100644 --- a/tests/utils/logits_process.test.js +++ b/tests/utils/logits_process.test.js @@ -97,7 +97,7 @@ describe("Logits Processors", () => { // block #1: [1n, 445n, 338n, 263n, 1243n, 3931n, 14756n, 7811n, 21645n, 31252n] [3931, 14756, 7811], - // result: [445n, 338n, 263n, 1243n, 3931n, 14756n, 13319n, 19437n, 21948n] + // result: [1n, 445n, 338n, 263n, 1243n, 3931n, 14756n, 13319n, 19437n, 21948n] ], }); compare(output, text_target); From 3501a774e06cdb990574b1d4deff5a8019f825e7 Mon Sep 17 00:00:00 2001 From: Joshua Lochner <26504141+xenova@users.noreply.github.com> Date: Fri, 18 Jul 2025 19:23:48 -0400 Subject: [PATCH 5/5] Formatting --- tests/tokenizers.test.js | 79 ++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 23 deletions(-) diff --git a/tests/tokenizers.test.js b/tests/tokenizers.test.js index d815d2fc7..5f7a4a138 100644 --- a/tests/tokenizers.test.js +++ b/tests/tokenizers.test.js @@ -54,7 +54,6 @@ describe("Tokenizer padding/truncation", () => { }, MAX_TOKENIZER_LOAD_TIME); describe("return_tensor=false (jagged array)", () => { - test("jagged array output when return_tensor is false", () => { const output = tokenizer(inputs, { return_tensor: false, @@ -105,7 +104,6 @@ describe("Tokenizer padding/truncation", () => { compare(output, expected); }); - test("No padding, max_length=3 (implicit truncation strategy)", () => { const output = tokenizer(inputs_2, { padding: false, @@ -129,9 +127,18 @@ describe("Tokenizer padding/truncation", () => { return_tensor: false, }); const expected = { - input_ids: [[1037, 0, 0, 0, 0], [1038, 1039, 1040, 1041, 1042]], - token_type_ids: [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], - attention_mask: [[1, 0, 0, 0, 0], [1, 1, 1, 1, 1]], + input_ids: [ + [1037, 0, 0, 0, 0], + [1038, 1039, 1040, 1041, 1042], + ], + token_type_ids: [ + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + ], + attention_mask: [ + [1, 0, 0, 0, 0], + [1, 1, 1, 1, 1], + ], }; compare(output, expected); }); @@ -161,48 +168,75 @@ describe("Tokenizer padding/truncation", () => { return_tensor: false, }); const expected = { - input_ids: [[1037, 0, 0], [1038, 1039, 1040]], - token_type_ids: [[0, 0, 0], [0, 0, 0]], - attention_mask: [[1, 0, 0], [1, 1, 1]], + input_ids: [ + [1037, 0, 0], + [1038, 1039, 1040], + ], + token_type_ids: [ + [0, 0, 0], + [0, 0, 0], + ], + attention_mask: [ + [1, 0, 0], + [1, 1, 1], + ], }; compare(output, expected); }); test("Padding 'max_length' without truncation, max_length=3", () => { const output = tokenizer(inputs_2, { - padding: 'max_length', + padding: "max_length", truncation: false, max_length: 3, add_special_tokens: false, return_tensor: false, }); const expected = { - input_ids: [[1037, 0, 0], [1038, 1039, 1040, 1041, 1042]], - token_type_ids: [[0, 0, 0], [0, 0, 0, 0, 0]], - attention_mask: [[1, 0, 0], [1, 1, 1, 1, 1]], + input_ids: [ + [1037, 0, 0], + [1038, 1039, 1040, 1041, 1042], + ], + token_type_ids: [ + [0, 0, 0], + [0, 0, 0, 0, 0], + ], + attention_mask: [ + [1, 0, 0], + [1, 1, 1, 1, 1], + ], }; compare(output, expected); }); test("Padding 'max_length' with truncation, max_length=3", () => { const output = tokenizer(inputs_2, { - padding: 'max_length', + padding: "max_length", truncation: true, max_length: 3, add_special_tokens: false, return_tensor: false, }); const expected = { - input_ids: [[1037, 0, 0], [1038, 1039, 1040]], - token_type_ids: [[0, 0, 0], [0, 0, 0]], - attention_mask: [[1, 0, 0], [1, 1, 1]], + input_ids: [ + [1037, 0, 0], + [1038, 1039, 1040], + ], + token_type_ids: [ + [0, 0, 0], + [0, 0, 0], + ], + attention_mask: [ + [1, 0, 0], + [1, 1, 1], + ], }; compare(output, expected); }); test("Padding 'max_length' without truncation and max_length=null", () => { const output = tokenizer(inputs_2, { - padding: 'max_length', + padding: "max_length", truncation: false, max_length: null, add_special_tokens: false, @@ -211,15 +245,15 @@ describe("Tokenizer padding/truncation", () => { const expected = { input_ids: [ [1037, ...Array(511).fill(0)], - [1038, 1039, 1040, 1041, 1042, ...Array(507).fill(0)] + [1038, 1039, 1040, 1041, 1042, ...Array(507).fill(0)], ], token_type_ids: [ [0, ...Array(511).fill(0)], - [0, 0, 0, 0, 0, ...Array(507).fill(0)] + [0, 0, 0, 0, 0, ...Array(507).fill(0)], ], attention_mask: [ [1, ...Array(511).fill(0)], - [1, 1, 1, 1, 1, ...Array(507).fill(0)] + [1, 1, 1, 1, 1, ...Array(507).fill(0)], ], }; compare(output, expected); @@ -227,7 +261,6 @@ describe("Tokenizer padding/truncation", () => { }); describe("return_tensor=true", () => { - test("throws error when tensor output is requested for a jagged array", () => { expect(() => tokenizer(inputs)).toThrow("Unable to create tensor"); }); @@ -329,7 +362,7 @@ describe("Tokenizer padding/truncation", () => { test("padding:'max_length' pads to the specified max_length", () => { const { input_ids, attention_mask, token_type_ids } = tokenizer(inputs, { - padding: 'max_length', + padding: "max_length", truncation: true, add_special_tokens: false, max_length: 3, @@ -347,7 +380,7 @@ describe("Tokenizer padding/truncation", () => { [0n, 0n, 0n], ]); }); - }) + }); }); describe("Token type ids", () => {