Skip to content

Commit c41c5de

Browse files
authored
feat: add left padding support to tokenizers (#1424)
1 parent 9ac7b67 commit c41c5de

2 files changed

Lines changed: 19 additions & 7 deletions

File tree

src/tokenizers/tokenizer.cpp

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -107,13 +107,24 @@ void Tokenizer::pad_tokens(std::vector<int>& tokens,
107107

108108
if (final_length > out_tokens.size()) {
109109
const size_t pad_count = final_length - out_tokens.size();
110-
out_tokens.insert(out_tokens.end(), pad_count, PAD_TOKEN_ID);
111-
112-
if (use_weights) {
113-
out_weights.insert(out_weights.end(), pad_count, 1.0f);
114-
}
115-
if (use_mask) {
116-
out_mask.insert(out_mask.end(), pad_count, 0.0f);
110+
if (pad_left) {
111+
out_tokens.insert(out_tokens.begin(), pad_count, PAD_TOKEN_ID);
112+
113+
if (use_weights) {
114+
out_weights.insert(out_weights.begin(), pad_count, 1.0f);
115+
}
116+
if (use_mask) {
117+
out_mask.insert(out_mask.begin(), pad_count, 0.0f);
118+
}
119+
} else {
120+
out_tokens.insert(out_tokens.end(), pad_count, PAD_TOKEN_ID);
121+
122+
if (use_weights) {
123+
out_weights.insert(out_weights.end(), pad_count, 1.0f);
124+
}
125+
if (use_mask) {
126+
out_mask.insert(out_mask.end(), pad_count, 0.0f);
127+
}
117128
}
118129
}
119130
};

src/tokenizers/tokenizer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ class Tokenizer {
1414
std::vector<std::string> special_tokens;
1515
bool add_bos_token = false;
1616
bool add_eos_token = false;
17+
bool pad_left = false;
1718
std::string end_of_word_suffix;
1819

1920
virtual std::string decode_token(int token_id) const = 0;

0 commit comments

Comments
 (0)