I saw this function, but never called
|
def _set_tokenizer(self, tokenizer_path: str): |
|
""" |
|
Configures the tokenizer for the model |
|
|
|
Parameters |
|
---------- |
|
tokenizer_path : str |
|
Path for the tokenizer (str) |
|
""" |
|
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) |
|
|
|
# setup padding |
|
tokenizer.pad_token_id = 1 |
|
tokenizer.pad_token = "<pad>" |
|
tokenizer.padding_side = "left" |
|
|
|
# setup truncation |
|
tokenizer.truncation_side = "left" |
|
|
|
# setup special tokens |
|
tokenizer.bos_token_id = 0 |
|
tokenizer.bos_token = "<s>" |
|
|
|
tokenizer.eos_token_id = 2 |
|
tokenizer.eos_token = "</s>" |
|
|
|
tokenizer.unk_token = "<unk>" |
|
tokenizer.unk_token_id = 3 |
|
|
|
self.tokenizer = tokenizer |
But quite confused that
it seems we never use the bos and eos tokens,
even in transformer, they are None if we print out the bos, eos, pad tokens.
I'm quite confused about bos and eos, do we actually use them or not
I saw this function, but never called
galai/galai/model.py
Lines 143 to 172 in 3a724f5
But quite confused that
it seems we never use the bos and eos tokens,
even in transformer, they are
Noneif we print out the bos, eos, pad tokens.I'm quite confused about bos and eos, do we actually use them or not