from transformers import AutoTokenizerfrom tokenizers import pre_tokenizers, Regex
tokenizer = AutoTokenizer.from_pretrained ("meta-llama/Meta-Llama-3-8B")
tokenizer._tokenizer.pre_tokenizer = pre_tokenizers.Sequence ( [ pre_tokenizers.Split (pattern = Regex (r"\d {1,3}(?=(\d {3})*\b)"), behavior="isolated", invert = False), pre_tokenizers.Split (pattern=Regex (r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p {L}\p {N}]?\p {L}+|\p {N}{1,3}| ?[^\s\p {L}\p {N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"), behavior="isolated", invert=False), pre_tokenizers.ByteLevel (add_prefix_space=False, trim_offsets=True, use_regex=False) ] )
print (tokenizer.tokenize ("42069"))