Search notes:

Python library: tiktoken

tiktoken is a fast BPE (Byte pair encoding) tokenizer for use with OpenAI's models.
A feature of byte pair encoders is that they can encode any arbitrary string. It it encounters a word not present in the vocabulary, it breaks it down to tokens that it understands.

Use different encoders to encode the same text

import tiktoken

from tiktoken_ext.openai_public import ENCODING_CONSTRUCTORS

def tokenize(encName):

    print(encName)    

    encFunc = ENCODING_CONSTRUCTORS[encName]
    encDict = encFunc()
    
    enc = tiktoken.Encoding(encDict['name'],
             pat_str         = encDict['pat_str'        ],
             mergeable_ranks = encDict['mergeable_ranks'],
             special_tokens  = encDict['special_tokens' ])
    
    tokens = enc.encode_ordinary('''
select
   o.id,
   o.order_date,
   o.amount,
   i.article_nr,
   i.price
from
   orders  o                         join
   items   i o o.id, items.order_id
    ''')
    
    print(tokens)
    print('')

#   --------------------------------------------------------

tokenize('gpt2'       )
tokenize('r50k_base'  )
tokenize('p50k_base'  )
tokenize('cl100k_base')

Encode, then decode

import tiktoken
from tiktoken_ext.openai_public import ENCODING_CONSTRUCTORS

encFunc = ENCODING_CONSTRUCTORS['gpt2']
encDict = encFunc()

enc = tiktoken.Encoding(encDict['name'],
         pat_str         = encDict['pat_str'        ],
         mergeable_ranks = encDict['mergeable_ranks'],
         special_tokens  = encDict['special_tokens' ])

tokens = enc.encode('''
def F(txt):
    print(txt)
''')

print(tokens)

print(enc.decode(tokens))

See also

tiktoken is a prerequisite for nanoGPT

Index