Search notes:

Python library: sentencepiece

A Python wrapper for SentencePiece.
from sentencepiece import SentencePieceProcessor
tokenizer = SentencePieceProcessor('tokenizer.model')
vocabSize = tokenizer.vocab_size()

SentencePieceProcessor

_add_bos bool object
_add_eos bool object
_alpha float object
bos_id()
CalculateEntropy()
_CalculateEntropy()
calculate_entropy()
_CalculateEntropyBatch()
Decode()
decode()
DecodeIds()
_DecodeIds()
decode_ids()
DecodeIdsAsImmutableProto()
_DecodeIdsAsImmutableProto()
decode_ids_as_immutable_proto()
_DecodeIdsAsImmutableProtoBatch()
DecodeIdsAsSerializedProto()
_DecodeIdsAsSerializedProto()
decode_ids_as_serialized_proto()
_DecodeIdsAsSerializedProtoBatch()
_DecodeIdsBatch()
DecodePieces()
_DecodePieces()
decode_pieces()
DecodePiecesAsImmutableProto()
_DecodePiecesAsImmutableProto()
decode_pieces_as_immutable_proto()
_DecodePiecesAsImmutableProtoBatch()
DecodePiecesAsSerializedProto()
_DecodePiecesAsSerializedProto()
decode_pieces_as_serialized_proto()
_DecodePiecesAsSerializedProtoBatch()
_DecodePiecesBatch()
Detokenize()
detokenize()
_emit_unk_piece bool object
_enable_sampling bool object
Encode()
encode()
EncodeAsIds()
_EncodeAsIds()
encode_as_ids()
_EncodeAsIdsBatch()
EncodeAsImmutableProto()
_EncodeAsImmutableProto()
encode_as_immutable_proto()
_EncodeAsImmutableProtoBatch()
EncodeAsPieces()
_EncodeAsPieces()
encode_as_pieces()
_EncodeAsPiecesBatch()
EncodeAsSerializedProto()
_EncodeAsSerializedProto()
encode_as_serialized_proto()
_EncodeAsSerializedProtoBatch()
eos_id()
GetPieceSize()
get_piece_size()
GetScore()
get_score() Returns the id's score (usually an emission log probability of unigram language).
IdToPiece()
id_to_piece() Returns a string representation of the token with the given id. Compare with piece_to_id().
Init()
init()
IsByte()
is_byte()
IsControl()
is_control() true if given id is a control token (e. g. <s>, </s>)
IsUnknown()
is_unknown() true if given id is an unknown token (e. g. <unk>)
IsUnused()
is_unused()
Load()
load()
LoadFromFile()
load_from_file()
LoadFromSerializedProto()
load_from_serialized_proto()
LoadVocabulary()
load_vocabulary()
NBestEncode()
nbest_encode()
NBestEncodeAsIds()
_NBestEncodeAsIds()
nbest_encode_as_ids()
NBestEncodeAsImmutableProto()
_NBestEncodeAsImmutableProto()
nbest_encode_as_immutable_proto()
NBestEncodeAsPieces()
_NBestEncodeAsPieces()
nbest_encode_as_pieces()
NBestEncodeAsSerializedProto()
_NBestEncodeAsSerializedProto()
nbest_encode_as_serialized_proto()
_nbest_size int object
_num_threads int object
_out_type ?
pad_id()
piece_size()
PieceToId()
piece_to_id() Returns the vocab id for the given string. Compare with id_to_piece()
ResetVocabulary()
reset_vocabulary()
_reverse bool object
SampleEncodeAndScore()
sample_encode_and_score()
SampleEncodeAndScoreAsIds()
_SampleEncodeAndScoreAsIds()
sample_encode_and_score_as_ids()
SampleEncodeAndScoreAsImmutableProto()
_SampleEncodeAndScoreAsImmutableProto()
sample_encode_and_score_as_immutable_proto()
SampleEncodeAndScoreAsPieces()
_SampleEncodeAndScoreAsPieces()
sample_encode_and_score_as_pieces()
SampleEncodeAndScoreAsSerializedProto()
_SampleEncodeAndScoreAsSerializedProto()
sample_encode_and_score_as_serialized_proto()
SampleEncodeAsIds()
sample_encode_as_ids()
SampleEncodeAsImmutableProto()
sample_encode_as_immutable_proto()
SampleEncodeAsPieces()
sample_encode_as_pieces()
SampleEncodeAsSerializedProto()
sample_encode_as_serialized_proto()
serialized_model_proto()
SetDecodeExtraOptions()
set_decode_extra_options()
SetEncodeExtraOptions()
set_encode_extra_options() Set extra options (for example 'bos:eos' to add <s> and </s>, 'reverse' to reverse the input, 'reverse:bos:eos' to reverse the input and add <s> and </s> …)
SetVocabulary()
set_vocabulary()
this SwigPyObject object
thisown bool object
Tokenize()
tokenize()
unk_id()
vocab_size()
from sentencepiece import SentencePieceProcessor

tokenizer = SentencePieceProcessor('LLaMA/tokenizer.model')

print(f'vocab size: {tokenizer.vocab_size()}')

with open('vocab.txt', 'w') as vocabFile:

     for vocabId in range(tokenizer.vocab_size()):

         vocabFile.write(f'{vocabId:>6} ')

         if tokenizer.is_unknown(vocabId):
            piece = '⁇'
         elif tokenizer.is_control(vocabId):
              piece = tokenizer.id_to_piece(vocabId) + ' (control)'

         elif tokenizer.is_byte(vocabId):
             piece = tokenizer.id_to_piece(vocabId)
             if len(piece) != 6:
                 print(f"Invalid token: {piece}")
                 sys.exit(1)
#            byte_value = int(piece[3:-1], 16)

         else:
             piece = tokenizer.id_to_piece(vocabId)
#            piece = tokenizer.id_to_piece(vocabId).replace("\u2581", " ") # .encode() # u2581 = ▁ (Lower One Eighth Block)

         vocabFile.write(f'{piece:<25}')

         score = tokenizer.get_score(vocabId)
         vocabFile.write(f'{score:14.1f}')
         vocabFile.write('\n')

Index