generated from fastai/nbdev_template
-
Notifications
You must be signed in to change notification settings - Fork 2.4k
Open
Labels
🐛 bugSomething isn't workingSomething isn't working
Description
CI fails with dev dependencies: https://github.com/huggingface/trl/actions/runs/19767041653/job/56642242755
ValueError: Could not load tokenizer from trl-internal-testing/tiny-BloomForCausalLM. No tokenizer class could be determined and no SentencePiece model found.
FAILED tests/test_dataset_formatting.py::TestCloneChatTemplate::test_clone - ValueError: Could not load tokenizer from trl-internal-testing/tiny-BloomForCausalLM. No tokenizer class could be determined and no SentencePiece model found.
FAILED tests/test_dataset_formatting.py::TestCloneChatTemplate::test_clone_with_resize - ValueError: Could not load tokenizer from trl-internal-testing/tiny-BloomForCausalLM. No tokenizer class could be determined and no SentencePiece model found.
FAILED tests/test_dataset_formatting.py::TestCloneChatTemplate::test_clone_with_resize_and_extra_tokens_already_in_vocab - ValueError: Could not load tokenizer from trl-internal-testing/tiny-BloomForCausalLM. No tokenizer class could be determined and no SentencePiece model found.
FAILED tests/test_dataset_formatting.py::TestCloneChatTemplate::test_apply_new_chat_template - ValueError: Could not load tokenizer from trl-internal-testing/tiny-BloomForCausalLM. No tokenizer class could be determined and no SentencePiece model found.Stacktrace:
def test_clone(self):
# This tokenizer doesn't have a chat_template by default
> tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-BloomForCausalLM")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
tests/test_dataset_formatting.py:165:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
.venv/lib/python3.12/site-packages/transformers/models/auto/tokenization_auto.py:1110: in from_pretrained
return _try_load_tokenizer_with_fallbacks(tokenizer_class, pretrained_model_name_or_path, inputs, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
tokenizer_class = None
pretrained_model_name_or_path = 'trl-internal-testing/tiny-BloomForCausalLM'
inputs = ()
kwargs = {'_commit_hash': 'fb6f154da5e58e70d7a62bafe35c0f0da99691cb', '_from_auto': True}
def _try_load_tokenizer_with_fallbacks(tokenizer_class, pretrained_model_name_or_path, inputs, kwargs):
"""
Try to load a tokenizer with backend selection.
This function routes to the appropriate backend based on the 'backend' parameter:
- "tokenizers" (default): Uses HuggingFace tokenizers library backend
- "sentencepiece": Uses SentencePiece backend
For the tokenizers backend, attempts to load with the following priority:
1. If tokenizer.json exists, load directly
2. If any .model file (SPM) exists, try extracting vocab and merges
3. If vocab.json and merges.txt exist, load with those
4. Fallback to SentencePieceBackend if available
Args:
tokenizer_class: The tokenizer class to instantiate (can be None)
pretrained_model_name_or_path: Path or model id
inputs: Additional positional arguments for tokenizer init
kwargs: Additional keyword arguments (may include 'backend' parameter, defaults to "tokenizers")
Returns:
An instantiated tokenizer object
Raises:
ValueError: If no tokenizer could be loaded
"""
# Extract the backend parameter - default to "tokenizers" to prioritize tokenizers backend
backend = kwargs.pop("backend", "tokenizers")
# Validate backend parameter
if backend not in ["sentencepiece", "tokenizers"]:
logger.warning(
f"Invalid backend '{backend}' specified. Valid options are 'tokenizers' or 'sentencepiece'. "
"Defaulting to 'tokenizers' backend."
)
backend = "tokenizers"
# Route to SentencePiece backend if requested
if backend == "sentencepiece":
if SentencePieceBackend is None:
raise ValueError(
"SentencePiece backend was requested but sentencepiece is not installed. "
"Please install it with: pip install sentencepiece"
)
logger.info("Loading tokenizer with SentencePiece backend")
# Track files loaded for SentencePiece backend
spm_file = _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs)
files_loaded = [spm_file] if spm_file else []
kwargs["backend"] = "sentencepiece"
kwargs["files_loaded"] = files_loaded
# Resolve the SPM file path and pass it as vocab_file
if spm_file is not None:
resolved_vocab_file = cached_file(
pretrained_model_name_or_path,
spm_file,
cache_dir=kwargs.get("cache_dir"),
force_download=kwargs.get("force_download", False),
proxies=kwargs.get("proxies"),
token=kwargs.get("token"),
revision=kwargs.get("revision"),
local_files_only=kwargs.get("local_files_only", False),
subfolder=kwargs.get("subfolder", ""),
)
kwargs["vocab_file"] = resolved_vocab_file
if isinstance(tokenizer_class, type) and issubclass(tokenizer_class, SentencePieceBackend):
logger.info("Loading tokenizer with SentencePiece backend using tokenizer class")
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
return SentencePieceBackend.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
# Route to tokenizers backend (default)
if backend == "tokenizers":
if tokenizer_class is not None:
# Check if tokenizer_class inherits from PreTrainedTokenizer (but not from TokenizersBackend/SentencePieceBackend)
# These are edge cases with custom logic (e.g., BioGptTokenizer with Moses tokenization)
from ...tokenization_python import PreTrainedTokenizer
# Build list of backend classes to check against
backend_classes = [TokenizersBackend] if TokenizersBackend else []
if SentencePieceBackend:
backend_classes.append(SentencePieceBackend)
# Check if it's a custom PreTrainedTokenizer (not a backend class)
is_custom_pre_trained = (
isinstance(tokenizer_class, type)
and issubclass(tokenizer_class, PreTrainedTokenizer)
and not any(issubclass(tokenizer_class, bc) for bc in backend_classes)
and tokenizer_class.__name__ not in ("PythonBackend", "PreTrainedTokenizer")
)
# Check if it's a completely custom tokenizer (not PreTrainedTokenizer, not backend class)
# e.g., MistralCommonBackend which has its own from_pretrained logic
inherits_from_backend = isinstance(tokenizer_class, type) and any(
bc and issubclass(tokenizer_class, bc) for bc in backend_classes
)
is_completely_custom = (
isinstance(tokenizer_class, type)
and not issubclass(tokenizer_class, PythonBackend)
and not inherits_from_backend
)
if is_custom_pre_trained:
logger.info("Loading tokenizer with custom PreTrainedTokenizer backend (edge case)")
# Track the backend type for custom tokenizers
kwargs["backend"] = "custom"
kwargs["files_loaded"] = [] # Custom tokenizers may load various files
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
if is_completely_custom:
# For completely custom tokenizers (like MistralCommonBackend), try calling from_pretrained directly
logger.info("Loading tokenizer with custom tokenizer class (non-PreTrainedTokenizer)")
# Filter out AutoTokenizer-specific kwargs that custom tokenizers don't accept
custom_kwargs = {k: v for k, v in kwargs.items() if k not in ["backend", "files_loaded"]}
custom_kwargs["_from_auto"] = True # Signal that this is called from AutoTokenizer
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **custom_kwargs)
if TokenizersBackend is None:
raise ValueError(
"Tokenizers backend is the default but tokenizers library is not installed. "
"Please install it with: pip install tokenizers"
)
logger.info("Loading tokenizer with tokenizers backend")
try:
return _load_tokenizers_backend(tokenizer_class, pretrained_model_name_or_path, inputs, kwargs)
except ValueError as e:
# If tokenizers backend fails, try falling back to SentencePiece backend if available
spm_file = _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs)
if spm_file is not None and SentencePieceBackend is not None:
logger.info(
f"Tokenizers backend failed: {e}. "
f"Falling back to SentencePieceBackend since {spm_file} file was found."
)
files_loaded = [spm_file]
kwargs["backend"] = "sentencepiece"
kwargs["files_loaded"] = files_loaded
# Resolve the SPM file path and pass it as vocab_file
resolved_vocab_file = cached_file(
pretrained_model_name_or_path,
spm_file,
cache_dir=kwargs.get("cache_dir"),
force_download=kwargs.get("force_download", False),
proxies=kwargs.get("proxies"),
token=kwargs.get("token"),
revision=kwargs.get("revision"),
local_files_only=kwargs.get("local_files_only", False),
subfolder=kwargs.get("subfolder", ""),
)
kwargs["vocab_file"] = resolved_vocab_file
if tokenizer_class is not None and issubclass(tokenizer_class, SentencePieceBackend):
logger.info(
"Falling back to SentencePiece backend using tokenizer class that inherits from SentencePieceBackend."
)
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
return SentencePieceBackend.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
# If no fallback available, try calling tokenizer class directly as last resort
if hasattr(tokenizer_class, "from_pretrained"):
logger.info(
f"Tokenizers backend failed: {e}. Trying to load tokenizer directly from tokenizer class."
)
# Filter out AutoTokenizer-specific kwargs that custom tokenizers don't accept
custom_kwargs = {k: v for k, v in kwargs.items() if k not in ["backend", "files_loaded"]}
custom_kwargs["_from_auto"] = True # Signal that this is called from AutoTokenizer
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **custom_kwargs)
# Re-raise if no fallback options available
raise
# If no tokenizer class but tokenizers backend requested, fall back to SentencePiece if available
spm_file = _find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs)
if spm_file is not None and SentencePieceBackend is not None:
logger.info(
f"Tokenizers backend was requested but no tokenizer class found. "
f"Falling back to SentencePieceBackend since {spm_file} file was found."
)
files_loaded = [spm_file]
kwargs["backend"] = "sentencepiece"
kwargs["files_loaded"] = files_loaded
# Resolve the SPM file path and pass it as vocab_file
resolved_vocab_file = cached_file(
pretrained_model_name_or_path,
spm_file,
cache_dir=kwargs.get("cache_dir"),
force_download=kwargs.get("force_download", False),
proxies=kwargs.get("proxies"),
token=kwargs.get("token"),
revision=kwargs.get("revision"),
local_files_only=kwargs.get("local_files_only", False),
subfolder=kwargs.get("subfolder", ""),
)
kwargs["vocab_file"] = resolved_vocab_file
if (
tokenizer_class is not None
and SentencePieceBackend is not None
and issubclass(tokenizer_class, SentencePieceBackend)
):
logger.info(
"Falling back to SentencePiece backend using tokenizer class that inherits from SentencePieceBackend."
)
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
return SentencePieceBackend.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
> raise ValueError(
f"Could not load tokenizer from {pretrained_model_name_or_path}. "
"No tokenizer class could be determined and no SentencePiece model found."
)
E ValueError: Could not load tokenizer from trl-internal-testing/tiny-BloomForCausalLM. No tokenizer class could be determined and no SentencePiece model found.
.venv/lib/python3.12/site-packages/transformers/models/auto/tokenization_auto.py:800: ValueErrorMetadata
Metadata
Assignees
Labels
🐛 bugSomething isn't workingSomething isn't working