Some checks failed
Self-hosted runner (nightly-past-ci-caller) / Get number (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.11 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.10 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.9 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.8 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.7 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.6 (push) Has been cancelled
Self-hosted runner (nightly-past-ci-caller) / TensorFlow 2.5 (push) Has been cancelled
Self-hosted runner (benchmark) / Benchmark (aws-g5-4xlarge-cache) (push) Has been cancelled
Build documentation / build (push) Has been cancelled
Build documentation / build_other_lang (push) Has been cancelled
CodeQL Security Analysis / CodeQL Analysis (push) Has been cancelled
New model PR merged notification / Notify new model (push) Has been cancelled
PR CI / pr-ci (push) Has been cancelled
Slow tests on important models (on Push - A10) / Get all modified files (push) Has been cancelled
Secret Leaks / trufflehog (push) Has been cancelled
Update Transformers metadata / build_and_package (push) Has been cancelled
Slow tests on important models (on Push - A10) / Model CI (push) Has been cancelled
Check Tiny Models / Check tiny models (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Model CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Pipeline CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Example CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / DeepSpeed CI (push) Has been cancelled
Self-hosted runner (Intel Gaudi3 scheduled CI caller) / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI - Flash Attn / Setup (push) Has been cancelled
Nvidia CI - Flash Attn / Model CI (push) Has been cancelled
Nvidia CI / Setup (push) Has been cancelled
Nvidia CI / Model CI (push) Has been cancelled
Nvidia CI / Torch pipeline CI (push) Has been cancelled
Nvidia CI / Example CI (push) Has been cancelled
Nvidia CI / Trainer/FSDP CI (push) Has been cancelled
Nvidia CI / DeepSpeed CI (push) Has been cancelled
Nvidia CI / Quantization CI (push) Has been cancelled
Nvidia CI / Kernels CI (push) Has been cancelled
Doctests / Setup (push) Has been cancelled
Doctests / Call doctest jobs (push) Has been cancelled
Doctests / Send results to webhook (push) Has been cancelled
Extras Smoke Test / Get supported Python versions (push) Has been cancelled
Extras Smoke Test / Test extras on Python ${{ matrix.python-version }} (push) Has been cancelled
Extras Smoke Test / Check Slack token availability (push) Has been cancelled
Extras Smoke Test / Notify failures to Slack (push) Has been cancelled
Self-hosted runner (AMD scheduled CI caller) / Trigger Scheduled AMD CI (push) Has been cancelled
Stale Bot / Close Stale Issues (push) Has been cancelled
252 lines
11 KiB
TOML
252 lines
11 KiB
TOML
# Copyright 2021 The HuggingFace Team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
# This file can carry repo-local rule overrides for faster iteration between
|
|
# `transformers-mlinter` releases.
|
|
# Keep it synced with the upstream package's rules.toml when possible so local
|
|
# behavior does not drift from the published checker longer than necessary.
|
|
|
|
version = 1
|
|
|
|
[rules.TRF001]
|
|
description = "Class-level config_class on <Model>PreTrainedModel should match <Model>Config naming."
|
|
default_enabled = true
|
|
allowlist_models = ["qwen3_omni_moe"]
|
|
|
|
[rules.TRF001.explanation]
|
|
what_it_does = "Checks naming consistency between <Model>PreTrainedModel and config_class."
|
|
why_bad = "Mismatched config_class can break loading, auto classes, and developer expectations."
|
|
diff = '''
|
|
class AcmePreTrainedModel(PreTrainedModel):
|
|
- config_class = WileConfig
|
|
+ config_class = AcmeConfig
|
|
'''
|
|
|
|
[rules.TRF002]
|
|
description = "base_model_prefix should be a non-empty canonical string when defined on PreTrainedModel classes."
|
|
default_enabled = true
|
|
allowlist_models = ["lighton_ocr"]
|
|
|
|
[rules.TRF002.explanation]
|
|
what_it_does = "Checks that base_model_prefix, when set, is a non-empty, whitespace-free string literal."
|
|
why_bad = "Invalid prefixes can break weight loading key mapping and base model access patterns."
|
|
diff = '''
|
|
class AcmePreTrainedModel(PreTrainedModel):
|
|
- base_model_prefix = ""
|
|
+ base_model_prefix = "model"
|
|
'''
|
|
|
|
[rules.TRF003]
|
|
description = "forward() should use capture_output/can_return_tuple decorators instead of manual return_dict branching."
|
|
default_enabled = false
|
|
allowlist_models = []
|
|
|
|
[rules.TRF003.explanation]
|
|
what_it_does = "Detects forward methods that use the old 'if not return_dict: return (x,)' pattern."
|
|
why_bad = "The old return_dict branching pattern is error-prone and verbose. Use the capture_output or can_return_tuple decorators instead."
|
|
diff = '''
|
|
-def forward(self, x, return_dict=None):
|
|
- if not return_dict:
|
|
- return (x,)
|
|
- return AcmeModelOutput(last_hidden_state=x)
|
|
+@can_return_tuple
|
|
+def forward(self, x):
|
|
+ return AcmeModelOutput(last_hidden_state=x)
|
|
'''
|
|
|
|
[rules.TRF004]
|
|
description = "Models must never override tie_weights. Use _tied_weights_keys instead."
|
|
default_enabled = true
|
|
allowlist_models = ["data2vec", "hubert", "sew", "sew_d", "unispeech", "unispeech_sat", "wav2vec2", "wav2vec2_conformer", "wavlm"]
|
|
|
|
[rules.TRF004.explanation]
|
|
what_it_does = "Checks that no model class defines a tie_weights method."
|
|
why_bad = "Overriding tie_weights leads to bad consequences for loading, device_map computation, and saving. Use _tied_weights_keys class attribute to declare tied weights instead."
|
|
diff = '''
|
|
-def tie_weights(self):
|
|
- self.lm_head.weight = self.emb.weight
|
|
+class AcmeForCausalLM(AcmePreTrainedModel):
|
|
+ _tied_weights_keys = ["lm_head.weight"]
|
|
'''
|
|
|
|
[rules.TRF005]
|
|
description = "_no_split_modules, when defined, should be a list/tuple of non-empty strings."
|
|
default_enabled = true
|
|
allowlist_models = ["d_fine", "deformable_detr", "glm46v", "lw_detr", "pp_doclayout_v3", "rt_detr", "rt_detr_v2", "voxtral", "voxtral_realtime"]
|
|
|
|
[rules.TRF005.explanation]
|
|
what_it_does = "Checks the shape of _no_split_modules when present."
|
|
why_bad = "Malformed values can break device-map partitioning and sharding behavior."
|
|
diff = '''
|
|
-_no_split_modules = [SomeLayerClass, ""]
|
|
+_no_split_modules = ["AcmeDecoderLayer", "AcmeAttention"]
|
|
'''
|
|
|
|
[rules.TRF006]
|
|
description = "forward with cache arguments should reference cache control/state variables consistently."
|
|
default_enabled = true
|
|
allowlist_models = ["chinese_clip", "evolla", "idefics2", "llama4"]
|
|
|
|
[rules.TRF006.explanation]
|
|
what_it_does = "Checks forward signatures that expose cache arguments for usage of those arguments in method body."
|
|
why_bad = "Unused cache arguments can indicate incomplete caching support and inconsistent API behavior."
|
|
diff = '''
|
|
def forward(self, x, past_key_values=None, use_cache=False):
|
|
+ if use_cache:
|
|
+ ...
|
|
return x
|
|
'''
|
|
|
|
[rules.TRF007]
|
|
description = "self.post_init() in __init__ should remain at the end of initialization for PreTrainedModel classes."
|
|
default_enabled = true
|
|
allowlist_models = ["distilbert", "lxmert", "mt5", "pix2struct", "pop2piano", "switch_transformers", "t5"]
|
|
|
|
[rules.TRF007.explanation]
|
|
what_it_does = "Checks for self attribute assignments after self.post_init() in __init__."
|
|
why_bad = "Mutating model structure after post_init can bypass intended initialization/finalization logic."
|
|
diff = '''
|
|
def __init__(self, config):
|
|
...
|
|
- self.post_init()
|
|
- self.proj = nn.Linear(...)
|
|
+ self.proj = nn.Linear(...)
|
|
+ self.post_init()
|
|
'''
|
|
|
|
[rules.TRF008]
|
|
description = "Doc decorators on PreTrainedModel classes should avoid empty add_start_docstrings usage."
|
|
default_enabled = true
|
|
|
|
[rules.TRF008.explanation]
|
|
what_it_does = "Checks add_start_docstrings usage on model classes for non-empty docstring arguments."
|
|
why_bad = "Empty decorator usage produces unclear docs and weakens generated API documentation quality."
|
|
diff = '''
|
|
-@add_start_docstrings("")
|
|
+@add_start_docstrings("The Acme model.")
|
|
class AcmeModel(AcmePreTrainedModel):
|
|
...
|
|
'''
|
|
|
|
[rules.TRF009]
|
|
description = "modeling_<name>.py should avoid importing implementation code from another model package."
|
|
default_enabled = true
|
|
allowlist_models = ["dpr", "maskformer", "sam3_video", "vision_text_dual_encoder"]
|
|
|
|
[rules.TRF009.explanation]
|
|
what_it_does = "Checks modeling files for cross-model imports such as transformers.models.other_model.* or from ..other_model.* imports."
|
|
why_bad = "Cross-model implementation imports violate the single-file policy and make model behavior harder to inspect and maintain."
|
|
diff = '''
|
|
-from transformers.models.llama.modeling_llama import LlamaAttention
|
|
+# Keep implementation local to this file.
|
|
+# If reusing code, copy it with a # Copied from comment.
|
|
'''
|
|
|
|
[rules.TRF010]
|
|
description = "Direct config definitions must use @strict(accept_kwargs=True)."
|
|
default_enabled = true
|
|
allowlist_models = ["nemotron_h", "vibevoice_asr"]
|
|
|
|
[rules.TRF010.explanation]
|
|
what_it_does = "Checks direct PreTrainedConfig/PretrainedConfig subclasses in configuration_*.py and modular_*.py for an explicit @strict(accept_kwargs=True) decorator."
|
|
why_bad = "Without strict, new config classes miss the repo's runtime type-validation contract and drift from the dataclass-based config standard."
|
|
diff = '''
|
|
+@strict(accept_kwargs=True)
|
|
class AcmeConfig(PreTrainedConfig):
|
|
...
|
|
'''
|
|
|
|
[rules.TRF011]
|
|
description = "forward() must not access non-nn.Module attributes on submodules (breaks pipeline parallelism with Identity replacement)."
|
|
default_enabled = true
|
|
allowlist_models = []
|
|
|
|
[rules.TRF011.explanation]
|
|
what_it_does = "In forward() methods of PreTrainedModel subclasses, checks for attribute accesses on submodules that would not exist on torch.nn.Identity. This includes attribute accesses on loop variables iterating over self.layers, and self.<submodule>.<attr> chains where <attr> is not a standard nn.Module attribute."
|
|
why_bad = "Pipeline parallelism may replace any submodule with torch.nn.Identity. Accessing custom attributes (e.g. decoder_layer.attention_type) on a replaced module raises AttributeError at runtime. Per-layer metadata should be read from self.config instead."
|
|
diff = '''
|
|
def forward(self, ...):
|
|
- for decoder_layer in self.layers:
|
|
+ for i, decoder_layer in enumerate(self.layers):
|
|
hidden_states = decoder_layer(
|
|
hidden_states,
|
|
- attention_mask=causal_mask_mapping[decoder_layer.attention_type],
|
|
+ attention_mask=causal_mask_mapping[self.config.layer_types[i]],
|
|
)
|
|
'''
|
|
|
|
[rules.TRF012]
|
|
description = "_init_weights must use init primitives, not in-place operations on module weights."
|
|
default_enabled = true
|
|
allowlist_models = []
|
|
|
|
[rules.TRF012.explanation]
|
|
what_it_does = "Checks that _init_weights(self, module) does not use in-place operations (e.g. .normal_(), .zero_()) directly on module weights."
|
|
why_bad = "We rely on internal flags set on parameters to track whether they need re-initialization. In-place ops bypass this mechanism. Use the `init` primitives instead."
|
|
diff = '''
|
|
+from transformers import initialization as init
|
|
+
|
|
def _init_weights(self, module):
|
|
- module.weight.normal_(mean=0.0, std=0.02)
|
|
+ init.normal_(module.weight, mean=0.0, std=0.02)
|
|
'''
|
|
|
|
[rules.TRF013]
|
|
description = "PreTrainedModel __init__ must call self.post_init()."
|
|
default_enabled = true
|
|
allowlist_models = []
|
|
|
|
[rules.TRF013.explanation]
|
|
what_it_does = "Checks that every PreTrainedModel subclass with an __init__ method calls self.post_init(). In modular files, calling super().__init__() is also accepted since it propagates post_init from the parent."
|
|
why_bad = "post_init performs essential finalization (weight initialization, gradient checkpointing setup, etc.). Omitting it causes subtle runtime bugs."
|
|
diff = '''
|
|
class AcmeModel(AcmePreTrainedModel):
|
|
def __init__(self, config):
|
|
super().__init__(config)
|
|
self.layers = nn.ModuleList(...)
|
|
+ self.post_init()
|
|
'''
|
|
|
|
[rules.TRF014]
|
|
description = "`trust_remote_code` should never be used in native model integrations."
|
|
default_enabled = true
|
|
allowlist_models = []
|
|
|
|
[rules.TRF014.explanation]
|
|
what_it_does = "Checks whether `trust_remote_code` is passed or used in code (e.g. as kwarg) within native model integration files."
|
|
why_bad = "`trust_remote_code` allows arbitrary loading, including binaries, which should only be a power feature for users, not a standard use-case. Native integrations must not depend on it, as remote code cannot be reviewed or maintained within transformers."
|
|
diff = '''
|
|
class AcmeModel(AcmePreTrainedModel):
|
|
def __init__(self, config):
|
|
super().__init__(config)
|
|
- self.model = AutoModel.from_pretrained(..., trust_remote_code=True)
|
|
+ self.model = AutoModel.from_pretrained(...)
|
|
'''
|
|
|
|
[rules.TRF015]
|
|
description = "Models with non-empty _tied_weights_keys must have tie_word_embeddings in their Config."
|
|
default_enabled = true
|
|
allowlist_models = []
|
|
|
|
[rules.TRF015.explanation]
|
|
what_it_does = "When a PreTrainedModel subclass defines _tied_weights_keys as a non-empty collection, checks that the corresponding configuration file declares a tie_word_embeddings field."
|
|
why_bad = "Without tie_word_embeddings in the config, users cannot control weight tying behavior. The model ties weights unconditionally, breaking serialization round-trips and preventing fine-tuning with untied heads."
|
|
diff = '''
|
|
# configuration_foo.py
|
|
@strict(accept_kwargs=True)
|
|
class FooConfig(PreTrainedConfig):
|
|
hidden_size: int = 768
|
|
+ tie_word_embeddings: bool = True
|
|
'''
|