transformers/utils/rules.toml

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This file can carry repo-local rule overrides for faster iteration between
# `transformers-mlinter` releases.
# Keep it synced with the upstream package's rules.toml when possible so local
# behavior does not drift from the published checker longer than necessary.

version = 1

[rules.TRF001]
description = "Class-level config_class on <Model>PreTrainedModel should match <Model>Config naming."
default_enabled = true
allowlist_models = ["qwen3_omni_moe"]

[rules.TRF001.explanation]
what_it_does = "Checks naming consistency between <Model>PreTrainedModel and config_class."
why_bad = "Mismatched config_class can break loading, auto classes, and developer expectations."
diff = '''
 class AcmePreTrainedModel(PreTrainedModel):
-    config_class = WileConfig
+    config_class = AcmeConfig
'''

[rules.TRF002]
description = "base_model_prefix should be a non-empty canonical string when defined on PreTrainedModel classes."
default_enabled = true
allowlist_models = ["lighton_ocr"]

[rules.TRF002.explanation]
what_it_does = "Checks that base_model_prefix, when set, is a non-empty, whitespace-free string literal."
why_bad = "Invalid prefixes can break weight loading key mapping and base model access patterns."
diff = '''
 class AcmePreTrainedModel(PreTrainedModel):
-    base_model_prefix = ""
+    base_model_prefix = "model"
'''

[rules.TRF003]
description = "forward() should use capture_output/can_return_tuple decorators instead of manual return_dict branching."
default_enabled = false
allowlist_models = []

[rules.TRF003.explanation]
what_it_does = "Detects forward methods that use the old 'if not return_dict: return (x,)' pattern."
why_bad = "The old return_dict branching pattern is error-prone and verbose. Use the capture_output or can_return_tuple decorators instead."
diff = '''
-def forward(self, x, return_dict=None):
-    if not return_dict:
-        return (x,)
-    return AcmeModelOutput(last_hidden_state=x)
+@can_return_tuple
+def forward(self, x):
+    return AcmeModelOutput(last_hidden_state=x)
'''

[rules.TRF004]
description = "Models must never override tie_weights. Use _tied_weights_keys instead."
default_enabled = true
allowlist_models = ["data2vec", "hubert", "sew", "sew_d", "unispeech", "unispeech_sat", "wav2vec2", "wav2vec2_conformer", "wavlm"]

[rules.TRF004.explanation]
what_it_does = "Checks that no model class defines a tie_weights method."
why_bad = "Overriding tie_weights leads to bad consequences for loading, device_map computation, and saving. Use _tied_weights_keys class attribute to declare tied weights instead."
diff = '''
-def tie_weights(self):
-    self.lm_head.weight = self.emb.weight
+class AcmeForCausalLM(AcmePreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
'''

[rules.TRF005]
description = "_no_split_modules, when defined, should be a list/tuple of non-empty strings."
default_enabled = true
allowlist_models = ["d_fine", "deformable_detr", "glm46v", "lw_detr", "pp_doclayout_v3", "rt_detr", "rt_detr_v2", "voxtral", "voxtral_realtime"]

[rules.TRF005.explanation]
what_it_does = "Checks the shape of _no_split_modules when present."
why_bad = "Malformed values can break device-map partitioning and sharding behavior."
diff = '''
-_no_split_modules = [SomeLayerClass, ""]
+_no_split_modules = ["AcmeDecoderLayer", "AcmeAttention"]
'''

[rules.TRF006]
description = "forward with cache arguments should reference cache control/state variables consistently."
default_enabled = true
allowlist_models = ["chinese_clip", "evolla", "idefics2", "llama4"]

[rules.TRF006.explanation]
what_it_does = "Checks forward signatures that expose cache arguments for usage of those arguments in method body."
why_bad = "Unused cache arguments can indicate incomplete caching support and inconsistent API behavior."
diff = '''
 def forward(self, x, past_key_values=None, use_cache=False):
+    if use_cache:
+        ...
     return x
'''

[rules.TRF007]
description = "self.post_init() in __init__ should remain at the end of initialization for PreTrainedModel classes."
default_enabled = true
allowlist_models = ["distilbert", "lxmert", "mt5", "pix2struct", "pop2piano", "switch_transformers", "t5"]

[rules.TRF007.explanation]
what_it_does = "Checks for self attribute assignments after self.post_init() in __init__."
why_bad = "Mutating model structure after post_init can bypass intended initialization/finalization logic."
diff = '''
 def __init__(self, config):
     ...
-    self.post_init()
-    self.proj = nn.Linear(...)
+    self.proj = nn.Linear(...)
+    self.post_init()
'''

[rules.TRF008]
description = "Doc decorators on PreTrainedModel classes should avoid empty add_start_docstrings usage."
default_enabled = true

[rules.TRF008.explanation]
what_it_does = "Checks add_start_docstrings usage on model classes for non-empty docstring arguments."
why_bad = "Empty decorator usage produces unclear docs and weakens generated API documentation quality."
diff = '''
-@add_start_docstrings("")
+@add_start_docstrings("The Acme model.")
 class AcmeModel(AcmePreTrainedModel):
     ...
'''

[rules.TRF009]
description = "modeling_<name>.py should avoid importing implementation code from another model package."
default_enabled = true
allowlist_models = ["dpr", "maskformer", "sam3_video", "vision_text_dual_encoder"]

[rules.TRF009.explanation]
what_it_does = "Checks modeling files for cross-model imports such as transformers.models.other_model.* or from ..other_model.* imports."
why_bad = "Cross-model implementation imports violate the single-file policy and make model behavior harder to inspect and maintain."
diff = '''
-from transformers.models.llama.modeling_llama import LlamaAttention
+# Keep implementation local to this file.
+# If reusing code, copy it with a # Copied from comment.
'''

[rules.TRF010]
description = "Direct config definitions must use @strict(accept_kwargs=True)."
default_enabled = true
allowlist_models = ["nemotron_h", "vibevoice_asr"]

[rules.TRF010.explanation]
what_it_does = "Checks direct PreTrainedConfig/PretrainedConfig subclasses in configuration_*.py and modular_*.py for an explicit @strict(accept_kwargs=True) decorator."
why_bad = "Without strict, new config classes miss the repo's runtime type-validation contract and drift from the dataclass-based config standard."
diff = '''
+@strict(accept_kwargs=True)
 class AcmeConfig(PreTrainedConfig):
     ...
'''

[rules.TRF011]
description = "forward() must not access non-nn.Module attributes on submodules (breaks pipeline parallelism with Identity replacement)."
default_enabled = true
allowlist_models = []

[rules.TRF011.explanation]
what_it_does = "In forward() methods of PreTrainedModel subclasses, checks for attribute accesses on submodules that would not exist on torch.nn.Identity. This includes attribute accesses on loop variables iterating over self.layers, and self.<submodule>.<attr> chains where <attr> is not a standard nn.Module attribute."
why_bad = "Pipeline parallelism may replace any submodule with torch.nn.Identity. Accessing custom attributes (e.g. decoder_layer.attention_type) on a replaced module raises AttributeError at runtime. Per-layer metadata should be read from self.config instead."
diff = '''
 def forward(self, ...):
-    for decoder_layer in self.layers:
+    for i, decoder_layer in enumerate(self.layers):
         hidden_states = decoder_layer(
             hidden_states,
-            attention_mask=causal_mask_mapping[decoder_layer.attention_type],
+            attention_mask=causal_mask_mapping[self.config.layer_types[i]],
         )
'''

[rules.TRF012]
description = "_init_weights must use init primitives, not in-place operations on module weights."
default_enabled = true
allowlist_models = []

[rules.TRF012.explanation]
what_it_does = "Checks that _init_weights(self, module) does not use in-place operations (e.g. .normal_(), .zero_()) directly on module weights."
why_bad = "We rely on internal flags set on parameters to track whether they need re-initialization. In-place ops bypass this mechanism. Use the `init` primitives instead."
diff = '''
+from transformers import initialization as init
+
 def _init_weights(self, module):
-    module.weight.normal_(mean=0.0, std=0.02)
+    init.normal_(module.weight, mean=0.0, std=0.02)
'''

[rules.TRF013]
description = "PreTrainedModel __init__ must call self.post_init()."
default_enabled = true
allowlist_models = []

[rules.TRF013.explanation]
what_it_does = "Checks that every PreTrainedModel subclass with an __init__ method calls self.post_init(). In modular files, calling super().__init__() is also accepted since it propagates post_init from the parent."
why_bad = "post_init performs essential finalization (weight initialization, gradient checkpointing setup, etc.). Omitting it causes subtle runtime bugs."
diff = '''
 class AcmeModel(AcmePreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.layers = nn.ModuleList(...)
+        self.post_init()
'''

[rules.TRF014]
description = "`trust_remote_code` should never be used in native model integrations."
default_enabled = true
allowlist_models = []

[rules.TRF014.explanation]
what_it_does = "Checks whether `trust_remote_code` is passed or used in code (e.g. as kwarg) within native model integration files."
why_bad = "`trust_remote_code` allows arbitrary loading, including binaries, which should only be a power feature for users, not a standard use-case. Native integrations must not depend on it, as remote code cannot be reviewed or maintained within transformers."
diff = '''
 class AcmeModel(AcmePreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        self.model = AutoModel.from_pretrained(..., trust_remote_code=True)
+        self.model = AutoModel.from_pretrained(...)
'''

[rules.TRF015]
description = "Models with non-empty _tied_weights_keys must have tie_word_embeddings in their Config."
default_enabled = true
allowlist_models = []

[rules.TRF015.explanation]
what_it_does = "When a PreTrainedModel subclass defines _tied_weights_keys as a non-empty collection, checks that the corresponding configuration file declares a tie_word_embeddings field."
why_bad = "Without tie_word_embeddings in the config, users cannot control weight tying behavior. The model ties weights unconditionally, breaking serialization round-trips and preventing fine-tuning with untied heads."
diff = '''
 # configuration_foo.py
 @strict(accept_kwargs=True)
 class FooConfig(PreTrainedConfig):
     hidden_size: int = 768
+    tie_word_embeddings: bool = True
'''