Conversation
| config_file_path = Path(__file__).parent / "test_yaml_configs/llama3_config_initalization.yaml" | ||
| n_layer = 4 | ||
| n_embd = 256 | ||
| model = self._get_components(config_file_path=config_file_path, has_bias=has_bias) |
There was a problem hiding this comment.
Please turn this into a @pytest.fixture(scope="module") and the tests into true, separate tests.
| ), | ||
| ComponentEntity( | ||
| "model_initialization", | ||
| "llama3_like", |
There was a problem hiding this comment.
| "llama3_like", | |
| "gpt2_llama3_like", |
It should be clear that this component only works for out gpt2 model.
|
|
||
| self.regex_to_init = { | ||
| # embedding weights | ||
| r"transformer\.wte\.weight": partial(nn.init.normal_, mean=0.0, std=1), |
There was a problem hiding this comment.
Is this really gonna be std=1?
There was a problem hiding this comment.
Similar in function to torchtitans weight init nn.init.normal_(self.tok_embeddings.weight) , you can see it here , mean=0.0, std=1 are then unnecessary right ?, since they are the default values as per the documentation
| match_count += 1 | ||
| hits[weight_regex] += 1 | ||
| if match_count == 0: | ||
| logger.warning(f"Parameter {parameter_name} did not match any regex for initialization") |
There was a problem hiding this comment.
should we add a flag which turns this into an error?
| b=2, | ||
| ), | ||
| } | ||
| if bias: |
There was a problem hiding this comment.
Shouldn't this depend on whether the given model has biases or not?
There was a problem hiding this comment.
Adding to that for llama titan has bias set to False for attention linears, see here
| b=2, | ||
| ), | ||
| # SwiGLU | ||
| r"transformer\.h\.\w+\.mlp\.(W)\.weight": partial( |
There was a problem hiding this comment.
why the switch to \w+ from \d+ here and below?
|
|
||
| self.regex_to_init = { | ||
| # embedding weights | ||
| r"transformer\.wte\.weight": partial(nn.init.normal_, mean=0.0, std=1), |
There was a problem hiding this comment.
Similar in function to torchtitans weight init nn.init.normal_(self.tok_embeddings.weight) , you can see it here , mean=0.0, std=1 are then unnecessary right ?, since they are the default values as per the documentation
| b=2, | ||
| ), | ||
| # final attention projection in attention block | ||
| r"transformer\.h\.\d+\.attn\.c_proj\.weight": partial( |
There was a problem hiding this comment.
This corresponds to following right ?, but in there you can see for out projection its std=init_std , which can be intialized differently and defaults to depth_init , because here we pass weight_init_std , which default to depth_init in titan here. If we dont want depth init then it matches scaled out_projections logic when depth_init is False for titan
| b=2, | ||
| ), | ||
| } | ||
| if bias: |
There was a problem hiding this comment.
Adding to that for llama titan has bias set to False for attention linears, see here
| def __init__(self, num_layers: int, n_embd: int, bias: bool) -> None: | ||
| super().__init__() | ||
|
|
||
| self.regex_to_init = { |
There was a problem hiding this comment.
we also need regex patterns for attention_norm, ffn_norm, and the final lm_head_normnai ?. Something like
r"transformer\.h\.\d+\.(attention_norm|ffn_norm)\.weight": nn.init.ones_,
r"transformer\.lm_head_norm\.weight": nn.init.ones_,
What does this PR do?
This PR ..
General Changes
Breaking Changes
Checklist before submitting final PR
python tests/tests.py)CHANGELOG_DEV.md)