Spaces:
Runtime error
Runtime error
| import torch | |
| def expand_t(t, x): | |
| """Function to reshape time t to broadcastable dimension of x | |
| Args: | |
| t: [bsz,], time vector | |
| x: [bsz,...], data point | |
| """ | |
| dims = [1] * (len(x.size()) - 1) | |
| t = t.view(t.size(0), *dims) | |
| return t | |
| def randn_tensor(shape, noise_repeat, device, dtype=torch.float32): | |
| bsz = shape[0] | |
| if bsz % noise_repeat != 0: | |
| raise ValueError(f"Batch size ({bsz}) must be divisible by noise repeat ({noise_repeat})") | |
| _shape = (noise_repeat,) + shape[1:] | |
| _tensor = torch.randn(_shape, device=device, dtype=dtype).repeat(bsz // noise_repeat, 1) | |
| return _tensor | |
| def rotate_half(x): | |
| """Rotates half the hidden dims of the input.""" | |
| x1 = x[..., : x.shape[-1] // 2] | |
| x2 = x[..., x.shape[-1] // 2 :] | |
| return torch.cat((-x2, x1), dim=-1) | |
| def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1): | |
| cos = cos.unsqueeze(unsqueeze_dim) | |
| sin = sin.unsqueeze(unsqueeze_dim) | |
| q_embed = (q * cos) + (rotate_half(q) * sin) | |
| k_embed = (k * cos) + (rotate_half(k) * sin) | |
| return q_embed, k_embed | |
| def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: | |
| """ | |
| This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, | |
| num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) | |
| """ | |
| batch, num_key_value_heads, slen, head_dim = hidden_states.shape | |
| if n_rep == 1: | |
| return hidden_states | |
| hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) | |
| return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) | |
| def identity(input: torch.Tensor, *args, **kwargs) -> torch.Tensor: | |
| return input | |
| def rms_norm( | |
| input: torch.Tensor, | |
| normalized_shape: torch.Size, | |
| eps: float = 1e-6, | |
| ) -> torch.Tensor: | |
| dtype = input.dtype | |
| input = input.to(torch.float32) | |
| variance = input.flatten(-len(normalized_shape)).pow(2).mean(dim=-1)[(...,) + (None,) * len(normalized_shape)] | |
| input = input * torch.rsqrt(variance + eps) | |
| return input.to(dtype) | |
| def layer_norm( | |
| input: torch.Tensor, | |
| normalized_shape: torch.Size, | |
| eps: float = 1e-6, | |
| ) -> torch.Tensor: | |
| dtype = input.dtype | |
| input = input.to(torch.float32) | |
| mean = input.flatten(-len(normalized_shape)).mean(dim=-1)[(...,) + (None,) * len(normalized_shape)] | |
| variance = (input - mean).flatten(-len(normalized_shape)).pow(2).mean(dim=-1)[(...,) + (None,) * len(normalized_shape)] | |
| input = (input - mean) * torch.rsqrt(variance + eps) | |
| return input.to(dtype) |