OpenLab-NLP
/

HyperConv-Layer

Model card Files Files and versions

xet

Community

OpenLab-NLP commited on 21 days ago

Commit

f80b20b

verified ·

1 Parent(s): c60c507

Update V2.py

Browse files

Files changed (1) hide show

V2.py +29 -22

V2.py CHANGED Viewed

@@ -157,57 +157,64 @@ class HyperConv1D(layers.Layer):
     def call(self, x, training=None):
         x_in = x
-        x_dtype = x.dtype
-        # 1) input projection
-        x_proj = self.input_proj(x)
         B = tf.shape(x_proj)[0]
         L = tf.shape(x_proj)[1]
         D = self.d_model
         pad = (self.k - 1) // 2
-        # ------------------------------
-        # 2) DynamicConv local mixing
-        # ------------------------------
-        kernels = self.kernel_generator(self.dynamic_dense(x_proj))  # (B, L, k)
         kernels = tf.nn.softmax(kernels, axis=-1)
         x_pad = tf.pad(x_proj, [[0,0],[pad,pad],[0,0]])
         x_pad_4d = tf.expand_dims(x_pad, axis=1)  # (B,1,L+k-1,D)
         patches = tf.image.extract_patches(
-            images=x_pad_4d,
-            sizes=[1,1,self.k,1],
-            strides=[1,1,1,1],
-            rates=[1,1,1,1],
-            padding='VALID'
-        )
         patches = tf.reshape(patches, [B, L, self.k, D])
         kernels_exp = tf.expand_dims(kernels, axis=-1)
         out_local = tf.reduce_sum(patches * kernels_exp, axis=2)  # (B,L,D)
         out_local = self.dynamic_proj(out_local)
-        # ------------------------------
-        # 3) Hyper scaling
-        # ------------------------------
         h = self.hyper(x_proj)
         global_z = self.attn_pool(h)
         global_z = tf.nn.softmax(global_z, axis=1)
         global_z = tf.reduce_sum(h * global_z, axis=1)
         scale = tf.expand_dims(tf.nn.sigmoid(self.scale_dense(global_z)), 1)
         out_local = out_local * scale
-        # ------------------------------
-        # 4) Residual + SiLU + LayerNorm
-        # ------------------------------
         out = x_proj + out_local
         out = tf.nn.silu(out)
         out = self.norm(out)
         out = self.dropout(out, training=training)
         return tf.cast(out, x_dtype)
 class L2NormLayer(layers.Layer):
     def __init__(self, axis=1, epsilon=1e-10, **kwargs):
         super().__init__(**kwargs)
@@ -223,7 +230,7 @@ class SentenceEncoder(Model):
         self.embed = layers.Embedding(vocab_size, embed_dim)
         self.pos_embed = layers.Embedding(input_dim=max_len, output_dim=embed_dim)
         self.dropout = layers.Dropout(dropout_rate)
-        self.blocks = [HyperConv1D(d_model=embed_dim, k=7, mem_size=128, hyper_dim=256) for _ in range(4)]
         self.attn_pool = layers.Dense(1)
         self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
         self.latent = layers.Dense(latent_dim, activation=None)

     def call(self, x, training=None):
         x_in = x
+        x_dtype = x.dtype  # 입력 dtype 저장
+    # 1) input projection
+        x_proj = self.input_proj(x)  # (B, L, D)
         B = tf.shape(x_proj)[0]
         L = tf.shape(x_proj)[1]
         D = self.d_model
         pad = (self.k - 1) // 2
+    # ------------------------------
+    # 2) DynamicConv local mixing
+    # ------------------------------
+    # kernels 생성 후 x_proj dtype으로 맞춤
+        kernels = self.kernel_generator(self.dynamic_dense(x_proj))
+        kernels = tf.cast(kernels, x_proj.dtype)
         kernels = tf.nn.softmax(kernels, axis=-1)
+    # padding & patch 추출
         x_pad = tf.pad(x_proj, [[0,0],[pad,pad],[0,0]])
         x_pad_4d = tf.expand_dims(x_pad, axis=1)  # (B,1,L+k-1,D)
         patches = tf.image.extract_patches(
+        images=x_pad_4d,
+        sizes=[1,1,self.k,1],
+        strides=[1,1,1,1],
+        rates=[1,1,1,1],
+        padding='VALID'
+    )
         patches = tf.reshape(patches, [B, L, self.k, D])
+    # kernels shape 맞추기
         kernels_exp = tf.expand_dims(kernels, axis=-1)
         out_local = tf.reduce_sum(patches * kernels_exp, axis=2)  # (B,L,D)
         out_local = self.dynamic_proj(out_local)
+    # ------------------------------
+    # 3) Hyper scaling
+    # ------------------------------
         h = self.hyper(x_proj)
         global_z = self.attn_pool(h)
         global_z = tf.nn.softmax(global_z, axis=1)
         global_z = tf.reduce_sum(h * global_z, axis=1)
         scale = tf.expand_dims(tf.nn.sigmoid(self.scale_dense(global_z)), 1)
+        scale = tf.cast(scale, x_proj.dtype)  # dtype 맞춤
         out_local = out_local * scale
+    # ------------------------------
+    # 4) Residual + SiLU + LayerNorm
+    # ------------------------------
         out = x_proj + out_local
         out = tf.nn.silu(out)
         out = self.norm(out)
         out = self.dropout(out, training=training)
         return tf.cast(out, x_dtype)
 class L2NormLayer(layers.Layer):
     def __init__(self, axis=1, epsilon=1e-10, **kwargs):
         super().__init__(**kwargs)
         self.embed = layers.Embedding(vocab_size, embed_dim)
         self.pos_embed = layers.Embedding(input_dim=max_len, output_dim=embed_dim)
         self.dropout = layers.Dropout(dropout_rate)
+        self.blocks = [HyperConv1D(d_model=embed_dim, k=7, hyper_dim=256) for _ in range(4)]
         self.attn_pool = layers.Dense(1)
         self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
         self.latent = layers.Dense(latent_dim, activation=None)