using negative euclidean distances is far better than previous dot product or the mlp attention

lucidrains · lucidrains · commit 3011ad3c17f0 · 2022-11-29T13:47:34.000-08:00
diff --git a/README.md b/README.md
@@ -14,6 +14,8 @@ Update: The choice of the norm or gating (still need to ablate to figure out whi
 
 Update: Nevermind, MLP attention seems to be working, but about the same as dot product attention.
 
+Update: By using the negative of the euclidean distance for dot product of higher types in dot product attention, I now see results that are far better than before as well as MLP attention. My conclusion is that the choice of norm and gating is contributing way more to the results in the paper than MLP attention
+
 <a href="https://wandb.ai/lucidrains/equiformer/reports/equiformer-and-mlp-attention---VmlldzozMDQwMTY3?accessToken=xmj0a1c80m8hehylrmbr0hndka8kk1vxmdrmvtmy7r1qgphtnuhq1643cb76zgfo">Running experiment, denoising residue positions in protein sequence</a>
 
 ## Install
diff --git a/equiformer_pytorch/equiformer_pytorch.py b/equiformer_pytorch/equiformer_pytorch.py
@@ -87,6 +87,9 @@ def feature_shapes(feature):
 def feature_fiber(feature):
     return tuple(v.shape[-2] for v in feature.values())
 
+def cdist(a, b, dim = -1, eps = 1e-5):
+    return ((a - b) ** 2).sum(dim = dim).clamp(min = eps).sqrt()
+
 # classes
 
 class Residual(nn.Module):
@@ -437,7 +440,6 @@ def __init__(
         edge_dim = None,
         single_headed_kv = False,
         radial_hidden_dim = 64,
-        use_cdist_sim = True,
         splits = 4
     ):
         super().__init__()
@@ -453,7 +455,6 @@ def __init__(
 
         self.single_headed_kv = single_headed_kv
         self.attend_self = attend_self
-        self.use_cdist_sim = use_cdist_sim
 
         kv_hidden_fiber = hidden_fiber if not single_headed_kv else dim_head
         kv_hidden_fiber = tuple(dim * 2 for dim in kv_hidden_fiber)
@@ -509,14 +510,14 @@ def forward(
 
             k, v = kv.chunk(2, dim = -2)
 
-            if degree == 0 or not self.use_cdist_sim:
+            if degree == 0:
                 sim = einsum(f'b h i d m, {kv_einsum_eq} -> b h i j', q, k) * scale
             else:
                 if one_head_kv:
                     k = repeat(k, 'b i j d m -> b h i j d m', h = h)
 
                 q = rearrange(q, 'b h i d m -> b h i 1 d m')
-                sim = -((q - k) ** 2).sum(dim = -1).clamp(min = 1e-5).sqrt().sum(dim = -1) * scale
+                sim = -cdist(q, k).sum(dim = -1) * scale
 
             if exists(neighbor_mask):
                 left_pad_needed = int(self.attend_self)
@@ -688,8 +689,7 @@ def __init__(
         embedding_grad_frac = 0.5,
         single_headed_kv = False,          # whether to do single headed key/values for dot product attention, to save on memory and compute
         ff_include_htype_norms = False,    # whether for type0 projection to also involve norms of all higher types, in feedforward first projection. this allows for all higher types to be gated by other type norms
-        dot_product_attention = True,
-        dot_product_attention_use_cdist_sim = True,
+        dot_product_attention = True,      # turn to False to use MLP attention as proposed in paper, but dot product attention with -cdist similarity is still far better, and i haven't even rotated distances (rotary embeddings) into the type 0 features yet
         **kwargs
     ):
         super().__init__()
@@ -761,7 +761,7 @@ def __init__(
 
         self.layers = nn.ModuleList([])
 
-        attention_klass = partial(DotProductAttention, use_cdist_sim = dot_product_attention_use_cdist_sim) if dot_product_attention else MLPAttention
+        attention_klass = DotProductAttention if dot_product_attention else MLPAttention
 
         for ind in range(depth):
             self.layers.append(nn.ModuleList([
diff --git a/equiformer_pytorch/version.py b/equiformer_pytorch/version.py
@@ -1,3 +1,3 @@
-__version__ = '0.0.30'
+__version__ = '0.0.31'
 
 __cuda_pkg_name__ = f'equiformer_pytorch_cuda_{__version__.replace(".", "_")}'

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`		`-__version__ = '0.0.30'`
	`1`	`+__version__ = '0.0.31'`
`2`	`2`
`3`	`3`	`__cuda_pkg_name__ = f'equiformer_pytorch_cuda_{__version__.replace(".", "_")}'`