From cc1b14bc106a0aedbe9049b11d3b7ea7fe88a650 Mon Sep 17 00:00:00 2001
From: Albert Zeyer <zeyer@i6.informatik.rwth-aachen.de>
Date: Sun, 20 Oct 2024 10:54:30 +0200
Subject: [PATCH 1/8] tests with disabled native frontend extension

---
 .github/workflows/main.yml   | 2 ++
 returnn/frontend/_backend.py | 9 ++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 4b10f72f4..207dc2b1b 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -350,6 +350,7 @@ jobs:
           - TEST=rf_array
           - TEST=rf_attention
           - TEST=rf_base
+          - TEST=rf_base RETURNN_FRONTEND_NATIVE=0
           - TEST=rf_cond
           - TEST=rf_const
           - TEST=rf_container
@@ -360,6 +361,7 @@ jobs:
           - TEST=rf_label_smoothing
           - TEST=rf_loop
           - TEST=rf_math
+          - TEST=rf_math RETURNN_FRONTEND_NATIVE=0
           - TEST=rf_normalization
           - TEST=rf_piecewise_linear
           - TEST=rf_rec
diff --git a/returnn/frontend/_backend.py b/returnn/frontend/_backend.py
index 41f14f59f..b5653861e 100644
--- a/returnn/frontend/_backend.py
+++ b/returnn/frontend/_backend.py
@@ -3,6 +3,8 @@
 """
 
 from __future__ import annotations
+
+import os
 from typing import Optional, Any, Union, TypeVar, Generic, Type, Callable, Sequence, Dict, Tuple, List
 import contextlib
 import numpy
@@ -1446,10 +1448,11 @@ def select_backend_torch():
     global_backend.__class__ = backend
     BehaviorVersion.set_min_behavior_version(16)
 
-    from returnn.frontend import _native
+    if os.environ.get("RETURNN_FRONTEND_NATIVE", "").strip() in ("", "1"):
+        from returnn.frontend import _native
 
-    _native.setup()
-    _native.setup_torch()
+        _native.setup()
+        _native.setup_torch()
 
 
 def get_backend_by_tensor(tensor: Tensor, *, fallback: Optional[T2] = None) -> Union[Type[Backend[T]], T2]:

From a769b47267f570c44f596ff1e637959a8719dd19 Mon Sep 17 00:00:00 2001
From: Albert Zeyer <zeyer@i6.informatik.rwth-aachen.de>
Date: Sun, 20 Oct 2024 10:57:27 +0200
Subject: [PATCH 2/8] tests for eq, neq, invalid broadcasting

---
 tests/test_rf_math.py | 60 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/tests/test_rf_math.py b/tests/test_rf_math.py
index 5f096de8c..abdb077c3 100644
--- a/tests/test_rf_math.py
+++ b/tests/test_rf_math.py
@@ -30,6 +30,66 @@ def _forward_step(*, model: _Net, extern_data: TensorDict):
     run_model(extern_data, lambda *, epoch, step: _Net(), _forward_step)
 
 
+def test_eq():
+    time_dim = Dim(Tensor("time", [batch_dim], dtype="int32"))
+    in_dim = Dim(7, name="in")
+    extern_data = TensorDict(
+        {
+            "a": Tensor("a", [batch_dim, time_dim, in_dim], dtype="int32"),
+            "b": Tensor("b", [batch_dim, in_dim], dtype="int32"),
+        }
+    )
+
+    # noinspection PyShadowingNames
+    def _forward_step(*, extern_data: TensorDict, **_kwargs):
+        out = extern_data["a"] == extern_data["b"]
+        out.mark_as_default_output(shape=(batch_dim, time_dim, in_dim))
+
+    run_model(extern_data, lambda **_kwargs: rf.Module(), _forward_step)
+
+
+def test_neq():
+    time_dim = Dim(Tensor("time", [batch_dim], dtype="int32"))
+    in_dim = Dim(7, name="in")
+    extern_data = TensorDict(
+        {
+            "a": Tensor("a", [batch_dim, time_dim, in_dim], dtype="int32"),
+            "b": Tensor("b", [batch_dim, in_dim], dtype="int32"),
+        }
+    )
+
+    # noinspection PyShadowingNames
+    def _forward_step(*, extern_data: TensorDict, **_kwargs):
+        out = extern_data["a"] != extern_data["b"]
+        out.mark_as_default_output(shape=(batch_dim, time_dim, in_dim))
+
+    run_model(extern_data, lambda **_kwargs: rf.Module(), _forward_step)
+
+
+def test_neq_broadcast_exception():
+    time_dim = Dim(Tensor("time", [batch_dim], dtype="int32"))
+    other_time_dim = Dim(Tensor("other_time", [batch_dim], dtype="int32"))
+    extern_data = TensorDict(
+        {
+            "a": Tensor("a", [batch_dim, time_dim], dtype="int32"),
+            "b": Tensor("b", [batch_dim, other_time_dim], dtype="int32"),
+        }
+    )
+
+    # noinspection PyShadowingNames
+    def _forward_step(*, extern_data: TensorDict, **_kwargs):
+        a, b = extern_data["a"], extern_data["b"]
+        try:
+            _ = a != b
+        except ValueError as e:
+            assert "require explicit allow_broadcast_all_sources=True" in str(e)
+        else:
+            raise Exception("Expected ValueError")
+        a.mark_as_default_output(shape=(batch_dim, time_dim))
+
+    run_model(extern_data, lambda **_kwargs: rf.Module(), _forward_step)
+
+
 def test_squared_difference():
     time_dim = Dim(Tensor("time", [batch_dim], dtype="int32"))
     in_dim = Dim(7, name="in")

From 2b879ad90373a5fd10738a4ca2cb803194f40ee1 Mon Sep 17 00:00:00 2001
From: Albert Zeyer <zeyer@i6.informatik.rwth-aachen.de>
Date: Sun, 20 Oct 2024 11:03:12 +0200
Subject: [PATCH 3/8] skip native test when not enabled

---
 tests/test_rf_base.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test_rf_base.py b/tests/test_rf_base.py
index 04f886281..fb12c7f53 100644
--- a/tests/test_rf_base.py
+++ b/tests/test_rf_base.py
@@ -4,6 +4,7 @@
 
 from __future__ import annotations
 from typing import Tuple
+import os
 from unittest import SkipTest
 import _setup_test_env  # noqa
 import returnn.frontend as rf
@@ -493,6 +494,9 @@ def test_build_from_dict_func():
 
 
 def test_build_from_dict_func_native():
+    if os.environ.get("RETURNN_FRONTEND_NATIVE", "").strip() not in ("", "1"):
+        raise SkipTest("RETURNN_FRONTEND_NATIVE not enabled")
+
     from types import BuiltinFunctionType
 
     rf.select_backend_torch()  # enables some of the native optimizations

From 7669979f409c937d6e85627bf66a627d17f3d789 Mon Sep 17 00:00:00 2001
From: Albert Zeyer <zeyer@i6.informatik.rwth-aachen.de>
Date: Sun, 20 Oct 2024 21:14:29 +0200
Subject: [PATCH 4/8] small fix

---
 tests/test_rf_math.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/test_rf_math.py b/tests/test_rf_math.py
index abdb077c3..d12e05f73 100644
--- a/tests/test_rf_math.py
+++ b/tests/test_rf_math.py
@@ -82,7 +82,10 @@ def _forward_step(*, extern_data: TensorDict, **_kwargs):
         try:
             _ = a != b
         except ValueError as e:
-            assert "require explicit allow_broadcast_all_sources=True" in str(e)
+            print("Got exception:", e)
+            assert "require explicit allow_broadcast_all_sources=True" in str(e) or "require broadcasting to" in str(
+                e
+            ), f"exception unexpected: {e}"
         else:
             raise Exception("Expected ValueError")
         a.mark_as_default_output(shape=(batch_dim, time_dim))

From 2d60c18000f6249c979883711da370a978fee261 Mon Sep 17 00:00:00 2001
From: Albert Zeyer <zeyer@i6.informatik.rwth-aachen.de>
Date: Mon, 21 Oct 2024 09:09:29 +0200
Subject: [PATCH 5/8] small fix

---
 tests/test_rf_math.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_rf_math.py b/tests/test_rf_math.py
index d12e05f73..7467df179 100644
--- a/tests/test_rf_math.py
+++ b/tests/test_rf_math.py
@@ -81,13 +81,13 @@ def _forward_step(*, extern_data: TensorDict, **_kwargs):
         a, b = extern_data["a"], extern_data["b"]
         try:
             _ = a != b
-        except ValueError as e:
+        except Exception as e:
             print("Got exception:", e)
             assert "require explicit allow_broadcast_all_sources=True" in str(e) or "require broadcasting to" in str(
                 e
             ), f"exception unexpected: {e}"
         else:
-            raise Exception("Expected ValueError")
+            raise Exception("Expected exception for invalid broadcasting")
         a.mark_as_default_output(shape=(batch_dim, time_dim))
 
     run_model(extern_data, lambda **_kwargs: rf.Module(), _forward_step)

From c24bf7044f70db5837a99ddec1538126864975a5 Mon Sep 17 00:00:00 2001
From: Albert Zeyer <zeyer@i6.informatik.rwth-aachen.de>
Date: Mon, 21 Oct 2024 10:20:06 +0200
Subject: [PATCH 6/8] RF native compareOrCombine, fix broadcast check

Note: Before this fix, broadcasting was just always allowed!
---
 returnn/frontend/_native/py_utils.hpp   |  2 +-
 returnn/frontend/_native/tensor_ops.cpp | 51 ++++---------------------
 2 files changed, 8 insertions(+), 45 deletions(-)

diff --git a/returnn/frontend/_native/py_utils.hpp b/returnn/frontend/_native/py_utils.hpp
index 119f31106..28afd925b 100644
--- a/returnn/frontend/_native/py_utils.hpp
+++ b/returnn/frontend/_native/py_utils.hpp
@@ -31,7 +31,7 @@ class PyObjectScopedRef {
 Sequence interface contract:
 
 - Holds borrowed reference to PyObject*.
-- Copy object itself is supposed to be fast, small object.
+- Copy PyTupleOrListStaticRef/PyTupleOrListRef itself is supposed to be fast, small object.
 
 Methods:
 
diff --git a/returnn/frontend/_native/tensor_ops.cpp b/returnn/frontend/_native/tensor_ops.cpp
index 082deaf57..a9851fc2d 100644
--- a/returnn/frontend/_native/tensor_ops.cpp
+++ b/returnn/frontend/_native/tensor_ops.cpp
@@ -253,41 +253,6 @@ static bool _isSeqSubsetReorderFast(ASeqT subset, BSeqT superset, std::vector<in
     return true;
 }
 
-static bool _isTupleSubsetReorderList(PyObject* subsetTuple, PyObject* supersetList, bool& error) {
-    int superSize = PyList_GET_SIZE(supersetList);
-    if(superSize < 0) { error = true; return false; }
-    int subSize = PyTuple_GET_SIZE(subsetTuple);
-    if(subSize < 0) { error = true; return false; }
-    if(subSize > superSize)
-        return false;
-    std::vector<bool> subsetTaken(subSize, false);
-    for(int j = 0; j < superSize; ++j) {
-        PyObject* b_ = PyList_GET_ITEM(supersetList, j);
-        int i = 0;
-        for(; i < subSize; ++i) {
-            if(subsetTaken[i]) continue;
-            PyObject* a_ = PyTuple_GET_ITEM(subsetTuple, i);
-            if(a_ == b_) break;
-        }
-        if(i == subSize) {  // not found, try again using rich compare
-            for(; i < subSize; ++i) {
-                if(subsetTaken[i]) continue;
-                PyObject* a_ = PyTuple_GET_ITEM(subsetTuple, i);
-                int eq = PyObject_RichCompareBool(a_, b_, Py_EQ);
-                if(eq < 0) { error = true; return false; }
-                if(eq) break;
-            }
-        }
-        if(i < subSize)
-            subsetTaken[i] = true;
-    }
-    for(int i = 0; i < subSize; ++i) {
-        if(!subsetTaken[i])
-            return false;
-    }
-    return true;
-}
-
 PyObject* pyTensorCopy(PyObject *self, PyObject *args, PyObject *kwargs) {
     static const char *kwlist[] = { "tensor", "name", NULL };
     PyObject* tensor;
@@ -1203,6 +1168,8 @@ static PyObject* compareOrCombine(
         // collect all dims
         PyObjectScopedRef allDims = PyList_New(0);
         if(!allDims) return NULL;
+        bool aDimsHaveAll = true;
+        bool bDimsHaveAll = true;
         for(int i = 0; i < aDimsSeq.size() + bDimsSeq.size(); ++i) {
             PyObject* dim =
                 i < aDimsSeq.size() ?
@@ -1221,8 +1188,10 @@ static PyObject* compareOrCombine(
             // and this allows for a faster path.
             int aDimsCount = PySequence_Count(aDims, dim);
             if(aDimsCount < 0) return NULL;
+            if(aDimsCount == 0) aDimsHaveAll = false;
             int bDimsCount = PySequence_Count(bDims, dim);
             if(bDimsCount < 0) return NULL;
+            if(bDimsCount == 0) bDimsHaveAll = false;
             if(aDimsCount <= 1 && bDimsCount <= 1) {
                 if(PyList_Append(allDims, dim) < 0) return NULL;
                 continue;
@@ -1249,17 +1218,11 @@ static PyObject* compareOrCombine(
         }
         PyTupleOrListStaticRef<false> allDimsSeq(allDims);
 
-        // check if all dims are in a and b, or whether we need allowBroadcastAllSources
-        bool error = false;
-        bool aDimsIsSubset = _isTupleSubsetReorderList(aDims, allDims, error);
-        if(error) return NULL;
-        bool bDimsIsSubset = _isTupleSubsetReorderList(bDims, allDims, error);
-        if(error) return NULL;
-        if(!aDimsIsSubset && !bDimsIsSubset) {
-            if(!allowBroadcastAllSources) {
+        if(!allowBroadcastAllSources) {
+            if(!aDimsHaveAll && !bDimsHaveAll) {
                 PyErr_Format(
                     PyExc_ValueError,
-                    "compareOrCombine: sources %R %R not allowed with allow_broadcast_all_sources=False",
+                    "compareOrCombine: sources %R %R not allowed, require explicit allow_broadcast_all_sources=True",
                     a, b);
                 return NULL;
             }

From 9146433196261bb5af2924d4d7977d7ccdb266c7 Mon Sep 17 00:00:00 2001
From: Albert Zeyer <zeyer@i6.informatik.rwth-aachen.de>
Date: Mon, 21 Oct 2024 10:36:59 +0200
Subject: [PATCH 7/8] RF sequence_mask, fix, require broadcasting

---
 returnn/frontend/array_.py |  5 ++++-
 returnn/frontend/math_.py  | 18 +++++++++++++++---
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/returnn/frontend/array_.py b/returnn/frontend/array_.py
index 6b605cec0..2e29fb139 100644
--- a/returnn/frontend/array_.py
+++ b/returnn/frontend/array_.py
@@ -615,7 +615,10 @@ def sequence_mask(dims: Union[Dim, Sequence[Dim]], *, device: Optional[str] = No
         return rf.constant(True, dims=())
     mask = True
     for dim in dyn_dims:
-        mask = rf.opt_logical_and(mask, dim.get_mask(dim_order=dims, device=device))
+        mask = rf.opt_logical_and(
+            mask, dim.get_mask(dim_order=dims, device=device), allow_broadcast_all_sources=True, dim_order=dims
+        )
+    assert isinstance(mask, Tensor)
     return mask
 
 
diff --git a/returnn/frontend/math_.py b/returnn/frontend/math_.py
index 36e1ae981..db18cd791 100644
--- a/returnn/frontend/math_.py
+++ b/returnn/frontend/math_.py
@@ -347,11 +347,23 @@ def opt_logical_or(a: Union[Tensor, bool], b: Union[Tensor, bool]) -> Union[Tens
 
 
 @overload
-def opt_logical_and(a: bool, b: bool) -> bool:
+def opt_logical_and(
+    a: bool,
+    b: bool,
+    *,
+    allow_broadcast_all_sources: Optional[bool] = None,
+    dim_order: Optional[Sequence[Dim]] = None,
+) -> bool:
     """logical and"""
 
 
-def opt_logical_and(a: Union[Tensor, bool], b: Union[Tensor, bool]) -> Union[Tensor, bool]:
+def opt_logical_and(
+    a: Union[Tensor, bool],
+    b: Union[Tensor, bool],
+    *,
+    allow_broadcast_all_sources: Optional[bool] = None,
+    dim_order: Optional[Sequence[Dim]] = None,
+) -> Union[Tensor, bool]:
     """logical and"""
     if isinstance(a, bool):
         if not a:
@@ -361,7 +373,7 @@ def opt_logical_and(a: Union[Tensor, bool], b: Union[Tensor, bool]) -> Union[Ten
         if not b:
             return False
         return a
-    return combine(a, "logical_and", b)
+    return combine(a, "logical_and", b, allow_broadcast_all_sources=allow_broadcast_all_sources, dim_order=dim_order)
 
 
 def is_finite(a: Tensor) -> Tensor:

From 437fdc6885469e7d446cf26ae0698bed181181af Mon Sep 17 00:00:00 2001
From: Albert Zeyer <zeyer@i6.informatik.rwth-aachen.de>
Date: Mon, 21 Oct 2024 10:41:36 +0200
Subject: [PATCH 8/8] Tensor copy_masked, small fix for broadcasting

---
 returnn/tensor/_tensor_extra.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/returnn/tensor/_tensor_extra.py b/returnn/tensor/_tensor_extra.py
index 318caddc9..aad29d6f9 100644
--- a/returnn/tensor/_tensor_extra.py
+++ b/returnn/tensor/_tensor_extra.py
@@ -2969,10 +2969,10 @@ def copy_masked(
 
         import returnn.frontend as rf
 
-        mask = None
+        mask = True
         for axis in axes:
             mask_ = self._dims[axis].get_mask(dim_order=self.dims, device=self.device)
-            mask = rf.logical_and(mask, mask_) if mask is not None else mask_
+            mask = rf.opt_logical_and(mask, mask_, allow_broadcast_all_sources=True, dim_order=self.dims)
         assert isinstance(mask, _t.Tensor)
         res = rf.where(mask, self, mask_value)
         if use_padding_info: