Missing entries in matrix assembled from facet submesh custom kernel

I created a custom kernel using numba and ffcx for a facet submesh. The resulting matrix seems to be missing entries. The matrix should have blocks of the form

[[1.         0.5        0.         0.         0.         0.        ]
 [0.5        1.         0.         0.         0.         0.        ]
 [0.         0.         1.41421356 0.70710678 0.         0.        ]
 [0.         0.         0.70710678 1.41421356 0.         0.        ]
 [0.         0.         0.         0.         1.         0.5       ]
 [0.         0.         0.         0.         0.5        1.        ]]

but some entries are missing. Here’s the MWE which adapts the kernel from dolfinx_hdg/python/demo/hdg_poisson.py at main · jpdean/dolfinx_hdg · GitHub.

import dolfinx
from mpi4py import MPI
from dolfinx import mesh, fem, jit, io, default_scalar_type, default_real_type
from dolfinx.cpp.mesh import cell_num_entities
import numpy as np
import ufl
import dolfinx.fem.petsc as petsc
from ufl import inner, grad, dot, div
from petsc4py import PETSc
from dolfinx.fem import IntegralType
import cffi
import numba
from dolfinx.cpp.fem import Form_float64
import sys

import gmsh
import json

from ffcx.codegeneration.utils import empty_void_pointer
from ffcx.codegeneration.utils import numba_ufcx_kernel_signature as ufcx_signature

# import custom_assembly

dtype = default_scalar_type
rtype = default_real_type

Print = PETSc.Sys.Print
nptype = 'float64'
ffcxtype = 'double'
null64 = np.zeros(0, dtype=np.float64)
null32 = np.zeros(0, dtype=np.int32)
null8 = np.zeros(0, dtype=np.uint8)

ffi = cffi.FFI()

def compute_cell_boundary_facets(msh: dolfinx.mesh.Mesh) -> np.ndarray:
    """Integration entities for integrals around all cell boundaries.

    Returns flat array of (cell_index, local_facet_index) pairs, as
    required by ``ufl.Measure("ds", subdomain_data=...)``.
    (Copied verbatim from demo_hdg.py.)
    """
    tdim = msh.topology.dim
    fdim = tdim - 1
    n_f = cell_num_entities(msh.topology.cell_type, fdim)
    n_c = msh.topology.index_map(tdim).size_local
    return np.vstack(
        (np.repeat(np.arange(n_c), n_f), np.tile(np.arange(n_f), n_c))
    ).T.flatten()


def get_kernel(ufl_form, comm):
    """JIT-compile ufl_form and return its cell-integral tabulate_tensor fn."""
    ufcx_form, _, _ = jit.ffcx_jit(
        comm, ufl_form, form_compiler_options={"scalar_type": ffcxtype}
    )
    attr = f"tabulate_tensor_{nptype}"
    return getattr(ufcx_form.form_integrals[0], attr)


comm = MPI.COMM_WORLD

n = 2
msh = mesh.create_unit_square(comm, n, n, ghost_mode=mesh.GhostMode.none)

tdim = msh.topology.dim
fdim = tdim - 1
num_cell_facets = cell_num_entities(msh.topology.cell_type, fdim)
msh.topology.create_entities(fdim)
facet_imap = msh.topology.index_map(fdim)
num_facets = facet_imap.size_local + facet_imap.num_ghosts
facets = np.arange(num_facets, dtype=np.int32)

facet_mesh, facet_mesh_emap = mesh.create_submesh(msh, fdim, facets)[0:2]

k = 1
V = fem.functionspace(msh, ("Discontinuous Lagrange", k))
Vbar = fem.functionspace(facet_mesh, ("Discontinuous Lagrange", k))

V_dim = V.element.space_dimension
Vbar_dim = Vbar.element.space_dimension

u = ufl.TrialFunction(V)
v = ufl.TestFunction(V)
ubar = ufl.TrialFunction(Vbar)
vbar = ufl.TestFunction(Vbar)

h = ufl.CellDiameter(msh)
n = ufl.FacetNormal(msh)
gamma = 6

dx_c = ufl.Measure("dx", domain=msh)
cell_boundary_facets = compute_cell_boundary_facets(msh)
cell_boundaries = 1  # tag value

ds_c = ufl.Measure("ds", subdomain_data=[(cell_boundaries, cell_boundary_facets)],
                   domain=msh,)

a_11 = gamma * inner(ubar, vbar) * ds_c(cell_boundaries)

kernel11 = get_kernel(a_11, comm)

num_dofs_g = V.dofmap.index_map.size_global

@numba.njit(fastmath=True)
def compute_A11(coords):
    A11 = np.zeros((num_cell_facets * Vbar_dim,
                    num_cell_facets * Vbar_dim),
                   dtype=PETSc.ScalarType)
    A11_f = np.zeros((Vbar_dim, Vbar_dim),
                     dtype=PETSc.ScalarType)

    entity_local_index = np.zeros((1), dtype=np.int32)
    coeffs = np.zeros(0, dtype=dtype)
    consts = np.zeros(0, dtype=dtype)

    for local_f in range(num_cell_facets):
        A11_f.fill(0.0)
        entity_local_index[0] = local_f

        kernel11(ffi.from_buffer(A11_f),
                  ffi.from_buffer(coeffs),
                  ffi.from_buffer(consts),
                  ffi.from_buffer(coords),
                  ffi.from_buffer(entity_local_index),
                  ffi.from_buffer(null8),
                  empty_void_pointer(),
                  )
        # Insert in correct location
        start = local_f * Vbar_dim
        end = start + Vbar_dim
        A11[start:end, start:end] += A11_f[:, :]
    return A11

x_dofs = V.mesh.geometry.dofmap
x = Vbar.mesh.geometry.x
geometry = x[x_dofs][2]
A11_loc = compute_A11(geometry)
Print(A11_loc)

c_signature = numba.types.void(
        numba.types.CPointer(numba.typeof(PETSc.ScalarType())),
        numba.types.CPointer(numba.typeof(PETSc.ScalarType())),
        numba.types.CPointer(numba.typeof(PETSc.ScalarType())),
        numba.types.CPointer(numba.types.double),
        numba.types.CPointer(numba.types.int32),
        numba.types.CPointer(numba.types.uint8))

@numba.cfunc(c_signature, nopython=True, fastmath=True)
def tabulate_tensor_a(A_, w_, c_, coords_, entity_local_index, permutation=ffi.NULL):
    A_local = numba.carray(A_, (num_cell_facets * Vbar_dim,
                                num_cell_facets * Vbar_dim),
                           dtype=PETSc.ScalarType)
    coords = numba.carray(coords_, (num_dofs_g, 3), dtype=PETSc.ScalarType)

    A_local += compute_A11(coords)


cells = np.arange(msh.topology.index_map(tdim).size_local)
facets = np.arange(facet_mesh.topology.index_map(fdim).size_local)

integrals_a = {fem.IntegralType.cell: [(0, tabulate_tensor_a.address, cells, np.array([], dtype=np.int8))]}
formtype = fem.form_cpp_class(dtype)
form_type_compiled = formtype([Vbar._cpp_object, Vbar._cpp_object], integrals_a, [], [], False, mesh=msh._cpp_object, entity_maps=[facet_mesh_emap._cpp_object])
a_cond = fem.Form(form_type_compiled)
A = petsc.assemble_matrix(a_cond, bcs=[])
A.assemble()
A.view()