Assembly of a row matrix for an integral constraint

Dear users,

I found a performance issue when assembling a row matrix A of global size (1, N). These kind of matrices arise naturally when imposing a global constraint for singular problems, see for instance, Real function spaces — scifem

As it is currently implemented, the assembly of the row matrix for real spaces is carried out in a single core and then distributed. It would be much faster to assemble the transpose matrix A^T of global size (N, 1) and then transpose it.
First I would like to share a minimal example that reproduces the performance issue (see below). Then, I would like to ask you for help in order to create and assemble the transpose (A^T)^T .
I have tried with A_T_T = PETSc.Mat().createTranspose(A_T)[1] but it seems this is not compatible with a following Mat.NEST and LU solver, I get kspConvergedReason = -11 or OUT of Memory. Any idea? [2]

from mpi4py import MPI
from petsc4py import PETSc
import dolfinx.fem.petsc

import numpy as np
from scifem import create_real_functionspace
import ufl
from timeit import default_timer as timer

comm = MPI.COMM_WORLD
rank = comm.rank
size = comm.size

M_DG = 150
mesh_DG = dolfinx.mesh.create_unit_square(MPI.COMM_WORLD, M_DG, M_DG, dolfinx.mesh.CellType.triangle, dtype=np.float64)
num_cells = mesh_DG.topology.index_map(mesh_DG.topology.dim).size_global

def assemble_a10(R, V):
    W = ufl.MixedFunctionSpace(V, R)
    u, lmbda = ufl.TrialFunctions(W)
    du, dl = ufl.TestFunctions(W)
    a10 = ufl.inner(u, dl) * ufl.dx
    A_10 = dolfinx.fem.petsc.assemble_matrix(dolfinx.fem.form(a10))
    A_10.assemble()
    return A_10
    
def assemble_a10_transpose(R, V):
    W = ufl.MixedFunctionSpace(V, R)
    u, lmbda = ufl.TrialFunctions(W)
    du, dl = ufl.TestFunctions(W)
    # Assemble transpose: inner
    a01 = ufl.inner(lmbda, du) * ufl.dx
    A_01 = dolfinx.fem.petsc.assemble_matrix(dolfinx.fem.form(a01))
    A_01.assemble()
    A_10 = PETSc.Mat().createTranspose(A_01)   # MATTRANSPOSEVIRTUAL
    return A_10
    
V_DG = dolfinx.fem.functionspace(mesh_DG, ("DG", 1))
R_DG = create_real_functionspace(mesh_DG)

num_dofs_global_DG = V_DG.dofmap.index_map.size_global * V_DG.dofmap.index_map_bs

if MPI.COMM_WORLD.rank ==0:
    print(f"Number of dofs global for DG: {num_dofs_global_DG}")
    
# ============================================================================
# Allreduce to get min/max/avg times across all ranks
# ============================================================================
if rank == 0:
    print("\n--- Load balance statistics ---")
 
comm.Barrier()
t0 = timer()
A_DG = assemble_a10(R_DG, V_DG)
comm.Barrier()
t1 = timer()
local_time = t1 - t0
 
# Gather statistics across ranks
times = comm.allgather(local_time)
times_min = min(times)
times_max = max(times)
times_avg = sum(times) / len(times)
 
if rank == 0:
    print(f"DG: min={times_min:.6f}s, max={times_max:.6f}s, avg={times_avg:.6f}s, imbalance={(times_max/times_min - 1)*100:.1f}%")


comm.Barrier()
t0 = timer()
A_DG_T_T = assemble_a10_transpose(R_DG, V_DG)
comm.Barrier()
t1 = timer()
local_time = t1 - t0

# Gather statistics across ranks
times = comm.allgather(local_time)
times_min = min(times)
times_max = max(times)
times_avg = sum(times) / len(times)
 
if rank == 0:
    print(f"DG transpose: min={times_min:.6f}s, max={times_max:.6f}s, avg={times_avg:.6f}s, imbalance={(times_max/times_min - 1)*100:.1f}%")


The output of the previous file with the command mpirun -np 6 python3 test_poisson.py is:

Number of dofs global for DG: 135000

--- Load balance statistics ---
DG: min=0.960112s, max=0.960114s, avg=0.960113s, imbalance=0.0%
DG transpose: min=0.010416s, max=0.010417s, avg=0.010417s, imbalance=0.0%


  1. assembling (A^T)^T with A_T.transpose() is much slower, maybe we need to allocate first the matrix? ↩︎

  2. Maybe like in fem.petsc with Asub = A.getLocalSubMatrix(is0[i], is1[j]) and then filling the values? ↩︎