Dear users,
I found a performance issue when assembling a row matrix A of global size (1, N). These kind of matrices arise naturally when imposing a global constraint for singular problems, see for instance, Real function spaces — scifem
As it is currently implemented, the assembly of the row matrix for real spaces is carried out in a single core and then distributed. It would be much faster to assemble the transpose matrix A^T of global size (N, 1) and then transpose it.
First I would like to share a minimal example that reproduces the performance issue (see below). Then, I would like to ask you for help in order to create and assemble the transpose (A^T)^T .
I have tried with A_T_T = PETSc.Mat().createTranspose(A_T)[1] but it seems this is not compatible with a following Mat.NEST and LU solver, I get kspConvergedReason = -11 or OUT of Memory. Any idea? [2]
from mpi4py import MPI
from petsc4py import PETSc
import dolfinx.fem.petsc
import numpy as np
from scifem import create_real_functionspace
import ufl
from timeit import default_timer as timer
comm = MPI.COMM_WORLD
rank = comm.rank
size = comm.size
M_DG = 150
mesh_DG = dolfinx.mesh.create_unit_square(MPI.COMM_WORLD, M_DG, M_DG, dolfinx.mesh.CellType.triangle, dtype=np.float64)
num_cells = mesh_DG.topology.index_map(mesh_DG.topology.dim).size_global
def assemble_a10(R, V):
W = ufl.MixedFunctionSpace(V, R)
u, lmbda = ufl.TrialFunctions(W)
du, dl = ufl.TestFunctions(W)
a10 = ufl.inner(u, dl) * ufl.dx
A_10 = dolfinx.fem.petsc.assemble_matrix(dolfinx.fem.form(a10))
A_10.assemble()
return A_10
def assemble_a10_transpose(R, V):
W = ufl.MixedFunctionSpace(V, R)
u, lmbda = ufl.TrialFunctions(W)
du, dl = ufl.TestFunctions(W)
# Assemble transpose: inner
a01 = ufl.inner(lmbda, du) * ufl.dx
A_01 = dolfinx.fem.petsc.assemble_matrix(dolfinx.fem.form(a01))
A_01.assemble()
A_10 = PETSc.Mat().createTranspose(A_01) # MATTRANSPOSEVIRTUAL
return A_10
V_DG = dolfinx.fem.functionspace(mesh_DG, ("DG", 1))
R_DG = create_real_functionspace(mesh_DG)
num_dofs_global_DG = V_DG.dofmap.index_map.size_global * V_DG.dofmap.index_map_bs
if MPI.COMM_WORLD.rank ==0:
print(f"Number of dofs global for DG: {num_dofs_global_DG}")
# ============================================================================
# Allreduce to get min/max/avg times across all ranks
# ============================================================================
if rank == 0:
print("\n--- Load balance statistics ---")
comm.Barrier()
t0 = timer()
A_DG = assemble_a10(R_DG, V_DG)
comm.Barrier()
t1 = timer()
local_time = t1 - t0
# Gather statistics across ranks
times = comm.allgather(local_time)
times_min = min(times)
times_max = max(times)
times_avg = sum(times) / len(times)
if rank == 0:
print(f"DG: min={times_min:.6f}s, max={times_max:.6f}s, avg={times_avg:.6f}s, imbalance={(times_max/times_min - 1)*100:.1f}%")
comm.Barrier()
t0 = timer()
A_DG_T_T = assemble_a10_transpose(R_DG, V_DG)
comm.Barrier()
t1 = timer()
local_time = t1 - t0
# Gather statistics across ranks
times = comm.allgather(local_time)
times_min = min(times)
times_max = max(times)
times_avg = sum(times) / len(times)
if rank == 0:
print(f"DG transpose: min={times_min:.6f}s, max={times_max:.6f}s, avg={times_avg:.6f}s, imbalance={(times_max/times_min - 1)*100:.1f}%")
The output of the previous file with the command mpirun -np 6 python3 test_poisson.py is:
Number of dofs global for DG: 135000
--- Load balance statistics ---
DG: min=0.960112s, max=0.960114s, avg=0.960113s, imbalance=0.0%
DG transpose: min=0.010416s, max=0.010417s, avg=0.010417s, imbalance=0.0%