Parallelisation, petsc4py, slepc4py, and dolfinx

Hello dolfinx users, I have a problem that requires me to compute eigenvalues, so dolfinx wrappers and petsc4py are not enough, I need to leverage slepc4py.

Physically, I want to do resolvant analysis at frequency 0 on a simple shear flow. This requires linearising Navier-Stokes, but also using an extensor matrix and manually setting boundary conditions outside of FENICsx wrappers.

Mathematically, I’m solving a generalised eigenvalue problem of the like of : B^HJ^{-1H}NJBf=\sigma Mf

B is an extensor so that Bf=\left[\begin{array}{c}f\\0\end{array}\right]

J is the Jacobian of Navier-Stokes with a trivial baseflow.

N is a norm matrix that satisfies : q^HNq=[u^H\;p^H]N\left[\begin{array}{c}u\\p\end{array}\right]=\int |u|^2 dx

Likewise, M is a norm matrix. f^HMf=\int |f|^2 dx

I’ve had mixed success - my code runs in serial and produces a nice graph :

However, the same code breaks apart in parallel :

Here’s my MWE. Apologies in advance for length, I did my best to boil it down.

import ufl
import numpy as np
import dolfinx as dfx
from dolfinx.io import XDMFFile
from petsc4py import PETSc as pet
from slepc4py import SLEPc as slp
from mpi4py.MPI import COMM_WORLD as comm

mesh=dfx.UnitSquareMesh(comm,100,100)

FE_vector_1=ufl.VectorElement("Lagrange",mesh.ufl_cell(),1)
FE_vector_2=ufl.VectorElement("Lagrange",mesh.ufl_cell(),2)
FE_scalar  =ufl.FiniteElement("Lagrange",mesh.ufl_cell(),1)
# Taylor Hood elements
V =dfx.FunctionSpace(mesh,FE_vector_1)
TH=dfx.FunctionSpace(mesh,FE_vector_2*FE_scalar)
# Extract velocities only
U, _ = TH.sub(0).collapse(collapsed_dofs=True)

q,l=ufl.TrialFunction(TH),ufl.TestFunction(TH)
ub=dfx.Function(V)
w,z=ufl.TrialFunction(U), ufl.TestFunction(U)
u,p=ufl.split(q); v,s=ufl.split(l)

# Baseflow
if comm.size==1:
	x=mesh.geometry.x[:,:2]
	ub.vector[::2]=-1+2*x[:,1]
	viewer = pet.Viewer().createMPIIO("sanity_check_baseflow.dat", 'w', comm)
	ub.vector.view(viewer)
else:
	viewer = pet.Viewer().createMPIIO("sanity_check_baseflow.dat", 'r', comm)
	ub.vector.load(viewer)

# Mass csv
J_form  = ufl.inner(ufl.div(u), s)
# Momentum (different test functions and IBP)
J_form += ufl.inner(ufl.grad(ub)*u,  	 v)  	 # Convection
J_form += ufl.inner(ufl.grad(u)*ub, 	 v)
J_form += ufl.inner(ufl.grad(u),ufl.grad(v))/100 # Diffusion (Re=100)
J_form -= ufl.inner(p,		    ufl.div( v)) 	 # Pressure
J_form *= ufl.dx
# Velocity norm
N_form = ufl.inner(u,v)*ufl.dx
# Forcing norm
M_form = ufl.inner(w,z)*ufl.dx
# Extensor
B_form = ufl.inner(w,v)*ufl.dx

# Assembling matrices
J = dfx.fem.assemble_matrix(J_form); J.assemble() # Jacobian
N = dfx.fem.assemble_matrix(N_form); N.assemble() # Mass matrix for q
M = dfx.fem.assemble_matrix(M_form); M.assemble() # Mass matrix for f
B = dfx.fem.assemble_matrix(B_form); B.assemble() # Extensor

# Boundary conditions
def border(x): return np.isclose(x[0],0)+np.isclose(x[0],1)+np.isclose(x[1],0)+np.isclose(x[1],1)
dofs,_=dfx.fem.locate_dofs_geometrical((TH.sub(0),U),border)
J.zeroRows(dofs); B.zeroRows(dofs,0)

# Sizes
n=B.getSize()[1]
n_local=B.getLocalSize()[1]

def setupKSP(KSP):
	KSP.setType('preonly')
	# Preconditioner
	PC = KSP.getPC(); PC.setType('lu')
	PC.setFactorSolverType('mumps')
	KSP.setFromOptions()

# Compute solvers instead of inverses
KSPs = []
for Mat in [J,J.copy().hermitianTranspose()]:
	# Krylov subspace
	KSP = pet.KSP().create(comm)
	KSP.setOperators(Mat)
	setupKSP(KSP)
	KSPs.append(KSP)

w,z = J.getVecs()
# Resolvent operator
class LHS_class:
	def mult(cls,A,x,y):
		B.mult(x,w)
		KSPs[0].solve(w,z)
		N.mult(z,w)
		KSPs[1].solve(w,z)
		B.multTranspose(z,y)

LHS=pet.Mat().create(comm)
LHS.setSizes([[n_local,n]]*2)
LHS.setType(pet.Mat.Type.PYTHON)
LHS.setPythonContext(LHS_class())
LHS.setUp()

# Eigensolver
EPS = slp.EPS(); EPS.create(comm)
EPS.setOperators(LHS,M) # Solve B^T*L^-1H*L^-1*B*f=sigma^2*M*f (cheaper than a proper SVD)
EPS.setProblemType(slp.EPS.ProblemType.GHEP) # Specify that A is hermitian (by construction), & M is semi-positive
EPS.setDimensions(1,10) # Find k eigenvalues only with max number of Lanczos vectors
EPS.setTolerances(1e-6,100) # Set absolute tolerance and number of iterations
# Spectral transform
ST = EPS.getST(); ST.setType('shift'); ST.setShift(0)
# Krylov subspace
KSP = ST.getKSP()
setupKSP(KSP)
EPS.setFromOptions()
EPS.solve()

f=dfx.Function(U)
EPS.getEigenvector(0,f.vector)
q=dfx.Function(TH)
B.mult(f.vector,w)
KSPs[0].solve(w,q.vector)
u,_=q.split()
with XDMFFile(comm, "sanity_check_response.xdmf","w") as xdmf:
	xdmf.write_mesh(mesh)
	xdmf.write_function(u)

Running the above code in the dolfinx 3.1.0 docker container in serial then in parallel (complex mode) should reproduce above behaviour.

I strongly suspect I’ve forgotten to ghost a vector or messed up my information-sharing between processors.

I couldn’t reproduce the issue with a single KSP so I’m stuck with considering how petsc4py and slepc4py interact…

Whenever you do matrix vector multiplication (and other petsc operations) you need to update ghost dofs, see for instance: dolfinx/test_discrete_operators.py at 366435de1415bca336f5c8be91a3ea66c969b6a8 · FEniCS/dolfinx · GitHub

I added .x.scatter_forward everywhere, it’s different but not really better…

import ufl
import numpy as np
import dolfinx as dfx
from dolfinx.io import XDMFFile
from petsc4py import PETSc as pet
from slepc4py import SLEPc as slp
from mpi4py.MPI import COMM_WORLD as comm

mesh=dfx.UnitSquareMesh(comm,100,100)

FE_vector_1=ufl.VectorElement("Lagrange",mesh.ufl_cell(),1)
FE_vector_2=ufl.VectorElement("Lagrange",mesh.ufl_cell(),2)
FE_scalar  =ufl.FiniteElement("Lagrange",mesh.ufl_cell(),1)
# Taylor Hood elements
V =dfx.FunctionSpace(mesh,FE_vector_1)
TH=dfx.FunctionSpace(mesh,FE_vector_2*FE_scalar)
# Extract velocities only
U, _ = TH.sub(0).collapse(collapsed_dofs=True)

q,l=ufl.TrialFunction(TH),ufl.TestFunction(TH)
ub=dfx.Function(V)
w,z=ufl.TrialFunction(U), ufl.TestFunction(U)
u,p=ufl.split(q); v,s=ufl.split(l)

# Baseflow
if comm.size==1:
	x=mesh.geometry.x[:,:2]
	ub.vector[::2]=-1+2*x[:,1]
	viewer = pet.Viewer().createMPIIO("sanity_check_baseflow.dat", 'w', comm)
	ub.vector.view(viewer)
else:
	viewer = pet.Viewer().createMPIIO("sanity_check_baseflow.dat", 'r', comm)
	ub.vector.load(viewer)

# Mass csv
J_form  = ufl.inner(ufl.div(u), s)
# Momentum (different test functions and IBP)
J_form += ufl.inner(ufl.grad(ub)*u,  	 v)  	 # Convection
J_form += ufl.inner(ufl.grad(u)*ub, 	 v)
J_form += ufl.inner(ufl.grad(u),ufl.grad(v))/100 # Diffusion (Re=100)
J_form -= ufl.inner(p,		    ufl.div( v)) 	 # Pressure
J_form *= ufl.dx
# Velocity norm
N_form = ufl.inner(u,v)*ufl.dx
# Forcing norm
M_form = ufl.inner(w,z)*ufl.dx
# Extensor
B_form = ufl.inner(w,v)*ufl.dx

# Assembling matrices
J = dfx.fem.assemble_matrix(J_form); J.assemble() # Jacobian
N = dfx.fem.assemble_matrix(N_form); N.assemble() # Mass matrix for q
M = dfx.fem.assemble_matrix(M_form); M.assemble() # Mass matrix for f
B = dfx.fem.assemble_matrix(B_form); B.assemble() # Extensor

# Boundary conditions
def border(x): return np.isclose(x[0],0)+np.isclose(x[0],1)+np.isclose(x[1],0)+np.isclose(x[1],1)
dofs,_=dfx.fem.locate_dofs_geometrical((TH.sub(0),U),border)
J.zeroRows(dofs); B.zeroRows(dofs,0)

# Sizes
n=B.getSize()[1]
n_local=B.getLocalSize()[1]

def setupKSP(KSP):
	KSP.setType('preonly')
	# Preconditioner
	PC = KSP.getPC(); PC.setType('lu')
	PC.setFactorSolverType('mumps')
	KSP.setFromOptions()

# Compute solvers instead of inverses
KSPs = []
for Mat in [J,J.copy().hermitianTranspose()]:
	# Krylov subspace
	KSP = pet.KSP().create(comm)
	KSP.setOperators(Mat)
	setupKSP(KSP)
	KSPs.append(KSP)

w,z = dfx.Function(TH), dfx.Function(TH)
s = dfx.Function(U)
# Resolvent operator
class LHS_class:
	def mult(cls,A,x,y):
		B.mult(x,w.vector)
		w.x.scatter_forward()
		KSPs[0].solve(w.vector,z.vector)
		z.x.scatter_forward()
		N.mult(z.vector,w.vector)
		w.x.scatter_forward()
		KSPs[1].solve(w.vector,z.vector)
		z.x.scatter_forward()
		B.multTranspose(z.vector,s.vector)
		s.x.scatter_forward()
		s.vector.copy(y)

LHS=pet.Mat().create(comm)
LHS.setSizes([[n_local,n]]*2)
LHS.setType(pet.Mat.Type.PYTHON)
LHS.setPythonContext(LHS_class())
LHS.setUp()

# Eigensolver
EPS = slp.EPS(); EPS.create(comm)
EPS.setOperators(LHS,M) # Solve B^T*L^-1H*L^-1*B*f=sigma^2*M*f (cheaper than a proper SVD)
EPS.setProblemType(slp.EPS.ProblemType.GHEP) # Specify that A is hermitian (by construction), & M is semi-positive
EPS.setDimensions(1,10) # Find k eigenvalues only with max number of Lanczos vectors
EPS.setTolerances(1e-6,100) # Set absolute tolerance and number of iterations
# Spectral transform
ST = EPS.getST(); ST.setType('shift'); ST.setShift(0)
# Krylov subspace
KSP = ST.getKSP()
setupKSP(KSP)
EPS.setFromOptions()
EPS.solve()

f=dfx.Function(U)
EPS.getEigenvector(0,f.vector)
q=dfx.Function(TH)
B.mult(f.vector,w.vector)
w.x.scatter_forward()
KSPs[0].solve(w.vector,q.vector)
q.x.scatter_forward()
u,_=q.split()
with XDMFFile(comm, "sanity_check_response.xdmf","w") as xdmf:
	xdmf.write_mesh(mesh)
	xdmf.write_function(u)

Gives the spurious but different :

is the EPSType consistent between serial and parallel runs?

Yes. EPS.getType() returns None in both cases right after creation, krylovschur right before EPS.solve()

ok, but using EPSView should tell you if you’re actually using the same EPSType (and all other important things) in serial and parallel. Perhaps the default is different in each case, although the documentation says it should be krylovschur. It’s also not obvious, but is LHS actually Hermitian?

Here’s the output for EPS.view() on one processor :

EPS Object: 1 MPI processes
  type: krylovschur
    50% of basis vectors kept after restart
    using the locking variant
  problem type: generalized hermitian eigenvalue problem
  selected portion of the spectrum: largest eigenvalues in magnitude
  postprocessing eigenvectors with purification
  number of eigenvalues (nev): 1
  number of column vectors (ncv): 10
  maximum dimension of projected problem (mpd): 10
  maximum number of iterations: 100
  tolerance: 1e-06
  convergence test: relative to the eigenvalue
BV Object: 1 MPI processes
  type: svec
  11 columns of global length 80802
  vector orthogonalization method: classical Gram-Schmidt
  orthogonalization refinement: if needed (eta: 0.7071)
  block orthogonalization method: GS
  non-standard inner product
  tolerance for definite inner product: 2.22045e-15
  inner product matrix:
  Mat Object: (st_) 1 MPI processes
    type: seqaij
    rows=80802, cols=80802, bs=2
    total: nonzeros=1846404, allocated nonzeros=1846404
    total number of mallocs used during MatSetValues calls=0
      using I-node routines: found 40399 nodes, limit used is 5
  doing matmult as a single matrix-matrix product
DS Object: 1 MPI processes
  type: hep
  solving the problem with: Implicit QR method (_steqr)
ST Object: 1 MPI processes
  type: shift
  shift: 0.
  number of matrices: 2
  all matrices have unknown nonzero pattern
  KSP Object: (st_) 1 MPI processes
    type: preonly
    maximum iterations=10000, initial guess is zero
    tolerances:  relative=1e-08, absolute=1e-50, divergence=10000.
    left preconditioning
    using NONE norm type for convergence test
  PC Object: (st_) 1 MPI processes
    type: lu
      out-of-place factorization
      tolerance for zero pivot 2.22045e-14
      matrix ordering: external
      factor fill ratio given 0., needed 0.
        Factored matrix follows:
          Mat Object: 1 MPI processes
            type: mumps
            rows=80802, cols=80802
            package used to perform factorization: mumps
            total: nonzeros=13033708, allocated nonzeros=13033708
              MUMPS run parameters:
                SYM (matrix type):                   0
                PAR (host participation):            1
                ICNTL(1) (output for error):         6
                ICNTL(2) (output of diagnostic msg): 0
                ICNTL(3) (output for global info):   0
                ICNTL(4) (level of printing):        0
                ICNTL(5) (input mat struct):         0
                ICNTL(6) (matrix prescaling):        7
                ICNTL(7) (sequential matrix ordering):7
                ICNTL(8) (scaling strategy):        77
                ICNTL(10) (max num of refinements):  0
                ICNTL(11) (error analysis):          0
                ICNTL(12) (efficiency control):                         1
                ICNTL(13) (sequential factorization of the root node):  0
                ICNTL(14) (percentage of estimated workspace increase): 20
                ICNTL(18) (input mat struct):                           0
                ICNTL(19) (Schur complement info):                      0
                ICNTL(20) (RHS sparse pattern):                         0
                ICNTL(21) (solution struct):                            0
                ICNTL(22) (in-core/out-of-core facility):               0
                ICNTL(23) (max size of memory can be allocated locally):0
                ICNTL(24) (detection of null pivot rows):               0
                ICNTL(25) (computation of a null space basis):          0
                ICNTL(26) (Schur options for RHS or solution):          0
                ICNTL(27) (blocking size for multiple RHS):             -32
                ICNTL(28) (use parallel or sequential ordering):        1
                ICNTL(29) (parallel ordering):                          0
                ICNTL(30) (user-specified set of entries in inv(A)):    0
                ICNTL(31) (factors is discarded in the solve phase):    0
                ICNTL(33) (compute determinant):                        0
                ICNTL(35) (activate BLR based factorization):           0
                ICNTL(36) (choice of BLR factorization variant):        0
                ICNTL(38) (estimated compression rate of LU factors):   333
                CNTL(1) (relative pivoting threshold):      0.01 
                CNTL(2) (stopping criterion of refinement): 1.49012e-08 
                CNTL(3) (absolute pivoting threshold):      0. 
                CNTL(4) (value of static pivoting):         -1. 
                CNTL(5) (fixation for null pivots):         0. 
                CNTL(7) (dropping parameter for BLR):       0. 
                RINFO(1) (local estimated flops for the elimination after analysis): 
                  [0] 2.61132e+09 
                RINFO(2) (local estimated flops for the assembly after factorization): 
                  [0]  2.0365e+07 
                RINFO(3) (local estimated flops for the elimination after factorization): 
                  [0]  2.61132e+09 
                INFO(15) (estimated size of (in MB) MUMPS internal data for running numerical factorization): 
                [0] 304
                INFO(16) (size of (in MB) MUMPS internal data used during numerical factorization): 
                  [0] 304
                INFO(23) (num of pivots eliminated on this processor after factorization): 
                  [0] 80802
                RINFOG(1) (global estimated flops for the elimination after analysis): 2.61132e+09 
                RINFOG(2) (global estimated flops for the assembly after factorization): 2.0365e+07 
                RINFOG(3) (global estimated flops for the elimination after factorization): 2.61132e+09 
                (RINFOG(12) RINFOG(13))*2^INFOG(34) (determinant): (0.,0.)*(2^0)
                INFOG(3) (estimated real workspace for factors on all processors after analysis): 13033708
                INFOG(4) (estimated integer workspace for factors on all processors after analysis): 1037876
                INFOG(5) (estimated maximum front size in the complete tree): 760
                INFOG(6) (number of nodes in the complete tree): 10847
                INFOG(7) (ordering option effectively used after analysis): 5
                INFOG(8) (structural symmetry in percent of the permuted matrix after analysis): 100
                INFOG(9) (total real/complex workspace to store the matrix factors after factorization): 13033708
                INFOG(10) (total integer space store the matrix factors after factorization): 1037876
                INFOG(11) (order of largest frontal matrix after factorization): 760
                INFOG(12) (number of off-diagonal pivots): 0
                INFOG(13) (number of delayed pivots after factorization): 0
                INFOG(14) (number of memory compress after factorization): 0
                INFOG(15) (number of steps of iterative refinement after solution): 0
                INFOG(16) (estimated size (in MB) of all MUMPS internal data for factorization after analysis: value on the most memory consuming processor): 304
                INFOG(17) (estimated size of all MUMPS internal data for factorization after analysis: sum over all processors): 304
                INFOG(18) (size of all MUMPS internal data allocated during factorization: value on the most memory consuming processor): 304
                INFOG(19) (size of all MUMPS internal data allocated during factorization: sum over all processors): 304
                INFOG(20) (estimated number of entries in the factors): 13033708
                INFOG(21) (size in MB of memory effectively used during factorization - value on the most memory consuming processor): 261
                INFOG(22) (size in MB of memory effectively used during factorization - sum over all processors): 261
                INFOG(23) (after analysis: value of ICNTL(6) effectively used): 0
                INFOG(24) (after analysis: value of ICNTL(12) effectively used): 1
                INFOG(25) (after factorization: number of pivots modified by static pivoting): 0
                INFOG(28) (after factorization: number of null pivots encountered): 0
                INFOG(29) (after factorization: effective number of entries in the factors (sum over all processors)): 13033708
                INFOG(30, 31) (after solution: size in Mbytes of memory used during solution phase): 279, 279
                INFOG(32) (after analysis: type of analysis done): 1
                INFOG(33) (value used for ICNTL(8)): 7
                INFOG(34) (exponent of the determinant if determinant is requested): 0
                INFOG(35) (after factorization: number of entries taking into account BLR factor compression - sum over all processors): 13033708
                INFOG(36) (after analysis: estimated size of all MUMPS internal data for running BLR in-core - value on the most memory consuming processor): 0
                INFOG(37) (after analysis: estimated size of all MUMPS internal data for running BLR in-core - sum over all processors): 0
                INFOG(38) (after analysis: estimated size of all MUMPS internal data for running BLR out-of-core - value on the most memory consuming processor): 0
                INFOG(39) (after analysis: estimated size of all MUMPS internal data for running BLR out-of-core - sum over all processors): 0
    linear system matrix = precond matrix:
    Mat Object: (st_) 1 MPI processes
      type: seqaij
      rows=80802, cols=80802, bs=2
      total: nonzeros=1846404, allocated nonzeros=1846404
      total number of mallocs used during MatSetValues calls=0
        using I-node routines: found 40399 nodes, limit used is 5

And two :

EPS Object: 2 MPI processes
  type: krylovschur
    50% of basis vectors kept after restart
    using the locking variant
  problem type: generalized hermitian eigenvalue problem
  selected portion of the spectrum: largest eigenvalues in magnitude
  postprocessing eigenvectors with purification
  number of eigenvalues (nev): 1
  number of column vectors (ncv): 10
  maximum dimension of projected problem (mpd): 10
  maximum number of iterations: 100
  tolerance: 1e-06
  convergence test: relative to the eigenvalue
BV Object: 2 MPI processes
  type: svec
  11 columns of global length 80802
  vector orthogonalization method: classical Gram-Schmidt
  orthogonalization refinement: if needed (eta: 0.7071)
  block orthogonalization method: GS
  non-standard inner product
  tolerance for definite inner product: 2.22045e-15
  inner product matrix:
  Mat Object: (st_) 2 MPI processes
    type: mpiaij
    rows=80802, cols=80802, bs=2
    total: nonzeros=1846404, allocated nonzeros=1846404
    total number of mallocs used during MatSetValues calls=0
      using I-node (on process 0) routines: found 20197 nodes, limit used is 5
  doing matmult as a single matrix-matrix product
DS Object: 2 MPI processes
  type: hep
  parallel operation mode: REDUNDANT
  solving the problem with: Implicit QR method (_steqr)
ST Object: 2 MPI processes
  type: shift
  shift: 0.
  number of matrices: 2
  all matrices have unknown nonzero pattern
  KSP Object: (st_) 2 MPI processes
    type: preonly
    maximum iterations=10000, initial guess is zero
    tolerances:  relative=1e-08, absolute=1e-50, divergence=10000.
    left preconditioning
    using NONE norm type for convergence test
  PC Object: (st_) 2 MPI processes
    type: lu
      out-of-place factorization
      tolerance for zero pivot 2.22045e-14
      matrix ordering: external
      factor fill ratio given 0., needed 0.
        Factored matrix follows:
          Mat Object: 2 MPI processes
            type: mumps
            rows=80802, cols=80802
            package used to perform factorization: mumps
            total: nonzeros=13231812, allocated nonzeros=13231812
              MUMPS run parameters:
                SYM (matrix type):                   0
                PAR (host participation):            1
                ICNTL(1) (output for error):         6
                ICNTL(2) (output of diagnostic msg): 0
                ICNTL(3) (output for global info):   0
                ICNTL(4) (level of printing):        0
                ICNTL(5) (input mat struct):         0
                ICNTL(6) (matrix prescaling):        7
                ICNTL(7) (sequential matrix ordering):7
                ICNTL(8) (scaling strategy):        77
                ICNTL(10) (max num of refinements):  0
                ICNTL(11) (error analysis):          0
                ICNTL(12) (efficiency control):                         1
                ICNTL(13) (sequential factorization of the root node):  0
                ICNTL(14) (percentage of estimated workspace increase): 20
                ICNTL(18) (input mat struct):                           3
                ICNTL(19) (Schur complement info):                      0
                ICNTL(20) (RHS sparse pattern):                         0
                ICNTL(21) (solution struct):                            1
                ICNTL(22) (in-core/out-of-core facility):               0
                ICNTL(23) (max size of memory can be allocated locally):0
                ICNTL(24) (detection of null pivot rows):               0
                ICNTL(25) (computation of a null space basis):          0
                ICNTL(26) (Schur options for RHS or solution):          0
                ICNTL(27) (blocking size for multiple RHS):             -32
                ICNTL(28) (use parallel or sequential ordering):        1
                ICNTL(29) (parallel ordering):                          0
                ICNTL(30) (user-specified set of entries in inv(A)):    0
                ICNTL(31) (factors is discarded in the solve phase):    0
                ICNTL(33) (compute determinant):                        0
                ICNTL(35) (activate BLR based factorization):           0
                ICNTL(36) (choice of BLR factorization variant):        0
                ICNTL(38) (estimated compression rate of LU factors):   333
                CNTL(1) (relative pivoting threshold):      0.01 
                CNTL(2) (stopping criterion of refinement): 1.49012e-08 
                CNTL(3) (absolute pivoting threshold):      0. 
                CNTL(4) (value of static pivoting):         -1. 
                CNTL(5) (fixation for null pivots):         0. 
                CNTL(7) (dropping parameter for BLR):       0. 
                RINFO(1) (local estimated flops for the elimination after analysis): 
                  [0] 1.35456e+09 
                  [1] 1.41695e+09 
                RINFO(2) (local estimated flops for the assembly after factorization): 
                  [0]  1.05945e+07 
                  [1]  1.02145e+07 
                RINFO(3) (local estimated flops for the elimination after factorization): 
                  [0]  1.35456e+09 
                  [1]  1.41695e+09 
                INFO(15) (estimated size of (in MB) MUMPS internal data for running numerical factorization): 
                [0] 176
                [1] 178
                INFO(16) (size of (in MB) MUMPS internal data used during numerical factorization): 
                  [0] 176
                  [1] 178
                INFO(23) (num of pivots eliminated on this processor after factorization): 
                  [0] 40314
                  [1] 40488
                RINFOG(1) (global estimated flops for the elimination after analysis): 2.77151e+09 
                RINFOG(2) (global estimated flops for the assembly after factorization): 2.0809e+07 
                RINFOG(3) (global estimated flops for the elimination after factorization): 2.77151e+09 
                (RINFOG(12) RINFOG(13))*2^INFOG(34) (determinant): (0.,0.)*(2^0)
                INFOG(3) (estimated real workspace for factors on all processors after analysis): 13231812
                INFOG(4) (estimated integer workspace for factors on all processors after analysis): 1044364
                INFOG(5) (estimated maximum front size in the complete tree): 740
                INFOG(6) (number of nodes in the complete tree): 10914
                INFOG(7) (ordering option effectively used after analysis): 5
                INFOG(8) (structural symmetry in percent of the permuted matrix after analysis): -1
                INFOG(9) (total real/complex workspace to store the matrix factors after factorization): 13231812
                INFOG(10) (total integer space store the matrix factors after factorization): 1044364
                INFOG(11) (order of largest frontal matrix after factorization): 740
                INFOG(12) (number of off-diagonal pivots): 0
                INFOG(13) (number of delayed pivots after factorization): 0
                INFOG(14) (number of memory compress after factorization): 0
                INFOG(15) (number of steps of iterative refinement after solution): 0
                INFOG(16) (estimated size (in MB) of all MUMPS internal data for factorization after analysis: value on the most memory consuming processor): 178
                INFOG(17) (estimated size of all MUMPS internal data for factorization after analysis: sum over all processors): 354
                INFOG(18) (size of all MUMPS internal data allocated during factorization: value on the most memory consuming processor): 178
                INFOG(19) (size of all MUMPS internal data allocated during factorization: sum over all processors): 354
                INFOG(20) (estimated number of entries in the factors): 13231812
                INFOG(21) (size in MB of memory effectively used during factorization - value on the most memory consuming processor): 152
                INFOG(22) (size in MB of memory effectively used during factorization - sum over all processors): 303
                INFOG(23) (after analysis: value of ICNTL(6) effectively used): 0
                INFOG(24) (after analysis: value of ICNTL(12) effectively used): 1
                INFOG(25) (after factorization: number of pivots modified by static pivoting): 0
                INFOG(28) (after factorization: number of null pivots encountered): 0
                INFOG(29) (after factorization: effective number of entries in the factors (sum over all processors)): 13231812
                INFOG(30, 31) (after solution: size in Mbytes of memory used during solution phase): 152, 303
                INFOG(32) (after analysis: type of analysis done): 1
                INFOG(33) (value used for ICNTL(8)): 7
                INFOG(34) (exponent of the determinant if determinant is requested): 0
                INFOG(35) (after factorization: number of entries taking into account BLR factor compression - sum over all processors): 13231812
                INFOG(36) (after analysis: estimated size of all MUMPS internal data for running BLR in-core - value on the most memory consuming processor): 0
                INFOG(37) (after analysis: estimated size of all MUMPS internal data for running BLR in-core - sum over all processors): 0
                INFOG(38) (after analysis: estimated size of all MUMPS internal data for running BLR out-of-core - value on the most memory consuming processor): 0
                INFOG(39) (after analysis: estimated size of all MUMPS internal data for running BLR out-of-core - sum over all processors): 0
    linear system matrix = precond matrix:
    Mat Object: (st_) 2 MPI processes
      type: mpiaij
      rows=80802, cols=80802, bs=2
      total: nonzeros=1846404, allocated nonzeros=1846404
      total number of mallocs used during MatSetValues calls=0
        using I-node (on process 0) routines: found 20197 nodes, limit used is 5

I can’t see any meaningful difference.

As for the second part of your question, I’m pretty sure that LHS should be hermitian. Morally, recall that : \text{LHS}=B^HL^{-1H}NL^{-1}B

LHS is defined in the code as a matrix-free script that takes a vector x, computes w=Bx, solves z\in C^m|Lz=w, does w=Nz, computes z\in C^m|L^Hz=w, then finally y=B^Tz.

I believe B is real so B^T=B^H and N is a norm (q^HNq=\int|u|^2dx) so I think LHS is hermitian.

I ran a quick check, N.isSymmetric(1e-14) returns True. So I’d say I have high confidence that LHS is hermitian.

Ok I’ve printed the results of the first matrix product (LHS_class.mult with x being basically noise) in both serial and parallel. I think the inversion is wrong. The serial part looks already something like the final structure :

But the parallel part just breaks apart :

I suspected this to be the case. I’ll try reproducing this behaviour on a smaller MWE

Ok I think I’m on a the correct path to solving the problem. There were a couple of problems with the previous MWE ; the viewer trick corrupts the data if used with a different number of procs at reading/writing.

More importantly, my attempt and handling the boundary conditions ‘by hand’ using zeroRows fails miserably in parallel. The key take away from this that I can see is trust the dolfinx developers folks, they know their stuff.

A parallel inverter follows if anyone is interested :

import ufl
import numpy as np
import dolfinx as dfx
from dolfinx.io import XDMFFile
from petsc4py import PETSc as pet
from mpi4py.MPI import COMM_WORLD as comm

n=100
mesh=dfx.UnitSquareMesh(comm,n,n)

FE_vector_1=ufl.VectorElement("Lagrange",mesh.ufl_cell(),1)
FE_vector_2=ufl.VectorElement("Lagrange",mesh.ufl_cell(),2)
FE_scalar  =ufl.FiniteElement("Lagrange",mesh.ufl_cell(),1)
# Taylor Hood elements
V =dfx.FunctionSpace(mesh,FE_vector_1)
TH=dfx.FunctionSpace(mesh,FE_vector_2*FE_scalar)
# Extract velocities only
U = TH.sub(0)
Uc, _ = TH.sub(0).collapse(collapsed_dofs=True)

q,l=ufl.TrialFunction(TH),ufl.TestFunction(TH)
w=ufl.TrialFunction(V)
f=dfx.Function(V)
u,p=ufl.split(q); v,s=ufl.split(l)

# Baseflow
f.interpolate(lambda x: np.exp(-.5*((x[0]-.5)**2+(x[1]-.5)**2)/.2**2)*
						np.sin(np.pi*(np.vstack((x[1]-.5,.5-x[0])))))
f.x.scatter_forward()

# Mass csv
J_form  = ufl.inner(ufl.div(u), s)
# Momentum (different test functions and IBP)
J_form += ufl.inner(ufl.grad(u),ufl.grad(v))/100 # Diffusion (Re=100)
J_form -= ufl.inner(p,		    ufl.div( v)) 	 # Pressure
J_form *= ufl.dx
# Extensor
B_form = ufl.inner(w,v)*ufl.dx

# Boundary conditions
def border(x): return np.isclose(x[0],0)+np.isclose(x[0],1)+np.isclose(x[1],0)+np.isclose(x[1],1)
dofs=dfx.fem.locate_dofs_geometrical((U,Uc),border)
zero=dfx.Function(Uc)
zero.interpolate(lambda x: np.zeros(x[:2,:].shape))
bcs = [dfx.DirichletBC(zero, dofs, U)]

# Assembling matrices
J = dfx.fem.assemble_matrix(J_form,bcs);   J.assemble() # Jacobian
B = dfx.fem.assemble_matrix(B_form,bcs,0); B.assemble() # Extensor

# Sizes
_, n	   = B.getSize()
_, n_local = B.getLocalSize()

# Krylov subspace
KSP = pet.KSP().create(comm)
KSP.setOperators(J)
KSP.setType('preonly')
# Preconditioner
PC = KSP.getPC(); PC.setType('lu')
PC.setFactorSolverType('mumps')
KSP.setFromOptions()

q,s=dfx.Function(TH),dfx.Function(TH)
B.mult(f.vector,s.vector)
s.x.scatter_forward()
KSP.solve(s.vector,q.vector)
q.x.scatter_forward()
u,_=q.split()
with XDMFFile(comm, f"sanity_check_response_dummy_{comm.size}.xdmf","w") as xdmf:
	xdmf.write_mesh(mesh)
	xdmf.write_function(u)