# Parallelisation, petsc4py, slepc4py, and dolfinx

Hello dolfinx users, I have a problem that requires me to compute eigenvalues, so dolfinx wrappers and petsc4py are not enough, I need to leverage slepc4py.

Physically, I want to do resolvant analysis at frequency 0 on a simple shear flow. This requires linearising Navier-Stokes, but also using an extensor matrix and manually setting boundary conditions outside of FENICsx wrappers.

Mathematically, I’m solving a generalised eigenvalue problem of the like of : B^HJ^{-1H}NJBf=\sigma Mf

B is an extensor so that Bf=\left[\begin{array}{c}f\\0\end{array}\right]

J is the Jacobian of Navier-Stokes with a trivial baseflow.

N is a norm matrix that satisfies : q^HNq=[u^H\;p^H]N\left[\begin{array}{c}u\\p\end{array}\right]=\int |u|^2 dx

Likewise, M is a norm matrix. f^HMf=\int |f|^2 dx

I’ve had mixed success - my code runs in serial and produces a nice graph :

However, the same code breaks apart in parallel :

Here’s my MWE. Apologies in advance for length, I did my best to boil it down.

import ufl
import numpy as np
import dolfinx as dfx
from dolfinx.io import XDMFFile
from petsc4py import PETSc as pet
from slepc4py import SLEPc as slp
from mpi4py.MPI import COMM_WORLD as comm

mesh=dfx.UnitSquareMesh(comm,100,100)

FE_vector_1=ufl.VectorElement("Lagrange",mesh.ufl_cell(),1)
FE_vector_2=ufl.VectorElement("Lagrange",mesh.ufl_cell(),2)
FE_scalar  =ufl.FiniteElement("Lagrange",mesh.ufl_cell(),1)
# Taylor Hood elements
V =dfx.FunctionSpace(mesh,FE_vector_1)
TH=dfx.FunctionSpace(mesh,FE_vector_2*FE_scalar)
# Extract velocities only
U, _ = TH.sub(0).collapse(collapsed_dofs=True)

q,l=ufl.TrialFunction(TH),ufl.TestFunction(TH)
ub=dfx.Function(V)
w,z=ufl.TrialFunction(U), ufl.TestFunction(U)
u,p=ufl.split(q); v,s=ufl.split(l)

# Baseflow
if comm.size==1:
x=mesh.geometry.x[:,:2]
ub.vector[::2]=-1+2*x[:,1]
viewer = pet.Viewer().createMPIIO("sanity_check_baseflow.dat", 'w', comm)
ub.vector.view(viewer)
else:
viewer = pet.Viewer().createMPIIO("sanity_check_baseflow.dat", 'r', comm)

# Mass csv
J_form  = ufl.inner(ufl.div(u), s)
# Momentum (different test functions and IBP)
J_form += ufl.inner(ufl.grad(ub)*u,  	 v)  	 # Convection
J_form -= ufl.inner(p,		    ufl.div( v)) 	 # Pressure
J_form *= ufl.dx
# Velocity norm
N_form = ufl.inner(u,v)*ufl.dx
# Forcing norm
M_form = ufl.inner(w,z)*ufl.dx
# Extensor
B_form = ufl.inner(w,v)*ufl.dx

# Assembling matrices
J = dfx.fem.assemble_matrix(J_form); J.assemble() # Jacobian
N = dfx.fem.assemble_matrix(N_form); N.assemble() # Mass matrix for q
M = dfx.fem.assemble_matrix(M_form); M.assemble() # Mass matrix for f
B = dfx.fem.assemble_matrix(B_form); B.assemble() # Extensor

# Boundary conditions
def border(x): return np.isclose(x[0],0)+np.isclose(x[0],1)+np.isclose(x[1],0)+np.isclose(x[1],1)
dofs,_=dfx.fem.locate_dofs_geometrical((TH.sub(0),U),border)
J.zeroRows(dofs); B.zeroRows(dofs,0)

# Sizes
n=B.getSize()[1]
n_local=B.getLocalSize()[1]

def setupKSP(KSP):
KSP.setType('preonly')
# Preconditioner
PC = KSP.getPC(); PC.setType('lu')
PC.setFactorSolverType('mumps')
KSP.setFromOptions()

# Compute solvers instead of inverses
KSPs = []
for Mat in [J,J.copy().hermitianTranspose()]:
# Krylov subspace
KSP = pet.KSP().create(comm)
KSP.setOperators(Mat)
setupKSP(KSP)
KSPs.append(KSP)

w,z = J.getVecs()
# Resolvent operator
class LHS_class:
def mult(cls,A,x,y):
B.mult(x,w)
KSPs[0].solve(w,z)
N.mult(z,w)
KSPs[1].solve(w,z)
B.multTranspose(z,y)

LHS=pet.Mat().create(comm)
LHS.setSizes([[n_local,n]]*2)
LHS.setType(pet.Mat.Type.PYTHON)
LHS.setPythonContext(LHS_class())
LHS.setUp()

# Eigensolver
EPS = slp.EPS(); EPS.create(comm)
EPS.setOperators(LHS,M) # Solve B^T*L^-1H*L^-1*B*f=sigma^2*M*f (cheaper than a proper SVD)
EPS.setProblemType(slp.EPS.ProblemType.GHEP) # Specify that A is hermitian (by construction), & M is semi-positive
EPS.setDimensions(1,10) # Find k eigenvalues only with max number of Lanczos vectors
EPS.setTolerances(1e-6,100) # Set absolute tolerance and number of iterations
# Spectral transform
ST = EPS.getST(); ST.setType('shift'); ST.setShift(0)
# Krylov subspace
KSP = ST.getKSP()
setupKSP(KSP)
EPS.setFromOptions()
EPS.solve()

f=dfx.Function(U)
EPS.getEigenvector(0,f.vector)
q=dfx.Function(TH)
B.mult(f.vector,w)
KSPs[0].solve(w,q.vector)
u,_=q.split()
with XDMFFile(comm, "sanity_check_response.xdmf","w") as xdmf:
xdmf.write_mesh(mesh)
xdmf.write_function(u)


Running the above code in the dolfinx 3.1.0 docker container in serial then in parallel (complex mode) should reproduce above behaviour.

I strongly suspect I’ve forgotten to ghost a vector or messed up my information-sharing between processors.

I couldn’t reproduce the issue with a single KSP so I’m stuck with considering how petsc4py and slepc4py interact…

Whenever you do matrix vector multiplication (and other petsc operations) you need to update ghost dofs, see for instance: dolfinx/test_discrete_operators.py at 366435de1415bca336f5c8be91a3ea66c969b6a8 · FEniCS/dolfinx · GitHub

I added .x.scatter_forward everywhere, it’s different but not really better…

import ufl
import numpy as np
import dolfinx as dfx
from dolfinx.io import XDMFFile
from petsc4py import PETSc as pet
from slepc4py import SLEPc as slp
from mpi4py.MPI import COMM_WORLD as comm

mesh=dfx.UnitSquareMesh(comm,100,100)

FE_vector_1=ufl.VectorElement("Lagrange",mesh.ufl_cell(),1)
FE_vector_2=ufl.VectorElement("Lagrange",mesh.ufl_cell(),2)
FE_scalar  =ufl.FiniteElement("Lagrange",mesh.ufl_cell(),1)
# Taylor Hood elements
V =dfx.FunctionSpace(mesh,FE_vector_1)
TH=dfx.FunctionSpace(mesh,FE_vector_2*FE_scalar)
# Extract velocities only
U, _ = TH.sub(0).collapse(collapsed_dofs=True)

q,l=ufl.TrialFunction(TH),ufl.TestFunction(TH)
ub=dfx.Function(V)
w,z=ufl.TrialFunction(U), ufl.TestFunction(U)
u,p=ufl.split(q); v,s=ufl.split(l)

# Baseflow
if comm.size==1:
x=mesh.geometry.x[:,:2]
ub.vector[::2]=-1+2*x[:,1]
viewer = pet.Viewer().createMPIIO("sanity_check_baseflow.dat", 'w', comm)
ub.vector.view(viewer)
else:
viewer = pet.Viewer().createMPIIO("sanity_check_baseflow.dat", 'r', comm)

# Mass csv
J_form  = ufl.inner(ufl.div(u), s)
# Momentum (different test functions and IBP)
J_form += ufl.inner(ufl.grad(ub)*u,  	 v)  	 # Convection
J_form -= ufl.inner(p,		    ufl.div( v)) 	 # Pressure
J_form *= ufl.dx
# Velocity norm
N_form = ufl.inner(u,v)*ufl.dx
# Forcing norm
M_form = ufl.inner(w,z)*ufl.dx
# Extensor
B_form = ufl.inner(w,v)*ufl.dx

# Assembling matrices
J = dfx.fem.assemble_matrix(J_form); J.assemble() # Jacobian
N = dfx.fem.assemble_matrix(N_form); N.assemble() # Mass matrix for q
M = dfx.fem.assemble_matrix(M_form); M.assemble() # Mass matrix for f
B = dfx.fem.assemble_matrix(B_form); B.assemble() # Extensor

# Boundary conditions
def border(x): return np.isclose(x[0],0)+np.isclose(x[0],1)+np.isclose(x[1],0)+np.isclose(x[1],1)
dofs,_=dfx.fem.locate_dofs_geometrical((TH.sub(0),U),border)
J.zeroRows(dofs); B.zeroRows(dofs,0)

# Sizes
n=B.getSize()[1]
n_local=B.getLocalSize()[1]

def setupKSP(KSP):
KSP.setType('preonly')
# Preconditioner
PC = KSP.getPC(); PC.setType('lu')
PC.setFactorSolverType('mumps')
KSP.setFromOptions()

# Compute solvers instead of inverses
KSPs = []
for Mat in [J,J.copy().hermitianTranspose()]:
# Krylov subspace
KSP = pet.KSP().create(comm)
KSP.setOperators(Mat)
setupKSP(KSP)
KSPs.append(KSP)

w,z = dfx.Function(TH), dfx.Function(TH)
s = dfx.Function(U)
# Resolvent operator
class LHS_class:
def mult(cls,A,x,y):
B.mult(x,w.vector)
w.x.scatter_forward()
KSPs[0].solve(w.vector,z.vector)
z.x.scatter_forward()
N.mult(z.vector,w.vector)
w.x.scatter_forward()
KSPs[1].solve(w.vector,z.vector)
z.x.scatter_forward()
B.multTranspose(z.vector,s.vector)
s.x.scatter_forward()
s.vector.copy(y)

LHS=pet.Mat().create(comm)
LHS.setSizes([[n_local,n]]*2)
LHS.setType(pet.Mat.Type.PYTHON)
LHS.setPythonContext(LHS_class())
LHS.setUp()

# Eigensolver
EPS = slp.EPS(); EPS.create(comm)
EPS.setOperators(LHS,M) # Solve B^T*L^-1H*L^-1*B*f=sigma^2*M*f (cheaper than a proper SVD)
EPS.setProblemType(slp.EPS.ProblemType.GHEP) # Specify that A is hermitian (by construction), & M is semi-positive
EPS.setDimensions(1,10) # Find k eigenvalues only with max number of Lanczos vectors
EPS.setTolerances(1e-6,100) # Set absolute tolerance and number of iterations
# Spectral transform
ST = EPS.getST(); ST.setType('shift'); ST.setShift(0)
# Krylov subspace
KSP = ST.getKSP()
setupKSP(KSP)
EPS.setFromOptions()
EPS.solve()

f=dfx.Function(U)
EPS.getEigenvector(0,f.vector)
q=dfx.Function(TH)
B.mult(f.vector,w.vector)
w.x.scatter_forward()
KSPs[0].solve(w.vector,q.vector)
q.x.scatter_forward()
u,_=q.split()
with XDMFFile(comm, "sanity_check_response.xdmf","w") as xdmf:
xdmf.write_mesh(mesh)
xdmf.write_function(u)


Gives the spurious but different :

is the EPSType consistent between serial and parallel runs?

Yes. EPS.getType() returns None in both cases right after creation, krylovschur right before EPS.solve()

ok, but using EPSView should tell you if you’re actually using the same EPSType (and all other important things) in serial and parallel. Perhaps the default is different in each case, although the documentation says it should be krylovschur. It’s also not obvious, but is LHS actually Hermitian?

Here’s the output for EPS.view() on one processor :

EPS Object: 1 MPI processes
type: krylovschur
50% of basis vectors kept after restart
using the locking variant
problem type: generalized hermitian eigenvalue problem
selected portion of the spectrum: largest eigenvalues in magnitude
postprocessing eigenvectors with purification
number of eigenvalues (nev): 1
number of column vectors (ncv): 10
maximum dimension of projected problem (mpd): 10
maximum number of iterations: 100
tolerance: 1e-06
convergence test: relative to the eigenvalue
BV Object: 1 MPI processes
type: svec
11 columns of global length 80802
vector orthogonalization method: classical Gram-Schmidt
orthogonalization refinement: if needed (eta: 0.7071)
block orthogonalization method: GS
non-standard inner product
tolerance for definite inner product: 2.22045e-15
inner product matrix:
Mat Object: (st_) 1 MPI processes
type: seqaij
rows=80802, cols=80802, bs=2
total: nonzeros=1846404, allocated nonzeros=1846404
total number of mallocs used during MatSetValues calls=0
using I-node routines: found 40399 nodes, limit used is 5
doing matmult as a single matrix-matrix product
DS Object: 1 MPI processes
type: hep
solving the problem with: Implicit QR method (_steqr)
ST Object: 1 MPI processes
type: shift
shift: 0.
number of matrices: 2
all matrices have unknown nonzero pattern
KSP Object: (st_) 1 MPI processes
type: preonly
maximum iterations=10000, initial guess is zero
tolerances:  relative=1e-08, absolute=1e-50, divergence=10000.
left preconditioning
using NONE norm type for convergence test
PC Object: (st_) 1 MPI processes
type: lu
out-of-place factorization
tolerance for zero pivot 2.22045e-14
matrix ordering: external
factor fill ratio given 0., needed 0.
Factored matrix follows:
Mat Object: 1 MPI processes
type: mumps
rows=80802, cols=80802
package used to perform factorization: mumps
total: nonzeros=13033708, allocated nonzeros=13033708
MUMPS run parameters:
SYM (matrix type):                   0
PAR (host participation):            1
ICNTL(1) (output for error):         6
ICNTL(2) (output of diagnostic msg): 0
ICNTL(3) (output for global info):   0
ICNTL(4) (level of printing):        0
ICNTL(5) (input mat struct):         0
ICNTL(6) (matrix prescaling):        7
ICNTL(7) (sequential matrix ordering):7
ICNTL(8) (scaling strategy):        77
ICNTL(10) (max num of refinements):  0
ICNTL(11) (error analysis):          0
ICNTL(12) (efficiency control):                         1
ICNTL(13) (sequential factorization of the root node):  0
ICNTL(14) (percentage of estimated workspace increase): 20
ICNTL(18) (input mat struct):                           0
ICNTL(19) (Schur complement info):                      0
ICNTL(20) (RHS sparse pattern):                         0
ICNTL(21) (solution struct):                            0
ICNTL(22) (in-core/out-of-core facility):               0
ICNTL(23) (max size of memory can be allocated locally):0
ICNTL(24) (detection of null pivot rows):               0
ICNTL(25) (computation of a null space basis):          0
ICNTL(26) (Schur options for RHS or solution):          0
ICNTL(27) (blocking size for multiple RHS):             -32
ICNTL(28) (use parallel or sequential ordering):        1
ICNTL(29) (parallel ordering):                          0
ICNTL(30) (user-specified set of entries in inv(A)):    0
ICNTL(31) (factors is discarded in the solve phase):    0
ICNTL(33) (compute determinant):                        0
ICNTL(35) (activate BLR based factorization):           0
ICNTL(36) (choice of BLR factorization variant):        0
ICNTL(38) (estimated compression rate of LU factors):   333
CNTL(1) (relative pivoting threshold):      0.01
CNTL(2) (stopping criterion of refinement): 1.49012e-08
CNTL(3) (absolute pivoting threshold):      0.
CNTL(4) (value of static pivoting):         -1.
CNTL(5) (fixation for null pivots):         0.
CNTL(7) (dropping parameter for BLR):       0.
RINFO(1) (local estimated flops for the elimination after analysis):
[0] 2.61132e+09
RINFO(2) (local estimated flops for the assembly after factorization):
[0]  2.0365e+07
RINFO(3) (local estimated flops for the elimination after factorization):
[0]  2.61132e+09
INFO(15) (estimated size of (in MB) MUMPS internal data for running numerical factorization):
[0] 304
INFO(16) (size of (in MB) MUMPS internal data used during numerical factorization):
[0] 304
INFO(23) (num of pivots eliminated on this processor after factorization):
[0] 80802
RINFOG(1) (global estimated flops for the elimination after analysis): 2.61132e+09
RINFOG(2) (global estimated flops for the assembly after factorization): 2.0365e+07
RINFOG(3) (global estimated flops for the elimination after factorization): 2.61132e+09
(RINFOG(12) RINFOG(13))*2^INFOG(34) (determinant): (0.,0.)*(2^0)
INFOG(3) (estimated real workspace for factors on all processors after analysis): 13033708
INFOG(4) (estimated integer workspace for factors on all processors after analysis): 1037876
INFOG(5) (estimated maximum front size in the complete tree): 760
INFOG(6) (number of nodes in the complete tree): 10847
INFOG(7) (ordering option effectively used after analysis): 5
INFOG(8) (structural symmetry in percent of the permuted matrix after analysis): 100
INFOG(9) (total real/complex workspace to store the matrix factors after factorization): 13033708
INFOG(10) (total integer space store the matrix factors after factorization): 1037876
INFOG(11) (order of largest frontal matrix after factorization): 760
INFOG(12) (number of off-diagonal pivots): 0
INFOG(13) (number of delayed pivots after factorization): 0
INFOG(14) (number of memory compress after factorization): 0
INFOG(15) (number of steps of iterative refinement after solution): 0
INFOG(16) (estimated size (in MB) of all MUMPS internal data for factorization after analysis: value on the most memory consuming processor): 304
INFOG(17) (estimated size of all MUMPS internal data for factorization after analysis: sum over all processors): 304
INFOG(18) (size of all MUMPS internal data allocated during factorization: value on the most memory consuming processor): 304
INFOG(19) (size of all MUMPS internal data allocated during factorization: sum over all processors): 304
INFOG(20) (estimated number of entries in the factors): 13033708
INFOG(21) (size in MB of memory effectively used during factorization - value on the most memory consuming processor): 261
INFOG(22) (size in MB of memory effectively used during factorization - sum over all processors): 261
INFOG(23) (after analysis: value of ICNTL(6) effectively used): 0
INFOG(24) (after analysis: value of ICNTL(12) effectively used): 1
INFOG(25) (after factorization: number of pivots modified by static pivoting): 0
INFOG(28) (after factorization: number of null pivots encountered): 0
INFOG(29) (after factorization: effective number of entries in the factors (sum over all processors)): 13033708
INFOG(30, 31) (after solution: size in Mbytes of memory used during solution phase): 279, 279
INFOG(32) (after analysis: type of analysis done): 1
INFOG(33) (value used for ICNTL(8)): 7
INFOG(34) (exponent of the determinant if determinant is requested): 0
INFOG(35) (after factorization: number of entries taking into account BLR factor compression - sum over all processors): 13033708
INFOG(36) (after analysis: estimated size of all MUMPS internal data for running BLR in-core - value on the most memory consuming processor): 0
INFOG(37) (after analysis: estimated size of all MUMPS internal data for running BLR in-core - sum over all processors): 0
INFOG(38) (after analysis: estimated size of all MUMPS internal data for running BLR out-of-core - value on the most memory consuming processor): 0
INFOG(39) (after analysis: estimated size of all MUMPS internal data for running BLR out-of-core - sum over all processors): 0
linear system matrix = precond matrix:
Mat Object: (st_) 1 MPI processes
type: seqaij
rows=80802, cols=80802, bs=2
total: nonzeros=1846404, allocated nonzeros=1846404
total number of mallocs used during MatSetValues calls=0
using I-node routines: found 40399 nodes, limit used is 5


And two :

EPS Object: 2 MPI processes
type: krylovschur
50% of basis vectors kept after restart
using the locking variant
problem type: generalized hermitian eigenvalue problem
selected portion of the spectrum: largest eigenvalues in magnitude
postprocessing eigenvectors with purification
number of eigenvalues (nev): 1
number of column vectors (ncv): 10
maximum dimension of projected problem (mpd): 10
maximum number of iterations: 100
tolerance: 1e-06
convergence test: relative to the eigenvalue
BV Object: 2 MPI processes
type: svec
11 columns of global length 80802
vector orthogonalization method: classical Gram-Schmidt
orthogonalization refinement: if needed (eta: 0.7071)
block orthogonalization method: GS
non-standard inner product
tolerance for definite inner product: 2.22045e-15
inner product matrix:
Mat Object: (st_) 2 MPI processes
type: mpiaij
rows=80802, cols=80802, bs=2
total: nonzeros=1846404, allocated nonzeros=1846404
total number of mallocs used during MatSetValues calls=0
using I-node (on process 0) routines: found 20197 nodes, limit used is 5
doing matmult as a single matrix-matrix product
DS Object: 2 MPI processes
type: hep
parallel operation mode: REDUNDANT
solving the problem with: Implicit QR method (_steqr)
ST Object: 2 MPI processes
type: shift
shift: 0.
number of matrices: 2
all matrices have unknown nonzero pattern
KSP Object: (st_) 2 MPI processes
type: preonly
maximum iterations=10000, initial guess is zero
tolerances:  relative=1e-08, absolute=1e-50, divergence=10000.
left preconditioning
using NONE norm type for convergence test
PC Object: (st_) 2 MPI processes
type: lu
out-of-place factorization
tolerance for zero pivot 2.22045e-14
matrix ordering: external
factor fill ratio given 0., needed 0.
Factored matrix follows:
Mat Object: 2 MPI processes
type: mumps
rows=80802, cols=80802
package used to perform factorization: mumps
total: nonzeros=13231812, allocated nonzeros=13231812
MUMPS run parameters:
SYM (matrix type):                   0
PAR (host participation):            1
ICNTL(1) (output for error):         6
ICNTL(2) (output of diagnostic msg): 0
ICNTL(3) (output for global info):   0
ICNTL(4) (level of printing):        0
ICNTL(5) (input mat struct):         0
ICNTL(6) (matrix prescaling):        7
ICNTL(7) (sequential matrix ordering):7
ICNTL(8) (scaling strategy):        77
ICNTL(10) (max num of refinements):  0
ICNTL(11) (error analysis):          0
ICNTL(12) (efficiency control):                         1
ICNTL(13) (sequential factorization of the root node):  0
ICNTL(14) (percentage of estimated workspace increase): 20
ICNTL(18) (input mat struct):                           3
ICNTL(19) (Schur complement info):                      0
ICNTL(20) (RHS sparse pattern):                         0
ICNTL(21) (solution struct):                            1
ICNTL(22) (in-core/out-of-core facility):               0
ICNTL(23) (max size of memory can be allocated locally):0
ICNTL(24) (detection of null pivot rows):               0
ICNTL(25) (computation of a null space basis):          0
ICNTL(26) (Schur options for RHS or solution):          0
ICNTL(27) (blocking size for multiple RHS):             -32
ICNTL(28) (use parallel or sequential ordering):        1
ICNTL(29) (parallel ordering):                          0
ICNTL(30) (user-specified set of entries in inv(A)):    0
ICNTL(31) (factors is discarded in the solve phase):    0
ICNTL(33) (compute determinant):                        0
ICNTL(35) (activate BLR based factorization):           0
ICNTL(36) (choice of BLR factorization variant):        0
ICNTL(38) (estimated compression rate of LU factors):   333
CNTL(1) (relative pivoting threshold):      0.01
CNTL(2) (stopping criterion of refinement): 1.49012e-08
CNTL(3) (absolute pivoting threshold):      0.
CNTL(4) (value of static pivoting):         -1.
CNTL(5) (fixation for null pivots):         0.
CNTL(7) (dropping parameter for BLR):       0.
RINFO(1) (local estimated flops for the elimination after analysis):
[0] 1.35456e+09
[1] 1.41695e+09
RINFO(2) (local estimated flops for the assembly after factorization):
[0]  1.05945e+07
[1]  1.02145e+07
RINFO(3) (local estimated flops for the elimination after factorization):
[0]  1.35456e+09
[1]  1.41695e+09
INFO(15) (estimated size of (in MB) MUMPS internal data for running numerical factorization):
[0] 176
[1] 178
INFO(16) (size of (in MB) MUMPS internal data used during numerical factorization):
[0] 176
[1] 178
INFO(23) (num of pivots eliminated on this processor after factorization):
[0] 40314
[1] 40488
RINFOG(1) (global estimated flops for the elimination after analysis): 2.77151e+09
RINFOG(2) (global estimated flops for the assembly after factorization): 2.0809e+07
RINFOG(3) (global estimated flops for the elimination after factorization): 2.77151e+09
(RINFOG(12) RINFOG(13))*2^INFOG(34) (determinant): (0.,0.)*(2^0)
INFOG(3) (estimated real workspace for factors on all processors after analysis): 13231812
INFOG(4) (estimated integer workspace for factors on all processors after analysis): 1044364
INFOG(5) (estimated maximum front size in the complete tree): 740
INFOG(6) (number of nodes in the complete tree): 10914
INFOG(7) (ordering option effectively used after analysis): 5
INFOG(8) (structural symmetry in percent of the permuted matrix after analysis): -1
INFOG(9) (total real/complex workspace to store the matrix factors after factorization): 13231812
INFOG(10) (total integer space store the matrix factors after factorization): 1044364
INFOG(11) (order of largest frontal matrix after factorization): 740
INFOG(12) (number of off-diagonal pivots): 0
INFOG(13) (number of delayed pivots after factorization): 0
INFOG(14) (number of memory compress after factorization): 0
INFOG(15) (number of steps of iterative refinement after solution): 0
INFOG(16) (estimated size (in MB) of all MUMPS internal data for factorization after analysis: value on the most memory consuming processor): 178
INFOG(17) (estimated size of all MUMPS internal data for factorization after analysis: sum over all processors): 354
INFOG(18) (size of all MUMPS internal data allocated during factorization: value on the most memory consuming processor): 178
INFOG(19) (size of all MUMPS internal data allocated during factorization: sum over all processors): 354
INFOG(20) (estimated number of entries in the factors): 13231812
INFOG(21) (size in MB of memory effectively used during factorization - value on the most memory consuming processor): 152
INFOG(22) (size in MB of memory effectively used during factorization - sum over all processors): 303
INFOG(23) (after analysis: value of ICNTL(6) effectively used): 0
INFOG(24) (after analysis: value of ICNTL(12) effectively used): 1
INFOG(25) (after factorization: number of pivots modified by static pivoting): 0
INFOG(28) (after factorization: number of null pivots encountered): 0
INFOG(29) (after factorization: effective number of entries in the factors (sum over all processors)): 13231812
INFOG(30, 31) (after solution: size in Mbytes of memory used during solution phase): 152, 303
INFOG(32) (after analysis: type of analysis done): 1
INFOG(33) (value used for ICNTL(8)): 7
INFOG(34) (exponent of the determinant if determinant is requested): 0
INFOG(35) (after factorization: number of entries taking into account BLR factor compression - sum over all processors): 13231812
INFOG(36) (after analysis: estimated size of all MUMPS internal data for running BLR in-core - value on the most memory consuming processor): 0
INFOG(37) (after analysis: estimated size of all MUMPS internal data for running BLR in-core - sum over all processors): 0
INFOG(38) (after analysis: estimated size of all MUMPS internal data for running BLR out-of-core - value on the most memory consuming processor): 0
INFOG(39) (after analysis: estimated size of all MUMPS internal data for running BLR out-of-core - sum over all processors): 0
linear system matrix = precond matrix:
Mat Object: (st_) 2 MPI processes
type: mpiaij
rows=80802, cols=80802, bs=2
total: nonzeros=1846404, allocated nonzeros=1846404
total number of mallocs used during MatSetValues calls=0
using I-node (on process 0) routines: found 20197 nodes, limit used is 5


I can’t see any meaningful difference.

As for the second part of your question, I’m pretty sure that LHS should be hermitian. Morally, recall that : \text{LHS}=B^HL^{-1H}NL^{-1}B

LHS is defined in the code as a matrix-free script that takes a vector x, computes w=Bx, solves z\in C^m|Lz=w, does w=Nz, computes z\in C^m|L^Hz=w, then finally y=B^Tz.

I believe B is real so B^T=B^H and N is a norm (q^HNq=\int|u|^2dx) so I think LHS is hermitian.

I ran a quick check, N.isSymmetric(1e-14) returns True. So I’d say I have high confidence that LHS is hermitian.

Ok I’ve printed the results of the first matrix product (LHS_class.mult with x being basically noise) in both serial and parallel. I think the inversion is wrong. The serial part looks already something like the final structure :

But the parallel part just breaks apart :

I suspected this to be the case. I’ll try reproducing this behaviour on a smaller MWE

Ok I think I’m on a the correct path to solving the problem. There were a couple of problems with the previous MWE ; the viewer trick corrupts the data if used with a different number of procs at reading/writing.

More importantly, my attempt and handling the boundary conditions ‘by hand’ using zeroRows fails miserably in parallel. The key take away from this that I can see is trust the dolfinx developers folks, they know their stuff.

A parallel inverter follows if anyone is interested :

import ufl
import numpy as np
import dolfinx as dfx
from dolfinx.io import XDMFFile
from petsc4py import PETSc as pet
from mpi4py.MPI import COMM_WORLD as comm

n=100
mesh=dfx.UnitSquareMesh(comm,n,n)

FE_vector_1=ufl.VectorElement("Lagrange",mesh.ufl_cell(),1)
FE_vector_2=ufl.VectorElement("Lagrange",mesh.ufl_cell(),2)
FE_scalar  =ufl.FiniteElement("Lagrange",mesh.ufl_cell(),1)
# Taylor Hood elements
V =dfx.FunctionSpace(mesh,FE_vector_1)
TH=dfx.FunctionSpace(mesh,FE_vector_2*FE_scalar)
# Extract velocities only
U = TH.sub(0)
Uc, _ = TH.sub(0).collapse(collapsed_dofs=True)

q,l=ufl.TrialFunction(TH),ufl.TestFunction(TH)
w=ufl.TrialFunction(V)
f=dfx.Function(V)
u,p=ufl.split(q); v,s=ufl.split(l)

# Baseflow
f.interpolate(lambda x: np.exp(-.5*((x[0]-.5)**2+(x[1]-.5)**2)/.2**2)*
np.sin(np.pi*(np.vstack((x[1]-.5,.5-x[0])))))
f.x.scatter_forward()

# Mass csv
J_form  = ufl.inner(ufl.div(u), s)
# Momentum (different test functions and IBP)
J_form -= ufl.inner(p,		    ufl.div( v)) 	 # Pressure
J_form *= ufl.dx
# Extensor
B_form = ufl.inner(w,v)*ufl.dx

# Boundary conditions
def border(x): return np.isclose(x[0],0)+np.isclose(x[0],1)+np.isclose(x[1],0)+np.isclose(x[1],1)
dofs=dfx.fem.locate_dofs_geometrical((U,Uc),border)
zero=dfx.Function(Uc)
zero.interpolate(lambda x: np.zeros(x[:2,:].shape))
bcs = [dfx.DirichletBC(zero, dofs, U)]

# Assembling matrices
J = dfx.fem.assemble_matrix(J_form,bcs);   J.assemble() # Jacobian
B = dfx.fem.assemble_matrix(B_form,bcs,0); B.assemble() # Extensor

# Sizes
_, n	   = B.getSize()
_, n_local = B.getLocalSize()

# Krylov subspace
KSP = pet.KSP().create(comm)
KSP.setOperators(J)
KSP.setType('preonly')
# Preconditioner
PC = KSP.getPC(); PC.setType('lu')
PC.setFactorSolverType('mumps')
KSP.setFromOptions()

q,s=dfx.Function(TH),dfx.Function(TH)
B.mult(f.vector,s.vector)
s.x.scatter_forward()
KSP.solve(s.vector,q.vector)
q.x.scatter_forward()
u,_=q.split()
with XDMFFile(comm, f"sanity_check_response_dummy_{comm.size}.xdmf","w") as xdmf:
xdmf.write_mesh(mesh)
xdmf.write_function(u)