Precondition appears not to happen

Hi all,

For a Navier-Stokes problem, I want to separately precondition the u block (Jacobi) and the p block (algebraic multigrid on the Schur complement), while using mixed spaces.

I believe I have everything set up correctly, and am (finally :face_exhaling:) no longer getting the PETSc warning ā€œThere are unused database optionsā€. But when I activate solver verbosity (setting ksp_view), then none of my own settings are mentioned in the output. Instead it mentions LU or MUMPS preconditioners (depending on mpi run with 1 or multiple cores).

Here is a MWE of a lid driven cavity:

from dolfinx import mesh, fem
from dolfinx.fem.petsc import NonlinearProblem
from dolfinx.nls.petsc import NewtonSolver

import ufl
from ufl import div, nabla_grad, dot, inner, outer, dx, sym
from basix.ufl import element, mixed_element

import numpy as np
from mpi4py import MPI
from petsc4py import PETSc

# User parameters
Re = 10 # Reynolds number
N = 5 # Elements in x & y
Pu = 2 # Polynomial order u
Pp = 1 # Polynomial order p

# Function space
L = 1 # Domain size
domain = mesh.create_rectangle(MPI.COMM_WORLD, [(0,0),(L,L)], [N,N], mesh.CellType.quadrilateral)
Ve = element("Lagrange", domain.basix_cell(), Pu, shape=(domain.geometry.dim,))
Qe = element("Lagrange", domain.basix_cell(), Pp)
W_el = mixed_element([Ve, Qe])
W = fem.functionspace(domain, W_el)    
V, WV_map = W.sub(0).collapse() # DOF map from W to V
Q, WQ_map = W.sub(1).collapse() # DOF map from W to Q

# Trial and test functions       
w_trial = ufl.TrialFunction(W)
v, q = ufl.TestFunctions(W)
u_p = fem.Function(W)
u, p = ufl.split(u_p)

# Navier-Stokes weak form
nu = fem.Constant(domain, PETSc.ScalarType(1/Re)) # Viscosity
B = - inner( outer(u,u), nabla_grad(v) )*dx \
    + dot( q, div(u) )*dx \
    - dot( div(v), p )*dx \
    + inner( 2*nu*sym(nabla_grad(u)), sym(nabla_grad(v)) )*dx

# Boundary conditions
def node_origin(x):
    return np.logical_and(np.isclose(x[0],0),np.isclose(x[1],0))
dof_origin_p = fem.locate_dofs_geometrical((W.sub(1), Q), node_origin)
bc_zero_p = fem.dirichletbc(fem.Function(Q), dof_origin_p, W.sub(1))
def walls(x):
    return np.isclose(x[0],0) | np.isclose(x[0], L) | np.isclose(x[1],0)
facets_wall = mesh.locate_entities_boundary(domain, domain.geometry.dim-1, walls)
dofs_walls = fem.locate_dofs_topological((W.sub(0), V), domain.geometry.dim-1, facets_wall)
bc_zero_u = fem.dirichletbc(fem.Function(V), dofs_walls, W.sub(0))
def lid(x):
    return np.isclose(x[1], L)
facets_lid = mesh.locate_entities_boundary(domain, domain.geometry.dim-1, lid)
dofs_lid = fem.locate_dofs_topological((W.sub(0), V), domain.geometry.dim-1, facets_lid)
u_unit = fem.Function(V)
u_unit.interpolate(lambda x: (np.ones(x.shape[1]),np.zeros(x.shape[1])) )
bc_unit_u = fem.dirichletbc(u_unit, dofs_lid, W.sub(0))
bcs = [bc_zero_p,bc_zero_u,bc_unit_u] 

##########
# Below is where the solver and its settings are
##########
J = ufl.derivative(B,u_p,w_trial)
ns_problem = NonlinearProblem(B,u_p,bcs=bcs,J=J)
newton_solver = NewtonSolver(domain.comm,ns_problem)

opts = PETSc.Options()

ksp = newton_solver.krylov_solver
option_prefix = ksp.getOptionsPrefix() 
opts[f"{option_prefix}ksp_type"] = "gmres"
opts[f"{option_prefix}ksp_pc_side"] = "right" # Use right preconditioning: ( K M^-1 ) M x = b. This preserves the residual.
opts[f"{option_prefix}ksp_view"] = None

pc = ksp.getPC() # Preconditioning object
pc.setType("fieldsplit") # Different preconditioning for different blocks

IS_u = PETSc.IS().createGeneral(WV_map, domain.comm)
pc.setFieldSplitIS(('u',IS_u))
IS_p = PETSc.IS().createGeneral(WQ_map, domain.comm)
pc.setFieldSplitIS(('p',IS_p))

ksp_u, ksp_p = pc.getFieldSplitSubKSP()
pc.setFieldSplitType(PETSc.PC.CompositeType.SCHUR)
pc.setFieldSplitSchurFactType(PETSc.PC.FieldSplitSchurFactType.UPPER)
pc.setFieldSplitSchurPreType(PETSc.PC.FieldSplitSchurPreType.SELFP)

option_prefix_u = ksp_u.getOptionsPrefix() 
opts[f"{option_prefix_u}ksp_type"] = "preonly" # Precondition only once in Newton iterations
opts[f"{option_prefix_u}pc_type"] = "jacobi"

option_prefix_p = ksp_p.getOptionsPrefix()
opts[f"{option_prefix_p}ksp_type"] = "preonly" # Precondition only once in Newton iterations
opts[f"{option_prefix_p}pc_type"] = "hypre" # Hypre (algebraic multigrid) preconditioning for p-block
opts[f"{option_prefix_p}pc_hypre_type"] = "boomeramg" # Choice of implementation
opts[f"{option_prefix_p}pc_hypre_boomeramg_coarsen_type"] = "pmis" # Grid coarsening strategy
opts[f"{option_prefix_p}pc_hypre_boomeramg_interp_type"] = "FF1" # Projection on to coarser grid
opts[f"{option_prefix_p}pc_hypre_boomeramg_strong_threshold"] = "0.5" # Coarsening limit

ksp_u.setFromOptions()
ksp_p.setFromOptions()
ksp.setFromOptions()

##########
# Above is where the solver and its settings are
##########

it,_ = newton_solver.solve(u_p)
print(f"Solved in {it} iterations.")

And this is the output for a single-core run:

KSP Object: (nls_solve_) 1 MPI process
  type: gmres
    restart=30, using Classical (unmodified) Gram-Schmidt Orthogonalization with no iterative refinement
    happy breakdown tolerance 1e-30
  maximum iterations=10000, initial guess is zero
  tolerances: relative=1e-05, absolute=1e-50, divergence=10000.
  right preconditioning
  using UNPRECONDITIONED norm type for convergence test
PC Object: (nls_solve_) 1 MPI process
  type: lu
   out-of-place factorization
    tolerance for zero pivot 2.22045e-14
    matrix ordering: nd
    factor fill ratio given 5., needed 1.47955
      Factored matrix follows:
        Mat Object: (nls_solve_) 1 MPI process
          type: seqaij
          rows=278, cols=278
          package used to perform factorization: petsc
          total: nonzeros=14328, allocated nonzeros=14328
            using I-node routines: found 103 nodes, limit used is 5
  linear system matrix = precond matrix:
  Mat Object: 1 MPI process
    type: seqaij
    rows=278, cols=278
    total: nonzeros=9684, allocated nonzeros=9684
    total number of mallocs used during MatSetValues calls=0
      using I-node routines: found 142 nodes, limit used is 5

Any help would be much appreciated!

Best,
Stein

The issue seems to be NewtonSolver related. The same code (but Stokes) with solver=LinearProblem(B,F,bcs=bcs) and ksp = solver._solver properly implements the preconditioning.

How do I make the Newton solver apply preconditioning settings?

One step forward, two steps back.

I managed to get both the linear and nonlinear solver to apply the preconditioner by starting the option setting code with: ksp.setOptionsPrefix(""). No idea why.

Everything seems to work, until I run with mpi and more than 1 core. For a tiny problem it says:

RuntimeError: Failed to successfully call PETSc function 'KSPSolve'. PETSc error code is: 55, Out of memory

So I would still much appreciate any helpā€¦

Here is the updated code snippet:

from dolfinx import mesh, fem
from dolfinx.fem.petsc import NonlinearProblem
from dolfinx.nls.petsc import NewtonSolver

import ufl
from ufl import div, nabla_grad, dot, inner, outer, dx, sym
from basix.ufl import element, mixed_element

import numpy as np
from mpi4py import MPI
from petsc4py import PETSc

# User parameters
Re = 10 # Reynolds number
N = 5 # Elements in x & y
Pu = 2 # Polynomial order u
Pp = 1 # Polynomial order p

# Function space
L = 1 # Domain size
domain = mesh.create_rectangle(MPI.COMM_WORLD, [(0,0),(L,L)], [N,N], mesh.CellType.quadrilateral)
Ve = element("Lagrange", domain.basix_cell(), Pu, shape=(domain.geometry.dim,))
Qe = element("Lagrange", domain.basix_cell(), Pp)
W_el = mixed_element([Ve, Qe])
W = fem.functionspace(domain, W_el)    
V, WV_map = W.sub(0).collapse() # DOF map from W to V
Q, WQ_map = W.sub(1).collapse() # DOF map from W to Q

# Trial and test functions       
w_trial = ufl.TrialFunction(W)
v, q = ufl.TestFunctions(W)
u_p = fem.Function(W)
u, p = ufl.split(u_p)

# Navier-Stokes weak form
nu = fem.Constant(domain, PETSc.ScalarType(1/Re)) # Viscosity
B = - inner( outer(u,u), nabla_grad(v) )*dx \
    + dot( q, div(u) )*dx \
    - dot( div(v), p )*dx \
    + inner( 2*nu*sym(nabla_grad(u)), sym(nabla_grad(v)) )*dx

# Boundary conditions
def node_origin(x):
    return np.logical_and(np.isclose(x[0],0),np.isclose(x[1],0))
dof_origin_p = fem.locate_dofs_geometrical((W.sub(1), Q), node_origin)
bc_zero_p = fem.dirichletbc(fem.Function(Q), dof_origin_p, W.sub(1))
def walls(x):
    return np.isclose(x[0],0) | np.isclose(x[0], L) | np.isclose(x[1],0)
facets_wall = mesh.locate_entities_boundary(domain, domain.geometry.dim-1, walls)
dofs_walls = fem.locate_dofs_topological((W.sub(0), V), domain.geometry.dim-1, facets_wall)
bc_zero_u = fem.dirichletbc(fem.Function(V), dofs_walls, W.sub(0))
def lid(x):
    return np.isclose(x[1], L)
facets_lid = mesh.locate_entities_boundary(domain, domain.geometry.dim-1, lid)
dofs_lid = fem.locate_dofs_topological((W.sub(0), V), domain.geometry.dim-1, facets_lid)
u_unit = fem.Function(V)
u_unit.interpolate(lambda x: (np.ones(x.shape[1]),np.zeros(x.shape[1])) )
bc_unit_u = fem.dirichletbc(u_unit, dofs_lid, W.sub(0))
bcs = [bc_zero_p,bc_zero_u,bc_unit_u] 

##########
# Below is where the solver and its settings are
##########
J = ufl.derivative(B,u_p,w_trial)
ns_problem = NonlinearProblem(B,u_p,bcs=bcs,J=J)
newton_solver = NewtonSolver(domain.comm,ns_problem)

ksp = newton_solver.krylov_solver
opts = PETSc.Options()

ksp.setOptionsPrefix("") # This fixes the issue with the nonlinear solver

option_prefix = ksp.getOptionsPrefix() 
opts[f"{option_prefix}ksp_type"] = "gmres"
opts[f"{option_prefix}ksp_pc_side"] = "right" # Use right preconditioning: ( K M^-1 ) M x = b. This preserves the residual.
opts[f"{option_prefix}ksp_view"] = None
    
IS_u = PETSc.IS().createGeneral(WV_map, domain.comm).sort()
IS_p = PETSc.IS().createGeneral(WQ_map, domain.comm).sort()
pc = ksp.getPC() # Preconditioning object
pc.setType("fieldsplit") # Different preconditioning for different blocks

pc.setFieldSplitIS(('u',IS_u),('p',IS_p))
ksp_u, ksp_p = pc.getFieldSplitSubKSP()
pc.setFieldSplitType(PETSc.PC.CompositeType.SCHUR)
pc.setFieldSplitSchurFactType(PETSc.PC.FieldSplitSchurFactType.UPPER)
pc.setFieldSplitSchurPreType(PETSc.PC.FieldSplitSchurPreType.SELFP)

option_prefix_u = ksp_u.getOptionsPrefix() 
opts[f"{option_prefix_u}ksp_type"] = "preonly" # Precondition only once in Newton iterations
opts[f"{option_prefix_u}pc_type"] = "jacobi"

option_prefix_p = ksp_p.getOptionsPrefix()
opts[f"{option_prefix_p}ksp_type"] = "preonly" # Precondition only once in Newton iterations
opts[f"{option_prefix_p}pc_type"] = "hypre" # Hypre (algebraic multigrid) preconditioning for p-block
opts[f"{option_prefix_p}pc_hypre_type"] = "boomeramg" # Choice of implementation
opts[f"{option_prefix_p}pc_hypre_boomeramg_coarsen_type"] = "pmis" # Grid coarsening strategy
opts[f"{option_prefix_p}pc_hypre_boomeramg_interp_type"] = "FF1" # Projection on to coarser grid
opts[f"{option_prefix_p}pc_hypre_boomeramg_strong_threshold"] = "0.5" # Coarsening limit

ksp_u.setFromOptions()
ksp_p.setFromOptions()
ksp.setFromOptions()

##########
# Above is where the solver and its settings are
##########

it,_ = newton_solver.solve(u_p)
print(f"Solved in {it} iterations.")

The problem was that WV_map and WQ_map reference (cpu-)local dof numbers, whereas PETScā€™s index-set requires global numbers. Secondly, the ghost dofs need to be thrown out. The following tweak fixes the parallel runnability:

W_indexmap = W.dofmap.index_map
globaldofs_u = np.setdiff1d( W_indexmap.local_to_global(np.array(WV_map)), W_indexmap.ghosts ) # getting the global velocity dofs that live on this core
globaldofs_p = np.setdiff1d( W_indexmap.local_to_global(np.array(WQ_map)), W_indexmap.ghosts ) # getting the global pressure dofs that live on this core
IS_u = PETSc.IS().createGeneral(np.array(globaldofs_u,dtype=np.int32), comm=domain.comm).sort()
IS_p = PETSc.IS().createGeneral(np.array(globaldofs_p,dtype=np.int32), comm=domain.comm).sort()

Iā€™ll post a small scaling study when I find the time

I ran a little scaling study, focusing on the effect of increasing polynomial order. Iā€™m comparing three different solvers:

For relevancy of the testcase, I am considering a 3D domain (cube) and I added an advective term to the stokes equation to mimick a Newton iteration in a Navier-Stokes solve. Iā€™ve added the code below.

For weak scaling I obtain:


with wall-clock runtime of the solve step on the y-axis, and DOF-count on the x-axis. Each marker corresponds to an increase in core numbers: 1, 2, 4, 8, 16, 32 and 48. Iā€™ve tried to stick to roughly 100k dofs per core.

For the higher-order cases, the runtime was severely limited by the assembly/integration and not by the solve. This implies strong scaling is more important:


with on the x-axis number of cores. It doesnā€™t appear to scale so great, but thatā€™s due to the limited number of 800k dofs. Iā€™m just happy to see that GMRes outperforms mumps in all cases.

Here is the code:

import os 
from dolfinx import mesh, fem
import dolfinx
from dolfinx.fem.petsc import assemble_matrix, assemble_vector, NonlinearProblem, LinearProblem
from dolfinx.nls.petsc import NewtonSolver

import ufl
from ufl import div, nabla_grad, dot, inner, outer, dx, sym
from basix.ufl import element, mixed_element

from mpi4py import MPI
from petsc4py import PETSc
import numpy as np
import time
import pickle

def main(outputdir:str = "", # Directory for storing output
         CASE:int = 0, # 0: MUMPS, 1: GMRes with block diagonal precon, 2: GMRes with block Schur precon
         Re:float = 10, # Reynolds number. With using domainsize and unit advective velocity, this is 1/nu
         N:int =  10, # Elements in x & y
         Pu:int = 2, # Polynomial order u
         Pp:int = 1, # Polynomial order p
    ):
     # Function space
     L = 1 # Domain size
     domain = mesh.create_box(MPI.COMM_WORLD, [(0,0,0),(L,L,L)], [N,N,N], mesh.CellType.hexahedron)
     Ve = element("Lagrange", domain.basix_cell(), Pu, shape=(domain.geometry.dim,))
     Qe = element("Lagrange", domain.basix_cell(), Pp)
     W_el = mixed_element([Ve, Qe])
     W = fem.functionspace(domain, W_el)
     V, WV_map = W.sub(0).collapse() # DOF map from W to V
     Q, WQ_map = W.sub(1).collapse() # DOF map from W to Q

     # Trial and test functions       
     w_trial = ufl.TrialFunction(W)
     u, p = ufl.split(w_trial)
     v, q = ufl.TestFunctions(W)
     u_p = fem.Function(W)

     # Physical parameters
     nu = fem.Constant(domain, PETSc.ScalarType(1/Re)) # Viscosity
     u_adv = fem.Function(V) # Dummy advection velocity, to mimic a Navier-Stokes solve
     u_adv.interpolate( lambda x: ( np.sin(2*np.pi*x[0,:]/L)*np.cos(2*np.pi*x[1,:]/L)*np.sin(np.pi*x[2,:]/L), \
                                   -np.cos(2*np.pi*x[0,:]/L)*np.sin(2*np.pi*x[1,:]/L)*np.sin(np.pi*x[2,:]/L),
                                    0*x[0,:] )   )
     
     # Linearized Navier-Stokes weak form
     B = inner( dot(u_adv,nabla_grad(u)), v )*dx \
       + inner( 2*nu*sym(nabla_grad(u)), sym(nabla_grad(v)) )*dx \
       - dot( q, div(u) )*dx \
       - dot( div(v), p )*dx 
     F = dot( q, fem.Constant(domain, PETSc.ScalarType(0)) )*dx
     P = inner( dot(u_adv,nabla_grad(u)), v )*dx \
       + inner( 2*nu*sym(nabla_grad(u)), sym(nabla_grad(v)) )*dx \
       + dot( 1/nu*q, p )*dx

     # Boundary conditions
     def node_origin(x):
          return np.logical_and(np.isclose(x[0],0),np.logical_and(np.isclose(x[1],0),np.isclose(x[2],0)))
     dof_origin_p = fem.locate_dofs_geometrical((W.sub(1), Q), node_origin)
     bc_zero_p = fem.dirichletbc(fem.Function(Q), dof_origin_p, W.sub(1))
     def walls(x):
          return np.isclose(x[0],0) | np.isclose(x[0], L) | np.isclose(x[1],0) | np.isclose(x[2], 0) | np.isclose(x[2], L)
     facets_wall = mesh.locate_entities_boundary(domain, domain.geometry.dim-1, walls)
     dofs_walls = fem.locate_dofs_topological((W.sub(0), V), domain.geometry.dim-1, facets_wall)
     bc_zero_u = fem.dirichletbc(fem.Function(V), dofs_walls, W.sub(0))
     def lid(x):
          return np.isclose(x[1], L)
     facets_lid = mesh.locate_entities_boundary(domain, domain.geometry.dim-1, lid)
     dofs_lid = fem.locate_dofs_topological((W.sub(0), V), domain.geometry.dim-1, facets_lid)
     u_unit = fem.Function(V)
     u_unit.interpolate(lambda x: (np.ones(x.shape[1]),np.zeros(x.shape[1]),np.zeros(x.shape[1])) )
     bc_unit_u = fem.dirichletbc(u_unit, dofs_lid, W.sub(0))
     bcs = [bc_zero_p,bc_zero_u,bc_unit_u] 

     # Assemble matrices
     Bmat = assemble_matrix(fem.form(B), bcs=bcs)
     Bmat.assemble()
     Pmat = assemble_matrix(fem.form(P), bcs=bcs)
     Pmat.assemble()
     Fvec = assemble_vector(fem.form(F))
     fem.petsc.apply_lifting(Fvec, [fem.form(B)], bcs=[bcs])
     Fvec.ghostUpdate(addv=PETSc.InsertMode.ADD, mode=PETSc.ScatterMode.REVERSE)
     fem.petsc.set_bc(Fvec, bcs)

     ##########
     # Below is where the solver and its settings are
     ##########

     ksp = PETSc.KSP().create(domain.comm)
     if CASE == 1:
          ksp.setOperators(Bmat, Pmat)
     else:
          ksp.setOperators(Bmat)

     opts = PETSc.Options()
     ksp.setOptionsPrefix("")
     option_prefix = ksp.getOptionsPrefix() 
     
     if CASE == 0:
          # Direct solve
          opts[f"{option_prefix}ksp_type"] = "preonly"
          opts[f"{option_prefix}pc_type"] = "lu"
          opts[f"{option_prefix}pc_factor_mat_solver_type"] = "mumps"
     else:
          # GMRes with block-based preconditioning
          opts[f"{option_prefix}ksp_type"] = "gmres"
          opts[f"{option_prefix}ksp_pc_side"] = "right"

          W_indexmap = W.dofmap.index_map
          localdofs_u = np.setdiff1d( W_indexmap.local_to_global(np.array(WV_map)), W_indexmap.ghosts ) # getting the global velocity dofs that live on this core
          localdofs_p = np.setdiff1d( W_indexmap.local_to_global(np.array(WQ_map)), W_indexmap.ghosts ) # getting the global pressure dofs that live on this core

          IS_u = PETSc.IS().createGeneral(np.array(localdofs_u,dtype=np.int32), comm=domain.comm).sort()
          IS_p = PETSc.IS().createGeneral(np.array(localdofs_p,dtype=np.int32), comm=domain.comm).sort()

          pc = ksp.getPC()
          pc.setType("fieldsplit")

          pc.setFieldSplitIS(('u',IS_u),('p',IS_p))
          ksp_u, ksp_p = pc.getFieldSplitSubKSP()

          if CASE == 1:
               # Block-diagonal precondition, AMG on velocity, Jacobi on pressure mass
               pc.setFieldSplitType(PETSc.PC.CompositeType.ADDITIVE)
          elif CASE == 2:
               # Schur-based precondition, AMG on velocity, Jacobi on Schur complement.
               pc.setFieldSplitType(PETSc.PC.CompositeType.SCHUR)
               pc.setFieldSplitSchurFactType(PETSc.PC.FieldSplitSchurFactType.UPPER)
               pc.setFieldSplitSchurPreType(PETSc.PC.FieldSplitSchurPreType.SELFP)

          option_prefix_u = ksp_u.getOptionsPrefix() 
          opts[f"{option_prefix_u}ksp_type"] = "preonly" # Precondition only once in Newton iterations
          opts[f"{option_prefix_u}pc_type"] = "hypre" # Hypre (algebraic multigrid) preconditioning for p-block
          opts[f"{option_prefix_u}pc_hypre_type"] = "boomeramg" # Choice of implementation
          opts[f"{option_prefix_u}pc_hypre_boomeramg_coarsen_type"] = "pmis" # Grid coarsening strategy
          opts[f"{option_prefix_u}pc_hypre_boomeramg_interp_type"] = "FF1" # Projection on to coarser grid
          opts[f"{option_prefix_u}pc_hypre_boomeramg_strong_threshold"] = "0.5" # Coarsening limit

          option_prefix_p = ksp_p.getOptionsPrefix()
          opts[f"{option_prefix_p}ksp_type"] = "preonly" # Precondition only once in Newton iterations
          opts[f"{option_prefix_p}pc_type"] = "jacobi"

          ksp_u.setFromOptions()
          ksp_p.setFromOptions()

     ksp.setFromOptions()

     ##########
     # Above is where the solver and its settings are
     ##########

     start = time.time()
     ksp.solve(Fvec, u_p.x.petsc_vec)
     end = time.time()
     u_sol,p_sol = [ f.collapse() for f in u_p.split() ]

     uL2 = fem.assemble_scalar( fem.form(ufl.inner(u_sol, u_sol) * ufl.dx) )
     uL2 = domain.comm.allreduce(uL2, op=MPI.SUM)
     if MPI.COMM_WORLD.rank == 0:
          print('runtime:', end - start)
          print('dofs:', (N*Pu-1)**3*3+(N*Pp+1)**3)
          print('L2 norm:', uL2)

          postprocessing_data = {'case':CASE,'Re':Re,'N':N,'Pu':Pu,'Pp':Pp,'time':float(end-start),'dofs':(N*Pu-1)**3*3+(N*Pp+1)**3}
          with open(outputdir+'/postprocessdata.pickle', 'wb') as handle:
               pickle.dump(postprocessing_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

While the Schur-based preconditioner performs quite well, it doesnā€™t show perfect weak scaling. I believe that is to be expected due to the ā€˜selfpā€™ option. Better would be to combine the two iterative approaches; approximate the Schur complement with the pressure mass matrix, while retaining the ā€˜upperā€™ block. That should give near-perfect (weak) scaling. I believe this is done in:

Those implementations are a bit too invasive for me now, but if someone is interested in implementing a light-weight version thereof Iā€™d be interested to be kept in the loop.