Obtaining nested Jacobian matrix from nonlinear residual and fieldsplit preconditioning

Hi all,

I have a nonlinear multiphysics problem and my goal is to build a good preconditioner to solve the problem efficiently. I have some ideas on how to do it, but I struggle a bit with the implementation.

I’m going to formulate a minimal example that captures the main features of my original system of equations (nonlinear and coupled system of equations), and will provide a working dolfinx code.

For simplicity, let us consider a single domain \Omega bounded by the Dirichlet boundary \Gamma_{D}. I’m then going to consider N Poisson equations coupled through the function f_{i}:

\begin{align} -\Delta u_{i} &= f_{i}(u_{i}) \quad &&\text{in} \quad \Omega \\ u_{i}&= 0 \quad &&\text{on} \quad \Gamma_{D} \\ \end{align}

For N=3 (i.e., three equations), the following minimal code solves the problem:

from mpi4py import MPI
from petsc4py import PETSc
from mpi4py import MPI
import numpy as np
import ufl
import basix.ufl
import dolfinx
from dolfinx import fem, io, log, default_scalar_type
from dolfinx.fem.petsc import NonlinearProblem
from dolfinx.nls.petsc import NewtonSolver

rank = MPI.COMM_WORLD.rank

# --- Computational domain ---
# Mesh
mesh_size = 200
domain = dolfinx.mesh.create_unit_square(MPI.COMM_WORLD, mesh_size, mesh_size)
# Normal vector
n = ufl.FacetNormal(domain)

# --- Mixed element space
el_u = basix.ufl.element("Lagrange", domain.basix_cell(), 1)
el_mixed = basix.ufl.mixed_element([el_u] * 3)
W = dolfinx.fem.functionspace(domain, el_mixed)

# Test functions
dw = ufl.TestFunctions(W)
v0 = dw[0]  
v1 = dw[1]  
v2 = dw[2]  

# Solution vector
uh = dolfinx.fem.Function(W)
u_split = ufl.split(uh)
uh0 = u_split[0]
uh1 = u_split[1]
uh2 = u_split[2]

# --- Distributed forcing and boundary conditions ---
# Distributed forcing
f = fem.Constant(domain, default_scalar_type(-6))
# Homogeneous Dirichlet, system 0
W0 = W.sub(0)
V0, V0_to_W0 = W0.collapse()
uh0_D = fem.Function(V0)
uh0_D.x.array[:] = 0
# Homogeneous Dirichlet, system 1
W1 = W.sub(1)
V1, V1_to_W1 = W1.collapse()
uh1_D = fem.Function(V1)
uh1_D.x.array[:] = 0
# Homogeneous Dirichlet, system 2
W2 = W.sub(2)
V2, V2_to_W2 = W2.collapse()
uh2_D = fem.Function(V2)
uh2_D.x.array[:] = 0

# Penalty parameters
h = 2 * ufl.Circumradius(domain)
alpha = fem.Constant(domain, default_scalar_type(10))

# --- Weak form ---
# Bilinear form
# Block 0
a = ufl.inner(ufl.grad(uh0), ufl.grad(v0)) * ufl.dx      # grad(u),grad(v)
a += -ufl.inner(ufl.dot(ufl.grad(uh0), n), v0) * ufl.ds  # Consistency term
a += -ufl.inner(ufl.dot(ufl.grad(v0), n), uh0) * ufl.ds  # Symmetry term
a += alpha/h*ufl.inner(uh0, v0) * ufl.ds                 # Penalty term
# Block 1
a += ufl.inner(ufl.grad(uh1), ufl.grad(v1)) * ufl.dx     # grad(u),grad(v)
a += -ufl.inner(ufl.dot(ufl.grad(uh1), n), v1) * ufl.ds  # Consistency term
a += -ufl.inner(ufl.dot(ufl.grad(v1), n), uh1) * ufl.ds  # Symmetry term
a += alpha/h*ufl.inner(uh1, v1) * ufl.ds                 # Penalty term
# Block 2
a += ufl.inner(ufl.grad(uh2), ufl.grad(v2)) * ufl.dx     # grad(u),grad(v)
a += -ufl.inner(ufl.dot(ufl.grad(uh2), n), v2) * ufl.ds  # Consistency term
a += -ufl.inner(ufl.dot(ufl.grad(v2), n), uh2) * ufl.ds  # Symmetry term
a += alpha/h*ufl.inner(uh2, v2) * ufl.ds                 # Penalty term

# Linear form
# Block 0
f0 = uh1 + uh2
L = ufl.inner(f, v0) * ufl.dx                               # Forcing term
L += ufl.inner(f0*f0, v0) * ufl.dx                          # Coupling term
L += -ufl.inner(ufl.dot(ufl.grad(v0), n), uh0_D) * ufl.ds   # Symmetry term                
L += alpha/h*ufl.inner(uh0_D, v0) * ufl.ds                  # Penalty term  
# Block 1
f1 = uh0 + uh2
L += ufl.inner(f1*f1, v1) * ufl.dx                          # Coupling term
L += -ufl.inner(ufl.dot(ufl.grad(v1), n), uh1_D) * ufl.ds   # Symmetry term                 
L += alpha/h*ufl.inner(uh1_D, v1) * ufl.ds                  # Penalty term
# Block 2
f2 = uh0 + uh1
L += ufl.inner(f2*f2, v2) * ufl.dx                          # Coupling term
L += -ufl.inner(ufl.dot(ufl.grad(v2), n), uh2_D) * ufl.ds   # Symmetry term                
L += alpha/h*ufl.inner(uh2_D, v2) * ufl.ds                  # Penalty term      

# Residual
F = a - L

# --- Solver setup --- 
# Nonlinear solver setup
problem = NonlinearProblem(F, uh, bcs=[])
solver = NewtonSolver(MPI.COMM_WORLD, problem)
solver.convergence_criterion = "residual"
solver.rtol = 1e-6
solver.atol = 1e-8
solver.max_it = 100
solver.report = True

# Linear solver setup
ksp = solver.krylov_solver
opts = PETSc.Options()
option_prefix = ksp.getOptionsPrefix()
opts[f"{option_prefix}ksp_type"] = "gmres"
opts[f"{option_prefix}ksp_rtol"] = 1.0e-8         # relative tolerance
opts[f"{option_prefix}ksp_atol"] = 1.0e-6         # absolute tolerance
opts[f"{option_prefix}pc_type"] = "gamg"
ksp.setFromOptions()

# --- Solve problem ---
if rank == 0:
    print("solving", flush=True)
import time
start_time = time.perf_counter()
n_iter, converged = solver.solve(uh)
end_time = time.perf_counter()
assert (converged)
if rank == 0:
    print(f"Number of interations: {n_iter:d}")
    print(f"Took {end_time - start_time:.6f} seconds")

Note that I have considered the Nitsche’s method to impose the Dirichlet boundary conditions weakly.

As you can check, the computational time scales badly as I increase the number of processors and number of dofs, which is expected due to solving this as a monolithic problem.

Because of the block structure of multiphysics problems, I’m thinking of building a block preconditioner using fieldsplit in PETSc. My guess is that something on the lines of this Stokes Example could work well.

However, my problem is nonlinear, so I’d have to build a nested Jacobian matrix and feed the ksp solver with it, along with a preconditioner.

Is there a way of obtaining a nested Jacobian from the residual F I provided in the minimal code using ufl.derivatives? or is it necessary to write down the residual in blocks first?

As a final note, I’m using the following Dolfinx version:

DOLFINx version: 0.9.0 based on GIT commit: 52b6ca576a5807f3a830b2de8cad6c00a6f729fc of GitHub - FEniCS/dolfinx: Next generation FEniCS problem solving environment

Thanks in advance.

I would wait for the v0.10 release, or used dolfinx nightly (through docker or spack) as the nonlinearproblem has been extended to nest systems there, as for instance illustrated in the diff: Use snes solver dolfinx instead of scifem by jorgensd · Pull Request #32 · METHODS-Group/ProximalGalerkin · GitHub

or in dolfinx/python/test/unit/fem/test_petsc_nonlinear_assembler.py at main · FEniCS/dolfinx · GitHub

Thank you, @dokken.

Is there a date for the v0.10 release?

We were hoping for this summer, but vacation has delayed it a bit. Hopefully soon!

2 Likes

Maybe this is besides the question that you’re asking, but block preconditioning with fieldsplit is already very doable for mixed spaces, also for nonlinear problems. See for example:

Other resources that I found quite useful are Rafinex-external-RIFLE / FEniCSx-pctools · GitLab and ambit/src/ambit_fe/solver/preconditioner.py at master · marchirschvogel/ambit · GitHub

1 Like

@Stein I agree it is doable, and both your solution and the PC-tools solution do indeed work.
I just figured it is a lot less coding if one moves onto main, as there the field-splits are set up automatically if one uses “blocked” assembly with kind=”nest”.

1 Like

Thank you, @Stein. I’ve been working on the approach you suggested, and I managed to replicated my minimal example using block preconditioning with fieldsplit. In particular, I consider Additive (block-diagonal) FieldSplit. This is the updated example:

from mpi4py import MPI
from petsc4py import PETSc
import numpy as np
import ufl
import basix.ufl
import dolfinx
from dolfinx import fem, default_scalar_type
from dolfinx.fem.petsc import NonlinearProblem, assemble_matrix
from dolfinx.nls.petsc import NewtonSolver
import time

rank = MPI.COMM_WORLD.rank

# --- Computational domain ---
# Mesh
mesh_size = 300
domain = dolfinx.mesh.create_unit_square(MPI.COMM_WORLD, mesh_size, mesh_size)
# Normal vector
n = ufl.FacetNormal(domain)

# --- Mixed element space ---
el_u = basix.ufl.element("Lagrange", domain.basix_cell(), 1)
el_mixed = basix.ufl.mixed_element([el_u] * 3)
W = dolfinx.fem.functionspace(domain, el_mixed)

# Test functions
dw = ufl.TestFunctions(W)
v0, v1, v2 = dw[0], dw[1], dw[2]

# Solution vector
uh = dolfinx.fem.Function(W)
uh0, uh1, uh2 = ufl.split(uh)

# --- Distributed forcing and boundary conditions ---
# Distributed forcing
f = fem.Constant(domain, default_scalar_type(-6))
# Homogeneous Dirichlet, system 0
W0 = W.sub(0)
V0, V0_to_W0 = W0.collapse()
uh0_D = fem.Function(V0)
uh0_D.x.array[:] = 0
# Homogeneous Dirichlet, system 1
W1 = W.sub(1)
V1, V1_to_W1 = W1.collapse()
uh1_D = fem.Function(V1)
uh1_D.x.array[:] = 0
# Homogeneous Dirichlet, system 2
W2 = W.sub(2)
V2, V2_to_W2 = W2.collapse()
uh2_D = fem.Function(V2)
uh2_D.x.array[:] = 0

# Penalty parameters
h = 2 * ufl.Circumradius(domain)
alpha = fem.Constant(domain, default_scalar_type(10))

# --- Weak form ---
# Bilinear form
# Block 0
a = ufl.inner(ufl.grad(uh0), ufl.grad(v0)) * ufl.dx      # grad(u),grad(v)
a += -ufl.inner(ufl.dot(ufl.grad(uh0), n), v0) * ufl.ds  # Consistency term
a += -ufl.inner(ufl.dot(ufl.grad(v0), n), uh0) * ufl.ds  # Symmetry term
a += alpha/h*ufl.inner(uh0, v0) * ufl.ds                 # Penalty term
# Block 1
a += ufl.inner(ufl.grad(uh1), ufl.grad(v1)) * ufl.dx     # grad(u),grad(v)
a += -ufl.inner(ufl.dot(ufl.grad(uh1), n), v1) * ufl.ds  # Consistency term
a += -ufl.inner(ufl.dot(ufl.grad(v1), n), uh1) * ufl.ds  # Symmetry term
a += alpha/h*ufl.inner(uh1, v1) * ufl.ds                 # Penalty term
# Block 2
a += ufl.inner(ufl.grad(uh2), ufl.grad(v2)) * ufl.dx     # grad(u),grad(v)
a += -ufl.inner(ufl.dot(ufl.grad(uh2), n), v2) * ufl.ds  # Consistency term
a += -ufl.inner(ufl.dot(ufl.grad(v2), n), uh2) * ufl.ds  # Symmetry term
a += alpha/h*ufl.inner(uh2, v2) * ufl.ds                 # Penalty term

# Linear form
# Block 0
f0 = uh1 + uh2
L = ufl.inner(f, v0) * ufl.dx                               # Forcing term
L += ufl.inner(f0*f0, v0) * ufl.dx                          # Coupling term
L += -ufl.inner(ufl.dot(ufl.grad(v0), n), uh0_D) * ufl.ds   # Symmetry term                
L += alpha/h*ufl.inner(uh0_D, v0) * ufl.ds                  # Penalty term  
# Block 1
f1 = uh0 + uh2
L += ufl.inner(f1*f1, v1) * ufl.dx                          # Coupling term
L += -ufl.inner(ufl.dot(ufl.grad(v1), n), uh1_D) * ufl.ds   # Symmetry term                 
L += alpha/h*ufl.inner(uh1_D, v1) * ufl.ds                  # Penalty term
# Block 2
f2 = uh0 + uh1
L += ufl.inner(f2*f2, v2) * ufl.dx                          # Coupling term
L += -ufl.inner(ufl.dot(ufl.grad(v2), n), uh2_D) * ufl.ds   # Symmetry term                
L += alpha/h*ufl.inner(uh2_D, v2) * ufl.ds                  # Penalty term      

# Residual
F = a - L

# --- Assemble Jacobian and attach to KSP ---
Trial = ufl.TrialFunction(W)
J_form = ufl.derivative(F, uh, Trial)
J = assemble_matrix(fem.form(J_form), bcs=[])
J.assemble()

# --- Assemble block preconditioner ---
w_trial = ufl.TrialFunction(W)
u1, u2, u3 = ufl.split(w_trial)
P_form = ufl.inner(ufl.grad(u1), ufl.grad(v1)) * ufl.dx \
    + ufl.inner(ufl.grad(u2), ufl.grad(v2)) * ufl.dx \
    + ufl.inner(ufl.grad(u2), ufl.grad(v2)) * ufl.dx
P = assemble_matrix(fem.form(P_form), bcs=[])
P.assemble()

# --- Nonlinear problem and Newton solver ---
problem = NonlinearProblem(F, uh, bcs=[])
solver = NewtonSolver(MPI.COMM_WORLD, problem)
solver.convergence_criterion = "residual"
solver.rtol = 1e-6
solver.atol = 1e-8
solver.max_it = 50
solver.report = True

uh.x.array[:] = 0.0  # initial guess

# --- Programmatic PETSc FieldSplit PC ---
ksp = solver.krylov_solver

# Build globally-owned lists of indices for each field using collapse maps
W_indexmap = W.dofmap.index_map
idx0 = np.array(V0_to_W0, dtype=np.int32)
idx1 = np.array(V1_to_W1, dtype=np.int32)
idx2 = np.array(V2_to_W2, dtype=np.int32)

gidx0 = np.setdiff1d(W_indexmap.local_to_global(idx0), W_indexmap.ghosts).astype(np.int32)
gidx1 = np.setdiff1d(W_indexmap.local_to_global(idx1), W_indexmap.ghosts).astype(np.int32)
gidx2 = np.setdiff1d(W_indexmap.local_to_global(idx2), W_indexmap.ghosts).astype(np.int32)

if rank == 0:
    print("Local owned field sizes:", gidx0.size, gidx1.size, gidx2.size)
    print("Global mixed dofs:", W_indexmap.size_global)

IS0 = PETSc.IS().createGeneral(gidx0, comm=MPI.COMM_WORLD).sort()
IS1 = PETSc.IS().createGeneral(gidx1, comm=MPI.COMM_WORLD).sort()
IS2 = PETSc.IS().createGeneral(gidx2, comm=MPI.COMM_WORLD).sort()

pc = ksp.getPC()
pc.setType("fieldsplit")
pc.setFieldSplitIS(("f0", IS0), ("f1", IS1), ("f2", IS2))

# Set solver and preconditioner
ksp.setOperators(J, P)
pc.setUp()

# --- Configure sub-KSPs (preonly + hypre) ---
subs = pc.getFieldSplitSubKSP()
if rank == 0:
    print("Number of sub-KSPs:", len(subs))

for subksp in subs:
    subksp.setType("preonly")
    subpc = subksp.getPC()
    subpc.setType("hypre")
    subpc.setHYPREType("boomeramg")
    subksp.setFromOptions()

# --- Configure outer KSP ---
ksp.setType("gmres")
ksp.setTolerances(rtol=1e-8, atol=1e-6)
ksp.setFromOptions()
pc.setFromOptions()

if rank == 0:
    print("FieldSplit PC configured. Starting Newton solve...")

# --- Solve nonlinear problem ---
t0 = time.time()
n_iter, converged = solver.solve(uh)
t1 = time.time()

if not converged:
    raise RuntimeError("Newton failed to converge")

if rank == 0:
    print(f"Newton finished: linear solves (Newton iterations) = {n_iter}")
    print("Elapsed time:", t1 - t0)

# Postprocess: L2 norms of split fields
uh0_f, uh1_f, uh2_f = uh.split()
L20 = fem.assemble_scalar(fem.form(ufl.inner(uh0_f, uh0_f) * ufl.dx))
L21 = fem.assemble_scalar(fem.form(ufl.inner(uh1_f, uh1_f) * ufl.dx))
L22 = fem.assemble_scalar(fem.form(ufl.inner(uh2_f, uh2_f) * ufl.dx))
total = MPI.COMM_WORLD.allreduce(L20 + L21 + L22, op=MPI.SUM)
if rank == 0:
    print("Total L2:", total)

When I run it in a single processor, it solves the problem faster in comparison with the approach above (i.e., monolithic gmres + gamg linear solver). However, it does not scale very well as I increase the number of processors. For example, I get very modest speedup when I use 2 or 4 processors, but it gets slower as I increase the number.

Do you know if I’m missing something?

Also, @dokken, is my approach something similar to the future nonlinearproblem kind=“nest” capability?

Regarding your parallel performance: How big are the meshes that you’re trying to do parallel computations with? If your mesh isn’t big enough, increasing the number of processors will not lead to any speedup, and might actually slow down your computations due to the increase in communication overhead.

It is somewhat similar. The main difference will be that one would use ufl.MixedFunctionSpace and ufl.extract_blocks instead of basix.ufl.mixed_element. Slightly different assembly routines will be called under the hood, and you wouldn’t have to make the index sets by hand, as illustrated in

FEniCSx PCtools is also a good tool for doing the wrapping. It supports slightly more advanced splits than what a PETSc nest matrix does see: Solver configuration at runtime — FEniCSx-pctools 0.9.0 documentation

One can of course restructure the fields in DOLFINx to make suitable splits, but I get their argument.

Profiling steps that I would suggest are:

  • Check the dolfinx logs to see if solver time or assembly time are dominant. One factor is indeed a large enough computation problem, as @spcvanschie mentions.

  • Add a ksp.setMonitor(ksp_monitor) to check the the required number of itative solves.

  • See how both change with number of cores and mesh size.

As a proof of concept of your preconditioning, you could change boomeramg into lu. Then you would hope that the iteration count does not change with mesh size and number of cores.

→ If that is the case, and the computational mesh is large enough, then you’d have to fiddle with the boomeramg parameters.

→ If that is not the case, then your preconditioning approach may not be suitable. The diagonal preconditioner is in general not great if the block interaction is strong. You’d either need some sort of UPPER/LOWER Schur type preconditioner, or you’d have to move your multi-physics problem from a monolithic solve to an iterative solve. Or maybe some sort of relaxation of the block interaction that is reduced each Newton iteration is an option?

I did a deeper analysis in which I monitored the iterations of both the Newton solver, as well as the internal linear iterations, as @Stein suggested.

However, what I’ve just noticed from ksp.view() is that my preconditioner is not even being applied:

KSP Object: (nls_solve_) 1 MPI process
  type: preonly
  maximum iterations=10000, initial guess is zero
  tolerances: relative=1e-08, absolute=1e-06, divergence=10000.
  left preconditioning
  using NONE norm type for convergence test
PC Object: (nls_solve_) 1 MPI process
  type: lu
    out-of-place factorization
    tolerance for zero pivot 2.22045e-14
    matrix ordering: nd
    factor fill ratio given 5., needed 11.5854
      Factored matrix follows:
        Mat Object: (nls_solve_) 1 MPI process
          type: seqaij
          rows=271803, cols=271803
          package used to perform factorization: petsc
          total: nonzeros=65876901, allocated nonzeros=65876901
            using I-node routines: found 90601 nodes, limit used is 5
  linear system matrix = precond matrix:
  Mat Object: 1 MPI process
    type: seqaij
    rows=271803, cols=271803
    total: nonzeros=5686209, allocated nonzeros=5686209
    total number of mallocs used during MatSetValues calls=0
      using I-node routines: found 90607 nodes, limit used is 5

For some reason, it’s using LU factorization, which explains the bad scalling behaviour I was obtaining. Do you guys know what I’m doing wrong?

@dokken thanks for sharing the FEniCSx PCtools, it may be useful for my problem.

That was precisely the issue raised in the post that I referenced earlier “precondition-appears-not-to-happen” :smiley: See response 3 to that post.

Something is off with the prefix to the PETSc when using the NonlinerSolver. You’ll have to set:

ksp.setOptionsPrefix("")
1 Like

Brilliant! You indeed raised that issue in the post…
Thank you, this preconditioner now is super fast :slight_smile:

I’ll leave below the solver settings in case it’s useful for someone else

# --- Nonlinear problem and Newton solver ---
problem = NonlinearProblem(F, uh, bcs=[])
solver = NewtonSolver(MPI.COMM_WORLD, problem)
solver.convergence_criterion = "residual"
solver.rtol = 1e-6
solver.atol = 1e-8
solver.max_it = 50
solver.report = True

uh.x.array[:] = 0.0  # initial guess

# --- Programmatic PETSc FieldSplit PC ---
ksp = solver.krylov_solver
ksp.setType("gmres")
ksp.setTolerances(rtol=1e-8, atol=1e-6)
linear_iters = []

def ksp_monitor(ksp, its, rnorm):
    if its == 0:
        # Starting a new Newton iteration
        linear_iters.append(0)
    linear_iters[-1] += 1
    print(f"[Rank {rank}] Newton step {len(linear_iters)}, "
          f"KSP iter {its}, residual = {rnorm:.3e}", flush=True)

# Attach the monitor to the outer Krylov solver
ksp.setMonitor(ksp_monitor)

# Build globally-owned lists of indices for each field using collapse maps
W_indexmap = W.dofmap.index_map
idx0 = np.array(V0_to_W0, dtype=np.int32)
idx1 = np.array(V1_to_W1, dtype=np.int32)
idx2 = np.array(V2_to_W2, dtype=np.int32)

gidx0 = np.setdiff1d(W_indexmap.local_to_global(idx0), W_indexmap.ghosts).astype(np.int32)
gidx1 = np.setdiff1d(W_indexmap.local_to_global(idx1), W_indexmap.ghosts).astype(np.int32)
gidx2 = np.setdiff1d(W_indexmap.local_to_global(idx2), W_indexmap.ghosts).astype(np.int32)

if rank == 0:
    print("Local owned field sizes:", gidx0.size, gidx1.size, gidx2.size, flush=True)
    print("Global mixed dofs:", W_indexmap.size_global, flush=True)

IS0 = PETSc.IS().createGeneral(gidx0, comm=MPI.COMM_WORLD).sort()
IS1 = PETSc.IS().createGeneral(gidx1, comm=MPI.COMM_WORLD).sort()
IS2 = PETSc.IS().createGeneral(gidx2, comm=MPI.COMM_WORLD).sort()

# Set solver
ksp.setOperators(J, J)

pc = ksp.getPC()
pc.setType("fieldsplit")
pc.setFieldSplitIS(("f0", IS0), ("f1", IS1), ("f2", IS2))
pc.setFieldSplitType(PETSc.PC.CompositeType.ADDITIVE)
pc.setReusePreconditioner(True)

# --- Configure sub-KSPs (preonly + hypre) ---
subs = pc.getFieldSplitSubKSP()
if rank == 0:
    print("Number of sub-KSPs:", len(subs), flush=True)

for subksp in subs:
    subksp.setType("preonly")
    subpc = subksp.getPC()
    subpc.setType("hypre")
    subpc.setHYPREType("boomeramg")
    #subksp.setFromOptions()

ksp.setOptionsPrefix("")
pc.setUp()

# --- Configure outer KSP ---
ksp.setFromOptions()
pc.setFromOptions()

3 Likes