Speeding up Navier Stokes code

Jaydeep_Deshpande · January 14, 2026, 12:41pm

Hey guys, I want to carry out fluid flow simulations in FEniCS. Here are the version specs:

DOLFINx version: 0.10.0.post2
PETSc version: (3, 15, 5)
petsc4py version: 3.15.1
Basix version: 0.10.0
UFL version: 2025.2.0.post0
MPI Vendor: (‘Open MPI’, (4, 1, 2))

To begin with I am trying to setup simple benchmarks. I am a complete noob at FEniCS at the moment and I have very less idea of the inner workings. Presently, I am trying to translate my MATLAB or non-PETSc C++ FEM code into it. Below is a 3d driven cavity example using gamg. The krylov iterations and picard iterations are converging as expected. I am unsure if the code is reassembling the non-convective terms in the loop at the moment; or does it recognise the repeating work automatically? I tried explicitly assembling the linear parts separately and adding them but I am running into PETSc issues. It seems as if the structure of the matrix changes while assembling and the linear parts do not simply add correctly to the nonlinear newly assembled pieces. Are there any tutorials, examples, suggestions which can help me in running the code faster? Thanks

from mpi4py import MPI
from petsc4py import PETSc
import numpy as np
import os
from pathlib import Path

# DOLFINx Imports
import dolfinx
from dolfinx.fem import (
    Constant, Function, functionspace, dirichletbc, 
    locate_dofs_geometrical, form, extract_function_spaces
)
from dolfinx.fem.petsc import (
    create_matrix, create_vector, assemble_matrix, assemble_vector, 
    apply_lifting, set_bc
)
from dolfinx.mesh import create_unit_cube, CellType
from dolfinx.io import VTKFile, XDMFFile
from basix.ufl import element, mixed_element
from ufl import (
    TestFunctions, TrialFunctions, split, dot, div, grad, dx, inner, 
    sqrt, CellDiameter, lhs, rhs
)

# -------------------------------------------------------------------
# 1. Simulation Parameters
# -------------------------------------------------------------------
N = 16
Re = 1000.0              
T_end = 10.0             

# --- IO Controls ---
output_freq = 5         
results_folder = Path("results_3d_cavity_final")
checkpoint_xdmf_path = results_folder / "checkpoint.xdmf"
checkpoint_meta_path = results_folder / "checkpoint_meta.npy"

# --- Adaptive Time Stepping ---
Target_CFL = 1  
Max_dt = 0.1              
Min_dt = 1e-5              
WARMUP_STEPS = 3
WARMUP_DT_START = 1e-5
# Ramp the lid velocity over the first 0.1 seconds to prevent shock
RAMP_DURATION = 0.1 

# --- Solver Tolerances ---
PROD_MAX_PICARD = 10    
PROD_MAX_KSP = 100      
PROD_KSP_TOL = 1e-8      

# Convergence Criteria
STAGNATION_TOL = 1e-4    
NONLIN_RES_TOL = 1e-4    

picard_relaxation = 1  
ENABLE_SUPG = 1.0 
ENABLE_PSPG = 1.0 

comm = MPI.COMM_WORLD

# -------------------------------------------------------------------
# 2. Mesh & Functions
# -------------------------------------------------------------------
if comm.rank == 0:
    print(f"Generating 3D Mesh ({N}x{N}x{N})...", flush=True)

mesh = create_unit_cube(comm, N, N, N, cell_type=CellType.tetrahedron)
h_min = 1.0 / N 
dt_init = 0.5 * h_min 

# Spaces
P2_v = element("Lagrange", mesh.basix_cell(), 2, shape=(mesh.geometry.dim,))
P1_s = element("Lagrange", mesh.basix_cell(), 1)
Mixed_Elem = mixed_element([P2_v, P1_s])
W = functionspace(mesh, Mixed_Elem)

# Functions
w = Function(W)           
w_k = Function(W)         
w0 = Function(W)          
w00 = Function(W)         

(u_trial, p_trial) = TrialFunctions(W)
(v, q) = TestFunctions(W)

u_k, p_k = split(w_k)     
u0, p0 = split(w0)
u00, p00 = split(w00)

# Physical Constants
rho = 1.0
mu = 1.0 / Re
nu = Constant(mesh, PETSc.ScalarType(mu / rho))
dt = Constant(mesh, PETSc.ScalarType(WARMUP_DT_START)) 
f_body = Constant(mesh, PETSc.ScalarType((0.0, 0.0, 0.0)))

# -------------------------------------------------------------------
# 3. Boundary Conditions
# -------------------------------------------------------------------
V_sub, _ = W.sub(0).collapse()
Q_sub, _ = W.sub(1).collapse()

# A. Wall BC (Value = 0.0)
u_walls = Function(V_sub)
u_walls.x.array[:] = 0.0
dofs_walls = locate_dofs_geometrical((W.sub(0), V_sub), 
    lambda x: np.logical_or(np.isclose(x[0],0)|np.isclose(x[0],1), np.isclose(x[2],0)|np.isclose(x[2],1) | np.isclose(x[1],0)))
bc_walls = dirichletbc(u_walls, dofs_walls, W.sub(0))

# B. Lid BC (Value = Ramped from 0.0 to 1.0)
u_lid = Function(V_sub)
u_lid.interpolate(lambda x: np.vstack((np.ones_like(x[0]), np.zeros_like(x[0]), np.zeros_like(x[0]))))
dofs_lid = locate_dofs_geometrical((W.sub(0), V_sub), lambda x: np.isclose(x[1], 1.0))
bc_lid = dirichletbc(u_lid, dofs_lid, W.sub(0))

# C. Pressure Pin (Value = 0.0)
dofs_p = locate_dofs_geometrical((W.sub(1), Q_sub), lambda x: (np.isclose(x[0],0)&np.isclose(x[1],0)&np.isclose(x[2],0)))
bc_p = dirichletbc(Function(Q_sub), dofs_p, W.sub(1))

# --- REAL BCs (Applied to System) ---
bcs = [bc_lid, bc_walls, bc_p]

# --- ZERO BCs (Applied ONLY to Residual Vector check) ---
# We need a BC that targets the Lid DOFs but forces them to 0.0
u_zero_func = Function(V_sub)
u_zero_func.x.array[:] = 0.0
bc_lid_zero = dirichletbc(u_zero_func, dofs_lid, W.sub(0))

# The walls and pin are already 0.0, so we can reuse them.
# This list is used solely to zero out the "Reaction Forces" in the residual check.
bcs_zero = [bc_lid_zero, bc_walls, bc_p]

# -------------------------------------------------------------------
# 4. Form Helper Functions
# -------------------------------------------------------------------
h = CellDiameter(mesh)

def get_tau(u_vec, dt_val):
    u_mag = sqrt(dot(u_vec, u_vec) + 1e-10)
    return 1.0 / sqrt( (2.0/dt_val)**2 + (2.0*u_mag/h)**2 + (4.0*nu/h**2)**2 )

# -------------------------------------------------------------------
# 5. Linearized System Forms
# -------------------------------------------------------------------
tau_lin = get_tau(u_k, dt)

# --- BDF1 Linear ---
dt_inv = 1.0/dt
mom_lhs_bdf1 = dt_inv * u_trial + dot(grad(u_trial),u_k) + grad(p_trial)
mom_rhs_bdf1 = f_body + dt_inv * u0

F_lin_bdf1 = inner(dt_inv * u_trial, v)*dx \
           + inner(dot(grad(u_trial),u_k), v)*dx \
           + nu * inner(grad(u_trial), grad(v))*dx \
           - inner(p_trial, div(v))*dx + inner(div(u_trial), q)*dx \
           - inner(f_body + dt_inv * u0, v)*dx

F_lin_bdf1 += ENABLE_SUPG * inner(mom_lhs_bdf1 - mom_rhs_bdf1, tau_lin * dot(grad(v),u_k)) * dx
F_lin_bdf1 += ENABLE_PSPG * inner(mom_lhs_bdf1 - mom_rhs_bdf1, tau_lin * grad(q)) * dx

# --- BDF2 Linear ---
bdf2_fac = 1.0 / (2.0*dt)
mom_lhs_bdf2 = 3.0*bdf2_fac * u_trial + dot(grad(u_trial),u_k) + grad(p_trial)
mom_rhs_bdf2 = f_body + bdf2_fac * (4.0*u0 - u00)

F_lin_bdf2 = inner(3.0*bdf2_fac * u_trial, v)*dx \
           + inner(dot(grad(u_trial),u_k), v)*dx \
           + nu * inner(grad(u_trial), grad(v))*dx \
           - inner(p_trial, div(v))*dx + inner(div(u_trial), q)*dx \
           - inner(f_body + bdf2_fac * (4.0*u0 - u00), v)*dx

F_lin_bdf2 += ENABLE_SUPG * inner(mom_lhs_bdf2 - mom_rhs_bdf2, tau_lin * dot(grad(v),u_k)) * dx
F_lin_bdf2 += ENABLE_PSPG * inner(mom_lhs_bdf2 - mom_rhs_bdf2, tau_lin * grad(q)) * dx

# Use standard 'form' and imported 'lhs/rhs'
a_bdf1, L_bdf1 = form(lhs(F_lin_bdf1)), form(rhs(F_lin_bdf1))
a_bdf2, L_bdf2 = form(lhs(F_lin_bdf2)), form(rhs(F_lin_bdf2))

# -------------------------------------------------------------------
# 6. NON-LINEAR Residual Forms
# -------------------------------------------------------------------
u_curr, p_curr = split(w)
tau_nonlin = get_tau(u_curr, dt)

# --- BDF1 ---
mom_strong_bdf1 = dt_inv*(u_curr - u0) + dot(grad(u_curr),u_curr) + grad(p_curr) - f_body
R_bdf1 = inner(dt_inv*(u_curr - u0), v)*dx + inner(dot(grad(u_curr),u_curr), v)*dx + nu * inner(grad(u_curr), grad(v))*dx \
       - inner(p_curr, div(v))*dx + inner(div(u_curr), q)*dx - inner(f_body, v)*dx
R_bdf1 += ENABLE_SUPG * inner(mom_strong_bdf1, tau_nonlin * dot(grad(v),u_curr)) * dx
R_bdf1 += ENABLE_PSPG * inner(mom_strong_bdf1, tau_nonlin * grad(q)) * dx

# --- BDF2 ---
mom_strong_bdf2 = bdf2_fac*(3.0*u_curr - 4.0*u0 + u00) + dot(grad(u_curr),u_curr) + grad(p_curr) - f_body
R_bdf2 = inner(bdf2_fac*(3.0*u_curr - 4.0*u0 + u00), v)*dx + inner(dot(grad(u_curr),u_curr), v)*dx + nu * inner(grad(u_curr), grad(v))*dx \
       - inner(p_curr, div(v))*dx + inner(div(u_curr), q)*dx - inner(f_body, v)*dx
R_bdf2 += ENABLE_SUPG * inner(mom_strong_bdf2, tau_nonlin * dot(grad(v),u_curr)) * dx
R_bdf2 += ENABLE_PSPG * inner(mom_strong_bdf2, tau_nonlin * grad(q)) * dx

Res_Form_BDF1 = form(R_bdf1)
Res_Form_BDF2 = form(R_bdf2)

# -------------------------------------------------------------------
# 7. Solver Setup
# -------------------------------------------------------------------
A = create_matrix(a_bdf1) 
b = create_vector(extract_function_spaces(L_bdf1))
res_vector = create_vector(extract_function_spaces(L_bdf1))

ksp = PETSc.KSP().create(mesh.comm)
ksp.setOperators(A)
ksp.setType("gmres")
ksp.setTolerances(rtol=PROD_KSP_TOL, max_it=PROD_MAX_KSP)
pc = ksp.getPC()
pc.setType("gamg") 
pc.setGAMGType("agg")

opts = PETSc.Options()
opts["pc_gamg_sym_graph"] = True       # Fixes "un-symmetric graph" error
opts["pc_gamg_square_graph"] = 1       # Improves robustness for non-symmetric/convective systems
opts["pc_gamg_agg_nsmooths"] = 1       # (Optional) Smoothens aggregation
ksp.setFromOptions()

# IO
results_folder.mkdir(exist_ok=True, parents=True)
file_u = VTKFile(comm, results_folder / "velocity.pvd", "w")
file_s = VTKFile(comm, results_folder / "pressure.pvd", "w")
u_vis = Function(V_sub, name="Velocity")
p_vis = Function(Q_sub, name="Pressure")

# -------------------------------------------------------------------
# 8. Optimized Time Loop
# -------------------------------------------------------------------
t = 0.0
step = 0

# Helper to scale lid velocity
def update_lid_bc(current_time):
    scale = min(current_time / RAMP_DURATION, 1.0)
    # 1. Update the Function used by the Real BC
    u_lid.x.array[:] = 0.0
    u_lid.interpolate(lambda x: np.vstack((np.full_like(x[0], scale), np.zeros_like(x[0]), np.zeros_like(x[0]))))
    u_lid.x.scatter_forward()
   

if comm.rank == 0:
    print(f"{'Step':<5} | {'Time':<7} | {'dt':<9} | {'Mode':<6} | {'Picard':<6} | {'KSP':<4} | {'Diff(Rel)':<10} | {'NL-Res(Abs)':<11}", flush=True)

while t < T_end:
    step += 1
    
    # 1. Update Time Step & Mode
    if step <= WARMUP_STEPS:
        current_a, current_L = a_bdf1, L_bdf1
        current_Res_Form = Res_Form_BDF1
        ramp = step / WARMUP_STEPS
        dt.value = WARMUP_DT_START + ramp * (dt_init - WARMUP_DT_START)
        min_picard = 1
        solver_mode = "Warm"
    else:
        current_a, current_L = a_bdf2, L_bdf2
        current_Res_Form = Res_Form_BDF2
        
        # CFL Update
        u_vis.interpolate(w.sub(0))
        u_flat = u_vis.x.array.reshape((-1, 3))
        u_max = comm.allreduce(np.max(np.linalg.norm(u_flat, axis=1)), op=MPI.MAX)
        dt.value = max(min(Target_CFL * h_min / max(u_max, 1e-6), Max_dt), Min_dt)
        min_picard = 2
        solver_mode = "BDF2"

    t += dt.value
    
    # 2. Ramp BCs
    update_lid_bc(t)
    
    # Predictor
    w_k.x.array[:] = w0.x.array[:]
    w_k.x.scatter_forward() 
    
    rel_diff = 1.0
    nonlin_res_norm = 1.0
    
    # --- Picard Loop ---
    for i in range(PROD_MAX_PICARD):
        # A. Linear Solve
        A.zeroEntries()
        assemble_matrix(A, current_a, bcs=bcs)
        A.assemble()
        
        with b.localForm() as loc_b: loc_b.set(0)
        assemble_vector(b, current_L)
        apply_lifting(b, [current_a], [bcs])
        b.ghostUpdate(addv=PETSc.InsertMode.ADD, mode=PETSc.ScatterMode.REVERSE)
        set_bc(b, bcs)
        
        ksp.solve(b, w.x.petsc_vec)
        w.x.scatter_forward()
        
        # B. Calculate Relative Update Difference
        diff_sq = comm.allreduce(np.sum((w.x.array - w_k.x.array)**2), op=MPI.SUM)
        norm_sq = comm.allreduce(np.sum(w.x.array**2), op=MPI.SUM)
        rel_diff = np.sqrt(diff_sq) / (np.sqrt(norm_sq) + 1e-10)
        
        # C. Calculate True Non-Linear Residual
        # 1. Assemble the vector (Integral of the Strong Form)
        with res_vector.localForm() as loc_r: loc_r.set(0)
        assemble_vector(res_vector, current_Res_Form)
        
        # 2. Ghost Update (Sum contributions from parallel partitions)
        res_vector.ghostUpdate(addv=PETSc.InsertMode.ADD, mode=PETSc.ScatterMode.REVERSE)
        
        # 3. Force Boundary Residuals to Zero
        
        set_bc(res_vector, bcs_zero) 
        
        nonlin_res_norm = res_vector.norm()
        
        # D. Update Guess
        w_k.x.array[:] = picard_relaxation * w.x.array[:] + (1.0 - picard_relaxation) * w_k.x.array[:]
        w_k.x.scatter_forward()
        
        # E. Check Convergence
        if i >= (min_picard - 1):
            if (rel_diff < STAGNATION_TOL) and (nonlin_res_norm < NONLIN_RES_TOL):
                break
            
            # Stagnation Safety (If solution stops changing, we trust it)
            if rel_diff < 1e-9: 
                break
    
    # Update History
    w00.x.array[:] = w0.x.array[:]
    w0.x.array[:] = w.x.array[:]
    
    if comm.rank == 0:
        print(f"{step:<5} | {t:<7.4f} | {dt.value:<9.2e} | {solver_mode:<6} | {i+1:<6} | {ksp.getIterationNumber():<4} | {rel_diff:<10.2e} | {nonlin_res_norm:<11.2e}", flush=True)

    # Output
    if step % output_freq == 0 or step == WARMUP_STEPS:
        p_vis.interpolate(w.sub(1))
        u_vis.interpolate(w.sub(0))
        file_u.write_function([u_vis], t)
        file_s.write_function([p_vis], t)
        
        with XDMFFile(comm, checkpoint_xdmf_path, "w") as xdmf:
            xdmf.write_mesh(mesh)
            u_out = Function(functionspace(mesh, ("Lagrange", 1, (mesh.geometry.dim,))))
            u_out.interpolate(w.sub(0))
            u_out.name = "velocity"
            xdmf.write_function(u_out)
            p_out = w.sub(1).collapse()
            p_out.name = "pressure"
            xdmf.write_function(p_out)
        
        if comm.rank == 0:
            np.save(checkpoint_meta_path, {'t': t, 'step': step, 'dt': dt.value})

if comm.rank == 0:
    print("Done.", flush=True)

Topic		Replies	Views
Stationary Navier-Stokes equations General	11	563	July 11, 2024
Error in Solving Steady Navier Stokes Equation Errors	7	700	April 14, 2025
Steady-State Navier Stokes (2D) Multi-Region Flow Issue dolfinx	4	87	November 17, 2025
PETSc NonlinearProblem running slow dolfinx	6	455	November 27, 2023
Navier-stokes using Matrix-vector multiplication: Oasis Linear Algebra	2	810	September 25, 2020

Speeding up Navier Stokes code

Related topics