Tabulate_tensor_integral becomes a performance hotspot when simulation moves from 2D to 3D

Intel vtune profiler was used to measure the performance of FEniCSx programmes. The result of 2D cases shows that the linear solver related cost is dominant in the simulation as expected, while tabulate_tensor_integral becomes dominant in 3D. This is a common situation across different codes.

Questions:

  1. Why this happens? It is not the solving linear system but integral step (assemble) takes the most amount of time in 3D.
  2. If possible to speedup this part in the 3D simulation?

To demonstrate it, I adapted the official navier-stokes code (Divergence conforming discontinuous Galerkin method for the Navier–Stokes equations — DOLFINx 0.9.0 documentation) and attached it in the end. The code can be run in either 2D and 3D by setting ndims = 2 # or 3. Its size is 64^2 (=4,096) in 2D and 16^3 (=4,096) in 3D, where the number of DOF is kept identical. tabulate_tensor_integral takes 3.8% and round 30% of total time repsectively.

To run the v0.9.0 code with and without vtune:

vtune -collect hotspots -knob sampling-mode=hw -knob enable-stack-collection=true numactl --physcpubind=1 python demo_navier-stokes.py

python demo_stokes.py

2D, 3D vtune profiling result and MWE are

# demo_navier-stokes.py

import os
os.environ["OMP_NUM_THREADS"] = "1"

from mpi4py import MPI

import numpy as np

from dolfinx import default_real_type, fem, io, mesh
from dolfinx.fem.petsc import assemble_matrix_block, assemble_vector_block
from ufl import (
    CellDiameter,
    FacetNormal,
    MixedFunctionSpace,
    TestFunctions,
    TrialFunctions,
    avg,
    conditional,
    div,
    dot,
    dS,
    ds,
    dx,
    extract_blocks,
    grad,
    gt,
    inner,
    outer,
)

try:
    from petsc4py import PETSc

    import dolfinx

    if not dolfinx.has_petsc:
        print("This demo requires DOLFINx to be compiled with PETSc enabled.")
        exit(0)
except ModuleNotFoundError:
    print("This demo requires petsc4py.")
    exit(0)


if np.issubdtype(PETSc.ScalarType, np.complexfloating):  # type: ignore
    print("Demo should only be executed with DOLFINx real mode")
    exit(0)


# +
def norm_L2(comm, v):
    """Compute the L2(Ω)-norm of v"""
    return np.sqrt(comm.allreduce(fem.assemble_scalar(fem.form(inner(v, v) * dx)), op=MPI.SUM))


def domain_average(msh, v):
    """Compute the average of a function over the domain"""
    vol = msh.comm.allreduce(
        fem.assemble_scalar(fem.form(fem.Constant(msh, default_real_type(1.0)) * dx)), op=MPI.SUM
    )
    return (1 / vol) * msh.comm.allreduce(fem.assemble_scalar(fem.form(v * dx)), op=MPI.SUM)


def u_e_expr(x):
    """Expression for the exact velocity solution to Kovasznay flow"""
    result = [
            1 - np.exp((Re / 2 - np.sqrt(Re**2 / 4 + 4 * np.pi**2)) * x[0])
            * np.cos(2 * np.pi * x[1]),
            (Re / 2 - np.sqrt(Re**2 / 4 + 4 * np.pi**2))
            / (2 * np.pi)
            * np.exp((Re / 2 - np.sqrt(Re**2 / 4 + 4 * np.pi**2)) * x[0])
            * np.sin(2 * np.pi * x[1]),
        ]
    if ndims == 3:
        result.append(0 * x[2])

    return np.vstack(
        result
    )


def p_e_expr(x):
    """Expression for the exact pressure solution to Kovasznay flow"""
    return (1 / 2) * (1 - np.exp(2 * (Re / 2 - np.sqrt(Re**2 / 4 + 4 * np.pi**2)) * x[0]))


def f_expr(x):
    """Expression for the applied force"""
    return np.vstack((np.zeros_like(x[0]), np.zeros_like(x[0])))



num_time_steps = 3
t_end = 10
Re = 25  # Reynolds Number
k = 1  # Polynomial degree
vis = False  # if to visualize the result

ndims = 2
if ndims == 2:
    n = 64
    msh = mesh.create_unit_square(MPI.COMM_WORLD, n, n)
elif ndims == 3:
    n = 16
    msh = mesh.create_unit_cube(MPI.COMM_WORLD, n, n, n)
    
# Function spaces for the velocity and for the pressure
V = fem.functionspace(msh, ("Raviart-Thomas", k + 1))
Q = fem.functionspace(msh, ("Discontinuous Lagrange", k))
VQ = MixedFunctionSpace(V, Q)

# Funcion space for visualising the velocity field
gdim = msh.geometry.dim
W = fem.functionspace(msh, ("Discontinuous Lagrange", k + 1, (gdim,)))

# Define trial and test functions
u, p = TrialFunctions(VQ)
v, q = TestFunctions(VQ)

delta_t = fem.Constant(msh, default_real_type(t_end / num_time_steps))
alpha = fem.Constant(msh, default_real_type(6.0 * k**2))

h = CellDiameter(msh)
n = FacetNormal(msh)

def jump(phi, n):
    return outer(phi("+"), n("+")) + outer(phi("-"), n("-"))

a = (1.0 / Re) * (
    inner(grad(u), grad(v)) * dx
    - inner(avg(grad(u)), jump(v, n)) * dS
    - inner(jump(u, n), avg(grad(v))) * dS
    + (alpha / avg(h)) * inner(jump(u, n), jump(v, n)) * dS
    - inner(grad(u), outer(v, n)) * ds
    - inner(outer(u, n), grad(v)) * ds
    + (alpha / h) * inner(outer(u, n), outer(v, n)) * ds
)
a -= inner(p, div(v)) * dx
a -= inner(div(u), q) * dx

a_blocked = fem.form(extract_blocks(a))

f = fem.Function(W)
u_D = fem.Function(V)
u_D.interpolate(u_e_expr)
L = inner(f, v) * dx + (1 / Re) * (
    -inner(outer(u_D, n), grad(v)) * ds + (alpha / h) * inner(outer(u_D, n), outer(v, n)) * ds
)
L += inner(fem.Constant(msh, default_real_type(0.0)), q) * dx
L_blocked = fem.form(extract_blocks(L))

# Boundary conditions
boundary_facets = mesh.exterior_facet_indices(msh.topology)
boundary_vel_dofs = fem.locate_dofs_topological(V, msh.topology.dim - 1, boundary_facets)
bc_u = fem.dirichletbc(u_D, boundary_vel_dofs)
bcs = [bc_u]

# Assemble Stokes problem
A = assemble_matrix_block(a_blocked, bcs=bcs)
A.assemble()
b = assemble_vector_block(L_blocked, a_blocked, bcs=bcs)

# Create and configure solver
ksp = PETSc.KSP().create(msh.comm)  # type: ignore
ksp.setOperators(A)
ksp.setType("preonly")
ksp.getPC().setType("lu")
ksp.getPC().setFactorSolverType("mumps")
opts = PETSc.Options()  # type: ignore
opts["mat_mumps_icntl_14"] = 80  # Increase MUMPS working memory
opts["mat_mumps_icntl_24"] = 1  # Option to support solving a singular matrix (pressure nullspace)
opts["mat_mumps_icntl_25"] = 0  # Option to support solving a singular matrix (pressure nullspace)
opts["ksp_error_if_not_converged"] = 1
ksp.setFromOptions()

# Solve Stokes for initial condition
x = A.createVecRight()
try:
    ksp.solve(b, x)
except PETSc.Error as e:  # type: ignore
    if e.ierr == 92:
        print("The required PETSc solver/preconditioner is not available. Exiting.")
        print(e)
        exit(0)
    else:
        raise e

# Split the solution
u_h = fem.Function(V)
p_h = fem.Function(Q)
p_h.name = "p"
offset = V.dofmap.index_map.size_local * V.dofmap.index_map_bs
u_h.x.array[:offset] = x.array_r[:offset]
u_h.x.scatter_forward()
p_h.x.array[: (len(x.array_r) - offset)] = x.array_r[offset:]
p_h.x.scatter_forward()

# Subtract the average of the pressure since it is only determined up to
# a constant
p_h.x.array[:] -= domain_average(msh, p_h)

# Write initial condition to file
t = 0.0

if vis:
    u_vis = fem.Function(W)
    u_vis.name = "u"
    u_vis.interpolate(u_h)

    try:
        u_file = io.VTXWriter(msh.comm, "u.bp", u_vis, engine = "BP4")
        p_file = io.VTXWriter(msh.comm, "p.bp", p_h, engine = "BP4")
        u_file.write(t)
        p_file.write(t)
    except AttributeError:
        print("File output requires ADIOS2.")

# Create function to store solution and previous time step
u_n = fem.Function(V)
u_n.x.array[:] = u_h.x.array

lmbda = conditional(gt(dot(u_n, n), 0), 1, 0)
u_uw = lmbda("+") * u("+") + lmbda("-") * u("-")
a += (
    inner(u / delta_t, v) * dx
    - inner(u, div(outer(v, u_n))) * dx
    + inner((dot(u_n, n))("+") * u_uw, v("+")) * dS
    + inner((dot(u_n, n))("-") * u_uw, v("-")) * dS
    + inner(dot(u_n, n) * lmbda * u, v) * ds
)

a_blocked = fem.form(extract_blocks(a))

L += inner(u_n / delta_t, v) * dx - inner(dot(u_n, n) * (1 - lmbda) * u_D, v) * ds
L_blocked = fem.form(extract_blocks(L))

# Time stepping loop
for n in range(num_time_steps):
    print(f"solve {n}-th step", flush=True)

    t += delta_t.value

    A.zeroEntries()
    fem.petsc.assemble_matrix_block(A, a_blocked, bcs=bcs)  # type: ignore
    A.assemble()

    with b.localForm() as b_loc:
        b_loc.set(0)
    fem.petsc.assemble_vector_block(b, L_blocked, a_blocked, bcs=bcs)  # type: ignore

    # Compute solution
    ksp.solve(b, x)

    u_h.x.array[:offset] = x.array_r[:offset]
    u_h.x.scatter_forward()
    p_h.x.array[: (len(x.array_r) - offset)] = x.array_r[offset:]
    p_h.x.scatter_forward()
    p_h.x.array[:] -= domain_average(msh, p_h)

    # Update u_n
    u_n.x.array[:] = u_h.x.array
    
    if vis:
        u_vis.interpolate(u_h)

        # Write to file
        try:
            u_file.write(t)
            p_file.write(t)
        except NameError:
            pass

if vis:
    try:
        u_file.close()
        p_file.close()
    except NameError:
        pass

# Function spaces for exact velocity and pressure
V_e = fem.functionspace(msh, ("Lagrange", k + 3, (gdim,)))
Q_e = fem.functionspace(msh, ("Lagrange", k + 2))

u_e = fem.Function(V_e)
u_e.interpolate(u_e_expr)

# Compute errors
e_u = norm_L2(msh.comm, u_h - u_e)
e_div_u = norm_L2(msh.comm, div(u_h))


if msh.comm.rank == 0:
    print(f"e_u = {e_u}")
    print(f"e_div_u = {e_div_u}")

# This scheme conserves mass exactly, so check this
assert np.isclose(e_div_u, 0.0, atol=float(1.0e5 * np.finfo(default_real_type).eps))

Two things change in 3D, you get more degrees of freedom per element (larger local matrix) and you get another quadrature rule (more points required to do the numerical integral).

You can control the latter, see for instance

Good to know. So the default one is for quadrature_degree>15 with gauss_jacobi rule? Is there suggestion for the choice of quadrature_degree?

It depends on your integrals. For most cases one doesn’t need more than 2P, P being the polynomial degree of the highest polynomial in the space of the basis function.

This varies a bit, but can be estimated for any PDE. The degree goes up if your mesh can be non-affine.