Intel vtune profiler was used to measure the performance of FEniCSx programmes. The result of 2D cases shows that the linear solver related cost is dominant in the simulation as expected, while tabulate_tensor_integral
becomes dominant in 3D. This is a common situation across different codes.
Questions:
- Why this happens? It is not the solving linear system but integral step (assemble) takes the most amount of time in 3D.
- If possible to speedup this part in the 3D simulation?
To demonstrate it, I adapted the official navier-stokes code (Divergence conforming discontinuous Galerkin method for the Navier–Stokes equations — DOLFINx 0.9.0 documentation) and attached it in the end. The code can be run in either 2D and 3D by setting ndims = 2 # or 3
. Its size is 64^2 (=4,096) in 2D and 16^3 (=4,096) in 3D, where the number of DOF is kept identical. tabulate_tensor_integral
takes 3.8% and round 30% of total time repsectively.
To run the v0.9.0 code with and without vtune:
vtune -collect hotspots -knob sampling-mode=hw -knob enable-stack-collection=true numactl --physcpubind=1 python demo_navier-stokes.py
python demo_stokes.py
2D, 3D vtune profiling result and MWE are
# demo_navier-stokes.py
import os
os.environ["OMP_NUM_THREADS"] = "1"
from mpi4py import MPI
import numpy as np
from dolfinx import default_real_type, fem, io, mesh
from dolfinx.fem.petsc import assemble_matrix_block, assemble_vector_block
from ufl import (
CellDiameter,
FacetNormal,
MixedFunctionSpace,
TestFunctions,
TrialFunctions,
avg,
conditional,
div,
dot,
dS,
ds,
dx,
extract_blocks,
grad,
gt,
inner,
outer,
)
try:
from petsc4py import PETSc
import dolfinx
if not dolfinx.has_petsc:
print("This demo requires DOLFINx to be compiled with PETSc enabled.")
exit(0)
except ModuleNotFoundError:
print("This demo requires petsc4py.")
exit(0)
if np.issubdtype(PETSc.ScalarType, np.complexfloating): # type: ignore
print("Demo should only be executed with DOLFINx real mode")
exit(0)
# +
def norm_L2(comm, v):
"""Compute the L2(Ω)-norm of v"""
return np.sqrt(comm.allreduce(fem.assemble_scalar(fem.form(inner(v, v) * dx)), op=MPI.SUM))
def domain_average(msh, v):
"""Compute the average of a function over the domain"""
vol = msh.comm.allreduce(
fem.assemble_scalar(fem.form(fem.Constant(msh, default_real_type(1.0)) * dx)), op=MPI.SUM
)
return (1 / vol) * msh.comm.allreduce(fem.assemble_scalar(fem.form(v * dx)), op=MPI.SUM)
def u_e_expr(x):
"""Expression for the exact velocity solution to Kovasznay flow"""
result = [
1 - np.exp((Re / 2 - np.sqrt(Re**2 / 4 + 4 * np.pi**2)) * x[0])
* np.cos(2 * np.pi * x[1]),
(Re / 2 - np.sqrt(Re**2 / 4 + 4 * np.pi**2))
/ (2 * np.pi)
* np.exp((Re / 2 - np.sqrt(Re**2 / 4 + 4 * np.pi**2)) * x[0])
* np.sin(2 * np.pi * x[1]),
]
if ndims == 3:
result.append(0 * x[2])
return np.vstack(
result
)
def p_e_expr(x):
"""Expression for the exact pressure solution to Kovasznay flow"""
return (1 / 2) * (1 - np.exp(2 * (Re / 2 - np.sqrt(Re**2 / 4 + 4 * np.pi**2)) * x[0]))
def f_expr(x):
"""Expression for the applied force"""
return np.vstack((np.zeros_like(x[0]), np.zeros_like(x[0])))
num_time_steps = 3
t_end = 10
Re = 25 # Reynolds Number
k = 1 # Polynomial degree
vis = False # if to visualize the result
ndims = 2
if ndims == 2:
n = 64
msh = mesh.create_unit_square(MPI.COMM_WORLD, n, n)
elif ndims == 3:
n = 16
msh = mesh.create_unit_cube(MPI.COMM_WORLD, n, n, n)
# Function spaces for the velocity and for the pressure
V = fem.functionspace(msh, ("Raviart-Thomas", k + 1))
Q = fem.functionspace(msh, ("Discontinuous Lagrange", k))
VQ = MixedFunctionSpace(V, Q)
# Funcion space for visualising the velocity field
gdim = msh.geometry.dim
W = fem.functionspace(msh, ("Discontinuous Lagrange", k + 1, (gdim,)))
# Define trial and test functions
u, p = TrialFunctions(VQ)
v, q = TestFunctions(VQ)
delta_t = fem.Constant(msh, default_real_type(t_end / num_time_steps))
alpha = fem.Constant(msh, default_real_type(6.0 * k**2))
h = CellDiameter(msh)
n = FacetNormal(msh)
def jump(phi, n):
return outer(phi("+"), n("+")) + outer(phi("-"), n("-"))
a = (1.0 / Re) * (
inner(grad(u), grad(v)) * dx
- inner(avg(grad(u)), jump(v, n)) * dS
- inner(jump(u, n), avg(grad(v))) * dS
+ (alpha / avg(h)) * inner(jump(u, n), jump(v, n)) * dS
- inner(grad(u), outer(v, n)) * ds
- inner(outer(u, n), grad(v)) * ds
+ (alpha / h) * inner(outer(u, n), outer(v, n)) * ds
)
a -= inner(p, div(v)) * dx
a -= inner(div(u), q) * dx
a_blocked = fem.form(extract_blocks(a))
f = fem.Function(W)
u_D = fem.Function(V)
u_D.interpolate(u_e_expr)
L = inner(f, v) * dx + (1 / Re) * (
-inner(outer(u_D, n), grad(v)) * ds + (alpha / h) * inner(outer(u_D, n), outer(v, n)) * ds
)
L += inner(fem.Constant(msh, default_real_type(0.0)), q) * dx
L_blocked = fem.form(extract_blocks(L))
# Boundary conditions
boundary_facets = mesh.exterior_facet_indices(msh.topology)
boundary_vel_dofs = fem.locate_dofs_topological(V, msh.topology.dim - 1, boundary_facets)
bc_u = fem.dirichletbc(u_D, boundary_vel_dofs)
bcs = [bc_u]
# Assemble Stokes problem
A = assemble_matrix_block(a_blocked, bcs=bcs)
A.assemble()
b = assemble_vector_block(L_blocked, a_blocked, bcs=bcs)
# Create and configure solver
ksp = PETSc.KSP().create(msh.comm) # type: ignore
ksp.setOperators(A)
ksp.setType("preonly")
ksp.getPC().setType("lu")
ksp.getPC().setFactorSolverType("mumps")
opts = PETSc.Options() # type: ignore
opts["mat_mumps_icntl_14"] = 80 # Increase MUMPS working memory
opts["mat_mumps_icntl_24"] = 1 # Option to support solving a singular matrix (pressure nullspace)
opts["mat_mumps_icntl_25"] = 0 # Option to support solving a singular matrix (pressure nullspace)
opts["ksp_error_if_not_converged"] = 1
ksp.setFromOptions()
# Solve Stokes for initial condition
x = A.createVecRight()
try:
ksp.solve(b, x)
except PETSc.Error as e: # type: ignore
if e.ierr == 92:
print("The required PETSc solver/preconditioner is not available. Exiting.")
print(e)
exit(0)
else:
raise e
# Split the solution
u_h = fem.Function(V)
p_h = fem.Function(Q)
p_h.name = "p"
offset = V.dofmap.index_map.size_local * V.dofmap.index_map_bs
u_h.x.array[:offset] = x.array_r[:offset]
u_h.x.scatter_forward()
p_h.x.array[: (len(x.array_r) - offset)] = x.array_r[offset:]
p_h.x.scatter_forward()
# Subtract the average of the pressure since it is only determined up to
# a constant
p_h.x.array[:] -= domain_average(msh, p_h)
# Write initial condition to file
t = 0.0
if vis:
u_vis = fem.Function(W)
u_vis.name = "u"
u_vis.interpolate(u_h)
try:
u_file = io.VTXWriter(msh.comm, "u.bp", u_vis, engine = "BP4")
p_file = io.VTXWriter(msh.comm, "p.bp", p_h, engine = "BP4")
u_file.write(t)
p_file.write(t)
except AttributeError:
print("File output requires ADIOS2.")
# Create function to store solution and previous time step
u_n = fem.Function(V)
u_n.x.array[:] = u_h.x.array
lmbda = conditional(gt(dot(u_n, n), 0), 1, 0)
u_uw = lmbda("+") * u("+") + lmbda("-") * u("-")
a += (
inner(u / delta_t, v) * dx
- inner(u, div(outer(v, u_n))) * dx
+ inner((dot(u_n, n))("+") * u_uw, v("+")) * dS
+ inner((dot(u_n, n))("-") * u_uw, v("-")) * dS
+ inner(dot(u_n, n) * lmbda * u, v) * ds
)
a_blocked = fem.form(extract_blocks(a))
L += inner(u_n / delta_t, v) * dx - inner(dot(u_n, n) * (1 - lmbda) * u_D, v) * ds
L_blocked = fem.form(extract_blocks(L))
# Time stepping loop
for n in range(num_time_steps):
print(f"solve {n}-th step", flush=True)
t += delta_t.value
A.zeroEntries()
fem.petsc.assemble_matrix_block(A, a_blocked, bcs=bcs) # type: ignore
A.assemble()
with b.localForm() as b_loc:
b_loc.set(0)
fem.petsc.assemble_vector_block(b, L_blocked, a_blocked, bcs=bcs) # type: ignore
# Compute solution
ksp.solve(b, x)
u_h.x.array[:offset] = x.array_r[:offset]
u_h.x.scatter_forward()
p_h.x.array[: (len(x.array_r) - offset)] = x.array_r[offset:]
p_h.x.scatter_forward()
p_h.x.array[:] -= domain_average(msh, p_h)
# Update u_n
u_n.x.array[:] = u_h.x.array
if vis:
u_vis.interpolate(u_h)
# Write to file
try:
u_file.write(t)
p_file.write(t)
except NameError:
pass
if vis:
try:
u_file.close()
p_file.close()
except NameError:
pass
# Function spaces for exact velocity and pressure
V_e = fem.functionspace(msh, ("Lagrange", k + 3, (gdim,)))
Q_e = fem.functionspace(msh, ("Lagrange", k + 2))
u_e = fem.Function(V_e)
u_e.interpolate(u_e_expr)
# Compute errors
e_u = norm_L2(msh.comm, u_h - u_e)
e_div_u = norm_L2(msh.comm, div(u_h))
if msh.comm.rank == 0:
print(f"e_u = {e_u}")
print(f"e_div_u = {e_div_u}")
# This scheme conserves mass exactly, so check this
assert np.isclose(e_div_u, 0.0, atol=float(1.0e5 * np.finfo(default_real_type).eps))