Im not sure about your timings here (as you haven’t supplied them). The problem with its default settings is quite time consuming.
You could have a look at JIT Parameters and visualization using Pandas — FEniCSx tutorial
which shows you how to change the JIT parameters that changes assembly speed:
import time
import dolfinx
import numpy as np
import ufl
from mpi4py import MPI
from petsc4py import PETSc
cffi_options = ["-Ofast", "-g0",ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss "-march=native"]
jit_params = {"cffi_extra_compile_args": cffi_options, "cffi_libraries": ["m"]}
nx = 400
ny = nx
lx, ly = 3, 3
dx = lx/nx
mesh = dolfinx.mesh.create_rectangle(MPI.COMM_WORLD, [np.array([0, 0]), np.array([lx, ly])],
[nx, ny], dolfinx.mesh.CellType.triangle)
T = 0.0003
t = 0
dt = 0.02*(dx)
nt = int(T/dt)
print(f"Num time-steps: {nt} NX: {nx}, NY: {ny}")
isDG = False # Switch between continuous and discontinuous Galerkin
order = 1
q = ufl.VectorElement("CG", mesh.ufl_cell(), order, 3)
if isDG:
q = ufl.VectorElement("DG", mesh.ufl_cell(), order, 3)
V_q = dolfinx.fem.FunctionSpace(mesh, q)
tst_q = ufl.TestFunction(V_q)
sol_q = ufl.TrialFunction(V_q)
sol_q0 = dolfinx.fem.Function(V_q)
sol_qtmp = dolfinx.fem.Function(V_q)
source = dolfinx.fem.Function(V_q)
Ax_mat = np.zeros((3, 3))
Ax_mat[0][2] = 1
Ax_mat[2][0] = 1
Ay_mat = np.zeros((3, 3))
Ay_mat[1][2] = 1
Ay_mat[2][1] = 1
V_M = dolfinx.fem.TensorFunctionSpace(mesh, ("DG", 0), (3, 3))
Ax = dolfinx.fem.Function(V_M)
Ay = dolfinx.fem.Function(V_M)
nbr_cells = len(mesh.topology.original_cell_index)
Ax.x.array[:] = np.tile(Ax_mat.flatten(), nbr_cells)
Ay.x.array[:] = np.tile(Ay_mat.flatten(), nbr_cells)
Mute_str_mat = [
1, 0, 0,
0, 1, 0,
0, 0, 0]
Mute_str = dolfinx.fem.Function(V_M)
Mute_str.x.array[:] = np.tile(Mute_str_mat, nbr_cells)
Fx = Ax*sol_q0
Fy = Ay*sol_q0
n = ufl.FacetNormal(mesh)
Fn_f = n[0]('+')*ufl.avg(Fx)+n[1]('+')*ufl.avg(Fy) - \
0.5*ufl.jump(sol_q0) # Lax-Fridrichs Flux
Fn_b = n[0]*(Ax*(Mute_str*sol_q0))+n[1] * \
(Ay*(Mute_str*sol_q0)) # stress free boundary condition
a0 = ufl.dot(sol_q, tst_q)*ufl.dx
l0 = ufl.dot(source, tst_q)*ufl.dx \
- (ufl.dot(Fx, tst_q.dx(0))*ufl.dx + ufl.dot(Fy, tst_q.dx(1))*ufl.dx) \
+ ufl.dot(Fn_b, tst_q) * ufl.ds
if isDG:
l0 = ufl.dot(source, tst_q) * ufl.dx \
- (ufl.dot(Fx, tst_q.dx(0)) * ufl.dx + ufl.dot(Fy, tst_q.dx(1)) * ufl.dx) \
+ ufl.dot(Fn_b, tst_q) * ufl.ds + \
ufl.dot(Fn_f, ufl.jump(tst_q)) * ufl.dS
L0 = ufl.dot(sol_q0, tst_q)*ufl.dx + dt*l0
form_a0 = dolfinx.fem.form(a0, jit_params=jit_params)
form_L0 = dolfinx.fem.form(L0, jit_params=jit_params)
A0 = dolfinx.fem.petsc.assemble_matrix(form_a0)
b0 = dolfinx.fem.petsc.create_vector(form_L0)
A0.assemble()
solver_q0 = PETSc.KSP().create(mesh.comm)
solver_q0.setOperators(A0)
solver_q0.setType(PETSc.KSP.Type.PREONLY)
solver_q0.getPC().setType(PETSc.PC.Type.LU)
print(f"Is DG: {isDG}")
total_start = time.perf_counter()
for i in range(nt):
start_b = time.perf_counter()
with b0.localForm() as loc_b0:
loc_b0.set(0)
dolfinx.fem.petsc.assemble_vector(b0, form_L0)
end_b = time.perf_counter()
print("assemble b: Time elapsed ", end_b - start_b, " seconds")
# Solve linear problem
start = time.perf_counter()
solver_q0.solve(b0, sol_qtmp.vector)
end = time.perf_counter()
print("solve: Time elapsed ", end - start, " seconds")
sol_qtmp.x.scatter_forward()
sol_q0.x.array[:] = sol_qtmp.x.array
total_end = time.perf_counter()
print(f"Jit Params: {jit_params}")
print(f"Total solve time: {total_end-total_start}")
Output for various options:
root@dokken-XPS-9320:~/shared# python3 mwe.py
Num time-steps: 4 NX: 800, NY: 800
Is DG: True
assemble b: Time elapsed 2.3898451939999177 seconds
solve: Time elapsed 7.893928467000023 seconds
assemble b: Time elapsed 2.3704275959998995 seconds
solve: Time elapsed 0.2115441450000617 seconds
assemble b: Time elapsed 2.4157444520001263 seconds
solve: Time elapsed 0.28561676099980104 seconds
assemble b: Time elapsed 2.883325651999712 seconds
solve: Time elapsed 0.29582896099964273 seconds
Jit Params: {'cffi_extra_compile_args': ['-O2', '-g0'], 'cffi_libraries': ['m']}
Total solve time: 18.789699413000108
root@dokken-XPS-9320:~/shared# python3 mwe.py
Num time-steps: 4 NX: 800, NY: 800
Is DG: True
assemble b: Time elapsed 2.1697077180001543 seconds
solve: Time elapsed 6.109729160999905 seconds
assemble b: Time elapsed 2.06873426500033 seconds
solve: Time elapsed 0.2866915140002675 seconds
assemble b: Time elapsed 2.0706592950000413 seconds
solve: Time elapsed 0.2797495400000116 seconds
assemble b: Time elapsed 2.0893385489998764 seconds
solve: Time elapsed 0.28623593199972674 seconds
Jit Params: {'cffi_extra_compile_args': ['-Ofast', '-g0', '-march=native'], 'cffi_libraries': ['m']}
Total solve time: 15.399842486000125
Num time-steps: 4 NX: 1000, NY: 1000
Is DG: True
assemble b: Time elapsed 3.7569394560000546 seconds
solve: Time elapsed 9.479718907000006 seconds
assemble b: Time elapsed 3.198316194000199 seconds
solve: Time elapsed 0.3721905079996759 seconds
assemble b: Time elapsed 3.1768408249999993 seconds
solve: Time elapsed 0.3744846109998434 seconds
assemble b: Time elapsed 3.140865130000293 seconds
solve: Time elapsed 0.373734142000103 seconds
Jit Params: {'cffi_extra_compile_args': ['-Ofast', '-g0', '-march=native'], 'cffi_libraries': ['m']}
Total solve time: 23.917673111000113
Num time-steps: 4 NX: 1000, NY: 1000
Is DG: True
assemble b: Time elapsed 3.5047534059999634 seconds
solve: Time elapsed 9.418209662000208 seconds
assemble b: Time elapsed 3.437472118000187 seconds
solve: Time elapsed 0.3693447730001935 seconds
assemble b: Time elapsed 3.450379697000244 seconds
solve: Time elapsed 0.3727702920000411 seconds
assemble b: Time elapsed 3.416163740999764 seconds
solve: Time elapsed 0.37302657599957456 seconds
Jit Params: {'cffi_extra_compile_args': ['-O2', '-g0'], 'cffi_libraries': ['m']}
Total solve time: 24.38624407599991
As you can see, by using “-march=native” and “-Ofast”, you gain about a second for 4 time steps.
The runtime for setting up the first CG LU factorization is very long, so it takes quite alot of time steps for the DG assembly to be slower.
You could also play around with using quadrilateral elements, as it would reduce the number of dofs in your problem (at least if you are considering unit squares)