# Efficient implementation of DG for acoustic wave equation

Hi all,

I’m trying to model acoustic waves using DG. I have roughly followed the DG discretization described in this thesis. My implementation seems to work. The problem is that it runs slower than a CG version of the same code, for practical number of time steps (>1000). DG’s mass matrix is block diagonal, so its inversion is considerably faster than CG’s mass matrix inversion. However, the DG formulation has an additional term (integral of numerical fluxes on internal faces) whose assembly at each iteration seems to be dominating DG’s running time, making it less time efficient than CG. My question: is there a more efficient way of assembling that flux vector that I may not be aware of? You’ll find a MWE below, where you can switch between CG and DG by changing the `isDG` variable.

Kind regards,

``````
from mpi4py import MPI
import numpy as np
import dolfinx
import ufl
from petsc4py import PETSc
import time

nx=1000
ny=nx

lx,ly=3,3
dx=lx/nx
mesh = dolfinx.mesh.create_rectangle(MPI.COMM_WORLD, [np.array([0, 0]), np.array([lx, ly])],
[nx, ny], dolfinx.mesh.CellType.triangle)

T=0.0003
t = 0
dt = 0.02*(dx)
nt=int(T/dt)

isDG=False #Switch between continuous and discontinuous Galerkin

order=1
q = ufl.VectorElement("CG", mesh.ufl_cell(), order, 3)
if isDG:
q = ufl.VectorElement("DG", mesh.ufl_cell(), order, 3)

V_q = dolfinx.fem.FunctionSpace(mesh, q)

tst_q=ufl.TestFunction(V_q)
sol_q=ufl.TrialFunction(V_q)

sol_q0=dolfinx.fem.Function(V_q)
sol_qtmp=dolfinx.fem.Function(V_q)
source=dolfinx.fem.Function(V_q)

Ax_mat=np.zeros((3,3))
Ax_mat[0][2]=1
Ax_mat[2][0]=1

Ay_mat=np.zeros((3,3))
Ay_mat[1][2]=1
Ay_mat[2][1]=1

V_M = dolfinx.fem.TensorFunctionSpace(mesh, ("DG", 0),(3,3))
Ax=dolfinx.fem.Function(V_M)
Ay=dolfinx.fem.Function(V_M)

nbr_cells=len(mesh.topology.original_cell_index)
Ax.x.array[:]= np.tile(Ax_mat.flatten(),nbr_cells)
Ay.x.array[:]= np.tile(Ay_mat.flatten(),nbr_cells)

Mute_str_mat=[\
1,0,0, \
0,1,0, \
0,0,0]
Mute_str=dolfinx.fem.Function(V_M)
Mute_str.x.array[:]=np.tile(Mute_str_mat,nbr_cells)

Fx=Ax*sol_q0
Fy=Ay*sol_q0

n=ufl.FacetNormal(mesh)

Fn_f=n[0]('+')*ufl.avg(Fx)+n[1]('+')*ufl.avg(Fy)-0.5*ufl.jump(sol_q0) # Lax-Fridrichs Flux

Fn_b=n[0]*(Ax*(Mute_str*sol_q0))+n[1]*(Ay*(Mute_str*sol_q0)) # stress free boundary condition

a0= ufl.dot(sol_q,tst_q)*ufl.dx
l0= ufl.dot(source,tst_q)*ufl.dx \
-( ufl.dot(Fx,tst_q.dx(0))*ufl.dx + ufl.dot(Fy,tst_q.dx(1))*ufl.dx )  \
+ ufl.dot(Fn_b, tst_q) * ufl.ds

if isDG:
l0 = ufl.dot(source, tst_q) * ufl.dx \
- (ufl.dot(Fx, tst_q.dx(0)) * ufl.dx + ufl.dot(Fy, tst_q.dx(1)) * ufl.dx) \
+ ufl.dot(Fn_b, tst_q) * ufl.ds + ufl.dot(Fn_f, ufl.jump(tst_q)) * ufl.dS

L0= ufl.dot(sol_q0,tst_q)*ufl.dx +  dt*l0

form_a0=dolfinx.fem.form(a0)
form_L0=dolfinx.fem.form(L0)

A0=dolfinx.fem.petsc.assemble_matrix(form_a0)
b0=dolfinx.fem.petsc.create_vector(form_L0)

A0.assemble()

solver_q0 = PETSc.KSP().create(mesh.comm)
solver_q0.setOperators(A0)
solver_q0.setType(PETSc.KSP.Type.PREONLY)
solver_q0.getPC().setType(PETSc.PC.Type.LU)

for i in range(nt):

start_b = time.time()
with b0.localForm() as loc_b0:
loc_b0.set(0)
dolfinx.fem.petsc.assemble_vector(b0, form_L0)
print("assemble b: Time elapsed ",time.time() - start_b," seconds")
# Solve linear problem
start = time.time()
solver_q0.solve(b0, sol_qtmp.vector)
print("solve: Time elapsed ",time.time() - start," seconds")
sol_qtmp.x.scatter_forward()

sol_q0.x.array[:] = sol_qtmp.x.array

``````

Im not sure about your timings here (as you haven’t supplied them). The problem with its default settings is quite time consuming.
You could have a look at JIT Parameters and visualization using Pandas — FEniCSx tutorial
which shows you how to change the JIT parameters that changes assembly speed:

``````import time

import dolfinx
import numpy as np
import ufl
from mpi4py import MPI
from petsc4py import PETSc

cffi_options = ["-Ofast", "-g0",ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss "-march=native"]
jit_params = {"cffi_extra_compile_args": cffi_options, "cffi_libraries": ["m"]}

nx = 400
ny = nx

lx, ly = 3, 3
dx = lx/nx
mesh = dolfinx.mesh.create_rectangle(MPI.COMM_WORLD, [np.array([0, 0]), np.array([lx, ly])],
[nx, ny], dolfinx.mesh.CellType.triangle)

T = 0.0003
t = 0
dt = 0.02*(dx)
nt = int(T/dt)
print(f"Num time-steps: {nt} NX: {nx}, NY: {ny}")
isDG = False  # Switch between continuous and discontinuous Galerkin

order = 1
q = ufl.VectorElement("CG", mesh.ufl_cell(), order, 3)
if isDG:
q = ufl.VectorElement("DG", mesh.ufl_cell(), order, 3)

V_q = dolfinx.fem.FunctionSpace(mesh, q)

tst_q = ufl.TestFunction(V_q)
sol_q = ufl.TrialFunction(V_q)

sol_q0 = dolfinx.fem.Function(V_q)
sol_qtmp = dolfinx.fem.Function(V_q)
source = dolfinx.fem.Function(V_q)

Ax_mat = np.zeros((3, 3))
Ax_mat[0][2] = 1
Ax_mat[2][0] = 1

Ay_mat = np.zeros((3, 3))
Ay_mat[1][2] = 1
Ay_mat[2][1] = 1

V_M = dolfinx.fem.TensorFunctionSpace(mesh, ("DG", 0), (3, 3))
Ax = dolfinx.fem.Function(V_M)
Ay = dolfinx.fem.Function(V_M)

nbr_cells = len(mesh.topology.original_cell_index)
Ax.x.array[:] = np.tile(Ax_mat.flatten(), nbr_cells)
Ay.x.array[:] = np.tile(Ay_mat.flatten(), nbr_cells)

Mute_str_mat = [
1, 0, 0,
0, 1, 0,
0, 0, 0]
Mute_str = dolfinx.fem.Function(V_M)
Mute_str.x.array[:] = np.tile(Mute_str_mat, nbr_cells)

Fx = Ax*sol_q0
Fy = Ay*sol_q0

n = ufl.FacetNormal(mesh)

Fn_f = n[0]('+')*ufl.avg(Fx)+n[1]('+')*ufl.avg(Fy) - \
0.5*ufl.jump(sol_q0)  # Lax-Fridrichs Flux

Fn_b = n[0]*(Ax*(Mute_str*sol_q0))+n[1] * \
(Ay*(Mute_str*sol_q0))  # stress free boundary condition

a0 = ufl.dot(sol_q, tst_q)*ufl.dx
l0 = ufl.dot(source, tst_q)*ufl.dx \
- (ufl.dot(Fx, tst_q.dx(0))*ufl.dx + ufl.dot(Fy, tst_q.dx(1))*ufl.dx)  \
+ ufl.dot(Fn_b, tst_q) * ufl.ds

if isDG:
l0 = ufl.dot(source, tst_q) * ufl.dx \
- (ufl.dot(Fx, tst_q.dx(0)) * ufl.dx + ufl.dot(Fy, tst_q.dx(1)) * ufl.dx) \
+ ufl.dot(Fn_b, tst_q) * ufl.ds + \
ufl.dot(Fn_f, ufl.jump(tst_q)) * ufl.dS

L0 = ufl.dot(sol_q0, tst_q)*ufl.dx + dt*l0

form_a0 = dolfinx.fem.form(a0, jit_params=jit_params)
form_L0 = dolfinx.fem.form(L0, jit_params=jit_params)

A0 = dolfinx.fem.petsc.assemble_matrix(form_a0)
b0 = dolfinx.fem.petsc.create_vector(form_L0)

A0.assemble()

solver_q0 = PETSc.KSP().create(mesh.comm)
solver_q0.setOperators(A0)
solver_q0.setType(PETSc.KSP.Type.PREONLY)
solver_q0.getPC().setType(PETSc.PC.Type.LU)
print(f"Is DG: {isDG}")
total_start = time.perf_counter()
for i in range(nt):

start_b = time.perf_counter()
with b0.localForm() as loc_b0:
loc_b0.set(0)
dolfinx.fem.petsc.assemble_vector(b0, form_L0)
end_b = time.perf_counter()
print("assemble b: Time elapsed ", end_b - start_b, " seconds")
# Solve linear problem
start = time.perf_counter()
solver_q0.solve(b0, sol_qtmp.vector)
end = time.perf_counter()
print("solve: Time elapsed ", end - start, " seconds")
sol_qtmp.x.scatter_forward()

sol_q0.x.array[:] = sol_qtmp.x.array
total_end = time.perf_counter()
print(f"Jit Params: {jit_params}")
print(f"Total solve time: {total_end-total_start}")
``````

Output for various options:

``````root@dokken-XPS-9320:~/shared# python3 mwe.py
Num time-steps: 4 NX: 800, NY: 800
Is DG: True
assemble b: Time elapsed  2.3898451939999177  seconds
solve: Time elapsed  7.893928467000023  seconds
assemble b: Time elapsed  2.3704275959998995  seconds
solve: Time elapsed  0.2115441450000617  seconds
assemble b: Time elapsed  2.4157444520001263  seconds
solve: Time elapsed  0.28561676099980104  seconds
assemble b: Time elapsed  2.883325651999712  seconds
solve: Time elapsed  0.29582896099964273  seconds
Jit Params: {'cffi_extra_compile_args': ['-O2', '-g0'], 'cffi_libraries': ['m']}
Total solve time: 18.789699413000108
root@dokken-XPS-9320:~/shared# python3 mwe.py
Num time-steps: 4 NX: 800, NY: 800
Is DG: True
assemble b: Time elapsed  2.1697077180001543  seconds
solve: Time elapsed  6.109729160999905  seconds
assemble b: Time elapsed  2.06873426500033  seconds
solve: Time elapsed  0.2866915140002675  seconds
assemble b: Time elapsed  2.0706592950000413  seconds
solve: Time elapsed  0.2797495400000116  seconds
assemble b: Time elapsed  2.0893385489998764  seconds
solve: Time elapsed  0.28623593199972674  seconds
Jit Params: {'cffi_extra_compile_args': ['-Ofast', '-g0', '-march=native'], 'cffi_libraries': ['m']}
Total solve time: 15.399842486000125
``````
``````Num time-steps: 4 NX: 1000, NY: 1000
Is DG: True
assemble b: Time elapsed  3.7569394560000546  seconds
solve: Time elapsed  9.479718907000006  seconds
assemble b: Time elapsed  3.198316194000199  seconds
solve: Time elapsed  0.3721905079996759  seconds
assemble b: Time elapsed  3.1768408249999993  seconds
solve: Time elapsed  0.3744846109998434  seconds
assemble b: Time elapsed  3.140865130000293  seconds
solve: Time elapsed  0.373734142000103  seconds
Jit Params: {'cffi_extra_compile_args': ['-Ofast', '-g0', '-march=native'], 'cffi_libraries': ['m']}
Total solve time: 23.917673111000113
``````
``````Num time-steps: 4 NX: 1000, NY: 1000
Is DG: True
assemble b: Time elapsed  3.5047534059999634  seconds
solve: Time elapsed  9.418209662000208  seconds
assemble b: Time elapsed  3.437472118000187  seconds
solve: Time elapsed  0.3693447730001935  seconds
assemble b: Time elapsed  3.450379697000244  seconds
solve: Time elapsed  0.3727702920000411  seconds
assemble b: Time elapsed  3.416163740999764  seconds
solve: Time elapsed  0.37302657599957456  seconds
Jit Params: {'cffi_extra_compile_args': ['-O2', '-g0'], 'cffi_libraries': ['m']}
Total solve time: 24.38624407599991
``````

As you can see, by using “-march=native” and “-Ofast”, you gain about a second for 4 time steps.

The runtime for setting up the first CG LU factorization is very long, so it takes quite alot of time steps for the DG assembly to be slower.

You could also play around with using quadrilateral elements, as it would reduce the number of dofs in your problem (at least if you are considering unit squares)

Thank you, dokken, for the timely response and the JIT tip. I’ll take a closer look into JIT parameters.

Happy New Year!