Efficient implementation of DG for acoustic wave equation

Im not sure about your timings here (as you haven’t supplied them). The problem with its default settings is quite time consuming.
You could have a look at JIT Parameters and visualization using Pandas — FEniCSx tutorial
which shows you how to change the JIT parameters that changes assembly speed:

import time

import dolfinx
import numpy as np
import ufl
from mpi4py import MPI
from petsc4py import PETSc

cffi_options = ["-Ofast", "-g0",ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss "-march=native"]
jit_params = {"cffi_extra_compile_args": cffi_options, "cffi_libraries": ["m"]}

nx = 400
ny = nx

lx, ly = 3, 3
dx = lx/nx
mesh = dolfinx.mesh.create_rectangle(MPI.COMM_WORLD, [np.array([0, 0]), np.array([lx, ly])],
                                     [nx, ny], dolfinx.mesh.CellType.triangle)

T = 0.0003
t = 0
dt = 0.02*(dx)
nt = int(T/dt)
print(f"Num time-steps: {nt} NX: {nx}, NY: {ny}")
isDG = False  # Switch between continuous and discontinuous Galerkin

order = 1
q = ufl.VectorElement("CG", mesh.ufl_cell(), order, 3)
if isDG:
    q = ufl.VectorElement("DG", mesh.ufl_cell(), order, 3)

V_q = dolfinx.fem.FunctionSpace(mesh, q)

tst_q = ufl.TestFunction(V_q)
sol_q = ufl.TrialFunction(V_q)

sol_q0 = dolfinx.fem.Function(V_q)
sol_qtmp = dolfinx.fem.Function(V_q)
source = dolfinx.fem.Function(V_q)

Ax_mat = np.zeros((3, 3))
Ax_mat[0][2] = 1
Ax_mat[2][0] = 1

Ay_mat = np.zeros((3, 3))
Ay_mat[1][2] = 1
Ay_mat[2][1] = 1

V_M = dolfinx.fem.TensorFunctionSpace(mesh, ("DG", 0), (3, 3))
Ax = dolfinx.fem.Function(V_M)
Ay = dolfinx.fem.Function(V_M)

nbr_cells = len(mesh.topology.original_cell_index)
Ax.x.array[:] = np.tile(Ax_mat.flatten(), nbr_cells)
Ay.x.array[:] = np.tile(Ay_mat.flatten(), nbr_cells)

Mute_str_mat = [
    1, 0, 0,
    0, 1, 0,
    0, 0, 0]
Mute_str = dolfinx.fem.Function(V_M)
Mute_str.x.array[:] = np.tile(Mute_str_mat, nbr_cells)

Fx = Ax*sol_q0
Fy = Ay*sol_q0

n = ufl.FacetNormal(mesh)

Fn_f = n[0]('+')*ufl.avg(Fx)+n[1]('+')*ufl.avg(Fy) - \
    0.5*ufl.jump(sol_q0)  # Lax-Fridrichs Flux

Fn_b = n[0]*(Ax*(Mute_str*sol_q0))+n[1] * \
    (Ay*(Mute_str*sol_q0))  # stress free boundary condition

a0 = ufl.dot(sol_q, tst_q)*ufl.dx
l0 = ufl.dot(source, tst_q)*ufl.dx \
    - (ufl.dot(Fx, tst_q.dx(0))*ufl.dx + ufl.dot(Fy, tst_q.dx(1))*ufl.dx)  \
    + ufl.dot(Fn_b, tst_q) * ufl.ds

if isDG:
    l0 = ufl.dot(source, tst_q) * ufl.dx \
        - (ufl.dot(Fx, tst_q.dx(0)) * ufl.dx + ufl.dot(Fy, tst_q.dx(1)) * ufl.dx) \
        + ufl.dot(Fn_b, tst_q) * ufl.ds + \
        ufl.dot(Fn_f, ufl.jump(tst_q)) * ufl.dS


L0 = ufl.dot(sol_q0, tst_q)*ufl.dx + dt*l0

form_a0 = dolfinx.fem.form(a0, jit_params=jit_params)
form_L0 = dolfinx.fem.form(L0, jit_params=jit_params)

A0 = dolfinx.fem.petsc.assemble_matrix(form_a0)
b0 = dolfinx.fem.petsc.create_vector(form_L0)

A0.assemble()

solver_q0 = PETSc.KSP().create(mesh.comm)
solver_q0.setOperators(A0)
solver_q0.setType(PETSc.KSP.Type.PREONLY)
solver_q0.getPC().setType(PETSc.PC.Type.LU)
print(f"Is DG: {isDG}")
total_start = time.perf_counter()
for i in range(nt):

    start_b = time.perf_counter()
    with b0.localForm() as loc_b0:
        loc_b0.set(0)
    dolfinx.fem.petsc.assemble_vector(b0, form_L0)
    end_b = time.perf_counter()
    print("assemble b: Time elapsed ", end_b - start_b, " seconds")
    # Solve linear problem
    start = time.perf_counter()
    solver_q0.solve(b0, sol_qtmp.vector)
    end = time.perf_counter()
    print("solve: Time elapsed ", end - start, " seconds")
    sol_qtmp.x.scatter_forward()

    sol_q0.x.array[:] = sol_qtmp.x.array
total_end = time.perf_counter()
print(f"Jit Params: {jit_params}")
print(f"Total solve time: {total_end-total_start}")

Output for various options:

root@dokken-XPS-9320:~/shared# python3 mwe.py 
Num time-steps: 4 NX: 800, NY: 800
Is DG: True
assemble b: Time elapsed  2.3898451939999177  seconds
solve: Time elapsed  7.893928467000023  seconds
assemble b: Time elapsed  2.3704275959998995  seconds
solve: Time elapsed  0.2115441450000617  seconds
assemble b: Time elapsed  2.4157444520001263  seconds
solve: Time elapsed  0.28561676099980104  seconds
assemble b: Time elapsed  2.883325651999712  seconds
solve: Time elapsed  0.29582896099964273  seconds
Jit Params: {'cffi_extra_compile_args': ['-O2', '-g0'], 'cffi_libraries': ['m']}
Total solve time: 18.789699413000108
root@dokken-XPS-9320:~/shared# python3 mwe.py 
Num time-steps: 4 NX: 800, NY: 800
Is DG: True
assemble b: Time elapsed  2.1697077180001543  seconds
solve: Time elapsed  6.109729160999905  seconds
assemble b: Time elapsed  2.06873426500033  seconds
solve: Time elapsed  0.2866915140002675  seconds
assemble b: Time elapsed  2.0706592950000413  seconds
solve: Time elapsed  0.2797495400000116  seconds
assemble b: Time elapsed  2.0893385489998764  seconds
solve: Time elapsed  0.28623593199972674  seconds
Jit Params: {'cffi_extra_compile_args': ['-Ofast', '-g0', '-march=native'], 'cffi_libraries': ['m']}
Total solve time: 15.399842486000125
Num time-steps: 4 NX: 1000, NY: 1000
Is DG: True
assemble b: Time elapsed  3.7569394560000546  seconds
solve: Time elapsed  9.479718907000006  seconds
assemble b: Time elapsed  3.198316194000199  seconds
solve: Time elapsed  0.3721905079996759  seconds
assemble b: Time elapsed  3.1768408249999993  seconds
solve: Time elapsed  0.3744846109998434  seconds
assemble b: Time elapsed  3.140865130000293  seconds
solve: Time elapsed  0.373734142000103  seconds
Jit Params: {'cffi_extra_compile_args': ['-Ofast', '-g0', '-march=native'], 'cffi_libraries': ['m']}
Total solve time: 23.917673111000113
Num time-steps: 4 NX: 1000, NY: 1000
Is DG: True
assemble b: Time elapsed  3.5047534059999634  seconds
solve: Time elapsed  9.418209662000208  seconds
assemble b: Time elapsed  3.437472118000187  seconds
solve: Time elapsed  0.3693447730001935  seconds
assemble b: Time elapsed  3.450379697000244  seconds
solve: Time elapsed  0.3727702920000411  seconds
assemble b: Time elapsed  3.416163740999764  seconds
solve: Time elapsed  0.37302657599957456  seconds
Jit Params: {'cffi_extra_compile_args': ['-O2', '-g0'], 'cffi_libraries': ['m']}
Total solve time: 24.38624407599991

As you can see, by using “-march=native” and “-Ofast”, you gain about a second for 4 time steps.

The runtime for setting up the first CG LU factorization is very long, so it takes quite alot of time steps for the DG assembly to be slower.

You could also play around with using quadrilateral elements, as it would reduce the number of dofs in your problem (at least if you are considering unit squares)