Assembly timings for hexahedral elements slow

Hi,
I was testing timings for assembling a hyperelastic form (in serial) and noticed that the timings for hex and tet elements (simple first order trilinear and linear elements respectively) were significantly different. While the number of degrees of freedom are the same (in this case) the assembly timings aren’t. Is such a big difference expected?

Note that I’m yet to try in parallel, but I was a bit thrown off with this big a difference. Anything I missed below that is causing such a huge difference in the run-times?

This is on dolfinx:nightly image from docker with commit-hash: 9d4002772ed9650a6720cf092948a2fd5f2f20cb

Timings
# Assembly Runtimes (unit-cube)

| DOF (tet) | DOF (hex) |             tet                  |           hex                   |
| --------- |  -------  | -------------------------------- | ------------------------------- |
|     81    |      81   |             0.00061              |             0.01417             |
|    375    |     375   |             0.00222              |             0.10319             |
|   2187    |    2187   |             0.01302              |             0.85903             |
|  14739    |   14739   |             0.11389              |             7.12959             |

speedupHexTet

Testing script
from timeit import timeit

import matplotlib.pyplot as plt
import numpy as np
from mpi4py import MPI

from dolfinx.mesh import create_unit_cube, CellType
from ufl import (
    Measure,
    grad,
    tr,
    Identity,
    det,
    TrialFunction,
    TestFunction,
    ln,
    derivative,
)
from basix.ufl import element
from dolfinx.log import LogLevel, set_output_file, set_log_level
from dolfinx import default_scalar_type
from dolfinx import git_commit_hash
from dolfinx.fem import (
    form,
    petsc,
    Function,
    functionspace,
)
import os

set_log_level(LogLevel.WARNING)

print(f"hash: {git_commit_hash}")
results_dir = os.path.join(os.getcwd(), "timingsDolfinxAssembly")
os.makedirs(results_dir, exist_ok=True)
set_output_file(os.path.join(results_dir, "logTimingsHex.txt"))


def load_mesh(n, cell_type):
    mesh_dolfinx = create_unit_cube(MPI.COMM_WORLD, n, n, n, cell_type=cell_type)
    return mesh_dolfinx


def pre_hyperelastic_dolfinx(mesh, **kwargs):
    quadrature_degree = 1
    ord = 1
    elemu = element("P", mesh.basix_cell(), ord, shape=(3,))
    V = functionspace(
        mesh, elemu, form_compiler_options={"quadrature_degree": quadrature_degree}
    )
    u = Function(V, name="u")
    v = TestFunction(V)
    du = TrialFunction(V)
    F = Identity(len(u)) + grad(u)
    C = F.T * F
    Ic = tr(C)
    J = det(F)
    dx = Measure("dx", domain=mesh)
    mu, lmbda = default_scalar_type(1.0), default_scalar_type(2.0)
    psi = 1 / 2 * (mu * (Ic - 3)) - mu * ln(J) + lmbda / 2 * ln(J) ** 2
    stress = derivative(psi, u, v) * dx
    jacobian = derivative(stress, u, du)
    return jacobian


def assemble_hyper_dolfinx(jacobian_form):
    A = petsc.create_matrix(jacobian_form)
    A.zeroEntries()
    petsc.assemble_matrix(A, jacobian_form)
    A.assemble()
    # return A


cache_dir = results_dir

print("# Assembly Runtimes")
print("")
print(
    "| DOF (tet) | DOF (hex) |             tet                  |           hex                   |",
)
print(
    "| --------- |  -------  | -------------------------------- | ------------------------------- |",
)
points_per_axis = np.array([2, 4, 8, 16])

number = 3


runtimes = np.zeros((len(points_per_axis), 3))

for i, n in enumerate(points_per_axis):
    mesh_hex = load_mesh(n, cell_type=CellType.hexahedron)
    mesh_tet = load_mesh(n, cell_type=CellType.tetrahedron)
    jacobian_hex = pre_hyperelastic_dolfinx(mesh_hex)
    jacobian_tet = pre_hyperelastic_dolfinx(mesh_tet)
    jacobian_form_tet = form(
        jacobian_tet,
        jit_options={
            "cffi_extra_compile_args": ["-Ofast"],
            "cache_dir": cache_dir,
            "cffi_libraries": ["m"],
        },
    )
    jacobian_form_hex = form(
        jacobian_hex,
        jit_options={
            "cffi_extra_compile_args": ["-Ofast"],
            "cache_dir": cache_dir,
            "cffi_libraries": ["m"],
        },
    )
    time_hyperelastic_dolfinx_tet = (
        timeit(lambda: assemble_hyper_dolfinx(jacobian_form_tet), number=number)
        / number
    )
    time_hyperelastic_dolfinx_hex = (
        timeit(lambda: assemble_hyper_dolfinx(jacobian_form_hex), number=number)
        / number
    )
    runtimes[i] = (
        time_hyperelastic_dolfinx_tet,
        time_hyperelastic_dolfinx_hex,
        mesh_tet.geometry.x.shape[0],
    )

    print(
        f"|{mesh_tet.geometry.x.shape[0] * 3:7d}    | {mesh_hex.geometry.x.shape[0] * 3:7d}   | {runtimes[i][0]:19.5f}              | {runtimes[i][1]:19.5f}             |",
    )

fig, ax = plt.subplots()
ax.plot(runtimes[:, -1] * 3.0, runtimes[:, 1] / runtimes[:, 0])
ax.set_xlabel("DOF")
ax.set_ylabel("Speedup")
ax.set_title("Speedup Hex/Tet")
ax.grid()
fig.tight_layout()
fig.savefig(os.path.join(results_dir, "speedupHexTet.png"))
plt.close()
# list_timings(MPI.COMM_WORLD, [TimingType.wall])

You are setting the quadrature degree in the wrong place (function space), not in the measure.
If you fix this:

def pre_hyperelastic_dolfinx(mesh, **kwargs):
    quadrature_degree = 4
    ord = 1
    elemu = element("P", mesh.basix_cell(), ord, shape=(3,))
    V = functionspace(
        mesh, elemu,
    )
    u = Function(V, name="u")
    v = TestFunction(V)
    du = TrialFunction(V)
    F = Identity(len(u)) + grad(u)
    C = F.T * F
    Ic = tr(C)
    J = det(F)
    dx = Measure("dx", domain=mesh, metadata={
        "quadrature_degree": quadrature_degree})
    mu, lmbda = default_scalar_type(1.0), default_scalar_type(2.0)
    psi = 1 / 2 * (mu * (Ic - 3)) - mu * ln(J) + lmbda / 2 * ln(J) ** 2
    stress = derivative(psi, u, v) * dx
    jacobian = derivative(stress, u, du)
    return jacobian

See results in next post

2 Likes

Quadrature degree 4:

| DOF (tet) | DOF (hex) |             tet                  |           hex                   |
| --------- |  -------  | -------------------------------- | ------------------------------- |
|     81    |      81   |             0.00071              |             0.00067             |
|    375    |     375   |             0.00372              |             0.00317             |
|   2187    |    2187   |             0.02831              |             0.02181             |
|  14739    |   14739   |             0.21745              |             0.17804             |

Quadrature degree 10

# Assembly Runtimes

| DOF (tet) | DOF (hex) |             tet                  |           hex                   |
| --------- |  -------  | -------------------------------- | ------------------------------- |
|     81    |      81   |             0.00166              |             0.00234             |
|    375    |     375   |             0.01043              |             0.01586             |
|   2187    |    2187   |             0.07922              |             0.12426             |
|  14739    |   14739   |             0.57804              |             0.95504             |

as there are more quadrature points in hexahedral elements than tetrahedral elements, and the jacobian is computed at every quadrature point for hexes, as we support non-affine hexahedral elements.

2 Likes

Thanks Jorgen! This makes sense.