Speed up (block) matrix assembly by merging SparsityPattern of the blocks

I would like to speed up the code that assembles a jacobian matrix
that consists of bilinear and linear forms.

The bottleneck in the code below is the iniital assembly of the
jacobian, because the sparsity pattern of the full matrix A is not
known in advance, but it is known for all its contained bilinear and
linear forms.

Is it possible to somehow merge the sparsity patterns of the block
matrices and the block vectors and use them for creating the matrix A
using e.g. dolfinx.fem.create_matrix?

Since my jacobian consists of both bilinear and linear forms, I can’t use
dolfinx.fem.create_matrix_block, right?

import time

import numpy as np
import ufl
from dolfinx import fem, mesh
from dolfinx.fem.petsc import create_matrix, create_vector
from mpi4py import MPI
from petsc4py import PETSc
from ufl import ds, dx, inner, nabla_grad

Print = PETSc.Sys.Print


def ass_linear_form_into_vec(vec, form):
    fem.petsc.assemble_vector(vec, fem.form(form))
    vec.assemble()


def ass_bilinear_form(mat, form, bcs, diagonal):
    # TODO ask if there is a public interface for assembling matrices into an
    # existing PETSc matrix.
    fem.petsc._assemble_matrix_mat(
        mat,
        fem.form(form),
        bcs=bcs,
        diagonal=diagonal,
    )
    mat.assemble()


def main():
    msh = mesh.create_unit_interval(MPI.COMM_WORLD, nx=3000)
    V = fem.FunctionSpace(msh, ("Lagrange", 3))

    u = ufl.TrialFunction(V)
    v = ufl.TestFunction(V)

    bcs = []

    L = fem.petsc.assemble_matrix(
        fem.form(-inner(nabla_grad(u), nabla_grad(v)) * dx),
        bcs=bcs,
        diagonal=PETSc.ScalarType(0),
    )
    n = L.getSize()[0]

    et = PETSc.Vec().createSeq(n)
    et.setValue(500, 1.0)
    df_dv = et.array.real
    dg_dw = et.array.real

    A = PETSc.Mat().create(MPI.COMM_WORLD)
    # A.setOption(PETSc.Mat.Option.HERMITIAN, False)
    N = 2 * n + 2
    A.setSizes([N, N])
    A.setUp()

    b = fem.Function(V)
    b.x.array[:] = 1.234

    # create bilinear and linear forms

    bilin_form = -inner(nabla_grad(u), nabla_grad(v)) * dx

    mat_dFRe_dv = create_matrix(fem.form(bilin_form))
    mat_dFRe_dw = create_matrix(fem.form(bilin_form))
    mat_dFIm_dv = create_matrix(fem.form(bilin_form))
    mat_dFIm_dw = create_matrix(fem.form(bilin_form))

    ass_bilinear_form(mat_dFRe_dv, bilin_form, bcs=bcs, diagonal=1.0)
    ass_bilinear_form(mat_dFRe_dw, bilin_form, bcs=bcs, diagonal=0.0)
    ass_bilinear_form(mat_dFIm_dv, bilin_form, bcs=bcs, diagonal=0.0)
    ass_bilinear_form(mat_dFIm_dw, bilin_form, bcs=bcs, diagonal=1.0)

    lin_form = inner(b, v) * dx

    vec_dFRe_dk = create_vector(fem.form(lin_form))
    vec_dFRe_ds = create_vector(fem.form(lin_form))
    vec_dFIm_dk = create_vector(fem.form(lin_form))
    vec_dFIm_ds = create_vector(fem.form(lin_form))
    ass_linear_form_into_vec(vec_dFRe_dk, lin_form)
    fem.set_bc(vec_dFRe_dk, bcs)
    ass_linear_form_into_vec(vec_dFRe_ds, lin_form)
    fem.set_bc(vec_dFRe_ds, bcs)
    ass_linear_form_into_vec(vec_dFIm_dk, lin_form)
    fem.set_bc(vec_dFIm_dk, bcs)
    ass_linear_form_into_vec(vec_dFIm_ds, lin_form)
    fem.set_bc(vec_dFIm_ds, bcs)

    Print("Fill real jacobian with values")
    t0 = time.monotonic()

    # TODO is it possible to determine the sparisty pattern of the whole jacobian A?

    # the jacobian consists of four block matrices (bilinar forms) and 6 linear
    # forms (2 row vectors and 4 column vectors)
    block_mats = [
        # matrix, row offset, col offset
        (mat_dFRe_dv, 0, 0),
        (mat_dFRe_dw, 0, n),
        (mat_dFIm_dv, n, 0),
        (mat_dFIm_dw, n, n),
    ]

    def insert_values(target_matrix, block_matrix, row_offset, col_offset):
        rows_ind, cols, values = block_matrix.getValuesCSR()

        cols_to = cols + col_offset

        rows_ind_to = np.empty(
            N + 1, dtype=np.int32
        )  # +1 because rows_ind_to[0] is always 0
        _idx = 0
        rows_ind_to[_idx : _idx + row_offset] = 0
        _idx = _idx + row_offset
        rows_ind_to[_idx : _idx + len(rows_ind)] = rows_ind
        _idx = _idx + len(rows_ind)
        rows_ind_to[_idx:] = len(values)

        t_setCSR_0 = time.monotonic()
        target_matrix.setValuesCSR(
            rows_ind_to, cols_to, values, addv=PETSc.InsertMode.ADD
        )
        Print(f"single setValuesCSR call took {time.monotonic()-t_setCSR_0:.2f}s")

    t_insert_0 = time.monotonic()
    for block_mat, row_offset, col_offset in block_mats:
        insert_values(A, block_mat, row_offset, col_offset)
    Print(f"insert_values() loop took {time.monotonic()-t_insert_0:.2e}s")

    # column vectors
    A.setValues(
        range(n),
        2 * n,
        vec_dFRe_dk,
        addv=PETSc.InsertMode.ADD,
    )
    A.setValues(
        range(n),
        2 * n + 1,
        vec_dFRe_ds,
        addv=PETSc.InsertMode.ADD,
    )
    A.setValues(
        range(n, 2 * n),
        2 * n,
        vec_dFIm_dk,
        addv=PETSc.InsertMode.ADD,
    )
    A.setValues(
        range(n, 2 * n),
        2 * n + 1,
        vec_dFIm_ds,
        addv=PETSc.InsertMode.ADD,
    )

    # row vectors
    A.setValues(
        2 * n,
        range(n),
        df_dv,
        addv=PETSc.InsertMode.ADD,
    )
    A.setValues(
        2 * n + 1,
        range(n, 2 * n),
        dg_dw,
        addv=PETSc.InsertMode.ADD,
    )

    print(
        f"insert_values loop + A.setValues for linear forms took "
        f"{time.monotonic()-t_insert_0:.2e}s"
    )

    # scalars
    # this is needed s.t. PETSc doesn't
    # complain that the diagonl entry is missing
    # (see https://lists.mcs.anl.gov/pipermail/petsc-users/2016-October/030704.html)
    A.setValue(2 * n, 2 * n, 0j, addv=PETSc.InsertMode.ADD)
    A.setValue(2 * n + 1, 2 * n + 1, 0j, addv=PETSc.InsertMode.ADD)

    t1 = time.monotonic()
    A.assemble()
    print(
        f"fill+assembly of J took {time.monotonic()-t0:.2e}s "
        f"(assemble {time.monotonic()-t1:.2e}s)"
    )


if __name__ == "__main__":
    main()

The current output of this code block is

Fill real jacobian with values
single setValuesCSR call took 2.79s
single setValuesCSR call took 9.57s
single setValuesCSR call took 6.47s
single setValuesCSR call took 16.46s
insert_values() loop took 3.53e+01s
insert_values loop + A.setValues for linear forms took 3.91e+01s
fill+assembly of J took 3.91e+01s (assemble 9.00e-04s)

For this you should be able to just call

fem.petsc.assemble_matrix(
        mat,
        fem.form(form),
        bcs=bcs,
        diagonal=diagonal,
    )

and as shown in:

you can create your sparsity pattern (and send that into the create_matrix function)

sp = fem.create_sparsity_pattern(form)
sp.insert_diagonal(zero_blocks)
sp.assemble()
end = time.perf_counter()
A = cpp.la.petsc.create_matrix(mesh.comm, sp)

Here I used insert_diagonal as I only wanted diagonal entries, but you can use sp.insert to insert combinations of row and column entries: https://github.com/FEniCS/dolfinx/blob/main/python/dolfinx/wrappers/la.cpp#L254-L263

1 Like

For this you should be able to just call fem.petsc.assemble_matrix

Thx, this works! (Don’t know why I overlooked this public function)

you can create your sparsity pattern (and send that into the create_matrix function)

AFAICS I can’t increase the size of the matrix simply by calling sp.insert_diagonal with row-indices that are outside of the index-map of sp.

I’ve managed to simplify my code a bit by creating a block matrix for the 4 bilinear forms, but I still need to extend this matrix by two column vectors and two row vectors.

I would probably extend the index map before sending it into the sparsity pattern initializer.

But i guess since you are adding more rows and columns to the matrices, why not let each of these rows/columns be a separate (rectangular) matrix?

Maybe Im missing the point, as you tried to avoid this. there is a lot going on in your code, making it hard to give guidance.

I think @cdaversin has done something similar to this.

I don’t have any experience with the insert_diagonal function, but indeed I have been working on an application where I needed to add rows and cols to an assembled matrix manually. I have only used petsc4py features to do so, copying the matrix entries (from the matrix assembled by dolfinx) into a bigger PETSc matrix and then added additional entries there.

Thx so far for the feedback!

I think extending the indexmap is also not easily possible, at least I don’t know how without writing c+±code.

The cleanest solution is IMO to use create_matrix_block and add support for linear forms to it. I’ll create a ticket for this on github.

I have a version of the code now that creates the big matrix with a specified CSR format, which is much faster! (a couple of ms instead of 30s)

A = PETSc.Mat().createAIJWithArrays(
        (N, N),
        (final_rows_ind, final_cols, final_values),
)
A.zeroEntries()  # throw away the final_values

The final_values only contain dummy values, but it is required by the createAIJWithArrays constructor. Do you know if it is possible to set the CSR pattern for preallocation of the matrix without specifying final_values?

I can post a full version of the script later after I’ve verified that it works.

The optimized code works so far, but only when I don’t have Dirichlet BC. This issue is not related to the createAIJWithArray code that I’ve posted above, but only due to the way I assemble the jacobian.

If I assemble the individual blocks like in

fem.petsc.assemble_matrix(mat_A, A, bcs=bcs, diagonal=1.0)
fem.petsc.assemble_matrix(mat_B, B, bcs=bcs, diagonal=0.0)
fem.petsc.assemble_matrix(mat_C, C, bcs=bcs, diagonal=0.0)
fem.petsc.assemble_matrix(mat_D, D, bcs=bcs, diagonal=1.0)

and then insert them into my jacobian, everything works (with Dirichlet and without Dirichlet BC).

But when I assemble the blocks into a single block matrix (called mat_big) the code only works when I don’t have DBC.

fem.petsc.assemble_matrix_block(
    mat_big,
    [[A, B], [C, D]],
    bcs=bcs,
    diagonal=1.0,
))

I guess that the BC is not correctly applied to the off-diagonal blocks B & C.

Do you know what I have to do to apply the BC for the off diagonal blocks B &C with diagonal=0?

The last post doesn’t have enough detail for me to respond precisely, but if you assemble a block matrix you need to make sure each block has its own test and trial function space, otherwise DOLFINx cannot tell which blocks are diagonal and which are off-diagonal. Note the use of the clone() method in dolfinx/test_assembler.py at main · FEniCS/dolfinx · GitHub to create a new function space and for use in a blocked matrix.

2 Likes

It seems as if there are two ways how a block-like matrix can be assembled:
either

In the former case the full matrix (for all functionspaces) has to be assembled using the assemble_matrix_block function and in the later case just by using the non-block version assemble_matrix.

Another difference between the two approaches is that in the latter case the full form can be defined by summing up all the forms on the different sub-spaces (e.g. inner(trialfunctions[i], testfunctions[j] * dx), whereas in the former case a 2D form-array needs to be passed to assemble_matrix_block.

Is there a preference for one approach over the other and are the use-cases where one is better than the other?
For my use-case I need to be able to assemble matrices where the number of blocks is a function parameter.

edit: The approach with the MixedElement allows defining different BCs on the different sub-spaces, which is not possible when assemble_matrix_block is used.