Conspicuous speed ups in parallel computing

I am testing a code from Implementation — FEniCSx tutorial (jsdokken.com). Interestingly, I found the serial time is 69.525 seconds while the parallel computing time with two processes is only 3.111 seconds. I am curious why we have such a huge speedup around 20 times faster?

Here is the MWE:

# Scaled variable
L = 1
W = 0.2
mu = 1
rho = 1
delta = W/L
gamma = 0.4*delta**2
beta = 1.25
lambda_ = beta
g = gamma

import numpy as np
import ufl

from mpi4py import MPI
from petsc4py.PETSc import ScalarType

from dolfinx import mesh, fem, plot, io

domain = mesh.create_box(MPI.COMM_WORLD, [np.array([0,0,0]), np.array([L, W, W])],
                  [40,20,20], cell_type=mesh.CellType.hexahedron)
V = fem.VectorFunctionSpace(domain, ("CG", 1))

def clamped_boundary(x):
    return np.isclose(x[0], 0)

fdim = domain.topology.dim - 1
boundary_facets = mesh.locate_entities_boundary(domain, fdim, clamped_boundary)

u_D = np.array([0,0,0], dtype=ScalarType)
bc = fem.dirichletbc(u_D, fem.locate_dofs_topological(V, fdim, boundary_facets), V)

T = fem.Constant(domain, ScalarType((0, 0, 0)))

ds = ufl.Measure("ds", domain=domain)

def epsilon(u):
    return ufl.sym(ufl.grad(u)) # Equivalent to 0.5*(ufl.nabla_grad(u) + ufl.nabla_grad(u).T)
def sigma(u):
    return lambda_ * ufl.nabla_div(u) * ufl.Identity(len(u)) + 2*mu*epsilon(u)

u = ufl.TrialFunction(V)
v = ufl.TestFunction(V)
f = fem.Constant(domain, ScalarType((0, 0, -rho*g)))
a = ufl.inner(sigma(u), epsilon(v)) * ufl.dx
L = ufl.dot(f, v) * ufl.dx + ufl.dot(T, v) * ds

import time
problem = fem.petsc.LinearProblem(a, L, bcs=[bc], petsc_options={"ksp_type": "preonly", "pc_type": "lu"})
start = time.time()
uh = problem.solve()
end = time.time()
print(f"{end-start:.3f}", flush=True)

This is because PETSc uses different LU solvers underneath the hood when running in serial and parallel if no option is specified. Consider the following:

# Scaled variable
L = 1
W = 0.2
mu = 1
rho = 1
delta = W/L
gamma = 0.4*delta**2
beta = 1.25
lambda_ = beta
g = gamma

import numpy as np
import ufl
import time
import dolfinx.fem.petsc
from mpi4py import MPI
from petsc4py import PETSc
from dolfinx import mesh, fem, plot, io

ScalarType = PETSc.ScalarType
domain = mesh.create_box(MPI.COMM_WORLD, [np.array([0,0,0]), np.array([L, W, W])],
                  [40,20,20], cell_type=mesh.CellType.hexahedron)
V = fem.VectorFunctionSpace(domain, ("CG", 1))

def clamped_boundary(x):
    return np.isclose(x[0], 0)

fdim = domain.topology.dim - 1
boundary_facets = mesh.locate_entities_boundary(domain, fdim, clamped_boundary)

u_D = np.array([0,0,0], dtype=ScalarType)
bc = fem.dirichletbc(u_D, fem.locate_dofs_topological(V, fdim, boundary_facets), V)

T = fem.Constant(domain, ScalarType((0, 0, 0)))

ds = ufl.Measure("ds", domain=domain)

def epsilon(u):
    return ufl.sym(ufl.grad(u)) # Equivalent to 0.5*(ufl.nabla_grad(u) + ufl.nabla_grad(u).T)
def sigma(u):
    return lambda_ * ufl.nabla_div(u) * ufl.Identity(len(u)) + 2*mu*epsilon(u)

u = ufl.TrialFunction(V)
v = ufl.TestFunction(V)
f = fem.Constant(domain, ScalarType((0, 0, -rho*g)))
a = ufl.inner(sigma(u), epsilon(v)) * ufl.dx
L = ufl.dot(f, v) * ufl.dx + ufl.dot(T, v) * ds

# Compile forms
af = fem.form(a)
Lf = fem.form(L)

# Assemble LHS
start_A = time.perf_counter()
A = fem.petsc.assemble_matrix(af, bcs=[bc])
A.assemble()
end_A = time.perf_counter()
print(f"Assemble A: {end_A-start_A:.2e}", flush=True)

# Assemble RHS
start_b = time.perf_counter()
b = fem.petsc.assemble_vector(Lf)
fem.petsc.apply_lifting(b, [af], [[bc]])
b.ghostUpdate(addv=PETSc.InsertMode.ADD_VALUES, mode=PETSc.ScatterMode.REVERSE)
fem.petsc.set_bc(b, [bc])
end_b = time.perf_counter()
print(f"Assemble b: {end_b-start_b:.2e}", flush=True)

# Define solver
solver = PETSc.KSP().create(domain.comm)
solver.setOperators(A)
solver.setType(PETSc.KSP.Type.PREONLY)
solver.getPC().setType(PETSc.PC.Type.LU)
solver.getPC().setFactorSolverType("mumps")

# Solve problem
uh = fem.Function(V)
start_ksp = time.perf_counter()
solver.solve(b, uh.vector)
end_ksp = time.perf_counter()
print(f"{solver.getConvergedReason()=}", flush=True)
print(f"KSP solve: {end_ksp - start_ksp:.2e}", flush=True)
solver  .destroy()
A.destroy()
b.destroy()


import time
problem = fem.petsc.LinearProblem(a, L, bcs=[bc], petsc_options={"ksp_type": "preonly", "pc_type": "lu",
"pc_factor_mat_solver_type": "mumps"})
start = time.perf_counter()
uh = problem.solve()
end = time.perf_counter()
print(f"LinearProblem {end-start:.3f}", flush=True)

Which yields

Assemble A: 3.08e-01
Assemble b: 1.65e-02
solver.getConvergedReason()=4
KSP solve: 2.62e+00
LinearProblem 2.978

in serial and

Assemble A: 1.76e-01
Assemble A: 1.76e-01
Assemble b: 1.14e-02
Assemble b: 1.14e-02
solver.getConvergedReason()=4
KSP solve: 1.84e+00
solver.getConvergedReason()=4
KSP solve: 1.84e+00
LinearProblem 2.055
LinearProblem 2.055

in parallel.
However, if you remove
"pc_factor_mat_solver_type": "mumps"
you get the timings you shared.

If you add problem.solver.getPC().view() at the end of the script you can see this:
SERIAL:

type: lu
    out-of-place factorization
    tolerance for zero pivot 2.22045e-14
    matrix ordering: nd
    factor fill ratio given 5., needed 20.4851
      Factored matrix follows:
        Mat Object: 1 MPI processes
          type: seqaij
          rows=54243, cols=54243, bs=3
          package used to perform factorization: petsc
          total: nonzeros=83008971, allocated nonzeros=83008971
            using I-node routines: found 17882 nodes, limit used is 5
  linear system matrix = precond matrix:
  Mat Object: (dolfinx_solve_140369687918432) 1 MPI processes
    type: seqaij
    rows=54243, cols=54243, bs=3
    total: nonzeros=4052169, allocated nonzeros=4052169
    total number of mallocs used during MatSetValues calls=0
      using I-node routines: found 18081 nodes, limit used is 5

Parallel:

PC Object: (dolfinx_solve_140718610168272) 2 MPI processes
  type: lu
    out-of-place factorization
    tolerance for zero pivot 2.22045e-14
    matrix ordering: external
    factor fill ratio given 0., needed 0.
      Factored matrix follows:
        Mat Object: 2 MPI processes
          type: mumps
          rows=54243, cols=54243
          package used to perform factorization: mumps
          total: nonzeros=78221205, allocated nonzeros=78221205
            MUMPS run parameters:
              SYM (matrix type):                   0
              PAR (host participation):            1
              ICNTL(1) (output for error):         6
              ICNTL(2) (output of diagnostic msg): 0
              ICNTL(3) (output for global info):   0
              ICNTL(4) (level of printing):        0
              ICNTL(5) (input mat struct):         0
              ICNTL(6) (matrix prescaling):        7
              ICNTL(7) (sequential matrix ordering):7
              ICNTL(8) (scaling strategy):        77
              ICNTL(10) (max num of refinements):  0
              ICNTL(11) (error analysis):          0
              ICNTL(12) (efficiency control):                         1
              ICNTL(13) (sequential factorization of the root node):  0
              ICNTL(14) (percentage of estimated workspace increase): 20
        LinearProblem 2.089
      ICNTL(18) (input mat struct):                           3
              ICNTL(19) (Schur complement info):                      0
              ICNTL(20) (RHS sparse pattern):                         10
              ICNTL(21) (solution struct):                            1
              ICNTL(22) (in-core/out-of-core facility):               0
              ICNTL(23) (max size of memory can be allocated locally):0
              ICNTL(24) (detection of null pivot rows):               0
              ICNTL(25) (computation of a null space basis):          0
              ICNTL(26) (Schur options for RHS or solution):          0
              ICNTL(27) (blocking size for multiple RHS):             -32
              ICNTL(28) (use parallel or sequential ordering):        1
              ICNTL(29) (parallel ordering):                          0
              ICNTL(30) (user-specified set of entries in inv(A)):    0
              ICNTL(31) (factors is discarded in the solve phase):    0
              ICNTL(33) (compute determinant):                        0
              ICNTL(35) (activate BLR based factorization):           0
              ICNTL(36) (choice of BLR factorization variant):        0
              ICNTL(38) (estimated compression rate of LU factors):   333
              CNTL(1) (relative pivoting threshold):      0.01 
              CNTL(2) (stopping criterion of refinement): 1.49012e-08 
              CNTL(3) (absolute pivoting threshold):      0. 
              CNTL(4) (value of static pivoting):         -1. 
              CNTL(5) (fixation for null pivots):         0. 
              CNTL(7) (dropping parameter for BLR):       0. 
              RINFO(1) (local estimated flops for the elimination after analysis): 
                [0] 4.79274e+10 
                [1] 5.09026e+10 
              RINFO(2) (local estimated flops for the assembly after factorization): 
                [0]  7.16551e+07 
                [1]  6.464e+07 
              RINFO(3) (local estimated flops for the elimination after factorization): 
                [0]  4.79274e+10 
                [1]  5.09026e+10 
              INFO(15) (estimated size of (in MB) MUMPS internal data for running numerical factorization): 
              [0] 495
              [1] 491
              INFO(16) (size of (in MB) MUMPS internal data used during numerical factorization): 
                [0] 495
                [1] 491
              INFO(23) (num of pivots eliminated on this processor after factorization): 
                [0] 29043
                [1] 25200
              RINFOG(1) (global estimated flops for the elimination after analysis): 9.88301e+10 
              RINFOG(2) (global estimated flops for the assembly after factorization): 1.36295e+08 
              RINFOG(3) (global estimated flops for the elimination after factorization): 9.88301e+10 
              (RINFOG(12) RINFOG(13))*2^INFOG(34) (determinant): (0.,0.)*(2^0)
              INFOG(3) (estimated real workspace for factors on all processors after analysis): 78221205
              INFOG(4) (estimated integer workspace for factors on all processors after analysis): 637590
              INFOG(5) (estimated maximum front size in the complete tree): 2583
              INFOG(6) (number of nodes in the complete tree): 985
              INFOG(7) (ordering option effectively used after analysis): 5
              INFOG(8) (structural symmetry in percent of the permuted matrix after analysis): -1
              INFOG(9) (total real/complex workspace to store the matrix factors after factorization): 78221205
              INFOG(10) (total integer space store the matrix factors after factorization): 637604
              INFOG(11) (order of largest frontal matrix after factorization): 2583
              INFOG(12) (number of off-diagonal pivots): 0
              INFOG(13) (number of delayed pivots after factorization): 0
              INFOG(14) (number of memory compress after factorization): 0
              INFOG(15) (number of steps of iterative refinement after solution): 0
              INFOG(16) (estimated size (in MB) of all MUMPS internal data for factorization after analysis: value on the most memory consuming processor): 495
              INFOG(17) (estimated size of all MUMPS internal data for factorization after analysis: sum over all processors): 986
              INFOG(18) (size of all MUMPS internal data allocated during factorization: value on the most memory consuming processor): 495
              INFOG(19) (size of all MUMPS internal data allocated during factorization: sum over all processors): 986
              INFOG(20) (estimated number of entries in the factors): 78221205
              INFOG(21) (size in MB of memory effectively used during factorization - value on the most memory consuming processor): 423
              INFOG(22) (size in MB of memory effectively used during factorization - sum over all processors): 811
              INFOG(23) (after analysis: value of ICNTL(6) effectively used): 0
              INFOG(24) (after analysis: value of ICNTL(12) effectively used): 1
              INFOG(25) (after factorization: number of pivots modified by static pivoting): 0
              INFOG(28) (after factorization: number of null pivots encountered): 0
              INFOG(29) (after factorization: effective number of entries in the factors (sum over all processors)): 78221205
              INFOG(30, 31) (after solution: size in Mbytes of memory used during solution phase): 480, 953
              INFOG(32) (after analysis: type of analysis done): 1
              INFOG(33) (value used for ICNTL(8)): 7
              INFOG(34) (exponent of the determinant if determinant is requested): 0
              INFOG(35) (after factorization: number of entries taking into account BLR factor compression - sum over all processors): 78221205
              INFOG(36) (after analysis: estimated size of all MUMPS internal data for running BLR in-core - value on the most memory consuming processor): 0
              INFOG(37) (after analysis: estimated size of all MUMPS internal data for running BLR in-core - sum over all processors): 0
              INFOG(38) (after analysis: estimated size of all MUMPS internal data for running BLR out-of-core - value on the most memory consuming processor): 0
              INFOG(39) (after analysis: estimated size of all MUMPS internal data for running BLR out-of-core - sum over all processors): 0
  linear system matrix = precond matrix:
  Mat Object: (dolfinx_solve_140718610168272) 2 MPI processes
    type: mpiaij
    rows=54243, cols=54243, bs=3
    total: nonzeros=4052169, allocated nonzeros=4052169
    total number of mallocs used during MatSetValues calls=0
3 Likes