Problem solving in parallel slower than in serial

This is due to garbage management in PETSc, see: Add explicit destruction of PETSc Python objects by garth-wells · Pull Request #2560 · FEniCS/dolfinx · GitHub
which resolved this.

Please note that the problem you are solving isn’t very large, i.e. num dofs= 27 783.
In general I would suggest having between 50 000 - 100 000 dofs per process.

Finally, I cannot reproduce your results with the docker image dolfinx/dolfinx:nightly:

from ufl import TestFunction, TrialFunction, dx, inner, grad, system
from dolfinx.fem import VectorFunctionSpace, Function, Constant
import time
from petsc4py import PETSc
from dolfinx import mesh, fem
from mpi4py import MPI
import numpy as np
L = 1
W = 0.2
diffusion_time_step = 0.1
end_time = 10
simu_start_time = 0


startT = time.time()

domain = mesh.create_box(MPI.COMM_WORLD, [np.array([0, 0, 0]), np.array([L, W, W])],
                         [20, 20, 20], cell_type=mesh.CellType.hexahedron)

V = VectorFunctionSpace(domain, ("CG", 1), dim=3)


p = TestFunction(V)
prc_prev = Function(V)
prc = TrialFunction(V)
prc_mid = 0.5*(prc + prc_prev)

dt = Constant(domain, diffusion_time_step)

D_p = Constant(domain, 2.4e3)
D_pr = Constant(domain, 2.4e-15)
kap = Constant(domain, 1e5)
kdp = Constant(domain, 1.86e-3)
psi_p = Constant(domain, 0.01)
psi_pr = Constant(domain, 1e-50)
psi_cp = Constant(domain, 1e-15)

# Supply term


def sup_func(x):
    return np.exp(-100*(x[0] % 0.1+x[1] % 0.1+x[2] % 0.1))


p_i = Function(V.sub(0).collapse()[0])
p_i.interpolate(sup_func)
p_sup = p_i

# Variational problem
F = (
    (- (prc[0] - prc_prev[0])*p[0]
     - (prc[1] - prc_prev[1])*p[1]
     - (prc[2] - prc_prev[2])*p[2])*dx()
    + (- dt*inner(D_p*grad(prc_mid[0]), grad(p[0])))*dx()
    + (+ dt*p_sup*p[0])*dx()
    + (+ dt*(- kap*prc_prev[0]*prc_mid[1]*p[0]
             - kap*prc_mid[0]*prc_prev[1]*p[1]
             + kap*prc_prev[0]*prc_prev[1]*p[2]
             + kdp*prc_prev[2]*p[0]
             + kdp*prc_prev[2]*p[1]
             - kdp*prc_mid[2]*p[2]))*dx()
    + (+ dt*(- psi_p*prc_mid[0]*p[0]
             - psi_pr*prc_mid[1]*p[1]
             - psi_cp*prc_mid[2]*p[2]))*dx()
)

# Create boundary condition
tdim = V.mesh.topology.dim
fdim = tdim - 1
domain.topology.create_connectivity(fdim, tdim)
boundary_facets = mesh.exterior_facet_indices(V.mesh.topology)
fixed_dofs_0 = fem.locate_dofs_topological(V.sub(0), fdim, boundary_facets)
fixed_dofs_1 = fem.locate_dofs_topological(V.sub(1), fdim, boundary_facets)
fixed_dofs_2 = fem.locate_dofs_topological(V.sub(2), fdim, boundary_facets)
bc0 = fem.dirichletbc(PETSc.ScalarType(0), fixed_dofs_0, V.sub(0))
bc1 = fem.dirichletbc(PETSc.ScalarType(0), fixed_dofs_1, V.sub(1))
bc2 = fem.dirichletbc(PETSc.ScalarType(0), fixed_dofs_2, V.sub(2))
# Define problem
(a, L) = system(F)
problem = fem.petsc.LinearProblem(a, L, bcs=[bc0, bc1, bc2], petsc_options={
                                  "ksp_type": "cg",
                                  "ksp_rtol": 1e-8, "ksp_atol": 1e-8})

# Solve problem
simu_time = simu_start_time
while simu_time < end_time:
    prc = problem.solve()
    prc.x.scatter_forward()
    prc_prev.x.array[:] = prc.x.array
    simu_time += dt.value


endT = time.time()
print(f"Num dofs = {V.dofmap.index_map.size_global * V.dofmap.index_map_bs}")
print(f"{mesh.comm.rank} The time of execution of above program is : {endT-startT}", flush=True)

Docker ran as:

docker pull dolfinx/dolfinx:nightly
docker run -ti --network=host -e DISPLAY=$DISPLAY -v /tmp/.X11-unix:/tmp/.X11-unix -v $(pwd):/fenics/shared -w /fenics/shared --rm --shm-size=512m dolfinx/dolfinx:nightly

gives me:

root@dokken-XPS-15-9560:/fenics/shared# python3 mwe.py 
Num dofs = 27783
0 The time of execution of above program is : 37.48247265815735

root@dokken-XPS-15-9560:/fenics/shared# mpirun -n 2 python3 mwe.py 
Num dofs = 27783
0 The time of execution of above program is : 22.22110080718994
Num dofs = 27783
1 The time of execution of above program is : 22.2211012840271

root@dokken-XPS-15-9560:/fenics/shared# mpirun -n 3 python3 mwe.py 
Num dofs = 27783
0 The time of execution of above program is : 14.786735773086548
Num dofs = 27783
1 The time of execution of above program is : 14.786738634109497
Num dofs = 27783
2 The time of execution of above program is : 14.786739587783813

root@dokken-XPS-15-9560:/fenics/shared# mpirun -n 4 python3 mwe.py 
Num dofs = 27783
0 The time of execution of above program is : 12.08518934249878
Num dofs = 27783
1 The time of execution of above program is : 12.085187435150146
Num dofs = 27783
2 The time of execution of above program is : 12.085189819335938
Num dofs = 27783
3 The time of execution of above program is : 12.085186004638672

As you can see when we go from 3 to 4 procs, there is barely any improvement, as you have very few dofs i on each process.

Finally, you haven’t specified your pre-conditioner, which is crucial for good performance.