Hello,

I have been running a reaction-diffusion problem which is getting quite large. Before deploying it on an HPC, I played around with a few cores on my laptop but noticed that running time increases substantially with increasing number of cores used.

I have looked at the similar discussion topics below but could not solve my issue.

https://fenicsproject.discourse.group/t/mumps-in-parallel-slower-than-in-serial/662

https://fenicsproject.discourse.group/t/fenics-mpi-on-docker-inefficient/3510

Also, I get the following warning: [WARNING] yaksa: 3 leaked handle pool objects. I don’t understand it.

Would you have any suggestion as to how to reduce the computational time?

I am running dolfinx in a Singularity container.

I have reproduced the issue with the minimal example below (1 core= 24 sec, 2 cores=49 sec, 3 cores: 102 sec )

Many thanks for your help!

```
L = 1
W = 0.2
diffusion_time_step=0.1
end_time=10
simu_start_time=0
import numpy as np
import ufl
from mpi4py import MPI
from petsc4py.PETSc import ScalarType
from dolfinx import mesh, fem, plot, io
from dolfinx.fem import FunctionSpace,VectorFunctionSpace, Function,TensorFunctionSpace, Constant
from ufl import TestFunction, TrialFunction, dx, inner,grad, Measure,derivative,system
from petsc4py import PETSc
import time
startT = time.time()
domain = mesh.create_box(MPI.COMM_WORLD, [np.array([0,0,0]), np.array([L, W, W])],
[20,20,20], cell_type=mesh.CellType.hexahedron)
V=VectorFunctionSpace(domain, ("CG", 1), dim=3)
p = TestFunction(V)
prc_prev = Function(V)
prc = TrialFunction(V)
prc_mid = 0.5*(prc + prc_prev)
dt=Constant(domain, diffusion_time_step)
D_p = Constant(domain,2.4e3)
D_pr = Constant(domain,2.4e-15)
kap = Constant(domain,1e5)
kdp = Constant(domain,1.86e-3)
psi_p = Constant(domain,0.01)
psi_pr = Constant(domain,1e-50)
psi_cp = Constant(domain,1e-15)
# Supply term
def sup_func(x):
return np.exp(-100*(x[0]%0.1+x[1]%0.1+x[2]%0.1))
p_i = Function(V.sub(0).collapse()[0])
p_i.interpolate(sup_func)
p_sup = p_i
# Variational problem
F = (
(- (prc[0] - prc_prev[0])*p[0]
- (prc[1] - prc_prev[1])*p[1]
- (prc[2] - prc_prev[2])*p[2])*dx()
+ (- dt*inner(D_p*grad(prc_mid[0]), grad(p[0])))*dx()
+ (+ dt*p_sup*p[0])*dx()
+ (+ dt*(- kap*prc_prev[0]*prc_mid[1]*p[0]
- kap*prc_mid[0]*prc_prev[1]*p[1]
+ kap*prc_prev[0]*prc_prev[1]*p[2]
+ kdp*prc_prev[2]*p[0]
+ kdp*prc_prev[2]*p[1]
- kdp*prc_mid[2]*p[2]))*dx()
+ (+ dt*(- psi_p*prc_mid[0]*p[0]
- psi_pr*prc_mid[1]*p[1]
- psi_cp*prc_mid[2]*p[2]))*dx()
)
# Create boundary condition
tdim = V.mesh.topology.dim
fdim = tdim - 1
domain.topology.create_connectivity(fdim, tdim)
boundary_facets = mesh.exterior_facet_indices(V.mesh.topology)
fixed_dofs_0 = fem.locate_dofs_topological(V.sub(0), fdim, boundary_facets)
fixed_dofs_1 = fem.locate_dofs_topological(V.sub(1), fdim, boundary_facets)
fixed_dofs_2 = fem.locate_dofs_topological(V.sub(2), fdim, boundary_facets)
bc0 = fem.dirichletbc(PETSc.ScalarType(0), fixed_dofs_0, V.sub(0))
bc1 = fem.dirichletbc(PETSc.ScalarType(0), fixed_dofs_1, V.sub(1))
bc2 = fem.dirichletbc(PETSc.ScalarType(0), fixed_dofs_2, V.sub(2))
# Define problem
(a, L) = system(F)
problem=fem.petsc.LinearProblem(a, L, bcs=[bc0,bc1,bc2],petsc_options={"ksp_type": "cg", "ksp_rtol":1e-8, "ksp_atol":1e-8})
# Solve problem
simu_time=simu_start_time
while simu_time < end_time:
prc = problem.solve()
prc.x.scatter_forward()
prc_prev.x.array[:]=prc.x.array
simu_time += dt.value
endT = time.time()
print("The time of execution of above program is :",(endT-startT),flush=True)
```