Reproducibility in parallel programming

Dear community,

Recently, I found the results can not be reproduced in parallel programming. Here is the MWE:

import dolfinx
import ufl
from dolfinx.generation import RectangleMesh
from dolfinx.mesh import CellType
from dolfinx.fem import VectorFunctionSpace, FunctionSpace, Function, DirichletBC
from dolfinx.nls import NewtonSolver
from mpi4py import MPI
from petsc4py import PETSc
import numpy as np

# mesh and function space
mesh = RectangleMesh(MPI.COMM_WORLD, [[0, 0, 0], [1, 1, 0]], [200, 200], CellType.quadrilateral)
fdim = mesh.topology.dim - 1
V = VectorFunctionSpace(mesh, ("CG", 1))
S = FunctionSpace(mesh, ("DG", 0))
u = Function(V)  # test function
v = ufl.TestFunction(V)  # trial function

# kinematics
lambda_, mu = 0.5769, 0.3846
P = 2.0*mu*ufl.sym(ufl.grad(u)) + lambda_*ufl.tr(ufl.sym(ufl.grad(u)))*ufl.Identity(len(u))
dx = ufl.Measure("dx", metadata={"quadrature_degree": 4})
R = ufl.inner(ufl.grad(v), P)*dx

# boundary conditions
def left(x):
    return np.isclose(x[0], 0)
def right(x):
    return np.isclose(x[0], 1)

u1 = Function(V)
boundary_facets = dolfinx.mesh.locate_entities_boundary(mesh, fdim, left)
with u1.vector.localForm() as loc:
    loc.set(0)
bc1 = DirichletBC(u1, dolfinx.fem.locate_dofs_topological(
    V, fdim, boundary_facets))
u2 = Function(V)

boundary_facets = dolfinx.mesh.locate_entities_boundary(mesh, fdim, right)
with u2.vector.localForm() as loc:
    loc.set(1)
bc2 = DirichletBC(u2, dolfinx.fem.locate_dofs_topological(
    V, fdim, boundary_facets))
bcs = [bc1, bc2]

# FEniCSx built-in solver
problem = dolfinx.fem.NonlinearProblem(R, u, bcs)
solver = NewtonSolver(MPI.COMM_WORLD, problem)
solver.solve(u)
print(u.vector.norm())

If we run this code 10 times with 3 cores, we can see that the results are random:

167.69866130498036
167.69866130497905
167.69866130498082
167.69866130498008
167.6986613049795
167.69866130498286
167.69866130498218
167.69866130497863
167.69866130498133
167.69866130498036

Although these results are relatively close to each other, the small difference can still be accumulated to a much larger one in the end in time-stepping problems. I wonder if we have some ways to ensure reproducibility? Or at least if we can ensure reproducibility with the same number of cores?

I cannot reproduce this issue using the dolfinx/dolfinx:nightly docker container with the following adaptation:

import dolfinx
import ufl
from dolfinx.mesh import create_rectangle
from dolfinx.mesh import CellType
from dolfinx.fem import VectorFunctionSpace, FunctionSpace, Function, dirichletbc
from dolfinx.nls.petsc import NewtonSolver
from mpi4py import MPI
from petsc4py import PETSc
import numpy as np

# mesh and function space
mesh = create_rectangle(MPI.COMM_WORLD, [[0, 0], [1, 1]], [200, 200], cell_type=CellType.quadrilateral)
fdim = mesh.topology.dim - 1
V = VectorFunctionSpace(mesh, ("CG", 1))
S = FunctionSpace(mesh, ("DG", 0))
u = Function(V)  # test function
v = ufl.TestFunction(V)  # trial function

# kinematics
lambda_, mu = 0.5769, 0.3846
P = 2.0*mu*ufl.sym(ufl.grad(u)) + lambda_*ufl.tr(ufl.sym(ufl.grad(u)))*ufl.Identity(len(u))
dx = ufl.Measure("dx", metadata={"quadrature_degree": 4})
R = ufl.inner(ufl.grad(v), P)*dx

# boundary conditions
def left(x):
    return np.isclose(x[0], 0)
def right(x):
    return np.isclose(x[0], 1)

u1 = Function(V)
boundary_facets = dolfinx.mesh.locate_entities_boundary(mesh, fdim, left)
with u1.vector.localForm() as loc:
    loc.set(0)
bc1 = dirichletbc(u1, dolfinx.fem.locate_dofs_topological(
    V, fdim, boundary_facets))
u2 = Function(V)

boundary_facets = dolfinx.mesh.locate_entities_boundary(mesh, fdim, right)
with u2.vector.localForm() as loc:
    loc.set(1)
bc2 = dirichletbc(u2, dolfinx.fem.locate_dofs_topological(
    V, fdim, boundary_facets))
bcs = [bc1, bc2]

# FEniCSx built-in solver
problem = dolfinx.fem.petsc.NonlinearProblem(R, u, bcs)
solver = NewtonSolver(MPI.COMM_WORLD, problem)
solver.solve(u)
print(u.vector.norm())

But please note that the difference in norm is at the magnitude of 10^{-12}, which is fairly close to machine precision and lower than the default tolerances of the Newton Solver (and most likely the underlying ksp solver).