In the code below, when I use cell_type=mesh.CellType.hexahedron, the results are the same whether I run on one processor or multiple processors.
However, when I change to cell_type=mesh.CellType.tetrahedron, I get different results depending on the number of processors.
Could anyone help me understand why using component-wise Dirichlet conditions with cell_type=mesh.CellType.tetrahedron leads to inconsistent results when running with different numbers of processors?
Thansk
from dolfinx import mesh, fem, io, default_scalar_type
from dolfinx.fem.petsc import LinearProblem
from mpi4py import MPI
import ufl
import numpy as np
L = 1
W = 0.2
mu = 1
rho = 1
delta = W / L
gamma = 0.4 * delta**2
beta = 1.25
lambda_ = beta
g = gamma
domain = mesh.create_box(
MPI.COMM_WORLD,
[np.array([0, 0, 0]), np.array([L, W, W])],
[20, 5, 5],
cell_type=mesh.CellType.tetrahedron,
# cell_type=mesh.CellType.hexahedron,
)
V = fem.functionspace(domain, ("Lagrange", 1, (domain.geometry.dim,)))
point_1_coords = np.array([0, 0, 0])
point_2_coords = np.array([0, W, 0])
point_3_coords = np.array([L, 0, 0])
def point_1(x):
return np.linalg.norm(x.T - point_1_coords, axis=1) <= 1e-6
def point_2(x):
return np.linalg.norm(x.T - point_2_coords, axis=1) <= 1e-6
def point_3(x):
return np.linalg.norm(x.T - point_3_coords, axis=1) <= 1e-6
boundary_vertex_1 = mesh.locate_entities_boundary(domain, 0, point_1)
boundary_vertex_2 = mesh.locate_entities_boundary(domain, 0, point_2)
boundary_vertex_3 = mesh.locate_entities_boundary(domain, 0, point_3)
# print(f"boundary_vertex_1: {boundary_vertex_1} on process {MPI.COMM_WORLD.rank}")
# print(f"boundary_vertex_2: {boundary_vertex_2} on process {MPI.COMM_WORLD.rank}")
# print(f"boundary_vertex_3: {boundary_vertex_3} on process {MPI.COMM_WORLD.rank}")
# Component wise constraints (for tetrahedron results depend on number of processors)
constraint_dofs_x = []
constraint_dofs_x.extend(fem.locate_dofs_topological(V.sub(0), 0, boundary_vertex_1))
constraint_dofs_y = []
constraint_dofs_y.extend(fem.locate_dofs_topological(V.sub(1), 0, boundary_vertex_1))
constraint_dofs_y.extend(fem.locate_dofs_topological(V.sub(1), 0, boundary_vertex_2))
constraint_dofs_z = []
constraint_dofs_z.extend(fem.locate_dofs_topological(V.sub(2), 0, boundary_vertex_1))
constraint_dofs_z.extend(fem.locate_dofs_topological(V.sub(2), 0, boundary_vertex_2))
constraint_dofs_z.extend(fem.locate_dofs_topological(V.sub(2), 0, boundary_vertex_3))
# print(f"constraint_dofs_x: {constraint_dofs_x} on process {MPI.COMM_WORLD.rank}")
# print(f"constraint_dofs_y: {constraint_dofs_y} on process {MPI.COMM_WORLD.rank}")
# print(f"constraint_dofs_z: {constraint_dofs_z} on process {MPI.COMM_WORLD.rank}")
bcx = fem.dirichletbc(
default_scalar_type(0), np.array(constraint_dofs_x, dtype=np.int32), V.sub(0)
)
bcy = fem.dirichletbc(
default_scalar_type(0), np.array(constraint_dofs_y, dtype=np.int32), V.sub(1)
)
bcz = fem.dirichletbc(
default_scalar_type(0), np.array(constraint_dofs_z, dtype=np.int32), V.sub(2)
)
bc = [bcx, bcy, bcz]
# # Constraints in all directions (works fine)
# u_D = np.array([0, 0, 0], dtype=default_scalar_type)
# bc1 = fem.dirichletbc(u_D, fem.locate_dofs_topological(V, 0, boundary_vertex_1), V)
# bc2 = fem.dirichletbc(u_D, fem.locate_dofs_topological(V, 0, boundary_vertex_2), V)
# bc3 = fem.dirichletbc(u_D, fem.locate_dofs_topological(V, 0, boundary_vertex_3), V)
# bc = [bc1, bc2, bc3]
T = fem.Constant(domain, default_scalar_type((0, 0, 0)))
ds = ufl.Measure("ds", domain=domain)
def epsilon(u):
return ufl.sym(
ufl.grad(u)
) # Equivalent to 0.5*(ufl.nabla_grad(u) + ufl.nabla_grad(u).T)
def sigma(u):
return lambda_ * ufl.nabla_div(u) * ufl.Identity(len(u)) + 2 * mu * epsilon(u)
u = ufl.TrialFunction(V)
v = ufl.TestFunction(V)
f = fem.Constant(domain, default_scalar_type((0, 0, -rho * g)))
a = ufl.inner(sigma(u), epsilon(v)) * ufl.dx
L = ufl.dot(f, v) * ufl.dx + ufl.dot(T, v) * ds
problem = LinearProblem(
a, L, bcs=bc, petsc_options={"ksp_type": "preonly", "pc_type": "lu"}
)
uh = problem.solve()
local_max = np.max(np.abs(uh.x.array))
print("local_max on rank", MPI.COMM_WORLD.rank, "= ", local_max)
global_max = MPI.COMM_WORLD.allreduce(local_max, op=MPI.MAX)
if MPI.COMM_WORLD.rank == 0:
print("global_max = ", global_max)
# with io.XDMFFile(domain.comm, "test_test_1.xdmf", "w") as xdmf:
# xdmf.write_mesh(domain)
# uh.name = "Deformation"
# xdmf.write_function(uh)
Could it be that your problem is simply ill posed? Prescribing Dirichlet conditions on points on the boundary is not a mathematically permitted operation, which may or may not manifest in the discrete statement. Indeed this could differ per element type, mesh refinement, linear solver, etc. For example, on my system your cell_type=mesh.CellType.hexahedron case with 1 process gives inf results.
Therefore, I was wondering what happens when I use cell_type=mesh.CellType.tetrahedron and why the results differ depending on the number of processors.
petsc4py.PETSc.Error: error code 71
[0] KSPSolve() at /usr/local/petsc/src/ksp/ksp/interface/itfunc.c:1089
[0] KSPSolve_Private() at /usr/local/petsc/src/ksp/ksp/interface/itfunc.c:838
[0] KSPSetUp() at /usr/local/petsc/src/ksp/ksp/interface/itfunc.c:427
[0] PCSetUp() at /usr/local/petsc/src/ksp/pc/interface/precon.c:1086
[0] PCSetUp_LU() at /usr/local/petsc/src/ksp/pc/impls/factor/lu/lu.c:121
[0] MatLUFactorNumeric() at /usr/local/petsc/src/mat/interface/matrix.c:3312
[0] MatLUFactorNumeric_SeqAIJ_Inode() at /usr/local/petsc/src/mat/impls/aij/seq/inode.c:1635
[0] MatPivotCheck() at /usr/local/petsc/include/petsc/private/matimpl.h:874
[0] MatPivotCheck_none() at /usr/local/petsc/include/petsc/private/matimpl.h:859
[0] Zero pivot in LU factorization: https://petsc.org/release/faq/#zeropivot
[0] Zero pivot row 2266 value 9.37531e-15 tolerance 2.22045e-14
on the tetrahedral grid.
Thus, one might wonder why this happens in serial and not in parallel, and that can be highlighted by running
after (trying) to solve the system.
In serial it yields:
PC Object: (dolfinx_linearproblem_131380451237840) 1 MPI process
type: lu
out-of-place factorization
tolerance for zero pivot 2.22045e-14
matrix ordering: nd
factor fill ratio given 5., needed 6.87672
Factored matrix follows:
Mat Object: (dolfinx_linearproblem_131380451237840) 1 MPI process
type: seqaij
rows=2268, cols=2268, bs=3
package used to perform factorization: petsc
total: nonzeros=567288, allocated nonzeros=567288
using I-node routines: found 752 nodes, limit used is 5
linear system matrix = precond matrix:
while in parallel:
PC Object: (dolfinx_linearproblem_125992758287040) 2 MPI processes
type: lu
out-of-place factorization
tolerance for zero pivot 2.22045e-14
matrix ordering: external
factor fill ratio given 0., needed 0.
Factored matrix follows:
Mat Object: (dolfinx_linearproblem_125992758287040) 2 MPI processes
type: mumps
rows=2268, cols=2268
package used to perform factorization: mumps
total: nonzeros=587574, allocated nonzeros=587574
MUMPS run parameters:
Use -dolfinx_linearproblem_125992758287040ksp_view ::ascii_info_detail to display information for all processes
RINFOG(1) (global estimated flops for the elimination after analysis): 8.9479e+07
RINFOG(2) (global estimated flops for the assembly after factorization): 450522.
RINFOG(3) (global estimated flops for the elimination after factorization): 8.9479e+07
(RINFOG(12) RINFOG(13))*2^INFOG(34) (determinant): (0.,0.)*(2^0)
INFOG(3) (estimated real workspace for factors on all processors after analysis): 587574
INFOG(4) (estimated integer workspace for factors on all processors after analysis): 15796
INFOG(5) (estimated maximum front size in the complete tree): 261
INFOG(6) (number of nodes in the complete tree): 68
INFOG(7) (ordering option effectively used after analysis): 2
INFOG(8) (structural symmetry in percent of the permuted matrix after analysis): -1
INFOG(9) (total real/complex workspace to store the matrix factors after factorization): 587574
INFOG(10) (total integer space store the matrix factors after factorization): 15796
INFOG(11) (order of largest frontal matrix after factorization): 261
INFOG(12) (number of off-diagonal pivots): 0
INFOG(13) (number of delayed pivots after factorization): 0
INFOG(14) (number of memory compress after factorization): 0
INFOG(15) (number of steps of iterative refinement after solution): 0
INFOG(16) (estimated size (in MB) of all MUMPS internal data for factorization after analysis: value on the most memory consuming processor): 10
INFOG(17) (estimated size of all MUMPS internal data for factorization after analysis: sum over all processors): 20
INFOG(18) (size of all MUMPS internal data allocated during factorization: value on the most memory consuming processor): 10
INFOG(19) (size of all MUMPS internal data allocated during factorization: sum over all processors): 20
INFOG(20) (estimated number of entries in the factors): 587574
INFOG(21) (size in MB of memory effectively used during factorization - value on the most memory consuming processor): 10
INFOG(22) (size in MB of memory effectively used during factorization - sum over all processors): 19
INFOG(23) (after analysis: value of ICNTL(6) effectively used): 0
INFOG(24) (after analysis: value of ICNTL(12) effectively used): 1
INFOG(25) (after factorization: number of pivots modified by static pivoting): 0
INFOG(28) (after factorization: number of null pivots encountered): 0
INFOG(29) (after factorization: effective number of entries in the factors (sum over all processors)): 587574
INFOG(30, 31) (after solution: size in Mbytes of memory used during solution phase): 9, 17
INFOG(32) (after analysis: type of analysis done): 1
INFOG(33) (value used for ICNTL(8)): 7
INFOG(34) (exponent of the determinant if determinant is requested): 0
INFOG(35) (after factorization: number of entries taking into account BLR factor compression - sum over all processors): 587574
INFOG(36) (after analysis: estimated size of all MUMPS internal data for running BLR in-core - value on the most memory consuming processor): 0
INFOG(37) (after analysis: estimated size of all MUMPS internal data for running BLR in-core - sum over all processors): 0
INFOG(38) (after analysis: estimated size of all MUMPS internal data for running BLR out-of-core - value on the most memory consuming processor): 0
INFOG(39) (after analysis: estimated size of all MUMPS internal data for running BLR out-of-core - sum over all processors): 0
linear system matrix = precond matrix:
which means that PETSc picks different factorization backends in serial and parallel.
The serial one (is the petsc default solver) does not deal well with null spaces, while the parallel one, mumps deals with null spaces.
However, as you have a null space for your operator (probably rigid body motions, you should remove them with for instance:
which will give you consistent results irregardless of the number of processes.
Full modified code:
from dolfinx import mesh, fem, io, default_scalar_type
from dolfinx.fem.petsc import LinearProblem
from mpi4py import MPI
import ufl
import numpy as np
import dolfinx
from petsc4py import PETSc
L = 1
W = 0.2
mu = 1
rho = 1
delta = W / L
gamma = 0.4 * delta**2
beta = 1.25
lambda_ = beta
g = gamma
domain = mesh.create_box(
MPI.COMM_WORLD,
[np.array([0, 0, 0]), np.array([L, W, W])],
[20, 5, 5],
cell_type=mesh.CellType.tetrahedron,
#cell_type=mesh.CellType.hexahedron,
)
V = fem.functionspace(domain, ("Lagrange", 1, (domain.geometry.dim,)))
point_1_coords = np.array([0, 0, 0])
point_2_coords = np.array([0, W, 0])
point_3_coords = np.array([L, 0, 0])
def point_1(x):
return np.linalg.norm(x.T - point_1_coords, axis=1) <= 1e-6
def point_2(x):
return np.linalg.norm(x.T - point_2_coords, axis=1) <= 1e-6
def point_3(x):
return np.linalg.norm(x.T - point_3_coords, axis=1) <= 1e-6
boundary_vertex_1 = mesh.locate_entities_boundary(domain, 0, point_1)
boundary_vertex_2 = mesh.locate_entities_boundary(domain, 0, point_2)
boundary_vertex_3 = mesh.locate_entities_boundary(domain, 0, point_3)
# print(f"boundary_vertex_1: {boundary_vertex_1} on process {MPI.COMM_WORLD.rank}")
# print(f"boundary_vertex_2: {boundary_vertex_2} on process {MPI.COMM_WORLD.rank}")
# print(f"boundary_vertex_3: {boundary_vertex_3} on process {MPI.COMM_WORLD.rank}")
# Component wise constraints (for tetrahedron results depend on number of processors)
constraint_dofs_x = []
constraint_dofs_x.extend(fem.locate_dofs_topological(V.sub(0), 0, boundary_vertex_1))
constraint_dofs_y = []
constraint_dofs_y.extend(fem.locate_dofs_topological(V.sub(1), 0, boundary_vertex_1))
constraint_dofs_y.extend(fem.locate_dofs_topological(V.sub(1), 0, boundary_vertex_2))
constraint_dofs_z = []
constraint_dofs_z.extend(fem.locate_dofs_topological(V.sub(2), 0, boundary_vertex_1))
constraint_dofs_z.extend(fem.locate_dofs_topological(V.sub(2), 0, boundary_vertex_2))
constraint_dofs_z.extend(fem.locate_dofs_topological(V.sub(2), 0, boundary_vertex_3))
# print(f"constraint_dofs_x: {constraint_dofs_x} on process {MPI.COMM_WORLD.rank}")
# print(f"constraint_dofs_y: {constraint_dofs_y} on process {MPI.COMM_WORLD.rank}")
# print(f"constraint_dofs_z: {constraint_dofs_z} on process {MPI.COMM_WORLD.rank}")
bcx = fem.dirichletbc(
default_scalar_type(0), np.array(constraint_dofs_x, dtype=np.int32), V.sub(0)
)
bcy = fem.dirichletbc(
default_scalar_type(0), np.array(constraint_dofs_y, dtype=np.int32), V.sub(1)
)
bcz = fem.dirichletbc(
default_scalar_type(0), np.array(constraint_dofs_z, dtype=np.int32), V.sub(2)
)
bc = [bcx, bcy, bcz]
# # Constraints in all directions (works fine)
# u_D = np.array([0, 0, 0], dtype=default_scalar_type)
# bc1 = fem.dirichletbc(u_D, fem.locate_dofs_topological(V, 0, boundary_vertex_1), V)
# bc2 = fem.dirichletbc(u_D, fem.locate_dofs_topological(V, 0, boundary_vertex_2), V)
# bc3 = fem.dirichletbc(u_D, fem.locate_dofs_topological(V, 0, boundary_vertex_3), V)
# bc = [bc1, bc2, bc3]
T = fem.Constant(domain, default_scalar_type((0, 0, 0)))
ds = ufl.Measure("ds", domain=domain)
def epsilon(u):
return ufl.sym(
ufl.grad(u)
) # Equivalent to 0.5*(ufl.nabla_grad(u) + ufl.nabla_grad(u).T)
def sigma(u):
return lambda_ * ufl.nabla_div(u) * ufl.Identity(len(u)) + 2 * mu * epsilon(u)
def build_nullspace(V: dolfinx.fem.FunctionSpace):
"""Build PETSc nullspace for 3D elasticity"""
dtype = dolfinx.default_scalar_type
# Create vectors that will span the nullspace
bs = V.dofmap.index_map_bs
length0 = V.dofmap.index_map.size_local
basis = [dolfinx.la.vector(V.dofmap.index_map, bs=bs, dtype=dtype) for i in range(6)]
b = [b.array for b in basis]
# Get dof indices for each subspace (x, y and z dofs)
dofs = [V.sub(i).dofmap.list.flatten() for i in range(3)]
# Set the three translational rigid body modes
for i in range(3):
b[i][dofs[i]] = 1.0
# Set the three rotational rigid body modes
x = V.tabulate_dof_coordinates()
dofs_block = V.dofmap.list.flatten()
x0, x1, x2 = x[dofs_block, 0], x[dofs_block, 1], x[dofs_block, 2]
b[3][dofs[0]] = -x1
b[3][dofs[1]] = x0
b[4][dofs[0]] = x2
b[4][dofs[2]] = -x0
b[5][dofs[2]] = x1
b[5][dofs[1]] = -x2
dolfinx.la.orthonormalize(basis)
basis_petsc = [
PETSc.Vec().createWithArray(x[: bs * length0], bsize=3, comm=V.mesh.comm) for x in b
]
return PETSc.NullSpace().create(vectors=basis_petsc)
nsp = build_nullspace(V)
u = ufl.TrialFunction(V)
v = ufl.TestFunction(V)
f = fem.Constant(domain, default_scalar_type((0, 0, -rho * g)))
a = ufl.inner(sigma(u), epsilon(v)) * ufl.dx
L = ufl.dot(f, v) * ufl.dx + ufl.dot(T, v) * ds
options = {"ksp_type": "preonly", "pc_type": "lu",
"ksp_error_if_not_converged": True,
"ksp_monitor": None,
"pc_factor_mat_solver_type": "mumps",}
uh = fem.Function(V, name="uh")
problem = LinearProblem(
a, L,u=uh, bcs=bc, petsc_options=options
)
problem.A.setNullSpace(nsp)
problem.solve()
local_max = np.max(np.abs(uh.x.array))
print("local_max on rank", MPI.COMM_WORLD.rank, "= ", local_max)
global_max = MPI.COMM_WORLD.allreduce(local_max, op=MPI.MAX)
if MPI.COMM_WORLD.rank == 0:
print("global_max = ", global_max)
# with io.XDMFFile(domain.comm, "test_test_1.xdmf", "w") as xdmf:
# xdmf.write_mesh(domain)
# uh.name = "Deformation"
# xdmf.write_function(uh)
You can also visualize the nullspace of your operator, as done in
also solves the “instability” problem, yielding:
mpirun -n 1 → global_max = 0.05372279263348693
mpirun -n 2 → global_max = 0.05372260524282449
mpirun -n 4 → global_max = 0.05372268668671327
Since I see the same package used to perform factorization: petsc with cell_type=mesh.CellType.hexahedron and the run is successful, I can guess that the solver is simply more stable with hex elements.
PC Object: (dolfinx_solve_281472922395824) 1 MPI process
type: lu
out-of-place factorization
tolerance for zero pivot 2.22045e-14
matrix ordering: nd
factor fill ratio given 5., needed 4.62462
Factored matrix follows:
Mat Object: (dolfinx_solve_281472922395824) 1 MPI process
type: seqaij
rows=2268, cols=2268, bs=3
package used to perform factorization: petsc
total: nonzeros=649962, allocated nonzeros=649962
using I-node routines: found 732 nodes, limit used is 5
linear system matrix = precond matrix: