Parallel data not compiling together?

Hi all,

I am trying to parallelise a piece of code, which is an extension of the following MWE, using mpirun -n 10 … in terminal. However, the xdmf file I get at the end is not the fully compiled solution, it seems to be only one part of the mesh.

To me this indicates that it is the section saved from just one of the processes. Using MPI+dolfinx is new to me and I’m not sure how to assemble the full final xdmf file properly.

Any help would be appreciated :slight_smile:

#!/usr/bin/env python3

import numpy as np
import matplotlib.pyplot as plt
from mpi4py import MPI
from petsc4py import PETSc
from dolfinx import fem, mesh, default_scalar_type, log
from dolfinx.io import XDMFFile
import ufl
import dolfinx.fem.petsc
import dolfinx.nls.petsc
import time
start_time = time.time()

## Create mesh
L = 1.0  # Length of the rod
nx, ny, nz = 10, 10, 100  # Number of elements in x, y, and z directions
domain = mesh.create_box(MPI.COMM_WORLD, [np.array([0, 0, 0]), np.array(
    [0.1, 0.1, L])], [nx, ny, nz], cell_type=mesh.CellType.hexahedron)

## Create function spaces
V = fem.VectorFunctionSpace(domain, ("CG", 1))

fdim = domain.topology.dim - 1

# Physical constants
E = 200e9
nu = 0.27
mu = E / (2.0 * (1 + nu))
lmbda = E * nu / ((1.0 + nu) * (1.0 - 2.0 * nu))
rho = 7990

# Time integration parameters
am = 0.2
af = 0.4
g = 0.5 + af - am
alpha_m = fem.Constant(domain, am)
alpha_f = fem.Constant(domain, af)
gamma = fem.Constant(domain, g)
beta = fem.Constant(domain, (g + 0.5)**2 / 4.0)

T = 5.0e-4
Timp = 5.0e-4
Nsteps = 100
dt = fem.Constant(domain, T / Nsteps)

## Time dep. boundary conditions
Amp=1e-8 #amplitude of perturbation

def point1(x):
    return np.logical_and(np.logical_and(np.isclose(x[2], 0.0), np.isclose(x[1], 0)), np.isclose(x[0], 0.05))

def point2(x):
    return np.logical_and(np.logical_and(np.isclose(x[2], 0.0), np.isclose(x[1], 0.1)), np.isclose(x[0], 0.05))

def input1(t):
    return np.array([Amp*t/Timp, 0.0,0.0], dtype=default_scalar_type)

def input2(t):
    return np.array([-1*Amp*t/Timp, 0.0,0.0], dtype=default_scalar_type)

ti=0
bc_fun1=input1(ti)
bc_fun2=input2(ti)

point1_dofs = fem.locate_dofs_geometrical(V, point1) 
bc1 = fem.dirichletbc(bc_fun1, point1_dofs, V)

point2_dofs = fem.locate_dofs_geometrical(V, point2) 
bc2 = fem.dirichletbc(bc_fun2, point2_dofs, V)

# Functions and fields
u_ = ufl.TestFunction(V)
u = fem.Function(V, name='Displacement')
u_old = fem.Function(V)
v_old = fem.Function(V)
a_old = fem.Function(V)

# Measures
dx = ufl.Measure("dx", domain=domain)

## GA method functions
def sigma(r):
    return lmbda*ufl.nabla_div(r)*ufl.Identity(len(r)) + 2*mu*ufl.sym(ufl.grad(r))

def m(u, u_):
    return rho*ufl.inner(u, u_)*dx

def k(u, u_):
    return ufl.inner(sigma(u), ufl.grad(u_))*dx

## Update variables
def update_a(u, u_old, v_old, a_old, ufl_=True):
    if ufl_:
        dt_ = dt
        beta_ = beta
        return (u-u_old-dt_*v_old)/beta_/dt_**2 - (1-2*beta_)/2/beta_*a_old

    else:
        dt_ = float(dt)
        beta_ = float(beta)
        return (u-u_old-dt_*v_old)/beta_/dt_**2 - (1-2*beta_)/2/beta_*a_old

def update_v(a, u_old, v_old, a_old, ufl_=True):
    if ufl_:
        return v_old + dt*((1-gamma)*a_old + gamma*a)
    else:
        return v_old + float(dt) * ((1 - float(gamma)) * a_old + float(gamma)*a)

def update_fields(u, u_old, v_old, a_old):
    u_vec, u0_vec = u.x.array[:], u_old.x.array[:]
    v0_vec, a0_vec = v_old.x.array[:], a_old.x.array[:]

    a_vec = update_a(u_vec, u0_vec, v0_vec, a0_vec, ufl_=False)
    v_vec = update_v(a_vec, u0_vec, v0_vec, a0_vec, ufl_=False)

    v_old.x.array[:] = v_vec     #numpy.ndarray stored inside dolfinx.fem.function.Function
    a_old.x.array[:] = a_vec
    u_old.x.array[:] = u_vec

def avg(x_old, x_new, alpha):
    return alpha*x_old + (1-alpha)*x_new

a_new = update_a(u, u_old, v_old, a_old, ufl_=True)     #ufl.algebra.Sum
v_new = update_v(a_new, u_old, v_old, a_old, ufl_=True)

## Residual to solve
res = m(avg(a_old, a_new, alpha_m), u_) + k(avg(u_old, u, alpha_f), u_) 

## Create file
with XDMFFile(domain.comm, "output_xdmfs/rod_straight.xdmf", "w") as xdmf_file:
    xdmf_file.write_mesh(domain)
    xdmf_file.write_function(u, 0)

# Solve time to assess time spent in each component
solve_time = 0

# Write function at each time step
tt=0
for i in range(Nsteps):
    tt += float(dt)

    bc_fun1=input1(tt)
    bc_fun2=input2(tt)

    bc1 = fem.dirichletbc(bc_fun1, point1_dofs,V)
    bc2 = fem.dirichletbc(bc_fun2, point2_dofs,V)

    allbcs=[bc1,bc2] #set BCs

    problem = fem.petsc.NonlinearProblem(res, u, bcs=allbcs) #problem setup

    ## Solver setup
    solver = dolfinx.nls.petsc.NewtonSolver(MPI.COMM_WORLD, problem)

    solver.atol = 1e-8
    solver.rtol = 1e-8
    solver.convergence_criterion = "incremental"

    ksp = solver.krylov_solver
    opts = PETSc.Options()
    option_prefix = ksp.getOptionsPrefix()
    opts[f"{option_prefix}ksp_type"] = "cg"
    opts[f"{option_prefix}pc_type"] = "gamg"
    #opts[f"{option_prefix}pc_factor_mat_solver_type"] = "mumps" 
    ksp.setFromOptions()
    
    # log.set_log_level(log.LogLevel.INFO)    # Convergence info

    if MPI.COMM_WORLD.rank==0:
        print("Time = %s" % tt)
        s = time.perf_counter()

    num_its, converged = solver.solve(u)

    if MPI.COMM_WORLD.rank==0:
        e = time.perf_counter()
        solve_time += e-s
        print(solve_time)
        # print(f"Number of interations: {num_its:d}")

    assert converged
    
    u.x.scatter_forward()

    # Update old fields with new quantities
    update_fields(u, u_old, v_old, a_old)

    # Write the displacement to the XDMF file
    xdmf_file.write_function(u, tt)


if MPI.COMM_WORLD.rank==0:
    print("--- %s seconds runtime ---" % (time.time() - start_time))

The issue is the usage here of the context manager (oop - Explaining Python's '__enter__' and '__exit__' - Stack Overflow).
You should use:

xdmf_file = XDMFFile(domain.comm, "output_xdmfs/rod_straight.xdmf", "w")

xdmf_file.write_mesh(domain)
xdmf_file.write_function(u, 0)

# rest of code
# ...
xdmf_file.close()

Thanks @dokken - works a charm.

One further question, I am running the code in parallel with 10 cores with a mesh with ~70000 points in 3d so ~200000 dofs in total. In docker desktop, I can see that if I have too many time steps the ‘Container memory usage’ will linearly increase to the current maximum (30GB) and will terminate the code.

Is the only way to remedy this to use more cores and increase this limit through WSL? Or I can help by not Preconditioning at every step or perhaps using a better solver-PC combination?

Thanks in advance.

There are many things to make your code use less memory. See for instance: Form compilation — FEniCS Tutorial @ Sorbonne
or
PETSc NonlinearProblem running slow - #4 by dokken
Speed up repeated evaluation, derivative (and assembly) of form - #2 by dokken
Out of memory error in frequency for loop - #4 by nate

1 Like