Parallel function saving and reading it back using PETSc or adios4dolfinx

I changed it to

and it still gives;

Mesh written to checkpoint_0.bp on comm 1/3

Mesh written to checkpoint_0.bp on comm 2/3

Mesh written to checkpoint_0.bp on comm 3/3

Mesh read from checkpoint_0.bp on comm 1/2
Function written to checkpoint_1.bp on comm 1/2

Mesh read from checkpoint_0.bp on comm 2/2
Function written to checkpoint_1.bp on comm 2/2

Traceback (most recent call last):
  File "demo.py", line 114, in <module>
    assert query_3.successful(), query_3.error
AssertionError: [<RemoteError[ ]:Exception(MPI_ERR_TRUNCATE: message truncated)>]
Stopping cluster <Cluster(cluster_id='1707334019-xzjr', profile='default', controller=<running>, engine_sets=['1707334020'])>

there is some problem during function reading. I did trying my tests again for v0.7.2. When I run mesh with;

# adios_mesh.py
from dolfinx.mesh import create_unit_square
import adios4dolfinx
from mpi4py import MPI
import shutil

mesh = create_unit_square(MPI.COMM_WORLD, 30, 30)
shutil.rmtree("mesh", ignore_errors=True)
adios4dolfinx.write_mesh(mesh, "mesh")

and write function with;

# adios_write.py
import numba
import numpy as np
from dolfinx.fem import (Function, FunctionSpace, locate_dofs_geometrical)
from dolfinx.mesh import locate_entities
from ufl import FiniteElement
from mpi4py import MPI
import dolfinx.fem
import dolfinx.io
import adios4dolfinx
import os
import pathlib
import shutil

def export_function(function: dolfinx.fem.Function, directory: str, filename: str) -> None:

    comm = function.function_space.mesh.comm
    visualization_directory = os.path.join(directory, filename + ".bp")
    checkpointing_directory = os.path.join(
        directory, filename + "_checkpoint.bp")

    shutil.rmtree(visualization_directory, ignore_errors=True)
    os.makedirs(visualization_directory, exist_ok=True)
    print(visualization_directory)
    # Export for visualization
    with dolfinx.io.VTXWriter(comm, visualization_directory, function, "bp4") as vtx_file:
        vtx_file.write(0)
    # Export for checkpointing
    adios4dolfinx.write_function(function, pathlib.Path(visualization_directory), "bp4")

mesh = adios4dolfinx.read_mesh(MPI.COMM_WORLD, "mesh", engine='BP4', ghost_mode=dolfinx.mesh.GhostMode.shared_facet)

v_cg = FiniteElement("CG", mesh.ufl_cell(), 2)
Q = FunctionSpace(mesh, v_cg)

def Omega_0(x):
    return x[1] <= 0.3

def Omega_1(x):
    return x[1] >= 0.7

kappa = Function(Q)
cells_0 = locate_entities(mesh, mesh.topology.dim, Omega_0)
cells_1 = locate_entities(mesh, mesh.topology.dim, Omega_1)

@numba.njit
def unroll_dofmap(dofs, bs):
    dofs_unrolled = np.zeros(bs*len(dofs), dtype=np.int32)
    for i, dof in enumerate(dofs):
        for b in range(bs):
            dofs_unrolled[i*bs+b] = dof*bs+b
    return dofs_unrolled

dofs_0 = locate_dofs_geometrical(Q, Omega_0)
dofs_1 = locate_dofs_geometrical(Q, Omega_1)
dofmap_bs = Q.dofmap.bs
kappa.x.array[unroll_dofmap(dofs_0, dofmap_bs)] = 1
kappa.x.array[unroll_dofmap(dofs_1, dofmap_bs)] = 0.1

export_function(kappa, "adios_dir", "test_export")

and finally testing the export with;

# assert.py
import numba
import numpy as np
from dolfinx.fem import (Function, FunctionSpace, locate_dofs_geometrical)
from dolfinx.mesh import create_unit_square, locate_entities
from ufl import FiniteElement
from mpi4py import MPI
import dolfinx.fem
import dolfinx.io
import adios4dolfinx
import os
import pathlib

mesh = adios4dolfinx.read_mesh(MPI.COMM_WORLD, "mesh", engine='BP4', ghost_mode=dolfinx.mesh.GhostMode.shared_facet)

v_cg = FiniteElement("CG", mesh.ufl_cell(), 2)
Q = FunctionSpace(mesh, v_cg)

def Omega_0(x):
    return x[1] <= 0.3

def Omega_1(x):
    return x[1] >= 0.7

kappa = Function(Q)
cells_0 = locate_entities(mesh, mesh.topology.dim, Omega_0)
cells_1 = locate_entities(mesh, mesh.topology.dim, Omega_1)

@numba.njit
def unroll_dofmap(dofs, bs):
    dofs_unrolled = np.zeros(bs*len(dofs), dtype=np.int32)
    for i, dof in enumerate(dofs):
        for b in range(bs):
            dofs_unrolled[i*bs+b] = dof*bs+b
    return dofs_unrolled

dofs_0 = locate_dofs_geometrical(Q, Omega_0)
dofs_1 = locate_dofs_geometrical(Q, Omega_1)
dofmap_bs = Q.dofmap.bs
kappa.x.array[unroll_dofmap(dofs_0, dofmap_bs)] = 1
kappa.x.array[unroll_dofmap(dofs_1, dofmap_bs)] = 0.1

def import_function(function_space: dolfinx.fem.FunctionSpace, directory: str, filename: str) -> dolfinx.fem.Function:

    function = dolfinx.fem.Function(function_space)
    checkpointing_directory = os.path.join(directory, filename + ".bp")
    adios4dolfinx.read_function(function, pathlib.Path(checkpointing_directory), "bp4")
    return function

kappa_import = import_function(Q, "adios_dir", "test_export")

print("original", np.count_nonzero(kappa.vector.array))
print("imported", np.count_nonzero(kappa_import.vector.array))

assert np.allclose(kappa.vector.array, kappa_import.vector.array)

gives;

Traceback (most recent call last):
  File "assert.py", line 50, in <module>
    kappa_import = import_function(Q, "adios_dir", "test_export")
  File "assert.py", line 47, in import_function
    adios4dolfinx.read_function(function, pathlib.Path(checkpointing_directory), "bp4")
  File "/home/ee331/Dev/Venvs/v073complex/lib/python3.8/site-packages/adios4dolfinx/checkpointing.py", line 337, in read_function
    inc_cells, inc_perms = send_and_recv_cell_perm(
  File "/home/ee331/Dev/Venvs/v073complex/lib/python3.8/site-packages/adios4dolfinx/comm_helpers.py", line 203, in send_and_recv_cell_perm
    mesh_to_data.Neighbor_alltoall(out_size, recv_size)
  File "mpi4py/MPI/Comm.pyx", line 2154, in mpi4py.MPI.Topocomm.Neighbor_alltoall
mpi4py.MPI.Exception: MPI_ERR_TRUNCATE: message truncated

what does
mesh_to_data.Neighbor_alltoall(out_size, recv_size)
do?

It communicates how much data should be passed from one process to the other.

Are you using openmpi or MPICH?
Could you also add a print-statement printing source and dest from:

with print(f"{MPI.COMM_WORLD.rank}, {source=} {dest=}", flush=True)

I am using openmpi.

it gives;

0, source=[0] dest=[0]

Again, for this code to work you should store the function and mesh in the same file.

Please also print (out_size) and (recv_size) prior to the all_to_all.

It seems that nothing changed. Here is another MWE;

# adios_write.py
import numba
import numpy as np
from dolfinx.fem import (Function, FunctionSpace, locate_dofs_geometrical)
from dolfinx.mesh import locate_entities
from ufl import FiniteElement
from mpi4py import MPI
import dolfinx.fem
import dolfinx.io
import adios4dolfinx
import os
import pathlib
import shutil

def export_function(function: dolfinx.fem.Function, directory: str, filename: str) -> None:

    comm = function.function_space.mesh.comm
    visualization_directory = os.path.join(directory, filename + ".bp")

    shutil.rmtree(visualization_directory, ignore_errors=True)
    os.makedirs(visualization_directory, exist_ok=True)
    print(visualization_directory)
    
    # Export for visualization
    with dolfinx.io.VTXWriter(comm, visualization_directory, function, "bp4") as vtx_file:
        vtx_file.write(0)
    # Export mesh
    adios4dolfinx.write_mesh(function.function_space.mesh, pathlib.Path(visualization_directory))
    # Export for checkpointing
    adios4dolfinx.write_function(function, pathlib.Path(visualization_directory), "bp4")


from dolfinx.mesh import create_unit_square

mesh_dolfinx = create_unit_square(MPI.COMM_WORLD, 30, 30)
shutil.rmtree("adios_dir/test_export.bp", ignore_errors=True)
adios4dolfinx.write_mesh(mesh_dolfinx, "adios_dir/test_export.bp")

mesh = adios4dolfinx.read_mesh(MPI.COMM_WORLD, "adios_dir/test_export.bp", engine='BP4', ghost_mode=dolfinx.mesh.GhostMode.shared_facet)

v_cg = FiniteElement("CG", mesh.ufl_cell(), 2)
Q = FunctionSpace(mesh, v_cg)

def Omega_0(x):
    return x[1] <= 0.3

def Omega_1(x):
    return x[1] >= 0.7

kappa = Function(Q)
cells_0 = locate_entities(mesh, mesh.topology.dim, Omega_0)
cells_1 = locate_entities(mesh, mesh.topology.dim, Omega_1)

@numba.njit
def unroll_dofmap(dofs, bs):
    dofs_unrolled = np.zeros(bs*len(dofs), dtype=np.int32)
    for i, dof in enumerate(dofs):
        for b in range(bs):
            dofs_unrolled[i*bs+b] = dof*bs+b
    return dofs_unrolled

dofs_0 = locate_dofs_geometrical(Q, Omega_0)
dofs_1 = locate_dofs_geometrical(Q, Omega_1)
dofmap_bs = Q.dofmap.bs
kappa.x.array[unroll_dofmap(dofs_0, dofmap_bs)] = 1
kappa.x.array[unroll_dofmap(dofs_1, dofmap_bs)] = 0.1

export_function(kappa, "adios_dir", "test_export")

then read and test it with;

# assert.py
import numba
import numpy as np
from dolfinx.fem import (Function, FunctionSpace, locate_dofs_geometrical)
from dolfinx.mesh import create_unit_square, locate_entities
from ufl import FiniteElement
from mpi4py import MPI
import dolfinx.fem
import dolfinx.io
import adios4dolfinx
import os
import pathlib

mesh = adios4dolfinx.read_mesh(MPI.COMM_WORLD, "adios_dir/test_export.bp", engine='BP4', ghost_mode=dolfinx.mesh.GhostMode.shared_facet)

v_cg = FiniteElement("CG", mesh.ufl_cell(), 2)
Q = FunctionSpace(mesh, v_cg)

def Omega_0(x):
    return x[1] <= 0.3

def Omega_1(x):
    return x[1] >= 0.7

kappa = Function(Q)
cells_0 = locate_entities(mesh, mesh.topology.dim, Omega_0)
cells_1 = locate_entities(mesh, mesh.topology.dim, Omega_1)

@numba.njit
def unroll_dofmap(dofs, bs):
    dofs_unrolled = np.zeros(bs*len(dofs), dtype=np.int32)
    for i, dof in enumerate(dofs):
        for b in range(bs):
            dofs_unrolled[i*bs+b] = dof*bs+b
    return dofs_unrolled

dofs_0 = locate_dofs_geometrical(Q, Omega_0)
dofs_1 = locate_dofs_geometrical(Q, Omega_1)
dofmap_bs = Q.dofmap.bs
kappa.x.array[unroll_dofmap(dofs_0, dofmap_bs)] = 1
kappa.x.array[unroll_dofmap(dofs_1, dofmap_bs)] = 0.1

def import_function(function_space: dolfinx.fem.FunctionSpace, directory: str, filename: str) -> dolfinx.fem.Function:

    function = dolfinx.fem.Function(function_space)
    checkpointing_directory = os.path.join(directory, filename + ".bp")
    adios4dolfinx.read_function(function, pathlib.Path(checkpointing_directory), "bp4")
    return function

kappa_import = import_function(Q, "adios_dir", "test_export")

print("original", np.count_nonzero(kappa.vector.array))
print("imported", np.count_nonzero(kappa_import.vector.array))

assert np.allclose(kappa.vector.array, kappa_import.vector.array)

gives

0, source=[0] dest=[0]
out_size, recv_size [1800] [0]
Traceback (most recent call last):
  File "assert.py", line 50, in <module>
    kappa_import = import_function(Q, "adios_dir", "test_export")
  File "assert.py", line 47, in import_function
    adios4dolfinx.read_function(function, pathlib.Path(checkpointing_directory), "bp4")
  File "/home/ee331/Dev/Venvs/v073complex/lib/python3.8/site-packages/adios4dolfinx/checkpointing.py", line 337, in read_function
    inc_cells, inc_perms = send_and_recv_cell_perm(
  File "/home/ee331/Dev/Venvs/v073complex/lib/python3.8/site-packages/adios4dolfinx/comm_helpers.py", line 204, in send_and_recv_cell_perm
    mesh_to_data.Neighbor_alltoall(out_size, recv_size)
  File "mpi4py/MPI/Comm.pyx", line 2154, in mpi4py.MPI.Topocomm.Neighbor_alltoall
mpi4py.MPI.Exception: MPI_ERR_TRUNCATE: message truncated

out_size is 1800 but recv_size is 0. Not sure how to interpret.

That makes sense to me, as you are filling in recv_size with the alltoall call.
There seems to be some issue with your MPI installation that I’ve not encountered myself.

You could simply try the following MWE (in serial):

from mpi4py import MPI
import numpy as np
comm = MPI.COMM_WORLD
assert comm.size == 1, "This should only run in serial"
dest_ranks = [0]
mesh_to_data = comm.Create_dist_graph(
    [0], [len(dest_ranks)], dest_ranks, reorder=False)
in_size = np.empty(1, dtype=np.int32)
out_size = np.array([10], dtype=np.int32)
mesh_to_data.Neighbor_alltoall(out_size, in_size)
print(in_size)

Thank you. What would you recommend then? Which version of openmpi/mpich are your using?

Did you run the snippet above? what did it return?

I’m using Mpich 4.1.2, as that is what comes with Docker.
What openmpi version are you using?

My openmpi version is

export OPENMPI_SERIES=4.1
export OPENMPI_PATCH=6

and relevant python binding is

mpi4py 3.1.5
This configuration can run all parallel stuff in dolfinx but sadly did not work for adios4dolfinx.

adios4dolfinx does nothing special. It uses standard MPI neighbourhood communicators.
The snippet I posted above is an illustration of what happens internally in adios4dolfinx in the call you are referring to.

It could be related to: Find cell tags from two overlapped meshes with different resolutions - #11 by dokken
but it is hard for me to work with when I cannot reproduce your error.

The issue was openmpi. I installed mpich from scratch and hence dolfinx and adios4dolfinx.

And the demo file runs fine, but I have another problem now. When I try to open the .bp file the paraview 5.12.0 RC1 says;

terminate called after throwing an instance of 'std::runtime_error'
  what():  ERROR: neither vtk.xml file or bp attribute was found in adios_dir/test_export.bp

I cannot view the solution.

Checkpoints are not possible to visualize in Paraview.

The reason for this is:

  1. they contain information specific to Dolfinx
  2. VTXReader in Paraview is missing features for global arrays. VTK support for unstructured grids with global arrays. Ā· Issue #3688 Ā· ornladios/ADIOS2 Ā· GitHub

The latter point stops any efficient implementation of N to M checkpointing (write with M processes, read with N), especially in cases where N<<M or N>>M

That’s the reason there is also have a VTXFile.write in my RBniCSx snippet.

Sorry guys @Ekrem_Ekici @dokken , I spoke too soon :wink:

I didn’t realize that yesterday I was running @dokken’s test in a terminal with adios4dolfinx before v0.7.2, rather than v0.7.2.
Now that I am actually importing v0.7.2 I can confirm that I have the same error as @Ekrem_Ekici

It seems to be specific about openmpi. I reported this at `v0.7.2` incompatible with openmpi Ā· Issue #55 Ā· jorgensd/adios4dolfinx Ā· GitHub

Thank you. Now, I am trying the simplify your demo further; Here is the minimal example;

# demo.py
from dolfinx.mesh import create_unit_square
import adios4dolfinx
from dolfinx.fem import Function, FunctionSpace
from ufl import FiniteElement
import numpy as np
from mpi4py import MPI
import dolfinx

path = "checkpoint0.bp"
path_out = "checkpoint1.bp"

mesh = create_unit_square(MPI.COMM_WORLD, 30, 30)
adios4dolfinx.write_mesh(mesh, path, engine='BP4')

def f(x):
    return x[0]**2+x[1]

mesh = adios4dolfinx.read_mesh(MPI.COMM_WORLD, path, engine='BP4', ghost_mode=dolfinx.mesh.GhostMode.shared_facet)

v_cg = FiniteElement("CG", mesh.ufl_cell(), 1)
V = FunctionSpace(mesh, v_cg)

kappa = Function(V)
kappa.interpolate(f)

adios4dolfinx.write_mesh(mesh, path_out, engine="BP4")
adios4dolfinx.write_function(kappa, path_out, engine="BP4")

kappa_import = Function(V)

adios4dolfinx.read_function(kappa_import, path_out, engine="BP4")

np.testing.assert_allclose(kappa.x.array, kappa_import.x.array, atol=1e-15)

It does work for single process but running it in parallel (2 procs) gives;

AssertionError: 
Not equal to tolerance rtol=1e-07, atol=1e-15

Mismatched elements: 523 / 523 (100%)
Max absolute difference: 1.41
Max relative difference: 68.375
 x: array([0.934444+0.j, 1.      +0.j, 1.033333+0.j, 0.967778+0.j,
       0.871111+0.j, 1.066667+0.j, 0.904444+0.j, 1.001111+0.j,
       0.81    +0.j, 1.1     +0.j, 0.843333+0.j, 0.937778+0.j,...
 y: array([0.09    +0.j, 0.111111+0.j, 0.144444+0.j, 0.071111+0.j,
       0.156667+0.j, 0.123333+0.j, 0.234444+0.j, 0.211111+0.j,
       0.137778+0.j, 0.19    +0.j, 0.211111+0.j, 0.073g.py:441: RuntimeWarning: mpi4py.MPI.File size changed

What is wrong here?

If you want to re-use the function inside the same script, you should use the snapshot checkpoints, not the recoverable checkpoint: checkpointing slides

Note that I’ve also fixed the openmpi issue (new release 0.7.3 at adios4dolfinx Ā· PyPI)

It does not work for separated files too. I build the mesh with;

# adios_mesh.py
from dolfinx.mesh import create_unit_square
import adios4dolfinx
from mpi4py import MPI
from pathlib import Path

path = Path("adios_dir/mesh.bp")
mesh = create_unit_square(MPI.COMM_WORLD, 30, 30)
adios4dolfinx.write_mesh(mesh, path, engine='BP4')

then read the mesh and write the function with;

# adios_write.py
from dolfinx.fem import (Function, FunctionSpace, locate_dofs_geometrical)
from ufl import FiniteElement
from mpi4py import MPI
import dolfinx.fem
import dolfinx.io
import adios4dolfinx
from pathlib import Path

def f(x):
    return x[0]**2+x[1]

path = Path("adios_dir/mesh.bp")
path_out = Path("adios_dir/checkpoint.bp")

mesh = adios4dolfinx.read_mesh(MPI.COMM_WORLD, path, engine='BP4', ghost_mode=dolfinx.mesh.GhostMode.shared_facet)

v_cg = FiniteElement("CG", mesh.ufl_cell(), 1)
V = FunctionSpace(mesh, v_cg)

kappa = Function(V)
kappa.interpolate(f)

adios4dolfinx.write_mesh(mesh, path_out, engine="BP4")
adios4dolfinx.write_function(kappa, path_out, engine="BP4")

then finally read that function with;

# adios_read.py
import dolfinx.fem
from dolfinx.fem import (Function, FunctionSpace)
from ufl import FiniteElement
from mpi4py import MPI
import adios4dolfinx
import dolfinx
import numpy as np
from pathlib import Path

def f(x):
    return x[0]**2+x[1]
    
path = Path("adios_dir/mesh.bp")
path_out = "adios_dir/checkpoint.bp"

mesh = adios4dolfinx.read_mesh(MPI.COMM_WORLD, path, engine='BP4', ghost_mode=dolfinx.mesh.GhostMode.shared_facet)

v_cg = FiniteElement("CG", mesh.ufl_cell(), 1)
V = FunctionSpace(mesh, v_cg)

kappa = Function(V)
kappa.interpolate(f)


kappa_import = Function(V)

adios4dolfinx.read_function(kappa_import, path_out, engine="BP4")

np.testing.assert_allclose(kappa.x.array, kappa_import.x.array, atol=1e-15)

gives;

AssertionError: 
Not equal to tolerance rtol=1e-07, atol=1e-15

Mismatched elements: 961 / 961 (100%)
Max absolute difference: 1.46555556
Max relative difference: 1319.
 x: array([9.344444e-01+0.j, 1.000000e+00+0.j, 1.033333e+00+0.j,
       9.677778e-01+0.j, 8.711111e-01+0.j, 1.066667e+00+0.j,
       9.044444e-01+0.j, 1.001111e+00+0.j, 8.100000e-01+0.j,...
 y: array([2.833333e-01+0.j, 2.500000e-01+0.j, 2.177778e-01+0.j,
       2.511111e-01+0.j, 3.166667e-01+0.j, 1.877778e-01+0.j,
       2.844444e-01+0.j, 2.211111e-01+0.j, 3.500000e-01+0.j,...

with the configuration below;

python3 adios_mesh.py
mpirun -np 2 python3 adios_write.py
mpirun -np 1 python3 adios_read.py

I almost copied everything from your demo as separate files but these files above are not working for some reason. Do they work as expected in your system?

Again, read the mesh from the checkpoint in the third file. That was the whole point of