Gather solutions in parallel in FEniCSX

It is important to write down what version of DOLFINx you are using.
For DOLFINx v0.9.0, the code works with:

import dolfinx
from mpi4py import MPI
import numpy as np


def f(x):
    return x[0] + 3 * x[1]


N = 4
P = 3
mesh = dolfinx.mesh.create_unit_square(MPI.COMM_WORLD, N, N)

pci = mesh.topology.original_cell_index
V = dolfinx.fem.functionspace(mesh, ("Lagrange", P))
u = dolfinx.fem.Function(V)
u.interpolate(f)
dofmap = V.dofmap.list

root = 0
ci = mesh.comm.gather(pci, root=root)
dofmaps = mesh.comm.gather(dofmap, root=root)
x = mesh.comm.gather(u.x.array, root=root)
if MPI.COMM_WORLD.rank == root:
    serial_mesh = dolfinx.mesh.create_unit_square(MPI.COMM_SELF, N, N)
    inverse_map = np.argsort(serial_mesh.topology.original_cell_index)
    Vs = dolfinx.fem.functionspace(serial_mesh, ("Lagrange", P))
    us = dolfinx.fem.Function(Vs)
    us.interpolate(f)
    us_array = us.x.array

    dofmap_s = Vs.dofmap.list
    num_dofs = dofmap_s.shape[1]
    for rank, (dofm, cell_indices, x_p) in enumerate(zip(dofmaps, ci, x)):
        print(f"DATA from rank {rank}:")
        for i, cell in enumerate(cell_indices):
            serial_cell = inverse_map[cell]
            serial_dofs = dofmap_s[serial_cell]
            parallel_dofs = dofm[i]
            for j in range(num_dofs):
                print(x_p[parallel_dofs[j]], us_array[serial_dofs[j]])
                assert np.isclose(x_p[parallel_dofs[j]], us_array[serial_dofs[j]])