MPI Test behaves unexpectedly. Reorder DoFs?

I am trying to run a single function in parallel while keeping the rest of the program in serial. Towards that I am using the mpi4py package. Here is a minimal working example:

from dolfin import *
import mpi4py
from mpi4py.futures import MPIPoolExecutor

mesh = RectangleMesh(mpi4py.MPI.COMM_WORLD, Point(0, 0), Point(1, 1), 2, 2)

V = FunctionSpace(mesh, "Lagrange", 1)

def f_parallel(numOfCores):
    # This part is parallel. Executed by the worker threads.

    dm = V.dofmap()
    local_range = dm.ownership_range()
    local_dim = local_range[1] - local_range[0]

    f_loc = Function(V)
    f_arr = f_loc.vector().get_local()
    coor = V.tabulate_dof_coordinates() 
    for i in range(local_dim):
        f_arr[i] = coor[i][0] # usually more complicated

    return [local_range, f_arr]

if __name__ == "__main__":  
    # Only executed by the main thread

    coor = V.tabulate_dof_coordinates() 

    f = Function(V)
    f.vector()[:] = -1
    num_cores = 3
    executor = MPIPoolExecutor()

    f_arr = f.vector().get_local()
    for result in executor.map(f_parallel, [1]*num_cores): # usually more complicated
        loc_range, arr = result
        print("loc_range: ", loc_range, "arr: ", arr)
        start = int(loc_range[0])
        end = int(loc_range[1])
        f_arr[start:end] = arr

    f.vector()[:] = f_arr

    # different order
    print(f_arr)
    print(coor[:,0])

    xdmf_2 = XDMFFile(mpi4py.MPI.COMM_WORLD, "solution_out.xdmf")
    xdmf_2.write_checkpoint(f, 'func_to_save', 0, append=False)
    xdmf_2.close()    

The program is run via

 mpiexec -n 1 -usize 4 python parallel_test.py

Result

Output:

[0.  0.  0.  0.5 1.  0.5 1.  0.5 1. ]
[0.  0.  0.5 0.  0.5 1.  0.5 1.  1. ] 

The order of the DoFs does not seem to be correct. How can I fix this?

I am sure there is a more elegant way, but this lead to the expected result:

from dolfin import *
import mpi4py
import numpy as np
from mpi4py.futures import MPIPoolExecutor

mesh = RectangleMesh(mpi4py.MPI.COMM_WORLD, Point(0, 0), Point(1, 1), 2, 2)

V = FunctionSpace(mesh, "Lagrange", 1)

def give_index(arr, x, y):
    for line in arr:
        if line[0] == x and line[1] == y:
            return int(line[2])

def f_parallel(numOfCores):
    # This part is parallel. Executed by the worker threads.

    dm = V.dofmap()
    local_range = dm.ownership_range()
    local_dim = local_range[1] - local_range[0]

    f_loc = Function(V)
    f_arr = f_loc.vector().get_local()
    coor = V.tabulate_dof_coordinates() 

    for i in range(local_dim):
        f_arr[i] = coor[i][0] # usually more complicated

    f_arrr = np.array([[f_arr[i]] for i in range(len(f_arr))])

    f_coor = np.concatenate((coor, f_arrr), axis=1)

    return f_coor

if __name__ == "__main__":  
    # Only executed by the main thread

    coor = V.tabulate_dof_coordinates() 
    indices = np.array([[i] for i in range(len(coor))])
    coor_indices = np.concatenate((coor, indices), axis = 1)

    f = Function(V)
    f_arr = f.vector().get_local()

    num_cores = 3
    executor = MPIPoolExecutor()
    
    for result in executor.map(f_parallel, [1]*num_cores): # usually more complicated
        for line in result:
            coor_x        = line[0]
            coor_y        = line[1]
            index         = give_index(coor_indices, coor_x, coor_y)
            f_arr[index]  = line[2]

    f.vector()[:] = f_arr

    # same order
    print(f_arr)
    print(coor[:,0])   

    xdmf_2 = XDMFFile(mpi4py.MPI.COMM_WORLD, "solution_out.xdmf")
    xdmf_2.write_checkpoint(f, 'func_to_save', 0, append=False)
    xdmf_2.close()

Result:

Output:

[0.  0.  0.5 0.  0.5 1.  0.5 1.  1. ]
[0.  0.  0.5 0.  0.5 1.  0.5 1.  1. ]