Parallel Sparse PETSc Matrix Generation with dofs as rows and columns

Thanks for your response Dokken,

The code I have posted is working in my system interestingly.
Here is adjusted MWE upon your suggestion;

from petsc4py import PETSc
import dolfin as dolf
import numpy as np

dolf.parameters["mesh_partitioner"] = "ParMETIS"
dolf.parameters["ghost_mode"] = "shared_facet"
dolf.parameters["ghost_mode"] = "shared_vertex"  

def mshr(el):
    """
    This function handles creation of mesh, subdomains and boundaries
    """

    mesh = dolf.UnitIntervalMesh(el)

    def l_boundary_func(x, on_boundary):
        x = x[0]
        return on_boundary and dolf.near(x, 0.)

    def r_boundary_func(x, on_boundary):
        x = x[0]
        return on_boundary and dolf.near(x, 1.)

    boundaries = dolf.MeshFunction('size_t', mesh, mesh.topology().dim() - 1)

    l_boundary = dolf.AutoSubDomain(l_boundary_func)
    r_boundary = dolf.AutoSubDomain(r_boundary_func)

    l_boundary.mark(boundaries, 1)
    r_boundary.mark(boundaries, 2)

    # ________________________________________________________________________________

    def fl_subdomain_func(x):
        x = x[0]
        x_f = 0.25
        a_f = 0.025
        return x_f - a_f - dolf.DOLFIN_EPS <= x <= x_f + a_f + dolf.DOLFIN_EPS

    subdomains = dolf.MeshFunction('size_t', mesh, mesh.topology().dim())

    subdomains.set_all(1)

    fl_subdomain = dolf.AutoSubDomain(fl_subdomain_func)
    fl_subdomain.mark(subdomains, 0)

    return mesh, boundaries, subdomains

degree = 1

mesh, boundaries, subdomains = mshr(30)

CG = dolf.FiniteElement('CG', mesh.ufl_cell(), degree)
W = dolf.FunctionSpace(mesh, dolf.MixedElement([CG, CG]))

def assemble_a(fl):
    """
    This function returns vector a's nonzero dofs and their values as a list
    """

    # Assemble vector a for subdomain that has 
    # nonzero entries between 0.225<x<0.275
    (v_1, v_2) = dolf.TestFunction(W)
    dx = dolf.Measure('dx', subdomain_data=subdomains)
    V = dolf.FunctionSpace(mesh, 'CG', 1)
    const = dolf.interpolate(dolf.Constant(1), V)
    V_fl = dolf.assemble(const * dx(fl))
    a_1 =  dolf.assemble(v_1 / V_fl * dx(fl))
    a_2 =  dolf.assemble(v_2 / V_fl * dx(fl))

    aa1 = a_1.get_local() # convert to numpy array
    aa2 = a_2.get_local() # convert to numpy array
    indices1 = np.flatnonzero(aa1) #find nonzero indices
    indices2 = np.flatnonzero(aa2) #find nonzero indices
    aa1 = list(zip(indices1,aa1[indices1])) #zip nonzero indices with their values
    aa2 = list(zip(indices2,aa2[indices2])) #zip nonzero indices with their values
    a = (aa1, aa2)
    a = np.concatenate(a)
    return a

def assemble_b(x):
    """
    This function finds the basis function's derivative at point x
    and returns the relevant dof and derivative as a list
    """

    v = np.array([[1]])
    cell_index = mesh.bounding_box_tree().compute_first_entity_collision(dolf.Point(x))
    B = [np.array([]), np.array([])]
    if cell_index <= mesh.num_entities(1):
        cell = dolf.Cell(mesh, cell_index)
        B = []
        for j in range(2):
            # Evaluate derivatives for each function spaces
            dofmap = W.sub(j).dofmap()
            cell_dofs = dofmap.cell_dofs(cell_index)
            element = W.sub(j).element()
            d_dx = element.evaluate_basis_derivatives_all(1, x, cell.get_vertex_coordinates(), cell.orientation())
            d_dx = d_dx.reshape((len(cell_dofs), -1))
            d_dv = np.dot(d_dx, v)
            d_dv = d_dv[:, 0]
            
            my_list = []
            for i, dof in enumerate(cell_dofs):
                my_list.append([dofmap.tabulate_local_to_global_dofs()[dof], d_dv[i]])
            B.append(my_list)

    B = np.concatenate(B)
    return (*B, )

a = assemble_a(0)
b = assemble_b(0.2)

print("a: ",a)
print("b: ",b)
# COMPLEX MATRIX GENERATION

nnz = len(a)*len(b)

row = np.zeros(nnz)
col = np.zeros(nnz)
val = np.zeros(nnz)

for i, c in enumerate(a):
    for j, d in enumerate(b):
        row[i * len(b) + j] = c[0]
        col[i * len(b) + j] = d[0]
        val[i * len(b) + j] = c[1] * d[1]

row = row.astype(dtype='int32')
col = col.astype(dtype='int32')

global_size = W.dim()
istart, iend = W.dofmap().ownership_range()
local_size = iend - istart  # local size
mat = PETSc.Mat().create(PETSc.COMM_WORLD) 
mat.setSizes([(local_size, global_size), (local_size, global_size)])
mat.setType('aij') 
mat.setUp()
for i in range(len(row)):
    mat.setValue(row[i],col[i],val[i])
mat.assemblyBegin()
mat.assemblyEnd()

# CHECK CREATED MATRIX THAT HAS 16 ENTRIES
read_row = [item[0] for item in a]
read_col = [item[0] for item in b]
print("NONZERO MATRIX ENTRIES:(SHOULD BE 16 ENTRY)\n",mat.getValues(read_row,read_col))

When you run this script with 4 cores, it will be more obvious to see the problem;

a:  []
b:  ()
NONZERO MATRIX ENTRIES:(SHOULD BE 16 ENTRY)
 []
a:  [[14.   0.5]
 [15.   0.5]]
b:  ()
NONZERO MATRIX ENTRIES:(SHOULD BE 16 ENTRY)
 []
a:  []
b:  ()
NONZERO MATRIX ENTRIES:(SHOULD BE 16 ENTRY)
 []
a:  [[0.  0.5]
 [1.  0.5]]
b:  (array([  4., -30.]), array([ 2., 30.]), array([  5., -30.]), array([ 3., 30.]))
 [[-15.  15. -15.  15.]
 [-15.  15. -15.  15.]]

In the second core, one of the vectors becomes zero, hence the outer product results in zero. Because my subdomain (0.225<x<0.275) is not fall into same mesh with x=0.2(I evaluate gradient of basis function at that point) during mesh partition of parallel execution. So only 8 value could be obtained instead of 16 values.

The problem is about mesh partition. It looks llike my subdomain(0.225<x<0.275) and the point(x=0.2) I am evaluating gradient should be in the same mesh while parallelization. I have asked another question in here that may be more clear.