Parallel Sparse PETSc Matrix Generation with dofs as rows and columns

Hi there,

I’m trying to generate parallel sparse PETSC matrix. I am able to calculate nonzero rows and columns and their values. Here is the MWE:

from petsc4py import PETSc
import dolfin as dolf
import numpy as np

dolf.parameters["mesh_partitioner"] = "ParMETIS"

def mshr(el):

    mesh = dolf.UnitIntervalMesh(el)

    def l_boundary_func(x, on_boundary):
        x = x[0]
        return on_boundary and dolf.near(x, 0.)

    def r_boundary_func(x, on_boundary):
        x = x[0]
        return on_boundary and dolf.near(x, 1.)

    boundaries = dolf.MeshFunction('size_t', mesh, mesh.topology().dim() - 1)

    l_boundary = dolf.AutoSubDomain(l_boundary_func)
    r_boundary = dolf.AutoSubDomain(r_boundary_func)

    l_boundary.mark(boundaries, 1)
    r_boundary.mark(boundaries, 2)

    # ________________________________________________________________________________

    def fl_subdomain_func(x):
        x = x[0]
        x_f = 0.25
        a_f = 0.025
        return x_f - a_f - dolf.DOLFIN_EPS <= x <= x_f + a_f + dolf.DOLFIN_EPS

    subdomains = dolf.MeshFunction('size_t', mesh, mesh.topology().dim())

    subdomains.set_all(1)

    fl_subdomain = dolf.AutoSubDomain(fl_subdomain_func)
    fl_subdomain.mark(subdomains, 0)

    return mesh, boundaries, subdomains

degree = 1

mesh, boundaries, subdomains = mshr(30)

CG = dolf.FiniteElement('CG', mesh.ufl_cell(), degree)
W = dolf.FunctionSpace(mesh, dolf.MixedElement([CG, CG]))



def assemble_left_vector(fl):


    (v_1, v_2) = dolf.TestFunction(W)
    dx = dolf.Measure('dx', subdomain_data=subdomains)

    V = dolf.FunctionSpace(mesh, 'CG', 1)
    const = dolf.interpolate(dolf.Constant(1), V)
    V_fl = dolf.assemble(const * dx(fl))
    
    a_1 =  dolf.assemble(v_1 / V_fl * dx(fl))
    a_2 =  dolf.assemble(v_2 / V_fl * dx(fl))

    dofmap = W.dofmap()
    
    def helper_func(v, dofmap):
        indices = np.flatnonzero(v) # Returns the indices which are nonzero
        values = v[indices]
        my_list = []
        for index, value in zip(indices, values):
            my_list.append([dofmap.dofs()[index], value])
        v = np.array(my_list)
        return v
    
    a_1 = helper_func(a_1, dofmap)
    a_2 = helper_func(a_2, dofmap)

    a = (a_1, a_2)

    return a

def assemble_right_vector(x):

    v = np.array([[0, 0, 1]])  # row
    dimension = mesh.topology().dim()
    if dimension == 1:
        v = np.array([[1]])
    elif dimension == 2:
        v = np.array([[1, 0]])
    v = v.transpose()  # column
    b = [np.array([]), np.array([])]
    cell_index = mesh.bounding_box_tree().compute_first_entity_collision(dolf.Point(x))
    if cell_index <= mesh.num_entities(dimension):
        cell = dolf.Cell(mesh, cell_index)

        b = []

        for j in range(2):

            dofmap = W.sub(j).dofmap()
            cell_dofs = dofmap.cell_dofs(cell_index)
            element = W.sub(j).element()
            d_dx = element.evaluate_basis_derivatives_all(1, x, cell.get_vertex_coordinates(), cell.orientation())
            d_dx = d_dx.reshape((len(cell_dofs), -1))
            d_dv = np.dot(d_dx, v)
            d_dv = d_dv[:, 0]
            
            my_list = []
            for i, dof in enumerate(cell_dofs):
                my_list.append([dofmap.tabulate_local_to_global_dofs()[dof], d_dv[i]])
            my_vec = np.array(my_list)
            b.append(my_vec)
    else:
        # print("The cell for the subdomain is not in the mesh")
        pass
    return (*b, )

a = assemble_left_vector(0)
b = assemble_right_vector(0.2)

a = np.concatenate(a)
b = np.concatenate(b)

nnz = len(a)*len(b)

row = np.zeros(nnz)
col = np.zeros(nnz)
val = np.zeros(nnz)

for i, c in enumerate(a):
    for j, d in enumerate(b):
        row[i * len(b) + j] = c[0]
        col[i * len(b) + j] = d[0]
        val[i * len(b) + j] = c[1] * d[1]

row = row.astype(dtype='int32')
col = col.astype(dtype='int32')

global_size = W.dim()
istart, iend = W.dofmap().ownership_range()
local_size = iend - istart  # local size
print(iend,istart)
mat = PETSc.Mat().create(PETSc.COMM_WORLD) # MPI.COMM_SELF
mat.setSizes([(local_size, global_size), (local_size, global_size)])
mat.setType('aij') 
mat.setUp()
for i in range(len(row)):
    mat.setValue(row[i],col[i],val[i])
mat.assemblyBegin()
mat.assemblyEnd()

What I am doing is;

  1. I am getting nonzero rows and their values as vector a
  2. Getting nonzero columns and their values as vector b
  3. Outer product of a and b should be my PETSc matrix by using corresponding rows and columns while others are zero.

For example when I run this code as serial, I get the a and b vectors like;

A:  [[44.   0.5]
      [46.   0.5]
      [45.   0.5]
      [47.   0.5]]
B:  [[ 50. -30.]
     [ 48.  30.]
     [ 51. -30.]
     [ 49.  30.]]

Then, when I take the outer product of a and b, the nonzero elements of PETSc matrix should be;

            48   49   50   51     # Nonzero columns
    44    [[15.  15. -15. -15.]
    45    [ 15.  15. -15. -15.]
    46    [ 15.  15. -15. -15.]
    47    [ 15.  15. -15. -15.]]
# Nonzero 
# rows

while the other entries are zero.

It works perfectly fine for 1 or 2 cores. However when I use 4 cores, one of the vectors(a or b) becomes zero due to mesh partition and it yields zero entries instead of nonzero entries. So my matrix becomes wrong in parallel.

How can I overcome this problem?

Any help would be much appreciated.

The code you have posted here does not run, and produces the error message:

Traceback (most recent call last):
  File "petsc_matrix.py", line 120, in <module>
    a = assemble_left_vector(0)
  File "petsc_matrix.py", line 77, in assemble_left_vector
    a_1 = helper_func(a_1, dofmap)
  File "petsc_matrix.py", line 69, in helper_func
    indices = np.flatnonzero(v) # Returns the indices which are nonzero
  File "/usr/lib/python3/dist-packages/numpy/core/numeric.py", line 896, in flatnonzero
    return a.ravel().nonzero()[0]
AttributeError: 'dolfin.cpp.la.Vector' object has no attribute 'ravel'

Please make sure that your code runs when you post it. I would also strongly suggest that you either:
a) Document the helper functions assemble_left_vector, assemble_right_vector, and explain the purpose of these functions.
b) Remove all helper functions and make a program that runs line by line (as the helper functions are only called once) and document the different stages of the code.

I would also prefer if the output you are showing

is reproducable by the code.

Thanks for your response Dokken,

The code I have posted is working in my system interestingly.
Here is adjusted MWE upon your suggestion;

from petsc4py import PETSc
import dolfin as dolf
import numpy as np

dolf.parameters["mesh_partitioner"] = "ParMETIS"
dolf.parameters["ghost_mode"] = "shared_facet"
dolf.parameters["ghost_mode"] = "shared_vertex"  

def mshr(el):
    """
    This function handles creation of mesh, subdomains and boundaries
    """

    mesh = dolf.UnitIntervalMesh(el)

    def l_boundary_func(x, on_boundary):
        x = x[0]
        return on_boundary and dolf.near(x, 0.)

    def r_boundary_func(x, on_boundary):
        x = x[0]
        return on_boundary and dolf.near(x, 1.)

    boundaries = dolf.MeshFunction('size_t', mesh, mesh.topology().dim() - 1)

    l_boundary = dolf.AutoSubDomain(l_boundary_func)
    r_boundary = dolf.AutoSubDomain(r_boundary_func)

    l_boundary.mark(boundaries, 1)
    r_boundary.mark(boundaries, 2)

    # ________________________________________________________________________________

    def fl_subdomain_func(x):
        x = x[0]
        x_f = 0.25
        a_f = 0.025
        return x_f - a_f - dolf.DOLFIN_EPS <= x <= x_f + a_f + dolf.DOLFIN_EPS

    subdomains = dolf.MeshFunction('size_t', mesh, mesh.topology().dim())

    subdomains.set_all(1)

    fl_subdomain = dolf.AutoSubDomain(fl_subdomain_func)
    fl_subdomain.mark(subdomains, 0)

    return mesh, boundaries, subdomains

degree = 1

mesh, boundaries, subdomains = mshr(30)

CG = dolf.FiniteElement('CG', mesh.ufl_cell(), degree)
W = dolf.FunctionSpace(mesh, dolf.MixedElement([CG, CG]))

def assemble_a(fl):
    """
    This function returns vector a's nonzero dofs and their values as a list
    """

    # Assemble vector a for subdomain that has 
    # nonzero entries between 0.225<x<0.275
    (v_1, v_2) = dolf.TestFunction(W)
    dx = dolf.Measure('dx', subdomain_data=subdomains)
    V = dolf.FunctionSpace(mesh, 'CG', 1)
    const = dolf.interpolate(dolf.Constant(1), V)
    V_fl = dolf.assemble(const * dx(fl))
    a_1 =  dolf.assemble(v_1 / V_fl * dx(fl))
    a_2 =  dolf.assemble(v_2 / V_fl * dx(fl))

    aa1 = a_1.get_local() # convert to numpy array
    aa2 = a_2.get_local() # convert to numpy array
    indices1 = np.flatnonzero(aa1) #find nonzero indices
    indices2 = np.flatnonzero(aa2) #find nonzero indices
    aa1 = list(zip(indices1,aa1[indices1])) #zip nonzero indices with their values
    aa2 = list(zip(indices2,aa2[indices2])) #zip nonzero indices with their values
    a = (aa1, aa2)
    a = np.concatenate(a)
    return a

def assemble_b(x):
    """
    This function finds the basis function's derivative at point x
    and returns the relevant dof and derivative as a list
    """

    v = np.array([[1]])
    cell_index = mesh.bounding_box_tree().compute_first_entity_collision(dolf.Point(x))
    B = [np.array([]), np.array([])]
    if cell_index <= mesh.num_entities(1):
        cell = dolf.Cell(mesh, cell_index)
        B = []
        for j in range(2):
            # Evaluate derivatives for each function spaces
            dofmap = W.sub(j).dofmap()
            cell_dofs = dofmap.cell_dofs(cell_index)
            element = W.sub(j).element()
            d_dx = element.evaluate_basis_derivatives_all(1, x, cell.get_vertex_coordinates(), cell.orientation())
            d_dx = d_dx.reshape((len(cell_dofs), -1))
            d_dv = np.dot(d_dx, v)
            d_dv = d_dv[:, 0]
            
            my_list = []
            for i, dof in enumerate(cell_dofs):
                my_list.append([dofmap.tabulate_local_to_global_dofs()[dof], d_dv[i]])
            B.append(my_list)

    B = np.concatenate(B)
    return (*B, )

a = assemble_a(0)
b = assemble_b(0.2)

print("a: ",a)
print("b: ",b)
# COMPLEX MATRIX GENERATION

nnz = len(a)*len(b)

row = np.zeros(nnz)
col = np.zeros(nnz)
val = np.zeros(nnz)

for i, c in enumerate(a):
    for j, d in enumerate(b):
        row[i * len(b) + j] = c[0]
        col[i * len(b) + j] = d[0]
        val[i * len(b) + j] = c[1] * d[1]

row = row.astype(dtype='int32')
col = col.astype(dtype='int32')

global_size = W.dim()
istart, iend = W.dofmap().ownership_range()
local_size = iend - istart  # local size
mat = PETSc.Mat().create(PETSc.COMM_WORLD) 
mat.setSizes([(local_size, global_size), (local_size, global_size)])
mat.setType('aij') 
mat.setUp()
for i in range(len(row)):
    mat.setValue(row[i],col[i],val[i])
mat.assemblyBegin()
mat.assemblyEnd()

# CHECK CREATED MATRIX THAT HAS 16 ENTRIES
read_row = [item[0] for item in a]
read_col = [item[0] for item in b]
print("NONZERO MATRIX ENTRIES:(SHOULD BE 16 ENTRY)\n",mat.getValues(read_row,read_col))

When you run this script with 4 cores, it will be more obvious to see the problem;

a:  []
b:  ()
NONZERO MATRIX ENTRIES:(SHOULD BE 16 ENTRY)
 []
a:  [[14.   0.5]
 [15.   0.5]]
b:  ()
NONZERO MATRIX ENTRIES:(SHOULD BE 16 ENTRY)
 []
a:  []
b:  ()
NONZERO MATRIX ENTRIES:(SHOULD BE 16 ENTRY)
 []
a:  [[0.  0.5]
 [1.  0.5]]
b:  (array([  4., -30.]), array([ 2., 30.]), array([  5., -30.]), array([ 3., 30.]))
 [[-15.  15. -15.  15.]
 [-15.  15. -15.  15.]]

In the second core, one of the vectors becomes zero, hence the outer product results in zero. Because my subdomain (0.225<x<0.275) is not fall into same mesh with x=0.2(I evaluate gradient of basis function at that point) during mesh partition of parallel execution. So only 8 value could be obtained instead of 16 values.

The problem is about mesh partition. It looks llike my subdomain(0.225<x<0.275) and the point(x=0.2) I am evaluating gradient should be in the same mesh while parallelization. I have asked another question in here that may be more clear.

The key here is that you are only access local (owned) indices of the vector (when you call get_local(), not the ghosted values. This means that you are ignoring several contributions to your matrix.

I do not know of an easy way of getting those ghost values in dolfin, however, here is a code that does the same thing for dolfinx.

from petsc4py import PETSc
from mpi4py import MPI
import dolfinx
import dolfinx.geometry
import basix
import numpy as np
import ufl
el = 30

mesh = dolfinx.UnitIntervalMesh(MPI.COMM_WORLD, el, dolfinx.cpp.mesh.GhostMode.shared_facet)
# mesh = dolfinx.UnitSquareMesh(MPI.COMM_WORLD, el, el)
#mesh = dolfinx.UnitCubeMesh(MPI.COMM_WORLD, el, el, el)


def fl_subdomain_func(x, eps=1e-16):
    x = x[0]
    x_f = 0.25
    a_f = 0.025
    return np.logical_and(x_f - a_f - eps <= x, x <= x_f + a_f + eps)

tdim = mesh.topology.dim
marked_cells = dolfinx.mesh.locate_entities(mesh, tdim, fl_subdomain_func)
fl = 1
subdomain = dolfinx.MeshTags(mesh, tdim, marked_cells, np.full(len(marked_cells), fl, dtype=np.int32))

degree = 1
CG = ufl.FiniteElement("CG", mesh.ufl_cell(), degree)
W = dolfinx.FunctionSpace(mesh, ufl.MixedElement([CG, CG]))

#This function returns a's nonzero dofs and their values as a list
v_1, v_2 = ufl.TestFunctions(W)
dx = ufl.Measure("dx", subdomain_data=subdomain)

V_fl = MPI.COMM_WORLD.allreduce(dolfinx.fem.assemble_scalar(dolfinx.Constant(mesh, 1)*dx(fl)), op=MPI.SUM)
b_1 = dolfinx.Function(W)
b_1.x.array[:] = 0
a_1 = dolfinx.fem.assemble_vector(b_1.vector, v_1/V_fl*dx(fl))
b_1.vector.ghostUpdate(addv=PETSc.InsertMode.ADD_VALUES, mode=PETSc.ScatterMode.REVERSE)
b_2 = dolfinx.Function(W)
b_2.x.array[:] = 0
a_2 = dolfinx.fem.assemble_vector(b_2.vector, v_2/V_fl*dx(fl))
b_2.vector.ghostUpdate(addv=PETSc.InsertMode.ADD_VALUES, mode=PETSc.ScatterMode.REVERSE)

# NOTE: This includes ghosted dofs
aa1 = b_1.x.array
aa2 = b_2.x.array
indices1 = np.flatnonzero(aa1)
indices2 = np.flatnonzero(aa2)

aa1 = list(zip(indices1, aa1[indices1]))
aa2 = list(zip(indices2, aa2[indices2]))
a =  np.concatenate((aa1, aa2))

print(MPI.COMM_WORLD.rank, a, W.dofmap.index_map.size_local)

point = np.array([[0.2, 0, 0]])
v = np.array([[0, 0, 1]]).T
if tdim == 1:
    v = np.array([[1]])
elif tdim == 2:
    v = np.array([[1, 0]]).T

# Finds the basis function's derivative at point x
# and returns the relevant dof and derivative as a list
bb_tree = dolfinx.geometry.BoundingBoxTree(mesh, tdim)
cell_candidates = dolfinx.geometry.compute_collisions_point(bb_tree, point[0])
# Choose one of the cells that contains the point
cell = dolfinx.geometry.select_colliding_cells(mesh, cell_candidates, point[0], 1)

# Data required for pull back of coordinate
gdim = mesh.geometry.dim
num_local_cells = mesh.topology.index_map(tdim).size_local
num_dofs_x = mesh.geometry.dofmap.links(0).size  # NOTE: Assumes same cell geometry in whole mesh
t_imap = mesh.topology.index_map(tdim)
num_cells = t_imap.size_local + t_imap.num_ghosts
x = mesh.geometry.x
x_dofs = mesh.geometry.dofmap.array.reshape(num_cells, num_dofs_x)
cell_geometry = np.zeros((num_dofs_x, gdim), dtype=np.float64)
points_ref = np.zeros((1, tdim))

# Data required for evaluation of derivative
ct = dolfinx.cpp.mesh.to_string(mesh.topology.cell_type)
element = basix.create_element(basix.finite_element.string_to_family(
        "Lagrange", ct), basix.cell.string_to_type(ct), degree, basix.LagrangeVariant.equispaced)
dofmaps = [W.sub(i).dofmap for i  in range(2)]
coordinate_element = basix.create_element(basix.finite_element.string_to_family(
        "Lagrange", ct), basix.cell.string_to_type(ct), 1, basix.LagrangeVariant.equispaced)

point_ref = None
B = []
if len(cell) > 0:
    cell = cell[0]
    # Only add contribution if cell is owned
    if cell < num_local_cells:
        # Map point in cell back to reference element
        cell_geometry[:] = x[x_dofs[cell], :gdim]
        point_ref = mesh.geometry.cmap.pull_back(point[:,:gdim], cell_geometry)
        dphi = coordinate_element.tabulate(1, point_ref)[1:,0,:]
        J = np.dot(cell_geometry.T, dphi.T)
        Jinv = np.linalg.inv(J)  
        for j in range(2):
            cell_dofs = dofmaps[j].cell_dofs(cell)
            global_dofs = dofmaps[j].index_map.local_to_global(cell_dofs)
            # Compute gradient on physical element by multiplying by J^(-T)
            d_dx = (Jinv.T @ element.tabulate(1, point_ref)[1:, 0, :]).T
            d_dv = np.dot(d_dx, v)[:, 0]
            for i in range(len(cell_dofs)):
                B.append([global_dofs[i], d_dv[i]])
    else:
        print(MPI.COMM_WORLD.rank, "Ghost", cell)

print(np.asarray(B))

Regarding you question about mesh partitioning, there is functionality to control this in dolfinx, and @IgorBaratta can probably provide you with an example of how to control it.

3 Likes

Dear @dokken,

Thanks for your interest. It looks like I need to migrate from dolfin to dolfinx.

Now, I am trying to install dolfinx to our Ubuntu 18.04 server. Since I do not have root access, the installation is very daunting. What would you suggest at this point.

What is the most convenient way for me to make the dolfinx run on our Ubuntu 18.04 server?

Dear @IgorBaratta, could you please provide some examples about controlling parallel mesh partitioning for dolfinx?

Here is another MWE, which clearly states the different results of serial and parallel runs;

from dolfinx.mesh import create_rectangle
from dolfinx.fem import Function, form, FunctionSpace
from dolfinx.fem.petsc import assemble_vector
from petsc4py import PETSc
from mpi4py import MPI
from ufl import as_vector, inner, grad, TrialFunction, TestFunction, dx
import numpy as np

comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
nx = 8
ny= 1
mesh = create_rectangle(comm,[[0,0],[1,0.047]], [nx, ny])
ndim = mesh.geometry.dim

V = FunctionSpace(mesh, ("Lagrange", 1))
dofmaps = V.dofmap

def gaussian(x, x_ref, sigma):
    spatial = (x[0]-x_ref[0])**2 + (x[1]-x_ref[1])**2
    amplitude = 1/(sigma**ndim*(2*np.pi)**(ndim/2))
    spatial_term = np.exp(-1*spatial/(2*sigma**2))
    return amplitude*spatial_term

x_f = np.array([0.25, 0.])  # [m]
a_f = 0.02  # [m]

x_r = np.array([0.20, 0.])  # [m]
a_r = 0.01

w = Function(V)  
w.interpolate(lambda x: gaussian(x,x_r,a_r))

h = Function(V)  
h.interpolate(lambda x: gaussian(x,x_f,a_f))

phi_i = TrialFunction(V)
phi_j = TestFunction(V)

def indices_and_values(form):
    tol = 1E-5
    temp = Function(V)
    assemble_vector(temp.vector, form)
    temp.vector.ghostUpdate(addv=PETSc.InsertMode.ADD_VALUES, mode=PETSc.ScatterMode.REVERSE)
    packed = temp.x.array
    packed.real[abs(packed.real) < tol] = 0.0
    packed.imag[abs(packed.imag) < tol] = 0.0
    indices = np.array(np.flatnonzero(packed),dtype=np.int32)
    global_indices = dofmaps.index_map.local_to_global(indices)
    packed = list(zip(global_indices, packed[indices]))
    return packed

def assemble_left_vector():
    form_to_assemble = form(phi_i * h  *  dx)
    left_vector = indices_and_values(form_to_assemble)
    # Parallelization - scatter nonzeros to processes
    left_vector = comm.gather(left_vector, root=0)
    if rank == 0:
        left_vector = [j for i in left_vector for j in i]
        chunks = [[] for _ in range(size)]
        for i, chunk in enumerate(left_vector):
            chunks[i % size].append(chunk)
    else:
        left_vector = None
        chunks = None
    left_vector = comm.scatter(chunks, root=0)
    return left_vector

def assemble_right_vector():
    n_ref = as_vector([1,0])
    gradient_form = form(inner(n_ref,grad(phi_j))  * w * dx)
    right_vector = indices_and_values(gradient_form)
    # Parallelization - gather and broadcast nonzeros
    right_vector = comm.gather(right_vector, root=0)
    if right_vector:
        right_vector = [j for i in right_vector for j in i]
    else:
        right_vector=[]
    right_vector = comm.bcast(right_vector,root=0)

    return right_vector

A = assemble_left_vector()
B = assemble_right_vector()

row = [item[0] for item in A]
col = [item[0] for item in B]
row_vals = [item[1] for item in A]
col_vals = [item[1] for item in B]
product = np.outer(row_vals,col_vals)
val = product.flatten()

global_size, local_size = V.dofmap.index_map.size_global, V.dofmap.index_map.size_local

mat = PETSc.Mat().create(PETSc.COMM_WORLD)
mat.setSizes([(local_size, global_size), (local_size, global_size)])
mat.setType('mpiaij')
ONNZ = len(col)*np.ones(local_size,dtype=np.int32)
mat.setPreallocationNNZ([ONNZ, ONNZ])
mat.setUp()
mat.setValues(row, col, val)
mat.assemblyBegin()
mat.assemblyEnd()
mat.view()

serial run gives;

  type: mpiaij
row 0:
row 1:
row 2: (3, -2.86058e-07)  (4, -2.86053e-07)  (6, 2.86058e-07)  (7, 2.86053e-07) 
row 3: (3, -5.09744e-06)  (4, -5.09735e-06)  (6, 5.09744e-06)  (7, 5.09736e-06) 
row 4: (3, -1.0767e-05)  (4, -1.07668e-05)  (6, 1.0767e-05)  (7, 1.07668e-05) 
row 5: (3, -2.7724e-05)  (4, -2.77236e-05)  (6, 2.7724e-05)  (7, 2.77236e-05) 
row 6: (3, -9.3367e-06)  (4, -9.33655e-06)  (6, 9.3367e-06)  (7, 9.33655e-06) 
row 7: (3, -4.52532e-06)  (4, -4.52525e-06)  (6, 4.52532e-06)  (7, 4.52525e-06) 
row 8:
row 9:
row 10:
row 11:
row 12:
row 13:
row 14:
row 15:
row 16:
row 17:

It runs OK for 2,3 processes whereas 4 process run gives;

  type: mpiaij
row 0:
row 1:
row 2: (3, -2.86058e-07)  (4, -2.86058e-07)  (5, 2.86058e-07)  (6, 2.86058e-07)  (7, 2.86053e-07) 
row 3: (3, -5.09744e-06)  (4, -5.09744e-06)  (5, 5.09744e-06)  (6, 5.09744e-06)  (7, 5.09736e-06) 
row 4: (3, -5.09744e-06)  (4, -5.09744e-06)  (5, 5.09744e-06)  (6, 5.09744e-06)  (7, 5.09736e-06) 
row 5: (3, -9.3367e-06)  (4, -9.3367e-06)  (5, 9.3367e-06)  (6, 9.3367e-06)  (7, 9.33655e-06) 
row 6: (3, -9.3367e-06)  (4, -9.3367e-06)  (5, 9.3367e-06)  (6, 9.3367e-06)  (7, 9.33655e-06) 
row 7: (3, -4.52532e-06)  (4, -4.52532e-06)  (5, 4.52532e-06)  (6, 4.52532e-06)  (7, 4.52525e-06) 
row 8:
row 9:
row 10:
row 11:
row 12:
row 13:
row 14:
row 15:
row 16:
row 17:

or 5 process run gives;

  type: mpiaij
row 0: (1, -2.77236e-05)  (5, -2.7724e-05)  (6, 2.77236e-05)  (7, 2.7724e-05) 
row 1: (1, -5.72106e-07)  (5, -5.72116e-07)  (6, 5.72106e-07)  (7, 5.72116e-07) 
row 2:
row 3:
row 4: (1, -2.86053e-07)  (5, -2.86058e-07)  (6, 2.86053e-07)  (7, 2.86058e-07) 
row 5: (1, -4.8113e-06)  (5, -4.81138e-06)  (6, 4.8113e-06)  (7, 4.81138e-06) 
row 6: (1, -4.52525e-06)  (5, -4.52532e-06)  (6, 4.52525e-06)  (7, 4.52532e-06) 
row 7: (1, -9.33655e-06)  (5, -9.3367e-06)  (6, 9.33655e-06)  (7, 9.3367e-06) 
row 8:
row 9:
row 10:
row 11:
row 12:
row 13:
row 14:
row 15:
row 16:
row 17:

I wonder whether shared dofs causing the problem?