Problem with solve() in parallel computing

I can solve it sequentially, but when I try to solve it in parallel with mpirun. The computing is stuck after the first iteration. It seems solve() cannot converge in parallel computing?
Here is my code, Thanks
from future import print_function
from fenics import *
from mshr import *
import numpy as np
import pdb
import gc
from sys import getsizeof

Tf = 0.02 # final time
num_steps = 4000 # number of time steps
dt1 = Tf / num_steps # time step size
mu = Constant(0.00329) # kinematic viscosity
rho =Constant(6165) # density
k=Constant(50.55)
Cp=Constant(759.75) # specific heat ( Change later)

mesh = RectangleMesh(Point(0.0,0.0), Point(0.003,0.001),350,100)
V = VectorElement(“CG”, mesh.ufl_cell(), 1)
Q = FiniteElement(“CG”, mesh.ufl_cell(), 1)

W=FunctionSpace(mesh, V* Q)
T = FunctionSpace(mesh, ‘CG’, 1)

dof_x = T.tabulate_dof_coordinates()
xT = dof_x[:, 0]
yT = dof_x[:, 1]
print ('Velocity dofs = ', T.dim()*2)
print ('Pressure dofs = ', T.dim())

boundary_parts = MeshFunction(“size_t”, mesh, mesh.topology().dim()-1)
tor=0.00001
top = AutoSubDomain(lambda x,on_bnd: x[1]>=0.001-tor and on_bnd)
bot = AutoSubDomain(lambda x,on_bnd: x[1]<=0.0+tor and on_bnd)
left = AutoSubDomain(lambda x,on_bnd: x[0]<=0.0+tor and on_bnd)
right = AutoSubDomain(lambda x,on_bnd: x[0]>=0.003-tor and on_bnd)

top.mark(boundary_parts, 1)
bot.mark(boundary_parts, 2)
left.mark(boundary_parts, 3)
right.mark(boundary_parts, 4)

ds1 = Measure(“ds”, subdomain_id=1,subdomain_data=boundary_parts)

top = DirichletBC(T, Constant(0), boundary_parts,1)
top_ind=np.fromiter(top.get_boundary_values().keys(),dtype=np.int)

bcu_wall1 = DirichletBC(W.sub(0), Constant((0, 0)), boundary_parts,2)
bcu_wall2 = DirichletBC(W.sub(0), Constant((0, 0)), boundary_parts,3)
bcu_wall3 = DirichletBC(W.sub(0), Constant((0, 0)), boundary_parts,4)
indd=np.fromiter(bcu_wall1.get_boundary_values().keys(),dtype=np.int)
bcu = [ bcu_wall1,bcu_wall2,bcu_wall3]

up0 = Function(W)
u0 = as_vector((up0[0], up0[1]))
u1 = as_vector((up0[0], up0[1]))
p0 = up0[2]
p1 = up0[2]
up1 = Function(W)
up = TrialFunction(W)
u = as_vector((up[0],up[1]))
p = up[2]
vq = TestFunction(W)
v = as_vector((vq[0],vq[1]))
q = vq[2]

Tu = TrialFunction(T)
Tv = TestFunction(T)

T_n = Function(T)
T_ = Function(T)
T_n.interpolate(Constant(300.0))
tor1=1.e-10
up0.interpolate(Constant([tor1,tor1,tor1]))

U = 0.5*(u0 + u)
n = FacetNormal(mesh)
dt = Constant(dt1)

LSpower=500*0.23
R=0.00045
t = 0
x_cord = Function(T)
x_cord.vector()[:]=np.ascontiguousarray(xT, dtype=np.float32)
y_cord = Function(T)
y_cord.vector()[:]=np.ascontiguousarray(yT, dtype=np.float32)

it=0
A_u1=None
AT=None
b_u1=None
bT=None
def dtang_T(T_n,T):
dtang = Expression((‘Tx’, ‘Ty’),Tx=project(T_n.dx(0),T),Ty=project(T_n.dx(1),T),degree=2)
return dtang

for n in range(num_steps):

t += dt1

F1 = rho*dot((u - u0) / dt, v)*dx + \
     rho*inner(v, dot(grad(u), u0))*dx \
     + (mu)*inner(grad(v), grad(u)+grad(u).T)*dx \
     - inner(div(v), p)*dx - inner(q, div(u))*dx- dot(-0.000303*dtang_T(T_n,T),v)*ds1
a_u1 = lhs(F1)
L_u1 = rhs(F1)


A_u1=assemble(a_u1, tensor=A_u1)
b_u1=assemble(L_u1, tensor=b_u1)
[bc.apply(A_u1,b_u1) for bc in bcu]
up1=Function(W)
if np.max(np.abs(b_u1.get_local()))<0.0000001:
    up1=Function(W)
else:
    solve(A_u1,up1.vector(), b_u1)
u0,p0=up1.split(deepcopy=True)


print ("it = %6d,   t = %12.6e" % (it,t))
ux,uy=u0.split(deepcopy=True)
u_mag=np.sqrt(ux.vector().get_local()**2+uy.vector().get_local()**2)
print ("%12.4e %12.4e" % (max(u_mag),min(u_mag)))

flux=project(LSpower/(np.pi*R**2)*exp(-2*((x_cord-0.0005-0.08*t)**2+(y_cord-0.001)**2)/R**2),T)
FT = k*inner(grad(Tu), grad(Tv))*dx+Cp*rho*inner(inner(grad(T_n),u0), Tv)*dx+Cp*rho*(1.0/dt)*inner(Tu-T_n, Tv)*dx  \
    -(flux*Tv*ds1)-0.8*5.67*10**(-8)*(Constant((300)**4)-T_n**4)*Tv*ds1
aT  = lhs(FT)
LT  = rhs(FT)

AT=assemble(aT,tensor=AT)
bT=assemble(LT,tensor=bT)

T_=Function(T)
solve(AT,T_.vector(), bT)
print ("%12.4e %12.4e" % (max(T_.vector().get_local()),min(T_.vector().get_local())))

assign(up0, up1)
assign(T_n, T_)

it+= 1