Unexpected behavior when run in parallel

Hello,
I would like to use the conjugate gradient method to solve a linear variational problem in parallel. As an example, here is a script for 1D transient heat conduction:

from dolfin import *
import sys

set_log_level(30)
parameters["linear_algebra_backend"] = "PETSc"
parameters["form_compiler"]["cpp_optimize"] = True
ffc_options = {"optimize": True, \
               "eliminate_zeros": True, \
               "precompute_basis_const": True, \
               "precompute_ip_const": True}

comm = MPI.comm_world
comm_rank = MPI.rank(comm)

# Constants
lx = 1 # length of domain
k = 210 # thermal conductivity
cp = 900 # specific heat
rho = 2700 # density
T0 = 283 # initial temperature
T_left = 423 # temperature left boundary
Q_dot = 0 # uniform body heat source
t_end = 10 # time span

# Generate 1D mesh
nel = 3000
mesh = IntervalMesh(comm, nel, 0., lx)

V = FunctionSpace(mesh, "CG", 1)

# Boundary conditions
left_boundary = CompiledSubDomain("near(x[0], 0.0, tol) && on_boundary", tol=1e-7)
T_left_expr = Expression("T", degree=1, T=T_left)
bc_left = DirichletBC(V, T_left_expr, left_boundary)
bcs = [bc_left]

# Define functions
u = TrialFunction(V)
v = TestFunction(V)
u_ = Function(V, name="Temp")
u_n = Function(V, name="Temp previous")

T_init = Expression("T", degree=1, T=T0)
u_.interpolate(T_init)
u_n.interpolate(T_init)

# Time incrementation
dt = 0.001
num_steps = int(t_end/dt)

# Variational problem
Q_dot_expr = Expression('Q',degree=1,Q=Q_dot)
diff_eq = cp*rho*v*u*dx - cp*rho*v*u_n*dx + dt*k*dot(grad(v),grad(u))*dx - dt*v*Q_dot_expr*dx
a_diff = lhs(diff_eq)
L_diff = rhs(diff_eq)

# Output file
file_results = XDMFFile(comm,"transient_heat.xdmf")
file_results.parameters["flush_output"] = True
file_results.parameters["functions_share_mesh"] = True

n_out = 100 # number of time steps to write to file
out_freq = int(num_steps/n_out)

t = 0
step = 1
while step<num_steps+1:
    t += dt
    solve(a_diff == L_diff, u_, bcs=bcs, solver_parameters={"linear_solver": "cg", "preconditioner":"amg"}, form_compiler_parameters={"optimize": True})

    if step % out_freq == 0:
        if comm_rank==0:
            print("t = {:.3f}".format(t))
            sys.stdout.flush()
        file_results.write(u_, t)

    # Update functions
    u_n.assign(u_)
    
    step += 1

However, I have found that when certain preconditioners are used, specifically AMG, the solution shows strange behavior at the locations where the mesh is partitioned. You can see these strange dips below, when this script is run with 1 core, 2 cores, and 4 cores, respectively:



When run with other solver and preconditioner combinations, this problem does not occur. I would appreciate any insight you may have into this. Thank you!