Error in assigning function values in Matrix-free solution in dolfinx

Dear all,

I am currently trying to follow a matrix-free approach to model a dynamic elasticity problem using Dolfinx 0.6.0. For this, in each time iteration, I solve for the acceleration values on each node by dividing the total force (which is a function of the displacement values at the previous time step) by lumped mass vector and then iterating the velocity and displacement fields accordingly. I am not sure if it is the most efficient way to do it but it looks like it works for the specific problem that I am interested in.

Here is my MWE:

import dolfinx
import ufl
import numpy as np
from mpi4py import MPI
from petsc4py import PETSc

from dolfinx import fem
from dolfinx.fem.petsc import (assemble_vector, set_bc)


Lx = 0.01
Ly = 0.1
meshsize = Lx/20.0
comm = MPI.COMM_WORLD

if comm.rank == 0:
    print("Number of Parallel Threads: ", comm.size)

nx = round(Lx/meshsize)
ny = round(Ly/meshsize)

# domain = dolfinx.mesh.create_unit_square(MPI.COMM_WORLD,nx,ny)
domain = dolfinx.mesh.create_rectangle(MPI.COMM_WORLD, 
                               [np.array([0.0,0.0]), np.array([Lx,Ly])], 
                               [nx, ny], dolfinx.mesh.CellType.quadrilateral,
                               ghost_mode = dolfinx.cpp.mesh.GhostMode.none)

# # Define material properties
young = 210.0e9
nu = 0.3
density = 7800.0
sigma0 = 17.5e3/Lx

lmbda_ = young/((1.0+nu)*(1.0-2.0*nu))
mu = young/(2.0*(1.0+nu))
kappa = lmbda_ + mu   #for 2D  #kappa = lmbda + 2/3 * mu #for 3D

t=0
dt = 1.0e-9
totalT = 2.0
numsteps = round(totalT/dt)
tsave = 0.1
nsave = round(tsave/dt)
nprint = 500

e0 = ufl.VectorElement("CG", domain.ufl_cell(), 1)
V = dolfinx.fem.FunctionSpace(domain, e0)                 # V = dolfinx.VectorFunctionSpace(mesh, ("CG", 1))

facet_indices, facet_markers = [], []
crack_indices, crack_markers = [], []
fdim = domain.topology.dim - 1

domain_boundaries = [(1, lambda x: np.isclose(x[0], 0.0)), 
                     (2, lambda x: np.isclose(x[0], Lx)), 
                     (3, lambda x: np.isclose(x[1], 0.0)),
                     (4, lambda x: np.isclose(x[1], Ly))]


for (marker, locator) in domain_boundaries:
   facets = dolfinx.mesh.locate_entities(domain, fdim, locator)
   facet_indices.append(facets)
   facet_markers.append(np.full(len(facets), marker))

facet_tags = dolfinx.mesh.meshtags(domain, fdim, np.array(np.hstack(facet_indices), dtype=np.int32),
                            np.array(np.hstack(facet_markers), dtype=np.int32))

domain.topology.create_connectivity(domain.topology.dim-1, domain.topology.dim)

ds = ufl.Measure("ds", domain=domain, subdomain_data=facet_tags)
n = ufl.FacetNormal(domain)  # normal


def top(x):
    return np.isclose(x[1], Ly)

def bot(x):
    return np.isclose(x[1], 0.0)


boundary_facets_top = dolfinx.mesh.locate_entities(domain, domain.topology.dim - 1,top)
boundary_facets_bot = dolfinx.mesh.locate_entities(domain, domain.topology.dim - 1,bot)

class MyExpression:
   def __init__(self):
       self.t = 0.0
   def eval(self, x):
       return np.full(x.shape[1], self.t)    # Added some spatial variation here. Expression is sin(t)*x

bc_u =[]


boundary_dofs_botxy_disp = dolfinx.fem.locate_dofs_topological(V, domain.topology.dim-1, boundary_facets_bot)
disp_bot_val = dolfinx.fem.Constant(domain, (0.0, 0.0))
bc_bot_xy = dolfinx.fem.dirichletbc(disp_bot_val, boundary_dofs_botxy_disp, V)
bc_u.append(bc_bot_xy)


def epsilon(u):
   return ufl.sym(ufl.grad(u))

def sigma(u):
   temp2 = epsilon(u)
   return 2.0*mu*temp2 + lmbda_* ufl.tr(temp2)*ufl.Identity(len(u))


def project(expression, V):
    u_, v_ = ufl.TrialFunction(V), ufl.TestFunction(V)
    a_p = ufl.inner(u_, v_) * ufl.dx - ufl.inner(expression, v_) * ufl.dx
    projection = dolfinx.fem.petsc.LinearProblem(ufl.lhs(a_p), ufl.rhs(a_p))
    return projection.solve()

def DensityExpression(x):
    vals0 = np.zeros((2, x.shape[1]))
    vals0[0] = density
    vals0[1] = density
    return vals0

dens_vals_for_acc = dolfinx.fem.Function(V)
dens_vals_for_acc.interpolate(DensityExpression)

def Constant_one_expression(x):
    vals0 = np.zeros((2, x.shape[1]))
    vals0[0] = 1.0
    vals0[1] = 1.0
    return vals0

Constant_one = dolfinx.fem.Function(V)
Constant_one.interpolate(Constant_one_expression)

vel_, vel_n = fem.Function(V), fem.Function(V)
a_, a_n = fem.Function(V), fem.Function(V)
u_, u_n = fem.Function(V), fem.Function(V)
u = ufl.TrialFunction(V)
v = ufl.TestFunction(V)


with u_n.vector.localForm() as uold_loc:
    uold_loc.set(0)   
with vel_n.vector.localForm() as uold_loc:
    uold_loc.set(0)
with a_n.vector.localForm() as uold_loc:
    uold_loc.set(0) 

def Surf_Force_exp(x):
    vals = np.zeros((domain.geometry.dim, x.shape[1]))
    vals[1] = 0.0
    for i in range(x.shape[1]):
        if(np.isclose(x[1][i], Ly)):
            vals[0][i] = sigma0
    return vals

surf_force = dolfinx.fem.Function(V)
surf_force.interpolate(Surf_Force_exp)

mass_form = ufl.inner(u,v)*ufl.dx 
mass_action_form = ufl.action(mass_form, dens_vals_for_acc)
L_lumped = dolfinx.fem.form(mass_action_form)
M_lumped = assemble_vector(L_lumped)
M_lumped.assemble()

B = dolfinx.fem.Constant(domain, PETSc.ScalarType((0.0, 0.0)))       # body force  N/mm/m


external_form = ufl.inner(B, v) * ufl.dx + ufl.inner(surf_force, v) * ds
F_external = assemble_vector(dolfinx.fem.form(external_form))
F_external.assemble()
F_external.ghostUpdate(PETSc.InsertMode.ADD_VALUES, PETSc.ScatterMode.REVERSE)
set_bc(F_external, bc_u)

i=0
filename_disp='./1_Result/Result_disp.pvd'
outfile_disp = dolfinx.io.VTKFile(MPI.COMM_WORLD,filename_disp,"w")

while i<=numsteps: 
    i+=1
    t = t + dt

    internal_form = ufl.inner(sigma(u_n), ufl.grad(v)) * ufl.dx
    F_internal = assemble_vector(dolfinx.fem.form(internal_form))
    F_internal.assemble()
    F_internal.ghostUpdate(PETSc.InsertMode.ADD_VALUES, PETSc.ScatterMode.REVERSE)
    set_bc(F_internal, bc_u)

    aaa = -(F_internal[:]-F_external[:])/M_lumped[:]
    a_.x.array[:] = aaa

    vel_ = vel_n + dt * a_
    u_ = u_n + dt * vel_
    u_n = project(u_, V)
    vel_n = project(vel_, V)

    if(i%100 == 0):
      if MPI.COMM_WORLD.rank == 0:
          print(f"TimeStep: {i}, Time: {t}", flush=True)

    if(i%200 == 0):
        a_n = project(a_, V)
        u_n.name = "Displacement"
        a_n.name = "Acceleration"
        vel_n.name = "Velocity"
        outfile_disp.write_function([u_n,a_n,vel_n],t)
        outfile_disp.close()

Everything seems to be working as expected when I run the code using one processor. However, when I increase the number of processors to 2 or more, I get the following error:

Traceback (most recent call last):
  File "script.py", line 185, in <module>
Traceback (most recent call last):
  File "script.py", line 185, in <module>
    aaa = -(F_internal[:]-F_external[:])/M_lumped[:]
  File "PETSc/Vec.pyx", line 107, in petsc4py.PETSc.Vec.__getitem__
    aaa = -(F_internal[:]-F_external[:])/M_lumped[:]
  File "PETSc/Vec.pyx", line 107, in petsc4py.PETSc.Vec.__getitem__
  File "PETSc/petscvec.pxi", line 430, in petsc4py.PETSc.vec_getitem
  File "PETSc/petscvec.pxi", line 382, in petsc4py.PETSc.vecgetvalues
  File "PETSc/petscvec.pxi", line 430, in petsc4py.PETSc.vec_getitem
petsc4py.PETSc.Error: error code 63
[0] VecGetValues() at /home/conda/feedstock_root/build_artifacts/petsc_1661243737224/work/src/vec/vec/interface/rvector.c:849
[0] VecGetValues_MPI() at /home/conda/feedstock_root/build_artifacts/petsc_1661243737224/work/src/vec/vec/impls/mpi/pdvec.c:848
[0] Argument out of range
[0] Can only get local values, trying 4216
  File "PETSc/petscvec.pxi", line 382, in petsc4py.PETSc.vecgetvalues
petsc4py.PETSc.Error: error code 63
[1] VecGetValues() at /home/conda/feedstock_root/build_artifacts/petsc_1661243737224/work/src/vec/vec/interface/rvector.c:849
[1] VecGetValues_MPI() at /home/conda/feedstock_root/build_artifacts/petsc_1661243737224/work/src/vec/vec/impls/mpi/pdvec.c:848
[1] Argument out of range
[1] Can only get local values, trying 0

Based on what I have learned so far from the previous questions, I guess this is related to the distributed mesh and perhaps the ghost cells. However, since I am very new to fenics and I couldnt find any matrix-free demo, I could not solve the problem yet. I also suspect that the code might have some efficiency flaws. I would really appreciate if someone helps me with:

  1. How to solve the above error? and
  2. If possible, how to increase the efficiency of the code.

I installed dolfinx using anaconda.

As far as I understand the : in PETSc.Vec()[:] tries to pull from the ghosted indices but PETSc.Vec().getValues() only pulls the array of non-ghosted values. You can technically pull only those values with

aaa = -(F_internal[...]-F_external[...])/M_lumped[...]

but then a_.x.array[:] = aaa would throw an error since a_ also has ghosted indices.

I believe the fix to your problem would be instead to use the localForm() of each Vec() object so that ghost values are accounted for. Like so

with F_internal.localForm() as lf1, F_external.localForm() as lf2, M_lumped.localForm() as lf3:
    a_.x.array[:] = (lf2 - lf1) / lf3

I distributed the negative sign for you, as it’s just extra clutter.

Thank you for the reply. Yes, I applied something similar. I toke the internal_form definition out of the time loop and used .vector to assign the local values first and then updated the ghost nodes. For those who want, the following is what I changed in the code. I noticed that project function is an expensive computation. Therefore it is best to avoid it. I now perform all the computations using petsc vectors and assign u_n values at the end of each iteration which makes the code way faster (more than 100 times).

F_internal = assemble_vector(internal_form)
F_internal.ghostUpdate(addv=PETSc.InsertMode.ADD, mode=PETSc.ScatterMode.REVERSE)

a_node = (F_external - F_internal)/M_lumped

vel_node = vel_n_node + a_node * dt
u_node = u_n_node + vel_node * dt
set_bc(u_node, bc_u)
vel_n_node = vel_node
u_n_node = u_node

u_n.vector.array[:] = u_n_node.array
u_n.vector.ghostUpdate(addv=PETSc.InsertMode.INSERT, mode=PETSc.ScatterMode.FORWARD)

I guess one other thing that can be done here is to take the assembe_vector out of the loop and instead use some local computations. I am still trying to figure that part out.