Reducing Number of MPI Communicators in Storing List of Solutions at Different Time Steps

FEniCS Version: 2019.2.0.dev0

Problem Description:
I am solving the Heat Equation in 1D, u_t - u_{xx} = f(x, t), with a manufactured solution, u(x, t) = \cos(t)\sin(\pi x), using a continuous Galerkin method in space-time. This means there are integrals over space and time in the weak form. I use Gaussian quadrature to handle the time integration portion with Lagrangian basis functions. I use FEniCS to handle the spatial integration component.

I solve the Heat equation and store the solution function at each time step in a list. I do this because I need this information later in my broader code to interpolate the solution at times in between time nodes. I have attached a MWE of how this is done in my broader code.

Issue:
I have run into an issue where in a broader code, I run out of MPI Communicators since you can only have 2048 in a single process. I found this thread mentioning this limit when initially investigating the error.

Goal
I want to reduce the number of MPI Communicators necessary to as low as possible while still generating the same solution list unchanged in the MWE.

MWE

from fenics import *
import numpy as np
set_log_level(LogLevel.WARNING)


def Forward_global_solver(t, u_initial, f, a_con, b_con, q, V, bc, Vh, Vh_true_deg):
    t_start = t[0]
    u_initial.t = t_start

    alpha_init = interpolate(u_initial, Vh)
    alpha_n = interpolate(u_initial, Vh)

    alpha_list = [alpha_init]

    for n in range(len(t) - 1):
        tL = t[n]
        tR = t[n + 1]

        alpha_loc = Forward_local_solver(alpha_n, f, a_con, b_con, tL, tR, q, V, bc, Vh_true_deg)

        for i in range(q):
            if q == 1:
                alpha_list.append(alpha_loc)
            else:
                alpha_list.append(alpha_loc.sub(i, deepcopy=True))
        if q == 1:
            alpha_n.assign(alpha_loc)
        else:
            alpha_n.assign(alpha_loc.sub(q - 1, deepcopy=True))

    return alpha_list


def Forward_local_solver(alpha_n, f, a_con, b_con, tL, tR, q, V, bc, Vh):
    (p, w) = np.polynomial.legendre.leggauss(3 * q)
    p = ((tR - tL) / 2.0) * p + (tL + tR) / 2.0
    w = ((tR - tL) / 2.0) * w
    alpha_tri = TrialFunctions(V)
    beta = TestFunctions(V)
    F = Constant(0.0)
    for j in range(q):
        for k in range(len(p)):
            TLagr = Constant(TestLagrange(tL, tR, q, j, p[k]))
            f_k = interpolate(Expression(f, degree=6, t=p[k], a=a_con, b=b_con), Vh)
            p1 = Constant(0.0)
            for i in range(q + 1):
                Lagr = Constant(Lagrange(tL, tR, q, i, p[k]))
                dLagr = Constant(Lagrange_deriv(tL, tR, q, i, p[k]))
                if i == 0:
                    p1 += dLagr * alpha_n * beta[j] + Lagr * alpha_n.dx(0) * beta[j].dx(0)
                else:
                    p1 += dLagr * alpha_tri[i - 1] * beta[j] + Lagr * alpha_tri[i - 1].dx(0) * beta[j].dx(0)
            weight = Constant(w[k])
            F += weight * TLagr * (p1 - f_k * beta[j])
    F = F * dx
    a1, L = lhs(F), rhs(F)
    alpha_tri = Function(V)
    solve(a1 == L, alpha_tri, bc)

    return alpha_tri


def Create_vector_element_function_space(msh, basis, q, deg):
    if q < 1:
        raise Exception('Degree of temporal basis functions must be greater than or equal to 1')
    if q == 1:
        V = FunctionSpace(msh, basis, deg)
    else:
        element = Create_vector_element_multiple_single(q, deg)
        V = FunctionSpace(msh, element)

    return V


def Create_zero_Dirichlet_bc(q, V):
    if q < 1:
        raise Exception('Degree of temporal basis functions must be greater than or equal to 1')
    if q == 1:
        tup = (0.0)
    else:
        tup = ()
        tup_single_system = (0.0,)
        for i in range(q):
            tup = tup + tup_single_system
    u_D = Constant(tup)
    bc = DirichletBC(V, u_D, boundary_D)

    return bc


def boundary_D(x, on_boundary):
    return on_boundary


def Create_vector_element_multiple_single(q, deg):
    P = FiniteElement('CG', interval, deg)
    element_array = []
    for i in range(q):
        element_array.append(P)
    vector_element = MixedElement(element_array)

    return vector_element


def Lagrange(initial, final, degree, j, x):
    I = np.linspace(initial, final, degree + 1)
    z = 1.0
    for k in range(degree + 1):
        if k != j:
            z *= (x - I[k]) / (I[j] - I[k])

    return z


def Lagrange_deriv(initial, final, degree, j, x):
    I = np.linspace(initial, final, degree + 1)
    z = 0.0
    for m in range(degree + 1):
        v = 1.0
        for k in range(degree + 1):
            if k != j and k != m:
                v *= (x - I[k]) / (I[j] - I[k])
        if m != j:
            z += (1.0 / (I[j] - I[m])) * v

    return z


def TestLagrange(initial, final, degree, j, x):
    I = np.linspace(initial, final, degree)
    z = 1.0
    if degree > 1:
        for k in range(degree):
            if k != j:
                z *= (x - I[k]) / (I[j] - I[k])

    return z


# Set up constants for time/space meshes, degrees of basis functions.
t0 = 0.0
T = 1.0
N = 5
Nt = N
Nx = N
q1 = 1
deg1 = 1
deg_true = 6

# Create time and space meshes.
t1 = np.linspace(t0, T, Nt + 1)
mesh = UnitIntervalMesh(Nx)

# Create RHS functions and initial conditions for forward problem.
a = 1.0
b = 1.0
u_init = Expression('cos(a*t)*sin(b*pi*x[0])', degree=6, t=t0, a=a, b=b)
f1 = 'sin(b*pi*x[0])*(b*b*pi*pi*cos(a*t) - a*sin(a*t))'

# Create function spaces for non-vector functions.
Vh1 = FunctionSpace(mesh, 'CG', deg1)
Vh3 = FunctionSpace(mesh, 'CG', deg_true)

# Create vector spaces for solutions.
V1 = Create_vector_element_function_space(mesh, 'CG', q1, deg1)

# Create zero Dirichlet boundary conditions.
bc1 = Create_zero_Dirichlet_bc(q1, V1)

# Forward problem.
alpha = Forward_global_solver(t1, u_init, f1, a, b, q1, V1, bc1, Vh1, Vh3)

There are many things that you can do in your code to make it more efficient, avoiding duplication of MPI processes etc.

As your code is quite tedious to parse, as you haven’t made it a minimal example to illustrate the issue, I’ll only refer you to previous posts:

and the references posts.
In particular,
https://jsdokken.com/FEniCS23-tutorial/src/form_compilation.html
might be helpful as I go through the idea of creating data outside of loops very carefully.

I apologize about the MWE not actually being an MWE. I have gotten used to a much larger, more unwieldy, code base, so what I in the moment thought was “minimal enough” is not in fact “minimal”. Once you pointed it out, I immediately realized how poor my attempt at an MWE was. In response, I have drastically reduced the number of lines in the code. I hope that my second attempt at an MWE is much more reasonable now. This much smaller MWE will now error out with the error I am experiencing, unlike the original, but will take some time to run. It takes a little under 15 minutes for the code to error out on my Ubuntu 22.04.3 machine with a AMD Ryzen 5800X3D. I have included the error, details and all, below the improved MWE.

I took a look at the links to previous posts that you provided that the common thread for what the culprit might be is the use of Constant() inside the for loops. Rather, any use of Constant() should be done outside the loop and using the assign() function to update the value, correct? Does this reduce the MPI Communicator overhead?

Improved MWE

from fenics import *
import numpy as np
set_log_level(LogLevel.WARNING)

def boundary_D(x, on_boundary):
    return on_boundary

t1 = np.linspace(0.0, 1.0, 35001)
mesh = UnitIntervalMesh(5)
u_init = Expression('cos(a*t)*sin(b*pi*x[0])', degree=6, t=0.0, a=1.0, b=1.0)
Vh1 = FunctionSpace(mesh, 'CG', 1)
Vh3 = FunctionSpace(mesh, 'CG', 6)
P = FiniteElement('CG', interval, 1)
element = MixedElement([P, P])
V1 = FunctionSpace(mesh, element)
bc1 = DirichletBC(V1, Constant((0.0, 0.0)), boundary_D)
alpha_init = interpolate(u_init, Vh1)
alpha_n = interpolate(u_init, Vh1)
alpha = [alpha_init]
for n in range(len(t1) - 1):
    p = np.ones(6) # placeholder array for quadrature points
    w = np.ones(6) # placeholder array for quadrature weights
    alpha_loc = TrialFunctions(V1)
    beta = TestFunctions(V1)
    F = Constant(0.0)
    for j in range(2):
        for k in range(len(p)):
            c1 = Constant(1.0) # time-dependent. placeholder value used
            f_k = interpolate(Expression('sin(b*pi*x[0])*(b*b*pi*pi*cos(a*t) - a*sin(a*t))', degree=6, t=p[k], a=1.0, b=1.0), Vh3)
            p1 = Constant(0.0)
            for i in range(3):
                c2 = Constant(1.0) # time-dependent. placeholder value used.
                c3 = Constant(1.0) # time-dependent. placeholder value used.
                if i == 0:
                    p1 += c3 * alpha_n * beta[j] + c2 * alpha_n.dx(0) * beta[j].dx(0)
                else:
                    p1 += c3 * alpha_loc[i - 1] * beta[j] + c2 * alpha_loc[i - 1].dx(0) * beta[j].dx(0)
            weight = Constant(w[k])
            F += weight * c1 * (p1 - f_k * beta[j])
    F = F * dx
    a1, L = lhs(F), rhs(F)
    alpha_loc = Function(V1)
    solve(a1 == L, alpha_loc, bc1)
    for i in range(2):
        alpha.append(alpha_loc.sub(i, deepcopy=True))
    alpha_n.assign(alpha_loc.sub(1, deepcopy=True))

Resulting Error

Traceback (most recent call last):
  File "/home/owner/PycharmProjects/pythonProject/MWE.py", line 51, in <module>
  File "/usr/lib/petsc/lib/python3/dist-packages/dolfin/function/function.py", line 550, in sub
    self.cpp_object().sub(i),
RuntimeError: *** Error: Duplication of MPI communicator failed (MPI_Comm_dup

Anything relating to the varitional form F should be declared outside of loops if possible.
This is possible in 99% of the cases. The only time anything from a varitional form should change inside a loop is when: You use a new mesh at a time step.

In your example, you should also create placeholder functions and expressions for f_k outside the loop, as you can interpolate into a function

expr = Expression('sin(b*pi*x[0])*(b*b*pi*pi*cos(a*t) - a*sin(a*t))', degree=6, t=p[k], a=1.0, b=1.0)
f_k = Function(Vh3)
for i in range(10):
    expr.t=i
    f_k.interpolate(expr)

which avoids a ton of extra work in converting strings to C++ code, allocating data for a Function.
Similarly, all constants should be created outside the loop, and be used with constant.assign() inside the loop to avoid creating new objects with a new id, that has to be parsed by ufl when generating assembly code.

I updated my non-MWE code to use the f_k changes that were recommended and it appears that something might be wrong in my code.

To track the changes in the output, I computed the L2 error norm using errornorm() for the numerical solution at the final time t = 1.0. Before the change, the L2 error was 0.013955854062120387. After the change, the L2 error was 0.05831788622693951. There appears to be some mistake as my understanding is that we are doing the same thing effectively with the f_k interpolation, just in a smarter way. I wouldn’t expect there to be such a wild swing in the error.

I did notice that there was a warning in my code, not sure how relevant it is, saying:

Class 'Function' does not define '__mul__', so the '*' operator cannot be used on its instances 

To be clear on how I changed the non-MWE code, I have added an updated MWE with the changes structurally that I made marked with the comment # changed line.

Updated MWE

from fenics import *
import numpy as np
set_log_level(LogLevel.WARNING)


def boundary_D(x, on_boundary):
    return on_boundary


t1 = np.linspace(0.0, 1.0, 50000)
mesh = UnitIntervalMesh(5)
u_init = Expression('cos(a*t)*sin(b*pi*x[0])', degree=6, t=0.0, a=1.0, b=1.0)
Vh1 = FunctionSpace(mesh, 'CG', 1)
Vh3 = FunctionSpace(mesh, 'CG', 6)
P = FiniteElement('CG', interval, 1)
element = MixedElement([P, P])
V1 = FunctionSpace(mesh, element)
bc1 = DirichletBC(V1, Constant((0.0, 0.0)), boundary_D)
alpha_init = interpolate(u_init, Vh1)
alpha_n = interpolate(u_init, Vh1)
alpha = [alpha_init]
f_expr = Expression('sin(b*pi*x[0])*(b*b*pi*pi*cos(a*t) - a*sin(a*t))', degree=6, t=0.0, a=1.0, b=1.0) #changed line
f_k = Function(Vh3) #changed line
for n in range(len(t1) - 1):
    p = np.ones(6)
    w = np.ones(6)
    alpha_loc = TrialFunctions(V1)
    beta = TestFunctions(V1)
    F = Constant(0.0)
    for j in range(2):
        for k in range(len(p)):
            c1 = Constant(1.0)
            f_expr.t = p[k] # changed line
            f_k.interpolate(f_expr) # changed line
            p1 = Constant(0.0)
            for i in range(3):
                c2 = Constant(1.0)
                c3 = Constant(1.0)
                if i == 0:
                    p1 += c3 * alpha_n * beta[j] + c2 * alpha_n.dx(0) * beta[j].dx(0)
                else:
                    p1 += c3 * alpha_loc[i - 1] * beta[j] + c2 * alpha_loc[i - 1].dx(0) * beta[j].dx(0)
            weight = Constant(w[k])
            F += weight * c1 * (p1 - f_k * beta[j])
    F = F * dx
    a1, L = lhs(F), rhs(F)
    alpha_loc = Function(V1)
    solve(a1 == L, alpha_loc, bc1)
    for i in range(2):
        alpha.append(alpha_loc.sub(i, deepcopy=True))
    alpha_n.assign(alpha_loc.sub(1, deepcopy=True))

Start by outputting f_k in the two cases, and see if it differs.

I printed the values of f_k at x = 0.25, 0.5, and 0.75 right before the i-loop in both approaches. They output the exact same values.

Via some tinkering, I found out that if I only do a single time step, from 0.0 to 1.0, both approaches give the same error value. However, if I do more than one time step, the approaches give different error values.

Could this be some sort of memory bug in my code? Though it seems weird that it would be a memory issue when evaluating f_k gives the same values in both cases.

Please try to reduce the number of loops and id statements. If you find a smaller example it id more likely that I have the bandwidth to help

After some more experimentation, I was able reduce the code further and still demonstrate the issue. I added some code to show how the solution at the final time varies depending on using the new vs old methods. The two lines at the end compute and print the value of the spatial integral of the solution at the final time. The value of this integral changes significantly depending on the method used. Uncomment the line marked with “original method” while commenting out the line marked with “new method” to see the value for the original method. Do the reverse to see the value for the new method.

Updated MWE

from fenics import *
import numpy as np
set_log_level(LogLevel.WARNING)

def boundary_D(x, on_boundary):
    return on_boundary

t1 = np.linspace(0.0, 1.0, 2)
mesh = UnitIntervalMesh(5)
u_init = Expression('cos(t)*sin(pi*x[0])', degree=6, t=0.0)
f_expr = Expression('sin(pi * x[0]) * (pi * pi * cos(t) - sin(t))', degree=6, t=0.0)
V1 = FunctionSpace(mesh, 'CG', 1)
V2 = FunctionSpace(mesh, 'CG', 6)
bc1 = DirichletBC(V1, Constant(0.0), boundary_D)
alpha_n = interpolate(u_init, V1)
alpha = [alpha_n]
f_k = Function(V2)
p = np.array([0.0, 1.0])
alpha_loc = TrialFunction(V1)
beta = TestFunction(V1)
F = Constant(0.0)
for k in range(len(p)):
    f_expr.t = p[k]
    # f_k = interpolate(f_expr, V2)  # original method
    f_k.interpolate(f_expr)         # new method
    F += (alpha_n + alpha_loc - f_k) * beta + (alpha_n + alpha_loc).dx(0) * beta.dx(0)
F = F * dx
a, L = lhs(F), rhs(F)
alpha_loc = Function(V1)
solve(a == L, alpha_loc, bc1)
alpha.append(alpha_loc)
alpha_n.assign(alpha_loc)
integral_alpha_final_time = assemble(alpha[-1] * dx)
print('integral of alpha final time = ', integral_alpha_final_time)

The issue here is that you keep on appending data to F_k for each time step.
It is not suprising that these differs, as f_k in the original method actually uses a new function for each step, while the new method uses a single f_k, whose value will be only the latest set value to f_expr.

One thing I note with your code is that the interpolation your doing isn’t useful, as you could rather use

def f(x, t):
    return sin(pi * x[0]) * (pi * pi * cos(t) - sin(t))
x = SpatialCoordinates(mesh) 
for k in range(len(p)):
    F += (alpha_n + alpha_loc - f(x, t=p[k]) * beta + (alpha_n + alpha_loc).dx(0) * beta.dx(0)

which should give you a similar result to the first method, with an higher accuracy (as f is now evaluated at quadrature points, instead of being an approximation of the function in a 6th order space).

I appreciate all the help so far!

As an update, using the function that you defined in my production code (not my MWE) is much slower at compiling on first run for me than the function approximation. It is much faster on subsequent runs. However, it thew an error much sooner when setting my temporal mesh to have 50000 time steps. It gave me an error at time step 12762, much sooner than when using the original attempt at f_k. The error is below.

Also, since it appears that my constants will suffer the same issue as my f_k function (assign() will change all the values in my form F, is there a different method to fix my MPI communicator issues? Or am I stuck with them? I don’t think I can get rid of the k loop as that is how I am doing my time integration with quadrature.

OpenBLAS blas_thread_init: pthread_create failed for thread 13 of 16: Resource temporarily unavailable
OpenBLAS blas_thread_init: RLIMIT_NPROC 127654 current, 127654 max
OpenBLAS blas_thread_init: pthread_create failed for thread 14 of 16: Resource temporarily unavailable
OpenBLAS blas_thread_init: RLIMIT_NPROC 127654 current, 127654 max
OpenBLAS blas_thread_init: pthread_create failed for thread 15 of 16: Resource temporarily unavailable
OpenBLAS blas_thread_init: RLIMIT_NPROC 127654 current, 127654 max
Traceback (most recent call last):
  File "/home/owner/PycharmProjects/pythonProject/NewAdaptiveHeat.py", line 100, in <module>
    alpha = Forward_global_solver(t1, u_init, f1, a, b, q1, V1, bc1, Vh1, Vh3)
  File "/home/owner/PycharmProjects/pythonProject/NewAdaptiveHeatFunctionFile.py", line 45, in Forward_global_solver
    alpha_loc = Forward_local_solver(alpha_n, f, a, b, tL, tR, q, V, bc, Vh_true_deg)
  File "/home/owner/PycharmProjects/pythonProject/NewAdaptiveHeatFunctionFile.py", line 98, in Forward_local_solver
    solve(a == L, alpha, bc)
  File "/usr/lib/petsc/lib/python3/dist-packages/dolfin/fem/solving.py", line 233, in solve
    _solve_varproblem(*args, **kwargs)
  File "/usr/lib/petsc/lib/python3/dist-packages/dolfin/fem/solving.py", line 268, in _solve_varproblem
    problem = LinearVariationalProblem(eq.lhs, eq.rhs, u, bcs,
  File "/usr/lib/petsc/lib/python3/dist-packages/dolfin/fem/problem.py", line 58, in __init__
    L = Form(L, form_compiler_parameters=form_compiler_parameters)
  File "/usr/lib/petsc/lib/python3/dist-packages/dolfin/fem/form.py", line 43, in __init__
    ufc_form = ffc_jit(form, form_compiler_parameters=form_compiler_parameters,
  File "/usr/lib/petsc/lib/python3/dist-packages/dolfin/jit/jit.py", line 50, in mpi_jit
    return local_jit(*args, **kwargs)
  File "/usr/lib/petsc/lib/python3/dist-packages/dolfin/jit/jit.py", line 100, in ffc_jit
    return ffc.jit(ufl_form, parameters=p)
  File "/usr/lib/python3/dist-packages/ffc/jitcompiler.py", line 217, in jit
    module = jit_build(ufl_object, module_name, parameters)
  File "/usr/lib/python3/dist-packages/ffc/jitcompiler.py", line 130, in jit_build
    module, signature = dijitso.jit(jitable=ufl_object,
  File "/usr/lib/python3/dist-packages/dijitso/jit.py", line 165, in jit
    header, source, dependencies = generate(jitable, name, signature, params["generator"])
  File "/usr/lib/python3/dist-packages/ffc/jitcompiler.py", line 65, in jit_generate
    code_h, code_c, dependent_ufl_objects = compile_object(ufl_object,
  File "/usr/lib/python3/dist-packages/ffc/compiler.py", line 142, in compile_form
    return compile_ufl_objects(forms, "form", object_names,
  File "/usr/lib/python3/dist-packages/ffc/compiler.py", line 190, in compile_ufl_objects
    ir = compute_ir(analysis, prefix, parameters, jit)
  File "/usr/lib/python3/dist-packages/ffc/representation.py", line 182, in compute_ir
    irs = [_compute_integral_ir(fd, form_id, prefix, element_numbers, classnames, parameters, jit)
  File "/usr/lib/python3/dist-packages/ffc/representation.py", line 182, in <listcomp>
    irs = [_compute_integral_ir(fd, form_id, prefix, element_numbers, classnames, parameters, jit)
  File "/usr/lib/python3/dist-packages/ffc/representation.py", line 455, in _compute_integral_ir
    ir = r.compute_integral_ir(itg_data,
  File "/usr/lib/python3/dist-packages/ffc/uflacs/uflacsrepresentation.py", line 43, in compute_integral_ir
    ir = initialize_integral_ir("uflacs", itg_data, form_data, form_id)
  File "/usr/lib/python3/dist-packages/ffc/representationutils.py", line 203, in initialize_integral_ir
    "needs_oriented": needs_oriented_jacobian(form_data),
  File "/usr/lib/python3/dist-packages/ffc/representationutils.py", line 156, in needs_oriented_jacobian
    element = create_element(ufl_element)
  File "/usr/lib/python3/dist-packages/ffc/fiatinterface.py", line 104, in create_element
    elements = _extract_elements(ufl_element)
  File "/usr/lib/python3/dist-packages/ffc/fiatinterface.py", line 307, in _extract_elements
    elements += _extract_elements(sub_element, restriction_domain)
  File "/usr/lib/python3/dist-packages/ffc/fiatinterface.py", line 319, in _extract_elements
    elements += [create_element(ufl_element)]
  File "/usr/lib/python3/dist-packages/ffc/fiatinterface.py", line 100, in create_element
    element = _create_fiat_element(ufl_element)
  File "/usr/lib/python3/dist-packages/ffc/fiatinterface.py", line 198, in _create_fiat_element
    element = ElementClass(fiat_cell, degree)
  File "/usr/lib/python3/dist-packages/FIAT/lagrange.py", line 43, in __init__
    poly_set = polynomial_set.ONPolynomialSet(ref_el, degree)
  File "/usr/lib/python3/dist-packages/FIAT/polynomial_set.py", line 166, in __init__
    vinv = numpy.linalg.inv(v)
  File "<__array_function__ internals>", line 5, in inv
  File "/usr/lib/python3/dist-packages/numpy/linalg/linalg.py", line 545, in inv
    ainv = _umath_linalg.inv(a, signature=signature, extobj=extobj)
KeyboardInterrupt

I would probably use a UnitSquareMesh with quadrialteral elements of size (num_elements, num_time-steps) to represent your PDE, rather than having a unique function, constant etc. for every time step.
Then u.dx(0) would be derivative in x-direction, u.dx(1) would be a temporal derivative for the finite element in question.