Assemble is very slow in FEniCS legacy and dolfin legacy

I defined a function to calculate the weak form

from fenics import *
import numpy as np
from ufl_legacy import transpose
from mpi4py import MPI
from Compute_DC import compute_DC_constants
from compute_SUPG import compute_tau_supg


def assemble_weak_form(V_fluid, Y, Y_n, dt, A0_f, A1_f, A2_f, A1_sp, A2_sp,
                       K11, K12, K21, K22, A0_s, A1_s, A2_s,
                       markers, beta=1.0):
    """
    Assemble the weak form for FSI problem with 4x4 matrices in parallel
    """
    A = PETScMatrix()
    b = PETScVector()
    # Set up parallel environment
    parameters["linear_algebra_backend"] = "PETSc"
    parameters["ghost_mode"] = "shared_facet"
    parameters["form_compiler"]["optimize"] = True

    # Test and trial functions
    W = TestFunction(V_fluid)
    dY = TrialFunction(V_fluid)

    # Get mesh info
    mesh = V_fluid.mesh()
    n = FacetNormal(mesh)
    h = CellDiameter(mesh)

    # Define measures
    dx = Measure('dx', domain=mesh)
    dx_s = Measure('dx', subdomain_data=markers, subdomain_id=1)
    dS = Measure('dS', domain=mesh)

    # Initialize forms
    a = 0
    L = 0

    # Get gradients
    grad_W = grad(W)
    grad_Y = grad(Y)

    # 1. Mass terms
    a +=  (1 / dt) *inner(W, dot(A0_f, Y)) * dx
    a -=  (1 / dt) *inner(W, dot(A0_f, Y)) * dx_s

    L += (1 / dt) * inner(W, dot(A0_f, Y_n)) * dx
    L -= (1 / dt) * inner(W, dot(A0_f, Y_n)) * dx_s
    L += (1 / dt) * inner(W, dot(A0_s, Y_n)) * dx_s

    # 2. Convective terms
    conv_term = dot(A1_f, grad_Y[:, 0]) + dot(A2_f, grad_Y[:, 1])
    a += inner(W, conv_term) * dx
    a -= inner(W, conv_term) * dx_s
    # 3. Diffusive terms
    F_diff_x = dot(K11, grad_Y[:, 0]) + dot(K12, grad_Y[:, 1])
    F_diff_y = dot(K21, grad_Y[:, 0]) + dot(K22, grad_Y[:, 1])
    a -= inner(grad_W[:, 0], F_diff_x) * dx
    a -= inner(grad_W[:, 1], F_diff_y) * dx
    a += inner(grad_W[:, 0], F_diff_x) * dx_s
    a += inner(grad_W[:, 1], F_diff_y) * dx_s
    # 4. SUPG stabilization
    F_diff_x_n = dot(K11, grad_Y[:, 0]) + dot(K12, grad_Y[:, 1])
    F_diff_y_n = dot(K21, grad_Y[:, 0]) + dot(K22, grad_Y[:, 1])
    div_F_diff = grad(F_diff_x_n)[:, 0] + grad(F_diff_y_n)[:, 1]

    Res_n = dot(A0_f, (Y - Y_n) / dt) + \
            dot(A1_f, grad_Y[:, 0]) + dot(A2_f, grad_Y[:, 1]) - \
            div_F_diff

    tau_SUPG = as_matrix(compute_tau_supg(dt, A1_sp, A2_sp, K11, K12, K21, K22, mesh))

    tau_Res = dot(tau_SUPG, Res_n)
    grad_W_SUPG_x = dot(transpose(A1_f), grad_W[:, 0])
    grad_W_SUPG_y = dot(transpose(A2_f), grad_W[:, 1])
    grad_W_SUPG = grad_W_SUPG_x + grad_W_SUPG_y

    B_st = inner(grad_W_SUPG, tau_Res) * dx
    B_st_s = inner(grad_W_SUPG, tau_Res) * dx_s
    a += (B_st - B_st_s)
    a += (B_st - B_st_s)
    assemble(a, tensor=A)
    # 5. DC stabilization
    K_DC = compute_DC_constants(h, Res_n, Y)
    a += inner(grad_W, dot(K_DC, grad_Y)) * dx
    a -= inner(grad_W, dot(K_DC, grad_Y)) * dx_s

    # 6. Solid domain terms
    a += inner(W, dot(A1_s, grad_Y[:, 0]) + dot(A2_s, grad_Y[:, 1])) * dx_s

    # 7. Interface terms
    a += beta * h * inner(jump(dot(grad_W, n)), jump(dot(grad(dY), n))) * dS

    # Initialize parallel matrices and vectors


    # Parallel assembly
    assemble(a, tensor=A)
    assemble(L, tensor=b)

    # Ensure proper synchronization
    A.apply('add')
    b.apply('add')

    return A, b

and V_fluid is a mixed finite element space, I define the matrices used here by using examples like

def create_coefficient_matrices(V_p, V_T, p_f, u_f, T_f):


    # Define thermodynamic expressions
    rho_f_expr = project(p_f / (R * T_f), V_p)
    v_expr = project(R * T_f / p_f, V_p)
    e_expr = project(c_v * T_f, V_T)
    h_expr = project(e_expr + p_f * v_expr, V_T)
    u_squared = project(0.5 * (u_f[0] ** 2 + u_f[1] ** 2), V_T)
    e_tot_expr = project(e_expr + u_squared, V_T)

    # Define thermodynamic coefficients
    beta_T_expr = project(1.0 / p_f, V_p)
    alpha_p_expr = project(1.0 / T_f, V_p)

    get_A0=as_matrix([
            [rho_f_expr * beta_T_expr, Constant(0), Constant(0), -rho_f_expr * alpha_p_expr],
            [rho_f_expr * beta_T_expr * u_f[0], rho_f_expr, Constant(0),
             -rho_f_expr * alpha_p_expr * u_f[0]],
            [rho_f_expr * beta_T_expr * u_f[1], Constant(0), rho_f_expr,
             -rho_f_expr * alpha_p_expr * u_f[1]],
            [(e_expr - Constant(1)), rho_f_expr * u_f[0],
             rho_f_expr * u_f[1], Constant(0)]
        ])

and I also have to calculate the inverse square root of a matrix in SUPG term, which I write the iterative method by hand, and also I modify the source code to compute 4*4 determinant, but it is very slow to assemble it

You should not use projections in this way.
As the projections are in to the same finite element spaces, you can assemble the lhs matrices once (and even factorize them if you use a direct solver only once and then have a fast solve).
See for instance
https://simula-sscp.github.io/SSCP_2024_lectures/L12%20(FEniCS%20Intro)/L03_FEniCS_Darcy.html#advanced-reusable-projection

Please note that we do recommend switching to DOLFINx, as we no longer maintain legacy DOLFIN.

Also note that your code doesn’t contain a reproducible example, meaning that the answers you get will be rather generic, as one cannot reproduce the behavior. You have also not put any effort into trying to pinpoint what operations that are slow.