Open
Description
The following code compares the mul!
speed of banded-block-banded matrices to the diagonal. For ChebyshevT
, the diagonal mul!
is mathematically only 3x for X and 9x for Y given the tridagonal-block-tridiagonal structure, but there appears to be a large overhead to it all. Moreover, mul!
with views
may spend a lot of time with the garbage collector and can be slower even though they require less data. I also don't see why mul!
with X
is not 3x faster than that with Y
. Finally, mul!
with an adjoint view is basically unusable (O(n^4) complexity?).
julia> using ClassicalOrthogonalPolynomials, MultivariateOrthogonalPolynomials, LinearAlgebra
julia> T² = RectPolynomial(ChebyshevT(), ChebyshevT());
julia> Jxt = jacobimatrix(Val(1), T²);
julia> Jyt = jacobimatrix(Val(2), T²);
julia> function jacobi_mul!(y, Z, x)
n = blocksize(Z, 1)
ZB1 = Z[Block(1), Block(1)]
k = size(ZB1, 1)
vx = view(x, 1:k)
vy = view(y, 1:k)
mul!(vy, ZB1, vx)
ZB2 = Z[Block(1), Block(2)]
vx = view(x, k+1:(k+size(ZB2, 2)))
mul!(vy, ZB2, vx, 1, 1)
Kstart = 1
Kstop = k
for j in 2:n-1
ZBjm1 = Z[Block(j), Block(j-1)]
vx = view(x, Kstart:Kstop)
ZBj = Z[Block(j), Block(j)]
Kstart = Kstop + 1
Kstop = Kstop + size(ZBj, 2)
vy = view(y, Kstart:Kstop)
mul!(vy, ZBjm1, vx)
vx = view(x, Kstart:Kstop)
mul!(vy, ZBj, vx, 1, 1)
ZBjp1 = Z[Block(j), Block(j+1)]
vx = view(x, (Kstop+1):(Kstop+size(ZBjp1, 2)))
mul!(vy, ZBjp1, vx, 1, 1)
end
ZBnm1 = Z[Block(n), Block(n-1)]
vx = view(x, Kstart:Kstop)
ZBn = Z[Block(n), Block(n)]
Kstart = Kstop + 1
Kstop = Kstop + size(ZBn, 2)
vy = view(y, Kstart:Kstop)
mul!(vy, ZBnm1, vx)
vx = view(x, Kstart:Kstop)
mul!(vy, ZBn, vx, 1, 1)
return y
end
jacobi_mul! (generic function with 1 method)
julia> function jacobi_t_mul!(y, Z, x)
n = blocksize(Z, 1)
ZB1 = Z[Block(1), Block(1)]'
k = size(ZB1, 1)
vx = view(x, 1:k)
vy = view(y, 1:k)
mul!(vy, ZB1, vx)
ZB2 = Z[Block(2), Block(1)]'
vx = view(x, k+1:(k+size(ZB2, 2)))
mul!(vy, ZB2, vx, 1, 1)
Kstart = 1
Kstop = k
for j in 2:n-1
ZBjm1 = Z[Block(j-1), Block(j)]'
vx = view(x, Kstart:Kstop)
ZBj = Z[Block(j), Block(j)]'
Kstart = Kstop + 1
Kstop = Kstop + size(ZBj, 2)
vy = view(y, Kstart:Kstop)
mul!(vy, ZBjm1, vx)
vx = view(x, Kstart:Kstop)
mul!(vy, ZBj, vx, 1, 1)
ZBjp1 = Z[Block(j+1), Block(j)]'
vx = view(x, (Kstop+1):(Kstop+size(ZBjp1, 2)))
mul!(vy, ZBjp1, vx, 1, 1)
end
ZBnm1 = Z[Block(n-1), Block(n)]'
vx = view(x, Kstart:Kstop)
ZBn = Z[Block(n), Block(n)]'
Kstart = Kstop + 1
Kstop = Kstop + size(ZBn, 2)
vy = view(y, Kstart:Kstop)
mul!(vy, ZBnm1, vx)
vx = view(x, Kstart:Kstop)
mul!(vy, ZBn, vx, 1, 1)
return y
end
jacobi_t_mul! (generic function with 1 method)
julia> for n in 50:50:1000
println("n = $n")
X = Jxt[Block.(1:n), Block.(1:n)]
Y = Jyt[Block.(1:n), Block.(1:n)]
DX = Diagonal(X)
x = Vector(X[:, 1])
y = zero(x)
@time mul!(y, X, x)
@time mul!(y, Y, x)
@time mul!(y, DX, x)
@time jacobi_mul!(y, X, x)
@time jacobi_mul!(y, Y, x)
VX = view(X, Block.(n÷2:n), Block.(n÷2:n))
x = Vector(VX[:, 1])
y = zero(x)
@time mul!(y, VX, x)
@time jacobi_mul!(y, VX, x)
@time jacobi_t_mul!(y, VX, x)
if n < 200
@time mul!(y, VX', x)
end
end
n = 50
0.000740 seconds (149 allocations: 9.312 KiB)
0.000273 seconds (149 allocations: 9.312 KiB)
0.000008 seconds
0.000189 seconds (148 allocations: 38.781 KiB)
0.000369 seconds (148 allocations: 100.453 KiB)
0.000096 seconds (2.84 k allocations: 178.906 KiB)
0.000131 seconds (152 allocations: 32.094 KiB)
0.000126 seconds (152 allocations: 32.094 KiB)
0.201908 seconds (1.91 M allocations: 494.370 MiB, 31.93% gc time)
n = 100
0.001709 seconds (299 allocations: 18.688 KiB)
0.000756 seconds (299 allocations: 18.688 KiB)
0.000007 seconds
0.003657 seconds (298 allocations: 139.031 KiB)
0.000647 seconds (298 allocations: 385.594 KiB)
0.000317 seconds (11.29 k allocations: 708.781 KiB)
0.001723 seconds (302 allocations: 110.172 KiB)
0.000451 seconds (302 allocations: 110.172 KiB)
3.623088 seconds (29.29 M allocations: 13.525 GiB, 35.57% gc time)
n = 150
0.002565 seconds (449 allocations: 28.062 KiB)
0.001368 seconds (449 allocations: 28.062 KiB)
0.000012 seconds
0.005007 seconds (448 allocations: 299.688 KiB)
0.001135 seconds (448 allocations: 848.469 KiB)
0.000696 seconds (25.36 k allocations: 1.553 MiB)
0.000674 seconds (452 allocations: 233.438 KiB)
0.000796 seconds (452 allocations: 233.438 KiB)
22.467557 seconds (146.26 M allocations: 91.525 GiB, 38.30% gc time)
n = 200
0.003845 seconds (599 allocations: 37.438 KiB)
0.002454 seconds (599 allocations: 37.438 KiB)
0.000019 seconds
0.001953 seconds (598 allocations: 522.750 KiB)
0.003532 seconds (598 allocations: 1.452 MiB)
0.001222 seconds (45.06 k allocations: 2.757 MiB)
0.001062 seconds (602 allocations: 403.406 KiB)
0.000965 seconds (602 allocations: 403.406 KiB)
n = 250
0.003841 seconds (749 allocations: 46.812 KiB)
0.005091 seconds (749 allocations: 46.812 KiB)
0.000040 seconds
0.002799 seconds (748 allocations: 808.344 KiB)
0.003726 seconds (748 allocations: 2.248 MiB)
0.002525 seconds (70.39 k allocations: 4.304 MiB)
0.002658 seconds (752 allocations: 620.469 KiB)
0.001539 seconds (752 allocations: 620.469 KiB)
n = 300
0.004683 seconds (899 allocations: 56.188 KiB)
0.004331 seconds (899 allocations: 56.188 KiB)
0.000057 seconds
0.004545 seconds (898 allocations: 1.127 MiB)
0.004530 seconds (898 allocations: 3.215 MiB)
0.002720 seconds (101.34 k allocations: 6.195 MiB)
0.003189 seconds (902 allocations: 883.484 KiB)
0.001960 seconds (902 allocations: 883.484 KiB)
n = 350
0.015783 seconds (1.05 k allocations: 65.562 KiB)
0.016362 seconds (1.05 k allocations: 65.562 KiB)
0.000076 seconds
0.008280 seconds (1.05 k allocations: 1.521 MiB)
0.009735 seconds (1.05 k allocations: 4.353 MiB)
0.003792 seconds (137.91 k allocations: 8.428 MiB)
0.002665 seconds (1.05 k allocations: 1.160 MiB)
0.002354 seconds (1.05 k allocations: 1.160 MiB)
n = 400
0.010599 seconds (1.20 k allocations: 74.938 KiB)
0.007430 seconds (1.20 k allocations: 74.938 KiB)
0.000078 seconds
0.006021 seconds (1.20 k allocations: 1.973 MiB)
0.026957 seconds (1.20 k allocations: 5.664 MiB, 70.43% gc time)
0.004860 seconds (180.11 k allocations: 11.006 MiB)
0.003017 seconds (1.20 k allocations: 1.500 MiB)
0.003652 seconds (1.20 k allocations: 1.500 MiB)
n = 450
0.009276 seconds (1.35 k allocations: 84.312 KiB)
0.009503 seconds (1.35 k allocations: 84.312 KiB)
0.000115 seconds
0.006383 seconds (1.35 k allocations: 2.482 MiB)
0.008038 seconds (1.35 k allocations: 7.146 MiB)
0.006218 seconds (227.94 k allocations: 13.927 MiB)
0.005178 seconds (1.35 k allocations: 1.881 MiB)
0.004215 seconds (1.35 k allocations: 1.881 MiB)
n = 500
0.012857 seconds (1.50 k allocations: 93.688 KiB)
0.011674 seconds (1.50 k allocations: 93.688 KiB)
0.000136 seconds
0.008604 seconds (1.50 k allocations: 3.048 MiB)
0.010344 seconds (1.50 k allocations: 8.799 MiB)
0.007599 seconds (281.39 k allocations: 17.191 MiB)
0.005681 seconds (1.50 k allocations: 2.306 MiB)
0.004769 seconds (1.50 k allocations: 2.306 MiB)
n = 550
0.014980 seconds (1.65 k allocations: 103.062 KiB)
0.017946 seconds (1.65 k allocations: 103.062 KiB)
0.000174 seconds
0.010327 seconds (1.65 k allocations: 3.671 MiB)
0.014607 seconds (1.65 k allocations: 10.625 MiB)
0.009169 seconds (340.46 k allocations: 20.798 MiB)
0.006178 seconds (1.65 k allocations: 2.773 MiB)
0.009445 seconds (1.65 k allocations: 2.773 MiB)
n = 600
0.016862 seconds (1.80 k allocations: 112.438 KiB)
0.018070 seconds (1.80 k allocations: 112.438 KiB)
0.000194 seconds
0.012115 seconds (1.80 k allocations: 4.351 MiB)
0.017187 seconds (1.80 k allocations: 12.622 MiB)
0.011016 seconds (405.16 k allocations: 24.748 MiB)
0.009576 seconds (1.80 k allocations: 3.282 MiB)
0.007794 seconds (1.80 k allocations: 3.282 MiB)
n = 650
0.019549 seconds (1.95 k allocations: 121.812 KiB)
0.020473 seconds (1.95 k allocations: 121.812 KiB)
0.000235 seconds
0.013520 seconds (1.95 k allocations: 5.089 MiB)
0.019936 seconds (1.95 k allocations: 14.790 MiB)
0.012865 seconds (475.49 k allocations: 29.042 MiB)
0.009241 seconds (1.95 k allocations: 3.835 MiB)
0.008665 seconds (1.95 k allocations: 3.835 MiB)
n = 700
0.026187 seconds (2.10 k allocations: 131.188 KiB)
0.023385 seconds (2.10 k allocations: 131.188 KiB)
0.000276 seconds
0.018192 seconds (2.10 k allocations: 5.884 MiB)
0.021707 seconds (2.15 k allocations: 17.127 MiB)
0.032264 seconds (551.44 k allocations: 33.679 MiB, 53.71% gc time)
0.010982 seconds (2.10 k allocations: 4.430 MiB)
0.009810 seconds (2.10 k allocations: 4.430 MiB)
n = 750
0.026024 seconds (2.25 k allocations: 140.562 KiB)
0.026325 seconds (2.25 k allocations: 140.562 KiB)
0.000324 seconds
0.075207 seconds (2.25 k allocations: 6.736 MiB, 73.08% gc time)
0.024082 seconds (2.45 k allocations: 19.627 MiB)
0.017025 seconds (633.01 k allocations: 38.659 MiB)
0.012634 seconds (2.25 k allocations: 5.069 MiB)
0.009810 seconds (2.25 k allocations: 5.069 MiB)
n = 800
0.033951 seconds (2.40 k allocations: 149.938 KiB)
0.033742 seconds (2.40 k allocations: 149.938 KiB)
0.000368 seconds
0.027009 seconds (2.40 k allocations: 7.646 MiB)
0.026405 seconds (2.75 k allocations: 22.299 MiB)
0.020540 seconds (720.21 k allocations: 43.984 MiB)
0.035980 seconds (2.40 k allocations: 5.750 MiB, 62.47% gc time)
0.012604 seconds (2.40 k allocations: 5.750 MiB)
n = 850
0.028899 seconds (2.55 k allocations: 159.312 KiB)
0.031767 seconds (2.55 k allocations: 159.312 KiB)
0.000406 seconds
0.039557 seconds (2.55 k allocations: 8.613 MiB, 45.35% gc time)
0.027419 seconds (3.05 k allocations: 25.143 MiB)
0.021903 seconds (813.04 k allocations: 49.651 MiB)
0.031004 seconds (2.55 k allocations: 6.474 MiB, 51.47% gc time)
0.035000 seconds (2.55 k allocations: 6.474 MiB)
n = 900
0.044860 seconds (2.70 k allocations: 168.688 KiB)
0.046641 seconds (2.70 k allocations: 168.688 KiB)
0.000514 seconds
0.026460 seconds (2.70 k allocations: 9.636 MiB)
0.039504 seconds (3.35 k allocations: 28.159 MiB)
0.026057 seconds (911.49 k allocations: 55.661 MiB)
0.015254 seconds (2.70 k allocations: 7.241 MiB)
0.016091 seconds (2.70 k allocations: 7.241 MiB)
n = 950
0.036002 seconds (2.85 k allocations: 178.062 KiB)
0.034280 seconds (2.85 k allocations: 178.062 KiB)
0.000477 seconds
0.026395 seconds (2.85 k allocations: 10.717 MiB)
0.057024 seconds (3.65 k allocations: 31.346 MiB, 38.59% gc time)
0.027497 seconds (1.02 M allocations: 62.015 MiB)
0.021707 seconds (2.85 k allocations: 8.051 MiB, 19.26% gc time)
0.016856 seconds (2.85 k allocations: 8.051 MiB)
n = 1000
0.063660 seconds (3.00 k allocations: 187.438 KiB, 31.73% gc time)
0.038649 seconds (3.00 k allocations: 187.438 KiB)
0.000567 seconds
0.028659 seconds (3.00 k allocations: 11.856 MiB)
0.040442 seconds (3.95 k allocations: 34.705 MiB)
0.081095 seconds (1.13 M allocations: 68.712 MiB, 59.31% gc time)
0.019885 seconds (3.00 k allocations: 8.904 MiB)
0.023380 seconds (3.00 k allocations: 8.904 MiB)
julia>
Metadata
Metadata
Assignees
Labels
No labels