DSP.jl/src/dspbase.jl at d0c90d49862cb5db8756f4050e5fa32adb2cbda3 · JuliaDSP/DSP.jl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
# This file was formerly a part of Julia. License is MIT: https://julialang.org/license

const SMALL_FILT_CUTOFF = 66

"""
    filt(b::Union{AbstractVector,Number},
         a::Union{AbstractVector,Number},
         x::AbstractArray)

Apply filter described by vectors `a` and `b` to vector `x`.

Inputs that are `Number`s are treated as one-element `Vector`s.
"""
filt(b::Union{AbstractVector, Number}, a::Union{AbstractVector, Number}, x::AbstractArray{T}) where {T} =
    filt!(similar(x, promote_type(eltype(b), eltype(a), T)), b, a, x)

# in-place filtering: returns results in the out argument, which may shadow x
# (and does so by default)

"""
    filt!(out, b, a, x)

Same as [`filt`](@ref) but writes the result into the `out` argument, which may
alias the input `x` to modify it in-place.
"""
function filt!(out::AbstractArray, b::Union{AbstractVector, Number}, a::Union{AbstractVector, Number},
               x::AbstractArray{T}) where {T}
    isempty(b) && throw(ArgumentError("filter vector b must be non-empty"))
    isempty(a) && throw(ArgumentError("filter vector a must be non-empty"))
    a[1] == 0  && throw(ArgumentError("filter vector a[1] must be nonzero"))
    if size(x) != size(out)
        throw(ArgumentError(lazy"output size $(size(out)) must match input size $(size(x))"))
    end

    as = length(a)
    bs = length(b)
    sz = max(as, bs)

    iszero(size(x, 1)) && return out
    isone(sz) && return (k = b[1] / a[1]; @noinline mul!(out, x, k)) # Simple scaling without memory

    # Filter coefficient normalization
    if !isone(a[1])
        norml = a[1]
        a = @noinline broadcast(/, a, norml)
        b = @noinline broadcast(/, b, norml)
    end

    si = Vector{promote_type(eltype(b), eltype(a), T)}(undef, sz - 1)

    if as == 1 && bs <= SMALL_FILT_CUTOFF
        fill!(si, zero(eltype(si)))
        _small_filt_fir!(out, b, x, si, Val(bs))
    else
        for col in CartesianIndices(axes(x)[2:end])
            # Reset the filter state
            fill!(si, zero(eltype(si)))
            if as > 1
                _filt_iir!(out, b, a, x, si, col)
            else
                _filt_fir!(out, b, x, si, col)
            end
        end
    end
    return out
end

# Transposed direct form II
function _filt_iir!(out, b, a, x, si, col)
    silen = length(si)
    @inbounds for i in axes(x, 1)
        xi = x[i, col]
        val = muladd(xi, b[1], si[1])
        out[i, col] = val
        for j in 1:min(length(a), length(b), silen) - 1
            si[j] = muladd(val, -a[j+1], muladd(xi, b[j+1], si[j+1]))
        end
        if length(a) == length(b)
            si[silen] = muladd(xi, b[silen+1], -a[silen+1]*val)
        elseif length(a) > length(b)
            for j in length(b):silen-1
                si[j] = muladd(val, -a[j+1], si[j+1])
            end
            si[silen] = -a[silen+1]*val
        else
            for j in length(a):silen-1
                si[j] = muladd(xi, b[j+1], si[j+1])
            end
            si[silen] = xi*b[silen+1]
        end
    end
end

# Transposed direct form II
function _filt_fir!(out, b, x, si, col)
    silen = length(si)
    @inbounds for i in axes(x, 1)
        xi = x[i, col]
        out[i, col] = muladd(xi, b[1], si[1])
        for j=1:(silen-1)
            si[j] = muladd(xi, b[j+1], si[j+1])
        end
        si[silen] = b[silen+1] * xi
    end
end

#
# filt implementation for FIR filters
#

### NOTE ###
# Fragile because of the impact of @inbounds and checkbounds
# on the effects system

const SMALL_FILT_VECT_CUTOFF = 19

# Transposed direct form II
@generated function _filt_fir!(out, b::NTuple{N,T}, x, siarr, col, ::Val{StoreSI}) where {N,T,StoreSI}
    silen = N - 1
    si_end = Symbol(:si_, silen)

    quote
        N <= SMALL_FILT_VECT_CUTOFF && checkbounds(siarr, $silen)
        Base.@nextract $silen si siarr
        VECTORIZE_LARGER = N > SMALL_FILT_VECT_CUTOFF
        for i in axes(x, 1)
            xi = x[i, col]
            val = muladd(xi, b[1], si_1)
            if VECTORIZE_LARGER
                @inbounds out[i, col] = val
            end
            Base.@nexprs $(silen - 1) j -> (si_j = muladd(xi, b[j+1], si_{j + 1}))
            $si_end = xi * b[N]
            if !VECTORIZE_LARGER
                out[i, col] = val
            end
        end
        if StoreSI
            Base.@nexprs $silen j -> siarr[j] = si_j
        end
        return nothing
    end
end

# Convert array filter tap input to tuple for small-filtering
function _small_filt_fir!(
    out::AbstractArray, h::AbstractVector, x::AbstractArray,
        si::AbstractVector, ::Val{bs}) where {bs}

    bs < 2 && throw(ArgumentError("invalid tuple size"))
    length(h) != bs && throw(ArgumentError("length(h) does not match bs"))
    b = ntuple(j -> h[j], Val(bs))
    for col in CartesianIndices(axes(x)[2:end])
        _filt_fir!(out, b, x, si, col, Val(false))
    end
end

"""
    deconv(b,a) -> c

Construct vector `c` such that `b = conv(a,c) + r`.
Equivalent to polynomial division.
"""
function deconv(b::StridedVector{T}, a::StridedVector{T}) where T
    lb = size(b,1)
    la = size(a,1)
    if lb < la
        return [zero(T)]
    end
    lx = lb-la+1
    x = zeros(T, lx)
    x[1] = 1
    filt(b, a, x)
end


"""
    _zeropad!(padded::AbstractVector,
              u::AbstractVector,
              padded_axes = axes(padded),
              data_dest::Tuple = (1,),
              data_region = CartesianIndices(u))

Place the portion of `u` specified by `data_region` into `padded`, starting at
location `data_dest`. Sets the rest of `padded` to zero. This will mutate
`padded`. `padded_axes` must correspond to the axes of `padded`.

"""
@inline function _zeropad!(
    padded::AbstractVector,
    u::AbstractVector,
    padded_axes = axes(padded),
    data_dest::Tuple = (first(padded_axes[1]),),
    data_region = CartesianIndices(u),
)
    datasize = length(data_region)
    # Use axes to accommodate arrays that do not start at index 1
    data_first_i = first(data_region)[1]
    dest_first_i = data_dest[1]
    copyto!(padded, dest_first_i, u, data_first_i, datasize)
    padded[first(padded_axes[1]):dest_first_i - 1] .= 0
    padded[dest_first_i + datasize : end] .= 0

    padded
end

@inline function _zeropad!(
    padded::AbstractArray,
    u::AbstractArray,
    padded_axes = axes(padded),
    data_dest::Tuple = first.(padded_axes),
    data_region = CartesianIndices(u),
)
    fill!(padded, zero(eltype(padded)))
    dest_axes = UnitRange.(data_dest, data_dest .+ size(data_region) .- 1)
    dest_region = CartesianIndices(dest_axes)
    copyto!(padded, dest_region, u, data_region)

    padded
end

"""
    _zeropad(u, padded_size, [data_dest, data_region])

Creates and returns a new base-1 index array of size `padded_size`, with the
section of `u` specified by `data_region` copied into the region of the new
 array as specified by `data_dest`. All other values will be initialized to
 zero.

If either `data_dest` or `data_region` is not specified, then the defaults
described in [`_zeropad!`](@ref) will be used.
"""
function _zeropad(u, padded_size, args...)
    padded = similar(u, padded_size)
    _zeropad!(padded, u, axes(padded), args...)
end

function _zeropad_keep_offset(u, padded_size, u_axes, args...)
    ax_starts = first.(u_axes)
    new_axes = UnitRange.(ax_starts, ax_starts .+ padded_size .- 1)
    _zeropad!(similar(u, new_axes), u, args...)
end

function _zeropad_keep_offset(
    u, padded_size, ::NTuple{<:Any, Base.OneTo{Int}}, args...
)
    _zeropad(u, padded_size, args...)
end

"""
    _zeropad_keep_offset(u, padded_size, [data_dest, dat_region])

Like [`_zeropad`](@ref), but retains the first index of `u` when creating a new
array.
"""
function _zeropad_keep_offset(u, padded_size, args...)
    _zeropad_keep_offset(u, padded_size, axes(u), args...)
end

"""
Estimate the number of floating point multiplications per output sample for an
overlap-save algorithm with fft size `nfft`, and filter size `nb`.
"""
os_fft_complexity(nfft, nb) =  (nfft * log2(nfft) + nfft) / (nfft - nb + 1)

"""
Determine the length of FFT that minimizes the number of multiplications per
output sample for an overlap-save convolution of vectors of size `nb` and `nx`.
"""
function optimalfftfiltlength(nb, nx)
    nfull = nb + nx - 1

    # Step through possible nffts and find the nfft that minimizes complexity
    # Assumes that complexity is convex
    first_pow2 = ceil(Int, log2(nb))
    max_pow2 = ceil(Int, log2(nfull))
    prev_complexity = os_fft_complexity(2 ^ first_pow2, nb)
    pow2 = first_pow2 + 1
    while pow2 <= max_pow2
        new_complexity = os_fft_complexity(2 ^ pow2, nb)
        new_complexity > prev_complexity && break
        prev_complexity = new_complexity
        pow2 += 1
    end
    nfft = pow2 > max_pow2 ? 2 ^ max_pow2 : 2 ^ (pow2 - 1)

    if nfft > nfull
        # If nfft > nfull, then it's better to find next fast power
        nfft = nextfastfft(nfull)
    end

    nfft
end

"""
Prepare buffers and FFTW plans for convolution. The two buffers, tdbuff and
fdbuff may be an alias of each other, because complex convolution only requires
one buffer. The plans are mutating where possible, and the inverse plan is
unnormalized.
"""
@inline function os_prepare_conv(u::AbstractArray{T, N},
                                 nffts) where {T<:Real, N}
    tdbuff = similar(u, nffts)
    bufsize = ntuple(i -> i == 1 ? nffts[i] >> 1 + 1 : nffts[i], N)
    fdbuff = similar(u, Complex{T}, NTuple{N, Int}(bufsize))

    p = plan_rfft(tdbuff)
    ip = plan_brfft(fdbuff, nffts[1])

    tdbuff, fdbuff, p, ip
end

@inline function os_prepare_conv(u::AbstractArray{<:Complex}, nffts)
    buff = similar(u, nffts)

    p = plan_fft!(buff)
    ip = inv(p).p

    buff, buff, p, ip # Only one buffer for complex
end

"""
Transform the smaller convolution input to frequency domain, and return it in a
new array. However, the contents of `buff` may be modified.
"""
@inline function os_filter_transform!(buff::AbstractArray{<:Real}, p)
    p * buff
end

@inline function os_filter_transform!(buff::AbstractArray{<:Complex}, p!)
    copy(p! * buff) # p operates in place on buff
end

"""
Take a block of data, and convolve it with the smaller convolution input. This
may modify the contents of `tdbuff` and `fdbuff`, and the result will be in
`tdbuff`.
"""
@inline function os_conv_block!(tdbuff::AbstractArray{<:Real},
                                fdbuff::AbstractArray,
                                filter_fd,
                                p,
                                ip)
    mul!(fdbuff, p, tdbuff)
    fdbuff .*= filter_fd
    mul!(tdbuff, ip, fdbuff)
end

"Like the real version, but only operates on one buffer"
@inline function os_conv_block!(buff::AbstractArray{<:Complex},
                                ::AbstractArray, # Only one buffer for complex
                                filter_fd,
                                p!,
                                ip!)
    p! * buff # p! operates in place on buff
    buff .*= filter_fd
    ip! * buff # ip! operates in place on buff
end

# Used by `unsafe_conv_kern_os!` to handle blocks of input data that need to be padded.
#
# For a given number of edge dimensions, convolve all regions along the
# perimeter that have that number of edge dimensions
#
# For a 3d cube, if n_edges = 1, this means all faces. If n_edges = 2, then
# all edges. Finally, if n_edges = 3, then all corners.
#
# This needs to be a separate function for subsets to generate tuple elements,
# which is only the case if the number of edges is a Val{n} type. Iterating over
# the number of edges with Val{n} is inherently type unstable, so this function
# boundary allows dispatch to make efficient code for each number of edge
# dimensions.
function unsafe_conv_kern_os_edge!(
    # These arrays and buffers will be mutated
    out::AbstractArray{<:Any, N},
    tdbuff,
    fdbuff,
    perimeter_range,
    # Number of edge dimensions to pad and convolve
    n_edges::Val,
    # Data to be convolved
    u,
    filter_fd,
    # FFTW plans
    p,
    ip,
    # Sizes, ranges, and other pre-calculated constants
    #
    ## ranges listing center and edge blocks
    edge_ranges,
    center_block_ranges,
    ## size and axis information
    all_dims, # 1:N
    su,
    u_start,
    sv,
    nffts,
    out_start,
    out_stop,
    save_blocksize,
    sout_deficit, # How many output samples are missing if nffts > sout
    tdbuff_axes,
) where N
    # Iterate over all possible combinations of edge dimensions for a number of
    # edges
    #
    # For a 3d cube with n_edges = 1, this will specify the top and bottom faces
    # (edge_dims = (1,)), then the left and right faces (edge_dims = (2,)), then
    # the front and back faces (edge_dims = (3,))
    for edge_dims in subsets(all_dims, n_edges)
        # Specify a region on the perimeter by combining an edge block index for
        # each dimension on an edge, and the central blocks for dimensions not
        # on an edge.
        #
        # First make all entries equal to the center blocks:
        copyto!(perimeter_range, 1, center_block_ranges, 1, N)

        # For the dimensions chosen to be on an edge (edge_dims), get the
        # ranges of the blocks that would need to be padded (lie on an edge)
        # in that dimension.
        #
        # There can be one to two such ranges for each dimension, because with
        # some inputs sizes the whole convolution is just one range
        # (one edge block), or the padding will be needed at both the leading
        # and trailing side of that dimension
        selected_edge_ranges = getindex.(Ref(edge_ranges), edge_dims)

        # Visit each combination of edge ranges for the edge dimensions chosen.
        # For a 3d cube with n_edges = 1 and edge_dims = (1,), this will visit
        # the top face, and then the bottom face.
        for perimeter_edge_ranges in Iterators.ProductIterator(selected_edge_ranges)
            # The center region for non-edge dimensions has been specified above,
            # so finish specifying the region of the perimeter for this edge
            # block
            for (i, dim) in enumerate(edge_dims)
                perimeter_range[dim] = perimeter_edge_ranges[i]
            end

            # Region of block indices, not data indices!
            block_region = CartesianIndices(
                NTuple{N, UnitRange{Int}}(perimeter_range)
            )
            for block_pos in block_region
                # Figure out which portion of the input data should be transformed

                block_idx = convert(NTuple{N, Int}, block_pos)
                ## data_offset is NOT the beginning of the region that will be
                ## convolved, but is instead the beginning of the unaliased data.
                data_offset = save_blocksize .* (block_idx .- 1)
                ## How many zeros will need to be added before the data
                pad_before = max.(0, sv .- data_offset .- 1)
                data_ideal_stop = data_offset .+ save_blocksize
                ## How many zeros will need to be added after the data
                pad_after = max.(0, data_ideal_stop .- su)

                ## Data indices, not block indices
                data_region = CartesianIndices(
                    UnitRange.(
                        u_start .+ data_offset .- sv .+ pad_before .+ 1,
                        u_start .+ data_ideal_stop .- pad_after .- 1
                    )
                )

                # Convolve portion of input

                _zeropad!(tdbuff, u, tdbuff_axes, pad_before .+ 1, data_region)
                os_conv_block!(tdbuff, fdbuff, filter_fd, p, ip)

                # Save convolved result to output

                block_out_stop = min.(
                    out_start .+ data_offset .+ save_blocksize .- 1,
                    out_stop
                )
                block_out_region = CartesianIndices(
                    UnitRange.(out_start .+ data_offset, block_out_stop)
                )
                ## If the input could not fill tdbuff, account for that before
                ## copying the convolution result to the output
                u_deficit = max.(0, pad_after .- sv .+ 1)
                valid_buff_region = CartesianIndices(
                    UnitRange.(sv, nffts .- u_deficit .- sout_deficit)
                )
                copyto!(out, block_out_region, tdbuff, valid_buff_region)
            end
        end
    end
end

# Assumes u is larger than, or the same size as, v
# nfft should be greater than or equal to 2*sv-1
function unsafe_conv_kern_os!(out,
                        output_indices,
                        u::AbstractArray{<:Any, N},
                        v,
                        nffts) where N
    sout = size(out)
    su = size(u)
    sv = size(v)
    u_start = first.(axes(u))
    out_start = Tuple(first(output_indices))
    out_stop = Tuple(last(output_indices))
    ideal_save_blocksize = nffts .- sv .+ 1
    # Number of samples that are "missing" if the output is smaller than the
    # valid portion of the convolution
    sout_deficit = max.(0, ideal_save_blocksize .- sout)
    # Size of the valid portion of the convolution result
    save_blocksize = ideal_save_blocksize .- sout_deficit
    nblocks = cld.(sout, save_blocksize)

    # Pre-allocation
    tdbuff, fdbuff, p, ip = os_prepare_conv(out, nffts)
    tdbuff_axes = axes(tdbuff)

    # Transform the smaller filter
    _zeropad!(tdbuff, v)
    filter_fd = os_filter_transform!(tdbuff, p)
    filter_fd .*= 1 / prod(nffts) # Normalize once for brfft

    # block indices for center blocks, which need no padding
    first_center_blocks = cld.(sv .- 1, save_blocksize) .+ 1
    last_center_blocks = fld.(su, save_blocksize)
    center_block_ranges = UnitRange.(first_center_blocks, last_center_blocks)

    # block index ranges for blocks that need to be padded
    # Corresponds to the leading and trailing side of a dimension, or if there
    # are no center blocks, corresponds to the whole dimension
    edge_ranges = map(nblocks, first_center_blocks, last_center_blocks) do nblock, firstfull, lastfull
        lastfull > 1 ? [1:firstfull - 1, lastfull + 1 : nblock] : [1:nblock]
    end
    all_dims = 1:N
    val_dims = ntuple(Val, Val(N))
    # Buffer to store ranges of indices for a single region of the perimeter
    perimeter_range = Vector{UnitRange{Int}}(undef, N)

    # Convolve all blocks that require padding.
    #
    # This is accomplished by dividing the perimeter of the volume into
    # subsections, where the division is done by the number of edge dimensions.
    # For a 3d cube, this convolves the perimeter in the following order:
    #
    # Number of Edge Dimensions | Convolved Region
    # --------------------------+-----------------
    #                         1 | Faces of Cube
    #                         2 | Edges of Cube
    #                         3 | Corners of Cube
    #
    for n_edges in val_dims
        unsafe_conv_kern_os_edge!(
            # These arrays and buffers will be mutated
            out,
            tdbuff,
            fdbuff,
            perimeter_range,
            # Number of edge dimensions to pad and convolve
            n_edges,
            # Data to be convolved
            u,
            filter_fd,
            # FFTW plans
            p,
            ip,
            # Sizes, ranges, and other pre-calculated constants
            #
            ## ranges listing center and edge blocks
            edge_ranges,
            center_block_ranges,
            ## size and axis information
            all_dims, # 1:N
            su,
            u_start,
            sv,
            nffts,
            out_start,
            out_stop,
            save_blocksize,
            sout_deficit,
            tdbuff_axes) # How many output samples are missing if nffts > sout
    end

    tdbuff_region = CartesianIndices(tdbuff)
    # Portion of buffer with valid result of convolution
    valid_buff_region = CartesianIndices(UnitRange.(sv, nffts))
    # Iterate over block indices (not data indices) that do not need to be padded
    for block_pos in CartesianIndices(center_block_ranges)
        # Calculate portion of data to transform

        block_idx = convert(NTuple{N, Int}, block_pos)
        ## data_offset is NOT the beginning of the region that will be
        ## convolved, but is instead the beginning of the unaliased data.
        data_offset = save_blocksize .* (block_idx .- 1)
        data_stop = data_offset .+ save_blocksize
        data_region = CartesianIndices(
            UnitRange.(u_start .+ data_offset .- sv .+ 1, u_start .+ data_stop .- 1)
        )

        # Convolve this portion of the data

        copyto!(tdbuff, tdbuff_region, u, data_region)
        os_conv_block!(tdbuff, fdbuff, filter_fd, p, ip)

        # Save convolved result to output

        block_out_region = CartesianIndices(
            UnitRange.(data_offset .+ out_start, data_stop .+ out_start .- 1)
        )
        copyto!(out, block_out_region, tdbuff, valid_buff_region)
    end

    out
end

function _conv_kern_fft!(out::AbstractArray{T, N},
                         output_indices,
                         u::AbstractArray{<:Real, N},
                         v::AbstractArray{<:Real, N}) where {T<:Real, N}
    outsize = size(output_indices)
    nffts = nextfastfft(outsize)
    padded = _zeropad!(similar(u, T, nffts), u)
    p = plan_rfft(padded)
    uf = p * padded
    _zeropad!(padded, v)
    vf = p * padded
    uf .*= vf
    raw_out = irfft(uf, nffts[1])
    copyto!(out,
            output_indices,
            raw_out,
            CartesianIndices(outsize))
end
function _conv_kern_fft!(out::AbstractArray{T}, output_indices, u, v) where {T}
    outsize = size(output_indices)
    nffts = nextfastfft(outsize)
    upad = _zeropad!(similar(u, T, nffts), u)
    vpad = _zeropad!(similar(v, T, nffts), v)
    p! = plan_fft!(upad)
    ip! = inv(p!)
    p! * upad # Operates in place on upad
    p! * vpad
    upad .*= vpad
    ip! * upad
    copyto!(out,
            output_indices,
            upad,
            CartesianIndices(outsize))
end

function _conv_td!(out, output_indices, u::AbstractArray{<:Number, N}, v::AbstractArray{<:Number, N}) where {N}
    index_offset = first(CartesianIndices(u)) + first(CartesianIndices(v)) - first(output_indices)
    checkbounds(out, output_indices)
    fill!(out, zero(eltype(out)))
    if size(u, 1) ≤ size(v, 1) # choose more efficient iteration order
        for m in CartesianIndices(u), n in CartesianIndices(v)
            @inbounds out[n+m - index_offset] = muladd(u[m], v[n], out[n+m - index_offset])
        end
    else
        for n in CartesianIndices(v), m in CartesianIndices(u)
            @inbounds out[n+m - index_offset] = muladd(u[m], v[n], out[n+m - index_offset])
        end
    end
    return out
end

# whether the given axis are to be considered to carry an offset for `conv!` and `conv`
conv_axis_with_offset(::Base.OneTo) = false
conv_axis_with_offset(a::Any) = throw(ArgumentError(lazy"unsupported axis type $(typeof(a))"))

function conv_axes_with_offset(as::Tuple...)
    with_offset = ((map(a -> map(conv_axis_with_offset, a), as)...)...,)
    if !allequal(with_offset)
        throw(ArgumentError("cannot mix offset and non-offset axes"))
    end
    return !isempty(with_offset) && first(with_offset)
end

const FFTTypes = Union{Float32, Float64, ComplexF32, ComplexF64}

"""
    conv!(out, u, v; algorithm=:auto)

Convolution of two arrays `u` and `v` with the result stored in `out`. `out`
must be large enough to store the entire result; if it is even larger, the
excess entries will be zeroed.

`out`, `u`, and `v` can be N-dimensional arrays, with arbitrary indexing
offsets. If none of them has offset axes,
`size(out,d) ≥ size(u,d) + size(v,d) - 1` must hold. If both input and output
have offset axes, `firstindex(out,d) ≤ firstindex(u,d) + firstindex(v,d)` and
`lastindex(out,d) ≥ lastindex(u,d) + lastindex(v,d)` must hold (for d = 1,...,N).
A mix of offset and non-offset axes is not permitted.

The `algorithm` keyword allows choosing the algorithm to use:
* `:direct`: Evaluates the convolution sum in time domain.
* `:fft_simple`: Evaluates the convolution as a product in the frequency domain.
* `:fft_overlapsave`: Evaluates the convolution block-wise as a product in the
  frequency domain, overlapping the resulting blocks.
* `:fft`: Selects the faster of `:fft_simple` and `:fft_overlapsave` (as
   estimated from the input size).
* `:fast`: Selects the fastest of `:direct`, `:fft_simple` and
  `:fft_overlapsave` (as estimated from the input size).
* `:auto` (default): Equivalent to `:fast` if the data type is known to be
  suitable for FFT-based computation, equivalent to `:direct` otherwise.

!!! warning
    The choices made by `:fft`, `:fast`, and `:auto` are based on performance
    heuristics which may not result in the fastest algorithm in all cases. If
    best performance for a certain size/type combination is required, it is
    advised to do individual benchmarking and explicitly specify the desired
    algorithm.
"""
function conv!(
    out::AbstractArray{T, N},
    u::AbstractArray{<:Number, N},
    v::AbstractArray{<:Number, N};
    algorithm=:auto
) where {T<:Number, N}
    offset = conv_axes_with_offset(axes(out), axes(u), axes(v)) ? 0 : 1
    output_indices = CartesianIndices(map(axes(out), axes(u), axes(v)) do ao, au, av
        return (first(au)+first(av) : last(au)+last(av)) .- offset
    end)

    if algorithm === :auto
        algorithm = T <: FFTTypes ? :fast : :direct
    end
    if algorithm === :fast
        if length(u) * length(v) < 2^16 # TODO: better heuristic
            algorithm = :direct
        else
            algorithm = :fft
        end
    end
    if algorithm === :direct || any(isempty, (u, v)) # ensure correct handling of empty arrays
        return _conv_td!(out, output_indices, u, v)
    else
        if output_indices != CartesianIndices(out)
            fill!(out, zero(eltype(out)))
        end
        os_nffts = length(u) >= length(v) ? map(optimalfftfiltlength, size(v), size(u)) : map(optimalfftfiltlength, size(u), size(v))
        if algorithm === :fft
            if any(os_nffts .< size(output_indices))
                algorithm = :fft_overlapsave
            else
                algorithm = :fft_simple
            end
        end
        if algorithm === :fft_overlapsave
            # v should be smaller than u for good performance
            if length(u) >= length(v)
                return unsafe_conv_kern_os!(out, output_indices, u, v, os_nffts)
            else
                return unsafe_conv_kern_os!(out, output_indices, v, u, os_nffts)
            end
        elseif algorithm === :fft_simple
            return _conv_kern_fft!(out, output_indices, u, v)
        else
            throw(ArgumentError("algorithm must be :auto, :fast, :direct, :fft, :fft_simple, or :fft_overlapsave"))
        end
    end
end

function conv_output_axes(au::Tuple, av::Tuple)
    if conv_axes_with_offset(au, av)
        return map((au, av) -> first(au)+first(av):last(au)+last(av), au, av)
    else
        return map((au, av) -> Base.OneTo(last(au) + last(av) - 1), au, av)
    end
end

"""
    conv(u, v; algorithm)

Convolution of two arrays. A convolution algorithm is automatically chosen among
direct convolution, FFT, or FFT overlap-save, depending on the size of the
input, unless explicitly specified with the `algorithm` keyword argument; see
[`conv!`](@ref) for details.
"""
function conv(
    u::AbstractArray{Tu, N}, v::AbstractArray{Tv, N};  kwargs...
) where {Tu<:Number, Tv<:Number, N}
    T = promote_type(Tu, Tv)
    out_axes = conv_output_axes(axes(u), axes(v))
    out = similar(u, T, out_axes)
    return conv!(out, u, v; kwargs...)
end

function conv(A::AbstractArray{<:Number, M},
              B::AbstractArray{<:Number, N}; kwargs...) where {M, N}
    if (M < N)
        conv(cat(A, dims=N)::AbstractArray{eltype(A), N}, B; kwargs...)
    else
        @assert M > N
        conv(A, cat(B, dims=M)::AbstractArray{eltype(B), M}; kwargs...)
    end
end

"""
    conv(u,v,A)

2-D convolution of the matrix `A` with the 2-D separable kernel generated by
the vector `u` and row-vector `v`.
Uses 2-D FFT algorithm.
"""
function conv(u::AbstractVector{T}, v::Transpose{T,<:AbstractVector}, A::AbstractMatrix{T}) where T
    # Arbitrary indexing offsets not implemented
    if any(conv_axis_with_offset, (axes(u)..., axes(v)..., axes(A)...))
        throw(ArgumentError("offset axes not supported"))
    end
    m = length(u) + size(A, 1) - 1
    n = length(v) + size(A, 2) - 1
    B = zeros(T, m, n)
    B[CartesianIndices(A)] = A
    u = fft(_zeropad(u, m))
    vt = fft(_zeropad(transpose(v), n))
    C = ifft(fft(B) .*= u .* transpose(vt))
    if T <: Real
        return real(C)
    end
    return C
end


dsp_reverse(v::AbstractVector, ::Tuple{Base.OneTo}) = reverse(v)
function dsp_reverse(v::AbstractVector, vaxes::Tuple{AbstractUnitRange})
    vsize = length(v)
    reflected_start = - first(only(vaxes)) - vsize + 1
    reflected_axes = (reflected_start : reflected_start + vsize - 1,)
    out = similar(v, reflected_axes)
    copyto!(out, reflected_start, Iterators.reverse(v), 1, vsize)
end


"""
    xcorr(u; padmode::Symbol=:none, scaling::Symbol=:none)
    xcorr(u, v; padmode::Symbol=:none, scaling::Symbol=:none)

With two arguments, compute the cross-correlation of two vectors, by calculating
the similarity between `u` and `v` with various offsets of `v`. Delaying `u`
relative to `v` will shift the result to the right. If one argument is provided,
calculate `xcorr(u, u; kwargs...)`.

The size of the output depends on the `padmode` keyword argument: with `padmode =
:none` the length of the result will be `length(u) + length(v) - 1`, as with `conv`.
With `padmode = :longest`, the shorter of the arguments will be padded so they
are of equal length. This gives a result with length `2*max(length(u), length(v))-1`,
with the zero-lag condition at the center.

The keyword argument `scaling` can be provided. Possible arguments are the default
`:none` and `:biased`. `:biased` is valid only if the vectors have the same length,
or only one vector is provided, dividing the result by `length(u)`.

!!! note

    In this package, `xcorr` conjugates the second argument `v`, choosing the same
    convention as MATLAB's `xcorr` and `scipy.signal.correlate`. This differs from
    the inner product convention more prevalent in physics, that conjugates the
    first vector.
# Examples

```jldoctest
julia> xcorr([1,2,3],[1,2,3])
5-element Vector{Int64}:
  3
  8
 14
  8
  3
```
"""
function xcorr(
    u::AbstractVector, v::AbstractVector; padmode::Symbol=:none, scaling::Symbol=:none
)
    su = size(u, 1); sv = size(v, 1)

    if scaling == :biased && su != sv
        throw(DimensionMismatch("scaling only valid for vectors of same length"))
    end

    if padmode == :longest
        if su < sv
            u = _zeropad_keep_offset(u, sv)
        elseif sv < su
            v = _zeropad_keep_offset(v, su)
        end
    elseif padmode != :none
        throw(ArgumentError("padmode keyword argument must be either :none or :longest"))
    end

    res = conv(u, dsp_reverse(conj(v), axes(v)))
    if scaling == :biased
        res = _normalize!(res, su)
    end

    return res
end

_normalize!(x::AbstractArray{<:Integer}, sz::Int) = (x ./ sz)   # does not mutate x
_normalize!(x::AbstractArray, sz::Int) = (x ./= sz)

# TODO: write specialized (r/)fft-ed autocorrelation functions
xcorr(u::AbstractVector; kwargs...) = xcorr(u, u; kwargs...)