-
Notifications
You must be signed in to change notification settings - Fork 51
/
Copy pathfilter.c
1433 lines (1271 loc) · 52.7 KB
/
filter.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// filter using fast convolution (overlap-save) and the FFTW3 FFT package
// for the ka9q-radio 'radiod' program
// Generates transfer functions using Kaiser window
// Optional output decimation by integer factor
// Complex input and transfer functions, complex or real output
// Copyright 2017-2023, Phil Karn, KA9Q, [email protected]
//#define LIQUID 1 // Experimental use of parks-mcclellan in filter generation
#define FFT_PRIO 95 // Runs at very high real time priority
#define MYNEW 1
#define _GNU_SOURCE 1
#include <assert.h>
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <stdbool.h>
#include <pthread.h>
#include <memory.h>
#include <complex.h>
#include <math.h>
#include <fftw3.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <errno.h>
#if LIQUID
// Otherwise genrates a bazillion warnings
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#include <liquid/liquid.h>
#pragma GCC diagnostic pop
#endif
#include "conf.h"
#include "misc.h"
#include "filter.h"
//#define FILTER_DEBUG 1 // turn on lots of printfs in the window creation code
// Settable from main
char const *Wisdom_file = "/var/lib/ka9q-radio/wisdom";
char const *System_wisdom_file = "/etc/fftw/wisdomf"; // only valid for float version
double FFTW_plan_timelimit = 30.0;
int N_worker_threads = 2;
int N_internal_threads = 1; // Usually most efficient
// Desired FFTW planning level
// If wisdom at this level is not present for some filter, the command to generate it will be logged and FFTW_MEASURE wisdom will be generated at runtime
int FFTW_planning_level = FFTW_PATIENT;
// FFTW3 doc strongly recommends doing your own locking around planning routines, so I now am
pthread_mutex_t FFTW_planning_mutex = PTHREAD_MUTEX_INITIALIZER;
static bool FFTW_init = false;
// FFT job descriptor
struct fft_job {
struct fft_job *next;
unsigned int jobnum;
enum filtertype type;
fftwf_plan plan;
void *input;
void *output;
size_t input_dropsize; // byte counts to drop from cache when FFT finishes
pthread_mutex_t *completion_mutex; // protects completion_jobnum
pthread_cond_t *completion_cond; // Signaled when job is complete
unsigned int *completion_jobnum; // Written with jobnum when complete
bool terminate; // set to tell fft thread to quit
};
static struct fft_job *FFT_free_list; // List of spare job descriptors
#define NTHREADS_MAX 20 // More than I'll ever need
static struct {
pthread_mutex_t queue_mutex; // protects job_queue
pthread_cond_t queue_cond; // signaled when job put on job_queue
struct fft_job *job_queue;
pthread_t thread[NTHREADS_MAX]; // Worker threads
} FFT;
static inline int modulo(int x,int const m){
x = x < 0 ? x + m : x;
return x > m ? x - m : x;
}
static bool goodchoice(unsigned long n);
// Create fast convolution filters
// The filters are now in two parts, filter_in (the master) and filter_out (the slave)
// Filter_in holds the original time-domain input and its frequency domain version
// Filter_out holds the frequency response and decimation information for one of several output filters that can share the same input
// create_filter_input() parameters, shared by all slaves:
// L = input data blocksize
// M = impulse response duration
// in_type = REAL or COMPLEX
// Set up input (master) half of filter
int create_filter_input(struct filter_in *master,int const L,int const M, enum filtertype const in_type){
assert(master != NULL);
assert(master != (void *)-1);
if(master == NULL)
return -1;
assert(L > 0);
assert(M > 0);
int const N = L + M - 1;
int const bins = (in_type == COMPLEX) ? N : (N/2 + 1);
if(bins < 1)
return -1; // Unreasonably small - will segfault. Can happen if sample rate is garbled
if(!goodchoice(N))
fprintf(stdout,"create_filter_input(L=%d, M=%d): N=%d is not an efficient blocksize for FFTW3\n",L,M,N);
// It really should already be zeroed.
// If not, it is probably being reused and the dynamically allocated storage in it may not have been freed
#ifdef NDEBUG
memset(master,0,sizeof *master); // make sure it's clean, even if it causes a memory leak
#else
ASSERT_ZEROED(master,sizeof *master);
#endif
master->points = N;
// If there are no worker threads, do it inline
master->perform_inline = (N_worker_threads == 0);
for(int i=0; i < ND; i++){
master->fdomain[i] = lmalloc(sizeof(float complex) * bins);
master->completed_jobs[i] = (unsigned int)-1; // So startup won't drop any blocks
}
master->bins = bins;
master->in_type = in_type;
master->ilen = L;
master->impulse_length = M;
pthread_mutex_init(&master->filter_mutex,NULL);
pthread_cond_init(&master->filter_cond,NULL);
int old_prio = norealtime();
// FFTW itself always runs with a single thread since multithreading didn't seem to do much good
// But we have a set of worker threads operating on a job queue to allow a controlled number
// of independent FFTs to execute at the same time
if(!FFTW_init){
printf("FFTW version: %s\n", fftwf_version);
fftwf_init_threads();
bool sr = fftwf_import_system_wisdom();
fprintf(stdout,"fftwf_import_system_wisdom() %s\n",sr ? "succeeded" : "failed");
if(!sr){
if(access(System_wisdom_file,R_OK) == -1){ // Would really like to use AT_EACCESS flag
fprintf(stdout,"%s not readable: %s\n",System_wisdom_file,strerror(errno));
}
}
bool lr = fftwf_import_wisdom_from_filename(Wisdom_file);
fprintf(stdout,"fftwf_import_wisdom_from_filename(%s) %s\n",Wisdom_file,lr ? "succeeded" : "failed");
if(!lr){
if(access(Wisdom_file,R_OK) == -1){
fprintf(stdout,"%s not readable: %s\n",Wisdom_file,strerror(errno));
}
}
if(access(Wisdom_file,W_OK) == -1){
fprintf(stdout,"Warning: %s not writeable, exports will fail: %s\n",Wisdom_file,strerror(errno));
}
fftwf_set_timelimit(FFTW_plan_timelimit);
if(!sr && !lr)
fprintf(stdout,"No wisdom read, planning FFTs may take up to %'.0lf sec\n",FFTW_plan_timelimit);
// Start FFT worker thread(s) if not already running
pthread_mutex_init(&FFT.queue_mutex,NULL);
pthread_cond_init(&FFT.queue_cond,NULL);
if(N_worker_threads > NTHREADS_MAX){
fprintf(stdout,"fft-threads=%d too high, limiting to %d\n",N_worker_threads,NTHREADS_MAX);
N_worker_threads = NTHREADS_MAX;
}
for(int i=0;i < N_worker_threads;i++)
pthread_create(&FFT.thread[i],NULL,run_fft,NULL);
FFTW_init = true;
}
pthread_mutex_lock(&FFTW_planning_mutex);
fftwf_plan_with_nthreads(N_internal_threads);
switch(in_type){
default:
pthread_mutex_unlock(&FFTW_planning_mutex);
assert(0); // shouldn't happen
realtime(old_prio);
return -1;
case CROSS_CONJ:
case COMPLEX:
master->input_buffer_size = round_to_page(ND * N * sizeof(float complex));
// Allocate input_buffer_size bytes immediately followed by its mirror
master->input_buffer = mirror_alloc(master->input_buffer_size);
memset(master->input_buffer,0,master->input_buffer_size);
master->input_read_pointer.c = master->input_buffer; // FFT starts reading here
master->input_write_pointer.c = master->input_read_pointer.c + (M-1); // start writing here
master->input_read_pointer.r = NULL;
master->input_write_pointer.r = NULL;
master->fwd_plan = fftwf_plan_dft_1d(N, master->input_read_pointer.c, master->fdomain[0], FFTW_FORWARD, FFTW_WISDOM_ONLY|FFTW_planning_level);
if(master->fwd_plan == NULL){
suggest(FFTW_planning_level,N,FFTW_FORWARD,COMPLEX);
master->fwd_plan = fftwf_plan_dft_1d(N, master->input_read_pointer.c, master->fdomain[0], FFTW_FORWARD, FFTW_MEASURE);
}
if(fftwf_export_wisdom_to_filename(Wisdom_file) == 0)
fprintf(stdout,"fftwf_export_wisdom_to_filename(%s) of cof%d failed\n",Wisdom_file,N);
break;
case REAL:
master->input_buffer_size = round_to_page(ND * N * sizeof(float));
master->input_buffer = mirror_alloc(master->input_buffer_size);
memset(master->input_buffer,0,master->input_buffer_size);
master->input_read_pointer.r = master->input_buffer;
master->input_write_pointer.r = master->input_read_pointer.r + (M-1); // start writing here
master->input_read_pointer.c = NULL;
master->input_write_pointer.c = NULL;
master->fwd_plan = fftwf_plan_dft_r2c_1d(N, master->input_read_pointer.r, master->fdomain[0], FFTW_WISDOM_ONLY|FFTW_planning_level);
if(master->fwd_plan == NULL){
suggest(FFTW_planning_level,N,FFTW_FORWARD,REAL);
master->fwd_plan = fftwf_plan_dft_r2c_1d(N, master->input_read_pointer.r, master->fdomain[0], FFTW_MEASURE);
}
if(fftwf_export_wisdom_to_filename(Wisdom_file) == 0)
fprintf(stdout,"fftwf_export_wisdom_to_filename(%s) of rof%d failed\n",Wisdom_file,N);
break;
}
pthread_mutex_unlock(&FFTW_planning_mutex);
realtime(old_prio);
return 0;
}
/* create an instance of an output filter stage attached to some common master stage
Many output filters can share a common input, each with its own frequency shift, response, output sample rate and type
But all must operate at the same block rate and with M limited to the time domain overlap in the master
The number of IFFT frequency points N = L + M - 1; L = length of data, M = length of filter impulse response in time domain
In the master, L and M refer to the input sample rate Rm; in the slave they refer to the output rate Rs
I.e., Nm / Ns = Rm / Rs = Lm / Ls = (Mm - 1) / (Ms - 1), where M-1 is the filter 'order' (one less than the # of FIR taps)
master = pointer to associated shared master (input) filter
response = complex frequency response; may be NULL here and set later with set_filter()
This is set in the slave and can be different (indeed, this is the reason to have multiple slaves)
This is always complex, even if the input and/or output are real in the time domain
However the length is shorter for real output because the complex spectrum is symmetrical around DC
response array length = Ns = Ls + Ms - 1 when output is complex
= Ns/2 + 1 = (Ls + Ms - 1)/2+1 when output is real
Must be SIMD-aligned (e.g., allocated with fftw_alloc) and will be freed by delete_filter()
len = number of time domain points in output = Ls
out_type = REAL, COMPLEX, CROSS_CONJ (COMPLEX with special processing for ISB) or SPECTRUM (dummy for spectrum analyzer)
All demodulators currently require COMPLEX output because a complex exponential is applied to the time domain
output for fine frequency tuning
Before fine tuning was added, SSB(CW) could (and did) use the REAL mode since the imaginary component is unneeded
and the c2r IFFT is faster
Baseband FM audio filtering for de-emphasis and PL separation can use REAL output because there's no baseband fine tuning
If you provide your own filter response, ensure that it drops to nil well below the Nyquist rate
to prevent aliasing. Remember that decimation reduces the Nyquist rate by the decimation ratio.
The set_filter() function uses Kaiser windowing for this purpose
An output filter must not be used after its master is deleted, a segfault will occur
*/
int create_filter_output(struct filter_out *slave,struct filter_in * master,float complex * const response,int len, enum filtertype const out_type){
assert(master != NULL);
if(master == NULL)
return -1;
assert(slave != NULL);
if(slave == NULL)
return -1;
assert(out_type == SPECTRUM || len > 0);
// Should already be zeroed
// If not, it is probably being reused and the dynamically allocated storage in it may not have been freed
#ifdef NDEBUG
memset(slave,0,sizeof *slave); // make sure it's clean
#else
ASSERT_ZEROED(slave,sizeof *slave);
#endif
// Share all but output fft bins, response, output and output type
slave->master = master;
slave->out_type = out_type;
// N / L = Total FFT points / time domain points
int const N = master->ilen + master->impulse_length - 1;
int const L = master->ilen;
slave->response = response;
pthread_mutex_init(&slave->response_mutex,NULL);
switch(slave->out_type){
default:
case COMPLEX:
case CROSS_CONJ:
{
slave->olen = len;
ldiv_t x = ldiv((long)len * N,L);
if(x.rem != 0){
fprintf(stdout,"Invalid filter output length %d (fft size %ld) for input N=%d, L=%d\n",len,x.quot,N,L);
return -1;
}
slave->points = x.quot; // Total number of FFT points including overlap
slave->bins = x.quot;
slave->fdomain = lmalloc(sizeof(float complex) * slave->bins);
slave->output_buffer.c = lmalloc(sizeof(float complex) * slave->bins);
assert(slave->output_buffer.c != NULL);
slave->output_buffer.r = NULL; // catch erroneous references
slave->output.c = slave->output_buffer.c + slave->bins - len;
int old_prio = norealtime(); // Could this cause a priority inversion?
pthread_mutex_lock(&FFTW_planning_mutex);
fftwf_plan_with_nthreads(1); // IFFTs are always small, use only one internal thread
if((slave->rev_plan = fftwf_plan_dft_1d(slave->points,slave->fdomain,slave->output_buffer.c,FFTW_BACKWARD,FFTW_WISDOM_ONLY|FFTW_planning_level)) == NULL){
suggest(FFTW_planning_level,slave->bins,FFTW_BACKWARD,COMPLEX);
slave->rev_plan = fftwf_plan_dft_1d(slave->points,slave->fdomain,slave->output_buffer.c,FFTW_BACKWARD,FFTW_MEASURE);
}
if(fftwf_export_wisdom_to_filename(Wisdom_file) == 0)
fprintf(stdout,"fftwf_export_wisdom_to_filename(%s) failed\n",Wisdom_file);
pthread_mutex_unlock(&FFTW_planning_mutex);
realtime(old_prio);
}
break;
case SPECTRUM: // Like complex, but no IFFT or output time domain buffer
// Recent change: spectrum will read directly from master frequency bins
// Also don't set up an IFFT
break;
case REAL:
{
slave->olen = len;
ldiv_t x = ldiv((long)len * N,L);
if(x.rem != 0){
fprintf(stdout,"Invalid filter output length %d for input N=%d, L=%d\n",len,N,L);
return -1;
}
slave->points = x.quot;
slave->bins = slave->points / 2 + 1;
slave->fdomain = lmalloc(sizeof(float complex) * slave->bins);
assert(slave->fdomain != NULL);
slave->output_buffer.r = lmalloc(sizeof(float) * slave->points);
assert(slave->output_buffer.r != NULL);
slave->output_buffer.c = NULL;
slave->output.r = slave->output_buffer.r + slave->points - len;
int old_prio = norealtime();
pthread_mutex_lock(&FFTW_planning_mutex);
if((slave->rev_plan = fftwf_plan_dft_c2r_1d(slave->points,slave->fdomain,slave->output_buffer.r,FFTW_WISDOM_ONLY|FFTW_planning_level)) == NULL){
suggest(FFTW_planning_level,slave->points,FFTW_BACKWARD,REAL);
slave->rev_plan = fftwf_plan_dft_c2r_1d(slave->points,slave->fdomain,slave->output_buffer.r,FFTW_MEASURE);
}
if(fftwf_export_wisdom_to_filename(Wisdom_file) == 0)
fprintf(stdout,"fftwf_export_wisdom_to_filename(%s) failed\n",Wisdom_file);
pthread_mutex_unlock(&FFTW_planning_mutex);
realtime(old_prio);
}
break;
}
if(slave->out_type != SPECTRUM && !goodchoice(slave->points)){
int const ell = slave->olen;
int const overlap = slave->points / (slave->points - ell);
int const step = overlap;
for(int nn = slave->points + step; nn < master->ilen + master->impulse_length - 1; nn += step){
if(goodchoice(nn)){
int nell = nn * ell / slave->points;
int nm = nn - nell + 1;
fprintf(stdout,"create_filter_output: N=%d is not an efficient blocksize for FFTW3.",slave->points);
fprintf(stdout," Next good choice is N = %d (L=%d, M=%d); set samprate = %d * blockrate\n",nn,nell,nm,nell);
break;
}
}
}
slave->next_jobnum = master->next_jobnum;
return 0;
}
// Assist with choosing good blocksizes for FFTW3
static const int small_primes[6] = {2, 3, 5, 7, 11, 13};
static unsigned long factor_small_primes(unsigned long n, int exponents[6]);
/**
* Factor n into the primes 2,3,5,7,11,13.
* exponents[] should be an array of length 6, each slot will hold
* the exponent for 2,3,5,7,11,13 respectively.
*
* Return value:
* - If the returned value is 1, then n was fully factored by {2,3,5,7,11,13}.
* - Otherwise, the leftover (return value) is the part that couldn't
* be factored into those primes.
*/
static unsigned long factor_small_primes(unsigned long n, int exponents[6]){
// Initialize exponents
for (int i = 0; i < 6; i++)
exponents[i] = 0;
// Divide out each prime in turn
for (int i = 0; i < 6; i++) {
while (n % small_primes[i] == 0) {
exponents[i]++;
n /= small_primes[i];
}
}
return n; // The remainder is what's left
}
// Is this a good blocksize for FFTW3?
// Any number of factors of 2, 3, 7 plus one of either 11 or 13
static bool goodchoice(unsigned long n){
int exponents[6];
unsigned long r = factor_small_primes(n,exponents);
if(r != 1 || (exponents[4] + exponents[5] > 1))
return false;
else
return true;
}
// Worker thread(s) that actually execute FFTs
// Used for input FFTs since they tend to be large and CPU-consuming
// Lets the input thread process the next input block in parallel on another core
// Frees the input buffer and the job descriptor when done
void *run_fft(void *p){
pthread_detach(pthread_self());
pthread_setname("fft");
(void)p; // Unused
realtime(FFT_PRIO);
stick_core();
while(true){
// Get next job
pthread_mutex_lock(&FFT.queue_mutex);
while(FFT.job_queue == NULL)
pthread_cond_wait(&FFT.queue_cond,&FFT.queue_mutex);
struct fft_job *job = FFT.job_queue;
FFT.job_queue = job->next;
pthread_mutex_unlock(&FFT.queue_mutex);
if(job->input != NULL && job->output != NULL && job->plan != NULL){
switch(job->type){
case COMPLEX:
case CROSS_CONJ:
fftwf_execute_dft(job->plan,job->input,job->output);
break;
case REAL:
fftwf_execute_dft_r2c(job->plan,job->input,job->output);
break;
default:
break;
}
}
drop_cache(job->input,job->input_dropsize);
// Signal we're done with this job
if(job->completion_mutex)
pthread_mutex_lock(job->completion_mutex);
if(job->completion_jobnum)
*job->completion_jobnum = job->jobnum;
if(job->completion_cond)
pthread_cond_broadcast(job->completion_cond);
if(job->completion_mutex)
pthread_mutex_unlock(job->completion_mutex);
// Do NOT destroy job->completion_cond and completion_mutex here, they continue to exist
bool const terminate = job->terminate; // Don't use job pointer after free
// Put descriptor on free pool
pthread_mutex_lock(&FFT.queue_mutex);
job->next = FFT_free_list;
FFT_free_list = job;
pthread_mutex_unlock(&FFT.queue_mutex);
if(terminate)
break; // Terminate after this job
}
return NULL;
}
// Execute the input side of a filter:
// We use the FFTW3 functions that specify the input and output arrays
int execute_filter_input(struct filter_in * const f){
assert(f != NULL);
if(f == NULL)
return -1;
if(f->perform_inline){
// Just execute it here
int jobnum = f->next_jobnum++;
float complex *output = f->fdomain[jobnum % ND];
switch(f->in_type){
default:
case CROSS_CONJ:
case COMPLEX:
{
float complex *input = f->input_read_pointer.c;
f->input_read_pointer.c += f->ilen;
mirror_wrap((void *)&f->input_read_pointer.c,f->input_buffer,f->input_buffer_size);
fftwf_execute_dft(f->fwd_plan,input,output);
drop_cache(input,f->ilen * sizeof(float complex));
}
break;
case REAL:
{
float *input = f->input_read_pointer.r;
f->input_read_pointer.r += f->ilen;
mirror_wrap((void *)&f->input_read_pointer.r,f->input_buffer,f->input_buffer_size);
fftwf_execute_dft_r2c(f->fwd_plan,input,output);
drop_cache(input,f->ilen * sizeof(float));
}
break;
}
// Signal we're done with this job
pthread_mutex_lock(&f->filter_mutex);
f->completed_jobs[jobnum % ND] = jobnum;
pthread_cond_broadcast(&f->filter_cond);
pthread_mutex_unlock(&f->filter_mutex);
return 0;
}
// set up a job for the FFT worker threads and enqueue it
// Take one off the pool, if available
pthread_mutex_lock(&FFT.queue_mutex);
struct fft_job *job = FFT_free_list;
if(job != NULL){
FFT_free_list = job->next;
job->next = NULL;
}
pthread_mutex_unlock(&FFT.queue_mutex);
if(job == NULL)
job = calloc(1,sizeof(struct fft_job)); // Otherwise create a new one
// A descriptor from the free list won't be blank, but we set everything below
assert(job != NULL);
job->jobnum = f->next_jobnum++;
job->output = f->fdomain[job->jobnum % ND];
job->type = f->in_type;
job->plan = f->fwd_plan;
job->completion_mutex = &f->filter_mutex;
job->completion_jobnum = &f->completed_jobs[job->jobnum % ND];
job->completion_cond = &f->filter_cond;
job->terminate = false;
// Set up the job and next input buffer
// We're assuming that the time-domain pointers we're passing to the FFT are always aligned the same
// as we increment the FFT pointer by f->ilen (L) modulo the mirror buffer size.
// They seem to be as long as ilen (L) has several factors of 2. For the real->complex transform,
// each element is 4 bytes long, so if L is divisible by 8 then the pointers will be aligned to 64 bytes,
// the usual size of a cache line. For complex->complex transforms, L has to be divisible by 4.
switch(f->in_type){
default:
case CROSS_CONJ:
case COMPLEX:
job->input = f->input_read_pointer.c;
job->input_dropsize = f->ilen * sizeof(float complex);
f->input_read_pointer.c += f->ilen;
mirror_wrap((void *)&f->input_read_pointer.c,f->input_buffer,f->input_buffer_size);
break;
case REAL:
job->input = f->input_read_pointer.r;
job->input_dropsize = f->ilen * sizeof(float);
f->input_read_pointer.r += f->ilen;
mirror_wrap((void *)&f->input_read_pointer.r,f->input_buffer,f->input_buffer_size);
break;
}
assert(job->input != NULL); // Should already be allocated in create_filter_input, or in our last call
// Append job to worker queue, wake FFT worker thread
struct fft_job *jp_prev = NULL;
pthread_mutex_lock(&FFT.queue_mutex);
for(struct fft_job *jp = FFT.job_queue; jp != NULL; jp = jp->next)
jp_prev = jp;
if(jp_prev)
jp_prev->next = job;
else
FFT.job_queue = job; // Head of list
pthread_cond_signal(&FFT.queue_cond); // Alert only one FFT worker
pthread_mutex_unlock(&FFT.queue_mutex);
return 0;
}
/* Execute the output side of a filter:
1 - wait for a forward FFT job to complete
frequency domain data is in a circular queue ND buffers deep to tolerate scheduling jitter
2 - multiply the selected frequency bin range by the filter frequency response
This is the hard part; handle all combinations of real/complex input/output, wraparound, etc
3 - convert back to time domain with IFFT
'shift' is the number of FFT bins to shift *down*; a positive 'shift' means that a positive input
frequency will become zero frequency on output
*/
int execute_filter_output(struct filter_out * const slave,int const shift){
assert(slave != NULL);
if(slave == NULL)
return -1;
// We do have to modify the master's data structure, notably mutex locks
// So the derefenced pointer can't be const
struct filter_in * const master = slave->master;
assert(master != NULL);
if(master == NULL)
return -1;
assert(slave->out_type == SPECTRUM || (slave->rev_plan != NULL && slave->bins > 0));
assert(slave->out_type != NONE);
assert(master->in_type != NONE);
assert(master->fdomain != NULL);
assert(master->bins > 0);
// DC and positive frequencies up to nyquist frequency are same for all types
assert(slave->out_type == SPECTRUM || malloc_usable_size(slave->fdomain) >= slave->bins * sizeof(*slave->fdomain));
// Wait for new block of output data
pthread_mutex_lock(&master->filter_mutex);
int blocks_behind = master->completed_jobs[slave->next_jobnum % ND] - slave->next_jobnum;
if(blocks_behind >= ND){
// We've fallen too far behind. skip ahead to the oldest block still available
unsigned nextblock = master->completed_jobs[0];
for(int i=1; i < ND; i++){
if((int)(master->completed_jobs[i] - nextblock) < 0) // modular comparison
nextblock = master->completed_jobs[i];
}
slave->block_drops += nextblock - slave->next_jobnum;
slave->next_jobnum = nextblock;
}
while((int)(slave->next_jobnum - master->completed_jobs[slave->next_jobnum % ND]) > 0)
pthread_cond_wait(&master->filter_cond,&master->filter_mutex);
// We don't modify the master's output data, we create our own
float complex const * const fdomain = master->fdomain[slave->next_jobnum % ND];
// in case we just waited so long that the buffer wrapped, resynch
slave->next_jobnum = master->completed_jobs[slave->next_jobnum % ND] + 1;
pthread_mutex_unlock(&master->filter_mutex);
assert(fdomain != NULL); // Should always be master frequency data
// In spectrum mode we'll read directly from master. Don't forget the 3dB scale when the input is real
if(slave->fdomain == NULL || slave->response == NULL)
return 0;
/* Multiply the requested frequency segment by the frequency response
Although frequency domain data is always complex, this is complicated because
we have to handle the four combinations of the filter input and output time domain data
being either real or complex.
In ka9q-radio the input depends on the SDR front end, while the output is complex
(even for SSB) because of the fine tuning frequency shift after conversion
back to the time domain. So while real output is supported it is not well tested.
*/
pthread_mutex_lock(&slave->response_mutex); // Don't let it change while we're using it
if(master->in_type != REAL && slave->out_type != REAL){
// Complex -> complex (e.g., fobos (in VHF/UHF mode), funcube, airspyhf, sdrplay)
int wp = (slave->bins+1)/2; // most negative output bin
int rp = shift - slave->bins/2; // Start index in master, unwrapped = shift - # output bins
// Starting below master, zero output until we're in range. Rarely needed.
while(rp < -(master->bins+1)/2){
assert(wp >=0 && wp < slave->bins);
slave->fdomain[wp] = 0;
rp++;
if(++wp == (slave->bins+1)/2) // exhausted output buffer
goto done;
if(wp == slave->bins)
wp = 0; // Wrap to DC
}
if(rp < 0)
rp += master->bins; // Starts in negative region of master
if(rp < 0 || rp >= master->bins){
// Shift is out of range
// Zero any remaining output
while(wp != (slave->bins+1)/2){
assert(wp >=0 && wp < slave->bins);
slave->fdomain[wp++] = 0;
if(wp == slave->bins)
wp = 0; // Wrap to DC
}
goto done;
}
// The actual work is here
do {
assert(rp >= 0 && rp < master->bins);
assert(wp >=0 && wp < slave->bins);
slave->fdomain[wp] = fdomain[rp] * slave->response[wp];
if(++rp == master->bins)
rp = 0; // Master wrapped to DC
if(++wp == slave->bins)
wp = 0; // Slave wrapped to DC
} while (wp != (slave->bins+1)/2 && rp != (master->bins+1)/2); // Until we reach the top of the output or input
// Zero any remaining output. Rarely needed.
while(wp != (slave->bins+1)/2){
assert(wp >=0 && wp < slave->bins);
slave->fdomain[wp++] = 0;
if(wp == slave->bins)
wp = 0; // Wrap to DC
}
} else if(master->in_type != REAL && slave->out_type == REAL){
// Complex -> real UNTESTED! not used in ka9q-radio at present
for(int si=0; si < slave->bins; si++){
int const mi = si + shift;
float complex result = 0;
if(mi >= -master->bins/2 && mi < master->bins/2)
result = slave->response[si] * (fdomain[modulo(mi,master->bins)] + conjf(fdomain[modulo(master->bins - mi, master->bins)]));
slave->fdomain[si] = result;
}
} else if(master->in_type == REAL && slave->out_type == REAL){
// Real -> real (e.g. in wfm stereo decoding)
// shift is unlikely to be non-zero because of the frequency folding, but handle it anyway
for(int si=0; si < slave->bins; si++){ // All positive frequencies
int const mi = si + shift;
slave->fdomain[si] = (mi >= 0 && mi < master->bins) ? fdomain[mi] * slave->response[si] : 0;
}
} else if(master->in_type == REAL && slave->out_type != REAL){
/* Real->complex (e.g., rx888, fobos (direct sample mode), airspy R2)
This can be tricky. We treat the input as complex with Hermitian symmetry (both positive and negative spectra)
If shift >= 0, the input spectrum is positive and right-side up (e.g., rx888, fobos direct sampling)
If shift < 0, the input spectrum is negative and inverted (e.g., Airspy R2)
Don't cross input DC as this doesn't seem useful; just blank the output
For real inputs, set_filter scales +3dB to account for the half energy in the implicit negative spectrum
*/
int wp = (slave->bins+1)/2; // most negative output bin
if(shift >= 0){
// Right side up
int rp = shift - slave->bins/2; // Start index in master, unwrapped = shift - # output bins
// Pad start if necessary. Rarely needed
while(rp < 0){
assert(wp >=0 && wp < slave->bins);
slave->fdomain[wp] = 0;
if(++wp == (slave->bins+1)/2)
goto done; // Top of output
if(wp == slave->bins)
wp = 0; // wrap to DC
rp++;
}
// Actual work
while(rp < master->bins){
assert(wp >=0 && wp < slave->bins);
assert(rp >= 0 && rp < master->bins);
slave->fdomain[wp] = fdomain[rp] * slave->response[wp];
if(++wp == (slave->bins+1)/2)
goto done;
if(wp == slave->bins)
wp = 0; // Wrap to DC
rp++;
}
} else {
// Inverted spectrum
int rp = -(shift - slave->bins/2); // Start at high (negative) input frequency
// Pad start if necessary, rarely needed
while(rp >= master->bins){
assert(wp >=0 && wp < slave->bins);
slave->fdomain[wp] = 0;
if(++wp == (slave->bins+1)/2)
goto done; // Top of output
if(wp == slave->bins)
wp = 0; // wrap to DC
rp--;
}
// Actual work
while(rp >= 0){
assert(wp >=0 && wp < slave->bins);
assert(rp >= 0 && rp < master->bins);
slave->fdomain[wp] = conjf(fdomain[rp]) * slave->response[wp];
if(++wp == (slave->bins+1)/2)
goto done;
if(wp == slave->bins)
wp = 0; // Wrap to DC
rp--;
}
}
// Zero any remaining output. Rarely needed
while(wp != (slave->bins+1)/2){
assert(wp >=0 && wp < slave->bins);
slave->fdomain[wp] = 0;
if(++wp == slave->bins)
wp = 0; // Wrap DC
}
}
done:;
// Zero out Nyquist bin when N even
if((slave->bins & 1) == 0)
slave->fdomain[slave->bins/2] = 0;
pthread_mutex_unlock(&slave->response_mutex); // release response[]
if(slave->out_type == CROSS_CONJ){
// hack for ISB; forces negative frequencies onto I, positive onto Q
// Don't really know how to use this anymore; it's incompatible with fine tuning in the time domain
// Re-implementing ISB will probably require a filter for each sideband
// Also probably generates time domain ripple effects due to the sharp notch at DC
assert(malloc_usable_size(slave->fdomain) >= slave->bins * sizeof(*slave->fdomain));
for(int p=1,dn=slave->bins-1; p < slave->bins/2; p++,dn--){
assert(p >= 0 && p < slave->bins);
assert(dn >= 0 && dn < slave->bins);
float complex const pos = slave->fdomain[p];
float complex const neg = slave->fdomain[dn];
slave->fdomain[p] = pos + conjf(neg);
slave->fdomain[dn] = neg - conjf(pos);
}
slave->fdomain[0] = 0; // Must be a null at DC
}
// And finally back to the time domain
fftwf_execute(slave->rev_plan); // Note: c2r version destroys fdomain[], but it's not used again anyway
// Drop the cache in the first M-1 points of the time domain buffer that we'll discard
if(slave->out_type == REAL)
drop_cache(slave->output_buffer.r,(slave->points - slave->olen) * sizeof (float));
else
drop_cache(slave->output_buffer.c,(slave->points - slave->olen) * sizeof (float complex));
return 0;
}
int delete_filter_input(struct filter_in * master){
if(master == NULL)
return -1;
ASSERT_UNLOCKED(&master->filter_mutex);
pthread_mutex_destroy(&master->filter_mutex);
pthread_cond_destroy(&master->filter_cond);
if(master->fwd_plan != NULL)
fftwf_destroy_plan(master->fwd_plan);
master->fwd_plan = NULL;
mirror_free(&master->input_buffer,master->input_buffer_size); // Don't use free() !
for(int i=0; i < ND; i++)
FREE(master->fdomain[i]);
memset(master,0,sizeof(*master)); // Wipe it all
return 0;
}
int delete_filter_output(struct filter_out *slave){
if(slave == NULL)
return -1;
ASSERT_UNLOCKED(&slave->response_mutex);
pthread_mutex_destroy(&slave->response_mutex);
if(slave->rev_plan != NULL)
fftwf_destroy_plan(slave->rev_plan);
slave->rev_plan = NULL;
// Only one will be non-null but it doesn't hurt to free both
FREE(slave->output_buffer.c);
FREE(slave->output_buffer.r);
FREE(slave->response);
FREE(slave->fdomain);
memset(slave,0,sizeof(*slave)); // Wipe it all
return 0;
}
// Compute an entire Kaiser window
// More efficient than repeatedly calling kaiser(n,M,beta)
int make_kaiser(float * const window,int const M,float const beta){
assert(window != NULL);
if(window == NULL)
return -1;
// Precompute unchanging partial values
float const inv_denom = 1. / i0(beta); // Inverse of denominator
float const pc = 2.0 / (M-1);
// The window is symmetrical, so compute only half of it and mirror
// this won't compute the middle value in an odd-length sequence
for(int n = 0; n < M/2; n++){
float const p = pc * n - 1;
window[M-1-n] = window[n] = i0(beta * sqrtf(1-p*p)) * inv_denom;
}
// If sequence length is odd, middle value is unity
if(M & 1)
window[(M-1)/2] = 1; // The -1 is actually unnecessary
return 0;
}
/* Set up a filter with a specified complex bandpass response
Uses a Kaiser-windowed sinc function - new as of March 2025
This can occasionally be called with slave == NULL at startup, so don't abort
NB: 'low' and 'high' are *fractional* frequencies relative to the output sample rate, i.e., -0.5 < f < +0.5
If invoked on a demod that hasn't run yet, slave->master will be NULL so check for that and quit;
the filter should get set up when it actually starts (thanks N5TNL for bug report) */
int set_filter(struct filter_out * const slave,float low,float high,float const kaiser_beta){
if(slave == NULL || isnan(low) || isnan(high) || isnan(kaiser_beta) || slave->master == NULL)
return -1;
if(slave->out_type == REAL){
// Filter edges crossing DC not allowed for real output
low = fabsf(low);
high = fabsf(high);
}
// Swap if necessary
if(low > high){
float tmp = low;
low = high;
high = tmp;
}
// Limit filter range to Nyquist rate
low = low < -0.5 ? -0.5 : low > +0.5 ? +0.5 : low;
high = high < -0.5 ? -0.5 : high > +0.5 ? +0.5 : high;
// Total number of time domain points
int const N = slave->points;
int const L = slave->olen;
int const M = N - L + 1; // Length of impulse response in time domain
// Real lowpass filter with cutoff = 1/2 bandwidth
float bw2 = fabsf(high - low)/2;
if(bw2 == 0)
bw2 = 1; // Avoid assert failures
float center = (high + low)/2;
float kaiser_window[M];
make_kaiser(kaiser_window,M,kaiser_beta);
// Form complex impulse response by generating kaiser-windowed sinc pulse and shifting to desired center freq
float complex impulse[M];
float window_gain = 0;
for(int i = 0; i < M; i++){
float n = i - (float)(M-1)/2;
float r = kaiser_window[i] * 2 * bw2 * sinc(2 * bw2 * n);
window_gain += r;
impulse[i] = cispif(2 * center * n) * r;
#if FILTER_DEBUG
printf("impulse[%d] = %g + j%g\n",i,crealf(impulse[i]),cimagf(impulse[i]));
#endif
}
// gain corrections:
// 1. real inputs require +3dB for half the power in the implicit negative spectrum
// 2. the windowed sinc has some loss
// 3. The un-normalized forward FFT has an implicit power gain of N
float const gain = (slave->master->in_type == REAL ? M_SQRT2 : 1.0f)
/ (window_gain * slave->master->points);
assert(gain != 0 && isfinite(gain));
for(int i = 0; i < M; i++)
impulse[i] *= gain; // Normalize for the window gain
float complex * const response = lmalloc(N * sizeof(float complex));
assert(response != NULL);
pthread_mutex_lock(&FFTW_planning_mutex);
fftwf_plan_with_nthreads(1);
// Doesn't have to be fast, we're only doing this once
fftwf_plan fwd_filter_plan = fftwf_plan_dft_1d(N,response,response,FFTW_FORWARD,FFTW_ESTIMATE);
assert(fwd_filter_plan != NULL);
pthread_mutex_unlock(&FFTW_planning_mutex);
memcpy(response,impulse,M * sizeof *response);
memset(response+M,0,(N-M) * sizeof *response);
fftwf_execute(fwd_filter_plan);
#if FILTER_DEBUG
{
for(int i=0; i < N; i++)
printf("response[%d] = %g + j%g\n",i,__real__ response[i],__imag__ response[i]);
}
#endif
// Hot swap with existing response, if any, using mutual exclusion
pthread_mutex_lock(&slave->response_mutex);
float complex * const tmp = slave->response;
slave->response = response;
pthread_mutex_unlock(&slave->response_mutex);
free(tmp);
return 0;
}
int write_cfilter(struct filter_in *f, float complex const *buffer,int size){
if(f == NULL)
return -1;
if(sizeof(*buffer) * (f->wcnt + size) >= f->input_buffer_size)
return -1; // Write is so large it wrapped the input buffer. Should handle this more cleanly
// Even though writes can now wrap past the primary copy of the input buffer, their start should always be in it
assert((void *)(f->input_write_pointer.c) >= f->input_buffer);
assert((void *)(f->input_write_pointer.c) < f->input_buffer + f->input_buffer_size);
if(buffer != NULL)
memcpy(f->input_write_pointer.c, buffer, size * sizeof(*buffer));
f->input_write_pointer.c += size;
mirror_wrap((void *)&f->input_write_pointer.c, f->input_buffer, f->input_buffer_size);
f->wcnt += size;
bool executed = false;
while(f->wcnt >= f->ilen){
f->wcnt -= f->ilen;
execute_filter_input(f);
executed = true;
}
return executed;
}
int write_rfilter(struct filter_in *f, float const *buffer,int size){
if(f == NULL)
return -1;
if(sizeof(*buffer) * (f->wcnt + size) >= f->input_buffer_size)
return -1; // Write is so large it wrapped the input buffer. Should handle this more cleanly
// Even though writes can now wrap past the primary copy of the input buffer, their start should always be in it
assert((void *)(f->input_write_pointer.r) >= f->input_buffer);
assert((void *)(f->input_write_pointer.r) < f->input_buffer + f->input_buffer_size);