quda/include/quda_milc_interface.h at cb580939eab6811137ea92d2c57b18aabaf7ae4a · lattice/quda · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#ifndef _QUDA_MILC_INTERFACE_H
#define _QUDA_MILC_INTERFACE_H

#include <enum_quda.h>
#include <quda.h>

/**
 * @file    quda_milc_interface.h
 *
 * @section Description
 *
 * The header file defines the milc interface to enable easy
 * interfacing between QUDA and the MILC software packed.
 */
#if defined(QUDA_TARGET_CUDA)
#if __COMPUTE_CAPABILITY__ >= 600
#define USE_QUDA_MANAGED 1
#endif
#endif

#ifdef __cplusplus
extern "C" {
#endif

  /**
   * Parameters related to MILC site struct
   */
  typedef struct {
    void *site; /** Pointer to beginning of site array */
    void *link; /** Pointer to link field (only used if site is not set) */
    size_t link_offset; /** Offset to link entry in site struct (bytes) */
    void *mom; /** Pointer to link field (only used if site is not set) */
    size_t mom_offset; /** Offset to mom entry in site struct (bytes) */
    size_t size; /** Size of site struct (bytes) */
  } QudaMILCSiteArg_t;

  /**
   * Parameters related to linear solvers.
   */
  typedef struct {
    int max_iter; /** Maximum number of iterations */
    QudaParity evenodd; /** Which parity are we working on ? (options are QUDA_EVEN_PARITY, QUDA_ODD_PARITY, QUDA_INVALID_PARITY */
    int mixed_precision; /** Whether to use mixed precision or not (1 - yes, 0 - no) */
    double boundary_phase[4]; /** Boundary conditions */
    int make_resident_solution; /** Make the solution resident and don't copy back */
    int use_resident_solution; /** Use the resident solution */
    QudaInverterType solver_type; /** Type of solver to use */
    double tadpole; /** Tadpole improvement factor - set to 1.0 for
                        HISQ fermions since the tadpole factor is
                        baked into the links during their construction */
    double naik_epsilon; /** Naik epsilon parameter (HISQ fermions only).*/
  } QudaInvertArgs_t;

  /**
   * Parameters related to deflated linear solvers.
   */
  typedef struct {
    size_t struct_size; /** Size of this struct in bytes. Used to check that host application and QUDA see the same struct size **/
    double tol_restart;
    QudaPrecision prec_eigensolver;
    int poly_deg; /** Degree of the Chebyshev polynomial **/
    double a_min; /** Range used in polynomial acceleration **/
    double a_max;
    QudaBoolean preserve_evals; /** Whether to preserve the evals or recompute them **/
    int n_ev;                   /** Size of the eigenvector search space **/
    int n_kr;                   /** Total size of Krylov space **/
    int n_conv;                 /** Number of requested converged eigenvectors **/
    int n_ev_deflate;           /** Number of requested converged eigenvectors to use in deflation **/
    double tol;                 /** Tolerance on the least well known eigenvalue's residual **/
    int max_restarts;           /** For IRLM/IRAM, quit after n restarts **/
    int batched_rotate;       /** For the Ritz rotation, the maximal number of extra vectors the solver may allocate **/
    int block_size;           /** For block method solvers, the block size **/
    char vec_infile[256];     /** Filename prefix where to load the null-space vectors */
    char vec_outfile[256];    /** Filename prefix for where to save the null-space vectors */
    QudaParity vec_in_parity; /** Parity of the incoming eigenvectors **/
    QudaPrecision save_prec;  /** The precision with which to save the vectors */
    QudaBoolean partfile;     /** Whether to save eigenvectors in QIO singlefile or partfile format */
    QudaBoolean io_parity_inflate; /** Whether to inflate single-parity eigen-vector I/O **/
    QudaBoolean use_norm_op;
    QudaBoolean use_pc;
    QudaEigType eig_type;            /** Type of eigensolver algorithm to employ **/
    QudaEigSpectrumType spectrum;    /** Which part of the spectrum to solve **/
    double qr_tol;                   /** Tolerance on the QR iteration **/
    QudaBoolean require_convergence; /** If true, the solver will error out if the convergence criteria are not met **/
    int check_interval;              /** For IRLM/IRAM, check every nth restart **/
    QudaBoolean use_dagger;          /** If use_dagger, use Mdag **/
    QudaBoolean compute_gamma5;     /** Performs the \gamma_5 OP solve by post multiplying the eignvectors with \gamma_5
                                       before computing the eigenvalues */
    QudaBoolean compute_svd;        /** Performs an MdagM solve, then constructs the left and right SVD. **/
    QudaBoolean use_eigen_qr;       /** Use Eigen routines to eigensolve the upper Hessenberg via QR **/
    QudaBoolean use_poly_acc;       /** Use Polynomial Acceleration **/
    QudaBoolean arpack_check;       /** In the test function, cross check the device result against ARPACK **/
    char arpack_logfile[512];       /** For Arpack cross check, name of the Arpack logfile **/
    int compute_evals_batch_size;   /** The batch size used when computing eigenvalues **/
    QudaBoolean preserve_deflation; /** Whether to preserve the deflation space between solves **/

  } QudaEigensolverArgs_t;

  /**
   * Parameters related to EigCG deflated solvers.
   */

  typedef struct {
    QudaPrecision  prec_ritz;
    int nev;
    int max_search_dim;
    int deflation_grid;
    double tol_restart;

    int eigcg_max_restarts;
    int max_restart_num;
    double inc_tol;
    double eigenval_tol;

    QudaExtLibType   solver_ext_lib;
    QudaExtLibType   deflation_ext_lib;

    QudaFieldLocation location_ritz;
    QudaMemoryType    mem_type_ritz;

    char *vec_infile;
    char *vec_outfile;

  } QudaEigArgs_t;


  /**
   * Parameters related to problem size and machine topology.
   */
  typedef struct {
    const int* latsize; /** Local lattice dimensions */
    const int* machsize; /** Machine grid size */
    int device; /** GPU device  number */
  } QudaLayout_t;


  /**
   * Parameters used to create a QUDA context.
   */
  typedef struct {
    QudaVerbosity verbosity; /** How verbose QUDA should be (QUDA_SILENT, QUDA_VERBOSE or QUDA_SUMMARIZE) */
    QudaLayout_t layout; /** Layout for QUDA to use */
  } QudaInitArgs_t; // passed to the initialization struct


  /**
   * Parameters for defining HISQ calculations
   */
  typedef struct {
    int reunit_allow_svd;         /** Allow SVD for reuniarization */
    int reunit_svd_only;          /** Force use of SVD for reunitarization */
    double reunit_svd_abs_error;  /** Absolute error bound for SVD to apply */
    double reunit_svd_rel_error;  /** Relative error bound for SVD to apply */
    double force_filter;          /** UV filter to apply to force */
  } QudaHisqParams_t;


  /**
   * Parameters for defining fat-link calculations
   */
  typedef struct {
    int su3_source;          /** is the incoming gauge field SU(3) */
    int use_pinned_memory;   /** use page-locked memory in QUDA    */
  } QudaFatLinkArgs_t;

  /**
   * Parameters for propagator contractions with FT
   */
  typedef struct {
    int n_mom;                 /* Number of sink momenta */
    int *mom_modes;            /* List of 4-component momenta as integers. Dimension 4*n_mom */
    QudaFFTSymmType *fft_type; /* The "parity" of the FT component */
    int *source_position;      /* The coordinate origin for the Fourier phases */
    double flops;              /* Return value */
    double dtime;              /* Return value */
  } QudaContractArgs_t;

  /**
   * Parameters for two-link Gaussian quark smearing.
   */
  typedef struct {
    int n_steps; /** Number of steps to apply **/
    double width; /** The width of the Gaussian **/
    int compute_2link; /** if nonzero then compute two-link, otherwise reuse gaugeSmeared **/
    int delete_2link; /** if nonzero then delete two-link, otherwise keep two-link for future use **/
    int t0; /** Set if the input spinor is on a time slice **/
    int laplaceDim; /** Dimension of Laplacian **/
  } QudaTwoLinkQuarkSmearArgs_t;

  /**
    Options when loading deflation space
  **/
  typedef enum QudaMilcEigLoad_s {
    QUDA_MILC_EIG_LOAD,              /** Load this parity evecs from MILC **/
    QUDA_MILC_EIG_COMPUTE,           /** Compute this parity evecs (or load from file via QUDA) **/
    QUDA_MILC_EIG_FROM_OTHER_PARITY, /** Compute this parity evecs from the other parity **/
    QUDA_MILC_INVALID_EIG = QUDA_INVALID_ENUM
  } QudaMilcEigLoad;

  /**
   * Optional: Set the MPI Comm Handle if it is not MPI_COMM_WORLD
   *
   * @param[in] input Pointer to an MPI_Comm handle, static cast as a void *
   */
  void qudaSetMPICommHandle(void *mycomm);

  /**
   * Initialize the QUDA context.
   *
   * @param[in] input Meta data for the QUDA context
   */
  void qudaInit(QudaInitArgs_t input);

  /**
   * Set set the local dimensions and machine topology for QUDA to use
   *
   * @param[in] layout Struct defining local dimensions and machine topology
   */
  void qudaSetLayout(QudaLayout_t layout);

  /**
   * Clean up the QUDA deflation space.
   */
  void qudaCleanUpDeflationSpace();

  /**
   * Destroy the QUDA context.
   */
  void qudaFinalize();

  /**
   * Allocate pinned memory suitable for CPU-GPU transfers
   * @param[in] bytes The size of the requested allocation
   * @return Pointer to allocated memory
   */
  void* qudaAllocatePinned(size_t bytes);

  /**
   * Free pinned memory
   * @param[in] ptr Pointer to memory to be free
   */
  void qudaFreePinned(void *ptr);

  /**
   * Allocate managed memory to reduce CPU-GPU transfers
   * @param[in] bytes The size of the requested allocation
   * @return Pointer to allocated memory
   */
  void *qudaAllocateManaged(size_t bytes);

  /**
   * Free managed memory
   * @param[in] ptr Pointer to memory to be free
   */
  void qudaFreeManaged(void *ptr);

  /**
   * Set the algorithms to use for HISQ fermion calculations, e.g.,
   * SVD parameters for reunitarization.
   *
   * @param[in] hisq_params Meta data desribing the algorithms to use for HISQ fermions
   */
  void qudaHisqParamsInit(QudaHisqParams_t hisq_params);

  /**
   * Compute the fat and long links using the input gauge field.  All
   * fields passed here are host fields, that must be preallocated.
   * The precision of all fields must match.
   *
   * @param[in] precision The precision of the fields
   * @param[in] fatlink_args Meta data for the algorithms to deploy
   * @param[in] act_path_coeff Array of coefficients for each path in the action
   * @param[in] inlink Host gauge field used for input
   * @param[out] fatlink Host fat-link field that is computed
   * @param[out] longlink Host long-link field that is computed
   */
  void qudaLoadKSLink(int precision,
		      QudaFatLinkArgs_t fatlink_args,
		      const double act_path_coeff[6],
		      void* inlink,
		      void* fatlink,
		      void* longlink);

  /**
   * Compute the fat links and unitzarize using the input gauge field.
   * All fields passed here are host fields, that must be
   * preallocated.  The precision of all fields must match.
   *
   * @param[in] precision The precision of the fields
   * @param[in] fatlink_args Meta data for the algorithms to deploy
   * @param[in] path_coeff Array of coefficients for each path in the action
   * @param[in] inlink Host gauge field used for input
   * @param[out] fatlink Host fat-link field that is computed
   * @param[out] ulink Host unitarized field that is computed
   */
  void qudaLoadUnitarizedLink(int precision,
			      QudaFatLinkArgs_t fatlink_args,
			      const double path_coeff[6],
			      void* inlink,
			      void* fatlink,
			      void* ulink);

  /**
   * Apply the forward/backward/symmetric shift for the spin-taste opeartor. All fields
   * passed and returned are host (CPU) field in MILC order.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] quda_precision     Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] links              Gauge field on the host
   * @param[in] src                Input spinor field
   * @param[out] dst                Output spinor field
   * @param[in] dir                Direction of application of the spin-taste operator
   * @param[in] sym                Kind of spin-taste operator (1 forward, 2 backward, 3 symmetric)
   * @param[in] reloadGaugeField   Should we transfer again the gauge field from the CPU to the GPU? (0 = false, anything else = true)
   */
  void qudaShift(int external_precision, int quda_precision, const void *const links, void *source, void *solution,
                 int dir, int sym, int reloadGaugeField);

  /**
   * Apply the forward/backward/symmetric shift for the spin-taste opeartor. All fields
   * passed and returned are host (CPU) field in MILC order.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] quda_precision     Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] links              Gauge field on the host
   * @param[in] src                Input spinor field
   * @param[out] dst                Output spinor field
   * @param[in] spin               Spin gamma structure using MILC numbering
   * @param[in] taste              Taste gamma structure using MILC numbering
   * @param[in] reloadGaugeField   Should we transfer again the gauge field from the CPU to the GPU? (0 = false, anything else = true)
   */
  void qudaSpinTaste(int external_precision, int quda_precision, const void *const links, void *src, void *dst,
                     int spin, int taste, int reloadGaugeField);
  /**
   * Apply the improved staggered operator to a field. All fields
   * passed and returned are host (CPU) field in MILC order.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] quda_precision     Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] inv_args           Struct setting some solver metadata
   * @param[in] milc_fatlink       Fat-link field on the host
   * @param[in] milc_longlink      Long-link field on the host
   * @param[in] source             Right-hand side source field
   * @param[out] solution           Solution spinor field
   */
  void qudaDslash(int external_precision,
		  int quda_precision,
		  QudaInvertArgs_t inv_args,
		  const void* const milc_fatlink,
		  const void* const milc_longlink,
		  void* source,
		  void* solution,
		  int* num_iters);

  /**
   * Solve Ax=b using an improved staggered operator with a
   * domain-decomposition preconditioner.  All fields are fields
   * passed and returned are host (CPU) field in MILC order.  This
   * function requires that persistent gauge and clover fields have
   * been created prior.  This interface is experimental.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] precision Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] mass Fermion mass parameter
   * @param[in] inv_args Struct setting some solver metadata
   * @param[in] target_residual Target residual
   * @param[in] target_relative_residual Target Fermilab residual
   * @param[in] domain_overlap Array specifying the overlap of the domains in each dimension
   * @param[in] fatlink Fat-link field on the host
   * @param[in] longlink Long-link field on the host
   * @param[in] source Right-hand side source field
   * @param[out] solution Solution spinor field
   * @param[in] final_residual True residual
   * @param[in] final_relative_residual True Fermilab residual
   * @param[in] num_iters Number of iterations taken
   */
  void qudaDDInvert(int external_precision,
		    int quda_precision,
		    double mass,
		    QudaInvertArgs_t inv_args,
		    double target_residual,
		    double target_fermilab_residual,
		    const int * const domain_overlap,
		    const void* const fatlink,
		    const void* const longlink,
		    void* source,
		    void* solution,
		    double* const final_residual,
		    double* const final_fermilab_residual,
		    int* num_iters);

  /**
   * Project the low modes off of a source of given parity.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] source Source vector(s)
   * @param[out] source Solution vector(s)
   * @param[in] nvec Number of source/solution vectors
   * @param[in] n_evec Number of low modes to project off of the source vectors
   * @param[in] parity Parity to use
   */
  void qudaProject(int external_precision, void **source, void **solution, int nvec, int n_evec, QudaParity parity);

  /**
   * Get pointers to QUDA's deflation space objects.
   *
   * @param[out] evecs Pointer to eigenvectors
   * @param[out] evals Pointer to eigenvalues
   * @param[in] parity Parity of the deflation space to return
   * @param[in] nvecs The number of eigenvectors
   */
  void qudaGetDeflationSpace(void **evecs, double *evals, QudaParity parity, int nvecs);

  /**
   * Load the deflation space (eigenvalues and eigenvectors) for a particular parity
   * which is set in invargs.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] quda_precision Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] milc_fatlink Fat-link field on the host
   * @param[in] milc_longlink Long-link field on the host
   * @param[in] mass Quark mass
   * @param[in] invargs Struct containing information for the inverter
   * @param[in] eigargs Struct containing information for the eigensolver
   * @param[in] evecs Evecs coming from MILC
   * @param[in] loadtype Whether to load from MILC, from file, compute, or check
   */
  void qudaLoadDeflationSpace(int external_precision, int quda_precision, const void *const milc_fatlink,
                              const void *const milc_longlink, double mass, QudaInvertArgs_t invargs,
                              QudaEigensolverArgs_t eigargs, void **evecs, QudaMilcEigLoad loadtype);

  /**
   * Solve Ax=b for an improved staggered operator. All fields are fields
   * passed and returned are host (CPU) field in MILC order.  This
   * function requires that persistent gauge and clover fields have
   * been created prior.  This interface is experimental.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] quda_precision Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] mass Fermion mass parameter
   * @param[in] inv_args Struct setting some solver metadata
   * @param[in] target_residual Target residual
   * @param[in] target_relative_residual Target Fermilab residual
   * @param[in] milc_fatlink Fat-link field on the host
   * @param[in] milc_longlink Long-link field on the host
   * @param[in] source Right-hand side source field
   * @param[out] solution Solution spinor field
   * @param[in] final_residual True residual
   * @param[in] final_relative_residual True Fermilab residual
   * @param[in] num_iters Number of iterations taken
   */
  void qudaInvert(int external_precision,
		  int quda_precision,
		  double mass,
		  QudaInvertArgs_t inv_args,
		  double target_residual,
		  double target_fermilab_residual,
		  const void* const milc_fatlink,
		  const void* const milc_longlink,
		  void* source,
		  void* solution,
		  double* const final_resid,
		  double* const final_rel_resid,
		  int* num_iters);

  /**
   * Solve Ax=b with deflation for an improved staggered operator. All fields are fields
   * passed and returned are host (CPU) field in MILC order.  This
   * function requires that persistent gauge and clover fields have
   * been created prior.  This interface is experimental.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] quda_precision Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] mass Fermion mass parameter
   * @param[in] inv_args Struct setting some solver metadata
   * @param[in] eig_args Struct setting some eigensolver metadata
   * @param[in] target_residual Target residual
   * @param[in] target_relative_residual Target Fermilab residual
   * @param[in] milc_fatlink Fat-link field on the host
   * @param[in] milc_longlink Long-link field on the host
   * @param[in] source Right-hand side source field
   * @param[out] solution Solution spinor field
   * @param[in] final_residual True residual
   * @param[in] final_relative_residual True Fermilab residual
   * @param[in] num_iters Number of iterations taken
   */
  void qudaInvertDeflatable(int external_precision, int quda_precision, double mass, QudaInvertArgs_t inv_args,
                            QudaEigensolverArgs_t eig_args, double target_residual, double target_fermilab_residual,
                            const void *const milc_fatlink, const void *const milc_longlink, void *source,
                            void *solution, double *const final_resid, double *const final_rel_resid, int *num_iters);

  /**
   * Prepare a staggered/HISQ multigrid solve with given fat and
   * long links. All fields passed are host (CPU) fields
   * in MILC order. This function requires persistent gauge fields.
   * This interface is experimental.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] quda_precision Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] mass Fermion mass parameter
   * @param[in] inv_args Struct setting some solver metadata; required for tadpole, naik coeff
   * @param[in] milc_fatlink Fat-link field on the host
   * @param[in] milc_longlink Long-link field on the host
   * @param[in] mg_param_file Path to an input text file describing the MG solve, to be documented on QUDA wiki
   * @return Void pointer wrapping a pack of multigrid-related structures
   */
  void *qudaMultigridCreate(int external_precision, int quda_precision, double mass, QudaInvertArgs_t inv_args,
                            const void *const milc_fatlink, const void *const milc_longlink,
                            const char *const mg_param_file);

  /**
   * Solve Ax=b for an improved staggered operator using MG.
   * All fields are fields passed and returned are host (CPU)
   * field in MILC order.  This function requires that persistent
   * gauge and clover fields have been created prior. It also
   * requires a multigrid parameter built from qudaSetupMultigrid
   * This interface is experimental.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] quda_precision Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] mass Fermion mass parameter
   * @param[in] inv_args Struct setting some solver metadata
   * @param[in] target_residual Target residual
   * @param[in] target_relative_residual Target Fermilab residual
   * @param[in] milc_fatlink Fat-link field on the host
   * @param[in] milc_longlink Long-link field on the host
   * @param[in] mg_pack_ptr MG preconditioner structure created by qudaSetupMultigrid
   * @param[in] mg_rebuild_type whether to do a full (1) or thin (0) MG rebuild
   * @param[in] source Right-hand side source field
   * @param[out] solution Solution spinor field
   * @param[in] final_residual True residual
   * @param[in] final_relative_residual True Fermilab residual
   * @param[in] num_iters Number of iterations taken
   */
  void qudaInvertMG(int external_precision, int quda_precision, double mass, QudaInvertArgs_t inv_args,
                    double target_residual, double target_fermilab_residual, const void *const milc_fatlink,
                    const void *const milc_longlink, void *mg_pack_ptr, int mg_rebuild_type, void *source,
                    void *solution, double *const final_residual, double *const final_fermilab_residual, int *num_iters);

  /**
   * Solve Ax=b for an improved staggered operator using MG with many right-hand sides.
   * All fields are fields passed and returned are host (CPU)
   * field in MILC order.  This function requires that persistent
   * gauge and clover fields have been created prior. It also
   * requires a multigrid parameter built from qudaSetupMultigrid
   * This interface is experimental.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] quda_precision Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] mass Fermion mass parameter
   * @param[in] inv_args Struct setting some solver metadata
   * @param[in] target_residual Target residual
   * @param[in] target_relative_residual Target Fermilab residual
   * @param[in] milc_fatlink Fat-link field on the host
   * @param[in] milc_longlink Long-link field on the host
   * @param[in] mg_pack_ptr MG preconditioner structure created by qudaSetupMultigrid
   * @param[in] mg_rebuild_type whether to do a full (1) or thin (0) MG rebuild
   * @param[in] sourceArray Array of right-hand side source fields
   * @param[out] solutionArray Array of solution spinor fields
   * @param[in] final_residual True residual
   * @param[in] final_relative_residual True Fermilab residual
   * @param[in] num_iters Number of iterations taken
   * @param[in] num_src Number of source fields
   */
  void qudaInvertMsrcMG(int external_precision, int quda_precision, double mass, QudaInvertArgs_t inv_args,
                        double target_residual, double target_fermilab_residual, const void *const milc_fatlink,
                        const void *const milc_longlink, void *mg_pack_ptr, int mg_rebuild_type, void **sourceArray,
                        void **solutionArray, double *const final_residual, double *const final_fermilab_residual,
                        int *num_iters, int num_src);

  /**
   * Clean up a staggered/HISQ multigrid object, freeing all internal
   * fields and otherwise allocated memory.
   *
   * @param[in] mg_pack_ptr Void pointer mapping to the multigrid structure returned by qudaSetupMultigrid
   */
  void qudaMultigridDestroy(void *mg_pack_ptr);

  /**
   * Solve Ax=b for an improved staggered operator with many right hand sides.
   * All fields are fields passed and returned are host (CPU) field in MILC order.
   * This function requires that persistent gauge and clover fields have
   * been created prior.  This interface is experimental.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] quda_precision Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] mass Fermion mass parameter
   * @param[in] inv_args Struct setting some solver metadata
   * @param[in] target_residual Target residual
   * @param[in] target_relative_residual Target Fermilab residual
   * @param[in] milc_fatlink Fat-link field on the host
   * @param[in] milc_longlink Long-link field on the host
   * @param[in] source array of right-hand side source fields
   * @param[out] solution array of solution spinor fields
   * @param[in] final_residual True residual
   * @param[in] final_relative_residual True Fermilab residual
   * @param[in] num_iters Number of iterations taken
   * @param[in] num_src Number of source fields
   */
  void qudaInvertMsrc(int external_precision,
                      int quda_precision,
                      double mass,
                      QudaInvertArgs_t inv_args,
                      double target_residual,
                      double target_fermilab_residual,
                      const void* const fatlink,
                      const void* const longlink,
                      void** sourceArray,
                      void** solutionArray,
                      double* const final_residual,
                      double* const final_fermilab_residual,
                      int* num_iters,
                      int num_src);

  /**
   * Solve Ax=b with deflation for an improved staggered operator with many right hand sides.
   * All fields are fields passed and returned are host (CPU) field in MILC order.
   * This function requires that persistent gauge and clover fields have
   * been created prior.  This interface is experimental.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] quda_precision Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] mass Fermion mass parameter
   * @param[in] inv_args Struct setting some solver metadata
   * @param[in] eig_args Struct setting some eigensolver metadata
   * @param[in] target_residual Target residual
   * @param[in] target_relative_residual Target Fermilab residual
   * @param[in] milc_fatlink Fat-link field on the host
   * @param[in] milc_longlink Long-link field on the host
   * @param[in] source array of right-hand side source fields
   * @param[out] solution array of solution spinor fields
   * @param[in] final_residual True residual
   * @param[in] final_relative_residual True Fermilab residual
   * @param[in] num_iters Number of iterations taken
   * @param[in] num_src Number of source fields
   */
  void qudaInvertMsrcDeflatable(int external_precision, int quda_precision, double mass, QudaInvertArgs_t inv_args,
                                QudaEigensolverArgs_t eig_args, double target_residual, double target_fermilab_residual,
                                const void *const fatlink, const void *const longlink, void **sourceArray,
                                void **solutionArray, double *const final_residual,
                                double *const final_fermilab_residual, int *num_iters, int num_src);

  /**
   * Solve for multiple shifts (e.g., masses) using an improved
   * staggered operator.  All fields are fields passed and returned
   * are host (CPU) field in MILC order.  This function requires that
   * persistent gauge and clover fields have been created prior.  When
   * a pure double-precision solver is requested no reliable updates
   * are used, else reliable updates are used with a reliable_delta
   * parameter of 0.1.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] precision Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] num_offsets Number of shifts to solve for
   * @param[in] offset Array of shift offset values
   * @param[in] inv_args Struct setting some solver metadata
   * @param[in] target_residual Array of target residuals per shift
   * @param[in] target_relative_residual Array of target Fermilab residuals per shift
   * @param[in] milc_fatlink Fat-link field on the host
   * @param[in] milc_longlink Long-link field on the host
   * @param[in] source Right-hand side source field
   * @param[out] solutionArray Array of solution spinor fields
   * @param[in] final_residual Array of true residuals
   * @param[in] final_relative_residual Array of true Fermilab residuals
   * @param[in] num_iters Number of iterations taken
   */
  void qudaMultishiftInvert(
      int external_precision,
      int precision,
      int num_offsets,
      double* const offset,
      QudaInvertArgs_t inv_args,
      const double* target_residual,
      const double* target_fermilab_residual,
      const void* const milc_fatlink,
      const void* const milc_longlink,
      void* source,
      void** solutionArray,
      double* const final_residual,
      double* const final_fermilab_residual,
      int* num_iters);

  /**
   * Solve for a system with many RHS using an improved
   * staggered operator.
   * The solving procedure consists of two computation phases :
   * 1) incremental pahse : call eigCG solver to accumulate low eigenmodes
   * 2) deflation phase : use computed eigenmodes to deflate a regular CG
   * All fields are fields passed and returned
   * are host (CPU) field in MILC order.  This function requires that
   * persistent gauge and clover fields have been created prior.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] precision Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] num_offsets Number of shifts to solve for
   * @param[in] offset Array of shift offset values
   * @param[in] inv_args Struct setting some solver metadata
   * @param[in] target_residual Array of target residuals per shift
   * @param[in] target_relative_residual Array of target Fermilab residuals per shift
   * @param[in] milc_fatlink Fat-link field on the host
   * @param[in] milc_longlink Long-link field on the host
   * @param[in] source Right-hand side source field
   * @param[out] solution Array of solution spinor fields
   * @param[in] eig_args contains info about deflation space
   * @param[in] rhs_idx  bookkeep current rhs
   * @param[in] last_rhs_flag  is this the last rhs to solve?
   * @param[in] final_residual Array of true residuals
   * @param[in] final_relative_residual Array of true Fermilab residuals
   * @param[in] num_iters Number of iterations taken
   */
  void qudaEigCGInvert(
      int external_precision,
      int quda_precision,
      double mass,
      QudaInvertArgs_t inv_args,
      double target_residual,
      double target_fermilab_residual,
      const void* const fatlink,
      const void* const longlink,
      void* source,
      void* solution,
      QudaEigArgs_t eig_args,
      const int rhs_idx,//current rhs
      const int last_rhs_flag,//is this the last rhs to solve?
      double* const final_residual,
      double* const final_fermilab_residual,
      int *num_iters);

  /**
   * Solve Ax=b using a Wilson-Clover operator.  All fields are fields
   * passed and returned are host (CPU) field in MILC order.  This
   * function creates the gauge and clover field from the host fields.
   * Reliable updates are used with a reliable_delta parameter of 0.1.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] quda_precision Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] kappa Kappa value
   * @param[in] clover_coeff Clover coefficient
   * @param[in] inv_args Struct setting some solver metadata
   * @param[in] target_residual Target residual
   * @param[in] milc_link Gauge field on the host
   * @param[in] milc_clover Clover field on the host
   * @param[in] milc_clover_inv Inverse clover on the host
   * @param[in] clover_coeff Clover coefficient
   * @param[in] source Right-hand side source field
   * @param[out] solution Solution spinor field
   * @param[in] final_residual True residual returned by the solver
   * @param[in] final_residual True Fermilab residual returned by the solver
   * @param[in] num_iters Number of iterations taken
   */
  void qudaCloverInvert(int external_precision,
			int quda_precision,
			double kappa,
			double clover_coeff,
			QudaInvertArgs_t inv_args,
			double target_residual,
			double target_fermilab_residual,
			const void* milc_link,
			void* milc_clover,
			void* milc_clover_inv,
			void* source,
			void* solution,
			double* const final_residual,
			double* const final_fermilab_residual,
			int* num_iters);

  /**
   * Solve for a system with many RHS using using a Wilson-Clover operator.
   * The solving procedure consists of two computation phases :
   * 1) incremental pahse : call eigCG solver to accumulate low eigenmodes
   * 2) deflation phase : use computed eigenmodes to deflate a regular CG
   * All fields are fields passed and returned
   * are host (CPU) field in MILC order.  This function requires that
   * persistent gauge and clover fields have been created prior.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] quda_precision Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] kappa Kappa value
   * @param[in] clover_coeff Clover coefficient
   * @param[in] inv_args Struct setting some solver metadata
   * @param[in] target_residual Target residual
   * @param[in] milc_link Gauge field on the host
   * @param[in] milc_clover Clover field on the host
   * @param[in] milc_clover_inv Inverse clover on the host
   * @param[in] clover_coeff Clover coefficient
   * @param[in] source Right-hand side source field
   * @param[out] solution Solution spinor field
   * @param[in] eig_args contains info about deflation space
   * @param[in] rhs_idx  bookkeep current rhs
   * @param[in] last_rhs_flag  is this the last rhs to solve?
   * @param[in] final_residual Array of true residuals
   * @param[in] final_relative_residual Array of true Fermilab residuals
   * @param[in] num_iters Number of iterations taken
   */
  void qudaEigCGCloverInvert(
      int external_precision,
      int quda_precision,
      double kappa,
      double clover_coeff,
      QudaInvertArgs_t inv_args,
      double target_residual,
      double target_fermilab_residual,
      const void* milc_link,
      void* milc_clover,
      void* milc_clover_inv,
      void* source,
      void* solution,
      QudaEigArgs_t eig_args,
      const int rhs_idx,//current rhs
      const int last_rhs_flag,//is this the last rhs to solve?
      double* const final_residual,
      double* const final_fermilab_residual,
      int *num_iters);

  /**
   * Load the gauge field from the host.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] quda_precision Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] inv_args Meta data
   * @param[in] milc_link Base pointer to host gauge field (regardless of dimensionality)
   */
  void qudaLoadGaugeField(int external_precision,
			  int quda_precision,
			  QudaInvertArgs_t inv_args,
			  const void* milc_link) ;

  /**
     Free the gauge field allocated in QUDA.
   */
  void qudaFreeGaugeField();


  /**.
     Free the two-link field allocated in QUDA.
   */
  void qudaFreeTwoLink();

  /**
   * Load the clover field and its inverse from the host.  If null
   * pointers are passed, the clover field and / or its inverse will
   * be computed dynamically from the resident gauge field.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] quda_precision Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] inv_args Meta data
   * @param[in] milc_clover Pointer to host clover field.  If 0 then the
   * clover field is computed dynamically within QUDA.
   * @param[in] milc_clover_inv Pointer to host inverse clover field.  If
   * 0 then the inverse if computed dynamically within QUDA.
   * @param[in] solution_type The type of solution required  (mat, matpc)
   * @param[in] solve_type The solve type to use (normal/direct/preconditioning)
   * @param[in] clover_coeff Clover coefficient
   * @param[in] compute_trlog Whether to compute the trlog of the clover field when inverting
   * @param[in] Array for storing the trlog (length two, one for each parity)
   */
  void qudaLoadCloverField(int external_precision,
			   int quda_precision,
			   QudaInvertArgs_t inv_args,
			   void* milc_clover,
			   void* milc_clover_inv,
			   QudaSolutionType solution_type,
			   QudaSolveType solve_type,
			   double clover_coeff,
			   int compute_trlog,
			   double *trlog) ;

  /**
     Free the clover field allocated in QUDA.
   */
  void qudaFreeCloverField();

  /**
   * Solve for multiple shifts (e.g., masses) using a Wilson-Clover
   * operator with multi-shift CG.  All fields are fields passed and
   * returned are host (CPU) field in MILC order.  This function
   * requires that persistent gauge and clover fields have been
   * created prior.  When a pure double-precision solver is requested
   * no reliable updates are used, else reliable updates are used with
   * a reliable_delta parameter of 0.1.
   *
   * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single)
   * @param[in] quda_precision Precision for QUDA to use (2 - double, 1 - single)
   * @param[in] num_offsets Number of shifts to solve for
   * @param[in] offset Array of shift offset values
   * @param[in] kappa Kappa value
   * @param[in] clover_coeff Clover coefficient
   * @param[in] inv_args Struct setting some solver metadata
   * @param[in] target_residual Array of target residuals per shift
   * @param[in] clover_coeff Clover coefficient
   * @param[in] source Right-hand side source field
   * @param[out] solutionArray Array of solution spinor fields
   * @param[in] final_residual Array of true residuals
   * @param[in] num_iters Number of iterations taken
   */
  void qudaCloverMultishiftInvert(int external_precision,
      int quda_precision,
      int num_offsets,
      double* const offset,
      double kappa,
      double clover_coeff,
      QudaInvertArgs_t inv_args,
      const double* target_residual,
      void* source,
      void** solutionArray,
      double* const final_residual,
      int* num_iters
      );

  /**
   * Compute the fermion force for the HISQ quark action.  All fields
   * are host fields in MILC order, and the precision of these fields
   * must match.
   *
   * @param[in] precision       The precision of the fields
   * @param[in] num_terms The number of quark fields
   * @param[in] num_naik_terms The number of naik contributions
   * @param[in] dt Integrating step size
   * @param[in] coeff The coefficients multiplying the fermion fields in the outer product
   * @param[in] quark_field The input fermion field.
   * @param[in] level2_coeff    The coefficients for the second level of smearing in the quark action.
   * @param[in] fat7_coeff      The coefficients for the first level of smearing (fat7) in the quark action.
   * @param[in] w_link          Unitarized link variables obtained by applying fat7 smearing and unitarization to the
   * original links.
   * @param[in] v_link          Fat7 link variables.
   * @param[in] u_link          SU(3) think link variables.
   * @param[in] milc_momentum        The momentum contribution from the quark action.
   */
  void qudaHisqForce(int precision,
                     int num_terms,
                     int num_naik_terms,
                     double dt,
                     double** coeff,
                     void** quark_field,
		     const double level2_coeff[6],
		     const double fat7_coeff[6],
		     const void* const w_link,
		     const void* const v_link,
		     const void* const u_link,
		     void* const milc_momentum);

  /**
   * Compute the gauge force and update the momentum field.  All fields
   * here are CPU fields in MILC order, and their precisions should
   * match.
   *
   * @param[in] precision The precision of the field (2 - double, 1 - single)
   * @param[in] num_loop_types 1, 2 or 3
   * @param[in] milc_loop_coeff Coefficients of the different loops in the Symanzik action
   * @param[in] eb3 The integration step size (for MILC this is dt*beta/3)
   * @param[in] arg Metadata for MILC's internal site struct array
   */
  void qudaGaugeForce(int precision,
		      int num_loop_types,
		      double milc_loop_coeff[3],
		      double eb3,
		      QudaMILCSiteArg_t *arg);

  /**
   * Compute the gauge force and update the momentum field.  All fields
   * here are CPU fields in MILC order, and their precisions should
   * match.
   *
   * @param[in] precision The precision of the field (2 - double, 1 - single)
   * @param[in] num_loop_types 1, 2 or 3
   * @param[in] milc_loop_coeff Coefficients of the different loops in the Symanzik action
   * @param[in] eb3 The integration step size (for MILC this is dt*beta/3)
   * @param[in] arg Metadata for MILC's internal site struct array
   * @param[in] phase_in whether staggered phases are applied
   */
  void qudaGaugeForcePhased(int precision, int num_loop_types, double milc_loop_coeff[3], double eb3,
                            QudaMILCSiteArg_t *arg, int phase_in);

  /**
   * Compute the real traces of gauge loops, with direct application to computing the gauge
   * action.  All fields here are CPU fields in MILC order, and their precisions should
   * match.
   *
   * @param[in] precision The precision of the field (2 - double, 1 - single)
   * @param[out] traces A pre-allocated buffer for computed traces of length 2 x num_paths to encode real and imaginary
   * @param[in] input_path_buf A double pointer of length num_paths x max_length containing loop paths
   * @param[in] path_length An array of length num_paths containing the lengths of each loop
   * @param[in] loop_coeff Coefficients for each individual loop
   * @param[in] num_paths The total number of paths that are computed
   * @param[in] max_length The maximum length across all loop paths
   * @param[in] factor An overall multiplicative factor applied to all traces
   * @param[in] arg Metadata for MILC's internal site struct array
   * @param[in] phase_in whether staggered phases are applied
   */
  void qudaGaugeLoopTracePhased(int precision, double *traces, int **input_path_buf, int *path_length, double *loop_coeff,
                                int num_paths, int max_length, double factor, QudaMILCSiteArg_t *arg, int phase_in);

  /**
   * Compute the total, spatial, and temporal plaquette. All fields here are CPU fields in
   * MILC order, and their precisions should match
   *
   * @param[in] precision The precision of the field (2 - double, 1 - single)
   * @param[out] plaq Storage for the total, spatial, and temporal plaquette
   * @param[in] arg Metadata for MILC's internal site struct array
   * @param[in] phase_in whether staggered phases are applied