stats305c/references.bib at spring2026 · slinderman/stats305c · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000

@ARTICLE{Neal2012-ev,
  title         = "{MCMC} using {H}amiltonian dynamics",
  author        = "Neal, Radford M",
  abstract      = "Hamiltonian dynamics can be used to produce distant
                   proposals for the Metropolis algorithm, thereby avoiding the
                   slow exploration of the state space that results from the
                   diffusive behaviour of simple random-walk proposals. Though
                   originating in physics, Hamiltonian dynamics can be applied
                   to most problems with continuous state spaces by simply
                   introducing fictitious ``momentum'' variables. A key to its
                   usefulness is that Hamiltonian dynamics preserves volume,
                   and its trajectories can thus be used to define complex
                   mappings without the need to account for a hard-to-compute
                   Jacobian factor - a property that can be exactly maintained
                   even when the dynamics is approximated by discretizing time.
                   In this review, I discuss theoretical and practical aspects
                   of Hamiltonian Monte Carlo, and present some of its
                   variations, including using windows of states for deciding
                   on acceptance or rejection, computing trajectories using
                   fast approximations, tempering during the course of a
                   trajectory to handle isolated modes, and short-cut methods
                   that prevent useless trajectories from taking much
                   computation time.",
  month         =  jun,
  year          =  2012,
  archivePrefix = "arXiv",
  primaryClass  = "stat.CO",
  eprint        = "1206.1901"
}


@ARTICLE{Andrieu2008-yc,
  title    = "A tutorial on adaptive {MCMC}",
  author   = "Andrieu, Christophe and Thoms, Johannes",
  abstract = "We review adaptive Markov chain Monte Carlo algorithms (MCMC) as
              a mean to optimise their performance. Using simple toy examples
              we review their theoretical underpinnings, and in particular show
              why adaptive MCMC algorithms might fail when some fundamental
              properties are not satisfied. This leads to guidelines concerning
              the design of correct algorithms. We then review criteria and the
              useful framework of stochastic approximation, which allows one to
              systematically optimise generally used criteria, but also analyse
              the properties of adaptive MCMC algorithms. We then propose a
              series of novel adaptive algorithms which prove to be robust and
              reliable in practice. These algorithms are applied to artificial
              and high dimensional scenarios, but also to the classic mine
              disaster dataset inference problem.",
  journal  = "Stat. Comput.",
  volume   =  18,
  number   =  4,
  pages    = "343--373",
  month    =  dec,
  year     =  2008
}


@ARTICLE{Betancourt2017-ml,
  title         = "A Conceptual Introduction to {H}amiltonian {M}onte {C}arlo",
  author        = "Betancourt, Michael",
  abstract      = "Hamiltonian Monte Carlo has proven a remarkable empirical
                   success, but only recently have we begun to develop a
                   rigorous understanding of why it performs so well on
                   difficult problems and how it is best applied in practice.
                   Unfortunately, that understanding is confined within the
                   mathematics of differential geometry which has limited its
                   dissemination, especially to the applied communities for
                   which it is particularly important. In this review I provide
                   a comprehensive conceptual account of these theoretical
                   foundations, focusing on developing a principled intuition
                   behind the method and its optimal implementations rather of
                   any exhaustive rigor. Whether a practitioner or a
                   statistician, the dedicated reader will acquire a solid
                   grasp of how Hamiltonian Monte Carlo works, when it
                   succeeds, and, perhaps most importantly, when it fails.",
  month         =  jan,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "stat.ME",
  eprint        = "1701.02434"
}


@ARTICLE{Hoffman2014-fl,
  title     = "The {No-U-Turn} sampler: adaptively setting path lengths in
               {H}amiltonian {M}onte {C}arlo",
  author    = "Hoffman, Matthew D and Gelman, Andrew",
  abstract  = "Abstract Hamiltonian Monte Carlo (HMC) is a Markov chain Monte
               Carlo (MCMC) algorithm that avoids the random walk behavior and
               sensitivity to correlated parameters that plague many MCMC
               methods by taking a series of steps informed by first-order
               gradient …",
  journal   = "J. Mach. Learn. Res.",
  publisher = "jmlr.org",
  volume    =  15,
  number    =  1,
  pages     = "1593--1623",
  year      =  2014
}


@ARTICLE{Kiselev2019-bt,
  title    = "Challenges in unsupervised clustering of single-cell {RNA-seq}
              data",
  author   = "Kiselev, Vladimir Yu and Andrews, Tallulah S and Hemberg, Martin",
  abstract = "Single-cell RNA sequencing (scRNA-seq) allows researchers to
              collect large catalogues detailing the transcriptomes of
              individual cells. Unsupervised clustering is of central
              importance for the analysis of these data, as it is used to
              identify putative cell types. However, there are many challenges
              involved. We discuss why clustering is a challenging problem from
              a computational point of view and what aspects of the data make
              it challenging. We also consider the difficulties related to the
              biological interpretation and annotation of the identified
              clusters.",
  journal  = "Nat. Rev. Genet.",
  volume   =  20,
  number   =  5,
  pages    = "273--282",
  month    =  may,
  year     =  2019,
  language = "en"
}

@article{orbanz2012lecture,
  title={Lecture notes on {B}ayesian nonparametrics},
  author={Orbanz, Peter},
  month={May},
  year={2014},
  url={http://www.gatsby.ucl.ac.uk/~porbanz/papers/porbanz_BNP_draft.pdf}
}

@article{blei2003latent,
  title={Latent {D}irichlet allocation},
  author={Blei, David M and Ng, Andrew Y and Jordan, Michael I},
  journal={Journal of Machine Learning Research},
  volume={3},
  pages={993--1022},
  year={2003},
  publisher={JMLR. org}
}


@ARTICLE{Blei2017-yc,
  title     = "Variational Inference: A Review for Statisticians",
  author    = "Blei, David M and Kucukelbir, Alp and McAuliffe, Jon D",
  journal   = "Journal of the American Statistical Association",
  publisher = "Taylor \& Francis",
  volume    =  112,
  number    =  518,
  pages     = "859--877",
  month     =  apr,
  year      =  2017
}


% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Boyd-Graber2017-qk,
  title     = "Applications of topic models",
  author    = "Boyd-Graber, Jordan and Hu, Yuening and Mimno, David",
  abstract  = "How can a single person understand what's going on in a
               collection of millions of documents? This is an increasingly
               common problem: sifting through an organization's e- mails,
               understanding a decade worth of newspapers, or characterizing a
               scientific field's …",
  journal   = "Found. Trends\textregistered{} Inf. Retr.",
  publisher = "Now Publishers",
  volume    =  11,
  number    = "2-3",
  pages     = "143--296",
  year      =  2017,
  language  = "en"
}

@article{blei2012probabilistic,
  title={Probabilistic topic models},
  author={Blei, David M},
  journal={Communications of the ACM},
  volume={55},
  number={4},
  pages={77--84},
  year={2012},
  publisher={ACM New York, NY, USA}
}

@inproceedings{blei2006dynamic,
  title={Dynamic topic models},
  author={Blei, David M and Lafferty, John D},
  booktitle={Proceedings of the 23rd international conference on Machine learning},
  pages={113--120},
  year={2006}
}

@inproceedings{wallach2009evaluation,
  title={Evaluation methods for topic models},
  author={Wallach, Hanna M and Murray, Iain and Salakhutdinov, Ruslan and Mimno, David},
  booktitle={Proceedings of the 26th Annual International Conference on Machine Learning},
  pages={1105--1112},
  year={2009}
}


@ARTICLE{Gopalan2013-bc,
  title         = "Scalable Recommendation with {P}oisson Factorization",
  author        = "Gopalan, Prem and Hofman, Jake M and Blei, David M",
  month         =  nov,
  year          =  2013,
  archivePrefix = "arXiv",
  primaryClass  = "cs.IR",
  eprint        = "1311.1704"
}


@ARTICLE{Gopalan2013-pz,
  title    = "Efficient discovery of overlapping communities in massive
              networks",
  author   = "Gopalan, Prem K and Blei, David M",
  journal  = "Proc. Natl. Acad. Sci. U. S. A.",
  volume   =  110,
  number   =  36,
  pages    = "14534--14539",
  month    =  sep,
  year     =  2013,
  keywords = "Bayesian statistics; massive data; network analysis",
  language = "en"
}

@article{hoffman2013stochastic,
  title={Stochastic variational inference.},
  author={Hoffman, Matthew D and Blei, David M and Wang, Chong and Paisley, John},
  journal={Journal of Machine Learning Research},
  volume={14},
  number={5},
  year={2013}
}

@article{neal2012mcmc,
  title={{MCMC} using {H}amiltonian dynamics},
  author={Neal, Radford M},
  journal={Handbook of Markov Chain Monte Carlo},
  volume={2},
  number={11},
  pages={2},
  year={2011},
  publisher={CRC Press}
}

@article{betancourt2017conceptual,
  title={A conceptual introduction to {H}amiltonian {M}onte {C}arlo},
  author={Betancourt, Michael},
  journal={arXiv preprint arXiv:1701.02434},
  year={2017}
}

@article{hoffman2014nuts,
  title={The {No-U-Turn} sampler: adaptively setting path lengths in {H}amiltonian {M}onte {C}arlo},
  author={Hoffman, Matthew D and Gelman, Andrew},
  journal={Journal of Machine Learning Research},
  volume={15},
  number={1},
  pages={1593--1623},
  year={2014}
}

@article{amari1998natural,
  title={Natural gradient works efficiently in learning},
  author={Amari, Shun-Ichi},
  journal={Neural computation},
  volume={10},
  number={2},
  pages={251--276},
  year={1998},
  publisher={MIT Press}
}


@ARTICLE{Erosheva2007-vd,
  title     = "DESCRIBING DISABILITY THROUGH INDIVIDUAL-LEVEL MIXTURE
               MODELS For MULTIVARIATE BINARY DATA",
  author    = "Erosheva, Elena A and Fienberg, Stephen E and Joutard, Cyrille",
  abstract  = "Data on functional disability are of widespread policy interest
               in the United States, especially with respect to planning for
               Medicare and Social Security for a growing population of elderly
               adults. We consider an extract of functional disability data
               from the National Long Term Care Survey (NLTCS) and attempt to
               develop disability profiles using variations of the Grade of
               Membership (GoM) model. We first describe GoM as an
               individual-level mixture model that allows individuals to have
               partial membership in several mixture components simultaneously.
               We then prove the equivalence between individual-level and
               population-level mixture models, and use this property to
               develop a Markov Chain Monte Carlo algorithm for Bayesian
               estimation of the model. We use our approach to analyze
               functional disability data from the NLTCS.",
  journal   = "Ann. Appl. Stat.",
  publisher = "ncbi.nlm.nih.gov",
  volume    =  1,
  number    =  2,
  pages     = "346--384",
  year      =  2007,
  language  = "en"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{Hofmann1999-de,
  title     = "Probabilistic latent semantic indexing",
  author    = "Hofmann, T",
  abstract  = "Abstract Probabilistic Latent Semantic Indexing is a novel
               approach to automated document indexing which is based on a
               statistical latent class model for factor analysis of count
               data. Fitted from a training corpus of text documents by a
               generalization of the Expectation …",
  journal   = "Proceedings of the 22nd annual international ACM",
  publisher = "dl.acm.org",
  year      =  1999
}

@ARTICLE{Deerwester1990-kp,
  title     = "Indexing by latent semantic analysis",
  author    = "Deerwester, Scott and Dumais, Susan T and Furnas, George W and
               Landauer, Thomas K and Harshman, Richard",
  abstract  = "Abstract A new method for automatic indexing and retrieval is
               described. The approach is to take advantage of implicit
               higher-order structure in the association of terms with
               documents (?semantic structure?) in order to improve the
               detection of relevant documents on the basis of terms found in
               queries. The particular technique used is singular-value
               decomposition, in which a large term by document matrix is
               decomposed into a set of ca. 100 orthogonal factors from which
               the original matrix can be approximated by linear combination.
               Documents are represented by ca. 100 item vectors of factor
               weights. Queries are represented as pseudo-document vectors
               formed from weighted combinations of terms, and documents with
               supra-threshold cosine values are returned. Initial tests find
               this completely automatic method for retrieval to be promising.
               ? 1990 John Wiley \& Sons, Inc.",
  journal   = "J. Am. Soc. Inf. Sci.",
  publisher = "Wiley",
  volume    =  41,
  number    =  6,
  pages     = "391--407",
  month     =  sep,
  year      =  1990,
  language  = "en"
}


@ARTICLE{Pritchard2000-wm,
  title     = "Inference of population structure using multilocus genotype data",
  author    = "Pritchard, J K and Stephens, M and Donnelly, P",
  abstract  = "We describe a model-based clustering method for using multilocus
               genotype data to infer population structure and assign
               individuals to populations. We assume a model in which there are
               K populations (where K may be unknown), each of which is
               characterized by a set of allele frequencies at each locus.
               Individuals in the sample are assigned (probabilistically) to
               populations, or jointly to two or more populations if their
               genotypes indicate that they are admixed. Our model does not
               assume a particular mutation process, and it can be applied to
               most of the commonly used genetic markers, provided that they
               are not closely linked. Applications of our method include
               demonstrating the presence of population structure, assigning
               individuals to populations, studying hybrid zones, and
               identifying migrants and admixed individuals. We show that the
               method can produce highly accurate assignments using modest
               numbers of loci-e.g. , seven microsatellite loci in an example
               using genotype data from an endangered bird species. The
               software used for this article is available from
               http://www.stats.ox.ac.uk/ approximately pritch/home. html.",
  journal   = "Genetics",
  publisher = "academic.oup.com",
  volume    =  155,
  number    =  2,
  pages     = "945--959",
  month     =  jun,
  year      =  2000,
  language  = "en"
}


@ARTICLE{Airoldi2008-rh,
  title    = "Mixed Membership Stochastic Blockmodels",
  author   = "Airoldi, Edoardo M and Blei, David M and Fienberg, Stephen E and
              Xing, Eric P",
  abstract = "Observations consisting of measurements on relationships for
              pairs of objects arise in many settings, such as protein
              interaction and gene regulatory networks, collections of
              author-recipient email, and social networks. Analyzing such data
              with probabilisic models can be delicate because the simple
              exchangeability assumptions underlying many boilerplate models no
              longer hold. In this paper, we describe a latent variable model
              of such data called the mixed membership stochastic blockmodel.
              This model extends blockmodels for relational data to ones which
              capture mixed membership latent relational structure, thus
              providing an object-specific low-dimensional representation. We
              develop a general variational inference algorithm for fast
              approximate posterior inference. We explore applications to
              social and protein interaction networks.",
  journal  = "J. Mach. Learn. Res.",
  volume   =  9,
  pages    = "1981--2014",
  month    =  sep,
  year     =  2008,
  language = "en"
}

@book{bishop2006pattern,
  title={Pattern recognition and machine learning},
  author={Bishop, Christopher M},
  year={2006},
  publisher={Springer}
}

@inproceedings{cremer2018inference,
  title={Inference suboptimality in variational autoencoders},
  author={Cremer, Chris and Li, Xuechen and Duvenaud, David},
  booktitle={International Conference on Machine Learning},
  pages={1078--1086},
  year={2018},
  organization={PMLR}
}

@book{williams1996gaussian,
  title={Gaussian processes for regression},
  author={Williams, Christopher KI and Rasmussen, Carl Edward},
  year={1996},
  publisher={MIT Press}
}


@INPROCEEDINGS{Storkey1999-wq,
  title     = "Truncated covariance matrices and Toeplitz methods in Gaussian
               processes",
  booktitle = "Artificial Neural Networks, 1999. {ICANN} 99. Ninth
               International Conference on (Conf. Publ. No. 470)",
  author    = "Storkey, Amos J",
  abstract  = "Gaussian processes are a limit extension of neural networks.
               Standard Gaussian process techniques use a squared exponential
               covariance function. Here, the use of truncated covariances is
               proposed. Such covariances have compact support. Their use
               speeds up matrix inversion and increases precision. Furthermore
               they allow the use of speedy, memory efficient Toeplitz
               inversion for high dimensional grid based Gaussian process
               predictors",
  publisher = "unknown",
  volume    =  1,
  pages     = "55--60 vol.1",
  month     =  feb,
  year      =  1999
}


@INPROCEEDINGS{Cunningham2008-zj,
  title     = "Fast {G}aussian process methods for point process intensity
               estimation",
  booktitle = "Proceedings of the 25th International Conference on Machine
               Learning",
  author    = "Cunningham, John P and Shenoy, Krishna V and Sahani, Maneesh",
  abstract  = "Point processes are difficult to analyze because they provide
               only a sparse and noisy observation of the intensity function
               driving the process. Gaussian Processes offer an attractive
               framework within which to infer underlying intensity functions.
               The result of this inference is a continuous function defined
               across time that is typically more amenable to analytical
               efforts. However, a naive implementation will become
               computationally infeasible in any problem of reasonable size,
               both in memory and run time requirements. We demonstrate problem
               specific methods for a class of renewal processes that eliminate
               the memory burden and reduce the solve time by orders of
               magnitude.",
  publisher = "Association for Computing Machinery",
  pages     = "192--199",
  month     =  jul,
  year      =  2008,
  address   = "New York, NY, USA",
  location  = "Helsinki, Finland"
}

@inproceedings{duvenaud2013structure,
  title={Structure discovery in nonparametric regression through compositional kernel search},
  author={Duvenaud, David and Lloyd, James and Grosse, Roger and Tenenbaum, Joshua and Zoubin, Ghahramani},
  booktitle={International Conference on Machine Learning},
  pages={1166--1174},
  year={2013},
  organization={PMLR}
}


@ARTICLE{Neal2003-zu,
  title     = "Slice sampling",
  author    = "Neal, Radford M",
  abstract  = "Markov chain sampling methods that adapt to characteristics of
               the distribution being sampled can be constructed using the
               principle that one can ample from a distribution by sampling
               uniformly from the region under the plot of its density
               function. A Markov chain that converges to this uniform
               distribution can be constructed by alternating uniform sampling
               in the vertical direction with uniform sampling from the
               horizontal ``slice'' defined by the current vertical position,
               or more generally, with some update that leaves the uniform
               distribution over this slice invariant. Such ``slice sampling''
               methods are easily implemented for univariate distributions, and
               can be used to sample from a multivariate distribution by
               updating each variable in turn. This approach is often easier to
               implement than Gibbs sampling and more efficient than simple
               Metropolis updates, due to the ability of slice sampling to
               adaptively choose the magnitude of changes made. It is therefore
               attractive for routine and automated use. Slice sampling methods
               that update all variables simultaneously are also possible.
               These methods can adaptively choose the magnitudes of changes
               made to each variable, based on the local properties of the
               density function. More ambitiously, such methods could
               potentially adapt to the dependencies between variables by
               constructing local quadratic approximations. Another approach is
               to improve sampling efficiency by suppressing random walks. This
               can be done for univariate slice sampling by ``overrelaxation,''
               and for multivariate slice sampling by ``reflection'' from the
               edges of the slice.",
  journal   = "Annals of Statistics",
  publisher = "Institute of Mathematical Statistics",
  volume    =  31,
  number    =  3,
  pages     = "705--767",
  month     =  jun,
  year      =  2003,
  keywords  = "65C05; 65C60; Adaptive methods; auxiliary variables; dynamical
               methods; Gibbs sampling; Markov chain Monte Carlo; Metropolis
               algorithm; overrelaxation;",
  language  = "en"
}


@INPROCEEDINGS{Murray2010-zb,
  title      = "Elliptical slice sampling",
  booktitle  = "Proceedings of the Thirteenth International Conference on
                Artificial Intelligence and Statistics",
  author     = "Murray, Iain and Adams, Ryan and MacKay, David",
  abstract   = "Many probabilistic models introduce strong dependencies between
                variables using a latent multivariate Gaussian distribution or
                a Gaussian process. We present a new Markov chain Monte Carlo
                algorithm for performing inference in models with multivariate
                Gaussian priors. Its key properties are: 1) it has simple,
                generic code applicable to many models, 2) it has no free
                parameters, 3) it works well for a variety of Gaussian process
                based models. These properties make our method ideal for use
                while model building, removing the need to spend time deriving
                and tuning updates for more complex algorithms.",
  publisher  = "jmlr.org",
  pages      = "541--548",
  month      =  mar,
  year       =  2010,
  language   = "en",
  conference = "Proceedings of the Thirteenth International Conference on
                Artificial Intelligence and Statistics"
}


@INPROCEEDINGS{Hensman2013-cf,
  title     = "Gaussian processes for Big data",
  booktitle = "Proceedings of the {Twenty-Ninth} Conference on Uncertainty in
               Artificial Intelligence",
  author    = "Hensman, James and Fusi, Nicol{\`o} and Lawrence, Neil D",
  abstract  = "We introduce stochastic variational inference for Gaussian
               process models. This enables the application of Gaussian process
               (GP) models to data sets containing millions of data points. We
               show how GPs can be variationally decomposed to depend on a set
               of globally relevant inducing variables which factorize the
               model in the necessary manner to perform variational inference.
               Our approach is readily extended to models with non-Gaussian
               likelihoods and latent variable models based around Gaussian
               processes. We demonstrate the approach on a simple toy problem
               and two real world data sets.",
  publisher = "AUAI Press",
  pages     = "282--290",
  series    = "UAI'13",
  month     =  aug,
  year      =  2013,
  address   = "Arlington, Virginia, USA",
  location  = "Bellevue, WA"
}


@INPROCEEDINGS{Titsias2009-ls,
  title     = "Variational Learning of Inducing Variables in Sparse Gaussian
               Processes",
  booktitle = "Proceedings of the Twelth International Conference on Artificial
               Intelligence and Statistics",
  author    = "Titsias, Michalis",
  editor    = "van Dyk, David and Welling, Max",
  abstract  = "Sparse Gaussian process methods that use inducing variables
               require the selection of the inducing inputs and the kernel
               hyperparameters. We introduce a variational formulation for
               sparse approximations that jointly infers the inducing inputs
               and the kernel hyperparameters by maximizing a lower bound of
               the true log marginal likelihood. The key property of this
               formulation is that the inducing inputs are defined to be
               variational parameters which are selected by minimizing the
               Kullback-Leibler divergence between the variational distribution
               and the exact posterior distribution over the latent function
               values. We apply this technique to regression and we compare it
               with other approaches in the literature.",
  publisher = "PMLR",
  volume    =  5,
  pages     = "567--574",
  series    = "Proceedings of Machine Learning Research",
  year      =  2009,
  address   = "Hilton Clearwater Beach Resort, Clearwater Beach, Florida USA"
}


@ARTICLE{Miller2018-dv,
  title    = "Mixture models with a prior on the number of components",
  author   = "Miller, Jeffrey W and Harrison, Matthew T",
  abstract = "A natural Bayesian approach for mixture models with an unknown
              number of components is to take the usual finite mixture model
              with symmetric Dirichlet weights, and put a prior on the number
              of components-that is, to use a mixture of finite mixtures (MFM).
              The most commonly-used method of inference for MFMs is reversible
              jump Markov chain Monte Carlo, but it can be nontrivial to design
              good reversible jump moves, especially in high-dimensional
              spaces. Meanwhile, there are samplers for Dirichlet process
              mixture (DPM) models that are relatively simple and are easily
              adapted to new applications. It turns out that, in fact, many of
              the essential properties of DPMs are also exhibited by MFMs-an
              exchangeable partition distribution, restaurant process, random
              measure representation, and stick-breaking representation-and
              crucially, the MFM analogues are simple enough that they can be
              used much like the corresponding DPM properties. Consequently,
              many of the powerful methods developed for inference in DPMs can
              be directly applied to MFMs as well; this simplifies the
              implementation of MFMs and can substantially improve mixing. We
              illustrate with real and simulated data, including
              high-dimensional gene expression data used to discriminate cancer
              subtypes.",
  journal  = "J. Am. Stat. Assoc.",
  volume   =  113,
  number   =  521,
  pages    = "340--356",
  year     =  2018,
  keywords = "Bayesian; clustering; density estimation; model selection;
              nonparametric",
  language = "en"
}

@book{neal1996bayesian,
  title={Bayesian learning for neural networks},
  author={Neal, Radford M},
  year={1996},
  publisher={Springer Science \& Business Media}
}


@ARTICLE{Jacot2018-dl,
  title         = "Neural Tangent Kernel: Convergence and Generalization in
                   Neural Networks",
  author        = "Jacot, Arthur and Gabriel, Franck and Hongler, Cl{\'e}ment",
  abstract      = "At initialization, artificial neural networks (ANNs) are
                   equivalent to Gaussian processes in the infinite-width
                   limit, thus connecting them to kernel methods. We prove that
                   the evolution of an ANN during training can also be
                   described by a kernel: during gradient descent on the
                   parameters of an ANN, the network function $f_\theta$ (which
                   maps input vectors to output vectors) follows the kernel
                   gradient of the functional cost (which is convex, in
                   contrast to the parameter cost) w.r.t. a new kernel: the
                   Neural Tangent Kernel (NTK). This kernel is central to
                   describe the generalization features of ANNs. While the NTK
                   is random at initialization and varies during training, in
                   the infinite-width limit it converges to an explicit
                   limiting kernel and it stays constant during training. This
                   makes it possible to study the training of ANNs in function
                   space instead of parameter space. Convergence of the
                   training can then be related to the positive-definiteness of
                   the limiting NTK. We prove the positive-definiteness of the
                   limiting NTK when the data is supported on the sphere and
                   the non-linearity is non-polynomial. We then focus on the
                   setting of least-squares regression and show that in the
                   infinite-width limit, the network function $f_\theta$
                   follows a linear differential equation during training. The
                   convergence is fastest along the largest kernel principal
                   components of the input data with respect to the NTK, hence
                   suggesting a theoretical motivation for early stopping.
                   Finally we study the NTK numerically, observe its behavior
                   for wide networks, and compare it to the infinite-width
                   limit.",
  month         =  jun,
  year          =  2018,
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1806.07572"
}

@ARTICLE{Arora2019-uc,
  title         = "On Exact Computation with an Infinitely Wide Neural Net",
  author        = "Arora, Sanjeev and Du, Simon S and Hu, Wei and Li, Zhiyuan
                   and Salakhutdinov, Ruslan and Wang, Ruosong",
  abstract      = "How well does a classic deep net architecture like AlexNet
                   or VGG19 classify on a standard dataset such as CIFAR-10
                   when its width --- namely, number of channels in
                   convolutional layers, and number of nodes in fully-connected
                   internal layers --- is allowed to increase to infinity? Such
                   questions have come to the forefront in the quest to
                   theoretically understand deep learning and its mysteries
                   about optimization and generalization. They also connect
                   deep learning to notions such as Gaussian processes and
                   kernels. A recent paper [Jacot et al., 2018] introduced the
                   Neural Tangent Kernel (NTK) which captures the behavior of
                   fully-connected deep nets in the infinite width limit
                   trained by gradient descent; this object was implicit in
                   some other recent papers. An attraction of such ideas is
                   that a pure kernel-based method is used to capture the power
                   of a fully-trained deep net of infinite width. The current
                   paper gives the first efficient exact algorithm for
                   computing the extension of NTK to convolutional neural nets,
                   which we call Convolutional NTK (CNTK), as well as an
                   efficient GPU implementation of this algorithm. This results
                   in a significant new benchmark for the performance of a pure
                   kernel-based method on CIFAR-10, being $10\%$ higher than
                   the methods reported in [Novak et al., 2019], and only $6\%$
                   lower than the performance of the corresponding finite deep
                   net architecture (once batch normalization, etc. are turned
                   off). Theoretically, we also give the first non-asymptotic
                   proof showing that a fully-trained sufficiently wide net is
                   indeed equivalent to the kernel regression predictor using
                   NTK.",
  month         =  apr,
  year          =  2019,
  archivePrefix = "arXiv",
  primaryClass  = "cs.LG",
  eprint        = "1904.11955"
}

@ARTICLE{Yang2019-uu,
  title         = "Scaling Limits of Wide Neural Networks with Weight Sharing:
                   Gaussian Process Behavior, Gradient Independence, and Neural
                   Tangent Kernel Derivation",
  author        = "Yang, Greg",
  abstract      = "Several recent trends in machine learning theory and
                   practice, from the design of state-of-the-art Gaussian
                   Process to the convergence analysis of deep neural nets
                   (DNNs) under stochastic gradient descent (SGD), have found
                   it fruitful to study wide random neural networks. Central to
                   these approaches are certain scaling limits of such
                   networks. We unify these results by introducing a notion of
                   a straightline \textbackslashemph\{tensor program\} that can
                   express most neural network computations, and we
                   characterize its scaling limit when its tensors are large
                   and randomized. From our framework follows (1) the
                   convergence of random neural networks to Gaussian processes
                   for architectures such as recurrent neural networks,
                   convolutional neural networks, residual networks, attention,
                   and any combination thereof, with or without batch
                   normalization; (2) conditions under which the
                   \textbackslashemph\{gradient independence assumption\} --
                   that weights in backpropagation can be assumed to be
                   independent from weights in the forward pass -- leads to
                   correct computation of gradient dynamics, and corrections
                   when it does not; (3) the convergence of the Neural Tangent
                   Kernel, a recently proposed kernel used to predict training
                   dynamics of neural networks under gradient descent, at
                   initialization for all architectures in (1) without batch
                   normalization. Mathematically, our framework is general
                   enough to rederive classical random matrix results such as
                   the semicircle and the Marchenko-Pastur laws, as well as
                   recent results in neural network Jacobian singular values.
                   We hope our work opens a way toward design of even stronger
                   Gaussian Processes, initialization schemes to avoid gradient
                   explosion/vanishing, and deeper understanding of SGD
                   dynamics in modern architectures.",
  month         =  feb,
  year          =  2019,
  archivePrefix = "arXiv",
  primaryClass  = "cs.NE",
  eprint        = "1902.04760"
}

@ARTICLE{Novak2018-qq,
  title         = "Bayesian Deep Convolutional Networks with Many Channels are
                   Gaussian Processes",
  author        = "Novak, Roman and Xiao, Lechao and Lee, Jaehoon and Bahri,
                   Yasaman and Yang, Greg and Hron, Jiri and Abolafia, Daniel A
                   and Pennington, Jeffrey and Sohl-Dickstein, Jascha",
  abstract      = "There is a previously identified equivalence between wide
                   fully connected neural networks (FCNs) and Gaussian
                   processes (GPs). This equivalence enables, for instance,
                   test set predictions that would have resulted from a fully
                   Bayesian, infinitely wide trained FCN to be computed without
                   ever instantiating the FCN, but by instead evaluating the
                   corresponding GP. In this work, we derive an analogous
                   equivalence for multi-layer convolutional neural networks
                   (CNNs) both with and without pooling layers, and achieve
                   state of the art results on CIFAR10 for GPs without
                   trainable kernels. We also introduce a Monte Carlo method to
                   estimate the GP corresponding to a given neural network
                   architecture, even in cases where the analytic form has too
                   many terms to be computationally feasible. Surprisingly, in
                   the absence of pooling layers, the GPs corresponding to CNNs
                   with and without weight sharing are identical. As a
                   consequence, translation equivariance, beneficial in finite
                   channel CNNs trained with stochastic gradient descent (SGD),
                   is guaranteed to play no role in the Bayesian treatment of
                   the infinite channel limit - a qualitative difference
                   between the two regimes that is not present in the FCN case.
                   We confirm experimentally, that while in some scenarios the
                   performance of SGD-trained finite CNNs approaches that of
                   the corresponding GPs as the channel count increases, with
                   careful tuning SGD-trained CNNs can significantly outperform
                   their corresponding GPs, suggesting advantages from SGD
                   training compared to fully Bayesian parameter estimation.",
  month         =  oct,
  year          =  2018,
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1810.05148"
}

@ARTICLE{Lee2017-sm,
  title         = "Deep Neural Networks as Gaussian Processes",
  author        = "Lee, Jaehoon and Bahri, Yasaman and Novak, Roman and
                   Schoenholz, Samuel S and Pennington, Jeffrey and
                   Sohl-Dickstein, Jascha",
  abstract      = "It has long been known that a single-layer fully-connected
                   neural network with an i.i.d. prior over its parameters is
                   equivalent to a Gaussian process (GP), in the limit of
                   infinite network width. This correspondence enables exact
                   Bayesian inference for infinite width neural networks on
                   regression tasks by means of evaluating the corresponding
                   GP. Recently, kernel functions which mimic multi-layer
                   random neural networks have been developed, but only outside
                   of a Bayesian framework. As such, previous work has not
                   identified that these kernels can be used as covariance
                   functions for GPs and allow fully Bayesian prediction with a
                   deep neural network. In this work, we derive the exact
                   equivalence between infinitely wide deep networks and GPs.
                   We further develop a computationally efficient pipeline to
                   compute the covariance function for these GPs. We then use
                   the resulting GPs to perform Bayesian inference for wide
                   deep neural networks on MNIST and CIFAR-10. We observe that
                   trained neural network accuracy approaches that of the
                   corresponding GP with increasing layer width, and that the
                   GP uncertainty is strongly correlated with trained network
                   prediction error. We further find that test performance
                   increases as finite-width trained networks are made wider
                   and more similar to a GP, and thus that GP predictions
                   typically outperform those of finite-width networks. Finally
                   we connect the performance of these GPs to the recent theory
                   of signal propagation in random neural networks.",
  month         =  nov,
  year          =  2017,
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1711.00165"
}

@ARTICLE{De_G_Matthews2018-vo,
  title         = "Gaussian Process Behaviour in Wide Deep Neural Networks",
  author        = "de G. Matthews, Alexander G and Rowland, Mark and Hron, Jiri
                   and Turner, Richard E and Ghahramani, Zoubin",
  abstract      = "Whilst deep neural networks have shown great empirical
                   success, there is still much work to be done to understand
                   their theoretical properties. In this paper, we study the
                   relationship between random, wide, fully connected,
                   feedforward networks with more than one hidden layer and
                   Gaussian processes with a recursive kernel definition. We
                   show that, under broad conditions, as we make the
                   architecture increasingly wide, the implied random function
                   converges in distribution to a Gaussian process, formalising
                   and extending existing results by Neal (1996) to deep
                   networks. To evaluate convergence rates empirically, we use
                   maximum mean discrepancy. We then compare finite Bayesian
                   deep networks from the literature to Gaussian processes in
                   terms of the key predictive quantities of interest, finding
                   that in some cases the agreement can be very close. We
                   discuss the desirability of Gaussian process behaviour and
                   review non-Gaussian alternative models from the literature.",
  month         =  apr,
  year          =  2018,
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1804.11271"
}

@ARTICLE{Garriga-Alonso2018-jx,
  title         = "Deep Convolutional Networks as shallow Gaussian Processes",
  author        = "Garriga-Alonso, Adri{\`a} and Rasmussen, Carl Edward and
                   Aitchison, Laurence",
  abstract      = "We show that the output of a (residual) convolutional neural
                   network (CNN) with an appropriate prior over the weights and
                   biases is a Gaussian process (GP) in the limit of infinitely
                   many convolutional filters, extending similar results for
                   dense networks. For a CNN, the equivalent kernel can be
                   computed exactly and, unlike ``deep kernels'', has very few
                   parameters: only the hyperparameters of the original CNN.
                   Further, we show that this kernel has two properties that
                   allow it to be computed efficiently; the cost of evaluating
                   the kernel for a pair of images is similar to a single
                   forward pass through the original CNN with only one filter
                   per layer. The kernel equivalent to a 32-layer ResNet
                   obtains 0.84\% classification error on MNIST, a new record
                   for GPs with a comparable number of parameters.",
  month         =  aug,
  year          =  2018,
  archivePrefix = "arXiv",
  primaryClass  = "stat.ML",
  eprint        = "1808.05587"
}


@MISC{Wilson_undated-sv,
  title        = "Bayesian neural networks from a Gaussian process perspective",
  author       = "Wilson, Andrew Gordon",
  howpublished = "\url{http://gpss.cc/gpss20/slides/Wilson2020_part2.pdf}",
  note         = "Accessed: 2021-5-26",
  year = {2020}
}

@article{shahriari2015taking,
  title={Taking the human out of the loop: A review of {B}ayesian optimization},
  author={Shahriari, Bobak and Swersky, Kevin and Wang, Ziyu and Adams, Ryan P and De Freitas, Nando},
  journal={Proceedings of the IEEE},
  volume={104},
  number={1},
  pages={148--175},
  year={2015},
  publisher={IEEE}
}


@ARTICLE{Blei2014-gr,
  title     = "Build, Compute, Critique, Repeat: Data Analysis with Latent
               Variable Models",
  author    = "Blei, David M",
  abstract  = "We survey latent variable models for solving data-analysis
               problems. A latent variable model is a probabilistic model that
               encodes hidden patterns in the data. We uncover these patterns
               from their conditional distribution and use them to summarize
               data and form predictions. Latent variable models are important
               in many fields, including computational biology, natural
               language processing, and social network analysis. Our
               perspective is that models are developed iteratively: We build a
               model, use it to analyze data, assess how it succeeds and fails,
               revise it, and repeat. We describe how new research has
               transformed these essential activities. First, we describe
               probabilistic graphical models, a language for formulating
               latent variable models. Second, we describe mean field
               variational inference, a generic algorithm for approximating
               conditional distributions. Third, we describe how to use our
               analyses to solve problems: exploring the data, forming
               predictions, and pointing us in the direction of improved
               models.",
  journal   = "Annu. Rev. Stat. Appl.",
  publisher = "Annual Reviews",
  volume    =  1,
  number    =  1,
  pages     = "203--232",
  month     =  jan,
  year      =  2014
}

@article{diaconis1980finite,
  title={Finite exchangeable sequences},
  author={Diaconis, Persi and Freedman, David},
  journal={The Annals of Probability},
  pages={745--764},
  year={1980},
  publisher={JSTOR}
}

@article{diaconis1980finetti,
  title={de {F}inetti's theorem for {M}arkov chains},
  author={Diaconis, Persi and Freedman, David},
  journal={The Annals of Probability},
  pages={115--130},
  year={1980},
  publisher={JSTOR}
}

@article{aldous1981representations,
  title={Representations for partially exchangeable arrays of random variables},
  author={Aldous, David J},
  journal={Journal of Multivariate Analysis},
  volume={11},
  number={4},
  pages={581--598},
  year={1981},
  publisher={Elsevier}
}

@techreport{hoover1979relations,
title={Relations on Probability Spaces and Arrays of Random Variables.},
author={Hoover, Douglas},
institution={Institute for Advanced Study, Princeton, NJ},
year={1979}
}

@article{geyer2011introduction,
  title={Introduction to {M}arkov chain {M}onte {C}arlo},
  author={Geyer, Charles J},