ref.bib


@INPROCEEDINGS{Arjovsky2017-ad,
  title     = "{W}asserstein Generative Adversarial Networks",
  booktitle = "Proceedings of the 34th International Conference on Machine
               Learning",
  author    = "Arjovsky, Martin and Chintala, Soumith and Bottou, L{\'e}on",
  editor    = "Precup, Doina and Teh, Yee Whye",
  abstract  = "We introduce a new algorithm named WGAN, an alternative to
               traditional GAN training. In this new model, we show that we can
               improve the stability of learning, get rid of problems like mode
               collapse, and provide meaningful learning curves useful for
               debugging and hyperparameter searches. Furthermore, we show that
               the corresponding optimization problem is sound, and provide
               extensive theoretical work highlighting the deep connections to
               different distances between distributions.",
  publisher = "PMLR",
  volume    =  70,
  pages     = "214--223",
  series    = "Proceedings of Machine Learning Research",
  year      =  2017
}

@ARTICLE{Radford2015-ci,
  title         = "Unsupervised Representation Learning with Deep Convolutional
                   Generative Adversarial Networks",
  author        = "Radford, Alec and Metz, Luke and Chintala, Soumith",
  abstract      = "In recent years, supervised learning with convolutional
                   networks (CNNs) has seen huge adoption in computer vision
                   applications. Comparatively, unsupervised learning with CNNs
                   has received less attention. In this work we hope to help
                   bridge the gap between the success of CNNs for supervised
                   learning and unsupervised learning. We introduce a class of
                   CNNs called deep convolutional generative adversarial
                   networks (DCGANs), that have certain architectural
                   constraints, and demonstrate that they are a strong
                   candidate for unsupervised learning. Training on various
                   image datasets, we show convincing evidence that our deep
                   convolutional adversarial pair learns a hierarchy of
                   representations from object parts to scenes in both the
                   generator and discriminator. Additionally, we use the
                   learned features for novel tasks - demonstrating their
                   applicability as general image representations.",
  month         =  nov,
  year          =  2015,
  archivePrefix = "arXiv",
  eprint        = "1511.06434",
  primaryClass  = "cs.LG",
  arxivid       = "1511.06434"
}

@ARTICLE{Lin2007-kc,
  title    = "Projected Gradient Methods for Nonnegative Matrix Factorization",
  author   = "Lin, Chih-Jen",
  abstract = "Nonnegative matrix factorization (NMF) can be formulated as a
              minimization problem with bound constraints. Although
              bound-constrained optimization has been studied extensively in
              both theory and practice, so far no study has formally applied
              its techniques to NMF. In this letter, we propose two projected
              gradient methods for NMF, both of which exhibit strong
              optimization properties. We discuss efficient implementations and
              demonstrate that one of the proposed methods converges faster
              than the popular multiplicative update approach. A simple Matlab
              code is also provided.",
  journal  = "Neural Comput.",
  volume   =  19,
  number   =  10,
  pages    = "2756--2779",
  month    =  oct,
  year     =  2007,
  issn     = "0899-7667",
  doi      = "10.1162/neco.2007.19.10.2756"
}

@ARTICLE{Lee1999-ge,
  title    = "Learning the parts of objects by non-negative matrix
              factorization",
  author   = "Lee, D D and Seung, H S",
  abstract = "Is perception of the whole based on perception of its parts?
              There is psychological and physiological evidence for parts-based
              representations in the brain, and certain computational theories
              of object recognition rely on such representations. But little is
              known about how brains or computers might learn the parts of
              objects. Here we demonstrate an algorithm for non-negative matrix
              factorization that is able to learn parts of faces and semantic
              features of text. This is in contrast to other methods, such as
              principal components analysis and vector quantization, that learn
              holistic, not parts-based, representations. Non-negative matrix
              factorization is distinguished from the other methods by its use
              of non-negativity constraints. These constraints lead to a
              parts-based representation because they allow only additive, not
              subtractive, combinations. When non-negative matrix factorization
              is implemented as a neural network, parts-based representations
              emerge by virtue of two properties: the firing rates of neurons
              are never negative and synaptic strengths do not change sign.",
  journal  = "Nature",
  volume   =  401,
  number   =  6755,
  pages    = "788--791",
  month    =  oct,
  year     =  1999,
  language = "en",
  issn     = "0028-0836",
  pmid     = "10548103",
  doi      = "10.1038/44565"
}


@ARTICLE{Deerwester1990-tn,
  title     = "Indexing by latent semantic analysis",
  author    = "Deerwester, Scott and Dumais, Susan T and Furnas, George W and
               Landauer, Thomas K and Harshman, Richard",
  journal   = "J. Am. Soc. Inf. Sci.",
  publisher = "Wiley",
  volume    =  41,
  number    =  6,
  pages     = "391--407",
  month     =  sep,
  year      =  1990,
  language  = "en",
  issn      = "0002-8231, 1097-4571",
  doi       = "10.1002/(sici)1097-4571(199009)41:6<391::aid-asi1>3.0.co;2-9"
}

@INPROCEEDINGS{Lee2000-ld,
  title     = "Algorithms for Non-negative Matrix Factorization",
  booktitle = "Advances in Neural Information Processing Systems",
  author    = "Lee, Daniel and Seung, H Sebastian",
  editor    = "Leen, T and Dietterich, T and Tresp, V",
  publisher = "MIT Press",
  volume    =  13,
  year      =  2000
}


@INPROCEEDINGS{Mikolov2013-ok,
  title     = "Efficient Estimation of Word Representations in Vector Space",
  booktitle = "1st International Conference on Learning Representations, {ICLR}
               2013, Scottsdale, Arizona, {USA}, May 2-4, 2013, Workshop Track
               Proceedings",
  author    = "Mikolov, Tom{\'a}s and Chen, Kai and Corrado, Greg and Dean,
               Jeffrey",
  editor    = "Bengio, Yoshua and LeCun, Yann",
  year      =  2013
}

@ARTICLE{Mikolov2013-yn,
  title   = "Distributed representations of words and phrases and their
             compositionality",
  author  = "Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg
             S and Dean, Jeff",
  journal = "Adv. Neural Inf. Process. Syst.",
  volume  =  26,
  year    =  2013,
  issn    = "1049-5258"
}

@INPROCEEDINGS{Mikolov2013-zb,
  title     = "Linguistic Regularities in Continuous Space Word Representations",
  booktitle = "Proceedings of the 2013 Conference of the North {A}merican
               Chapter of the Association for Computational Linguistics: Human
               Language Technologies",
  author    = "Mikolov, Tomas and Yih, Wen-Tau and Zweig, Geoffrey",
  publisher = "Association for Computational Linguistics",
  pages     = "746--751",
  month     =  jun,
  year      =  2013,
  address   = "Atlanta, Georgia"
}


@ARTICLE{Van_der_Maaten2008-jw,
  title    = "Visualizing Data using {t-SNE}",
  author   = "van der Maaten, Laurens and Hinton, Geoffrey",
  journal  = "J. Mach. Learn. Res.",
  volume   =  9,
  number   =  86,
  pages    = "2579--2605",
  year     =  2008,
  issn     = "1532-4435, 1533-7928"
}


@INPROCEEDINGS{He2015-rc,
  title           = "Delving deep into rectifiers: Surpassing human-level
                     performance on {ImageNet} classification",
  booktitle       = "2015 {IEEE} International Conference on Computer Vision
                     ({ICCV})",
  author          = "He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun,
                     Jian",
  publisher       = "IEEE",
  month           =  dec,
  year            =  2015,
  conference      = "2015 IEEE International Conference on Computer Vision
                     (ICCV)",
  location        = "Santiago, Chile",
  isbn            = "9781467383912",
  doi             = "10.1109/iccv.2015.123"
}


@INPROCEEDINGS{Bottou94,
  author={Bottou, L. and Cortes, C. and Denker, J.S. and Drucker, H. and Guyon, I. and Jackel, L.D. and LeCun, Y. and Muller, U.A. and Sackinger, E. and Simard, P. and Vapnik, V.},
  booktitle={Proceedings of the 12th IAPR International Conference on Pattern Recognition, Vol. 3 - Conference C: Signal Processing (Cat. No.94CH3440-5)}, 
  title={Comparison of classifier methods: a case study in handwritten digit recognition}, 
  year={1994},
  volume={2},
  number={},
  pages={77-82 vol.2},
  doi={10.1109/ICPR.1994.576879}}

@ARTICLE{Hinton2006-yj,
  title    = "Reducing the dimensionality of data with neural networks",
  author   = "Hinton, G E and Salakhutdinov, R R",
  abstract = "High-dimensional data can be converted to low-dimensional codes
              by training a multilayer neural network with a small central
              layer to reconstruct high-dimensional input vectors. Gradient
              descent can be used for fine-tuning the weights in such
              ``autoencoder'' networks, but this works well only if the initial
              weights are close to a good solution. We describe an effective
              way of initializing the weights that allows deep autoencoder
              networks to learn low-dimensional codes that work much better
              than principal components analysis as a tool to reduce the
              dimensionality of data.",
  journal  = "Science",
  volume   =  313,
  number   =  5786,
  pages    = "504--507",
  month    =  jul,
  year     =  2006,
  language = "en",
  issn     = "0036-8075, 1095-9203",
  pmid     = "16873662",
  doi      = "10.1126/science.1127647"
}


@ARTICLE{Michelucci2022-jm,
  title         = "An Introduction to Autoencoders",
  author        = "Michelucci, Umberto",
  abstract      = "In this article, we will look at autoencoders. This article
                   covers the mathematics and the fundamental concepts of
                   autoencoders. We will discuss what they are, what the
                   limitations are, the typical use cases, and we will look at
                   some examples. We will start with a general introduction to
                   autoencoders, and we will discuss the role of the activation
                   function in the output layer and the loss function. We will
                   then discuss what the reconstruction error is. Finally, we
                   will look at typical applications as dimensionality
                   reduction, classification, denoising, and anomaly detection.
                   This paper contains the notes of a PhD-level lecture on
                   autoencoders given in 2021.",
  month         =  jan,
  year          =  2022,
  archivePrefix = "arXiv",
  eprint        = "2201.03898",
  primaryClass  = "cs.LG",
  arxivid       = "2201.03898"
}


@INPROCEEDINGS{Szegedy2015-qi,
  title           = "Going deeper with convolutions",
  booktitle       = "2015 {IEEE} Conference on Computer Vision and Pattern
                     Recognition ({CVPR})",
  author          = "Szegedy, Christian and Liu, Wei and Jia, Yangqing and
                     Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir
                     and Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich,
                     Andrew",
  publisher       = "IEEE",
  pages           = "1--9",
  month           =  jun,
  year            =  2015,
  conference      = "2015 IEEE Conference on Computer Vision and Pattern
                     Recognition (CVPR)",
  location        = "Boston, MA, USA",
  isbn            = "9781467369640",
  doi             = "10.1109/cvpr.2015.7298594"
}

@INPROCEEDINGS{Deng2009-ei,
  title     = "{ImageNet}: A large-scale hierarchical image database",
  booktitle = "2009 {IEEE} Conference on Computer Vision and Pattern
               Recognition",
  author    = "Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and
               Li, Kai and Fei-Fei, Li",
  abstract  = "The explosion of image data on the Internet has the potential to
               foster more sophisticated and robust models and algorithms to
               index, retrieve, organize and interact with images and
               multimedia data. But exactly how such data can be harnessed and
               organized remains a critical problem. We introduce here a new
               database called ``ImageNet'', a large-scale ontology of images
               built upon the backbone of the WordNet structure. ImageNet aims
               to populate the majority of the 80,000 synsets of WordNet with
               an average of 500--1000 clean and full resolution images. This
               will result in tens of millions of annotated images organized by
               the semantic hierarchy of WordNet. This paper offers a detailed
               analysis of ImageNet in its current state: 12 subtrees with 5247
               synsets and 3.2 million images in total. We show that ImageNet
               is much larger in scale and diversity and much more accurate
               than the current image datasets. Constructing such a large-scale
               database is a challenging task. We describe the data collection
               scheme with Amazon Mechanical Turk. Lastly, we illustrate the
               usefulness of ImageNet through three simple applications in
               object recognition, image classification and automatic object
               clustering. We hope that the scale, accuracy, diversity and
               hierarchical structure of ImageNet can offer unparalleled
               opportunities to researchers in the computer vision community
               and beyond.",
  pages     = "248--255",
  month     =  jun,
  year      =  2009,
  keywords  = "Large-scale systems;Image
               databases;Explosions;Internet;Robustness;Information
               retrieval;Image retrieval;Multimedia databases;Ontologies;Spine",
  issn      = "1063-6919",
  doi       = "10.1109/CVPR.2009.5206848"
}


@ARTICLE{Simonyan2014-wx,
  title         = "Very Deep Convolutional Networks for {Large-Scale} Image
                   Recognition",
  author        = "Simonyan, Karen and Zisserman, Andrew",
  abstract      = "In this work we investigate the effect of the convolutional
                   network depth on its accuracy in the large-scale image
                   recognition setting. Our main contribution is a thorough
                   evaluation of networks of increasing depth using an
                   architecture with very small (3x3) convolution filters,
                   which shows that a significant improvement on the prior-art
                   configurations can be achieved by pushing the depth to 16-19
                   weight layers. These findings were the basis of our ImageNet
                   Challenge 2014 submission, where our team secured the first
                   and the second places in the localisation and classification
                   tracks respectively. We also show that our representations
                   generalise well to other datasets, where they achieve
                   state-of-the-art results. We have made our two
                   best-performing ConvNet models publicly available to
                   facilitate further research on the use of deep visual
                   representations in computer vision.",
  month         =  sep,
  year          =  2014,
  archivePrefix = "arXiv",
  eprint        = "1409.1556",
  primaryClass  = "cs.CV",
  arxivid       = "1409.1556"
}


@INCOLLECTION{Cun1990-yz,
  title     = "Handwritten digit recognition with a back-propagation network",
  booktitle = "Advances in neural information processing systems 2",
  author    = "Cun, Y Le and Boser, B and Denker, J S and Howard, R E and
               Habbard, W and Jackel, L D and Henderson, D",
  publisher = "Morgan Kaufmann Publishers Inc.",
  pages     = "396--404",
  month     =  jun,
  year      =  1990,
  address   = "San Francisco, CA, USA",
  isbn      = "9781558601000"
}

@ARTICLE{Lecun1998-im,
  title    = "Gradient-based learning applied to document recognition",
  author   = "Lecun, Y and Bottou, L and Bengio, Y and Haffner, P",
  abstract = "Multilayer neural networks trained with the back-propagation
              algorithm constitute the best example of a successful gradient
              based learning technique. Given an appropriate network
              architecture, gradient-based learning algorithms can be used to
              synthesize a complex decision surface that can classify
              high-dimensional patterns, such as handwritten characters, with
              minimal preprocessing. This paper reviews various methods applied
              to handwritten character recognition and compares them on a
              standard handwritten digit recognition task. Convolutional neural
              networks, which are specifically designed to deal with the
              variability of 2D shapes, are shown to outperform all other
              techniques. Real-life document recognition systems are composed
              of multiple modules including field extraction, segmentation
              recognition, and language modeling. A new learning paradigm,
              called graph transformer networks (GTN), allows such multimodule
              systems to be trained globally using gradient-based methods so as
              to minimize an overall performance measure. Two systems for
              online handwriting recognition are described. Experiments
              demonstrate the advantage of global training, and the flexibility
              of graph transformer networks. A graph transformer network for
              reading a bank cheque is also described. It uses convolutional
              neural network character recognizers combined with global
              training techniques to provide record accuracy on business and
              personal cheques. It is deployed commercially and reads several
              million cheques per day.",
  journal  = "Proc. IEEE",
  volume   =  86,
  number   =  11,
  pages    = "2278--2324",
  month    =  nov,
  year     =  1998,
  keywords = "Neural networks;Pattern recognition;Machine learning;Optical
              character recognition software;Character recognition;Feature
              extraction;Multi-layer neural network;Optical computing;Hidden
              Markov models;Principal component analysis",
  issn     = "1558-2256",
  doi      = "10.1109/5.726791"
}

@ARTICLE{LeCun1989-us,
  title    = "Backpropagation Applied to Handwritten Zip Code Recognition",
  author   = "LeCun, Y and Boser, B and Denker, J S and Henderson, D and
              Howard, R E and Hubbard, W and Jackel, L D",
  abstract = "The ability of learning networks to generalize can be greatly
              enhanced by providing constraints from the task domain. This
              paper demonstrates how such constraints can be integrated into a
              backpropagation network through the architecture of the network.
              This approach has been successfully applied to the recognition of
              handwritten zip code digits provided by the U.S. Postal Service.
              A single network learns the entire recognition operation, going
              from the normalized image of the character to the final
              classification.",
  journal  = "Neural Comput.",
  volume   =  1,
  number   =  4,
  pages    = "541--551",
  month    =  dec,
  year     =  1989,
  issn     = "0899-7667",
  doi      = "10.1162/neco.1989.1.4.541"
}


@ARTICLE{Fukushima1980-zv,
  title    = "Neocognitron: a self organizing neural network model for a
              mechanism of pattern recognition unaffected by shift in position",
  author   = "Fukushima, K",
  abstract = "A neural network model for a mechanism of visual pattern
              recognition is proposed in this paper. The network is
              self-organized by ``learning without a teacher'', and acquires an
              ability to recognize stimulus patterns based on the geometrical
              similarity (Gestalt) of their shapes without affected by their
              positions. This network is given a nickname ``neocognitron''.
              After completion of self-organization, the network has a
              structure similar to the hierarchy model of the visual nervous
              system proposed by Hubel and Wiesel. The network consists of an
              input layer (photoreceptor array) followed by a cascade
              connection of a number of modular structures, each of which is
              composed of two layers of cells connected in a cascade. The first
              layer of each module consists of ``S-cells'', which show
              characteristics similar to simple cells or lower order
              hypercomplex cells, and the second layer consists of ``C-cells''
              similar to complex cells or higher order hypercomplex cells. The
              afferent synapses to each S-cell have plasticity and are
              modifiable. The network has an ability of unsupervised learning:
              We do not need any ``teacher'' during the process of
              self-organization, and it is only needed to present a set of
              stimulus patterns repeatedly to the input layer of the network.
              The network has been simulated on a digital computer. After
              repetitive presentation of a set of stimulus patterns, each
              stimulus pattern has become to elicit an output only from one of
              the C-cells of the last layer, and conversely, this C-cell has
              become selectively responsive only to that stimulus pattern. That
              is, none of the C-cells of the last layer responds to more than
              one stimulus pattern. The response of the C-cells of the last
              layer is not affected by the pattern's position at all. Neither
              is it affected by a small change in shape nor in size of the
              stimulus pattern.",
  journal  = "Biol. Cybern.",
  volume   =  36,
  number   =  4,
  pages    = "193--202",
  year     =  1980,
  language = "en",
  issn     = "0340-1200",
  pmid     = "7370364",
  doi      = "10.1007/BF00344251"
}

@ARTICLE{Krizhevsky2012-nl,
  title   = "Imagenet classification with deep convolutional neural networks",
  author  = "Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E",
  journal = "Adv. Neural Inf. Process. Syst.",
  volume  =  25,
  year    =  2012,
  issn    = "1049-5258"
}

% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{2016-as,
  title   = "画像認識のための深層学習の研究動向 : 畳込みニューラルネットワークとその利用法の発展(ニューラルネットワーク研究のフロンティア)",
  author  = "貴之, 岡谷",
  journal = "人工知能",
  volume  =  31,
  number  =  2,
  pages   = "169--179",
  year    =  2016,
  doi     = "10.11517/jjsai.31.2\_169"
}


@BOOK{Minsky2017-ab,
  title     = "Perceptrons: An introduction to computational geometry",
  author    = "Minsky, Marvin and Papert, Seymour A",
  publisher = "The MIT Press",
  year      =  2017,
  isbn      = "9780262343930",
  doi       = "10.7551/mitpress/11301.001.0001"
}


@ARTICLE{2020NumPy-Array,
  author  = {Harris, Charles R. and Millman, K. Jarrod and van der Walt, Stéfan J and Gommers, Ralf and Virtanen, Pauli and Cournapeau, David and Wieser, Eric and Taylor, Julian and Berg, Sebastian and Smith, Nathaniel J. and Kern, Robert and Picus, Matti and Hoyer, Stephan and van Kerkwijk, Marten H. and Brett, Matthew and Haldane, Allan and Fernández del Río, Jaime and Wiebe, Mark and Peterson, Pearu and Gérard-Marchant, Pierre and Sheppard, Kevin and Reddy, Tyler and Weckesser, Warren and Abbasi, Hameer and Gohlke, Christoph and Oliphant, Travis E.},
  title   = {Array programming with {NumPy}},
  journal = {Nature},
  year    = {2020},
  volume  = {585},
  pages   = {357–362},
  doi     = {10.1038/s41586-020-2649-2}
}

@incollection{NEURIPS2019_9015,
title = {PyTorch: An Imperative Style, High-Performance Deep Learning Library},
author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith},
booktitle = {Advances in Neural Information Processing Systems 32},
pages = {8024--8035},
year = {2019},
publisher = {Curran Associates, Inc.},
url = {http://papers.neurips.cc/paper/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf}
}

@ARTICLE{Hornik1989-cu,
  title    = "Multilayer feedforward networks are universal approximators",
  author   = "Hornik, Kurt and Stinchcombe, Maxwell and White, Halbert",
  abstract = "This paper rigorously establishes that standard multilayer
              feedforward networks with as few as one hidden layer using
              arbitrary squashing functions are capable of approximating any
              Borel measurable function from one finite dimensional space to
              another to any desired degree of accuracy, provided sufficiently
              many hidden units are available. In this sense, multilayer
              feedforward networks are a class of universal approximators.",
  journal  = "Neural Netw.",
  volume   =  2,
  number   =  5,
  pages    = "359--366",
  month    =  jan,
  year     =  1989,
  keywords = "Feedforward networks; Universal approximation; Mapping networks;
              Network representation capability; Stone-Weierstrass Theorem;
              Squashing functions; Sigma-Pi networks; Back-propagation networks",
  issn     = "0893-6080",
  doi      = "10.1016/0893-6080(89)90020-8"
}

@ARTICLE{Pinkus1999-hg,
  title     = "Approximation theory of the {MLP} model in neural networks",
  author    = "Pinkus, Allan",
  abstract  = "In this survey we discuss various approximation-theoretic
               problems that arise in the multilayer feedforward perceptron
               (MLP) model in neural networks. The MLP model is one of the more
               popular and practical of the many neural network models.
               Mathematically it is also one of the simpler models. Nonetheless
               the mathematics of this model is not well understood, and many
               of these problems are approximation-theoretic in character. Most
               of the research we will discuss is of very recent vintage. We
               will report on what has been done and on various unanswered
               questions. We will not be presenting practical (algorithmic)
               methods. We will, however, be exploring the capabilities and
               limitations of this model.",
  journal   = "Acta Numer.",
  publisher = "Cambridge University Press",
  volume    =  8,
  pages     = "143--195",
  month     =  jan,
  year      =  1999,
  issn      = "0962-4929, 1474-0508",
  doi       = "10.1017/S0962492900002919"
}

@ARTICLE{Leshno1993-sj,
  title    = "Multilayer feedforward networks with a nonpolynomial activation
              function can approximate any function",
  author   = "Leshno, Moshe and Lin, Vladimir Ya and Pinkus, Allan and
              Schocken, Shimon",
  abstract = "Several researchers characterized the activation function under
              which multilayer feedforward networks can act as universal
              approximators. We show that most of all the characterizations
              that were reported thus far in the literature are special cases
              of the following general result: A standard multilayer
              feedforward network with a locally bounded piecewise continuous
              activation function can approximate any continuous function to
              any degree of accuracy if and only if the network's activation
              function is not a polynomial. We also emphasize the important
              role of the threshold, asserting that without it the last theorem
              does not hold.",
  journal  = "Neural Netw.",
  volume   =  6,
  number   =  6,
  pages    = "861--867",
  month    =  jan,
  year     =  1993,
  keywords = "Multilayer feedforward networks; Activation functions; Role of
              threshold; Universal approximation capabilities; ($\mu$)
              approximation",
  issn     = "0893-6080",
  doi      = "10.1016/S0893-6080(05)80131-5"
}

@ARTICLE{Hornik1991-gh,
  title    = "Approximation capabilities of multilayer feedforward networks",
  author   = "Hornik, Kurt",
  abstract = "We show that standard multilayer feedforward networks with as few
              as a single hidden layer and arbitrary bounded and nonconstant
              activation function are universal approximators with respect to
              Lp($\mu$) performance criteria, for arbitrary finite input
              environment measures $\mu$, provided only that sufficiently many
              hidden units are available. If the activation function is
              continuous, bounded and nonconstant, then continuous mappings can
              be learned uniformly over compact input sets. We also give very
              general conditions ensuring that networks with sufficiently
              smooth activation functions are capable of arbitrarily accurate
              approximation to a function and its derivatives.",
  journal  = "Neural Netw.",
  volume   =  4,
  number   =  2,
  pages    = "251--257",
  month    =  jan,
  year     =  1991,
  keywords = "Multilayer feedforward networks; Activation function; Universal
              approximation capabilities; Input environment measure; ()
              approximation; Uniform approximation; Sobolev spaces; Smooth
              approximation",
  issn     = "0893-6080",
  doi      = "10.1016/0893-6080(91)90009-T"
}

@ARTICLE{Cybenko1989-dm,
  title     = "Approximation by superpositions of a sigmoidal function",
  author    = "Cybenko, G",
  abstract  = "In this paper we demonstrate that finite linear combinations of
               compositions of a fixed, univariate function and a set of affine
               functionals can uniformly approximate any continuous function
               ofn real variables with support in the unit hypercube; only mild
               conditions are imposed on the univariate function. Our results
               settle an open question about representability in the class of
               single hidden layer neural networks. In particular, we show that
               arbitrary decision regions can be arbitrarily well approximated
               by continuous feedforward neural networks with only a single
               internal, hidden layer and any continuous sigmoidal
               nonlinearity. The paper discusses approximation properties of
               other possible types of nonlinearities that might be implemented
               by artificial neural networks.",
  journal   = "Math. Control Signals Systems",
  publisher = "Springer Science and Business Media LLC",
  volume    =  2,
  number    =  4,
  pages     = "303--314",
  month     =  dec,
  year      =  1989,
  language  = "en",
  issn      = "0932-4194, 1435-568X",
  doi       = "10.1007/bf02551274"
}


@ARTICLE{McCulloch1943-py,
  title    = "A logical calculus of the ideas immanent in nervous activity",
  author   = "McCulloch, Warren S and Pitts, Walter",
  abstract = "Because of the ``all-or-none'' character of nervous activity,
              neural events and the relations among them can be treated by
              means of propositional logic. It is found that the behavior of
              every net can be described in these terms, with the addition of
              more complicated logical means for nets containing circles; and
              that for any logical expression satisfying certain conditions,
              one can find a net behaving in the fashion it describes. It is
              shown that many particular choices among possible
              neurophysiological assumptions are equivalent, in the sense that
              for every net behaving under one assumption, there exists another
              net which behaves under the other and gives the same results,
              although perhaps not in the same time. Various applications of
              the calculus are discussed.",
  journal  = "Bull. Math. Biophys.",
  volume   =  5,
  number   =  4,
  pages    = "115--133",
  month    =  dec,
  year     =  1943,
  issn     = "0007-4985, 1522-9602",
  doi      = "10.1007/BF02478259"
}


@ARTICLE{Rosenblatt1958-qg,
  title    = "The perceptron: A probabilistic model for information storage and
              organization in the brain",
  author   = "Rosenblatt, F",
  abstract = "To answer the questions of how information about the physical
              world is sensed, in what form is information remembered, and how
              does information retained in memory influence recognition and
              behavior, a theory is developed for a hypothetical nervous system
              called a perceptron. The theory serves as a bridge between
              biophysics and psychology. It is possible to predict learning
              curves from neurological variables and vice versa. The
              quantitative statistical approach is fruitful in the
              understanding of the organization of cognitive systems. 18
              references. (PsycINFO Database Record (c) 2016 APA, all rights
              reserved)",
  journal  = "Psychol. Rev.",
  volume   =  65,
  number   =  6,
  pages    = "386--408",
  month    =  nov,
  year     =  1958,
  issn     = "0033-295X, 1939-1471",
  doi      = "10.1037/h0042519"
}

@ARTICLE{Hochreiter1997-mz,
  title    = "Long short-term memory",
  author   = "Hochreiter, S and Schmidhuber, J",
  abstract = "Learning to store information over extended time intervals by
              recurrent backpropagation takes a very long time, mostly because
              of insufficient, decaying error backflow. We briefly review
              Hochreiter's (1991) analysis of this problem, then address it by
              introducing a novel, efficient, gradient-based method called long
              short-term memory (LSTM). Truncating the gradient where this does
              not do harm, LSTM can learn to bridge minimal time lags in excess
              of 1000 discrete-time steps by enforcing constant error flow
              through constant error carousels within special units.
              Multiplicative gate units learn to open and close access to the
              constant error flow. LSTM is local in space and time; its
              computational complexity per time step and weight is O(1). Our
              experiments with artificial data involve local, distributed,
              real-valued, and noisy pattern representations. In comparisons
              with real-time recurrent learning, back propagation through time,
              recurrent cascade correlation, Elman nets, and neural sequence
              chunking, LSTM leads to many more successful runs, and learns
              much faster. LSTM also solves complex, artificial long-time-lag
              tasks that have never been solved by previous recurrent network
              algorithms.",
  journal  = "Neural Comput.",
  volume   =  9,
  number   =  8,
  pages    = "1735--1780",
  month    =  nov,
  year     =  1997,
  language = "en",
  issn     = "0899-7667",
  pmid     = "9377276",
  doi      = "10.1162/neco.1997.9.8.1735"
}

@INPROCEEDINGS{Reimers2019-uh,
  title     = "{Sentence-{BERT}}: Sentence Embeddings using {S}iamese
               {{BERT}-Networks}",
  booktitle = "Proceedings of the 2019 Conference on Empirical Methods in
               Natural Language Processing and the 9th International Joint
               Conference on Natural Language Processing ({EMNLP-IJCNLP})",
  author    = "Reimers, Nils and Gurevych, Iryna",
  abstract  = "BERT (Devlin et al., 2018) and RoBERTa (Liu et al., 2019) has
               set a new state-of-the-art performance on sentence-pair
               regression tasks like semantic textual similarity (STS).
               However, it requires that both sentences are fed into the
               network, which causes a massive computational overhead: Finding
               the most similar pair in a collection of 10,000 sentences
               requires about 50 million inference computations
               (\textbackslashtextasciitilde65 hours) with BERT. The
               construction of BERT makes it unsuitable for semantic similarity
               search as well as for unsupervised tasks like clustering. In
               this publication, we present Sentence-BERT (SBERT), a
               modification of the pretrained BERT network that use siamese and
               triplet network structures to derive semantically meaningful
               sentence embeddings that can be compared using
               cosine-similarity. This reduces the effort for finding the most
               similar pair from 65 hours with BERT / RoBERTa to about 5
               seconds with SBERT, while maintaining the accuracy from BERT. We
               evaluate SBERT and SRoBERTa on common STS tasks and transfer
               learning tasks, where it outperforms other state-of-the-art
               sentence embeddings methods.",
  publisher = "Association for Computational Linguistics",
  pages     = "3982--3992",
  month     =  nov,
  year      =  2019,
  address   = "Hong Kong, China",
  doi       = "10.18653/v1/D19-1410"
}

@ARTICLE{Blei2012-zn,
  title     = "Probabilistic topic models",
  author    = "Blei, David M",
  abstract  = "Surveying a suite of algorithms that offer a solution to
               managing large document archives.",
  journal   = "Commun. ACM",
  publisher = "Association for Computing Machinery",
  volume    =  55,
  number    =  4,
  pages     = "77--84",
  month     =  apr,
  year      =  2012,
  address   = "New York, NY, USA",
  issn      = "0001-0782",
  doi       = "10.1145/2133806.2133826"
}

@INPROCEEDINGS{Devlin2019-pa,
  title     = "{BERT:} Pre-training of Deep Bidirectional Transformers for
               Language Understanding",
  booktitle = "Proceedings of the 2019 Conference of the North {A}merican
               Chapter of the Association for Computational Linguistics: Human
               Language Technologies, Volume 1 (Long and Short Papers)",
  author    = "Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova,
               Kristina",
  abstract  = "We introduce a new language representation model called BERT,
               which stands for Bidirectional Encoder Representations from
               Transformers. Unlike recent language representation models
               (Peters et al., 2018a; Radford et al., 2018), BERT is designed
               to pre-train deep bidirectional representations from unlabeled
               text by jointly conditioning on both left and right context in
               all layers. As a result, the pre-trained BERT model can be
               fine-tuned with just one additional output layer to create
               state-of-the-art models for a wide range of tasks, such as
               question answering and language inference, without substantial
               task-specific architecture modifications. BERT is conceptually
               simple and empirically powerful. It obtains new state-of-the-art
               results on eleven natural language processing tasks, including
               pushing the GLUE score to 80.5 (7.7 point absolute improvement),
               MultiNLI accuracy to 86.7\% (4.6\% absolute improvement), SQuAD
               v1.1 question answering Test F1 to 93.2 (1.5 point absolute
               improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute
               improvement).",
  publisher = "Association for Computational Linguistics",
  pages     = "4171--4186",
  month     =  jun,
  year      =  2019,
  address   = "Minneapolis, Minnesota",
  doi       = "10.18653/v1/N19-1423"
}

@ARTICLE{Nandwani2021-ew,
  title    = "A review on sentiment analysis and emotion detection from text",
  author   = "Nandwani, Pansy and Verma, Rupali",
  abstract = "Social networking platforms have become an essential means for
              communicating feelings to the entire world due to rapid expansion
              in the Internet era. Several people use textual content,
              pictures, audio, and video to express their feelings or
              viewpoints. Text communication via Web-based networking media, on
              the other hand, is somewhat overwhelming. Every second, a massive
              amount of unstructured data is generated on the Internet due to
              social media platforms. The data must be processed as rapidly as
              generated to comprehend human psychology, and it can be
              accomplished using sentiment analysis, which recognizes polarity
              in texts. It assesses whether the author has a negative,
              positive, or neutral attitude toward an item, administration,
              individual, or location. In some applications, sentiment analysis
              is insufficient and hence requires emotion detection, which
              determines an individual's emotional/mental state precisely. This
              review paper provides understanding into levels of sentiment
              analysis, various emotion models, and the process of sentiment
              analysis and emotion detection from text. Finally, this paper
              discusses the challenges faced during sentiment and emotion
              analysis.",
  journal  = "Soc Netw Anal Min",
  volume   =  11,
  number   =  1,
  pages    = "81",
  month    =  aug,
  year     =  2021,
  keywords = "Affective computing; Natural language processing; Opinion mining;
              Pre-processing; Word embedding",
  language = "en",
  issn     = "1869-5450",
  pmid     = "34484462",
  doi      = "10.1007/s13278-021-00776-6",
  pmc      = "PMC8402961"
}

@ARTICLE{Almeida2019-dg,
  title    = "Word Embeddings: A Survey",
  author   = "Almeida, Felipe and Xex{\'e}o, Geraldo",
  abstract = "The main recent strategies for building fixed-length, dense and
              distributed representations for words, based on the
              distributional hypothesis, are described, which are now commonly
              called word embeddings. This work lists and describes the main
              recent strategies for building fixed-length, dense and
              distributed representations for words, based on the
              distributional hypothesis. These representations are now commonly
              called word embeddings and, in addition to encoding surprisingly
              good syntactic and semantic information, have been proven useful
              as extra features in many downstream NLP tasks.",
  journal  = "ArXiv",
  year     =  2019,
  language = "en",
  arxivid  = "1901.09069"
}

@INPROCEEDINGS{plsi,
  title     = "Probabilistic latent semantic indexing",
  booktitle = "Proceedings of the 22nd annual international {ACM} {SIGIR}
               conference on Research and development in information retrieval",
  author    = "Hofmann, Thomas",
  publisher = "Association for Computing Machinery",
  pages     = "50--57",
  series    = "SIGIR '99",
  month     =  aug,
  year      =  1999,
  address   = "New York, NY, USA",
  location  = "Berkeley, California, USA",
  isbn      = "9781581130966",
  doi       = "10.1145/312624.312649"
}

@INPROCEEDINGS{Goodfellow2014-eg,
  title     = "Generative Adversarial Nets",
  booktitle = "Advances in Neural Information Processing Systems",
  author    = "Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu,
               Bing and Warde-Farley, David and Ozair, Sherjil and Courville,
               Aaron and Bengio, Yoshua",
  editor    = "Ghahramani, Z and Welling, M and Cortes, C and Lawrence, N and
               Weinberger, K Q",
  publisher = "Curran Associates, Inc.",
  volume    =  27,
  year      =  2014
}

@ARTICLE{dropout,
  title    = "Dropout: A Simple Way to Prevent Neural Networks from Overfitting",
  author   = "Srivastava, Nitish and Hinton, Geoffrey and Krizhevsky, Alex and
              Sutskever, Ilya and Salakhutdinov, Ruslan",
  journal  = "J. Mach. Learn. Res.",
  volume   =  15,
  number   =  56,
  pages    = "1929--1958",
  year     =  2014,
  issn     = "1532-4435, 1533-7928"
}

@ARTICLE{Murakami2022-ah,
  title    = "Investigating the Efficient Use of Word Embedding with
              {Neural-Topic} Models for Interpretable Topics from Short Texts",
  author   = "Murakami, Riki and Chakraborty, Basabi",
  abstract = "With the rapid proliferation of social networking sites (SNS),
              automatic topic extraction from various text messages posted on
              SNS are becoming an important source of information for
              understanding current social trends or needs. Latent Dirichlet
              Allocation (LDA), a probabilistic generative model, is one of the
              popular topic models in the area of Natural Language Processing
              (NLP) and has been widely used in information retrieval, topic
              extraction, and document analysis. Unlike long texts from formal
              documents, messages on SNS are generally short. Traditional topic
              models such as LDA or pLSA (probabilistic latent semantic
              analysis) suffer performance degradation for short-text analysis
              due to a lack of word co-occurrence information in each short
              text. To cope with this problem, various techniques are evolving
              for interpretable topic modeling for short texts, pretrained word
              embedding with an external corpus combined with topic models is
              one of them. Due to recent developments of deep neural networks
              (DNN) and deep generative models, neural-topic models (NTM) are
              emerging to achieve flexibility and high performance in topic
              modeling. However, there are very few research works on
              neural-topic models with pretrained word embedding for generating
              high-quality topics from short texts. In this work, in addition
              to pretrained word embedding, a fine-tuning stage with an
              original corpus is proposed for training neural-topic models in
              order to generate semantically coherent, corpus-specific topics.
              An extensive study with eight neural-topic models has been
              completed to check the effectiveness of additional fine-tuning
              and pretrained word embedding in generating interpretable topics
              by simulation experiments with several benchmark datasets. The
              extracted topics are evaluated by different metrics of topic
              coherence and topic diversity. We have also studied the
              performance of the models in classification and clustering tasks.
              Our study concludes that though auxiliary word embedding with a
              large external corpus improves the topic coherency of short
              texts, an additional fine-tuning stage is needed for generating
              more corpus-specific topics from short-text data.",
  journal  = "Sensors",
  volume   =  22,
  number   =  3,
  month    =  jan,
  year     =  2022,
  keywords = "coherent topic; fine-tuning; neural-topic model; pretrained word
              embedding; short-text data",
  language = "en",
  issn     = "1424-8220",
  pmid     = "35161598",
  doi      = "10.3390/s22030852",
  pmc      = "PMC8840106"
}

@ARTICLE{NSTM,
  title         = "Neural Topic Model via Optimal Transport",
  author        = "Zhao, He and Phung, Dinh and Huynh, Viet and Le, Trung and
                   Buntine, Wray",
  abstract      = "Recently, Neural Topic Models (NTMs) inspired by variational
                   autoencoders have obtained increasingly research interest
                   due to their promising results on text analysis. However, it
                   is usually hard for existing NTMs to achieve good document
                   representation and coherent/diverse topics at the same time.
                   Moreover, they often degrade their performance severely on
                   short documents. The requirement of reparameterisation could
                   also comprise their training quality and model flexibility.
                   To address these shortcomings, we present a new neural topic
                   model via the theory of optimal transport (OT).
                   Specifically, we propose to learn the topic distribution of
                   a document by directly minimising its OT distance to the
                   document's word distributions. Importantly, the cost matrix
                   of the OT distance models the weights between topics and
                   words, which is constructed by the distances between topics
                   and words in an embedding space. Our proposed model can be
                   trained efficiently with a differentiable loss. Extensive
                   experiments show that our framework significantly
                   outperforms the state-of-the-art NTMs on discovering more
                   coherent and diverse topics and deriving better document
                   representations for both regular and short texts.",
  month         =  aug,
  year          =  2020,
  archivePrefix = "arXiv",
  eprint        = "2008.13537",
  primaryClass  = "cs.IR",
  arxivid       = "2008.13537"
}

@INPROCEEDINGS{DTM,
  title     = "Dynamic topic models",
  booktitle = "Proceedings of the 23rd international conference on Machine
               learning",
  author    = "Blei, David M and Lafferty, John D",
  abstract  = "A family of probabilistic time series models is developed to
               analyze the time evolution of topics in large document
               collections. The approach is to use state space models on the
               natural parameters of the multinomial distributions that
               represent the topics. Variational approximations based on Kalman
               filters and nonparametric wavelet regression are developed to
               carry out approximate posterior inference over the latent
               topics. In addition to giving quantitative, predictive models of
               a sequential corpus, dynamic topic models provide a qualitative
               window into the contents of a large document collection. The
               models are demonstrated by analyzing the OCR'ed archives of the
               journal Science from 1880 through 2000.",
  publisher = "Association for Computing Machinery",
  pages     = "113--120",
  series    = "ICML '06",
  month     =  jun,
  year      =  2006,
  address   = "New York, NY, USA",
  location  = "Pittsburgh, Pennsylvania, USA",
  isbn      = "9781595933836",
  doi       = "10.1145/1143844.1143859"
}

@INPROCEEDINGS{Hofmann1999-dd,
  title     = "Probabilistic latent semantic analysis",
  booktitle = "Proceedings of the Fifteenth conference on Uncertainty in
               artificial intelligence",
  author    = "Hofmann, Thomas",
  abstract  = "Probabilistic Latent Semantic Analysis is a novel statistical
               technique for the analysis of two-mode and co-occurrence data,
               which has applications in information retrieval and filtering,
               natural language processing, machine learning from text, and in
               related areas. Compared to standard Latent Semantic Analysis
               which stems from linear algebra and performs a Singular Value
               Decomposition of co-occurrence tables, the proposed method is
               based on a mixture decomposition derived from a latent class
               model. This results in a more principled approach which has a
               solid foundation in statistics. In order to avoid overfitting,
               we propose a widely applicable generalization of maximum
               likelihood model fitting by tempered EM. Our approach yields
               substantial and consistent improvements over Latent Semantic
               Analysis in a number of experiments.",
  publisher = "Morgan Kaufmann Publishers Inc.",
  pages     = "289--296",
  series    = "UAI'99",
  month     =  jul,
  year      =  1999,
  address   = "San Francisco, CA, USA",
  location  = "Stockholm, Sweden",
  isbn      = "9781558606142"
}

@INPROCEEDINGS{WAE,
  title     = "Wasserstein {Auto-Encoders}",
  booktitle = "6th International Conference on Learning Representations ({ICLR}
               2018)",
  author    = "Tolstikhin, Ilya O and Bousquet, Olivier and Gelly, Sylvain and
               Sch{\"o}lkopf, Bernhard",
  abstract  = "We propose the Wasserstein Auto-Encoder (WAE)---a new algorithm
               for building a generative model of the data distribution. WAE
               minimizes a penalized form of the Wasserstein distance between
               the model distribution and the target distribution, which leads
               to a different regularizer than the one used by the Variational
               Auto-Encoder (VAE). This regularizer encourages the encoded
               training distribution to match the prior. We compare our
               algorithm with several other techniques and show that it is a
               generalization of adversarial auto-encoders (AAE). Our
               experiments show that WAE shares many of the properties of VAEs
               (stable training, encoder-decoder architecture, nice latent
               manifold structure) while generating samples of better quality,
               as measured by the FID score.",
  year      =  2018
}

@INPROCEEDINGS{Goodfellow2014-eb,
  title     = "Generative Adversarial Nets",
  booktitle = "Advances in Neural Information Processing Systems",
  author    = "Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu,
               Bing and Warde-Farley, David and Ozair, Sherjil and Courville,
               Aaron and Bengio, Yoshua",
  editor    = "Ghahramani, Z and Welling, M and Cortes, C and Lawrence, N and
               Weinberger, K Q",
  abstract  = "We propose a new framework for estimating generative models via
               adversarial nets, in which we simultaneously train two models: a
               generative model G that captures the data distribution, and a
               discriminative model D that estimates the probability that a
               sample came from the training data rather than G. The training
               procedure for G is to maximize the probability of D making a
               mistake. This framework corresponds to a minimax two-player
               game. In the space of arbitrary functions G and D, a unique
               solution exists, with G recovering the training data
               distribution and D equal to 1/2 everywhere. In the case where G
               and D are defined by multilayer perceptrons, the entire system
               can be trained with backpropagation. There is no need for any
               Markov chains or unrolled approximate inference networks during
               either training or generation of samples. Experiments
               demonstrate the potential of the framework through qualitative
               and quantitatively evaluation of the generated samples.",
  publisher = "Curran Associates, Inc.",
  volume    =  27,
  year      =  2014
}

@INPROCEEDINGS{VAE,
  title     = "{{Auto-Encoding} Variational Bayes}",
  booktitle = "2nd International Conference on Learning Representations, {ICLR}
               2014, Banff, {AB}, Canada, April 14-16, 2014, Conference Track
               Proceedings",
  author    = "Kingma, Diederik P and Welling, Max",
  abstract  = "How can we perform efficient inference and learning in directed
               probabilistic models, in the presence of continuous latent
               variables with intractable posterior distributions, and large
               datasets? We introduce a stochastic variational inference and
               learning algorithm that scales to large datasets and, under some
               mild differentiability conditions, even works in the intractable
               case. Our contributions is two-fold. First, we show that a
               reparameterization of the variational lower bound yields a lower
               bound estimator that can be straightforwardly optimized using
               standard stochastic gradient methods. Second, we show that for
               i.i.d. datasets with continuous latent variables per datapoint,
               posterior inference can be made especially efficient by fitting
               an approximate inference model (also called a recognition model)
               to the intractable posterior using the proposed lower bound
               estimator. Theoretical advantages are reflected in experimental
               results.",
  year      =  2014,
  keywords  = "cs.LG stat.ML vae"
}

@INPROCEEDINGS{Vaswani2017-nq,
  title     = "Attention is All you Need",
  booktitle = "Advances in Neural Information Processing Systems",
  author    = "Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and
               Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser,
               {\L} Ukasz and Polosukhin, Illia",
  editor    = "Guyon, I and Luxburg, U V and Bengio, S and Wallach, H and
               Fergus, R and Vishwanathan, S and Garnett, R",
  abstract  = "The dominant sequence transduction models are based on complex
               recurrent or convolutional neural networks that include an
               encoder and a decoder. The best performing models also connect
               the encoder and decoder through an attention mechanism. We
               propose a new simple network architecture, the Transformer,
               based solely on attention mechanisms, dispensing with recurrence
               and convolutions entirely. Experiments on two machine
               translation tasks show these models to be superior in quality
               while being more parallelizable and requiring significantly less
               time to train. Our model achieves 28.4 BLEU on the WMT 2014
               Englishto-German translation task, improving over the existing
               best results, including ensembles, by over 2 BLEU. On the WMT
               2014 English-to-French translation task, our model establishes a
               new single-model state-of-the-art BLEU score of 41.0 after
               training for 3.5 days on eight GPUs, a small fraction of the
               training costs of the best models from the literature.",
  publisher = "Curran Associates, Inc.",
  volume    =  30,
  year      =  2017
}

@INPROCEEDINGS{Isonuma2020-zj,
  title     = "{{T}ree-{S}tructured} {N}eural {T}opic {M}odel",
  booktitle = "Proceedings of the 58th Annual Meeting of the Association for
               Computational Linguistics",
  author    = "Isonuma, Masaru and Mori, Junichiro and Bollegala, Danushka and
               Sakata, Ichiro",
  abstract  = "This paper presents a tree-structured neural topic model, which
               has a topic distribution over a tree with an infinite number of
               branches. Our model parameterizes an unbounded ancestral and
               fraternal topic distribution by applying doubly-recurrent neural
               networks. With the help of autoencoding variational Bayes, our
               model improves data scalability and achieves competitive
               performance when inducing latent topics and tree structures, as
               compared to a prior tree-structured topic model (Blei et al.,
               2010). This work extends the tree-structured topic model such
               that it can be incorporated with neural models for downstream
               tasks.",
  publisher = "Association for Computational Linguistics",
  pages     = "800--806",
  month     =  jul,
  year      =  2020,
  address   = "Online",
  doi       = "10.18653/v1/2020.acl-main.73"
}

@MISC{Bradbury2018-ev,
  title  = "{{JAX}}: composable transformations of {{P}ython+{N}um{P}y}
            programs",
  author = "Bradbury, James and Frostig, Roy and Hawkins, Peter and Johnson,
            Matthew James and Leary, Chris and Maclaurin, Dougal and Necula,
            George and Paszke, Adam and VanderPlas, Jake and Wanderman-Milne,
            Skye and Zhang, Qiao",
  year   =  2018
}

@MISC{Heek2020-he,
  title  = "{F}lax: A neural network library and ecosystem for {JAX}",
  author = "Heek, Jonathan and Levskaya, Anselm and Oliver, Avital and Ritter,
            Marvin and Rondepierre, Bertrand and Steiner, Andreas and van Zee,
            Marc",
  year   =  2020
}

@INPROCEEDINGS{Gregor2014-ii,
  title     = "Deep {AutoRegressive} Networks",
  booktitle = "Proceedings of the 31st International Conference on Machine
               Learning",
  author    = "Gregor, Karol and Danihelka, Ivo and Mnih, Andriy and Blundell,
               Charles and Wierstra, Daan",
  editor    = "Xing, Eric P and Jebara, Tony",
  abstract  = "We introduce a deep, generative autoencoder capable of learning
               hierarchies of distributed representations from data. Successive
               deep stochastic hidden layers are equipped with autoregressive
               connections, which enable the model to be sampled from quickly
               and exactly via ancestral sampling. We derive an efficient
               approximate parameter estimation method based on the minimum
               description length (MDL) principle, which can be seen as
               maximising a variational lower bound on the log-likelihood, with
               a feedforward neural network implementing approximate inference.
               We demonstrate state-of-the-art generative performance on a
               number of classic data sets: several UCI data sets, MNIST and
               Atari 2600 games.",
  publisher = "PMLR",
  volume    =  32,
  pages     = "1242--1250",
  series    = "Proceedings of Machine Learning Research",
  year      =  2014,
  address   = "Bejing, China"
}

@INPROCEEDINGS{Goodfellow2014-oc,
  title     = "Generative Adversarial Nets",
  booktitle = "Advances in Neural Information Processing Systems 27",
  author    = "Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu,
               Bing and Warde-Farley, David and Ozair, Sherjil and Courville,
               Aaron and Bengio, Yoshua",
  volume    =  27,
  pages     = "2672--2680",
  year      =  2014
}

@INPROCEEDINGS{ADAM,
  title     = "Adam: A Method for Stochastic Optimization",
  booktitle = "{ICLR} 2015 : International Conference on Learning
               Representations 2015",
  author    = "Kingma, Diederik P and Ba, Jimmy Lei",
  abstract  = "We introduce Adam, an algorithm for first-order gradient-based
               optimization of stochastic objective functions, based on
               adaptive estimates of lower-order moments. The method is
               straightforward to implement, is computationally efficient, has
               little memory requirements, is invariant to diagonal rescaling
               of the gradients, and is well suited for problems that are large
               in terms of data and/or parameters. The method is also
               appropriate for non-stationary objectives and problems with very
               noisy and/or sparse gradients. The hyper-parameters have
               intuitive interpretations and typically require little tuning.
               Some connections to related algorithms, on which Adam was
               inspired, are discussed. We also analyze the theoretical
               convergence properties of the algorithm and provide a regret
               bound on the convergence rate that is comparable to the best
               known results under the online convex optimization framework.
               Empirical results demonstrate that Adam works well in practice
               and compares favorably to other stochastic optimization methods.
               Finally, we discuss AdaMax, a variant of Adam based on the
               infinity norm.",
  year      =  2015
}

@INPROCEEDINGS{NVDM,
  title     = "Neural Variational Inference for Text Processing",
  booktitle = "Proceedings of The 33rd International Conference on Machine
               Learning",
  author    = "Miao, Yishu and Yu, Lei and Blunsom, Phil",
  editor    = "Balcan, Maria Florina and Weinberger, Kilian Q",
  abstract  = "Recent advances in neural variational inference have spawned a
               renaissance in deep latent variable models. In this paper we
               introduce a generic variational inference framework for
               generative and conditional models of text. While traditional
               variational methods derive an analytic approximation for the
               intractable distributions over latent variables, here we
               construct an inference network conditioned on the discrete text
               input to provide the variational distribution. We validate this
               framework on two very different text modelling applications,
               generative document modelling and supervised question answering.
               Our neural variational document model combines a continuous
               stochastic document representation with a bag-of-words
               generative model and achieves the lowest reported perplexities
               on two standard test corpora. The neural answer selection model
               employs a stochastic representation layer within an attention
               mechanism to extract the semantics between a question and answer
               pair. On two question answering benchmarks this model exceeds
               all previous published benchmarks.",
  publisher = "PMLR",
  volume    =  48,
  pages     = "1727--1736",
  series    = "Proceedings of Machine Learning Research",
  year      =  2016,
  address   = "New York, New York, USA"
}

@INPROCEEDINGS{Srivastava2013-xr,
  title     = "Modeling Documents with a Deep Boltzmann Machine",
  booktitle = "Proceedings of the {Twenty-Ninth} Conference on Uncertainty in
               Artificial Intelligence",
  author    = "Srivastava, Nitish and Salakhutdinov, Ruslan and Hinton,
               Geoffrey",
  abstract  = "We introduce a type of Deep Boltzmann Machine (DBM) that is
               suitable for extracting distributed semantic representations
               from a large unstructured collection of documents. We overcome
               the apparent difficulty of training a DBM with judicious
               parameter tying. This enables an efficient pretraining algorithm
               and a state initialization scheme for fast inference. The model
               can be trained just as efficiently as a standard Restricted
               Boltzmann Machine. Our experiments show that the model assigns
               better log probability to unseen data than the Replicated
               Softmax model. Features extracted from our model outperform LDA,
               Replicated Softmax, and DocNADE models on document retrieval and
               document classification tasks.",
  publisher = "AUAI Press",
  pages     = "616--624",
  series    = "UAI'13",
  year      =  2013,
  address   = "Arlington, Virginia, USA",
  location  = "Bellevue, WA"
}

@ARTICLE{Bojanowski2017-dd,
  title     = "Enriching word vectors with subword information",
  author    = "Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and
               Mikolov, Tomas",
  abstract  = "Continuous word representations, trained on large unlabeled
               corpora are useful for many natural language processing tasks.
               Popular models that learn such representations ignore the
               morphology of words, by assigning a distinct vector to each
               word. This is a limitation, especially for languages with large
               vocabularies and many rare words. In this paper, we propose a
               new approach based on the skipgram model, where each word is
               represented as a bag of character n-grams. A vector
               representation is associated to each character n-gram; words
               being represented as the sum of these representations. Our
               method is fast, allowing to train models on large corpora
               quickly and allows us to compute word representations for words
               that did not appear in the training data. We evaluate our word
               representations on nine different languages, both on word
               similarity and analogy tasks. By comparing to recently proposed
               morphological word representations, we show that our vectors
               achieve state-of-the-art performance on these tasks.",
  journal   = "Trans. Assoc. Comput. Linguist.",
  publisher = "MIT Press - Journals",
  volume    =  5,
  pages     = "135--146",
  month     =  dec,
  year      =  2017,
  language  = "en",
  issn      = "2307-387X",
  doi       = "10.1162/tacl\_a\_00051"
}

@INPROCEEDINGS{GloVe,
  title      = "Glove: Global vectors for word representation",
  booktitle  = "Proceedings of the 2014 Conference on Empirical Methods in
                Natural Language Processing ({EMNLP})",
  author     = "Pennington, Jeffrey and Socher, Richard and Manning,
                Christopher",
  abstract   = "Pennington, Jeffrey, Richard Socher, and Christopher Manning.
                2014. ``Glove: Global Vectors for Word Representation.'' In
                Proceedings of the 2014 Conference on Empirical Methods in
                Natural Language Processing (EMNLP). Stroudsburg, PA, USA:
                Association for Computational Linguistics.
                https://doi.org/10.3115/v1/d14-1162.",
  publisher  = "Association for Computational Linguistics",
  year       =  2014,
  address    = "Stroudsburg, PA, USA",
  conference = "Proceedings of the 2014 Conference on Empirical Methods in
                Natural Language Processing (EMNLP)",
  location   = "Doha, Qatar",
  doi        = "10.3115/v1/d14-1162"
}

@MISC{Mikolov2013-sc,
  title    = "Efficient Estimation of Word Representations in Vector Space",
  author   = "Mikolov, Tomas and Chen, Kai and Corrado, Greg S and Dean,
              Jeffrey",
  abstract = "We propose two novel model architectures for computing continuous
              vector representations of words from very large data sets. The
              quality of these representations is measured in a word similarity
              task, and the results are compared to the previously best
              performing techniques based on different types of neural
              networks. We observe large improvements in accuracy at much lower
              computational cost, i.e. it takes less than a day to learn high
              quality word vectors from a 1.6 billion words data set.
              Furthermore, we show that these vectors provide state-of-the-art
              performance on our test set for measuring syntactic and semantic
              word similarities.",
  year     =  2013
}

@ARTICLE{PLSA,
  title     = "Probabilistic topic models",
  author    = "Blei, David M",
  abstract  = "As our collective knowledge continues to be digitized and
               stored---in the form of news, blogs, Web pages, scientific
               articles, books, images, sound, video, and social networks---it
               becomes more difficult to find and discover what we are looking
               for. We need new computational tools to help organize, search,
               and understand these vast amounts of information. Right now, we
               work with online information using two main tools---search and
               links. We type keywords into a search engine and find a set of
               documents related to them. We look at the documents in that set,
               possibly navigating to other linked documents. This is a
               powerful way of interacting with our online archive, but
               something is missing. Imagine searching and exploring documents
               based on the themes that run through them. We might ``zoom in''
               and ``zoom out'' to find specific or broader themes; we might
               look at how those themes changed through time or how they are
               connected to each other. Rather than finding documents through
               keyword search alone, we might first find the theme that we are
               interested in, and then examine the documents related to that
               theme.",
  journal   = "Commun. ACM",
  publisher = "Association for Computing Machinery",
  volume    =  55,
  number    =  4,
  pages     = "77--84",
  month     =  apr,
  year      =  2012,
  address   = "New York, NY, USA",
  issn      = "0001-0782",
  doi       = "10.1145/2133806.2133826"
}

@ARTICLE{Blei2003-by,
  title    = "Latent Dirichlet Allocation",
  author   = "Blei, David M",
  abstract = "We describe latent Dirichlet allocation (LDA), a generative
              probabilistic model for collections of discrete data such as text
              corpora. LDA is a three-level hierarchical Bayesian model, in
              which each item of a collection is modeled as a finite mixture
              over an underlying set of topics. Each topic is, in turn, modeled
              as an infinite mixture over an underlying set of topic
              probabilities. In the context of text modeling, the topic
              probabilities provide an explicit representation of a document.
              We present efficient approximate inference techniques based on
              variational methods and an EM algorithm for empirical Bayes
              parameter estimation. We report results in document modeling,
              text classification, and collaborative filtering, comparing to a
              mixture of unigrams model and the probabilistic LSI model.",
  journal  = "J. Mach. Learn. Res.",
  volume   =  3,
  pages    = "993--1022",
  year     =  2003,
  issn     = "1532-4435"
}

@INPROCEEDINGS{ProdLDA,
  title     = "Autoencoding Variational Inference For Topic Models",
  booktitle = "5th International Conference on Learning Representations, {ICLR}
               2017, Toulon, France, April 24-26, 2017, Conference Track
               Proceedings",
  author    = "Srivastava, Akash and Sutton, Charles",
  abstract  = "Topic models are one of the most popular methods for learning
               representations of text, but a major challenge is that any
               change to the topic model requires mathematically deriving a new
               inference algorithm. A promising approach to address this
               problem is autoencoding variational Bayes (AEVB), but it has
               proven difficult to apply to topic models in practice. We
               present what is to our knowledge the first effective AEVB based
               inference method for latent Dirichlet allocation (LDA), which we
               call Autoencoded Variational Inference For Topic Model (AVITM).
               This model tackles the problems caused for AEVB by the Dirichlet
               prior and by component collapsing. We find that AVITM matches
               traditional methods in accuracy with much better inference time.
               Indeed, because of the inference network, we find that it is
               unnecessary to pay the computational cost of running variational
               optimization on test data. Because AVITM is black box, it is
               readily applied to new topic models. As a dramatic illustration
               of this, we present a new topic model called ProdLDA, that
               replaces the mixture model in LDA with a product of experts. By
               changing only one line of code from LDA, we find that ProdLDA
               yields much more interpretable topics, even if LDA is trained
               via collapsed Gibbs sampling.",
  publisher = "OpenReview.net",
  year      =  2017
}

@ARTICLE{RSM,
  title    = "Replicated softmax: an undirected topic model",
  author   = "Hinton, Geoffrey E and Salakhutdinov, Russ R",
  abstract = "We show how to model documents as bags of words using family of
              two-layer, undirected graphical models. Each member of the family
              has the same number of binary hidden units but a different number
              of softmax visible units. All of the softmax units in all of the
              models in the family share the same weights to the binary hidden
              units. We describe efficient inference and learning procedures
              for such a family. Each member of the family models the
              probability distribution of documents of a specific length as a
              product of topic-specific distributions rather than as a mixture
              and this gives much better generalization than Latent Dirichlet
              Allocation for modeling the log probabilities of held-out
              documents. The low-dimensional topic vectors learned by the
              undirected family are also much better than LDA topic vectors for
              retrieving documents that are similar to a query document. The
              learned topics are more general than those found by LDA because
              precision is achieved by intersecting many general topics rather
              than by selecting a single precise topic to generate each word.",
  journal  = "Adv. Neural Inf. Process. Syst.",
  volume   =  22,
  pages    = "1607--1614",
  year     =  2009,
  issn     = "1049-5258"
}