-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathref.bib
1532 lines (1452 loc) · 77.5 KB
/
ref.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@INPROCEEDINGS{Arjovsky2017-ad,
title = "{W}asserstein Generative Adversarial Networks",
booktitle = "Proceedings of the 34th International Conference on Machine
Learning",
author = "Arjovsky, Martin and Chintala, Soumith and Bottou, L{\'e}on",
editor = "Precup, Doina and Teh, Yee Whye",
abstract = "We introduce a new algorithm named WGAN, an alternative to
traditional GAN training. In this new model, we show that we can
improve the stability of learning, get rid of problems like mode
collapse, and provide meaningful learning curves useful for
debugging and hyperparameter searches. Furthermore, we show that
the corresponding optimization problem is sound, and provide
extensive theoretical work highlighting the deep connections to
different distances between distributions.",
publisher = "PMLR",
volume = 70,
pages = "214--223",
series = "Proceedings of Machine Learning Research",
year = 2017
}
@ARTICLE{Radford2015-ci,
title = "Unsupervised Representation Learning with Deep Convolutional
Generative Adversarial Networks",
author = "Radford, Alec and Metz, Luke and Chintala, Soumith",
abstract = "In recent years, supervised learning with convolutional
networks (CNNs) has seen huge adoption in computer vision
applications. Comparatively, unsupervised learning with CNNs
has received less attention. In this work we hope to help
bridge the gap between the success of CNNs for supervised
learning and unsupervised learning. We introduce a class of
CNNs called deep convolutional generative adversarial
networks (DCGANs), that have certain architectural
constraints, and demonstrate that they are a strong
candidate for unsupervised learning. Training on various
image datasets, we show convincing evidence that our deep
convolutional adversarial pair learns a hierarchy of
representations from object parts to scenes in both the
generator and discriminator. Additionally, we use the
learned features for novel tasks - demonstrating their
applicability as general image representations.",
month = nov,
year = 2015,
archivePrefix = "arXiv",
eprint = "1511.06434",
primaryClass = "cs.LG",
arxivid = "1511.06434"
}
@ARTICLE{Lin2007-kc,
title = "Projected Gradient Methods for Nonnegative Matrix Factorization",
author = "Lin, Chih-Jen",
abstract = "Nonnegative matrix factorization (NMF) can be formulated as a
minimization problem with bound constraints. Although
bound-constrained optimization has been studied extensively in
both theory and practice, so far no study has formally applied
its techniques to NMF. In this letter, we propose two projected
gradient methods for NMF, both of which exhibit strong
optimization properties. We discuss efficient implementations and
demonstrate that one of the proposed methods converges faster
than the popular multiplicative update approach. A simple Matlab
code is also provided.",
journal = "Neural Comput.",
volume = 19,
number = 10,
pages = "2756--2779",
month = oct,
year = 2007,
issn = "0899-7667",
doi = "10.1162/neco.2007.19.10.2756"
}
@ARTICLE{Lee1999-ge,
title = "Learning the parts of objects by non-negative matrix
factorization",
author = "Lee, D D and Seung, H S",
abstract = "Is perception of the whole based on perception of its parts?
There is psychological and physiological evidence for parts-based
representations in the brain, and certain computational theories
of object recognition rely on such representations. But little is
known about how brains or computers might learn the parts of
objects. Here we demonstrate an algorithm for non-negative matrix
factorization that is able to learn parts of faces and semantic
features of text. This is in contrast to other methods, such as
principal components analysis and vector quantization, that learn
holistic, not parts-based, representations. Non-negative matrix
factorization is distinguished from the other methods by its use
of non-negativity constraints. These constraints lead to a
parts-based representation because they allow only additive, not
subtractive, combinations. When non-negative matrix factorization
is implemented as a neural network, parts-based representations
emerge by virtue of two properties: the firing rates of neurons
are never negative and synaptic strengths do not change sign.",
journal = "Nature",
volume = 401,
number = 6755,
pages = "788--791",
month = oct,
year = 1999,
language = "en",
issn = "0028-0836",
pmid = "10548103",
doi = "10.1038/44565"
}
@ARTICLE{Deerwester1990-tn,
title = "Indexing by latent semantic analysis",
author = "Deerwester, Scott and Dumais, Susan T and Furnas, George W and
Landauer, Thomas K and Harshman, Richard",
journal = "J. Am. Soc. Inf. Sci.",
publisher = "Wiley",
volume = 41,
number = 6,
pages = "391--407",
month = sep,
year = 1990,
language = "en",
issn = "0002-8231, 1097-4571",
doi = "10.1002/(sici)1097-4571(199009)41:6<391::aid-asi1>3.0.co;2-9"
}
@INPROCEEDINGS{Lee2000-ld,
title = "Algorithms for Non-negative Matrix Factorization",
booktitle = "Advances in Neural Information Processing Systems",
author = "Lee, Daniel and Seung, H Sebastian",
editor = "Leen, T and Dietterich, T and Tresp, V",
publisher = "MIT Press",
volume = 13,
year = 2000
}
@INPROCEEDINGS{Mikolov2013-ok,
title = "Efficient Estimation of Word Representations in Vector Space",
booktitle = "1st International Conference on Learning Representations, {ICLR}
2013, Scottsdale, Arizona, {USA}, May 2-4, 2013, Workshop Track
Proceedings",
author = "Mikolov, Tom{\'a}s and Chen, Kai and Corrado, Greg and Dean,
Jeffrey",
editor = "Bengio, Yoshua and LeCun, Yann",
year = 2013
}
@ARTICLE{Mikolov2013-yn,
title = "Distributed representations of words and phrases and their
compositionality",
author = "Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg
S and Dean, Jeff",
journal = "Adv. Neural Inf. Process. Syst.",
volume = 26,
year = 2013,
issn = "1049-5258"
}
@INPROCEEDINGS{Mikolov2013-zb,
title = "Linguistic Regularities in Continuous Space Word Representations",
booktitle = "Proceedings of the 2013 Conference of the North {A}merican
Chapter of the Association for Computational Linguistics: Human
Language Technologies",
author = "Mikolov, Tomas and Yih, Wen-Tau and Zweig, Geoffrey",
publisher = "Association for Computational Linguistics",
pages = "746--751",
month = jun,
year = 2013,
address = "Atlanta, Georgia"
}
@ARTICLE{Van_der_Maaten2008-jw,
title = "Visualizing Data using {t-SNE}",
author = "van der Maaten, Laurens and Hinton, Geoffrey",
journal = "J. Mach. Learn. Res.",
volume = 9,
number = 86,
pages = "2579--2605",
year = 2008,
issn = "1532-4435, 1533-7928"
}
@INPROCEEDINGS{He2015-rc,
title = "Delving deep into rectifiers: Surpassing human-level
performance on {ImageNet} classification",
booktitle = "2015 {IEEE} International Conference on Computer Vision
({ICCV})",
author = "He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun,
Jian",
publisher = "IEEE",
month = dec,
year = 2015,
conference = "2015 IEEE International Conference on Computer Vision
(ICCV)",
location = "Santiago, Chile",
isbn = "9781467383912",
doi = "10.1109/iccv.2015.123"
}
@INPROCEEDINGS{Bottou94,
author={Bottou, L. and Cortes, C. and Denker, J.S. and Drucker, H. and Guyon, I. and Jackel, L.D. and LeCun, Y. and Muller, U.A. and Sackinger, E. and Simard, P. and Vapnik, V.},
booktitle={Proceedings of the 12th IAPR International Conference on Pattern Recognition, Vol. 3 - Conference C: Signal Processing (Cat. No.94CH3440-5)},
title={Comparison of classifier methods: a case study in handwritten digit recognition},
year={1994},
volume={2},
number={},
pages={77-82 vol.2},
doi={10.1109/ICPR.1994.576879}}
@ARTICLE{Hinton2006-yj,
title = "Reducing the dimensionality of data with neural networks",
author = "Hinton, G E and Salakhutdinov, R R",
abstract = "High-dimensional data can be converted to low-dimensional codes
by training a multilayer neural network with a small central
layer to reconstruct high-dimensional input vectors. Gradient
descent can be used for fine-tuning the weights in such
``autoencoder'' networks, but this works well only if the initial
weights are close to a good solution. We describe an effective
way of initializing the weights that allows deep autoencoder
networks to learn low-dimensional codes that work much better
than principal components analysis as a tool to reduce the
dimensionality of data.",
journal = "Science",
volume = 313,
number = 5786,
pages = "504--507",
month = jul,
year = 2006,
language = "en",
issn = "0036-8075, 1095-9203",
pmid = "16873662",
doi = "10.1126/science.1127647"
}
@ARTICLE{Michelucci2022-jm,
title = "An Introduction to Autoencoders",
author = "Michelucci, Umberto",
abstract = "In this article, we will look at autoencoders. This article
covers the mathematics and the fundamental concepts of
autoencoders. We will discuss what they are, what the
limitations are, the typical use cases, and we will look at
some examples. We will start with a general introduction to
autoencoders, and we will discuss the role of the activation
function in the output layer and the loss function. We will
then discuss what the reconstruction error is. Finally, we
will look at typical applications as dimensionality
reduction, classification, denoising, and anomaly detection.
This paper contains the notes of a PhD-level lecture on
autoencoders given in 2021.",
month = jan,
year = 2022,
archivePrefix = "arXiv",
eprint = "2201.03898",
primaryClass = "cs.LG",
arxivid = "2201.03898"
}
@INPROCEEDINGS{Szegedy2015-qi,
title = "Going deeper with convolutions",
booktitle = "2015 {IEEE} Conference on Computer Vision and Pattern
Recognition ({CVPR})",
author = "Szegedy, Christian and Liu, Wei and Jia, Yangqing and
Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir
and Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich,
Andrew",
publisher = "IEEE",
pages = "1--9",
month = jun,
year = 2015,
conference = "2015 IEEE Conference on Computer Vision and Pattern
Recognition (CVPR)",
location = "Boston, MA, USA",
isbn = "9781467369640",
doi = "10.1109/cvpr.2015.7298594"
}
@INPROCEEDINGS{Deng2009-ei,
title = "{ImageNet}: A large-scale hierarchical image database",
booktitle = "2009 {IEEE} Conference on Computer Vision and Pattern
Recognition",
author = "Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and
Li, Kai and Fei-Fei, Li",
abstract = "The explosion of image data on the Internet has the potential to
foster more sophisticated and robust models and algorithms to
index, retrieve, organize and interact with images and
multimedia data. But exactly how such data can be harnessed and
organized remains a critical problem. We introduce here a new
database called ``ImageNet'', a large-scale ontology of images
built upon the backbone of the WordNet structure. ImageNet aims
to populate the majority of the 80,000 synsets of WordNet with
an average of 500--1000 clean and full resolution images. This
will result in tens of millions of annotated images organized by
the semantic hierarchy of WordNet. This paper offers a detailed
analysis of ImageNet in its current state: 12 subtrees with 5247
synsets and 3.2 million images in total. We show that ImageNet
is much larger in scale and diversity and much more accurate
than the current image datasets. Constructing such a large-scale
database is a challenging task. We describe the data collection
scheme with Amazon Mechanical Turk. Lastly, we illustrate the
usefulness of ImageNet through three simple applications in
object recognition, image classification and automatic object
clustering. We hope that the scale, accuracy, diversity and
hierarchical structure of ImageNet can offer unparalleled
opportunities to researchers in the computer vision community
and beyond.",
pages = "248--255",
month = jun,
year = 2009,
keywords = "Large-scale systems;Image
databases;Explosions;Internet;Robustness;Information
retrieval;Image retrieval;Multimedia databases;Ontologies;Spine",
issn = "1063-6919",
doi = "10.1109/CVPR.2009.5206848"
}
@ARTICLE{Simonyan2014-wx,
title = "Very Deep Convolutional Networks for {Large-Scale} Image
Recognition",
author = "Simonyan, Karen and Zisserman, Andrew",
abstract = "In this work we investigate the effect of the convolutional
network depth on its accuracy in the large-scale image
recognition setting. Our main contribution is a thorough
evaluation of networks of increasing depth using an
architecture with very small (3x3) convolution filters,
which shows that a significant improvement on the prior-art
configurations can be achieved by pushing the depth to 16-19
weight layers. These findings were the basis of our ImageNet
Challenge 2014 submission, where our team secured the first
and the second places in the localisation and classification
tracks respectively. We also show that our representations
generalise well to other datasets, where they achieve
state-of-the-art results. We have made our two
best-performing ConvNet models publicly available to
facilitate further research on the use of deep visual
representations in computer vision.",
month = sep,
year = 2014,
archivePrefix = "arXiv",
eprint = "1409.1556",
primaryClass = "cs.CV",
arxivid = "1409.1556"
}
@INCOLLECTION{Cun1990-yz,
title = "Handwritten digit recognition with a back-propagation network",
booktitle = "Advances in neural information processing systems 2",
author = "Cun, Y Le and Boser, B and Denker, J S and Howard, R E and
Habbard, W and Jackel, L D and Henderson, D",
publisher = "Morgan Kaufmann Publishers Inc.",
pages = "396--404",
month = jun,
year = 1990,
address = "San Francisco, CA, USA",
isbn = "9781558601000"
}
@ARTICLE{Lecun1998-im,
title = "Gradient-based learning applied to document recognition",
author = "Lecun, Y and Bottou, L and Bengio, Y and Haffner, P",
abstract = "Multilayer neural networks trained with the back-propagation
algorithm constitute the best example of a successful gradient
based learning technique. Given an appropriate network
architecture, gradient-based learning algorithms can be used to
synthesize a complex decision surface that can classify
high-dimensional patterns, such as handwritten characters, with
minimal preprocessing. This paper reviews various methods applied
to handwritten character recognition and compares them on a
standard handwritten digit recognition task. Convolutional neural
networks, which are specifically designed to deal with the
variability of 2D shapes, are shown to outperform all other
techniques. Real-life document recognition systems are composed
of multiple modules including field extraction, segmentation
recognition, and language modeling. A new learning paradigm,
called graph transformer networks (GTN), allows such multimodule
systems to be trained globally using gradient-based methods so as
to minimize an overall performance measure. Two systems for
online handwriting recognition are described. Experiments
demonstrate the advantage of global training, and the flexibility
of graph transformer networks. A graph transformer network for
reading a bank cheque is also described. It uses convolutional
neural network character recognizers combined with global
training techniques to provide record accuracy on business and
personal cheques. It is deployed commercially and reads several
million cheques per day.",
journal = "Proc. IEEE",
volume = 86,
number = 11,
pages = "2278--2324",
month = nov,
year = 1998,
keywords = "Neural networks;Pattern recognition;Machine learning;Optical
character recognition software;Character recognition;Feature
extraction;Multi-layer neural network;Optical computing;Hidden
Markov models;Principal component analysis",
issn = "1558-2256",
doi = "10.1109/5.726791"
}
@ARTICLE{LeCun1989-us,
title = "Backpropagation Applied to Handwritten Zip Code Recognition",
author = "LeCun, Y and Boser, B and Denker, J S and Henderson, D and
Howard, R E and Hubbard, W and Jackel, L D",
abstract = "The ability of learning networks to generalize can be greatly
enhanced by providing constraints from the task domain. This
paper demonstrates how such constraints can be integrated into a
backpropagation network through the architecture of the network.
This approach has been successfully applied to the recognition of
handwritten zip code digits provided by the U.S. Postal Service.
A single network learns the entire recognition operation, going
from the normalized image of the character to the final
classification.",
journal = "Neural Comput.",
volume = 1,
number = 4,
pages = "541--551",
month = dec,
year = 1989,
issn = "0899-7667",
doi = "10.1162/neco.1989.1.4.541"
}
@ARTICLE{Fukushima1980-zv,
title = "Neocognitron: a self organizing neural network model for a
mechanism of pattern recognition unaffected by shift in position",
author = "Fukushima, K",
abstract = "A neural network model for a mechanism of visual pattern
recognition is proposed in this paper. The network is
self-organized by ``learning without a teacher'', and acquires an
ability to recognize stimulus patterns based on the geometrical
similarity (Gestalt) of their shapes without affected by their
positions. This network is given a nickname ``neocognitron''.
After completion of self-organization, the network has a
structure similar to the hierarchy model of the visual nervous
system proposed by Hubel and Wiesel. The network consists of an
input layer (photoreceptor array) followed by a cascade
connection of a number of modular structures, each of which is
composed of two layers of cells connected in a cascade. The first
layer of each module consists of ``S-cells'', which show
characteristics similar to simple cells or lower order
hypercomplex cells, and the second layer consists of ``C-cells''
similar to complex cells or higher order hypercomplex cells. The
afferent synapses to each S-cell have plasticity and are
modifiable. The network has an ability of unsupervised learning:
We do not need any ``teacher'' during the process of
self-organization, and it is only needed to present a set of
stimulus patterns repeatedly to the input layer of the network.
The network has been simulated on a digital computer. After
repetitive presentation of a set of stimulus patterns, each
stimulus pattern has become to elicit an output only from one of
the C-cells of the last layer, and conversely, this C-cell has
become selectively responsive only to that stimulus pattern. That
is, none of the C-cells of the last layer responds to more than
one stimulus pattern. The response of the C-cells of the last
layer is not affected by the pattern's position at all. Neither
is it affected by a small change in shape nor in size of the
stimulus pattern.",
journal = "Biol. Cybern.",
volume = 36,
number = 4,
pages = "193--202",
year = 1980,
language = "en",
issn = "0340-1200",
pmid = "7370364",
doi = "10.1007/BF00344251"
}
@ARTICLE{Krizhevsky2012-nl,
title = "Imagenet classification with deep convolutional neural networks",
author = "Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E",
journal = "Adv. Neural Inf. Process. Syst.",
volume = 25,
year = 2012,
issn = "1049-5258"
}
% The entry below contains non-ASCII chars that could not be converted
% to a LaTeX equivalent.
@ARTICLE{2016-as,
title = "画像認識のための深層学習の研究動向 : 畳込みニューラルネットワークとその利用法の発展(ニューラルネットワーク研究のフロンティア)",
author = "貴之, 岡谷",
journal = "人工知能",
volume = 31,
number = 2,
pages = "169--179",
year = 2016,
doi = "10.11517/jjsai.31.2\_169"
}
@BOOK{Minsky2017-ab,
title = "Perceptrons: An introduction to computational geometry",
author = "Minsky, Marvin and Papert, Seymour A",
publisher = "The MIT Press",
year = 2017,
isbn = "9780262343930",
doi = "10.7551/mitpress/11301.001.0001"
}
@ARTICLE{2020NumPy-Array,
author = {Harris, Charles R. and Millman, K. Jarrod and van der Walt, Stéfan J and Gommers, Ralf and Virtanen, Pauli and Cournapeau, David and Wieser, Eric and Taylor, Julian and Berg, Sebastian and Smith, Nathaniel J. and Kern, Robert and Picus, Matti and Hoyer, Stephan and van Kerkwijk, Marten H. and Brett, Matthew and Haldane, Allan and Fernández del Río, Jaime and Wiebe, Mark and Peterson, Pearu and Gérard-Marchant, Pierre and Sheppard, Kevin and Reddy, Tyler and Weckesser, Warren and Abbasi, Hameer and Gohlke, Christoph and Oliphant, Travis E.},
title = {Array programming with {NumPy}},
journal = {Nature},
year = {2020},
volume = {585},
pages = {357–362},
doi = {10.1038/s41586-020-2649-2}
}
@incollection{NEURIPS2019_9015,
title = {PyTorch: An Imperative Style, High-Performance Deep Learning Library},
author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith},
booktitle = {Advances in Neural Information Processing Systems 32},
pages = {8024--8035},
year = {2019},
publisher = {Curran Associates, Inc.},
url = {http://papers.neurips.cc/paper/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf}
}
@ARTICLE{Hornik1989-cu,
title = "Multilayer feedforward networks are universal approximators",
author = "Hornik, Kurt and Stinchcombe, Maxwell and White, Halbert",
abstract = "This paper rigorously establishes that standard multilayer
feedforward networks with as few as one hidden layer using
arbitrary squashing functions are capable of approximating any
Borel measurable function from one finite dimensional space to
another to any desired degree of accuracy, provided sufficiently
many hidden units are available. In this sense, multilayer
feedforward networks are a class of universal approximators.",
journal = "Neural Netw.",
volume = 2,
number = 5,
pages = "359--366",
month = jan,
year = 1989,
keywords = "Feedforward networks; Universal approximation; Mapping networks;
Network representation capability; Stone-Weierstrass Theorem;
Squashing functions; Sigma-Pi networks; Back-propagation networks",
issn = "0893-6080",
doi = "10.1016/0893-6080(89)90020-8"
}
@ARTICLE{Pinkus1999-hg,
title = "Approximation theory of the {MLP} model in neural networks",
author = "Pinkus, Allan",
abstract = "In this survey we discuss various approximation-theoretic
problems that arise in the multilayer feedforward perceptron
(MLP) model in neural networks. The MLP model is one of the more
popular and practical of the many neural network models.
Mathematically it is also one of the simpler models. Nonetheless
the mathematics of this model is not well understood, and many
of these problems are approximation-theoretic in character. Most
of the research we will discuss is of very recent vintage. We
will report on what has been done and on various unanswered
questions. We will not be presenting practical (algorithmic)
methods. We will, however, be exploring the capabilities and
limitations of this model.",
journal = "Acta Numer.",
publisher = "Cambridge University Press",
volume = 8,
pages = "143--195",
month = jan,
year = 1999,
issn = "0962-4929, 1474-0508",
doi = "10.1017/S0962492900002919"
}
@ARTICLE{Leshno1993-sj,
title = "Multilayer feedforward networks with a nonpolynomial activation
function can approximate any function",
author = "Leshno, Moshe and Lin, Vladimir Ya and Pinkus, Allan and
Schocken, Shimon",
abstract = "Several researchers characterized the activation function under
which multilayer feedforward networks can act as universal
approximators. We show that most of all the characterizations
that were reported thus far in the literature are special cases
of the following general result: A standard multilayer
feedforward network with a locally bounded piecewise continuous
activation function can approximate any continuous function to
any degree of accuracy if and only if the network's activation
function is not a polynomial. We also emphasize the important
role of the threshold, asserting that without it the last theorem
does not hold.",
journal = "Neural Netw.",
volume = 6,
number = 6,
pages = "861--867",
month = jan,
year = 1993,
keywords = "Multilayer feedforward networks; Activation functions; Role of
threshold; Universal approximation capabilities; ($\mu$)
approximation",
issn = "0893-6080",
doi = "10.1016/S0893-6080(05)80131-5"
}
@ARTICLE{Hornik1991-gh,
title = "Approximation capabilities of multilayer feedforward networks",
author = "Hornik, Kurt",
abstract = "We show that standard multilayer feedforward networks with as few
as a single hidden layer and arbitrary bounded and nonconstant
activation function are universal approximators with respect to
Lp($\mu$) performance criteria, for arbitrary finite input
environment measures $\mu$, provided only that sufficiently many
hidden units are available. If the activation function is
continuous, bounded and nonconstant, then continuous mappings can
be learned uniformly over compact input sets. We also give very
general conditions ensuring that networks with sufficiently
smooth activation functions are capable of arbitrarily accurate
approximation to a function and its derivatives.",
journal = "Neural Netw.",
volume = 4,
number = 2,
pages = "251--257",
month = jan,
year = 1991,
keywords = "Multilayer feedforward networks; Activation function; Universal
approximation capabilities; Input environment measure; ()
approximation; Uniform approximation; Sobolev spaces; Smooth
approximation",
issn = "0893-6080",
doi = "10.1016/0893-6080(91)90009-T"
}
@ARTICLE{Cybenko1989-dm,
title = "Approximation by superpositions of a sigmoidal function",
author = "Cybenko, G",
abstract = "In this paper we demonstrate that finite linear combinations of
compositions of a fixed, univariate function and a set of affine
functionals can uniformly approximate any continuous function
ofn real variables with support in the unit hypercube; only mild
conditions are imposed on the univariate function. Our results
settle an open question about representability in the class of
single hidden layer neural networks. In particular, we show that
arbitrary decision regions can be arbitrarily well approximated
by continuous feedforward neural networks with only a single
internal, hidden layer and any continuous sigmoidal
nonlinearity. The paper discusses approximation properties of
other possible types of nonlinearities that might be implemented
by artificial neural networks.",
journal = "Math. Control Signals Systems",
publisher = "Springer Science and Business Media LLC",
volume = 2,
number = 4,
pages = "303--314",
month = dec,
year = 1989,
language = "en",
issn = "0932-4194, 1435-568X",
doi = "10.1007/bf02551274"
}
@ARTICLE{McCulloch1943-py,
title = "A logical calculus of the ideas immanent in nervous activity",
author = "McCulloch, Warren S and Pitts, Walter",
abstract = "Because of the ``all-or-none'' character of nervous activity,
neural events and the relations among them can be treated by
means of propositional logic. It is found that the behavior of
every net can be described in these terms, with the addition of
more complicated logical means for nets containing circles; and
that for any logical expression satisfying certain conditions,
one can find a net behaving in the fashion it describes. It is
shown that many particular choices among possible
neurophysiological assumptions are equivalent, in the sense that
for every net behaving under one assumption, there exists another
net which behaves under the other and gives the same results,
although perhaps not in the same time. Various applications of
the calculus are discussed.",
journal = "Bull. Math. Biophys.",
volume = 5,
number = 4,
pages = "115--133",
month = dec,
year = 1943,
issn = "0007-4985, 1522-9602",
doi = "10.1007/BF02478259"
}
@ARTICLE{Rosenblatt1958-qg,
title = "The perceptron: A probabilistic model for information storage and
organization in the brain",
author = "Rosenblatt, F",
abstract = "To answer the questions of how information about the physical
world is sensed, in what form is information remembered, and how
does information retained in memory influence recognition and
behavior, a theory is developed for a hypothetical nervous system
called a perceptron. The theory serves as a bridge between
biophysics and psychology. It is possible to predict learning
curves from neurological variables and vice versa. The
quantitative statistical approach is fruitful in the
understanding of the organization of cognitive systems. 18
references. (PsycINFO Database Record (c) 2016 APA, all rights
reserved)",
journal = "Psychol. Rev.",
volume = 65,
number = 6,
pages = "386--408",
month = nov,
year = 1958,
issn = "0033-295X, 1939-1471",
doi = "10.1037/h0042519"
}
@ARTICLE{Hochreiter1997-mz,
title = "Long short-term memory",
author = "Hochreiter, S and Schmidhuber, J",
abstract = "Learning to store information over extended time intervals by
recurrent backpropagation takes a very long time, mostly because
of insufficient, decaying error backflow. We briefly review
Hochreiter's (1991) analysis of this problem, then address it by
introducing a novel, efficient, gradient-based method called long
short-term memory (LSTM). Truncating the gradient where this does
not do harm, LSTM can learn to bridge minimal time lags in excess
of 1000 discrete-time steps by enforcing constant error flow
through constant error carousels within special units.
Multiplicative gate units learn to open and close access to the
constant error flow. LSTM is local in space and time; its
computational complexity per time step and weight is O(1). Our
experiments with artificial data involve local, distributed,
real-valued, and noisy pattern representations. In comparisons
with real-time recurrent learning, back propagation through time,
recurrent cascade correlation, Elman nets, and neural sequence
chunking, LSTM leads to many more successful runs, and learns
much faster. LSTM also solves complex, artificial long-time-lag
tasks that have never been solved by previous recurrent network
algorithms.",
journal = "Neural Comput.",
volume = 9,
number = 8,
pages = "1735--1780",
month = nov,
year = 1997,
language = "en",
issn = "0899-7667",
pmid = "9377276",
doi = "10.1162/neco.1997.9.8.1735"
}
@INPROCEEDINGS{Reimers2019-uh,
title = "{Sentence-{BERT}}: Sentence Embeddings using {S}iamese
{{BERT}-Networks}",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in
Natural Language Processing and the 9th International Joint
Conference on Natural Language Processing ({EMNLP-IJCNLP})",
author = "Reimers, Nils and Gurevych, Iryna",
abstract = "BERT (Devlin et al., 2018) and RoBERTa (Liu et al., 2019) has
set a new state-of-the-art performance on sentence-pair
regression tasks like semantic textual similarity (STS).
However, it requires that both sentences are fed into the
network, which causes a massive computational overhead: Finding
the most similar pair in a collection of 10,000 sentences
requires about 50 million inference computations
(\textbackslashtextasciitilde65 hours) with BERT. The
construction of BERT makes it unsuitable for semantic similarity
search as well as for unsupervised tasks like clustering. In
this publication, we present Sentence-BERT (SBERT), a
modification of the pretrained BERT network that use siamese and
triplet network structures to derive semantically meaningful
sentence embeddings that can be compared using
cosine-similarity. This reduces the effort for finding the most
similar pair from 65 hours with BERT / RoBERTa to about 5
seconds with SBERT, while maintaining the accuracy from BERT. We
evaluate SBERT and SRoBERTa on common STS tasks and transfer
learning tasks, where it outperforms other state-of-the-art
sentence embeddings methods.",
publisher = "Association for Computational Linguistics",
pages = "3982--3992",
month = nov,
year = 2019,
address = "Hong Kong, China",
doi = "10.18653/v1/D19-1410"
}
@ARTICLE{Blei2012-zn,
title = "Probabilistic topic models",
author = "Blei, David M",
abstract = "Surveying a suite of algorithms that offer a solution to
managing large document archives.",
journal = "Commun. ACM",
publisher = "Association for Computing Machinery",
volume = 55,
number = 4,
pages = "77--84",
month = apr,
year = 2012,
address = "New York, NY, USA",
issn = "0001-0782",
doi = "10.1145/2133806.2133826"
}
@INPROCEEDINGS{Devlin2019-pa,
title = "{BERT:} Pre-training of Deep Bidirectional Transformers for
Language Understanding",
booktitle = "Proceedings of the 2019 Conference of the North {A}merican
Chapter of the Association for Computational Linguistics: Human
Language Technologies, Volume 1 (Long and Short Papers)",
author = "Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova,
Kristina",
abstract = "We introduce a new language representation model called BERT,
which stands for Bidirectional Encoder Representations from
Transformers. Unlike recent language representation models
(Peters et al., 2018a; Radford et al., 2018), BERT is designed
to pre-train deep bidirectional representations from unlabeled
text by jointly conditioning on both left and right context in
all layers. As a result, the pre-trained BERT model can be
fine-tuned with just one additional output layer to create
state-of-the-art models for a wide range of tasks, such as
question answering and language inference, without substantial
task-specific architecture modifications. BERT is conceptually
simple and empirically powerful. It obtains new state-of-the-art
results on eleven natural language processing tasks, including
pushing the GLUE score to 80.5 (7.7 point absolute improvement),
MultiNLI accuracy to 86.7\% (4.6\% absolute improvement), SQuAD
v1.1 question answering Test F1 to 93.2 (1.5 point absolute
improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute
improvement).",
publisher = "Association for Computational Linguistics",
pages = "4171--4186",
month = jun,
year = 2019,
address = "Minneapolis, Minnesota",
doi = "10.18653/v1/N19-1423"
}
@ARTICLE{Nandwani2021-ew,
title = "A review on sentiment analysis and emotion detection from text",
author = "Nandwani, Pansy and Verma, Rupali",
abstract = "Social networking platforms have become an essential means for
communicating feelings to the entire world due to rapid expansion
in the Internet era. Several people use textual content,
pictures, audio, and video to express their feelings or
viewpoints. Text communication via Web-based networking media, on
the other hand, is somewhat overwhelming. Every second, a massive
amount of unstructured data is generated on the Internet due to
social media platforms. The data must be processed as rapidly as
generated to comprehend human psychology, and it can be
accomplished using sentiment analysis, which recognizes polarity
in texts. It assesses whether the author has a negative,
positive, or neutral attitude toward an item, administration,
individual, or location. In some applications, sentiment analysis
is insufficient and hence requires emotion detection, which
determines an individual's emotional/mental state precisely. This
review paper provides understanding into levels of sentiment
analysis, various emotion models, and the process of sentiment
analysis and emotion detection from text. Finally, this paper
discusses the challenges faced during sentiment and emotion
analysis.",
journal = "Soc Netw Anal Min",
volume = 11,
number = 1,
pages = "81",
month = aug,
year = 2021,
keywords = "Affective computing; Natural language processing; Opinion mining;
Pre-processing; Word embedding",
language = "en",
issn = "1869-5450",
pmid = "34484462",
doi = "10.1007/s13278-021-00776-6",
pmc = "PMC8402961"
}
@ARTICLE{Almeida2019-dg,
title = "Word Embeddings: A Survey",
author = "Almeida, Felipe and Xex{\'e}o, Geraldo",
abstract = "The main recent strategies for building fixed-length, dense and
distributed representations for words, based on the
distributional hypothesis, are described, which are now commonly
called word embeddings. This work lists and describes the main
recent strategies for building fixed-length, dense and
distributed representations for words, based on the
distributional hypothesis. These representations are now commonly
called word embeddings and, in addition to encoding surprisingly
good syntactic and semantic information, have been proven useful
as extra features in many downstream NLP tasks.",
journal = "ArXiv",
year = 2019,
language = "en",
arxivid = "1901.09069"
}
@INPROCEEDINGS{plsi,
title = "Probabilistic latent semantic indexing",
booktitle = "Proceedings of the 22nd annual international {ACM} {SIGIR}
conference on Research and development in information retrieval",
author = "Hofmann, Thomas",
publisher = "Association for Computing Machinery",
pages = "50--57",
series = "SIGIR '99",
month = aug,
year = 1999,
address = "New York, NY, USA",
location = "Berkeley, California, USA",
isbn = "9781581130966",
doi = "10.1145/312624.312649"
}
@INPROCEEDINGS{Goodfellow2014-eg,
title = "Generative Adversarial Nets",
booktitle = "Advances in Neural Information Processing Systems",
author = "Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu,
Bing and Warde-Farley, David and Ozair, Sherjil and Courville,
Aaron and Bengio, Yoshua",
editor = "Ghahramani, Z and Welling, M and Cortes, C and Lawrence, N and
Weinberger, K Q",
publisher = "Curran Associates, Inc.",
volume = 27,
year = 2014
}
@ARTICLE{dropout,
title = "Dropout: A Simple Way to Prevent Neural Networks from Overfitting",
author = "Srivastava, Nitish and Hinton, Geoffrey and Krizhevsky, Alex and
Sutskever, Ilya and Salakhutdinov, Ruslan",
journal = "J. Mach. Learn. Res.",
volume = 15,
number = 56,
pages = "1929--1958",
year = 2014,
issn = "1532-4435, 1533-7928"
}
@ARTICLE{Murakami2022-ah,
title = "Investigating the Efficient Use of Word Embedding with
{Neural-Topic} Models for Interpretable Topics from Short Texts",
author = "Murakami, Riki and Chakraborty, Basabi",
abstract = "With the rapid proliferation of social networking sites (SNS),
automatic topic extraction from various text messages posted on
SNS are becoming an important source of information for
understanding current social trends or needs. Latent Dirichlet
Allocation (LDA), a probabilistic generative model, is one of the
popular topic models in the area of Natural Language Processing
(NLP) and has been widely used in information retrieval, topic
extraction, and document analysis. Unlike long texts from formal
documents, messages on SNS are generally short. Traditional topic
models such as LDA or pLSA (probabilistic latent semantic
analysis) suffer performance degradation for short-text analysis
due to a lack of word co-occurrence information in each short
text. To cope with this problem, various techniques are evolving
for interpretable topic modeling for short texts, pretrained word
embedding with an external corpus combined with topic models is
one of them. Due to recent developments of deep neural networks
(DNN) and deep generative models, neural-topic models (NTM) are
emerging to achieve flexibility and high performance in topic
modeling. However, there are very few research works on
neural-topic models with pretrained word embedding for generating
high-quality topics from short texts. In this work, in addition
to pretrained word embedding, a fine-tuning stage with an
original corpus is proposed for training neural-topic models in
order to generate semantically coherent, corpus-specific topics.
An extensive study with eight neural-topic models has been
completed to check the effectiveness of additional fine-tuning
and pretrained word embedding in generating interpretable topics
by simulation experiments with several benchmark datasets. The
extracted topics are evaluated by different metrics of topic
coherence and topic diversity. We have also studied the
performance of the models in classification and clustering tasks.
Our study concludes that though auxiliary word embedding with a
large external corpus improves the topic coherency of short
texts, an additional fine-tuning stage is needed for generating
more corpus-specific topics from short-text data.",
journal = "Sensors",
volume = 22,
number = 3,
month = jan,
year = 2022,
keywords = "coherent topic; fine-tuning; neural-topic model; pretrained word
embedding; short-text data",
language = "en",
issn = "1424-8220",
pmid = "35161598",
doi = "10.3390/s22030852",
pmc = "PMC8840106"
}
@ARTICLE{NSTM,
title = "Neural Topic Model via Optimal Transport",
author = "Zhao, He and Phung, Dinh and Huynh, Viet and Le, Trung and
Buntine, Wray",
abstract = "Recently, Neural Topic Models (NTMs) inspired by variational
autoencoders have obtained increasingly research interest
due to their promising results on text analysis. However, it
is usually hard for existing NTMs to achieve good document
representation and coherent/diverse topics at the same time.
Moreover, they often degrade their performance severely on
short documents. The requirement of reparameterisation could
also comprise their training quality and model flexibility.
To address these shortcomings, we present a new neural topic
model via the theory of optimal transport (OT).
Specifically, we propose to learn the topic distribution of