@InProceedings{	  Abadi.Barham.Chen.ea.2016,
  title		= {Tensor{F}low: A system for large-scale machine learning},
  author	= {Abadi, Mart{\'\i}n and Barham, Paul and Chen, Jianmin and
		  Chen, Zhifeng and Davis, Andy and Dean, Jeffrey and Devin,
		  Matthieu and Ghemawat, Sanjay and Irving, Geoffrey and
		  Isard, Michael and et al.},
  booktitle	= {12th {USENIX} {S}ymposium on {O}perating {S}ystems {D}esign
		  and {I}mplementation ({OSDI} 16)},
  pages		= {265--283},
  year		= {2016}
}

@Article{	  Abdel-Hamid.Mohamed.Jiang.ea.2014,
  title		= {Convolutional neural networks for speech recognition},
  author	= {Abdel-Hamid, Ossama and Mohamed, Abdel-Rahman and Jiang,
		  Hui and Deng, Li and Penn, Gerald and Yu, Dong},
  journal	= {IEEE/{ACM} {T}ransactions on {A}udio, {S}peech, and {L}anguage
		  Processing},
  volume	= {22},
  number	= {10},
  pages		= {1533--1545},
  year		= {2014},
  publisher	= {IEEE}
}

@InProceedings{	  Ahmed.Aly.Gonzalez.ea.2012,
  title		= {Scalable inference in latent variable models},
  author	= {Ahmed, Amr and Aly, Moahmed and Gonzalez, Joseph and
		  Narayanamurthy, Shravan and Smola, Alexander J},
  booktitle	= {Proceedings of the {F}ifth {ACM} {I}nternational {C}onference on
		  {W}eb {S}earch and {D}ata {M}ining},
  pages		= {123--132},
  year		= {2012},
  organization	= {ACM}
}

@book{guyon2008feature,
  title={Feature {E}xtraction: {F}oundations and {A}pplications},
  author={Guyon, Isabelle and Gunn, Steve and Nikravesh, Masoud and Zadeh, Lofti A},
  year={2008},
  publisher={Springer}
}

@Article{	  Aji.McEliece.2000,
  title		= {The generalized distributive law},
  author	= {Aji, Srinivas M and McEliece, Robert J},
  journal	= {IEEE {T}ransactions on {I}nformation {T}heory},
  volume	= {46},
  number	= {2},
  pages		= {325--343},
  year		= {2000},
  publisher	= {IEEE}
}

@Article{	  Alsallakh.Kokhlikyan.Miglani.ea.2020,
  title		= {Mind the {PAD} -- {CNN}s can develop blind spots},
  author	= {Alsallakh, Bilal and Kokhlikyan, Narine and Miglani, Vivek
		  and Yuan, Jun and Reblitz-Richardson, Orion},
  journal	= {Ar{X}iv:2010.02178},
  year		= {2020}
}

@Article{	  Aronszajn.1950,
  title		= {Theory of {r}eproducing {k}ernels},
  author	= {Aronszajn, Nachman},
  journal	= {{T}ransactions of the {A}merican {M}athematical {S}ociety},
  volume	= {68},
  number	= {3},
  pages		= {337--404},
  year		= {1950}
}

@Article{	  Ba.Kiros.Hinton.2016,
  title		= {Layer normalization},
  author	= {Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey
		  E},
  journal	= {Ar{X}iv:1607.06450},
  year		= {2016}
}

@Article{	  Bahdanau.Cho.Bengio.2014,
  title		= {Neural machine translation by jointly learning to align
		  and translate},
  author	= {Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
  journal	= {Ar{X}iv:1409.0473},
  year		= {2014}
}

@Article{	  Bartlett.Montanari.Rakhlin.2021,
  title		= {Deep learning: a statistical viewpoint},
  author	= {Bartlett, Peter L and Montanari, Andrea and Rakhlin,
		  Alexander},
  journal	= {Ar{X}iv:2103.09177},
  year		= {2021}
}

@InProceedings{	  Bay.Tuytelaars.Van-Gool.2006,
  title		= {{SURF}: {S}peeded up robust features},
  author	= {Bay, Herbert and Tuytelaars, Tinne and Van Gool, Luc},
  booktitle	= {European {C}onference on {C}omputer {V}ision},
  pages		= {404--417},
  year		= {2006},
  organization	= {Springer}
}

@Article{	  Bengio.Ducharme.Vincent.ea.2003,
  title		= {A neural probabilistic language model},
  author	= {Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent,
		  Pascal and Jauvin, Christian},
  journal	= {{J}ournal of {M}achine {L}earning {R}esearch},
  volume	= {3},
  number	= {Feb},
  pages		= {1137--1155},
  year		= {2003}
}

@article{bengio1994learning,
  title={Learning long-term dependencies with gradient descent is difficult},
  author={Bengio, Yoshua and Simard, Patrice and Frasconi, Paolo},
  journal={IEEE {T}ransactions on {N}eural {N}etworks},
  volume={5},
  number={2},
  pages={157--166},
  year={1994},
  publisher={IEEE}
}


@InProceedings{	  Bergstra.Breuleux.Bastien.ea.2010,
  title		= {Theano: A {CPU} and {GPU} math compiler in {P}ython},
  author	= {Bergstra, James and Breuleux, Olivier and Bastien,
		  Fr{\'e}d{\'e}ric and Lamblin, Pascal and Pascanu, Razvan
		  and Desjardins, Guillaume and Turian, Joseph and
		  Warde-Farley, David and Bengio, Yoshua},
  booktitle	= {Proc. 9th {P}ython in {S}cience {C}onference},
  volume	= {1},
  pages		= {3--10},
  year		= {2010}
}

@InProceedings{	  Beutel.Murray.Faloutsos.ea.2014,
  title		= {Co{B}a{F}i: collaborative {B}ayesian filtering},
  author	= {Beutel, Alex and Murray, Kenton and Faloutsos, Christos
		  and Smola, Alexander J},
  booktitle	= {Proceedings of the 23rd {I}nternational {C}onference on {W}orld
		  {W}ide {W}eb},
  pages		= {97--108},
  year		= {2014}
}

@Article{	  Bishop.1995,
  title		= {Training with noise is equivalent to {T}ikhonov
		  regularization},
  author	= {Bishop, Chris M},
  journal	= {Neural {C}omputation},
  volume	= {7},
  number	= {1},
  pages		= {108--116},
  year		= {1995},
  publisher	= {MIT Press}
}

@Book{		  Bishop.2006,
  title		= {Pattern {R}ecognition and {M}achine {L}earning},
  author	= {Bishop, Christopher M},
  year		= {2006},
  publisher	= {Springer}
}

@Article{	  Black.Scholes.1973,
  title		= {The pricing of options and corporate liabilities},
  author	= {Black, Fischer and Scholes, Myron},
  journal	= {Journal of {P}olitical {E}conomy},
  pages		= {637--654},
  year		= {1973},
  volume = {81},
publisher	= {JSTOR}
}

@InProceedings{	  Bodla.Singh.Chellappa.ea.2017,
  title		= {Soft-{NMS}-improving object detection with one line of
		  code},
  author	= {Bodla, Navaneeth and Singh, Bharat and Chellappa, Rama and
		  Davis, Larry S},
  booktitle	= {Proceedings of the {IEEE} {I}nternational {C}onference on
		  {C}omputer {V}ision},
  pages		= {5561--5569},
  year		= {2017}
}

@Article{	  Bojanowski.Grave.Joulin.ea.2017,
  title		= {Enriching word vectors with subword information},
  author	= {Bojanowski, Piotr and Grave, Edouard and Joulin, Armand
		  and Mikolov, Tomas},
  journal	= {{T}ransactions of the {A}ssociation for {C}omputational
		  {L}inguistics},
  volume	= {5},
  pages		= {135--146},
  year		= {2017},
  publisher	= {MIT {P}ress}
}

@Book{		  Bollobas.1999,
  title		= {Linear {A}nalysis},
  author	= {Bollob{\'a}s, B},
  year		= {1999},
  publisher	= {Cambridge {U}niversity {P}ress}
}

@InCollection{	  Bottou.2010,
  title		= {Large-scale machine learning with stochastic gradient
		  descent},
  author	= {Bottou, L{\'e}on},
  booktitle	= {Proceedings of {COMPSTAT}'2010},
  pages		= {177--186},
  year		= {2010},
  publisher	= {Springer}
}

@InProceedings{	  Bottou.Le-Cun.1988,
  author	= {Bottou, {L\'eon} and {Le Cun}, Yann},
  title		= {{SN}: A simulator for connectionist models},
  pages		= {371-382},
  booktitle	= {Proceedings of {N}euro{N}imes 88},
  address	= {Nimes, France},
  year		= {1988},
  url		= {http://leon.bottou.org/papers/bottou-lecun-88}
}

@Article{	  Bowman.Angeli.Potts.ea.2015,
  title		= {A large annotated corpus for learning natural language
		  inference},
  author	= {Bowman, Samuel R and Angeli, Gabor and Potts, Christopher
		  and Manning, Christopher D},
  journal	= {Ar{X}iv:1508.05326},
  year		= {2015}
}

@Book{		  Boyd.Vandenberghe.2004,
  address	= {Cambridge, England},
  author	= {Stephen Boyd and Lieven Vandenberghe},
  publisher	= {Cambridge {U}niversity {P}ress},
  title		= {Convex {O}ptimization},
  year		= 2004
}

@Article{	  Bradley.Terry.1952,
  title		= {Rank analysis of incomplete block designs: {I}. {T}he method
		  of paired comparisons},
  author	= {Bradley, Ralph Allan and Terry, Milton E},
  journal	= {Biometrika},
  volume	= {39},
  number	= {3/4},
  pages		= {324--345},
  year		= {1952},
  publisher	= {JSTOR}
}

@InProceedings{	  Brown.Cocke.Della-Pietra.ea.1988,
  title		= {A statistical approach to language translation},
  author	= {Brown, Peter F and Cocke, John and Della Pietra, Stephen A
		  and Della Pietra, Vincent J and Jelinek, Frederick and
		  Mercer, Robert L and Roossin, Paul},
  booktitle	= {{COLING} {B}udapest 1988 {V}olume 1: {I}nternational {C}onference on
		  {C}omputational {L}inguistics},
  year		= {1988}
}

@Article{	  Brown.Cocke.Della-Pietra.ea.1990,
  title		= {A statistical approach to machine translation},
  author	= {Brown, Peter F and Cocke, John and Della Pietra, Stephen A
		  and Della Pietra, Vincent J and Jelinek, Frederick and
		  Lafferty, John and Mercer, Robert L and Roossin, Paul S},
  journal	= {{C}omputational {L}inguistics},
  volume	= {16},
  number	= {2},
  pages		= {79--85},
  year		= {1990}
}

@InProceedings{	  Brown.Sandholm.2017,
  title		= {Libratus: The superhuman {AI} for no-limit poker.},
  author	= {Brown, Noam and Sandholm, Tuomas},
  booktitle	= {{IJCAI}},
  pages		= {5226--5228},
  year		= {2017}
}

@Article{	  Buslaev.Iglovikov.Khvedchenya.ea.2020,
  title		= {Albumentations: {F}ast and flexible image augmentations},
  author	= {Buslaev, Alexander and Iglovikov, Vladimir I and
		  Khvedchenya, Eugene and Parinov, Alex and Druzhinin,
		  Mikhail and Kalinin, Alexandr A},
  journal	= {Information},
  volume	= {11},
  number	= {2},
  pages		= {125},
  year		= {2020},
  publisher	= {Multidisciplinary {D}igital {P}ublishing {I}nstitute}
}

@Book{		  Cajal.Azoulay.1894,
  title		= {Les {N}ouvelles {I}d{\'e}es sur la {S}tructure du {S}yst{\`e}me
		  {N}erveux chez l'{H}omme et chez les {V}ert{\'e}br{\'e}s},
  author = {Ram{\'o}n y {C}ajal, Santiago, and Azoulay, L.},
  year		= {1894},
  publisher = {Paris, C. {R}einwald & {C}ie}
}

@Article{	  Campbell.Hoane-Jr.Hsu.2002,
  title		= {Deep blue},
  author	= {Campbell, Murray and Hoane Jr, A Joseph and Hsu,
		  Feng-hsiung},
  journal	= {Artificial {I}ntelligence},
  volume	= {134},
  number	= {1-2},
  pages		= {57--83},
  year		= {2002},
  publisher	= {Elsevier}
}

@InCollection{	  Canny.1987,
  title		= {A computational approach to edge detection},
  author	= {Canny, John},
  booktitle	= {Readings in {C}omputer {V}ision},
  pages		= {184--203},
  year		= {1987},
  publisher	= {Elsevier}
}

@Article{	  Cantelli.1933,
  author	= {F.~P. Cantelli},
  journal	= {Rend.\ {A}ccad.\ {L}incei},
  number	= 1,
  pages		= 39,
  title		= {Sulla probabilita come limita della frequenza},
  volume	= 26,
  year		= 1933
}

@InProceedings{	  Cer.Diab.Agirre.ea.2017,
  title		= {Sem{E}val-2017 {T}ask 1: Semantic textual similarity
		  multilingual and crosslingual focused evaluation},
  author	= {Cer, Daniel and Diab, Mona and Agirre, Eneko and
		  Lopez-Gazpio, I{\~n}igo and Specia, Lucia},
  booktitle	= {Proceedings of the 11th {I}nternational {W}orkshop on {S}emantic
		  {E}valuation ({S}em{E}val-2017)},
  pages		= {1--14},
  year		= {2017}
}

@Article{	  Chen.Li.Li.ea.2015,
  title		= {{MXNET}: A flexible and efficient machine learning library
		  for heterogeneous distributed systems},
  author	= {Chen, Tianqi and Li, Mu and Li, Yutian and Lin, Min and
		  Wang, Naiyan and Wang, Minjie and Xiao, Tianjun and Xu,
		  Bing and Zhang, Chiyuan and Zhang, Zheng},
  journal	= {Ar{X}iv:1512.01274},
  year		= {2015}
}

@InProceedings{	  Cheng.Dong.Lapata.2016,
  title		= {Long short-term memory-networks for machine reading},
  author	= {Cheng, Jianpeng and Dong, Li and Lapata, Mirella},
  booktitle	= {Proceedings of the 2016 {C}onference on {E}mpirical {M}ethods in
		  {N}atural {L}anguage {P}rocessing},
  pages		= {551--561},
  year		= {2016}
}

@Article{	  Chetlur.Woolley.Vandermersch.ea.2014,
  title		= {cu{DNN}: {E}fficient primitives for deep learning},
  author	= {Chetlur, Sharan and Woolley, Cliff and Vandermersch,
		  Philippe and Cohen, Jonathan and Tran, John and Catanzaro,
		  Bryan and Shelhamer, Evan},
  journal	= {Ar{X}iv:1410.0759},
  year		= {2014}
}

@Article{	  Cho.Van-Merrienboer.Bahdanau.ea.2014,
  title		= {On the properties of neural machine translation:
		  {E}ncoder--decoder approaches},
  author	= {Cho, Kyunghyun and Van Merri{\"e}nboer, Bart and Bahdanau,
		  Dzmitry and Bengio, Yoshua},
  journal	= {Ar{X}iv:1409.1259},
  year		= {2014}
}

@Article{	  Cho.Van-Merrienboer.Gulcehre.ea.2014,
  title		= {Learning phrase representations using {RNN} encoder--decoder
		  for statistical machine translation},
  author	= {Cho, Kyunghyun and Van Merri{\"e}nboer, Bart and Gulcehre,
		  Caglar and Bahdanau, Dzmitry and Bougares, Fethi and
		  Schwenk, Holger and Bengio, Yoshua},
  journal	= {Ar{X}iv:1406.1078},
  year		= {2014}
}

@Book{		  Chowdhury.2010,
  title		= {Introduction to {M}odern {I}nformation {R}etrieval},
  author	= {Chowdhury, Gobinda G},
  year		= {2010},
  publisher	= {Facet {P}ublishing}
}

@Article{	  Chung.Gulcehre.Cho.ea.2014,
  title		= {Empirical evaluation of gated recurrent neural networks on
		  sequence modeling},
  author	= {Chung, Junyoung and Gulcehre, Caglar and Cho, KyungHyun
		  and Bengio, Yoshua},
  journal	= {Ar{X}iv:1412.3555},
  year		= {2014}
}

@Article{	  Collobert.Weston.Bottou.ea.2011,
  title		= {Natural language processing (almost) from scratch},
  author	= {Collobert, Ronan and Weston, Jason and Bottou, L{\'e}on
		  and Karlen, Michael and Kavukcuoglu, Koray and Kuksa,
		  Pavel},
  journal	= {{J}ournal of {M}achine {L}earning {R}esearch},
  volume	= {12},
   pages		= {2493--2537},
  year		= {2011}
}

@Article{	  Corfield.Scholkopf.Vapnik.2009,
  title		= {Falsificationism and statistical learning theory:
		  Comparing the {P}opper and {V}apnik--{C}hervonenkis dimensions},
  author	= {Corfield, David and Sch{\"o}lkopf, Bernhard and Vapnik,
		  Vladimir},
  journal	= {Journal for {G}eneral {P}hilosophy of {S}cience},
  volume	= {40},
  number	= {1},
  pages		= {51--58},
  year		= {2009},
  publisher	= {Springer}
}

@Book{		  Cover.Thomas.1999,
  title		= {Elements of {I}nformation {T}heory},
  author	= {Cover, T and Thomas, JM},
  year		= {1999},
  publisher	= {John {W}iley \& {S}ons}
}

@Book{	  Cramer.1946,
  title		= {Mathematical {M}ethods of {S}tatistics.},
  author	= {Cram{\'e}r, H},
  year		= {1946},
  publisher	= {Princeton {U}niversity {P}ress}
}

@Article{	  Csiszar.2008,
  title		= {Axiomatic characterizations of information measures},
  author	= {Csisz{\'a}r, Imre},
  journal	= {Entropy},
  volume	= {10},
  number	= {3},
  pages		= {261--273},
  year		= {2008},
  publisher	= {Molecular {D}iversity {P}reservation {I}nternational}
}

@Article{	  Cybenko.1989,
  title		= {Approximation by superpositions of a sigmoidal function},
  author	= {Cybenko, George},
  journal	= {Mathematics of {C}ontrol, {S}ignals and {S}ystems},
  volume	= {2},
  number	= {4},
  pages		= {303--314},
  year		= {1989},
  publisher	= {Springer}
}

@InProceedings{	  Dalal.Triggs.2005,
  title		= {Histograms of oriented gradients for human detection},
  author	= {Dalal, Navneet and Triggs, Bill},
  booktitle	= {2005 {IEEE} {C}omputer {S}ociety {C}onference on {C}omputer {V}ision
		  and {P}attern {R}ecognition ({CVPR}'05)},
  volume	= {1},
  pages		= {886--893},
  year		= {2005},
  organization	= {{IEEE}}
}

@Article{	  De-Cock.2011,
  title		= {Ames, {I}owa: Alternative to the {B}oston housing data as an
		  end of semester regression project},
  author	= {De Cock, Dean},
  journal	= {Journal of {S}tatistics {E}ducation},
  volume	= {19},
  number	= {3},
  year		= {2011},
  publisher	= {Taylor \& {F}rancis}
}

@Article{	  De-Valois.Albrecht.Thorell.1982,
  title		= {Spatial frequency selectivity of cells in macaque visual
		  cortex},
  author	= {De Valois, Russell L and Albrecht, Duane G and Thorell,
		  Lisa G},
  journal	= {Vision {R}esearch},
  volume	= {22},
  number	= {5},
  pages		= {545--559},
  year		= {1982},
  publisher	= {Elsevier}
}

@InProceedings{	  DeCandia.Hastorun.Jampani.ea.2007,
  title		= {Dynamo: {A}mazon's highly available key-value store},
  author	= {DeCandia, Giuseppe and Hastorun, Deniz and Jampani, Madan
		  and Kakulapati, Gunavardhan and Lakshman, Avinash and
		  Pilchin, Alex and Sivasubramanian, Swaminathan and
		  Vosshall, Peter and Vogels, Werner},
  booktitle	= {{ACM} {SIGOPS} {O}perating {S}ystems {R}eview},
  volume	= {41},
  number	= {6},
  pages		= {205--220},
  year		= {2007},
  organization	= {ACM}
}

@InProceedings{	  Dean.Corrado.Monga.ea.2012,
  title		= {Large scale distributed deep networks},
  author	= {Dean, Jeffrey and Corrado, Greg S and Monga, Rajat and
		  Chen, Kai and Devin, Matthieu and Le, Quoc V and Mao, Mark
		  Z and Ranzato, Marc'Aurelio and Senior, Andrew and Tucker,
		  Paul and et al.},
  booktitle	= {Proceedings of the 25th {I}nternational {C}onference on {N}eural
		  {I}nformation {P}rocessing {S}ystems, {V}olume 1},
  pages		= {1223--1231},
  year		= {2012}
}

@InProceedings{	  Deng.Dong.Socher.ea.2009,
  title		= {Imagenet: A large-scale hierarchical image database},
  author	= {Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia
		  and Li, Kai and Fei-Fei, Li},
  booktitle	= {2009 {IEEE} {C}onference on {C}omputer {V}ision and {P}attern
		  {R}ecognition},
  pages		= {248--255},
  year		= {2009},
  organization	= {IEEE}
}

@Article{	  Der-Kiureghian.Ditlevsen.2009,
  title		= {Aleatory or epistemic? Does it matter?},
  author	= {Der Kiureghian, Armen and Ditlevsen, Ove},
  journal	= {Structural {S}afety},
  volume	= {31},
  number	= {2},
  pages		= {105--112},
  year		= {2009},
  publisher	= {Elsevier}
}

@Article{	  Devlin.Chang.Lee.ea.2018,
  title		= {{BERT}: {P}re-training of deep bidirectional transformers for
		  language understanding},
  author	= {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and
		  Toutanova, Kristina},
  journal	= {Ar{X}iv:1810.04805},
  year		= {2018}
}

@InProceedings{	  Doersch.Gupta.Efros.2015,
  title		= {Unsupervised visual representation learning by context
		  prediction},
  author	= {Doersch, Carl and Gupta, Abhinav and Efros, Alexei A},
  booktitle	= {Proceedings of the {IEEE} {I}nternational {C}onference on
		  {C}omputer {V}ision},
  pages		= {1422--1430},
  year		= {2015}
}

@InProceedings{	  Dosovitskiy.Beyer.Kolesnikov.ea.2021,
  title		= {An image is worth 16 x 16 words: Transformers for image
		  recognition at scale},
  author	= {Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov,
		  Alexander and Weissenborn, Dirk and Zhai, Xiaohua and
		  Unterthiner, Thomas and Dehghani, Mostafa and Minderer,
		  Matthias and Heigold, Georg and Gelly, Sylvain and et al.},
  booktitle	= {{I}nternational {C}onference on {L}earning {R}epresentations},
  year		= {2021}
}

@InProceedings{	  Dosovitskiy.Beyer.Kolesnikov.ea.2021*1,
  title		= {An image is worth $16\times 16$ words: Transformers for image
		  recognition at scale},
  author	= {Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov,
		  Alexander and Weissenborn, Dirk and Zhai, Xiaohua and
		  Unterthiner, Thomas and Dehghani, Mostafa and Minderer,
		  Matthias and Heigold, Georg and Gelly, Sylvain and et al.},
  booktitle	= {{I}nternational {C}onference on {L}earning {R}epresentations},
  year		= {2021}
}

@InCollection{	  Doucet.De-Freitas.Gordon.2001,
  title		= {An introduction to sequential {M}onte {C}arlo methods},
  author	= {Doucet, Arnaud and De Freitas, Nando and Gordon, Neil},
  booktitle	= {Sequential {M}onte {C}arlo {M}ethods in {P}ractice},
  pages		= {3--14},
  year		= {2001},
  publisher	= {Springer}
}

@Article{	  Duchi.Hazan.Singer.2011,
  title		= {Adaptive subgradient methods for online learning and
		  stochastic optimization},
  author	= {Duchi, John and Hazan, Elad and Singer, Yoram},
  journal	= {{J}ournal of {M}achine {L}earning {R}esearch},
  volume	= {12},
  pages		= {2121--2159},
  year		= {2011}
}

@Article{	  Dumoulin.Visin.2016,
  title		= {A guide to convolution arithmetic for deep learning},
  author	= {Dumoulin, Vincent and Visin, Francesco},
  journal	= {Ar{X}iv:1603.07285},
  year		= {2016}
}

@Article{	  Edelman.Ostrovsky.Schwarz.2007,
  title		= {Internet advertising and the generalized second-price
		  auction: Selling billions of dollars worth of keywords},
  author	= {Edelman, Benjamin and Ostrovsky, Michael and Schwarz,
		  Michael},
  journal	= {American {E}conomic {R}eview},
  volume	= {97},
  number	= {1},
  pages		= {242--259},
  year		= {2007}
}

@article{elman1990finding,
  title={Finding structure in time},
  author={Elman, Jeffrey L},
  journal={Cognitive {S}cience},
  volume={14},
  number={2},
  pages={179--211},
  year={1990},
  publisher={Wiley Online Library}
}


@Book{		  Fechner.1860,
  title		= {Elemente der {P}sychophysik},
  author	= {Fechner, Gustav Theodor},
  volume	= {2},
  year		= {1860},
  publisher	= {Breitkopf u. {H}{\"a}rtel}
}

@Book{		  Fernando.2004,
  title		= {GPU {G}ems: {P}rogramming {T}echniques, {T}ips, and {T}ricks for
		  {R}eal-{T}ime {G}raphics},
  author	= {Fernando, Randima},
  year		= {2004},
  publisher	= {Addison-{W}esley}
}

@Article{	  Field.1987,
  title		= {Relations between the statistics of natural images and the
		  response properties of cortical cells},
  author	= {Field, David J},
  journal	= {{JOSA} {A}},
  volume	= {4},
  number	= {12},
  pages		= {2379--2394},
  year		= {1987},
  publisher	= {Optical {S}ociety of {A}merica}
}

@Book{		  Fisher.1928,
  title		= {Statistical {M}ethods for {R}esearch {W}orkers.},
  author	= {Fisher, R A},
  year		= {1925},
  publisher	= {Oliver \& {B}oyd}
}

@InProceedings{	  Flammarion.Bach.2015,
  title		= {From averaging to acceleration, there is only a
		  step-size},
  author	= {Flammarion, Nicolas and Bach, Francis},
  booktitle	= {{C}onference on {L}earning {T}heory},
  pages		= {658--695},
  year		= {2015}
}

@Article{	  Frankle.Carbin.2018,
  title		= {The lottery ticket hypothesis: Finding sparse, trainable
		  neural networks},
  author	= {Frankle, Jonathan and Carbin, Michael},
  journal	= {Ar{X}iv:1803.03635},
  year		= {2018}
}

@Article{	  Frazier.2018,
  title		= {A tutorial on {B}ayesian optimization},
  author	= {Frazier, Peter I},
  journal	= {Ar{X}iv:1807.02811},
  year		= {2018}
}

@InProceedings{	  Freund.Schapire.ea.1996,
  title		= {Experiments with a new boosting algorithm},
  author	= {Freund, Yoav and Schapire, Robert E},
  booktitle	= {Proceedings of the {I}nternational {C}onference on {M}achine {L}earning},
  volume	= {96},
  pages		= {148--156},
  year		= {1996},
  organization	= {Citeseer}
}

@Article{	  Friedman.1997,
  title		= {On bias, variance, 0/1-loss, and the
		  curse-of-dimensionality},
  author	= {Friedman, Jerome H},
  journal	= {{D}ata {M}ining and {K}nowledge {D}iscovery},
  volume	= {1},
  number	= {1},
  pages		= {55--77},
  year		= {1997},
  publisher	= {Springer}
}

@InCollection{	  Frostig.Johnson.Leary.2018,
  title		= {Compiling machine learning programs via high-level
		  tracing},
  author	= {Frostig, Roy and Johnson, Matthew James and Leary, Chris},
  booktitle	= {Proceedings of Systems for {M}achine {L}earning},
  year		= {2018}
}

@InCollection{	  Fukushima.1982,
  title		= {Neocognitron: A self-organizing neural network model for a
		  mechanism of visual pattern recognition},
  author	= {Fukushima, Kunihiko},
  booktitle	= {Competition and {C}ooperation in {N}eural {N}ets},
  pages		= {267--285},
  year		= {1982},
  publisher	= {Springer}
}

@InProceedings{Gardner.Pleiss.Weinberger.Bindel.Wilson.2018,
  title={{GPyTorch}: Blackbox matrix--matrix {G}aussian process inference with {GPU} acceleration},
  author={Gardner, Jacob and Pleiss, Geoff and Weinberger, Kilian Q and Bindel, David and Wilson, Andrew G},
  booktitle={Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  volume={31},
  year={2018}
}


@InProceedings{Garg.Balakrishnan.Kolter.Lipton.2021,
  title={{RATT}: Leveraging unlabeled data to guarantee generalization},
  author={Garg, Saurabh and Balakrishnan, Sivaraman and Kolter, Zico and Lipton, Zachary},
  booktitle={{I}nternational {C}onference on {M}achine {L}earning},
  pages={3598--3609},
  year={2021},
  organization={PMLR}
}


@InProceedings{	  Gatys.Ecker.Bethge.2016,
  title		= {Image style transfer using convolutional neural networks},
  author	= {Gatys, Leon A and Ecker, Alexander S and Bethge,
		  Matthias},
  booktitle	= {Proceedings of the {IEEE} {C}onference on {C}omputer {V}ision and
		  {P}attern {R}ecognition},
  pages		= {2414--2423},
  year		= {2016}
}

@InCollection{	  Gauss.1809,
  title		= {Theoria motus corporum coelestum},
  author	= {Gauss, Carl Friedrich},
  booktitle	= {Werke},
  year		= {1809},
  publisher	= {K\"oniglich {P}reussische {A}kademie der {W}issenschaften}
}

@Book{	  Gibbs.1902,
  title		= {Elementary {P}rinciples of {S}tatistical {M}hanics},
  author	= {Gibbs, Josiah Willard},
  year		= {1902},
publisher = {Scribner's}
}

@Article{	  Ginibre.1965,
  title		= {Statistical ensembles of complex, quaternion, and real matrices},
  author	= {Ginibre, Jean},
  journal	= {Journal of {M}athematical {P}hysics},
  volume	= {6},
  number	= {3},
  pages		= {440--449},
  year		= {1965},
  publisher	= {AIP}
}

@InProceedings{	  Girshick.2015,
  title		= {Fast {R}-{CNN}},
  author	= {Girshick, Ross},
  booktitle	= {Proceedings of the {IEEE} {I}nternational {C}onference on
		  {C}omputer {V}ision},
  pages		= {1440--1448},
  year		= {2015}
}

@InProceedings{	  Girshick.Donahue.Darrell.ea.2014,
  title		= {Rich feature hierarchies for accurate object detection and
		  semantic segmentation},
  author	= {Girshick, Ross and Donahue, Jeff and Darrell, Trevor and
		  Malik, Jitendra},
  booktitle	= {Proceedings of the {IEEE} {C}onference on {C}omputer {V}ision and
		  {P}attern {R}ecognition},
  pages		= {580--587},
  year		= {2014}
}

@Article{	  Glivenko.1933,
  author	= {V.~I. Glivenko},
  journal	= {Giornale dell'{I}stituta {I}taliano degli {A}ttuari},
  pages		= 92,
  title		= {Sulla determinazione empirica delle leggi di probabilita},
  volume	= 4,
  year		= 1933
}

@InProceedings{	  Glorot.Bengio.2010,
  title		= {Understanding the difficulty of training deep feedforward
		  neural networks},
  author	= {Glorot, Xavier and Bengio, Yoshua},
  booktitle	= {Proceedings of the 13th {I}nternational {C}onference on
		  {A}rtificial {I}ntelligence and {S}tatistics},
  pages		= {249--256},
  year		= {2010}
}

@Article{	  Goh.2017,
  author	= {Goh, Gabriel},
  title		= {Why momentum really works},
  journal	= {Distill},
  year		= {2017},
  url		= {http://distill.pub/2017/momentum}
}

@Article{	  Goldberg.Nichols.Oki.ea.1992,
  title		= {Using collaborative filtering to weave an information
		  tapestry},
  author	= {Goldberg, David and Nichols, David and Oki, Brian M and
		  Terry, Douglas},
  journal	= {Communications of the {ACM}},
  volume	= {35},
  number	= {12},
  pages		= {61--71},
  year		= {1992},
  publisher	= {{A}ssociation for Computing Machinery, Inc.}
}

@Book{		  Golub.Van-Loan.1996,
  title		= {Matrix {C}omputations},
  author	= {Golub, Gene H and Van Loan, Charles F},
  year		= {1996},
  publisher	= {Johns {H}opkins {U}niversity {P}ress}
}

@Book{		  Goodfellow.Bengio.Courville.2016,
  title		= {Deep Learning},
  author	= {Ian Goodfellow and Yoshua Bengio and Aaron Courville},
  publisher	= {MIT {P}ress},
  note		= {\url{http://www.deeplearningbook.org}},
  year		= {2016}
}

@InProceedings{	  Goodfellow.Pouget-Abadie.Mirza.ea.2014,
  title		= {Generative adversarial nets},
  author	= {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi
		  and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and
		  Courville, Aaron and Bengio, Yoshua},
  booktitle	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  pages		= {2672--2680},
  year		= {2014}
}

@Article{	  Gotmare.Keskar.Xiong.ea.2018,
  title		= {A closer look at deep learning heuristics: Learning rate
		  restarts, warmup and distillation},
  author	= {Gotmare, Akhilesh and Keskar, Nitish Shirish and Xiong,
		  Caiming and Socher, Richard},
  journal	= {Ar{X}iv:1810.13243},
  year		= {2018}
}

@Article{	  Goyal.Bochkovskiy.Deng.ea.2021,
  title		= {Non-deep networks},
  author	= {Goyal, Ankit and Bochkovskiy, Alexey and Deng, Jia and
		  Koltun, Vladlen},
  journal	= {Ar{X}iv:2110.07641},
  year		= {2021}
}

@Article{	  Graham.2014,
  title		= {Fractional max-pooling},
  author	= {Graham, Benjamin},
  journal	= {Ar{X}iv:1412.6071},
  year		= {2014}
}

@Article{	  Graves.2013,
  title		= {Generating sequences with recurrent neural networks},
  author	= {Graves, Alex},
  journal	= {Ar{X}iv:1308.0850},
  year		= {2013}
}

@Article{	  Graves.Schmidhuber.2005,
  title		= {Framewise phoneme classification with bidirectional {LSTM}
		  and other neural network architectures},
  author	= {Graves, Alex and Schmidhuber, J{\"u}rgen},
  journal	= {Neural Networks},
  volume	= {18},
  number	= {5-6},
  pages		= {602--610},
  year		= {2005},
  publisher	= {Elsevier}
}

@article{graves2008novel,
  title={A novel connectionist system for unconstrained handwriting recognition},
  author={Graves, Alex and Liwicki, Marcus and Fern{\'a}ndez, Santiago and Bertolami, Roman and Bunke, Horst and Schmidhuber, J{\"u}rgen},
  journal={IEEE {T}ransactions on {P}attern {A}nalysis and {M}achine {I}ntelligence},
  volume={31},
  number={5},
  pages={855--868},
  year={2008},
  publisher={IEEE}
}

@InCollection{	  Griewank.1989,
  title		= {On automatic differentiation},
  author	= {Griewank, Andreas},
  booktitle	= {Mathematical {P}rogramming: {R}ecent {D}evelopments and
		  {A}pplications},
  pages		= {83--107},
  year		= {1989},
  publisher	= {Kluwer}
}

@InCollection{	  Gunawardana.Shani.2015,
  title		= {Evaluating recommender systems},
  author	= {Gunawardana, Asela and Shani, Guy},
  booktitle	= {Recommender {S}ystems {H}andbook},
  pages		= {265--308},
  year		= {2015},
  publisher	= {Springer}
}

@InProceedings{	  Guo.Tang.Ye.ea.2017,
  title		= {DeepFM: a factorization-machine based neural network for
		  CTR prediction},
  author	= {Guo, Huifeng and Tang, Ruiming and Ye, Yunming and Li,
		  Zhenguo and He, Xiuqiang},
  booktitle	= {Proceedings of the 26th {I}nternational {J}oint {C}onference on
		  {A}rtificial {I}ntelligence},
  pages		= {1725--1731},
  year		= {2017},
  organization	= {AAAI {P}ress}
}

@Article{	  Hadjis.Zhang.Mitliagkas.ea.2016,
  title		= {Omnivore: An optimizer for multi-device deep learning on
		  {CPU}s and {GPU}s},
  author	= {Hadjis, Stefan and Zhang, Ce and Mitliagkas, Ioannis and
		  Iter, Dan and R{\'e}, Christopher},
  journal	= {Ar{X}iv:1606.04487},
  year		= {2016}
}

@Book{		  Hartley.Zisserman.2000,
  author	= {Hartley, Richard and Zisserman, Andrew},
  title		= {Multiple {V}iew {G}eometry in {C}omputer {V}ision},
  year		= {2000},
  publisher	= {Cambridge {U}niversity {P}ress}
}

@InProceedings{	  Hazan.Rakhlin.Bartlett.2008,
  title		= {Adaptive online gradient descent},
  author	= {Hazan, Elad and Rakhlin, Alexander and Bartlett, Peter L},
  booktitle	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  pages		= {65--72},
  year		= {2008}
}

@InProceedings{	  He.Chua.2017,
  title		= {Neural factorization machines for sparse predictive
		  analytics},
  author	= {He, Xiangnan and Chua, Tat-Seng},
  booktitle	= {Proceedings of the 40th {I}nternational {ACM} {SIGIR} {C}onference
		  on {R}esearch and {D}evelopment in {I}nformation {R}etrieval},
  pages		= {355--364},
  year		= {2017},
  organization	= {ACM}
}

@InProceedings{	  He.Gkioxari.Dollar.ea.2017,
  title		= {Mask {R}-{CNN}},
  author	= {He, Kaiming and Gkioxari, Georgia and Doll{\'a}r, Piotr
		  and Girshick, Ross},
  booktitle	= {Proceedings of the {IEEE} {I}nternational {C}onference on
		  {C}omputer {V}ision},
  pages		= {2961--2969},
  year		= {2017}
}

@InProceedings{	  He.Liao.Zhang.ea.2017,
  title		= {Neural collaborative filtering},
  author	= {He, Xiangnan and Liao, Lizi and Zhang, Hanwang and Nie,
		  Liqiang and Hu, Xia and Chua, Tat-Seng},
  booktitle	= {Proceedings of the 26th {I}nternational {C}onference on {W}orld
		  {W}ide {W}eb},
  pages		= {173--182},
  year		= {2017},
  organization	= {International World Wide Web {C}onferences Steering
		  Committee}
}

@InProceedings{	  He.Zhang.Ren.ea.2015,
  title		= {Delving deep into rectifiers: Surpassing human-level
		  performance on {I}mage{N}et classification},
  author	= {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun,
		  Jian},
  booktitle	= {Proceedings of the {IEEE} {I}nternational {C}onference on
		  {C}omputer {V}ision},
  pages		= {1026--1034},
  year		= {2015}
}

@InProceedings{	  He.Zhang.Ren.ea.2016,
  title		= {Deep residual learning for image recognition},
  author	= {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun,
		  Jian},
  booktitle	= {Proceedings of the {IEEE} {C}onference on {C}omputer {V}ision and
		  {P}attern {R}ecognition},
  pages		= {770--778},
  year		= {2016}
}

@InProceedings{	  He.Zhang.Ren.ea.2016*1,
  title		= {Identity mappings in deep residual networks},
  author	= {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun,
		  Jian},
  booktitle	= {European {C}onference on {C}omputer {V}ision},
  pages		= {630--645},
  year		= {2016},
  organization	= {Springer}
}

@Book{		  Hebb.1949,
  title		= {The {O}rganization of {B}ehavior},
  author	= {Hebb, Donald Olding},
  year		= {1949},
  publisher	= {Wiley}
}

@Article{	  Hendrycks.Gimpel.2016,
  title		= {Gaussian error linear units ({GELU}s)},
  author	= {Hendrycks, Dan and Gimpel, Kevin},
  journal	= {Ar{X}iv:1606.08415},
  year		= {2016}
}

@Article{	  Henighan.Kaplan.Katz.ea.2020,
  title		= {Scaling laws for autoregressive generative modeling},
  author	= {Henighan, Tom and Kaplan, Jared and Katz, Mor and Chen,
		  Mark and Hesse, Christopher and Jackson, Jacob and Jun,
		  Heewoo and Brown, Tom B and Dhariwal, Prafulla and Gray,
		  Scott and et al.},
  journal	= {Ar{X}iv:2010.14701},
  year		= {2020}
}

@Book{		  Hennessy.Patterson.2011,
  title		= {Computer {A}rchitecture: A {Q}uantitative {A}pproach},
  author	= {Hennessy, John L and Patterson, David A},
  year		= {2011},
  publisher	= {Elsevier}
}

@InProceedings{	  Herlocker.Konstan.Borchers.ea.1999,
  title		= {An algorithmic framework for performing collaborative
		  filtering},
  author	= {Herlocker, Jonathan L and Konstan, Joseph A and Borchers,
		  Al and Riedl, John},
  booktitle	= {22nd {A}nnual {I}nternational {ACM} {C}onference on {R}esearch
		  and {D}evelopment in {I}nformation {R}etrieval, {SIGIR} 1999},
  pages		= {230--237},
  year		= {1999},
  organization	= {{A}ssociation for Computing Machinery, Inc}
}

@Article{	  Hidasi.Karatzoglou.Baltrunas.ea.2015,
  title		= {Session-based recommendations with recurrent neural
		  networks},
  author	= {Hidasi, Bal{\'a}zs and Karatzoglou, Alexandros and
		  Baltrunas, Linas and Tikk, Domonkos},
  journal	= {Ar{X}iv:1511.06939},
  year		= {2015}
}

@InCollection{		  Hochreiter.Bengio.Frasconi.ea.2001,
  title		= {Gradient flow in recurrent nets: the difficulty of
		  learning long-term dependencies},
  author	= {Hochreiter, Sepp and Bengio, Yoshua and Frasconi, Paolo
		  and Schmidhuber, J{\"u}rgen},
  year		= {2001},
  booktitle	= {A {F}ield {G}uide to {D}ynamical {R}ecurrent {N}eural {N}etworks},
publisher ={{IEEE} {P}ress}
}

@Article{	  Hochreiter.Schmidhuber.1997,
  title		= {Long short-term memory},
  author	= {Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
  journal	= {Neural {C}omputation},
  volume	= {9},
  number	= {8},
  pages		= {1735--1780},
  year		= {1997},
  publisher	= {MIT {P}ress}
}

@InProceedings{	  Howard.Sandler.Chu.ea.2019,
  title		= {Searching for {M}obile{N}et{V}3},
  author	= {Howard, Andrew and Sandler, Mark and Chu, Grace and Chen,
		  Liang-Chieh and Chen, Bo and Tan, Mingxing and Wang, Weijun
		  and Zhu, Yukun and Pang, Ruoming and Vasudevan, Vijay and
		  Le, Quoc V. and Adam, Hartwig},
  booktitle	= {Proceedings of the {IEEE}/{CVF} {I}nternational {C}onference on
		  {C}omputer {V}ision},
  pages		= {1314--1324},
  year		= {2019}
}

@InProceedings{	  Hoyer.Janzing.Mooij.ea.2009,
  title		= {Nonlinear causal discovery with additive noise models},
  author	= {Hoyer, Patrik O and Janzing, Dominik and Mooij, Joris M
		  and Peters, Jonas and Sch{\"o}lkopf, Bernhard},
  booktitle	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  pages		= {689--696},
  year		= {2009}
}

@InProceedings{	  Hu.Koren.Volinsky.2008,
  title		= {Collaborative filtering for implicit feedback datasets},
  author	= {Hu, Yifan and Koren, Yehuda and Volinsky, Chris},
  booktitle	= {2008 8th {IEEE} {I}nternational {C}onference on {D}ata {M}ining},
  pages		= {263--272},
  year		= {2008},
  organization	= {IEEE}
}

@article{10.1145/3544903.3544906,
  author = {Hu, Zhiqiang and Lee, Roy Ka-Wei and Aggarwal, Charu C. and Zhang, Aston},
  title = {Text style transfer: A review and experimental evaluation},
  year = {2022},
  journal = {{SIGKDD} {E}xplor. {N}ewsl.},
  volume = {24},
  number = {1},
  url = {https://doi.org/10.1145/3544903.3544906}
}

@article{parzen1957consistent,
  title={On consistent estimates of the spectrum of a stationary time series},
  author={Parzen, Emanuel},
  journal={{A}nnals of {M}athematical {S}tatistics},
  volume = {28},
pages={329--348},
  year={1957},
  publisher={JSTOR}
}

@article{mack1982weak,
  title={Weak and strong uniform consistency of kernel regression estimates},
  author={Mack, Yue-{P}ok and Silverman, Bernard W},
  journal={Zeitschrift f{\"u}r {W}ahrscheinlichkeitstheorie und verwandte {G}ebiete},
  volume={61},
  number={3},
  pages={405--415},
  year={1982},
  publisher={Springer}
}

@book{Silverman86,
  Author =	 {B.~W. Silverman},
  Publisher =	 {Chapman and {H}all},
  Title =	 {Density {E}stimation for {S}tatistical and {D}ata
                  {A}nalysis},
  Year =	 1986
}

@article{norelli2022asif,
  title={{ASIF}: Coupled data turns unimodal models to multimodal without training},
  author={Norelli, Antonio and Fumero, Marco and Maiorca, Valentino and Moschella, Luca and Rodol{\`a}, Emanuele and Locatello, Francesco},
  journal={Ar{X}iv:2210.01738},
  year={2022}
}

@InProceedings{	  Hu.Shen.Sun.2018,
  title		= {Squeeze-and-excitation networks},
  author	= {Hu, Jie and Shen, Li and Sun, Gang},
  booktitle	= {Proceedings of the {IEEE} {C}onference on {C}omputer {V}ision and
		  {P}attern {R}ecognition},
  pages		= {7132--7141},
  year		= {2018}
}

@InProceedings{	  Huang.Liu.Van-Der-Maaten.ea.2017,
  title		= {Densely connected convolutional networks},
  author	= {Huang, Gao and Liu, Zhuang and Van Der Maaten, Laurens and
		  Weinberger, Kilian Q},
  booktitle	= {Proceedings of the {IEEE} {C}onference on {C}omputer {V}ision and
		  {P}attern {R}ecognition},
  pages		= {4700--4708},
  year		= {2017}
}

@Article{	  Huang.Xu.Yu.2015,
  title		= {Bidirectional {LSTM}--{CRF} models for sequence tagging},
  author	= {Huang, Zhiheng and Xu, Wei and Yu, Kai},
  journal	= {Ar{X}iv:1508.01991},
  year		= {2015}
}

@Article{	  Hubel.Wiesel.1959,
  title		= {Receptive fields of single neurones in the cat's striate
		  cortex},
  author	= {Hubel, David H and Wiesel, Torsten N},
  journal	= {Journal of {P}hysiology},
  volume	= {148},
  number	= {3},
  pages		= {574--591},
  year		= {1959},
  publisher	= {Wiley Online Library}
}

@Article{	  Hubel.Wiesel.1962,
  title		= {Receptive fields, binocular interaction and functional
		  architecture in the cat's visual cortex},
  author	= {Hubel, David H and Wiesel, Torsten N},
  journal	= {Journal of {P}hysiology},
  volume	= {160},
  number	= {1},
  pages		= {106--154},
  year		= {1962},
  publisher	= {Wiley Online Library}
}

@Article{	  Hubel.Wiesel.1968,
  title		= {Receptive fields and functional architecture of monkey
		  striate cortex},
  author	= {Hubel, David H and Wiesel, Torsten N},
  journal	= {Journal of {P}hysiology},
  volume	= {195},
  number	= {1},
  pages		= {215--243},
  year		= {1968},
  publisher	= {Wiley Online Library}
}

@InProceedings{	  Ioffe.2017,
  title		= {Batch renormalization: Towards reducing minibatch
		  dependence in batch-normalized models},
  author	= {Ioffe, Sergey},
  booktitle	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  pages		= {1945--1953},
  year		= {2017}
}

@Article{	  Ioffe.Szegedy.2015,
  title		= {Batch normalization: Accelerating deep network training by
		  reducing internal covariate shift},
  author	= {Ioffe, Sergey and Szegedy, Christian},
  journal	= {Ar{X}iv:1502.03167},
  year		= {2015}
}

@Article{	  Izmailov.Podoprikhin.Garipov.ea.2018,
  title		= {Averaging weights leads to wider optima and better
		  generalization},
  author	= {Izmailov, Pavel and Podoprikhin, Dmitrii and Garipov,
		  Timur and Vetrov, Dmitry and Wilson, Andrew Gordon},
  journal	= {Ar{X}iv:1803.05407},
  year		= {2018}
}

@InProceedings{Jacot.Grabriel.Hongler.2018,
  title={Neural tangent kernel: Convergence and generalization in neural networks},
  author={Jacot, Arthur and Gabriel, Franck and Hongler, Cl{\'e}ment},
  booktitle={Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  volume={31},
  year={2018}
}


@Book{		  Jaeger.2002,
  title		= {Tutorial on training recurrent neural networks, covering
		  {BPPT}, {RTRL}, {EKF} and the ``echo state network'' approach},
  author	= {Jaeger, Herbert},
  year		= {2002},
  publisher	= {GMD-{F}orschungszentrum {I}nformationstechnik {B}onn}
}

@Book{		  James.2007,
  title		= {The {P}rinciples of {P}sychology},
  author	= {James, William},
  year		= {2007},
  publisher	= {Cosimo, {I}nc.}
}

@InProceedings{	  Jia.Shelhamer.Donahue.ea.2014,
  title		= {Caffe: Convolutional architecture for fast feature
		  embedding},
  author	= {Jia, Yangqing and Shelhamer, Evan and Donahue, Jeff and
		  Karayev, Sergey and Long, Jonathan and Girshick, Ross and
		  Guadarrama, Sergio and Darrell, Trevor},
  booktitle	= {Proceedings of the 22nd {ACM} {I}nternational {C}onference on
		  Multimedia},
  pages		= {675--678},
  year		= {2014}
}

@Article{	  Jia.Song.He.ea.2018,
  title		= {Highly scalable deep learning training system with
		  mixed-precision: Training {I}mage{N}et in four minutes},
  author	= {Jia, Xianyan and Song, Shutao and He, Wei and Wang,
		  Yangzihao and Rong, Haidong and Zhou, Feihu and Xie,
		  Liqiang and Guo, Zhenyu and Yang, Yuanzhou and Yu, Liwei
		  and et al.},
  journal	= {Ar{X}iv:1807.11205},
  year		= {2018}
}

@InProceedings{	  Jouppi.Young.Patil.ea.2017,
  title		= {In-datacenter performance analysis of a tensor processing
		  unit},
  author	= {Jouppi, Norman P and Young, Cliff and Patil, Nishant and
		  Patterson, David and Agrawal, Gaurav and Bajwa, Raminder
		  and Bates, Sarah and Bhatia, Suresh and Boden, Nan and
		  Borchers, Al and et al.},
  booktitle	= {2017 {ACM}/IEEE 44th {A}nnual {I}nternational {S}ymposium on
		  {C}omputer {A}rchitecture ({ISCA})},
  pages		= {1--12},
  year		= {2017},
  organization	= {IEEE}
}

@Article{	  Kalchbrenner.Grefenstette.Blunsom.2014,
  title		= {A convolutional neural network for modelling sentences},
  author	= {Kalchbrenner, Nal and Grefenstette, Edward and Blunsom,
		  Phil},
  journal	= {Ar{X}iv:1404.2188},
  year		= {2014}
}

@InProceedings{	  Kalman.Kwasny.1992,
  title		= {Why tanh: choosing a sigmoidal function},
  author	= {Kalman, Barry L and Kwasny, Stan {C}},
  booktitle	= {Proceedings of the {I}nternational {J}oint {C}onference on
		  {N}eural {N}etworks (IJCNN)},
  pages		= {578--581},
  year		= {1992},
  organization	= {IEEE}
}

@Article{	  Karras.Aila.Laine.ea.2017,
  title		= {Progressive growing of {GAN}s for improved quality,
		  stability, and variation},
  author	= {Karras, Tero and Aila, Timo and Laine, Samuli and
		  Lehtinen, Jaakko},
  journal	= {Ar{X}iv:1710.10196},
  year		= {2017}
}

@Article{	  Kawaguchi.Kaelbling.Bengio.2017,
  title		= {Generalization in deep learning},
  author	= {Kawaguchi, Kenji and Kaelbling, Leslie Pack and Bengio,
		  Yoshua},
  journal	= {Ar{X}iv:1710.05468},
  year		= {2017}
}

@Article{	  Kim.2014,
  title		= {Convolutional neural networks for sentence
		  classification},
  author	= {Kim, Yoon},
  journal	= {Ar{X}iv:1408.5882},
  year		= {2014}
}

@Article{	  Kimeldorf.Wahba.1971,
  author	= {G.~S.~Kimeldorf and G.~Wahba},
  journal	= {J. {M}ath. {A}nal. {A}ppl.},
  pages		= {82--95},
  title		= {Some results on {T}chebycheffian spline functions},
  volume	= 33,
  year		= 1971
}

@Article{	  Kingma.Ba.2014,
  title		= {Adam: A method for stochastic optimization},
  author	= {Kingma, Diederik P and Ba, Jimmy},
  journal	= {Ar{X}iv:1412.6980},
  year		= {2014}
}

@inproceedings{Kingma.Welling.2014,
  author = {Kingma, Diederik P. and Welling, Max},
  booktitle = {{I}nternational {C}onference on {L}earning {R}epresentations ({ICLR})},
  title = {Auto-encoding variational {B}ayes},
  year = 2014
}


@Article{	  Kipf.Welling.2016,
  title		= {Semi-supervised classification with graph convolutional
		  networks},
  author	= {Kipf, Thomas N and Welling, Max},
  journal	= {Ar{X}iv:1609.02907},
  year		= {2016}
}

@Book{		  Koller.Friedman.2009,
  title		= {Probabilistic {G}raphical {M}odels: {P}rinciples and
		  {T}echniques},
  author	= {Koller, Daphne and Friedman, Nir},
  year		= {2009},
  publisher	= {MIT {P}ress}
}

@Article{	  Kolmogorov.1933,
  title		= {Sulla determinazione empirica di una legge di
		  distribuzione},
  author	= {Kolmogorov, Andrey},
  journal	= {Inst. {I}tal. {A}ttuari, {G}iorn.},
  volume	= {4},
  pages		= {83--91},
  year		= {1933}
}

@Article{	  Kolter.2008,
  title		= {Linear algebra review and reference},
  author	= {Kolter, Zico},
  journal	= {Available online:
		  http://cs229.stanford.edu/section/cs229-linalg.pdf},
  year		= {2008}
}

@InProceedings{	  Koren.2009,
  title		= {Collaborative filtering with temporal dynamics},
  author	= {Koren, Yehuda},
  booktitle	= {Proceedings of the 15th {ACM} {SIGKDD} {I}nternational
		  {C}onference on {K}nowledge {D}iscovery and {D}ata {M}ining},
  pages		= {447--456},
  year		= {2009},
  organization	= {ACM}
}

@Article{	  Koren.Bell.Volinsky.2009,
  title		= {Matrix factorization techniques for recommender systems},
  author	= {Koren, Yehuda and Bell, Robert and Volinsky, Chris},
  journal	= {Computer},
  number	= {8},
  pages		= {30--37},
  year		= {2009},
  publisher	= {IEEE}
}

@InProceedings{	  Krizhevsky.Sutskever.Hinton.2012,
  title		= {Image{N}et classification with deep convolutional neural
		  networks},
  author	= {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey
		  E},
  booktitle	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  pages		= {1097--1105},
  year		= {2012}
}

@InProceedings{	  Krogh.Hertz.1992,
  title		= {A simple weight decay can improve generalization},
  author	= {Krogh, Anders and Hertz, John A},
  booktitle	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  pages		= {950--957},
  year		= {1992}
}

@Article{	  Kung.1988,
  title		= {{VLSI} {A}rray {P}rocessors},
  author	= {Kung, Sun Yuan},
  journal	= {Prentice {H}all},
  year		= {1988}
}

@Article{	  Kuzovkin.Vicente.Petton.ea.2018,
  title		= {Activations of deep convolutional neural networks are
		  aligned with gamma band activity of human visual cortex},
  author	= {Kuzovkin, Ilya and Vicente, Raul and Petton, Mathilde and
		  Lachaux, Jean-Philippe and Baciu, Monica and Kahane,
		  Philippe and Rheims, Sylvain and Vidal, Juan R and Aru,
		  Jaan},
  journal	= {Communications {B}iology},
  volume	= {1},
  number	= {1},
  pages		= {1--12},
  year		= {2018},
  publisher	= {Nature Publishing Group}
}

@InCollection{	  LeCun.Bengio.ea.1995,
  title		= {Convolutional networks for images, speech, and time
		  series},
  author	= {LeCun, Yann and Bengio, Yoshua and et al.},
  booktitle	= {The {H}andbook of {B}rain {T}heory and {N}eural {N}etworks},
  pages		= {3361},
  year		= {1995},
  publisher = {MIT {P}ress}
}

@Article{	  LeCun.Boser.Denker.ea.1989,
  title		= {Backpropagation applied to handwritten zip code
		  recognition},
  author	= {LeCun, Yann and Boser, Bernhard and Denker, John S and
		  Henderson, Donnie and Howard, Richard E and Hubbard, Wayne
		  and Jackel, Lawrence D},
  journal	= {Neural {C}omputation},
  volume	= {1},
  number	= {4},
  pages		= {541--551},
  year		= {1989},
  publisher	= {MIT Press}
}

@Article{	  LeCun.Bottou.Bengio.ea.1998,
  title		= {Gradient-based learning applied to document recognition},
  author	= {LeCun, Yann and Bottou, L{\'e}on and Bengio, Yoshua and
		  Haffner, Patrick},
  journal	= {Proceedings of the {IEEE}},
  volume	= {86},
  number	= {11},
  pages		= {2278--2324},
  year		= {1998},
  publisher	= {Taipei, Taiwan}
}

@InCollection{	  LeCun.Bottou.Orr.ea.1998,
  title		= {Efficient backprop},
  author	= {LeCun, Yann and Bottou, Leon and Orr, G and Muller,
		  Klaus-Robert},
  booktitle	= {Neural {N}etworks: {T}ricks of the {T}rade},
  year		= {1998},
publisher = {Springer}
}

@InProceedings{	  LeCun.Jackel.Bottou.ea.1995,
  title		= {Comparison of learning algorithms for handwritten digit
		  recognition},
  author	= {LeCun, Yann and Jackel, LD and Bottou, Leon and Brunot, A
		  and Cortes, Corinna and Denker, John and Drucker, Harris
		  and Guyon, Isabelle and Muller, UA and Sackinger, Eduard and
		  et al.},
  booktitle	= {{I}nternational {C}onference on {A}rtificial {N}eural {N}etworks},
  pages		= {53--60},
  year		= {1995},
}

@Book{		  Legendre.1805,
  title		= {M{\'e}moire sur les {O}p{\'e}rations {T}rigonom{\'e}triques:
		  dont les {R}{\'e}sultats {D}{\'e}pendent de la {F}igure de la {T}erre},
  author	= {Legendre, Adrien Marie},
  year		= {1805},
  publisher	= {F. {D}idot}
}

@PhDThesis{	  Li.2017,
  title		= {Scaling {D}istributed {M}achine {L}earning with {S}ystem and
		  {A}lgorithm {C}o-design},
  author	= {Li, Mu},
  year		= {2017},
  school	= {Ph{D} {T}hesis, {CMU}}
}

@InProceedings{	  Li.Andersen.Park.ea.2014,
  title		= {Scaling distributed machine learning with the parameter
		  server},
  author	= {Li, Mu and Andersen, David G and Park, Jun Woo and Smola,
		  Alexander J and Ahmed, Amr and Josifovski, Vanja and Long,
		  James and Shekita, Eugene J and Su, Bor-Yiing},
  booktitle	= {11th {S}ymposium on {O}perating {S}ystems {D}esign
		  and {I}mplementation ({OSDI} 14)},
  pages		= {583--598},
  year		= {2014}
}

@InProceedings{	  Li.Zhang.Chen.ea.2014,
  title		= {Efficient mini-batch training for stochastic
		  optimization},
  author	= {Li, Mu and Zhang, Tong and Chen, Yuqiang and Smola,
		  Alexander J},
  booktitle	= {Proceedings of the 20th {ACM} {SIGKDD} {I}nternational
		  {C}onference on {K}nowledge {D}iscovery and {D}ata {M}ining},
  pages		= {661--670},
  year		= {2014}
}

@Article{	  Lin.Chen.Yan.2013,
  title		= {Network in network},
  author	= {Lin, Min and Chen, Qiang and Yan, Shuicheng},
  journal	= {Ar{X}iv:1312.4400},
  year		= {2013}
}

@Article{	  Lin.Feng.Santos.ea.2017,
  title		= {A structured self-attentive sentence embedding},
  author	= {Lin, Zhouhan and Feng, Minwei and Santos, Cicero Nogueira
		  dos and Yu, Mo and Xiang, Bing and Zhou, Bowen and Bengio,
		  Yoshua},
  journal	= {Ar{X}iv:1703.03130},
  year		= {2017}
}

@InProceedings{	  Lin.Goyal.Girshick.ea.2017,
  title		= {Focal loss for dense object detection},
  author	= {Lin, Tsung-Yi and Goyal, Priya and Girshick, Ross and He,
		  Kaiming and Doll{\'a}r, Piotr},
  booktitle	= {Proceedings of the {IEEE} {I}nternational {C}onference on
		  {C}omputer {V}ision},
  pages		= {2980--2988},
  year		= {2017}
}

@Article{	  Lin.Lv.Zhu.ea.2010,
  title		= {Image{N}et classification: fast descriptor coding and
		  large-scale {S}{V}{M} training},
  author	= {Lin, Yuanqing and Lv, F and Zhu, S and Yang, M and Cour, T
		  and Yu, K and Cao, L and Li, Z and Tsai, MH and Zhou, X and
		  others},
  journal	= {Large Scale Visual Recognition Challenge},
  year		= {2010}
}

@InProceedings{Lipton.Kale.2016,
  title={Learning to diagnose with {LSTM} recurrent neural networks},
  author={Lipton, Zachary C and Kale, David C and Elkan, Charles and Wetzel, Randall},
  booktitle={{I}nternational {C}onference on {L}earning {R}epresentations ({ICLR})},
  year={2016}
}


@Article{	  Lipton.Steinhardt.2018,
  title		= {Troubling trends in machine learning scholarship},
  author	= {Lipton, Zachary C and Steinhardt, Jacob},
  journal	= {Communications of the {ACM}},
 volume ={17},
issue = {1},
pages ={45--77},
 year		= {2018}
}

@article{     Lipton.Berkowitz.Elkan.2015,
  title		= {A critical review of recurrent neural networks for sequence learning},
  author	= {Lipton, Zachary C and Berkowitz, John and Elkan, Charles},
  journal	= {Ar{X}iv:1506.00019},
  year		= {2015}
}


@InProceedings{	  Liu.Anguelov.Erhan.ea.2016,
  title		= {{SSD}: Single shot multibox detector},
  author	= {Liu, Wei and Anguelov, Dragomir and Erhan, Dumitru and
		  Szegedy, Christian and Reed, Scott and Fu, Cheng-Yang and
		  Berg, Alexander {C}},
  booktitle	= {European {C}onference on {C}omputer {V}ision},
  pages		= {21--37},
  year		= {2016},
  organization	= {Springer}
}

@Article{	  Liu.Nocedal.1989,
  title		= {On the limited memory {BFGS} method for large scale
		  optimization},
  author	= {Liu, Dong C and Nocedal, Jorge},
  journal	= {Mathematical {P}rogramming},
  volume	= {45},
  number	= {1},
  pages		= {503--528},
  year		= {1989},
  publisher	= {Springer}
}

@Article{	  Liu.Ott.Goyal.ea.2019,
  title		= {Ro{BERT}a: A robustly optimized {BERT} pretraining approach},
  author	= {Liu, Yinhan and Ott, Myle and Goyal, Naman and Du, Jingfei
		  and Joshi, Mandar and Chen, Danqi and Levy, Omer and Lewis,
		  Mike and Zettlemoyer, Luke and Stoyanov, Veselin},
  journal	= {Ar{X}iv:1907.11692},
  year		= {2019}
}

@InProceedings{	  Long.Shelhamer.Darrell.2015,
  title		= {Fully convolutional networks for semantic segmentation},
  author	= {Long, Jonathan and Shelhamer, Evan and Darrell, Trevor},
  booktitle	= {Proceedings of the {IEEE} {C}onference on {C}omputer {V}ision and
		  {P}attern {R}ecognition},
  pages		= {3431--3440},
  year		= {2015}
}

@Article{	  Loshchilov.Hutter.2016,
  title		= {{SGDR}: Stochastic gradient descent with warm restarts},
  author	= {Loshchilov, Ilya and Hutter, Frank},
  journal	= {Ar{X}iv:1608.03983},
  year		= {2016}
}

@Article{	  Lowe.2004,
  title		= {Distinctive image features from scale-invariant
		  keypoints},
  author	= {Lowe, David G},
  journal	= {International {J}ournal of {C}omputer {V}ision},
  volume	= {60},
  number	= {2},
  pages		= {91--110},
  year		= {2004},
  publisher	= {Springer}
}

@Article{	  Luo.Wang.Shao.ea.2018,
  title		= {Towards understanding regularization in batch
		  normalization},
  author	= {Luo, Ping and Wang, Xinjiang and Shao, Wenqi and Peng,
		  Zhanglin},
  journal	= {Ar{X}iv:1809.00846},
  year		= {2018}
}

@InProceedings{	  Maas.Daly.Pham.ea.2011,
  title		= {Learning word vectors for sentiment analysis},
  author	= {Maas, Andrew L and Daly, Raymond E and Pham, Peter T and
		  Huang, Dan and Ng, Andrew Y and Potts, Christopher},
  booktitle	= {Proceedings of the 49th {A}nnual {M}eeting of the {A}ssociation
		  for {C}omputational {L}inguistics: {H}uman {L}anguage
		  {T}echnologies, {V}olume 1},
  pages		= {142--150},
  year		= {2011},
  organization	= {{A}ssociation for {C}omputational {L}inguistics}
}

@Article{	  Mangasarian.1965,
  author	= {O.~L.~Mangasarian},
  journal	= {Oper. {R}es.},
  pages		= {444-452},
  title		= {Linear and nonlinear separation of patterns by linear
		  programming},
  volume	= 13,
  year		= 1965
}

@Article{	  Mangram.2013,
  title		= {A simplified perspective of the {M}arkowitz portfolio
		  theory},
  author	= {Mangram, Myles E},
  journal	= {Global {J}ournal of {B}usiness {R}esearch},
  volume	= {7},
  number	= {1},
  pages		= {59--70},
  year		= {2013}
}

@InProceedings{	  McCann.Bradbury.Xiong.ea.2017,
  title		= {Learned in translation: {C}ontextualized word vectors},
  author	= {McCann, Bryan and Bradbury, James and Xiong, Caiming and
		  Socher, Richard},
  booktitle	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  pages		= {6294--6305},
  year		= {2017}
}

@Article{	  McCulloch.Pitts.1943,
  title		= {A logical calculus of the ideas immanent in nervous
		  activity},
  author	= {McCulloch, Warren S and Pitts, Walter},
  journal	= {Bulletin of {Mathematical} {Biophysics}},
  volume	= {5},
  number	= {4},
  pages		= {115--133},
  year		= {1943},
  publisher	= {Springer}
}

@InProceedings{	  McMahan.Holt.Sculley.ea.2013,
  title		= {Ad click prediction: a view from the trenches},
  author	= {McMahan, H Brendan and Holt, Gary and Sculley, David and
		  Young, Michael and Ebner, Dietmar and Grady, Julian and
		  Nie, Lan and Phillips, Todd and Davydov, Eugene and
		  Golovin, Daniel and et al.},
  booktitle	= {Proceedings of the 19th {ACM} SIGKDD {I}nternational
		  {C}onference on {K}nowledge {D}iscovery and {D}ata {M}ining},
  pages		= {1222--1230},
  year		= {2013},
  organization	= {ACM}
}

@Article{	  Mead.1980,
  title		= {Introduction to {VLSI} systems},
  author	= {Mead, Carver},
  journal	= {IEE {P}roceedings I-{S}olid-{S}tate and {E}lectron {D}evices},
  volume	= {128},
  number	= {1},
  pages		= {18},
  year		= {1980},
  publisher	= {{IET}}
}

@Article{	  Merity.Xiong.Bradbury.ea.2016,
  title		= {Pointer sentinel mixture models},
  author	= {Merity, Stephen and Xiong, Caiming and Bradbury, James and
		  Socher, Richard},
  journal	= {Ar{X}iv:1609.07843},
  year		= {2016}
}

@InCollection{	  Micchelli.1986,
  author	= {C.~A.~Micchelli},
  booktitle	= {Proceedings of {S}ymposia in {A}pplied {M}athematics},
  pages		= {81--102},
  title		= {Algebraic aspects of interpolation},
  year		= 1986
}

@Article{	  Mikolov.Chen.Corrado.ea.2013,
  title		= {Efficient estimation of word representations in vector
		  space},
  author	= {Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean,
		  Jeffrey},
  journal	= {Ar{X}iv:1301.3781},
  year		= {2013}
}

@InProceedings{	  Mikolov.Sutskever.Chen.ea.2013,
  title		= {Distributed representations of words and phrases and their
		  compositionality},
  author	= {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and
		  Corrado, Greg S and Dean, Jeff},
  booktitle	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  pages		= {3111--3119},
  year		= {2013}
}

@Article{	  Miller.1995,
  title		= {Word{N}et: a lexical database for {E}nglish},
  author	= {Miller, George A},
  journal	= {Communications of the {ACM}},
  volume	= {38},
  number	= {11},
  pages		= {39--41},
  year		= {1995},
  publisher	= {ACM}
}

@InProceedings{	  Mirhoseini.Pham.Le.ea.2017,
  title		= {Device placement optimization with reinforcement
		  learning},
  author	= {Mirhoseini, Azalia and Pham, Hieu and Le, Quoc V and
		  Steiner, Benoit and Larsen, Rasmus and Zhou, Yuefeng and
		  Kumar, Naveen and Norouzi, Mohammad and Bengio, Samy and
		  Dean, Jeff},
  booktitle	= {Proceedings of the 34th {I}nternational {C}onference on
		  {M}achine {L}earning},
  pages		= {2430--2439},
  year		= {2017},
}

@InProceedings{	  Mnih.Heess.Graves.ea.2014,
  title		= {Recurrent models of visual attention},
  author	= {Mnih, Volodymyr and Heess, Nicolas and Graves, Alex and
		  others},
  booktitle	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  pages		= {2204--2212},
  year		= {2014}
}

@InProceedings{	  Moon.Smola.Chang.ea.2010,
  title		= {Intervalrank: isotonic regression with listwise and
		  pairwise constraints},
  author	= {Moon, Taesup and Smola, Alex and Chang, Yi and Zheng,
		  Zhaohui},
  booktitle	= {Proceedings of the 3rd {ACM} {I}nternational {C}onference on
		  {W}eb {S}earch and {D}ata {M}ining},
  pages		= {151--160},
  year		= {2010}
}

@Article{	  Morey.Hoekstra.Rouder.ea.2016,
  title		= {The fallacy of placing confidence in confidence
		  intervals},
  author	= {Morey, Richard D and Hoekstra, Rink and Rouder, Jeffrey N
		  and Lee, Michael D and Wagenmakers, Eric-Jan},
  journal	= {Psychonomic {B}ulletin \& {R}eview},
  volume	= {23},
  number	= {1},
  pages		= {103--123},
  year		= {2016},
  publisher	= {Springer}
}

@Article{	  Murata.Yoshizawa.Amari.1994,
  title		= {Network information criterion -- determining the number of
		  hidden units for an artificial neural network model},
  author	= {Murata, Noboru and Yoshizawa, Shuji and Amari, Shun-ichi},
  journal	= {IEEE {T}ransactions on {N}eural {N}etworks},
  volume	= {5},
  number	= {6},
  pages		= {865--872},
  year		= {1994},
  publisher	= {IEEE}
}

@Article{	  Nadaraya.1964,
  title		= {On estimating regression},
  author	= {Nadaraya, Elizbar A},
  journal	= {Theory of {P}robability \& its {A}pplications},
  volume	= {9},
  number	= {1},
  pages		= {141--142},
  year		= {1964},
  publisher	= {SIAM}
}

@Article{	  Nagarajan.Kolter.2019,
  title		= {Uniform convergence may be unable to explain
		  generalization in deep learning},
  author	= {Nagarajan, Vaishnavh and Kolter, J Zico},
  journal	= {Ar{X}iv:1902.04742},
  year		= {2019}
}

@InProceedings{	  Nair.Hinton.2010,
  title		= {Rectified linear units improve restricted {B}oltzmann
		  machines},
  author	= {Nair, Vinod and Hinton, Geoffrey E},
  booktitle	= {ICML},
  year		= {2010}
}

@Article{	  Naor.Reingold.1999,
  title		= {On the construction of pseudorandom permutations:
		  {L}uby--{R}ackoff revisited},
  author	= {Naor, Moni and Reingold, Omer},
  journal	= {Journal of {C}ryptology},
  volume	= {12},
  number	= {1},
  pages		= {29--66},
  year		= {1999},
  publisher	= {Springer}
}

@Book{		  Nesterov.2018,
  title		= {Lectures on {C}onvex {O}ptimization},
  author	= {Nesterov, Yu},
  year		= {2018},
  publisher	= {Springer}
}

@Article{		  Nesterov.Vial.2000,
  title		= {Confidence level solutions for stochastic programming},
journal	= 	{Automatica},
volume = {44},
number = {6},
pages = {1559--1568},
  author	= {Nesterov, Yu and Vial, J-Ph},
  year		= {2000}
}

@Article{	  Neyman.1937,
  title		= {Outline of a theory of statistical estimation based on the
		  classical theory of probability},
  author	= {Neyman, Jerzy},
  journal	= {Philosophical {T}ransactions of the {R}oyal {S}ociety of {L}ondon.
		  {S}eries {A}, {M}athematical and {P}hysical {S}ciences},
  volume	= {236},
  number	= {767},
  pages		= {333--380},
  year		= {1937},
  publisher	= {The Royal Society London}
}

@InProceedings{	  Papineni.Roukos.Ward.ea.2002,
  title		= {{BLEU}: a method for automatic evaluation of machine
		  translation},
  author	= {Papineni, Kishore and Roukos, Salim and Ward, Todd and
		  Zhu, Wei-Jing},
  booktitle	= {Proceedings of the 40th {A}nnual {M}eeting of the {A}ssociation
		  for {C}omputational {L}inguistics},
  pages		= {311--318},
  year		= {2002}
}

@Article{	  Parikh.Tackstrom.Das.ea.2016,
  title		= {A decomposable attention model for natural language
		  inference},
  author	= {Parikh, Ankur P and T{\"a}ckstr{\"o}m, Oscar and Das,
		  Dipanjan and Uszkoreit, Jakob},
  journal	= {Ar{X}iv:1606.01933},
  year		= {2016}
}

@InProceedings{	  Park.Liu.Wang.ea.2019,
  title		= {Semantic image synthesis with spatially-adaptive
		  normalization},
  author	= {Park, Taesung and Liu, Ming-Yu and Wang, Ting-Chun and
		  Zhu, Jun-Yan},
  booktitle	= {Proceedings of the {IEEE} {C}onference on {C}omputer {V}ision and
		  {P}attern {R}ecognition},
  pages		= {2337--2346},
  year		= {2019}
}

@Article{	  Paszke.Gross.Massa.ea.2019,
  title		= {Py{T}orch: An imperative style, high-performance deep
		  learning library},
  author	= {Paszke, Adam and Gross, Sam and Massa, Francisco and
		  Lerer, Adam and Bradbury, James and Chanan, Gregory and
		  Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and
		  Antiga, Luca and et al.},
  journal	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  volume	= {32},
  pages		= {8026--8037},
  year		= {2019}
}

@Article{	  Paulus.Xiong.Socher.2017,
  title		= {A deep reinforced model for abstractive summarization},
  author	= {Paulus, Romain and Xiong, Caiming and Socher, Richard},
  journal	= {Ar{X}iv:1705.04304},
  year		= {2017}
}

@InProceedings{	  Pennington.Schoenholz.Ganguli.2017,
  title		= {Resurrecting the sigmoid in deep learning through
		  dynamical isometry: theory and practice},
  author	= {Pennington, Jeffrey and Schoenholz, Samuel and Ganguli,
		  Surya},
  booktitle	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  pages		= {4785--4795},
  year		= {2017}
}

@InProceedings{	  Pennington.Socher.Manning.2014,
  title		= {Glo{V}e: Global vectors for word representation},
  author	= {Pennington, Jeffrey and Socher, Richard and Manning,
		  Christopher},
  booktitle	= {Proceedings of the 2014 {C}onference on {E}mpirical {M}ethods in
		  {N}atural {L}anguage {P}rocessing ({EMNLP})},
  pages		= {1532--1543},
  year		= {2014}
}

@InProceedings{	  Peters.Ammar.Bhagavatula.ea.2017,
  title		= {Semi-supervised sequence tagging with bidirectional
		  language models},
  author	= {Peters, Matthew and Ammar, Waleed and Bhagavatula, Chandra
		  and Power, Russell},
  booktitle	= {Proceedings of the 55th {A}nnual {M}eeting of the {A}ssociation
		  for {C}omputational {L}inguistics, {V}olume 1},
  pages		= {1756--1765},
  year		= {2017}
}

@Book{		  Peters.Janzing.Scholkopf.2017,
  title		= {Elements of {C}ausal {I}nference: {F}oundations and {L}earning
		  {A}lgorithms},
  author	= {Peters, Jonas and Janzing, Dominik and Sch{\"o}lkopf,
		  Bernhard},
  year		= {2017},
  publisher	= {MIT {P}ress}
}

@InProceedings{	  Peters.Neumann.Iyyer.ea.2018,
  title		= {Deep contextualized word representations},
  author	= {Peters, Matthew and Neumann, Mark and Iyyer, Mohit and
		  Gardner, Matt and Clark, Christopher and Lee, Kenton and
		  Zettlemoyer, Luke},
  booktitle	= {Proceedings of the 2018 {C}onference of the {N}orth {A}merican
		  {C}hapter of the {A}ssociation for {C}omputational {L}inguistics:
		  {H}uman {L}anguage {T}echnologies, {V}olume 1},
  pages		= {2227--2237},
  year		= {2018}
}

@Book{	  Petersen.Pedersen.ea.2008,
  title		= {The Matrix Cookbook},
  author	= {Petersen, Kaare Brandt and Pedersen, Michael Syskind},
  publisher	= {Technical University of Denmark},
  year		= {2008}
}

@Article{	  Polyak.1964,
  title		= {Some methods of speeding up the convergence of iteration
		  methods},
  author	= {Polyak, Boris T},
  journal	= {{USSR} {C}omputational {M}athematics and {M}athematical {P}hysics},
  volume	= {4},
  number	= {5},
  pages		= {1--17},
  year		= {1964},
  publisher	= {Elsevier}
}

@InCollection{	  Prechelt.1998,
  title		= {Early stopping -- but when?},
  author	= {Prechelt, Lutz},
  booktitle	= {{N}eural {N}etworks: {T}ricks of the {T}rade},
  pages		= {55--69},
  year		= {1998},
  publisher	= {Springer}
}

@Article{	  Quadrana.Cremonesi.Jannach.2018,
  title		= {Sequence-aware recommender systems},
  author	= {Quadrana, Massimo and Cremonesi, Paolo and Jannach,
		  Dietmar},
  journal	= {{ACM} {C}omputing {S}urveys},
  volume	= {51},
  number	= {4},
  pages		= {66},
  year		= {2018},
  publisher	= {ACM}
}

@book{		  quinlan2014c4,
  title		= {C4.5: {P}rograms for {M}achine {L}earning},
  author	= {Quinlan, J Ross},
  year		= {1993},
  publisher	= {Elsevier}
}

@Article{	  Radford.Metz.Chintala.2015,
  title		= {Unsupervised representation learning with deep
		  convolutional generative adversarial networks},
  author	= {Radford, Alec and Metz, Luke and Chintala, Soumith},
  journal	= {Ar{X}iv:1511.06434},
  year		= {2015}
}

@Article{	  Radford.Narasimhan.Salimans.ea.2018,
  title		= {Improving language understanding by generative
		  pre-training},
  author	= {Radford, Alec and Narasimhan, Karthik and Salimans, Tim
		  and Sutskever, Ilya},
  journal	= {Open{AI}},
  year		= {2018}
}

@Article{	  Radford.Wu.Child.ea.2019,
  title		= {Language models are unsupervised multitask learners},
  author	= {Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan,
		  David and Amodei, Dario and Sutskever, Ilya},
  journal	= {Open{AI} {B}log},
  volume	= {1},
  number	= {8},
  pages		= {9},
  year		= {2019}
}

@Article{	  Radhakrishna-Rao.1945,
  title		= {Information and accuracy attainable in the estimation of
		  statistical parameters},
  author	= {Radhakrishna Rao, {C}},
  journal	= {Bulletin of the {C}alcutta {M}athematical {S}ociety},
  volume	= {37},
  number	= {3},
  pages		= {81--91},
  year		= {1945},
  publisher	= {Calcutta Mathematical Society}
}

@InProceedings{	  Radosavovic.Kosaraju.Girshick.ea.2020,
  title		= {Designing network design spaces},
  author	= {Radosavovic, Ilija and Kosaraju, Raj Prateek and Girshick,
		  Ross and He, Kaiming and Doll{\'a}r, Piotr},
  booktitle	= {Proceedings of the {IEEE}/{CVF} {C}onference on {C}omputer {V}ision
		  and {P}attern {R}ecognition},
  pages		= {10428--10436},
  year		= {2020}
}

@Article{	  Rajpurkar.Zhang.Lopyrev.ea.2016,
  title		= {{S}{Q}u{A}{D}: 100,000+ questions for machine comprehension of
		  text},
  author	= {Rajpurkar, Pranav and Zhang, Jian and Lopyrev, Konstantin
		  and Liang, Percy},
  journal	= {Ar{X}iv:1606.05250},
  year		= {2016}
}

@Article{	  Ramachandran.Zoph.Le.2017,
  title		= {Searching for activation functions},
  author	= {Ramachandran, Prajit and Zoph, Barret and Le, Quoc V},
  journal	= {Ar{X}iv:1710.05941},
  year		= {2017}
}

@InProceedings{	  Ranzato.Boureau.Chopra.ea.2007,
  title		= {A unified energy-based framework for unsupervised
		  learning},
  author	= {Ranzato, Marc-Aurelio and Boureau, Y-Lan and Chopra,
		  Sumit and LeCun, Yann},
  booktitle	= {Artificial {I}ntelligence and {S}tatistics},
  pages		= {371--379},
  year		= {2007},
  organization	= {PMLR}
}

@Article{	  Reddi.Kale.Kumar.2019,
  title		= {On the convergence of {A}dam and beyond},
  author	= {Reddi, Sashank J and Kale, Satyen and Kumar, Sanjiv},
  journal	= {Ar{X}iv:1904.09237},
  year		= {2019}
}

@InProceedings{	  Redmon.Divvala.Girshick.ea.2016,
  title		= {You only look once: Unified, real-time object detection},
  author	= {Redmon, Joseph and Divvala, Santosh and Girshick, Ross and
		  Farhadi, Ali},
  booktitle	= {Proceedings of the {IEEE} {C}onference on {C}omputer {V}ision and
		  {P}attern {R}ecognition},
  pages		= {779--788},
  year		= {2016}
}

@Article{	  Redmon.Farhadi.2018,
  title		= {Y{OLO}v3: An incremental improvement},
  author	= {Redmon, Joseph and Farhadi, Ali},
  journal	= {Ar{X}iv:1804.02767},
  year		= {2018}
}

@Article{	  Reed.De-Freitas.2015,
  title		= {Neural programmer-interpreters},
  author	= {Reed, Scott and De Freitas, Nando},
  journal	= {Ar{X}iv:1511.06279},
  year		= {2015}
}

@InProceedings{	  Ren.He.Girshick.ea.2015,
  title		= {Faster {R}-{CNN}: Towards real-time object detection with
		  region proposal networks},
  author	= {Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun,
		  Jian},
  booktitle	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  pages		= {91--99},
  year		= {2015}
}

@InProceedings{	  Rendle.2010,
  title		= {Factorization machines},
  author	= {Rendle, Steffen},
  booktitle	= {2010 {IEEE} {I}nternational {C}onference on {D}ata {M}ining},
  pages		= {995--1000},
  year		= {2010},
  organization	= {IEEE}
}

@InProceedings{	  Rendle.Freudenthaler.Gantner.ea.2009,
  title		= {{BPR}: {B}ayesian personalized ranking from implicit
		  feedback},
  author	= {Rendle, Steffen and Freudenthaler, Christoph and Gantner,
		  Zeno and Schmidt-Thieme, Lars},
  booktitle	= {Proceedings of the 25th {C}onference on {U}ncertainty
		  in {A}rtificial {I}ntelligence},
  pages		= {452--461},
  year		= {2009},
  organization	= {AUAI Press}
}

@Article{	  Revels.Lubin.Papamarkou.2016,
  title		= {Forward-mode automatic differentiation in {J}ulia},
  author	= {Revels, Jarrett and Lubin, Miles and Papamarkou,
		  Theodore},
  journal	= {Ar{X}iv:1607.07892},
  year		= {2016}
}

@Article{	  Riesenhuber.Poggio.1999,
  title		= {Hierarchical models of object recognition in cortex},
  author	= {Riesenhuber, Maximilian and Poggio, Tomaso},
  journal	= {Nature {N}euroscience},
  volume	= {2},
  number	= {11},
  pages		= {1019--1025},
  year		= {1999},
  publisher	= {Nature Publishing Group}
}

@Book{		  Rockafellar.1970,
  author	= {R.~T.~Rockafellar},
  publisher	= {Princeton {U}niversity {P}ress},
  title		= {Convex {A}nalysis},
  year		= 1970
}

@article{Rolnick.Veit.Belongie.Shavit.2017,
  title={Deep learning is robust to massive label noise},
  author={Rolnick, David and Veit, Andreas and Belongie, Serge and Shavit, Nir},
  journal={Ar{X}iv:1705.10694},
  year={2017}
}


@Article{	  Rosenbaum.Rubin.1983,
  title		= {The central role of the propensity score in observational
		  studies for causal effects},
  author	= {Rosenbaum, Paul R and Rubin, Donald B},
  journal	= {Biometrika},
  volume	= {70},
  number	= {1},
  pages		= {41--55},
  year		= {1983},
  publisher	= {Oxford University Press}
}

@Book{		  Rudin.1973,
  author	= {W.~Rudin},
  publisher	= {McGraw-Hill},
  title		= {Functional {A}nalysis},
  year		= 1973
}

@Article{	  Rumelhart.Hinton.Williams.ea.1988,
  title		= {Learning representations by back-propagating errors},
  author	= {Rumelhart, David E and Hinton, Geoffrey E and Williams,
		  Ronald J},
  journal	= {Cognitive {M}odeling},
  volume	= {5},
  number	= {3},
  pages		= {1},
  year		= {1988}
}

@InProceedings{	  Russakovsky.Deng.Huang.ea.2013,
  author	= {Olga Russakovsky and Jia Deng and Zhiheng Huang and
		  Alexander C. Berg and Li Fei-Fei},
  title		= {Detecting avocados to zucchinis: what have we done, and
		  where are we going?},
  booktitle	= {{I}nternational {C}onference on {C}omputer {V}ision (ICCV)},
  year		= {2013}
}

@Book{		  Russell.Norvig.2016,
  title		= {Artificial {I}ntelligence: {A} {M}odern {A}pproach},
  author	= {Russell, Stuart J and Norvig, Peter},
  year		= {2016},
  publisher	= {Pearson {E}ducation {L}imited}
}

@Article{	  Salton.Wong.Yang.1975,
  title		= {A vector space model for automatic indexing},
  author	= {Salton, Gerard and Wong, Anita and Yang, Chung-Shu},
  journal	= {Communications of the {ACM}},
  volume	= {18},
  number	= {11},
  pages		= {613--620},
  year		= {1975},
  publisher	= {ACM}
}

@InProceedings{	  Santurkar.Tsipras.Ilyas.ea.2018,
  title		= {How does batch normalization help optimization?},
  author	= {Santurkar, Shibani and Tsipras, Dimitris and Ilyas, Andrew
		  and Madry, Aleksander},
  booktitle	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  pages		= {2483--2493},
  year		= {2018}
}

@InProceedings{	  Sarwar.Karypis.Konstan.ea.2001,
  title		= {Item-based collaborative filtering recommendation
		  algorithms.},
  author	= {Sarwar, Badrul Munir and Karypis, George and Konstan,
		  Joseph A and Riedl, John},
  booktitle	= {Proceedings of 10th {I}nternational {C}onference on {W}orld {W}ide {W}eb},
  pages		= {285--295},
  year		= {2001}
}

@InProceedings{	  Schein.Popescul.Ungar.ea.2002,
  title		= {Methods and metrics for cold-start recommendations},
  author	= {Schein, Andrew I and Popescul, Alexandrin and Ungar, Lyle
		  H and Pennock, David M},
  booktitle	= {Proceedings of the 25th {A}nnual {I}nternational {ACM} {SIGIR}
		  {C}onference on {R}esearch and {D}evelopment in {I}nformation
		  {R}etrieval},
  pages		= {253--260},
  year		= {2002},
  organization	= {ACM}
}

@InProceedings{	  Scholkopf.Burges.Vapnik.1996,
  title		= {Incorporating invariances in support vector learning
		  machines},
  author	= {Sch{\"o}lkopf, Bernhard and Burges, Chris and Vapnik,
		  Vladimir},
  booktitle	= {{I}nternational {C}onference on {A}rtificial {N}eural {N}etworks},
  pages		= {47--52},
  year		= {1996},
  organization	= {Springer}
}

@InProceedings{	  Scholkopf.Herbrich.Smola.2001,
  title = {A generalized representer theorem},
  author	= {B.~Sch{\"o}lkopf and R.~Herbrich and A. J. Smola},
  booktitle	= {Proceedings of the {A}nnual {C}onference on {C}omputational {L}earning {T}heory},
  editor	= {D.~P.~Helmbold and B.~Williamson},
  pages		= {416-426},
  publisher	= {Springer-Verlag},
  year		= 2001
}

@Book{		  Scholkopf.Smola.2002,
  title		= {Learning with {K}ernels: {S}upport {V}ector {M}achines,
		  {R}egularization, {O}ptimization, and {B}eyond},
  author	= {Sch{\"o}lkopf, Bernhard and Smola, Alexander J},
  year		= {2002},
  publisher	= {{MIT} {P}ress}
}

@Article{	  Schuster.Paliwal.1997,
  title		= {Bidirectional recurrent neural networks},
  author	= {Schuster, Mike and Paliwal, Kuldip K},
  journal	= {IEEE {T}ransactions on {S}ignal {P}rocessing},
  volume	= {45},
  number	= {11},
  pages		= {2673--2681},
  year		= {1997},
  publisher	= {IEEE}
}

@InProceedings{	  Sedhain.Menon.Sanner.ea.2015,
  title		= {Autorec: Autoencoders meet collaborative filtering},
  author	= {Sedhain, Suvash and Menon, Aditya Krishna and Sanner,
		  Scott and Xie, Lexing},
  booktitle	= {Proceedings of the 24th {I}nternational {C}onference on {W}orld
		  {W}ide {W}eb},
  pages		= {111--112},
  year		= {2015},
  organization	= {ACM}
}

@Article{	  Sennrich.Haddow.Birch.2015,
  title		= {Neural machine translation of rare words with subword
		  units},
  author	= {Sennrich, Rico and Haddow, Barry and Birch, Alexandra},
  journal	= {Ar{X}iv:1508.07909},
  year		= {2015}
}

@Article{	  Sergeev.Del-Balso.2018,
  title		= {Horovod: fast and easy distributed deep learning in
		  {T}ensor{F}low},
  author	= {Sergeev, Alexander and Del Balso, Mike},
  journal	= {Ar{X}iv:1802.05799},
  year		= {2018}
}

@Article{	  Shannon.1948,
  author	= {Shannon, Claude Elwood},
  journal	= {The {B}ell {S}ystem {T}echnical {J}ournal},
  number	= {3},
  pages		= {379--423},
  publisher	= {Nokia Bell Labs},
  title		= {A Mathematical Theory of Communication},
  volume	= {27},
  year		= 1948
}

@InProceedings{	  Shao.Yao.Sun.ea.2020,
  title		= {Control{VAE}: Controllable variational autoencoder},
  author	= {Shao, Huajie and Yao, Shuochao and Sun, Dachun and Zhang,
		  Aston and Liu, Shengzhong and Liu, Dongxin and Wang, Jun
		  and Abdelzaher, Tarek},
  booktitle	= {Proceedings of the 37th {I}nternational {C}onference on
		  {M}achine {L}earning},
  year		= {2020},
  organization	= {JMLR. org}
}

@Article{	  Silver.Huang.Maddison.ea.2016,
  title		= {Mastering the game of {G}o with deep neural networks and
		  tree search},
  author	= {Silver, David and Huang, Aja and Maddison, Chris J and
		  Guez, Arthur and Sifre, Laurent and Van Den Driessche,
		  George and Schrittwieser, Julian and Antonoglou, Ioannis
		  and Panneershelvam, Veda and Lanctot, Marc and et al.},
  journal	= {Nature},
  volume	= {529},
  number	= {7587},
  pages		= {484},
  year		= {2016},
  publisher	= {Nature Publishing Group}
}

@InCollection{	  Simard.LeCun.Denker.ea.1998,
  title		= {Transformation invariance in pattern recognition -- tangent
		  distance and tangent propagation},
  author	= {Simard, Patrice Y and LeCun, Yann A and Denker, John S and
		  Victorri, Bernard},
  booktitle	= {Neural {N}etworks: {T}ricks of the {T}rade},
  pages		= {239--274},
  year		= {1998},
  publisher	= {Springer}
}

@Article{	  Simonyan.Zisserman.2014,
  title		= {Very deep convolutional networks for large-scale image
		  recognition},
  author	= {Simonyan, Karen and Zisserman, Andrew},
  journal	= {Ar{X}iv:1409.1556},
  year		= {2014}
}

@InProceedings{	  Sivic.Zisserman.2003,
  title		= {Video {G}oogle: A text retrieval approach to object matching
		  in videos},
  author	= {Sivic, Josef and Zisserman, Andrew},
  booktitle	= {Proceedings of the {IEEE} {I}nternational {C}onference on {C}omputer {V}ision},
  volume	= {3},
  pages		= {1470--1470},
  year		= {2003},
  organization	= {IEEE {C}omputer {S}ociety}
}

@Article{	  Smola.Narayanamurthy.2010,
  title		= {An architecture for parallel topic models},
  author	= {Smola, Alexander and Narayanamurthy, Shravan},
  journal	= {Proceedings of the {VLDB} {E}ndowment},
  volume	= {3},
  number	= {1-2},
  pages		= {703--710},
  year		= {2010},
  publisher	= {VLDB Endowment}
}

@PhDThesis{	  Speelpenning.1980,
  title		= {Compiling fast partial derivatives of functions given by
		  algorithms},
  author	= {Speelpenning, Bert},
  year		= {1980},
  school	= {University of Illinois at Urbana-Champaign}
}

@Book{		  Spiegelhalter.2019,
  title		= {The {A}rt of {S}tatistics: {L}earning from {D}ata},
  author	= {Spiegelhalter, David},
  year		= {2019},
  publisher	= {Penguin}
}

@Article{	  Srivastava.Hinton.Krizhevsky.ea.2014,
  title		= {Dropout: a simple way to prevent neural networks from
		  overfitting},
  author	= {Srivastava, Nitish and Hinton, Geoffrey and Krizhevsky,
		  Alex and Sutskever, Ilya and Salakhutdinov, Ruslan},
  journal	= {{J}ournal of {M}achine {L}earning {R}esearch},
  volume	= {15},
  number	= {1},
  pages		= {1929--1958},
  year		= {2014},
  publisher	= {JMLR. org}
}

@Book{		  Strang.1993,
  title		= {Introduction to {L}inear {A}lgebra},
  author	= {Strang, Gilbert},
  year		= {1993},
  publisher	= {Wellesley--{C}ambridge {P}ress}
}

@Article{	  Su.Khoshgoftaar.2009,
  title		= {A survey of collaborative filtering techniques},
  author	= {Su, Xiaoyuan and Khoshgoftaar, Taghi M},
  journal	= {Advances in {A}rtificial {I}ntelligence},
  volume	= {2009},
  year		= {2009},
  publisher	= {Hindawi}
}

@InProceedings{	  Sukhbaatar.Weston.Fergus.ea.2015,
  title		= {End-to-end memory networks},
  author	= {Sukhbaatar, Sainbayar and Weston, Jason and Fergus, Rob},
  booktitle	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  pages		= {2440--2448},
  year		= {2015}
}

@InProceedings{	  Sutskever.Martens.Dahl.ea.2013,
  title		= {On the importance of initialization and momentum in deep
		  learning},
  author	= {Sutskever, Ilya and Martens, James and Dahl, George and
		  Hinton, Geoffrey},
  booktitle	= {{I}nternational {C}onference on {M}achine {L}earning},
  pages		= {1139--1147},
  year		= {2013}
}

@article{chan2015listen,
  title={Listen, attend and spell},
  author={Chan, William and Jaitly, Navdeep and Le, Quoc V and Vinyals, Oriol},
  journal={Ar{X}iv:1508.01211},
  year={2015}
}

@book{rabiner1993fundamentals,
  title={Fundamentals of {S}peech {R}ecognition},
  author={Rabiner, Lawrence and Juang, Biing-Hwang},
  year={1993},
  publisher={Prentice-{H}all.}
}

@article{yang2016neural,
  title={Neural machine translation with recurrent attention modeling},
  author={Yang, Zichao and Hu, Zhiting and Deng, Yuntian and Dyer, Chris and Smola, Alex},
  journal={Ar{X}iv:1607.05108},
  year={2016}
}


@InProceedings{	  Sutskever.Vinyals.Le.2014,
  title		= {Sequence to sequence learning with neural networks},
  author	= {Sutskever, Ilya and Vinyals, Oriol and Le, Quoc V},
  booktitle	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  pages		= {3104--3112},
  year		= {2014}
}

@InProceedings{	  Szegedy.Ioffe.Vanhoucke.ea.2017,
  title		= {Inception-v4, {I}nception-{R}es{N}et and the impact of residual
		  connections on learning},
  author	= {Szegedy, Christian and Ioffe, Sergey and Vanhoucke,
		  Vincent and Alemi, Alexander A},
  booktitle	= {31st {AAAI} {C}onference on {A}rtificial {I}ntelligence},
  year		= {2017}
}

@InProceedings{	  Szegedy.Liu.Jia.ea.2015,
  title		= {Going deeper with convolutions},
  author	= {Szegedy, Christian and Liu, Wei and Jia, Yangqing and
		  Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and
		  Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich,
		  Andrew},
  booktitle	= {Proceedings of the {IEEE} {C}onference on {C}omputer {V}ision and
		  {P}attern {R}ecognition},
  pages		= {1--9},
  year		= {2015}
}

@InProceedings{	  Szegedy.Vanhoucke.Ioffe.ea.2016,
  title		= {Rethinking the {I}nception architecture for computer
		  vision},
  author	= {Szegedy, Christian and Vanhoucke, Vincent and Ioffe,
		  Sergey and Shlens, Jon and Wojna, Zbigniew},
  booktitle	= {Proceedings of the {IEEE} {C}onference on {C}omputer {V}ision and
		  {P}attern {R}ecognition},
  pages		= {2818--2826},
  year		= {2016}
}

@Article{	  Tallec.Ollivier.2017,
  title		= {Unbiasing truncated backpropagation through time},
  author	= {Tallec, Corentin and Ollivier, Yann},
  journal	= {Ar{X}iv:1705.08209},
  year		= {2017}
}

@InProceedings{	  Tang.Wang.2018,
  title		= {Personalized top-n sequential recommendation via
		  convolutional sequence embedding},
  author	= {Tang, Jiaxi and Wang, Ke},
  booktitle	= {Proceedings of the Eleventh {ACM} {I}nternational {C}onference
		  on {W}eb {S}earch and {D}ata {M}ining},
  pages		= {565--573},
  year		= {2018},
  organization	= {ACM}
}

@Article{	  Taskar.Guestrin.Koller.2004,
  title		= {Max-margin {M}arkov networks},
  author	= {Taskar, Ben and Guestrin, Carlos and Koller, Daphne},
  journal	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  volume	= {16},
  pages		= {25},
  year		= {2004}
}

@Article{	  Tay.Dehghani.Bahri.ea.2020,
  title		= {Efficient transformers: A survey},
  author	= {Tay, Yi and Dehghani, Mostafa and Bahri, Dara and Metzler,
		  Donald},
  journal	= {Ar{X}iv:2009.06732},
  year		= {2020}
}

@Article{	  Teye.Azizpour.Smith.2018,
  title		= {Bayesian uncertainty estimation for batch normalized deep
		  networks},
  author	= {Teye, Mattias and Azizpour, Hossein and Smith, Kevin},
  journal	= {Ar{X}iv:1802.06455},
  year		= {2018}
}

@Article{	  Thomee.Shamma.Friedland.ea.2016,
  title		= {{YFCC}100{M}: The new data in multimedia research},
  author	= {Thomee, Bart and Shamma, David A and Friedland, Gerald and
		  Elizalde, Benjamin and Ni, Karl and Poland, Douglas and
		  Borth, Damian and Li, Li-Jia},
  journal	= {Communications of the {ACM}},
  volume	= {59},
  number	= {2},
  pages		= {64--73},
  year		= {2016},
  publisher	= {{ACM} New York, NY, USA}
}

@Book{		  Thompson.2015,
  title		= {Mathematical {S}tatistical {M}echanics},
  author	= {Thompson, Colin J},
  year		= {2015},
  publisher	= {Princeton {U}niversity {P}ress}
}

@inCollection{	  Tieleman.Hinton.2012,
  title		= {Divide the gradient by a running
		  average of its recent magnitude},
  author	= {Tieleman, Tijmen and Hinton, Geoffrey},
  booktitle	= {{COURSERA}: Neural {N}etworks for {M}achine {L}earning, Lecture 6.5-rmsprop},
  year		= {2012}
}

@Article{	  Torralba.Fergus.Freeman.2008,
  title		= {80 million tiny images: A large data set for nonparametric
		  object and scene recognition},
  author	= {Torralba, Antonio and Fergus, Rob and Freeman, William T},
  journal	= {{IEEE} {T}ransactions on {P}attern {A}nalysis and {M}achine
		  {I}ntelligence},
  volume	= {30},
  number	= {11},
  pages		= {1958--1970},
  year		= {2008},
  publisher	= {IEEE}
}

@misc{	  Toscher.Jahrer.Bell.2009,
  title		= {The bigchaos solution to the {N}etflix grand prize},
  author	= {T{\"o}scher, Andreas and Jahrer, Michael and Bell, Robert
		  M},
  booktitle	= {Netflix {P}rize {D}ocumentation},
  pages		= {1--52},
  year		= {2009}
}

@Article{	  Treisman.Gelade.1980,
  title		= {A feature-integration theory of attention},
  author	= {Treisman, Anne M and Gelade, Garry},
  journal	= {Cognitive {P}sychology},
  volume	= {12},
  number	= {1},
  pages		= {97--136},
  year		= {1980},
  publisher	= {Elsevier}
}

@Article{	  Tsoumakas.Katakis.2007,
  title		= {Multi-label classification: An overview},
  author	= {Tsoumakas, Grigorios and Katakis, Ioannis},
  journal	= {International {J}ournal of {D}ata {W}arehousing and {M}ining},
  volume	= {3},
  number	= {3},
  pages		= {1--13},
  year		= {2007},
  publisher	= {IGI {G}lobal}
}

@Article{	  Turing.1950,
  title		= {Computing machinery and intelligence},
  author	= {Turing, Alan},
  journal	= {Mind},
  volume	= {59},
  number	= {236},
  pages		= {433},
  year		= {1950}
}

@Article{	  Uijlings.Van-De-Sande.Gevers.ea.2013,
  title		= {Selective search for object recognition},
  author	= {Uijlings, Jasper RR and Van De Sande, Koen EA and Gevers,
		  Theo and Smeulders, Arnold WM},
  journal	= {International {J}ournal of {C}omputer {V}ision},
  volume	= {104},
  number	= {2},
  pages		= {154--171},
  year		= {2013},
  publisher	= {Springer}
}

@InProceedings{	  Vapnik.1992,
  title		= {Principles of risk minimization for learning theory},
  author	= {Vapnik, Vladimir},
  booktitle	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  pages		= {831--838},
  year		= {1992}
}

@Book{		  Vapnik.1998,
  address	= {New York},
  author	= {V. Vapnik},
  publisher	= {John {W}iley \& {S}ons},
  title		= {Statistical {L}earning {T}heory},
  year		= 1998
}

@Article{	  Vapnik.Chervonenkis.1964,
  author	= {V. Vapnik and A. Chervonenkis},
  issue		= {1},
  journal	= {Automation and {R}emote {C}ontrol},
  title		= {A note on one class of perceptrons},
  volume	= {25},
pages = {103--109},
  year		= {1964}
}

@Article{	  Vapnik.Chervonenkis.1968,
  author	= {V. Vapnik and A. Chervonenkis},
  journal	= {Dokl.\ {A}kad.\ {N}auk {SSSR}},
  pages		= {915--918},
  title		= {Uniform convergence of frequencies of occurence of events
		  to their probabilities},
  volume	= 181,
  year		= 1968
}

@Article{	  Vapnik.Chervonenkis.1971,
  author	= {V. Vapnik and A. Chervonenkis},
  journal	= {Theory {P}robab. {A}ppl.},
  number	= 2,
  pages		= {264--281},
  title		= {On the uniform convergence of relative frequencies of
		  events to their probabilities},
  volume	= 16,
  year		= 1971
}

@Book{		  Vapnik.Chervonenkis.1974,
  address	= {Moscow},
  author	= {V. Vapnik and A. Chervonenkis},
  note		= {(German Translation: W. Wapnik \& A. Tscherwonenkis, {\em
		  Theorie der Zeichenerkennung}, Akademie-Verlag, Berlin,
		  1979)},
  publisher	= {Nauka},
  title		= {Theory of {P}attern {R}ecognition [in {R}ussian]},
  year		= 1974
}

@Article{	  Vapnik.Chervonenkis.1974*1,
  author	= {V.~N. Vapnik and A.~Y. Chervonenkis},
  journal	= {Automation and {R}emote {C}ontrol},
  pages		= {1226--1235; 1403--1412},
  title		= {Ordered risk minimization},
  volume	= 35,
  year		= 1974
}

@Book{		  Vapnik.Chervonenkis.1974*2,
  address	= {Moscow: Nauka},
  author	= {V. Vapnik and A. Chervonenkis},
  title		= {Teoriya {R}aspoznavaniya {O}brazov: {S}tatisticheskie {P}roblemy
		  {O}bucheniya. (in Russian) [{T}heory of {P}attern {R}ecognition:
		  {S}tatistical {P}roblems of {L}earning]},
  year		= 1974
}

@Article{	  Vapnik.Chervonenkis.1981,
  author	= {V. Vapnik and A. Chervonenkis},
  journal	= {Teoriya {V}eroyatnostei i {E}e {P}rimeneniya},
  number	= 3,
  pages		= {543--564},
  title		= {The necessary and sufficient conditions for the uniform
		  convergence of averages to their expected values},
  volume	= 26,
  year		= 1981
}

@Article{	  Vapnik.Chervonenkis.1991,
  author	= {V. Vapnik and A. Chervonenkis},
  journal	= {{P}attern {R}ecognition and {I}mage {A}nalysis},
  number	= 3,
  pages		= {283--305},
  title		= {The necessary and sufficient conditions for consistency in
		  the empirical risk minimization method},
  volume	= 1,
  year		= 1991
}

@Article{	  Vapnik.Levin.Le-Cun.1994,
  title		= {Measuring the {VC}-dimension of a learning machine},
  author	= {Vapnik, Vladimir and Levin, Esther and Le Cun, Yann},
  journal	= {Neural {C}omputation},
  volume	= {6},
  number	= {5},
  pages		= {851--876},
  year		= {1994},
  publisher	= {{MIT} {P}ress}
}

@InProceedings{	  Vaswani.Shazeer.Parmar.ea.2017,
  title		= {Attention is all you need},
  author	= {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and
		  Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and
		  Kaiser, {\L}ukasz and Polosukhin, Illia},
  booktitle	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  pages		= {5998--6008},
  year		= {2017}
}

@Book{		  Wahba.1990,
  title		= {Spline {M}odels for {O}bservational {D}ata},
  author	= {Wahba, Grace},
  year		= {1990},
  publisher	= {SIAM}
}

@Article{	  Waibel.Hanazawa.Hinton.ea.1989,
  title		= {Phoneme recognition using time-delay neural networks},
  author	= {Waibel, Alex and Hanazawa, Toshiyuki and Hinton, Geoffrey
		  and Shikano, Kiyohiro and Lang, Kevin J},
  journal	= {IEEE {T}ransactions on {A}coustics, {S}peech, and {S}ignal
		  {P}rocessing},
  volume	= {37},
  number	= {3},
  pages		= {328--339},
  year		= {1989},
  publisher	= {IEEE}
}

@InProceedings{	  Wang.Davidson.Pan.ea.2016,
  title		= {Gunrock: A high-performance graph processing library on
		  the {GPU}},
  author	= {Wang, Yangzihao and Davidson, Andrew and Pan, Yuechao and
		  Wu, Yuduo and Riffel, Andy and Owens, John D},
  booktitle	= {{ACM} {SIGPLAN} {N}otices},
  volume	= {51},
  number	= {8},
  pages		= {11},
  year		= {2016},
  organization	= {ACM}
}

@Article{	  Wang.Li.Liberty.ea.2018,
  title		= {Optimal message scheduling for aggregation},
  author	= {Wang, Leyuan and Li, Mu and Liberty, Edo and Smola, Alex
		  J},
  journal	= {Networks},
  volume	= {2},
  number	= {3},
  pages		= {2--3},
  year		= {2018}
}

@Article{	  Warstadt.Singh.Bowman.2019,
  title		= {Neural network acceptability judgments},
  author	= {Warstadt, Alex and Singh, Amanpreet and Bowman, Samuel R},
  journal	= {{T}ransactions of the {A}ssociation for {C}omputational
		  {L}inguistics},
  volume	= {7},
  pages		= {625--641},
  year		= {2019},
  publisher	= {MIT {P}ress}
}

@Book{		  Wasserman.2013,
  title		= {All of {S}tatistics: {A} {C}oncise {C}ourse in {S}tatistical
		  {I}nference},
  author	= {Wasserman, Larry},
  year		= {2013},
  publisher	= {Springer}
}

@Article{	  Watkins.Dayan.1992,
  title		= {Q-learning},
  author	= {Watkins, Christopher JCH and Dayan, Peter},
  journal	= {Machine {L}earning},
  volume	= {8},
  number	= {3--4},
  pages		= {279--292},
  year		= {1992},
  publisher	= {Springer}
}

@Article{	  Watson.1964,
  title		= {Smooth regression analysis},
  author	= {Watson, Geoffrey S},
  journal	= {Sankhy{\=a}: The {I}ndian {J}ournal of {S}tatistics, {S}eries {A}},
  pages		= {359--372},
  year		= {1964},
  publisher	= {JSTOR}
}

@InProceedings{	  Welling.Teh.2011,
  title		= {Bayesian learning via stochastic gradient {L}angevin
		  dynamics},
  author	= {Welling, Max and Teh, Yee W},
  booktitle	= {Proceedings of the 28th {I}nternational {C}onference on
		  {M}achine {L}earning ({ICML}-11)},
  pages		= {681--688},
  year		= {2011}
}

@Article{	  Wengert.1964,
  title		= {A simple automatic derivative evaluation program},
  author	= {Wengert, Robert Edwin},
  journal	= {Communications of the {ACM}},
  volume	= {7},
  number	= {8},
  pages		= {463--464},
  year		= {1964},
  publisher	= {{ACM} New York, NY, USA}
}

@Article{	  Werbos.1990,
  title		= {Backpropagation through time: what it does and how to do
		  it},
  author	= {Werbos, Paul J},
  journal	= {Proceedings of the {IEEE}},
  volume	= {78},
  number	= {10},
  pages		= {1550--1560},
  year		= {1990},
  publisher	= {IEEE}
}

@InProceedings{	  Wigner.1958,
  title		= {On the distribution of the roots of certain symmetric
		  matrices},
  author	= {Wigner, Eugene P.},
  booktitle	= {Ann. {M}ath.},
  pages		= {325--327},
  year		= {1958}
}

@TechReport{	  Williams.Waterman.Patterson.2009,
  title		= {Roofline: An insightful visual performance model for
		  floating-point programs and multicore architectures},
  author	= {Williams, Samuel and Waterman, Andrew and Patterson,
		  David},
  year		= {2009},
  institution	= {Lawrence {B}erkeley {N}ational {L}ab.}
}

@Article{	  Wood.Gasthaus.Archambeau.ea.2011,
  title		= {The sequence memoizer},
  author	= {Wood, Frank and Gasthaus, Jan and Archambeau, C{\'e}dric
		  and James, Lancelot and Teh, Yee Whye},
  journal	= {Communications of the {ACM}},
  volume	= {54},
  number	= {2},
  pages		= {91--98},
  year		= {2011},
  publisher	= {ACM}
}

@InProceedings{	  Wu.Ahmed.Beutel.ea.2017,
  title		= {Recurrent recommender networks},
  author	= {Wu, Chao-Yuan and Ahmed, Amr and Beutel, Alex and Smola,
		  Alexander J and Jing, How},
  booktitle	= {Proceedings of the 10th {ACM} {I}nternational {C}onference on
		  {W}eb {S}earch and {D}ata {M}ining},
  pages		= {495--503},
  year		= {2017},
  organization	= {ACM}
}

@Article{	  Wu.Schuster.Chen.ea.2016,
  title		= {Google's neural machine translation system: Bridging the
		  gap between human and machine translation},
  author	= {Wu, Yonghui and Schuster, Mike and Chen, Zhifeng and Le,
		  Quoc V and Norouzi, Mohammad and Macherey, Wolfgang and
		  Krikun, Maxim and Cao, Yuan and Gao, Qin and Macherey,
		  Klaus and et al.},
  journal	= {Ar{X}iv:1609.08144},
  year		= {2016}
}

@InProceedings{	  Xiao.Bahri.Sohl-Dickstein.ea.2018,
  title		= {Dynamical isometry and a mean field theory of {CNN}s: How to
		  train 10,000-layer vanilla convolutional neural networks},
  author	= {Xiao, Lechao and Bahri, Yasaman and Sohl-Dickstein, Jascha
		  and Schoenholz, Samuel and Pennington, Jeffrey},
  booktitle	= {{I}nternational {C}onference on {M}achine {L}earning},
  pages		= {5393--5402},
  year		= {2018}
}

@Article{	  Xiao.Rasul.Vollgraf.2017,
  title		= {Fashion-{MNIST}: a novel image dataset for benchmarking
		  machine learning algorithms},
  author	= {Xiao, Han and Rasul, Kashif and Vollgraf, Roland},
  journal	= {Ar{X}iv:1708.07747},
  year		= {2017}
}

@InProceedings{	  Xie.Girshick.Dollar.ea.2017,
  title		= {Aggregated residual transformations for deep neural
		  networks},
  author	= {Xie, Saining and Girshick, Ross and Doll{\'a}r, Piotr and
		  Tu, Zhuowen and He, Kaiming},
  booktitle	= {Proceedings of the {IEEE} {C}onference on {C}omputer {V}ision and
		  {P}attern {R}ecognition},
  pages		= {1492--1500},
  year		= {2017}
}

@InProceedings{	  Xiong.Wu.Alleva.ea.2018,
  title		= {The {M}icrosoft 2017 conversational speech recognition
		  system},
  author	= {Xiong, Wayne and Wu, Lingfeng and Alleva, Fil and Droppo,
		  Jasha and Huang, Xuedong and Stolcke, Andreas},
  booktitle	= {2018 {IEEE} {I}nternational {C}onference on {A}coustics, {S}peech
		  and {S}ignal {P}rocessing ({ICASSP})},
  pages		= {5934--5938},
  year		= {2018},
  organization	= {IEEE}
}

@InProceedings{	  Yamaguchi.Sakamoto.Akabane.ea.1990,
  title		= {A neural network for speaker-independent isolated word
		  recognition},
  author	= {Yamaguchi, Kouichi and Sakamoto, Kenji and Akabane, Toshio
		  and Fujimoto, Yoshiji},
  booktitle	= {First {I}nternational {C}onference on {S}poken {L}anguage
		  {P}rocessing},
  year		= {1990}
}

@InProceedings{	  Yang.Moczulski.Denil.ea.2015,
  title		= {Deep fried convnets},
  author	= {Yang, Zichao and Moczulski, Marcin and Denil, Misha and De
		  Freitas, Nando and Smola, Alex and Song, Le and Wang,
		  Ziyu},
  booktitle	= {Proceedings of the {IEEE} {I}nternational {C}onference on
		  {C}omputer {V}ision},
  pages		= {1476--1483},
  year		= {2015}
}

@InProceedings{	  Ye.Yin.Lee.ea.2011,
  title		= {Exploiting geographical influence for collaborative
		  point-of-interest recommendation},
  author	= {Ye, Mao and Yin, Peifeng and Lee, Wang-Chien and Lee,
		  Dik-Lun},
  booktitle	= {Proceedings of the 34th {I}nternational {ACM} {SIGIR} {C}onference
		  on {R}esearch and {D}evelopment in {I}nformation {R}etrieval},
  pages		= {325--334},
  year		= {2011},
  organization	= {ACM}
}

@Article{	  You.Gitman.Ginsburg.2017,
  title		= {Large batch training of convolutional networks},
  author	= {You, Yang and Gitman, Igor and Ginsburg, Boris},
  journal	= {Ar{X}iv:1708.03888},
  year		= {2017}
}

@Article{	  Yu.1994,
  title		= {Rates of convergence for empirical processes of stationary
		  mixing sequences},
  author	= {Yu, Bin},
  journal	= {{A}nnals of {P}robability},
  pages		= {94--116},
  year		= {1994},
  publisher	= {JSTOR}
}

@InProceedings{	  Zaheer.Reddi.Sachan.ea.2018,
  title		= {Adaptive methods for nonconvex optimization},
  author	= {Zaheer, Manzil and Reddi, Sashank and Sachan, Devendra and
		  Kale, Satyen and Kumar, Sanjiv},
  booktitle	= {Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  pages		= {9793--9803},
  year		= {2018}
}

@Article{	  Zeiler.2012,
  title		= {{ADADELTA}: an adaptive learning rate method},
  author	= {Zeiler, Matthew D},
  journal	= {Ar{X}iv:1212.5701},
  year		= {2012}
}

@Article{	  Zeiler.Fergus.2013,
  title		= {Stochastic pooling for regularization of deep
		  convolutional neural networks},
  author	= {Zeiler, Matthew D and Fergus, Rob},
  journal	= {Ar{X}iv:1301.3557},
  year		= {2013}
}

@Article{	  Zhang.2004,
  title		= {Statistical behavior and consistency of classification
		  methods based on convex risk minimization},
  author	= {Zhang, Tong},
  journal	= {{A}nnals of {S}tatistics},
  volume	= {32},
  number	= {1},
  pages		= {56--85},
  year		= {2004},
  publisher	= {Institute of {M}athematical {S}tatistics}
}

@Article{	  Zhang.Sun.Jiang.ea.2021,
  title		= {Byte{T}rack: Multi-object tracking by associating every
		  detection box},
  author	= {Yifu Zhang and Peize Sun and Yi Jiang and Dongdong Yu and
		  Zehuan Yuan and Ping Luo and Wenyu Liu and Xinggang Wang},
  journal	= {Ar{X}iv:2110.06864},
  year		= {2021}
}

@InProceedings{	  Zhang.Tay.Zhang.ea.2021,
  title		= {Beyond fully-connected layers with quaternions:
		  Parameterization of hypercomplex multiplications with 1/n
		  parameters},
  author	= {Zhang, Aston and Tay, Yi and Zhang, Shuai and Chan, Alvin
		  and Luu, Anh Tuan and Hui, Siu Cheung and Fu, Jie},
  booktitle	= {{I}nternational {C}onference on {L}earning {R}epresentations},
  year		= {2021}
}

@Article{	  Zhang.Yao.Sun.ea.2019,
  title		= {Deep learning based recommender system: A survey and new
		  perspectives},
  author	= {Zhang, Shuai and Yao, Lina and Sun, Aixin and Tay, Yi},
  journal	= {{ACM} {C}omputing {S}urveys},
  volume	= {52},
  number	= {1},
  pages		= {5},
  year		= {2019},
  publisher	= {ACM}
}

@InProceedings{	  Zhang.ea.1988,
  title		= {Shift-invariant pattern recognition neural network and its
		  optical architecture},
  author	= {Zhang, Wei and Tanida, Jun and Itoh, Kazuyoshi and Ichioka, Yoshiki},
  booktitle	= {Proceedings of {A}nnual {C}onference of the {J}apan {S}ociety of
		  {A}pplied {P}hysics},
  year		= {1988}
}

@Article{	  Zhao.Zheng.Xu.ea.2019,
  title		= {Object detection with deep learning: A review},
  author	= {Zhao, Zhong-Qiu and Zheng, Peng and Xu, Shou-tao and Wu,
		  Xindong},
  journal	= {{IEEE} {T}ransactions on {N}eural {N}etworks and {L}earning
		  {S}ystems},
  volume	= {30},
  number	= {11},
  pages		= {3212--3232},
  year		= {2019},
  publisher	= {IEEE}
}

@InProceedings{	  Zhu.Kiros.Zemel.ea.2015,
  title		= {Aligning books and movies: Towards story-like visual
		  explanations by watching movies and reading books},
  author	= {Zhu, Yukun and Kiros, Ryan and Zemel, Rich and
		  Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba,
		  Antonio and Fidler, Sanja},
  booktitle	= {Proceedings of the {IEEE} {I}nternational {C}onference on
		  {C}omputer {V}ision},
  pages		= {19--27},
  year		= {2015}
}

@InProceedings{	  Zhu.Park.Isola.ea.2017,
  title		= {Unpaired image-to-image translation using cycle-consistent
		  adversarial networks},
  author	= {Zhu, Jun-Yan and Park, Taesung and Isola, Phillip and
		  Efros, Alexei A},
  booktitle	= {Proceedings of the {IEEE} {I}nternational {C}onference on
		  {C}omputer {V}ision},
  pages		= {2223--2232},
  year		= {2017}
}

@article{vapnik1994measuring,
  title={Measuring the {VC}-dimension of a learning machine},
  author={Vapnik, Vladimir and Levin, Esther and Le Cun, Yann},
  journal={Neural Computation},
  volume={6},
  number={5},
  pages={851--876},
  year={1994},
  publisher={MIT Press}
}

@article{friedman1987exploratory,
  title={Exploratory projection pursuit},
  author={Friedman, Jerome H},
  journal={Journal of the {A}merican {S}tatistical {A}ssociation},
  volume={82},
  number={397},
  pages={249--266},
  year={1987},
  publisher={Taylor \& {F}rancis}
}

@article{ong2005learning,
  title={Learning the kernel with hyperkernels},
  author={Ong, Cheng Soon and Smola, Alexander and Williamson, Robert},
  volume={6},
  pages={1043--1071},
  year={2005},
  publisher={MIT Press},
  journal={{J}ournal of {M}achine {L}earning {R}esearch}
}

@article{thomee2016yfcc100m,
  title={YFCC100M: The new data in multimedia research},
  author={Thomee, Bart and Shamma, David A and Friedland, Gerald and Elizalde, Benjamin and Ni, Karl and Poland, Douglas and Borth, Damian and Li, Li-Jia},
  journal={Communications of the {ACM}},
  volume={59},
  number={2},
  pages={64--73},
  year={2016},
  publisher={{ACM} New York, NY, USA}
}

@book{Vapnik98,
  Address =	 {New York},
  Author =	 {V. Vapnik},
  Publisher =	 {John Wiley and Sons},
  Title =	 {Statistical Learning Theory},
  Year =	 1998
}

@article{VapChe64,
  Author =	 {V. Vapnik and A. Chervonenkis},
  Issue =	 1,
  Journal =	 {Automation and Remote Control},
  Title =	 {A note on one class of perceptrons},
  Volume =	 25,
  Year =	 1964
}

@article{VapChe68,
  Author =	 {V. Vapnik and A. Chervonenkis},
  Journal =	 {Dokl.\ Akad.\ Nauk SSSR},
  Pages =	 {915-918},
  Title =	 {Uniform convergence of frequencies of occurence of
                  events to their probabilities},
  Volume =	 181,
  Year =	 1968
}

@article{VapChe71,
  Author =	 {V. Vapnik and A. Chervonenkis},
  Journal =	 {Theory Probab. Appl.},
  Number =	 2,
  Pages =	 {264-281},
  Title =	 {On the uniform convergence of relative frequencies
                  of events to their probabilities},
  Volume =	 16,
  Year =	 1971
}

@book{VapChe74,
  Address =	 {Moscow},
  Author =	 {V. Vapnik and A. Chervonenkis},
  Note =	 {(German Translation: W. Wapnik \& A. Tscherwonenkis,
                  {\em Theorie der Zeichenerkennung}, Akademie-Verlag,
                  Berlin, 1979)},
  Publisher =	 {Nauka},
  Title =	 {Theory of {P}attern {R}ecognition [in Russian]},
  Year =	 1974
}

@article{VapChe74b,
  Author =	 {V.~N. Vapnik and A.~Y. Chervonenkis},
  Journal =	 {Automation and Remote Control},
  Pages =	 {1226--1235, 1403--1412},
  Title =	 {Ordered risk minimization},
  Volume =	 35,
  Year =	 1974
}

@article{VapChe81,
  Author =	 {V. Vapnik and A. Chervonenkis},
  Journal =	 {Teoriya Veroyatnostei i Ee Primeneniya},
  Number =	 3,
  Pages =	 {543-564},
  Title =	 {The necessary and sufficient conditions for the
                  uniform convergence of averages to their expected
                  values},
  Volume =	 26,
  Year =	 1981
}

@article{VapChe91,
  Author =	 {V. Vapnik and A. Chervonenkis},
  Journal =	 {{P}attern {R}ecognition and Image Analysis},
  Number =	 3,
  Pages =	 {283-305},
  Title =	 {The necessary and sufficient conditions for
                  consistency in the empirical risk minimization
                  method},
  Volume =	 1,
  Year =	 1991
}

@article{boucheron2005theory,
  title={Theory of classification: A survey of some recent advances},
  author={Boucheron, St{\'e}phane and Bousquet, Olivier and Lugosi, G{\'a}bor},
  journal={ESAIM: {P}robability and {S}tatistics},
  volume={9},
  pages={323--375},
  year={2005},
  publisher={EDP Sciences}
}

@article{sindhwani2015structured,
  title={Structured transforms for small-footprint deep learning},
  author={Sindhwani, Vikas and Sainath, Tara N and Kumar, Sanjiv},
  journal={Ar{X}iv:1510.01722},
  year={2015}
}

@inproceedings{dwork2015preserving,
  title={Preserving statistical validity in adaptive data analysis},
  author={Dwork, Cynthia and Feldman, Vitaly and Hardt, Moritz and Pitassi, Toniann and Reingold, Omer and Roth, Aaron Leon},
  booktitle={Proceedings of the 47th {A}nnual {ACM} {S}ymposium on {T}heory of {C}omputing},
  pages={117--126},
  year={2015}
}

@incollection{micchelli1984interpolation,
  title={Interpolation of scattered data: distance matrices and conditionally positive definite functions},
  author={Micchelli, Charles A},
  booktitle={Approximation {T}heory and {S}pline {F}unctions},
  pages={143--145},
  year={1984},
  publisher={Springer}
}

@book{popper2005logic,
  title={The {L}ogic of {S}cientific {D}iscovery},
  author={Popper, Karl},
  year={2005},
  publisher={Routledge}
}

@book{mackay2003information,
  title={Information {T}heory, {I}nference and {L}earning {A}lgorithms},
  author={MacKay, David JC},
  year={2003},
  publisher={Cambridge {U}niversity {P}ress}
}

@book{rasmussen2006gaussian,
  title={Gaussian {P}rocesses for {M}achine {L}earning},
  author={Rasmussen, Carl Edward and Williams, Christopher KI},
  number={3},
  year={2006},
  publisher={{MIT} {P}ress}
}


@inproceedings{lavin2016fast,
  title={Fast algorithms for convolutional neural networks},
  author={Lavin, Andrew and Gray, Scott},
  booktitle={Proceedings of the {IEEE} {C}onference on {C}omputer {V}ision and {P}attern {R}ecognition},
  pages={4013--4021},
  year={2016}
}

@inproceedings{tan2019efficientnet,
  title={Efficient{N}et: Rethinking model scaling for convolutional neural networks},
  author={Tan, Mingxing and Le, Quoc},
  booktitle={{I}nternational {C}onference on {M}achine {L}earning},
  pages={6105--6114},
  year={2019},
  organization={PMLR}
}

@article{zoph2016neural,
  title={Neural architecture search with reinforcement learning},
  author={Zoph, Barret and Le, Quoc V},
  journal={Ar{X}iv:1611.01578},
  year={2016}
}

@article{liu2018darts,
  title={{DARTS}: Differentiable architecture search},
  author={Liu, Hanxiao and Simonyan, Karen and Yang, Yiming},
  journal={Ar{X}iv:1806.09055},
  year={2018}
}

@article{liu2022convnet,
  title={A Conv{N}et for the 2020s},
  author={Liu, Zhuang and Mao, Hanzi and Wu, Chao-Yuan and Feichtenhofer, Christoph and Darrell, Trevor and Xie, Saining},
  journal={Ar{X}iv:2201.03545},
  year={2022}
}

@inproceedings{radosavovic2019network,
  title={On network design spaces for visual recognition},
  author={Radosavovic, Ilija and Johnson, Justin and Xie, Saining and Lo, Wan-Yen and Doll{\'a}r, Piotr},
  booktitle={Proceedings of the {IEEE}/{CVF} {I}nternational {C}onference on {C}omputer {V}ision},
  pages={1882--1890},
  year={2019}
}

@article{tolstikhin2021mlp,
  title={{MLP}-mixer: An all-{MLP} architecture for vision},
  author={Tolstikhin, Ilya O and Houlsby, Neil and Kolesnikov, Alexander and Beyer, Lucas and Zhai, Xiaohua and Unterthiner, Thomas and Yung, Jessica and Steiner, Andreas and Keysers, Daniel and Uszkoreit, Jakob and et al.},
  journal={Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  volume={34},
  year={2021}
}

@inproceedings{liu2021swin,
  title={Swin transformer: Hierarchical vision transformer using shifted windows},
  author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
  booktitle={Proceedings of the {IEEE}/{CVF} {I}nternational {C}onference on {C}omputer {V}ision},
  pages={10012--10022},
  year={2021}
}

@inproceedings{touvron2021training,
  title={Training data-efficient image transformers \& distillation through attention},
  author={Touvron, Hugo and Cord, Matthieu and Douze, Matthijs and Massa, Francisco and Sablayrolles, Alexandre and J{\'e}gou, Herv{\'e}},
  booktitle={{I}nternational {C}onference on {M}achine {L}earning},
  pages={10347--10357},
  year={2021},
  organization={PMLR}
}

@inproceedings{huang2018music,
  title={Music transformer: generating music with long-term structure},
  author={Huang, Cheng-Zhi Anna and Vaswani, Ashish and Uszkoreit, Jakob and Simon, Ian and Hawthorne, Curtis and Shazeer, Noam and Dai, Andrew M and Hoffman, Matthew D and Dinculescu, Monica and Eck, Douglas},
  booktitle={{I}nternational {C}onference on {L}earning {R}epresentations},
  year={2018}
}

@article{shaw2018self,
  title={Self-attention with relative position representations},
  author={Shaw, Peter and Uszkoreit, Jakob and Vaswani, Ashish},
  journal={Ar{X}iv:1803.02155},
  year={2018}
}

@book{neal1996bayesian,
  title={Bayesian {L}earning for {N}eural {N}etworks},
  author={Neal, Radford M},
  year={1996},
  publisher={Springer}
}

@article{matthews2018gaussian,
  title={Gaussian process behaviour in wide deep neural networks},
  author={Matthews, Alexander G de G and Rowland, Mark and Hron, Jiri and Turner, Richard E and Ghahramani, Zoubin},
  journal={Ar{X}iv:1804.11271},
  year={2018}
}

@article{novak2018bayesian,
  title={Bayesian deep convolutional networks with many channels are {G}aussian processes},
  author={Novak, Roman and Xiao, Lechao and Lee, Jaehoon and Bahri, Yasaman and Yang, Greg and Hron, Jiri and Abolafia, Daniel A and Pennington, Jeffrey and Sohl-Dickstein, Jascha},
  journal={Ar{X}iv:1810.05148},
  year={2018}
}

@article{zhang2021understanding,
  title={Understanding deep learning (still) requires rethinking generalization},
  author={Zhang, Chiyuan and Bengio, Samy and Hardt, Moritz and Recht, Benjamin and Vinyals, Oriol},
  journal={Communications of the {ACM}},
  volume={64},
  number={3},
  pages={107--115},
  year={2021},
  publisher={{ACM} New York, NY, USA}
}

@article{nakkiran2021deep,
  title={Deep double descent: Where bigger models and more data hurt},
  author={Nakkiran, Preetum and Kaplun, Gal and Bansal, Yamini and Yang, Tristan and Barak, Boaz and Sutskever, Ilya},
  journal={Journal of {S}tatistical {M}echanics: {T}heory and {E}xperiment},
  volume={2021},
  number={12},
  pages={124003},
  year={2021},
  publisher={IOP Publishing}
}

@book{tikhonov1977solutions,
  title={Solutions of {I}ll-{P}osed {P}roblems},
  author={Tikhonov, A. N. and Arsenin, V. Y.},
  year={1977},
  publisher={W.{H}.~{W}inston}
}

@book{morozov2012methods,
  title={Methods for {S}olving {I}ncorrectly {P}osed {P}roblems},
  author={Morozov, Vladimir Alekseevich},
  year={1984},
  publisher={Springer}
}

@article{srivastava2015highway,
  title={Highway networks},
  author={Srivastava, Rupesh Kumar and Greff, Klaus and Schmidhuber, J{\"u}rgen},
  journal={Ar{X}iv:1505.00387},
  year={2015}
}

@article{pleiss2017memory,
  title={Memory-efficient implementation of densenets},
  author={Pleiss, Geoff and Chen, Danlu and Huang, Gao and Li, Tongcheng and Van Der Maaten, Laurens and Weinberger, Kilian Q},
  journal={Ar{X}iv:1707.06990},
  year={2017}
}

@article{ramachandran2019stand,
  title={Stand-alone self-attention in vision models},
  author={Ramachandran, Prajit and Parmar, Niki and Vaswani, Ashish and Bello, Irwan and Levskaya, Anselm and Shlens, Jon},
  journal={Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  volume={32},
  year={2019}
}


@inproceedings{cordonnier2020relationship,
  title={On the relationship between self-attention and convolutional layers},
  author={Cordonnier, Jean-Baptiste and Loukas, Andreas and Jaggi, Martin},
  booktitle={{I}nternational {C}onference on {L}earning {R}epresentations},
  year={2020}
}

@article{brown2020language,
  title={Language models are few-shot learners},
  author={Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and et al.},
  journal={Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  volume={33},
  pages={1877--1901},
  year={2020}
}

@article{raffel2020exploring,
  title={Exploring the limits of transfer learning with a unified text-to-text transformer},
  author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
  journal={{J}ournal of {M}achine {L}earning {R}esearch},
  volume={21},
  pages={1--67},
  year={2020}
}

@inproceedings{xiong2020layer,
  title={On layer normalization in the transformer architecture},
  author={Xiong, Ruibin and Yang, Yunchang and He, Di and Zheng, Kai and Zheng, Shuxin and Xing, Chen and Zhang, Huishuai and Lan, Yanyan and Wang, Liwei and Liu, Tieyan},
  booktitle={{I}nternational {C}onference on {M}achine {L}earning},
  pages={10524--10533},
  year={2020},
  organization={PMLR}
}

@inproceedings{baevski2018adaptive,
  title={Adaptive input representations for neural language modeling},
  author={Baevski, Alexei and Auli, Michael},
  booktitle={{I}nternational {C}onference on Learning Representations},
  year={2018}
}

@inproceedings{wang2019learning,
  title={Learning deep transformer models for machine translation},
  author={Wang, Qiang and Li, Bei and Xiao, Tong and Zhu, Jingbo and Li, Changliang and Wong, Derek F and Chao, Lidia S},
  booktitle={Proceedings of the 57th {A}nnual {M}eeting of the {A}ssociation for {C}omputational {L}inguistics},
  pages={1810--1822},
  year={2019}
}

@article{wilson2020bayesian,
  title={Bayesian deep learning and a probabilistic perspective of generalization},
  author={Wilson, Andrew G and Izmailov, Pavel},
  journal={Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  volume={33},
  pages={4697--4708},
  year={2020}
}

@article{lewis2019bart,
  title={{BART}: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension},
  author={Lewis, Mike and Liu, Yinhan and Goyal, Naman and Ghazvininejad, Marjan and Mohamed, Abdelrahman and Levy, Omer and Stoyanov, Ves and Zettlemoyer, Luke},
  journal={Ar{X}iv:1910.13461},
  year={2019}
}

@inproceedings{clark2019electra,
  title={{ELECTRA}: Pre-training text encoders as discriminators rather than generators},
  author={Clark, Kevin and Luong, Minh-Thang and Le, Quoc V and Manning, Christopher D},
  booktitle={{I}nternational {C}onference on {L}earning {R}epresentations},
  year={2020}
}

@article{yang2019xlnet,
  title={{XLN}et: Generalized autoregressive pretraining for language understanding},
  author={Yang, Zhilin and Dai, Zihang and Yang, Yiming and Carbonell, Jaime and Salakhutdinov, Russ R and Le, Quoc V},
  journal={Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  volume={32},
  year={2019}
}

@article{lan2019albert,
  title={{ALBERT}: A lite {BERT} for self-supervised learning of language representations},
  author={Lan, Zhenzhong and Chen, Mingda and Goodman, Sebastian and Gimpel, Kevin and Sharma, Piyush and Soricut, Radu},
  journal={Ar{X}iv:1909.11942},
  year={2019}
}

@article{sanh2019distilbert,
  title={Distil{BERT}, a distilled version of {BERT}: smaller, faster, cheaper and lighter},
  author={Sanh, Victor and Debut, Lysandre and Chaumond, Julien and Wolf, Thomas},
  journal={Ar{X}iv:1910.01108},
  year={2019}
}

@inproceedings{he2022masked,
  title={Masked autoencoders are scalable vision learners},
  author={He, Kaiming and Chen, Xinlei and Xie, Saining and Li, Yanghao and Doll{\'a}r, Piotr and Girshick, Ross},
  booktitle={Proceedings of the {IEEE}/{CVF} {C}onference on {C}omputer {V}ision and {P}attern {R}ecognition},
  pages={16000--16009},
  year={2022}
}

@inproceedings{chen2020generative,
  title={Generative pretraining from pixels},
  author={Chen, Mark and Radford, Alec and Child, Rewon and Wu, Jeffrey and Jun, Heewoo and Luan, David and Sutskever, Ilya},
  booktitle={{I}nternational {C}onference on machine learning},
  pages={1691--1703},
  year={2020},
  organization={PMLR}
}

@article{kaplan2020scaling,
  title={Scaling laws for neural language models},
  author={Kaplan, Jared and McCandlish, Sam and Henighan, Tom and Brown, Tom B and Chess, Benjamin and Child, Rewon and Gray, Scott and Radford, Alec and Wu, Jeffrey and Amodei, Dario},
  journal={Ar{X}iv:2001.08361},
  year={2020}
}

@article{rae2021scaling,
  title={Scaling language models: Methods, analysis \& insights from training gopher},
  author={Rae, Jack W and Borgeaud, Sebastian and Cai, Trevor and Millican, Katie and Hoffmann, Jordan and Song, Francis and Aslanides, John and Henderson, Sarah and Ring, Roman and Young, Susannah and et al.},
  journal={Ar{X}iv:2112.11446},
  year={2021}
}

@article{du2021glam,
  title={{GL}a{M}: Efficient scaling of language models with mixture-of-experts},
  author={Du, Nan and Huang, Yanping and Dai, Andrew M and Tong, Simon and Lepikhin, Dmitry and Xu, Yuanzhong and Krikun, Maxim and Zhou, Yanqi and Yu, Adams Wei and Firat, Orhan and et al.},
  journal={Ar{X}iv:2112.06905},
  year={2021}
}

@article{shoeybi2019megatron,
  title={Megatron-{LM}: Training multi-billion parameter language models using model parallelism},
  author={Shoeybi, Mohammad and Patwary, Mostofa and Puri, Raul and LeGresley, Patrick and Casper, Jared and Catanzaro, Bryan},
  journal={Ar{X}iv:1909.08053},
  year={2019}
}

@article{smith2022using,
  title={Using {D}eep{S}peed and {M}egatron to train {M}egatron-{T}uring {NLG} 530{B}, a large-scale generative language model},
  author={Smith, Shaden and Patwary, Mostofa and Norick, Brandon and LeGresley, Patrick and Rajbhandari, Samyam and Casper, Jared and Liu, Zhun and Prabhumoye, Shrimai and Zerveas, George and Korthikanti, Vijay and et al.},
  journal={Ar{X}iv:2201.11990},
  year={2022}
}

@article{thoppilan2022lamda,
  title={La{MDA}: Language models for dialog applications},
  author={Thoppilan, Romal and De Freitas, Daniel and Hall, Jamie and Shazeer, Noam and Kulshreshtha, Apoorv and Cheng, Heng-Tze and Jin, Alicia and Bos, Taylor and Baker, Leslie and Du, Yu and et al.},
  journal={Ar{X}iv:2201.08239},
  year={2022}
}

@article{hoffmann2022training,
  title={Training compute-optimal large language models},
  author={Hoffmann, Jordan and Borgeaud, Sebastian and Mensch, Arthur and Buchatskaya, Elena and Cai, Trevor and Rutherford, Eliza and Casas, Diego de Las and Hendricks, Lisa Anne and Welbl, Johannes and Clark, Aidan and et al.},
  journal={Ar{X}iv:2203.15556},
  year={2022}
}

@article{zhang2022opt,
  title={{OPT}: Open pre-trained transformer language models},
  author={Zhang, Susan and Roller, Stephen and Goyal, Naman and Artetxe, Mikel and Chen, Moya and Chen, Shuohui and Dewan, Christopher and Diab, Mona and Li, Xian and Lin, Xi Victoria and et al.},
  journal={Ar{X}iv:2205.01068},
  year={2022}
}

@article{chowdhery2022palm,
  title={Pa{LM}: Scaling language modeling with pathways},
  author={Chowdhery, Aakanksha and Narang, Sharan and Devlin, Jacob and Bosma, Maarten and Mishra, Gaurav and Roberts, Adam and Barham, Paul and Chung, Hyung Won and Sutton, Charles and Gehrmann, Sebastian and et al.},
  journal={Ar{X}iv:2204.02311},
  year={2022}
}

@article{hernandez2021scaling,
  title={Scaling laws for transfer},
  author={Hernandez, Danny and Kaplan, Jared and Henighan, Tom and McCandlish, Sam},
  journal={Ar{X}iv:2102.01293},
  year={2021}
}

@article{tay2021scale,
  title={Scale efficiently: Insights from pre-training and fine-tuning transformers},
  author={Tay, Yi and Dehghani, Mostafa and Rao, Jinfeng and Fedus, William and Abnar, Samira and Chung, Hyung Won and Narang, Sharan and Yogatama, Dani and Vaswani, Ashish and Metzler, Donald},
  journal={Ar{X}iv:2109.10686},
  year={2021}
}

@article{wei2022emergent,
  title={Emergent abilities of large language models},
  author={Wei, Jason and Tay, Yi and Bommasani, Rishi and Raffel, Colin and Zoph, Barret and Borgeaud, Sebastian and Yogatama, Dani and Bosma, Maarten and Zhou, Denny and Metzler, Donald and et al.},
  journal={Ar{X}iv:2206.07682},
  year={2022}
}

@inproceedings{radford2021learning,
  title={Learning transferable visual models from natural language supervision},
  author={Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and et al.},
  booktitle={{I}nternational {C}onference on Machine Learning},
  pages={8748--8763},
  year={2021},
  organization={PMLR}
}

@inproceedings{ramesh2021zero,
  title={Zero-shot text-to-image generation},
  author={Ramesh, Aditya and Pavlov, Mikhail and Goh, Gabriel and Gray, Scott and Voss, Chelsea and Radford, Alec and Chen, Mark and Sutskever, Ilya},
  booktitle={{I}nternational {C}onference on {M}achine {L}earning},
  pages={8821--8831},
  year={2021},
  organization={PMLR}
}

@article{ramesh2022hierarchical,
  title={Hierarchical text-conditional image generation with clip latents},
  author={Ramesh, Aditya and Dhariwal, Prafulla and Nichol, Alex and Chu, Casey and Chen, Mark},
  journal={Ar{X}iv:2204.06125},
  year={2022}
}

@article{alayrac2022flamingo,
  title={Flamingo: a visual language model for few-shot learning},
  author={Alayrac, Jean-Baptiste and Donahue, Jeff and Luc, Pauline and Miech, Antoine and Barr, Iain and Hasson, Yana and Lenc, Karel and Mensch, Arthur and Millican, Katie and Reynolds, Malcolm and et al.},
  journal={Ar{X}iv:2204.14198},
  year={2022}
}

@article{saharia2022photorealistic,
  title={Photorealistic text-to-image diffusion models with deep language understanding},
  author={Saharia, Chitwan and Chan, William and Saxena, Saurabh and Li, Lala and Whang, Jay and Denton, Emily and Ghasemipour, Seyed Kamyar Seyed and Ayan, Burcu Karagol and Mahdavi, S Sara and Lopes, Rapha Gontijo and et al.},
  journal={Ar{X}iv:2205.11487},
  year={2022}
}

@article{reed2022generalist,
  title={A generalist agent},
  author={Reed, Scott and Zolna, Konrad and Parisotto, Emilio and Colmenarejo, Sergio Gomez and Novikov, Alexander and Barth-Maron, Gabriel and Gimenez, Mai and Sulsky, Yury and Kay, Jackie and Springenberg, Jost Tobias and et al.},
  journal={Ar{X}iv:2205.06175},
  year={2022}
}

@article{fedus2022switch,
  title={Switch transformers: scaling to trillion parameter models with simple and efficient sparsity},
  author={Fedus, William and Zoph, Barret and Shazeer, Noam},
  journal={{J}ournal of {M}achine {L}earning {R}esearch},
  volume={23},
  number={120},
  pages={1--39},
  year={2022}
}

@article{child2019generating,
  title={Generating long sequences with sparse transformers},
  author={Child, Rewon and Gray, Scott and Radford, Alec and Sutskever, Ilya},
  journal={Ar{X}iv:1904.10509},
  year={2019}
}

@article{joshi2020spanbert,
  title={Span{BERT}: Improving pre-training by representing and predicting spans},
  author={Joshi, Mandar and Chen, Danqi and Liu, Yinhan and Weld, Daniel S and Zettlemoyer, Luke and Levy, Omer},
  journal={{T}ransactions of the {A}ssociation for {C}omputational {L}inguistics},
  volume={8},
  pages={64--77},
  year={2020},
  publisher={MIT {P}ress}
}

@article{yu2022scaling,
  title={Scaling autoregressive models for content-rich text-to-image generation},
  author={Yu, Jiahui and Xu, Yuanzhong and Koh, Jing Yu and Luong, Thang and Baid, Gunjan and Wang, Zirui and Vasudevan, Vijay and Ku, Alexander and Yang, Yinfei and Karagol Ayan, Burcu and Hutchinson, Ben and Han, Wei and Parekh, Zarana and Li, Xin and Zhang, Han and Baldridge, Jason and Wu, Yonghui},
  journal={Ar{X}iv:2206.10789},
  year={2022}
}

@article{srivastava2022beyond,
  title={Beyond the imitation game: quantifying and extrapolating the capabilities of language models},
  author={Srivastava, Aarohi and Rastogi, Abhinav and Rao, Abhishek and Shoeb, Abu Awal Md and Abid, Abubakar and Fisch, Adam and Brown, Adam R and Santoro, Adam and Gupta, Aditya and Garriga-Alonso, Adri{\`a} and et al.},
  journal={Ar{X}iv:2206.04615},
  year={2022}
}

@article{lewkowycz2022solving,
  title={Solving quantitative reasoning problems with language models},
  author={Lewkowycz, Aitor and Andreassen, Anders and Dohan, David and Dyer, Ethan and Michalewski, Henryk and Ramasesh, Vinay and Slone, Ambrose and Anil, Cem and Schlag, Imanol and Gutman-Solo, Theo and et al.},
  journal={Ar{X}iv:2206.14858},
  year={2022}
}

@article{mnih2015human,
  title={Human-level control through deep reinforcement learning},
  author={Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Rusu, Andrei A and Veness, Joel and Bellemare, Marc G and Graves, Alex and Riedmiller, Martin and Fidjeland, Andreas K and Ostrovski, Georg and et al.},
  journal={{N}ature},
  volume={518},
  number={7540},
  pages={529--533},
  year={2015},
  publisher={Nature Publishing Group}
}

@article{hartley2009global,
  title={Global optimization through rotation space search},
  author={Hartley, Richard I and Kahl, Fredrik},
  journal={International {J}ournal of {C}omputer {V}ision},
  volume={82},
  number={1},
  pages={64--79},
  year={2009},
  publisher={Springer}
}

@inproceedings{wu2018shift,
  title={Shift: A zero flop, zero parameter alternative to spatial convolutions},
  author={Wu, Bichen and Wan, Alvin and Yue, Xiangyu and Jin, Peter and Zhao, Sicheng and Golmant, Noah and Gholaminejad, Amir and Gonzalez, Joseph and Keutzer, Kurt},
  booktitle={Proceedings of the {IEEE} {C}onference on {C}omputer {V}ision and {P}attern {R}ecognition},
  pages={9127--9135},
  year={2018}
}

@article{bommasani2021opportunities,
  title={On the opportunities and risks of foundation models},
  author={Bommasani, Rishi and Hudson, Drew A and Adeli, Ehsan and Altman, Russ and Arora, Simran and von Arx, Sydney and Bernstein, Michael S and Bohg, Jeannette and Bosselut, Antoine and Brunskill, Emma and et al.},
  journal={Ar{X}iv:2108.07258},
  year={2021}
}

@article{schuhmann2022laion,
  title={{LAION}-5{B}: An open large-scale dataset for training next generation image-text models},
  author={Schuhmann, Christoph and Beaumont, Romain and Vencu, Richard and Gordon, Cade and Wightman, Ross and Cherti, Mehdi and Coombes, Theo and Katta, Aarush and Mullis, Clayton and Wortsman, Mitchell and et al.},
  journal={Ar{X}iv:2210.08402},
  year={2022}
}

@inproceedings{le2013building,
  title={Building high-level features using large scale unsupervised learning},
  author={Le, Quoc V},
  booktitle={Proceedings of the {IEEE} {I}nternational {C}onference on {A}coustics, {S}peech and {S}ignal {P}rocessing},
  pages={8595--8598},
  year={2013},
  organization={IEEE}
}

@article{olshausen1996emergence,
  title={Emergence of simple-cell receptive field properties by learning a sparse code for natural images},
  author={Olshausen, Bruno A and Field, David J},
  journal={Nature},
  volume={381},
  number={6583},
  pages={607--609},
  year={1996},
  publisher={Nature Publishing Group}
}

@book{Vapnik95,
  Address =	 {New York},
  Author =	 {V. Vapnik},
  Publisher =	 {Springer},
  Title =	 {The {N}ature of {S}tatistical {L}earning {T}heory},
  Year =	 1995
}

@inproceedings{Novikoff62,
  Author =	 {A.~B.~J.~Novikoff},
  Booktitle =	 {Proceedings of the {S}ymposium on the {M}athematical
                  {T}heory of {A}utomata},
  Organization = {Polytechnic {I}nstitute of {B}rooklyn},
  Pages =	 {615-622},
  Title =	 {On convergence proofs on perceptrons},
  Year =	 1962
}

@article{ba2016layer,
  title={Layer normalization},
  author={Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey E},
  journal={Ar{X}iv:1607.06450},
  year={2016}
}

@article{anil2020scalable,
  title={Scalable second-order optimization for deep learning},
  author={Anil, Rohan and Gupta, Vineet and Koren, Tomer and Regan, Kevin and Singer, Yoram},
  journal={Ar{X}iv:2002.09018},
  year={2020}
}

@article{prakash2016neural,
  title={Neural paraphrase generation with stacked residual {LSTM} networks},
  author={Prakash, Aaditya and Hasan, Sadid A and Lee, Kathy and Datla, Vivek and Qadir, Ashequl and Liu, Joey and Farri, Oladimeji},
  journal={Ar{X}iv:1610.03098},
  year={2016}
}

@article{kim2017residual,
  title={Residual {LSTM}: Design of a deep recurrent architecture for distant speech recognition},
  author={Kim, Jaeyoung and El-Khamy, Mostafa and Lee, Jungwon},
  journal={Ar{X}iv:1701.03360},
  year={2017}
}

@article{russakovsky2015imagenet,
  title={Image{N}et large scale visual recognition challenge},
  author={Russakovsky, Olga and Deng, Jia and Su, Hao and Krause, Jonathan and Satheesh, Sanjeev and Ma, Sean and Huang, Zhiheng and Karpathy, Andrej and Khosla, Aditya and Bernstein, Michael and et al.},
  journal={International {J}ournal of {C}omputer {V}ision},
  volume={115},
  number={3},
  pages={211--252},
  year={2015},
  publisher={Springer}
}

@article{forrester2007multi,
  title={Multi-fidelity optimization via surrogate modelling},
  author={Forrester, Alexander IJ and S{\'o}bester, Andr{\'a}s and Keane, Andy J},
  journal={Proceedings of the {R}oyal {S}ociety {A}: {M}athematical, {P}hysical and {E}ngineering {S}ciences},
  volume={463},
  number={2088},
  pages={3251--3269},
  year={2007},
  publisher={The Royal Society London}
}

@article{feurer-arxiv22,
  author = {M. Feurer and B. Letham and F. Hutter and E. Bakshy},
  title = {Practical transfer learning for {B}ayesian optimization},
  journal = {Ar{X}iv:1802.02219 [stat.ML]},
  year = {2022}
}

@article{wistuba-ml18,
  author = {M. Wistuba and N. Schilling and L. Schmidt-Thieme},
  title = {Scalable {G}aussian process-based transfer surrogates for hyperparameter optimization},
  journal = {Machine {L}earning},
  volume = {108},
pages = {43--78},
year = {2018}
}

@inproceedings{bardenet-icml13a,
    author    = {R. Bardenet and M. Brendel and B. K{\'e}gl and M. Sebag},
    title     = {Collaborative hyperparameter tuning},
    booktitle = {Proceedings of the 30th {I}nternational {C}onference on {M}achine {L}earning ({ICML}'13)},
    year      = {2013},
}

@InProceedings{jenatton-icml17a,
  title	    = {Bayesian optimization with {tree}-structured {dependencies}},
  author    = {R. Jenatton and C. Archambeau and J. González and M. Seeger},
  booktitle = {Proceedings of the 34th {I}nternational {C}onference on {M}achine {L}earning ({ICML}'17)},
  year      = {2017},

}

@article{li-arxiv18,
 title   = {Massively parallel hyperparameter tuning},
 author  = {L. Li and K. Jamieson and A. Rostamizadeh and K. Gonina and M. Hardt and B. Recht and A. Talwalkar},
 year    = {2018},
 journal = {Ar{X}iv:1810.05934}
}

@article{bellman-science66,
  title={Dynamic programming},
  author={R. Bellman},
  journal={Science},
volume ={153},
pages={34--37},
  year={1966},
}

@inproceedings{li-iclr17,
 author    = {L. Li and K. Jamieson and G. DeSalvo and A. Rostamizadeh and A. Talwalkar},
 title     = {Hyperband: Bandit-based configuration evaluation for hyperparameter optimization},
 booktitle = {{I}nternational {C}onference on {L}earning {R}epresentations ({ICLR}'17)},
 year      = {2017}
}

@inproceedings{karnin-icml13,
  author    = {Z. Karnin and T. Koren and O. Somekh},
  title     = {Almost optimal exploration in multi-armed bandits},
  booktitle = {Proceedings of the 30th {I}nternational {C}onference on {M}achine {L}earning ({ICML}'13)},
  year      = {2013},
}

@inproceedings{jamieson-aistats16,
 author    = {K. Jamieson and A. Talwalkar},
 title     = {Non-stochastic best arm identification and hyperparameter optimization},
 booktitle = {Proceedings of the 17th {I}nternational {C}onference on {A}rtificial {I}ntelligence and {S}tatistics},
 year      = {2016}
}

@InProceedings{akiba-sigkdd19,
 title     = {{Optuna}: A next-generation hyperparameter optimization framework},
 author    = {T. Akiba and S. Sano and T. Yanase and T. Ohta and M. Koyama},
 booktitle = {Proceedings of the 25th {ACM} {SIGKDD} {I}nternational {C}onference on {K}nowledge {D}iscovery \& {D}ata {M}ining},
 year      = {2019}
}

@article{liaw-arxiv18,
 author  = {R. Liaw and E. Liang and R. Nishihara and P. Moritz and J. Gonzalez and I. Stoica},
 title   = {{Tune}: A research platform for distributed model selection and training},
 journal = {Ar{X}iv:1807.05118},
 year    = {2018}
}

@InProceedings{salinas-automl22,
  title     = {Syne {T}une: A library for Large Scale Hyperparameter Tuning and Reproducible Research},
  author    = {D. Salinas and M. Seeger and A. Klein and V. Perrone and M. Wistuba and C. Archambeau},
  booktitle = {First {C}onference on {A}utomated {M}achine {L}earning},
  year      = {2022},
}

@InProceedings{baptista-icml18a,
  title = 	 {{B}ayesian optimization of combinatorial structures},
  author =       {R. Baptista and M. Poloczek},
  booktitle = 	 {Proceedings of the 35th {I}nternational {C}onference on {M}achine {L}earning},
  year = 	 {2018},
}

@inproceedings{hutter-lion11a,
    author    = {F. Hutter and H. Hoos and K. Leyton-Brown},
    title     = {Sequential Model-Based Optimization for General Algorithm Configuration},
    booktitle = {Proceedings of the Fifth {I}nternational {C}onference on {L}earning and {I}ntelligent {O}ptimization ({LION}'11)},
    year      = {2011}
}

@inproceedings{franceschi-icml17a,
  title = 	 {Forward and Reverse Gradient-Based Hyperparameter Optimization},
  author = 	 {L. Franceschi and M. Donini and P. Frasconi and M. Pontil},
  booktitle = 	 {Proceedings of the 34th {I}nternational {C}onference on {M}achine {L}earning ({ICML}'17)},
  year = 	 {2017},
}

@inproceedings{maclaurin-icml15,
  title = 	 {Gradient-based Hyperparameter Optimization through Reversible Learning},
  author = 	 {D. Maclaurin and D. Duvenaud and R. Adams},
  booktitle = 	 {Proceedings of the 32nd {I}nternational {C}onference on {M}achine {L}earning ({ICML}'15)},
  year = 	 {2015}
}

@Book{hutter-book19a,
  editor	= {F. Hutter and L. Kotthoff and J. Vanschoren},
  publisher	= {Springer},
  title		= {Automated {M}achine {L}earning: {M}ethods, {S}ystems, {C}hallenges},
  year		= {2019}
}

@article{wistuba-arxiv19,
  author  = {M. Wistuba and A. Rawat and T. Pedapati},
  title   = {A survey on neural architecture search},
  journal = {Ar{X}iv:1905.01392 [cs.LG]},
  year    = {2019}
}

@article{elsken-arxiv18a,
  author  = {T. Elsken and J. H. Metzen and F. Hutter},
  title   = {Neural architecture search: A sSurvey},
  journal = {Ar{X}iv:1808.05377 [stat.ML]},
  year    = {2018}
}

@incollection{feurer-automlbook18a,
  author    = {M. Feurer and F. Hutter},
  title     = {Hyperparameter ptimization},
  booktitle = {Automatic {M}achine {L}earning: {M}ethods, {S}ystems, {C}hallenges},
  year      = {2018},
  publisher = {Springer}
}

@inproceedings{snoek-nips12,
  title       = {Practical {B}ayesian Optimization of Machine Learning Algorithms},
  author      = {Snoek, J. and Larochelle, H. and Adams, R.},
  booktitle   = {Advances in {N}eural {I}nformation {P}rocessing {S}ystems 25},
  year        = {2012},
  pages       = {2951--2959}
}

@article{bergstra2011algorithms,
  title={Algorithms for hyper-parameter optimization},
  author={Bergstra, James and Bardenet, R{\'e}mi and Bengio, Yoshua and K{\'e}gl, Bal{\'a}zs},
  journal={Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  volume={24},
  year={2011}
}

@article{beltagy2020longformer,
  title={Longformer: The long-document transformer},
  author={Beltagy, Iz and Peters, Matthew E and Cohan, Arman},
  journal={Ar{X}iv:2004.05150},
  year={2020}
}

@article{gulati2020conformer,
  title={Conformer: Convolution-augmented Transformer for Speech Recognition},
  author={Gulati, Anmol and Qin, James and Chiu, Chung-Cheng and Parmar, Niki and Zhang, Yu and Yu, Jiahui and Han, Wei and Wang, Shibo and Zhang, Zhengdong and Wu, Yonghui and et al.},
  journal={Proc. {I}nterspeech 2020},
  pages={5036--5040},
  year={2020}
}

@article{chen2021decision,
  title={Decision transformer: Reinforcement learning via sequence modeling},
  author={Chen, Lili and Lu, Kevin and Rajeswaran, Aravind and Lee, Kimin and Grover, Aditya and Laskin, Misha and Abbeel, Pieter and Srinivas, Aravind and Mordatch, Igor},
  journal={Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  volume={34},
  pages={15084--15097},
  year={2021}
}

@article{dwivedi2020generalization,
  title={A generalization of transformer networks to graphs},
  author={Dwivedi, Vijay Prakash and Bresson, Xavier},
  journal={Ar{X}iv:2012.09699},
  year={2020}
}

@techreport{wolpert1995no,
  title={No free lunch theorems for search},
  author={Wolpert, David H and Macready, William G},
  year={1995},
  institution={Technical {R}eport SFI-TR-95-02-010, {S}anta {F}e {I}nstitute}
}

@inproceedings{rezende2015variational,
  title={Variational inference with normalizing flows},
  author={Rezende, Danilo and Mohamed, Shakir},
  booktitle={{I}nternational {C}onference on {M}achine {L}earning},
  pages={1530--1538},
  year={2015},
  organization={PMLR}
}

@inproceedings{sohl2015deep,
  title={Deep unsupervised learning using nonequilibrium thermodynamics},
  author={Sohl-Dickstein, Jascha and Weiss, Eric and Maheswaranathan, Niru and Ganguli, Surya},
  booktitle={{I}nternational {C}onference on {M}achine {L}earning},
  pages={2256--2265},
  year={2015},
  organization={PMLR}
}

@article{ho2020denoising,
  title={Denoising diffusion probabilistic models},
  author={Ho, Jonathan and Jain, Ajay and Abbeel, Pieter},
  journal={Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  volume={33},
  pages={6840--6851},
  year={2020}
}

@article{song2019generative,
  title={Generative modeling by estimating gradients of the data distribution},
  author={Song, Yang and Ermon, Stefano},
  journal={Advances in {N}eural {I}nformation {P}rocessing {S}ystems},
  volume={32},
  year={2019}
}

@article{BellmanDPPaper,
author = {Richard Bellman},
title = {On the Theory of Dynamic Programming},
journal = {Proceedings of the {N}ational {A}cademy of {S}ciences},
volume = {38},
number = {8},
pages = {716--719},
year = {1952}
}

@article{BellmanMDP,
 ISSN = {00959057, 19435274},
 URL = {http://www.jstor.org/stable/24900506},
 author = {Richard Bellman},
 journal = {Journal of {M}athematics and {M}echanics},
 number = {5},
 pages = {679--684},
 publisher = {Indiana University Mathematics Department},
 title = {A {M}arkovian Decision Process},
 urldate = {2022-11-28},
 volume = {6},
 year = {1957}
}

@book{BellmanDPBook,
  isbn={9780486317199},
  publisher = {Dover Publications},
  title = {{Dynamic Programming}},
  year = 1957,
  author = {Richard Bellman},
  series={Dover},
}

@article{mnih2013playing,
	Author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Graves, Alex and Antonoglou, Ioannis and Wierstra, Daan and Riedmiller, Martin},
	Journal = {Ar{X}iv:1312.5602},
	Title = {Playing {A}tari with deep reinforcement learning},
	Year = {2013}
	}

@article{ouyang2022training,
  title={Training language models to follow instructions with human feedback},
  author={Ouyang, Long and Wu, Jeff and Jiang, Xu and Almeida, Diogo and Wainwright, Carroll L and Mishkin, Pamela and Zhang, Chong and Agarwal, Sandhini and Slama, Katarina and Ray, Alex and et al.},
  journal={Ar{X}iv:2203.02155},
  year={2022}
}

@article{wei2022chain,
  title={Chain of thought prompting elicits reasoning in large language models},
  author={Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and Chi, Ed and Le, Quoc and Zhou, Denny},
  journal={Ar{X}iv:2201.11903},
  year={2022}
}

@InProceedings{zhang2023automatic,
  title		= {Automatic chain of thought prompting in large language models},
  author	= {Zhang, Zhuosheng and Zhang, Aston and Li, Mu and Smola, Alex},
  booktitle	= {{I}nternational {C}onference on {L}earning {R}epresentations},
  year		= {2023}
}

@inproceedings{wang2022removing,
  title={Removing batch normalization boosts adversarial training},
  author={Wang, Haotao and Zhang, Aston and Zheng, Shuai and Shi, Xingjian and Li, Mu and Wang, Zhangyang},
  booktitle={{I}nternational {C}onference on {M}achine {L}earning},
  pages={23433--23445},
  year={2022},
  organization={PMLR}
}

@article{zhang2023multicot,
  title={Multimodal Chain-of-Thought Reasoning in Language Models},
  author={Zhang, Zhuosheng and Zhang, Aston and Li, Mu and Zhao, Hai and Karypis, George and Smola, Alex},
  journal={Ar{X}iv:2302.00923},
  year={2023}
}

@article{kojima2022large,
 author = {Kojima, Takeshi and Gu, Shixiang Shane and Reid, Machel and Matsuo, Yutaka and Iwasawa, Yusuke},
 journal = {arxiv.org/abs/2205.11916},
 title = {Large Language Models are Zero-Shot Reasoners},
 year = {2022}
}

@InProceedings{wang2023self,
  title		= {Self-Consistency Improves Chain of Thought Reasoning in Language Models},
  author	= {Wang, Xuezhi and Wei, Jason and Schuurmans, Dale and Le, Quoc and Chi, Ed and Zhou, Denny},
  booktitle	= {{I}nternational {C}onference on {L}earning {R}epresentations},
  year		= {2023}
}

@InProceedings{zhou2023least,
  title		= {Least-to-most prompting enables complex reasoning in large language models},
  author	= {Zhou, Denny and Sch{\"a}rli, Nathanael and Hou, Le and Wei, Jason and Scales, Nathan and Wang, Xuezhi and Schuurmans, Dale and Bousquet, Olivier and Le, Quoc and Chi, Ed},
  booktitle	= {{I}nternational {C}onference on {L}earning {R}epresentations},
  year		= {2023}
}

@inproceedings{rezende2014stochastic,
  title={Stochastic backpropagation and approximate inference in deep generative models},
  author={Rezende, Danilo Jimenez and Mohamed, Shakir and Wierstra, Daan},
  booktitle={{I}nternational {C}onference on {M}achine {L}earning},
  pages={1278--1286},
  year={2014},
  organization={PMLR}
}

@InProceedings{song2021score,
  title		= {Score-Based Generative Modeling through Stochastic Differential Equations},
  author	= {Song, Yang and Sohl-Dickstein, Jascha and Kingma, Diederik P and Kumar, Abhishek and Ermon, Stefano and Poole, Ben},
  booktitle	= {{I}nternational {C}onference on {L}earning {R}epresentations},
  year		= {2021}
}

@article{dinh2014nice,
  title={{NICE}: Non-linear independent components estimation},
  author={Dinh, Laurent and Krueger, David and Bengio, Yoshua},
  journal={Ar{X}iv:1410.8516},
  year={2014}
}

@InProceedings{dinh2017density,
  title		= {Density estimation using Real {NVP}},
  author	= {Dinh, Laurent and Sohl-Dickstein, Jascha and Bengio, Samy},
  booktitle	= {{I}nternational {C}onference on {L}earning {R}epresentations},
  year		= {2017}
}

@article{openai2023gpt4,
  title={{GPT}-4 {T}echnical {R}eport},
  author={Open{AI}},
  journal={Ar{X}iv:2303.08774},
  year={2023}
}

@article{anil2023palm,
  title={Pa{LM} 2 {T}echnical {R}eport},
  author={Anil, Rohan and Dai, Andrew M and Firat, Orhan and Johnson, Melvin and Lepikhin, Dmitry and Passos, Alexandre and Shakeri, Siamak and Taropa, Emanuel and Bailey, Paige and Chen, Zhifeng and et al.},
  journal={Ar{X}iv:2305.10403},
  year={2023}
}

@article{wei2021finetuned,
  title={Finetuned language models are zero-shot learners},
  author={Wei, Jason and Bosma, Maarten and Zhao, Vincent Y and Guu, Kelvin and Yu, Adams Wei and Lester, Brian and Du, Nan and Dai, Andrew M and Le, Quoc V},
  journal={Ar{X}iv:2109.01652},
  year={2021}
}

@article{sanh2021multitask,
  title={Multitask prompted training enables zero-shot task generalization},
  author={Sanh, Victor and Webson, Albert and Raffel, Colin and Bach, Stephen H and Sutawika, Lintang and Alyafeai, Zaid and Chaffin, Antoine and Stiegler, Arnaud and Scao, Teven Le and Raja, Arun and et al.},
  journal={Ar{X}iv:2110.08207},
  year={2021}
}

@article{bai2022constitutional,
  title={Constitutional {AI}: Harmlessness from {AI} feedback},
  author={Bai, Yuntao and Kadavath, Saurav and Kundu, Sandipan and Askell, Amanda and Kernion, Jackson and Jones, Andy and Chen, Anna and Goldie, Anna and Mirhoseini, Azalia and McKinnon, Cameron and et al.},
  journal={Ar{X}iv:2212.08073},
  year={2022}
}

@article{taylor2022galactica,
  title={Galactica: A large language model for science},
  author={Taylor, Ross and Kardas, Marcin and Cucurull, Guillem and Scialom, Thomas and Hartshorn, Anthony and Saravia, Elvis and Poulton, Andrew and Kerkez, Viktor and Stojnic, Robert},
  journal={Ar{X}iv:2211.09085},
  year={2022}
}

@article{touvron2023llama,
  title={L{L}a{MA}: Open and efficient foundation language models},
  author={Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and et al.},
  journal={Ar{X}iv:2302.13971},
  year={2023a}
}

@article{touvron2023llama2,
  title={L{L}a{MA} 2: Open foundation and fine-tuned chat models},
  author={Touvron, Hugo and Martin, Louis and Stone, Kevin and Albert, Peter and Almahairi, Amjad and Babaei, Yasmine and Bashlykov, Nikolay and Batra, Soumya and Bhargava, Prajjwal and Bhosale, Shruti and et al.},
  journal={Ar{X}iv:2307.09288},
  year={2023b}
}

@article{qin2023chatgpt,
  title={Is {C}hat{GPT} a general-purpose natural language processing task solver?},
  author={Qin, Chengwei and Zhang, Aston and Zhang, Zhuosheng and Chen, Jiaao and Yasunaga, Michihiro and Yang, Diyi},
  journal={Ar{X}iv:2302.06476},
  year={2023}
}

@article{scao2022bloom,
  title={{BLOOM}: A 176{B}-parameter open-access multilingual language model},
  author={Scao, Teven Le and Fan, Angela and Akiki, Christopher and Pavlick, Ellie and Ili{\'c}, Suzana and Hesslow, Daniel and Castagn{\'e}, Roman and Luccioni, Alexandra Sasha and Yvon, Fran{\c{c}}ois and Gall{\'e}, Matthias and et al.},
  journal={Ar{X}iv:2211.05100},
  year={2022}
}

@article{penedo2023refinedweb,
  title={The {R}efined{W}eb dataset for {F}alcon {LLM}: outperforming curated corpora with web data, and web data only},
  author={Penedo, Guilherme and Malartic, Quentin and Hesslow, Daniel and Cojocaru, Ruxandra and Cappelli, Alessandro and Alobeidli, Hamza and Pannier, Baptiste and Almazrouei, Ebtesam and Launay, Julien},
  journal={Ar{X}iv:2306.01116},
  year={2023}
}