d2l.bib


@InProceedings{	  Ahmed.Aly.Gonzalez.ea.2012,
  title		= {Scalable inference in latent variable models},
  author	= {Ahmed, Amr and Aly, Moahmed and Gonzalez, Joseph and
		  Narayanamurthy, Shravan and Smola, Alexander J},
  booktitle	= {Proceedings of the fifth ACM international conference on
		  Web search and data mining},
  pages		= {123--132},
  year		= {2012},
  organization	= {ACM}
}

@Article{	  Aji.McEliece.2000,
  title		= {The generalized distributive law},
  author	= {Aji, Srinivas M and McEliece, Robert J},
  journal	= {IEEE transactions on Information Theory},
  volume	= {46},
  number	= {2},
  pages		= {325--343},
  year		= {2000},
  publisher	= {IEEE}
}

@Article{	  Ba.Kiros.Hinton.2016,
  title		= {Layer normalization},
  author	= {Ba, Jimmy Lei and Kiros, Jamie Ryan and Hinton, Geoffrey
		  E},
  journal	= {arXiv preprint arXiv:1607.06450},
  year		= {2016}
}

@Article{	  Bahdanau.Cho.Bengio.2014,
  title		= {Neural machine translation by jointly learning to align
		  and translate},
  author	= {Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
  journal	= {arXiv preprint arXiv:1409.0473},
  year		= {2014}
}

@InProceedings{	  Bay.Tuytelaars.Van-Gool.2006,
  title		= {Surf: Speeded up robust features},
  author	= {Bay, Herbert and Tuytelaars, Tinne and Van Gool, Luc},
  booktitle	= {European conference on computer vision},
  pages		= {404--417},
  year		= {2006},
  organization	= {Springer}
}

@Article{	  Bengio.Ducharme.Vincent.ea.2003,
  title		= {A neural probabilistic language model},
  author	= {Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent,
		  Pascal and Jauvin, Christian},
  journal	= {Journal of machine learning research},
  volume	= {3},
  number	= {Feb},
  pages		= {1137--1155},
  year		= {2003}
}

@Article{	  Bishop.1995,
  title		= {Training with noise is equivalent to Tikhonov
		  regularization},
  author	= {Bishop, Chris M},
  journal	= {Neural computation},
  volume	= {7},
  number	= {1},
  pages		= {108--116},
  year		= {1995},
  publisher	= {MIT Press}
}

@Book{		  Bishop.2006,
  title		= {Pattern recognition and machine learning},
  author	= {Bishop, Christopher M},
  year		= {2006},
  publisher	= {springer}
}

@InProceedings{	  Bodla.Singh.Chellappa.ea.2017,
  title		= {Soft-NMS--improving object detection with one line of
		  code},
  author	= {Bodla, Navaneeth and Singh, Bharat and Chellappa, Rama and
		  Davis, Larry S},
  booktitle	= {Proceedings of the IEEE international conference on
		  computer vision},
  pages		= {5561--5569},
  year		= {2017}
}

@Article{	  Bojanowski.Grave.Joulin.ea.2017,
  title		= {Enriching word vectors with subword information},
  author	= {Bojanowski, Piotr and Grave, Edouard and Joulin, Armand
		  and Mikolov, Tomas},
  journal	= {Transactions of the Association for Computational
		  Linguistics},
  volume	= {5},
  pages		= {135--146},
  year		= {2017},
  publisher	= {MIT Press}
}

@Book{		  Bollobas.1999,
  title		= {Linear analysis},
  author	= {Bollob{\'a}s, B},
  year		= {1999},
  publisher	= {Cambridge University Press, Cambridge}
}

@Article{	  Bowman.Angeli.Potts.ea.2015,
  title		= {A large annotated corpus for learning natural language
		  inference},
  author	= {Bowman, Samuel R and Angeli, Gabor and Potts, Christopher
		  and Manning, Christopher D},
  journal	= {arXiv preprint arXiv:1508.05326},
  year		= {2015}
}

@Book{		  Boyd.Vandenberghe.2004,
  address	= {Cambridge, England},
  author	= {Stephen Boyd and Lieven Vandenberghe},
  publisher	= {Cambridge University Press},
  title		= {Convex Optimization},
  year		= 2004
}

@InProceedings{	  Brown.Cocke.Della-Pietra.ea.1988,
  title		= {A statistical approach to language translation},
  author	= {Brown, Peter F and Cocke, John and Della Pietra, Stephen A
		  and Della Pietra, Vincent J and Jelinek, Frederick and
		  Mercer, Robert L and Roossin, Paul},
  booktitle	= {Coling Budapest 1988 Volume 1: International Conference on
		  Computational Linguistics},
  year		= {1988}
}

@Article{	  Brown.Cocke.Della-Pietra.ea.1990,
  title		= {A statistical approach to machine translation},
  author	= {Brown, Peter F and Cocke, John and Della Pietra, Stephen A
		  and Della Pietra, Vincent J and Jelinek, Frederick and
		  Lafferty, John and Mercer, Robert L and Roossin, Paul S},
  journal	= {Computational linguistics},
  volume	= {16},
  number	= {2},
  pages		= {79--85},
  year		= {1990}
}

@InProceedings{	  Brown.Sandholm.2017,
  title		= {Libratus: The Superhuman AI for No-Limit Poker.},
  author	= {Brown, Noam and Sandholm, Tuomas},
  booktitle	= {IJCAI},
  pages		= {5226--5228},
  year		= {2017}
}

@Article{	  Campbell.Hoane-Jr.Hsu.2002,
  title		= {Deep blue},
  author	= {Campbell, Murray and Hoane Jr, A Joseph and Hsu,
		  Feng-hsiung},
  journal	= {Artificial intelligence},
  volume	= {134},
  number	= {1-2},
  pages		= {57--83},
  year		= {2002},
  publisher	= {Elsevier}
}

@InCollection{	  Canny.1987,
  title		= {A computational approach to edge detection},
  author	= {Canny, John},
  booktitle	= {Readings in computer vision},
  pages		= {184--203},
  year		= {1987},
  publisher	= {Elsevier}
}

@InProceedings{	  Cer.Diab.Agirre.ea.2017,
  title		= {SemEval-2017 Task 1: Semantic Textual Similarity
		  Multilingual and Crosslingual Focused Evaluation},
  author	= {Cer, Daniel and Diab, Mona and Agirre, Eneko and
		  Lopez-Gazpio, I{\~n}igo and Specia, Lucia},
  booktitle	= {Proceedings of the 11th International Workshop on Semantic
		  Evaluation (SemEval-2017)},
  pages		= {1--14},
  year		= {2017}
}

@InProceedings{	  Cheng.Dong.Lapata.2016,
  title		= {Long Short-Term Memory-Networks for Machine Reading},
  author	= {Cheng, Jianpeng and Dong, Li and Lapata, Mirella},
  booktitle	= {Proceedings of the 2016 Conference on Empirical Methods in
		  Natural Language Processing},
  pages		= {551--561},
  year		= {2016}
}

@Article{	  Cho.Van-Merrienboer.Bahdanau.ea.2014,
  title		= {On the properties of neural machine translation:
		  Encoder-decoder approaches},
  author	= {Cho, Kyunghyun and Van Merri{\"e}nboer, Bart and Bahdanau,
		  Dzmitry and Bengio, Yoshua},
  journal	= {arXiv preprint arXiv:1409.1259},
  year		= {2014}
}

@Article{	  Cho.Van-Merrienboer.Gulcehre.ea.2014,
  title		= {Learning phrase representations using RNN encoder-decoder
		  for statistical machine translation},
  author	= {Cho, Kyunghyun and Van Merri{\"e}nboer, Bart and Gulcehre,
		  Caglar and Bahdanau, Dzmitry and Bougares, Fethi and
		  Schwenk, Holger and Bengio, Yoshua},
  journal	= {arXiv preprint arXiv:1406.1078},
  year		= {2014}
}

@Book{		  Chowdhury.2010,
  title		= {Introduction to modern information retrieval},
  author	= {Chowdhury, Gobinda G},
  year		= {2010},
  publisher	= {Facet publishing}
}

@Article{	  Chung.Gulcehre.Cho.ea.2014,
  title		= {Empirical evaluation of gated recurrent neural networks on
		  sequence modeling},
  author	= {Chung, Junyoung and Gulcehre, Caglar and Cho, KyungHyun
		  and Bengio, Yoshua},
  journal	= {arXiv preprint arXiv:1412.3555},
  year		= {2014}
}

@Article{	  Collobert.Weston.Bottou.ea.2011,
  title		= {Natural language processing (almost) from scratch},
  author	= {Collobert, Ronan and Weston, Jason and Bottou, L{\'e}on
		  and Karlen, Michael and Kavukcuoglu, Koray and Kuksa,
		  Pavel},
  journal	= {Journal of machine learning research},
  volume	= {12},
  number	= {ARTICLE},
  pages		= {2493--2537},
  year		= {2011}
}

@Article{	  Csiszar.2008,
  title		= {Axiomatic characterizations of information measures},
  author	= {Csisz{\'a}r, Imre},
  journal	= {Entropy},
  volume	= {10},
  number	= {3},
  pages		= {261--273},
  year		= {2008},
  publisher	= {Molecular Diversity Preservation International}
}

@InProceedings{	  Dalal.Triggs.2005,
  title		= {Histograms of oriented gradients for human detection},
  author	= {Dalal, Navneet and Triggs, Bill},
  booktitle	= {2005 IEEE computer society conference on computer vision
		  and pattern recognition (CVPR'05)},
  volume	= {1},
  pages		= {886--893},
  year		= {2005},
  organization	= {IEEE}
}

@Article{	  De-Cock.2011,
  title		= {Ames, Iowa: Alternative to the Boston housing data as an
		  end of semester regression project},
  author	= {De Cock, Dean},
  journal	= {Journal of Statistics Education},
  volume	= {19},
  number	= {3},
  year		= {2011},
  publisher	= {Taylor \& Francis}
}

@InProceedings{	  DeCandia.Hastorun.Jampani.ea.2007,
  title		= {Dynamo: Amazon's highly available key-value store},
  author	= {DeCandia, Giuseppe and Hastorun, Deniz and Jampani, Madan
		  and Kakulapati, Gunavardhan and Lakshman, Avinash and
		  Pilchin, Alex and Sivasubramanian, Swaminathan and
		  Vosshall, Peter and Vogels, Werner},
  booktitle	= {ACM SIGOPS operating systems review},
  volume	= {41},
  number	= {6},
  pages		= {205--220},
  year		= {2007},
  organization	= {ACM}
}

@Article{	  Devlin.Chang.Lee.ea.2018,
  title		= {Bert: Pre-training of deep bidirectional transformers for
		  language understanding},
  author	= {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and
		  Toutanova, Kristina},
  journal	= {arXiv preprint arXiv:1810.04805},
  year		= {2018}
}

@InProceedings{	  Doersch.Gupta.Efros.2015,
  title		= {Unsupervised visual representation learning by context
		  prediction},
  author	= {Doersch, Carl and Gupta, Abhinav and Efros, Alexei A},
  booktitle	= {Proceedings of the IEEE international conference on
		  computer vision},
  pages		= {1422--1430},
  year		= {2015}
}

@InProceedings{	  Dosovitskiy.Beyer.Kolesnikov.ea.2021,
  title		= {An image is worth 16x16 words: Transformers for image
		  recognition at scale},
  author	= {Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov,
		  Alexander and Weissenborn, Dirk and Zhai, Xiaohua and
		  Unterthiner, Thomas and Dehghani, Mostafa and Minderer,
		  Matthias and Heigold, Georg and Gelly, Sylvain and others},
  booktitle	= {International Conference on Learning Representations},
  year		= {2021}
}

@InCollection{	  Doucet.De-Freitas.Gordon.2001,
  title		= {An introduction to sequential Monte Carlo methods},
  author	= {Doucet, Arnaud and De Freitas, Nando and Gordon, Neil},
  booktitle	= {Sequential Monte Carlo methods in practice},
  pages		= {3--14},
  year		= {2001},
  publisher	= {Springer}
}

@Article{	  Duchi.Hazan.Singer.2011,
  title		= {Adaptive subgradient methods for online learning and
		  stochastic optimization},
  author	= {Duchi, John and Hazan, Elad and Singer, Yoram},
  journal	= {Journal of Machine Learning Research},
  volume	= {12},
  number	= {Jul},
  pages		= {2121--2159},
  year		= {2011}
}

@Article{	  Dumoulin.Visin.2016,
  title		= {A guide to convolution arithmetic for deep learning},
  author	= {Dumoulin, Vincent and Visin, Francesco},
  journal	= {arXiv preprint arXiv:1603.07285},
  year		= {2016}
}

@Article{	  Edelman.Ostrovsky.Schwarz.2007,
  title		= {Internet advertising and the generalized second-price
		  auction: Selling billions of dollars worth of keywords},
  author	= {Edelman, Benjamin and Ostrovsky, Michael and Schwarz,
		  Michael},
  journal	= {American economic review},
  volume	= {97},
  number	= {1},
  pages		= {242--259},
  year		= {2007}
}

@InProceedings{	  Flammarion.Bach.2015,
  title		= {From averaging to acceleration, there is only a
		  step-size},
  author	= {Flammarion, Nicolas and Bach, Francis},
  booktitle	= {Conference on Learning Theory},
  pages		= {658--695},
  year		= {2015}
}

@InProceedings{	  Gatys.Ecker.Bethge.2016,
  title		= {Image style transfer using convolutional neural networks},
  author	= {Gatys, Leon A and Ecker, Alexander S and Bethge,
		  Matthias},
  booktitle	= {Proceedings of the IEEE conference on computer vision and
		  pattern recognition},
  pages		= {2414--2423},
  year		= {2016}
}

@Article{	  Ginibre.1965,
  title		= {Statistical ensembles of complex, quaternion, and real
		  matrices},
  author	= {Ginibre, Jean},
  journal	= {Journal of Mathematical Physics},
  volume	= {6},
  number	= {3},
  pages		= {440--449},
  year		= {1965},
  publisher	= {AIP}
}

@InProceedings{	  Girshick.2015,
  title		= {Fast r-cnn},
  author	= {Girshick, Ross},
  booktitle	= {Proceedings of the IEEE international conference on
		  computer vision},
  pages		= {1440--1448},
  year		= {2015}
}

@InProceedings{	  Girshick.Donahue.Darrell.ea.2014,
  title		= {Rich feature hierarchies for accurate object detection and
		  semantic segmentation},
  author	= {Girshick, Ross and Donahue, Jeff and Darrell, Trevor and
		  Malik, Jitendra},
  booktitle	= {Proceedings of the IEEE conference on computer vision and
		  pattern recognition},
  pages		= {580--587},
  year		= {2014}
}

@InProceedings{	  Glorot.Bengio.2010,
  title		= {Understanding the difficulty of training deep feedforward
		  neural networks},
  author	= {Glorot, Xavier and Bengio, Yoshua},
  booktitle	= {Proceedings of the thirteenth international conference on
		  artificial intelligence and statistics},
  pages		= {249--256},
  year		= {2010}
}

@Article{	  Goh.2017,
  author	= {Goh, Gabriel},
  title		= {Why Momentum Really Works},
  journal	= {Distill},
  year		= {2017},
  url		= {http://distill.pub/2017/momentum},
  doi		= {10.23915/distill.00006}
}

@Article{	  Goldberg.Nichols.Oki.ea.1992,
  title		= {Using collaborative filtering to weave an information
		  tapestry},
  author	= {Goldberg, David and Nichols, David and Oki, Brian M and
		  Terry, Douglas},
  journal	= {Communications of the ACM},
  volume	= {35},
  number	= {12},
  pages		= {61--71},
  year		= {1992},
  publisher	= {Association for Computing Machinery, Inc.}
}

@Book{		  Goodfellow.Bengio.Courville.2016,
  title		= {Deep Learning},
  author	= {Ian Goodfellow and Yoshua Bengio and Aaron Courville},
  publisher	= {MIT Press},
  note		= {\url{http://www.deeplearningbook.org}},
  year		= {2016}
}

@InProceedings{	  Goodfellow.Pouget-Abadie.Mirza.ea.2014,
  title		= {Generative adversarial nets},
  author	= {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi
		  and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and
		  Courville, Aaron and Bengio, Yoshua},
  booktitle	= {Advances in neural information processing systems},
  pages		= {2672--2680},
  year		= {2014}
}

@Article{	  Gotmare.Keskar.Xiong.ea.2018,
  title		= {A Closer Look at Deep Learning Heuristics: Learning rate
		  restarts, Warmup and Distillation},
  author	= {Gotmare, Akhilesh and Keskar, Nitish Shirish and Xiong,
		  Caiming and Socher, Richard},
  journal	= {arXiv preprint arXiv:1810.13243},
  year		= {2018}
}

@Article{	  Graves.2013,
  title		= {Generating sequences with recurrent neural networks},
  author	= {Graves, Alex},
  journal	= {arXiv preprint arXiv:1308.0850},
  year		= {2013}
}

@Article{	  Graves.Schmidhuber.2005,
  title		= {Framewise phoneme classification with bidirectional LSTM
		  and other neural network architectures},
  author	= {Graves, Alex and Schmidhuber, J{\"u}rgen},
  journal	= {Neural networks},
  volume	= {18},
  number	= {5-6},
  pages		= {602--610},
  year		= {2005},
  publisher	= {Elsevier}
}

@InCollection{	  Gunawardana.Shani.2015,
  title		= {Evaluating recommender systems},
  author	= {Gunawardana, Asela and Shani, Guy},
  booktitle	= {Recommender systems handbook},
  pages		= {265--308},
  year		= {2015},
  publisher	= {Springer}
}

@InProceedings{	  Guo.Tang.Ye.ea.2017,
  title		= {DeepFM: a factorization-machine based neural network for
		  CTR prediction},
  author	= {Guo, Huifeng and Tang, Ruiming and Ye, Yunming and Li,
		  Zhenguo and He, Xiuqiang},
  booktitle	= {Proceedings of the 26th International Joint Conference on
		  Artificial Intelligence},
  pages		= {1725--1731},
  year		= {2017},
  organization	= {AAAI Press}
}

@Article{	  Hadjis.Zhang.Mitliagkas.ea.2016,
  title		= {Omnivore: An optimizer for multi-device deep learning on
		  cpus and gpus},
  author	= {Hadjis, Stefan and Zhang, Ce and Mitliagkas, Ioannis and
		  Iter, Dan and R{\'e}, Christopher},
  journal	= {arXiv preprint arXiv:1606.04487},
  year		= {2016}
}

@InProceedings{	  Hazan.Rakhlin.Bartlett.2008,
  title		= {Adaptive online gradient descent},
  author	= {Hazan, Elad and Rakhlin, Alexander and Bartlett, Peter L},
  booktitle	= {Advances in Neural Information Processing Systems},
  pages		= {65--72},
  year		= {2008}
}

@InProceedings{	  He.Chua.2017,
  title		= {Neural factorization machines for sparse predictive
		  analytics},
  author	= {He, Xiangnan and Chua, Tat-Seng},
  booktitle	= {Proceedings of the 40th International ACM SIGIR conference
		  on Research and Development in Information Retrieval},
  pages		= {355--364},
  year		= {2017},
  organization	= {ACM}
}

@InProceedings{	  He.Gkioxari.Dollar.ea.2017,
  title		= {Mask r-cnn},
  author	= {He, Kaiming and Gkioxari, Georgia and Doll{\'a}r, Piotr
		  and Girshick, Ross},
  booktitle	= {Proceedings of the IEEE international conference on
		  computer vision},
  pages		= {2961--2969},
  year		= {2017}
}

@InProceedings{	  He.Liao.Zhang.ea.2017,
  title		= {Neural collaborative filtering},
  author	= {He, Xiangnan and Liao, Lizi and Zhang, Hanwang and Nie,
		  Liqiang and Hu, Xia and Chua, Tat-Seng},
  booktitle	= {Proceedings of the 26th international conference on world
		  wide web},
  pages		= {173--182},
  year		= {2017},
  organization	= {International World Wide Web Conferences Steering
		  Committee}
}

@InProceedings{	  He.Zhang.Ren.ea.2015,
  title		= {Delving deep into rectifiers: Surpassing human-level
		  performance on imagenet classification},
  author	= {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun,
		  Jian},
  booktitle	= {Proceedings of the IEEE international conference on
		  computer vision},
  pages		= {1026--1034},
  year		= {2015}
}

@InProceedings{	  He.Zhang.Ren.ea.2016,
  title		= {Deep residual learning for image recognition},
  author	= {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun,
		  Jian},
  booktitle	= {Proceedings of the IEEE conference on computer vision and
		  pattern recognition},
  pages		= {770--778},
  year		= {2016}
}

@InProceedings{	  He.Zhang.Ren.ea.2016*1,
  title		= {Identity mappings in deep residual networks},
  author	= {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun,
		  Jian},
  booktitle	= {European conference on computer vision},
  pages		= {630--645},
  year		= {2016},
  organization	= {Springer}
}

@Book{		  Hebb.Hebb.1949,
  title		= {The organization of behavior},
  author	= {Hebb, Donald Olding and Hebb, DO},
  volume	= {65},
  year		= {1949},
  publisher	= {Wiley New York}
}

@Article{	  Hendrycks.Gimpel.2016,
  title		= {Gaussian error linear units (gelus)},
  author	= {Hendrycks, Dan and Gimpel, Kevin},
  journal	= {arXiv preprint arXiv:1606.08415},
  year		= {2016}
}

@Book{		  Hennessy.Patterson.2011,
  title		= {Computer architecture: a quantitative approach},
  author	= {Hennessy, John L and Patterson, David A},
  year		= {2011},
  publisher	= {Elsevier}
}

@InProceedings{	  Herlocker.Konstan.Borchers.ea.1999,
  title		= {An algorithmic framework for performing collaborative
		  filtering},
  author	= {Herlocker, Jonathan L and Konstan, Joseph A and Borchers,
		  Al and Riedl, John},
  booktitle	= {22nd Annual International ACM SIGIR Conference on Research
		  and Development in Information Retrieval, SIGIR 1999},
  pages		= {230--237},
  year		= {1999},
  organization	= {Association for Computing Machinery, Inc}
}

@Article{	  Hidasi.Karatzoglou.Baltrunas.ea.2015,
  title		= {Session-based recommendations with recurrent neural
		  networks},
  author	= {Hidasi, Bal{\'a}zs and Karatzoglou, Alexandros and
		  Baltrunas, Linas and Tikk, Domonkos},
  journal	= {arXiv preprint arXiv:1511.06939},
  year		= {2015}
}

@Misc{		  Hochreiter.Bengio.Frasconi.ea.2001,
  title		= {Gradient flow in recurrent nets: the difficulty of
		  learning long-term dependencies},
  author	= {Hochreiter, Sepp and Bengio, Yoshua and Frasconi, Paolo
		  and Schmidhuber, J{\"u}rgen and others},
  year		= {2001},
  publisher	= {A field guide to dynamical recurrent neural networks. IEEE
		  Press}
}

@Article{	  Hochreiter.Schmidhuber.1997,
  title		= {Long short-term memory},
  author	= {Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
  journal	= {Neural computation},
  volume	= {9},
  number	= {8},
  pages		= {1735--1780},
  year		= {1997},
  publisher	= {MIT Press}
}

@InProceedings{	  Hoyer.Janzing.Mooij.ea.2009,
  title		= {Nonlinear causal discovery with additive noise models},
  author	= {Hoyer, Patrik O and Janzing, Dominik and Mooij, Joris M
		  and Peters, Jonas and Sch{\"o}lkopf, Bernhard},
  booktitle	= {Advances in neural information processing systems},
  pages		= {689--696},
  year		= {2009}
}

@InProceedings{	  Hu.Koren.Volinsky.2008,
  title		= {Collaborative filtering for implicit feedback datasets},
  author	= {Hu, Yifan and Koren, Yehuda and Volinsky, Chris},
  booktitle	= {2008 Eighth IEEE International Conference on Data Mining},
  pages		= {263--272},
  year		= {2008},
  organization	= {Ieee}
}

@Article{	  Hu.Lee.Aggarwal.ea.2020,
  title		= {Text Style Transfer: A Review and Experimental
		  Evaluation},
  author	= {Hu, Zhiqiang and Lee, Roy Ka-Wei and Aggarwal, Charu C and
		  Zhang, Aston},
  journal	= {arXiv preprint arXiv:2010.12742},
  year		= {2020}
}

@InProceedings{	  Hu.Shen.Sun.2018,
  title		= {Squeeze-and-excitation networks},
  author	= {Hu, Jie and Shen, Li and Sun, Gang},
  booktitle	= {Proceedings of the IEEE conference on computer vision and
		  pattern recognition},
  pages		= {7132--7141},
  year		= {2018}
}

@InProceedings{	  Huang.Liu.Van-Der-Maaten.ea.2017,
  title		= {Densely connected convolutional networks},
  author	= {Huang, Gao and Liu, Zhuang and Van Der Maaten, Laurens and
		  Weinberger, Kilian Q},
  booktitle	= {Proceedings of the IEEE conference on computer vision and
		  pattern recognition},
  pages		= {4700--4708},
  year		= {2017}
}

@InProceedings{	  Ioffe.2017,
  title		= {Batch renormalization: Towards reducing minibatch
		  dependence in batch-normalized models},
  author	= {Ioffe, Sergey},
  booktitle	= {Advances in neural information processing systems},
  pages		= {1945--1953},
  year		= {2017}
}

@Article{	  Ioffe.Szegedy.2015,
  title		= {Batch normalization: Accelerating deep network training by
		  reducing internal covariate shift},
  author	= {Ioffe, Sergey and Szegedy, Christian},
  journal	= {arXiv preprint arXiv:1502.03167},
  year		= {2015}
}

@Article{	  Izmailov.Podoprikhin.Garipov.ea.2018,
  title		= {Averaging weights leads to wider optima and better
		  generalization},
  author	= {Izmailov, Pavel and Podoprikhin, Dmitrii and Garipov,
		  Timur and Vetrov, Dmitry and Wilson, Andrew Gordon},
  journal	= {arXiv preprint arXiv:1803.05407},
  year		= {2018}
}

@Book{		  Jaeger.2002,
  title		= {Tutorial on training recurrent neural networks, covering
		  BPPT, RTRL, EKF and the" echo state network" approach},
  author	= {Jaeger, Herbert},
  volume	= {5},
  year		= {2002},
  publisher	= {GMD-Forschungszentrum Informationstechnik Bonn}
}

@Book{		  James.2007,
  title		= {The principles of psychology},
  author	= {James, William},
  volume	= {1},
  year		= {2007},
  publisher	= {Cosimo, Inc.}
}

@Article{	  Jia.Song.He.ea.2018,
  title		= {Highly scalable deep learning training system with
		  mixed-precision: Training imagenet in four minutes},
  author	= {Jia, Xianyan and Song, Shutao and He, Wei and Wang,
		  Yangzihao and Rong, Haidong and Zhou, Feihu and Xie,
		  Liqiang and Guo, Zhenyu and Yang, Yuanzhou and Yu, Liwei
		  and others},
  journal	= {arXiv preprint arXiv:1807.11205},
  year		= {2018}
}

@InProceedings{	  Jouppi.Young.Patil.ea.2017,
  title		= {In-datacenter performance analysis of a tensor processing
		  unit},
  author	= {Jouppi, Norman P and Young, Cliff and Patil, Nishant and
		  Patterson, David and Agrawal, Gaurav and Bajwa, Raminder
		  and Bates, Sarah and Bhatia, Suresh and Boden, Nan and
		  Borchers, Al and others},
  booktitle	= {2017 ACM/IEEE 44th Annual International Symposium on
		  Computer Architecture (ISCA)},
  pages		= {1--12},
  year		= {2017},
  organization	= {IEEE}
}

@Article{	  Karras.Aila.Laine.ea.2017,
  title		= {Progressive growing of gans for improved quality,
		  stability, and variation},
  author	= {Karras, Tero and Aila, Timo and Laine, Samuli and
		  Lehtinen, Jaakko},
  journal	= {arXiv preprint arXiv:1710.10196},
  year		= {2017}
}

@Article{	  Kim.2014,
  title		= {Convolutional neural networks for sentence
		  classification},
  author	= {Kim, Yoon},
  journal	= {arXiv preprint arXiv:1408.5882},
  year		= {2014}
}

@Article{	  Kingma.Ba.2014,
  title		= {Adam: A method for stochastic optimization},
  author	= {Kingma, Diederik P and Ba, Jimmy},
  journal	= {arXiv preprint arXiv:1412.6980},
  year		= {2014}
}

@Book{		  Koller.Friedman.2009,
  title		= {Probabilistic graphical models: principles and
		  techniques},
  author	= {Koller, Daphne and Friedman, Nir},
  year		= {2009},
  publisher	= {MIT press}
}

@Article{	  Kolter.2008,
  title		= {Linear Algebra Review and Reference},
  author	= {Kolter, Zico},
  journal	= {Available online: http},
  year		= {2008}
}

@InProceedings{	  Koren.2009,
  title		= {Collaborative filtering with temporal dynamics},
  author	= {Koren, Yehuda},
  booktitle	= {Proceedings of the 15th ACM SIGKDD international
		  conference on Knowledge discovery and data mining},
  pages		= {447--456},
  year		= {2009},
  organization	= {ACM}
}

@Article{	  Koren.Bell.Volinsky.2009,
  title		= {Matrix factorization techniques for recommender systems},
  author	= {Koren, Yehuda and Bell, Robert and Volinsky, Chris},
  journal	= {Computer},
  number	= {8},
  pages		= {30--37},
  year		= {2009},
  publisher	= {IEEE}
}

@InProceedings{	  Krizhevsky.Sutskever.Hinton.2012,
  title		= {Imagenet classification with deep convolutional neural
		  networks},
  author	= {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey
		  E},
  booktitle	= {Advances in neural information processing systems},
  pages		= {1097--1105},
  year		= {2012}
}

@Article{	  Kung.1988,
  title		= {VLSI array processors},
  author	= {Kung, Sun Yuan},
  journal	= {Englewood Cliffs, NJ, Prentice Hall, 1988, 685 p. Research
		  supported by the Semiconductor Research Corp., SDIO, NSF,
		  and US Navy.},
  year		= {1988}
}

@Article{	  LeCun.Bottou.Bengio.ea.1998,
  title		= {Gradient-based learning applied to document recognition},
  author	= {LeCun, Yann and Bottou, L{\'e}on and Bengio, Yoshua and
		  Haffner, Patrick and others},
  journal	= {Proceedings of the IEEE},
  volume	= {86},
  number	= {11},
  pages		= {2278--2324},
  year		= {1998},
  publisher	= {Taipei, Taiwan}
}

@PhDThesis{	  Li.2017,
  title		= {Scaling Distributed Machine Learning with System and
		  Algorithm Co-design},
  author	= {Li, Mu},
  year		= {2017},
  school	= {PhD Thesis, CMU}
}

@InProceedings{	  Li.Andersen.Park.ea.2014,
  title		= {Scaling distributed machine learning with the parameter
		  server},
  author	= {Li, Mu and Andersen, David G and Park, Jun Woo and Smola,
		  Alexander J and Ahmed, Amr and Josifovski, Vanja and Long,
		  James and Shekita, Eugene J and Su, Bor-Yiing},
  booktitle	= {11th $\{$USENIX$\}$ Symposium on Operating Systems Design
		  and Implementation ($\{$OSDI$\}$ 14)},
  pages		= {583--598},
  year		= {2014}
}

@Article{	  Lin.Chen.Yan.2013,
  title		= {Network in network},
  author	= {Lin, Min and Chen, Qiang and Yan, Shuicheng},
  journal	= {arXiv preprint arXiv:1312.4400},
  year		= {2013}
}

@Article{	  Lin.Feng.Santos.ea.2017,
  title		= {A structured self-attentive sentence embedding},
  author	= {Lin, Zhouhan and Feng, Minwei and Santos, Cicero Nogueira
		  dos and Yu, Mo and Xiang, Bing and Zhou, Bowen and Bengio,
		  Yoshua},
  journal	= {arXiv preprint arXiv:1703.03130},
  year		= {2017}
}

@InProceedings{	  Lin.Goyal.Girshick.ea.2017,
  title		= {Focal loss for dense object detection},
  author	= {Lin, Tsung-Yi and Goyal, Priya and Girshick, Ross and He,
		  Kaiming and Doll{\'a}r, Piotr},
  booktitle	= {Proceedings of the IEEE international conference on
		  computer vision},
  pages		= {2980--2988},
  year		= {2017}
}

@Article{	  Lin.Lv.Zhu.ea.2010,
  title		= {Imagenet classification: fast descriptor coding and
		  large-scale svm training},
  author	= {Lin, Yuanqing and Lv, F and Zhu, S and Yang, M and Cour, T
		  and Yu, K and Cao, L and Li, Z and Tsai, MH and Zhou, X and
		  others},
  journal	= {Large scale visual recognition challenge},
  year		= {2010}
}

@Article{	  Lipton.Steinhardt.2018,
  title		= {Troubling trends in machine learning scholarship},
  author	= {Lipton, Zachary C and Steinhardt, Jacob},
  journal	= {arXiv preprint arXiv:1807.03341},
  year		= {2018}
}

@InProceedings{	  Liu.Anguelov.Erhan.ea.2016,
  title		= {Ssd: Single shot multibox detector},
  author	= {Liu, Wei and Anguelov, Dragomir and Erhan, Dumitru and
		  Szegedy, Christian and Reed, Scott and Fu, Cheng-Yang and
		  Berg, Alexander C},
  booktitle	= {European conference on computer vision},
  pages		= {21--37},
  year		= {2016},
  organization	= {Springer}
}

@Article{	  Liu.Ott.Goyal.ea.2019,
  title		= {Roberta: A robustly optimized bert pretraining approach},
  author	= {Liu, Yinhan and Ott, Myle and Goyal, Naman and Du, Jingfei
		  and Joshi, Mandar and Chen, Danqi and Levy, Omer and Lewis,
		  Mike and Zettlemoyer, Luke and Stoyanov, Veselin},
  journal	= {arXiv preprint arXiv:1907.11692},
  year		= {2019}
}

@InProceedings{	  Long.Shelhamer.Darrell.2015,
  title		= {Fully convolutional networks for semantic segmentation},
  author	= {Long, Jonathan and Shelhamer, Evan and Darrell, Trevor},
  booktitle	= {Proceedings of the IEEE conference on computer vision and
		  pattern recognition},
  pages		= {3431--3440},
  year		= {2015}
}

@Article{	  Loshchilov.Hutter.2016,
  title		= {Sgdr: Stochastic gradient descent with warm restarts},
  author	= {Loshchilov, Ilya and Hutter, Frank},
  journal	= {arXiv preprint arXiv:1608.03983},
  year		= {2016}
}

@Article{	  Lowe.2004,
  title		= {Distinctive image features from scale-invariant
		  keypoints},
  author	= {Lowe, David G},
  journal	= {International journal of computer vision},
  volume	= {60},
  number	= {2},
  pages		= {91--110},
  year		= {2004},
  publisher	= {Springer}
}

@Article{	  Luo.Wang.Shao.ea.2018,
  title		= {Towards understanding regularization in batch
		  normalization},
  author	= {Luo, Ping and Wang, Xinjiang and Shao, Wenqi and Peng,
		  Zhanglin},
  journal	= {arXiv preprint},
  year		= {2018}
}

@InProceedings{	  Maas.Daly.Pham.ea.2011,
  title		= {Learning word vectors for sentiment analysis},
  author	= {Maas, Andrew L and Daly, Raymond E and Pham, Peter T and
		  Huang, Dan and Ng, Andrew Y and Potts, Christopher},
  booktitle	= {Proceedings of the 49th annual meeting of the association
		  for computational linguistics: Human language
		  technologies-volume 1},
  pages		= {142--150},
  year		= {2011},
  organization	= {Association for Computational Linguistics}
}

@InProceedings{	  McCann.Bradbury.Xiong.ea.2017,
  title		= {Learned in translation: Contextualized word vectors},
  author	= {McCann, Bryan and Bradbury, James and Xiong, Caiming and
		  Socher, Richard},
  booktitle	= {Advances in Neural Information Processing Systems},
  pages		= {6294--6305},
  year		= {2017}
}

@Article{	  McCulloch.Pitts.1943,
  title		= {A logical calculus of the ideas immanent in nervous
		  activity},
  author	= {McCulloch, Warren S and Pitts, Walter},
  journal	= {The bulletin of mathematical biophysics},
  volume	= {5},
  number	= {4},
  pages		= {115--133},
  year		= {1943},
  publisher	= {Springer}
}

@InProceedings{	  McMahan.Holt.Sculley.ea.2013,
  title		= {Ad click prediction: a view from the trenches},
  author	= {McMahan, H Brendan and Holt, Gary and Sculley, David and
		  Young, Michael and Ebner, Dietmar and Grady, Julian and
		  Nie, Lan and Phillips, Todd and Davydov, Eugene and
		  Golovin, Daniel and others},
  booktitle	= {Proceedings of the 19th ACM SIGKDD international
		  conference on Knowledge discovery and data mining},
  pages		= {1222--1230},
  year		= {2013},
  organization	= {ACM}
}

@Article{	  Merity.Xiong.Bradbury.ea.2016,
  title		= {Pointer sentinel mixture models},
  author	= {Merity, Stephen and Xiong, Caiming and Bradbury, James and
		  Socher, Richard},
  journal	= {arXiv preprint arXiv:1609.07843},
  year		= {2016}
}

@Article{	  Mikolov.Chen.Corrado.ea.2013,
  title		= {Efficient estimation of word representations in vector
		  space},
  author	= {Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean,
		  Jeffrey},
  journal	= {arXiv preprint arXiv:1301.3781},
  year		= {2013}
}

@InProceedings{	  Mikolov.Sutskever.Chen.ea.2013,
  title		= {Distributed representations of words and phrases and their
		  compositionality},
  author	= {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and
		  Corrado, Greg S and Dean, Jeff},
  booktitle	= {Advances in neural information processing systems},
  pages		= {3111--3119},
  year		= {2013}
}

@InProceedings{	  Mirhoseini.Pham.Le.ea.2017,
  title		= {Device placement optimization with reinforcement
		  learning},
  author	= {Mirhoseini, Azalia and Pham, Hieu and Le, Quoc V and
		  Steiner, Benoit and Larsen, Rasmus and Zhou, Yuefeng and
		  Kumar, Naveen and Norouzi, Mohammad and Bengio, Samy and
		  Dean, Jeff},
  booktitle	= {Proceedings of the 34th International Conference on
		  Machine Learning-Volume 70},
  pages		= {2430--2439},
  year		= {2017},
  organization	= {JMLR. org}
}

@InProceedings{	  Mnih.Heess.Graves.ea.2014,
  title		= {Recurrent models of visual attention},
  author	= {Mnih, Volodymyr and Heess, Nicolas and Graves, Alex and
		  others},
  booktitle	= {Advances in neural information processing systems},
  pages		= {2204--2212},
  year		= {2014}
}

@Article{	  Morey.Hoekstra.Rouder.ea.2016,
  title		= {The fallacy of placing confidence in confidence
		  intervals},
  author	= {Morey, Richard D and Hoekstra, Rink and Rouder, Jeffrey N
		  and Lee, Michael D and Wagenmakers, Eric-Jan},
  journal	= {Psychonomic bulletin \& review},
  volume	= {23},
  number	= {1},
  pages		= {103--123},
  year		= {2016},
  publisher	= {Springer}
}

@Article{	  Nadaraya.1964,
  title		= {On estimating regression},
  author	= {Nadaraya, Elizbar A},
  journal	= {Theory of Probability \& Its Applications},
  volume	= {9},
  number	= {1},
  pages		= {141--142},
  year		= {1964},
  publisher	= {SIAM}
}

@Book{		  Nesterov.2018,
  title		= {Lectures on convex optimization},
  author	= {Nesterov, Yurii},
  volume	= {137},
  year		= {2018},
  publisher	= {Springer}
}

@Misc{		  Nesterov.Vial.2000,
  title		= {Confidence level solutions for stochastic programming,
		  Stochastic Programming E-Print Series},
  author	= {Nesterov, Yu and Vial, J-Ph},
  year		= {2000}
}

@Article{	  Neyman.1937,
  title		= {Outline of a theory of statistical estimation based on the
		  classical theory of probability},
  author	= {Neyman, Jerzy},
  journal	= {Philosophical Transactions of the Royal Society of London.
		  Series A, Mathematical and Physical Sciences},
  volume	= {236},
  number	= {767},
  pages		= {333--380},
  year		= {1937},
  publisher	= {The Royal Society London}
}

@InProceedings{	  Papineni.Roukos.Ward.ea.2002,
  title		= {BLEU: a method for automatic evaluation of machine
		  translation},
  author	= {Papineni, Kishore and Roukos, Salim and Ward, Todd and
		  Zhu, Wei-Jing},
  booktitle	= {Proceedings of the 40th annual meeting of the Association
		  for Computational Linguistics},
  pages		= {311--318},
  year		= {2002}
}

@Article{	  Parikh.Tackstrom.Das.ea.2016,
  title		= {A decomposable attention model for natural language
		  inference},
  author	= {Parikh, Ankur P and T{\"a}ckstr{\"o}m, Oscar and Das,
		  Dipanjan and Uszkoreit, Jakob},
  journal	= {arXiv preprint arXiv:1606.01933},
  year		= {2016}
}

@InProceedings{	  Park.Liu.Wang.ea.2019,
  title		= {Semantic image synthesis with spatially-adaptive
		  normalization},
  author	= {Park, Taesung and Liu, Ming-Yu and Wang, Ting-Chun and
		  Zhu, Jun-Yan},
  booktitle	= {Proceedings of the IEEE Conference on Computer Vision and
		  Pattern Recognition},
  pages		= {2337--2346},
  year		= {2019}
}

@Article{	  Paulus.Xiong.Socher.2017,
  title		= {A deep reinforced model for abstractive summarization},
  author	= {Paulus, Romain and Xiong, Caiming and Socher, Richard},
  journal	= {arXiv preprint arXiv:1705.04304},
  year		= {2017}
}

@InProceedings{	  Pennington.Schoenholz.Ganguli.2017,
  title		= {Resurrecting the sigmoid in deep learning through
		  dynamical isometry: theory and practice},
  author	= {Pennington, Jeffrey and Schoenholz, Samuel and Ganguli,
		  Surya},
  booktitle	= {Advances in neural information processing systems},
  pages		= {4785--4795},
  year		= {2017}
}

@InProceedings{	  Pennington.Socher.Manning.2014,
  title		= {Glove: Global vectors for word representation},
  author	= {Pennington, Jeffrey and Socher, Richard and Manning,
		  Christopher},
  booktitle	= {Proceedings of the 2014 conference on empirical methods in
		  natural language processing (EMNLP)},
  pages		= {1532--1543},
  year		= {2014}
}

@InProceedings{	  Peters.Ammar.Bhagavatula.ea.2017,
  title		= {Semi-supervised sequence tagging with bidirectional
		  language models},
  author	= {Peters, Matthew and Ammar, Waleed and Bhagavatula, Chandra
		  and Power, Russell},
  booktitle	= {Proceedings of the 55th Annual Meeting of the Association
		  for Computational Linguistics (Volume 1: Long Papers)},
  pages		= {1756--1765},
  year		= {2017}
}

@Book{		  Peters.Janzing.Scholkopf.2017,
  title		= {Elements of causal inference: foundations and learning
		  algorithms},
  author	= {Peters, Jonas and Janzing, Dominik and Sch{\"o}lkopf,
		  Bernhard},
  year		= {2017},
  publisher	= {MIT press}
}

@InProceedings{	  Peters.Neumann.Iyyer.ea.2018,
  title		= {Deep Contextualized Word Representations},
  author	= {Peters, Matthew and Neumann, Mark and Iyyer, Mohit and
		  Gardner, Matt and Clark, Christopher and Lee, Kenton and
		  Zettlemoyer, Luke},
  booktitle	= {Proceedings of the 2018 Conference of the North American
		  Chapter of the Association for Computational Linguistics:
		  Human Language Technologies, Volume 1 (Long Papers)},
  pages		= {2227--2237},
  year		= {2018}
}

@Article{	  Petersen.Pedersen.ea.2008,
  title		= {The matrix cookbook},
  author	= {Petersen, Kaare Brandt and Pedersen, Michael Syskind and
		  others},
  journal	= {Technical University of Denmark},
  volume	= {7},
  number	= {15},
  pages		= {510},
  year		= {2008}
}

@Article{	  Polyak.1964,
  title		= {Some methods of speeding up the convergence of iteration
		  methods},
  author	= {Polyak, Boris T},
  journal	= {USSR Computational Mathematics and Mathematical Physics},
  volume	= {4},
  number	= {5},
  pages		= {1--17},
  year		= {1964},
  publisher	= {Elsevier}
}

@Article{	  Quadrana.Cremonesi.Jannach.2018,
  title		= {Sequence-aware recommender systems},
  author	= {Quadrana, Massimo and Cremonesi, Paolo and Jannach,
		  Dietmar},
  journal	= {ACM Computing Surveys (CSUR)},
  volume	= {51},
  number	= {4},
  pages		= {66},
  year		= {2018},
  publisher	= {ACM}
}

@Article{	  Radford.Metz.Chintala.2015,
  title		= {Unsupervised representation learning with deep
		  convolutional generative adversarial networks},
  author	= {Radford, Alec and Metz, Luke and Chintala, Soumith},
  journal	= {arXiv preprint arXiv:1511.06434},
  year		= {2015}
}

@Article{	  Radford.Narasimhan.Salimans.ea.2018,
  title		= {Improving language understanding by generative
		  pre-training},
  author	= {Radford, Alec and Narasimhan, Karthik and Salimans, Tim
		  and Sutskever, Ilya},
  journal	= {OpenAI},
  year		= {2018}
}

@Article{	  Radford.Wu.Child.ea.2019,
  title		= {Language models are unsupervised multitask learners},
  author	= {Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan,
		  David and Amodei, Dario and Sutskever, Ilya},
  journal	= {OpenAI Blog},
  volume	= {1},
  number	= {8},
  pages		= {9},
  year		= {2019}
}

@Article{	  Rajpurkar.Zhang.Lopyrev.ea.2016,
  title		= {Squad: 100,000+ questions for machine comprehension of
		  text},
  author	= {Rajpurkar, Pranav and Zhang, Jian and Lopyrev, Konstantin
		  and Liang, Percy},
  journal	= {arXiv preprint arXiv:1606.05250},
  year		= {2016}
}

@Article{	  Reddi.Kale.Kumar.2019,
  title		= {On the convergence of Adam and beyond},
  author	= {Reddi, Sashank J and Kale, Satyen and Kumar, Sanjiv},
  journal	= {arXiv preprint arXiv:1904.09237},
  year		= {2019}
}

@InProceedings{	  Redmon.Divvala.Girshick.ea.2016,
  title		= {You only look once: Unified, real-time object detection},
  author	= {Redmon, Joseph and Divvala, Santosh and Girshick, Ross and
		  Farhadi, Ali},
  booktitle	= {Proceedings of the IEEE conference on computer vision and
		  pattern recognition},
  pages		= {779--788},
  year		= {2016}
}

@Article{	  Reed.De-Freitas.2015,
  title		= {Neural programmer-interpreters},
  author	= {Reed, Scott and De Freitas, Nando},
  journal	= {arXiv preprint arXiv:1511.06279},
  year		= {2015}
}

@InProceedings{	  Ren.He.Girshick.ea.2015,
  title		= {Faster r-cnn: Towards real-time object detection with
		  region proposal networks},
  author	= {Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun,
		  Jian},
  booktitle	= {Advances in neural information processing systems},
  pages		= {91--99},
  year		= {2015}
}

@InProceedings{	  Rendle.2010,
  title		= {Factorization machines},
  author	= {Rendle, Steffen},
  booktitle	= {2010 IEEE International Conference on Data Mining},
  pages		= {995--1000},
  year		= {2010},
  organization	= {IEEE}
}

@InProceedings{	  Rendle.Freudenthaler.Gantner.ea.2009,
  title		= {BPR: Bayesian personalized ranking from implicit
		  feedback},
  author	= {Rendle, Steffen and Freudenthaler, Christoph and Gantner,
		  Zeno and Schmidt-Thieme, Lars},
  booktitle	= {Proceedings of the twenty-fifth conference on uncertainty
		  in artificial intelligence},
  pages		= {452--461},
  year		= {2009},
  organization	= {AUAI Press}
}

@Article{	  Rumelhart.Hinton.Williams.ea.1988,
  title		= {Learning representations by back-propagating errors},
  author	= {Rumelhart, David E and Hinton, Geoffrey E and Williams,
		  Ronald J and others},
  journal	= {Cognitive modeling},
  volume	= {5},
  number	= {3},
  pages		= {1},
  year		= {1988}
}

@Book{		  Russell.Norvig.2016,
  title		= {Artificial intelligence: a modern approach},
  author	= {Russell, Stuart J and Norvig, Peter},
  year		= {2016},
  publisher	= {Malaysia; Pearson Education Limited,}
}

@Article{	  Salton.Wong.Yang.1975,
  title		= {A vector space model for automatic indexing},
  author	= {Salton, Gerard and Wong, Anita and Yang, Chung-Shu},
  journal	= {Communications of the ACM},
  volume	= {18},
  number	= {11},
  pages		= {613--620},
  year		= {1975},
  publisher	= {ACM}
}

@InProceedings{	  Santurkar.Tsipras.Ilyas.ea.2018,
  title		= {How does batch normalization help optimization?},
  author	= {Santurkar, Shibani and Tsipras, Dimitris and Ilyas, Andrew
		  and Madry, Aleksander},
  booktitle	= {Advances in Neural Information Processing Systems},
  pages		= {2483--2493},
  year		= {2018}
}

@Article{	  Sarwar.Karypis.Konstan.ea.2001,
  title		= {Item-based collaborative filtering recommendation
		  algorithms.},
  author	= {Sarwar, Badrul Munir and Karypis, George and Konstan,
		  Joseph A and Riedl, John and others},
  journal	= {Www},
  volume	= {1},
  pages		= {285--295},
  year		= {2001}
}

@InProceedings{	  Schein.Popescul.Ungar.ea.2002,
  title		= {Methods and metrics for cold-start recommendations},
  author	= {Schein, Andrew I and Popescul, Alexandrin and Ungar, Lyle
		  H and Pennock, David M},
  booktitle	= {Proceedings of the 25th annual international ACM SIGIR
		  conference on Research and development in information
		  retrieval},
  pages		= {253--260},
  year		= {2002},
  organization	= {ACM}
}

@Article{	  Schuster.Paliwal.1997,
  title		= {Bidirectional recurrent neural networks},
  author	= {Schuster, Mike and Paliwal, Kuldip K},
  journal	= {IEEE Transactions on Signal Processing},
  volume	= {45},
  number	= {11},
  pages		= {2673--2681},
  year		= {1997},
  publisher	= {IEEE}
}

@InProceedings{	  Sedhain.Menon.Sanner.ea.2015,
  title		= {Autorec: Autoencoders meet collaborative filtering},
  author	= {Sedhain, Suvash and Menon, Aditya Krishna and Sanner,
		  Scott and Xie, Lexing},
  booktitle	= {Proceedings of the 24th International Conference on World
		  Wide Web},
  pages		= {111--112},
  year		= {2015},
  organization	= {ACM}
}

@Article{	  Sennrich.Haddow.Birch.2015,
  title		= {Neural machine translation of rare words with subword
		  units},
  author	= {Sennrich, Rico and Haddow, Barry and Birch, Alexandra},
  journal	= {arXiv preprint arXiv:1508.07909},
  year		= {2015}
}

@Article{	  Sergeev.Del-Balso.2018,
  title		= {Horovod: fast and easy distributed deep learning in
		  TensorFlow},
  author	= {Sergeev, Alexander and Del Balso, Mike},
  journal	= {arXiv preprint arXiv:1802.05799},
  year		= {2018}
}

@Article{	  Shannon.1948,
  author	= {Shannon, Claude Elwood},
  journal	= {The Bell System Technical Journal},
  month		= {7},
  number	= 3,
  pages		= {379--423},
  publisher	= {Nokia Bell Labs},
  title		= {A Mathematical Theory of Communication},
  volume	= 27,
  year		= 1948
}

@InProceedings{	  Shao.Yao.Sun.ea.2020,
  title		= {ControlVAE: Controllable Variational Autoencoder},
  author	= {Shao, Huajie and Yao, Shuochao and Sun, Dachun and Zhang,
		  Aston and Liu, Shengzhong and Liu, Dongxin and Wang, Jun
		  and Abdelzaher, Tarek},
  booktitle	= {Proceedings of the 37th International Conference on
		  Machine Learning},
  year		= {2020},
  organization	= {JMLR. org}
}

@Article{	  Silver.Huang.Maddison.ea.2016,
  title		= {Mastering the game of Go with deep neural networks and
		  tree search},
  author	= {Silver, David and Huang, Aja and Maddison, Chris J and
		  Guez, Arthur and Sifre, Laurent and Van Den Driessche,
		  George and Schrittwieser, Julian and Antonoglou, Ioannis
		  and Panneershelvam, Veda and Lanctot, Marc and others},
  journal	= {nature},
  volume	= {529},
  number	= {7587},
  pages		= {484},
  year		= {2016},
  publisher	= {Nature Publishing Group}
}

@Article{	  Simonyan.Zisserman.2014,
  title		= {Very deep convolutional networks for large-scale image
		  recognition},
  author	= {Simonyan, Karen and Zisserman, Andrew},
  journal	= {arXiv preprint arXiv:1409.1556},
  year		= {2014}
}

@Article{	  Smola.Narayanamurthy.2010,
  title		= {An architecture for parallel topic models},
  author	= {Smola, Alexander and Narayanamurthy, Shravan},
  journal	= {Proceedings of the VLDB Endowment},
  volume	= {3},
  number	= {1-2},
  pages		= {703--710},
  year		= {2010},
  publisher	= {VLDB Endowment}
}

@Article{	  Srivastava.Hinton.Krizhevsky.ea.2014,
  title		= {Dropout: a simple way to prevent neural networks from
		  overfitting},
  author	= {Srivastava, Nitish and Hinton, Geoffrey and Krizhevsky,
		  Alex and Sutskever, Ilya and Salakhutdinov, Ruslan},
  journal	= {The Journal of Machine Learning Research},
  volume	= {15},
  number	= {1},
  pages		= {1929--1958},
  year		= {2014},
  publisher	= {JMLR. org}
}

@Book{		  Strang.1993,
  title		= {Introduction to linear algebra},
  author	= {Strang, Gilbert},
  volume	= {3},
  year		= {1993},
  publisher	= {Wellesley-Cambridge Press Wellesley, MA}
}

@Article{	  Su.Khoshgoftaar.2009,
  title		= {A survey of collaborative filtering techniques},
  author	= {Su, Xiaoyuan and Khoshgoftaar, Taghi M},
  journal	= {Advances in artificial intelligence},
  volume	= {2009},
  year		= {2009},
  publisher	= {Hindawi}
}

@InProceedings{	  Sukhbaatar.Weston.Fergus.ea.2015,
  title		= {End-to-end memory networks},
  author	= {Sukhbaatar, Sainbayar and Weston, Jason and Fergus, Rob
		  and others},
  booktitle	= {Advances in neural information processing systems},
  pages		= {2440--2448},
  year		= {2015}
}

@InProceedings{	  Sutskever.Martens.Dahl.ea.2013,
  title		= {On the importance of initialization and momentum in deep
		  learning},
  author	= {Sutskever, Ilya and Martens, James and Dahl, George and
		  Hinton, Geoffrey},
  booktitle	= {International conference on machine learning},
  pages		= {1139--1147},
  year		= {2013}
}

@InProceedings{	  Sutskever.Vinyals.Le.2014,
  title		= {Sequence to sequence learning with neural networks},
  author	= {Sutskever, Ilya and Vinyals, Oriol and Le, Quoc V},
  booktitle	= {Advances in neural information processing systems},
  pages		= {3104--3112},
  year		= {2014}
}

@InProceedings{	  Szegedy.Ioffe.Vanhoucke.ea.2017,
  title		= {Inception-v4, inception-resnet and the impact of residual
		  connections on learning},
  author	= {Szegedy, Christian and Ioffe, Sergey and Vanhoucke,
		  Vincent and Alemi, Alexander A},
  booktitle	= {Thirty-First AAAI Conference on Artificial Intelligence},
  year		= {2017}
}

@InProceedings{	  Szegedy.Liu.Jia.ea.2015,
  title		= {Going deeper with convolutions},
  author	= {Szegedy, Christian and Liu, Wei and Jia, Yangqing and
		  Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and
		  Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich,
		  Andrew},
  booktitle	= {Proceedings of the IEEE conference on computer vision and
		  pattern recognition},
  pages		= {1--9},
  year		= {2015}
}

@InProceedings{	  Szegedy.Vanhoucke.Ioffe.ea.2016,
  title		= {Rethinking the inception architecture for computer
		  vision},
  author	= {Szegedy, Christian and Vanhoucke, Vincent and Ioffe,
		  Sergey and Shlens, Jon and Wojna, Zbigniew},
  booktitle	= {Proceedings of the IEEE conference on computer vision and
		  pattern recognition},
  pages		= {2818--2826},
  year		= {2016}
}

@Article{	  Tallec.Ollivier.2017,
  title		= {Unbiasing truncated backpropagation through time},
  author	= {Tallec, Corentin and Ollivier, Yann},
  journal	= {arXiv preprint arXiv:1705.08209},
  year		= {2017}
}

@InProceedings{	  Tang.Wang.2018,
  title		= {Personalized top-n sequential recommendation via
		  convolutional sequence embedding},
  author	= {Tang, Jiaxi and Wang, Ke},
  booktitle	= {Proceedings of the Eleventh ACM International Conference
		  on Web Search and Data Mining},
  pages		= {565--573},
  year		= {2018},
  organization	= {ACM}
}

@Article{	  Tay.Dehghani.Bahri.ea.2020,
  title		= {Efficient transformers: A survey},
  author	= {Tay, Yi and Dehghani, Mostafa and Bahri, Dara and Metzler,
		  Donald},
  journal	= {arXiv preprint arXiv:2009.06732},
  year		= {2020}
}

@Article{	  Teye.Azizpour.Smith.2018,
  title		= {Bayesian uncertainty estimation for batch normalized deep
		  networks},
  author	= {Teye, Mattias and Azizpour, Hossein and Smith, Kevin},
  journal	= {arXiv preprint arXiv:1802.06455},
  year		= {2018}
}

@Article{	  Tieleman.Hinton.2012,
  title		= {Lecture 6.5-rmsprop: Divide the gradient by a running
		  average of its recent magnitude},
  author	= {Tieleman, Tijmen and Hinton, Geoffrey},
  journal	= {COURSERA: Neural networks for machine learning},
  volume	= {4},
  number	= {2},
  pages		= {26--31},
  year		= {2012}
}

@Article{	  Toscher.Jahrer.Bell.2009,
  title		= {The bigchaos solution to the netflix grand prize},
  author	= {T{\"o}scher, Andreas and Jahrer, Michael and Bell, Robert
		  M},
  journal	= {Netflix prize documentation},
  pages		= {1--52},
  year		= {2009}
}

@Article{	  Treisman.Gelade.1980,
  title		= {A feature-integration theory of attention},
  author	= {Treisman, Anne M and Gelade, Garry},
  journal	= {Cognitive psychology},
  volume	= {12},
  number	= {1},
  pages		= {97--136},
  year		= {1980},
  publisher	= {Elsevier}
}

@Article{	  Turing.1950,
  title		= {Computing machinery and intelligence},
  author	= {Turing, Alan},
  journal	= {Mind},
  volume	= {59},
  number	= {236},
  pages		= {433},
  year		= {1950}
}

@Article{	  Uijlings.Van-De-Sande.Gevers.ea.2013,
  title		= {Selective search for object recognition},
  author	= {Uijlings, Jasper RR and Van De Sande, Koen EA and Gevers,
		  Theo and Smeulders, Arnold WM},
  journal	= {International journal of computer vision},
  volume	= {104},
  number	= {2},
  pages		= {154--171},
  year		= {2013},
  publisher	= {Springer}
}

@Book{		  Van-Loan.Golub.1983,
  title		= {Matrix computations},
  author	= {Van Loan, Charles F and Golub, Gene H},
  year		= {1983},
  publisher	= {Johns Hopkins University Press}
}

@InProceedings{	  Vaswani.Shazeer.Parmar.ea.2017,
  title		= {Attention is all you need},
  author	= {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and
		  Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and
		  Kaiser, {\L}ukasz and Polosukhin, Illia},
  booktitle	= {Advances in neural information processing systems},
  pages		= {5998--6008},
  year		= {2017}
}

@InProceedings{	  Wang.Davidson.Pan.ea.2016,
  title		= {Gunrock: A high-performance graph processing library on
		  the GPU},
  author	= {Wang, Yangzihao and Davidson, Andrew and Pan, Yuechao and
		  Wu, Yuduo and Riffel, Andy and Owens, John D},
  booktitle	= {ACM SIGPLAN Notices},
  volume	= {51},
  number	= {8},
  pages		= {11},
  year		= {2016},
  organization	= {ACM}
}

@Article{	  Wang.Li.Liberty.ea.2018,
  title		= {Optimal Message Scheduling for Aggregation},
  author	= {Wang, Leyuan and Li, Mu and Liberty, Edo and Smola, Alex
		  J},
  journal	= {NETWORKS},
  volume	= {2},
  number	= {3},
  pages		= {2--3},
  year		= {2018}
}

@Article{	  Warstadt.Singh.Bowman.2019,
  title		= {Neural network acceptability judgments},
  author	= {Warstadt, Alex and Singh, Amanpreet and Bowman, Samuel R},
  journal	= {Transactions of the Association for Computational
		  Linguistics},
  volume	= {7},
  pages		= {625--641},
  year		= {2019},
  publisher	= {MIT Press}
}

@Book{		  Wasserman.2013,
  title		= {All of statistics: a concise course in statistical
		  inference},
  author	= {Wasserman, Larry},
  year		= {2013},
  publisher	= {Springer Science \& Business Media}
}

@Article{	  Watkins.Dayan.1992,
  title		= {Q-learning},
  author	= {Watkins, Christopher JCH and Dayan, Peter},
  journal	= {Machine learning},
  volume	= {8},
  number	= {3-4},
  pages		= {279--292},
  year		= {1992},
  publisher	= {Springer}
}

@Article{	  Watson.1964,
  title		= {Smooth regression analysis},
  author	= {Watson, Geoffrey S},
  journal	= {Sankhy{\=a}: The Indian Journal of Statistics, Series A},
  pages		= {359--372},
  year		= {1964},
  publisher	= {JSTOR}
}

@InProceedings{	  Welling.Teh.2011,
  title		= {Bayesian learning via stochastic gradient Langevin
		  dynamics},
  author	= {Welling, Max and Teh, Yee W},
  booktitle	= {Proceedings of the 28th international conference on
		  machine learning (ICML-11)},
  pages		= {681--688},
  year		= {2011}
}

@Article{	  Werbos.1990,
  title		= {Backpropagation through time: what it does and how to do
		  it},
  author	= {Werbos, Paul J},
  journal	= {Proceedings of the IEEE},
  volume	= {78},
  number	= {10},
  pages		= {1550--1560},
  year		= {1990},
  publisher	= {IEEE}
}

@InProceedings{	  Wigner.1958,
  title		= {On the distribution of the roots of certain symmetric
		  matrices},
  author	= {Wigner, Eugene P.},
  booktitle	= {Ann. Math},
  pages		= {325--327},
  year		= {1958}
}

@TechReport{	  Williams.Waterman.Patterson.2009,
  title		= {Roofline: An insightful visual performance model for
		  floating-point programs and multicore architectures},
  author	= {Williams, Samuel and Waterman, Andrew and Patterson,
		  David},
  year		= {2009},
  institution	= {Lawrence Berkeley National Lab.(LBNL), Berkeley, CA
		  (United States)}
}

@Article{	  Wood.Gasthaus.Archambeau.ea.2011,
  title		= {The sequence memoizer},
  author	= {Wood, Frank and Gasthaus, Jan and Archambeau, C{\'e}dric
		  and James, Lancelot and Teh, Yee Whye},
  journal	= {Communications of the ACM},
  volume	= {54},
  number	= {2},
  pages		= {91--98},
  year		= {2011},
  publisher	= {ACM}
}

@InProceedings{	  Wu.Ahmed.Beutel.ea.2017,
  title		= {Recurrent recommender networks},
  author	= {Wu, Chao-Yuan and Ahmed, Amr and Beutel, Alex and Smola,
		  Alexander J and Jing, How},
  booktitle	= {Proceedings of the tenth ACM international conference on
		  web search and data mining},
  pages		= {495--503},
  year		= {2017},
  organization	= {ACM}
}

@Article{	  Wu.Schuster.Chen.ea.2016,
  title		= {Google's neural machine translation system: Bridging the
		  gap between human and machine translation},
  author	= {Wu, Yonghui and Schuster, Mike and Chen, Zhifeng and Le,
		  Quoc V and Norouzi, Mohammad and Macherey, Wolfgang and
		  Krikun, Maxim and Cao, Yuan and Gao, Qin and Macherey,
		  Klaus and others},
  journal	= {arXiv preprint arXiv:1609.08144},
  year		= {2016}
}

@InProceedings{	  Xiao.Bahri.Sohl-Dickstein.ea.2018,
  title		= {Dynamical Isometry and a Mean Field Theory of CNNs: How to
		  Train 10,000-Layer Vanilla Convolutional Neural Networks},
  author	= {Xiao, Lechao and Bahri, Yasaman and Sohl-Dickstein, Jascha
		  and Schoenholz, Samuel and Pennington, Jeffrey},
  booktitle	= {International Conference on Machine Learning},
  pages		= {5393--5402},
  year		= {2018}
}

@Article{	  Xiao.Rasul.Vollgraf.2017,
  title		= {Fashion-mnist: a novel image dataset for benchmarking
		  machine learning algorithms},
  author	= {Xiao, Han and Rasul, Kashif and Vollgraf, Roland},
  journal	= {arXiv preprint arXiv:1708.07747},
  year		= {2017}
}

@InProceedings{	  Xiong.Wu.Alleva.ea.2018,
  title		= {The Microsoft 2017 conversational speech recognition
		  system},
  author	= {Xiong, Wayne and Wu, Lingfeng and Alleva, Fil and Droppo,
		  Jasha and Huang, Xuedong and Stolcke, Andreas},
  booktitle	= {2018 IEEE International Conference on Acoustics, Speech
		  and Signal Processing (ICASSP)},
  pages		= {5934--5938},
  year		= {2018},
  organization	= {IEEE}
}

@InProceedings{	  Ye.Yin.Lee.ea.2011,
  title		= {Exploiting geographical influence for collaborative
		  point-of-interest recommendation},
  author	= {Ye, Mao and Yin, Peifeng and Lee, Wang-Chien and Lee,
		  Dik-Lun},
  booktitle	= {Proceedings of the 34th international ACM SIGIR conference
		  on Research and development in Information Retrieval},
  pages		= {325--334},
  year		= {2011},
  organization	= {ACM}
}

@Article{	  You.Gitman.Ginsburg.2017,
  title		= {Large batch training of convolutional networks},
  author	= {You, Yang and Gitman, Igor and Ginsburg, Boris},
  journal	= {arXiv preprint arXiv:1708.03888},
  year		= {2017}
}

@InProceedings{	  Zaheer.Reddi.Sachan.ea.2018,
  title		= {Adaptive methods for nonconvex optimization},
  author	= {Zaheer, Manzil and Reddi, Sashank and Sachan, Devendra and
		  Kale, Satyen and Kumar, Sanjiv},
  booktitle	= {Advances in Neural Information Processing Systems},
  pages		= {9793--9803},
  year		= {2018}
}

@Article{	  Zeiler.2012,
  title		= {ADADELTA: an adaptive learning rate method},
  author	= {Zeiler, Matthew D},
  journal	= {arXiv preprint arXiv:1212.5701},
  year		= {2012}
}

@InProceedings{	  Zhang.Tay.Zhang.ea.2021,
  title		= {Beyond Fully-Connected Layers with Quaternions:
		  Parameterization of Hypercomplex Multiplications with 1/n
		  Parameters},
  author	= {Zhang, Aston and Tay, Yi and Zhang, Shuai and Chan, Alvin
		  and Luu, Anh Tuan and Hui, Siu Cheung and Fu, Jie},
  booktitle	= {International Conference on Learning Representations},
  year		= {2021}
}

@Article{	  Zhang.Yao.Sun.ea.2019,
  title		= {Deep learning based recommender system: A survey and new
		  perspectives},
  author	= {Zhang, Shuai and Yao, Lina and Sun, Aixin and Tay, Yi},
  journal	= {ACM Computing Surveys (CSUR)},
  volume	= {52},
  number	= {1},
  pages		= {5},
  year		= {2019},
  publisher	= {ACM}
}

@Article{	  Zhao.Zheng.Xu.ea.2019,
  title		= {Object detection with deep learning: A review},
  author	= {Zhao, Zhong-Qiu and Zheng, Peng and Xu, Shou-tao and Wu,
		  Xindong},
  journal	= {IEEE transactions on neural networks and learning
		  systems},
  volume	= {30},
  number	= {11},
  pages		= {3212--3232},
  year		= {2019},
  publisher	= {IEEE}
}

@InProceedings{	  Zhu.Kiros.Zemel.ea.2015,
  title		= {Aligning books and movies: Towards story-like visual
		  explanations by watching movies and reading books},
  author	= {Zhu, Yukun and Kiros, Ryan and Zemel, Rich and
		  Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba,
		  Antonio and Fidler, Sanja},
  booktitle	= {Proceedings of the IEEE international conference on
		  computer vision},
  pages		= {19--27},
  year		= {2015}
}

@InProceedings{	  Zhu.Park.Isola.ea.2017,
  title		= {Unpaired image-to-image translation using cycle-consistent
		  adversarial networks},
  author	= {Zhu, Jun-Yan and Park, Taesung and Isola, Phillip and
		  Efros, Alexei A},
  booktitle	= {Proceedings of the IEEE international conference on
		  computer vision},
  pages		= {2223--2232},
  year		= {2017}
}