references.bib

% Please download the latest anthology.bib from
%
% http://aclweb.org/anthology/anthology.bib.gz

@book{Aho:72,
    author  = {Alfred V. Aho and Jeffrey D. Ullman},
    title   = {The Theory of Parsing, Translation and Compiling},
    year    = "1972",
    volume  = "1",
    publisher = {Prentice-Hall},
    address = {Englewood Cliffs, NJ}
}

@book{APA:83,
    author  = {{American Psychological Association}},
    title   = {Publications Manual},
    year    = "1983",
    publisher = {American Psychological Association},
    address = {Washington, DC}
}

@article{Chandra:81,
	author = {Ashok K. Chandra and Dexter C. Kozen and Larry J. Stockmeyer},
	year = "1981",
	title = {Alternation},
	journal = {Journal of the Association for Computing Machinery},
	volume = "28",
	number = "1",
	pages = "114--133",
	doi = "10.1145/322234.322243",
}

@inproceedings{andrew2007scalable,
  title={Scalable training of {L1}-regularized log-linear models},
  author={Andrew, Galen and Gao, Jianfeng},
  booktitle={Proceedings of the 24th International Conference on Machine Learning},
  pages={33--40},
  year={2007},
}

@book{Gusfield:97,
    author  = {Dan Gusfield},
    title   = {Algorithms on Strings, Trees and Sequences},
    year    = "1997",
    publisher = {Cambridge University Press},
    address = {Cambridge, UK}
}

@article{rasooli-tetrault-2015,
    author    = {Mohammad Sadegh Rasooli and Joel R. Tetreault},
    title     = {Yara Parser: {A} Fast and Accurate Dependency Parser},
    journal   = {Computing Research Repository},
    volume    = {arXiv:1503.06733},
    year      = {2015},
    url       = {http://arxiv.org/abs/1503.06733},
    note    = {version 2}
}


@article{cao2021attention,
  title={Attention Head Masking for Inference Time Content Selection in Abstractive Summarization},
  author={Cao, Shuyang and Wang, Lu},
  journal={arXiv preprint arXiv:2104.02205},
  year={2021}
}

@article{Ando2005,
	Acmid = {1194905},
	Author = {Ando, Rie Kubota and Zhang, Tong},
	Issn = {1532-4435},
	Issue_Date = {12/1/2005},
	Journal = {Journal of Machine Learning Research},
	Month = dec,
	Numpages = {37},
	Pages = {1817--1853},
	Publisher = {JMLR.org},
	Title = {A Framework for Learning Predictive Structures from Multiple Tasks and Unlabeled Data},
	Volume = {6},
	Year = {2005}
}

@article{cheng2016neural,
  title={Neural summarization by extracting sentences and words},
  author={Cheng, Jianpeng and Lapata, Mirella},
  journal={arXiv preprint arXiv:1603.07252},
  year={2016}
}

@inproceedings{dlikman2016using,
  title={Using Machine Learning Methods and Linguistic Features in Single-Document Extractive Summarization.},
  author={Dlikman, Alexander and Last, Mark},
  booktitle={DMNLP@ PKDD/ECML},
  pages={1--8},
  year={2016}
}

@article{nallapati2016classify,
  title={Classify or select: Neural architectures for extractive document summarization},
  author={Nallapati, Ramesh and Zhou, Bowen and Ma, Mingbo},
  journal={arXiv preprint arXiv:1611.04244},
  year={2016}
}

@article{kasai2020deep,
  title={Deep encoder, shallow decoder: Reevaluating the speed-quality tradeoff in machine translation},
  author={Kasai, Jungo and Pappas, Nikolaos and Peng, Hao and Cross, James and Smith, Noah A},
  journal={arXiv preprint arXiv:2006.10369},
  year={2020}
}

@article{liu2020noisy,
  title={Noisy Self-Knowledge Distillation for Text Summarization},
  author={Liu, Yang and Shen, Sheng and Lapata, Mirella},
  journal={arXiv preprint arXiv:2009.07032},
  year={2020}
}

@article{shleifer2020pre,
  title={Pre-trained Summarization Distillation},
  author={Shleifer, Sam and Rush, Alexander M},
  journal={arXiv preprint arXiv:2010.13002},
  year={2020}
}

@inproceedings{ba2014deep,
  title={Do Deep Nets Really Need to be Deep?},
  author={Ba, Jimmy and Caruana, Rich},
  booktitle={NIPS},
  year={2014}
}

@article{romero2014fitnets,
  title={Fitnets: Hints for thin deep nets},
  author={Romero, Adriana and Ballas, Nicolas and Kahou, Samira Ebrahimi and Chassang, Antoine and Gatta, Carlo and Bengio, Yoshua},
  journal={arXiv preprint arXiv:1412.6550},
  year={2014}
}


@inproceedings{DBLP:conf/iclr/ZagoruykoK17,
  author    = {Sergey Zagoruyko and
               Nikos Komodakis},
  title     = {Paying More Attention to Attention: Improving the Performance of Convolutional
               Neural Networks via Attention Transfer},
  booktitle = {5th International Conference on Learning Representations, {ICLR} 2017,
               Toulon, France, April 24-26, 2017, Conference Track Proceedings},
  publisher = {OpenReview.net},
  year      = {2017},
  url       = {https://openreview.net/forum?id=Sks9\_ajex},
  timestamp = {Thu, 25 Jul 2019 14:25:41 +0200},
  biburl    = {https://dblp.org/rec/conf/iclr/ZagoruykoK17.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{czarnecki2017sobolev,
  title={Sobolev training for neural networks},
  author={Czarnecki, Wojciech Marian and Osindero, Simon and Jaderberg, Max and Swirszcz, Grzegorz and Pascanu, Razvan},
  booktitle={Proceedings of the 31st International Conference on Neural Information Processing Systems},
  pages={4281--4290},
  year={2017}
}

@article{hinton2015distilling,
  title={Distilling the knowledge in a neural network},
  author={Hinton, Geoffrey and Vinyals, Oriol and Dean, Jeff},
  journal={arXiv preprint arXiv:1503.02531},
  year={2015}
}

@inproceedings{hermann2015teaching,
  title={Teaching Machines to Read and Comprehend},
  author={Hermann, Karl Moritz and Kocisk{\`y}, Tom{\'a}s and Grefenstette, Edward and Espeholt, Lasse and Kay, Will and Suleyman, Mustafa and Blunsom, Phil},
  booktitle={NIPS},
  year={2015}
}

@article{sandhaus2008new,
  title={The new york times annotated corpus},
  author={Sandhaus, Evan},
  journal={Linguistic Data Consortium, Philadelphia},
  volume={6},
  number={12},
  pages={e26752},
  year={2008}
}

@inproceedings{napoles2012annotated,
  title={Annotated gigaword},
  author={Napoles, Courtney and Gormley, Matthew R and Van Durme, Benjamin},
  booktitle={Proceedings of the Joint Workshop on Automatic Knowledge Base Construction and Web-scale Knowledge Extraction (AKBC-WEKEX)},
  pages={95--100},
  year={2012}
}

@inproceedings{DBLP:conf/iclr/PereyraTCKH17,
  author    = {Gabriel Pereyra and
               George Tucker and
               Jan Chorowski and
               Lukasz Kaiser and
               Geoffrey E. Hinton},
  title     = {Regularizing Neural Networks by Penalizing Confident Output Distributions},
  booktitle = {5th International Conference on Learning Representations, {ICLR} 2017,
               Toulon, France, April 24-26, 2017, Workshop Track Proceedings},
  publisher = {OpenReview.net},
  year      = {2017},
  url       = {https://openreview.net/forum?id=HyhbYrGYe},
  timestamp = {Thu, 04 Apr 2019 13:20:08 +0200},
  biburl    = {https://dblp.org/rec/conf/iclr/PereyraTCKH17.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{furlanello2018born,
  title={Born again neural networks},
  author={Furlanello, Tommaso and Lipton, Zachary and Tschannen, Michael and Itti, Laurent and Anandkumar, Anima},
  booktitle={International Conference on Machine Learning},
  pages={1607--1616},
  year={2018},
  organization={PMLR}
}

@inproceedings{xie2020self,
  title={Self-training with noisy student improves imagenet classification},
  author={Xie, Qizhe and Luong, Minh-Thang and Hovy, Eduard and Le, Quoc V},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={10687--10698},
  year={2020}
}

@inproceedings{he2019revisiting,
  title={Revisiting Self-Training for Neural Sequence Generation},
  author={He, Junxian and Gu, Jiatao and Shen, Jiajun and Ranzato, Marc'Aurelio},
  booktitle={International Conference on Learning Representations},
  year={2019}
}

@inproceedings{zhang2020pegasus,
  title={Pegasus: Pre-training with extracted gap-sentences for abstractive summarization},
  author={Zhang, Jingqing and Zhao, Yao and Saleh, Mohammad and Liu, Peter},
  booktitle={International Conference on Machine Learning},
  pages={11328--11339},
  year={2020},
  organization={PMLR}
}

@inproceedings{bao2020unilmv2,
  title={Unilmv2: Pseudo-masked language models for unified language model pre-training},
  author={Bao, Hangbo and Dong, Li and Wei, Furu and Wang, Wenhui and Yang, Nan and Liu, Xiaodong and Wang, Yu and Gao, Jianfeng and Piao, Songhao and Zhou, Ming and others},
  booktitle={International Conference on Machine Learning},
  pages={642--652},
  year={2020},
  organization={PMLR}
}

@inproceedings{vaswani2017attention,
  title={Attention is all you need},
  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
  booktitle={Proceedings of the 31st International Conference on Neural Information Processing Systems},
  pages={6000--6010},
  year={2017}
}

@inproceedings{DBLP:journals/corr/BahdanauCB14,
  author    = {Dzmitry Bahdanau and
               Kyunghyun Cho and
               Yoshua Bengio},
  editor    = {Yoshua Bengio and
               Yann LeCun},
  title     = {Neural Machine Translation by Jointly Learning to Align and Translate},
  booktitle = {3rd International Conference on Learning Representations, {ICLR} 2015,
               San Diego, CA, USA, May 7-9, 2015, Conference Track Proceedings},
  year      = {2015},
  url       = {http://arxiv.org/abs/1409.0473},
  timestamp = {Wed, 17 Jul 2019 10:40:54 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/BahdanauCB14.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{raffel2020exploring,
  title={Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
  author={Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J},
  journal={Journal of Machine Learning Research},
  volume={21},
  pages={1--67},
  year={2020}
}

@inproceedings{song2019mass,
  title={MASS: Masked Sequence to Sequence Pre-training for Language Generation},
  author={Song, Kaitao and Tan, Xu and Qin, Tao and Lu, Jianfeng and Liu, Tie-Yan},
  booktitle={International Conference on Machine Learning},
  pages={5926--5936},
  year={2019},
  organization={PMLR}
}

@article{DBLP:journals/corr/abs-1910-01108,
  author    = {Victor Sanh and
               Lysandre Debut and
               Julien Chaumond and
               Thomas Wolf},
  title     = {DistilBERT, a distilled version of {BERT:} smaller, faster, cheaper
               and lighter},
  journal   = {CoRR},
  volume    = {abs/1910.01108},
  year      = {2019},
  url       = {http://arxiv.org/abs/1910.01108},
  archivePrefix = {arXiv},
  eprint    = {1910.01108},
  timestamp = {Tue, 02 Jun 2020 12:48:59 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1910-01108.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{wang2020minilm,
  title={Minilm: Deep self-attention distillation for task-agnostic compression of pre-trained transformers},
  author={Wang, Wenhui and Wei, Furu and Dong, Li and Bao, Hangbo and Yang, Nan and Zhou, Ming},
  journal={arXiv preprint arXiv:2002.10957},
  year={2020}
}

@article{wang2020minilmv2,
  title={MiniLMv2: Multi-Head Self-Attention Relation Distillation for Compressing Pretrained Transformers},
  author={Wang, Wenhui and Bao, Hangbo and Huang, Shaohan and Dong, Li and Wei, Furu},
  journal={arXiv preprint arXiv:2012.15828},
  year={2020}
}


@article{kingma2014adam,
  title={Adam: A method for stochastic optimization},
  author={Kingma, Diederik P and Ba, Jimmy},
  journal={arXiv preprint arXiv:1412.6980},
  year={2014}
}


@article{radford2019language,
	title={Language models are unsupervised multitask learners},
	author={Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
	journal={OpenAI blog},
	volume={1},
	number={8},
	pages={9},
	year={2019}
}


@article{Nenkova:McKeown:2011,
	author = 	 {Ani Nenkova and Kathleen McKeown},
	title = 	 {Automatic Summarization},
	journal = 	 {Foundations and Trends in Information Retrieval},
	volume=  {5},
	number = 	 {2--3},
	year = 	 2011,
	pages = 	 {103--233}
}

@article{gu2017non,
  title={Non-autoregressive neural machine translation},
  author={Gu, Jiatao and Bradbury, James and Xiong, Caiming and Li, Victor OK and Socher, Richard},
  journal={arXiv preprint arXiv:1711.02281},
  year={2017}
}

@inproceedings{deng2009imagenet,
  title={Imagenet: A large-scale hierarchical image database},
  author={Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li},
  booktitle={2009 IEEE conference on computer vision and pattern recognition},
  pages={248--255},
  year={2009},
  organization={Ieee}
}

@inproceedings{katsumata-komachi-2020-stronger,
    title = "Stronger Baselines for Grammatical Error Correction Using a Pretrained Encoder-Decoder Model",
    author = "Katsumata, Satoru  and
      Komachi, Mamoru",
    booktitle = "Proceedings of the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing",
    month = dec,
    year = "2020",
    address = "Suzhou, China",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/2020.aacl-main.83",
    pages = "827--832",
}

@inproceedings{devlin-etal-2019-bert,
    title = "{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding",
    author = "Devlin, Jacob  and
      Chang, Ming-Wei  and
      Lee, Kenton  and
      Toutanova, Kristina",
    booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)",
    month = jun,
    year = "2019",
    address = "Minneapolis, Minnesota",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/N19-1423",
    doi = "10.18653/v1/N19-1423",
    pages = "4171--4186",
}

@inproceedings{durrett-etal-2016-learning,
    title = "Learning-Based Single-Document Summarization with Compression and Anaphoricity Constraints",
    author = "Durrett, Greg  and
      Berg-Kirkpatrick, Taylor  and
      Klein, Dan",
    booktitle = "Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = aug,
    year = "2016",
    address = "Berlin, Germany",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/P16-1188",
    doi = "10.18653/v1/P16-1188",
    pages = "1998--2008",
}

@inproceedings{jiao-etal-2020-tinybert,
    title = "{T}iny{BERT}: Distilling {BERT} for Natural Language Understanding",
    author = "Jiao, Xiaoqi  and
      Yin, Yichun  and
      Shang, Lifeng  and
      Jiang, Xin  and
      Chen, Xiao  and
      Li, Linlin  and
      Wang, Fang  and
      Liu, Qun",
    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
    month = nov,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/2020.findings-emnlp.372",
    doi = "10.18653/v1/2020.findings-emnlp.372",
    pages = "4163--4174",
}

@inproceedings{kim-rush-2016-sequence,
    title = "Sequence-Level Knowledge Distillation",
    author = "Kim, Yoon  and
      Rush, Alexander M.",
    booktitle = "Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing",
    month = nov,
    year = "2016",
    address = "Austin, Texas",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D16-1139",
    doi = "10.18653/v1/D16-1139",
    pages = "1317--1327",
}

@inproceedings{lewis-etal-2020-bart,
    title = "{BART}: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension",
    author = "Lewis, Mike  and
      Liu, Yinhan  and
      Goyal, Naman  and
      Ghazvininejad, Marjan  and
      Mohamed, Abdelrahman  and
      Levy, Omer  and
      Stoyanov, Veselin  and
      Zettlemoyer, Luke",
    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
    month = jul,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/2020.acl-main.703",
    doi = "10.18653/v1/2020.acl-main.703",
    pages = "7871--7880",
}

@inproceedings{lin-2004-rouge,
    title = "{ROUGE}: A Package for Automatic Evaluation of Summaries",
    author = "Lin, Chin-Yew",
    booktitle = "Text Summarization Branches Out",
    month = jul,
    year = "2004",
    address = "Barcelona, Spain",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W04-1013",
    pages = "74--81",
}

@inproceedings{liu-lapata-2019-text,
    title = "Text Summarization with Pretrained Encoders",
    author = "Liu, Yang  and
      Lapata, Mirella",
    booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
    month = nov,
    year = "2019",
    address = "Hong Kong, China",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D19-1387",
    doi = "10.18653/v1/D19-1387",
    pages = "3730--3740",
}

@inproceedings{narayan-etal-2018-dont,
    title = "Don{'}t Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization",
    author = "Narayan, Shashi  and
      Cohen, Shay B.  and
      Lapata, Mirella",
    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
    month = oct # "-" # nov,
    year = "2018",
    address = "Brussels, Belgium",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D18-1206",
    doi = "10.18653/v1/D18-1206",
    pages = "1797--1807",
}

@inproceedings{ott-etal-2019-fairseq,
    title = "fairseq: A Fast, Extensible Toolkit for Sequence Modeling",
    author = "Ott, Myle  and
      Edunov, Sergey  and
      Baevski, Alexei  and
      Fan, Angela  and
      Gross, Sam  and
      Ng, Nathan  and
      Grangier, David  and
      Auli, Michael",
    booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics (Demonstrations)",
    month = jun,
    year = "2019",
    address = "Minneapolis, Minnesota",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/N19-4009",
    doi = "10.18653/v1/N19-4009",
    pages = "48--53",
}

@inproceedings{rajpurkar-etal-2016-squad,
    title = "{SQ}u{AD}: 100,000+ Questions for Machine Comprehension of Text",
    author = "Rajpurkar, Pranav  and
      Zhang, Jian  and
      Lopyrev, Konstantin  and
      Liang, Percy",
    booktitle = "Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing",
    month = nov,
    year = "2016",
    address = "Austin, Texas",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/D16-1264",
    doi = "10.18653/v1/D16-1264",
    pages = "2383--2392",
}

@inproceedings{see-etal-2017-get,
    title = "Get To The Point: Summarization with Pointer-Generator Networks",
    author = "See, Abigail  and
      Liu, Peter J.  and
      Manning, Christopher D.",
    booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = jul,
    year = "2017",
    address = "Vancouver, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/P17-1099",
    doi = "10.18653/v1/P17-1099",
    pages = "1073--1083",
}

@inproceedings{sennrich-etal-2016-neural,
    title = "Neural Machine Translation of Rare Words with Subword Units",
    author = "Sennrich, Rico  and
      Haddow, Barry  and
      Birch, Alexandra",
    booktitle = "Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = aug,
    year = "2016",
    address = "Berlin, Germany",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/P16-1162",
    doi = "10.18653/v1/P16-1162",
    pages = "1715--1725",
}

@inproceedings{sun-etal-2020-mobilebert,
    title = "{M}obile{BERT}: a Compact Task-Agnostic {BERT} for Resource-Limited Devices",
    author = "Sun, Zhiqing  and
      Yu, Hongkun  and
      Song, Xiaodan  and
      Liu, Renjie  and
      Yang, Yiming  and
      Zhou, Denny",
    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
    month = jul,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/2020.acl-main.195",
    doi = "10.18653/v1/2020.acl-main.195",
    pages = "2158--2170",
}

@inproceedings{wang-etal-2018-glue,
    title = "{GLUE}: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding",
    author = "Wang, Alex  and
      Singh, Amanpreet  and
      Michael, Julian  and
      Hill, Felix  and
      Levy, Omer  and
      Bowman, Samuel",
    booktitle = "Proceedings of the 2018 {EMNLP} Workshop {B}lackbox{NLP}: Analyzing and Interpreting Neural Networks for {NLP}",
    month = nov,
    year = "2018",
    address = "Brussels, Belgium",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W18-5446",
    doi = "10.18653/v1/W18-5446",
    pages = "353--355",
}

@inproceedings{xu-etal-2020-bert,
    title = "{BERT}-of-Theseus: Compressing {BERT} by Progressive Module Replacing",
    author = "Xu, Canwen  and
      Zhou, Wangchunshu  and
      Ge, Tao  and
      Wei, Furu  and
      Zhou, Ming",
    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
    month = nov,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/2020.emnlp-main.633",
    doi = "10.18653/v1/2020.emnlp-main.633",
    pages = "7859--7869",
}

@inproceedings{denkowski-neubig-2017-stronger,
    title = "Stronger Baselines for Trustable Results in Neural Machine Translation",
    author = "Denkowski, Michael  and
      Neubig, Graham",
    booktitle = "Proceedings of the First Workshop on Neural Machine Translation",
    month = aug,
    year = "2017",
    address = "Vancouver",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W17-3203",
    doi = "10.18653/v1/W17-3203",
    pages = "18--27",
}

@inproceedings{edunov-etal-2018-understanding,
    title = "Understanding Back-Translation at Scale",
    author = "Edunov, Sergey  and
      Ott, Myle  and
      Auli, Michael  and
      Grangier, David",
    booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing",
    month = oct # "-" # nov,
    year = "2018",
    address = "Brussels, Belgium",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/D18-1045",
    doi = "10.18653/v1/D18-1045",
    pages = "489--500",
    abstract = "An effective method to improve neural machine translation with monolingual data is to augment the parallel training corpus with back-translations of target language sentences. This work broadens the understanding of back-translation and investigates a number of methods to generate synthetic source sentences. We find that in all but resource poor settings back-translations obtained via sampling or noised beam outputs are most effective. Our analysis shows that sampling or noisy synthetic data gives a much stronger training signal than data generated by beam or greedy search. We also compare how synthetic data compares to genuine bitext and study various domain effects. Finally, we scale to hundreds of millions of monolingual sentences and achieve a new state of the art of 35 BLEU on the WMT{'}14 English-German test set.",
}

@article{holtzman2019curious,
  title={The curious case of neural text degeneration},
  author={Holtzman, Ari and Buys, Jan and Du, Li and Forbes, Maxwell and Choi, Yejin},
  journal={arXiv preprint arXiv:1904.09751},
  year={2019}
}


@InProceedings{bojar-EtAl:2016:WMT1,
  author    = {Bojar, Ond\v{r}ej  and  Chatterjee, Rajen and Federmann, Christian  and  Graham, Yvette  and  Haddow, Barry  and  Huck, Matthias  and  Jimeno Yepes, Antonio  and  Koehn, Philipp  and  Logacheva, Varvara  and  Monz, Christof  and  Negri, Matteo  and  Neveol, Aurelie  and  Neves, Mariana  and  Popel, Martin  and  Post,  Matt  and  Rubino, Raphael  and  Scarton, Carolina  and  Specia,  Lucia  and  Turchi, Marco  and  Verspoor, Karin  and  Zampieri,  Marcos},
  title     = {Findings of the 2016 Conference on Machine Translation},
  booktitle = {Proceedings of the First Conference on Machine Translation},
  month     = {August},
  year      = {2016},
  address   = {Berlin, Germany},
  publisher = {Association for Computational Linguistics},
  pages     = {131--198},
  url       = {http://www.aclweb.org/anthology/W/W16/W16-2301}
}

@inproceedings{xu-etal-2020-understanding-neural,
    title = "Understanding Neural Abstractive Summarization Models via Uncertainty",
    author = "Xu, Jiacheng  and
      Desai, Shrey  and
      Durrett, Greg",
    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
    month = nov,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2020.emnlp-main.508",
    doi = "10.18653/v1/2020.emnlp-main.508",
    pages = "6275--6281",
}

@book{davison1997bootstrap,
  title={Bootstrap methods and their application},
  author={Davison, Anthony Christopher and Hinkley, David Victor},
  number={1},
  year={1997},
  publisher={Cambridge university press}
}

@article{ahn2022can,
  title={Do as i can, not as i say: Grounding language in robotic affordances},
  author={Ahn, Michael and Brohan, Anthony and Brown, Noah and Chebotar, Yevgen and Cortes, Omar and David, Byron and Finn, Chelsea and Gopalakrishnan, Keerthana and Hausman, Karol and Herzog, Alex and others},
  journal={arXiv preprint arXiv:2204.01691},
  year={2022}
}