biblio.bib


@String{today = {2024-04-23}}

@book{art-mp,
	author = {Herlihy, Maurice and Shavit, Nir},
	title = {The Art of Multiprocessor Programming, Revised Reprint},
	year = {2012},
	isbn = {9780123973375},
	publisher = {Morgan Kaufmann Publishers Inc.},
	address = {San Francisco, CA, USA},
	edition = {1st edition},
	abstract = {Revised and updated with improvements conceived in parallel programming courses, The Art of Multiprocessor Programming is an authoritative guide to multicore programming. It introduces a higher level set of software development skills than that needed for efficient single-core programming. This book provides comprehensive coverage of the new principles, algorithms, and tools necessary for effective multiprocessor programming. Students and professionals alike will benefit from thorough coverage of key multiprocessor programming issues. This revised edition incorporates much-demanded updates throughout the book, based on feedback and corrections reported from classrooms since 2008 Learn the fundamentals of programming multiple threads accessing shared memory Explore mainstream concurrent data structures and the key elements of their design, as well as synchronization techniques from simple locks to transactional memory systems Visit the companion site and download source code, example Java programs, and materials to support and enhance the learning experience Table of Contents 1. Introduction 2. Mutual Exclusion 3. Concurrent Objects and Linearization 4. Foundations of Shared Memory 5. The Relative Power of Synchronization Methods 6. The Universality of Consensus 7. Spin Locks and Contention 8. Monitors and Blocking Synchronization 9. Linked Lists: the Role of Locking 10. Concurrent Queues and the ABA Problem 11. Concurrent Stacks and Elimination 12. Counting, Sorting and Distributed Coordination 13. Concurrent Hashing and Natural Parallelism 14. Skiplists and Balanced Search 15. Priority Queues 16. Futures, Scheduling and Work Distribution 17. Barriers 18. Transactional Memory Appendices}
}

@book{the-art-vol-2,
	author = {Knuth, Donald E.},
	title = {The art of computer programming,  volume 3: sorting and searching},
	edition = {2nd edition},
	year = {1998},
	isbn = {0201896850},
	publisher = {Addison Wesley Longman Publishing Co., Inc.},
	address = {USA}
}

@online{dabeaz-gil,
	author = {Dave Beazley},
	title = {An Inside Look at the GIL Removal Patch of Lore},
	date = {2011},
	url = {https://dabeaz.blogspot.com/2011/08/inside-look-at-gil-removal-patch-of.html},
	urldate = today,
}

@inproceedings{biased-refcounting,
	author = {Choi, Jiho and Shull, Thomas and Torrellas, Josep},
	title = {Biased reference counting: minimizing atomic operations in garbage collection},
	year = {2018},
	isbn = {9781450359863},
	publisher = {Association for Computing Machinery},
	address = {New York, NY, USA},
	url = {https://doi.org/10.1145/3243176.3243195},
	doi = {10.1145/3243176.3243195},
	abstract = {Reference counting (RC) is one of the two fundamental approaches to garbage collection. It has the desirable characteristics of low memory overhead and short pause times, which are key in today's interactive mobile platforms. However, RC has a higher execution time overhead than its counterpart, tracing garbage collection. The reason is that RC implementations maintain per-object counters, which must be continually updated. In particular, the execution time overhead is high in environments where low memory overhead is critical and, therefore, non-deferred RC is used. This is because the counter updates need to be performed atomically.To address this problem, this paper proposes a novel algorithm called Biased Reference Counting (BRC), which significantly improves the performance of non-deferred RC. BRC is based on the observation that most objects are only accessed by a single thread, which allows most RC operations to be performed non-atomically. BRC leverages this by biasing each object towards a specific thread, and keeping two counters for each object --- one updated by the owner thread and another updated by the other threads. This allows the owner thread to perform RC operations non-atomically, while the other threads update the second counter atomically.We implement BRC in the Swift programming language runtime, and evaluate it with client and server programs. We find that BRC makes each RC operation more than twice faster in the common case. As a result, BRC reduces the average execution time of client programs by 22.5\%, and boosts the average throughput of server programs by 7.3\%.},
	booktitle = {Proceedings of the 27th International Conference on Parallel Architectures and Compilation Techniques},
	articleno = {35},
	numpages = {12},
	keywords = {garbage collection, reference counting, swift},
	location = {Limassol, Cyprus},
	series = {PACT '18}
}

@inproceedings{deferred-refcounting,
	author = {Anderson, Daniel and Blelloch, Guy E. and Wei, Yuanhao},
	title = {Concurrent deferred reference counting with constant-time overhead},
	year = {2021},
	isbn = {9781450383912},
	publisher = {Association for Computing Machinery},
	address = {New York, NY, USA},
	url = {https://doi.org/10.1145/3453483.3454060},
	doi = {10.1145/3453483.3454060},
	abstract = {We present a safe automatic memory reclamation approach for concurrent programs, and show that it is both theoretically and practically efficient. Our approach combines ideas from referencing counting and hazard pointers in a novel way to implement concurrent reference counting with wait-free, constant-time overhead. It overcomes the limitations of previous approaches by significantly reducing modifications to, and hence contention on, the reference counts. Furthermore, it is safer and easier to use than manual approaches. Our technique involves using a novel generalization of hazard pointers to defer reference-count decrements until no other process can be incrementing them, and to defer or elide reference-count increments for short-lived references.  We have implemented the approach as a C++ library and compared it experimentally to several methods including existing atomic reference-counting libraries and state-of-the art manual techniques. Our results indicate that our technique is faster than existing reference-counting implementations, and competitive with manual memory reclamation techniques. More importantly, it is significantly safer than manual techniques since objects are reclaimed automatically.},
	booktitle = {Proceedings of the 42nd ACM SIGPLAN International Conference on Programming Language Design and Implementation},
	pages = {526–541},
	numpages = {16},
	keywords = {automatic memory reclamation, concurrent algorithms, wait-free},
	location = {Virtual, Canada},
	series = {PLDI 2021}
}

@online{qsbr,
	author = {Gross, Sam},
	title = {Add delayed reclamation mechanism for free-threaded build (QSBR)},
	date = {2024-02-06},
	url = {https://github.com/python/cpython/issues/115103},
	urldate = today,
}

@online{pep703,
	author = {Gross, Sam},
	title = {PEP 703 -- Making the Global Interpreter Lock Optional in CPython},
	date = {2023-05-04},
	url = {https://peps.python.org/pep-0703/},
	urldate = today,
}

@online{pep554,
	author = {Snow, Eric},
	title = {PEP 554 -- Multiple Interpreters in the Stdlib},
	date = {2017-09-07},
	url = {https://peps.python.org/pep-0554/},
	urldate = today,
}

@online{pep734,
	author = {Snow, Eric},
	title = {PEP 734 -- Multiple Interpreters in the Stdlib},
	date = {2023-11-06},
	url = {https://peps.python.org/pep-0734/},
	urldate = today,
}

@online{pep684,
	author = {Snow, Eric},
	title = {PEP 684 -- A Per-Interpreter GIL},
	date = {2022-03-08},
	url = {https://peps.python.org/pep-0684/},
	urldate = today,
}

@online{pep412,
	author = {Shannon, Mark},
	title = {PEP 412 -- Key-Sharing Dictionary},
	date = {2012-02-08},
	url = {https://peps.python.org/pep-0412/},
	urldate = today,
}

@online{gross-doc,
	author = {Gross, Sam},
	title = {Multithreaded Python without the GIL},
	url = {https://docs.google.com/document/d/18CXhDb1ygxg-YXNBJNzfzZsDFosB5e6BfnXLlejd9l0/edit},
	date = {2021-10-05},
	urldate = today,
}

@inproceedings{mimalloc,
	author = {Leijen, Daan and Zorn, Benjamin and de Moura, Leonardo},
	editor = {Lin, Anthony Widjaja},
	title = {Mimalloc: Free List Sharding in Action},
	booktitle = {Programming Languages and Systems},
	year = {2019},
	publisher = {Springer International Publishing},
	address = {Cham},
	pages = {244--265},
	abstract = {Modern memory allocators have to balance many simultaneous demands, including performance, security, the presence of concurrency, and application-specific demands depending on the context of their use. One increasing use-case for allocators is as back-end implementations of languages, such as Swift and Python, that use reference counting to automatically deallocate objects. We present mimalloc, a memory allocator that effectively balances these demands, shows significant performance advantages over existing allocators, and is tailored to support languages that rely on the memory allocator as a backend for reference counting. Mimalloc combines several innovations to achieve this result. First, it uses three page-local sharded free lists to increase locality, avoid contention, and support a highly-tuned allocate and free fast path. These free lists also support temporal cadence, which allows the allocator to predictably leave the fast path for regular maintenance tasks such as supporting deferred freeing, handling frees from non-local threads, etc. While influenced by the allocation workload of the reference-counted Lean and Koka programming language, we show that mimalloc has superior performance to modern commercial memory allocators, including tcmalloc and jemalloc, with speed improvements of 7{\%} and 14{\%}, respectively, on redis, and consistently out performs over a wide range of sequential and concurrent benchmarks. Allocators tailored to provide an efficient runtime for reference-counting languages reduce the implementation burden on developers and encourage the creation of innovative new language designs.},
	isbn = {978-3-030-34175-6},
	url={https://api.semanticscholar.org/CorpusID:198363081},
}

@online{hettinger-dict,
	author = {Hettinger, Raymond},
	title = {More compact dictionaries with faster iteration},
	date = {2021-12-09},
	url = {https://mail.python.org/archives/list/python-dev@python.org/thread/V342SDL2AUMVCTAODAALJWW2PLDYS5GX/},
	urldate = today,
}

@online{dict-notes,
	author = {{CPython contributors}},
	title = {dictnotes.txt},
	url = {https://github.com/python/cpython/blob/23950beff84c39d50f48011e930f4c6ebf32fc73/Objects/dictnotes.txt},
	urldate = today,
}

@online{dict-comment-design,
	author = {{CPython contributors}},
	title = {dictobject.c comment on its design},
	url = {https://github.com/python/cpython/blob/23950beff84c39d50f48011e930f4c6ebf32fc73/Objects/dictobject.c#L11},
	urldate = today,
}

@online{dict-comment-hash,
	author = {{CPython contributors}},
	title = {dictobject.c comment on hashing function},
	url = {https://github.com/python/cpython/blob/23950beff84c39d50f48011e930f4c6ebf32fc73/Objects/dictobject.c#L262},
	urldate = today,
}

@article{maier,
	author = {Maier, Tobias and Sanders, Peter and Dementiev, Roman},
	title = {Concurrent Hash Tables: Fast and General(?)!},
	year = {2019},
	issue_date = {December 2018},
	publisher = {Association for Computing Machinery},
	address = {New York, NY, USA},
	volume = {5},
	number = {4},
	issn = {2329-4949},
	url = {https://doi.org/10.1145/3309206},
	doi = {10.1145/3309206},
	abstract = {Concurrent hash tables are one of the most important concurrent data structures, which are used in numerous applications. For some applications, it is common that hash table accesses dominate the execution time. To efficiently solve these problems in parallel, we need implementations that achieve speedups in highly concurrent scenarios. Unfortunately, currently available concurrent hashing libraries are far away from this requirement, in particular, when adaptively sized tables are necessary or contention on some elements occurs.Our starting point for better performing data structures is a fast and simple lock-free concurrent hash table based on linear probing that is, however, limited to word-sized key-value types and does not support dynamic size adaptation. We explain how to lift these limitations in a provably scalable way and demonstrate that dynamic growing has a performance overhead comparable to the same generalization in sequential hash tables.We perform extensive experiments comparing the performance of our implementations with six of the most widely used concurrent hash tables. Ours are considerably faster than the best algorithms with similar restrictions and an order of magnitude faster than the best more general tables. In some extreme cases, the difference even approaches four orders of magnitude.All our implementations discussed in this publication can be found on github [17].},
	journal = {ACM Trans. Parallel Comput.},
	month = {2},
	articleno = {16},
	numpages = {32},
	keywords = {Concurrency, dynamic data structures, experimental analysis, hash table, lock-freedom, transactional memory}
}

@thesis{bolt,
	author = {Kahssay, Endrias},
	title = {A fast concurrent and resizable Robin Hood hash table},
	type = {Graduate Thesis},
	institution = {Massachusetts Institute of Technology. Department of Electrical Engineering and Computer Science},
	date = {2021},
	OPTurl = {https://hdl.handle.net/1721.1/130693},
	OPTurldate = today,
}

@thesis{robin-hood-eval,
	author = {Dahlberg, Patrick},
	title = {Time evaluation of lookups with Robin Hood Hashing and Linear Probing},
	type = {Bachelor Thesis},
	institution = {Umeå University},
	date = {2023},
	OPTurl = {https://hdl.handle.net/1721.1/130693},
	OPTurldate = today,
}

@article{robin-hood,
	title={Robin hood hashing},
	author={Pedro Celis and Per-{\AA}ke Larson and J. Ian Munro},
	journal={26th Annual Symposium on Foundations of Computer Science (sfcs 1985)},
	year={1985},
	pages={281-288},
	url={https://api.semanticscholar.org/CorpusID:17447073}
}

@inproceedings{dramhit,
	author = {Narayanan, Vikram and Detweiler, David and Huang, Tianjiao and Burtsev, Anton},
	title = {DRAMHiT: A Hash Table Architected for the Speed of DRAM},
	year = {2023},
	isbn = {9781450394871},
	publisher = {Association for Computing Machinery},
	address = {New York, NY, USA},
	url = {https://doi.org/10.1145/3552326.3587457},
	doi = {10.1145/3552326.3587457},
	abstract = {Despite decades of innovation, existing hash tables fail to achieve peak performance on modern hardware. Built around a relatively simple computation, i.e., a hash function, which in most cases takes only a handful of CPU cycles, hash tables should only be limited by the throughput of the memory subsystem. Unfortunately, due to the inherently random memory access pattern and the contention across multiple threads, existing hash tables spend most of their time waiting for the memory subsystem to serve cache misses and coherence requests.DRAMHiT is a new hash table designed to work at the speed of DRAM. Architecting for performance, we embrace the fact that modern machines are distributed systems---while the latency of communication between the cores is much lower than in a traditional network, it is still dominant for the hash table workload. We design DRAMHiT to apply a range of optimizations typical for a distributed system: asynchronous interface, fully-prefetched access, batching with out-of-order completion, and partitioned design with a low-overhead, scalable delegation scheme. DRAMHiT never touches unprefetched memory and minimizes the penalty of coherence requests and atomic instructions. These optimizations allow DRAMHiT to operate close to the speed of DRAM. On uniform key distributions, DRAMHiT achieves 973Mops for reads and 792Mops for writes on 64-thread Intel servers and 1192Mops and 1052Mops on 128-thread AMD machines; hence, outperforming existing lock-free designs by nearly a factor of two.},
	booktitle = {Proceedings of the Eighteenth European Conference on Computer Systems},
	pages = {817–834},
	numpages = {18},
	keywords = {hash tables, concurrency, memory subsystem},
	location = {Rome, Italy},
	series = {EuroSys '23}
}

@article{hesselink,
	title={Lock-free dynamic hash tables with open addressing},
	author={Hui Gao and Jan Friso Groote and Wim H. Hesselink},
	journal={Distributed Computing},
	year={2003},
	volume={18},
	pages={21-42},
	url={https://api.semanticscholar.org/CorpusID:17755299}
}

@online{cereggii,
	title = {cereggii},
	author = {Daniele Parmeggiani},
	subtitle = {Thread synchronization utilities for Python},
	url = {https://github.com/dpdani/cereggii},
	date = {2023-09-19},
}

@online{peniocereus-greggii,
	title = {Peniocereus greggii},
	author = {{Wikipedia contributors}},
	url = {https://en.wikipedia.org/wiki/Peniocereus_greggii},
	urldate = today,
}

@online{dict-metrics,
	title = {Collecting metrics on built-in dictionary usage},
	author = {Daniele Parmeggiani},
	date = {2023-08-29},
	url = {https://github.com/dpdani/cpython/tree/3.11},
	urldate = today,
}

@article{cas-costs,
	title={Evaluating the Cost of Atomic Operations on Modern Architectures},
	author={Hermann Schweizer and Maciej Besta and Torsten Hoefler},
	journal={2015 International Conference on Parallel Architecture and Compilation (PACT)},
	year={2015},
	pages={445-456},
	url={https://api.semanticscholar.org/CorpusID:6565322}
}

@manual{x86-64,
	author = {{Intel Corp.}},
	title = {Intel® 64 and IA-32 Architectures Software Developer's Manual Combined Volumes 2A, 2B, 2C, and 2D: Instruction Set Reference, A-Z},
	date = {2024-03},
	url = {https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html},
	urldate = today,
}

@manual{x86-instruction-tables,
	title = {Optimization Manuals, vol. 4, Instruction tables},
	author = {Agner Fog},
	year = {1996--2022},
	institution = {Technical University of Denmark},
	url = {https://www.agner.org/optimize/#manuals},
	urldate = today,
}

@inproceedings{michael-hash-tables,
	author = {Michael, Maged M.},
	title = {High performance dynamic lock-free hash tables and list-based sets},
	year = {2002},
	isbn = {1581135297},
	publisher = {Association for Computing Machinery},
	address = {New York, NY, USA},
	url = {https://doi.org/10.1145/564870.564881},
	doi = {10.1145/564870.564881},
	abstract = {Lock-free (non-blocking) shared data structures promise more robust performance and reliability than conventional lock-based implementations. However, all prior lock-free algorithms for sets and hash tables suffer from serious drawbacks that prevent or limit their use in practice. These drawbacks include size inflexibility, dependence on atomic primitives not supported on any current processor architecture, and dependence on highly-inefficient or blocking memory management techniques.Building on the results of prior researchers, this paper presents the first CAS-based lock-free list-based set algorithm that is compatible with all lock-free memory management methods. We use it as a building block of an algorithm for lock-free hash tables. In addition to being lock-free, the new algorithm is dynamic, linearizable, and space-efficient.Our experimental results show that the new algorithm outperforms the best known lock-free as well as lock-based hash table implementations by significant margins, and indicate that it is the algorithm of choice for implementing shared hash tables.},
	booktitle = {Proceedings of the Fourteenth Annual ACM Symposium on Parallel Algorithms and Architectures},
	pages = {73–82},
	numpages = {10},
	location = {Winnipeg, Manitoba, Canada},
	series = {SPAA '02}
}

@inproceedings{michael-safe-reclamation,
	author = {Michael, Maged M.},
	title = {Safe memory reclamation for dynamic lock-free objects using atomic reads and writes},
	year = {2002},
	isbn = {1581134851},
	publisher = {Association for Computing Machinery},
	address = {New York, NY, USA},
	url = {https://doi.org/10.1145/571825.571829},
	doi = {10.1145/571825.571829},
	abstract = {A major obstacle to the wide use of lock-free data structures, despite their many performance and reliability advantages, is the absence of a practical lock-free method for reclaiming the memory of dynamic nodes removed from dynamic lock-free objects for arbitrary reuse.The only prior lock-free memory reclamation method depends on the DCAS atomic primitive, which is not supported on any current processor architecture. Other memory management methods are blocking, require special operating system support, or do not allow arbitrary memory reuse.This paper presents the first lock-free memory management method for dynamic lock-free objects that allows arbitrary memory reuse, and does not require special operating system or hardware support. It guarantees an upper bound on the number of removed nodes not yet freed at any time, regardless of thread failures and delays. Furthermore, it is wait-free, it is only logarithmically contention-sensitive, and it uses only atomic reads and writes for its operations. In addition, it can be used to prevent the ABA problem for pointers to dynamic nodes in most algorithms, without requiring extra space per pointer or per node.},
	booktitle = {Proceedings of the Twenty-First Annual Symposium on Principles of Distributed Computing},
	pages = {21–30},
	numpages = {10},
	location = {Monterey, California},
	series = {PODC '02}
}

@online{python-summit-2023-nogil,
	author = {Alex Waygood},
	title = {The Python Language Summit 2023: Making the Global Interpreter Lock Optional},
	date = {2023-05-29},
	url = {https://pyfound.blogspot.com/2023/05/the-python-language-summit-2023-making.html},
	urldate = today,
}

@online{python-summit-2024-free-threading-ecosystems,
	author = {Seth Michael Larson},
	title = {The Python Language Summit 2024: Free-threading ecosystems},
	date = {2024-06-14},
	url = {https://pyfound.blogspot.com/2024/06/python-language-summit-2024-free-threading-ecosystems.html},
	urldate = today,
}