From a70f8403307bd711a61e2146d6ce70b5db58b60d Mon Sep 17 00:00:00 2001
From: kcleal <clealk@cardiff.ac.uk>
Date: Mon, 18 Apr 2022 19:08:31 +0100
Subject: [PATCH] v1.3.10 Fixed X/= style cigar compatibility issues.
 Improvements to merging

---
 dysgu/assembler.pyx      |   90 ++--
 dysgu/call_component.pyx |  131 +++--
 dysgu/cluster.pyx        |   72 ++-
 dysgu/graph.pyx          |  135 +++--
 dysgu/robin_hood.h       | 1063 +++++++++++++++++++++++++-------------
 setup.py                 |    2 +-
 6 files changed, 947 insertions(+), 546 deletions(-)

diff --git a/dysgu/assembler.pyx b/dysgu/assembler.pyx
index 50fa7d7..b4a450c 100644
--- a/dysgu/assembler.pyx
+++ b/dysgu/assembler.pyx
@@ -147,7 +147,7 @@ cdef void add_to_graph(DiGraph& G, AlignedSegment r, cpp_vector[int]& nweight, T
         cigar, current_pos, i = trim_cigar(cigar, pos, approx_position)
 
     for opp, length in cigar:
-        with nogil:
+        # with nogil:
             if done:
                 break
 
@@ -225,10 +225,10 @@ cdef void add_to_graph(DiGraph& G, AlignedSegment r, cpp_vector[int]& nweight, T
                         G.updateEdge(prev_node, n, qual)
                     prev_node = n
 
-                current_pos += 1  # <-- Reference pos increases 1
+                # current_pos += 1  # <-- Reference pos increases 1
 
             elif opp == 2: # deletion
-                current_pos += length + 1
+                current_pos += length # + 1
 
             elif opp == 0 or opp == 7 or opp == 8 or opp == 3:  # All match, match (=), mis-match (X), N's
 
@@ -270,10 +270,12 @@ cdef void add_to_graph(DiGraph& G, AlignedSegment r, cpp_vector[int]& nweight, T
                         G.updateEdge(prev_node, n, qual)
                     prev_node = n
 
+                current_pos += 1
+
             start = False
 
 
-cdef int topo_sort2(DiGraph& G, cpp_deque[int]& order):  # except -1:
+cdef int topo_sort2(DiGraph& G, cpp_deque[int]& order, r): #  except -1:
 
     cdef unordered_set[int] seen
     cdef unordered_set[int] explored
@@ -286,51 +288,53 @@ cdef int topo_sort2(DiGraph& G, cpp_deque[int]& order):  # except -1:
 
     cdef cpp_vector[int] debug_res
 
-    with nogil:
+    # with nogil:
 
-        for v in range(G.numberOfNodes()):  # process all vertices in G
-            if explored.find(v) != explored.end():
-                continue
+    for v in range(G.numberOfNodes()):  # process all vertices in G
+        if explored.find(v) != explored.end():
+            continue
 
-            fringe.clear()
-            fringe.push_back(v)  # nodes yet to look at
+        fringe.clear()
+        fringe.push_back(v)  # nodes yet to look at
 
-            while fringe.size() != 0:
+        while fringe.size() != 0:
 
-                w = fringe.back() # depth first search
-                if explored.find(w) != explored.end():  # already looked down this branch
-                    fringe.pop_back()
-                    continue
+            w = fringe.back() # depth first search
+            if explored.find(w) != explored.end():  # already looked down this branch
+                fringe.pop_back()
+                continue
 
-                seen.insert(w)
+            seen.insert(w)
 
-                # Check successors for cycles and for new nodes
-                if new_nodes.size() > 0:
-                    new_nodes.clear()
+            # Check successors for cycles and for new nodes
+            if new_nodes.size() > 0:
+                new_nodes.clear()
 
-                neighbors = G.neighbors(w)
-                for n in neighbors:
-                    if explored.find(n) == explored.end():
+            neighbors = G.neighbors(w)
+            for n in neighbors:
+                if explored.find(n) == explored.end():
 
-                        if seen.find(n) != seen.end(): #CYCLE !!
-                            order.clear()
-                            order.push_back(-1)
-                            order.push_back(n)
-                            order.push_back(w)
-                            # return order
-                            graph_node_2_vec(n, debug_res)
-                            raise ValueError("Graph contains a cycle. Please report this. n={}, w={}, v={}. Node info n was: {}, {}, {}, {}".format(n, w, v, debug_res[0], debug_res[1], debug_res[2], debug_res[4]))
+                    if seen.find(n) != seen.end(): #CYCLE !!
+                        order.clear()
+                        order.push_back(-1)
+                        order.push_back(n)
+                        order.push_back(w)
+                        # return order
+                        graph_node_2_vec(n, debug_res)
+                        # echo("Graph contains a cycle. Please report this. n={}, w={}, v={}. Node info n was: {}, {}, {}, {}".format(n, w, v, debug_res[0], debug_res[1], debug_res[2], debug_res[4]))
 
-                        new_nodes.push_back(n)
+                        raise ValueError("Graph contains a cycle. Please report this. n={}, w={}, v={}. Node info n was: {}, {}, {}, {}".format(n, w, v, debug_res[0], debug_res[1], debug_res[2], debug_res[4]))
 
-                if new_nodes.size() > 0:  # Add new_nodes to fringe
-                    fringe.insert(fringe.end(), new_nodes.begin(), new_nodes.end())  # Extend
+                    new_nodes.push_back(n)
 
-                else:  # No new nodes so w is fully explored
-                    explored.insert(w)
+            if new_nodes.size() > 0:  # Add new_nodes to fringe
+                fringe.insert(fringe.end(), new_nodes.begin(), new_nodes.end())  # Extend
 
-                    order.push_front(w)
-                    fringe.pop_back()  # done considering this node
+            else:  # No new nodes so w is fully explored
+                explored.insert(w)
+
+                order.push_front(w)
+                fringe.pop_back()  # done considering this node
 
 
 cdef cpp_deque[int] score_best_path(DiGraph& G, cpp_deque[int]& nodes_to_visit, cpp_vector[int]& n_weights):
@@ -418,7 +422,7 @@ cdef dict get_consensus(rd, int position, int max_distance):
     cdef DiGraph G = DiGraph()
     cdef TwoWayMap ndict_r2
     cdef cpp_vector[int] node_weights
-
+    r = None
     for r in rd:
         if r.seq is None:
             continue
@@ -428,7 +432,15 @@ cdef dict get_consensus(rd, int position, int max_distance):
         add_to_graph(G, r, node_weights, ndict_r2, position, max_distance)
 
     cdef cpp_deque[int] nodes_to_visit2
-    return_code = topo_sort2(G, nodes_to_visit2)
+
+    # try:
+    return_code = topo_sort2(G, nodes_to_visit2, r)
+
+    # except ValueError:
+    #     echo("was -1")
+    #     for r in rd:
+    #         echo(r.qname, r.pos)
+
     if return_code == -1 or nodes_to_visit2.size() < 50:
         return {}
 
diff --git a/dysgu/call_component.pyx b/dysgu/call_component.pyx
index c80fb98..2daf041 100644
--- a/dysgu/call_component.pyx
+++ b/dysgu/call_component.pyx
@@ -304,7 +304,7 @@ cdef guess_informative_pair(aligns):
         b_cigar_info, b = aligns[1]
 
         # check for paired-end read through with no SA tag
-        if a.flag & 1 and a_cigar_info[5] == -1 and b_cigar_info[5] == -1:
+        if a.flag & 1 and a_cigar_info.cigar_index == -1 and b_cigar_info.cigar_index == -1:
             if a.pos == b.pos and a.reference_end == b.reference_end:
                 extent_left_same = True
                 extent_right_same = True
@@ -316,9 +316,9 @@ cdef guess_informative_pair(aligns):
                     return None
 
         # within read sv
-        if 0 < a_cigar_info[5] < len(a.cigartuples) - 1:
-            cigar_index = a_cigar_info[5]
-            event_pos = a_cigar_info[6]
+        if 0 < a_cigar_info.cigar_index < len(a.cigartuples) - 1:
+            cigar_index = a_cigar_info.cigar_index
+            event_pos = a_cigar_info.event_pos
             ci = a.cigartuples[cigar_index]
             return (ci[0],
                     a.rname,
@@ -328,9 +328,9 @@ cdef guess_informative_pair(aligns):
                     a,
                     cigar_index)
 
-        elif 0 < b_cigar_info[5] < len(b.cigartuples) - 1:
-            cigar_index = b_cigar_info[5]
-            event_pos = b_cigar_info[6]
+        elif 0 < b_cigar_info.cigar_index < len(b.cigartuples) - 1:
+            cigar_index = b_cigar_info.cigar_index
+            event_pos = b_cigar_info.event_pos
             ci = b.cigartuples[cigar_index]
             return (ci[0],
                     b.rname,
@@ -340,8 +340,8 @@ cdef guess_informative_pair(aligns):
                     b,
                     cigar_index)
 
-        a_pos = a_cigar_info[6]  # Position may have been inferred from SA tag, use this if available
-        b_pos = b_cigar_info[6]
+        a_pos = a_cigar_info.event_pos  # Position may have been inferred from SA tag, use this if available
+        b_pos = b_cigar_info.event_pos
         if a_pos == b_pos:
             # make sure different breaks are mapped
             if (a.cigartuples[0][0] == 4 and b.cigartuples[0][0] == 4) or (a.cigartuples[-1][0] == 4 and b.cigartuples[-1][0] == 4):
@@ -802,7 +802,7 @@ cdef single(rds, int insert_size, int insert_stdev, float insert_ppf, int clip_l
 
     if n_templates == 1:
         # Filter uninteresting reads
-        if not any(not i.flag & 1 or not i.flag & 2 or i.rname != i.rnext or node_info[5] != 2 or
+        if not any(not i.flag & 1 or not i.flag & 2 or i.rname != i.rnext or node_info.cigar_index != 2 or
                    (i.flag & 1 and abs(i.tlen) > min_distance)
                    for node_info, i in rds):
 
@@ -842,10 +842,10 @@ cdef single(rds, int insert_size, int insert_stdev, float insert_ppf, int clip_l
 
         else:  # Single alignment, check spanning
             cigar_info, a = alignments[0]
-            cigar_index = cigar_info[5]
+            cigar_index = cigar_info.cigar_index #[5]
 
             if 0 < cigar_index < len(a.cigartuples) - 1:  # Alignment spans SV
-                event_pos = cigar_info[6]
+                event_pos = cigar_info.event_pos #[6]
                 ci = a.cigartuples[cigar_index]
                 spanning_alignments.append((ci[0],
                                             a.rname,
@@ -1261,14 +1261,14 @@ cdef void make_call(informative, breakA_precise, breakB_precise, svtype, jointyp
                         else:
                             lens.append(i.inferred_sv_len)
                 if len(lens) > 0:
-                    svlen = value_closest_to_mean(lens) #int(np.mean(lens))
+                    svlen = value_closest_to_mean(lens)
                     if main_svlen > 0 and (svlen / main_svlen) > 0.7:
                         svlen_precise = 1
                     else:
                         svlen = main_svlen
                 else:
                     if len(inferred_lens) > 0:
-                        svlen = value_closest_to_mean(inferred_lens) #int(np.mean(inferred_lens))
+                        svlen = value_closest_to_mean(inferred_lens)
                     else:
                         svlen = main_svlen
 
@@ -1581,7 +1581,9 @@ cdef call_from_reads(u_reads, v_reads, int insert_size, int insert_stdev, float
                 v_reads.append(v_item.read_b)
 
             er = EventResult()
-
+            # for item in sub_informative:
+            #     if item.read_a.qname == "D00360:18:H8VC6ADXX:1:1210:7039:44052":
+            #         echo("here")
             make_call(sub_informative, precise_a, precise_b, svtype, jointype, insert_size, insert_stdev, er)
 
             count_attributes2(u_reads, v_reads, [], extended_tags, insert_ppf, [], er)
@@ -1612,9 +1614,43 @@ cdef call_from_reads(u_reads, v_reads, int insert_size, int insert_stdev, float
     return results
 
 
+cdef filter_single_partitions(u_reads, v_reads):
+    # rare, but single reads with >2 alignments can have multiple alignments end up in one block. These should be
+    # processed as singles
+    u_counts = defaultdict(list)
+    v_counts = defaultdict(list)
+    any_u_grouped = False
+    any_v_grouped = False
+
+    for cigar_info, a in u_reads:
+        u_counts[(a.is_read1, a.qname)].append((cigar_info, a))
+        if len(u_counts[(a.is_read1, a.qname)]) > 1:
+            any_u_grouped = True
+    for cigar_info, a in v_reads:
+        v_counts[(a.is_read1, a.qname)].append((cigar_info, a))
+        if len(v_counts[(a.is_read1, a.qname)]) > 1:
+            any_v_grouped = True
+    if not any_u_grouped and not any_v_grouped:
+        return u_reads, v_reads, None, None
+
+    single_u, single_v, actual_u, actual_v = [], [], [], []
+    for k, v in u_counts.items():
+        if len(v) == 1:
+            actual_u += v
+        else:
+            single_u += v
+    for k, v in v_counts.items():
+        if len(v) == 1:
+            actual_v += v
+        else:
+            single_v += v
+
+    return actual_u, actual_v, single_u, single_v
+
+
 cdef one_edge(u_reads_info, v_reads_info, int clip_length, int insert_size, int insert_stdev, float insert_ppf,
                    int min_support, int block_edge, int assemble, int extended_tags, info):
-    #print("starting one edge")
+
     spanning_alignments = []
     u_reads = []
     v_reads = []
@@ -1624,9 +1660,9 @@ cdef one_edge(u_reads_info, v_reads_info, int clip_length, int insert_size, int
         if not a.cigartuples:
             continue
         u_reads.append(a)
-        cigar_index = cigar_info[5]
+        cigar_index = cigar_info.cigar_index
         if 0 < cigar_index < len(a.cigartuples) - 1:  # Alignment spans SV
-            event_pos = cigar_info[6]
+            event_pos = cigar_info.event_pos
             ci = a.cigartuples[cigar_index]
             spanning_alignments.append((ci[0],
                                         a.rname,
@@ -1640,9 +1676,9 @@ cdef one_edge(u_reads_info, v_reads_info, int clip_length, int insert_size, int
         if not a.cigartuples:
             continue
         v_reads.append(a)
-        cigar_index = cigar_info[5]
+        cigar_index = cigar_info.cigar_index
         if 0 < cigar_index < len(a.cigartuples) - 1:  # Alignment spans SV
-            event_pos = cigar_info[6]
+            event_pos = cigar_info.event_pos
             ci = a.cigartuples[cigar_index]
             spanning_alignments.append((ci[0],
                                         a.rname,
@@ -1748,7 +1784,7 @@ cdef one_edge(u_reads_info, v_reads_info, int clip_length, int insert_size, int
 
 
 def fpos_srt(x):
-    return x[0][4]
+    return x[0].tell
 
 
 cdef get_reads(infile, nodes_info, buffered_reads, n2n, bint add_to_buffer, sites_index):
@@ -1770,28 +1806,27 @@ cdef get_reads(infile, nodes_info, buffered_reads, n2n, bint add_to_buffer, site
         if int_node in buffered_reads:
             aligns.append((n, buffered_reads[int_node]))
             continue
+        # # def as_tuple(self):
+        #     #     return self.h, self.f, self.p, self.c, self.t, self.cigar_index, self.event_pos
+        # p = n[4]
+        # node = (n[0], n[1], n[2], n[3], p)  # drop cigar index and event pos
 
-        p = n[4]
-        node = (n[0], n[1], n[2], n[3], p)  # drop cigar index and event pos
-
-        fpos.append((node, int_node, n))
-
-    # improve chances of reading nearby locations sequentially
-    fpos = sorted(fpos, key=fpos_srt)
+        fpos.append((n, int_node))
 
-    for node, int_node, save_node in fpos:
+    for node, int_node in sorted(fpos, key=fpos_srt):
 
-        p = node[4]
+        # p = node[4]
 
-        infile.seek(p)
+        infile.seek(node.tell)
         try:
             a = next(infile)
         except StopIteration:
             return aligns
         v = xxhasher(bam_get_qname(a._delegate), len(a.qname), 42)
 
-        if (v, a.flag, a.pos, a.rname, p) == node:
-            aligns.append((save_node, a))
+        # if (v, a.flag, a.pos, a.rname, p) == node:
+        if v == node.hash_name and a.flag == node.flag and a.pos == node.pos and a.rname == node.chrom:
+            aligns.append((node, a))
 
             if add_to_buffer:
                 buffered_reads[int_node] = a  # Add to buffer, then block nodes with multi-edges dont need collecting twice
@@ -1806,8 +1841,9 @@ cdef get_reads(infile, nodes_info, buffered_reads, n2n, bint add_to_buffer, site
                 steps += 1
                 v = xxhasher(bam_get_qname(a._delegate), len(a.qname), 42)
 
-                if (v, a.flag, a.pos, a.rname, p) == node:
-                    aligns.append((save_node, a))
+                # if (v, a.flag, a.pos, a.rname, p) == node:
+                if v == node.hash_name and a.flag == node.flag and a.pos == node.pos and a.rname == node.chrom:
+                    aligns.append((node, a))
 
                     if add_to_buffer:
                         buffered_reads[int_node] = a
@@ -1841,6 +1877,7 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf,
         sites_info = []
 
     # u and v are the part ids, d[0] and d[1] are the lists of nodes for those parts
+
     for (u, v), d in data["s_between"].items():
 
         rd_u = get_reads(bam, d[0], data["reads"], data["n2n"], add_to_buffer, info)   # [(Nodeinfo, alignment)..]
@@ -1862,8 +1899,28 @@ cdef list multi(data, bam, int insert_size, int insert_stdev, float insert_ppf,
         if v in seen:
             seen.remove(v)
 
-        events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, assemble_contigs,
-                            extended_tags, sites_info)
+        # finds reads that should be a single partition
+        u_reads, v_reads, u_single, v_single = filter_single_partitions(rd_u, rd_v)
+
+        if len(u_reads) > 0 and len(v_reads) > 0:
+            events += one_edge(rd_u, rd_v, clip_length, insert_size, insert_stdev, insert_ppf, min_support, 1, assemble_contigs,
+                               extended_tags, sites_info)
+        if u_single:
+            res = single(u_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs,
+                         extended_tags, sites_info)
+            if res:
+                if isinstance(res, EventResult):
+                    events.append(res)
+                else:
+                    events += res
+        if v_single:
+            res = single(v_single, insert_size, insert_stdev, insert_ppf, clip_length, min_support, assemble_contigs,
+                         extended_tags, sites_info)
+            if res:
+                if isinstance(res, EventResult):
+                    events.append(res)
+                else:
+                    events += res
 
     # Process any singles / unconnected blocks
     if seen:
diff --git a/dysgu/cluster.pyx b/dysgu/cluster.pyx
index 5e3f48c..4e9df04 100644
--- a/dysgu/cluster.pyx
+++ b/dysgu/cluster.pyx
@@ -151,6 +151,7 @@ def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, re
 
     seen = set([])
     pad = 100
+    disjoint_nodes = set([])  # if a component has more than one disjoint nodes it needs to be broken apart
 
     for ei, ej, idx, jdx in event_iter:
 
@@ -228,8 +229,13 @@ def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, re
             continue
 
         recpi_overlap = is_reciprocal_overlapping(ei.posA, ei.posB, ej.posA, ej.posB)
-        overlap = max(ei.posA, ej.posA) - min(ei.posB, ej.posB)
+        overlap = max(0, min(ei.posA, ej.posA) - max(ei.posB, ej.posB))
 
+        if paired_end:
+            if ei.spanning > 0 and ej.spanning > 0 and overlap == 0:
+                disjoint_nodes.add(i_id)
+                disjoint_nodes.add(j_id)
+                continue
         # If long reads only rely on reciprocal overlap, seems to work better
         if paired_end:
             spd = span_similarity(ei, ej)
@@ -247,21 +253,6 @@ def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, re
         else:
             l_ratio = min(ei.svlen, ej.svlen) / ml
 
-        # this didnt work
-        # merge events that might be deletion of tandem duplication
-        # if paired_end and ei.svtype == "DEL":
-        #     if l_ratio < 0.9:
-        #         if both_imprecise and min(ei.remap_score, ej.remap_score) > 50:
-        #             continue
-        #     elif overlap < -15:
-        #         continue
-
-        # this worked ok
-        # echo(ei.remap_score, ej.remap_score, recpi_overlap, l_ratio)
-        # if max(ei.remap_score, ej.remap_score) > 50 and not recpi_overlap:
-        # # if max(ei.remap_score, ej.remap_score) > 50 and max(ei.spanning, ej.spanning) > 0:
-        #     continue
-
         if ei.svtype == "INS":
             if aggressive_ins_merge:
                 m = True
@@ -287,8 +278,8 @@ def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, re
         if not m:
             continue
         # if ei.posA == 66323 and ej.posA == 66323:
-        # echo(ml, l_ratio, one_is_imprecise, any_contigs_to_check, (ei.remap_score, ej.remap_score),
-        #      (ei.svlen, ej.svlen), (ei.event_id, ej.event_id), recpi_overlap, spd, loci_similar, loci_same, "overlap", overlap)
+        # echo((ei.event_id, ej.event_id), ml, l_ratio, one_is_imprecise, any_contigs_to_check, (ei.remap_score, ej.remap_score),
+        #      (ei.svlen, ej.svlen), recpi_overlap, spd, loci_similar, loci_same, "overlap", overlap)
 
         # Loci are similar, check contig match or reciprocal overlap
         if not any_contigs_to_check:
@@ -335,24 +326,48 @@ def enumerate_events(G, potential, max_dist, try_rev, tree, paired_end=False, re
                         G.add_edge(i_id, j_id, loci_same=True)
                         continue
 
-    return G
+    return G, disjoint_nodes
 
 
-def cut_components(G):
+def cut_components(G, disjoint_nodes):
     e = G.edges(data=True)
     G2 = nx.Graph([i for i in e if i[2]["loci_same"] == True])
     for u in G.nodes():
         if u not in G2:
             e0 = next(G.edges(u).__iter__())  # Use first edge out of u to connect
             G2.add_edge(*e0)
-    return nx.algorithms.components.connected_components(G2)
+    components = nx.algorithms.components.connected_components(G2)
+    if len(disjoint_nodes) > 0:
+        # try split this component into disjoint sets. This method works for small cluster sizes (most of the time)
+        # but can fail when there are many disjoint nodes. Label propagation might be needed for these
+        components2 = []
+        for c in components:
+            n_disjoin = set([])
+            for node in c:
+                if node in disjoint_nodes:
+                    n_disjoin.add(node)
+            if len(n_disjoin) <= 1:
+                components2.append(c)
+                continue
+
+            out_e = defaultdict(list)
+            for node in n_disjoin:
+                for neigh in G.neighbors(node):
+                    out_e[neigh].append(node)
+
+            G3 = nx.Graph()
+            for k, v in out_e.items():
+                G3.add_edge(k, random.choice(v))  # randomly assign to one of the sets
+
+            components2 += list(nx.algorithms.components.connected_components(G3))
+        return components2
+    return components
 
 
 cpdef srt_func(c):
-    # keeper_bias = 0 if not c.site_info else 10000
     if c.type != "pe" and c.type != "":
-        return 100 + c.su # + keeper_bias
-    return c.su + (3 * c.spanning) # + keeper_bias
+        return 100 + c.su
+    return c.su + (3 * c.spanning)
 
 
 def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pick_best=False, add_partners=False,
@@ -367,9 +382,9 @@ def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pic
 
     # Cluster events on graph
     G = nx.Graph()
-    G = enumerate_events(G, potential, max_dist, try_rev, tree, paired_end, rel_diffs, diffs, same_sample,
-                         aggressive_ins_merge=aggressive_ins_merge,
-                         debug=debug)
+    G, disjoint_nodes = enumerate_events(G, potential, max_dist, try_rev, tree, paired_end, rel_diffs, diffs, same_sample,
+                        aggressive_ins_merge=aggressive_ins_merge,
+                        debug=debug)
 
     found = []
     for item in potential:  # Add singletons, non-merged
@@ -378,8 +393,9 @@ def merge_events(potential, max_dist, tree, paired_end=False, try_rev=False, pic
 
     # Try and merge SVs with identical breaks, then merge ones with less accurate breaks - this helps prevent
     # over merging SVs that are close together
-    components = cut_components(G)
+    components = cut_components(G, disjoint_nodes)
     node_to_event = {i.event_id: i for i in potential}
+
     cdef int k
     # Only keep edges with loci_same==False if removing the edge leads to an isolated node
     for grp in components:
diff --git a/dysgu/graph.pyx b/dysgu/graph.pyx
index df84940..03c3a03 100644
--- a/dysgu/graph.pyx
+++ b/dysgu/graph.pyx
@@ -6,6 +6,7 @@ from collections import defaultdict, deque, namedtuple
 import numpy as np
 cimport numpy as np
 import sortedcontainers
+import cython
 from cpython cimport array
 import array
 import re
@@ -542,7 +543,7 @@ cdef class PairedEndScoper:
 
 
 cdef class TemplateEdges:
-    cdef unordered_map[string, vector[int]] templates_s  # robin map was buggy for iterating
+    cdef public unordered_map[string, vector[int]] templates_s  # robin map was buggy for iterating
     def __init__(self):
         pass
 
@@ -555,27 +556,29 @@ cdef class TemplateEdges:
         val.push_back(flag)
         self.templates_s[key].insert(self.templates_s[key].end(), val.begin(), val.end())
 
-    def iterate_map(self):
-
-        cdef unordered_map[string, vector[int]].iterator it = self.templates_s.begin()
-        cdef string first
-        cdef vector[int] second
-        while it != self.templates_s.end():
-            first = dereference(it).first
-            second = dereference(it).second
-            yield str(dereference(it).first), list(dereference(it).second)  # Array values are flag, node name, query start
-            postincrement(it)
-
 
 cdef void add_template_edges(G, TemplateEdges template_edges):
     # this function joins up template reads (read 1, read 2, plus any supplementary)
     cdef int ii, u_start, v_start, u, v, uflag, vflag
+
     # normally 2 reads for paired end, or >2 if supplementary reads
-    for qname, arr in template_edges.iterate_map():
+    cdef unordered_map[string, vector[int]].iterator it = template_edges.templates_s.begin()
+    # cdef string qname
+    cdef vector[int] arr
+    while it != template_edges.templates_s.end():
+
+        # qname = str(dereference(it).first)
+
+        arr = dereference(it).second
+        # Array values are query start, node-name, flag
+        # if qname == "D00360:18:H8VC6ADXX:1:1210:7039:44052":
+        #     echo(arr)
+        postincrement(it)
+
         read1_aligns = []
         read2_aligns = []
-        for ii in range(0, len(arr), 3):
-            if arr[ii + 2] & 64:
+        for ii in range(0, arr.size(), 3):
+            if arr[ii + 2] & 64:  # first in pair
                 read1_aligns.append(arr[ii:ii + 3])
             else:
                 read2_aligns.append(arr[ii:ii + 3])
@@ -607,7 +610,7 @@ cdef void add_template_edges(G, TemplateEdges template_edges):
                     primary2 = read2_aligns[0][1]
             else:
                 if len(read2_aligns) > 2:
-                    read2_aligns = sorted(read2_aligns)
+                    read2_aligns = sorted(read2_aligns)  # sorted by query pos
                 for ii in range(len(read2_aligns) - 1):
                     u_start, u, uflag = read2_aligns[ii]
                     if not uflag & 2304:  # Is primary
@@ -624,6 +627,27 @@ cdef void add_template_edges(G, TemplateEdges template_edges):
                 G.addEdge(primary1, primary2, w=1)
 
 
+@cython.auto_pickle(True)
+cdef class NodeName:
+    cdef public uint64_t hash_name
+    cdef public uint64_t tell
+    cdef public uint32_t pos
+    cdef public int32_t cigar_index
+    cdef public uint32_t event_pos
+    cdef public uint16_t flag
+    cdef public uint16_t chrom
+    def __init__(self, h, f, p, c, t, cigar_index, event_pos):
+        self.hash_name = h
+        self.flag = f
+        self.pos = p
+        self.chrom = c
+        self.tell = t
+        self.cigar_index = cigar_index
+        self.event_pos = event_pos
+
+    # def as_tuple(self):
+    #     return self.h, self.f, self.p, self.c, self.t, self.cigar_index, self.event_pos
+
 cdef class NodeToName:
     # Index these vectors to get the unique 'template_name'
     cdef vector[uint64_t] h
@@ -650,7 +674,7 @@ cdef class NodeToName:
         self.event_pos.push_back(g)
 
     def __getitem__(self, idx):
-        return self.h[idx], self.f[idx], self.p[idx], self.c[idx], self.t[idx], self.cigar_index[idx], self.event_pos[idx]
+        return NodeName(self.h[idx], self.f[idx], self.p[idx], self.c[idx], self.t[idx], self.cigar_index[idx], self.event_pos[idx])
 
 
 cdef get_query_pos_from_cigarstring(cigar, pos):
@@ -660,19 +684,18 @@ cdef get_query_pos_from_cigarstring(cigar, pos):
     cdef bint i = 0
     cdef int ref_end = pos
     cdef int slen
-
     for slen, opp in cigar:
         if not i and opp in "SH":
             start += slen
             end += slen
             i = 1
-        elif opp == "M":
-            end += slen
-            ref_end += slen
         elif opp == "D":
             ref_end += slen
         elif opp == "I":
             end += slen
+        elif opp in "M=X":
+            end += slen
+            ref_end += slen
         i = 1
     return start, end, pos, ref_end
 
@@ -714,7 +737,7 @@ cdef alignments_from_sa_tag(r, gettid, thresh, paired_end, mapq_thresh):
         cigar = sa[3]
         matches = [(int(slen), opp) for slen, opp in re.findall(r'(\d+)([A-Z]{1})', sa[3])]  # parse cigar
 
-        query_start, query_end, ref_start, ref_end = get_query_pos_from_cigarstring(matches, start_pos2)
+        query_start, query_end, ref_start, ref_end = get_query_pos_from_cigarstring(matches, start_pos2) #, strand == current_strand)
 
         if current_strand != strand:  # count from end
             start_temp = query_length - query_end
@@ -722,9 +745,9 @@ cdef alignments_from_sa_tag(r, gettid, thresh, paired_end, mapq_thresh):
             query_start = start_temp
 
         # If another local alignment is found use only this, usually corresponds to the other side of an insertion/dup
-        if aln_chrom == chrom2 and position_distance(aln_start, aln_end, ref_start, ref_end) < thresh:
-            query_aligns = [query_aligns[0], (query_start, query_end, ref_start, ref_end, chrom2, mq, strand == current_strand)]
-            break
+        # if aln_chrom == chrom2 and position_distance(aln_start, aln_end, ref_start, ref_end) < thresh:
+        #     query_aligns = [query_aligns[0], (query_start, query_end, ref_start, ref_end, chrom2, mq, strand == current_strand)]
+            # break
 
         query_aligns.append((query_start, query_end, ref_start, ref_end, chrom2, mq, strand == current_strand))
 
@@ -848,7 +871,7 @@ cdef void add_to_graph(G, AlignedSegment r, PairedEndScoper_t pe_scope, Template
     # # if r.qname in look:
     # if node_name in node_look:
     #     echo(r.qname, r.pos)
-    # if r.qname == "m64004_190803_004451/154077992/ccs":
+    # if r.qname == "D00360:18:H8VC6ADXX:1:1210:7039:44052":
     #     echo("@", r.flag, node_name, chrom, event_pos, chrom2, pos2, list(other_nodes),
     #          count_sc_edges, cigar_index, length_from_cigar)
     #     echo()
@@ -994,7 +1017,7 @@ cdef void process_alignment(G, AlignedSegment r, int clip_l, int loci_dist, gett
 
         if read_enum == SPLIT:
             # Parse SA tag. For paired reads
-            if r.has_tag("SA") and good_clip:  # Parse SA, first alignment is the other read primary line
+            if r.has_tag("SA") and good_clip:  # Parse SA, first alignment is the other read primary alignment
                 all_aligns, index = alignments_from_sa_tag(r, gettid, loci_dist, paired_end, mapq_thresh)
                 event = all_aligns[index]
                 if len(all_aligns) == 1:
@@ -1392,41 +1415,16 @@ cpdef tuple construct_graph(genome_scanner, infile, int max_dist, int clustering
     return G, node_to_name, bad_clip_counter, site_adder
 
 
-cpdef dict get_reads(infile, sub_graph_reads):
-
-    rd = dict()
-    cdef int j, int_node
-    cdef long int p
-    cdef uint64_t v
-    cdef AlignedSegment a
-    for int_node, node in sub_graph_reads.items():
-        node = tuple(node[:-2])  # drop cigar index and event pos
-        p = node[4]
-        infile.seek(p)
-        a = next(infile)
-        v = xxhasher(bam_get_qname(a._delegate), len(a.qname), 42)
-        n1 = (v, a.flag, a.pos, a.rname, p)
-        # Try next few reads, sometimes they are on top of one another
-        if n1 != node:
-            for j in range(5):
-                a = next(infile)
-                n2 = (xxhasher(bam_get_qname(a._delegate), len(a.qname), 42), a.flag, a.pos, a.rname, p)
-                if n2 == node:
-                    rd[int_node] = a
-                    break
-        else:
-            rd[int_node] = a
-    return rd
-
-
 cdef BFS_local(G, int source, unordered_set[int]& visited ):
     # Create a queue for BFS
     cdef array.array queue = array.array("L", [source])
     nodes_found = set([])
     cdef int u, v
+    cdef vector[int] neighbors
     while queue:
         u = queue.pop(0)
-        for v in G.neighbors(u):
+        neighbors = G.neighbors(u)
+        for v in neighbors:
             if visited.find(v) == visited.end():
                 if G.weight(u, v) > 1:
                     if u not in nodes_found:
@@ -1442,12 +1440,13 @@ cdef get_partitions(G, nodes):
 
     cdef unordered_set[int] seen
     cdef int u, v, i
+    cdef vector[int] neighbors
     parts = []
     for u in nodes:
         if seen.find(u) != seen.end():
             continue
-
-        for v in G.neighbors(u):
+        neighbors = G.neighbors(u)
+        for v in neighbors:
             if seen.find(v) != seen.end():
                 continue
 
@@ -1464,17 +1463,16 @@ cdef tuple count_support_between(G, parts, int min_support):
 
     cdef int i, j, node, child, any_out_edges
     cdef tuple t
+    cdef unsigned long[:] p
 
     if len(parts) == 0:
         return {}, {}
     elif len(parts) == 1:
         return {}, {0: parts[0]}
-        # return {}, {list(parts.keys())[0]: array.array("L", list(parts.values())[0])}
 
     # Make a table to count from, int-int
     cdef Py_Int2IntMap p2i = map_set_utils.Py_Int2IntMap()
     for i, p in enumerate(parts):
-    # for i, p in parts.items():
         for node in p:
             p2i.insert(node, i)
 
@@ -1485,13 +1483,14 @@ cdef tuple count_support_between(G, parts, int min_support):
     self_counts = {}
 
     seen_t = set([])
+    cdef vector[int] neighbors
     for i, p in enumerate(parts):
-    # for i, p in parts.items():
         current_t = set([])
         for node in p:
             any_out_edges = 0  # Keeps track of number of outgoing pairs, or self edges
 
-            for child in G.neighbors(node):
+            neighbors = G.neighbors(node)
+            for child in neighbors:
 
                 if not p2i.has_key(child):
                     continue  # Exterior child, not in any partition
@@ -1529,7 +1528,7 @@ cdef tuple count_support_between(G, parts, int min_support):
 
         seen_t.update(current_t)  # Only count edge once
 
-        # save memory by converting support_between to 2d array
+        # save memory by converting support_between to array
         for t in current_t:
             counts[t] = [np.fromiter(m, dtype="uint32", count=len(m)) for m in counts[t]]
 
@@ -1552,7 +1551,6 @@ cpdef break_large_component(G, component, int min_support):
     # Make a table to count from, int-int
     cdef Py_Int2IntMap p2i = map_set_utils.Py_Int2IntMap()
     for i, p in enumerate(parts):
-    # for i, p in parts.items():
         for node in p:
             p2i.insert(node, i)
 
@@ -1563,13 +1561,14 @@ cpdef break_large_component(G, component, int min_support):
     self_counts = defaultdict(int)
 
     seen_t = set([])
+    cdef vector[int] neighbors
     for i, p in enumerate(parts):
-    # for i, p in parts.items():
         current_t = set([])
         for node in p:
             any_out_edges = 0  # Keeps track of number of outgoing pairs, or self edges
 
-            for child in G.neighbors(node):
+            neighbors = G.neighbors(node)
+            for child in neighbors:
 
                 if not p2i.has_key(child):
                     continue  # Exterior child, not in any partition
@@ -1636,7 +1635,7 @@ cpdef proc_component(node_to_name, component, read_buffer, infile, G, int min_su
         # Need to keep a record of all node info, and cigar indexes
         key = node_to_name[v]
 
-        if key[5] != -1:
+        if key.cigar_index != -1:
             support_estimate += 2
         else:
             support_estimate += 1
@@ -1647,10 +1646,8 @@ cpdef proc_component(node_to_name, component, read_buffer, infile, G, int min_su
 
     # Explore component for locally interacting nodes; create partitions using these
     partitions = get_partitions(G, component)
-    # partitions = {i: p for i, p in enumerate(partitions)}
-
     support_between, support_within = count_support_between(G, partitions, min_support)
-    # echo("support between", len(support_between), len(support_within), info, partitions, len(n2n), info)
+
     if len(support_between) == 0 and len(support_within) == 0:
         if not paired_end:
 
diff --git a/dysgu/robin_hood.h b/dysgu/robin_hood.h
index 481a084..0af031f 100755
--- a/dysgu/robin_hood.h
+++ b/dysgu/robin_hood.h
@@ -6,12 +6,11 @@
 //                                      _/_____/
 //
 // Fast & memory efficient hashtable based on robin hood hashing for C++11/14/17/20
-// version 3.6.0
 // https://github.com/martinus/robin-hood-hashing
 //
 // Licensed under the MIT License <http://opensource.org/licenses/MIT>.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2020 Martin Ankerl <http://martin.ankerl.com>
+// Copyright (c) 2018-2021 Martin Ankerl <http://martin.ankerl.com>
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -35,23 +34,29 @@
 #define ROBIN_HOOD_H_INCLUDED
 
 // see https://semver.org/
-#define ROBIN_HOOD_VERSION_MAJOR 3 // for incompatible API changes
-#define ROBIN_HOOD_VERSION_MINOR 6 // for adding functionality in a backwards-compatible manner
-#define ROBIN_HOOD_VERSION_PATCH 0 // for backwards-compatible bug fixes
+#define ROBIN_HOOD_VERSION_MAJOR 3  // for incompatible API changes
+#define ROBIN_HOOD_VERSION_MINOR 11 // for adding functionality in a backwards-compatible manner
+#define ROBIN_HOOD_VERSION_PATCH 5  // for backwards-compatible bug fixes
 
 #include <algorithm>
 #include <cstdlib>
 #include <cstring>
 #include <functional>
+#include <limits>
+#include <memory> // only to support hash of smart pointers
 #include <stdexcept>
 #include <string>
 #include <type_traits>
 #include <utility>
+#if __cplusplus >= 201703L
+#    include <string_view>
+#endif
 
 // #define ROBIN_HOOD_LOG_ENABLED
 #ifdef ROBIN_HOOD_LOG_ENABLED
 #    include <iostream>
-#    define ROBIN_HOOD_LOG(x) std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl
+#    define ROBIN_HOOD_LOG(...) \
+        std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ << std::endl;
 #else
 #    define ROBIN_HOOD_LOG(x)
 #endif
@@ -59,8 +64,8 @@
 // #define ROBIN_HOOD_TRACE_ENABLED
 #ifdef ROBIN_HOOD_TRACE_ENABLED
 #    include <iostream>
-#    define ROBIN_HOOD_TRACE(x) \
-        std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << x << std::endl
+#    define ROBIN_HOOD_TRACE(...) \
+        std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ << std::endl;
 #else
 #    define ROBIN_HOOD_TRACE(x)
 #endif
@@ -128,30 +133,32 @@ static Counts& counts() {
 #endif
 
 // count leading/trailing bits
-#ifdef _MSC_VER
-#    if ROBIN_HOOD(BITNESS) == 32
-#        define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward
-#    else
-#        define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward64
-#    endif
-#    include <intrin.h>
-#    pragma intrinsic(ROBIN_HOOD(BITSCANFORWARD))
-#    define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x)                                       \
-        [](size_t mask) noexcept -> int {                                             \
-            unsigned long index;                                                      \
-            return ROBIN_HOOD(BITSCANFORWARD)(&index, mask) ? static_cast<int>(index) \
-                                                            : ROBIN_HOOD(BITNESS);    \
-        }(x)
-#else
-#    if ROBIN_HOOD(BITNESS) == 32
-#        define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzl
-#        define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzl
+#if !defined(ROBIN_HOOD_DISABLE_INTRINSICS)
+#    ifdef _MSC_VER
+#        if ROBIN_HOOD(BITNESS) == 32
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward
+#        else
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward64
+#        endif
+#        include <intrin.h>
+#        pragma intrinsic(ROBIN_HOOD(BITSCANFORWARD))
+#        define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x)                                       \
+            [](size_t mask) noexcept -> int {                                             \
+                unsigned long index;                                                      \
+                return ROBIN_HOOD(BITSCANFORWARD)(&index, mask) ? static_cast<int>(index) \
+                                                                : ROBIN_HOOD(BITNESS);    \
+            }(x)
 #    else
-#        define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzll
-#        define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzll
+#        if ROBIN_HOOD(BITNESS) == 32
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzl
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzl
+#        else
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzll
+#            define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzll
+#        endif
+#        define ROBIN_HOOD_COUNT_LEADING_ZEROES(x) ((x) ? ROBIN_HOOD(CLZ)(x) : ROBIN_HOOD(BITNESS))
+#        define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x) ((x) ? ROBIN_HOOD(CTZ)(x) : ROBIN_HOOD(BITNESS))
 #    endif
-#    define ROBIN_HOOD_COUNT_LEADING_ZEROES(x) ((x) ? ROBIN_HOOD(CLZ)(x) : ROBIN_HOOD(BITNESS))
-#    define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x) ((x) ? ROBIN_HOOD(CTZ)(x) : ROBIN_HOOD(BITNESS))
 #endif
 
 // fallthrough
@@ -175,6 +182,28 @@ static Counts& counts() {
 #    define ROBIN_HOOD_UNLIKELY(condition) __builtin_expect(condition, 0)
 #endif
 
+// detect if native wchar_t type is availiable in MSVC
+#ifdef _MSC_VER
+#    ifdef _NATIVE_WCHAR_T_DEFINED
+#        define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1
+#    else
+#        define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 0
+#    endif
+#else
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1
+#endif
+
+// detect if MSVC supports the pair(std::piecewise_construct_t,...) consructor being constexpr
+#ifdef _MSC_VER
+#    if _MSC_VER <= 1900
+#        define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 1
+#    else
+#        define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0
+#    endif
+#else
+#    define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0
+#endif
+
 // workaround missing "is_trivially_copyable" in g++ < 5.0
 // See https://stackoverflow.com/a/31798726/48181
 #if defined(__GNUC__) && __GNUC__ < 5
@@ -274,39 +303,18 @@ using index_sequence_for = make_index_sequence<sizeof...(T)>;
 
 namespace detail {
 
-// umul
-#if defined(__SIZEOF_INT128__)
-#    define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_UMUL128() 1
-#    if defined(__GNUC__) || defined(__clang__)
-#        pragma GCC diagnostic push
-#        pragma GCC diagnostic ignored "-Wpedantic"
-using uint128_t = unsigned __int128;
-#        pragma GCC diagnostic pop
-#    endif
-inline uint64_t umul128(uint64_t a, uint64_t b, uint64_t* high) noexcept {
-    auto result = static_cast<uint128_t>(a) * static_cast<uint128_t>(b);
-    *high = static_cast<uint64_t>(result >> 64U);
-    return static_cast<uint64_t>(result);
-}
-#elif (defined(_MSC_VER) && ROBIN_HOOD(BITNESS) == 64)
-#    define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_UMUL128() 1
-#    include <intrin.h> // for __umulh
-#    pragma intrinsic(__umulh)
-#    ifndef _M_ARM64
-#        pragma intrinsic(_umul128)
-#    endif
-inline uint64_t umul128(uint64_t a, uint64_t b, uint64_t* high) noexcept {
-#    ifdef _M_ARM64
-    *high = __umulh(a, b);
-    return ((uint64_t)(a)) * (b);
-#    else
-    return _umul128(a, b, high);
-#    endif
-}
+// make sure we static_cast to the correct type for hash_int
+#if ROBIN_HOOD(BITNESS) == 64
+using SizeT = uint64_t;
 #else
-#    define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_UMUL128() 0
+using SizeT = uint32_t;
 #endif
 
+template <typename T>
+T rotr(T x, unsigned k) {
+    return (x >> k) | (x << (8U * sizeof(T) - k));
+}
+
 // This cast gets rid of warnings like "cast from 'uint8_t*' {aka 'unsigned char*'} to
 // 'uint64_t*' {aka 'long unsigned int*'} increases required alignment of target type". Use with
 // care!
@@ -323,14 +331,14 @@ inline T reinterpret_cast_no_cast_align_warning(void const* ptr) noexcept {
 // make sure this is not inlined as it is slow and dramatically enlarges code, thus making other
 // inlinings more difficult. Throws are also generally the slow path.
 template <typename E, typename... Args>
-ROBIN_HOOD(NOINLINE)
+[[noreturn]] ROBIN_HOOD(NOINLINE)
 #if ROBIN_HOOD(HAS_EXCEPTIONS)
-void doThrow(Args&&... args) {
+    void doThrow(Args&&... args) {
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay)
     throw E(std::forward<Args>(args)...);
 }
 #else
-void doThrow(Args&&... ROBIN_HOOD_UNUSED(args) /*unused*/) {
+    void doThrow(Args&&... ROBIN_HOOD_UNUSED(args) /*unused*/) {
     abort();
 }
 #endif
@@ -396,7 +404,8 @@ class BulkPoolAllocator {
     void reset() noexcept {
         while (mListForFree) {
             T* tmp = *mListForFree;
-            free(mListForFree);
+            ROBIN_HOOD_LOG("std::free")
+            std::free(mListForFree);
             mListForFree = reinterpret_cast_no_cast_align_warning<T**>(tmp);
         }
         mHead = nullptr;
@@ -431,8 +440,10 @@ class BulkPoolAllocator {
         // calculate number of available elements in ptr
         if (numBytes < ALIGNMENT + ALIGNED_SIZE) {
             // not enough data for at least one element. Free and return.
-            free(ptr);
+            ROBIN_HOOD_LOG("std::free")
+            std::free(ptr);
         } else {
+            ROBIN_HOOD_LOG("add to buffer")
             add(ptr, numBytes);
         }
     }
@@ -473,10 +484,10 @@ class BulkPoolAllocator {
         mListForFree = data;
 
         // create linked list for newly allocated data
-        auto const headT =
+        auto* const headT =
             reinterpret_cast_no_cast_align_warning<T*>(reinterpret_cast<char*>(ptr) + ALIGNMENT);
 
-        auto const head = reinterpret_cast<char*>(headT);
+        auto* const head = reinterpret_cast<char*>(headT);
 
         // Visual Studio compiler automatically unrolls this loop, which is pretty cool
         for (size_t i = 0; i < numElements; ++i) {
@@ -496,9 +507,10 @@ class BulkPoolAllocator {
         size_t const numElementsToAlloc = calcNumElementsToAlloc();
 
         // alloc new memory: [prev |T, T, ... T]
-        // std::cout << (sizeof(T*) + ALIGNED_SIZE * numElementsToAlloc) << " bytes" << std::endl;
         size_t const bytes = ALIGNMENT + ALIGNED_SIZE * numElementsToAlloc;
-        add(assertNotNull<std::bad_alloc>(malloc(bytes)), bytes);
+        ROBIN_HOOD_LOG("std::malloc " << bytes << " = " << ALIGNMENT << " + " << ALIGNED_SIZE
+                                      << " * " << numElementsToAlloc)
+        add(assertNotNull<std::bad_alloc>(std::malloc(bytes)), bytes);
         return mHead;
     }
 
@@ -534,30 +546,29 @@ struct NodeAllocator<T, MinSize, MaxSize, true> {
 
     // we are not using the data, so just free it.
     void addOrFree(void* ptr, size_t ROBIN_HOOD_UNUSED(numBytes) /*unused*/) noexcept {
-        free(ptr);
+        ROBIN_HOOD_LOG("std::free")
+        std::free(ptr);
     }
 };
 
 template <typename T, size_t MinSize, size_t MaxSize>
 struct NodeAllocator<T, MinSize, MaxSize, false> : public BulkPoolAllocator<T, MinSize, MaxSize> {};
 
-// dummy hash, unsed as mixer when robin_hood::hash is already used
-template <typename T>
-struct identity_hash {
-    constexpr size_t operator()(T const& obj) const noexcept {
-        return static_cast<size_t>(obj);
-    }
-};
-
 // c++14 doesn't have is_nothrow_swappable, and clang++ 6.0.1 doesn't like it either, so I'm making
 // my own here.
 namespace swappable {
+#if ROBIN_HOOD(CXX) < ROBIN_HOOD(CXX17)
 using std::swap;
 template <typename T>
 struct nothrow {
     static const bool value = noexcept(swap(std::declval<T&>(), std::declval<T&>()));
 };
-
+#else
+template <typename T>
+struct nothrow {
+    static const bool value = std::is_nothrow_swappable<T>::value;
+};
+#endif
 } // namespace swappable
 
 } // namespace detail
@@ -586,44 +597,46 @@ struct pair {
         , second(o.second) {}
 
     // pair constructors are explicit so we don't accidentally call this ctor when we don't have to.
-    explicit constexpr pair(std::pair<T1, T2>&& o) noexcept(
-        noexcept(T1(std::move(std::declval<T1&&>()))) &&
-        noexcept(T2(std::move(std::declval<T2&&>()))))
+    explicit constexpr pair(std::pair<T1, T2>&& o) noexcept(noexcept(
+        T1(std::move(std::declval<T1&&>()))) && noexcept(T2(std::move(std::declval<T2&&>()))))
         : first(std::move(o.first))
         , second(std::move(o.second)) {}
 
-    constexpr pair(T1&& a, T2&& b) noexcept(noexcept(T1(std::move(std::declval<T1&&>()))) &&
-                                            noexcept(T2(std::move(std::declval<T2&&>()))))
+    constexpr pair(T1&& a, T2&& b) noexcept(noexcept(
+        T1(std::move(std::declval<T1&&>()))) && noexcept(T2(std::move(std::declval<T2&&>()))))
         : first(std::move(a))
         , second(std::move(b)) {}
 
     template <typename U1, typename U2>
-    constexpr pair(U1&& a, U2&& b) noexcept(noexcept(T1(std::forward<U1>(std::declval<U1&&>()))) &&
-                                            noexcept(T2(std::forward<U2>(std::declval<U2&&>()))))
+    constexpr pair(U1&& a, U2&& b) noexcept(noexcept(T1(std::forward<U1>(
+        std::declval<U1&&>()))) && noexcept(T2(std::forward<U2>(std::declval<U2&&>()))))
         : first(std::forward<U1>(a))
         , second(std::forward<U2>(b)) {}
 
     template <typename... U1, typename... U2>
-    constexpr pair(
-        std::piecewise_construct_t /*unused*/, std::tuple<U1...> a,
-        std::tuple<U2...> b) noexcept(noexcept(pair(std::declval<std::tuple<U1...>&>(),
-                                                    std::declval<std::tuple<U2...>&>(),
-                                                    ROBIN_HOOD_STD::index_sequence_for<U1...>(),
-                                                    ROBIN_HOOD_STD::index_sequence_for<U2...>())))
+    // MSVC 2015 produces error "C2476: ‘constexpr’ constructor does not initialize all members"
+    // if this constructor is constexpr
+#if !ROBIN_HOOD(BROKEN_CONSTEXPR)
+    constexpr
+#endif
+        pair(std::piecewise_construct_t /*unused*/, std::tuple<U1...> a,
+             std::tuple<U2...>
+                 b) noexcept(noexcept(pair(std::declval<std::tuple<U1...>&>(),
+                                           std::declval<std::tuple<U2...>&>(),
+                                           ROBIN_HOOD_STD::index_sequence_for<U1...>(),
+                                           ROBIN_HOOD_STD::index_sequence_for<U2...>())))
         : pair(a, b, ROBIN_HOOD_STD::index_sequence_for<U1...>(),
-               ROBIN_HOOD_STD::index_sequence_for<U2...>()) {}
+               ROBIN_HOOD_STD::index_sequence_for<U2...>()) {
+    }
 
     // constructor called from the std::piecewise_construct_t ctor
     template <typename... U1, size_t... I1, typename... U2, size_t... I2>
-    pair(std::tuple<U1...>& a, std::tuple<U2...>& b,
-         ROBIN_HOOD_STD::index_sequence<I1...> /*unused*/,
-         ROBIN_HOOD_STD::index_sequence<
-             I2...> /*unused*/) noexcept(noexcept(T1(std::
-                                                         forward<U1>(std::get<I1>(
-                                                             std::declval<
-                                                                 std::tuple<U1...>&>()))...)) &&
-                                         noexcept(T2(std::forward<U2>(
-                                             std::get<I2>(std::declval<std::tuple<U2...>&>()))...)))
+    pair(std::tuple<U1...>& a, std::tuple<U2...>& b, ROBIN_HOOD_STD::index_sequence<I1...> /*unused*/, ROBIN_HOOD_STD::index_sequence<I2...> /*unused*/) noexcept(
+        noexcept(T1(std::forward<U1>(std::get<I1>(
+            std::declval<std::tuple<
+                U1...>&>()))...)) && noexcept(T2(std::
+                                                     forward<U2>(std::get<I2>(
+                                                         std::declval<std::tuple<U2...>&>()))...)))
         : first(std::forward<U1>(std::get<I1>(a))...)
         , second(std::forward<U2>(std::get<I2>(b))...) {
         // make visual studio compiler happy about warning about unused a & b.
@@ -658,7 +671,9 @@ inline constexpr bool operator!=(pair<A, B> const& x, pair<A, B> const& y) {
     return !(x == y);
 }
 template <typename A, typename B>
-inline constexpr bool operator<(pair<A, B> const& x, pair<A, B> const& y) {
+inline constexpr bool operator<(pair<A, B> const& x, pair<A, B> const& y) noexcept(noexcept(
+    std::declval<A const&>() < std::declval<A const&>()) && noexcept(std::declval<B const&>() <
+                                                                     std::declval<B const&>())) {
     return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
 }
 template <typename A, typename B>
@@ -674,14 +689,12 @@ inline constexpr bool operator>=(pair<A, B> const& x, pair<A, B> const& y) {
     return !(x < y);
 }
 
-// Hash an arbitrary amount of bytes. This is basically Murmur2 hash without caring about big
-// endianness. TODO(martinus) add a fallback for very large strings?
-static size_t hash_bytes(void const* ptr, size_t const len) noexcept {
+inline size_t hash_bytes(void const* ptr, size_t len) noexcept {
     static constexpr uint64_t m = UINT64_C(0xc6a4a7935bd1e995);
     static constexpr uint64_t seed = UINT64_C(0xe17a1465);
     static constexpr unsigned int r = 47;
 
-    auto const data64 = static_cast<uint64_t const*>(ptr);
+    auto const* const data64 = static_cast<uint64_t const*>(ptr);
     uint64_t h = seed ^ (len * m);
 
     size_t const n_blocks = len / 8;
@@ -696,7 +709,7 @@ static size_t hash_bytes(void const* ptr, size_t const len) noexcept {
         h *= m;
     }
 
-    auto const data8 = reinterpret_cast<uint8_t const*>(data64 + n_blocks);
+    auto const* const data8 = reinterpret_cast<uint8_t const*>(data64 + n_blocks);
     switch (len & 7U) {
     case 7:
         h ^= static_cast<uint64_t>(data8[6]) << 48U;
@@ -725,65 +738,87 @@ static size_t hash_bytes(void const* ptr, size_t const len) noexcept {
     }
 
     h ^= h >> r;
-    h *= m;
-    h ^= h >> r;
+
+    // not doing the final step here, because this will be done by keyToIdx anyways
+    // h *= m;
+    // h ^= h >> r;
     return static_cast<size_t>(h);
 }
 
-inline size_t hash_int(uint64_t obj) noexcept {
-#if ROBIN_HOOD(HAS_UMUL128)
-    // 167079903232 masksum, 120428523 ops best: 0xde5fb9d2630458e9
-    static constexpr uint64_t k = UINT64_C(0xde5fb9d2630458e9);
-    uint64_t h;
-    uint64_t l = detail::umul128(obj, k, &h);
-    return h + l;
-#elif ROBIN_HOOD(BITNESS) == 32
-    uint64_t const r = obj * UINT64_C(0xca4bcaa75ec3f625);
-    auto h = static_cast<uint32_t>(r >> 32U);
-    auto l = static_cast<uint32_t>(r);
-    return h + l;
-#else
-    // murmurhash 3 finalizer
-    uint64_t h = obj;
-    h ^= h >> 33;
-    h *= 0xff51afd7ed558ccd;
-    h ^= h >> 33;
-    h *= 0xc4ceb9fe1a85ec53;
-    h ^= h >> 33;
-    return static_cast<size_t>(h);
-#endif
+inline size_t hash_int(uint64_t x) noexcept {
+    // tried lots of different hashes, let's stick with murmurhash3. It's simple, fast, well tested,
+    // and doesn't need any special 128bit operations.
+    x ^= x >> 33U;
+    x *= UINT64_C(0xff51afd7ed558ccd);
+    x ^= x >> 33U;
+
+    // not doing the final step here, because this will be done by keyToIdx anyways
+    // x *= UINT64_C(0xc4ceb9fe1a85ec53);
+    // x ^= x >> 33U;
+    return static_cast<size_t>(x);
 }
 
 // A thin wrapper around std::hash, performing an additional simple mixing step of the result.
-template <typename T>
+template <typename T, typename Enable = void>
 struct hash : public std::hash<T> {
     size_t operator()(T const& obj) const
         noexcept(noexcept(std::declval<std::hash<T>>().operator()(std::declval<T const&>()))) {
         // call base hash
         auto result = std::hash<T>::operator()(obj);
         // return mixed of that, to be save against identity has
-        return hash_int(static_cast<uint64_t>(result));
+        return hash_int(static_cast<detail::SizeT>(result));
     }
 };
 
-template <>
-struct hash<std::string> {
-    size_t operator()(std::string const& str) const noexcept {
-        return hash_bytes(str.data(), str.size());
+template <typename CharT>
+struct hash<std::basic_string<CharT>> {
+    size_t operator()(std::basic_string<CharT> const& str) const noexcept {
+        return hash_bytes(str.data(), sizeof(CharT) * str.size());
     }
 };
 
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17)
+template <typename CharT>
+struct hash<std::basic_string_view<CharT>> {
+    size_t operator()(std::basic_string_view<CharT> const& sv) const noexcept {
+        return hash_bytes(sv.data(), sizeof(CharT) * sv.size());
+    }
+};
+#endif
+
 template <class T>
 struct hash<T*> {
     size_t operator()(T* ptr) const noexcept {
-        return hash_int(reinterpret_cast<size_t>(ptr));
+        return hash_int(reinterpret_cast<detail::SizeT>(ptr));
+    }
+};
+
+template <class T>
+struct hash<std::unique_ptr<T>> {
+    size_t operator()(std::unique_ptr<T> const& ptr) const noexcept {
+        return hash_int(reinterpret_cast<detail::SizeT>(ptr.get()));
+    }
+};
+
+template <class T>
+struct hash<std::shared_ptr<T>> {
+    size_t operator()(std::shared_ptr<T> const& ptr) const noexcept {
+        return hash_int(reinterpret_cast<detail::SizeT>(ptr.get()));
+    }
+};
+
+template <typename Enum>
+struct hash<Enum, typename std::enable_if<std::is_enum<Enum>::value>::type> {
+    size_t operator()(Enum e) const noexcept {
+        using Underlying = typename std::underlying_type<Enum>::type;
+        return hash<Underlying>{}(static_cast<Underlying>(e));
     }
 };
 
 #define ROBIN_HOOD_HASH_INT(T)                           \
     template <>                                          \
     struct hash<T> {                                     \
-        size_t operator()(T obj) const noexcept {        \
+        size_t operator()(T const& obj) const noexcept { \
             return hash_int(static_cast<uint64_t>(obj)); \
         }                                                \
     }
@@ -799,7 +834,9 @@ ROBIN_HOOD_HASH_INT(signed char);
 ROBIN_HOOD_HASH_INT(unsigned char);
 ROBIN_HOOD_HASH_INT(char16_t);
 ROBIN_HOOD_HASH_INT(char32_t);
+#if ROBIN_HOOD(HAS_NATIVE_WCHART)
 ROBIN_HOOD_HASH_INT(wchar_t);
+#endif
 ROBIN_HOOD_HASH_INT(short);
 ROBIN_HOOD_HASH_INT(unsigned short);
 ROBIN_HOOD_HASH_INT(int);
@@ -813,8 +850,20 @@ ROBIN_HOOD_HASH_INT(unsigned long long);
 #endif
 namespace detail {
 
-// using wrapper classes for hash and key_equal prevents the diamond problem when the same type is
-// used. see https://stackoverflow.com/a/28771920/48181
+template <typename T>
+struct void_type {
+    using type = void;
+};
+
+template <typename T, typename = void>
+struct has_is_transparent : public std::false_type {};
+
+template <typename T>
+struct has_is_transparent<T, typename void_type<typename T::is_transparent>::type>
+    : public std::true_type {};
+
+// using wrapper classes for hash and key_equal prevents the diamond problem when the same type
+// is used. see https://stackoverflow.com/a/28771920/48181
 template <typename T>
 struct WrapHash : public T {
     WrapHash() = default;
@@ -831,8 +880,8 @@ struct WrapKeyEqual : public T {
 
 // A highly optimized hashmap implementation, using the Robin Hood algorithm.
 //
-// In most cases, this map should be usable as a drop-in replacement for std::unordered_map, but be
-// about 2x faster in most cases and require much less allocations.
+// In most cases, this map should be usable as a drop-in replacement for std::unordered_map, but
+// be about 2x faster in most cases and require much less allocations.
 //
 // This implementation uses the following memory layout:
 //
@@ -840,8 +889,8 @@ struct WrapKeyEqual : public T {
 //
 // * Node: either a DataNode that directly has the std::pair<key, val> as member,
 //   or a DataNode with a pointer to std::pair<key,val>. Which DataNode representation to use
-//   depends on how fast the swap() operation is. Heuristically, this is automatically choosen based
-//   on sizeof(). there are always 2^n Nodes.
+//   depends on how fast the swap() operation is. Heuristically, this is automatically choosen
+//   based on sizeof(). there are always 2^n Nodes.
 //
 // * info: Each Node in the map has a corresponding info byte, so there are 2^n info bytes.
 //   Each byte is initialized to 0, meaning the corresponding Node is empty. Set to 1 means the
@@ -849,12 +898,11 @@ struct WrapKeyEqual : public T {
 //   actually belongs to the previous position and was pushed out because that place is already
 //   taken.
 //
-// * infoSentinel: Sentinel byte set to 1, so that iterator's ++ can stop at end() without the need
-// for a idx
-//   variable.
+// * infoSentinel: Sentinel byte set to 1, so that iterator's ++ can stop at end() without the
+//   need for a idx variable.
 //
-// According to STL, order of templates has effect on throughput. That's why I've moved the boolean
-// to the front.
+// According to STL, order of templates has effect on throughput. That's why I've moved the
+// boolean to the front.
 // https://www.reddit.com/r/cpp/comments/ahp6iu/compile_time_binary_size_reductions_and_cs_future/eeguck4/
 template <bool IsFlat, size_t MaxLoadFactor100, typename Key, typename T, typename Hash,
           typename KeyEqual>
@@ -870,6 +918,8 @@ class Table
     static constexpr bool is_flat = IsFlat;
     static constexpr bool is_map = !std::is_void<T>::value;
     static constexpr bool is_set = !is_map;
+    static constexpr bool is_transparent =
+        has_is_transparent<Hash>::value && has_is_transparent<KeyEqual>::value;
 
     using key_type = Key;
     using mapped_type = T;
@@ -894,7 +944,8 @@ class Table
     static constexpr size_t InitialNumElements = sizeof(uint64_t);
     static constexpr uint32_t InitialInfoNumBits = 5;
     static constexpr uint8_t InitialInfoInc = 1U << InitialInfoNumBits;
-    static constexpr uint8_t InitialInfoHashShift = sizeof(size_t) * 8 - InitialInfoNumBits;
+    static constexpr size_t InfoMask = InitialInfoInc - 1U;
+    static constexpr uint8_t InitialInfoHashShift = 0;
     using DataPool = detail::NodeAllocator<value_type, 4, 16384, IsFlat>;
 
     // type needs to be wider than uint8_t.
@@ -903,8 +954,8 @@ class Table
     // DataNode ////////////////////////////////////////////////////////
 
     // Primary template for the data node. We have special implementations for small and big
-    // objects. For large objects it is assumed that swap() is fairly slow, so we allocate these on
-    // the heap so swap merely swaps a pointer.
+    // objects. For large objects it is assumed that swap() is fairly slow, so we allocate these
+    // on the heap so swap merely swaps a pointer.
     template <typename M, bool>
     class DataNode {};
 
@@ -953,8 +1004,8 @@ class Table
 
         template <typename VT = value_type>
         ROBIN_HOOD(NODISCARD)
-        typename std::enable_if<is_map, typename VT::first_type const&>::type getFirst() const
-            noexcept {
+        typename std::enable_if<is_map, typename VT::first_type const&>::type
+            getFirst() const noexcept {
             return mData.first;
         }
         template <typename VT = value_type>
@@ -1036,8 +1087,8 @@ class Table
 
         template <typename VT = value_type>
         ROBIN_HOOD(NODISCARD)
-        typename std::enable_if<is_map, typename VT::first_type const&>::type getFirst() const
-            noexcept {
+        typename std::enable_if<is_map, typename VT::first_type const&>::type
+            getFirst() const noexcept {
             return mData->first;
         }
         template <typename VT = value_type>
@@ -1069,7 +1120,7 @@ class Table
 
     using Node = DataNode<Self, IsFlat>;
 
-    // helpers for doInsert: extract first entry (only const required)
+    // helpers for insertKeyPrepareEmptySpot: extract first entry (only const required)
     ROBIN_HOOD(NODISCARD) key_type const& getFirstConst(Node const& n) const noexcept {
         return n.getFirst();
     }
@@ -1097,8 +1148,8 @@ class Table
     template <typename M>
     struct Cloner<M, true> {
         void operator()(M const& source, M& target) const {
-            auto src = reinterpret_cast<char const*>(source.mKeyVals);
-            auto tgt = reinterpret_cast<char*>(target.mKeyVals);
+            auto const* const src = reinterpret_cast<char const*>(source.mKeyVals);
+            auto* tgt = reinterpret_cast<char*>(target.mKeyVals);
             auto const numElementsWithBuffer = target.calcNumElementsWithBuffer(target.mMask + 1);
             std::copy(src, src + target.calcNumBytesTotal(numElementsWithBuffer), tgt);
         }
@@ -1186,8 +1237,8 @@ class Table
         // compared to end().
         Iter() = default;
 
-        // Rule of zero: nothing specified. The conversion constructor is only enabled for iterator
-        // to const_iterator, so it doesn't accidentally work as a copy ctor.
+        // Rule of zero: nothing specified. The conversion constructor is only enabled for
+        // iterator to const_iterator, so it doesn't accidentally work as a copy ctor.
 
         // Conversion constructor from iterator to const_iterator.
         template <bool OtherIsConst,
@@ -1224,6 +1275,12 @@ class Table
             return *this;
         }
 
+        Iter operator++(int) noexcept {
+            Iter tmp = *this;
+            ++(*this);
+            return tmp;
+        }
+
         reference operator*() const {
             return **mKeyVals;
         }
@@ -1244,18 +1301,37 @@ class Table
 
     private:
         // fast forward to the next non-free info byte
+        // I've tried a few variants that don't depend on intrinsics, but unfortunately they are
+        // quite a bit slower than this one. So I've reverted that change again. See map_benchmark.
         void fastForward() noexcept {
-            int inc;
-            do {
-                auto const n = detail::unaligned_load<size_t>(mInfo);
-#if ROBIN_HOOD(LITTLE_ENDIAN)
-                inc = ROBIN_HOOD_COUNT_TRAILING_ZEROES(n) / 8;
+            size_t n = 0;
+            while (0U == (n = detail::unaligned_load<size_t>(mInfo))) {
+                mInfo += sizeof(size_t);
+                mKeyVals += sizeof(size_t);
+            }
+#if defined(ROBIN_HOOD_DISABLE_INTRINSICS)
+            // we know for certain that within the next 8 bytes we'll find a non-zero one.
+            if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load<uint32_t>(mInfo))) {
+                mInfo += 4;
+                mKeyVals += 4;
+            }
+            if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load<uint16_t>(mInfo))) {
+                mInfo += 2;
+                mKeyVals += 2;
+            }
+            if (ROBIN_HOOD_UNLIKELY(0U == *mInfo)) {
+                mInfo += 1;
+                mKeyVals += 1;
+            }
 #else
-                inc = ROBIN_HOOD_COUNT_LEADING_ZEROES(n) / 8;
+#    if ROBIN_HOOD(LITTLE_ENDIAN)
+            auto inc = ROBIN_HOOD_COUNT_TRAILING_ZEROES(n) / 8;
+#    else
+            auto inc = ROBIN_HOOD_COUNT_LEADING_ZEROES(n) / 8;
+#    endif
+            mInfo += inc;
+            mKeyVals += inc;
 #endif
-                mInfo += inc;
-                mKeyVals += inc;
-            } while (inc == static_cast<int>(sizeof(size_t)));
         }
 
         friend class Table<IsFlat, MaxLoadFactor100, key_type, mapped_type, hasher, key_equal>;
@@ -1270,17 +1346,17 @@ class Table
     // The upper 1-5 bits need to be a reasonable good hash, to save comparisons.
     template <typename HashKey>
     void keyToIdx(HashKey&& key, size_t* idx, InfoType* info) const {
-        // for a user-specified hash that is *not* robin_hood::hash, apply robin_hood::hash as an
-        // additional mixing step. This serves as a bad hash prevention, if the given data is badly
-        // mixed.
-        using Mix =
-            typename std::conditional<std::is_same<::robin_hood::hash<key_type>, hasher>::value,
-                                      ::robin_hood::detail::identity_hash<size_t>,
-                                      ::robin_hood::hash<size_t>>::type;
-        *idx = Mix{}(WHash::operator()(key));
+        // In addition to whatever hash is used, add another mul & shift so we get better hashing.
+        // This serves as a bad hash prevention, if the given data is
+        // badly mixed.
+        auto h = static_cast<uint64_t>(WHash::operator()(key));
+
+        h *= mHashMultiplier;
+        h ^= h >> 33U;
 
-        *info = mInfoInc + static_cast<InfoType>(*idx >> mInfoHashShift);
-        *idx &= mMask;
+        // the lower InitialInfoNumBits are reserved for info.
+        *info = mInfoInc + static_cast<InfoType>((h & InfoMask) >> mInfoHashShift);
+        *idx = (static_cast<size_t>(h) >> InitialInfoNumBits) & mMask;
     }
 
     // forwards the index by one, wrapping around at the end
@@ -1308,7 +1384,7 @@ class Table
 
         idx = startIdx;
         while (idx != insertion_idx) {
-            ROBIN_HOOD_COUNT(shiftUp);
+            ROBIN_HOOD_COUNT(shiftUp)
             mInfo[idx] = static_cast<uint8_t>(mInfo[idx - 1] + mInfoInc);
             if (ROBIN_HOOD_UNLIKELY(mInfo[idx] + mInfoInc > 0xFF)) {
                 mMaxNumElementsAllowed = 0;
@@ -1319,12 +1395,13 @@ class Table
 
     void shiftDown(size_t idx) noexcept(std::is_nothrow_move_assignable<Node>::value) {
         // until we find one that is either empty or has zero offset.
-        // TODO(martinus) we don't need to move everything, just the last one for the same bucket.
+        // TODO(martinus) we don't need to move everything, just the last one for the same
+        // bucket.
         mKeyVals[idx].destroy(*this);
 
         // until we find one that is either empty or has zero offset.
         while (mInfo[idx + 1] >= 2 * mInfoInc) {
-            ROBIN_HOOD_COUNT(shiftDown);
+            ROBIN_HOOD_COUNT(shiftDown)
             mInfo[idx] = static_cast<uint8_t>(mInfo[idx + 1] - mInfoInc);
             mKeyVals[idx] = std::move(mKeyVals[idx + 1]);
             ++idx;
@@ -1340,8 +1417,8 @@ class Table
     template <typename Other>
     ROBIN_HOOD(NODISCARD)
     size_t findIdx(Other const& key) const {
-        size_t idx;
-        InfoType info;
+        size_t idx{};
+        InfoType info{};
         keyToIdx(key, &idx, &info);
 
         do {
@@ -1369,16 +1446,16 @@ class Table
     }
 
     // inserts a keyval that is guaranteed to be new, e.g. when the hashmap is resized.
-    // @return index where the element was created
-    size_t insert_move(Node&& keyval) {
+    // @return True on success, false if something went wrong
+    void insert_move(Node&& keyval) {
         // we don't retry, fail if overflowing
         // don't need to check max num elements
         if (0 == mMaxNumElementsAllowed && !try_increase_info()) {
-            throwOverflowError(); // impossible to reach LCOV_EXCL_LINE
+            throwOverflowError();
         }
 
-        size_t idx;
-        InfoType info;
+        size_t idx{};
+        InfoType info{};
         keyToIdx(keyval.getFirst(), &idx, &info);
 
         // skip forward. Use <= because we are certain that the element is not there.
@@ -1411,24 +1488,29 @@ class Table
         mInfo[insertion_idx] = insertion_info;
 
         ++mNumElements;
-        return insertion_idx;
     }
 
 public:
     using iterator = Iter<false>;
     using const_iterator = Iter<true>;
 
-    // Creates an empty hash map. Nothing is allocated yet, this happens at the first insert. This
-    // tremendously speeds up ctor & dtor of a map that never receives an element. The penalty is
-    // payed at the first insert, and not before. Lookup of this empty map works because everybody
-    // points to DummyInfoByte::b. parameter bucket_count is dictated by the standard, but we can
-    // ignore it.
-    explicit Table(size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0, const Hash& h = Hash{},
-                   const KeyEqual& equal = KeyEqual{}) noexcept(noexcept(Hash(h)) &&
-                                                                noexcept(KeyEqual(equal)))
+    Table() noexcept(noexcept(Hash()) && noexcept(KeyEqual()))
+        : WHash()
+        , WKeyEqual() {
+        ROBIN_HOOD_TRACE(this)
+    }
+
+    // Creates an empty hash map. Nothing is allocated yet, this happens at the first insert.
+    // This tremendously speeds up ctor & dtor of a map that never receives an element. The
+    // penalty is payed at the first insert, and not before. Lookup of this empty map works
+    // because everybody points to DummyInfoByte::b. parameter bucket_count is dictated by the
+    // standard, but we can ignore it.
+    explicit Table(
+        size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/, const Hash& h = Hash{},
+        const KeyEqual& equal = KeyEqual{}) noexcept(noexcept(Hash(h)) && noexcept(KeyEqual(equal)))
         : WHash(h)
         , WKeyEqual(equal) {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
     }
 
     template <typename Iter>
@@ -1436,7 +1518,7 @@ class Table
           const Hash& h = Hash{}, const KeyEqual& equal = KeyEqual{})
         : WHash(h)
         , WKeyEqual(equal) {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         insert(first, last);
     }
 
@@ -1445,7 +1527,7 @@ class Table
           const KeyEqual& equal = KeyEqual{})
         : WHash(h)
         , WKeyEqual(equal) {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         insert(initlist.begin(), initlist.end());
     }
 
@@ -1453,8 +1535,9 @@ class Table
         : WHash(std::move(static_cast<WHash&>(o)))
         , WKeyEqual(std::move(static_cast<WKeyEqual&>(o)))
         , DataPool(std::move(static_cast<DataPool&>(o))) {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         if (o.mMask) {
+            mHashMultiplier = std::move(o.mHashMultiplier);
             mKeyVals = std::move(o.mKeyVals);
             mInfo = std::move(o.mInfo);
             mNumElements = std::move(o.mNumElements);
@@ -1468,11 +1551,12 @@ class Table
     }
 
     Table& operator=(Table&& o) noexcept {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         if (&o != this) {
             if (o.mMask) {
                 // only move stuff if the other map actually has some data
                 destroy();
+                mHashMultiplier = std::move(o.mHashMultiplier);
                 mKeyVals = std::move(o.mKeyVals);
                 mInfo = std::move(o.mInfo);
                 mNumElements = std::move(o.mNumElements);
@@ -1498,14 +1582,19 @@ class Table
         : WHash(static_cast<const WHash&>(o))
         , WKeyEqual(static_cast<const WKeyEqual&>(o))
         , DataPool(static_cast<const DataPool&>(o)) {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         if (!o.empty()) {
             // not empty: create an exact copy. it is also possible to just iterate through all
             // elements and insert them, but copying is probably faster.
 
             auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1);
-            mKeyVals = static_cast<Node*>(detail::assertNotNull<std::bad_alloc>(
-                malloc(calcNumBytesTotal(numElementsWithBuffer))));
+            auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer);
+
+            ROBIN_HOOD_LOG("std::malloc " << numBytesTotal << " = calcNumBytesTotal("
+                                          << numElementsWithBuffer << ")")
+            mHashMultiplier = o.mHashMultiplier;
+            mKeyVals = static_cast<Node*>(
+                detail::assertNotNull<std::bad_alloc>(std::malloc(numBytesTotal)));
             // no need for calloc because clonData does memcpy
             mInfo = reinterpret_cast<uint8_t*>(mKeyVals + numElementsWithBuffer);
             mNumElements = o.mNumElements;
@@ -1521,14 +1610,14 @@ class Table
     // Not sure why clang-tidy thinks this doesn't handle self assignment, it does
     // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp)
     Table& operator=(Table const& o) {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         if (&o == this) {
             // prevent assigning of itself
             return *this;
         }
 
-        // we keep using the old allocator and not assign the new one, because we want to keep the
-        // memory available. when it is the same size.
+        // we keep using the old allocator and not assign the new one, because we want to keep
+        // the memory available. when it is the same size.
         if (o.empty()) {
             if (0 == mMask) {
                 // nothing to do, we are empty too
@@ -1553,12 +1642,16 @@ class Table
             // no luck: we don't have the same array size allocated, so we need to realloc.
             if (0 != mMask) {
                 // only deallocate if we actually have data!
-                free(mKeyVals);
+                ROBIN_HOOD_LOG("std::free")
+                std::free(mKeyVals);
             }
 
             auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1);
-            mKeyVals = static_cast<Node*>(detail::assertNotNull<std::bad_alloc>(
-                malloc(calcNumBytesTotal(numElementsWithBuffer))));
+            auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer);
+            ROBIN_HOOD_LOG("std::malloc " << numBytesTotal << " = calcNumBytesTotal("
+                                          << numElementsWithBuffer << ")")
+            mKeyVals = static_cast<Node*>(
+                detail::assertNotNull<std::bad_alloc>(std::malloc(numBytesTotal)));
 
             // no need for calloc here because cloneData performs a memcpy.
             mInfo = reinterpret_cast<uint8_t*>(mKeyVals + numElementsWithBuffer);
@@ -1567,6 +1660,7 @@ class Table
         WHash::operator=(static_cast<const WHash&>(o));
         WKeyEqual::operator=(static_cast<const WKeyEqual&>(o));
         DataPool::operator=(static_cast<DataPool const&>(o));
+        mHashMultiplier = o.mHashMultiplier;
         mNumElements = o.mNumElements;
         mMask = o.mMask;
         mMaxNumElementsAllowed = o.mMaxNumElementsAllowed;
@@ -1579,17 +1673,17 @@ class Table
 
     // Swaps everything between the two maps.
     void swap(Table& o) {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         using std::swap;
         swap(o, *this);
     }
 
     // Clears all data, without resizing.
     void clear() {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         if (empty()) {
-            // don't do anything! also important because we don't want to write to DummyInfoByte::b,
-            // even though we would just write 0 to it.
+            // don't do anything! also important because we don't want to write to
+            // DummyInfoByte::b, even though we would just write 0 to it.
             return;
         }
 
@@ -1607,13 +1701,13 @@ class Table
 
     // Destroys the map and all it's contents.
     ~Table() {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         destroy();
     }
 
     // Checks if both tables contain the same entries. Order is irrelevant.
     bool operator==(const Table& other) const {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         if (other.size() != size()) {
             return false;
         }
@@ -1627,20 +1721,61 @@ class Table
     }
 
     bool operator!=(const Table& other) const {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         return !operator==(other);
     }
 
     template <typename Q = mapped_type>
     typename std::enable_if<!std::is_void<Q>::value, Q&>::type operator[](const key_type& key) {
-        ROBIN_HOOD_TRACE(this);
-        return doCreateByKey(key);
+        ROBIN_HOOD_TRACE(this)
+        auto idxAndState = insertKeyPrepareEmptySpot(key);
+        switch (idxAndState.second) {
+        case InsertionState::key_found:
+            break;
+
+        case InsertionState::new_node:
+            ::new (static_cast<void*>(&mKeyVals[idxAndState.first]))
+                Node(*this, std::piecewise_construct, std::forward_as_tuple(key),
+                     std::forward_as_tuple());
+            break;
+
+        case InsertionState::overwrite_node:
+            mKeyVals[idxAndState.first] = Node(*this, std::piecewise_construct,
+                                               std::forward_as_tuple(key), std::forward_as_tuple());
+            break;
+
+        case InsertionState::overflow_error:
+            throwOverflowError();
+        }
+
+        return mKeyVals[idxAndState.first].getSecond();
     }
 
     template <typename Q = mapped_type>
     typename std::enable_if<!std::is_void<Q>::value, Q&>::type operator[](key_type&& key) {
-        ROBIN_HOOD_TRACE(this);
-        return doCreateByKey(std::move(key));
+        ROBIN_HOOD_TRACE(this)
+        auto idxAndState = insertKeyPrepareEmptySpot(key);
+        switch (idxAndState.second) {
+        case InsertionState::key_found:
+            break;
+
+        case InsertionState::new_node:
+            ::new (static_cast<void*>(&mKeyVals[idxAndState.first]))
+                Node(*this, std::piecewise_construct, std::forward_as_tuple(std::move(key)),
+                     std::forward_as_tuple());
+            break;
+
+        case InsertionState::overwrite_node:
+            mKeyVals[idxAndState.first] =
+                Node(*this, std::piecewise_construct, std::forward_as_tuple(std::move(key)),
+                     std::forward_as_tuple());
+            break;
+
+        case InsertionState::overflow_error:
+            throwOverflowError();
+        }
+
+        return mKeyVals[idxAndState.first].getSecond();
     }
 
     template <typename Iter>
@@ -1651,31 +1786,123 @@ class Table
         }
     }
 
+    void insert(std::initializer_list<value_type> ilist) {
+        for (auto&& vt : ilist) {
+            insert(std::move(vt));
+        }
+    }
+
     template <typename... Args>
     std::pair<iterator, bool> emplace(Args&&... args) {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         Node n{*this, std::forward<Args>(args)...};
-        auto r = doInsert(std::move(n));
-        if (!r.second) {
-            // insertion not possible: destroy node
-            // NOLINTNEXTLINE(bugprone-use-after-move)
+        auto idxAndState = insertKeyPrepareEmptySpot(getFirstConst(n));
+        switch (idxAndState.second) {
+        case InsertionState::key_found:
+            n.destroy(*this);
+            break;
+
+        case InsertionState::new_node:
+            ::new (static_cast<void*>(&mKeyVals[idxAndState.first])) Node(*this, std::move(n));
+            break;
+
+        case InsertionState::overwrite_node:
+            mKeyVals[idxAndState.first] = std::move(n);
+            break;
+
+        case InsertionState::overflow_error:
             n.destroy(*this);
+            throwOverflowError();
+            break;
         }
-        return r;
+
+        return std::make_pair(iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first),
+                              InsertionState::key_found != idxAndState.second);
+    }
+
+    template <typename... Args>
+    iterator emplace_hint(const_iterator position, Args&&... args) {
+        (void)position;
+        return emplace(std::forward<Args>(args)...).first;
+    }
+
+    template <typename... Args>
+    std::pair<iterator, bool> try_emplace(const key_type& key, Args&&... args) {
+        return try_emplace_impl(key, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    std::pair<iterator, bool> try_emplace(key_type&& key, Args&&... args) {
+        return try_emplace_impl(std::move(key), std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    iterator try_emplace(const_iterator hint, const key_type& key, Args&&... args) {
+        (void)hint;
+        return try_emplace_impl(key, std::forward<Args>(args)...).first;
+    }
+
+    template <typename... Args>
+    iterator try_emplace(const_iterator hint, key_type&& key, Args&&... args) {
+        (void)hint;
+        return try_emplace_impl(std::move(key), std::forward<Args>(args)...).first;
+    }
+
+    template <typename Mapped>
+    std::pair<iterator, bool> insert_or_assign(const key_type& key, Mapped&& obj) {
+        return insertOrAssignImpl(key, std::forward<Mapped>(obj));
+    }
+
+    template <typename Mapped>
+    std::pair<iterator, bool> insert_or_assign(key_type&& key, Mapped&& obj) {
+        return insertOrAssignImpl(std::move(key), std::forward<Mapped>(obj));
+    }
+
+    template <typename Mapped>
+    iterator insert_or_assign(const_iterator hint, const key_type& key, Mapped&& obj) {
+        (void)hint;
+        return insertOrAssignImpl(key, std::forward<Mapped>(obj)).first;
+    }
+
+    template <typename Mapped>
+    iterator insert_or_assign(const_iterator hint, key_type&& key, Mapped&& obj) {
+        (void)hint;
+        return insertOrAssignImpl(std::move(key), std::forward<Mapped>(obj)).first;
     }
 
     std::pair<iterator, bool> insert(const value_type& keyval) {
-        ROBIN_HOOD_TRACE(this);
-        return doInsert(keyval);
+        ROBIN_HOOD_TRACE(this)
+        return emplace(keyval);
+    }
+
+    iterator insert(const_iterator hint, const value_type& keyval) {
+        (void)hint;
+        return emplace(keyval).first;
     }
 
     std::pair<iterator, bool> insert(value_type&& keyval) {
-        return doInsert(std::move(keyval));
+        return emplace(std::move(keyval));
+    }
+
+    iterator insert(const_iterator hint, value_type&& keyval) {
+        (void)hint;
+        return emplace(std::move(keyval)).first;
     }
 
     // Returns 1 if key is found, 0 otherwise.
     size_t count(const key_type& key) const { // NOLINT(modernize-use-nodiscard)
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
+        auto kv = mKeyVals + findIdx(key);
+        if (kv != reinterpret_cast_no_cast_align_warning<Node*>(mInfo)) {
+            return 1;
+        }
+        return 0;
+    }
+
+    template <typename OtherKey, typename Self_ = Self>
+    // NOLINTNEXTLINE(modernize-use-nodiscard)
+    typename std::enable_if<Self_::is_transparent, size_t>::type count(const OtherKey& key) const {
+        ROBIN_HOOD_TRACE(this)
         auto kv = mKeyVals + findIdx(key);
         if (kv != reinterpret_cast_no_cast_align_warning<Node*>(mInfo)) {
             return 1;
@@ -1687,12 +1914,18 @@ class Table
         return 1U == count(key);
     }
 
+    template <typename OtherKey, typename Self_ = Self>
+    // NOLINTNEXTLINE(modernize-use-nodiscard)
+    typename std::enable_if<Self_::is_transparent, bool>::type contains(const OtherKey& key) const {
+        return 1U == count(key);
+    }
+
     // Returns a reference to the value found for key.
     // Throws std::out_of_range if element cannot be found
     template <typename Q = mapped_type>
     // NOLINTNEXTLINE(modernize-use-nodiscard)
     typename std::enable_if<!std::is_void<Q>::value, Q&>::type at(key_type const& key) {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         auto kv = mKeyVals + findIdx(key);
         if (kv == reinterpret_cast_no_cast_align_warning<Node*>(mInfo)) {
             doThrow<std::out_of_range>("key not found");
@@ -1705,7 +1938,7 @@ class Table
     template <typename Q = mapped_type>
     // NOLINTNEXTLINE(modernize-use-nodiscard)
     typename std::enable_if<!std::is_void<Q>::value, Q const&>::type at(key_type const& key) const {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         auto kv = mKeyVals + findIdx(key);
         if (kv == reinterpret_cast_no_cast_align_warning<Node*>(mInfo)) {
             doThrow<std::out_of_range>("key not found");
@@ -1714,44 +1947,60 @@ class Table
     }
 
     const_iterator find(const key_type& key) const { // NOLINT(modernize-use-nodiscard)
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         const size_t idx = findIdx(key);
         return const_iterator{mKeyVals + idx, mInfo + idx};
     }
 
     template <typename OtherKey>
     const_iterator find(const OtherKey& key, is_transparent_tag /*unused*/) const {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
+        const size_t idx = findIdx(key);
+        return const_iterator{mKeyVals + idx, mInfo + idx};
+    }
+
+    template <typename OtherKey, typename Self_ = Self>
+    typename std::enable_if<Self_::is_transparent, // NOLINT(modernize-use-nodiscard)
+                            const_iterator>::type  // NOLINT(modernize-use-nodiscard)
+    find(const OtherKey& key) const {              // NOLINT(modernize-use-nodiscard)
+        ROBIN_HOOD_TRACE(this)
         const size_t idx = findIdx(key);
         return const_iterator{mKeyVals + idx, mInfo + idx};
     }
 
     iterator find(const key_type& key) {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         const size_t idx = findIdx(key);
         return iterator{mKeyVals + idx, mInfo + idx};
     }
 
     template <typename OtherKey>
     iterator find(const OtherKey& key, is_transparent_tag /*unused*/) {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
+        const size_t idx = findIdx(key);
+        return iterator{mKeyVals + idx, mInfo + idx};
+    }
+
+    template <typename OtherKey, typename Self_ = Self>
+    typename std::enable_if<Self_::is_transparent, iterator>::type find(const OtherKey& key) {
+        ROBIN_HOOD_TRACE(this)
         const size_t idx = findIdx(key);
         return iterator{mKeyVals + idx, mInfo + idx};
     }
 
     iterator begin() {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         if (empty()) {
             return end();
         }
         return iterator(mKeyVals, mInfo, fast_forward_tag{});
     }
     const_iterator begin() const { // NOLINT(modernize-use-nodiscard)
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         return cbegin();
     }
     const_iterator cbegin() const { // NOLINT(modernize-use-nodiscard)
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         if (empty()) {
             return cend();
         }
@@ -1759,22 +2008,22 @@ class Table
     }
 
     iterator end() {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         // no need to supply valid info pointer: end() must not be dereferenced, and only node
         // pointer is compared.
         return iterator{reinterpret_cast_no_cast_align_warning<Node*>(mInfo), nullptr};
     }
     const_iterator end() const { // NOLINT(modernize-use-nodiscard)
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         return cend();
     }
     const_iterator cend() const { // NOLINT(modernize-use-nodiscard)
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         return const_iterator{reinterpret_cast_no_cast_align_warning<Node*>(mInfo), nullptr};
     }
 
     iterator erase(const_iterator pos) {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         // its safe to perform const cast here
         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
         return erase(iterator{const_cast<Node*>(pos.mKeyVals), const_cast<uint8_t*>(pos.mInfo)});
@@ -1782,7 +2031,7 @@ class Table
 
     // Erases element at pos, returns iterator to the next element.
     iterator erase(iterator pos) {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         // we assume that pos always points to a valid entry, and not end().
         auto const idx = static_cast<size_t>(pos.mKeyVals - mKeyVals);
 
@@ -1799,9 +2048,9 @@ class Table
     }
 
     size_t erase(const key_type& key) {
-        ROBIN_HOOD_TRACE(this);
-        size_t idx;
-        InfoType info;
+        ROBIN_HOOD_TRACE(this)
+        size_t idx{};
+        InfoType info{};
         keyToIdx(key, &idx, &info);
 
         // check while info matches with the source idx
@@ -1821,53 +2070,66 @@ class Table
     // reserves space for the specified number of elements. Makes sure the old data fits.
     // exactly the same as reserve(c).
     void rehash(size_t c) {
-        reserve(c);
+        // forces a reserve
+        reserve(c, true);
     }
 
     // reserves space for the specified number of elements. Makes sure the old data fits.
-    // Exactly the same as resize(c). Use resize(0) to shrink to fit.
+    // Exactly the same as rehash(c). Use rehash(0) to shrink to fit.
     void reserve(size_t c) {
-        ROBIN_HOOD_TRACE(this);
-        auto const minElementsAllowed = (std::max)(c, mNumElements);
+        // reserve, but don't force rehash
+        reserve(c, false);
+    }
+
+    // If possible reallocates the map to a smaller one. This frees the underlying table.
+    // Does not do anything if load_factor is too large for decreasing the table's size.
+    void compact() {
+        ROBIN_HOOD_TRACE(this)
         auto newSize = InitialNumElements;
-        while (calcMaxNumElementsAllowed(newSize) < minElementsAllowed && newSize != 0) {
+        while (calcMaxNumElementsAllowed(newSize) < mNumElements && newSize != 0) {
             newSize *= 2;
         }
         if (ROBIN_HOOD_UNLIKELY(newSize == 0)) {
             throwOverflowError();
         }
 
-        rehashPowerOfTwo(newSize);
+        ROBIN_HOOD_LOG("newSize > mMask + 1: " << newSize << " > " << mMask << " + 1")
+
+        // only actually do anything when the new size is bigger than the old one. This prevents to
+        // continuously allocate for each reserve() call.
+        if (newSize < mMask + 1) {
+            rehashPowerOfTwo(newSize, true);
+        }
     }
 
     size_type size() const noexcept { // NOLINT(modernize-use-nodiscard)
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         return mNumElements;
     }
 
     size_type max_size() const noexcept { // NOLINT(modernize-use-nodiscard)
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         return static_cast<size_type>(-1);
     }
 
     ROBIN_HOOD(NODISCARD) bool empty() const noexcept {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         return 0 == mNumElements;
     }
 
     float max_load_factor() const noexcept { // NOLINT(modernize-use-nodiscard)
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         return MaxLoadFactor100 / 100.0F;
     }
 
     // Average number of elements per bucket. Since we allow only 1 per bucket
     float load_factor() const noexcept { // NOLINT(modernize-use-nodiscard)
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         return static_cast<float>(size()) / static_cast<float>(mMask + 1);
     }
 
     ROBIN_HOOD(NODISCARD) size_t mask() const noexcept {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         return mMask;
     }
 
@@ -1916,7 +2178,7 @@ class Table
     template <typename Q = mapped_type>
     ROBIN_HOOD(NODISCARD)
     typename std::enable_if<!std::is_void<Q>::value, bool>::type has(const value_type& e) const {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         auto it = find(e.first);
         return it != end() && it->second == e.second;
     }
@@ -1924,14 +2186,35 @@ class Table
     template <typename Q = mapped_type>
     ROBIN_HOOD(NODISCARD)
     typename std::enable_if<std::is_void<Q>::value, bool>::type has(const value_type& e) const {
-        ROBIN_HOOD_TRACE(this);
+        ROBIN_HOOD_TRACE(this)
         return find(e) != end();
     }
 
+    void reserve(size_t c, bool forceRehash) {
+        ROBIN_HOOD_TRACE(this)
+        auto const minElementsAllowed = (std::max)(c, mNumElements);
+        auto newSize = InitialNumElements;
+        while (calcMaxNumElementsAllowed(newSize) < minElementsAllowed && newSize != 0) {
+            newSize *= 2;
+        }
+        if (ROBIN_HOOD_UNLIKELY(newSize == 0)) {
+            throwOverflowError();
+        }
+
+        ROBIN_HOOD_LOG("newSize > mMask + 1: " << newSize << " > " << mMask << " + 1")
+
+        // only actually do anything when the new size is bigger than the old one. This prevents to
+        // continuously allocate for each reserve() call.
+        if (forceRehash || newSize > mMask + 1) {
+            rehashPowerOfTwo(newSize, false);
+        }
+    }
+
     // reserves space for at least the specified number of elements.
     // only works if numBuckets if power of two
-    void rehashPowerOfTwo(size_t numBuckets) {
-        ROBIN_HOOD_TRACE(this);
+    // True on success, false otherwise
+    void rehashPowerOfTwo(size_t numBuckets, bool forceFree) {
+        ROBIN_HOOD_TRACE(this)
 
         Node* const oldKeyVals = mKeyVals;
         uint8_t const* const oldInfo = mInfo;
@@ -1939,18 +2222,29 @@ class Table
         const size_t oldMaxElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1);
 
         // resize operation: move stuff
-        init_data(numBuckets);
+        initData(numBuckets);
         if (oldMaxElementsWithBuffer > 1) {
             for (size_t i = 0; i < oldMaxElementsWithBuffer; ++i) {
                 if (oldInfo[i] != 0) {
+                    // might throw an exception, which is really bad since we are in the middle of
+                    // moving stuff.
                     insert_move(std::move(oldKeyVals[i]));
                     // destroy the node but DON'T destroy the data.
                     oldKeyVals[i].~Node();
                 }
             }
 
-            // don't destroy old data: put it into the pool instead
-            DataPool::addOrFree(oldKeyVals, calcNumBytesTotal(oldMaxElementsWithBuffer));
+            // this check is not necessary as it's guarded by the previous if, but it helps
+            // silence g++'s overeager "attempt to free a non-heap object 'map'
+            // [-Werror=free-nonheap-object]" warning.
+            if (oldKeyVals != reinterpret_cast_no_cast_align_warning<Node*>(&mMask)) {
+                // don't destroy old data: put it into the pool instead
+                if (forceFree) {
+                    std::free(oldKeyVals);
+                } else {
+                    DataPool::addOrFree(oldKeyVals, calcNumBytesTotal(oldMaxElementsWithBuffer));
+                }
+            }
         }
     }
 
@@ -1962,17 +2256,80 @@ class Table
 #endif
     }
 
-    void init_data(size_t max_elements) {
+    template <typename OtherKey, typename... Args>
+    std::pair<iterator, bool> try_emplace_impl(OtherKey&& key, Args&&... args) {
+        ROBIN_HOOD_TRACE(this)
+        auto idxAndState = insertKeyPrepareEmptySpot(key);
+        switch (idxAndState.second) {
+        case InsertionState::key_found:
+            break;
+
+        case InsertionState::new_node:
+            ::new (static_cast<void*>(&mKeyVals[idxAndState.first])) Node(
+                *this, std::piecewise_construct, std::forward_as_tuple(std::forward<OtherKey>(key)),
+                std::forward_as_tuple(std::forward<Args>(args)...));
+            break;
+
+        case InsertionState::overwrite_node:
+            mKeyVals[idxAndState.first] = Node(*this, std::piecewise_construct,
+                                               std::forward_as_tuple(std::forward<OtherKey>(key)),
+                                               std::forward_as_tuple(std::forward<Args>(args)...));
+            break;
+
+        case InsertionState::overflow_error:
+            throwOverflowError();
+            break;
+        }
+
+        return std::make_pair(iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first),
+                              InsertionState::key_found != idxAndState.second);
+    }
+
+    template <typename OtherKey, typename Mapped>
+    std::pair<iterator, bool> insertOrAssignImpl(OtherKey&& key, Mapped&& obj) {
+        ROBIN_HOOD_TRACE(this)
+        auto idxAndState = insertKeyPrepareEmptySpot(key);
+        switch (idxAndState.second) {
+        case InsertionState::key_found:
+            mKeyVals[idxAndState.first].getSecond() = std::forward<Mapped>(obj);
+            break;
+
+        case InsertionState::new_node:
+            ::new (static_cast<void*>(&mKeyVals[idxAndState.first])) Node(
+                *this, std::piecewise_construct, std::forward_as_tuple(std::forward<OtherKey>(key)),
+                std::forward_as_tuple(std::forward<Mapped>(obj)));
+            break;
+
+        case InsertionState::overwrite_node:
+            mKeyVals[idxAndState.first] = Node(*this, std::piecewise_construct,
+                                               std::forward_as_tuple(std::forward<OtherKey>(key)),
+                                               std::forward_as_tuple(std::forward<Mapped>(obj)));
+            break;
+
+        case InsertionState::overflow_error:
+            throwOverflowError();
+            break;
+        }
+
+        return std::make_pair(iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first),
+                              InsertionState::key_found != idxAndState.second);
+    }
+
+    void initData(size_t max_elements) {
         mNumElements = 0;
         mMask = max_elements - 1;
         mMaxNumElementsAllowed = calcMaxNumElementsAllowed(max_elements);
 
         auto const numElementsWithBuffer = calcNumElementsWithBuffer(max_elements);
 
-        // calloc also zeroes everything
-        mKeyVals = reinterpret_cast<Node*>(detail::assertNotNull<std::bad_alloc>(
-            calloc(1, calcNumBytesTotal(numElementsWithBuffer))));
+        // malloc & zero mInfo. Faster than calloc everything.
+        auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer);
+        ROBIN_HOOD_LOG("std::calloc " << numBytesTotal << " = calcNumBytesTotal("
+                                      << numElementsWithBuffer << ")")
+        mKeyVals = reinterpret_cast<Node*>(
+            detail::assertNotNull<std::bad_alloc>(std::malloc(numBytesTotal)));
         mInfo = reinterpret_cast<uint8_t*>(mKeyVals + numElementsWithBuffer);
+        std::memset(mInfo, 0, numBytesTotal - numElementsWithBuffer * sizeof(Node));
 
         // set sentinel
         mInfo[numElementsWithBuffer] = 1;
@@ -1981,86 +2338,34 @@ class Table
         mInfoHashShift = InitialInfoHashShift;
     }
 
-    template <typename Arg, typename Q = mapped_type>
-    typename std::enable_if<!std::is_void<Q>::value, Q&>::type doCreateByKey(Arg&& key) {
-        while (true) {
-            size_t idx;
-            InfoType info;
-            keyToIdx(key, &idx, &info);
-            nextWhileLess(&info, &idx);
-
-            // while we potentially have a match. Can't do a do-while here because when mInfo is 0
-            // we don't want to skip forward
-            while (info == mInfo[idx]) {
-                if (WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) {
-                    // key already exists, do not insert.
-                    return mKeyVals[idx].getSecond();
-                }
-                next(&info, &idx);
-            }
+    enum class InsertionState { overflow_error, key_found, new_node, overwrite_node };
 
-            // unlikely that this evaluates to true
-            if (ROBIN_HOOD_UNLIKELY(mNumElements >= mMaxNumElementsAllowed)) {
-                increase_size();
-                continue;
-            }
-
-            // key not found, so we are now exactly where we want to insert it.
-            auto const insertion_idx = idx;
-            auto const insertion_info = info;
-            if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) {
-                mMaxNumElementsAllowed = 0;
-            }
-
-            // find an empty spot
-            while (0 != mInfo[idx]) {
-                next(&info, &idx);
-            }
-
-            auto& l = mKeyVals[insertion_idx];
-            if (idx == insertion_idx) {
-                // put at empty spot. This forwards all arguments into the node where the object is
-                // constructed exactly where it is needed.
-                ::new (static_cast<void*>(&l))
-                    Node(*this, std::piecewise_construct,
-                         std::forward_as_tuple(std::forward<Arg>(key)), std::forward_as_tuple());
-            } else {
-                shiftUp(idx, insertion_idx);
-                l = Node(*this, std::piecewise_construct,
-                         std::forward_as_tuple(std::forward<Arg>(key)), std::forward_as_tuple());
-            }
-
-            // mKeyVals[idx].getFirst() = std::move(key);
-            mInfo[insertion_idx] = static_cast<uint8_t>(insertion_info);
-
-            ++mNumElements;
-            return mKeyVals[insertion_idx].getSecond();
-        }
-    }
-
-    // This is exactly the same code as operator[], except for the return values
-    template <typename Arg>
-    std::pair<iterator, bool> doInsert(Arg&& keyval) {
-        while (true) {
-            size_t idx;
-            InfoType info;
-            keyToIdx(getFirstConst(keyval), &idx, &info);
+    // Finds key, and if not already present prepares a spot where to pot the key & value.
+    // This potentially shifts nodes out of the way, updates mInfo and number of inserted
+    // elements, so the only operation left to do is create/assign a new node at that spot.
+    template <typename OtherKey>
+    std::pair<size_t, InsertionState> insertKeyPrepareEmptySpot(OtherKey&& key) {
+        for (int i = 0; i < 256; ++i) {
+            size_t idx{};
+            InfoType info{};
+            keyToIdx(key, &idx, &info);
             nextWhileLess(&info, &idx);
 
             // while we potentially have a match
             while (info == mInfo[idx]) {
-                if (WKeyEqual::operator()(getFirstConst(keyval), mKeyVals[idx].getFirst())) {
+                if (WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) {
                     // key already exists, do NOT insert.
                     // see http://en.cppreference.com/w/cpp/container/unordered_map/insert
-                    return std::make_pair<iterator, bool>(iterator(mKeyVals + idx, mInfo + idx),
-                                                          false);
+                    return std::make_pair(idx, InsertionState::key_found);
                 }
                 next(&info, &idx);
             }
 
             // unlikely that this evaluates to true
             if (ROBIN_HOOD_UNLIKELY(mNumElements >= mMaxNumElementsAllowed)) {
-                increase_size();
+                if (!increase_size()) {
+                    return std::make_pair(size_t(0), InsertionState::overflow_error);
+                }
                 continue;
             }
 
@@ -2076,26 +2381,25 @@ class Table
                 next(&info, &idx);
             }
 
-            auto& l = mKeyVals[insertion_idx];
-            if (idx == insertion_idx) {
-                ::new (static_cast<void*>(&l)) Node(*this, std::forward<Arg>(keyval));
-            } else {
+            if (idx != insertion_idx) {
                 shiftUp(idx, insertion_idx);
-                l = Node(*this, std::forward<Arg>(keyval));
             }
-
             // put at empty spot
             mInfo[insertion_idx] = static_cast<uint8_t>(insertion_info);
-
             ++mNumElements;
-            return std::make_pair(iterator(mKeyVals + insertion_idx, mInfo + insertion_idx), true);
+            return std::make_pair(insertion_idx, idx == insertion_idx
+                                                     ? InsertionState::new_node
+                                                     : InsertionState::overwrite_node);
         }
+
+        // enough attempts failed, so finally give up.
+        return std::make_pair(size_t(0), InsertionState::overflow_error);
     }
 
     bool try_increase_info() {
         ROBIN_HOOD_LOG("mInfoInc=" << mInfoInc << ", numElements=" << mNumElements
                                    << ", maxNumElementsAllowed="
-                                   << calcMaxNumElementsAllowed(mMask + 1));
+                                   << calcMaxNumElementsAllowed(mMask + 1))
         if (mInfoInc <= 2) {
             // need to be > 2 so that shift works (otherwise undefined behavior!)
             return false;
@@ -2120,28 +2424,41 @@ class Table
         return true;
     }
 
-    void increase_size() {
+    // True if resize was possible, false otherwise
+    bool increase_size() {
         // nothing allocated yet? just allocate InitialNumElements
         if (0 == mMask) {
-            init_data(InitialNumElements);
-            return;
+            initData(InitialNumElements);
+            return true;
         }
 
         auto const maxNumElementsAllowed = calcMaxNumElementsAllowed(mMask + 1);
         if (mNumElements < maxNumElementsAllowed && try_increase_info()) {
-            return;
+            return true;
         }
 
         ROBIN_HOOD_LOG("mNumElements=" << mNumElements << ", maxNumElementsAllowed="
                                        << maxNumElementsAllowed << ", load="
                                        << (static_cast<double>(mNumElements) * 100.0 /
-                                           (static_cast<double>(mMask) + 1)));
-        // it seems we have a really bad hash function! don't try to resize again
+                                           (static_cast<double>(mMask) + 1)))
+
         if (mNumElements * 2 < calcMaxNumElementsAllowed(mMask + 1)) {
-            throwOverflowError();
+            // we have to resize, even though there would still be plenty of space left!
+            // Try to rehash instead. Delete freed memory so we don't steadyily increase mem in case
+            // we have to rehash a few times
+            nextHashMultiplier();
+            rehashPowerOfTwo(mMask + 1, true);
+        } else {
+            // we've reached the capacity of the map, so the hash seems to work nice. Keep using it.
+            rehashPowerOfTwo((mMask + 1) * 2, false);
         }
+        return true;
+    }
 
-        rehashPowerOfTwo((mMask + 1) * 2);
+    void nextHashMultiplier() {
+        // adding an *even* number, so that the multiplier will always stay odd. This is necessary
+        // so that the hash stays a mixing function (and thus doesn't have any information loss).
+        mHashMultiplier += UINT64_C(0xc4ceb9fe1a85ec54);
     }
 
     void destroy() {
@@ -2155,15 +2472,16 @@ class Table
 
         // This protection against not deleting mMask shouldn't be needed as it's sufficiently
         // protected with the 0==mMask check, but I have this anyways because g++ 7 otherwise
-        // reports a compile error: attempt to free a non-heap object ‘fm’
+        // reports a compile error: attempt to free a non-heap object 'fm'
         // [-Werror=free-nonheap-object]
-        if (mKeyVals != reinterpret_cast<Node*>(&mMask)) {
-            free(mKeyVals);
+        if (mKeyVals != reinterpret_cast_no_cast_align_warning<Node*>(&mMask)) {
+            ROBIN_HOOD_LOG("std::free")
+            std::free(mKeyVals);
         }
     }
 
     void init() noexcept {
-        mKeyVals = reinterpret_cast<Node*>(&mMask);
+        mKeyVals = reinterpret_cast_no_cast_align_warning<Node*>(&mMask);
         mInfo = reinterpret_cast<uint8_t*>(&mMask);
         mNumElements = 0;
         mMask = 0;
@@ -2173,14 +2491,15 @@ class Table
     }
 
     // members are sorted so no padding occurs
-    Node* mKeyVals = reinterpret_cast<Node*>(&mMask);    // 8 byte  8
-    uint8_t* mInfo = reinterpret_cast<uint8_t*>(&mMask); // 8 byte 16
-    size_t mNumElements = 0;                             // 8 byte 24
-    size_t mMask = 0;                                    // 8 byte 32
-    size_t mMaxNumElementsAllowed = 0;                   // 8 byte 40
-    InfoType mInfoInc = InitialInfoInc;                  // 4 byte 44
-    InfoType mInfoHashShift = InitialInfoHashShift;      // 4 byte 48
-                                                         // 16 byte 56 if NodeAllocator
+    uint64_t mHashMultiplier = UINT64_C(0xc4ceb9fe1a85ec53);                // 8 byte  8
+    Node* mKeyVals = reinterpret_cast_no_cast_align_warning<Node*>(&mMask); // 8 byte 16
+    uint8_t* mInfo = reinterpret_cast<uint8_t*>(&mMask);                    // 8 byte 24
+    size_t mNumElements = 0;                                                // 8 byte 32
+    size_t mMask = 0;                                                       // 8 byte 40
+    size_t mMaxNumElementsAllowed = 0;                                      // 8 byte 48
+    InfoType mInfoInc = InitialInfoInc;                                     // 4 byte 52
+    InfoType mInfoHashShift = InitialInfoHashShift;                         // 4 byte 56
+                                                    // 16 byte 56 if NodeAllocator
 };
 
 } // namespace detail
diff --git a/setup.py b/setup.py
index b218536..d8b2a22 100644
--- a/setup.py
+++ b/setup.py
@@ -172,7 +172,7 @@ def get_extra_args():
     url="https://github.com/kcleal/dysgu",
     description="Structural variant calling",
     license="MIT",
-    version='1.3.9',
+    version='1.3.10',
     python_requires='>=3.7',
     install_requires=[  # runtime requires
             'cython',