forked from scylladb/scylladb
-
Notifications
You must be signed in to change notification settings - Fork 0
/
database.hh
1291 lines (1133 loc) · 55 KB
/
database.hh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* Copyright (C) 2014 ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef DATABASE_HH_
#define DATABASE_HH_
#include "dht/i_partitioner.hh"
#include "locator/abstract_replication_strategy.hh"
#include "core/sstring.hh"
#include "core/shared_ptr.hh"
#include "net/byteorder.hh"
#include "utils/UUID_gen.hh"
#include "utils/UUID.hh"
#include "utils/hash.hh"
#include "db_clock.hh"
#include "gc_clock.hh"
#include <chrono>
#include "core/distributed.hh"
#include <functional>
#include <cstdint>
#include <unordered_map>
#include <map>
#include <set>
#include <iosfwd>
#include <boost/functional/hash.hpp>
#include <boost/range/algorithm/find.hpp>
#include <experimental/optional>
#include <string.h>
#include "types.hh"
#include "compound.hh"
#include "core/future.hh"
#include "core/gate.hh"
#include "cql3/column_specification.hh"
#include "db/commitlog/replay_position.hh"
#include <limits>
#include <cstddef>
#include "schema.hh"
#include "timestamp.hh"
#include "tombstone.hh"
#include "atomic_cell.hh"
#include "query-request.hh"
#include "keys.hh"
#include "mutation.hh"
#include "memtable.hh"
#include <list>
#include "mutation_reader.hh"
#include "row_cache.hh"
#include "compaction_strategy.hh"
#include "sstables/compaction_manager.hh"
#include "utils/exponential_backoff_retry.hh"
#include "utils/histogram.hh"
#include "utils/estimated_histogram.hh"
#include "sstables/compaction.hh"
#include "sstables/sstable_set.hh"
#include <seastar/core/rwlock.hh>
#include <seastar/core/shared_future.hh>
#include <seastar/core/metrics_registration.hh>
#include "tracing/trace_state.hh"
#include <boost/intrusive/parent_from_member.hpp>
#include "db/view/view.hh"
class cell_locker;
class locked_cell;
class frozen_mutation;
class reconcilable_result;
namespace service {
class storage_proxy;
}
namespace sstables {
class sstable;
class entry_descriptor;
}
namespace db {
template<typename T>
class serializer;
class commitlog;
class config;
namespace system_keyspace {
void make(database& db, bool durable, bool volatile_testing_only);
}
}
class replay_position_reordered_exception : public std::exception {};
using shared_memtable = lw_shared_ptr<memtable>;
class memtable_list;
class dirty_memory_manager: public logalloc::region_group_reclaimer {
// We need a separate boolean, because from the LSA point of view, pressure may still be
// mounting, in which case the pressure flag could be set back on if we force it off.
bool _db_shutdown_requested = false;
database* _db;
logalloc::region_group _region_group;
// We would like to serialize the flushing of memtables. While flushing many memtables
// simultaneously can sustain high levels of throughput, the memory is not freed until the
// memtable is totally gone. That means that if we have throttled requests, they will stay
// throttled for a long time. Even when we have virtual dirty, that only provides a rough
// estimate, and we can't release requests that early.
semaphore _flush_serializer;
// We will accept a new flush before another one ends, once it is done with the data write.
// That is so we can keep the disk always busy. But there is still some background work that is
// left to be done. Mostly, update the caches and seal the auxiliary components of the SSTable.
// This semaphore will cap the amount of background work that we have. Note that we're not
// overly concerned about memtable memory, because dirty memory will put a limit to that. This
// is mostly about dangling continuations. So that doesn't have to be a small number.
static constexpr unsigned _max_background_work = 20;
semaphore _background_work_flush_serializer = { _max_background_work };
condition_variable _should_flush;
int64_t _dirty_bytes_released_pre_accounted = 0;
future<> flush_when_needed();
struct flush_permit {
semaphore_units<> permit;
flush_permit(semaphore_units<>&& permit) : permit(std::move(permit)) {}
};
// We need to start a flush before the current one finishes, otherwise
// we'll have a period without significant disk activity when the current
// SSTable is being sealed, the caches are being updated, etc. To do that
// we need to keep track of who is it that we are flushing this memory from.
std::unordered_map<const logalloc::region*, flush_permit> _flush_manager;
future<> _waiting_flush;
virtual void start_reclaiming() noexcept override;
bool has_pressure() const {
return over_soft_limit();
}
seastar::metrics::metric_groups _metrics;
public:
void setup_collectd(sstring namestr);
future<> shutdown();
// Limits and pressure conditions:
// ===============================
//
// Virtual Dirty
// -------------
// We can't free memory until the whole memtable is flushed because we need to keep it in memory
// until the end, but we can fake freeing memory. When we are done with an element of the
// memtable, we will update the region group pretending memory just went down by that amount.
//
// Because the amount of memory that we pretend to free should be close enough to the actual
// memory used by the memtables, that effectively creates two sub-regions inside the dirty
// region group, of equal size. In the worst case, we will have <memtable_total_space> dirty
// bytes used, and half of that already virtually freed.
//
// Hard Limit
// ----------
// The total space that can be used by memtables in each group is defined by the threshold, but
// we will only allow the region_group to grow to half of that. This is because of virtual_dirty
// as explained above. Because virtual dirty is implemented by reducing the usage in the
// region_group directly on partition written, we want to throttle every time half of the memory
// as seen by the region_group. To achieve that we need to set the hard limit (first parameter
// of the region_group_reclaimer) to 1/2 of the user-supplied threshold
//
// Soft Limit
// ----------
// When the soft limit is hit, no throttle happens. The soft limit exists because we don't want
// to start flushing only when the limit is hit, but a bit earlier instead. If we were to start
// flushing only when the hard limit is hit, workloads in which the disk is fast enough to cope
// would see latency added to some requests unnecessarily.
//
// We then set the soft limit to 80 % of the virtual dirty hard limit, which is equal to 40 % of
// the user-supplied threshold.
dirty_memory_manager(database& db, size_t threshold)
: logalloc::region_group_reclaimer(threshold / 2, threshold * 0.40)
, _db(&db)
, _region_group(*this)
, _flush_serializer(1)
, _waiting_flush(flush_when_needed()) {}
dirty_memory_manager() : logalloc::region_group_reclaimer()
, _db(nullptr)
, _region_group(*this)
, _flush_serializer(1)
, _waiting_flush(make_ready_future<>()) {}
static dirty_memory_manager& from_region_group(logalloc::region_group *rg) {
return *(boost::intrusive::get_parent_from_member(rg, &dirty_memory_manager::_region_group));
}
logalloc::region_group& region_group() {
return _region_group;
}
const logalloc::region_group& region_group() const {
return _region_group;
}
void revert_potentially_cleaned_up_memory(logalloc::region* from, int64_t delta) {
_region_group.update(delta);
_dirty_bytes_released_pre_accounted -= delta;
}
void account_potentially_cleaned_up_memory(logalloc::region* from, int64_t delta) {
_region_group.update(-delta);
_dirty_bytes_released_pre_accounted += delta;
}
// This can be called multiple times during the lifetime of the region, and should always
// ultimately be called after the flush ends. However, some flushers may decide to call it
// earlier. For instance, the normal memtables sealing function will call this before updating
// the cache.
//
// Also, for sealing methods like the normal memtable sealing method - that may retry after a
// failed write, calling this method after the attempt is completed with success or failure is
// mandatory. That's because the new attempt will create a new flush reader for the same
// SSTable, so we need to make sure that we revert the old charges.
void remove_from_flush_manager(const logalloc::region *region) {
auto it = _flush_manager.find(region);
if (it != _flush_manager.end()) {
_flush_manager.erase(it);
}
}
void add_to_flush_manager(const logalloc::region *region, flush_permit&& permit) {
_flush_manager.emplace(region, std::move(permit));
}
size_t real_dirty_memory() const {
return _region_group.memory_used() + _dirty_bytes_released_pre_accounted;
}
size_t virtual_dirty_memory() const {
return _region_group.memory_used();
}
future<> flush_one(memtable_list& cf, semaphore_units<> permit);
future<semaphore_units<>> get_flush_permit() {
return get_units(_flush_serializer, 1);
}
};
extern thread_local dirty_memory_manager default_dirty_memory_manager;
// We could just add all memtables, regardless of types, to a single list, and
// then filter them out when we read them. Here's why I have chosen not to do
// it:
//
// First, some of the methods in which a memtable is involved (like seal) are
// assume a commitlog, and go through great care of updating the replay
// position, flushing the log, etc. We want to bypass those, and that has to
// be done either by sprikling the seal code with conditionals, or having a
// separate method for each seal.
//
// Also, if we ever want to put some of the memtables in as separate allocator
// region group to provide for extra QoS, having the classes properly wrapped
// will make that trivial: just pass a version of new_memtable() that puts it
// in a different region, while the list approach would require a lot of
// conditionals as well.
//
// If we are going to have different methods, better have different instances
// of a common class.
class memtable_list {
public:
enum class flush_behavior { delayed, immediate };
private:
std::vector<shared_memtable> _memtables;
std::function<future<> (flush_behavior)> _seal_fn;
std::function<schema_ptr()> _current_schema;
dirty_memory_manager* _dirty_memory_manager;
std::experimental::optional<shared_promise<>> _flush_coalescing;
public:
memtable_list(std::function<future<> (flush_behavior)> seal_fn, std::function<schema_ptr()> cs, dirty_memory_manager* dirty_memory_manager)
: _memtables({})
, _seal_fn(seal_fn)
, _current_schema(cs)
, _dirty_memory_manager(dirty_memory_manager) {
add_memtable();
}
memtable_list(std::function<schema_ptr()> cs, dirty_memory_manager* dirty_memory_manager)
: _memtables({})
, _seal_fn()
, _current_schema(cs)
, _dirty_memory_manager(dirty_memory_manager) {
add_memtable();
}
bool may_flush() const {
return bool(_seal_fn);
}
shared_memtable back() {
return _memtables.back();
}
// The caller has to make sure the element exist before calling this.
void erase(const shared_memtable& element) {
_memtables.erase(boost::range::find(_memtables, element));
}
void clear() {
_memtables.clear();
}
size_t size() const {
return _memtables.size();
}
future<> seal_active_memtable(flush_behavior behavior) {
return _seal_fn(behavior);
}
auto begin() noexcept {
return _memtables.begin();
}
auto begin() const noexcept {
return _memtables.begin();
}
auto end() noexcept {
return _memtables.end();
}
auto end() const noexcept {
return _memtables.end();
}
memtable& active_memtable() {
return *_memtables.back();
}
void add_memtable() {
_memtables.emplace_back(new_memtable());
}
logalloc::region_group& region_group() {
return _dirty_memory_manager->region_group();
}
// This is used for explicit flushes. Will queue the memtable for flushing and proceed when the
// dirty_memory_manager allows us to. We will not seal at this time since the flush itself
// wouldn't happen anyway. Keeping the memtable in memory will potentially increase the time it
// spends in memory allowing for more coalescing opportunities.
future<> request_flush();
private:
lw_shared_ptr<memtable> new_memtable();
};
using sstable_list = sstables::sstable_list;
// The CF has a "stats" structure. But we don't want all fields here,
// since some of them are fairly complex for exporting to collectd. Also,
// that structure matches what we export via the API, so better leave it
// untouched. And we need more fields. We will summarize it in here what
// we need.
struct cf_stats {
int64_t pending_memtables_flushes_count = 0;
int64_t pending_memtables_flushes_bytes = 0;
// number of time the clustering filter was executed
int64_t clustering_filter_count = 0;
// sstables considered by the filter (so dividing this by the previous one we get average sstables per read)
int64_t sstables_checked_by_clustering_filter = 0;
// number of times the filter passed the fast-path checks
int64_t clustering_filter_fast_path_count = 0;
// how many sstables survived the clustering key checks
int64_t surviving_sstables_after_clustering_filter = 0;
};
class column_family {
public:
struct config {
sstring datadir;
bool enable_disk_writes = true;
bool enable_disk_reads = true;
bool enable_cache = true;
bool enable_commitlog = true;
bool enable_incremental_backups = false;
::dirty_memory_manager* dirty_memory_manager = &default_dirty_memory_manager;
::dirty_memory_manager* streaming_dirty_memory_manager = &default_dirty_memory_manager;
restricted_mutation_reader_config read_concurrency_config;
restricted_mutation_reader_config streaming_read_concurrency_config;
::cf_stats* cf_stats = nullptr;
uint64_t max_cached_partition_size_in_bytes;
};
struct no_commitlog {};
struct stats {
/** Number of times flush has resulted in the memtable being switched out. */
int64_t memtable_switch_count = 0;
/** Estimated number of tasks pending for this column family */
int64_t pending_flushes = 0;
int64_t live_disk_space_used = 0;
int64_t total_disk_space_used = 0;
int64_t live_sstable_count = 0;
/** Estimated number of compactions pending for this column family */
int64_t pending_compactions = 0;
utils::timed_rate_moving_average_and_histogram reads{256};
utils::timed_rate_moving_average_and_histogram writes{256};
utils::estimated_histogram estimated_read;
utils::estimated_histogram estimated_write;
utils::estimated_histogram estimated_sstable_per_read{35};
utils::timed_rate_moving_average_and_histogram tombstone_scanned;
utils::timed_rate_moving_average_and_histogram live_scanned;
};
struct snapshot_details {
int64_t total;
int64_t live;
};
private:
schema_ptr _schema;
config _config;
mutable stats _stats;
uint64_t _failed_counter_applies_to_memtable = 0;
template<typename... Args>
void do_apply(Args&&... args);
lw_shared_ptr<memtable_list> _memtables;
// In older incarnations, we simply commited the mutations to memtables.
// However, doing that makes it harder for us to provide QoS within the
// disk subsystem. Keeping them in separate memtables allow us to properly
// classify those streams into its own I/O class
//
// We could write those directly to disk, but we still want the mutations
// coming through the wire to go to a memtable staging area. This has two
// major advantages:
//
// first, it will allow us to properly order the partitions. They are
// hopefuly sent in order but we can't really guarantee that without
// sacrificing sender-side parallelism.
//
// second, we will be able to coalesce writes from multiple plan_id's and
// even multiple senders, as well as automatically tapping into the dirty
// memory throttling mechanism, guaranteeing we will not overload the
// server.
lw_shared_ptr<memtable_list> _streaming_memtables;
utils::phased_barrier _streaming_flush_phaser;
// If mutations are fragmented during streaming the sstables cannot be made
// visible immediately after memtable flush, because that could cause
// readers to see only a part of a partition thus violating isolation
// guarantees.
// Mutations that are sent in fragments are kept separately in per-streaming
// plan memtables and the resulting sstables are not made visible until
// the streaming is complete.
struct streaming_memtable_big {
lw_shared_ptr<memtable_list> memtables;
std::vector<sstables::shared_sstable> sstables;
seastar::gate flush_in_progress;
};
std::unordered_map<utils::UUID, lw_shared_ptr<streaming_memtable_big>> _streaming_memtables_big;
future<> flush_streaming_big_mutations(utils::UUID plan_id);
void apply_streaming_big_mutation(schema_ptr m_schema, utils::UUID plan_id, const frozen_mutation& m);
future<> seal_active_streaming_memtable_big(streaming_memtable_big& smb);
lw_shared_ptr<memtable_list> make_memory_only_memtable_list();
lw_shared_ptr<memtable_list> make_memtable_list();
lw_shared_ptr<memtable_list> make_streaming_memtable_list();
lw_shared_ptr<memtable_list> make_streaming_memtable_big_list(streaming_memtable_big& smb);
sstables::compaction_strategy _compaction_strategy;
// generation -> sstable. Ordered by key so we can easily get the most recent.
lw_shared_ptr<sstables::sstable_set> _sstables;
// sstables that have been compacted (so don't look up in query) but
// have not been deleted yet, so must not GC any tombstones in other sstables
// that may delete data in these sstables:
std::vector<sstables::shared_sstable> _sstables_compacted_but_not_deleted;
// sstables that have been opened but not loaded yet, that's because refresh
// needs to load all opened sstables atomically, and now, we open a sstable
// in all shards at the same time, which makes it hard to store all sstables
// we need to load later on for all shards.
std::vector<sstables::shared_sstable> _sstables_opened_but_not_loaded;
// sstables that are shared between several shards so we want to rewrite
// them (split the data belonging to this shard to a separate sstable),
// but for correct compaction we need to start the compaction only after
// reading all sstables.
std::vector<sstables::shared_sstable> _sstables_need_rewrite;
// Control background fibers waiting for sstables to be deleted
seastar::gate _sstable_deletion_gate;
// There are situations in which we need to stop writing sstables. Flushers will take
// the read lock, and the ones that wish to stop that process will take the write lock.
rwlock _sstables_lock;
mutable row_cache _cache; // Cache covers only sstables.
std::experimental::optional<int64_t> _sstable_generation = {};
db::replay_position _highest_flushed_rp;
// Provided by the database that owns this commitlog
db::commitlog* _commitlog;
compaction_manager& _compaction_manager;
int _compaction_disabled = 0;
class memtable_flush_queue;
std::unique_ptr<memtable_flush_queue> _flush_queue;
// Because streaming mutations bypass the commitlog, there is
// no need for the complications of the flush queue. Besides, it
// is easier to just use a common gate than it is to modify the flush_queue
// to work both with and without a replay position.
//
// Last but not least, we seldom need to guarantee any ordering here: as long
// as all data is waited for, we're good.
seastar::gate _streaming_flush_gate;
std::unordered_map<sstring, lw_shared_ptr<db::view::view>> _views;
std::vector<view_ptr> _view_schemas;
semaphore _cache_update_sem{1};
std::unique_ptr<cell_locker> _counter_cell_locks;
void set_metrics();
seastar::metrics::metric_groups _metrics;
private:
void update_stats_for_new_sstable(uint64_t disk_space_used_by_sstable, std::vector<unsigned>&& shards_for_the_sstable);
// Adds new sstable to the set of sstables
// Doesn't update the cache.
// Doesn't trigger compaction.
void add_sstable(lw_shared_ptr<sstables::sstable> sstable, std::vector<unsigned>&& shards_for_the_sstable);
// returns an empty pointer if sstable doesn't belong to current shard.
future<lw_shared_ptr<sstables::sstable>> open_sstable(sstables::foreign_sstable_open_info info, sstring dir,
int64_t generation, sstables::sstable::version_types v, sstables::sstable::format_types f);
void load_sstable(lw_shared_ptr<sstables::sstable>& sstable, bool reset_level = false);
lw_shared_ptr<memtable> new_memtable();
lw_shared_ptr<memtable> new_streaming_memtable();
future<stop_iteration> try_flush_memtable_to_sstable(lw_shared_ptr<memtable> memt);
future<> update_cache(memtable&, lw_shared_ptr<sstables::sstable_set> old_sstables);
struct merge_comparator;
// update the sstable generation, making sure that new new sstables don't overwrite this one.
void update_sstables_known_generation(unsigned generation) {
if (!_sstable_generation) {
_sstable_generation = 1;
}
_sstable_generation = std::max<uint64_t>(*_sstable_generation, generation / smp::count + 1);
}
uint64_t calculate_generation_for_new_table() {
assert(_sstable_generation);
// FIXME: better way of ensuring we don't attempt to
// overwrite an existing table.
return (*_sstable_generation)++ * smp::count + engine().cpu_id();
}
// inverse of calculate_generation_for_new_table(), used to determine which
// shard a sstable should be opened at.
static int64_t calculate_shard_from_sstable_generation(int64_t sstable_generation) {
return sstable_generation % smp::count;
}
// Rebuild existing _sstables with new_sstables added to it and sstables_to_remove removed from it.
void rebuild_sstable_list(const std::vector<sstables::shared_sstable>& new_sstables,
const std::vector<sstables::shared_sstable>& sstables_to_remove);
void rebuild_statistics();
private:
using virtual_reader_type = std::function<mutation_reader(schema_ptr, const dht::partition_range&, const query::partition_slice&, const io_priority_class&, tracing::trace_state_ptr)>;
virtual_reader_type _virtual_reader;
// Creates a mutation reader which covers sstables.
// Caller needs to ensure that column_family remains live (FIXME: relax this).
// The 'range' parameter must be live as long as the reader is used.
// Mutations returned by the reader will all have given schema.
mutation_reader make_sstable_reader(schema_ptr schema,
const dht::partition_range& range,
const query::partition_slice& slice,
const io_priority_class& pc,
tracing::trace_state_ptr trace_state) const;
mutation_source sstables_as_mutation_source();
partition_presence_checker make_partition_presence_checker(lw_shared_ptr<sstables::sstable_set>);
std::chrono::steady_clock::time_point _sstable_writes_disabled_at;
void do_trigger_compaction();
public:
uint64_t failed_counter_applies_to_memtable() const {
return _failed_counter_applies_to_memtable;
}
// This function should be called when this column family is ready for writes, IOW,
// to produce SSTables. Extensive details about why this is important can be found
// in Scylla's Github Issue #1014
//
// Nothing should be writing to SSTables before we have the chance to populate the
// existing SSTables and calculate what should the next generation number be.
//
// However, if that happens, we want to protect against it in a way that does not
// involve overwriting existing tables. This is one of the ways to do it: every
// column family starts in an unwriteable state, and when it can finally be written
// to, we mark it as writeable.
//
// Note that this *cannot* be a part of add_column_family. That adds a column family
// to a db in memory only, and if anybody is about to write to a CF, that was most
// likely already called. We need to call this explicitly when we are sure we're ready
// to issue disk operations safely.
void mark_ready_for_writes() {
update_sstables_known_generation(0);
}
// Creates a mutation reader which covers all data sources for this column family.
// Caller needs to ensure that column_family remains live (FIXME: relax this).
// Note: for data queries use query() instead.
// The 'range' parameter must be live as long as the reader is used.
// Mutations returned by the reader will all have given schema.
// If I/O needs to be issued to read anything in the specified range, the operations
// will be scheduled under the priority class given by pc.
mutation_reader make_reader(schema_ptr schema,
const dht::partition_range& range = query::full_partition_range,
const query::partition_slice& slice = query::full_slice,
const io_priority_class& pc = default_priority_class(),
tracing::trace_state_ptr trace_state = nullptr) const;
// The streaming mutation reader differs from the regular mutation reader in that:
// - Reflects all writes accepted by replica prior to creation of the
// reader and a _bounded_ amount of writes which arrive later.
// - Does not populate the cache
mutation_reader make_streaming_reader(schema_ptr schema,
const dht::partition_range& range = query::full_partition_range) const;
// Requires ranges to be sorted and disjoint.
mutation_reader make_streaming_reader(schema_ptr schema,
const dht::partition_range_vector& ranges) const;
mutation_source as_mutation_source(tracing::trace_state_ptr trace_state) const;
void set_virtual_reader(virtual_reader_type virtual_reader) {
_virtual_reader = std::move(virtual_reader);
}
// Queries can be satisfied from multiple data sources, so they are returned
// as temporaries.
//
// FIXME: in case a query is satisfied from a single memtable, avoid a copy
using const_mutation_partition_ptr = std::unique_ptr<const mutation_partition>;
using const_row_ptr = std::unique_ptr<const row>;
memtable& active_memtable() { return _memtables->active_memtable(); }
const row_cache& get_row_cache() const {
return _cache;
}
row_cache& get_row_cache() {
return _cache;
}
future<std::vector<locked_cell>> lock_counter_cells(const mutation& m);
logalloc::occupancy_stats occupancy() const;
private:
column_family(schema_ptr schema, config cfg, db::commitlog* cl, compaction_manager&);
public:
column_family(schema_ptr schema, config cfg, db::commitlog& cl, compaction_manager& cm)
: column_family(schema, std::move(cfg), &cl, cm) {set_metrics();}
column_family(schema_ptr schema, config cfg, no_commitlog, compaction_manager& cm)
: column_family(schema, std::move(cfg), nullptr, cm) {set_metrics();}
column_family(column_family&&) = delete; // 'this' is being captured during construction
~column_family();
const schema_ptr& schema() const { return _schema; }
void set_schema(schema_ptr);
db::commitlog* commitlog() { return _commitlog; }
future<const_mutation_partition_ptr> find_partition(schema_ptr, const dht::decorated_key& key) const;
future<const_mutation_partition_ptr> find_partition_slow(schema_ptr, const partition_key& key) const;
future<const_row_ptr> find_row(schema_ptr, const dht::decorated_key& partition_key, clustering_key clustering_key) const;
// Applies given mutation to this column family
// The mutation is always upgraded to current schema.
void apply(const frozen_mutation& m, const schema_ptr& m_schema, const db::replay_position& = db::replay_position());
void apply(const mutation& m, const db::replay_position& = db::replay_position());
void apply_streaming_mutation(schema_ptr, utils::UUID plan_id, const frozen_mutation&, bool fragmented);
// Returns at most "cmd.limit" rows
future<lw_shared_ptr<query::result>> query(schema_ptr,
const query::read_command& cmd, query::result_request request,
const dht::partition_range_vector& ranges,
tracing::trace_state_ptr trace_state,
query::result_memory_limiter& memory_limiter,
uint64_t max_result_size);
void start();
future<> stop();
future<> flush();
future<> flush(const db::replay_position&);
future<> flush_streaming_mutations(utils::UUID plan_id, dht::partition_range_vector ranges = dht::partition_range_vector{});
future<> fail_streaming_mutations(utils::UUID plan_id);
future<> clear(); // discards memtable(s) without flushing them to disk.
future<db::replay_position> discard_sstables(db_clock::time_point);
// Important warning: disabling writes will only have an effect in the current shard.
// The other shards will keep writing tables at will. Therefore, you very likely need
// to call this separately in all shards first, to guarantee that none of them are writing
// new data before you can safely assume that the whole node is disabled.
future<int64_t> disable_sstable_write() {
_sstable_writes_disabled_at = std::chrono::steady_clock::now();
return _sstables_lock.write_lock().then([this] {
if (_sstables->all()->empty()) {
return make_ready_future<int64_t>(0);
}
int64_t max = 0;
for (auto&& s : *_sstables->all()) {
max = std::max(max, s->generation());
}
return make_ready_future<int64_t>(max);
});
}
// SSTable writes are now allowed again, and generation is updated to new_generation if != -1
// returns the amount of microseconds elapsed since we disabled writes.
std::chrono::steady_clock::duration enable_sstable_write(int64_t new_generation) {
if (new_generation != -1) {
update_sstables_known_generation(new_generation);
}
_sstables_lock.write_unlock();
return std::chrono::steady_clock::now() - _sstable_writes_disabled_at;
}
// Make sure the generation numbers are sequential, starting from "start".
// Generations before "start" are left untouched.
//
// Return the highest generation number seen so far
//
// Word of warning: although this function will reshuffle anything over "start", it is
// very dangerous to do that with live SSTables. This is meant to be used with SSTables
// that are not yet managed by the system.
//
// Parameter all_generations stores the generation of all SSTables in the system, so it
// will be easy to determine which SSTable is new.
// An example usage would query all shards asking what is the highest SSTable number known
// to them, and then pass that + 1 as "start".
future<std::vector<sstables::entry_descriptor>> reshuffle_sstables(std::set<int64_t> all_generations, int64_t start);
// FIXME: this is just an example, should be changed to something more
// general. compact_all_sstables() starts a compaction of all sstables.
// It doesn't flush the current memtable first. It's just a ad-hoc method,
// not a real compaction policy.
future<> compact_all_sstables();
// Compact all sstables provided in the vector.
// If cleanup is set to true, compaction_sstables will run on behalf of a cleanup job,
// meaning that irrelevant keys will be discarded.
future<> compact_sstables(sstables::compaction_descriptor descriptor, bool cleanup = false);
// Performs a cleanup on each sstable of this column family, excluding
// those ones that are irrelevant to this node or being compacted.
// Cleanup is about discarding keys that are no longer relevant for a
// given sstable, e.g. after node loses part of its token range because
// of a newly added node.
future<> cleanup_sstables(sstables::compaction_descriptor descriptor);
future<bool> snapshot_exists(sstring name);
future<> snapshot(sstring name);
future<> clear_snapshot(sstring name);
future<std::unordered_map<sstring, snapshot_details>> get_snapshot_details();
const bool incremental_backups_enabled() const {
return _config.enable_incremental_backups;
}
void set_incremental_backups(bool val) {
_config.enable_incremental_backups = val;
}
const sstables::sstable_set& get_sstable_set() const;
lw_shared_ptr<sstable_list> get_sstables() const;
lw_shared_ptr<sstable_list> get_sstables_including_compacted_undeleted() const;
const std::vector<sstables::shared_sstable>& compacted_undeleted_sstables() const;
std::vector<sstables::shared_sstable> select_sstables(const dht::partition_range& range) const;
size_t sstables_count() const;
std::vector<uint64_t> sstable_count_per_level() const;
int64_t get_unleveled_sstables() const;
void start_compaction();
void trigger_compaction();
future<> run_compaction(sstables::compaction_descriptor descriptor);
void set_compaction_strategy(sstables::compaction_strategy_type strategy);
const sstables::compaction_strategy& get_compaction_strategy() const {
return _compaction_strategy;
}
sstables::compaction_strategy& get_compaction_strategy() {
return _compaction_strategy;
}
const stats& get_stats() const {
return _stats;
}
::cf_stats* cf_stats() {
return _config.cf_stats;
}
compaction_manager& get_compaction_manager() const {
return _compaction_manager;
}
template<typename Func, typename Result = futurize_t<std::result_of_t<Func()>>>
Result run_with_compaction_disabled(Func && func) {
++_compaction_disabled;
return _compaction_manager.remove(this).then(std::forward<Func>(func)).finally([this] {
if (--_compaction_disabled == 0) {
// we're turning if on again, use function that does not increment
// the counter further.
do_trigger_compaction();
}
});
}
void add_or_update_view(view_ptr v);
void remove_view(view_ptr v);
const std::vector<view_ptr>& views() const;
future<> push_view_replica_updates(const schema_ptr& base, mutation&& m) const;
private:
void update_view_schemas();
std::vector<lw_shared_ptr<db::view::view>> affected_views(const schema_ptr& base, const mutation& update) const;
future<std::vector<mutation>> generate_view_updates(const schema_ptr& base,
std::vector<lw_shared_ptr<db::view::view>>&& views,
streamed_mutation updates,
streamed_mutation existings) const;
// One does not need to wait on this future if all we are interested in, is
// initiating the write. The writes initiated here will eventually
// complete, and the seastar::gate below will make sure they are all
// completed before we stop() this column_family.
//
// But it is possible to synchronously wait for the seal to complete by
// waiting on this future. This is useful in situations where we want to
// synchronously flush data to disk.
future<> seal_active_memtable(memtable_list::flush_behavior behavior = memtable_list::flush_behavior::delayed);
// I am assuming here that the repair process will potentially send ranges containing
// few mutations, definitely not enough to fill a memtable. It wants to know whether or
// not each of those ranges individually succeeded or failed, so we need a future for
// each.
//
// One of the ways to fix that, is changing the repair itself to send more mutations at
// a single batch. But relying on that is a bad idea for two reasons:
//
// First, the goals of the SSTable writer and the repair sender are at odds. The SSTable
// writer wants to write as few SSTables as possible, while the repair sender wants to
// break down the range in pieces as small as it can and checksum them individually, so
// it doesn't have to send a lot of mutations for no reason.
//
// Second, even if the repair process wants to process larger ranges at once, some ranges
// themselves may be small. So while most ranges would be large, we would still have
// potentially some fairly small SSTables lying around.
//
// The best course of action in this case is to coalesce the incoming streams write-side.
// repair can now choose whatever strategy - small or big ranges - it wants, resting assure
// that the incoming memtables will be coalesced together.
shared_promise<> _waiting_streaming_flushes;
timer<> _delayed_streaming_flush{[this] { _streaming_memtables->request_flush(); }};
future<> seal_active_streaming_memtable_delayed();
future<> seal_active_streaming_memtable_immediate();
future<> seal_active_streaming_memtable(memtable_list::flush_behavior behavior) {
if (behavior == memtable_list::flush_behavior::delayed) {
return seal_active_streaming_memtable_delayed();
} else if (behavior == memtable_list::flush_behavior::immediate) {
return seal_active_streaming_memtable_immediate();
} else {
// Impossible
assert(0);
}
}
// filter manifest.json files out
static bool manifest_json_filter(const sstring& fname);
// Iterate over all partitions. Protocol is the same as std::all_of(),
// so that iteration can be stopped by returning false.
// Func signature: bool (const decorated_key& dk, const mutation_partition& mp)
template <typename Func>
future<bool> for_all_partitions(schema_ptr, Func&& func) const;
void check_valid_rp(const db::replay_position&) const;
public:
void start_rewrite();
// Iterate over all partitions. Protocol is the same as std::all_of(),
// so that iteration can be stopped by returning false.
future<bool> for_all_partitions_slow(schema_ptr, std::function<bool (const dht::decorated_key&, const mutation_partition&)> func) const;
friend std::ostream& operator<<(std::ostream& out, const column_family& cf);
// Testing purposes.
friend class column_family_test;
friend class distributed_loader;
};
class user_types_metadata {
std::unordered_map<bytes, user_type> _user_types;
public:
user_type get_type(const bytes& name) const {
return _user_types.at(name);
}
const std::unordered_map<bytes, user_type>& get_all_types() const {
return _user_types;
}
void add_type(user_type type) {
auto i = _user_types.find(type->_name);
assert(i == _user_types.end() || type->is_compatible_with(*i->second));
_user_types[type->_name] = std::move(type);
}
void remove_type(user_type type) {
_user_types.erase(type->_name);
}
friend std::ostream& operator<<(std::ostream& os, const user_types_metadata& m);
};
class keyspace_metadata final {
sstring _name;
sstring _strategy_name;
std::map<sstring, sstring> _strategy_options;
std::unordered_map<sstring, schema_ptr> _cf_meta_data;
bool _durable_writes;
lw_shared_ptr<user_types_metadata> _user_types;
public:
keyspace_metadata(sstring name,
sstring strategy_name,
std::map<sstring, sstring> strategy_options,
bool durable_writes,
std::vector<schema_ptr> cf_defs = std::vector<schema_ptr>{},
lw_shared_ptr<user_types_metadata> user_types = make_lw_shared<user_types_metadata>())
: _name{std::move(name)}
, _strategy_name{strategy_name.empty() ? "NetworkTopologyStrategy" : strategy_name}
, _strategy_options{std::move(strategy_options)}
, _durable_writes{durable_writes}
, _user_types{std::move(user_types)}
{
for (auto&& s : cf_defs) {
_cf_meta_data.emplace(s->cf_name(), s);
}
}
static lw_shared_ptr<keyspace_metadata>
new_keyspace(sstring name,
sstring strategy_name,
std::map<sstring, sstring> options,
bool durables_writes,
std::vector<schema_ptr> cf_defs = std::vector<schema_ptr>{})
{
return ::make_lw_shared<keyspace_metadata>(name, strategy_name, options, durables_writes, cf_defs);
}
void validate() const;
const sstring& name() const {
return _name;
}
const sstring& strategy_name() const {
return _strategy_name;
}
const std::map<sstring, sstring>& strategy_options() const {
return _strategy_options;
}
const std::unordered_map<sstring, schema_ptr>& cf_meta_data() const {
return _cf_meta_data;
}
bool durable_writes() const {
return _durable_writes;
}
const lw_shared_ptr<user_types_metadata>& user_types() const {
return _user_types;
}
void add_or_update_column_family(const schema_ptr& s) {
_cf_meta_data[s->cf_name()] = s;
}
void remove_column_family(const schema_ptr& s) {
_cf_meta_data.erase(s->cf_name());
}
void add_user_type(const user_type ut) {
_user_types->add_type(ut);
}
void remove_user_type(const user_type ut) {
_user_types->remove_type(ut);
}
std::vector<schema_ptr> tables() const;
std::vector<view_ptr> views() const;
friend std::ostream& operator<<(std::ostream& os, const keyspace_metadata& m);
};
class keyspace {
public:
struct config {
sstring datadir;
bool enable_commitlog = true;
bool enable_disk_reads = true;
bool enable_disk_writes = true;
bool enable_cache = true;
bool enable_incremental_backups = false;
::dirty_memory_manager* dirty_memory_manager = &default_dirty_memory_manager;
::dirty_memory_manager* streaming_dirty_memory_manager = &default_dirty_memory_manager;