From a12be569af41b0a8c3cf4ec3b47cdf2b67e23372 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrewkr@fb.com>
Date: Sun, 20 Feb 2022 14:33:37 -0800
Subject: [PATCH 01/34] Update HISTORY.md and version.h for 7.0 release (#9609)

Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/9609

Reviewed By: riversand963

Differential Revision: D34370309

Pulled By: ajkr

fbshipit-source-id: 5fc9306439aefa4b2d61d847534ea6758c30b6a5
---
 HISTORY.md                | 2 +-
 include/rocksdb/version.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 714d1a1fe..f1908c56b 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,5 @@
 # Rocksdb Change Log
-## Unreleased
+## 7.0.0 (02/20/2022)
 ### Bug Fixes
 * Fixed a major bug in which batched MultiGet could return old values for keys deleted by DeleteRange when memtable Bloom filter is enabled (memtable_prefix_bloom_size_ratio > 0). (The fix includes a substantial MultiGet performance improvement in the unusual case of both memtable_whole_key_filtering and prefix_extractor.)
 * Fixed more cases of EventListener::OnTableFileCreated called with OK status, file_size==0, and no SST file kept. Now the status is Aborted.
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 07c3ca3b4..0fb3de0b4 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -9,8 +9,8 @@
 
 #include "rocksdb/rocksdb_namespace.h"
 
-#define ROCKSDB_MAJOR 6
-#define ROCKSDB_MINOR 29
+#define ROCKSDB_MAJOR 7
+#define ROCKSDB_MINOR 0
 #define ROCKSDB_PATCH 0
 
 // Do not use these. We made the mistake of declaring macros starting with

From b251c4f5a9aed7ec6742e408c534d83317c1f734 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrewkr@fb.com>
Date: Sun, 27 Feb 2022 11:36:54 -0800
Subject: [PATCH 02/34] Dedicate cacheline for DB mutex (#9637)

Summary:
We found a case of cacheline bouncing due to writers locking/unlocking `mutex_` and readers accessing `block_cache_tracer_`. We discovered it only after the issue was fixed by https://github.com/facebook/rocksdb/issues/9462 shifting the `DBImpl` members such that `mutex_` and `block_cache_tracer_` were naturally placed in separate cachelines in our regression testing setup. This PR forces the cacheline alignment of `mutex_` so we don't accidentally reintroduce the problem.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9637

Reviewed By: riversand963

Differential Revision: D34502233

Pulled By: ajkr

fbshipit-source-id: 46aa313b7fe83e80c3de254e332b6fb242434c07
---
 db/db_impl/db_impl.h            | 5 ++++-
 monitoring/instrumented_mutex.h | 8 ++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 9c07e81fc..c37a46c3e 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1187,7 +1187,10 @@ class DBImpl : public DB {
   // WriteToWAL need different synchronization: log_empty_, alive_log_files_,
   // logs_, logfile_number_. Refer to the definition of each variable below for
   // more description.
-  mutable InstrumentedMutex mutex_;
+  //
+  // `mutex_` can be a hot lock in some workloads, so it deserves dedicated
+  // cachelines.
+  mutable CacheAlignedInstrumentedMutex mutex_;
 
   ColumnFamilyHandleImpl* default_cf_handle_;
   InternalStats* default_cf_internal_stats_;
diff --git a/monitoring/instrumented_mutex.h b/monitoring/instrumented_mutex.h
index 1e72815bf..e05c6addd 100644
--- a/monitoring/instrumented_mutex.h
+++ b/monitoring/instrumented_mutex.h
@@ -51,6 +51,14 @@ class InstrumentedMutex {
   int stats_code_;
 };
 
+class ALIGN_AS(CACHE_LINE_SIZE) CacheAlignedInstrumentedMutex
+    : public InstrumentedMutex {
+  using InstrumentedMutex::InstrumentedMutex;
+  char padding[(CACHE_LINE_SIZE - sizeof(InstrumentedMutex) % CACHE_LINE_SIZE) %
+               CACHE_LINE_SIZE] ROCKSDB_FIELD_UNUSED;
+};
+static_assert(sizeof(CacheAlignedInstrumentedMutex) % CACHE_LINE_SIZE == 0);
+
 // RAII wrapper for InstrumentedMutex
 class InstrumentedMutexLock {
  public:

From a4bf9311da1180b8fbe023c8e5db746c9b9c49e1 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrewkr@fb.com>
Date: Mon, 28 Feb 2022 23:45:08 -0800
Subject: [PATCH 03/34] Handle failures in block-based table size/offset
 approximation (#9615)

Summary:
In crash test with fault injection, we were seeing stack traces like the following:

```
https://github.com/facebook/rocksdb/issues/3 0x00007f75f763c533 in __GI___assert_fail (assertion=assertion@entry=0x1c5b2a0 "end_offset >= start_offset", file=file@entry=0x1c580a0 "table/block_based/block_based_table_reader.cc", line=line@entry=3245,
function=function@entry=0x1c60e60 "virtual uint64_t rocksdb::BlockBasedTable::ApproximateSize(const rocksdb::Slice&, const rocksdb::Slice&, rocksdb::TableReaderCaller)") at assert.c:101
https://github.com/facebook/rocksdb/issues/4 0x00000000010ea9b4 in rocksdb::BlockBasedTable::ApproximateSize (this=<optimized out>, start=..., end=..., caller=<optimized out>) at table/block_based/block_based_table_reader.cc:3224
https://github.com/facebook/rocksdb/issues/5 0x0000000000be61fb in rocksdb::TableCache::ApproximateSize (this=0x60f0000161b0, start=..., end=..., fd=..., caller=caller@entry=rocksdb::kCompaction, internal_comparator=..., prefix_extractor=...) at db/table_cache.cc:719
https://github.com/facebook/rocksdb/issues/6 0x0000000000c3eaec in rocksdb::VersionSet::ApproximateSize (this=<optimized out>, v=<optimized out>, f=..., start=..., end=..., caller=<optimized out>) at ./db/version_set.h:850
https://github.com/facebook/rocksdb/issues/7 0x0000000000c6ebc3 in rocksdb::VersionSet::ApproximateSize (this=<optimized out>, options=..., v=v@entry=0x621000047500, start=..., end=..., start_level=start_level@entry=0, end_level=<optimized out>, caller=<optimized out>)
at db/version_set.cc:5657
https://github.com/facebook/rocksdb/issues/8 0x000000000166e894 in rocksdb::CompactionJob::GenSubcompactionBoundaries (this=<optimized out>) at ./include/rocksdb/options.h:1869
https://github.com/facebook/rocksdb/issues/9 0x000000000168c526 in rocksdb::CompactionJob::Prepare (this=this@entry=0x7f75f3ffcf00) at db/compaction/compaction_job.cc:546
```

The problem occurred in `ApproximateSize()` when the index `Seek()` for the first `ApproximateDataOffsetOf()` encountered an I/O error, while the second `Seek()` did not. In the old code that scenario caused `start_offset == data_size` , thus it was easy to trip the assertion that `end_offset >= start_offset`.

The fix is to set `start_offset == 0` when the first index `Seek()` fails, and `end_offset == data_size` when the second index `Seek()` fails. I doubt these give an "on average correct" answer for how this function is used, but I/O errors in index seeks are hopefully rare, it looked consistent with what was already there, and it was easier to calculate.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9615

Test Plan:
run the repro command for a while and stopped seeing coredumps -

```
$ while !  ./db_stress --block_size=128 --cache_size=32768 --clear_column_family_one_in=0 --column_families=1 --continuous_verification_interval=0 --db=/dev/shm/rocksdb_crashtest --delpercent=4 --delrangepercent=1 --destroy_db_initially=0 --expected_values_dir=/dev/shm/rocksdb_crashtest_expected --index_type=2 --iterpercent=10  --kill_random_test=18887 --max_key=1000000 --max_bytes_for_level_base=2048576 --nooverwritepercent=1 --open_files=-1 --open_read_fault_one_in=32 --ops_per_thread=1000000 --prefixpercent=5 --read_fault_one_in=0 --readpercent=45 --reopen=0 --skip_verifydb=1 --subcompactions=2 --target_file_size_base=524288 --test_batches_snapshots=0 --value_size_mult=32 --write_buffer_size=524288 --writepercent=35  ; do : ; done
```

Reviewed By: pdillinger

Differential Revision: D34383069

Pulled By: ajkr

fbshipit-source-id: fac26c3b20ea962e75387515ba5f2724dc48719f
---
 table/block_based/block_based_table_reader.cc | 34 ++++++++++++++++---
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 4908a03af..686f38ba1 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -3158,6 +3158,7 @@ Status BlockBasedTable::CreateIndexReader(
 uint64_t BlockBasedTable::ApproximateDataOffsetOf(
     const InternalIteratorBase<IndexValue>& index_iter,
     uint64_t data_size) const {
+  assert(index_iter.status().ok());
   if (index_iter.Valid()) {
     BlockHandle handle = index_iter.value().handle;
     return handle.offset();
@@ -3200,8 +3201,16 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key,
   }
 
   index_iter->Seek(key);
+  uint64_t offset;
+  if (index_iter->status().ok()) {
+    offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  } else {
+    // Split in half to avoid skewing one way or another,
+    // since we don't know whether we're operating on lower bound or
+    // upper bound.
+    return rep_->file_size / 2;
+  }
 
-  uint64_t offset = ApproximateDataOffsetOf(*index_iter, data_size);
   // Pro-rate file metadata (incl filters) size-proportionally across data
   // blocks.
   double size_ratio =
@@ -3217,7 +3226,9 @@ uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
   uint64_t data_size = GetApproximateDataSize();
   if (UNLIKELY(data_size == 0)) {
     // Hmm. Assume whole file is involved, since we have lower and upper
-    // bound.
+    // bound. This likely skews the estimate if we consider that this function
+    // is typically called with `[start, end]` fully contained in the file's
+    // key-range.
     return rep_->file_size;
   }
 
@@ -3235,9 +3246,24 @@ uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end,
   }
 
   index_iter->Seek(start);
-  uint64_t start_offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  uint64_t start_offset;
+  if (index_iter->status().ok()) {
+    start_offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  } else {
+    // Assume file is involved from the start. This likely skews the estimate
+    // but is consistent with the above error handling.
+    start_offset = 0;
+  }
+
   index_iter->Seek(end);
-  uint64_t end_offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  uint64_t end_offset;
+  if (index_iter->status().ok()) {
+    end_offset = ApproximateDataOffsetOf(*index_iter, data_size);
+  } else {
+    // Assume file is involved until the end. This likely skews the estimate
+    // but is consistent with the above error handling.
+    end_offset = data_size;
+  }
 
   assert(end_offset >= start_offset);
   // Pro-rate file metadata (incl filters) size-proportionally across data

From 0344b1d331cbd7baa0f3d9fd9626303764158150 Mon Sep 17 00:00:00 2001
From: Jay Zhuang <zjay@fb.com>
Date: Wed, 2 Mar 2022 13:43:00 -0800
Subject: [PATCH 04/34] Unschedule manual compaction from thread-pool queue
 (#9625)

Summary:
PR https://github.com/facebook/rocksdb/issues/9557 introduced a race condition between manual compaction
foreground thread and background compaction thread.
This PR adds the ability to really unschedule manual compaction from
thread-pool queue by differentiate tag name for manual compaction and
other tasks.
Also fix an issue that db `close()` didn't cancel the manual compaction thread.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9625

Test Plan: unittest not hang

Reviewed By: ajkr

Differential Revision: D34410811

Pulled By: jay-zhuang

fbshipit-source-id: cb14065eabb8cf1345fa042b5652d4f788c0c40c
---
 HISTORY.md                             |   4 +
 db/db_compaction_test.cc               |  61 +++++++-
 db/db_impl/db_impl.cc                  |  15 +-
 db/db_impl/db_impl.h                   |  20 ++-
 db/db_impl/db_impl_compaction_flush.cc | 193 +++++++++++++------------
 5 files changed, 191 insertions(+), 102 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index f1908c56b..352267328 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,4 +1,8 @@
 # Rocksdb Change Log
+## Unreleased
+### Bug Fixes
+* Fix a race condition when cancel manual compaction with `DisableManualCompaction`. Also DB close can cancel the manual compaction thread.
+
 ## 7.0.0 (02/20/2022)
 ### Bug Fixes
 * Fixed a major bug in which batched MultiGet could return old values for keys deleted by DeleteRange when memtable Bloom filter is enabled (memtable_prefix_bloom_size_ratio > 0). (The fix includes a substantial MultiGet performance improvement in the unusual case of both memtable_whole_key_filtering and prefix_extractor.)
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 6af4bcafd..94933e8f2 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -6951,7 +6951,7 @@ TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFullDBClose) {
 
   SyncPoint::GetInstance()->LoadDependency(
       {{"DBImpl::RunManualCompaction:Scheduled",
-        "DBCompactionTest::DisableManualCompactionThreadQueueFull:"
+        "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
         "PreDisableManualCompaction"}});
   SyncPoint::GetInstance()->EnableProcessing();
 
@@ -6979,7 +6979,7 @@ TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFullDBClose) {
   });
 
   TEST_SYNC_POINT(
-      "DBCompactionTest::DisableManualCompactionThreadQueueFull:"
+      "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
       "PreDisableManualCompaction");
 
   // Generate more files to trigger auto compaction which is scheduled after
@@ -7006,6 +7006,63 @@ TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFullDBClose) {
   sleeping_task_low.WaitUntilDone();
 }
 
+TEST_F(DBCompactionTest, DBCloseWithManualCompaction) {
+  const int kNumL0Files = 4;
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::RunManualCompaction:Scheduled",
+        "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+        "PreDisableManualCompaction"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableManualCompactionThreadQueueFullDBClose:"
+      "PreDisableManualCompaction");
+
+  // Generate more files to trigger auto compaction which is scheduled after
+  // manual compaction. Has to generate 4 more files because existing files are
+  // pending compaction
+  for (int i = 0; i < kNumL0Files; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+  ASSERT_EQ(ToString(kNumL0Files + (kNumL0Files / 2)), FilesPerLevel(0));
+
+  // Close DB with manual compaction and auto triggered compaction in the queue.
+  auto s = db_->Close();
+  ASSERT_OK(s);
+
+  // manual compaction thread should return with Incomplete().
+  compact_thread.join();
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+}
+
 TEST_F(DBCompactionTest,
        DisableManualCompactionDoesNotWaitForDrainingAutomaticCompaction) {
   // When `CompactRangeOptions::exclusive_manual_compaction == true`, we wait
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 196b428a3..d57c992ce 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -527,10 +527,19 @@ Status DBImpl::CloseHelper() {
   // marker. After this we do a variant of the waiting and unschedule work
   // (to consider: moving all the waiting into CancelAllBackgroundWork(true))
   CancelAllBackgroundWork(false);
+
+  // Cancel manual compaction if there's any
+  if (HasPendingManualCompaction()) {
+    DisableManualCompaction();
+  }
   mutex_.Lock();
-  env_->UnSchedule(this, Env::Priority::BOTTOM);
-  env_->UnSchedule(this, Env::Priority::LOW);
-  env_->UnSchedule(this, Env::Priority::HIGH);
+  // Unschedule all tasks for this DB
+  for (uint8_t i = 0; i < static_cast<uint8_t>(TaskType::kCount); i++) {
+    env_->UnSchedule(GetTaskTag(i), Env::Priority::BOTTOM);
+    env_->UnSchedule(GetTaskTag(i), Env::Priority::LOW);
+    env_->UnSchedule(GetTaskTag(i), Env::Priority::HIGH);
+  }
+
   Status ret = Status::OK();
 
   // Wait for background work to finish
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index c37a46c3e..d1e95c75a 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1539,7 +1539,6 @@ class DBImpl : public DB {
     ManualCompactionState* manual_compaction_state;  // nullptr if non-manual
     // task limiter token is requested during compaction picking.
     std::unique_ptr<TaskLimiterToken> task_token;
-    bool is_canceled = false;
   };
 
   struct CompactionArg {
@@ -1734,6 +1733,25 @@ class DBImpl : public DB {
     }
   }
 
+  // TaskType is used to identify tasks in thread-pool, currently only
+  // differentiate manual compaction, which could be unscheduled from the
+  // thread-pool.
+  enum class TaskType : uint8_t {
+    kDefault = 0,
+    kManualCompaction = 1,
+    kCount = 2,
+  };
+
+  // Task tag is used to identity tasks in thread-pool, which is
+  // dbImpl obj address + type
+  inline void* GetTaskTag(TaskType type) {
+    return GetTaskTag(static_cast<uint8_t>(type));
+  }
+
+  inline void* GetTaskTag(uint8_t type) {
+    return static_cast<uint8_t*>(static_cast<void*>(this)) + type;
+  }
+
   // REQUIRES: mutex locked and in write thread.
   void AssignAtomicFlushSeq(const autovector<ColumnFamilyData*>& cfds);
 
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 0e1864788..05520d426 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1763,6 +1763,7 @@ Status DBImpl::RunManualCompaction(
   CompactionArg* ca = nullptr;
 
   bool scheduled = false;
+  Env::Priority thread_pool_priority = Env::Priority::TOTAL;
   bool manual_conflict = false;
   ManualCompactionState manual;
   manual.cfd = cfd;
@@ -1884,9 +1885,9 @@ Status DBImpl::RunManualCompaction(
         manual.done = true;
         manual.status =
             Status::Incomplete(Status::SubCode::kManualCompactionPaused);
-        if (ca && ca->prepicked_compaction) {
-          ca->prepicked_compaction->is_canceled = true;
-        }
+        assert(thread_pool_priority != Env::Priority::TOTAL);
+        env_->UnSchedule(GetTaskTag(TaskType::kManualCompaction),
+                         thread_pool_priority);
         break;
       }
       if (scheduled && manual.incomplete == true) {
@@ -1916,13 +1917,17 @@ Status DBImpl::RunManualCompaction(
         bg_bottom_compaction_scheduled_++;
         ca->compaction_pri_ = Env::Priority::BOTTOM;
         env_->Schedule(&DBImpl::BGWorkBottomCompaction, ca,
-                       Env::Priority::BOTTOM, this,
+                       Env::Priority::BOTTOM,
+                       GetTaskTag(TaskType::kManualCompaction),
                        &DBImpl::UnscheduleCompactionCallback);
+        thread_pool_priority = Env::Priority::BOTTOM;
       } else {
         bg_compaction_scheduled_++;
         ca->compaction_pri_ = Env::Priority::LOW;
-        env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW, this,
+        env_->Schedule(&DBImpl::BGWorkCompaction, ca, Env::Priority::LOW,
+                       GetTaskTag(TaskType::kManualCompaction),
                        &DBImpl::UnscheduleCompactionCallback);
+        thread_pool_priority = Env::Priority::LOW;
       }
       scheduled = true;
       TEST_SYNC_POINT("DBImpl::RunManualCompaction:Scheduled");
@@ -1933,6 +1938,13 @@ Status DBImpl::RunManualCompaction(
   assert(!manual.in_progress);
   assert(HasPendingManualCompaction());
   RemoveManualCompaction(&manual);
+  // if the manual job is unscheduled, try schedule other jobs in case there's
+  // any unscheduled compaction job which was blocked by exclusive manual
+  // compaction.
+  if (manual.status.IsIncomplete() &&
+      manual.status.subcode() == Status::SubCode::kManualCompactionPaused) {
+    MaybeScheduleFlushOrCompaction();
+  }
   bg_cv_.SignalAll();
   return manual.status;
 }
@@ -2661,6 +2673,8 @@ void DBImpl::UnscheduleCompactionCallback(void* arg) {
   delete reinterpret_cast<CompactionArg*>(arg);
   if (ca.prepicked_compaction != nullptr) {
     if (ca.prepicked_compaction->compaction != nullptr) {
+      ca.prepicked_compaction->compaction->ReleaseCompactionFiles(
+          Status::Incomplete(Status::SubCode::kManualCompactionPaused));
       delete ca.prepicked_compaction->compaction;
     }
     delete ca.prepicked_compaction;
@@ -2851,106 +2865,93 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) {
 void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
                                       Env::Priority bg_thread_pri) {
   bool made_progress = false;
+  JobContext job_context(next_job_id_.fetch_add(1), true);
   TEST_SYNC_POINT("BackgroundCallCompaction:0");
   LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL,
                        immutable_db_options_.info_log.get());
   {
     InstrumentedMutexLock l(&mutex_);
 
-    if (prepicked_compaction && prepicked_compaction->is_canceled) {
-      assert(prepicked_compaction->compaction);
-      ROCKS_LOG_BUFFER(&log_buffer, "[%s] Skip canceled manual compaction job",
-                       prepicked_compaction->compaction->column_family_data()
-                           ->GetName()
-                           .c_str());
-      prepicked_compaction->compaction->ReleaseCompactionFiles(
-          Status::Incomplete(Status::SubCode::kManualCompactionPaused));
-      delete prepicked_compaction->compaction;
-    } else {
-      JobContext job_context(next_job_id_.fetch_add(1), true);
-      // This call will unlock/lock the mutex to wait for current running
-      // IngestExternalFile() calls to finish.
-      WaitForIngestFile();
-
-      num_running_compactions_++;
-
-      std::unique_ptr<std::list<uint64_t>::iterator>
-          pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
-              CaptureCurrentFileNumberInPendingOutputs()));
-
-      assert((bg_thread_pri == Env::Priority::BOTTOM &&
-              bg_bottom_compaction_scheduled_) ||
-             (bg_thread_pri == Env::Priority::LOW && bg_compaction_scheduled_));
-      Status s = BackgroundCompaction(&made_progress, &job_context, &log_buffer,
-                                      prepicked_compaction, bg_thread_pri);
-      TEST_SYNC_POINT("BackgroundCallCompaction:1");
-      if (s.IsBusy()) {
-        bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
-        mutex_.Unlock();
-        immutable_db_options_.clock->SleepForMicroseconds(
-            10000);  // prevent hot loop
-        mutex_.Lock();
-      } else if (!s.ok() && !s.IsShutdownInProgress() &&
-                 !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) {
-        // Wait a little bit before retrying background compaction in
-        // case this is an environmental problem and we do not want to
-        // chew up resources for failed compactions for the duration of
-        // the problem.
-        uint64_t error_cnt =
-            default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
-        bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
-        mutex_.Unlock();
-        log_buffer.FlushBufferToLog();
-        ROCKS_LOG_ERROR(immutable_db_options_.info_log,
-                        "Waiting after background compaction error: %s, "
-                        "Accumulated background error counts: %" PRIu64,
-                        s.ToString().c_str(), error_cnt);
-        LogFlush(immutable_db_options_.info_log);
-        immutable_db_options_.clock->SleepForMicroseconds(1000000);
-        mutex_.Lock();
-      } else if (s.IsManualCompactionPaused()) {
-        assert(prepicked_compaction);
-        ManualCompactionState* m =
-            prepicked_compaction->manual_compaction_state;
-        assert(m);
-        ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused",
-                         m->cfd->GetName().c_str(), job_context.job_id);
-      }
-
-      ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
-
-      // If compaction failed, we want to delete all temporary files that we
-      // might have created (they might not be all recorded in job_context in
-      // case of a failure). Thus, we force full scan in FindObsoleteFiles()
-      FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
-                                          !s.IsManualCompactionPaused() &&
-                                          !s.IsColumnFamilyDropped() &&
-                                          !s.IsBusy());
-      TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles");
-
-      // delete unnecessary files if any, this is done outside the mutex
-      if (job_context.HaveSomethingToClean() ||
-          job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
-        mutex_.Unlock();
-        // Have to flush the info logs before bg_compaction_scheduled_--
-        // because if bg_flush_scheduled_ becomes 0 and the lock is
-        // released, the deconstructor of DB can kick in and destroy all the
-        // states of DB so info_log might not be available after that point.
-        // It also applies to access other states that DB owns.
-        log_buffer.FlushBufferToLog();
-        if (job_context.HaveSomethingToDelete()) {
-          PurgeObsoleteFiles(job_context);
-          TEST_SYNC_POINT(
-              "DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles");
-        }
-        job_context.Clean();
-        mutex_.Lock();
-      }
+    // This call will unlock/lock the mutex to wait for current running
+    // IngestExternalFile() calls to finish.
+    WaitForIngestFile();
+
+    num_running_compactions_++;
+
+    std::unique_ptr<std::list<uint64_t>::iterator>
+        pending_outputs_inserted_elem(new std::list<uint64_t>::iterator(
+            CaptureCurrentFileNumberInPendingOutputs()));
+
+    assert((bg_thread_pri == Env::Priority::BOTTOM &&
+            bg_bottom_compaction_scheduled_) ||
+           (bg_thread_pri == Env::Priority::LOW && bg_compaction_scheduled_));
+    Status s = BackgroundCompaction(&made_progress, &job_context, &log_buffer,
+                                    prepicked_compaction, bg_thread_pri);
+    TEST_SYNC_POINT("BackgroundCallCompaction:1");
+    if (s.IsBusy()) {
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      mutex_.Unlock();
+      immutable_db_options_.clock->SleepForMicroseconds(
+          10000);  // prevent hot loop
+      mutex_.Lock();
+    } else if (!s.ok() && !s.IsShutdownInProgress() &&
+               !s.IsManualCompactionPaused() && !s.IsColumnFamilyDropped()) {
+      // Wait a little bit before retrying background compaction in
+      // case this is an environmental problem and we do not want to
+      // chew up resources for failed compactions for the duration of
+      // the problem.
+      uint64_t error_cnt =
+          default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
+      bg_cv_.SignalAll();  // In case a waiter can proceed despite the error
+      mutex_.Unlock();
+      log_buffer.FlushBufferToLog();
+      ROCKS_LOG_ERROR(immutable_db_options_.info_log,
+                      "Waiting after background compaction error: %s, "
+                      "Accumulated background error counts: %" PRIu64,
+                      s.ToString().c_str(), error_cnt);
+      LogFlush(immutable_db_options_.info_log);
+      immutable_db_options_.clock->SleepForMicroseconds(1000000);
+      mutex_.Lock();
+    } else if (s.IsManualCompactionPaused()) {
+      assert(prepicked_compaction);
+      ManualCompactionState* m = prepicked_compaction->manual_compaction_state;
+      assert(m);
+      ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused",
+                       m->cfd->GetName().c_str(), job_context.job_id);
+    }
+
+    ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem);
 
-      assert(num_running_compactions_ > 0);
-      num_running_compactions_--;
+    // If compaction failed, we want to delete all temporary files that we
+    // might have created (they might not be all recorded in job_context in
+    // case of a failure). Thus, we force full scan in FindObsoleteFiles()
+    FindObsoleteFiles(&job_context, !s.ok() && !s.IsShutdownInProgress() &&
+                                        !s.IsManualCompactionPaused() &&
+                                        !s.IsColumnFamilyDropped() &&
+                                        !s.IsBusy());
+    TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:FoundObsoleteFiles");
+
+    // delete unnecessary files if any, this is done outside the mutex
+    if (job_context.HaveSomethingToClean() ||
+        job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
+      mutex_.Unlock();
+      // Have to flush the info logs before bg_compaction_scheduled_--
+      // because if bg_flush_scheduled_ becomes 0 and the lock is
+      // released, the deconstructor of DB can kick in and destroy all the
+      // states of DB so info_log might not be available after that point.
+      // It also applies to access other states that DB owns.
+      log_buffer.FlushBufferToLog();
+      if (job_context.HaveSomethingToDelete()) {
+        PurgeObsoleteFiles(job_context);
+        TEST_SYNC_POINT("DBImpl::BackgroundCallCompaction:PurgedObsoleteFiles");
+      }
+      job_context.Clean();
+      mutex_.Lock();
     }
 
+    assert(num_running_compactions_ > 0);
+    num_running_compactions_--;
+
     if (bg_thread_pri == Env::Priority::LOW) {
       bg_compaction_scheduled_--;
     } else {

From f278e0e2d325d0a4b320b8e647f9e48323ffae0b Mon Sep 17 00:00:00 2001
From: Hui Xiao <huixiao@fb.com>
Date: Fri, 25 Feb 2022 14:44:46 -0800
Subject: [PATCH 05/34] Deflake DBErrorHandlingFSTest.MultiCFWALWriteError
 (#9496)

Summary:
**Context:**
As part of https://github.com/facebook/rocksdb/pull/6949, file deletion is disabled for faulty database on the IOError of MANIFEST write/sync and [re-enabled again during `DBImpl::Resume()` if all recovery is completed](https://github.com/facebook/rocksdb/commit/e66199d848cd484b816d07359f1dc0f0b99e5351#diff-d9341fbe2a5d4089b93b22c5ed7f666bc311b378c26d0786f4b50c290e460187R396). Before re-enabling file deletion, it `assert(versions_->io_status().ok());`, which IMO assumes `versions_` is **the** `version_` in the recovery process.

However, this is not necessarily true due to `s = error_handler_.ClearBGError();` happening before that assertion can unblock some foreground thread by [`EventHelpers::NotifyOnErrorRecoveryEnd()`](https://github.com/facebook/rocksdb/blob/3122cb435875d720fc3d23a48eb7c0fa89d869aa/db/error_handler.cc#L552-L553) as part of the `ClearBGError()`. That foreground thread can do whatever it wants including closing/reopening the db and clean up that same `versions_`.

As a consequence,  `assert(versions_->io_status().ok());`, will access `io_status()` of a nullptr and test like `DBErrorHandlingFSTest.MultiCFWALWriteError` becomes flaky. The unblocked foreground thread (in this case, the testing thread) proceeds to [reopen the db](https://github.com/facebook/rocksdb/blob/6.29.fb/db/error_handler_fs_test.cc?fbclid=IwAR1kQOxSbTUmaHQPAGz5jdMHXtDsDFKiFl8rifX-vIz4B23Y0S9jBkssSCg#L1494), where [`versions_` gets reset to nullptr](https://github.com/facebook/rocksdb/blob/6.29.fb/db/db_impl/db_impl.cc?fbclid=IwAR2uRhwBiPKgmE9q_6CM2mzbfwjoRgsGpXOrHruSJUDcAKc9rYZtVSvKdOY#L678) as part of the old db clean-up. If this happens right before `assert(versions_->io_status().ok()); ` gets excuted in the background thread, then we can see error like
```
db/db_impl/db_impl.cc:420:5: runtime error: member call on null pointer of type 'rocksdb::VersionSet'
assert(versions_->io_status().ok());
```

**Summary:**
- I proposed to call `s = error_handler_.ClearBGError();` after we know it's fine to wake up foreground, which I think is right before we LOG `ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");`
   - As the context,  the orignal https://github.com/facebook/rocksdb/pull/3997  introducing `DBImpl::Resume()` calls `s = error_handler_.ClearBGError();` very close to calling `ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");` while the later https://github.com/facebook/rocksdb/pull/6949 distances these two calls a bit.
   - And it seems fine to me that `s = error_handler_.ClearBGError();` happens after `EnableFileDeletions(/*force=*/true);` at least syntax-wise since these two functions are orthogonal. And it also seems okay to me that we re-enable file deletion before `s = error_handler_.ClearBGError();`, which basically is resetting some state variables.
- In addition, to preserve the previous behavior of  https://github.com/facebook/rocksdb/pull/6949 where status of re-enabling file deletion is not taken account into the general status of resuming the db, I separated `enable_file_deletion_s` from the general `s`
- In addition, to make `ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");` more clear, I separated it into its own if-block.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9496

Test Plan:
- Manually reproduce the assertion failure in`DBErrorHandlingFSTest.MultiCFWALWriteError` by injecting sleep like below so that it's more likely for `assert(versions_->io_status().ok());` to execute after [reopening the db](https://github.com/facebook/rocksdb/blob/6.29.fb/db/error_handler_fs_test.cc?fbclid=IwAR1kQOxSbTUmaHQPAGz5jdMHXtDsDFKiFl8rifX-vIz4B23Y0S9jBkssSCg#L1494) in the foreground (i.e, testing) thread
```
sleep(1);
assert(versions_->io_status().ok());
```
   `python3 gtest-parallel/gtest_parallel.py -r 100 -w 100 rocksdb/error_handler_fs_test --gtest_filter=DBErrorHandlingFSTest.MultiCFWALWriteError`
   ```
[==========] Running 1 test from 1 test case.
[----------] Global test environment set-up.
[----------] 1 test from DBErrorHandlingFSTest
[ RUN      ] DBErrorHandlingFSTest.MultiCFWALWriteError
Received signal 11 (Segmentation fault)
#0   rocksdb/error_handler_fs_test() [0x5818a4] rocksdb::DBImpl::ResumeImpl(rocksdb::DBRecoverContext)  /data/users/huixiao/rocksdb/db/db_impl/db_impl.cc:421
https://github.com/facebook/rocksdb/issues/1   rocksdb/error_handler_fs_test() [0x6379ff] rocksdb::ErrorHandler::RecoverFromBGError(bool) /data/users/huixiao/rocksdb/db/error_handler.cc:600
https://github.com/facebook/rocksdb/issues/2   rocksdb/error_handler_fs_test() [0x7c5362] rocksdb::SstFileManagerImpl::ClearError()       /data/users/huixiao/rocksdb/file/sst_file_manager_impl.cc:310
https://github.com/facebook/rocksdb/issues/3   rocksdb/error_handler_fs_test()
   ```
- The assertion failure does not happen with PR
`python3 gtest-parallel/gtest_parallel.py -r 100 -w 100 rocksdb/error_handler_fs_test --gtest_filter=DBErrorHandlingFSTest.MultiCFWALWriteError`
`[100/100] DBErrorHandlingFSTest.MultiCFWALWriteError (43785 ms)  `

Reviewed By: riversand963, anand1976

Differential Revision: D33990099

Pulled By: hx235

fbshipit-source-id: 2e0259a471fa8892ff177da91b3e1c0792dd7bab
---
 HISTORY.md            |  3 +++
 db/db_impl/db_impl.cc | 30 +++++++++++++++++++++---------
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 352267328..122029d52 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -3,6 +3,9 @@
 ### Bug Fixes
 * Fix a race condition when cancel manual compaction with `DisableManualCompaction`. Also DB close can cancel the manual compaction thread.
 
+### Bug Fixes
+* * Fixed a data race on `versions_` between `DBImpl::ResumeImpl()` and threads waiting for recovery to complete (#9496)
+
 ## 7.0.0 (02/20/2022)
 ### Bug Fixes
 * Fixed a major bug in which batched MultiGet could return old values for keys deleted by DeleteRange when memtable Bloom filter is enabled (memtable_prefix_bloom_size_ratio > 0). (The fix includes a substantial MultiGet performance improvement in the unusual case of both memtable_whole_key_filtering and prefix_extractor.)
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index d57c992ce..27dd5abf0 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -400,14 +400,6 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
 
   JobContext job_context(0);
   FindObsoleteFiles(&job_context, true);
-  if (s.ok()) {
-    s = error_handler_.ClearBGError();
-  } else {
-    // NOTE: this is needed to pass ASSERT_STATUS_CHECKED
-    // in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test.
-    // See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952
-    error_handler_.GetRecoveryError().PermitUncheckedError();
-  }
   mutex_.Unlock();
 
   job_context.manifest_file_number = 1;
@@ -428,11 +420,31 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
             immutable_db_options_.info_log,
             "DB resume requested but could not enable file deletions [%s]",
             s.ToString().c_str());
+        assert(false);
       }
     }
-    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
   }
+
   mutex_.Lock();
+  if (s.ok()) {
+    // This will notify and unblock threads waiting for error recovery to
+    // finish. Those previouly waiting threads can now proceed, which may
+    // include closing the db.
+    s = error_handler_.ClearBGError();
+  } else {
+    // NOTE: this is needed to pass ASSERT_STATUS_CHECKED
+    // in the DBSSTTest.DBWithMaxSpaceAllowedRandomized test.
+    // See https://github.com/facebook/rocksdb/pull/7715#issuecomment-754947952
+    error_handler_.GetRecoveryError().PermitUncheckedError();
+  }
+
+  if (s.ok()) {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Successfully resumed DB");
+  } else {
+    ROCKS_LOG_INFO(immutable_db_options_.info_log, "Failed to resume DB [%s]",
+                   s.ToString().c_str());
+  }
+
   // Check for shutdown again before scheduling further compactions,
   // since we released and re-acquired the lock above
   if (shutdown_initiated_) {

From a6fd3332e52b9872608e664247bb608521bdde3a Mon Sep 17 00:00:00 2001
From: "jingkai.yuan" <jingkai.yuan@shopee.com>
Date: Wed, 2 Mar 2022 16:35:21 -0800
Subject: [PATCH 06/34] Fix corruption error when compressing blob data with
 zlib. (#9572)

Summary:
The plain data length may not be big enough if the compression actually expands data. So use deflateBound() to get the upper limit on the compressed output before deflate().

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9572

Reviewed By: riversand963

Differential Revision: D34326475

Pulled By: ajkr

fbshipit-source-id: 4b679cb7a83a62782a127785b4d5eb9aa4646449
---
 util/compression.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/util/compression.h b/util/compression.h
index 8f21807e6..1effb8605 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -744,9 +744,6 @@ inline bool Zlib_Compress(const CompressionInfo& info,
     output_header_len = compression::PutDecompressedSizeInfo(
         output, static_cast<uint32_t>(length));
   }
-  // Resize output to be the plain data length.
-  // This may not be big enough if the compression actually expands data.
-  output->resize(output_header_len + length);
 
   // The memLevel parameter specifies how much memory should be allocated for
   // the internal compression state.
@@ -780,12 +777,16 @@ inline bool Zlib_Compress(const CompressionInfo& info,
     }
   }
 
+  // Get an upper bound on the compressed size.
+  size_t upper_bound = deflateBound(&_stream, length);
+  output->resize(output_header_len + upper_bound);
+
   // Compress the input, and put compressed data in output.
   _stream.next_in = (Bytef*)input;
   _stream.avail_in = static_cast<unsigned int>(length);
 
   // Initialize the output size.
-  _stream.avail_out = static_cast<unsigned int>(length);
+  _stream.avail_out = static_cast<unsigned int>(upper_bound);
   _stream.next_out = reinterpret_cast<Bytef*>(&(*output)[output_header_len]);
 
   bool compressed = false;

From 596c606739900909f0725f04871f6bb604e656e7 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Wed, 2 Mar 2022 21:03:14 -0800
Subject: [PATCH 07/34] Fix bug causing incorrect data returned by snapshot
 read (#9648)

Summary:
This bug affects use cases that meet the following conditions
- (has only the default column family or disables WAL) and
- has at least one event listener
- atomic flush is NOT affected.

If the above conditions meet, then RocksDB can release the db mutex before picking all the
existing memtables to flush. In the meantime, a snapshot can be created and db's sequence
number can still be incremented. The upcoming flush will ignore this snapshot.
A later read using this snapshot can return incorrect result.

To fix this issue, we call the listeners callbacks after picking the memtables so that we avoid
creating snapshots during this interval.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9648

Test Plan: make check

Reviewed By: ajkr

Differential Revision: D34555456

Pulled By: riversand963

fbshipit-source-id: 1438981e9f069a5916686b1a0ad7627f734cf0ee
---
 HISTORY.md                             |  5 +--
 db/db_flush_test.cc                    | 56 ++++++++++++++++++++++++++
 db/db_impl/db_impl_compaction_flush.cc | 36 ++++++++++++++---
 3 files changed, 89 insertions(+), 8 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 122029d52..4b2c8fada 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -2,9 +2,8 @@
 ## Unreleased
 ### Bug Fixes
 * Fix a race condition when cancel manual compaction with `DisableManualCompaction`. Also DB close can cancel the manual compaction thread.
-
-### Bug Fixes
-* * Fixed a data race on `versions_` between `DBImpl::ResumeImpl()` and threads waiting for recovery to complete (#9496)
+* Fixed a data race on `versions_` between `DBImpl::ResumeImpl()` and threads waiting for recovery to complete (#9496)
+* Fixed a bug caused by race among flush, incoming writes and taking snapshots. Queries to snapshots created with these race condition can return incorrect result, e.g. resurfacing deleted data.
 
 ## 7.0.0 (02/20/2022)
 ### Bug Fixes
diff --git a/db/db_flush_test.cc b/db/db_flush_test.cc
index b4ca7f019..7255aa8cd 100644
--- a/db/db_flush_test.cc
+++ b/db/db_flush_test.cc
@@ -676,6 +676,7 @@ class TestFlushListener : public EventListener {
   ~TestFlushListener() override {
     prev_fc_info_.status.PermitUncheckedError();  // Ignore the status
   }
+
   void OnTableFileCreated(const TableFileCreationInfo& info) override {
     // remember the info for later checking the FlushJobInfo.
     prev_fc_info_ = info;
@@ -1999,6 +2000,61 @@ TEST_P(DBFlushTestBlobError, FlushError) {
 }
 
 #ifndef ROCKSDB_LITE
+TEST_F(DBFlushTest, TombstoneVisibleInSnapshot) {
+  class SimpleTestFlushListener : public EventListener {
+   public:
+    explicit SimpleTestFlushListener(DBFlushTest* _test) : test_(_test) {}
+    ~SimpleTestFlushListener() override {}
+
+    void OnFlushBegin(DB* db, const FlushJobInfo& info) override {
+      ASSERT_EQ(static_cast<uint32_t>(0), info.cf_id);
+
+      ASSERT_OK(db->Delete(WriteOptions(), "foo"));
+      snapshot_ = db->GetSnapshot();
+      ASSERT_OK(db->Put(WriteOptions(), "foo", "value"));
+
+      auto* dbimpl = static_cast_with_check<DBImpl>(db);
+      assert(dbimpl);
+
+      ColumnFamilyHandle* cfh = db->DefaultColumnFamily();
+      auto* cfhi = static_cast_with_check<ColumnFamilyHandleImpl>(cfh);
+      assert(cfhi);
+      ASSERT_OK(dbimpl->TEST_SwitchMemtable(cfhi->cfd()));
+    }
+
+    DBFlushTest* test_ = nullptr;
+    const Snapshot* snapshot_ = nullptr;
+  };
+
+  Options options = CurrentOptions();
+  options.create_if_missing = true;
+  auto* listener = new SimpleTestFlushListener(this);
+  options.listeners.emplace_back(listener);
+  DestroyAndReopen(options);
+
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "value0"));
+
+  ManagedSnapshot snapshot_guard(db_);
+
+  ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily();
+  ASSERT_OK(db_->Flush(FlushOptions(), default_cf));
+
+  const Snapshot* snapshot = listener->snapshot_;
+  assert(snapshot);
+
+  ReadOptions read_opts;
+  read_opts.snapshot = snapshot;
+
+  // Using snapshot should not see "foo".
+  {
+    std::string value;
+    Status s = db_->Get(read_opts, "foo", &value);
+    ASSERT_TRUE(s.IsNotFound());
+  }
+
+  db_->ReleaseSnapshot(snapshot);
+}
+
 TEST_P(DBAtomicFlushTest, ManualFlushUnder2PC) {
   Options options = CurrentOptions();
   options.create_if_missing = true;
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index 05520d426..e716bac9b 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -170,6 +170,7 @@ Status DBImpl::FlushMemTableToOutputFile(
   const bool needs_to_sync_closed_wals =
       logfile_number_ > 0 &&
       versions_->GetColumnFamilySet()->NumberOfColumnFamilies() > 1;
+
   // If needs_to_sync_closed_wals is true, we need to record the current
   // maximum memtable ID of this column family so that a later PickMemtables()
   // call will not pick memtables whose IDs are higher. This is due to the fact
@@ -177,9 +178,33 @@ Status DBImpl::FlushMemTableToOutputFile(
   // happen for this column family in the meantime. The newly created memtables
   // have their data backed by unsynced WALs, thus they cannot be included in
   // this flush job.
+  // Another reason why we must record the current maximum memtable ID of this
+  // column family: SyncClosedLogs() may release db mutex, thus it's possible
+  // for application to continue to insert into memtables increasing db's
+  // sequence number. The application may take a snapshot, but this snapshot is
+  // not included in `snapshot_seqs` which will be passed to flush job because
+  // `snapshot_seqs` has already been computed before this function starts.
+  // Recording the max memtable ID ensures that the flush job does not flush
+  // a memtable without knowing such snapshot(s).
   uint64_t max_memtable_id = needs_to_sync_closed_wals
                                  ? cfd->imm()->GetLatestMemTableID()
                                  : port::kMaxUint64;
+
+  // If needs_to_sync_closed_wals is false, then the flush job will pick ALL
+  // existing memtables of the column family when PickMemTable() is called
+  // later. Although we won't call SyncClosedLogs() in this case, we may still
+  // call the callbacks of the listeners, i.e. NotifyOnFlushBegin() which also
+  // releases and re-acquires the db mutex. In the meantime, the application
+  // can still insert into the memtables and increase the db's sequence number.
+  // The application can take a snapshot, hoping that the latest visible state
+  // to this snapshto is preserved. This is hard to guarantee since db mutex
+  // not held. This newly-created snapshot is not included in `snapshot_seqs`
+  // and the flush job is unaware of its presence. Consequently, the flush job
+  // may drop certain keys when generating the L0, causing incorrect data to be
+  // returned for snapshot read using this snapshot.
+  // To address this, we make sure NotifyOnFlushBegin() executes after memtable
+  // picking so that no new snapshot can be taken between the two functions.
+
   FlushJob flush_job(
       dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id,
       file_options_for_compaction_, versions_.get(), &mutex_, &shutting_down_,
@@ -192,11 +217,6 @@ Status DBImpl::FlushMemTableToOutputFile(
       &blob_callback_);
   FileMetaData file_meta;
 
-#ifndef ROCKSDB_LITE
-  // may temporarily unlock and lock the mutex.
-  NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id);
-#endif  // ROCKSDB_LITE
-
   Status s;
   bool need_cancel = false;
   IOStatus log_io_s = IOStatus::OK();
@@ -221,6 +241,12 @@ Status DBImpl::FlushMemTableToOutputFile(
   }
   TEST_SYNC_POINT_CALLBACK(
       "DBImpl::FlushMemTableToOutputFile:AfterPickMemtables", &flush_job);
+
+#ifndef ROCKSDB_LITE
+  // may temporarily unlock and lock the mutex.
+  NotifyOnFlushBegin(cfd, &file_meta, mutable_cf_options, job_context->job_id);
+#endif  // ROCKSDB_LITE
+
   bool switched_to_mempurge = false;
   // Within flush_job.Run, rocksdb may call event listener to notify
   // file creation and deletion.

From 4ab2fe77f3885dd84d2fec2ed86dd9e6cd5aad78 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrewkr@fb.com>
Date: Wed, 2 Mar 2022 21:42:52 -0800
Subject: [PATCH 08/34] update HISTORY.md and version.h for 7.0.1

---
 HISTORY.md                | 2 +-
 include/rocksdb/version.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 4b2c8fada..d541fde13 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,5 @@
 # Rocksdb Change Log
-## Unreleased
+## 7.0.1 (03/02/2022)
 ### Bug Fixes
 * Fix a race condition when cancel manual compaction with `DisableManualCompaction`. Also DB close can cancel the manual compaction thread.
 * Fixed a data race on `versions_` between `DBImpl::ResumeImpl()` and threads waiting for recovery to complete (#9496)
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index 0fb3de0b4..fb0b89f0b 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -11,7 +11,7 @@
 
 #define ROCKSDB_MAJOR 7
 #define ROCKSDB_MINOR 0
-#define ROCKSDB_PATCH 0
+#define ROCKSDB_PATCH 1
 
 // Do not use these. We made the mistake of declaring macros starting with
 // double underscore. Now we have to live with our choice. We'll deprecate these

From 3a4d73d337a828a0dcbeeb98174b3814cb50065f Mon Sep 17 00:00:00 2001
From: Jonathan Albrecht <jonathan.albrecht@ibm.com>
Date: Tue, 1 Mar 2022 13:50:41 -0800
Subject: [PATCH 09/34] Reenable s390x platform_dependent travis job (#9631)

Summary:
Fix g++ -march=native detection and reenable s390x in travis

This PR fixes s390x assembler messages:
```
Error: invalid switch -march=z14
Error: unrecognized option -march=z14
```

The s390x travis build was failing with gcc-7 because the assembler on
ubuntu 16.04 is too old to recognize the z14 model so it doesn't work
with -march=native on a z14 machine. It fixes the check for the
-march=native flag so that the assembler will get called and correctly
fail on ubuntu 16.04 which will cause the build to fall back to
-march=z196 which works.

The other changes are needed so builds work more consistently on
s390x:

1. Set make parallelism to 1 for s390x: The default was 4 previously
but I saw frequent internal compiler errors on travis probably due to
low resources. The `platform_dependent` job works more consistently
but is roughly 10 minutes slower although it varies.
2. Remove status_checked jobs, as we are relying on CircleCI for
these now and do not really need platform coverage on them.

Fixes https://github.com/facebook/rocksdb/issues/9524

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9631

Test Plan: CI

Reviewed By: ajkr

Differential Revision: D34553989

Pulled By: pdillinger

fbshipit-source-id: a6e3a7276446721c4c0bebc4ed217c2ca2b53f11
---
 .travis.yml                       | 40 ++++++++++---------------------
 build_tools/build_detect_platform |  2 +-
 2 files changed, 13 insertions(+), 29 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index d2dc33142..8c5ce5bb9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -43,7 +43,6 @@ env:
   - JOB_NAME=cmake-gcc9 # 3-5 minutes
   - JOB_NAME=cmake-gcc9-c++20 # 3-5 minutes
   - JOB_NAME=cmake-mingw # 3 minutes
-  - JOB_NAME=status_checked
 
 matrix:
   exclude:
@@ -58,10 +57,6 @@ matrix:
     env: JOB_NAME=cmake-mingw
   - os: linux
     compiler: clang
-  - if: type = pull_request AND commit_message !~ /FULL_CI/
-    os: linux
-    arch: s390x # Temporary while working through gcc-7 + assembler issue
-    env: TEST_GROUP=platform_dependent
   - if: type = pull_request AND commit_message !~ /FULL_CI/
     os: linux
     arch: arm64
@@ -190,18 +185,6 @@ matrix:
     os: linux
     arch: s390x
     env: JOB_NAME=cmake-gcc9-c++20
-  - if: type = pull_request AND commit_message !~ /FULL_CI/
-    os : linux
-    arch: arm64
-    env: JOB_NAME=status_checked
-  - if: type = pull_request AND commit_message !~ /FULL_CI/
-    os: linux
-    arch: ppc64le
-    env: JOB_NAME=status_checked
-  - if: type = pull_request AND commit_message !~ /FULL_CI/
-    os: linux
-    arch: s390x
-    env: JOB_NAME=status_checked
 
 install:
   -  CC=gcc-7 && CXX=g++-7
@@ -247,21 +230,25 @@ before_script:
 script:
   - date; ${CXX} --version
   - if [ `command -v ccache` ]; then ccache -C; fi
+  - export MK_PARALLEL=4;
+    if [[ "$TRAVIS_CPU_ARCH" == s390x ]]; then
+      export MK_PARALLEL=1;
+    fi
   - case $TEST_GROUP in
     platform_dependent)
-      OPT=-DTRAVIS LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=only make -j4 all_but_some_tests check_some
+      OPT=-DTRAVIS LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=only make -j$MK_PARALLEL all_but_some_tests check_some
       ;;
     1)
-      OPT=-DTRAVIS LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=exclude ROCKSDBTESTS_END=backupable_db_test make -j4 check_some
+      OPT=-DTRAVIS LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=exclude ROCKSDBTESTS_END=backupable_db_test make -j$MK_PARALLEL check_some
       ;;
     2)
-      OPT="-DTRAVIS -DROCKSDB_NAMESPACE=alternative_rocksdb_ns" LIB_MODE=shared V=1 make -j4 tools && OPT="-DTRAVIS -DROCKSDB_NAMESPACE=alternative_rocksdb_ns" LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=exclude ROCKSDBTESTS_START=backupable_db_test ROCKSDBTESTS_END=db_universal_compaction_test make -j4 check_some
+      OPT="-DTRAVIS -DROCKSDB_NAMESPACE=alternative_rocksdb_ns" LIB_MODE=shared V=1 make -j$MK_PARALLEL tools && OPT="-DTRAVIS -DROCKSDB_NAMESPACE=alternative_rocksdb_ns" LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=exclude ROCKSDBTESTS_START=backupable_db_test ROCKSDBTESTS_END=db_universal_compaction_test make -j$MK_PARALLEL check_some
       ;;
     3)
-      OPT=-DTRAVIS LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=exclude ROCKSDBTESTS_START=db_universal_compaction_test ROCKSDBTESTS_END=table_properties_collector_test make -j4 check_some
+      OPT=-DTRAVIS LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=exclude ROCKSDBTESTS_START=db_universal_compaction_test ROCKSDBTESTS_END=table_properties_collector_test make -j$MK_PARALLEL check_some
       ;;
     4)
-      OPT=-DTRAVIS LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=exclude ROCKSDBTESTS_START=table_properties_collector_test make -j4 check_some
+      OPT=-DTRAVIS LIB_MODE=shared V=1 ROCKSDBTESTS_PLATFORM_DEPENDENT=exclude ROCKSDBTESTS_START=table_properties_collector_test make -j$MK_PARALLEL check_some
       ;;
     esac
   - case $JOB_NAME in
@@ -269,10 +256,10 @@ script:
       OPT=-DTRAVIS LIB_MODE=shared V=1 make rocksdbjava jtest
       ;;
     lite_build)
-      OPT='-DTRAVIS -DROCKSDB_LITE' LIB_MODE=shared V=1 make -j4 all
+      OPT='-DTRAVIS -DROCKSDB_LITE' LIB_MODE=shared V=1 make -j$MK_PARALLEL all
       ;;
     examples)
-      OPT=-DTRAVIS LIB_MODE=shared V=1 make -j4 static_lib && cd examples && make -j4
+      OPT=-DTRAVIS LIB_MODE=shared V=1 make -j$MK_PARALLEL static_lib && cd examples && make -j$MK_PARALLEL
       ;;
     cmake-mingw)
       sudo update-alternatives --set x86_64-w64-mingw32-g++ /usr/bin/x86_64-w64-mingw32-g++-posix;
@@ -285,10 +272,7 @@ script:
           ;;
       esac
 
-      mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=Release -DWITH_TESTS=0 -DWITH_GFLAGS=0 -DWITH_BENCHMARK_TOOLS=0 -DWITH_TOOLS=0 -DWITH_CORE_TOOLS=1 .. && make -j4 && cd  .. && rm -rf build && mkdir build && cd build && cmake -DJNI=1 .. -DCMAKE_BUILD_TYPE=Release $OPT && make -j4 rocksdb rocksdbjni
-      ;;
-    status_checked)
-      OPT=-DTRAVIS LIB_MODE=shared V=1 ASSERT_STATUS_CHECKED=1 make -j4 check_some
+      mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=Release -DWITH_TESTS=0 -DWITH_GFLAGS=0 -DWITH_BENCHMARK_TOOLS=0 -DWITH_TOOLS=0 -DWITH_CORE_TOOLS=1 .. && make -j$MK_PARALLEL && cd  .. && rm -rf build && mkdir build && cd build && cmake -DJNI=1 .. -DCMAKE_BUILD_TYPE=Release $OPT && make -j$MK_PARALLEL rocksdb rocksdbjni
       ;;
     esac
 notifications:
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index 1d89cba79..dd9890dbb 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -632,7 +632,7 @@ if test "0$PORTABLE" -eq 0; then
     COMMON_FLAGS="$COMMON_FLAGS"
   elif test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then
     if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ \
-      -fsyntax-only -march=native - -o /dev/null 2>/dev/null; then
+      -march=native - -o /dev/null 2>/dev/null; then
       COMMON_FLAGS="$COMMON_FLAGS -march=native "
     else
       COMMON_FLAGS="$COMMON_FLAGS -march=z196 "

From fe9e363ba1774f1627a933b5115b3c05840712c1 Mon Sep 17 00:00:00 2001
From: Adam Retter <adam.retter@googlemail.com>
Date: Mon, 7 Mar 2022 10:50:52 -0800
Subject: [PATCH 10/34] Fix RocksJava releases for macOS (#9662)

Summary:
Addresses the problems described in https://github.com/facebook/rocksdb/pull/9254#issuecomment-1054598516 and https://github.com/facebook/rocksdb/pull/9254#issuecomment-1059574837 that have blocked a RocksJava release

**NOTE** Also needs to be ported to 6.29.fb branch.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9662

Reviewed By: ajkr

Differential Revision: D34689200

Pulled By: pdillinger

fbshipit-source-id: c62fe34c54f05be5a00ee1daec8ec7454baa5eb8
---
 Makefile                                      |  37 ++++--
 .../java/org/rocksdb/NativeLibraryLoader.java | 112 ++++++++++++------
 .../java/org/rocksdb/util/Environment.java    |  25 +++-
 .../org/rocksdb/util/EnvironmentTest.java     |  29 +++--
 4 files changed, 146 insertions(+), 57 deletions(-)

diff --git a/Makefile b/Makefile
index 7aee1b5b8..e57a36857 100644
--- a/Makefile
+++ b/Makefile
@@ -329,8 +329,8 @@ ifneq ($(MACHINE), arm64)
 # linking with jemalloc (as it won't be arm64-compatible) and remove some other options
 # set during platform detection
 DISABLE_JEMALLOC=1
-PLATFORM_CFLAGS := $(filter-out -march=native -DHAVE_SSE42, $(PLATFORM_CFLAGS))
-PLATFORM_CXXFLAGS := $(filter-out -march=native -DHAVE_SSE42, $(PLATFORM_CXXFLAGS))
+PLATFORM_CFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CFLAGS))
+PLATFORM_CXXFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CXXFLAGS))
 endif
 endif
 endif
@@ -2116,7 +2116,9 @@ CURL_SSL_OPTS ?= --tlsv1
 ifeq ($(PLATFORM), OS_MACOSX)
 ifeq (,$(findstring librocksdbjni-osx,$(ROCKSDBJNILIB)))
 ifeq ($(MACHINE),arm64)
-	ROCKSDBJNILIB = librocksdbjni-osx-aarch64.jnilib
+	ROCKSDBJNILIB = librocksdbjni-osx-arm64.jnilib
+else ifeq ($(MACHINE),x86_64)
+	ROCKSDBJNILIB = librocksdbjni-osx-x86_64.jnilib
 else
 	ROCKSDBJNILIB = librocksdbjni-osx.jnilib
 endif
@@ -2247,15 +2249,20 @@ endif
 	$(MAKE) rocksdbjavastatic_deps
 	$(MAKE) rocksdbjavastatic_libobjects
 	$(MAKE) rocksdbjavastatic_javalib
-	$(MAKE) rocksdbjavastatic_jar
+	$(MAKE) rocksdbjava_jar
 
 rocksdbjavastaticosx: rocksdbjavastaticosx_archs
-	mv java/target/librocksdbjni-osx-x86_64.jnilib java/target/librocksdbjni-osx.jnilib
-	mv java/target/librocksdbjni-osx-arm64.jnilib java/target/librocksdbjni-osx-aarch64.jnilib
+	cd java; $(JAR_CMD)  -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) librocksdbjni-osx-x86_64.jnilib librocksdbjni-osx-arm64.jnilib
+	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+	openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1
 
 rocksdbjavastaticosx_ub: rocksdbjavastaticosx_archs
-	lipo -create -output ./java/target/$(ROCKSDBJNILIB) java/target/librocksdbjni-osx-x86_64.jnilib java/target/librocksdbjni-osx-arm64.jnilib
-	$(MAKE) rocksdbjavastatic_jar
+	cd java/target; lipo -create -output librocksdbjni-osx.jnilib librocksdbjni-osx-x86_64.jnilib librocksdbjni-osx-arm64.jnilib
+	cd java; $(JAR_CMD)  -cf target/$(ROCKSDB_JAR) HISTORY*.md
+	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) librocksdbjni-osx.jnilib
+	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
+	openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1
 
 rocksdbjavastaticosx_archs:
 	$(MAKE) rocksdbjavastaticosx_arch_x86_64
@@ -2289,28 +2296,32 @@ rocksdbjavastatic_javalib:
 		strip $(STRIPFLAGS) $(ROCKSDBJNILIB); \
 	fi
 
-rocksdbjavastatic_jar:
+rocksdbjava_jar:
 	cd java; $(JAR_CMD)  -cf target/$(ROCKSDB_JAR) HISTORY*.md
 	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
 	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
-	cd java/target/apidocs; $(JAR_CMD) -cf ../$(ROCKSDB_JAVADOCS_JAR) *
-	cd java/src/main/java; $(JAR_CMD) -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org
 	openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1
+
+rocksdbjava_javadocs_jar:
+	cd java/target/apidocs; $(JAR_CMD) -cf ../$(ROCKSDB_JAVADOCS_JAR) *
 	openssl sha1 java/target/$(ROCKSDB_JAVADOCS_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAVADOCS_JAR).sha1
+
+rocksdbjava_sources_jar:
+	cd java/src/main/java; $(JAR_CMD) -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org
 	openssl sha1 java/target/$(ROCKSDB_SOURCES_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_SOURCES_JAR).sha1
 
 rocksdbjavastatic_deps: $(JAVA_COMPRESSIONS)
 
 rocksdbjavastatic_libobjects: $(LIB_OBJECTS)
 
-rocksdbjavastaticrelease: rocksdbjavastaticosx
+rocksdbjavastaticrelease: rocksdbjavastaticosx rocksdbjava_javadocs_jar rocksdbjava_sources_jar
 	cd java/crossbuild && (vagrant destroy -f || true) && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64 && vagrant up linux64-musl && vagrant halt linux64-musl
 	cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
 	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
 	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
 	openssl sha1 java/target/$(ROCKSDB_JAR_ALL) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR_ALL).sha1
 
-rocksdbjavastaticreleasedocker: rocksdbjavastaticosx rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64 rocksdbjavastaticdockerx86musl rocksdbjavastaticdockerx86_64musl
+rocksdbjavastaticreleasedocker: rocksdbjavastaticosx rocksdbjavastaticdockerx86 rocksdbjavastaticdockerx86_64 rocksdbjavastaticdockerx86musl rocksdbjavastaticdockerx86_64musl rocksdbjava_javadocs_jar rocksdbjava_sources_jar
 	cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md
 	cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib
 	cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class
diff --git a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
index e47eacb22..b97cf28b9 100644
--- a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
+++ b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java
@@ -18,7 +18,11 @@ public class NativeLibraryLoader {
 
   private static final String sharedLibraryName = Environment.getSharedLibraryName("rocksdb");
   private static final String jniLibraryName = Environment.getJniLibraryName("rocksdb");
+  private static final /* @Nullable */ String fallbackJniLibraryName =
+      Environment.getFallbackJniLibraryName("rocksdb");
   private static final String jniLibraryFileName = Environment.getJniLibraryFileName("rocksdb");
+  private static final /* @Nullable */ String fallbackJniLibraryFileName =
+      Environment.getFallbackJniLibraryFileName("rocksdb");
   private static final String tempFilePrefix = "librocksdbjni";
   private static final String tempFileSuffix = Environment.getJniLibraryExtension();
 
@@ -49,14 +53,33 @@ public static NativeLibraryLoader getInstance() {
    */
   public synchronized void loadLibrary(final String tmpDir) throws IOException {
     try {
-        System.loadLibrary(sharedLibraryName);
-    } catch(final UnsatisfiedLinkError ule1) {
+      // try dynamic library
+      System.loadLibrary(sharedLibraryName);
+      return;
+    } catch (final UnsatisfiedLinkError ule) {
+      // ignore - try from static library
+    }
+
+    try {
+      // try static library
+      System.loadLibrary(jniLibraryName);
+      return;
+    } catch (final UnsatisfiedLinkError ule) {
+      // ignore - then try static library fallback or from jar
+    }
+
+    if (fallbackJniLibraryName != null) {
       try {
-        System.loadLibrary(jniLibraryName);
-      } catch(final UnsatisfiedLinkError ule2) {
-        loadLibraryFromJar(tmpDir);
+        // try static library fallback
+        System.loadLibrary(fallbackJniLibraryName);
+        return;
+      } catch (final UnsatisfiedLinkError ule) {
+        // ignore - then try from jar
       }
     }
+
+    // try jar
+    loadLibraryFromJar(tmpDir);
   }
 
   /**
@@ -83,43 +106,62 @@ void loadLibraryFromJar(final String tmpDir)
 
   File loadLibraryFromJarToTemp(final String tmpDir)
           throws IOException {
-    final File temp;
-    if (tmpDir == null || tmpDir.isEmpty()) {
-      temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
-    } else {
-      final File parentDir = new File(tmpDir);
-      if (!parentDir.exists()) {
-        throw new RuntimeException(
-            "Directory: " + parentDir.getAbsolutePath() + " does not exist!");
+    InputStream is = null;
+    try {
+      // attempt to look up the static library in the jar file
+      String libraryFileName = jniLibraryFileName;
+      is = getClass().getClassLoader().getResourceAsStream(libraryFileName);
+
+      if (is == null) {
+        // is there a fallback we can try
+        if (fallbackJniLibraryFileName == null) {
+          throw new RuntimeException(libraryFileName + " was not found inside JAR.");
+        }
+
+        // attempt to look up the fallback static library in the jar file
+        libraryFileName = fallbackJniLibraryFileName;
+        is = getClass().getClassLoader().getResourceAsStream(libraryFileName);
+        if (is == null) {
+          throw new RuntimeException(libraryFileName + " was not found inside JAR.");
+        }
       }
-      temp = new File(parentDir, jniLibraryFileName);
-      if (temp.exists() && !temp.delete()) {
-        throw new RuntimeException("File: " + temp.getAbsolutePath()
-            + " already exists and cannot be removed.");
+
+      // create a temporary file to copy the library to
+      final File temp;
+      if (tmpDir == null || tmpDir.isEmpty()) {
+        temp = File.createTempFile(tempFilePrefix, tempFileSuffix);
+      } else {
+        final File parentDir = new File(tmpDir);
+        if (!parentDir.exists()) {
+          throw new RuntimeException(
+              "Directory: " + parentDir.getAbsolutePath() + " does not exist!");
+        }
+        temp = new File(parentDir, libraryFileName);
+        if (temp.exists() && !temp.delete()) {
+          throw new RuntimeException(
+              "File: " + temp.getAbsolutePath() + " already exists and cannot be removed.");
+        }
+        if (!temp.createNewFile()) {
+          throw new RuntimeException("File: " + temp.getAbsolutePath() + " could not be created.");
+        }
       }
-      if (!temp.createNewFile()) {
-        throw new RuntimeException("File: " + temp.getAbsolutePath()
-            + " could not be created.");
+      if (!temp.exists()) {
+        throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist.");
+      } else {
+        temp.deleteOnExit();
       }
-    }
 
-    if (!temp.exists()) {
-      throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist.");
-    } else {
-      temp.deleteOnExit();
-    }
+      // copy the library from the Jar file to the temp destination
+      Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING);
 
-    // attempt to copy the library from the Jar file to the temp destination
-    try (final InputStream is = getClass().getClassLoader().
-      getResourceAsStream(jniLibraryFileName)) {
-      if (is == null) {
-        throw new RuntimeException(jniLibraryFileName + " was not found inside JAR.");
-      } else {
-        Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING);
+      // return the temporary library file
+      return temp;
+
+    } finally {
+      if (is != null) {
+        is.close();
       }
     }
-
-    return temp;
   }
 
   /**
diff --git a/java/src/main/java/org/rocksdb/util/Environment.java b/java/src/main/java/org/rocksdb/util/Environment.java
index 46357a38f..5da471f18 100644
--- a/java/src/main/java/org/rocksdb/util/Environment.java
+++ b/java/src/main/java/org/rocksdb/util/Environment.java
@@ -110,8 +110,14 @@ public static String getJniLibraryName(final String name) {
         return String.format("%sjni-linux%s%s", name, arch, getLibcPostfix());
       }
     } else if (isMac()) {
-      if (isAarch64()) {
-        return String.format("%sjni-osx-%s", name, ARCH);
+      if (is64Bit()) {
+        final String arch;
+        if (isAarch64()) {
+          arch = "arm64";
+        } else {
+          arch = "x86_64";
+        }
+        return String.format("%sjni-osx-%s", name, arch);
       } else {
         return String.format("%sjni-osx", name);
       }
@@ -131,10 +137,25 @@ public static String getJniLibraryName(final String name) {
     throw new UnsupportedOperationException(String.format("Cannot determine JNI library name for ARCH='%s' OS='%s' name='%s'", ARCH, OS, name));
   }
 
+  public static /*@Nullable*/ String getFallbackJniLibraryName(final String name) {
+    if (isMac() && is64Bit()) {
+      return String.format("%sjni-osx", name);
+    }
+    return null;
+  }
+
   public static String getJniLibraryFileName(final String name) {
     return appendLibOsSuffix("lib" + getJniLibraryName(name), false);
   }
 
+  public static /*@Nullable*/ String getFallbackJniLibraryFileName(final String name) {
+    final String fallbackJniLibraryName = getFallbackJniLibraryName(name);
+    if (fallbackJniLibraryName == null) {
+      return null;
+    }
+    return appendLibOsSuffix("lib" + fallbackJniLibraryName, false);
+  }
+
   private static String appendLibOsSuffix(final String libraryFileName, final boolean shared) {
     if (isUnix() || isAix() || isSolaris() || isFreeBSD() || isOpenBSD()) {
       return libraryFileName + ".so";
diff --git a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
index 43cd38a1c..301dec22f 100644
--- a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
+++ b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java
@@ -9,7 +9,6 @@
 import org.junit.Test;
 
 import java.lang.reflect.Field;
-import java.lang.reflect.Modifier;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
@@ -37,18 +36,21 @@ public void mac32() {
         isEqualTo(".jnilib");
     assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-osx.jnilib");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.dylib");
   }
 
   @Test
-  public void mac64() {
-    setEnvironmentClassFields("mac", "x86-64");
+  public void mac64_x86_64() {
+    setEnvironmentClassFields("mac", "x86_64");
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".jnilib");
-    assertThat(Environment.getJniLibraryFileName("rocksdb")).
-        isEqualTo("librocksdbjni-osx.jnilib");
+    assertThat(Environment.getJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-osx-x86_64.jnilib");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-osx.jnilib");
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.dylib");
   }
@@ -59,7 +61,9 @@ public void macAarch64() {
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).isEqualTo(".jnilib");
     assertThat(Environment.getJniLibraryFileName("rocksdb"))
-        .isEqualTo("librocksdbjni-osx-aarch64.jnilib");
+        .isEqualTo("librocksdbjni-osx-arm64.jnilib");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb"))
+        .isEqualTo("librocksdbjni-osx.jnilib");
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.dylib");
   }
 
@@ -73,6 +77,7 @@ public void nix32() {
         isEqualTo(".so");
     assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-linux32.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
     // Linux musl-libc (Alpine)
@@ -103,7 +108,8 @@ public void aix32() {
     assertThat(Environment.isWindows()).isFalse();
     assertThat(Environment.getJniLibraryExtension()).
         isEqualTo(".so");
-    Environment.getJniLibraryFileName("rocksdb");
+    assertThat(Environment.getJniLibraryFileName("rocksdb")).isEqualTo("blah");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
   }
 
   @Test
@@ -115,6 +121,7 @@ public void nix64() {
         isEqualTo(".so");
     assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-linux64.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
     // Linux musl-libc (Alpine)
@@ -124,6 +131,7 @@ public void nix64() {
         isEqualTo(".so");
     assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-linux64-musl.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
     // UNIX
@@ -134,6 +142,7 @@ public void nix64() {
         isEqualTo(".so");
     assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-linux64.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
     // AIX
@@ -143,6 +152,7 @@ public void nix64() {
         isEqualTo(".so");
     assertThat(Environment.getJniLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni-aix64.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
         isEqualTo("librocksdbjni.so");
   }
@@ -161,6 +171,7 @@ public void win64() {
       isEqualTo(".dll");
     assertThat(Environment.getJniLibraryFileName("rocksdb")).
       isEqualTo("librocksdbjni-win64.dll");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).
       isEqualTo("librocksdbjni.dll");
   }
@@ -177,6 +188,7 @@ public void ppc64le() {
     assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le");
     assertThat(Environment.getJniLibraryFileName("rocksdb"))
         .isEqualTo("librocksdbjni-linux-ppc64le.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
     // Linux musl-libc (Alpine)
     setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
@@ -189,6 +201,7 @@ public void ppc64le() {
     assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-ppc64le-musl");
     assertThat(Environment.getJniLibraryFileName("rocksdb"))
         .isEqualTo("librocksdbjni-linux-ppc64le-musl.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
     setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
   }
@@ -205,6 +218,7 @@ public void linuxArch64() {
     assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64");
     assertThat(Environment.getJniLibraryFileName("rocksdb"))
         .isEqualTo("librocksdbjni-linux-aarch64.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
     // Linux musl-libc (Alpine)
     setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, true);
@@ -217,6 +231,7 @@ public void linuxArch64() {
     assertThat(Environment.getJniLibraryName("rocksdb")).isEqualTo("rocksdbjni-linux-aarch64-musl");
     assertThat(Environment.getJniLibraryFileName("rocksdb"))
         .isEqualTo("librocksdbjni-linux-aarch64-musl.so");
+    assertThat(Environment.getFallbackJniLibraryFileName("rocksdb")).isNull();
     assertThat(Environment.getSharedLibraryFileName("rocksdb")).isEqualTo("librocksdbjni.so");
     setEnvironmentClassField(MUSL_LIBC_FIELD_NAME, false);
   }

From bbae679ce7bd53d267ac8310f60fa86491716666 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrewkr@fb.com>
Date: Wed, 9 Mar 2022 21:07:31 -0800
Subject: [PATCH 11/34] Avoid popcnt on Windows when unavailable and in
 portable builds (#9680)

Summary:
Fixes https://github.com/facebook/rocksdb/issues/9560. Only use popcnt intrinsic when HAVE_SSE42 is set. Also avoid setting it based on compiler test in portable builds because such test will pass on MSVC even without proper arch flags (ref: https://devblogs.microsoft.com/oldnewthing/20201026-00/?p=104397).

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9680

Test Plan: verified the combinations of -DPORTABLE and -DFORCE_SSE42 produce expected compiler flags on Linux. Verified MSVC build using PORTABLE=1 (in CircleCI) does not set HAVE_SSE42.

Reviewed By: pdillinger

Differential Revision: D34739033

Pulled By: ajkr

fbshipit-source-id: d10456f3392945fc3e59430a1777840f7b60b276
---
 CMakeLists.txt | 14 ++++++++------
 util/math.h    | 22 +++++++++++++++-------
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4ac7b5628..774c7a188 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -321,7 +321,8 @@ if(NOT MSVC)
   set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul")
 endif()
 
-CHECK_CXX_SOURCE_COMPILES("
+if (NOT PORTABLE OR FORCE_SSE42)
+  CHECK_CXX_SOURCE_COMPILES("
 #include <cstdint>
 #include <nmmintrin.h>
 #include <wmmintrin.h>
@@ -333,11 +334,12 @@ int main() {
   auto d = _mm_cvtsi128_si64(c);
 }
 " HAVE_SSE42)
-if(HAVE_SSE42)
-  add_definitions(-DHAVE_SSE42)
-  add_definitions(-DHAVE_PCLMUL)
-elseif(FORCE_SSE42)
-  message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled")
+  if(HAVE_SSE42)
+    add_definitions(-DHAVE_SSE42)
+    add_definitions(-DHAVE_PCLMUL)
+  elseif(FORCE_SSE42)
+    message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled")
+  endif()
 endif()
 
 # Check if -latomic is required or not
diff --git a/util/math.h b/util/math.h
index c6dc2a29f..2254b4aab 100644
--- a/util/math.h
+++ b/util/math.h
@@ -92,18 +92,25 @@ inline int CountTrailingZeroBits(T v) {
 #endif
 }
 
-#if defined(_MSC_VER) && !defined(_M_X64)
+// Not all MSVC compile settings will use `BitsSetToOneFallback()`. We include
+// the following code at coarse granularity for simpler macros. It's important
+// to exclude at least so our non-MSVC unit test coverage tool doesn't see it.
+#ifdef _MSC_VER
+
 namespace detail {
+
 template <typename T>
 int BitsSetToOneFallback(T v) {
   const int kBits = static_cast<int>(sizeof(T)) * 8;
   static_assert((kBits & (kBits - 1)) == 0, "must be power of two bits");
   // we static_cast these bit patterns in order to truncate them to the correct
-  // size
+  // size. Warning C4309 dislikes this technique, so disable it here.
+#pragma warning(disable : 4309)
   v = static_cast<T>(v - ((v >> 1) & static_cast<T>(0x5555555555555555ull)));
   v = static_cast<T>((v & static_cast<T>(0x3333333333333333ull)) +
                      ((v >> 2) & static_cast<T>(0x3333333333333333ull)));
   v = static_cast<T>((v + (v >> 4)) & static_cast<T>(0x0F0F0F0F0F0F0F0Full));
+#pragma warning(default : 4309)
   for (int shift_bits = 8; shift_bits < kBits; shift_bits <<= 1) {
     v += static_cast<T>(v >> shift_bits);
   }
@@ -113,7 +120,8 @@ int BitsSetToOneFallback(T v) {
 }
 
 }  // namespace detail
-#endif
+
+#endif  // _MSC_VER
 
 // Number of bits set to 1. Also known as "population count".
 template <typename T>
@@ -126,21 +134,21 @@ inline int BitsSetToOne(T v) {
     constexpr auto mm = 8 * sizeof(uint32_t) - 1;
     // The bit mask is to neutralize sign extension on small signed types
     constexpr uint32_t m = (uint32_t{1} << ((8 * sizeof(T)) & mm)) - 1;
-#if defined(_M_X64) || defined(_M_IX86)
+#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86))
     return static_cast<int>(__popcnt(static_cast<uint32_t>(v) & m));
 #else
     return static_cast<int>(detail::BitsSetToOneFallback(v) & m);
 #endif
   } else if (sizeof(T) == sizeof(uint32_t)) {
-#if defined(_M_X64) || defined(_M_IX86)
+#if defined(HAVE_SSE42) && (defined(_M_X64) || defined(_M_IX86))
     return static_cast<int>(__popcnt(static_cast<uint32_t>(v)));
 #else
     return detail::BitsSetToOneFallback(static_cast<uint32_t>(v));
 #endif
   } else {
-#ifdef _M_X64
+#if defined(HAVE_SSE42) && defined(_M_X64)
     return static_cast<int>(__popcnt64(static_cast<uint64_t>(v)));
-#elif defined(_M_IX86)
+#elif defined(HAVE_SSE42) && defined(_M_IX86)
     return static_cast<int>(
         __popcnt(static_cast<uint32_t>(static_cast<uint64_t>(v) >> 32) +
                  __popcnt(static_cast<uint32_t>(v))));

From 22e011fe0a551545e901fdfdece6bec1091d45ed Mon Sep 17 00:00:00 2001
From: Jay Zhuang <zjay@fb.com>
Date: Sat, 12 Mar 2022 11:45:56 -0800
Subject: [PATCH 12/34] Fix a timer crash caused by invalid memory management
 (#9656)

Summary:
Timer crash when multiple DB instances doing heavy DB open and close
operations concurrently. Which is caused by adding a timer task with
smaller timestamp than the current running task. Fix it by moving the
getting new task timestamp part within timer mutex protection.
And other fixes:
- Disallow adding duplicated function name to timer
- Fix a minor memory leak in timer when a running task is cancelled

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9656

Reviewed By: ajkr

Differential Revision: D34626296

Pulled By: jay-zhuang

fbshipit-source-id: 6b6d96a5149746bf503546244912a9e41a0c5f6b
---
 db/column_family.cc                          |  1 +
 db/compaction/compaction_picker_universal.cc |  1 +
 db/db_impl/compacted_db_impl.cc              |  4 +-
 db/db_impl/db_impl.cc                        |  9 +--
 db/db_impl/db_impl.h                         |  2 +-
 db/db_impl/db_impl_compaction_flush.cc       |  2 +
 db/db_impl/db_impl_open.cc                   |  5 +-
 db/periodic_work_scheduler.cc                | 38 +++++++----
 db/periodic_work_scheduler.h                 |  4 +-
 util/timer.h                                 | 71 +++++++++++---------
 util/timer_test.cc                           | 21 +++---
 11 files changed, 92 insertions(+), 66 deletions(-)

diff --git a/db/column_family.cc b/db/column_family.cc
index a2885515c..1d1c0c0db 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -989,6 +989,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
           GetL0ThresholdSpeedupCompaction(
               mutable_cf_options.level0_file_num_compaction_trigger,
               mutable_cf_options.level0_slowdown_writes_trigger)) {
+        fprintf(stdout, "JJJ2\n");
         write_controller_token_ =
             write_controller->GetCompactionPressureToken();
         ROCKS_LOG_INFO(
diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index 7ec7d3b93..4a65433b5 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -372,6 +372,7 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
   const int kLevel0 = 0;
   score_ = vstorage_->CompactionScore(kLevel0);
   sorted_runs_ = CalculateSortedRuns(*vstorage_);
+  fprintf(stdout, "JJJ1\n");
 
   if (sorted_runs_.size() == 0 ||
       (vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
diff --git a/db/db_impl/compacted_db_impl.cc b/db/db_impl/compacted_db_impl.cc
index b65455437..e1c061c27 100644
--- a/db/db_impl/compacted_db_impl.cc
+++ b/db/db_impl/compacted_db_impl.cc
@@ -171,7 +171,9 @@ Status CompactedDBImpl::Open(const Options& options,
   std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
   Status s = db->Init(options);
   if (s.ok()) {
-    db->StartPeriodicWorkScheduler();
+    s = db->StartPeriodicWorkScheduler();
+  }
+  if (s.ok()) {
     ROCKS_LOG_INFO(db->immutable_db_options_.info_log,
                    "Opened the db as fully compacted mode");
     LogFlush(db->immutable_db_options_.info_log);
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 27dd5abf0..fb16122dd 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -770,7 +770,7 @@ void DBImpl::PrintStatistics() {
   }
 }
 
-void DBImpl::StartPeriodicWorkScheduler() {
+Status DBImpl::StartPeriodicWorkScheduler() {
 #ifndef ROCKSDB_LITE
 
 #ifndef NDEBUG
@@ -780,7 +780,7 @@ void DBImpl::StartPeriodicWorkScheduler() {
       "DBImpl::StartPeriodicWorkScheduler:DisableScheduler",
       &disable_scheduler);
   if (disable_scheduler) {
-    return;
+    return Status::OK();
   }
 #endif  // !NDEBUG
 
@@ -791,10 +791,11 @@ void DBImpl::StartPeriodicWorkScheduler() {
                              &periodic_work_scheduler_);
   }
 
-  periodic_work_scheduler_->Register(
+  return periodic_work_scheduler_->Register(
       this, mutable_db_options_.stats_dump_period_sec,
       mutable_db_options_.stats_persist_period_sec);
 #endif  // !ROCKSDB_LITE
+  return Status::OK();
 }
 
 // esitmate the total size of stats_history_
@@ -1228,7 +1229,7 @@ Status DBImpl::SetDBOptions(
               mutable_db_options_.stats_persist_period_sec) {
         mutex_.Unlock();
         periodic_work_scheduler_->Unregister(this);
-        periodic_work_scheduler_->Register(
+        s = periodic_work_scheduler_->Register(
             this, new_options.stats_dump_period_sec,
             new_options.stats_persist_period_sec);
         mutex_.Lock();
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index d1e95c75a..4dfa5ebbe 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1866,7 +1866,7 @@ class DBImpl : public DB {
                               LogBuffer* log_buffer);
 
   // Schedule background tasks
-  void StartPeriodicWorkScheduler();
+  Status StartPeriodicWorkScheduler();
 
   void PrintStatistics();
 
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index e716bac9b..f8d018ceb 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -285,6 +285,7 @@ Status DBImpl::FlushMemTableToOutputFile(
     assert(storage_info);
 
     VersionStorageInfo::LevelSummaryStorage tmp;
+    fprintf(stdout, "JJJ4\n");
     ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
                      column_family_name.c_str(),
                      storage_info->LevelSummary(&tmp));
@@ -729,6 +730,7 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
       assert(storage_info);
 
       VersionStorageInfo::LevelSummaryStorage tmp;
+      fprintf(stdout, "JJJ3\n");
       ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
                        column_family_name.c_str(),
                        storage_info->LevelSummary(&tmp));
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index ee79eb9f0..840dc2bec 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -1889,8 +1889,9 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
                    persist_options_status.ToString().c_str());
   }
   if (s.ok()) {
-    impl->StartPeriodicWorkScheduler();
-  } else {
+    s = impl->StartPeriodicWorkScheduler();
+  }
+  if (!s.ok()) {
     for (auto* h : *handles) {
       delete h;
     }
diff --git a/db/periodic_work_scheduler.cc b/db/periodic_work_scheduler.cc
index 677eec90c..904b847f1 100644
--- a/db/periodic_work_scheduler.cc
+++ b/db/periodic_work_scheduler.cc
@@ -16,31 +16,41 @@ PeriodicWorkScheduler::PeriodicWorkScheduler(
   timer = std::unique_ptr<Timer>(new Timer(clock.get()));
 }
 
-void PeriodicWorkScheduler::Register(DBImpl* dbi,
-                                     unsigned int stats_dump_period_sec,
-                                     unsigned int stats_persist_period_sec) {
+Status PeriodicWorkScheduler::Register(DBImpl* dbi,
+                                       unsigned int stats_dump_period_sec,
+                                       unsigned int stats_persist_period_sec) {
   MutexLock l(&timer_mu_);
   static std::atomic<uint64_t> initial_delay(0);
   timer->Start();
   if (stats_dump_period_sec > 0) {
-    timer->Add([dbi]() { dbi->DumpStats(); }, GetTaskName(dbi, "dump_st"),
-               initial_delay.fetch_add(1) %
-                   static_cast<uint64_t>(stats_dump_period_sec) *
-                   kMicrosInSecond,
-               static_cast<uint64_t>(stats_dump_period_sec) * kMicrosInSecond);
+    bool succeeded = timer->Add(
+        [dbi]() { dbi->DumpStats(); }, GetTaskName(dbi, "dump_st"),
+        initial_delay.fetch_add(1) %
+            static_cast<uint64_t>(stats_dump_period_sec) * kMicrosInSecond,
+        static_cast<uint64_t>(stats_dump_period_sec) * kMicrosInSecond);
+    if (!succeeded) {
+      return Status::Aborted("Unable to add periodic task DumpStats");
+    }
   }
   if (stats_persist_period_sec > 0) {
-    timer->Add(
+    bool succeeded = timer->Add(
         [dbi]() { dbi->PersistStats(); }, GetTaskName(dbi, "pst_st"),
         initial_delay.fetch_add(1) %
             static_cast<uint64_t>(stats_persist_period_sec) * kMicrosInSecond,
         static_cast<uint64_t>(stats_persist_period_sec) * kMicrosInSecond);
+    if (!succeeded) {
+      return Status::Aborted("Unable to add periodic task PersistStats");
+    }
+  }
+  bool succeeded = timer->Add(
+      [dbi]() { dbi->FlushInfoLog(); }, GetTaskName(dbi, "flush_info_log"),
+      initial_delay.fetch_add(1) % kDefaultFlushInfoLogPeriodSec *
+          kMicrosInSecond,
+      kDefaultFlushInfoLogPeriodSec * kMicrosInSecond);
+  if (!succeeded) {
+    return Status::Aborted("Unable to add periodic task PersistStats");
   }
-  timer->Add([dbi]() { dbi->FlushInfoLog(); },
-             GetTaskName(dbi, "flush_info_log"),
-             initial_delay.fetch_add(1) % kDefaultFlushInfoLogPeriodSec *
-                 kMicrosInSecond,
-             kDefaultFlushInfoLogPeriodSec * kMicrosInSecond);
+  return Status::OK();
 }
 
 void PeriodicWorkScheduler::Unregister(DBImpl* dbi) {
diff --git a/db/periodic_work_scheduler.h b/db/periodic_work_scheduler.h
index fe89ff567..e494c7eee 100644
--- a/db/periodic_work_scheduler.h
+++ b/db/periodic_work_scheduler.h
@@ -30,8 +30,8 @@ class PeriodicWorkScheduler {
   PeriodicWorkScheduler& operator=(const PeriodicWorkScheduler&) = delete;
   PeriodicWorkScheduler& operator=(PeriodicWorkScheduler&&) = delete;
 
-  void Register(DBImpl* dbi, unsigned int stats_dump_period_sec,
-                unsigned int stats_persist_period_sec);
+  Status Register(DBImpl* dbi, unsigned int stats_dump_period_sec,
+                  unsigned int stats_persist_period_sec);
 
   void Unregister(DBImpl* dbi);
 
diff --git a/util/timer.h b/util/timer.h
index 736d0bf0a..6571dc7de 100644
--- a/util/timer.h
+++ b/util/timer.h
@@ -48,36 +48,38 @@ class Timer {
   ~Timer() { Shutdown(); }
 
   // Add a new function to run.
-  // fn_name has to be identical, otherwise, the new one overrides the existing
-  // one, regardless if the function is pending removed (invalid) or not.
+  // fn_name has to be identical, otherwise it will fail to add and return false
   // start_after_us is the initial delay.
   // repeat_every_us is the interval between ending time of the last call and
   // starting time of the next call. For example, repeat_every_us = 2000 and
   // the function takes 1000us to run. If it starts at time [now]us, then it
   // finishes at [now]+1000us, 2nd run starting time will be at [now]+3000us.
   // repeat_every_us == 0 means do not repeat.
-  void Add(std::function<void()> fn,
-           const std::string& fn_name,
-           uint64_t start_after_us,
-           uint64_t repeat_every_us) {
-    std::unique_ptr<FunctionInfo> fn_info(new FunctionInfo(
-        std::move(fn), fn_name, clock_->NowMicros() + start_after_us,
-        repeat_every_us));
-    {
-      InstrumentedMutexLock l(&mutex_);
-      auto it = map_.find(fn_name);
-      if (it == map_.end()) {
-        heap_.push(fn_info.get());
-        map_.emplace(std::make_pair(fn_name, std::move(fn_info)));
-      } else {
-        // If it already exists, overriding it.
-        it->second->fn = std::move(fn_info->fn);
-        it->second->valid = true;
-        it->second->next_run_time_us = clock_->NowMicros() + start_after_us;
-        it->second->repeat_every_us = repeat_every_us;
-      }
+  bool Add(std::function<void()> fn, const std::string& fn_name,
+           uint64_t start_after_us, uint64_t repeat_every_us) {
+    auto fn_info = std::make_unique<FunctionInfo>(std::move(fn), fn_name, 0,
+                                                  repeat_every_us);
+    InstrumentedMutexLock l(&mutex_);
+    // Assign time within mutex to make sure the next_run_time is larger than
+    // the current running one
+    fn_info->next_run_time_us = clock_->NowMicros() + start_after_us;
+    // the new task start time should never before the current task executing
+    // time, as the executing task can only be running if it's next_run_time_us
+    // is due (<= clock_->NowMicros()).
+    if (executing_task_ &&
+        fn_info->next_run_time_us < heap_.top()->next_run_time_us) {
+      return false;
+    }
+    auto it = map_.find(fn_name);
+    if (it == map_.end()) {
+      heap_.push(fn_info.get());
+      map_.try_emplace(fn_name, std::move(fn_info));
+    } else {
+      // timer doesn't support duplicated function name
+      return false;
     }
     cond_var_.SignalAll();
+    return true;
   }
 
   void Cancel(const std::string& fn_name) {
@@ -116,7 +118,7 @@ class Timer {
     }
 
     running_ = true;
-    thread_.reset(new port::Thread(&Timer::Run, this));
+    thread_ = std::make_unique<port::Thread>(&Timer::Run, this);
     return true;
   }
 
@@ -140,8 +142,8 @@ class Timer {
 
   bool HasPendingTask() const {
     InstrumentedMutexLock l(&mutex_);
-    for (auto it = map_.begin(); it != map_.end(); it++) {
-      if (it->second->IsValid()) {
+    for (const auto& fn_info : map_) {
+      if (fn_info.second->IsValid()) {
         return true;
       }
     }
@@ -155,7 +157,7 @@ class Timer {
   // here to bump current time and trigger Timer. See timer_test for example.
   //
   // Note: only support one caller of this method.
-  void TEST_WaitForRun(std::function<void()> callback = nullptr) {
+  void TEST_WaitForRun(const std::function<void()>& callback = nullptr) {
     InstrumentedMutexLock l(&mutex_);
     // It act as a spin lock
     while (executing_task_ ||
@@ -177,8 +179,8 @@ class Timer {
   size_t TEST_GetPendingTaskNum() const {
     InstrumentedMutexLock l(&mutex_);
     size_t ret = 0;
-    for (auto it = map_.begin(); it != map_.end(); it++) {
-      if (it->second->IsValid()) {
+    for (const auto& fn_info : map_) {
+      if (fn_info.second->IsValid()) {
         ret++;
       }
     }
@@ -220,10 +222,13 @@ class Timer {
         executing_task_ = false;
         cond_var_.SignalAll();
 
-        // Remove the work from the heap once it is done executing.
+        // Remove the work from the heap once it is done executing, make sure
+        // it's the same function after executing the work while mutex is
+        // released.
         // Note that we are just removing the pointer from the heap. Its
         // memory is still managed in the map (as it holds a unique ptr).
         // So current_fn is still a valid ptr.
+        assert(heap_.top() == current_fn);
         heap_.pop();
 
         // current_fn may be cancelled already.
@@ -234,6 +239,10 @@ class Timer {
 
           // Schedule new work into the heap with new time.
           heap_.push(current_fn);
+        } else {
+          // if current_fn is cancelled or no need to repeat, remove it from the
+          // map to avoid leak.
+          map_.erase(current_fn->name);
         }
       } else {
         cond_var_.TimedWait(current_fn->next_run_time_us);
@@ -280,10 +289,10 @@ class Timer {
     // calls `Cancel()`.
     bool valid;
 
-    FunctionInfo(std::function<void()>&& _fn, const std::string& _name,
+    FunctionInfo(std::function<void()>&& _fn, std::string _name,
                  const uint64_t _next_run_time_us, uint64_t _repeat_every_us)
         : fn(std::move(_fn)),
-          name(_name),
+          name(std::move(_name)),
           next_run_time_us(_next_run_time_us),
           repeat_every_us(_repeat_every_us),
           valid(true) {}
diff --git a/util/timer_test.cc b/util/timer_test.cc
index 9256fcd45..a04c098ea 100644
--- a/util/timer_test.cc
+++ b/util/timer_test.cc
@@ -273,33 +273,32 @@ TEST_F(TimerTest, AddSameFuncName) {
   ASSERT_TRUE(timer.Start());
 
   int func_counter1 = 0;
-  timer.Add([&] { func_counter1++; }, "duplicated_func", kInitDelayUs,
-            kRepeat1Us);
+  ASSERT_TRUE(timer.Add([&] { func_counter1++; }, "duplicated_func",
+                        kInitDelayUs, kRepeat1Us));
 
   int func2_counter = 0;
-  timer.Add([&] { func2_counter++; }, "func2", kInitDelayUs, kRepeat2Us);
+  ASSERT_TRUE(
+      timer.Add([&] { func2_counter++; }, "func2", kInitDelayUs, kRepeat2Us));
 
-  // New function with the same name should override the existing one
+  // New function with the same name should fail to add
   int func_counter2 = 0;
-  timer.Add([&] { func_counter2++; }, "duplicated_func", kInitDelayUs,
-            kRepeat1Us);
+  ASSERT_FALSE(timer.Add([&] { func_counter2++; }, "duplicated_func",
+                         kInitDelayUs, kRepeat1Us));
 
   ASSERT_EQ(0, func_counter1);
   ASSERT_EQ(0, func2_counter);
-  ASSERT_EQ(0, func_counter2);
 
   timer.TEST_WaitForRun(
       [&] { mock_clock_->SleepForMicroseconds(kInitDelayUs); });
 
-  ASSERT_EQ(0, func_counter1);
+  ASSERT_EQ(1, func_counter1);
   ASSERT_EQ(1, func2_counter);
-  ASSERT_EQ(1, func_counter2);
 
   timer.TEST_WaitForRun([&] { mock_clock_->SleepForMicroseconds(kRepeat1Us); });
 
-  ASSERT_EQ(0, func_counter1);
+  ASSERT_EQ(2, func_counter1);
   ASSERT_EQ(2, func2_counter);
-  ASSERT_EQ(2, func_counter2);
+  ASSERT_EQ(0, func_counter2);
 
   ASSERT_TRUE(timer.Shutdown());
 }

From 9fe3a534431b36dc08d65a58175eaef922dc80a3 Mon Sep 17 00:00:00 2001
From: Jay Zhuang <zjay@fb.com>
Date: Sat, 12 Mar 2022 20:07:04 -0800
Subject: [PATCH 13/34] DisableManualCompaction may fail to cancel an
 unscheduled task (#9659)

Summary:
https://github.com/facebook/rocksdb/issues/9625 didn't change the unschedule condition which was waiting for the background thread to clean-up the compaction.
make sure we only unschedule the task when it's scheduled.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9659

Reviewed By: ajkr

Differential Revision: D34651820

Pulled By: jay-zhuang

fbshipit-source-id: 23f42081b15ec8886cd81cbf131b116e0c74dc2f
---
 HISTORY.md                                   |   3 +
 db/column_family.cc                          |   1 -
 db/compaction/compaction_picker_universal.cc |   1 -
 db/db_compaction_test.cc                     | 106 +++++++++++++++++
 db/db_impl/db_impl.h                         |  31 +++--
 db/db_impl/db_impl_compaction_flush.cc       | 113 ++++++++++---------
 6 files changed, 189 insertions(+), 66 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index d541fde13..4a90fac95 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,4 +1,7 @@
 # Rocksdb Change Log
+## Unreleased
+* Fixed a bug that DisableManualCompaction may assert when disable an unscheduled manual compaction.
+
 ## 7.0.1 (03/02/2022)
 ### Bug Fixes
 * Fix a race condition when cancel manual compaction with `DisableManualCompaction`. Also DB close can cancel the manual compaction thread.
diff --git a/db/column_family.cc b/db/column_family.cc
index 1d1c0c0db..a2885515c 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -989,7 +989,6 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
           GetL0ThresholdSpeedupCompaction(
               mutable_cf_options.level0_file_num_compaction_trigger,
               mutable_cf_options.level0_slowdown_writes_trigger)) {
-        fprintf(stdout, "JJJ2\n");
         write_controller_token_ =
             write_controller->GetCompactionPressureToken();
         ROCKS_LOG_INFO(
diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc
index 4a65433b5..7ec7d3b93 100644
--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@@ -372,7 +372,6 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
   const int kLevel0 = 0;
   score_ = vstorage_->CompactionScore(kLevel0);
   sorted_runs_ = CalculateSortedRuns(*vstorage_);
-  fprintf(stdout, "JJJ1\n");
 
   if (sorted_runs_.size() == 0 ||
       (vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index 94933e8f2..fd0eb66eb 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -6889,6 +6889,112 @@ TEST_F(DBCompactionTest, FIFOWarm) {
   Destroy(options);
 }
 
+TEST_F(DBCompactionTest, DisableMultiManualCompaction) {
+  const int kNumL0Files = 10;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // Generate 2 levels of file to make sure the manual compaction is not skipped
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), "value"));
+    if (i % 2) {
+      ASSERT_OK(Flush());
+    }
+  }
+  MoveFilesToLevel(2);
+
+  for (int i = 0; i < 10; i++) {
+    ASSERT_OK(Put(Key(i), "value"));
+    if (i % 2) {
+      ASSERT_OK(Flush());
+    }
+  }
+  MoveFilesToLevel(1);
+
+  // Block compaction queue
+  test::SleepingBackgroundTask sleeping_task_low;
+  env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low,
+                 Env::Priority::LOW);
+
+  port::Thread compact_thread1([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = false;
+    std::string begin_str = Key(0);
+    std::string end_str = Key(3);
+    Slice b = begin_str;
+    Slice e = end_str;
+    auto s = db_->CompactRange(cro, &b, &e);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  port::Thread compact_thread2([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = false;
+    std::string begin_str = Key(4);
+    std::string end_str = Key(7);
+    Slice b = begin_str;
+    Slice e = end_str;
+    auto s = db_->CompactRange(cro, &b, &e);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  // Disable manual compaction should cancel both manual compactions and both
+  // compaction should return incomplete.
+  db_->DisableManualCompaction();
+
+  compact_thread1.join();
+  compact_thread2.join();
+
+  sleeping_task_low.WakeUp();
+  sleeping_task_low.WaitUntilDone();
+  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+}
+
+TEST_F(DBCompactionTest, DisableJustStartedManualCompaction) {
+  const int kNumL0Files = 4;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  // make sure the manual compaction background is started but not yet set the
+  // status to in_progress, then cancel the manual compaction, which should not
+  // result in segfault
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BGWorkCompaction",
+        "DBCompactionTest::DisableJustStartedManualCompaction:"
+        "PreDisableManualCompaction"},
+       {"DBCompactionTest::DisableJustStartedManualCompaction:"
+        "ManualCompactionReturn",
+        "BackgroundCallCompaction:0"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+    TEST_SYNC_POINT(
+        "DBCompactionTest::DisableJustStartedManualCompaction:"
+        "ManualCompactionReturn");
+  });
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableJustStartedManualCompaction:"
+      "PreDisableManualCompaction");
+  db_->DisableManualCompaction();
+
+  compact_thread.join();
+}
+
 TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFull) {
   const int kNumL0Files = 4;
 
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 4dfa5ebbe..3fa039793 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1514,19 +1514,31 @@ class DBImpl : public DB {
 
   // Information for a manual compaction
   struct ManualCompactionState {
+    ManualCompactionState(ColumnFamilyData* _cfd, int _input_level,
+                          int _output_level, uint32_t _output_path_id,
+                          bool _exclusive, bool _disallow_trivial_move,
+                          std::atomic<bool>* _canceled)
+        : cfd(_cfd),
+          input_level(_input_level),
+          output_level(_output_level),
+          output_path_id(_output_path_id),
+          exclusive(_exclusive),
+          disallow_trivial_move(_disallow_trivial_move),
+          canceled(_canceled) {}
+
     ColumnFamilyData* cfd;
     int input_level;
     int output_level;
     uint32_t output_path_id;
     Status status;
-    bool done;
-    bool in_progress;             // compaction request being processed?
-    bool incomplete;              // only part of requested range compacted
-    bool exclusive;               // current behavior of only one manual
-    bool disallow_trivial_move;   // Force actual compaction to run
-    const InternalKey* begin;     // nullptr means beginning of key range
-    const InternalKey* end;       // nullptr means end of key range
-    InternalKey* manual_end;      // how far we are compacting
+    bool done = false;
+    bool in_progress = false;    // compaction request being processed?
+    bool incomplete = false;     // only part of requested range compacted
+    bool exclusive;              // current behavior of only one manual
+    bool disallow_trivial_move;  // Force actual compaction to run
+    const InternalKey* begin = nullptr;  // nullptr means beginning of key range
+    const InternalKey* end = nullptr;    // nullptr means end of key range
+    InternalKey* manual_end = nullptr;   // how far we are compacting
     InternalKey tmp_storage;      // Used to keep track of compaction progress
     InternalKey tmp_storage1;     // Used to keep track of compaction progress
     std::atomic<bool>* canceled;  // Compaction canceled by the user?
@@ -1536,7 +1548,8 @@ class DBImpl : public DB {
     Compaction* compaction;
     // caller retains ownership of `manual_compaction_state` as it is reused
     // across background compactions.
-    ManualCompactionState* manual_compaction_state;  // nullptr if non-manual
+    std::shared_ptr<ManualCompactionState>
+        manual_compaction_state;  // nullptr if non-manual
     // task limiter token is requested during compaction picking.
     std::unique_ptr<TaskLimiterToken> task_token;
   };
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index f8d018ceb..c2135f732 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -285,7 +285,6 @@ Status DBImpl::FlushMemTableToOutputFile(
     assert(storage_info);
 
     VersionStorageInfo::LevelSummaryStorage tmp;
-    fprintf(stdout, "JJJ4\n");
     ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
                      column_family_name.c_str(),
                      storage_info->LevelSummary(&tmp));
@@ -730,7 +729,6 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles(
       assert(storage_info);
 
       VersionStorageInfo::LevelSummaryStorage tmp;
-      fprintf(stdout, "JJJ3\n");
       ROCKS_LOG_BUFFER(log_buffer, "[%s] Level summary: %s\n",
                        column_family_name.c_str(),
                        storage_info->LevelSummary(&tmp));
@@ -1793,34 +1791,27 @@ Status DBImpl::RunManualCompaction(
   bool scheduled = false;
   Env::Priority thread_pool_priority = Env::Priority::TOTAL;
   bool manual_conflict = false;
-  ManualCompactionState manual;
-  manual.cfd = cfd;
-  manual.input_level = input_level;
-  manual.output_level = output_level;
-  manual.output_path_id = compact_range_options.target_path_id;
-  manual.done = false;
-  manual.in_progress = false;
-  manual.incomplete = false;
-  manual.exclusive = exclusive;
-  manual.disallow_trivial_move = disallow_trivial_move;
-  manual.canceled = compact_range_options.canceled;
+
+  auto manual = std::make_shared<ManualCompactionState>(
+      cfd, input_level, output_level, compact_range_options.target_path_id,
+      exclusive, disallow_trivial_move, compact_range_options.canceled);
   // For universal compaction, we enforce every manual compaction to compact
   // all files.
   if (begin == nullptr ||
       cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
       cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
-    manual.begin = nullptr;
+    manual->begin = nullptr;
   } else {
     begin_storage.SetMinPossibleForUserKey(*begin);
-    manual.begin = &begin_storage;
+    manual->begin = &begin_storage;
   }
   if (end == nullptr ||
       cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
       cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
-    manual.end = nullptr;
+    manual->end = nullptr;
   } else {
     end_storage.SetMaxPossibleForUserKey(*end);
-    manual.end = &end_storage;
+    manual->end = &end_storage;
   }
 
   TEST_SYNC_POINT("DBImpl::RunManualCompaction:0");
@@ -1832,10 +1823,10 @@ Status DBImpl::RunManualCompaction(
     // `DisableManualCompaction()` just waited for the manual compaction queue
     // to drain. So return immediately.
     TEST_SYNC_POINT("DBImpl::RunManualCompaction:PausedAtStart");
-    manual.status =
+    manual->status =
         Status::Incomplete(Status::SubCode::kManualCompactionPaused);
-    manual.done = true;
-    return manual.status;
+    manual->done = true;
+    return manual->status;
   }
 
   // When a manual compaction arrives, temporarily disable scheduling of
@@ -1855,7 +1846,7 @@ Status DBImpl::RunManualCompaction(
   // However, only one of them will actually schedule compaction, while
   // others will wait on a condition variable until it completes.
 
-  AddManualCompaction(&manual);
+  AddManualCompaction(manual.get());
   TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_);
   if (exclusive) {
     // Limitation: there's no way to wake up the below loop when user sets
@@ -1864,11 +1855,11 @@ Status DBImpl::RunManualCompaction(
     while (bg_bottom_compaction_scheduled_ > 0 ||
            bg_compaction_scheduled_ > 0) {
       if (manual_compaction_paused_ > 0 ||
-          (manual.canceled != nullptr && *manual.canceled == true)) {
+          (manual->canceled != nullptr && *manual->canceled == true)) {
         // Pretend the error came from compaction so the below cleanup/error
         // handling code can process it.
-        manual.done = true;
-        manual.status =
+        manual->done = true;
+        manual->status =
             Status::Incomplete(Status::SubCode::kManualCompactionPaused);
         break;
       }
@@ -1890,56 +1881,64 @@ Status DBImpl::RunManualCompaction(
   // We don't check bg_error_ here, because if we get the error in compaction,
   // the compaction will set manual.status to bg_error_ and set manual.done to
   // true.
-  while (!manual.done) {
+  while (!manual->done) {
     assert(HasPendingManualCompaction());
     manual_conflict = false;
     Compaction* compaction = nullptr;
-    if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) ||
-        scheduled ||
-        (((manual.manual_end = &manual.tmp_storage1) != nullptr) &&
-         ((compaction = manual.cfd->CompactRange(
-               *manual.cfd->GetLatestMutableCFOptions(), mutable_db_options_,
-               manual.input_level, manual.output_level, compact_range_options,
-               manual.begin, manual.end, &manual.manual_end, &manual_conflict,
-               max_file_num_to_ignore)) == nullptr &&
+    if (ShouldntRunManualCompaction(manual.get()) ||
+        (manual->in_progress == true) || scheduled ||
+        (((manual->manual_end = &manual->tmp_storage1) != nullptr) &&
+         ((compaction = manual->cfd->CompactRange(
+               *manual->cfd->GetLatestMutableCFOptions(), mutable_db_options_,
+               manual->input_level, manual->output_level, compact_range_options,
+               manual->begin, manual->end, &manual->manual_end,
+               &manual_conflict, max_file_num_to_ignore)) == nullptr &&
           manual_conflict))) {
       // exclusive manual compactions should not see a conflict during
       // CompactRange
       assert(!exclusive || !manual_conflict);
       // Running either this or some other manual compaction
       bg_cv_.Wait();
-      if (manual_compaction_paused_ > 0 && !manual.done &&
-          !manual.in_progress) {
-        manual.done = true;
-        manual.status =
+      if (manual_compaction_paused_ > 0) {
+        manual->done = true;
+        manual->status =
             Status::Incomplete(Status::SubCode::kManualCompactionPaused);
-        assert(thread_pool_priority != Env::Priority::TOTAL);
-        env_->UnSchedule(GetTaskTag(TaskType::kManualCompaction),
-                         thread_pool_priority);
+        if (scheduled) {
+          assert(thread_pool_priority != Env::Priority::TOTAL);
+          auto unscheduled_task_num = env_->UnSchedule(
+              GetTaskTag(TaskType::kManualCompaction), thread_pool_priority);
+          if (unscheduled_task_num > 0) {
+            ROCKS_LOG_INFO(
+                immutable_db_options_.info_log,
+                "[%s] Unscheduled %d number of manual compactions from the "
+                "thread-pool",
+                cfd->GetName().c_str(), unscheduled_task_num);
+          }
+        }
         break;
       }
-      if (scheduled && manual.incomplete == true) {
-        assert(!manual.in_progress);
+      if (scheduled && manual->incomplete == true) {
+        assert(!manual->in_progress);
         scheduled = false;
-        manual.incomplete = false;
+        manual->incomplete = false;
       }
     } else if (!scheduled) {
       if (compaction == nullptr) {
-        manual.done = true;
+        manual->done = true;
         bg_cv_.SignalAll();
         continue;
       }
       ca = new CompactionArg;
       ca->db = this;
       ca->prepicked_compaction = new PrepickedCompaction;
-      ca->prepicked_compaction->manual_compaction_state = &manual;
+      ca->prepicked_compaction->manual_compaction_state = manual;
       ca->prepicked_compaction->compaction = compaction;
       if (!RequestCompactionToken(
               cfd, true, &ca->prepicked_compaction->task_token, &log_buffer)) {
         // Don't throttle manual compaction, only count outstanding tasks.
         assert(false);
       }
-      manual.incomplete = false;
+      manual->incomplete = false;
       if (compaction->bottommost_level() &&
           env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
         bg_bottom_compaction_scheduled_++;
@@ -1963,18 +1962,18 @@ Status DBImpl::RunManualCompaction(
   }
 
   log_buffer.FlushBufferToLog();
-  assert(!manual.in_progress);
+  assert(!manual->in_progress);
   assert(HasPendingManualCompaction());
-  RemoveManualCompaction(&manual);
+  RemoveManualCompaction(manual.get());
   // if the manual job is unscheduled, try schedule other jobs in case there's
   // any unscheduled compaction job which was blocked by exclusive manual
   // compaction.
-  if (manual.status.IsIncomplete() &&
-      manual.status.subcode() == Status::SubCode::kManualCompactionPaused) {
+  if (manual->status.IsIncomplete() &&
+      manual->status.subcode() == Status::SubCode::kManualCompactionPaused) {
     MaybeScheduleFlushOrCompaction();
   }
   bg_cv_.SignalAll();
-  return manual.status;
+  return manual->status;
 }
 
 void DBImpl::GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
@@ -2942,7 +2941,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
       mutex_.Lock();
     } else if (s.IsManualCompactionPaused()) {
       assert(prepicked_compaction);
-      ManualCompactionState* m = prepicked_compaction->manual_compaction_state;
+      auto m = prepicked_compaction->manual_compaction_state;
       assert(m);
       ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused",
                        m->cfd->GetName().c_str(), job_context.job_id);
@@ -3024,7 +3023,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                                     LogBuffer* log_buffer,
                                     PrepickedCompaction* prepicked_compaction,
                                     Env::Priority thread_pri) {
-  ManualCompactionState* manual_compaction =
+  std::shared_ptr<ManualCompactionState> manual_compaction =
       prepicked_compaction == nullptr
           ? nullptr
           : prepicked_compaction->manual_compaction_state;
@@ -3068,6 +3067,10 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   if (!status.ok()) {
     if (is_manual) {
       manual_compaction->status = status;
+      manual_compaction->status
+          .PermitUncheckedError();  // the manual compaction thread may exit
+                                    // first, which won't be able to check the
+                                    // status
       manual_compaction->done = true;
       manual_compaction->in_progress = false;
       manual_compaction = nullptr;
@@ -3090,7 +3093,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   // InternalKey* manual_end = &manual_end_storage;
   bool sfm_reserved_compact_space = false;
   if (is_manual) {
-    ManualCompactionState* m = manual_compaction;
+    auto m = manual_compaction;
     assert(m->in_progress);
     if (!c) {
       m->done = true;
@@ -3474,7 +3477,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   c.reset();
 
   if (is_manual) {
-    ManualCompactionState* m = manual_compaction;
+    auto m = manual_compaction;
     if (!status.ok()) {
       m->status = status;
       m->done = true;

From 7b6ca63ed05ad92c00785e9086bc115e8e61fc39 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrewkr@fb.com>
Date: Mon, 14 Mar 2022 09:38:23 -0700
Subject: [PATCH 14/34] update HISTORY.md and version.h for 7.0.2

---
 HISTORY.md                | 2 +-
 include/rocksdb/version.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 4a90fac95..1a7d9f577 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,5 @@
 # Rocksdb Change Log
-## Unreleased
+## 7.0.2 (03/12/2022)
 * Fixed a bug that DisableManualCompaction may assert when disable an unscheduled manual compaction.
 
 ## 7.0.1 (03/02/2022)
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index fb0b89f0b..e1b67be85 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -11,7 +11,7 @@
 
 #define ROCKSDB_MAJOR 7
 #define ROCKSDB_MINOR 0
-#define ROCKSDB_PATCH 1
+#define ROCKSDB_PATCH 2
 
 // Do not use these. We made the mistake of declaring macros starting with
 // double underscore. Now we have to live with our choice. We'll deprecate these

From 348414d5020ac8dc9ef5968fb5dcc7f32902ce38 Mon Sep 17 00:00:00 2001
From: Yuriy Chernyshov <thegeorg@yandex-team.ru>
Date: Sun, 13 Mar 2022 17:01:21 -0700
Subject: [PATCH 15/34] #include <winioctl.h> as MSDN prescribes (#9612)

Summary:
The recommendation can be found e. g. [here](https://docs.microsoft.com/en-us/windows/win32/api/winioctl/ns-winioctl-storage_property_query).

While `<windows.h>` transitively includes `<winioctl.h>` by default, this can be switched off by `/DWIN32_LEAN_AND_MEAN` which forces the user to include-what-you-use.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9612

Reviewed By: riversand963

Differential Revision: D34845629

Pulled By: ajkr

fbshipit-source-id: 1ef9273074e3d84864c6833a7de6eb9df81e29d9
---
 port/win/env_win.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/port/win/env_win.cc b/port/win/env_win.cc
index 3108cf6e7..7ee58a85b 100644
--- a/port/win/env_win.cc
+++ b/port/win/env_win.cc
@@ -9,6 +9,8 @@
 
 #if defined(OS_WIN)
 
+#include "port/win/env_win.h"
+
 #include <direct.h>  // _rmdir, _mkdir, _getcwd
 #include <errno.h>
 #include <io.h>   // _access
@@ -17,6 +19,7 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <windows.h>
+#include <winioctl.h>
 
 #include <algorithm>
 #include <ctime>
@@ -27,7 +30,6 @@
 #include "monitoring/thread_status_util.h"
 #include "port/port.h"
 #include "port/port_dirent.h"
-#include "port/win/env_win.h"
 #include "port/win/io_win.h"
 #include "port/win/win_logger.h"
 #include "rocksdb/env.h"

From 9db48d6e08b0b81af59ebf12a2a0132d7c0a13ff Mon Sep 17 00:00:00 2001
From: Tomas Kolda <koldat@gmail.com>
Date: Mon, 14 Mar 2022 14:12:30 -0700
Subject: [PATCH 16/34] NPE in
 Java_org_rocksdb_ColumnFamilyOptions_setSstPartitionerFactory (#9622)

Summary:
There was a mistake that incorrectly cast SstPartitionerFactory (missed shared pointer). It worked for database (correct cast), but not for family. Trying to set it in family has caused Access violation.

I have also added test and improved it. Older version was passing even without sst partitioner which is weird, because on Level1 we had two SST files with same key "aaaa1". I was not sure if it is a new feature and changed it to overlaping keys "aaaa0" - "aaaa2" overlaps "aaaa1".

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9622

Reviewed By: ajkr

Differential Revision: D34871968

Pulled By: pdillinger

fbshipit-source-id: a08009766da49fc198692a610e8beb19caf737e6
---
 java/rocksjni/options.cc                      |  5 +--
 .../java/org/rocksdb/SstPartitionerTest.java  | 33 +++++++++++++++++--
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc
index 42cb1e244..9d929e88c 100644
--- a/java/rocksjni/options.cc
+++ b/java/rocksjni/options.cc
@@ -4260,9 +4260,10 @@ void Java_org_rocksdb_ColumnFamilyOptions_setSstPartitionerFactory(
     JNIEnv*, jobject, jlong jhandle, jlong factory_handle) {
   auto* options =
       reinterpret_cast<ROCKSDB_NAMESPACE::ColumnFamilyOptions*>(jhandle);
-  auto* factory = reinterpret_cast<ROCKSDB_NAMESPACE::SstPartitionerFactory*>(
+  auto factory = reinterpret_cast<
+      std::shared_ptr<ROCKSDB_NAMESPACE::SstPartitionerFactory>*>(
       factory_handle);
-  options->sst_partitioner_factory.reset(factory);
+  options->sst_partitioner_factory = *factory;
 }
 
 /*
diff --git a/java/src/test/java/org/rocksdb/SstPartitionerTest.java b/java/src/test/java/org/rocksdb/SstPartitionerTest.java
index 8593e0045..74816db93 100644
--- a/java/src/test/java/org/rocksdb/SstPartitionerTest.java
+++ b/java/src/test/java/org/rocksdb/SstPartitionerTest.java
@@ -5,6 +5,7 @@
 
 package org.rocksdb;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.assertj.core.api.Assertions.assertThat;
 
 import java.util.List;
@@ -21,7 +22,7 @@ public class SstPartitionerTest {
   @Rule public TemporaryFolder dbFolder = new TemporaryFolder();
 
   @Test
-  public void sstFixedPrefix() throws InterruptedException, RocksDBException {
+  public void sstFixedPrefix() throws RocksDBException {
     try (SstPartitionerFixedPrefixFactory factory = new SstPartitionerFixedPrefixFactory(4);
          final Options opt =
              new Options().setCreateIfMissing(true).setSstPartitionerFactory(factory);
@@ -31,7 +32,8 @@ public void sstFixedPrefix() throws InterruptedException, RocksDBException {
       db.put("bbbb1".getBytes(), "B".getBytes());
       db.flush(new FlushOptions());
 
-      db.put("aaaa1".getBytes(), "A2".getBytes());
+      db.put("aaaa0".getBytes(), "A2".getBytes());
+      db.put("aaaa2".getBytes(), "A2".getBytes());
       db.flush(new FlushOptions());
 
       db.compactRange();
@@ -40,4 +42,31 @@ public void sstFixedPrefix() throws InterruptedException, RocksDBException {
       assertThat(metadata.size()).isEqualTo(2);
     }
   }
+
+  @Test
+  public void sstFixedPrefixFamily() throws RocksDBException {
+    final byte[] cfName = "new_cf".getBytes(UTF_8);
+    final ColumnFamilyDescriptor cfDescriptor = new ColumnFamilyDescriptor(cfName,
+        new ColumnFamilyOptions().setSstPartitionerFactory(
+            new SstPartitionerFixedPrefixFactory(4)));
+
+    try (final Options opt = new Options().setCreateIfMissing(true);
+         final RocksDB db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath())) {
+      final ColumnFamilyHandle columnFamilyHandle = db.createColumnFamily(cfDescriptor);
+
+      // writing (long)100 under key
+      db.put(columnFamilyHandle, "aaaa1".getBytes(), "A".getBytes());
+      db.put(columnFamilyHandle, "bbbb1".getBytes(), "B".getBytes());
+      db.flush(new FlushOptions(), columnFamilyHandle);
+
+      db.put(columnFamilyHandle, "aaaa0".getBytes(), "A2".getBytes());
+      db.put(columnFamilyHandle, "aaaa2".getBytes(), "A2".getBytes());
+      db.flush(new FlushOptions(), columnFamilyHandle);
+
+      db.compactRange(columnFamilyHandle);
+
+      List<LiveFileMetaData> metadata = db.getLiveFilesMetaData();
+      assertThat(metadata.size()).isEqualTo(2);
+    }
+  }
 }

From fddc1a79ddd215718920e280f60099b0b83c0d6e Mon Sep 17 00:00:00 2001
From: Jermy Li <javaloveme@gmail.com>
Date: Tue, 15 Mar 2022 09:50:21 -0700
Subject: [PATCH 17/34] fix: Reusing-Iterator reads stale keys after
 DeleteRange() performed (#9258)

Summary:
fix https://github.com/facebook/rocksdb/issues/9255

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9258

Reviewed By: pdillinger

Differential Revision: D34879684

Pulled By: ajkr

fbshipit-source-id: 5934f4b7524dc27ecdf1430e0456a0fc02958fc7
---
 HISTORY.md                  |  4 +++
 db/arena_wrapped_db_iter.cc | 69 +++++++++++++++++++++++++------------
 db/db_range_del_test.cc     | 28 +++++++++++++++
 3 files changed, 79 insertions(+), 22 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 1a7d9f577..91cf7ede9 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,4 +1,8 @@
 # Rocksdb Change Log
+## Unreleased
+### Bug Fixes
+* Fixed a bug that `Iterator::Refresh()` reads stale keys after DeleteRange() performed.
+
 ## 7.0.2 (03/12/2022)
 * Fixed a bug that DisableManualCompaction may assert when disable an unscheduled manual compaction.
 
diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc
index 520588afe..20d4655be 100644
--- a/db/arena_wrapped_db_iter.cc
+++ b/db/arena_wrapped_db_iter.cc
@@ -58,30 +58,55 @@ Status ArenaWrappedDBIter::Refresh() {
   uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
   TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:1");
   TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:2");
-  if (sv_number_ != cur_sv_number) {
-    Env* env = db_iter_->env();
-    db_iter_->~DBIter();
-    arena_.~Arena();
-    new (&arena_) Arena();
+  while (true) {
+    if (sv_number_ != cur_sv_number) {
+      Env* env = db_iter_->env();
+      db_iter_->~DBIter();
+      arena_.~Arena();
+      new (&arena_) Arena();
 
-    SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_);
-    SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
-    if (read_callback_) {
-      read_callback_->Refresh(latest_seq);
-    }
-    Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
-         sv->current, latest_seq,
-         sv->mutable_cf_options.max_sequential_skip_in_iterations,
-         cur_sv_number, read_callback_, db_impl_, cfd_, expose_blob_index_,
-         allow_refresh_);
+      SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_);
+      SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
+      if (read_callback_) {
+        read_callback_->Refresh(latest_seq);
+      }
+      Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
+           sv->current, latest_seq,
+           sv->mutable_cf_options.max_sequential_skip_in_iterations,
+           cur_sv_number, read_callback_, db_impl_, cfd_, expose_blob_index_,
+           allow_refresh_);
 
-    InternalIterator* internal_iter = db_impl_->NewInternalIterator(
-        read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(),
-        latest_seq, /* allow_unprepared_value */ true);
-    SetIterUnderDBIter(internal_iter);
-  } else {
-    db_iter_->set_sequence(db_impl_->GetLatestSequenceNumber());
-    db_iter_->set_valid(false);
+      InternalIterator* internal_iter = db_impl_->NewInternalIterator(
+          read_options_, cfd_, sv, &arena_, db_iter_->GetRangeDelAggregator(),
+          latest_seq, /* allow_unprepared_value */ true);
+      SetIterUnderDBIter(internal_iter);
+      break;
+    } else {
+      SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
+      // Refresh range-tombstones in MemTable
+      if (!read_options_.ignore_range_deletions) {
+        SuperVersion* sv = cfd_->GetThreadLocalSuperVersion(db_impl_);
+        ReadRangeDelAggregator* range_del_agg =
+            db_iter_->GetRangeDelAggregator();
+        std::unique_ptr<FragmentedRangeTombstoneIterator> range_del_iter;
+        range_del_iter.reset(
+            sv->mem->NewRangeTombstoneIterator(read_options_, latest_seq));
+        range_del_agg->AddTombstones(std::move(range_del_iter));
+        cfd_->ReturnThreadLocalSuperVersion(sv);
+      }
+      // Refresh latest sequence number
+      db_iter_->set_sequence(latest_seq);
+      db_iter_->set_valid(false);
+      // Check again if the latest super version number is changed
+      uint64_t latest_sv_number = cfd_->GetSuperVersionNumber();
+      if (latest_sv_number != cur_sv_number) {
+        // If the super version number is changed after refreshing,
+        // fallback to Re-Init the InternalIterator
+        cur_sv_number = latest_sv_number;
+        continue;
+      }
+      break;
+    }
   }
   return Status::OK();
 }
diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc
index 8967ddef5..0a74d28d9 100644
--- a/db/db_range_del_test.cc
+++ b/db/db_range_del_test.cc
@@ -1724,6 +1724,34 @@ TEST_F(DBRangeDelTest, OverlappedKeys) {
   ASSERT_EQ(0, NumTableFilesAtLevel(1));
 }
 
+TEST_F(DBRangeDelTest, IteratorRefresh) {
+  // Refreshing an iterator after a range tombstone is added should cause the
+  // deleted range of keys to disappear.
+  for (bool sv_changed : {false, true}) {
+    ASSERT_OK(db_->Put(WriteOptions(), "key1", "value1"));
+    ASSERT_OK(db_->Put(WriteOptions(), "key2", "value2"));
+
+    auto* iter = db_->NewIterator(ReadOptions());
+    ASSERT_OK(iter->status());
+
+    ASSERT_OK(db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(),
+                               "key2", "key3"));
+
+    if (sv_changed) {
+      ASSERT_OK(db_->Flush(FlushOptions()));
+    }
+
+    ASSERT_OK(iter->Refresh());
+    ASSERT_OK(iter->status());
+    iter->SeekToFirst();
+    ASSERT_EQ("key1", iter->key());
+    iter->Next();
+    ASSERT_FALSE(iter->Valid());
+
+    delete iter;
+  }
+}
+
 #endif  // ROCKSDB_LITE
 
 }  // namespace ROCKSDB_NAMESPACE

From 0e4c063c166cbdd6e5c4bc51484b681c4d5324db Mon Sep 17 00:00:00 2001
From: gukaifeng <github@gukaifeng.me>
Date: Tue, 15 Mar 2022 09:55:49 -0700
Subject: [PATCH 18/34] fix a bug of the ticker NO_FILE_OPENS (#9677)

Summary:
In the original code, the value of `NO_FILE_OPENS` corresponding to the Ticker item will be increased regardless of whether the file is successfully opened or not. Even counts are repeated, which can lead to skewed counts.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9677

Reviewed By: jay-zhuang

Differential Revision: D34725733

Pulled By: ajkr

fbshipit-source-id: 841234ed03802c0105fd2107d82a740265ead576
---
 db/table_cache.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/db/table_cache.cc b/db/table_cache.cc
index c86f60750..e548fc6c3 100644
--- a/db/table_cache.cc
+++ b/db/table_cache.cc
@@ -114,15 +114,18 @@ Status TableCache::GetTableReader(
   if (s.ok()) {
     s = ioptions_.fs->NewRandomAccessFile(fname, fopts, &file, nullptr);
   }
-  RecordTick(ioptions_.stats, NO_FILE_OPENS);
-  if (s.IsPathNotFound()) {
+  if (s.ok()) {
+    RecordTick(ioptions_.stats, NO_FILE_OPENS);
+  } else if (s.IsPathNotFound()) {
     fname = Rocks2LevelTableFileName(fname);
     s = PrepareIOFromReadOptions(ro, ioptions_.clock, fopts.io_options);
     if (s.ok()) {
       s = ioptions_.fs->NewRandomAccessFile(fname, file_options, &file,
                                             nullptr);
     }
-    RecordTick(ioptions_.stats, NO_FILE_OPENS);
+    if (s.ok()) {
+      RecordTick(ioptions_.stats, NO_FILE_OPENS);
+    }
   }
 
   if (s.ok()) {

From 418e58e6a268c9f1cefbf4cc85832317b38b492b Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Wed, 23 Mar 2022 10:00:54 -0700
Subject: [PATCH 19/34] Fix a major performance bug in 7.0 re: filter
 compatibility (#9736)

Summary:
Bloom filters generated by pre-7.0 releases are not read by
7.0.x releases (and vice-versa) due to changes to FilterPolicy::Name()
in https://github.com/facebook/rocksdb/issues/9590. This can severely impact read performance and read I/O on
upgrade or downgrade with existing DB, but not data correctness.

To fix, we go back using the old, unified name in SST metadata but (for
a while anyway) recognize the aliases that could be generated by early
7.0.x releases. This unfortunately requires a public API change to avoid
interfering with all the good changes from https://github.com/facebook/rocksdb/issues/9590, but the API change
only affects users with custom FilterPolicy, which should be very few.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9736

Test Plan:
manual

Generate DBs with
```
./db_bench.7.0 -db=/dev/shm/rocksdb.7.0 -bloom_bits=10 -cache_index_and_filter_blocks=1 -benchmarks=fillrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0
```
and similar. Compare with
```
for IMPL in 6.29 7.0 fixed; do for DB in 6.29 7.0 fixed; do echo "Testing $IMPL on $DB:"; ./db_bench.$IMPL -db=/dev/shm/rocksdb.$DB -use_existing_db -readonly -bloom_bits=10 -benchmarks=readrandom -num=10000000 -compaction_style=2 -fifo_compaction_max_table_files_size_mb=10000 -fifo_compaction_allow_compaction=0 -duration=10 2>&1 | grep micros/op; done; done
```

Results:
```
Testing 6.29 on 6.29:
readrandom   :      34.381 micros/op 29085 ops/sec;    3.2 MB/s (291999 of 291999 found)
Testing 6.29 on 7.0:
readrandom   :     190.443 micros/op 5249 ops/sec;    0.6 MB/s (52999 of 52999 found)
Testing 6.29 on fixed:
readrandom   :      40.148 micros/op 24907 ops/sec;    2.8 MB/s (249999 of 249999 found)
Testing 7.0 on 6.29:
readrandom   :     229.430 micros/op 4357 ops/sec;    0.5 MB/s (43999 of 43999 found)
Testing 7.0 on 7.0:
readrandom   :      33.348 micros/op 29986 ops/sec;    3.3 MB/s (299999 of 299999 found)
Testing 7.0 on fixed:
readrandom   :     152.734 micros/op 6546 ops/sec;    0.7 MB/s (65999 of 65999 found)
Testing fixed on 6.29:
readrandom   :      32.024 micros/op 31224 ops/sec;    3.5 MB/s (312999 of 312999 found)
Testing fixed on 7.0:
readrandom   :      33.990 micros/op 29390 ops/sec;    3.3 MB/s (294999 of 294999 found)
Testing fixed on fixed:
readrandom   :      28.714 micros/op 34825 ops/sec;    3.9 MB/s (348999 of 348999 found)
```

Just paying attention to order of magnitude of ops/sec (short test
durations, lots of noise), it's clear that with the fix we can read <= 6.29
& >= 7.0 at full speed, where neither 6.29 nor 7.0 can on both. And 6.29
release can properly read fixed DB at full speed.

Reviewed By: siying, ajkr

Differential Revision: D35057844

Pulled By: pdillinger

fbshipit-source-id: a46893a6af4bf084375ebe4728066d00eb08f050
---
 HISTORY.md                                    |  4 +
 db/c.cc                                       |  6 ++
 db/db_bloom_filter_test.cc                    |  8 +-
 include/rocksdb/filter_policy.h               | 13 +++
 options/customizable_test.cc                  |  1 +
 .../block_based/block_based_table_builder.cc  |  2 +-
 table/block_based/block_based_table_reader.cc | 80 +++++++++++++------
 table/block_based/filter_policy.cc            | 12 ++-
 table/block_based/filter_policy_internal.h    |  3 +
 table/block_based/full_filter_block_test.cc   |  1 +
 10 files changed, 101 insertions(+), 29 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 91cf7ede9..e2c1ba74c 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,8 +1,12 @@
 # Rocksdb Change Log
 ## Unreleased
 ### Bug Fixes
+* Fixed a major performance bug in which Bloom filters generated by pre-7.0 releases are not read by early 7.0.x releases (and vice-versa) due to changes to FilterPolicy::Name() in #9590. This can severely impact read performance and read I/O on upgrade or downgrade with existing DB, but not data correctness.
 * Fixed a bug that `Iterator::Refresh()` reads stale keys after DeleteRange() performed.
 
+### Public API changes
+* Added pure virtual FilterPolicy::CompatibilityName(), which is needed for fixing major performance bug involving FilterPolicy naming in SST metadata without affecting Customizable aspect of FilterPolicy. This change only affects those with their own custom or wrapper FilterPolicy classes.
+
 ## 7.0.2 (03/12/2022)
 * Fixed a bug that DisableManualCompaction may assert when disable an unscheduled manual compaction.
 
diff --git a/db/c.cc b/db/c.cc
index c6f23adf2..211caa9ac 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -3757,6 +3757,9 @@ rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_bloom_format(
     const FilterPolicy* rep_;
     ~Wrapper() override { delete rep_; }
     const char* Name() const override { return rep_->Name(); }
+    const char* CompatibilityName() const override {
+      return rep_->CompatibilityName();
+    }
     // No need to override GetFilterBitsBuilder if this one is overridden
     ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext(
         const ROCKSDB_NAMESPACE::FilterBuildingContext& context)
@@ -3794,6 +3797,9 @@ rocksdb_filterpolicy_t* rocksdb_filterpolicy_create_ribbon_format(
     const FilterPolicy* rep_;
     ~Wrapper() override { delete rep_; }
     const char* Name() const override { return rep_->Name(); }
+    const char* CompatibilityName() const override {
+      return rep_->CompatibilityName();
+    }
     ROCKSDB_NAMESPACE::FilterBitsBuilder* GetBuilderWithContext(
         const ROCKSDB_NAMESPACE::FilterBuildingContext& context)
         const override {
diff --git a/db/db_bloom_filter_test.cc b/db/db_bloom_filter_test.cc
index c2630d2e5..c8f8d5320 100644
--- a/db/db_bloom_filter_test.cc
+++ b/db/db_bloom_filter_test.cc
@@ -1551,9 +1551,15 @@ class LevelAndStyleCustomFilterPolicy : public FilterPolicy {
         policy_l0_other_(NewBloomFilterPolicy(bpk_l0_other)),
         policy_otherwise_(NewBloomFilterPolicy(bpk_otherwise)) {}
 
+  const char* Name() const override {
+    return "LevelAndStyleCustomFilterPolicy";
+  }
+
   // OK to use built-in policy name because we are deferring to a
   // built-in builder. We aren't changing the serialized format.
-  const char* Name() const override { return policy_fifo_->Name(); }
+  const char* CompatibilityName() const override {
+    return policy_fifo_->CompatibilityName();
+  }
 
   FilterBitsBuilder* GetBuilderWithContext(
       const FilterBuildingContext& context) const override {
diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h
index 873520ee0..954d15b4a 100644
--- a/include/rocksdb/filter_policy.h
+++ b/include/rocksdb/filter_policy.h
@@ -90,6 +90,19 @@ class FilterPolicy : public Customizable {
   virtual ~FilterPolicy();
   static const char* Type() { return "FilterPolicy"; }
 
+  // The name used for identifying whether a filter on disk is readable
+  // by this FilterPolicy. If this FilterPolicy is part of a family that
+  // can read each others filters, such as built-in BloomFilterPolcy and
+  // RibbonFilterPolicy, the CompatibilityName is a shared family name,
+  // while kinds of filters in the family can have distinct Customizable
+  // Names. This function is pure virtual so that wrappers around built-in
+  // policies are prompted to defer to CompatibilityName() of the wrapped
+  // policy, which is important for compatibility.
+  //
+  // For custom filter policies that are not part of a read-compatible
+  // family (rare), implementations may return Name().
+  virtual const char* CompatibilityName() const = 0;
+
   // Creates a new FilterPolicy based on the input value string and returns the
   // result The value might be an ID, and ID with properties, or an old-style
   // policy string.
diff --git a/options/customizable_test.cc b/options/customizable_test.cc
index 690b394ea..84f7f4656 100644
--- a/options/customizable_test.cc
+++ b/options/customizable_test.cc
@@ -1487,6 +1487,7 @@ class MockFilterPolicy : public FilterPolicy {
  public:
   static const char* kClassName() { return "MockFilterPolicy"; }
   const char* Name() const override { return kClassName(); }
+  const char* CompatibilityName() const override { return Name(); }
   FilterBitsBuilder* GetBuilderWithContext(
       const FilterBuildingContext&) const override {
     return nullptr;
diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc
index 6c622555d..41a49556b 100644
--- a/table/block_based/block_based_table_builder.cc
+++ b/table/block_based/block_based_table_builder.cc
@@ -1605,7 +1605,7 @@ void BlockBasedTableBuilder::WriteFilterBlock(
                 ? BlockBasedTable::kPartitionedFilterBlockPrefix
                 : BlockBasedTable::kFullFilterBlockPrefix;
     }
-    key.append(rep_->table_options.filter_policy->Name());
+    key.append(rep_->table_options.filter_policy->CompatibilityName());
     meta_index_builder->Add(key, filter_block_handle);
   }
 }
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 686f38ba1..388add9ed 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -12,6 +12,7 @@
 #include <array>
 #include <limits>
 #include <string>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -50,6 +51,7 @@
 #include "table/block_based/block_prefix_index.h"
 #include "table/block_based/block_type.h"
 #include "table/block_based/filter_block.h"
+#include "table/block_based/filter_policy_internal.h"
 #include "table/block_based/full_filter_block.h"
 #include "table/block_based/hash_index_reader.h"
 #include "table/block_based/partitioned_filter_block.h"
@@ -899,33 +901,59 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
     const BlockBasedTableOptions& table_options, const int level,
     size_t file_size, size_t max_file_size_for_l0_meta_pin,
     BlockCacheLookupContext* lookup_context) {
-  Status s;
-
   // Find filter handle and filter type
   if (rep_->filter_policy) {
-    for (auto filter_type :
-         {Rep::FilterType::kFullFilter, Rep::FilterType::kPartitionedFilter,
-          Rep::FilterType::kBlockFilter}) {
-      std::string prefix;
-      switch (filter_type) {
-        case Rep::FilterType::kFullFilter:
-          prefix = kFullFilterBlockPrefix;
-          break;
-        case Rep::FilterType::kPartitionedFilter:
-          prefix = kPartitionedFilterBlockPrefix;
-          break;
-        case Rep::FilterType::kBlockFilter:
-          prefix = kFilterBlockPrefix;
+    auto name = rep_->filter_policy->CompatibilityName();
+    bool builtin_compatible =
+        strcmp(name, BuiltinFilterPolicy::kCompatibilityName()) == 0;
+
+    for (const auto& [filter_type, prefix] :
+         {std::make_pair(Rep::FilterType::kFullFilter, kFullFilterBlockPrefix),
+          std::make_pair(Rep::FilterType::kPartitionedFilter,
+                         kPartitionedFilterBlockPrefix),
+          std::make_pair(Rep::FilterType::kBlockFilter, kFilterBlockPrefix)}) {
+      if (builtin_compatible) {
+        // This code is only here to deal with a hiccup in early 7.0.x where
+        // there was an unintentional name change in the SST files metadata.
+        // It should be OK to remove this in the future (late 2022) and just
+        // have the 'else' code.
+        // NOTE: the test:: names below are likely not needed but included
+        // out of caution
+        static const std::unordered_set<std::string> kBuiltinNameAndAliases = {
+            BuiltinFilterPolicy::kCompatibilityName(),
+            test::LegacyBloomFilterPolicy::kClassName(),
+            test::FastLocalBloomFilterPolicy::kClassName(),
+            test::Standard128RibbonFilterPolicy::kClassName(),
+            DeprecatedBlockBasedBloomFilterPolicy::kClassName(),
+            BloomFilterPolicy::kClassName(),
+            RibbonFilterPolicy::kClassName(),
+        };
+
+        // For efficiency, do a prefix seek and see if the first match is
+        // good.
+        meta_iter->Seek(prefix);
+        if (meta_iter->status().ok() && meta_iter->Valid()) {
+          Slice key = meta_iter->key();
+          if (key.starts_with(prefix)) {
+            key.remove_prefix(prefix.size());
+            if (kBuiltinNameAndAliases.find(key.ToString()) !=
+                kBuiltinNameAndAliases.end()) {
+              Slice v = meta_iter->value();
+              Status s = rep_->filter_handle.DecodeFrom(&v);
+              if (s.ok()) {
+                rep_->filter_type = filter_type;
+                break;
+              }
+            }
+          }
+        }
+      } else {
+        std::string filter_block_key = prefix + name;
+        if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
+                .ok()) {
+          rep_->filter_type = filter_type;
           break;
-        default:
-          assert(0);
-      }
-      std::string filter_block_key = prefix;
-      filter_block_key.append(rep_->filter_policy->Name());
-      if (FindMetaBlock(meta_iter, filter_block_key, &rep_->filter_handle)
-              .ok()) {
-        rep_->filter_type = filter_type;
-        break;
+        }
       }
     }
   }
@@ -934,8 +962,8 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks(
          rep_->index_type == BlockBasedTableOptions::kTwoLevelIndexSearch);
 
   // Find compression dictionary handle
-  s = FindOptionalMetaBlock(meta_iter, kCompressionDictBlockName,
-                            &rep_->compression_dict_handle);
+  Status s = FindOptionalMetaBlock(meta_iter, kCompressionDictBlockName,
+                                   &rep_->compression_dict_handle);
   if (!s.ok()) {
     return s;
   }
diff --git a/table/block_based/filter_policy.cc b/table/block_based/filter_policy.cc
index 6594bfb4d..cfbc658aa 100644
--- a/table/block_based/filter_policy.cc
+++ b/table/block_based/filter_policy.cc
@@ -1325,6 +1325,16 @@ bool BuiltinFilterPolicy::IsInstanceOf(const std::string& name) const {
   }
 }
 
+static const char* kBuiltinFilterMetadataName = "rocksdb.BuiltinBloomFilter";
+
+const char* BuiltinFilterPolicy::kCompatibilityName() {
+  return kBuiltinFilterMetadataName;
+}
+
+const char* BuiltinFilterPolicy::CompatibilityName() const {
+  return kBuiltinFilterMetadataName;
+}
+
 BloomLikeFilterPolicy::BloomLikeFilterPolicy(double bits_per_key)
     : warned_(false), aggregate_rounding_balance_(0) {
   // Sanitize bits_per_key
@@ -1372,7 +1382,7 @@ bool BloomLikeFilterPolicy::IsInstanceOf(const std::string& name) const {
 }
 
 const char* ReadOnlyBuiltinFilterPolicy::kClassName() {
-  return "rocksdb.BuiltinBloomFilter";
+  return kBuiltinFilterMetadataName;
 }
 
 const char* DeprecatedBlockBasedBloomFilterPolicy::kClassName() {
diff --git a/table/block_based/filter_policy_internal.h b/table/block_based/filter_policy_internal.h
index 4ceeed0d0..06566f871 100644
--- a/table/block_based/filter_policy_internal.h
+++ b/table/block_based/filter_policy_internal.h
@@ -135,6 +135,9 @@ class BuiltinFilterPolicy : public FilterPolicy {
   FilterBitsReader* GetFilterBitsReader(const Slice& contents) const override;
   static const char* kClassName();
   bool IsInstanceOf(const std::string& id) const override;
+  // All variants of BuiltinFilterPolicy can read each others filters.
+  const char* CompatibilityName() const override;
+  static const char* kCompatibilityName();
 
  public:  // new
   // An internal function for the implementation of
diff --git a/table/block_based/full_filter_block_test.cc b/table/block_based/full_filter_block_test.cc
index 76f612728..24d870d4c 100644
--- a/table/block_based/full_filter_block_test.cc
+++ b/table/block_based/full_filter_block_test.cc
@@ -84,6 +84,7 @@ class TestFilterBitsReader : public FilterBitsReader {
 class TestHashFilter : public FilterPolicy {
  public:
   const char* Name() const override { return "TestHashFilter"; }
+  const char* CompatibilityName() const override { return Name(); }
 
   FilterBitsBuilder* GetBuilderWithContext(
       const FilterBuildingContext&) const override {

From b7119ff818f83b54a46602ac1f23d4fe2add1bfc Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Wed, 23 Mar 2022 10:32:57 -0700
Subject: [PATCH 20/34] Update HISTORY.md and version.h for 7.0.3

Summary: Also clarified in HISTORY.md that this patch release breaks
binary compatibility (because of FilterPolicy vtable)
---
 HISTORY.md                | 4 ++--
 include/rocksdb/version.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index e2c1ba74c..3b27a6a6a 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,11 +1,11 @@
 # Rocksdb Change Log
-## Unreleased
+## 7.0.3 (03/23/2022)
 ### Bug Fixes
 * Fixed a major performance bug in which Bloom filters generated by pre-7.0 releases are not read by early 7.0.x releases (and vice-versa) due to changes to FilterPolicy::Name() in #9590. This can severely impact read performance and read I/O on upgrade or downgrade with existing DB, but not data correctness.
 * Fixed a bug that `Iterator::Refresh()` reads stale keys after DeleteRange() performed.
 
 ### Public API changes
-* Added pure virtual FilterPolicy::CompatibilityName(), which is needed for fixing major performance bug involving FilterPolicy naming in SST metadata without affecting Customizable aspect of FilterPolicy. This change only affects those with their own custom or wrapper FilterPolicy classes.
+* Added pure virtual FilterPolicy::CompatibilityName(), which is needed for fixing major performance bug involving FilterPolicy naming in SST metadata without affecting Customizable aspect of FilterPolicy. For source code, this change only affects those with their own custom or wrapper FilterPolicy classes, but does break compiled library binary compatibility in a patch release.
 
 ## 7.0.2 (03/12/2022)
 * Fixed a bug that DisableManualCompaction may assert when disable an unscheduled manual compaction.
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index e1b67be85..aaf177a18 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -11,7 +11,7 @@
 
 #define ROCKSDB_MAJOR 7
 #define ROCKSDB_MINOR 0
-#define ROCKSDB_PATCH 2
+#define ROCKSDB_PATCH 3
 
 // Do not use these. We made the mistake of declaring macros starting with
 // double underscore. Now we have to live with our choice. We'll deprecate these

From 6996495c1b0b6b362d4dbca8436a1d641fa36a1f Mon Sep 17 00:00:00 2001
From: Jay Zhuang <zjay@fb.com>
Date: Wed, 16 Mar 2022 21:16:12 -0700
Subject: [PATCH 21/34] Minor fix for Windows build with zlib (#9699)

Summary:
```
conversion from 'size_t' to 'uLong', possible loss of data
```

Fix https://github.com/facebook/rocksdb/issues/9688

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9699

Reviewed By: riversand963

Differential Revision: D34901116

Pulled By: jay-zhuang

fbshipit-source-id: 969148a7a8c023449bd85055a1f0eec71d0a9b3f
---
 util/compression.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/util/compression.h b/util/compression.h
index 1effb8605..fbe9edbb9 100644
--- a/util/compression.h
+++ b/util/compression.h
@@ -778,7 +778,8 @@ inline bool Zlib_Compress(const CompressionInfo& info,
   }
 
   // Get an upper bound on the compressed size.
-  size_t upper_bound = deflateBound(&_stream, length);
+  size_t upper_bound =
+      deflateBound(&_stream, static_cast<unsigned long>(length));
   output->resize(output_header_len + upper_bound);
 
   // Compress the input, and put compressed data in output.

From 63a39d911008caaca4995db95a77ac4e08b7e353 Mon Sep 17 00:00:00 2001
From: Jay Zhuang <zjay@fb.com>
Date: Tue, 15 Mar 2022 12:31:14 -0700
Subject: [PATCH 22/34] Fix a race condition when disable and enable manual
 compaction (#9694)

Summary:
In https://github.com/facebook/rocksdb/issues/9659, when `DisableManualCompaction()` is issued, the foreground
manual compaction thread does not have to wait background compaction
thread to finish. Which could be a problem that the user re-enable
manual compaction with `EnableManualCompaction()`, it may re-enable the
BG compaction which supposed be cancelled.
This patch makes the FG compaction wait on
`manual_compaction_state.done`, which either be set by BG compaction or
Unschedule callback. Then when FG manual compaction thread returns, it
should not have BG compaction running. So shared_ptr is no longer needed
for `manual_compaction_state`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9694

Test Plan: a StressTest and unittest

Reviewed By: ajkr

Differential Revision: D34885472

Pulled By: jay-zhuang

fbshipit-source-id: e6476175b43e8c59cd49f5c09241036a0716c274
---
 HISTORY.md                             |   4 +
 db/db_compaction_test.cc               |  43 +++++++--
 db/db_impl/db_impl.h                   |   3 +-
 db/db_impl/db_impl_compaction_flush.cc | 116 +++++++++++++------------
 4 files changed, 103 insertions(+), 63 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 3b27a6a6a..f1205633a 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,4 +1,8 @@
 # Rocksdb Change Log
+## Unreleased
+### Bug Fixes
+* Fixed a race condition when disable and re-enable manual compaction.
+
 ## 7.0.3 (03/23/2022)
 ### Bug Fixes
 * Fixed a major performance bug in which Bloom filters generated by pre-7.0 releases are not read by early 7.0.x releases (and vice-versa) due to changes to FilterPolicy::Name() in #9590. This can severely impact read performance and read I/O on upgrade or downgrade with existing DB, but not data correctness.
diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc
index fd0eb66eb..03502157d 100644
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@@ -6973,8 +6973,7 @@ TEST_F(DBCompactionTest, DisableJustStartedManualCompaction) {
       {{"DBImpl::BGWorkCompaction",
         "DBCompactionTest::DisableJustStartedManualCompaction:"
         "PreDisableManualCompaction"},
-       {"DBCompactionTest::DisableJustStartedManualCompaction:"
-        "ManualCompactionReturn",
+       {"DBImpl::RunManualCompaction:Unscheduled",
         "BackgroundCallCompaction:0"}});
   SyncPoint::GetInstance()->EnableProcessing();
 
@@ -6983,9 +6982,6 @@ TEST_F(DBCompactionTest, DisableJustStartedManualCompaction) {
     cro.exclusive_manual_compaction = true;
     auto s = db_->CompactRange(cro, nullptr, nullptr);
     ASSERT_TRUE(s.IsIncomplete());
-    TEST_SYNC_POINT(
-        "DBCompactionTest::DisableJustStartedManualCompaction:"
-        "ManualCompactionReturn");
   });
   TEST_SYNC_POINT(
       "DBCompactionTest::DisableJustStartedManualCompaction:"
@@ -6995,6 +6991,43 @@ TEST_F(DBCompactionTest, DisableJustStartedManualCompaction) {
   compact_thread.join();
 }
 
+TEST_F(DBCompactionTest, DisableInProgressManualCompaction) {
+  const int kNumL0Files = 4;
+
+  Options options = CurrentOptions();
+  options.level0_file_num_compaction_trigger = kNumL0Files;
+  Reopen(options);
+
+  SyncPoint::GetInstance()->LoadDependency(
+      {{"DBImpl::BackgroundCompaction:InProgress",
+        "DBCompactionTest::DisableInProgressManualCompaction:"
+        "PreDisableManualCompaction"},
+       {"DBImpl::RunManualCompaction:Unscheduled",
+        "CompactionJob::Run():Start"}});
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  // generate files, but avoid trigger auto compaction
+  for (int i = 0; i < kNumL0Files / 2; i++) {
+    ASSERT_OK(Put(Key(1), "value1"));
+    ASSERT_OK(Put(Key(2), "value2"));
+    ASSERT_OK(Flush());
+  }
+
+  port::Thread compact_thread([&]() {
+    CompactRangeOptions cro;
+    cro.exclusive_manual_compaction = true;
+    auto s = db_->CompactRange(cro, nullptr, nullptr);
+    ASSERT_TRUE(s.IsIncomplete());
+  });
+
+  TEST_SYNC_POINT(
+      "DBCompactionTest::DisableInProgressManualCompaction:"
+      "PreDisableManualCompaction");
+  db_->DisableManualCompaction();
+
+  compact_thread.join();
+}
+
 TEST_F(DBCompactionTest, DisableManualCompactionThreadQueueFull) {
   const int kNumL0Files = 4;
 
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index 3fa039793..f632d5e9b 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1548,8 +1548,7 @@ class DBImpl : public DB {
     Compaction* compaction;
     // caller retains ownership of `manual_compaction_state` as it is reused
     // across background compactions.
-    std::shared_ptr<ManualCompactionState>
-        manual_compaction_state;  // nullptr if non-manual
+    ManualCompactionState* manual_compaction_state;  // nullptr if non-manual
     // task limiter token is requested during compaction picking.
     std::unique_ptr<TaskLimiterToken> task_token;
   };
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index c2135f732..a99025675 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -1789,10 +1789,11 @@ Status DBImpl::RunManualCompaction(
   CompactionArg* ca = nullptr;
 
   bool scheduled = false;
+  bool unscheduled = false;
   Env::Priority thread_pool_priority = Env::Priority::TOTAL;
   bool manual_conflict = false;
 
-  auto manual = std::make_shared<ManualCompactionState>(
+  ManualCompactionState manual(
       cfd, input_level, output_level, compact_range_options.target_path_id,
       exclusive, disallow_trivial_move, compact_range_options.canceled);
   // For universal compaction, we enforce every manual compaction to compact
@@ -1800,18 +1801,18 @@ Status DBImpl::RunManualCompaction(
   if (begin == nullptr ||
       cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
       cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
-    manual->begin = nullptr;
+    manual.begin = nullptr;
   } else {
     begin_storage.SetMinPossibleForUserKey(*begin);
-    manual->begin = &begin_storage;
+    manual.begin = &begin_storage;
   }
   if (end == nullptr ||
       cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
       cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
-    manual->end = nullptr;
+    manual.end = nullptr;
   } else {
     end_storage.SetMaxPossibleForUserKey(*end);
-    manual->end = &end_storage;
+    manual.end = &end_storage;
   }
 
   TEST_SYNC_POINT("DBImpl::RunManualCompaction:0");
@@ -1823,10 +1824,10 @@ Status DBImpl::RunManualCompaction(
     // `DisableManualCompaction()` just waited for the manual compaction queue
     // to drain. So return immediately.
     TEST_SYNC_POINT("DBImpl::RunManualCompaction:PausedAtStart");
-    manual->status =
+    manual.status =
         Status::Incomplete(Status::SubCode::kManualCompactionPaused);
-    manual->done = true;
-    return manual->status;
+    manual.done = true;
+    return manual.status;
   }
 
   // When a manual compaction arrives, temporarily disable scheduling of
@@ -1846,7 +1847,7 @@ Status DBImpl::RunManualCompaction(
   // However, only one of them will actually schedule compaction, while
   // others will wait on a condition variable until it completes.
 
-  AddManualCompaction(manual.get());
+  AddManualCompaction(&manual);
   TEST_SYNC_POINT_CALLBACK("DBImpl::RunManualCompaction:NotScheduled", &mutex_);
   if (exclusive) {
     // Limitation: there's no way to wake up the below loop when user sets
@@ -1855,11 +1856,11 @@ Status DBImpl::RunManualCompaction(
     while (bg_bottom_compaction_scheduled_ > 0 ||
            bg_compaction_scheduled_ > 0) {
       if (manual_compaction_paused_ > 0 ||
-          (manual->canceled != nullptr && *manual->canceled == true)) {
+          (manual.canceled != nullptr && *manual.canceled == true)) {
         // Pretend the error came from compaction so the below cleanup/error
         // handling code can process it.
-        manual->done = true;
-        manual->status =
+        manual.done = true;
+        manual.status =
             Status::Incomplete(Status::SubCode::kManualCompactionPaused);
         break;
       }
@@ -1881,64 +1882,63 @@ Status DBImpl::RunManualCompaction(
   // We don't check bg_error_ here, because if we get the error in compaction,
   // the compaction will set manual.status to bg_error_ and set manual.done to
   // true.
-  while (!manual->done) {
+  while (!manual.done) {
     assert(HasPendingManualCompaction());
     manual_conflict = false;
     Compaction* compaction = nullptr;
-    if (ShouldntRunManualCompaction(manual.get()) ||
-        (manual->in_progress == true) || scheduled ||
-        (((manual->manual_end = &manual->tmp_storage1) != nullptr) &&
-         ((compaction = manual->cfd->CompactRange(
-               *manual->cfd->GetLatestMutableCFOptions(), mutable_db_options_,
-               manual->input_level, manual->output_level, compact_range_options,
-               manual->begin, manual->end, &manual->manual_end,
-               &manual_conflict, max_file_num_to_ignore)) == nullptr &&
+    if (ShouldntRunManualCompaction(&manual) || (manual.in_progress == true) ||
+        scheduled ||
+        (((manual.manual_end = &manual.tmp_storage1) != nullptr) &&
+         ((compaction = manual.cfd->CompactRange(
+               *manual.cfd->GetLatestMutableCFOptions(), mutable_db_options_,
+               manual.input_level, manual.output_level, compact_range_options,
+               manual.begin, manual.end, &manual.manual_end, &manual_conflict,
+               max_file_num_to_ignore)) == nullptr &&
           manual_conflict))) {
       // exclusive manual compactions should not see a conflict during
       // CompactRange
       assert(!exclusive || !manual_conflict);
       // Running either this or some other manual compaction
       bg_cv_.Wait();
-      if (manual_compaction_paused_ > 0) {
-        manual->done = true;
-        manual->status =
-            Status::Incomplete(Status::SubCode::kManualCompactionPaused);
-        if (scheduled) {
-          assert(thread_pool_priority != Env::Priority::TOTAL);
-          auto unscheduled_task_num = env_->UnSchedule(
-              GetTaskTag(TaskType::kManualCompaction), thread_pool_priority);
-          if (unscheduled_task_num > 0) {
-            ROCKS_LOG_INFO(
-                immutable_db_options_.info_log,
-                "[%s] Unscheduled %d number of manual compactions from the "
-                "thread-pool",
-                cfd->GetName().c_str(), unscheduled_task_num);
-          }
+      if (manual_compaction_paused_ > 0 && scheduled && !unscheduled) {
+        assert(thread_pool_priority != Env::Priority::TOTAL);
+        // unschedule all manual compactions
+        auto unscheduled_task_num = env_->UnSchedule(
+            GetTaskTag(TaskType::kManualCompaction), thread_pool_priority);
+        if (unscheduled_task_num > 0) {
+          ROCKS_LOG_INFO(
+              immutable_db_options_.info_log,
+              "[%s] Unscheduled %d number of manual compactions from the "
+              "thread-pool",
+              cfd->GetName().c_str(), unscheduled_task_num);
+          // it may unschedule other manual compactions, notify others.
+          bg_cv_.SignalAll();
         }
-        break;
+        unscheduled = true;
+        TEST_SYNC_POINT("DBImpl::RunManualCompaction:Unscheduled");
       }
-      if (scheduled && manual->incomplete == true) {
-        assert(!manual->in_progress);
+      if (scheduled && manual.incomplete == true) {
+        assert(!manual.in_progress);
         scheduled = false;
-        manual->incomplete = false;
+        manual.incomplete = false;
       }
     } else if (!scheduled) {
       if (compaction == nullptr) {
-        manual->done = true;
+        manual.done = true;
         bg_cv_.SignalAll();
         continue;
       }
       ca = new CompactionArg;
       ca->db = this;
       ca->prepicked_compaction = new PrepickedCompaction;
-      ca->prepicked_compaction->manual_compaction_state = manual;
+      ca->prepicked_compaction->manual_compaction_state = &manual;
       ca->prepicked_compaction->compaction = compaction;
       if (!RequestCompactionToken(
               cfd, true, &ca->prepicked_compaction->task_token, &log_buffer)) {
         // Don't throttle manual compaction, only count outstanding tasks.
         assert(false);
       }
-      manual->incomplete = false;
+      manual.incomplete = false;
       if (compaction->bottommost_level() &&
           env_->GetBackgroundThreads(Env::Priority::BOTTOM) > 0) {
         bg_bottom_compaction_scheduled_++;
@@ -1962,18 +1962,18 @@ Status DBImpl::RunManualCompaction(
   }
 
   log_buffer.FlushBufferToLog();
-  assert(!manual->in_progress);
+  assert(!manual.in_progress);
   assert(HasPendingManualCompaction());
-  RemoveManualCompaction(manual.get());
+  RemoveManualCompaction(&manual);
   // if the manual job is unscheduled, try schedule other jobs in case there's
   // any unscheduled compaction job which was blocked by exclusive manual
   // compaction.
-  if (manual->status.IsIncomplete() &&
-      manual->status.subcode() == Status::SubCode::kManualCompactionPaused) {
+  if (manual.status.IsIncomplete() &&
+      manual.status.subcode() == Status::SubCode::kManualCompactionPaused) {
     MaybeScheduleFlushOrCompaction();
   }
   bg_cv_.SignalAll();
-  return manual->status;
+  return manual.status;
 }
 
 void DBImpl::GenerateFlushRequest(const autovector<ColumnFamilyData*>& cfds,
@@ -2699,6 +2699,12 @@ void DBImpl::UnscheduleCompactionCallback(void* arg) {
   CompactionArg ca = *(ca_ptr);
   delete reinterpret_cast<CompactionArg*>(arg);
   if (ca.prepicked_compaction != nullptr) {
+    // if it's a manual compaction, set status to ManualCompactionPaused
+    if (ca.prepicked_compaction->manual_compaction_state) {
+      ca.prepicked_compaction->manual_compaction_state->done = true;
+      ca.prepicked_compaction->manual_compaction_state->status =
+          Status::Incomplete(Status::SubCode::kManualCompactionPaused);
+    }
     if (ca.prepicked_compaction->compaction != nullptr) {
       ca.prepicked_compaction->compaction->ReleaseCompactionFiles(
           Status::Incomplete(Status::SubCode::kManualCompactionPaused));
@@ -2941,7 +2947,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
       mutex_.Lock();
     } else if (s.IsManualCompactionPaused()) {
       assert(prepicked_compaction);
-      auto m = prepicked_compaction->manual_compaction_state;
+      ManualCompactionState* m = prepicked_compaction->manual_compaction_state;
       assert(m);
       ROCKS_LOG_BUFFER(&log_buffer, "[%s] [JOB %d] Manual compaction paused",
                        m->cfd->GetName().c_str(), job_context.job_id);
@@ -3023,7 +3029,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
                                     LogBuffer* log_buffer,
                                     PrepickedCompaction* prepicked_compaction,
                                     Env::Priority thread_pri) {
-  std::shared_ptr<ManualCompactionState> manual_compaction =
+  ManualCompactionState* manual_compaction =
       prepicked_compaction == nullptr
           ? nullptr
           : prepicked_compaction->manual_compaction_state;
@@ -3067,10 +3073,6 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   if (!status.ok()) {
     if (is_manual) {
       manual_compaction->status = status;
-      manual_compaction->status
-          .PermitUncheckedError();  // the manual compaction thread may exit
-                                    // first, which won't be able to check the
-                                    // status
       manual_compaction->done = true;
       manual_compaction->in_progress = false;
       manual_compaction = nullptr;
@@ -3087,13 +3089,15 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
     manual_compaction->in_progress = true;
   }
 
+  TEST_SYNC_POINT("DBImpl::BackgroundCompaction:InProgress");
+
   std::unique_ptr<TaskLimiterToken> task_token;
 
   // InternalKey manual_end_storage;
   // InternalKey* manual_end = &manual_end_storage;
   bool sfm_reserved_compact_space = false;
   if (is_manual) {
-    auto m = manual_compaction;
+    ManualCompactionState* m = manual_compaction;
     assert(m->in_progress);
     if (!c) {
       m->done = true;
@@ -3477,7 +3481,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress,
   c.reset();
 
   if (is_manual) {
-    auto m = manual_compaction;
+    ManualCompactionState* m = manual_compaction;
     if (!status.ok()) {
       m->status = status;
       m->done = true;

From 802093fe44417a238d1e5173df28865b315fc9a7 Mon Sep 17 00:00:00 2001
From: anand76 <anand76@devvm4702.ftw0.facebook.com>
Date: Mon, 7 Mar 2022 11:39:31 -0800
Subject: [PATCH 23/34] Fix issue #9627 (#9657)

Summary:
SMB mounts do not support hard links. The ENOTSUP error code is
returned, which should be interpreted by PosixFileSystem as
IOStatus::NotSupported().

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9657

Reviewed By: mrambacher

Differential Revision: D34634783

Pulled By: anand1976

fbshipit-source-id: 0d57f5b2e6118e4c20e9ed1a293327428c3aecac
---
 env/fs_posix.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/env/fs_posix.cc b/env/fs_posix.cc
index e63381066..db5319615 100644
--- a/env/fs_posix.cc
+++ b/env/fs_posix.cc
@@ -753,8 +753,10 @@ class PosixFileSystem : public FileSystem {
                     const IOOptions& /*opts*/,
                     IODebugContext* /*dbg*/) override {
     if (link(src.c_str(), target.c_str()) != 0) {
-      if (errno == EXDEV) {
-        return IOStatus::NotSupported("No cross FS links allowed");
+      if (errno == EXDEV || errno == ENOTSUP) {
+        return IOStatus::NotSupported(errno == EXDEV
+                                          ? "No cross FS links allowed"
+                                          : "Links not supported by FS");
       }
       return IOError("while link file to " + target, src, errno);
     }

From 3e18c47db0d72c481475aa66416290ebd3fa72c4 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Mon, 14 Mar 2022 18:49:55 -0700
Subject: [PATCH 24/34] Fix a TSAN-reported bug caused by concurrent accesss to
 std::deque (#9686)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9686

According to https://www.cplusplus.com/reference/deque/deque/back/,
"
The container is accessed (neither the const nor the non-const versions modify the container).
The last element is potentially accessed or modified by the caller. Concurrently accessing or modifying other elements is safe.
"

Also according to https://www.cplusplus.com/reference/deque/deque/pop_front/,
"
The container is modified.
The first element is modified. Concurrently accessing or modifying other elements is safe (although see iterator validity above).
"
In RocksDB, we never pop the last element of `DBImpl::alive_log_files_`. We have been
exploiting this fact and the above two properties when ensuring correctness when
`DBImpl::alive_log_files_` may be accessed concurrently. Specifically, it can be accessed
in the write path when db mutex is released. Sometimes, the log_mute_ is held. It can also be accessed in `FindObsoleteFiles()`
when db mutex is always held. It can also be accessed
during recovery when db mutex is also held.
Given the fact that we never pop the last element of alive_log_files_, we currently do not
acquire additional locks when accessing it in `WriteToWAL()` as follows
```
alive_log_files_.back().AddSize(log_entry.size());
```

This is problematic.

Check source code of deque.h
```
  back() _GLIBCXX_NOEXCEPT
  {
__glibcxx_requires_nonempty();
...
  }

  pop_front() _GLIBCXX_NOEXCEPT
  {
...
  if (this->_M_impl._M_start._M_cur
      != this->_M_impl._M_start._M_last - 1)
    {
      ...
      ++this->_M_impl._M_start._M_cur;
    }
  ...
  }
```

`back()` will actually call `__glibcxx_requires_nonempty()` first.
If `__glibcxx_requires_nonempty()` is enabled and not an empty macro,
it will call `empty()`
```
bool empty() {
return this->_M_impl._M_finish == this->_M_impl._M_start;
}
```
You can see that it will access `this->_M_impl._M_start`, racing with `pop_front()`.
Therefore, TSAN will actually catch the bug in this case.

To be able to use TSAN on our library and unit tests, we should always coordinate
concurrent accesses to STL containers properly.

We need to pass information about db mutex and log mutex into `WriteToWAL()`, otherwise
it's impossible to know which mutex to acquire inside the function.

To fix this, we can catch the tail of `alive_log_files_` by reference, so that we do not have to call `back()` in `WriteToWAL()`.

Reviewed By: pdillinger

Differential Revision: D34780309

fbshipit-source-id: 1def9821f0c437f2736c6a26445d75890377889b
---
 HISTORY.md                  |  1 +
 db/db_impl/db_impl.h        | 12 ++++++++----
 db/db_impl/db_impl_open.cc  |  5 ++++-
 db/db_impl/db_impl_write.cc | 39 ++++++++++++++++++++++++++++++++-----
 4 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index f1205633a..5e486ed72 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -2,6 +2,7 @@
 ## Unreleased
 ### Bug Fixes
 * Fixed a race condition when disable and re-enable manual compaction.
+* Fixed a race condition for `alive_log_files_` in non-two-write-queues mode. The race is between the write_thread_ in WriteToWAL() and another thread executing `FindObsoleteFiles()`. The race condition will be caught if `__glibcxx_requires_nonempty` is enabled.
 
 ## 7.0.3 (03/23/2022)
 ### Bug Fixes
diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h
index f632d5e9b..b06cab521 100644
--- a/db/db_impl/db_impl.h
+++ b/db/db_impl/db_impl.h
@@ -1782,7 +1782,8 @@ class DBImpl : public DB {
                          WriteBatch** to_be_cached_state);
 
   IOStatus WriteToWAL(const WriteBatch& merged_batch, log::Writer* log_writer,
-                      uint64_t* log_used, uint64_t* log_size);
+                      uint64_t* log_used, uint64_t* log_size,
+                      bool with_db_mutex = false, bool with_log_mutex = false);
 
   IOStatus WriteToWAL(const WriteThread::WriteGroup& write_group,
                       log::Writer* log_writer, uint64_t* log_used,
@@ -2108,12 +2109,15 @@ class DBImpl : public DB {
   bool persistent_stats_cfd_exists_ = true;
 
   // Without two_write_queues, read and writes to alive_log_files_ are
-  // protected by mutex_. However since back() is never popped, and push_back()
-  // is done only from write_thread_, the same thread can access the item
-  // reffered by back() without mutex_. With two_write_queues_, writes
+  // protected by mutex_. With two_write_queues_, writes
   // are protected by locking both mutex_ and log_write_mutex_, and reads must
   // be under either mutex_ or log_write_mutex_.
   std::deque<LogFileNumberSize> alive_log_files_;
+  // Caching the result of `alive_log_files_.back()` so that we do not have to
+  // call `alive_log_files_.back()` in the write thread (WriteToWAL()) which
+  // requires locking db mutex if log_mutex_ is not already held in
+  // two-write-queues mode.
+  std::deque<LogFileNumberSize>::reverse_iterator alive_log_files_tail_;
   // Log files that aren't fully synced, and the current log file.
   // Synchronization:
   //  - push_back() is done from write_thread_ with locked mutex_ and
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 840dc2bec..5cb6beeb6 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -1364,6 +1364,7 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
     total_log_size_ += log.size;
     alive_log_files_.push_back(log);
   }
+  alive_log_files_tail_ = alive_log_files_.rbegin();
   if (two_write_queues_) {
     log_write_mutex_.Unlock();
   }
@@ -1709,6 +1710,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
       }
       impl->alive_log_files_.push_back(
           DBImpl::LogFileNumberSize(impl->logfile_number_));
+      impl->alive_log_files_tail_ = impl->alive_log_files_.rbegin();
       if (impl->two_write_queues_) {
         impl->log_write_mutex_.Unlock();
       }
@@ -1729,7 +1731,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
         WriteOptions write_options;
         uint64_t log_used, log_size;
         log::Writer* log_writer = impl->logs_.back().writer;
-        s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size);
+        s = impl->WriteToWAL(empty_batch, log_writer, &log_used, &log_size,
+                             /*with_db_mutex==*/true);
         if (s.ok()) {
           // Need to fsync, otherwise it might get lost after a power reset.
           s = impl->FlushWAL(false);
diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index a5bd89a7e..fd1fa1acb 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -1147,8 +1147,22 @@ WriteBatch* DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
 // write thread. Otherwise this must be called holding log_write_mutex_.
 IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
                             log::Writer* log_writer, uint64_t* log_used,
-                            uint64_t* log_size) {
+                            uint64_t* log_size,
+                            bool with_db_mutex, bool with_log_mutex) {
   assert(log_size != nullptr);
+
+  // Assert mutex explicitly.
+  if (with_db_mutex) {
+    mutex_.AssertHeld();
+  } else if (two_write_queues_) {
+    log_write_mutex_.AssertHeld();
+    assert(with_log_mutex);
+  }
+
+#ifdef NDEBUG
+  (void)with_log_mutex;
+#endif
+
   Slice log_entry = WriteBatchInternal::Contents(&merged_batch);
   *log_size = log_entry.size();
   // When two_write_queues_ WriteToWAL has to be protected from concurretn calls
@@ -1171,9 +1185,20 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
     *log_used = logfile_number_;
   }
   total_log_size_ += log_entry.size();
-  // TODO(myabandeh): it might be unsafe to access alive_log_files_.back() here
-  // since alive_log_files_ might be modified concurrently
-  alive_log_files_.back().AddSize(log_entry.size());
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+  if (with_db_mutex || with_log_mutex) {
+#endif  // __has_feature(thread_sanitizer)
+#endif  // defined(__has_feature)
+    assert(alive_log_files_tail_ != alive_log_files_.rend());
+    assert(alive_log_files_tail_ == alive_log_files_.rbegin());
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+  }
+#endif  // __has_feature(thread_sanitizer)
+#endif  // defined(__has_feature)
+  LogFileNumberSize& last_alive_log = *alive_log_files_tail_;
+  last_alive_log.AddSize(*log_size);
   log_empty_ = false;
   return io_s;
 }
@@ -1183,6 +1208,7 @@ IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
                             bool need_log_sync, bool need_log_dir_sync,
                             SequenceNumber sequence) {
   IOStatus io_s;
+  assert(!two_write_queues_);
   assert(!write_group.leader->disable_wal);
   // Same holds for all in the batch group
   size_t write_with_wal = 0;
@@ -1270,6 +1296,7 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
     SequenceNumber* last_sequence, size_t seq_inc) {
   IOStatus io_s;
 
+  assert(two_write_queues_ || immutable_db_options_.unordered_write);
   assert(!write_group.leader->disable_wal);
   // Same holds for all in the batch group
   WriteBatch tmp_batch;
@@ -1294,7 +1321,8 @@ IOStatus DBImpl::ConcurrentWriteToWAL(
 
   log::Writer* log_writer = logs_.back().writer;
   uint64_t log_size;
-  io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size);
+  io_s = WriteToWAL(*merged_batch, log_writer, log_used, &log_size,
+                    /*with_db_mutex=*/false, /*with_log_mutex=*/true);
   if (to_be_cached_state) {
     cached_recoverable_state_ = *to_be_cached_state;
     cached_recoverable_state_empty_ = false;
@@ -1948,6 +1976,7 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
       log_dir_synced_ = false;
       logs_.emplace_back(logfile_number_, new_log);
       alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
+      alive_log_files_tail_ = alive_log_files_.rbegin();
     }
     log_write_mutex_.Unlock();
   }

From 4ae8687e88a3f17e7e11437fe7bf0f275003cd74 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Tue, 15 Mar 2022 12:16:40 -0700
Subject: [PATCH 25/34] Fix TSAN caused by calling `rend()` and `pop_front()`.
 (#9698)

Summary:
PR9686 makes `WriteToWAL()` call `assert(...!=rend())` while not holding
db mutex or log mutex. Another thread may concurrently call
`pop_front()`, causing race condition.
To fix, assert only if mutex is held.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9698

Test Plan: COMPILE_WITH_TSAN=1 make check

Reviewed By: jay-zhuang

Differential Revision: D34898535

Pulled By: riversand963

fbshipit-source-id: 1ddfa5bf1b6ae8d409cab6ff6e1b5321c6803da9
---
 db/db_impl/db_impl_write.cc | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index fd1fa1acb..0975ae8bf 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -1159,10 +1159,6 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
     assert(with_log_mutex);
   }
 
-#ifdef NDEBUG
-  (void)with_log_mutex;
-#endif
-
   Slice log_entry = WriteBatchInternal::Contents(&merged_batch);
   *log_size = log_entry.size();
   // When two_write_queues_ WriteToWAL has to be protected from concurretn calls
@@ -1190,13 +1186,15 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
   if (with_db_mutex || with_log_mutex) {
 #endif  // __has_feature(thread_sanitizer)
 #endif  // defined(__has_feature)
-    assert(alive_log_files_tail_ != alive_log_files_.rend());
     assert(alive_log_files_tail_ == alive_log_files_.rbegin());
 #if defined(__has_feature)
 #if __has_feature(thread_sanitizer)
   }
 #endif  // __has_feature(thread_sanitizer)
 #endif  // defined(__has_feature)
+  if (with_db_mutex || with_log_mutex) {
+    assert(alive_log_files_tail_ != alive_log_files_.rend());
+  }
   LogFileNumberSize& last_alive_log = *alive_log_files_tail_;
   last_alive_log.AddSize(*log_size);
   log_empty_ = false;

From 1ac5d372762c74d4e563ebe3d96a1695a36345d1 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Thu, 17 Mar 2022 19:50:30 -0700
Subject: [PATCH 26/34] Fix race condition caused by concurrent accesses to
 forceMmapOff_ when opening Posix WritableFile (#9685)

Summary:
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9685

Our TSAN reports a race condition as follows when running test
```
gtest-parallel -r 100 ./external_sst_file_test --gtest_filter=ExternalSSTFileTest.MultiThreaded
```
leads to the following

```
WARNING: ThreadSanitizer: data race (pid=2683148)
  Write of size 1 at 0x556fede63340 by thread T7:
    #0 rocksdb::(anonymous namespace)::PosixFileSystem::OpenWritableFile(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, rocksdb::FileOptions const&, bool, std::unique_ptr<rocksdb::FSWritableFile, std::default_delete<rocksdb::FSWritableFile> >*, rocksdb::IODebugContext*) internal_repo_rocksdb/repo/env/fs_posix.cc:334 (external_sst_file_test+0xb61ac4)
    #1 rocksdb::(anonymous namespace)::PosixFileSystem::ReopenWritableFile(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, rocksdb::FileOptions const&, std::unique_ptr<rocksdb::FSWritableFile, std::default_delete<rocksdb::FSWritableFile> >*, rocksdb::IODebugContext*) internal_repo_rocksdb/repo/env/fs_posix.cc:382 (external_sst_file_test+0xb5ba96)
    #2 rocksdb::CompositeEnv::ReopenWritableFile(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unique_ptr<rocksdb::WritableFile, std::default_delete<rocksdb::WritableFile> >*, rocksdb::EnvOptions const&) internal_repo_rocksdb/repo/env/composite_env.cc:334 (external_sst_file_test+0xa6ab7f)
    #3 rocksdb::EnvWrapper::ReopenWritableFile(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, std::unique_ptr<rocksdb::WritableFile, std::default_delete<rocksdb::WritableFile> >*, rocksdb::EnvOptions const&) internal_repo_rocksdb/repo/include/rocksdb/env.h:1428 (external_sst_file_test+0x561f3e)
Previous read of size 1 at 0x556fede63340 by thread T4:
    #0 rocksdb::(anonymous namespace)::PosixFileSystem::OpenWritableFile(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, rocksdb::FileOptions const&, bool, std::unique_ptr<rocksdb::FSWritableFile, std::default_delete<rocksdb::FSWritableFile> >*, rocksdb::IODebugContext*) internal_repo_rocksdb/repo/env/fs_posix.cc:328 (external_sst_file_test+0xb61a70)
    #1 rocksdb::(anonymous namespace)::PosixFileSystem::ReopenWritableFile(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator
...
```

Fix by making sure the following block gets executed only once:
```
      if (!checkedDiskForMmap_) {
        // this will be executed once in the program's lifetime.
        // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
        if (!SupportsFastAllocate(fname)) {
          forceMmapOff_ = true;
        }
        checkedDiskForMmap_ = true;
      }
```

Reviewed By: pdillinger

Differential Revision: D34780308

fbshipit-source-id: b761f66b24c8b5b8389d86ea371c8542b8d869d5
---
 HISTORY.md      |  1 +
 env/fs_posix.cc | 45 ++++++++++++++++++++++-----------------------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 5e486ed72..09a8c840d 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -3,6 +3,7 @@
 ### Bug Fixes
 * Fixed a race condition when disable and re-enable manual compaction.
 * Fixed a race condition for `alive_log_files_` in non-two-write-queues mode. The race is between the write_thread_ in WriteToWAL() and another thread executing `FindObsoleteFiles()`. The race condition will be caught if `__glibcxx_requires_nonempty` is enabled.
+* Fixed a race condition when mmaping a WritableFile on POSIX.
 
 ## 7.0.3 (03/23/2022)
 ### Bug Fixes
diff --git a/env/fs_posix.cc b/env/fs_posix.cc
index db5319615..ab4dead50 100644
--- a/env/fs_posix.cc
+++ b/env/fs_posix.cc
@@ -325,14 +325,7 @@ class PosixFileSystem : public FileSystem {
     SetFD_CLOEXEC(fd, &options);
 
     if (options.use_mmap_writes) {
-      if (!checkedDiskForMmap_) {
-        // this will be executed once in the program's lifetime.
-        // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
-        if (!SupportsFastAllocate(fname)) {
-          forceMmapOff_ = true;
-        }
-        checkedDiskForMmap_ = true;
-      }
+      MaybeForceDisableMmap(fd);
     }
     if (options.use_mmap_writes && !forceMmapOff_) {
       result->reset(new PosixMmapFile(fname, fd, page_size_, options));
@@ -431,14 +424,7 @@ class PosixFileSystem : public FileSystem {
     }
 
     if (options.use_mmap_writes) {
-      if (!checkedDiskForMmap_) {
-        // this will be executed once in the program's lifetime.
-        // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
-        if (!SupportsFastAllocate(fname)) {
-          forceMmapOff_ = true;
-        }
-        checkedDiskForMmap_ = true;
-      }
+      MaybeForceDisableMmap(fd);
     }
     if (options.use_mmap_writes && !forceMmapOff_) {
       result->reset(new PosixMmapFile(fname, fd, page_size_, options));
@@ -999,8 +985,7 @@ class PosixFileSystem : public FileSystem {
   }
 #endif
  private:
-  bool checkedDiskForMmap_;
-  bool forceMmapOff_;  // do we override Env options?
+  bool forceMmapOff_ = false;  // do we override Env options?
 
   // Returns true iff the named directory exists and is a directory.
   virtual bool DirExists(const std::string& dname) {
@@ -1011,10 +996,10 @@ class PosixFileSystem : public FileSystem {
     return false;  // stat() failed return false
   }
 
-  bool SupportsFastAllocate(const std::string& path) {
+  bool SupportsFastAllocate(int fd) {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
     struct statfs s;
-    if (statfs(path.c_str(), &s)) {
+    if (fstatfs(fd, &s)) {
       return false;
     }
     switch (s.f_type) {
@@ -1028,11 +1013,26 @@ class PosixFileSystem : public FileSystem {
         return false;
     }
 #else
-    (void)path;
+    (void)fd;
     return false;
 #endif
   }
 
+  void MaybeForceDisableMmap(int fd) {
+    static std::once_flag s_check_disk_for_mmap_once;
+    assert(this == FileSystem::Default().get());
+    std::call_once(
+        s_check_disk_for_mmap_once,
+        [this](int fdesc) {
+          // this will be executed once in the program's lifetime.
+          // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
+          if (!SupportsFastAllocate(fdesc)) {
+            forceMmapOff_ = true;
+          }
+        },
+        fd);
+  }
+
 #ifdef ROCKSDB_IOURING_PRESENT
   bool IsIOUringEnabled() {
     if (RocksDbIOUringEnable && RocksDbIOUringEnable()) {
@@ -1096,8 +1096,7 @@ size_t PosixFileSystem::GetLogicalBlockSizeForWriteIfNeeded(
 }
 
 PosixFileSystem::PosixFileSystem()
-    : checkedDiskForMmap_(false),
-      forceMmapOff_(false),
+    : forceMmapOff_(false),
       page_size_(getpagesize()),
       allow_non_owner_access_(true) {
 #if defined(ROCKSDB_IOURING_PRESENT)

From 7789c45856408831674c3be2e7841e55b6c228eb Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Fri, 18 Mar 2022 13:11:57 -0700
Subject: [PATCH 27/34] Fix assertion error by doing comparison with mutex
 (#9717)

Summary:
On CircleCI MacOS instances, we have been seeing the following assertion error:
```
Assertion failed: (alive_log_files_tail_ == alive_log_files_.rbegin()), function WriteToWAL, file /Users/distiller/project/db/db_impl/db_impl_write.cc, line 1213.
Received signal 6 (Abort trap: 6)
#0   0x1
https://github.com/facebook/rocksdb/issues/1   abort (in libsystem_c.dylib) + 120
https://github.com/facebook/rocksdb/issues/2   err (in libsystem_c.dylib) + 0
https://github.com/facebook/rocksdb/issues/3   rocksdb::DBImpl::WriteToWAL(rocksdb::WriteBatch const&, rocksdb::log::Writer*, unsigned long long*, unsigned long long*, rocksdb::Env::IOPriority, bool, bool) (in librocksdb.7.0.0.dylib) (db_impl_write.cc:1213)
https://github.com/facebook/rocksdb/issues/4   rocksdb::DBImpl::WriteToWAL(rocksdb::WriteThread::WriteGroup const&, rocksdb::log::Writer*, unsigned long long*, bool, bool, unsigned long long) (in librocksdb.7.0.0.dylib) (db_impl_write.cc:1251)
https://github.com/facebook/rocksdb/issues/5   rocksdb::DBImpl::WriteImpl(rocksdb::WriteOptions const&, rocksdb::WriteBatch*, rocksdb::WriteCallback*, unsigned long long*, unsigned long long, bool, unsigned long long*, unsigned long, rocksdb::PreReleaseCallback*) (in librocksdb.7.0.0.dylib) (db_impl_	rite.cc:421)
https://github.com/facebook/rocksdb/issues/6   rocksdb::DBImpl::Write(rocksdb::WriteOptions const&, rocksdb::WriteBatch*) (in librocksdb.7.0.0.dylib) (db_impl_write.cc:109)
https://github.com/facebook/rocksdb/issues/7   rocksdb::DB::Put(rocksdb::WriteOptions const&, rocksdb::ColumnFamilyHandle*, rocksdb::Slice const&, rocksdb::Slice const&, rocksdb::Slice const&) (in librocksdb.7.0.0.dylib) (db_impl_write.cc:2159)
https://github.com/facebook/rocksdb/issues/8   rocksdb::DBImpl::Put(rocksdb::WriteOptions const&, rocksdb::ColumnFamilyHandle*, rocksdb::Slice const&, rocksdb::Slice const&, rocksdb::Slice const&) (in librocksdb.7.0.0.dylib) (db_impl_write.cc:37)
https://github.com/facebook/rocksdb/issues/9   rocksdb::DB::Put(rocksdb::WriteOptions const&, rocksdb::Slice const&, rocksdb::Slice const&, rocksdb::Slice const&) (in librocksdb.7.0.0.dylib) (db.h:382)
https://github.com/facebook/rocksdb/issues/10  rocksdb::DBBasicTestWithTimestampPrefixSeek_IterateWithPrefix_Test::TestBody() (in db_with_timestamp_basic_test) (db_with_timestamp_basic_test.cc:2926)
https://github.com/facebook/rocksdb/issues/11  void testing::internal::HandleSehExceptionsInMethodIfSupported<testing::Test, void>(testing::Test*, void (testing::Test::*)(), char const*) (in db_with_timestamp_basic_test) (gtest-all.cc:3899)
https://github.com/facebook/rocksdb/issues/12  void testing::internal::HandleExceptionsInMethodIfSupported<testing::Test, void>(testing::Test*, void (testing::Test::*)(), char const*) (in db_with_timestamp_basic_test) (gtest-all.cc:3935)
https://github.com/facebook/rocksdb/issues/13  testing::Test::Run() (in db_with_timestamp_basic_test) (gtest-all.cc:3980)
https://github.com/facebook/rocksdb/issues/14  testing::TestInfo::Run() (in db_with_timestamp_basic_test) (gtest-all.cc:4153)
https://github.com/facebook/rocksdb/issues/15  testing::TestCase::Run() (in db_with_timestamp_basic_test) (gtest-all.cc:4266)
https://github.com/facebook/rocksdb/issues/16  testing::internal::UnitTestImpl::RunAllTests() (in db_with_timestamp_basic_test) (gtest-all.cc:6632)
https://github.com/facebook/rocksdb/issues/17  bool testing::internal::HandleSehExceptionsInMethodIfSupported<testing::internal::UnitTestImpl, bool>(testing::internal::UnitTestImpl*, bool (testing::internal::UnitTestImpl::*)(), char const*) (in db_with_timestamp_basic_test) (gtest-all.cc:3899)
https://github.com/facebook/rocksdb/issues/18  bool testing::internal::HandleExceptionsInMethodIfSupported<testing::internal::UnitTestImpl, bool>(testing::internal::UnitTestImpl*, bool (testing::internal::UnitTestImpl::*)(), char const*) (in db_with_timestamp_basic_test) (gtest-all.cc:3935)
https://github.com/facebook/rocksdb/issues/19  testing::UnitTest::Run() (in db_with_timestamp_basic_test) (gtest-all.cc:6242)
https://github.com/facebook/rocksdb/issues/20  RUN_ALL_TESTS() (in db_with_timestamp_basic_test) (gtest.h:22110)
https://github.com/facebook/rocksdb/issues/21  main (in db_with_timestamp_basic_test) (db_with_timestamp_basic_test.cc:3150)
https://github.com/facebook/rocksdb/issues/22  start (in libdyld.dylib) + 1
```

It's likely caused by concurrent, unprotected access to the deque, even though `back()` is never popped,
and we are comparing `rbegin()` with a cached `riterator`. To be safe, do the comparison only if we have mutex.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9717

Test Plan:
One example
Ssh to one CircleCI MacOS instance.
```
gtest-parallel -r 1000 -w 8 ./db_test --gtest_filter=DBTest.FlushesInParallelWithCompactRange
```

Reviewed By: pdillinger

Differential Revision: D34990696

Pulled By: riversand963

fbshipit-source-id: 62dd48ae6fedbda53d0a64d73de9b948b4c26eee
---
 db/db_impl/db_impl_write.cc | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc
index 0975ae8bf..48252d331 100644
--- a/db/db_impl/db_impl_write.cc
+++ b/db/db_impl/db_impl_write.cc
@@ -1181,18 +1181,8 @@ IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
     *log_used = logfile_number_;
   }
   total_log_size_ += log_entry.size();
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
   if (with_db_mutex || with_log_mutex) {
-#endif  // __has_feature(thread_sanitizer)
-#endif  // defined(__has_feature)
     assert(alive_log_files_tail_ == alive_log_files_.rbegin());
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-  }
-#endif  // __has_feature(thread_sanitizer)
-#endif  // defined(__has_feature)
-  if (with_db_mutex || with_log_mutex) {
     assert(alive_log_files_tail_ != alive_log_files_.rend());
   }
   LogFileNumberSize& last_alive_log = *alive_log_files_tail_;

From fdd0e3a8755cb7003c9fd94447910dd45007fec2 Mon Sep 17 00:00:00 2001
From: duyuqi <duyuqi@sensorsdata.cn>
Date: Mon, 21 Mar 2022 12:04:33 -0700
Subject: [PATCH 28/34] =?UTF-8?q?fix=20a=20bug,=20c=20api,=20if=20enable?=
 =?UTF-8?q?=20inplace=5Fupdate=5Fsupport,=20and=20use=20create=20sn?=
 =?UTF-8?q?=E2=80=A6=20(#9471)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
c api release snapshot will core dump when enable inplace_update_support and create snapshot

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9471

Reviewed By: akankshamahajan15

Differential Revision: D34965103

Pulled By: riversand963

fbshipit-source-id: c3aeeb9ea7126c2eda1466102794fecf57b6ab77
---
 db/c_test.c                  | 42 ++++++++++++++++++++++++++++++------
 db/db_impl/db_impl.cc        |  6 ++++++
 db/db_inplace_update_test.cc | 30 ++++++++++++++++++++++++++
 include/rocksdb/db.h         |  2 +-
 4 files changed, 72 insertions(+), 8 deletions(-)

diff --git a/db/c_test.c b/db/c_test.c
index 044bbfd1b..980b5dd99 100644
--- a/db/c_test.c
+++ b/db/c_test.c
@@ -7,12 +7,13 @@
 
 #ifndef ROCKSDB_LITE  // Lite does not support C API
 
-#include "rocksdb/c.h"
-
+#include <assert.h>
 #include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
+
+#include "rocksdb/c.h"
 #ifndef OS_WIN
 #include <unistd.h>
 #endif
@@ -89,10 +90,8 @@ static void CheckEqual(const char* expected, const char* v, size_t n) {
     // ok
     return;
   } else {
-    fprintf(stderr, "%s: expected '%s', got '%s'\n",
-            phase,
-            (expected ? expected : "(null)"),
-            (v ? v : "(null"));
+    fprintf(stderr, "%s: expected '%s', got '%s'\n", phase,
+            (expected ? expected : "(null)"), (v ? v : "(null)"));
     abort();
   }
 }
@@ -989,7 +988,36 @@ int main(int argc, char** argv) {
     CheckGet(db, roptions, "foo", NULL);
     rocksdb_release_snapshot(db, snap);
   }
-
+  StartPhase("snapshot_with_memtable_inplace_update");
+  {
+    rocksdb_close(db);
+    const rocksdb_snapshot_t* snap = NULL;
+    const char* s_key = "foo_snap";
+    const char* value1 = "hello_s1";
+    const char* value2 = "hello_s2";
+    rocksdb_options_set_allow_concurrent_memtable_write(options, 0);
+    rocksdb_options_set_inplace_update_support(options, 1);
+    rocksdb_options_set_error_if_exists(options, 0);
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_put(db, woptions, s_key, 8, value1, 8, &err);
+    snap = rocksdb_create_snapshot(db);
+    assert(snap != NULL);
+    rocksdb_put(db, woptions, s_key, 8, value2, 8, &err);
+    CheckNoError(err);
+    rocksdb_readoptions_set_snapshot(roptions, snap);
+    CheckGet(db, roptions, "foo", NULL);
+    // snapshot syntax is invalid, because of inplace update supported is set
+    CheckGet(db, roptions, s_key, value2);
+    // restore the data and options
+    rocksdb_delete(db, woptions, s_key, 8, &err);
+    CheckGet(db, roptions, s_key, NULL);
+    rocksdb_release_snapshot(db, snap);
+    rocksdb_readoptions_set_snapshot(roptions, NULL);
+    rocksdb_options_set_inplace_update_support(options, 0);
+    rocksdb_options_set_allow_concurrent_memtable_write(options, 1);
+    rocksdb_options_set_error_if_exists(options, 1);
+  }
   StartPhase("repair");
   {
     // If we do not compact here, then the lazy deletion of
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index fb16122dd..2d95d493a 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -3207,6 +3207,12 @@ bool CfdListContains(const CfdList& list, ColumnFamilyData* cfd) {
 }  //  namespace
 
 void DBImpl::ReleaseSnapshot(const Snapshot* s) {
+  if (s == nullptr) {
+    // DBImpl::GetSnapshot() can return nullptr when snapshot
+    // not supported by specifying the condition:
+    // inplace_update_support enabled.
+    return;
+  }
   const SnapshotImpl* casted_s = reinterpret_cast<const SnapshotImpl*>(s);
   {
     InstrumentedMutexLock l(&mutex_);
diff --git a/db/db_inplace_update_test.cc b/db/db_inplace_update_test.cc
index 450782c89..2012cabf1 100644
--- a/db/db_inplace_update_test.cc
+++ b/db/db_inplace_update_test.cc
@@ -169,6 +169,36 @@ TEST_F(DBTestInPlaceUpdate, InPlaceUpdateCallbackNoAction) {
     ASSERT_EQ(Get(1, "key"), "NOT_FOUND");
   } while (ChangeCompactOptions());
 }
+
+TEST_F(DBTestInPlaceUpdate, InPlaceUpdateAndSnapshot) {
+  do {
+    Options options = CurrentOptions();
+    options.create_if_missing = true;
+    options.inplace_update_support = true;
+    options.env = env_;
+    options.write_buffer_size = 100000;
+    options.allow_concurrent_memtable_write = false;
+    Reopen(options);
+    CreateAndReopenWithCF({"pikachu"}, options);
+
+    // Update key with values of smaller size, and
+    // run GetSnapshot and ReleaseSnapshot
+    int numValues = 2;
+    for (int i = numValues; i > 0; i--) {
+      const Snapshot* s = db_->GetSnapshot();
+      ASSERT_EQ(nullptr, s);
+      std::string value = DummyString(i, 'a');
+      ASSERT_OK(Put(1, "key", value));
+      ASSERT_EQ(value, Get(1, "key"));
+      // release s (nullptr)
+      db_->ReleaseSnapshot(s);
+    }
+
+    // Only 1 instance for that key.
+    validateNumberOfEntries(1, 1);
+  } while (ChangeCompactOptions());
+}
+
 }  // namespace ROCKSDB_NAMESPACE
 
 int main(int argc, char** argv) {
diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h
index abd625746..10418eacb 100644
--- a/include/rocksdb/db.h
+++ b/include/rocksdb/db.h
@@ -780,7 +780,7 @@ class DB {
   // snapshot is no longer needed.
   //
   // nullptr will be returned if the DB fails to take a snapshot or does
-  // not support snapshot.
+  // not support snapshot (eg: inplace_update_support enabled).
   virtual const Snapshot* GetSnapshot() = 0;
 
   // Release a previously acquired snapshot.  The caller must not

From 6733f75086e79d811fb643260e063776093eaffb Mon Sep 17 00:00:00 2001
From: KNOEEE <1215892778@qq.com>
Date: Mon, 21 Mar 2022 16:11:02 -0700
Subject: [PATCH 29/34] Fix a bug in PosixClock (#9695)

Summary:
Multiplier here should be 1e6 to get microseconds.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9695

Reviewed By: ajkr

Differential Revision: D34897086

Pulled By: jay-zhuang

fbshipit-source-id: 9c1d0811ea740ba0a007edc2da199edbd000b88b
---
 db/compaction/compaction_job.cc | 4 ++--
 db/flush_job.cc                 | 8 ++++----
 env/env_posix.cc                | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 35ad06bac..32e8d06bc 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -1312,7 +1312,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   }
 #endif  // !ROCKSDB_LITE
 
-  uint64_t prev_cpu_micros = db_options_.clock->CPUNanos() / 1000;
+  uint64_t prev_cpu_micros = db_options_.clock->CPUMicros();
 
   ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
 
@@ -1659,7 +1659,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
   }
 
   sub_compact->compaction_job_stats.cpu_micros =
-      db_options_.clock->CPUNanos() / 1000 - prev_cpu_micros;
+      db_options_.clock->CPUMicros() - prev_cpu_micros;
 
   if (measure_io_stats_) {
     sub_compact->compaction_job_stats.file_write_nanos +=
diff --git a/db/flush_job.cc b/db/flush_job.cc
index 877b76025..9da6e69ca 100644
--- a/db/flush_job.cc
+++ b/db/flush_job.cc
@@ -354,7 +354,7 @@ Status FlushJob::MemPurge() {
 
   // Measure purging time.
   const uint64_t start_micros = clock_->NowMicros();
-  const uint64_t start_cpu_micros = clock_->CPUNanos() / 1000;
+  const uint64_t start_cpu_micros = clock_->CPUMicros();
 
   MemTable* new_mem = nullptr;
   // For performance/log investigation purposes:
@@ -606,7 +606,7 @@ Status FlushJob::MemPurge() {
     TEST_SYNC_POINT("DBImpl::FlushJob:MemPurgeUnsuccessful");
   }
   const uint64_t micros = clock_->NowMicros() - start_micros;
-  const uint64_t cpu_micros = clock_->CPUNanos() / 1000 - start_cpu_micros;
+  const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros;
   ROCKS_LOG_INFO(db_options_.info_log,
                  "[%s] [JOB %d] Mempurge lasted %" PRIu64
                  " microseconds, and %" PRIu64
@@ -792,7 +792,7 @@ Status FlushJob::WriteLevel0Table() {
       ThreadStatus::STAGE_FLUSH_WRITE_L0);
   db_mutex_->AssertHeld();
   const uint64_t start_micros = clock_->NowMicros();
-  const uint64_t start_cpu_micros = clock_->CPUNanos() / 1000;
+  const uint64_t start_cpu_micros = clock_->CPUMicros();
   Status s;
 
   std::vector<BlobFileAddition> blob_file_additions;
@@ -979,7 +979,7 @@ Status FlushJob::WriteLevel0Table() {
   // Note that here we treat flush as level 0 compaction in internal stats
   InternalStats::CompactionStats stats(CompactionReason::kFlush, 1);
   const uint64_t micros = clock_->NowMicros() - start_micros;
-  const uint64_t cpu_micros = clock_->CPUNanos() / 1000 - start_cpu_micros;
+  const uint64_t cpu_micros = clock_->CPUMicros() - start_cpu_micros;
   stats.micros = micros;
   stats.cpu_micros = cpu_micros;
 
diff --git a/env/env_posix.cc b/env/env_posix.cc
index ca54b01ca..609d169f2 100644
--- a/env/env_posix.cc
+++ b/env/env_posix.cc
@@ -166,7 +166,7 @@ class PosixClock : public SystemClock {
     defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12))
     struct timespec ts;
     clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
-    return static_cast<uint64_t>(ts.tv_sec) * 1000000000;
+    return (static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec) / 1000;
 #endif
     return 0;
   }

From 2de3bfc7d12ce29527c5221e607ab72c477a0332 Mon Sep 17 00:00:00 2001
From: Yanqin Jin <yanqin@fb.com>
Date: Wed, 23 Mar 2022 19:41:31 -0700
Subject: [PATCH 30/34] Fix a race condition in WAL tracking causing DB open
 failure (#9715)

Summary:
There is a race condition if WAL tracking in the MANIFEST is enabled in a database that disables 2PC.

The race condition is between two background flush threads trying to install flush results to the MANIFEST.

Consider an example database with two column families: "default" (cfd0) and "cf1" (cfd1). Initially,
both column families have one mutable (active) memtable whose data backed by 6.log.

1. Trigger a manual flush for "cf1", creating a 7.log
2. Insert another key to "default", and trigger flush for "default", creating 8.log
3. BgFlushThread1 finishes writing 9.sst
4. BgFlushThread2 finishes writing 10.sst

```
Time  BgFlushThread1                                    BgFlushThread2
 |    mutex_.Lock()
 |    precompute min_wal_to_keep as 6
 |    mutex_.Unlock()
 |                                                     mutex_.Lock()
 |                                                     precompute min_wal_to_keep as 6
 |                                                     join MANIFEST write queue and mutex_.Unlock()
 |    write to MANIFEST
 |    mutex_.Lock()
 |    cfd1->log_number = 7
 |    Signal bg_flush_2 and mutex_.Unlock()
 |                                                     wake up and mutex_.Lock()
 |                                                     cfd0->log_number = 8
 |                                                     FindObsoleteFiles() with job_context->log_number == 7
 |                                                     mutex_.Unlock()
 |                                                     PurgeObsoleteFiles() deletes 6.log
 V
```

As shown in the above, BgFlushThread2 thinks that the min wal to keep is 6.log because "cf1" has unflushed data in 6.log (cf1.log_number=6).
Similarly, BgThread1 thinks that min wal to keep is also 6.log because "default" has unflushed data (default.log_number=6).
No WAL deletion will be written to MANIFEST because 6 is equal to `versions_->wals_.min_wal_number_to_keep`,
due to https://github.com/facebook/rocksdb/blob/7.1.fb/db/memtable_list.cc#L513:L514.
The bg flush thread that finishes last will perform file purging. `job_context.log_number` will be evaluated as 7, i.e.
the min wal that contains unflushed data, causing 6.log to be deleted. However, MANIFEST thinks 6.log should still exist.
If you close the db at this point, you won't be able to re-open it if `track_and_verify_wal_in_manifest` is true.

We must handle the case of multiple bg flush threads, and it is difficult for one bg flush thread to know
the correct min wal number until the other bg flush threads have finished committing to the manifest and updated
the `cfd::log_number`.
To fix this issue, we rename an existing variable `min_log_number_to_keep_2pc` to `min_log_number_to_keep`,
and use it to track WAL file deletion in non-2pc mode as well.
This variable is updated only 1) during recovery with mutex held, or 2) in the MANIFEST write thread.
`min_log_number_to_keep` means RocksDB will delete WALs below it, although there may be WALs
above it which are also obsolete. Formally, we will have [min_wal_to_keep, max_obsolete_wal]. During recovery, we
make sure that only WALs above max_obsolete_wal are checked and added back to `alive_log_files_`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9715

Test Plan:
```
make check
```
Also ran stress test below (with asan) to make sure it completes successfully.
```
TEST_TMPDIR=/dev/shm/rocksdb OPT=-g ASAN_OPTIONS=disable_coredump=0 \
CRASH_TEST_EXT_ARGS=--compression_type=zstd SKIP_FORMAT_BUCK_CHECKS=1 \
make J=52 -j52 blackbox_asan_crash_test
```

Reviewed By: ltamasi

Differential Revision: D34984412

Pulled By: riversand963

fbshipit-source-id: c7b21a8d84751bb55ea79c9f387103d21b231005
---
 HISTORY.md                  |  1 +
 db/db_impl/db_impl_files.cc |  8 +---
 db/db_impl/db_impl_open.cc  | 25 +++++++++--
 db/db_wal_test.cc           | 87 +++++++++++++++++++++++++++++++++++++
 db/event_helpers.cc         |  5 ++-
 db/memtable_list.cc         | 26 ++++++-----
 db/version_edit_handler.cc  |  7 ++-
 db/version_set.cc           | 15 +++----
 db/version_set.h            | 15 ++++---
 db/version_set_test.cc      |  5 ++-
 10 files changed, 150 insertions(+), 44 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 09a8c840d..f18d0a3f9 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -4,6 +4,7 @@
 * Fixed a race condition when disable and re-enable manual compaction.
 * Fixed a race condition for `alive_log_files_` in non-two-write-queues mode. The race is between the write_thread_ in WriteToWAL() and another thread executing `FindObsoleteFiles()`. The race condition will be caught if `__glibcxx_requires_nonempty` is enabled.
 * Fixed a race condition when mmaping a WritableFile on POSIX.
+* Fixed a race condition when 2PC is disabled and WAL tracking in the MANIFEST is enabled. The race condition is between two background flush threads trying to install flush results, causing a WAL deletion not tracked in the MANIFEST. A future DB open may fail.
 
 ## 7.0.3 (03/23/2022)
 ### Bug Fixes
diff --git a/db/db_impl/db_impl_files.cc b/db/db_impl/db_impl_files.cc
index b50380d92..1790ed836 100644
--- a/db/db_impl/db_impl_files.cc
+++ b/db/db_impl/db_impl_files.cc
@@ -23,11 +23,7 @@
 namespace ROCKSDB_NAMESPACE {
 
 uint64_t DBImpl::MinLogNumberToKeep() {
-  if (allow_2pc()) {
-    return versions_->min_log_number_to_keep_2pc();
-  } else {
-    return versions_->MinLogNumberWithUnflushedData();
-  }
+  return versions_->min_log_number_to_keep();
 }
 
 uint64_t DBImpl::MinObsoleteSstNumberToKeep() {
@@ -224,7 +220,6 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
     }
 
     // Add log files in wal_dir
-
     if (!immutable_db_options_.IsWalDirSameAsDBPath(dbname_)) {
       std::vector<std::string> log_files;
       Status s = env_->GetChildren(immutable_db_options_.wal_dir, &log_files);
@@ -234,6 +229,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
             log_file, immutable_db_options_.wal_dir);
       }
     }
+
     // Add info log files in db_log_dir
     if (!immutable_db_options_.db_log_dir.empty() &&
         immutable_db_options_.db_log_dir != dbname_) {
diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc
index 5cb6beeb6..e7e8ad5e1 100644
--- a/db/db_impl/db_impl_open.cc
+++ b/db/db_impl/db_impl_open.cc
@@ -866,6 +866,11 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
   bool flushed = false;
   uint64_t corrupted_wal_number = kMaxSequenceNumber;
   uint64_t min_wal_number = MinLogNumberToKeep();
+  if (!allow_2pc()) {
+    // In non-2pc mode, we skip WALs that do not back unflushed data.
+    min_wal_number =
+        std::max(min_wal_number, versions_->MinLogNumberWithUnflushedData());
+  }
   for (auto wal_number : wal_numbers) {
     if (wal_number < min_wal_number) {
       ROCKS_LOG_INFO(immutable_db_options_.info_log,
@@ -1270,9 +1275,16 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
       }
 
       std::unique_ptr<VersionEdit> wal_deletion;
-      if (immutable_db_options_.track_and_verify_wals_in_manifest) {
-        wal_deletion.reset(new VersionEdit);
-        wal_deletion->DeleteWalsBefore(max_wal_number + 1);
+      if (flushed) {
+        wal_deletion = std::make_unique<VersionEdit>();
+        if (immutable_db_options_.track_and_verify_wals_in_manifest) {
+          wal_deletion->DeleteWalsBefore(max_wal_number + 1);
+        }
+        if (!allow_2pc()) {
+          // In non-2pc mode, flushing the memtables of the column families
+          // means we can advance min_log_number_to_keep.
+          wal_deletion->SetMinLogNumberToKeep(max_wal_number + 1);
+        }
         edit_lists.back().push_back(wal_deletion.get());
       }
 
@@ -1351,7 +1363,14 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
   // FindObsoleteFiles()
   total_log_size_ = 0;
   log_empty_ = false;
+  uint64_t min_wal_with_unflushed_data =
+      versions_->MinLogNumberWithUnflushedData();
   for (auto wal_number : wal_numbers) {
+    if (!allow_2pc() && wal_number < min_wal_with_unflushed_data) {
+      // In non-2pc mode, the WAL files not backing unflushed data are not
+      // alive, thus should not be added to the alive_log_files_.
+      continue;
+    }
     // We preallocate space for wals, but then after a crash and restart, those
     // preallocated space are not needed anymore. It is likely only the last
     // log has such preallocated space, so we only truncate for the last log.
diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc
index 8a61dadaf..f2b94ed07 100644
--- a/db/db_wal_test.cc
+++ b/db/db_wal_test.cc
@@ -1491,6 +1491,93 @@ TEST_F(DBWALTest, kPointInTimeRecoveryCFConsistency) {
   ASSERT_NOK(TryReopenWithColumnFamilies({"default", "one", "two"}, options));
 }
 
+TEST_F(DBWALTest, RaceInstallFlushResultsWithWalObsoletion) {
+  Options options = CurrentOptions();
+  options.env = env_;
+  options.track_and_verify_wals_in_manifest = true;
+  // The following make sure there are two bg flush threads.
+  options.max_background_jobs = 8;
+
+  const std::string cf1_name("cf1");
+  CreateAndReopenWithCF({cf1_name}, options);
+  assert(handles_.size() == 2);
+
+  {
+    dbfull()->TEST_LockMutex();
+    ASSERT_LE(2, dbfull()->GetBGJobLimits().max_flushes);
+    dbfull()->TEST_UnlockMutex();
+  }
+
+  ASSERT_OK(dbfull()->PauseBackgroundWork());
+
+  ASSERT_OK(db_->Put(WriteOptions(), handles_[1], "foo", "value"));
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "value"));
+
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(false, true, handles_[1]));
+
+  ASSERT_OK(db_->Put(WriteOptions(), "foo", "value"));
+  ASSERT_OK(dbfull()->TEST_FlushMemTable(false, true, handles_[0]));
+
+  bool called = false;
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+  // This callback will be called when the first bg flush thread reaches the
+  // point before entering the MANIFEST write queue after flushing the SST
+  // file.
+  // The purpose of the sync points here is to ensure both bg flush threads
+  // finish computing `min_wal_number_to_keep` before any of them updates the
+  // `log_number` for the column family that's being flushed.
+  SyncPoint::GetInstance()->SetCallBack(
+      "MemTableList::TryInstallMemtableFlushResults:AfterComputeMinWalToKeep",
+      [&](void* /*arg*/) {
+        dbfull()->mutex()->AssertHeld();
+        if (!called) {
+          // We are the first bg flush thread in the MANIFEST write queue.
+          // We set up the dependency between sync points for two threads that
+          // will be executing the same code.
+          // For the interleaving of events, see
+          // https://github.com/facebook/rocksdb/pull/9715.
+          // bg flush thread1 will release the db mutex while in the MANIFEST
+          // write queue. In the meantime, bg flush thread2 locks db mutex and
+          // computes the min_wal_number_to_keep (before thread1 writes to
+          // MANIFEST thus before cf1->log_number is updated). Bg thread2 joins
+          // the MANIFEST write queue afterwards and bg flush thread1 proceeds
+          // with writing to MANIFEST.
+          called = true;
+          SyncPoint::GetInstance()->LoadDependency({
+              {"VersionSet::LogAndApply:WriteManifestStart",
+               "DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2"},
+              {"DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2",
+               "VersionSet::LogAndApply:WriteManifest"},
+          });
+        } else {
+          // The other bg flush thread has already been in the MANIFEST write
+          // queue, and we are after.
+          TEST_SYNC_POINT(
+              "DBWALTest::RaceInstallFlushResultsWithWalObsoletion:BgFlush2");
+        }
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  ASSERT_OK(dbfull()->ContinueBackgroundWork());
+
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0]));
+  ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1]));
+
+  ASSERT_TRUE(called);
+
+  Close();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+
+  DB* db1 = nullptr;
+  Status s = DB::OpenForReadOnly(options, dbname_, &db1);
+  ASSERT_OK(s);
+  assert(db1);
+  delete db1;
+}
+
 // Test scope:
 // - We expect to open data store under all circumstances
 // - We expect only data upto the point where the first error was encountered
diff --git a/db/event_helpers.cc b/db/event_helpers.cc
index 88bf8cc69..3ec0e8da1 100644
--- a/db/event_helpers.cc
+++ b/db/event_helpers.cc
@@ -95,8 +95,9 @@ void EventHelpers::LogAndNotifyTableFileCreationFinished(
     jwriter << "cf_name" << cf_name << "job" << job_id << "event"
             << "table_file_creation"
             << "file_number" << fd.GetNumber() << "file_size"
-            << fd.GetFileSize() << "file_checksum" << file_checksum
-            << "file_checksum_func_name" << file_checksum_func_name;
+            << fd.GetFileSize() << "file_checksum"
+            << Slice(file_checksum).ToString(true) << "file_checksum_func_name"
+            << file_checksum_func_name;
 
     // table_properties
     {
diff --git a/db/memtable_list.cc b/db/memtable_list.cc
index 0955be675..bd186441d 100644
--- a/db/memtable_list.cc
+++ b/db/memtable_list.cc
@@ -494,8 +494,8 @@ Status MemTableList::TryInstallMemtableFlushResults(
     // TODO(myabandeh): Not sure how batch_count could be 0 here.
     if (batch_count > 0) {
       uint64_t min_wal_number_to_keep = 0;
+      assert(edit_list.size() > 0);
       if (vset->db_options()->allow_2pc) {
-        assert(edit_list.size() > 0);
         // Note that if mempurge is successful, the edit_list will
         // not be applicable (contains info of new min_log number to keep,
         // and level 0 file path of SST file created during normal flush,
@@ -504,23 +504,26 @@ Status MemTableList::TryInstallMemtableFlushResults(
         min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
             vset, *cfd, edit_list, memtables_to_flush, prep_tracker);
 
-        // We piggyback the information of  earliest log file to keep in the
+        // We piggyback the information of earliest log file to keep in the
         // manifest entry for the last file flushed.
-        edit_list.back()->SetMinLogNumberToKeep(min_wal_number_to_keep);
+      } else {
+        min_wal_number_to_keep =
+            PrecomputeMinLogNumberToKeepNon2PC(vset, *cfd, edit_list);
       }
+      edit_list.back()->SetMinLogNumberToKeep(min_wal_number_to_keep);
 
       std::unique_ptr<VersionEdit> wal_deletion;
       if (vset->db_options()->track_and_verify_wals_in_manifest) {
-        if (!vset->db_options()->allow_2pc) {
-          min_wal_number_to_keep =
-              PrecomputeMinLogNumberToKeepNon2PC(vset, *cfd, edit_list);
-        }
         if (min_wal_number_to_keep >
             vset->GetWalSet().GetMinWalNumberToKeep()) {
           wal_deletion.reset(new VersionEdit);
           wal_deletion->DeleteWalsBefore(min_wal_number_to_keep);
           edit_list.push_back(wal_deletion.get());
         }
+        TEST_SYNC_POINT_CALLBACK(
+            "MemTableList::TryInstallMemtableFlushResults:"
+            "AfterComputeMinWalToKeep",
+            nullptr);
       }
 
       const auto manifest_write_cb = [this, cfd, batch_count, log_buffer,
@@ -805,15 +808,14 @@ Status InstallMemtableAtomicFlushResults(
   if (vset->db_options()->allow_2pc) {
     min_wal_number_to_keep = PrecomputeMinLogNumberToKeep2PC(
         vset, cfds, edit_lists, mems_list, prep_tracker);
-    edit_lists.back().back()->SetMinLogNumberToKeep(min_wal_number_to_keep);
+  } else {
+    min_wal_number_to_keep =
+        PrecomputeMinLogNumberToKeepNon2PC(vset, cfds, edit_lists);
   }
+  edit_lists.back().back()->SetMinLogNumberToKeep(min_wal_number_to_keep);
 
   std::unique_ptr<VersionEdit> wal_deletion;
   if (vset->db_options()->track_and_verify_wals_in_manifest) {
-    if (!vset->db_options()->allow_2pc) {
-      min_wal_number_to_keep =
-          PrecomputeMinLogNumberToKeepNon2PC(vset, cfds, edit_lists);
-    }
     if (min_wal_number_to_keep > vset->GetWalSet().GetMinWalNumberToKeep()) {
       wal_deletion.reset(new VersionEdit);
       wal_deletion->DeleteWalsBefore(min_wal_number_to_keep);
diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc
index 208527fb8..f3e24016b 100644
--- a/db/version_edit_handler.cc
+++ b/db/version_edit_handler.cc
@@ -394,7 +394,7 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader,
   if (s->ok()) {
     version_set_->GetColumnFamilySet()->UpdateMaxColumnFamily(
         version_edit_params_.max_column_family_);
-    version_set_->MarkMinLogNumberToKeep2PC(
+    version_set_->MarkMinLogNumberToKeep(
         version_edit_params_.min_log_number_to_keep_);
     version_set_->MarkFileNumberUsed(version_edit_params_.prev_log_number_);
     version_set_->MarkFileNumberUsed(version_edit_params_.log_number_);
@@ -970,12 +970,11 @@ void DumpManifestHandler::CheckIterationResult(const log::Reader& reader,
   fprintf(stdout,
           "next_file_number %" PRIu64 " last_sequence %" PRIu64
           "  prev_log_number %" PRIu64 " max_column_family %" PRIu32
-          " min_log_number_to_keep "
-          "%" PRIu64 "\n",
+          " min_log_number_to_keep %" PRIu64 "\n",
           version_set_->current_next_file_number(),
           version_set_->LastSequence(), version_set_->prev_log_number(),
           version_set_->column_family_set_->GetMaxColumnFamily(),
-          version_set_->min_log_number_to_keep_2pc());
+          version_set_->min_log_number_to_keep());
 }
 
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/db/version_set.cc b/db/version_set.cc
index daf1d9145..8c6c9fce1 100644
--- a/db/version_set.cc
+++ b/db/version_set.cc
@@ -4141,7 +4141,7 @@ void VersionSet::Reset() {
   }
   db_id_.clear();
   next_file_number_.store(2);
-  min_log_number_to_keep_2pc_.store(0);
+  min_log_number_to_keep_.store(0);
   manifest_file_number_ = 0;
   options_file_number_ = 0;
   pending_manifest_file_number_ = 0;
@@ -4610,8 +4610,7 @@ Status VersionSet::ProcessManifestWrites(
       }
 
       if (last_min_log_number_to_keep != 0) {
-        // Should only be set in 2PC mode.
-        MarkMinLogNumberToKeep2PC(last_min_log_number_to_keep);
+        MarkMinLogNumberToKeep(last_min_log_number_to_keep);
       }
 
       for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
@@ -4965,7 +4964,7 @@ Status VersionSet::Recover(
         ",min_log_number_to_keep is %" PRIu64 "\n",
         manifest_path.c_str(), manifest_file_number_, next_file_number_.load(),
         last_sequence_.load(), log_number, prev_log_number_,
-        column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep_2pc());
+        column_family_set_->GetMaxColumnFamily(), min_log_number_to_keep());
 
     for (auto cfd : *column_family_set_) {
       if (cfd->IsDropped()) {
@@ -5378,9 +5377,9 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) {
 }
 // Called only either from ::LogAndApply which is protected by mutex or during
 // recovery which is single-threaded.
-void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) {
-  if (min_log_number_to_keep_2pc_.load(std::memory_order_relaxed) < number) {
-    min_log_number_to_keep_2pc_.store(number, std::memory_order_relaxed);
+void VersionSet::MarkMinLogNumberToKeep(uint64_t number) {
+  if (min_log_number_to_keep_.load(std::memory_order_relaxed) < number) {
+    min_log_number_to_keep_.store(number, std::memory_order_relaxed);
   }
 }
 
@@ -5506,7 +5505,7 @@ Status VersionSet::WriteCurrentStateToManifest(
         // min_log_number_to_keep is for the whole db, not for specific column family.
         // So it does not need to be set for every column family, just need to be set once.
         // Since default CF can never be dropped, we set the min_log to the default CF here.
-        uint64_t min_log = min_log_number_to_keep_2pc();
+        uint64_t min_log = min_log_number_to_keep();
         if (min_log != 0) {
           edit.SetMinLogNumberToKeep(min_log);
         }
diff --git a/db/version_set.h b/db/version_set.h
index 38e6616cf..f30cc0b76 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -1128,8 +1128,8 @@ class VersionSet {
 
   uint64_t current_next_file_number() const { return next_file_number_.load(); }
 
-  uint64_t min_log_number_to_keep_2pc() const {
-    return min_log_number_to_keep_2pc_.load();
+  uint64_t min_log_number_to_keep() const {
+    return min_log_number_to_keep_.load();
   }
 
   // Allocate and return a new file number
@@ -1187,7 +1187,7 @@ class VersionSet {
   // Mark the specified log number as deleted
   // REQUIRED: this is only called during single-threaded recovery or repair, or
   // from ::LogAndApply where the global mutex is held.
-  void MarkMinLogNumberToKeep2PC(uint64_t number);
+  void MarkMinLogNumberToKeep(uint64_t number);
 
   // Return the log file number for the log file that is currently
   // being compacted, or zero if there is no such log file.
@@ -1196,10 +1196,12 @@ class VersionSet {
   // Returns the minimum log number which still has data not flushed to any SST
   // file.
   // In non-2PC mode, all the log numbers smaller than this number can be safely
-  // deleted.
+  // deleted, although we still use `min_log_number_to_keep_` to determine when
+  // to delete a WAL file.
   uint64_t MinLogNumberWithUnflushedData() const {
     return PreComputeMinLogNumberWithUnflushedData(nullptr);
   }
+
   // Returns the minimum log number which still has data not flushed to any SST
   // file.
   // Empty column families' log number is considered to be
@@ -1399,9 +1401,8 @@ class VersionSet {
   const ImmutableDBOptions* const db_options_;
   std::atomic<uint64_t> next_file_number_;
   // Any WAL number smaller than this should be ignored during recovery,
-  // and is qualified for being deleted in 2PC mode. In non-2PC mode, this
-  // number is ignored.
-  std::atomic<uint64_t> min_log_number_to_keep_2pc_ = {0};
+  // and is qualified for being deleted.
+  std::atomic<uint64_t> min_log_number_to_keep_ = {0};
   uint64_t manifest_file_number_;
   uint64_t options_file_number_;
   uint64_t options_file_size_;
diff --git a/db/version_set_test.cc b/db/version_set_test.cc
index 422afaf49..c48547508 100644
--- a/db/version_set_test.cc
+++ b/db/version_set_test.cc
@@ -3424,6 +3424,7 @@ TEST_F(VersionSetTestMissingFiles, NoFileMissing) {
 }
 
 TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
+  db_options_.allow_2pc = true;
   NewDB();
 
   SstInfo sst(100, kDefaultColumnFamilyName, "a");
@@ -3435,12 +3436,12 @@ TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) {
   edit.AddFile(0, file_metas[0]);
   edit.SetMinLogNumberToKeep(kMinWalNumberToKeep2PC);
   ASSERT_OK(LogAndApplyToDefaultCF(edit));
-  ASSERT_EQ(versions_->min_log_number_to_keep_2pc(), kMinWalNumberToKeep2PC);
+  ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC);
 
   for (int i = 0; i < 3; i++) {
     CreateNewManifest();
     ReopenDB();
-    ASSERT_EQ(versions_->min_log_number_to_keep_2pc(), kMinWalNumberToKeep2PC);
+    ASSERT_EQ(versions_->min_log_number_to_keep(), kMinWalNumberToKeep2PC);
   }
 }
 

From 8b2cfef9e017068519442e264a1757b833bcc9be Mon Sep 17 00:00:00 2001
From: Peter Dillinger <peterd@fb.com>
Date: Thu, 24 Mar 2022 13:05:17 -0700
Subject: [PATCH 31/34] Fix heap use-after-free race with DropColumnFamily
 (#9730)

Summary:
Although ColumnFamilySet comments say that DB mutex can be
freed during iteration, as long as you hold a ref while releasing DB
mutex, this is not quite true because UnrefAndTryDelete might delete cfd
right before it is needed to get ->next_ for the next iteration of the
loop.

This change solves the problem by making a wrapper class that makes such
iteration easier while handling the tricky details of UnrefAndTryDelete
on the previous cfd only after getting next_ in operator++.

FreeDeadColumnFamilies should already have been obsolete; this removes
it for good. Similarly, ColumnFamilySet::iterator doesn't need to check
for cfd with 0 refs, because those are immediately deleted.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9730

Test Plan:
was reported with ASAN on unit tests like
DBLogicalBlockSizeCacheTest.CreateColumnFamily (very rare); keep watching

Reviewed By: ltamasi

Differential Revision: D35038143

Pulled By: pdillinger

fbshipit-source-id: 0a5478d5be96c135343a00603711b7df43ae19c9
---
 HISTORY.md                             |  1 +
 db/column_family.cc                    | 14 -----
 db/column_family.h                     | 73 ++++++++++++++++++++------
 db/db_filesnapshot.cc                  |  5 +-
 db/db_impl/db_impl.cc                  | 31 ++++-------
 db/db_impl/db_impl_compaction_flush.cc |  2 -
 db/version_set.h                       |  4 ++
 7 files changed, 71 insertions(+), 59 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index f18d0a3f9..34f20138e 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -5,6 +5,7 @@
 * Fixed a race condition for `alive_log_files_` in non-two-write-queues mode. The race is between the write_thread_ in WriteToWAL() and another thread executing `FindObsoleteFiles()`. The race condition will be caught if `__glibcxx_requires_nonempty` is enabled.
 * Fixed a race condition when mmaping a WritableFile on POSIX.
 * Fixed a race condition when 2PC is disabled and WAL tracking in the MANIFEST is enabled. The race condition is between two background flush threads trying to install flush results, causing a WAL deletion not tracked in the MANIFEST. A future DB open may fail.
+* Fixed a heap use-after-free race with DropColumnFamily.
 
 ## 7.0.3 (03/23/2022)
 ### Bug Fixes
diff --git a/db/column_family.cc b/db/column_family.cc
index a2885515c..c89ed24c0 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -1562,20 +1562,6 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
   return new_cfd;
 }
 
-// REQUIRES: DB mutex held
-void ColumnFamilySet::FreeDeadColumnFamilies() {
-  autovector<ColumnFamilyData*> to_delete;
-  for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) {
-    if (cfd->refs_.load(std::memory_order_relaxed) == 0) {
-      to_delete.push_back(cfd);
-    }
-  }
-  for (auto cfd : to_delete) {
-    // this is very rare, so it's not a problem that we do it under a mutex
-    delete cfd;
-  }
-}
-
 // under a DB mutex AND from a write thread
 void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
   auto cfd_iter = column_family_data_.find(cfd->GetID());
diff --git a/db/column_family.h b/db/column_family.h
index a28f5d7e9..4de70290d 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -519,9 +519,10 @@ class ColumnFamilyData {
   ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
   WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
 
+  static const uint32_t kDummyColumnFamilyDataId;
+
  private:
   friend class ColumnFamilySet;
-  static const uint32_t kDummyColumnFamilyDataId;
   ColumnFamilyData(uint32_t id, const std::string& name,
                    Version* dummy_versions, Cache* table_cache,
                    WriteBufferManager* write_buffer_manager,
@@ -627,10 +628,8 @@ class ColumnFamilyData {
 // held and it needs to be executed from the write thread. SetDropped() also
 // guarantees that it will be called only from single-threaded LogAndApply(),
 // but this condition is not that important.
-// * Iteration -- hold DB mutex, but you can release it in the body of
-// iteration. If you release DB mutex in body, reference the column
-// family before the mutex and unreference after you unlock, since the column
-// family might get dropped when the DB mutex is released
+// * Iteration -- hold DB mutex. If you want to release the DB mutex in the
+// body of the iteration, wrap in a RefedColumnFamilySet.
 // * GetDefault() -- thread safe
 // * GetColumnFamily() -- either inside of DB mutex or from a write thread
 // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(),
@@ -642,17 +641,12 @@ class ColumnFamilySet {
    public:
     explicit iterator(ColumnFamilyData* cfd)
         : current_(cfd) {}
+    // NOTE: minimum operators for for-loop iteration
     iterator& operator++() {
-      // dropped column families might still be included in this iteration
-      // (we're only removing them when client drops the last reference to the
-      // column family).
-      // dummy is never dead, so this will never be infinite
-      do {
-        current_ = current_->next_;
-      } while (current_->refs_.load(std::memory_order_relaxed) == 0);
+      current_ = current_->next_;
       return *this;
     }
-    bool operator!=(const iterator& other) {
+    bool operator!=(const iterator& other) const {
       return this->current_ != other.current_;
     }
     ColumnFamilyData* operator*() { return current_; }
@@ -691,10 +685,6 @@ class ColumnFamilySet {
   iterator begin() { return iterator(dummy_cfd_->next_); }
   iterator end() { return iterator(dummy_cfd_); }
 
-  // REQUIRES: DB mutex held
-  // Don't call while iterating over ColumnFamilySet
-  void FreeDeadColumnFamilies();
-
   Cache* get_table_cache() { return table_cache_; }
 
   WriteBufferManager* write_buffer_manager() { return write_buffer_manager_; }
@@ -737,6 +727,55 @@ class ColumnFamilySet {
   std::string db_session_id_;
 };
 
+// A wrapper for ColumnFamilySet that supports releasing DB mutex during each
+// iteration over the iterator, because the cfd is Refed and Unrefed during
+// each iteration to prevent concurrent CF drop from destroying it (until
+// Unref).
+class RefedColumnFamilySet {
+ public:
+  explicit RefedColumnFamilySet(ColumnFamilySet* cfs) : wrapped_(cfs) {}
+
+  class iterator {
+   public:
+    explicit iterator(ColumnFamilySet::iterator wrapped) : wrapped_(wrapped) {
+      MaybeRef(*wrapped_);
+    }
+    ~iterator() { MaybeUnref(*wrapped_); }
+    inline void MaybeRef(ColumnFamilyData* cfd) {
+      if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
+        cfd->Ref();
+      }
+    }
+    inline void MaybeUnref(ColumnFamilyData* cfd) {
+      if (cfd->GetID() != ColumnFamilyData::kDummyColumnFamilyDataId) {
+        cfd->UnrefAndTryDelete();
+      }
+    }
+    // NOTE: minimum operators for for-loop iteration
+    inline iterator& operator++() {
+      ColumnFamilyData* old = *wrapped_;
+      ++wrapped_;
+      // Can only unref & potentially free cfd after accessing its next_
+      MaybeUnref(old);
+      MaybeRef(*wrapped_);
+      return *this;
+    }
+    inline bool operator!=(const iterator& other) const {
+      return this->wrapped_ != other.wrapped_;
+    }
+    inline ColumnFamilyData* operator*() { return *wrapped_; }
+
+   private:
+    ColumnFamilySet::iterator wrapped_;
+  };
+
+  iterator begin() { return iterator(wrapped_->begin()); }
+  iterator end() { return iterator(wrapped_->end()); }
+
+ private:
+  ColumnFamilySet* wrapped_;
+};
+
 // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access
 // memtables of different column families (specified by ID in the write batch)
 class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc
index 7d4da8c22..c9144f8c9 100644
--- a/db/db_filesnapshot.cc
+++ b/db/db_filesnapshot.cc
@@ -45,17 +45,15 @@ Status DBImpl::FlushForGetLiveFiles() {
     }
     mutex_.Lock();
   } else {
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
+    for (auto cfd : versions_->GetRefedColumnFamilySet()) {
       if (cfd->IsDropped()) {
         continue;
       }
-      cfd->Ref();
       mutex_.Unlock();
       status = FlushMemTable(cfd, FlushOptions(), FlushReason::kGetLiveFiles);
       TEST_SYNC_POINT("DBImpl::GetLiveFiles:1");
       TEST_SYNC_POINT("DBImpl::GetLiveFiles:2");
       mutex_.Lock();
-      cfd->UnrefAndTryDelete();
       if (!status.ok() && !status.IsColumnFamilyDropped()) {
         break;
       } else if (status.IsColumnFamilyDropped()) {
@@ -63,7 +61,6 @@ Status DBImpl::FlushForGetLiveFiles() {
       }
     }
   }
-  versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
   return status;
 }
 
diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc
index 2d95d493a..a5a63fba6 100644
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@@ -377,15 +377,12 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) {
       s = AtomicFlushMemTables(cfds, flush_opts, context.flush_reason);
       mutex_.Lock();
     } else {
-      for (auto cfd : *versions_->GetColumnFamilySet()) {
+      for (auto cfd : versions_->GetRefedColumnFamilySet()) {
         if (cfd->IsDropped()) {
           continue;
         }
-        cfd->Ref();
-        mutex_.Unlock();
+        InstrumentedMutexUnlock u(&mutex_);
         s = FlushMemTable(cfd, flush_opts, context.flush_reason);
-        mutex_.Lock();
-        cfd->UnrefAndTryDelete();
         if (!s.ok()) {
           break;
         }
@@ -497,18 +494,14 @@ void DBImpl::CancelAllBackgroundWork(bool wait) {
       s.PermitUncheckedError();  //**TODO: What to do on error?
       mutex_.Lock();
     } else {
-      for (auto cfd : *versions_->GetColumnFamilySet()) {
+      for (auto cfd : versions_->GetRefedColumnFamilySet()) {
         if (!cfd->IsDropped() && cfd->initialized() && !cfd->mem()->IsEmpty()) {
-          cfd->Ref();
-          mutex_.Unlock();
+          InstrumentedMutexUnlock u(&mutex_);
           Status s = FlushMemTable(cfd, FlushOptions(), FlushReason::kShutDown);
           s.PermitUncheckedError();  //**TODO: What to do on error?
-          mutex_.Lock();
-          cfd->UnrefAndTryDelete();
         }
       }
     }
-    versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
   }
 
   shutting_down_.store(true, std::memory_order_release);
@@ -971,18 +964,13 @@ void DBImpl::DumpStats() {
   TEST_SYNC_POINT("DBImpl::DumpStats:StartRunning");
   {
     InstrumentedMutexLock l(&mutex_);
-    for (auto cfd : *versions_->GetColumnFamilySet()) {
+    for (auto cfd : versions_->GetRefedColumnFamilySet()) {
       if (cfd->initialized()) {
         // Release DB mutex for gathering cache entry stats. Pass over all
         // column families for this first so that other stats are dumped
         // near-atomically.
-        // Get a ref before unlocking
-        cfd->Ref();
-        {
-          InstrumentedMutexUnlock u(&mutex_);
-          cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
-        }
-        cfd->UnrefAndTryDelete();
+        InstrumentedMutexUnlock u(&mutex_);
+        cfd->internal_stats()->CollectCacheEntryStats(/*foreground=*/false);
       }
     }
 
@@ -3492,15 +3480,13 @@ bool DBImpl::GetAggregatedIntProperty(const Slice& property,
     // Needs mutex to protect the list of column families.
     InstrumentedMutexLock l(&mutex_);
     uint64_t value;
-    for (auto* cfd : *versions_->GetColumnFamilySet()) {
+    for (auto* cfd : versions_->GetRefedColumnFamilySet()) {
       if (!cfd->initialized()) {
         continue;
       }
-      cfd->Ref();
       ret = GetIntPropertyInternal(cfd, *property_info, true, &value);
       // GetIntPropertyInternal may release db mutex and re-acquire it.
       mutex_.AssertHeld();
-      cfd->UnrefAndTryDelete();
       if (ret) {
         sum += value;
       } else {
@@ -5091,6 +5077,7 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options,
     }
   }
 
+  // TODO: simplify using GetRefedColumnFamilySet?
   std::vector<ColumnFamilyData*> cfd_list;
   {
     InstrumentedMutexLock l(&mutex_);
diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc
index a99025675..e75a5582f 100644
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@@ -2992,8 +2992,6 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction,
       bg_bottom_compaction_scheduled_--;
     }
 
-    versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
-
     // See if there's more work to be done
     MaybeScheduleFlushOrCompaction();
 
diff --git a/db/version_set.h b/db/version_set.h
index f30cc0b76..090e416e8 100644
--- a/db/version_set.h
+++ b/db/version_set.h
@@ -1299,6 +1299,10 @@ class VersionSet {
                         uint64_t min_pending_output);
 
   ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); }
+  RefedColumnFamilySet GetRefedColumnFamilySet() {
+    return RefedColumnFamilySet(GetColumnFamilySet());
+  }
+
   const FileOptions& file_options() { return file_options_; }
   void ChangeFileOptions(const MutableDBOptions& new_options) {
     file_options_.writable_file_max_buffer_size =

From e7de808d35d155952677a58c54222f183bd87a6b Mon Sep 17 00:00:00 2001
From: myasuka <myasuka@live.com>
Date: Thu, 24 Mar 2022 15:06:24 -0700
Subject: [PATCH 32/34] Enable READ_BLOCK_COMPACTION_MICROS to track stats
 (#9722)

Summary:
After commit [d642c60](https://github.com/facebook/rocksdb/commit/d642c60bdc100f7509ca77b383cd47b51d80d810), the stats `READ_BLOCK_COMPACTION_MICROS` cannot record any compaction read duration, and it always report zero.

This PR targets to distinguish `READ_BLOCK_COMPACTION_MICROS` with `READ_BLOCK_GET_MICROS` so that `READ_BLOCK_COMPACTION_MICROS` could record the correct stats.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9722

Reviewed By: ajkr

Differential Revision: D35021870

Pulled By: jay-zhuang

fbshipit-source-id: f1a804994265e51465de64c2a08f2e0eeb6fc5a3
---
 HISTORY.md                                    |  1 +
 table/block_based/block_based_table_reader.cc | 23 +++++++++++--------
 table/block_based/block_based_table_reader.h  |  7 +++---
 table/block_based/partitioned_filter_block.cc |  4 ++--
 table/block_based/partitioned_index_reader.cc |  4 ++--
 5 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 34f20138e..be8620c05 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -6,6 +6,7 @@
 * Fixed a race condition when mmaping a WritableFile on POSIX.
 * Fixed a race condition when 2PC is disabled and WAL tracking in the MANIFEST is enabled. The race condition is between two background flush threads trying to install flush results, causing a WAL deletion not tracked in the MANIFEST. A future DB open may fail.
 * Fixed a heap use-after-free race with DropColumnFamily.
+* Fixed a bug that `rocksdb.read.block.compaction.micros` cannot track compaction stats (#9722).
 
 ## 7.0.3 (03/23/2022)
 ### Bug Fixes
diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc
index 388add9ed..f1f245f8a 100644
--- a/table/block_based/block_based_table_reader.cc
+++ b/table/block_based/block_based_table_reader.cc
@@ -1467,9 +1467,10 @@ template <typename TBlocklike>
 Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
     FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
     const BlockHandle& handle, const UncompressionDict& uncompression_dict,
-    const bool wait, CachableEntry<TBlocklike>* block_entry,
-    BlockType block_type, GetContext* get_context,
-    BlockCacheLookupContext* lookup_context, BlockContents* contents) const {
+    const bool wait, const bool for_compaction,
+    CachableEntry<TBlocklike>* block_entry, BlockType block_type,
+    GetContext* get_context, BlockCacheLookupContext* lookup_context,
+    BlockContents* contents) const {
   assert(block_entry != nullptr);
   const bool no_io = (ro.read_tier == kBlockCacheTier);
   Cache* block_cache = rep_->table_options.block_cache.get();
@@ -1522,7 +1523,9 @@ Status BlockBasedTable::MaybeReadBlockAndLoadToCache(
       CompressionType raw_block_comp_type;
       BlockContents raw_block_contents;
       if (!contents) {
-        StopWatch sw(rep_->ioptions.clock, statistics, READ_BLOCK_GET_MICROS);
+        Histograms histogram = for_compaction ? READ_BLOCK_COMPACTION_MICROS
+                                              : READ_BLOCK_GET_MICROS;
+        StopWatch sw(rep_->ioptions.clock, statistics, histogram);
         BlockFetcher block_fetcher(
             rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle,
             &raw_block_contents, rep_->ioptions, do_uncompress,
@@ -1880,8 +1883,9 @@ void BlockBasedTable::RetrieveMultipleBlocks(
         // avoid looking up the block cache
         s = MaybeReadBlockAndLoadToCache(
             nullptr, options, handle, uncompression_dict, /*wait=*/true,
-            block_entry, BlockType::kData, mget_iter->get_context,
-            &lookup_data_block_context, &raw_block_contents);
+            /*for_compaction=*/false, block_entry, BlockType::kData,
+            mget_iter->get_context, &lookup_data_block_context,
+            &raw_block_contents);
 
         // block_entry value could be null if no block cache is present, i.e
         // BlockBasedTableOptions::no_block_cache is true and no compressed
@@ -1935,7 +1939,7 @@ Status BlockBasedTable::RetrieveBlock(
   if (use_cache) {
     s = MaybeReadBlockAndLoadToCache(
         prefetch_buffer, ro, handle, uncompression_dict, wait_for_cache,
-        block_entry, block_type, get_context, lookup_context,
+        for_compaction, block_entry, block_type, get_context, lookup_context,
         /*contents=*/nullptr);
 
     if (!s.ok()) {
@@ -1964,8 +1968,9 @@ Status BlockBasedTable::RetrieveBlock(
   std::unique_ptr<TBlocklike> block;
 
   {
-    StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats,
-                 READ_BLOCK_GET_MICROS);
+    Histograms histogram =
+        for_compaction ? READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS;
+    StopWatch sw(rep_->ioptions.clock, rep_->ioptions.stats, histogram);
     s = ReadBlockFromFile(
         rep_->file.get(), prefetch_buffer, rep_->footer, ro, handle, &block,
         rep_->ioptions, do_uncompress, maybe_compressed, block_type,
diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h
index 1163449da..eac4fd38c 100644
--- a/table/block_based/block_based_table_reader.h
+++ b/table/block_based/block_based_table_reader.h
@@ -343,9 +343,10 @@ class BlockBasedTable : public TableReader {
   Status MaybeReadBlockAndLoadToCache(
       FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
       const BlockHandle& handle, const UncompressionDict& uncompression_dict,
-      const bool wait, CachableEntry<TBlocklike>* block_entry,
-      BlockType block_type, GetContext* get_context,
-      BlockCacheLookupContext* lookup_context, BlockContents* contents) const;
+      const bool wait, const bool for_compaction,
+      CachableEntry<TBlocklike>* block_entry, BlockType block_type,
+      GetContext* get_context, BlockCacheLookupContext* lookup_context,
+      BlockContents* contents) const;
 
   // Similar to the above, with one crucial difference: it will retrieve the
   // block from the file even if there are no caches configured (assuming the
diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc
index f9d53aba7..c33c383f1 100644
--- a/table/block_based/partitioned_filter_block.cc
+++ b/table/block_based/partitioned_filter_block.cc
@@ -519,8 +519,8 @@ Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro,
     // filter blocks
     s = table()->MaybeReadBlockAndLoadToCache(
         prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
-        /* wait */ true, &block, BlockType::kFilter, nullptr /* get_context */,
-        &lookup_context, nullptr /* contents */);
+        /* wait */ true, /* for_compaction */ false, &block, BlockType::kFilter,
+        nullptr /* get_context */, &lookup_context, nullptr /* contents */);
     if (!s.ok()) {
       return s;
     }
diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc
index e295d41a4..9ed418f02 100644
--- a/table/block_based/partitioned_index_reader.cc
+++ b/table/block_based/partitioned_index_reader.cc
@@ -180,8 +180,8 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro,
     // filter blocks
     Status s = table()->MaybeReadBlockAndLoadToCache(
         prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(),
-        /*wait=*/true, &block, BlockType::kIndex, /*get_context=*/nullptr,
-        &lookup_context, /*contents=*/nullptr);
+        /*wait=*/true, /*for_compaction=*/false, &block, BlockType::kIndex,
+        /*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr);
 
     if (!s.ok()) {
       return s;

From 9622795deb870bd1f2c286ed929f84e522e950f8 Mon Sep 17 00:00:00 2001
From: Andrew Kryczka <andrewkr@fb.com>
Date: Tue, 29 Mar 2022 12:50:05 -0700
Subject: [PATCH 33/34] update version.h and HISTORY.md for 7.0.4

---
 HISTORY.md                | 2 +-
 include/rocksdb/version.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index be8620c05..d744cf176 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,5 @@
 # Rocksdb Change Log
-## Unreleased
+## 7.0.4 (03/29/2022)
 ### Bug Fixes
 * Fixed a race condition when disable and re-enable manual compaction.
 * Fixed a race condition for `alive_log_files_` in non-two-write-queues mode. The race is between the write_thread_ in WriteToWAL() and another thread executing `FindObsoleteFiles()`. The race condition will be caught if `__glibcxx_requires_nonempty` is enabled.
diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h
index aaf177a18..8a3d80e09 100644
--- a/include/rocksdb/version.h
+++ b/include/rocksdb/version.h
@@ -11,7 +11,7 @@
 
 #define ROCKSDB_MAJOR 7
 #define ROCKSDB_MINOR 0
-#define ROCKSDB_PATCH 3
+#define ROCKSDB_PATCH 4
 
 // Do not use these. We made the mistake of declaring macros starting with
 // double underscore. Now we have to live with our choice. We'll deprecate these

From 076ac983f40236171acfa6f8726eb79a51184727 Mon Sep 17 00:00:00 2001
From: Yuriy Chernyshov <thegeorg@yandex-team.ru>
Date: Wed, 2 Mar 2022 17:41:02 -0800
Subject: [PATCH 34/34] Do not rely on ADL when invoking std::max_element
 (#9608)

Summary:
Certain STLs use raw pointers and ADL does not work for them.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/9608

Reviewed By: ajkr

Differential Revision: D34583012

Pulled By: riversand963

fbshipit-source-id: 7de6bbc8a080c3e7243ce0d758fe83f1663168aa
---
 db/external_sst_file_ingestion_job.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc
index 40b5604d5..16ec558f2 100644
--- a/db/external_sst_file_ingestion_job.cc
+++ b/db/external_sst_file_ingestion_job.cc
@@ -777,10 +777,11 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile(
         const std::vector<FileMetaData*>& level_files =
             vstorage->LevelFiles(lvl);
         const SequenceNumber level_largest_seqno =
-            (*max_element(level_files.begin(), level_files.end(),
-                          [](FileMetaData* f1, FileMetaData* f2) {
-                            return f1->fd.largest_seqno < f2->fd.largest_seqno;
-                          }))
+            (*std::max_element(level_files.begin(), level_files.end(),
+                               [](FileMetaData* f1, FileMetaData* f2) {
+                                 return f1->fd.largest_seqno <
+                                        f2->fd.largest_seqno;
+                               }))
                 ->fd.largest_seqno;
         // should only assign seqno to current level's largest seqno when
         // the file fits