فهرست منبع

fix: race condition in connection pool causing "no such table" errors (#1441)

* chore: debug other errors

* fix: race condition in connection pool causing "no such table" errors

This fixes a race condition in the connection pool that caused
  intermittent "no such table: mint_quote_payments" errors, particularly
  on slower CI runners.

  ## Root Cause

  The `get_timeout()` function in `pool.rs` had a TOCTOU (time-of-check-
  time-of-use) race condition. When acquiring a connection:

  1. Thread A pops a connection from the queue
  2. Thread A releases the mutex BEFORE incrementing `in_use` counter
  3. Thread B acquires the mutex, sees empty queue, releases mutex
  4. Thread B checks `in_use < max_size` (0 < 1) - passes because
     Thread A hasn't incremented yet
  5. Thread B creates a NEW connection

  For in-memory SQLite, each `Connection::open_in_memory()` creates a
  SEPARATE, INDEPENDENT database. Migrations only run on the first
  connection created during `SQLMintDatabase::new()`. When the race
  condition triggers, additional connections point to fresh databases
  without any tables - hence "no such table: mint_quote_payments".

  ## The Fix

  Move `increment_connection_counter()` to BEFORE releasing the mutex
  in both code paths:

  1. When popping an existing resource from the queue
  2. When creating a new resource

  This ensures other threads immediately see the updated count and wait
  instead of creating duplicate connections. Added error handling to
  decrement the counter if resource creation fails, preserving the
  behavior from PR #1367.

  ## Why CI-only failures

  The race window is very small (between mutex release and counter
  increment). On faster local machines, threads rarely hit this window.
  On slower CI runners with more context switching, the probability of
  two threads interleaving in the problematic order increases
  significantly.
tsk 1 ماه پیش
والد
کامیت
26970e43a8
3فایلهای تغییر یافته به همراه30 افزوده شده و 17 حذف شده
  1. 5 5
      crates/cdk-integration-tests/tests/mint.rs
  2. 24 12
      crates/cdk-sql-common/src/pool.rs
  3. 1 0
      misc/itests.sh

+ 5 - 5
crates/cdk-integration-tests/tests/mint.rs

@@ -241,16 +241,16 @@ async fn test_concurrent_duplicate_payment_handling() {
         "Exactly one task should successfully process the payment (got {})",
         success_count
     );
-    assert_eq!(
-        duplicate_errors, 9,
-        "Nine tasks should receive Duplicate error (got {})",
-        duplicate_errors
-    );
     assert!(
         other_errors.is_empty(),
         "No unexpected errors should occur. Got: {:?}",
         other_errors
     );
+    assert_eq!(
+        duplicate_errors, 9,
+        "Nine tasks should receive Duplicate error (got {})",
+        duplicate_errors
+    );
 
     // Verify the quote was incremented exactly once
     let final_quote = MintQuotesDatabase::get_mint_quote(&*database, &mint_quote.id)

+ 24 - 12
crates/cdk-sql-common/src/pool.rs

@@ -192,9 +192,12 @@ where
         let time = Instant::now();
 
         loop {
-            if let Some((stale, resource)) = resources.pop() {
+            while let Some((stale, resource)) = resources.pop() {
                 if !stale.load(Ordering::SeqCst) {
-                    drop(resources);
+                    // Increment counter BEFORE releasing the mutex to prevent race condition
+                    // where another thread sees in_use < max_size and creates a duplicate connection.
+                    // For in-memory SQLite, each connection is a separate database, so this race
+                    // would cause some connections to miss migrations.
                     self.increment_connection_counter();
 
                     return Ok(PooledResource {
@@ -207,17 +210,26 @@ where
             }
 
             if self.in_use.load(Ordering::Relaxed) < self.max_size {
-                drop(resources);
-                let stale: Arc<AtomicBool> = Arc::new(false.into());
-                let new_resource = RM::new_resource(&self.config, stale.clone(), timeout)?;
+                // Increment counter BEFORE releasing the mutex to prevent race condition.
+                // This ensures other threads see the updated count and wait instead of
+                // creating additional connections beyond max_size.
                 self.increment_connection_counter();
-
-                return Ok(PooledResource {
-                    resource: Some((stale, new_resource)),
-                    pool: self.clone(),
-                    #[cfg(feature = "prometheus")]
-                    start_time: Instant::now(),
-                });
+                let stale: Arc<AtomicBool> = Arc::new(false.into());
+                match RM::new_resource(&self.config, stale.clone(), timeout) {
+                    Ok(new_resource) => {
+                        return Ok(PooledResource {
+                            resource: Some((stale, new_resource)),
+                            pool: self.clone(),
+                            #[cfg(feature = "prometheus")]
+                            start_time: Instant::now(),
+                        });
+                    }
+                    Err(e) => {
+                        // If resource creation fails, decrement the counter we just incremented
+                        self.in_use.fetch_sub(1, Ordering::AcqRel);
+                        return Err(e);
+                    }
+                }
             }
 
             resources = self

+ 1 - 0
misc/itests.sh

@@ -51,6 +51,7 @@ cleanup() {
 trap cleanup EXIT
 
 export CDK_TEST_REGTEST=1
+export RUST_BACKTRACE=full
 
 # Create a temporary directory
 export CDK_ITESTS_DIR=$(mktemp -d)