04-resharding.tcl 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. # Failover stress test.
  2. # In this test a different node is killed in a loop for N
  3. # iterations. The test checks that certain properties
  4. # are preserved across iterations.
  5. source "../tests/includes/init-tests.tcl"
  6. source "../../../tests/support/cli.tcl"
  7. test "Create a 5 nodes cluster" {
  8. create_cluster 5 5
  9. }
  10. test "Cluster is up" {
  11. assert_cluster_state ok
  12. }
  13. test "Enable AOF in all the instances" {
  14. foreach_redis_id id {
  15. R $id config set appendonly yes
  16. # We use "appendfsync no" because it's fast but also guarantees that
  17. # write(2) is performed before replying to client.
  18. R $id config set appendfsync no
  19. }
  20. foreach_redis_id id {
  21. wait_for_condition 1000 500 {
  22. [RI $id aof_rewrite_in_progress] == 0 &&
  23. [RI $id aof_enabled] == 1
  24. } else {
  25. fail "Failed to enable AOF on instance #$id"
  26. }
  27. }
  28. }
  29. # Return non-zero if the specified PID is about a process still in execution,
  30. # otherwise 0 is returned.
  31. proc process_is_running {pid} {
  32. # PS should return with an error if PID is non existing,
  33. # and catch will return non-zero. We want to return non-zero if
  34. # the PID exists, so we invert the return value with expr not operator.
  35. expr {![catch {exec ps -p $pid}]}
  36. }
  37. # Our resharding test performs the following actions:
  38. #
  39. # - N commands are sent to the cluster in the course of the test.
  40. # - Every command selects a random key from key:0 to key:MAX-1.
  41. # - The operation RPUSH key <randomvalue> is performed.
  42. # - Tcl remembers into an array all the values pushed to each list.
  43. # - After N/2 commands, the resharding process is started in background.
  44. # - The test continues while the resharding is in progress.
  45. # - At the end of the test, we wait for the resharding process to stop.
  46. # - Finally the keys are checked to see if they contain the value they should.
  47. set numkeys 50000
  48. set numops 200000
  49. set start_node_port [get_instance_attrib redis 0 port]
  50. set cluster [redis_cluster 127.0.0.1:$start_node_port]
  51. if {$::tls} {
  52. # setup a non-TLS cluster client to the TLS cluster
  53. set plaintext_port [get_instance_attrib redis 0 plaintext-port]
  54. set cluster_plaintext [redis_cluster 127.0.0.1:$plaintext_port 0]
  55. puts "Testing TLS cluster on start node 127.0.0.1:$start_node_port, plaintext port $plaintext_port"
  56. } else {
  57. set cluster_plaintext $cluster
  58. puts "Testing using non-TLS cluster"
  59. }
  60. catch {unset content}
  61. array set content {}
  62. set tribpid {}
  63. test "Cluster consistency during live resharding" {
  64. set ele 0
  65. for {set j 0} {$j < $numops} {incr j} {
  66. # Trigger the resharding once we execute half the ops.
  67. if {$tribpid ne {} &&
  68. ($j % 10000) == 0 &&
  69. ![process_is_running $tribpid]} {
  70. set tribpid {}
  71. }
  72. if {$j >= $numops/2 && $tribpid eq {}} {
  73. puts -nonewline "...Starting resharding..."
  74. flush stdout
  75. set target [dict get [get_myself [randomInt 5]] id]
  76. set tribpid [lindex [exec \
  77. ../../../src/redis-cli --cluster reshard \
  78. 127.0.0.1:[get_instance_attrib redis 0 port] \
  79. --cluster-from all \
  80. --cluster-to $target \
  81. --cluster-slots 100 \
  82. --cluster-yes \
  83. {*}[rediscli_tls_config "../../../tests"] \
  84. | [info nameofexecutable] \
  85. ../tests/helpers/onlydots.tcl \
  86. &] 0]
  87. }
  88. # Write random data to random list.
  89. set listid [randomInt $numkeys]
  90. set key "key:$listid"
  91. incr ele
  92. # We write both with Lua scripts and with plain commands.
  93. # This way we are able to stress Lua -> Redis command invocation
  94. # as well, that has tests to prevent Lua to write into wrong
  95. # hash slots.
  96. # We also use both TLS and plaintext connections.
  97. if {$listid % 3 == 0} {
  98. $cluster rpush $key $ele
  99. } elseif {$listid % 3 == 1} {
  100. $cluster_plaintext rpush $key $ele
  101. } else {
  102. $cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele
  103. }
  104. lappend content($key) $ele
  105. if {($j % 1000) == 0} {
  106. puts -nonewline W; flush stdout
  107. }
  108. }
  109. # Wait for the resharding process to end
  110. wait_for_condition 1000 500 {
  111. [process_is_running $tribpid] == 0
  112. } else {
  113. fail "Resharding is not terminating after some time."
  114. }
  115. }
  116. test "Verify $numkeys keys for consistency with logical content" {
  117. # Check that the Redis Cluster content matches our logical content.
  118. foreach {key value} [array get content] {
  119. if {[$cluster lrange $key 0 -1] ne $value} {
  120. fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
  121. }
  122. }
  123. }
  124. test "Crash and restart all the instances" {
  125. foreach_redis_id id {
  126. kill_instance redis $id
  127. restart_instance redis $id
  128. }
  129. }
  130. test "Cluster should eventually be up again" {
  131. assert_cluster_state ok
  132. }
  133. test "Verify $numkeys keys after the crash & restart" {
  134. # Check that the Redis Cluster content matches our logical content.
  135. foreach {key value} [array get content] {
  136. if {[$cluster lrange $key 0 -1] ne $value} {
  137. fail "Key $key expected to hold '$value' but actual content is [$cluster lrange $key 0 -1]"
  138. }
  139. }
  140. }
  141. test "Disable AOF in all the instances" {
  142. foreach_redis_id id {
  143. R $id config set appendonly no
  144. }
  145. }
  146. test "Verify slaves consistency" {
  147. set verified_masters 0
  148. foreach_redis_id id {
  149. set role [R $id role]
  150. lassign $role myrole myoffset slaves
  151. if {$myrole eq {slave}} continue
  152. set masterport [get_instance_attrib redis $id port]
  153. set masterdigest [R $id debug digest]
  154. foreach_redis_id sid {
  155. set srole [R $sid role]
  156. if {[lindex $srole 0] eq {master}} continue
  157. if {[lindex $srole 2] != $masterport} continue
  158. wait_for_condition 1000 500 {
  159. [R $sid debug digest] eq $masterdigest
  160. } else {
  161. fail "Master and slave data digest are different"
  162. }
  163. incr verified_masters
  164. }
  165. }
  166. assert {$verified_masters >= 5}
  167. }
  168. test "Dump sanitization was skipped for migrations" {
  169. set verified_masters 0
  170. foreach_redis_id id {
  171. assert {[RI $id dump_payload_sanitizations] == 0}
  172. }
  173. }