1
0

instances.tcl 22 KB


  1. # Multi-instance test framework.
  2. # This is used in order to test Sentinel and Redis Cluster, and provides
  3. # basic capabilities for spawning and handling N parallel Redis / Sentinel
  4. # instances.
  5. #
  6. # Copyright (C) 2014 Salvatore Sanfilippo antirez@gmail.com
  7. # This software is released under the BSD License. See the COPYING file for
  8. # more information.
  9. package require Tcl 8.5
  10. set tcl_precision 17
  11. source ../support/redis.tcl
  12. source ../support/util.tcl
  13. source ../support/server.tcl
  14. source ../support/test.tcl
  15. set ::verbose 0
  16. set ::valgrind 0
  17. set ::tls 0
  18. set ::pause_on_error 0
  19. set ::dont_clean 0
  20. set ::simulate_error 0
  21. set ::failed 0
  22. set ::sentinel_instances {}
  23. set ::redis_instances {}
  24. set ::global_config {}
  25. set ::sentinel_base_port 20000
  26. set ::redis_base_port 30000
  27. set ::redis_port_count 1024
  28. set ::host "127.0.0.1"
  29. set ::leaked_fds_file [file normalize "tmp/leaked_fds.txt"]
  30. set ::pids {} ; # We kill everything at exit
  31. set ::dirs {} ; # We remove all the temp dirs at exit
  32. set ::run_matching {} ; # If non empty, only tests matching pattern are run.
  33. if {[catch {cd tmp}]} {
  34. puts "tmp directory not found."
  35. puts "Please run this test from the Redis source root."
  36. exit 1
  37. }
  38. # Execute the specified instance of the server specified by 'type', using
  39. # the provided configuration file. Returns the PID of the process.
  40. proc exec_instance {type dirname cfgfile} {
  41. if {$type eq "redis"} {
  42. set prgname redis-server
  43. } elseif {$type eq "sentinel"} {
  44. set prgname redis-sentinel
  45. } else {
  46. error "Unknown instance type."
  47. }
  48. set errfile [file join $dirname err.txt]
  49. if {$::valgrind} {
  50. set pid [exec valgrind --track-origins=yes --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile 2>> $errfile &]
  51. } else {
  52. set pid [exec ../../../src/${prgname} $cfgfile 2>> $errfile &]
  53. }
  54. return $pid
  55. }
  56. # Spawn a redis or sentinel instance, depending on 'type'.
  57. proc spawn_instance {type base_port count {conf {}} {base_conf_file ""}} {
  58. for {set j 0} {$j < $count} {incr j} {
  59. set port [find_available_port $base_port $::redis_port_count]
  60. # plaintext port (only used for TLS cluster)
  61. set pport 0
  62. # Create a directory for this instance.
  63. set dirname "${type}_${j}"
  64. lappend ::dirs $dirname
  65. catch {exec rm -rf $dirname}
  66. file mkdir $dirname
  67. # Write the instance config file.
  68. set cfgfile [file join $dirname $type.conf]
  69. if {$base_conf_file ne ""} {
  70. file copy -- $base_conf_file $cfgfile
  71. set cfg [open $cfgfile a+]
  72. } else {
  73. set cfg [open $cfgfile w]
  74. }
  75. if {$::tls} {
  76. puts $cfg "tls-port $port"
  77. puts $cfg "tls-replication yes"
  78. puts $cfg "tls-cluster yes"
  79. # plaintext port, only used by plaintext clients in a TLS cluster
  80. set pport [find_available_port $base_port $::redis_port_count]
  81. puts $cfg "port $pport"
  82. puts $cfg [format "tls-cert-file %s/../../tls/server.crt" [pwd]]
  83. puts $cfg [format "tls-key-file %s/../../tls/server.key" [pwd]]
  84. puts $cfg [format "tls-client-cert-file %s/../../tls/client.crt" [pwd]]
  85. puts $cfg [format "tls-client-key-file %s/../../tls/client.key" [pwd]]
  86. puts $cfg [format "tls-dh-params-file %s/../../tls/redis.dh" [pwd]]
  87. puts $cfg [format "tls-ca-cert-file %s/../../tls/ca.crt" [pwd]]
  88. puts $cfg "loglevel debug"
  89. } else {
  90. puts $cfg "port $port"
  91. }
  92. puts $cfg "dir ./$dirname"
  93. puts $cfg "logfile log.txt"
  94. # Add additional config files
  95. foreach directive $conf {
  96. puts $cfg $directive
  97. }
  98. dict for {name val} $::global_config {
  99. puts $cfg "$name $val"
  100. }
  101. close $cfg
  102. # Finally exec it and remember the pid for later cleanup.
  103. set retry 100
  104. while {$retry} {
  105. set pid [exec_instance $type $dirname $cfgfile]
  106. # Check availability
  107. if {[server_is_up 127.0.0.1 $port 100] == 0} {
  108. puts "Starting $type #$j at port $port failed, try another"
  109. incr retry -1
  110. set port [find_available_port $base_port $::redis_port_count]
  111. set cfg [open $cfgfile a+]
  112. if {$::tls} {
  113. puts $cfg "tls-port $port"
  114. set pport [find_available_port $base_port $::redis_port_count]
  115. puts $cfg "port $pport"
  116. } else {
  117. puts $cfg "port $port"
  118. }
  119. close $cfg
  120. } else {
  121. puts "Starting $type #$j at port $port"
  122. lappend ::pids $pid
  123. break
  124. }
  125. }
  126. # Check availability finally
  127. if {[server_is_up $::host $port 100] == 0} {
  128. set logfile [file join $dirname log.txt]
  129. puts [exec tail $logfile]
  130. abort_sentinel_test "Problems starting $type #$j: ping timeout, maybe server start failed, check $logfile"
  131. }
  132. # Push the instance into the right list
  133. set link [redis $::host $port 0 $::tls]
  134. $link reconnect 1
  135. lappend ::${type}_instances [list \
  136. pid $pid \
  137. host $::host \
  138. port $port \
  139. plaintext-port $pport \
  140. link $link \
  141. ]
  142. }
  143. }
  144. proc log_crashes {} {
  145. set start_pattern {*REDIS BUG REPORT START*}
  146. set logs [glob */log.txt]
  147. foreach log $logs {
  148. set fd [open $log]
  149. set found 0
  150. while {[gets $fd line] >= 0} {
  151. if {[string match $start_pattern $line]} {
  152. puts "\n*** Crash report found in $log ***"
  153. set found 1
  154. }
  155. if {$found} {
  156. puts $line
  157. incr ::failed
  158. }
  159. }
  160. }
  161. set logs [glob */err.txt]
  162. foreach log $logs {
  163. set res [find_valgrind_errors $log true]
  164. if {$res != ""} {
  165. puts $res
  166. incr ::failed
  167. }
  168. }
  169. }
  170. proc is_alive pid {
  171. if {[catch {exec ps -p $pid} err]} {
  172. return 0
  173. } else {
  174. return 1
  175. }
  176. }
  177. proc stop_instance pid {
  178. catch {exec kill $pid}
  179. # Node might have been stopped in the test
  180. catch {exec kill -SIGCONT $pid}
  181. if {$::valgrind} {
  182. set max_wait 60000
  183. } else {
  184. set max_wait 10000
  185. }
  186. while {[is_alive $pid]} {
  187. incr wait 10
  188. if {$wait >= $max_wait} {
  189. puts "Forcing process $pid to exit..."
  190. catch {exec kill -KILL $pid}
  191. } elseif {$wait % 1000 == 0} {
  192. puts "Waiting for process $pid to exit..."
  193. }
  194. after 10
  195. }
  196. }
  197. proc cleanup {} {
  198. puts "Cleaning up..."
  199. foreach pid $::pids {
  200. puts "killing stale instance $pid"
  201. stop_instance $pid
  202. }
  203. log_crashes
  204. if {$::dont_clean} {
  205. return
  206. }
  207. foreach dir $::dirs {
  208. catch {exec rm -rf $dir}
  209. }
  210. }
  211. proc abort_sentinel_test msg {
  212. incr ::failed
  213. puts "WARNING: Aborting the test."
  214. puts ">>>>>>>> $msg"
  215. if {$::pause_on_error} pause_on_error
  216. cleanup
  217. exit 1
  218. }
  219. proc parse_options {} {
  220. for {set j 0} {$j < [llength $::argv]} {incr j} {
  221. set opt [lindex $::argv $j]
  222. set val [lindex $::argv [expr $j+1]]
  223. if {$opt eq "--single"} {
  224. incr j
  225. set ::run_matching "*${val}*"
  226. } elseif {$opt eq "--pause-on-error"} {
  227. set ::pause_on_error 1
  228. } elseif {$opt eq {--dont-clean}} {
  229. set ::dont_clean 1
  230. } elseif {$opt eq "--fail"} {
  231. set ::simulate_error 1
  232. } elseif {$opt eq {--valgrind}} {
  233. set ::valgrind 1
  234. } elseif {$opt eq {--host}} {
  235. incr j
  236. set ::host ${val}
  237. } elseif {$opt eq {--tls}} {
  238. package require tls 1.6
  239. ::tls::init \
  240. -cafile "$::tlsdir/ca.crt" \
  241. -certfile "$::tlsdir/client.crt" \
  242. -keyfile "$::tlsdir/client.key"
  243. set ::tls 1
  244. } elseif {$opt eq {--config}} {
  245. set val2 [lindex $::argv [expr $j+2]]
  246. dict set ::global_config $val $val2
  247. incr j 2
  248. } elseif {$opt eq "--help"} {
  249. puts "--single <pattern> Only runs tests specified by pattern."
  250. puts "--dont-clean Keep log files on exit."
  251. puts "--pause-on-error Pause for manual inspection on error."
  252. puts "--fail Simulate a test failure."
  253. puts "--valgrind Run with valgrind."
  254. puts "--tls Run tests in TLS mode."
  255. puts "--host <host> Use hostname instead of 127.0.0.1."
  256. puts "--config <k> <v> Extra config argument(s)."
  257. puts "--help Shows this help."
  258. exit 0
  259. } else {
  260. puts "Unknown option $opt"
  261. exit 1
  262. }
  263. }
  264. }
  265. # If --pause-on-error option was passed at startup this function is called
  266. # on error in order to give the developer a chance to understand more about
  267. # the error condition while the instances are still running.
  268. proc pause_on_error {} {
  269. puts ""
  270. puts [colorstr yellow "*** Please inspect the error now ***"]
  271. puts "\nType \"continue\" to resume the test, \"help\" for help screen.\n"
  272. while 1 {
  273. puts -nonewline "> "
  274. flush stdout
  275. set line [gets stdin]
  276. set argv [split $line " "]
  277. set cmd [lindex $argv 0]
  278. if {$cmd eq {continue}} {
  279. break
  280. } elseif {$cmd eq {show-redis-logs}} {
  281. set count 10
  282. if {[lindex $argv 1] ne {}} {set count [lindex $argv 1]}
  283. foreach_redis_id id {
  284. puts "=== REDIS $id ===="
  285. puts [exec tail -$count redis_$id/log.txt]
  286. puts "---------------------\n"
  287. }
  288. } elseif {$cmd eq {show-sentinel-logs}} {
  289. set count 10
  290. if {[lindex $argv 1] ne {}} {set count [lindex $argv 1]}
  291. foreach_sentinel_id id {
  292. puts "=== SENTINEL $id ===="
  293. puts [exec tail -$count sentinel_$id/log.txt]
  294. puts "---------------------\n"
  295. }
  296. } elseif {$cmd eq {ls}} {
  297. foreach_redis_id id {
  298. puts -nonewline "Redis $id"
  299. set errcode [catch {
  300. set str {}
  301. append str "@[RI $id tcp_port]: "
  302. append str "[RI $id role] "
  303. if {[RI $id role] eq {slave}} {
  304. append str "[RI $id master_host]:[RI $id master_port]"
  305. }
  306. set str
  307. } retval]
  308. if {$errcode} {
  309. puts " -- $retval"
  310. } else {
  311. puts $retval
  312. }
  313. }
  314. foreach_sentinel_id id {
  315. puts -nonewline "Sentinel $id"
  316. set errcode [catch {
  317. set str {}
  318. append str "@[SI $id tcp_port]: "
  319. append str "[join [S $id sentinel get-master-addr-by-name mymaster]]"
  320. set str
  321. } retval]
  322. if {$errcode} {
  323. puts " -- $retval"
  324. } else {
  325. puts $retval
  326. }
  327. }
  328. } elseif {$cmd eq {help}} {
  329. puts "ls List Sentinel and Redis instances."
  330. puts "show-sentinel-logs \[N\] Show latest N lines of logs."
  331. puts "show-redis-logs \[N\] Show latest N lines of logs."
  332. puts "S <id> cmd ... arg Call command in Sentinel <id>."
  333. puts "R <id> cmd ... arg Call command in Redis <id>."
  334. puts "SI <id> <field> Show Sentinel <id> INFO <field>."
  335. puts "RI <id> <field> Show Redis <id> INFO <field>."
  336. puts "continue Resume test."
  337. } else {
  338. set errcode [catch {eval $line} retval]
  339. if {$retval ne {}} {puts "$retval"}
  340. }
  341. }
  342. }
  343. # We redefine 'test' as for Sentinel we don't use the server-client
  344. # architecture for the test, everything is sequential.
  345. proc test {descr code} {
  346. set ts [clock format [clock seconds] -format %H:%M:%S]
  347. puts -nonewline "$ts> $descr: "
  348. flush stdout
  349. if {[catch {set retval [uplevel 1 $code]} error]} {
  350. incr ::failed
  351. if {[string match "assertion:*" $error]} {
  352. set msg [string range $error 10 end]
  353. puts [colorstr red $msg]
  354. if {$::pause_on_error} pause_on_error
  355. puts "(Jumping to next unit after error)"
  356. return -code continue
  357. } else {
  358. # Re-raise, let handler up the stack take care of this.
  359. error $error $::errorInfo
  360. }
  361. } else {
  362. puts [colorstr green OK]
  363. }
  364. }
  365. # Check memory leaks when running on OSX using the "leaks" utility.
  366. proc check_leaks instance_types {
  367. if {[string match {*Darwin*} [exec uname -a]]} {
  368. puts -nonewline "Testing for memory leaks..."; flush stdout
  369. foreach type $instance_types {
  370. foreach_instance_id [set ::${type}_instances] id {
  371. if {[instance_is_killed $type $id]} continue
  372. set pid [get_instance_attrib $type $id pid]
  373. set output {0 leaks}
  374. catch {exec leaks $pid} output
  375. if {[string match {*process does not exist*} $output] ||
  376. [string match {*cannot examine*} $output]} {
  377. # In a few tests we kill the server process.
  378. set output "0 leaks"
  379. } else {
  380. puts -nonewline "$type/$pid "
  381. flush stdout
  382. }
  383. if {![string match {*0 leaks*} $output]} {
  384. puts [colorstr red "=== MEMORY LEAK DETECTED ==="]
  385. puts "Instance type $type, ID $id:"
  386. puts $output
  387. puts "==="
  388. incr ::failed
  389. }
  390. }
  391. }
  392. puts ""
  393. }
  394. }
  395. # Execute all the units inside the 'tests' directory.
  396. proc run_tests {} {
  397. set tests [lsort [glob ../tests/*]]
  398. foreach test $tests {
  399. # Remove leaked_fds file before starting
  400. if {$::leaked_fds_file != "" && [file exists $::leaked_fds_file]} {
  401. file delete $::leaked_fds_file
  402. }
  403. if {$::run_matching ne {} && [string match $::run_matching $test] == 0} {
  404. continue
  405. }
  406. if {[file isdirectory $test]} continue
  407. puts [colorstr yellow "Testing unit: [lindex [file split $test] end]"]
  408. source $test
  409. check_leaks {redis sentinel}
  410. # Check if a leaked fds file was created and abort the test.
  411. if {$::leaked_fds_file != "" && [file exists $::leaked_fds_file]} {
  412. puts [colorstr red "ERROR: Sentinel has leaked fds to scripts:"]
  413. puts [exec cat $::leaked_fds_file]
  414. puts "----"
  415. incr ::failed
  416. }
  417. }
  418. }
  419. # Print a message and exists with 0 / 1 according to zero or more failures.
  420. proc end_tests {} {
  421. if {$::failed == 0 } {
  422. puts "GOOD! No errors."
  423. exit 0
  424. } else {
  425. puts "WARNING $::failed test(s) failed."
  426. exit 1
  427. }
  428. }
  429. # The "S" command is used to interact with the N-th Sentinel.
  430. # The general form is:
  431. #
  432. # S <sentinel-id> command arg arg arg ...
  433. #
  434. # Example to ping the Sentinel 0 (first instance): S 0 PING
  435. proc S {n args} {
  436. set s [lindex $::sentinel_instances $n]
  437. [dict get $s link] {*}$args
  438. }
  439. # Returns a Redis instance by index.
  440. # Example:
  441. # [Rn 0] info
  442. proc Rn {n} {
  443. return [dict get [lindex $::redis_instances $n] link]
  444. }
  445. # Like R but to chat with Redis instances.
  446. proc R {n args} {
  447. [Rn $n] {*}$args
  448. }
  449. proc get_info_field {info field} {
  450. set fl [string length $field]
  451. append field :
  452. foreach line [split $info "\n"] {
  453. set line [string trim $line "\r\n "]
  454. if {[string range $line 0 $fl] eq $field} {
  455. return [string range $line [expr {$fl+1}] end]
  456. }
  457. }
  458. return {}
  459. }
  460. proc SI {n field} {
  461. get_info_field [S $n info] $field
  462. }
  463. proc RI {n field} {
  464. get_info_field [R $n info] $field
  465. }
  466. proc RPort {n} {
  467. if {$::tls} {
  468. return [lindex [R $n config get tls-port] 1]
  469. } else {
  470. return [lindex [R $n config get port] 1]
  471. }
  472. }
  473. # Iterate over IDs of sentinel or redis instances.
  474. proc foreach_instance_id {instances idvar code} {
  475. upvar 1 $idvar id
  476. for {set id 0} {$id < [llength $instances]} {incr id} {
  477. set errcode [catch {uplevel 1 $code} result]
  478. if {$errcode == 1} {
  479. error $result $::errorInfo $::errorCode
  480. } elseif {$errcode == 4} {
  481. continue
  482. } elseif {$errcode == 3} {
  483. break
  484. } elseif {$errcode != 0} {
  485. return -code $errcode $result
  486. }
  487. }
  488. }
  489. proc foreach_sentinel_id {idvar code} {
  490. set errcode [catch {uplevel 1 [list foreach_instance_id $::sentinel_instances $idvar $code]} result]
  491. return -code $errcode $result
  492. }
  493. proc foreach_redis_id {idvar code} {
  494. set errcode [catch {uplevel 1 [list foreach_instance_id $::redis_instances $idvar $code]} result]
  495. return -code $errcode $result
  496. }
  497. # Get the specific attribute of the specified instance type, id.
  498. proc get_instance_attrib {type id attrib} {
  499. dict get [lindex [set ::${type}_instances] $id] $attrib
  500. }
  501. # Set the specific attribute of the specified instance type, id.
  502. proc set_instance_attrib {type id attrib newval} {
  503. set d [lindex [set ::${type}_instances] $id]
  504. dict set d $attrib $newval
  505. lset ::${type}_instances $id $d
  506. }
  507. # Create a master-slave cluster of the given number of total instances.
  508. # The first instance "0" is the master, all others are configured as
  509. # slaves.
  510. proc create_redis_master_slave_cluster n {
  511. foreach_redis_id id {
  512. if {$id == 0} {
  513. # Our master.
  514. R $id slaveof no one
  515. R $id flushall
  516. } elseif {$id < $n} {
  517. R $id slaveof [get_instance_attrib redis 0 host] \
  518. [get_instance_attrib redis 0 port]
  519. } else {
  520. # Instances not part of the cluster.
  521. R $id slaveof no one
  522. }
  523. }
  524. # Wait for all the slaves to sync.
  525. wait_for_condition 1000 50 {
  526. [RI 0 connected_slaves] == ($n-1)
  527. } else {
  528. fail "Unable to create a master-slaves cluster."
  529. }
  530. }
  531. proc get_instance_id_by_port {type port} {
  532. foreach_${type}_id id {
  533. if {[get_instance_attrib $type $id port] == $port} {
  534. return $id
  535. }
  536. }
  537. fail "Instance $type port $port not found."
  538. }
  539. # Kill an instance of the specified type/id with SIGKILL.
  540. # This function will mark the instance PID as -1 to remember that this instance
  541. # is no longer running and will remove its PID from the list of pids that
  542. # we kill at cleanup.
  543. #
  544. # The instance can be restarted with restart-instance.
  545. proc kill_instance {type id} {
  546. set pid [get_instance_attrib $type $id pid]
  547. set port [get_instance_attrib $type $id port]
  548. if {$pid == -1} {
  549. error "You tried to kill $type $id twice."
  550. }
  551. stop_instance $pid
  552. set_instance_attrib $type $id pid -1
  553. set_instance_attrib $type $id link you_tried_to_talk_with_killed_instance
  554. # Remove the PID from the list of pids to kill at exit.
  555. set ::pids [lsearch -all -inline -not -exact $::pids $pid]
  556. # Wait for the port it was using to be available again, so that's not
  557. # an issue to start a new server ASAP with the same port.
  558. set retry 100
  559. while {[incr retry -1]} {
  560. set port_is_free [catch {set s [socket 127.0.0.1 $port]}]
  561. if {$port_is_free} break
  562. catch {close $s}
  563. after 100
  564. }
  565. if {$retry == 0} {
  566. error "Port $port does not return available after killing instance."
  567. }
  568. }
  569. # Return true of the instance of the specified type/id is killed.
  570. proc instance_is_killed {type id} {
  571. set pid [get_instance_attrib $type $id pid]
  572. expr {$pid == -1}
  573. }
  574. # Restart an instance previously killed by kill_instance
  575. proc restart_instance {type id} {
  576. set dirname "${type}_${id}"
  577. set cfgfile [file join $dirname $type.conf]
  578. set port [get_instance_attrib $type $id port]
  579. # Execute the instance with its old setup and append the new pid
  580. # file for cleanup.
  581. set pid [exec_instance $type $dirname $cfgfile]
  582. set_instance_attrib $type $id pid $pid
  583. lappend ::pids $pid
  584. # Check that the instance is running
  585. if {[server_is_up 127.0.0.1 $port 100] == 0} {
  586. set logfile [file join $dirname log.txt]
  587. puts [exec tail $logfile]
  588. abort_sentinel_test "Problems starting $type #$id: ping timeout, maybe server start failed, check $logfile"
  589. }
  590. # Connect with it with a fresh link
  591. set link [redis 127.0.0.1 $port 0 $::tls]
  592. $link reconnect 1
  593. set_instance_attrib $type $id link $link
  594. # Make sure the instance is not loading the dataset when this
  595. # function returns.
  596. while 1 {
  597. catch {[$link ping]} retval
  598. if {[string match {*LOADING*} $retval]} {
  599. after 100
  600. continue
  601. } else {
  602. break
  603. }
  604. }
  605. }
  606. proc redis_deferring_client {type id} {
  607. set port [get_instance_attrib $type $id port]
  608. set host [get_instance_attrib $type $id host]
  609. set client [redis $host $port 1 $::tls]
  610. return $client
  611. }
  612. proc redis_client {type id} {
  613. set port [get_instance_attrib $type $id port]
  614. set host [get_instance_attrib $type $id host]
  615. set client [redis $host $port 0 $::tls]
  616. return $client
  617. }