corrupt-dump-fuzzer.tcl 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217
  1. # tests of corrupt ziplist payload with valid CRC
  2. tags {"dump" "corruption" "external:skip"} {
  3. # catch sigterm so that in case one of the random command hangs the test,
  4. # usually due to redis not putting a response in the output buffers,
  5. # we'll know which command it was
  6. if { ! [ catch {
  7. package require Tclx
  8. } err ] } {
  9. signal error SIGTERM
  10. }
  11. proc generate_collections {suffix elements} {
  12. set rd [redis_deferring_client]
  13. for {set j 0} {$j < $elements} {incr j} {
  14. # add both string values and integers
  15. if {$j % 2 == 0} {set val $j} else {set val "_$j"}
  16. $rd hset hash$suffix $j $val
  17. $rd lpush list$suffix $val
  18. $rd zadd zset$suffix $j $val
  19. $rd sadd set$suffix $val
  20. $rd xadd stream$suffix * item 1 value $val
  21. }
  22. for {set j 0} {$j < $elements * 5} {incr j} {
  23. $rd read ; # Discard replies
  24. }
  25. $rd close
  26. }
  27. # generate keys with various types and encodings
  28. proc generate_types {} {
  29. r config set list-max-ziplist-size 5
  30. r config set hash-max-ziplist-entries 5
  31. r config set zset-max-ziplist-entries 5
  32. r config set stream-node-max-entries 5
  33. # create small (ziplist / listpack encoded) objects with 3 items
  34. generate_collections "" 3
  35. # add some metadata to the stream
  36. r xgroup create stream mygroup 0
  37. set records [r xreadgroup GROUP mygroup Alice COUNT 2 STREAMS stream >]
  38. r xdel stream [lindex [lindex [lindex [lindex $records 0] 1] 1] 0]
  39. r xack stream mygroup [lindex [lindex [lindex [lindex $records 0] 1] 0] 0]
  40. # create other non-collection types
  41. r incr int
  42. r set string str
  43. # create bigger objects with 10 items (more than a single ziplist / listpack)
  44. generate_collections big 10
  45. # make sure our big stream also has a listpack record that has different
  46. # field names than the master recorded
  47. r xadd streambig * item 1 value 1
  48. r xadd streambig * item 1 unique value
  49. }
  50. proc corrupt_payload {payload} {
  51. set len [string length $payload]
  52. set count 1 ;# usually corrupt only one byte
  53. if {rand() > 0.9} { set count 2 }
  54. while { $count > 0 } {
  55. set idx [expr {int(rand() * $len)}]
  56. set ch [binary format c [expr {int(rand()*255)}]]
  57. set payload [string replace $payload $idx $idx $ch]
  58. incr count -1
  59. }
  60. return $payload
  61. }
  62. # fuzzy tester for corrupt RESTORE payloads
  63. # valgrind will make sure there were no leaks in the rdb loader error handling code
  64. foreach sanitize_dump {no yes} {
  65. if {$::accurate} {
  66. set min_duration [expr {60 * 10}] ;# run at least 10 minutes
  67. set min_cycles 1000 ;# run at least 1k cycles (max 16 minutes)
  68. } else {
  69. set min_duration 10 ; # run at least 10 seconds
  70. set min_cycles 10 ; # run at least 10 cycles
  71. }
  72. # Don't execute this on FreeBSD due to a yet-undiscovered memory issue
  73. # which causes tclsh to bloat.
  74. if {[exec uname] == "FreeBSD"} {
  75. set min_cycles 1
  76. set min_duration 1
  77. }
  78. test "Fuzzer corrupt restore payloads - sanitize_dump: $sanitize_dump" {
  79. if {$min_duration * 2 > $::timeout} {
  80. fail "insufficient timeout"
  81. }
  82. # start a server, fill with data and save an RDB file once (avoid re-save)
  83. start_server [list overrides [list "save" "" use-exit-on-panic yes crash-memcheck-enabled no loglevel verbose] ] {
  84. set stdout [srv 0 stdout]
  85. r config set sanitize-dump-payload $sanitize_dump
  86. r debug set-skip-checksum-validation 1
  87. set start_time [clock seconds]
  88. generate_types
  89. set dbsize [r dbsize]
  90. r save
  91. set cycle 0
  92. set stat_terminated_in_restore 0
  93. set stat_terminated_in_traffic 0
  94. set stat_terminated_by_signal 0
  95. set stat_successful_restore 0
  96. set stat_rejected_restore 0
  97. set stat_traffic_commands_sent 0
  98. # repeatedly DUMP a random key, corrupt it and try RESTORE into a new key
  99. while true {
  100. set k [r randomkey]
  101. set dump [r dump $k]
  102. set dump [corrupt_payload $dump]
  103. set printable_dump [string2printable $dump]
  104. set restore_failed false
  105. set report_and_restart false
  106. set sent {}
  107. # RESTORE can fail, but hopefully not terminate
  108. if { [catch { r restore "_$k" 0 $dump REPLACE } err] } {
  109. set restore_failed true
  110. # skip if return failed with an error response.
  111. if {[string match "ERR*" $err]} {
  112. incr stat_rejected_restore
  113. } else {
  114. set report_and_restart true
  115. incr stat_terminated_in_restore
  116. write_log_line 0 "corrupt payload: $printable_dump"
  117. if {$sanitize_dump == yes} {
  118. puts "Server crashed in RESTORE with payload: $printable_dump"
  119. }
  120. }
  121. } else {
  122. r ping ;# an attempt to check if the server didn't terminate (this will throw an error that will terminate the tests)
  123. }
  124. set print_commands false
  125. if {!$restore_failed} {
  126. # if RESTORE didn't fail or terminate, run some random traffic on the new key
  127. incr stat_successful_restore
  128. if { [ catch {
  129. set sent [generate_fuzzy_traffic_on_key "_$k" 1] ;# traffic for 1 second
  130. incr stat_traffic_commands_sent [llength $sent]
  131. r del "_$k" ;# in case the server terminated, here's where we'll detect it.
  132. if {$dbsize != [r dbsize]} {
  133. puts "unexpected keys"
  134. puts "keys: [r keys *]"
  135. puts $sent
  136. exit 1
  137. }
  138. } err ] } {
  139. # if the server terminated update stats and restart it
  140. set report_and_restart true
  141. incr stat_terminated_in_traffic
  142. set by_signal [count_log_message 0 "crashed by signal"]
  143. incr stat_terminated_by_signal $by_signal
  144. if {$by_signal != 0 || $sanitize_dump == yes} {
  145. puts "Server crashed (by signal: $by_signal), with payload: $printable_dump"
  146. set print_commands true
  147. }
  148. }
  149. }
  150. # check valgrind report for invalid reads after each RESTORE
  151. # payload so that we have a report that is easier to reproduce
  152. set valgrind_errors [find_valgrind_errors [srv 0 stderr] false]
  153. if {$valgrind_errors != ""} {
  154. puts "valgrind found an issue for payload: $printable_dump"
  155. set report_and_restart true
  156. set print_commands true
  157. }
  158. if {$report_and_restart} {
  159. if {$print_commands} {
  160. puts "violating commands:"
  161. foreach cmd $sent {
  162. foreach arg $cmd {
  163. puts -nonewline "[string2printable $arg] "
  164. }
  165. puts ""
  166. }
  167. }
  168. # restart the server and re-apply debug configuration
  169. write_log_line 0 "corrupt payload: $printable_dump"
  170. restart_server 0 true true
  171. r config set sanitize-dump-payload $sanitize_dump
  172. r debug set-skip-checksum-validation 1
  173. }
  174. incr cycle
  175. if { ([clock seconds]-$start_time) >= $min_duration && $cycle >= $min_cycles} {
  176. break
  177. }
  178. }
  179. if {$::verbose} {
  180. puts "Done $cycle cycles in [expr {[clock seconds]-$start_time}] seconds."
  181. puts "RESTORE: successful: $stat_successful_restore, rejected: $stat_rejected_restore"
  182. puts "Total commands sent in traffic: $stat_traffic_commands_sent, crashes during traffic: $stat_terminated_in_traffic ($stat_terminated_by_signal by signal)."
  183. }
  184. }
  185. # if we run sanitization we never expect the server to crash at runtime
  186. if {$sanitize_dump == yes} {
  187. assert_equal $stat_terminated_in_restore 0
  188. assert_equal $stat_terminated_in_traffic 0
  189. }
  190. # make sure all terminations where due to assertion and not a SIGSEGV
  191. assert_equal $stat_terminated_by_signal 0
  192. }
  193. }
  194. } ;# tags