diff --git a/.gitignore b/.gitignore index 5b4676a24..bdc5b754d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,7 @@ /.vendor/ .idea/ *.tmp + +tools/ +*__failpoint_binding__.go +*.go__failpoint_stash__ diff --git a/go.mod b/go.mod index 11a57a8d8..764dd9a6f 100644 --- a/go.mod +++ b/go.mod @@ -55,6 +55,7 @@ require ( github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.1.1 // indirect github.com/pingcap/errors v0.11.5-0.20260310054046-9c8b3586e4b2 // indirect + github.com/pingcap/failpoint v0.0.0-20260521055755-e7642935314f // indirect github.com/pingcap/log v1.1.1-0.20260227082333-572e590d08f1 // indirect github.com/pingcap/tidb/pkg/parser v0.0.0-20260504140133-511dba1dbe17 // indirect github.com/pkg/errors v0.9.1 // indirect diff --git a/go.sum b/go.sum index dcb552f90..d4aae4ccd 100644 --- a/go.sum +++ b/go.sum @@ -117,6 +117,8 @@ github.com/pingcap/errors v0.11.5-0.20240311024730-e056997136bb h1:3pSi4EDG6hg0o github.com/pingcap/errors v0.11.5-0.20240311024730-e056997136bb/go.mod h1:X2r9ueLEUZgtx2cIogM0v4Zj5uvvzhuuiu7Pn8HzMPg= github.com/pingcap/errors v0.11.5-0.20260310054046-9c8b3586e4b2 h1:cLgCk5mwDG9lDH+dPK8TmEliTjyGJwwKN0qevWAl8IY= github.com/pingcap/errors v0.11.5-0.20260310054046-9c8b3586e4b2/go.mod h1:ktAJCA9lxrHHjVyVl2pKJFvzBnq2eZbb+CUOjBRPlXo= +github.com/pingcap/failpoint v0.0.0-20260521055755-e7642935314f h1:cDo4qNgaQc2POMWTXjNrMA7yySdIF/d1AaW8kOA7qOs= +github.com/pingcap/failpoint v0.0.0-20260521055755-e7642935314f/go.mod h1:jimwlLpI/XtwQdlZML15HS+j4rirvwZM0GLY07wwgOo= github.com/pingcap/log v1.1.1-0.20230317032135-a0d097d16e22 h1:2SOzvGvE8beiC1Y4g9Onkvu6UmuBBOeWRGQEjJaT/JY= github.com/pingcap/log v1.1.1-0.20230317032135-a0d097d16e22/go.mod h1:DWQW5jICDR7UJh4HtxXSM20Churx4CQL0fwL/SoOSA4= github.com/pingcap/log v1.1.1-0.20260227082333-572e590d08f1 h1:A2bEfgSb7hLwR9mxDszgGKweF+xY9YoTDG+8RjdFjDE= diff --git a/go/base/context.go b/go/base/context.go index cccf44f33..a21b5cb04 100644 --- a/go/base/context.go +++ b/go/base/context.go @@ -22,9 +22,10 @@ import ( "github.com/github/gh-ost/go/metrics" "github.com/github/gh-ost/go/mysql" "github.com/github/gh-ost/go/sql" - "github.com/openark/golib/log" "github.com/go-ini/ini" + "github.com/openark/golib/log" + "github.com/pingcap/failpoint" ) // RowsEstimateMethod is the type of row number estimation @@ -312,6 +313,8 @@ type MigrationContext struct { DrainGTID mysql.BinlogCoordinates // Source @@gtid_executed captured immediately after the source RENAME TABLE; the applier drains until it reaches this coordinate (move-tables only). } + UnsafeFailPointsEnabled bool + Log Logger } @@ -1206,3 +1209,33 @@ func SendWithContext[T any](ctx context.Context, ch chan<- T, val T) error { return ctx.Err() } } + +type failPointOpts struct { + wait time.Duration +} + +type FailPointOpt func(*failPointOpts) + +// WithFailPointWait sets the time for a fail point to wait before exiting. +func WithFailPointWait(wait time.Duration) FailPointOpt { + return func(opts *failPointOpts) { + opts.wait = wait + } +} + +func (mctx *MigrationContext) NewFailPoint(name string, opts ...FailPointOpt) { + if mctx.UnsafeFailPointsEnabled { + var fpo failPointOpts + for _, opt := range opts { + opt(&fpo) + } + + failpoint.Inject(name, func(_ failpoint.Value) { + mctx.Log.Debugf("[TEST] Encountered fail point: '%s'", name) + if fpo.wait > 0 { + time.Sleep(fpo.wait) + } + panic(fmt.Sprintf("[TEST] Encountered fail point: '%s'", name)) + }) + } +} diff --git a/go/cmd/gh-ost/main.go b/go/cmd/gh-ost/main.go index 0cf4f6121..6c9765b5f 100644 --- a/go/cmd/gh-ost/main.go +++ b/go/cmd/gh-ost/main.go @@ -197,6 +197,9 @@ func main() { flag.StringVar(&migrationContext.MoveTables.TargetDatabase, "target-database", "", "Target MySQL database name for --move-tables mode. If not provided, uses the same database name as the source connection") flag.BoolVar(&migrationContext.MoveTables.AllowOnSourcePrimary, "allow-on-source-primary", false, "allow --move-tables to read (schema, row copy, binlog) from the source cluster's primary. By default gh-ost stops if --host is the primary; prefer pointing --host at a replica to spare the primary the copy load.") + // unsafe fail points, for integration testing purposes + flag.BoolVar(&migrationContext.UnsafeFailPointsEnabled, "unsafe-fail-points-enabled", false, "UNSAFE: Enable fail points for integration testing purposes. Do not use in production.") + flag.CommandLine.SetOutput(os.Stdout) flag.Parse() cutOverLockTimeoutUserSpecified := false @@ -345,7 +348,9 @@ func main() { if *storageEngine == "rocksdb" { migrationContext.Log.Warning("RocksDB storage engine support is experimental") } - if migrationContext.CheckpointIntervalSeconds < 10 { + // ignore low checkpoint intervals in unsafe mode as frequent checkpoints are required to reliably + // reduce test duration + if migrationContext.CheckpointIntervalSeconds < 10 && !migrationContext.UnsafeFailPointsEnabled { migrationContext.Log.Fatalf("--checkpoint-seconds should be >=10") } if migrationContext.CountTableRows && migrationContext.PanicOnWarnings { diff --git a/go/logic/migrator.go b/go/logic/migrator.go index 0bae5e5a4..700031cc3 100644 --- a/go/logic/migrator.go +++ b/go/logic/migrator.go @@ -1386,6 +1386,9 @@ func (mgtr *Migrator) moveTablesCutOver() (err error) { } } + mgtr.migrationContext.NewFailPoint("move-tables-panic-before-drain-completion", base.WithFailPointWait(2*time.Second)) + + // ------ T3: draining applier to drain GTID ----------- if err := mgtr.drainMoveTablesCutOver(drainGTID); err != nil { return err } @@ -1403,6 +1406,8 @@ func (mgtr *Migrator) moveTablesCutOver() (err error) { atomic.StoreInt64(&mgtr.migrationContext.CutOverCompleteFlag, 1) mgtr.migrationContext.Log.Debugf("T4: CutOverCompleteFlag set") + mgtr.migrationContext.NewFailPoint("move-tables-panic-before-on-success-hook", base.WithFailPointWait(2*time.Second)) + // ----- T5: on-success hook ----- // Hook unlocks user_rw@target via db-user-management and flips the // write_cutover? feature flag. Standard env vars only — GH_OST_DRAIN_GTID + @@ -2509,6 +2514,8 @@ func (mgtr *Migrator) iterateChunks() error { } return terminateRowIteration(err) } + + mgtr.migrationContext.NewFailPoint("move-tables-panic-after-row-copy", base.WithFailPointWait(2*time.Second)) } } diff --git a/localtests/move-tables-test.sh b/localtests/move-tables-test.sh index e74db7545..a3b2ada46 100755 --- a/localtests/move-tables-test.sh +++ b/localtests/move-tables-test.sh @@ -36,6 +36,7 @@ original_sql_mode= current_gtid_mode= test_timeout=120 test_failure_log_tail_lines=50 +tables_to_migrate=() OPTIND=1 while getopts "b:s:dg" OPTION; do @@ -175,8 +176,13 @@ build_ghost_command() { # Build gh-ost command with all standard options # # expected $1 to be a comma-separated list of tables to move + + # build comma-separated list of tables to move + move_tables_arg=$(IFS=, ; echo "${tables_to_migrate[*]}") + + # NOTE(chriskirkland): fully qualified package name + failpoint name cmd="GOTRACEBACK=crash $ghost_binary \ - --move-tables=$1 \ + --move-tables=$move_tables_arg \ --user=root \ --password=opensesame \ --host=$source_replica_host \ @@ -197,7 +203,13 @@ build_ghost_command() { --stack \ --checkpoint \ --postpone-cut-over-flag-file=$postpone_cutover_flag_file \ + --checkpoint-seconds=1 \ + --unsafe-fail-points-enabled \ --execute ${extra_args[@]}" + + if [ -n "$GO_FAILPOINTS" ]; then + cmd="GO_FAILPOINTS=\"$GO_FAILPOINTS\" $cmd" + fi } print_log_excerpt() { @@ -348,46 +360,47 @@ test_single() { wait $test_pid 2>/dev/null execution_result=$? return $execution_result - fi - # kick off the on_test script for the test. this enables arbitrary custom logic - # concurrent with the gh-ost process. this enables additional scenarios like - # streaming of writes prior to the write cutover. - # - # IMPORTANT: The on-test script is executed in the background and will be killed as soon - # as the gh-ost process terminates. - if [ -f $tests_path/$test_name/on_test.sh ]; then - $tests_path/$test_name/on_test.sh &> /dev/null & - on_test_pid=$! - fi + else - # queue up removal of the postpone cutover flag, otherwise gh-ost hangs on the cutover - ( - sleep 1; - echo "⏩ Sending unpostpone cutover" - rm $postpone_cutover_flag_file &> /dev/null; - ) & + # kick off the on_test script for the test. this enables arbitrary custom logic + # concurrent with the gh-ost process. this enables additional scenarios like + # streaming of writes prior to the write cutover. + # + # IMPORTANT: The on-test script is executed in the background and will be killed as soon + # as the gh-ost process terminates. + if [ -f $tests_path/$test_name/on_test.sh ]; then + $tests_path/$test_name/on_test.sh &> /dev/null & + on_test_pid=$! + fi - # Build and execute gh-ost command - move_tables_arg=$(IFS=, ; echo "${tables_to_migrate[*]}") - build_ghost_command "$move_tables_arg" - echo_dot - echo $cmd >$exec_command_file - echo_dot - timeout $test_timeout bash $exec_command_file >$test_logfile 2>&1 + # queue up removal of the postpone cutover flag, otherwise gh-ost hangs on the cutover + ( + sleep 1; + echo "⏩ Sending unpostpone cutover" + rm $postpone_cutover_flag_file &> /dev/null; + ) & - execution_result=$? + # Build and execute gh-ost command + build_ghost_command + echo_dot + echo $cmd >$exec_command_file + echo_dot + timeout $test_timeout bash $exec_command_file >$test_logfile 2>&1 - if [ -n "$on_test_pid" ]; then - kill -KILL $on_test_pid &>/dev/null - fi + execution_result=$? - # Check for timeout (exit code 124) - if [ $execution_result -eq 124 ]; then - echo - echo "ERROR $test_name execution timed out" - print_log_excerpt - return 1 + if [ -n "$on_test_pid" ]; then + kill -KILL $on_test_pid &>/dev/null + fi + + # Check for timeout (exit code 124) + if [ $execution_result -eq 124 ]; then + echo + echo "ERROR $test_name execution timed out" + print_log_excerpt + return 1 + fi fi if [ -f $tests_path/$test_name/sql_mode ]; then @@ -453,7 +466,29 @@ test_single() { done } +enable_failpoint() { + mkdir -p $repo_root/tools/bin + if [ ! -f $repo_root/tools/bin/failpoint-ctl ]; then + echo "⚙️ Installing failpoint" + GOBIN=$repo_root/tools/bin go install github.com/pingcap/failpoint/failpoint-ctl@v0.0.0-20220801062533-2eaa32854a6c + fi + + echo "⚙️ Enabling failpoint" + $repo_root/tools/bin/failpoint-ctl enable go + + echo "✅ Successfully enabled failpoint" +} + +disable_failpoint() { + echo "⚙️ Disabling failpoint" + $repo_root/tools/bin/failpoint-ctl disable go + + echo "✅ Successfully disabled failpoint" +} + build_binary() { + enable_failpoint + echo "Building" rm -f $default_ghost_binary [ "$ghost_binary" == "" ] && ghost_binary="$default_ghost_binary" @@ -468,6 +503,8 @@ build_binary() { echo "Build failure" exit 1 fi + + disable_failpoint } test_all() { diff --git a/localtests/move-tables/resume-panic-before-drain-complete/create.sql b/localtests/move-tables/resume-panic-before-drain-complete/create.sql new file mode 100644 index 000000000..46e919003 --- /dev/null +++ b/localtests/move-tables/resume-panic-before-drain-complete/create.sql @@ -0,0 +1,34 @@ +drop table if exists gh_ost_test; +create table gh_ost_test ( + id bigint(20) NOT NULL AUTO_INCREMENT, + column1 int(11) NOT NULL, + column2 smallint(5) unsigned NOT NULL, + column3 mediumint(8) unsigned NOT NULL, + column4 tinyint(3) unsigned NOT NULL, + column5 int(11) NOT NULL, + column6 int(11) NOT NULL, + PRIMARY KEY (id), + KEY c12_ix (column1, column2) +) auto_increment=1; + +insert into gh_ost_test values + (NULL, 1001, 100, 500000, 10, 1700000001, 1700000002), + (NULL, 1002, 200, 600000, 20, 1700000003, 1700000004), + (NULL, 1003, 300, 700000, 30, 1700000005, 1700000006), + (NULL, 1004, 400, 800000, 40, 1700000007, 1700000008), + (NULL, 1005, 500, 900000, 50, 1700000009, 1700000010), + (NULL, 1006, 600, 1000000, 60, 1700000011, 1700000012), + (NULL, 1007, 700, 1100000, 70, 1700000013, 1700000014), + (NULL, 1008, 800, 1200000, 80, 1700000015, 1700000016), + (NULL, 1009, 900, 1300000, 90, 1700000017, 1700000018), + (NULL, 1010, 1000, 1400000, 100, 1700000019, 1700000020), + (NULL, 1011, 1100, 1500000, 110, 1700000021, 1700000022), + (NULL, 1012, 1200, 1600000, 120, 1700000023, 1700000024), + (NULL, 1013, 1300, 1700000, 130, 1700000025, 1700000026), + (NULL, 1014, 1400, 1800000, 140, 1700000027, 1700000028), + (NULL, 1015, 1500, 1900000, 150, 1700000029, 1700000030), + (NULL, 1016, 1600, 2000000, 160, 1700000031, 1700000032), + (NULL, 1017, 1700, 2100000, 170, 1700000033, 1700000034), + (NULL, 1018, 1800, 2200000, 180, 1700000035, 1700000036), + (NULL, 1019, 1900, 2300000, 190, 1700000037, 1700000038), + (NULL, 1020, 2000, 2400000, 200, 1700000039, 1700000040); \ No newline at end of file diff --git a/localtests/move-tables/resume-panic-before-drain-complete/tables.txt b/localtests/move-tables/resume-panic-before-drain-complete/tables.txt new file mode 100644 index 000000000..11fc5eef8 --- /dev/null +++ b/localtests/move-tables/resume-panic-before-drain-complete/tables.txt @@ -0,0 +1 @@ +gh_ost_test diff --git a/localtests/move-tables/resume-panic-before-drain-complete/test.sh b/localtests/move-tables/resume-panic-before-drain-complete/test.sh new file mode 100644 index 000000000..39392d53a --- /dev/null +++ b/localtests/move-tables/resume-panic-before-drain-complete/test.sh @@ -0,0 +1,121 @@ + +#!/bin/bash +# Custom test: +# - panic after RENAME (T1) and prior to drain completion (T3), prior to cutover completion +# - validate RENAME and source writes are not possible +# - resume and complete the migration + +set -x + +database=test +table_name=gh_ost_test + +# Build gh-ost command from scratch using framework function (required to inject failpoints) +rm $ghost_binary +build_binary + +###################################################################################################### +### Run #1: Should panic after RENAME (T1) and before drain completion (T3) +###################################################################################################### + +echo "⚙️ Starting migration with failpoint (run #1)..." + +# Build the gh-ost command using the framework function +GO_FAILPOINTS="github.com/github/gh-ost/go/base/move-tables-panic-before-drain-completion=return(true)" build_ghost_command + +# queue up removal of the postpone cutover flag, otherwise gh-ost hangs on the cutover +( + sleep 2; + echo "⏩ Sending unpostpone cutover" + rm $postpone_cutover_flag_file &> /dev/null; +) & + +# Run the gh-ost command, expecting panic on the failpoint the first time +echo_dot +echo > $test_logfile +bash -c "$cmd" >>$test_logfile 2>&1 +ghost_result=$? + +if [ $ghost_result -eq 0 ]; then + echo "ERROR: gh-ost should have failed but did not." + return 1 +fi + +echo -e "\n\n\n\n\n" + +###################################################################################################### +### Intermediate validation +###################################################################################################### + +echo "⚙️ Validating checkpointed state on unexpected exit..." + +# Table was renamed on source +mysql-exec source primary $database -sNe "SELECT 1 FROM ${table_name} LIMIT 1;" +if [ $? -eq 0 ]; then + echo "ERROR: Table '${table_name}' exists on source but show have been renamed." + return 1 +fi + +mysql-exec source primary $database -sNe "SELECT 1 FROM _${table_name}_del LIMIT 1;" +if [ $? -gt 0 ]; then + echo "ERROR: Renamed table '_${table_name}_del' does not exist on source." + return 1 +fi + +# Table not writeable on source +mysql-exec source primary $database -e "INSERT INTO ${table_name} VALUES (NULL, 1021, 2001, 2400001, 201, 1700000041, 1700000041);" +if [ $? -eq 0 ]; then + echo "ERROR: Table '${table_name}' was writeable on source but should not be!." + return 1 +fi + +# Table still exists on target +mysql-exec target primary $database -sNe "SELECT 1 FROM ${table_name} LIMIT 1;" +if [ $? -gt 0 ]; then + echo "ERROR: Table '${table_name}' does not exist on target." + return 1 +fi + +# validate last checkpoint (cutover started and drain GTID are set) +cutover_started=$(mysql-exec target primary $database -Ne "SELECT gh_ost_move_tables_cutover_started FROM _${table_name}_ghk ORDER BY gh_ost_chk_id DESC LIMIT 1;") +if [ "$cutover_started" != 1 ]; then + echo "ERROR: Expected cutover started to be set in last checkpoint." + return 1 +fi + +drain_gtid=$(mysql-exec target primary $database -Ne "SELECT gh_ost_move_tables_drain_gtid FROM _${table_name}_ghk ORDER BY gh_ost_chk_id DESC LIMIT 1;") +if [ "$drain_gtid" == "" ]; then + echo "ERROR: Expected drain GTID to be set in last checkpoint." + return 1 +fi + +echo "✅ Validated checkpointed state on unexpected exit..." + +echo -e "\n\n\n\n\n" + +###################################################################################################### +### Run #2: Resume and complete the migration +###################################################################################################### + +echo "⚙️ Resuming migration (run #2)..." + +# resume migration +build_ghost_command +cmd="$cmd --resume" + +# queue up removal of the postpone cutover flag, otherwise gh-ost hangs on the cutover +( + sleep 2; + echo "⏩ Sending unpostpone cutover" + rm $postpone_cutover_flag_file &> /dev/null; +) & + +bash -c "$cmd" >>$test_logfile 2>&1 +ghost_result=$? + +if [ $ghost_result -ne 0 ]; then + echo "ERROR: gh-ost should have succeeded but did not. ($ghost_result)" + return 1 +fi + +echo -e "\n\n\n\n\n" diff --git a/localtests/move-tables/resume-panic-before-on-success-hook/create.sql b/localtests/move-tables/resume-panic-before-on-success-hook/create.sql new file mode 100644 index 000000000..46e919003 --- /dev/null +++ b/localtests/move-tables/resume-panic-before-on-success-hook/create.sql @@ -0,0 +1,34 @@ +drop table if exists gh_ost_test; +create table gh_ost_test ( + id bigint(20) NOT NULL AUTO_INCREMENT, + column1 int(11) NOT NULL, + column2 smallint(5) unsigned NOT NULL, + column3 mediumint(8) unsigned NOT NULL, + column4 tinyint(3) unsigned NOT NULL, + column5 int(11) NOT NULL, + column6 int(11) NOT NULL, + PRIMARY KEY (id), + KEY c12_ix (column1, column2) +) auto_increment=1; + +insert into gh_ost_test values + (NULL, 1001, 100, 500000, 10, 1700000001, 1700000002), + (NULL, 1002, 200, 600000, 20, 1700000003, 1700000004), + (NULL, 1003, 300, 700000, 30, 1700000005, 1700000006), + (NULL, 1004, 400, 800000, 40, 1700000007, 1700000008), + (NULL, 1005, 500, 900000, 50, 1700000009, 1700000010), + (NULL, 1006, 600, 1000000, 60, 1700000011, 1700000012), + (NULL, 1007, 700, 1100000, 70, 1700000013, 1700000014), + (NULL, 1008, 800, 1200000, 80, 1700000015, 1700000016), + (NULL, 1009, 900, 1300000, 90, 1700000017, 1700000018), + (NULL, 1010, 1000, 1400000, 100, 1700000019, 1700000020), + (NULL, 1011, 1100, 1500000, 110, 1700000021, 1700000022), + (NULL, 1012, 1200, 1600000, 120, 1700000023, 1700000024), + (NULL, 1013, 1300, 1700000, 130, 1700000025, 1700000026), + (NULL, 1014, 1400, 1800000, 140, 1700000027, 1700000028), + (NULL, 1015, 1500, 1900000, 150, 1700000029, 1700000030), + (NULL, 1016, 1600, 2000000, 160, 1700000031, 1700000032), + (NULL, 1017, 1700, 2100000, 170, 1700000033, 1700000034), + (NULL, 1018, 1800, 2200000, 180, 1700000035, 1700000036), + (NULL, 1019, 1900, 2300000, 190, 1700000037, 1700000038), + (NULL, 1020, 2000, 2400000, 200, 1700000039, 1700000040); \ No newline at end of file diff --git a/localtests/move-tables/resume-panic-before-on-success-hook/hooks/gh-ost-on-success b/localtests/move-tables/resume-panic-before-on-success-hook/hooks/gh-ost-on-success new file mode 100755 index 000000000..a2c0b6926 --- /dev/null +++ b/localtests/move-tables/resume-panic-before-on-success-hook/hooks/gh-ost-on-success @@ -0,0 +1,4 @@ +#!/bin/bash + +# touch file to mark completion of on-success hook +touch /tmp/gh-ost-hooks/on-success \ No newline at end of file diff --git a/localtests/move-tables/resume-panic-before-on-success-hook/tables.txt b/localtests/move-tables/resume-panic-before-on-success-hook/tables.txt new file mode 100644 index 000000000..11fc5eef8 --- /dev/null +++ b/localtests/move-tables/resume-panic-before-on-success-hook/tables.txt @@ -0,0 +1 @@ +gh_ost_test diff --git a/localtests/move-tables/resume-panic-before-on-success-hook/test.sh b/localtests/move-tables/resume-panic-before-on-success-hook/test.sh new file mode 100644 index 000000000..14c68bd67 --- /dev/null +++ b/localtests/move-tables/resume-panic-before-on-success-hook/test.sh @@ -0,0 +1,149 @@ + +#!/bin/bash +# Custom test: +# - panic after drain (T4) and prior to on-success (T5), prior to cutover completion +# - validate RENAME and source writes are not possible +# - validate contents of source and target are the same +# - resume and complete the migration + +database=test +table_name=gh_ost_test + +# Build gh-ost command from scratch using framework function (required to inject failpoints) +rm $ghost_binary +build_binary + +# ensure hook files are executable +chmod +x $tests_path/$test_name/hooks/* + +# clean up any existing test hook files +rm -rf /tmp/gh-ost-hooks/ +mkdir -p /tmp/gh-ost-hooks/ + +###################################################################################################### +### Run #1: Should panic after drain (T4) and before on-success (T5) +###################################################################################################### + + +echo "⚙️ Starting migration with failpoint (run #1)..." + +# Build the gh-ost command using the framework function +GO_FAILPOINTS="github.com/github/gh-ost/go/base/move-tables-panic-before-on-success-hook=return(true)" build_ghost_command +cmd="$cmd --hooks-path=$tests_path/$test_name/hooks" + +# queue up removal of the postpone cutover flag, otherwise gh-ost hangs on the cutover +( + sleep 2; + echo "⏩ Sending unpostpone cutover" + rm $postpone_cutover_flag_file &> /dev/null; +) & + +# drive some concurrent writes to the table to exercise queue drain (T3/T4) +( + DATABASE=test script/move-tables/insert-source-primary-loop 100 0.1 10 &>/dev/null & + writes_pid=$! + sleep 3 + kill $writes_pid +) & + +# Run the gh-ost command, expecting panic on the failpoint the first time +echo_dot +echo > $test_logfile +bash -c "$cmd" >>$test_logfile 2>&1 +ghost_result=$? + +if [ $ghost_result -eq 0 ]; then + echo "ERROR: gh-ost should have failed but did not." + return 1 +fi + +echo -e "\n\n\n\n\n" + +###################################################################################################### +### Intermediate validation +###################################################################################################### + +echo "⚙️ Validating checkpointed state on unexpected exit..." + +# Table was renamed on source +mysql-exec source primary $database -sNe "SELECT 1 FROM ${table_name} LIMIT 1;" +if [ $? -eq 0 ]; then + echo "ERROR: Table '${table_name}' exists on source but show have been renamed." + return 1 +fi + +mysql-exec source primary $database -sNe "SELECT 1 FROM _${table_name}_del LIMIT 1;" +if [ $? -gt 0 ]; then + echo "ERROR: Renamed table '_${table_name}_del' does not exist on source." + return 1 +fi + +# Table not writeable on source +mysql-exec source primary $database -sNe "INSERT INTO ${table_name} VALUES (NULL, 1021, 2001, 2400001, 201, 1700000041, 1700000041);" +if [ $? -eq 0 ]; then + echo "ERROR: Table '${table_name}' was writeable on source but should not be!." + return 1 +fi + +# Table still exists on target +mysql-exec target primary $database -sNe "SELECT 1 FROM ${table_name} LIMIT 1;" +if [ $? -gt 0 ]; then + echo "ERROR: Table '${table_name}' does not exist on target." + return 1 +fi + +# contents of table on source and target are the same +source_contents_file=/tmp/gh-ost-test.resume-move-tables-panic-before-on-success-hook-source_contents.txt +target_contents_file=/tmp/gh-ost-test.resume-move-tables-panic-before-on-success-hook-target_contents.txt +mysql-exec source primary $database -sNe "SELECT * FROM _${table_name}_del;" > $source_contents_file +mysql-exec target primary $database -sNe "SELECT * FROM ${table_name};" > $target_contents_file + +if ! diff $source_contents_file $target_contents_file; then + echo "ERROR: Contents of table '${table_name}' are not the same on source and target." + echo "---- DIFF -----" + diff --side-by-side $source_contents_file $target_contents_file + echo "---------------" + return 1 +fi + +# validate on-success hook was not called +if [ -f /tmp/gh-ost-hooks/on-success ]; then + echo "ERROR: on-success hook was called when it should not have been." + return 1 +fi + +echo "✅ Validated checkpointed state on unexpected exit..." + +echo -e "\n\n\n\n\n" + +###################################################################################################### +### Run #2: Resume and complete the migration +###################################################################################################### + +echo "⚙️ Resuming migration (run #2)..." + +# resume migration +build_ghost_command +cmd="$cmd --resume --hooks-path=$tests_path/$test_name/hooks" + +# queue up removal of the postpone cutover flag, otherwise gh-ost hangs on the cutover +( + sleep 2; + echo "⏩ Sending unpostpone cutover" + rm $postpone_cutover_flag_file &> /dev/null; +) & + +bash -c "$cmd" >>$test_logfile 2>&1 +ghost_result=$? + +if [ $ghost_result -ne 0 ]; then + echo "ERROR: gh-ost should have succeeded but did not. ($ghost_result)" + return 1 +fi + +# validate on-success hook was was called +if [ ! -f /tmp/gh-ost-hooks/on-success ]; then + echo "ERROR: on-success hook was not called when it should have been." +fi + +echo -e "\n\n\n\n\n" diff --git a/localtests/move-tables/resume-panic-on-row-copy/create.sql b/localtests/move-tables/resume-panic-on-row-copy/create.sql new file mode 100644 index 000000000..46e919003 --- /dev/null +++ b/localtests/move-tables/resume-panic-on-row-copy/create.sql @@ -0,0 +1,34 @@ +drop table if exists gh_ost_test; +create table gh_ost_test ( + id bigint(20) NOT NULL AUTO_INCREMENT, + column1 int(11) NOT NULL, + column2 smallint(5) unsigned NOT NULL, + column3 mediumint(8) unsigned NOT NULL, + column4 tinyint(3) unsigned NOT NULL, + column5 int(11) NOT NULL, + column6 int(11) NOT NULL, + PRIMARY KEY (id), + KEY c12_ix (column1, column2) +) auto_increment=1; + +insert into gh_ost_test values + (NULL, 1001, 100, 500000, 10, 1700000001, 1700000002), + (NULL, 1002, 200, 600000, 20, 1700000003, 1700000004), + (NULL, 1003, 300, 700000, 30, 1700000005, 1700000006), + (NULL, 1004, 400, 800000, 40, 1700000007, 1700000008), + (NULL, 1005, 500, 900000, 50, 1700000009, 1700000010), + (NULL, 1006, 600, 1000000, 60, 1700000011, 1700000012), + (NULL, 1007, 700, 1100000, 70, 1700000013, 1700000014), + (NULL, 1008, 800, 1200000, 80, 1700000015, 1700000016), + (NULL, 1009, 900, 1300000, 90, 1700000017, 1700000018), + (NULL, 1010, 1000, 1400000, 100, 1700000019, 1700000020), + (NULL, 1011, 1100, 1500000, 110, 1700000021, 1700000022), + (NULL, 1012, 1200, 1600000, 120, 1700000023, 1700000024), + (NULL, 1013, 1300, 1700000, 130, 1700000025, 1700000026), + (NULL, 1014, 1400, 1800000, 140, 1700000027, 1700000028), + (NULL, 1015, 1500, 1900000, 150, 1700000029, 1700000030), + (NULL, 1016, 1600, 2000000, 160, 1700000031, 1700000032), + (NULL, 1017, 1700, 2100000, 170, 1700000033, 1700000034), + (NULL, 1018, 1800, 2200000, 180, 1700000035, 1700000036), + (NULL, 1019, 1900, 2300000, 190, 1700000037, 1700000038), + (NULL, 1020, 2000, 2400000, 200, 1700000039, 1700000040); \ No newline at end of file diff --git a/localtests/move-tables/resume-panic-on-row-copy/tables.txt b/localtests/move-tables/resume-panic-on-row-copy/tables.txt new file mode 100644 index 000000000..11fc5eef8 --- /dev/null +++ b/localtests/move-tables/resume-panic-on-row-copy/tables.txt @@ -0,0 +1 @@ +gh_ost_test diff --git a/localtests/move-tables/resume-panic-on-row-copy/test.sh b/localtests/move-tables/resume-panic-on-row-copy/test.sh new file mode 100644 index 000000000..03df29fca --- /dev/null +++ b/localtests/move-tables/resume-panic-on-row-copy/test.sh @@ -0,0 +1,117 @@ + +#!/bin/bash +# Custom test: +# - panic during row copy stage, prior to cutover +# - resume and complete the migration + +database=test +table_name=gh_ost_test + +# Build gh-ost command from scratch using framework function (required to inject failpoints) +rm $ghost_binary +build_binary + +###################################################################################################### +### Run #1: Should panic after first row copy and migration will not complete +###################################################################################################### + +echo "⚙️ Starting migration with failpoint (run #1)..." + +# Build the gh-ost command using the framework function +GO_FAILPOINTS="github.com/github/gh-ost/go/base/move-tables-panic-after-row-copy=return(true)" build_ghost_command + +# Run the gh-ost command, expecting panic on the failpoint the first time +echo_dot +echo > $test_logfile +bash -c "$cmd" >>$test_logfile 2>&1 +ghost_result=$? + +if [ $ghost_result -eq 0 ]; then + echo "ERROR: gh-ost should have failed but did not." + return 1 +fi + +echo -e "\n\n\n\n\n" + +###################################################################################################### +### Intermediate validation +###################################################################################################### + +echo "⚙️ Validating checkpointed state on unexpected exit..." + +# checkpoint table exists on target and is non-empty +mysql-exec target primary $database -sNe "SELECT 1 FROM _${table_name}_ghk LIMIT 1;" +if [ $? -gt 0 ]; then + echo "ERROR: Checkpoint table is empty or does not exist." + return 1 +fi + +# original table still exists on source +mysql-exec source replica $database -sNe "SELECT 1 FROM ${table_name} LIMIT 1;" +if [ $? -gt 0 ]; then + echo "ERROR: Table '${table_name}' does not exist on the source cluster." + return 1 +fi + +# original table exists on the target +mysql-exec target replica $database -sNe "SELECT 1 FROM ${table_name} LIMIT 1;" +if [ $? -gt 0 ]; then + echo "ERROR: Table '${table_name}' does not exist on the target cluster." + return 1 +fi + +# validate we processed a single row-copy chunk (10 rows) and there are 20 total to process +rows_copied=$(mysql-exec target primary $database -Ne "SELECT gh_ost_rows_copied FROM _${table_name}_ghk ORDER BY gh_ost_chk_id DESC LIMIT 1;") +if [ $rows_copied -ne 10 ]; then + echo "ERROR: Expected last checkpoint to show 10 rows copied." + return 1 +fi + +echo "✅ Validating checkpointed state on unexpected exit..." + +echo -e "\n\n\n\n\n" + +###################################################################################################### +### Run #2: Resume and complete the migration +###################################################################################################### + +echo "⚙️ Resuming migration (run #2)..." + +# resume migration +build_ghost_command +cmd="$cmd --resume" + +# queue up removal of the postpone cutover flag, otherwise gh-ost hangs on the cutover +( + sleep 2; + echo "⏩ Sending unpostpone cutover" + rm $postpone_cutover_flag_file &> /dev/null; +) & + +bash -c "$cmd" >>$test_logfile 2>&1 +ghost_result=$? + +if [ $ghost_result -ne 0 ]; then + echo "ERROR: gh-ost should have succeeded but did not. ($ghost_result)" + return 1 +fi + +echo -e "\n\n\n\n\n" + +###################################################################################################### +### post-migration validation +###################################################################################################### + +echo "⚙️ Validating checkpointed state after resumed migration..." + +# validate we processed the rest of the 20 rows to copy +rows_copied=$(mysql-exec target primary $database -Ne "SELECT gh_ost_rows_copied FROM _${table_name}_ghk ORDER BY gh_ost_chk_id DESC LIMIT 1;") +if [ $rows_copied -ne 20 ]; then + echo "ERROR: Expected last checkpoint to show 20 rows copied." + return 1 +fi + +echo "✅ Validating checkpointed state on resumed migration." + +echo -e "\n\n\n\n\n" + diff --git a/localtests/move-tables/single-concurrent-writes/on_test.sh b/localtests/move-tables/single-concurrent-writes/on_test.sh index 71120eb95..46849a09b 100755 --- a/localtests/move-tables/single-concurrent-writes/on_test.sh +++ b/localtests/move-tables/single-concurrent-writes/on_test.sh @@ -2,5 +2,5 @@ # insert data into the source primary, starting at ID 100 in batches of 10. kill # the process after 5 seconds -DATABASE=test script/move-tables/insert-source-primary-loop 100 0.1 10 & +DATABASE=test script/move-tables/insert-source-primary-loop 100 0.01 100 & sleep 5 && kill $! diff --git a/localtests/move-tables/single-with-hooks/create.sql b/localtests/move-tables/single-with-hooks/create.sql new file mode 100644 index 000000000..46e919003 --- /dev/null +++ b/localtests/move-tables/single-with-hooks/create.sql @@ -0,0 +1,34 @@ +drop table if exists gh_ost_test; +create table gh_ost_test ( + id bigint(20) NOT NULL AUTO_INCREMENT, + column1 int(11) NOT NULL, + column2 smallint(5) unsigned NOT NULL, + column3 mediumint(8) unsigned NOT NULL, + column4 tinyint(3) unsigned NOT NULL, + column5 int(11) NOT NULL, + column6 int(11) NOT NULL, + PRIMARY KEY (id), + KEY c12_ix (column1, column2) +) auto_increment=1; + +insert into gh_ost_test values + (NULL, 1001, 100, 500000, 10, 1700000001, 1700000002), + (NULL, 1002, 200, 600000, 20, 1700000003, 1700000004), + (NULL, 1003, 300, 700000, 30, 1700000005, 1700000006), + (NULL, 1004, 400, 800000, 40, 1700000007, 1700000008), + (NULL, 1005, 500, 900000, 50, 1700000009, 1700000010), + (NULL, 1006, 600, 1000000, 60, 1700000011, 1700000012), + (NULL, 1007, 700, 1100000, 70, 1700000013, 1700000014), + (NULL, 1008, 800, 1200000, 80, 1700000015, 1700000016), + (NULL, 1009, 900, 1300000, 90, 1700000017, 1700000018), + (NULL, 1010, 1000, 1400000, 100, 1700000019, 1700000020), + (NULL, 1011, 1100, 1500000, 110, 1700000021, 1700000022), + (NULL, 1012, 1200, 1600000, 120, 1700000023, 1700000024), + (NULL, 1013, 1300, 1700000, 130, 1700000025, 1700000026), + (NULL, 1014, 1400, 1800000, 140, 1700000027, 1700000028), + (NULL, 1015, 1500, 1900000, 150, 1700000029, 1700000030), + (NULL, 1016, 1600, 2000000, 160, 1700000031, 1700000032), + (NULL, 1017, 1700, 2100000, 170, 1700000033, 1700000034), + (NULL, 1018, 1800, 2200000, 180, 1700000035, 1700000036), + (NULL, 1019, 1900, 2300000, 190, 1700000037, 1700000038), + (NULL, 1020, 2000, 2400000, 200, 1700000039, 1700000040); \ No newline at end of file diff --git a/localtests/move-tables/single-with-hooks/hooks/gh-ost-on-before-cut-over b/localtests/move-tables/single-with-hooks/hooks/gh-ost-on-before-cut-over new file mode 100755 index 000000000..5c7a5b872 --- /dev/null +++ b/localtests/move-tables/single-with-hooks/hooks/gh-ost-on-before-cut-over @@ -0,0 +1,14 @@ +#!/bin/bash + +repo_root=$(git rev-parse --show-toplevel) +source $repo_root/localtests/move-tables/single-with-hooks/hooks/util.sh + +# dump environment variables on dirty exit +trap '[[ $? -eq 0 ]] || dump_env' EXIT + +set -e + +assert_common_envs + +# touch file to mark completion of on-before-cut-over hook +touch /tmp/gh-ost-hooks/on-before-cut-over diff --git a/localtests/move-tables/single-with-hooks/hooks/gh-ost-on-row-copy-complete b/localtests/move-tables/single-with-hooks/hooks/gh-ost-on-row-copy-complete new file mode 100755 index 000000000..0b2ea7d4c --- /dev/null +++ b/localtests/move-tables/single-with-hooks/hooks/gh-ost-on-row-copy-complete @@ -0,0 +1,14 @@ +#!/bin/bash + +repo_root=$(git rev-parse --show-toplevel) +source $repo_root/localtests/move-tables/single-with-hooks/hooks/util.sh + +# dump environment variables on dirty exit +trap '[[ $? -eq 0 ]] || dump_env' EXIT + +set -e + +assert_common_envs + +# touch file to mark completion of on-row-copy-complete hook +touch /tmp/gh-ost-hooks/on-row-copy-complete \ No newline at end of file diff --git a/localtests/move-tables/single-with-hooks/hooks/gh-ost-on-success b/localtests/move-tables/single-with-hooks/hooks/gh-ost-on-success new file mode 100755 index 000000000..2397c8aaa --- /dev/null +++ b/localtests/move-tables/single-with-hooks/hooks/gh-ost-on-success @@ -0,0 +1,16 @@ +#!/bin/bash + +repo_root=$(git rev-parse --show-toplevel) +source $repo_root/localtests/move-tables/single-with-hooks/hooks/util.sh + +# dump environment variables on dirty exit +trap '[[ $? -eq 0 ]] || dump_env' EXIT + +set -e + +assert_common_envs + +assert_env_present "GH_OST_DRAIN_GTID" + +# touch file to mark completion of on-success hook +touch /tmp/gh-ost-hooks/on-success \ No newline at end of file diff --git a/localtests/move-tables/single-with-hooks/hooks/util.sh b/localtests/move-tables/single-with-hooks/hooks/util.sh new file mode 100755 index 000000000..d5820acfc --- /dev/null +++ b/localtests/move-tables/single-with-hooks/hooks/util.sh @@ -0,0 +1,42 @@ +#/bin/bash + +assert_env_equal() { + env_name=$1 + expected=$2 + + if [ "${!env_name}" != "${expected}" ]; then + echo "ERROR: Expected '${expected}' for ${env_name}, but got '${!env_name}'" + exit 1 + fi +} + +assert_env_present() { + env_name=$1 + + echo "checking '${env_name}=${!env_name}'" + if [[ -z "${!env_name}" ]]; then + echo "ERROR: Expected '${env_name}' to be set but not present" + exit 1 + fi +} + +assert_common_envs() { + assert_env_present "GH_OST_TARGET_HOST" + + assert_env_equal "GH_OST_TARGET_DATABASE_NAME" "test" + assert_env_equal "GH_OST_TABLE_NAME" "gh_ost_test" + assert_env_equal "GH_OST_OLD_TABLE_NAME" "_gh_ost_test_del" + assert_env_equal "GH_OST_TARGET_TABLE_NAME" "gh_ost_test" + assert_env_equal "GH_OST_MOVE_TABLES" "true" + assert_env_equal "GH_OST_REVERT" "false" +} + +dump_env() { + echo "-----------------------------------------------------" + echo "----------------- ENVIRONS --------------------------" + echo "-----------------------------------------------------" + env | grep "GH_OST_" + echo "-----------------------------------------------------" + echo "-----------------------------------------------------" + echo "-----------------------------------------------------" +} \ No newline at end of file diff --git a/localtests/move-tables/single-with-hooks/tables.txt b/localtests/move-tables/single-with-hooks/tables.txt new file mode 100644 index 000000000..11fc5eef8 --- /dev/null +++ b/localtests/move-tables/single-with-hooks/tables.txt @@ -0,0 +1 @@ +gh_ost_test diff --git a/localtests/move-tables/single-with-hooks/test.sh b/localtests/move-tables/single-with-hooks/test.sh new file mode 100644 index 000000000..73fbb7771 --- /dev/null +++ b/localtests/move-tables/single-with-hooks/test.sh @@ -0,0 +1,67 @@ + +#!/bin/bash +# Custom test: +# Executes migration with custom hooks (on-row-copy-complete, on-before-cut-over, on-success) +# which are executed at different stages of the migration and validate the environment variables +# expected to be available to the respective hooks. + +database=test +table_name=gh_ost_test + +# Build gh-ost command from scratch using framework function +build_binary + +###################################################################################################### +### Run gh-ost with custom hooks neabled +###################################################################################################### + +echo "⚙️ Running gh-ost with custom hooks..." + +# ensure hook files are executable +chmod +x $tests_path/$test_name/hooks/* + +# clean up any existing test hook files +rm -rf /tmp/gh-ost-hooks/ +mkdir -p /tmp/gh-ost-hooks/ + +# Build the gh-ost command using the framework function +build_ghost_command +cmd="$cmd --hooks-path=$tests_path/$test_name/hooks" + +# queue up removal of the postpone cutover flag, otherwise gh-ost hangs on the cutover +( + sleep 2; + echo "⏩ Sending unpostpone cutover" + rm $postpone_cutover_flag_file &> /dev/null; +) & + +# Run the gh-ost command +echo_dot +echo > $test_logfile +bash -c "$cmd" >>$test_logfile 2>&1 +ghost_result=$? + +if [ $ghost_result -ne 0 ]; then + echo "ERROR: gh-ost failed unexpectedly." + return 1 +fi + +echo "✅ gh-ost move-tables succeeded!" + +echo -e "\n\n\n\n\n" + + +###################################################################################################### +### Validate hook status +###################################################################################################### + +echo "⚙️ Validating hook status after execution..." + +for expected in on-row-copy-complete on-before-cut-over on-success; do + if [ ! -f "/tmp/gh-ost-hooks/$expected" ]; then + echo "ERROR: Expected test hook file '/tmp/gh-ost-hooks/$expected' was not found." + return 1 + fi +done + +echo "✅ Hook status validated successfully." diff --git a/vendor/github.com/pingcap/failpoint/.codecov.yml b/vendor/github.com/pingcap/failpoint/.codecov.yml new file mode 100644 index 000000000..402988545 --- /dev/null +++ b/vendor/github.com/pingcap/failpoint/.codecov.yml @@ -0,0 +1,39 @@ +codecov: + notify: + require_ci_to_pass: yes + +coverage: + precision: 4 + round: down + range: "65...90" + + status: + project: + default: + threshold: 20 #Allow the coverage to drop by threshold%, and posting a success status. + patch: + default: + target: 0% # trial operation + changes: no + +parsers: + gcov: + branch_detection: + conditional: yes + loop: yes + method: no + macro: no + +comment: + layout: "header, diff" + behavior: default + require_changes: no + +ignore: + - "LICENSES" + - "*_test.go" + - "marker.go" # This file only contains empty function stub + - "failpoint-ctl" # Ignore the `failpoint-ctl` command line tool + - ".git" + - "*.yml" + - "*.md" diff --git a/vendor/github.com/pingcap/failpoint/.gitignore b/vendor/github.com/pingcap/failpoint/.gitignore new file mode 100644 index 000000000..b1e5133d6 --- /dev/null +++ b/vendor/github.com/pingcap/failpoint/.gitignore @@ -0,0 +1,27 @@ +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib + +# Test binary, build with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out + +bin +coverage.out +.idea/ +*.iml +*.swp +*.txt +*.log +tags +profile.coverprofile +overalls.coverprofile +explain_test +*.fail.go +vendor +.DS_Store diff --git a/vendor/github.com/pingcap/failpoint/CONTRIBUTING.md b/vendor/github.com/pingcap/failpoint/CONTRIBUTING.md new file mode 100644 index 000000000..cfefb0ed5 --- /dev/null +++ b/vendor/github.com/pingcap/failpoint/CONTRIBUTING.md @@ -0,0 +1,94 @@ +# How to contribute + +This document outlines some of the conventions on development workflow, commit +message formatting, contact points and other resources to make it easier to get +your contribution accepted. + +## Getting started + +- Fork the repository on GitHub. +- Read the README.md for build instructions. +- Play with the project, submit bugs, submit patches! + +## Building Failpoint + +Developing Failpoint requires: + +* [Go 1.13](http://golang.org/doc/code.html) +* An internet connection to download the dependencies + +Simply run `make` to build the program. + +```sh +make +``` + +### Running tests + +This project contains unit tests and integration tests with coverage collection. +See [tests/README.md](./tests/README.md) for how to execute and add tests. + +### Updating dependencies + +Failpoint manages dependencies using [Go module](https://github.com/golang/go/wiki/Modules). +To add or update a dependency, either + +* Use the `go mod edit` command to change the dependency, or +* Edit `go.mod` and then run `make update` to update the checksum. + +## Contribution flow + +This is a rough outline of what a contributor's workflow looks like: + +- Create a topic branch from where you want to base your work. This is usually `master`. +- Make commits of logical units and add test case if the change fixes a bug or adds new functionality. +- Run tests and make sure all the tests are passed. +- Make sure your commit messages are in the proper format (see below). +- Push your changes to a topic branch in your fork of the repository. +- Submit a pull request. +- Your PR must receive LGTMs from two maintainers. + +Thanks for your contributions! + +### Code style + +The coding style suggested by the Golang community is used in `failpoint`. +See the [style doc](https://github.com/golang/go/wiki/CodeReviewComments) for details. + +Please follow this style to makeg `failpoint` easy to review, maintain and develop. + +### Format of the Commit Message + +We follow a rough convention for commit messages that is designed to answer two +questions: what changed and why. The subject line should feature the what and +the body of the commit should describe the why. + +``` +restore: add comment for variable declaration + +Improve documentation. +``` + +The format can be described more formally as follows: + +``` +: + + + +