Use commit_ordered() in XtraDB, to avoid the prepare_commit_mutex that breaks group commit. --- storage/xtradb/handler/ha_innodb.cc | 212 +++++++++++++++++------------------- 1 file changed, 102 insertions(+), 110 deletions(-) Index: work-5.1-groupcommit/storage/xtradb/handler/ha_innodb.cc =================================================================== --- work-5.1-groupcommit.orig/storage/xtradb/handler/ha_innodb.cc 2010-08-12 12:43:20.000000000 +0200 +++ work-5.1-groupcommit/storage/xtradb/handler/ha_innodb.cc 2010-08-12 12:43:34.000000000 +0200 @@ -138,8 +138,6 @@ bool check_global_access(THD *thd, ulong /** to protect innobase_open_files */ static pthread_mutex_t innobase_share_mutex; -/** to force correct commit order in binlog */ -static pthread_mutex_t prepare_commit_mutex; static ulong commit_threads = 0; static pthread_mutex_t commit_threads_m; static pthread_cond_t commit_cond; @@ -239,6 +237,7 @@ static const char* innobase_change_buffe static INNOBASE_SHARE *get_share(const char *table_name); static void free_share(INNOBASE_SHARE *share); static int innobase_close_connection(handlerton *hton, THD* thd); +static void innobase_commit_ordered(handlerton *hton, THD* thd, bool all); static int innobase_commit(handlerton *hton, THD* thd, bool all); static int innobase_rollback(handlerton *hton, THD* thd, bool all); static int innobase_rollback_to_savepoint(handlerton *hton, THD* thd, @@ -1356,7 +1355,6 @@ innobase_trx_init( trx_t* trx) /*!< in/out: InnoDB transaction handle */ { DBUG_ENTER("innobase_trx_init"); - DBUG_ASSERT(EQ_CURRENT_THD(thd)); DBUG_ASSERT(thd == trx->mysql_thd); trx->check_foreigns = !thd_test_options( @@ -1416,8 +1414,6 @@ check_trx_exists( { trx_t*& trx = thd_to_trx(thd); - ut_ad(EQ_CURRENT_THD(thd)); - if (trx == NULL) { trx = innobase_trx_allocate(thd); } else if (UNIV_UNLIKELY(trx->magic_n != TRX_MAGIC_N)) { @@ -2024,6 +2020,7 @@ innobase_init( innobase_hton->savepoint_set=innobase_savepoint; innobase_hton->savepoint_rollback=innobase_rollback_to_savepoint; innobase_hton->savepoint_release=innobase_release_savepoint; + innobase_hton->commit_ordered=innobase_commit_ordered; innobase_hton->commit=innobase_commit; innobase_hton->rollback=innobase_rollback; innobase_hton->prepare=innobase_xa_prepare; @@ -2492,7 +2489,6 @@ skip_overwrite: innobase_open_tables = hash_create(200); pthread_mutex_init(&innobase_share_mutex, MY_MUTEX_INIT_FAST); - pthread_mutex_init(&prepare_commit_mutex, MY_MUTEX_INIT_FAST); pthread_mutex_init(&commit_threads_m, MY_MUTEX_INIT_FAST); pthread_mutex_init(&commit_cond_m, MY_MUTEX_INIT_FAST); pthread_mutex_init(&analyze_mutex, MY_MUTEX_INIT_FAST); @@ -2547,7 +2543,6 @@ innobase_end( my_free(internal_innobase_data_file_path, MYF(MY_ALLOW_ZERO_PTR)); pthread_mutex_destroy(&innobase_share_mutex); - pthread_mutex_destroy(&prepare_commit_mutex); pthread_mutex_destroy(&commit_threads_m); pthread_mutex_destroy(&commit_cond_m); pthread_mutex_destroy(&analyze_mutex); @@ -2681,6 +2676,101 @@ innobase_start_trx_and_assign_read_view( } /*****************************************************************//** +Perform the first, fast part of InnoDB commit. + +Doing it in this call ensures that we get the same commit order here +as in binlog and any other participating transactional storage engines. + +Note that we want to do as little as really needed here, as we run +under a global mutex. The expensive fsync() is done later, in +innobase_commit(), without a lock so group commit can take place. + +Note also that this method can be called from a different thread than +the one handling the rest of the transaction. */ +static +void +innobase_commit_ordered( +/*============*/ + handlerton *hton, /*!< in: Innodb handlerton */ + THD* thd, /*!< in: MySQL thread handle of the user for whom + the transaction should be committed */ + bool all) /*!< in: TRUE - commit transaction + FALSE - the current SQL statement ended */ +{ + trx_t* trx; + DBUG_ENTER("innobase_commit_ordered"); + DBUG_ASSERT(hton == innodb_hton_ptr); + + trx = check_trx_exists(thd); + + if (trx->active_trans == 0 + && trx->conc_state != TRX_NOT_STARTED) { + /* We throw an error here; instead we will catch this error + again in innobase_commit() and report it from there. */ + DBUG_VOID_RETURN; + } + /* Since we will reserve the kernel mutex, we have to release + the search system latch first to obey the latching order. */ + + if (trx->has_search_latch) { + trx_search_latch_release_if_reserved(trx); + } + + /* commit_ordered is only called when committing the whole transaction + (or an SQL statement when autocommit is on). */ + DBUG_ASSERT(all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))); + + /* We need current binlog position for ibbackup to work. + Note, the position is current because commit_ordered is guaranteed + to be called in same sequenece as writing to binlog. */ + +retry: + if (innobase_commit_concurrency > 0) { + pthread_mutex_lock(&commit_cond_m); + commit_threads++; + + if (commit_threads > innobase_commit_concurrency) { + commit_threads--; + pthread_cond_wait(&commit_cond, + &commit_cond_m); + pthread_mutex_unlock(&commit_cond_m); + goto retry; + } + else { + pthread_mutex_unlock(&commit_cond_m); + } + } + + /* The following calls to read the MySQL binary log + file name and the position return consistent results: + 1) We use commit_ordered() to get same commit order + in InnoDB as in binary log. + 2) A MySQL log file rotation cannot happen because + MySQL protects against this by having a counter of + transactions in prepared state and it only allows + a rotation when the counter drops to zero. See + LOCK_prep_xids and COND_prep_xids in log.cc. */ + trx->mysql_log_file_name = mysql_bin_log_file_name(); + trx->mysql_log_offset = (ib_int64_t) mysql_bin_log_file_pos(); + + /* Don't do write + flush right now. For group commit + to work we want to do the flush in the innobase_commit() + method, which runs without holding any locks. */ + trx->flush_log_later = TRUE; + innobase_commit_low(trx); + trx->flush_log_later = FALSE; + + if (innobase_commit_concurrency > 0) { + pthread_mutex_lock(&commit_cond_m); + commit_threads--; + pthread_cond_signal(&commit_cond); + pthread_mutex_unlock(&commit_cond_m); + } + + DBUG_VOID_RETURN; +} + +/*****************************************************************//** Commits a transaction in an InnoDB database or marks an SQL statement ended. @return 0 */ @@ -2702,13 +2792,6 @@ innobase_commit( trx = check_trx_exists(thd); - /* Since we will reserve the kernel mutex, we have to release - the search system latch first to obey the latching order. */ - - if (trx->has_search_latch) { - trx_search_latch_release_if_reserved(trx); - } - /* The flag trx->active_trans is set to 1 in 1. ::external_lock(), @@ -2736,62 +2819,8 @@ innobase_commit( /* We were instructed to commit the whole transaction, or this is an SQL statement end and autocommit is on */ - /* We need current binlog position for ibbackup to work. - Note, the position is current because of - prepare_commit_mutex */ -retry: - if (innobase_commit_concurrency > 0) { - pthread_mutex_lock(&commit_cond_m); - commit_threads++; - - if (commit_threads > innobase_commit_concurrency) { - commit_threads--; - pthread_cond_wait(&commit_cond, - &commit_cond_m); - pthread_mutex_unlock(&commit_cond_m); - goto retry; - } - else { - pthread_mutex_unlock(&commit_cond_m); - } - } - - /* The following calls to read the MySQL binary log - file name and the position return consistent results: - 1) Other InnoDB transactions cannot intervene between - these calls as we are holding prepare_commit_mutex. - 2) Binary logging of other engines is not relevant - to InnoDB as all InnoDB requires is that committing - InnoDB transactions appear in the same order in the - MySQL binary log as they appear in InnoDB logs. - 3) A MySQL log file rotation cannot happen because - MySQL protects against this by having a counter of - transactions in prepared state and it only allows - a rotation when the counter drops to zero. See - LOCK_prep_xids and COND_prep_xids in log.cc. */ - trx->mysql_log_file_name = mysql_bin_log_file_name(); - trx->mysql_log_offset = (ib_int64_t) mysql_bin_log_file_pos(); - - /* Don't do write + flush right now. For group commit - to work we want to do the flush after releasing the - prepare_commit_mutex. */ - trx->flush_log_later = TRUE; - innobase_commit_low(trx); - trx->flush_log_later = FALSE; - - if (innobase_commit_concurrency > 0) { - pthread_mutex_lock(&commit_cond_m); - commit_threads--; - pthread_cond_signal(&commit_cond); - pthread_mutex_unlock(&commit_cond_m); - } - - if (trx->active_trans == 2) { - - pthread_mutex_unlock(&prepare_commit_mutex); - } - - /* Now do a write + flush of logs. */ + /* We did the first part already in innobase_commit_ordered(), + Now finish by doing a write + flush of logs. */ trx_commit_complete_for_mysql(trx); trx->active_trans = 0; @@ -2866,10 +2895,6 @@ innobase_rollback( || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { error = trx_rollback_for_mysql(trx); - if (trx->active_trans == 2) { - - pthread_mutex_unlock(&prepare_commit_mutex); - } trx->active_trans = 0; } else { error = trx_rollback_last_sql_stat_for_mysql(trx); @@ -4625,6 +4650,7 @@ no_commit: no need to re-acquire locks on it. */ /* Altering to InnoDB format */ + innobase_commit_ordered(ht, user_thd, 1); innobase_commit(ht, user_thd, 1); /* Note that this transaction is still active. */ prebuilt->trx->active_trans = 1; @@ -4641,6 +4667,7 @@ no_commit: /* Commit the transaction. This will release the table locks, so they have to be acquired again. */ + innobase_commit_ordered(ht, user_thd, 1); innobase_commit(ht, user_thd, 1); /* Note that this transaction is still active. */ prebuilt->trx->active_trans = 1; @@ -8343,6 +8370,7 @@ ha_innobase::external_lock( if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { if (trx->active_trans != 0) { + innobase_commit_ordered(ht, thd, TRUE); innobase_commit(ht, thd, TRUE); } } else { @@ -9452,36 +9480,6 @@ innobase_xa_prepare( srv_active_wake_master_thread(); - if (thd_sql_command(thd) != SQLCOM_XA_PREPARE && - (all || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) - { - if (srv_enable_unsafe_group_commit && !THDVAR(thd, support_xa)) { - /* choose group commit rather than binlog order */ - return(error); - } - - /* For ibbackup to work the order of transactions in binlog - and InnoDB must be the same. Consider the situation - - thread1> prepare; write to binlog; ... - - thread2> prepare; write to binlog; commit - thread1> ... commit - - To ensure this will not happen we're taking the mutex on - prepare, and releasing it on commit. - - Note: only do it for normal commits, done via ha_commit_trans. - If 2pc protocol is executed by external transaction - coordinator, it will be just a regular MySQL client - executing XA PREPARE and XA COMMIT commands. - In this case we cannot know how many minutes or hours - will be between XA PREPARE and XA COMMIT, and we don't want - to block for undefined period of time. */ - pthread_mutex_lock(&prepare_commit_mutex); - trx->active_trans = 2; - } - return(error); } @@ -10673,11 +10671,6 @@ static MYSQL_SYSVAR_ENUM(adaptive_checkp "Enable/Disable flushing along modified age. (none, reflex, [estimate])", NULL, innodb_adaptive_checkpoint_update, 2, &adaptive_checkpoint_typelib); -static MYSQL_SYSVAR_ULONG(enable_unsafe_group_commit, srv_enable_unsafe_group_commit, - PLUGIN_VAR_RQCMDARG, - "Enable/Disable unsafe group commit when support_xa=OFF and use with binlog or other XA storage engine.", - NULL, NULL, 0, 0, 1, 0); - static MYSQL_SYSVAR_ULONG(expand_import, srv_expand_import, PLUGIN_VAR_RQCMDARG, "Enable/Disable converting automatically *.ibd files when import tablespace.", @@ -10767,7 +10760,6 @@ static struct st_mysql_sys_var* innobase MYSQL_SYSVAR(flush_neighbor_pages), MYSQL_SYSVAR(read_ahead), MYSQL_SYSVAR(adaptive_checkpoint), - MYSQL_SYSVAR(enable_unsafe_group_commit), MYSQL_SYSVAR(expand_import), MYSQL_SYSVAR(extra_rsegments), MYSQL_SYSVAR(dict_size_limit),